Java编程实现文件编码转换

来源:互联网 发布:淘宝运营需要学美工吗 编辑:程序博客网 时间:2024/06/05 08:26

最近在做自然语言处理的工程,数据源是由各个工作人员编写汇报上来的中文文本,存在文本编码不统一的问题。借助cpDetector工具自动识别文本的编码,编写函数JudgeFileCode,并实现文本编码的转换,统一编码为UTF-8,写个博客保存一下。

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FilenameFilter;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.nio.charset.Charset;
import java.nio.charset.UnsupportedCharsetException;


import info.monitorenter.cpdetector.io.ASCIIDetector;
import info.monitorenter.cpdetector.io.CodepageDetectorProxy;
import info.monitorenter.cpdetector.io.JChardetFacade;
import info.monitorenter.cpdetector.io.ParsingDetector;
import info.monitorenter.cpdetector.io.UnicodeDetector;


public class FileCharsetConverter {


private static void convert(File file, String fromCharsetName, String toCharsetName, FilenameFilter filter)throws Exception {
// TODO Auto-generated method stub
if(file.isDirectory()) {
File[] fileList=null;
if(filter==null) {
fileList=file.listFiles();
}else {
fileList=file.listFiles(filter);
}
for(File f:fileList) {
convert(f,fromCharsetName,toCharsetName,filter);
}
}else {
if(filter==null||filter.accept(file.getParentFile(), file.getName())) {
String fileContent=getFileContentFromCharset(file,fromCharsetName);
saveFile2Charset(file,toCharsetName,fileContent);
}
}
}


//以指定编码方式写文本文件,存在会覆盖
private static void saveFile2Charset(File file, String toCharsetName, String content)throws Exception {
// TODO Auto-generated method stub
if(!Charset.isSupported(toCharsetName)) {
throw new UnsupportedCharsetException(toCharsetName);
}
FileOutputStream outputStream=new FileOutputStream(file);
OutputStreamWriter outWrite=new OutputStreamWriter(outputStream,toCharsetName);
outWrite.write(content);
outWrite.close();
}


//以指定编码读取文件,返回文件内容
private static String getFileContentFromCharset(File file, String fromCharsetName)throws Exception {
// TODO Auto-generated method stub
if(!Charset.isSupported(fromCharsetName)) {
throw new UnsupportedCharsetException(fromCharsetName);
}
FileInputStream inputStream=new FileInputStream(file);
InputStreamReader reader=new InputStreamReader(inputStream,fromCharsetName);
char[] chs=new char[(int)file.length()];
reader.read(chs);
String str=new String(chs).trim();
reader.close();
return str;
}

private static String JudgeFileCode(File file) {

CodepageDetectorProxy detector=CodepageDetectorProxy.getInstance();

detector.add(new ParsingDetector(false));

detector.add(JChardetFacade.getInstance());

detector.add(ASCIIDetector.getInstance());

detector.add(UnicodeDetector.getInstance());

Charset charset=null;

try {
charset=detector.detectCodepage(file.toURL());
}catch(Exception ex) {
ex.printStackTrace();
}

if(charset!=null) {
return charset.name();
}else {
return null;
}
}




public static void main(String[] args) throws Exception {
// TODO Auto-generated method stub

File srcfile=new File("sourceText");
File[] sfiles=srcfile.listFiles();

for(File sfile:sfiles) {
String code=JudgeFileCode(sfile);
//System.out.println(code);
convert(sfile,code,"utf-8",new FilenameFilter() {


@Override
public boolean accept(File dir, String name) {
// TODO Auto-generated method stub
return name.endsWith("txt");
}
});
}

}


}

参考博客:http://weict1988.iteye.com/blog/1003379

http://blog.csdn.net/wfdtxz/article/details/26166853

http://blog.csdn.net/mhmyqn/article/details/37917947#java



原创粉丝点击