检查文件编码是什么格式的

来源：互联网发布：ubuntu 清空系统编辑：程序博客网时间：2024/05/01 21:25

可以使用一个开源项目cpdetector，它所在的网址是：http://cpdetector.sourceforge.net/。它的类库很小，只有500K左右，cpDetector是基于统计学原理的，不保证完全正确，利用该类库判定文本文件的代码如下：

/** * 利用第三方开源包cpdetector获取文件编码格式 *  * @param path *            要判断文件编码格式的源文件的路径 * @author tg * @version 2012-7-12 14:05 */public static String getFileEncode(String path) {/* * detector是探测器，它把探测任务交给具体的探测实现类的实例完成。 * cpDetector内置了一些常用的探测实现类，这些探测实现类的实例可以通过add方法 加进来，如ParsingDetector、 * JChardetFacade、ASCIIDetector、UnicodeDetector。 * detector按照“谁最先返回非空的探测结果，就以该结果为准”的原则返回探测到的 * 字符集编码。使用需要用到三个第三方JAR包：antlr.jar、chardet.jar和cpdetector.jar * cpDetector是基于统计学原理的，不保证完全正确。 */CodepageDetectorProxy detector = CodepageDetectorProxy.getInstance();/* * ParsingDetector可用于检查HTML、XML等文件或字符流的编码,构造方法中的参数用于 * 指示是否显示探测过程的详细信息，为false不显示。 */detector.add(new ParsingDetector(false));/* * JChardetFacade封装了由Mozilla组织提供的JChardet，它可以完成大多数文件的编码 * 测定。所以，一般有了这个探测器就可满足大多数项目的要求，如果你还不放心，可以 * 再多加几个探测器，比如下面的ASCIIDetector、UnicodeDetector等。 */detector.add(JChardetFacade.getInstance());// 用到antlr.jar、chardet.jar// ASCIIDetector用于ASCII编码测定detector.add(ASCIIDetector.getInstance());// UnicodeDetector用于Unicode家族编码的测定detector.add(UnicodeDetector.getInstance());java.nio.charset.Charset charset = null;File f = new File(path);try {charset = detector.detectCodepage(f.toURI().toURL());} catch (Exception ex) {ex.printStackTrace();}if (charset != null)return charset.name();elsereturn null;}/** * 利用第三方开源包cpdetector获取URL对应的文件编码 *  * @param path *            要判断文件编码格式的源文件的URL * @author tg * @version 2012-7-12 14:05 */public static String getFileEncode(URL url) {/* * detector是探测器，它把探测任务交给具体的探测实现类的实例完成。 * cpDetector内置了一些常用的探测实现类，这些探测实现类的实例可以通过add方法 加进来，如ParsingDetector、 * JChardetFacade、ASCIIDetector、UnicodeDetector。 * detector按照“谁最先返回非空的探测结果，就以该结果为准”的原则返回探测到的 * 字符集编码。使用需要用到三个第三方JAR包：antlr.jar、chardet.jar和cpdetector.jar * cpDetector是基于统计学原理的，不保证完全正确。 */CodepageDetectorProxy detector = CodepageDetectorProxy.getInstance();/* * ParsingDetector可用于检查HTML、XML等文件或字符流的编码,构造方法中的参数用于 * 指示是否显示探测过程的详细信息，为false不显示。 */detector.add(new ParsingDetector(false));/* * JChardetFacade封装了由Mozilla组织提供的JChardet，它可以完成大多数文件的编码 * 测定。所以，一般有了这个探测器就可满足大多数项目的要求，如果你还不放心，可以 * 再多加几个探测器，比如下面的ASCIIDetector、UnicodeDetector等。 */detector.add(JChardetFacade.getInstance());// 用到antlr.jar、chardet.jar// ASCIIDetector用于ASCII编码测定detector.add(ASCIIDetector.getInstance());// UnicodeDetector用于Unicode家族编码的测定detector.add(UnicodeDetector.getInstance());java.nio.charset.Charset charset = null;try {charset = detector.detectCodepage(url);// 就这一行url不同，getFileEncode是根据文件路径来} catch (Exception ex) {ex.printStackTrace();}if (charset != null)return charset.name();elsereturn null;}/** * 根据编码 和文件路径读取对应的文件 *  * @param configFilePath *            文件路径 * @param encoding *            编码 */public static void readFile(String configFilePath, String encoding) {FileInputStream inputStream = null;BufferedReader reader = null;try {inputStream = new FileInputStream(configFilePath);reader = new BufferedReader(new InputStreamReader(inputStream,"utf-8"));String tempString = null;int line = 1;// 一次读入一行，直到读入null为文件结束while ((tempString = reader.readLine()) != null) {// 显示行号System.out.println("line " + line + ": " + tempString);line++;}} catch (FileNotFoundException e) {// TODO Auto-generated catch blocke.printStackTrace();} catch (UnsupportedEncodingException e) {// TODO Auto-generated catch blocke.printStackTrace();} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();} finally {if (inputStream != null) {try {inputStream.close();} catch (IOException e1) {}}if (reader != null) {try {reader.close();} catch (IOException e1) {}}}}public static void main(String[] args) {// String configFilePath="E:\\ftpshare\\ANZBS20131113.txt.backup";// String configFilePath="E:\\ftpshare\\测试编码\\ANZBS20131113.txt";// String configFilePath="D:\\output\\ANZBS20131113.txt";String configFilePath = "D:\\input\\ANZBS20131113.txt";String charsetName = getFileEncode(configFilePath);System.out.println(charsetName);URL url = TestEncoding.class.getResource("../../../mule-config.xml" );try {//URLConnection urlConnection = url.openConnection();String charsetName1 = getFileEncode(url);System.out.println(charsetName1);} catch (Exception e) {// TODO Auto-generated catch blocke.printStackTrace();}}

同时还有如何批量将gbk转为utf-8的方式，具体可以参见附件代码

public class FileGBK2UTF8 {public static void main(String[] args) {// 需要转换的文件目录String fromPath = "D:\\input";// 转换到指定的文件目录String toPath = "D:\\output";info("start transform [from path]={0} [to path]={1}", fromPath, toPath);// 递归取到所有的文件进行转换transform(fromPath, toPath);}/** * 把一个目录中的文件转换到另一个目录中 *  * @param fromPath *            -- 来源文件目录 * @param toPath *            -- 目标文件目录 * @return */public static boolean transform(String fromPath, String toPath) {File ftmp = new File(fromPath);if (!ftmp.exists()) {info("转换文件路径错误！");return false;}info("frompath is [{0}], topath is [{1}]", fromPath, toPath);// 如果是文件，则转换，结束if (ftmp.isFile()) {byte[] value = fileToBytes(fromPath);String content = convEncoding(value, "gbk", "utf-8");return saveFileUtf8(toPath, content);} else {// 查找目录下面的所有文件与文件夹File[] childFiles = ftmp.listFiles();for (int i = 0, n = childFiles.length; i < n; i++) {File child = childFiles[i];String childFrom = fromPath + "/" + child.getName();String childTo = toPath + "/" + child.getName();transform(childFrom, childTo);}}return true;}/** * 把文件内容保存到指定的文件中，如果指定的文件已存在，则先删除这个文件， 如果没有则创建一个新文件，文件内容采用UTF-8编码方式保存。 * 如果指定的文件路径不存在，则先创建文件路径，文件路径从根目录开始创建。 *  * @param fileName *            -- 文件路径 * @param content *            -- 文件内容 * @return */public static boolean saveFileUtf8(String fileName, String content) {if (fileName == null || fileName.length() == 0)return false;if (content == null)return false;// 路径中的\转换为/fileName = fileName.replace('\\', '/');// 处理文件路径createPath(fileName.substring(0, fileName.lastIndexOf('/')));File file = null;FileOutputStream out = null;try {// 创建或修改文件file = new File(fileName);if (file.exists()) {file.delete();} else {file.createNewFile();}out = new FileOutputStream(file);// 添加三个字节标识为UTF-8格式，也是BOM码// out.write(new byte[]{(byte)0xEF,(byte)0xBB,(byte)0xBF});out.write(content.getBytes("UTF-8"));} catch (FileNotFoundException e) {e.printStackTrace();return false;} catch (IOException e) {e.printStackTrace();return false;} finally {if (out != null) {try {out.flush();out.close();} catch (IOException e) {e.printStackTrace();return false;}}}return true;}/** * 把文件内容转换为字节数组输出。 *  * @param fileName *            -- 文件名 * @return */public static byte[] fileToBytes(String fileName) {FileInputStream ins = null;ByteArrayOutputStream bos = null;try {// 创建文件读入流ins = new FileInputStream(new File(fileName));// 创建目标输出流bos = new ByteArrayOutputStream();// 取流中的数据int len = 0;byte[] buf = new byte[256];while ((len = ins.read(buf, 0, 256)) > -1) {bos.write(buf, 0, len);}// 目标流转为字节数组返回到前台return bos.toByteArray();} catch (Exception e) {e.printStackTrace();} finally {try {if (ins != null) {ins.close();ins = null;}if (bos != null) {bos.close();bos = null;}} catch (Exception e) {e.printStackTrace();}}return null;}/** * 检查指定的文件路径，如果文件路径不存在，则创建新的路径， 文件路径从根目录开始创建。 *  * @param filePath * @return */public static boolean createPath(String filePath) {if (filePath == null || filePath.length() == 0)return false;// 路径中的\转换为/filePath = filePath.replace('\\', '/');// 处理文件路径String[] paths = filePath.split("/");// 处理文件名中没有的路径StringBuilder sbpath = new StringBuilder();for (int i = 0, n = paths.length; i < n; i++) {sbpath.append(paths[i]);// 检查文件路径如果没有则创建File ftmp = new File(sbpath.toString());if (!ftmp.exists()) {ftmp.mkdir();}sbpath.append("/");}return true;}/** * 取路径中的文件名 *  * @param path *            -- 文件路径，含文件名 * @return */public static String getFileName(String path) {if (path == null || path.length() == 0)return "";path = path.replaceAll("\\\\", "/");int last = path.lastIndexOf("/");if (last >= 0) {return path.substring(last + 1);} else {return path;}}/** * 字符串的编码格式转换 *  * @param value *            -- 要转换的字符串 * @param oldCharset *            -- 原编码格式 * @param newCharset *            -- 新编码格式 * @return */public static String convEncoding(byte[] value, String oldCharset,String newCharset) {OutputStreamWriter outWriter = null;ByteArrayInputStream byteIns = null;ByteArrayOutputStream byteOuts = new ByteArrayOutputStream();InputStreamReader inReader = null;char cbuf[] = new char[1024];int retVal = 0;try {byteIns = new ByteArrayInputStream(value);inReader = new InputStreamReader(byteIns, oldCharset);outWriter = new OutputStreamWriter(byteOuts, newCharset);while ((retVal = inReader.read(cbuf)) != -1) {outWriter.write(cbuf, 0, retVal);}} catch (Exception e) {e.printStackTrace();} finally {try {if (inReader != null)inReader.close();if (outWriter != null)outWriter.close();} catch (Exception e) {e.printStackTrace();}}String temp = null;try {temp = new String(byteOuts.toByteArray(), newCharset);} catch (UnsupportedEncodingException e) {e.printStackTrace();}// System.out.println("temp" + temp);return temp;}/** * 显示提示信息 *  * @param message *            -- 信息内容 * @param params *            -- 参数 */private static void info(String message, Object... params) {message = MessageFormat.format(message, params);System.out.println(message);}}