java自动探测文件的字符编码

来源:互联网 发布:betterzip 3.1.2 mac 编辑:程序博客网 时间:2024/05/29 15:08
import java.io.BufferedInputStream;import java.io.File;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.IOException;import org.mozilla.intl.chardet.nsDetector;import org.mozilla.intl.chardet.nsICharsetDetectionObserver;public class FileCharsetDetector {    private boolean found = false;    private String encoding = null;    public static void main(String[] argv) throws Exception {        File file1 = new File("C:\\test1.txt");                System.out.println("文件编码:" + new FileCharsetDetector().guessFileEncoding(file1));    }    /**     * 传入一个文件(File)对象,检查文件编码     *      * @param file     *            File对象实例     * @return 文件编码,若无,则返回null     * @throws FileNotFoundException     * @throws IOException     */    public String guessFileEncoding(File file) throws FileNotFoundException, IOException {        return guessFileEncoding(file, new nsDetector());    }    /**     * <pre>     * 获取文件的编码     * @param file     *            File对象实例     * @param languageHint     *            语言提示区域代码 @see #nsPSMDetector ,取值如下:     *             1 : Japanese     *             2 : Chinese     *             3 : Simplified Chinese     *             4 : Traditional Chinese     *             5 : Korean     *             6 : Dont know(default)     * </pre>     *      * @return 文件编码,eg:UTF-8,GBK,GB2312形式(不确定的时候,返回可能的字符编码序列);若无,则返回null     * @throws FileNotFoundException     * @throws IOException     */    public String guessFileEncoding(File file, int languageHint) throws FileNotFoundException, IOException {        return guessFileEncoding(file, new nsDetector(languageHint));    }    /**     * 获取文件的编码     *      * @param file     * @param det     * @return     * @throws FileNotFoundException     * @throws IOException     */    private String guessFileEncoding(File file, nsDetector det) throws FileNotFoundException, IOException {        // Set an observer...        // The Notify() will be called when a matching charset is found.        det.Init(new nsICharsetDetectionObserver() {            public void Notify(String charset) {                encoding = charset;                found = true;            }        });        BufferedInputStream imp = new BufferedInputStream(new FileInputStream(file));        byte[] buf = new byte[1024];        int len;        boolean done = false;        boolean isAscii = false;        while ((len = imp.read(buf, 0, buf.length)) != -1) {            // Check if the stream is only ascii.            isAscii = det.isAscii(buf, len);            if (isAscii) {                break;            }            // DoIt if non-ascii and not done yet.            done = det.DoIt(buf, len, false);            if (done) {                break;            }        }        imp.close();        det.DataEnd();        if (isAscii) {            encoding = "ASCII";            found = true;        }        if (!found) {            String[] prob = det.getProbableCharsets();            //这里将可能的字符集组合起来返回            for (int i = 0; i < prob.length; i++) {                if (i == 0) {                    encoding = prob[i];                } else {                    encoding += "," + prob[i];                }            }            if (prob.length > 0) {                // 在没有发现情况下,也可以只取第一个可能的编码,这里返回的是一个可能的序列                return encoding;            } else {                return null;            }        }        return encoding;    }}

0 0
原创粉丝点击