android获取文本编码

来源：互联网发布：周星驰锵锵三人行知乎编辑：程序博客网时间：2024/06/07 15:13

转载来源：http://blog.csdn.net/u012587637/article/details/40107557

方法1：利用windows文本文件编码特点

windows下，Unicode、Unicode big endian和UTF-8编码的txt文件的开头会多出几个字节，分别是FF、FE（Unicode）,FE、FF（Unicode big endian）,EF、BB、BF（UTF-8）。

public static String getCharset(File file) {        String charset = "GBK";        byte[] first3Bytes = new byte[3];        try {            boolean checked = false;            BufferedInputStream bis = new BufferedInputStream(                  new FileInputStream(file));            bis.mark(0);            int read = bis.read(first3Bytes, 0, 3);            if (read == -1)                return charset;            if (first3Bytes[0] == (byte) 0xFF && first3Bytes[1] == (byte) 0xFE) {                charset = "UTF-16LE";                checked = true;            } else if (first3Bytes[0] == (byte) 0xFE && first3Bytes[1]                == (byte) 0xFF) {                charset = "UTF-16BE";                checked = true;            } else if (first3Bytes[0] == (byte) 0xEF && first3Bytes[1]                    == (byte) 0xBB                    && first3Bytes[2] == (byte) 0xBF) {                charset = "UTF-8";                checked = true;            }            bis.reset();            if (!checked) {                int loc = 0;                while ((read = bis.read()) != -1) {                    loc++;                    if (read >= 0xF0)                        break;                    //单独出现BF以下的，也算是GBK                    if (0x80 <= read && read <= 0xBF)                        break;                    if (0xC0 <= read && read <= 0xDF) {                        read = bis.read();                        if (0x80 <= read && read <= 0xBF)// 双字节 (0xC0 - 0xDF)                            // (0x80 -                            // 0xBF),也可能在GB编码内                            continue;                        else                            break;                     // 也有可能出错，但是几率较小                    } else if (0xE0 <= read && read <= 0xEF) {                        read = bis.read();                        if (0x80 <= read && read <= 0xBF) {                            read = bis.read();                            if (0x80 <= read && read <= 0xBF) {                                charset = "UTF-8";                                break;                            } else                                break;                        } else                            break;                    }                }                System.out.println(loc + " " + Integer.toHexString(read));            }            bis.close();        } catch (Exception e) {            e.printStackTrace();        }        return charset;    }

缺点：不能这样去探测linux下的文件。

方法2：开源工程 JCharDet

package org.mozilla.intl.chardet;import java.io.BufferedInputStream;import java.io.File;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.IOException;/** * 借助JCharDet获取文件字符集 * @author icer * PS: * JCharDet 是mozilla自动字符集探测算法代码的java移植，其官方主页为： *      http://jchardet.sourceforge.net/ * @date    2008/11/13  */public class FileCharsetDetector {    private boolean found = false;    /**     * 如果完全匹配某个字符集检测算法, 则该属性保存该字符集的名称. 否则(如二进制文件)其值就为默认值 null, 这时应当查询属性      */    private String encoding = null;    public static void main(String[] argv) throws Exception {        if (argv.length != 1 && argv.length != 2) {            System.out                    .println("Usage: FileCharsetDetector <path> [<languageHint>]");            System.out.println("");            System.out.println("Where <path> is d:/demo.txt");            System.out.println("For optional <languageHint>. Use following...");            System.out.println("        1 => Japanese");            System.out.println("        2 => Chinese");            System.out.println("        3 => Simplified Chinese");            System.out.println("        4 => Traditional Chinese");            System.out.println("        5 => Korean");            System.out.println("        6 => Dont know (default)");            return;        } else {            String encoding = null;            if (argv.length == 2) {                encoding = new FileCharsetDetector().guestFileEncoding(argv[0],                        Integer.valueOf(argv[1]));            } else {                encoding = new FileCharsetDetector().guestFileEncoding(argv[0]);            }            System.out.println("文件编码:" + encoding);        }    }    /**     * 传入一个文件(File)对象，检查文件编码     *      * @param file     *            File对象实例     * @return 文件编码，若无，则返回null     * @throws FileNotFoundException     * @throws IOException     */    public String guestFileEncoding(File file) throws FileNotFoundException,            IOException {        return geestFileEncoding(file, new nsDetector());    }    /**     * 获取文件的编码     *      * @param file     *            File对象实例     * @param languageHint     *            语言提示区域代码 eg：1 : Japanese; 2 : Chinese; 3 : Simplified Chinese;     *            4 : Traditional Chinese; 5 : Korean; 6 : Dont know (default)     * @return 文件编码，eg：UTF-8,GBK,GB2312形式，若无，则返回null     * @throws FileNotFoundException     * @throws IOException     */    public String guestFileEncoding(File file, int languageHint)            throws FileNotFoundException, IOException {        return geestFileEncoding(file, new nsDetector(languageHint));    }    /**     * 获取文件的编码     *      * @param path     *            文件路径     * @return 文件编码，eg：UTF-8,GBK,GB2312形式，若无，则返回null     * @throws FileNotFoundException     * @throws IOException     */    public String guestFileEncoding(String path) throws FileNotFoundException,            IOException {        return guestFileEncoding(new File(path));    }    /**     * 获取文件的编码     *      * @param path     *            文件路径     * @param languageHint     *            语言提示区域代码 eg：1 : Japanese; 2 : Chinese; 3 : Simplified Chinese;     *            4 : Traditional Chinese; 5 : Korean; 6 : Dont know (default)     * @return     * @throws FileNotFoundException     * @throws IOException     */    public String guestFileEncoding(String path, int languageHint)            throws FileNotFoundException, IOException {        return guestFileEncoding(new File(path), languageHint);    }    /**     * 获取文件的编码     *      * @param file     * @param det     * @return     * @throws FileNotFoundException     * @throws IOException     */    private String geestFileEncoding(File file, nsDetector det)            throws FileNotFoundException, IOException {        // Set an observer...        // The Notify() will be called when a matching charset is found.        det.Init(new nsICharsetDetectionObserver() {            public void Notify(String charset) {                found = true;                encoding = charset;            }        });        BufferedInputStream imp = new BufferedInputStream(new FileInputStream(                file));        byte[] buf = new byte[1024];        int len;        boolean done = false;        boolean isAscii = true;        while ((len = imp.read(buf, 0, buf.length)) != -1) {            // Check if the stream is only ascii.            if (isAscii)                isAscii = det.isAscii(buf, len);            // DoIt if non-ascii and not done yet.            if (!isAscii && !done)                done = det.DoIt(buf, len, false);        }        det.DataEnd();        if (isAscii) {            encoding = "ASCII";            found = true;        }        if (!found) {            String prob[] = det.getProbableCharsets();            if (prob.length > 0) {                // 在没有发现情况下，则取第一个可能的编码                encoding = prob[0];            } else {                return null;            }        }        return encoding;    }}

jar包下载地址：http://download.csdn.net/detail/u012587637/8047697

方法3：开源工程juniversalcharde

官网：http://code.google.com/p/juniversalchardet/

public static String getFileIncode(File file) {        if (!file.exists()) {            System.err.println("getFileIncode: file not exists!");            return null;        }        byte[] buf = new byte[4096];        FileInputStream fis = null;        try {            fis = new FileInputStream(file);            // (1)            UniversalDetector detector = new UniversalDetector(null);            // (2)            int nread;            while ((nread = fis.read(buf)) > 0 && !detector.isDone()) {                detector.handleData(buf, 0, nread);            }            // (3)            detector.dataEnd();            // (4)            String encoding = detector.getDetectedCharset();            if (encoding != null) {                System.out.println("Detected encoding = " + encoding);            } else {                System.out.println("No encoding detected.");            }            // (5)            detector.reset();            fis.close();            return encoding;        } catch (Exception e) {            e.printStackTrace();        }        return null;    }

jar包下载：http://download.csdn.net/detail/u012587637/8041181

4.使用阿帕奇的commons-io包

具体参考：http://blog.csdn.net/21aspnet/article/details/50612867

阅读全文

0 0