java 文件编码判断

来源:互联网 发布:大数据分析导论 感想 编辑:程序博客网 时间:2024/06/11 01:05

java的unicode对于无头Utf-8格式的编码的判断是缺失
工具类实现java对文件编码的判断

编码格式判断工具类方法

public static String getTxtEncode(InputStream in) throws IOException {        String dc = Charset.defaultCharset().name();        UnicodeInputStream uin = new UnicodeInputStream(in, dc);        if ("UTF-8".equals(uin.getEncoding())) {            uin.close();            return "UTF-8";        }        uin.close();        byte[] head = new byte[3];        in.read(head);        in.reset();        String code = "GBK";        if (head[0] == -1 && head[1] == -2)            code = "UTF-16";        if (head[0] == -2 && head[1] == -1)            code = "Unicode";        // 带BOM        if (head[0] == -17 && head[1] == -69 && head[2] == -65)            code = "UTF-8";        if ("Unicode".equals(code)) {            code = "UTF-16";        }        return code;    }

此处代码实现对无头uff-8编码判断

import java.io.*;    /**   * This inputstream will recognize unicode BOM marks and will skip bytes if   * getEncoding() method is called before any of the read(...) methods.   *    * Usage pattern: String enc = "ISO-8859-1"; // or NULL to use systemdefault   * FileInputStream fis = new FileInputStream(file); UnicodeInputStream uin = new   * UnicodeInputStream(fis, enc); enc = uin.getEncoding(); // check and skip   * possible BOM bytes InputStreamReader in; if (enc == null) in = new   * InputStreamReader(uin); else in = new InputStreamReader(uin, enc);   * * 添加对无Bom Utf-8文件编码的识别 */    public class UnicodeInputStream extends InputStream {        PushbackInputStream internalIn;        boolean isInited = false;        String defaultEnc;        String encoding;        private static final int BOM_SIZE = 4;        public UnicodeInputStream(InputStream in, String defaultEnc) {            internalIn = new PushbackInputStream(in, BOM_SIZE);            this.defaultEnc = defaultEnc;        }        public String getDefaultEncoding() {            return defaultEnc;        }        public String getEncoding() {            if (!isInited) {                try {                    init();                } catch (IOException ex) {                    IllegalStateException ise = new IllegalStateException(                            "Init method failed.");                    ise.initCause(ise);                    throw ise;                }            }            return encoding;        }        /**       * Read-ahead four bytes and check for BOM marks. Extra bytes are unread       * back to the stream, only BOM bytes are skipped.       */        protected void init() throws IOException {            if (isInited)                return;            byte bom[] = new byte[BOM_SIZE];            int n, unread;            n = internalIn.read(bom, 0, bom.length);            if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00)                    && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) {                encoding = "UTF-32BE";                unread = n - 4;            } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)                    && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) {                encoding = "UTF-32LE";                unread = n - 4;            } else if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB)                    && (bom[2] == (byte) 0xBF)) {                encoding = "UTF-8";                unread = n - 3;            } else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {                encoding = "UTF-16BE";                unread = n - 2;            } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {                encoding = "UTF-16LE";                unread = n - 2;            } else {                // Unicode BOM mark not found, unread all bytes                encoding = defaultEnc;                unread = n;            }            // System.out.println("read=" + n + ", unread=" + unread);            if (unread > 0)                internalIn.unread(bom, (n - unread), unread);            isInited = true;        }        public void close() throws IOException {            // init();            isInited = true;            internalIn.close();        }        public int read() throws IOException {            // init();            isInited = true;            return internalIn.read();        }    }

参考此文: http://jybzjf.iteye.com/blog/2262392