http://www.2cto.com/kf/201410/344061.html

来源:互联网 发布:gp88s调整软件 编辑:程序博客网 时间:2024/04/30 14:35

方法1:利用windows文本文件编码特点。

windows下,Unicode、Unicode big endian和UTF-8编码的txt文件的开头会多出几个字节,分别是FF、FE(Unicode),FE、FF(Unicode big endian),EF、BB、BF(UTF-8)。

public static String getCharset(File file) {

        String charset = "GBK";

        byte[] first3Bytes =new byte[3];

        try {

            boolean checked =false;

            BufferedInputStream bis =new BufferedInputStream(

                  new FileInputStream(file));

            bis.mark(0);

            int read = bis.read(first3Bytes,0,3);

            if (read == -1)

                return charset;

            if (first3Bytes[0] == (byte)0xFF && first3Bytes[1] == (byte)0xFE) {

                charset = "UTF-16LE";

                checked = true;

            } else if (first3Bytes[0] == (byte) 0xFE && first3Bytes[1]

                == (byte)0xFF) {

                charset = "UTF-16BE";

                checked = true;

            } else if (first3Bytes[0] == (byte) 0xEF && first3Bytes[1]

                    == (byte)0xBB

                    && first3Bytes[2] == (byte)0xBF) {

                charset = "UTF-8";

                checked = true;

            }

            bis.reset();

            if (!checked) {

                int loc =0;

                while ((read = bis.read()) != -1) {

                    loc++;

                    if (read >=0xF0)

                        break;

                    //单独出现BF以下的,也算是GBK

                    if (0x80 <= read && read <= 0xBF)

                        break;

                    if (0xC0 <= read && read <= 0xDF) {

                        read = bis.read();

                        if (0x80 <= read && read <= 0xBF)// 双字节 (0xC0 - 0xDF)

                            // (0x80 -

                            // 0xBF),也可能在GB编码内

                            continue;

                        else

                            break;

                     // 也有可能出错,但是几率较小

                    } else if (0xE0 <= read && read <= 0xEF) {

                        read = bis.read();

                        if (0x80 <= read && read <= 0xBF) {

                            read = bis.read();

                            if (0x80 <= read && read <= 0xBF) {

                                charset ="UTF-8";

                                break;

                            } else

                                break;

                        } else

                            break;

                    }

                }

                System.out.println(loc +" " + Integer.toHexString(read));

            }

            bis.close();

        } catch (Exception e) {

            e.printStackTrace();

        }

        return charset;

    }

缺点:不能这样去探测linux下的文件。

方法2:开源工程 JCharDet

http://www.iteye.com/topic/266501

package org.mozilla.intl.chardet;

import java.io.BufferedInputStream;

import java.io.File;

import java.io.FileInputStream;

import java.io.FileNotFoundException;

import java.io.IOException;

/**

 * 借助JCharDet获取文件字符集

 * @author icer

 * PS:

 * JCharDet 是mozilla自动字符集探测算法代码的java移植,其官方主页为:

 *     http://jchardet.sourceforge.net/

 * @date    2008/11/13

 */

public class FileCharsetDetector {

    private boolean found = false;

    /**

     * 如果完全匹配某个字符集检测算法, 则该属性保存该字符集的名称. 否则(如二进制文件)其值就为默认值 null, 这时应当查询属性

     */

    private String encoding =null;

    public static void main(String[] argv) throws Exception {

        if (argv.length !=1 && argv.length !=2) {

            System.out

                    .println("Usage: FileCharsetDetector <path> [<languagehint>]");

            System.out.println("");

            System.out.println("Where <path> is d:/demo.txt");

            System.out.println("For optional <languagehint>. Use following...");

            System.out.println("        1 => Japanese");

            System.out.println("        2 => Chinese");

            System.out.println("        3 => Simplified Chinese");

            System.out.println("        4 => Traditional Chinese");

            System.out.println("        5 => Korean");

            System.out.println("        6 => Dont know (default)");

            return;

        } else {

            String encoding = null;

            if (argv.length ==2) {

                encoding = new FileCharsetDetector().guestFileEncoding(argv[0],

                        Integer.valueOf(argv[1]));

            } else {

                encoding = new FileCharsetDetector().guestFileEncoding(argv[0]);

            }

            System.out.println("文件编码:" + encoding);

        }

    }

    /**

     * 传入一个文件(File)对象,检查文件编码

     *

     * @param file

     *            File对象实例

     * @return 文件编码,若无,则返回null

     * @throws FileNotFoundException

     * @throws IOException

     */

    public String guestFileEncoding(File file)throws FileNotFoundException,

            IOException {

        return geestFileEncoding(file,new nsDetector());

    }

    /**

     * 获取文件的编码

     *

     * @param file

     *            File对象实例

     * @param languageHint

     *            语言提示区域代码 eg:1 : Japanese; 2 : Chinese; 3 : Simplified Chinese;

     *            4 : Traditional Chinese; 5 : Korean; 6 : Dont know (default)

     * @return 文件编码,eg:UTF-8,GBK,GB2312形式,若无,则返回null

     * @throws FileNotFoundException

     * @throws IOException

     */

    public String guestFileEncoding(File file,int languageHint)

            throws FileNotFoundException, IOException {

        return geestFileEncoding(file,new nsDetector(languageHint));

    }

    /**

     * 获取文件的编码

     *

     * @param path

     *            文件路径

     * @return 文件编码,eg:UTF-8,GBK,GB2312形式,若无,则返回null

     * @throws FileNotFoundException

     * @throws IOException

     */

    public String guestFileEncoding(String path)throws FileNotFoundException,

            IOException {

        return guestFileEncoding(new File(path));

    }

    /**

     * 获取文件的编码

     *

     * @param path

     *            文件路径

     * @param languageHint

     *            语言提示区域代码 eg:1 : Japanese; 2 : Chinese; 3 : Simplified Chinese;

     *            4 : Traditional Chinese; 5 : Korean; 6 : Dont know (default)

     * @return

     * @throws FileNotFoundException

     * @throws IOException

     */

    public String guestFileEncoding(String path,int languageHint)

            throws FileNotFoundException, IOException {

        return guestFileEncoding(new File(path), languageHint);

    }

    /**

     * 获取文件的编码

     *

     * @param file

     * @param det

     * @return

     * @throws FileNotFoundException

     * @throws IOException

     */

    private String geestFileEncoding(File file, nsDetector det)

            throws FileNotFoundException, IOException {

        // Set an observer...

        // The Notify() will be called when a matching charset is found.

        det.Init(new nsICharsetDetectionObserver() {

            public void Notify(String charset) {

                found = true;

                encoding = charset;

            }

        });

        BufferedInputStream imp = new BufferedInputStream(new FileInputStream(

                file));

        byte[] buf =new byte[1024];

        int len;

        boolean done =false;

        boolean isAscii =true;

        while ((len = imp.read(buf,0, buf.length)) != -1) {

            // Check if the stream is only ascii.

            if (isAscii)

                isAscii = det.isAscii(buf, len);

            // DoIt if non-ascii and not done yet.

            if (!isAscii && !done)

                done = det.DoIt(buf, len,false);

        }

        det.DataEnd();

        if (isAscii) {

          encoding = "ASCII";

            found = true;

        }

        if (!found) {

            String prob[] = det.getProbableCharsets();

           if (prob.length >0) {

                // 在没有发现情况下,则取第一个可能的编码

                encoding = prob[0];

            } else {

                return null;

            }

        }

        return encoding;

    }

}

jar包下载地址:http://download.csdn.net/detail/u012587637/8041169

方法3:开源工程juniversalcharde

http://code.google.com/p/juniversalchardet/

public static String getFileIncode(File file) {

        if (!file.exists()) {

            System.err.println("getFileIncode: file not exists!");

            return null;

        }

        byte[] buf =new byte[4096];

        FileInputStream fis = null;

        try {

            fis = new FileInputStream(file);         

            UniversalDetector detector = new UniversalDetector(null);

            int nread;

            while ((nread = fis.read(buf)) >0 && !detector.isDone()) {

                detector.handleData(buf,0, nread);

            }

            detector.dataEnd();

            String encoding = detector.getDetectedCharset();

            if (encoding !=null) {

                System.out.println("Detected encoding = " + encoding);

            } else {

                System.out.println("No encoding detected.");

            }

            detector.reset();

            fis.close();

            return encoding;

        } catch (Exception e) {

            e.printStackTrace();

        }

        return null;

    }

jar包下载:http://download.csdn.net/detail/u012587637/8041181

说明:第三个方法要比第二个速度快些,也比较新,所以推荐使用第三个。


0 0
原创粉丝点击