http://www.2cto.com/kf/201410/344061.html

来源：互联网发布：gp88s调整软件编辑：程序博客网时间：2024/04/30 14:35

方法1：利用windows文本文件编码特点。

windows下，Unicode、Unicode big endian和UTF-8编码的txt文件的开头会多出几个字节，分别是FF、FE（Unicode）,FE、FF（Unicode big endian）,EF、BB、BF（UTF-8）。

public static String getCharset(File file) {

String charset = "GBK";

byte[] first3Bytes =new byte[3];

try {

boolean checked =false;

BufferedInputStream bis =new BufferedInputStream(

new FileInputStream(file));

bis.mark(0);

int read = bis.read(first3Bytes,0,3);

if (read == -1)

return charset;

if (first3Bytes[0] == (byte)0xFF && first3Bytes[1] == (byte)0xFE) {

charset = "UTF-16LE";

checked = true;

} else if (first3Bytes[0] == (byte) 0xFE && first3Bytes[1]

== (byte)0xFF) {

charset = "UTF-16BE";

checked = true;

} else if (first3Bytes[0] == (byte) 0xEF && first3Bytes[1]

== (byte)0xBB

&& first3Bytes[2] == (byte)0xBF) {

charset = "UTF-8";

checked = true;

}

bis.reset();

if (!checked) {

int loc =0;

while ((read = bis.read()) != -1) {

loc++;

if (read >=0xF0)

break;

//单独出现BF以下的，也算是GBK

if (0x80 <= read && read <= 0xBF)

break;

if (0xC0 <= read && read <= 0xDF) {

read = bis.read();

if (0x80 <= read && read <= 0xBF)// 双字节 (0xC0 - 0xDF)

// (0x80 -

// 0xBF),也可能在GB编码内

continue;

else

break;

// 也有可能出错，但是几率较小

} else if (0xE0 <= read && read <= 0xEF) {

read = bis.read();

if (0x80 <= read && read <= 0xBF) {

read = bis.read();

if (0x80 <= read && read <= 0xBF) {

charset ="UTF-8";

break;

} else

break;

} else

break;

}

System.out.println(loc +" " + Integer.toHexString(read));

}

bis.close();

} catch (Exception e) {

e.printStackTrace();

}

return charset;

}

缺点：不能这样去探测linux下的文件。

方法2：开源工程 JCharDet

http://www.iteye.com/topic/266501

package org.mozilla.intl.chardet;

import java.io.BufferedInputStream;

import java.io.File;

import java.io.FileInputStream;

import java.io.FileNotFoundException;

import java.io.IOException;

/**

* 借助JCharDet获取文件字符集

* @author icer

* PS:

* JCharDet 是mozilla自动字符集探测算法代码的java移植，其官方主页为：

* http://jchardet.sourceforge.net/

* @date 2008/11/13

public class FileCharsetDetector {

private boolean found = false;

/**

* 如果完全匹配某个字符集检测算法, 则该属性保存该字符集的名称. 否则(如二进制文件)其值就为默认值 null, 这时应当查询属性

private String encoding =null;

public static void main(String[] argv) throws Exception {

if (argv.length !=1 && argv.length !=2) {

System.out

.println("Usage: FileCharsetDetector <path> [<languagehint>]");

System.out.println("");

System.out.println("Where <path> is d:/demo.txt");

System.out.println("For optional <languagehint>. Use following...");

System.out.println(" 1 => Japanese");

System.out.println(" 2 => Chinese");

System.out.println(" 3 => Simplified Chinese");

System.out.println(" 4 => Traditional Chinese");

System.out.println(" 5 => Korean");

System.out.println(" 6 => Dont know (default)");

return;

} else {

String encoding = null;

if (argv.length ==2) {

encoding = new FileCharsetDetector().guestFileEncoding(argv[0],

Integer.valueOf(argv[1]));

} else {

encoding = new FileCharsetDetector().guestFileEncoding(argv[0]);

}

System.out.println("文件编码:" + encoding);

}

/**

* 传入一个文件(File)对象，检查文件编码

* @param file

* File对象实例

* @return 文件编码，若无，则返回null

* @throws FileNotFoundException

* @throws IOException

public String guestFileEncoding(File file)throws FileNotFoundException,

IOException {

return geestFileEncoding(file,new nsDetector());

}

/**

* 获取文件的编码

* @param file

* File对象实例

* @param languageHint

* 语言提示区域代码 eg：1 : Japanese; 2 : Chinese; 3 : Simplified Chinese;

* 4 : Traditional Chinese; 5 : Korean; 6 : Dont know (default)

* @return 文件编码，eg：UTF-8,GBK,GB2312形式，若无，则返回null

* @throws FileNotFoundException

* @throws IOException

public String guestFileEncoding(File file,int languageHint)

throws FileNotFoundException, IOException {

return geestFileEncoding(file,new nsDetector(languageHint));

}

/**

* 获取文件的编码

* @param path

* 文件路径

* @return 文件编码，eg：UTF-8,GBK,GB2312形式，若无，则返回null

* @throws FileNotFoundException

* @throws IOException

public String guestFileEncoding(String path)throws FileNotFoundException,

IOException {

return guestFileEncoding(new File(path));

}

/**

* 获取文件的编码

* @param path

* 文件路径

* @param languageHint

* 语言提示区域代码 eg：1 : Japanese; 2 : Chinese; 3 : Simplified Chinese;

* 4 : Traditional Chinese; 5 : Korean; 6 : Dont know (default)

* @return

* @throws FileNotFoundException

* @throws IOException

public String guestFileEncoding(String path,int languageHint)

throws FileNotFoundException, IOException {

return guestFileEncoding(new File(path), languageHint);

}

/**

* 获取文件的编码

* @param file

* @param det

* @return

* @throws FileNotFoundException

* @throws IOException

private String geestFileEncoding(File file, nsDetector det)

throws FileNotFoundException, IOException {

// Set an observer...

// The Notify() will be called when a matching charset is found.

det.Init(new nsICharsetDetectionObserver() {

public void Notify(String charset) {

found = true;

encoding = charset;

}

});

BufferedInputStream imp = new BufferedInputStream(new FileInputStream(

file));

byte[] buf =new byte[1024];

int len;

boolean done =false;

boolean isAscii =true;

while ((len = imp.read(buf,0, buf.length)) != -1) {

// Check if the stream is only ascii.

if (isAscii)

isAscii = det.isAscii(buf, len);

// DoIt if non-ascii and not done yet.

if (!isAscii && !done)

done = det.DoIt(buf, len,false);

}

det.DataEnd();

if (isAscii) {

encoding = "ASCII";

found = true;

}

if (!found) {

String prob[] = det.getProbableCharsets();

if (prob.length >0) {

// 在没有发现情况下，则取第一个可能的编码

encoding = prob[0];

} else {

return null;

}

return encoding;

}

jar包下载地址：http://download.csdn.net/detail/u012587637/8041169

方法3：开源工程juniversalcharde

http://code.google.com/p/juniversalchardet/

public static String getFileIncode(File file) {

if (!file.exists()) {

System.err.println("getFileIncode: file not exists!");

return null;

}

byte[] buf =new byte[4096];

FileInputStream fis = null;

try {

fis = new FileInputStream(file);

UniversalDetector detector = new UniversalDetector(null);

int nread;

while ((nread = fis.read(buf)) >0 && !detector.isDone()) {

detector.handleData(buf,0, nread);

}

detector.dataEnd();

String encoding = detector.getDetectedCharset();

if (encoding !=null) {

System.out.println("Detected encoding = " + encoding);

} else {

System.out.println("No encoding detected.");

}

detector.reset();

fis.close();

return encoding;

} catch (Exception e) {

e.printStackTrace();

}

return null;

}

jar包下载：http://download.csdn.net/detail/u012587637/8041181

说明：第三个方法要比第二个速度快些，也比较新，所以推荐使用第三个。

0 0