java 文件编码判断
来源:互联网 发布:大数据分析导论 感想 编辑:程序博客网 时间:2024/06/11 01:05
java的unicode对于无头Utf-8格式的编码的判断是缺失
工具类实现java对文件编码的判断
编码格式判断工具类方法
public static String getTxtEncode(InputStream in) throws IOException { String dc = Charset.defaultCharset().name(); UnicodeInputStream uin = new UnicodeInputStream(in, dc); if ("UTF-8".equals(uin.getEncoding())) { uin.close(); return "UTF-8"; } uin.close(); byte[] head = new byte[3]; in.read(head); in.reset(); String code = "GBK"; if (head[0] == -1 && head[1] == -2) code = "UTF-16"; if (head[0] == -2 && head[1] == -1) code = "Unicode"; // 带BOM if (head[0] == -17 && head[1] == -69 && head[2] == -65) code = "UTF-8"; if ("Unicode".equals(code)) { code = "UTF-16"; } return code; }
此处代码实现对无头uff-8编码判断
import java.io.*; /** * This inputstream will recognize unicode BOM marks and will skip bytes if * getEncoding() method is called before any of the read(...) methods. * * Usage pattern: String enc = "ISO-8859-1"; // or NULL to use systemdefault * FileInputStream fis = new FileInputStream(file); UnicodeInputStream uin = new * UnicodeInputStream(fis, enc); enc = uin.getEncoding(); // check and skip * possible BOM bytes InputStreamReader in; if (enc == null) in = new * InputStreamReader(uin); else in = new InputStreamReader(uin, enc); * * 添加对无Bom Utf-8文件编码的识别 */ public class UnicodeInputStream extends InputStream { PushbackInputStream internalIn; boolean isInited = false; String defaultEnc; String encoding; private static final int BOM_SIZE = 4; public UnicodeInputStream(InputStream in, String defaultEnc) { internalIn = new PushbackInputStream(in, BOM_SIZE); this.defaultEnc = defaultEnc; } public String getDefaultEncoding() { return defaultEnc; } public String getEncoding() { if (!isInited) { try { init(); } catch (IOException ex) { IllegalStateException ise = new IllegalStateException( "Init method failed."); ise.initCause(ise); throw ise; } } return encoding; } /** * Read-ahead four bytes and check for BOM marks. Extra bytes are unread * back to the stream, only BOM bytes are skipped. */ protected void init() throws IOException { if (isInited) return; byte bom[] = new byte[BOM_SIZE]; int n, unread; n = internalIn.read(bom, 0, bom.length); if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00) && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) { encoding = "UTF-32BE"; unread = n - 4; } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE) && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) { encoding = "UTF-32LE"; unread = n - 4; } else if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) { encoding = "UTF-8"; unread = n - 3; } else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) { encoding = "UTF-16BE"; unread = n - 2; } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) { encoding = "UTF-16LE"; unread = n - 2; } else { // Unicode BOM mark not found, unread all bytes encoding = defaultEnc; unread = n; } // System.out.println("read=" + n + ", unread=" + unread); if (unread > 0) internalIn.unread(bom, (n - unread), unread); isInited = true; } public void close() throws IOException { // init(); isInited = true; internalIn.close(); } public int read() throws IOException { // init(); isInited = true; return internalIn.read(); } }
参考此文: http://jybzjf.iteye.com/blog/2262392
阅读全文
0 0
- java判断文件编码
- java判断文件编码
- java 文件编码判断
- Java:判断文件的编码
- Java判断文件编码格式
- java判断文件编码集
- java判断文件编码格式
- Java判断文件编码格式
- JAVA判断文件编码类型
- java判断文件编码格式
- java判断文件编码格式
- java判断文件的编码
- Java判断文件编码格式
- JAVA判断文件编码类型
- JAVA判断文件编码类型
- java判断文件编码格式
- java判断文件编码格式
- 判断java文件编码格式
- hdu 1069 Monkey and Banana
- 欢迎使用CSDN-markdown编辑器
- 链表基础 回顾
- StringUtils工具类用法
- SQL总结--存储过程
- java 文件编码判断
- 通过Excel生成批量SQL语句
- 如何安装a.vim
- 项目随笔错误
- [bzoj1509][NOI2003]逃学的小孩 树的直径
- R Manage Data
- 约瑟夫环
- 我们一起来学习CC认证之“安全认证难,到底难在哪里?”
- nano编辑器