android获取文本编码
来源:互联网 发布:周星驰 锵锵三人行知乎 编辑:程序博客网 时间:2024/06/07 15:13
转载来源:http://blog.csdn.net/u012587637/article/details/40107557
方法1:利用windows文本文件编码特点
windows下,Unicode、Unicode big endian和UTF-8编码的txt文件的开头会多出几个字节,分别是FF、FE(Unicode),FE、FF(Unicode big endian),EF、BB、BF(UTF-8)。
public static String getCharset(File file) { String charset = "GBK"; byte[] first3Bytes = new byte[3]; try { boolean checked = false; BufferedInputStream bis = new BufferedInputStream( new FileInputStream(file)); bis.mark(0); int read = bis.read(first3Bytes, 0, 3); if (read == -1) return charset; if (first3Bytes[0] == (byte) 0xFF && first3Bytes[1] == (byte) 0xFE) { charset = "UTF-16LE"; checked = true; } else if (first3Bytes[0] == (byte) 0xFE && first3Bytes[1] == (byte) 0xFF) { charset = "UTF-16BE"; checked = true; } else if (first3Bytes[0] == (byte) 0xEF && first3Bytes[1] == (byte) 0xBB && first3Bytes[2] == (byte) 0xBF) { charset = "UTF-8"; checked = true; } bis.reset(); if (!checked) { int loc = 0; while ((read = bis.read()) != -1) { loc++; if (read >= 0xF0) break; //单独出现BF以下的,也算是GBK if (0x80 <= read && read <= 0xBF) break; if (0xC0 <= read && read <= 0xDF) { read = bis.read(); if (0x80 <= read && read <= 0xBF)// 双字节 (0xC0 - 0xDF) // (0x80 - // 0xBF),也可能在GB编码内 continue; else break; // 也有可能出错,但是几率较小 } else if (0xE0 <= read && read <= 0xEF) { read = bis.read(); if (0x80 <= read && read <= 0xBF) { read = bis.read(); if (0x80 <= read && read <= 0xBF) { charset = "UTF-8"; break; } else break; } else break; } } System.out.println(loc + " " + Integer.toHexString(read)); } bis.close(); } catch (Exception e) { e.printStackTrace(); } return charset; }
缺点:不能这样去探测linux下的文件。
方法2:开源工程 JCharDet
package org.mozilla.intl.chardet;import java.io.BufferedInputStream;import java.io.File;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.IOException;/** * 借助JCharDet获取文件字符集 * @author icer * PS: * JCharDet 是mozilla自动字符集探测算法代码的java移植,其官方主页为: * http://jchardet.sourceforge.net/ * @date 2008/11/13 */public class FileCharsetDetector { private boolean found = false; /** * 如果完全匹配某个字符集检测算法, 则该属性保存该字符集的名称. 否则(如二进制文件)其值就为默认值 null, 这时应当查询属性 */ private String encoding = null; public static void main(String[] argv) throws Exception { if (argv.length != 1 && argv.length != 2) { System.out .println("Usage: FileCharsetDetector <path> [<languageHint>]"); System.out.println(""); System.out.println("Where <path> is d:/demo.txt"); System.out.println("For optional <languageHint>. Use following..."); System.out.println(" 1 => Japanese"); System.out.println(" 2 => Chinese"); System.out.println(" 3 => Simplified Chinese"); System.out.println(" 4 => Traditional Chinese"); System.out.println(" 5 => Korean"); System.out.println(" 6 => Dont know (default)"); return; } else { String encoding = null; if (argv.length == 2) { encoding = new FileCharsetDetector().guestFileEncoding(argv[0], Integer.valueOf(argv[1])); } else { encoding = new FileCharsetDetector().guestFileEncoding(argv[0]); } System.out.println("文件编码:" + encoding); } } /** * 传入一个文件(File)对象,检查文件编码 * * @param file * File对象实例 * @return 文件编码,若无,则返回null * @throws FileNotFoundException * @throws IOException */ public String guestFileEncoding(File file) throws FileNotFoundException, IOException { return geestFileEncoding(file, new nsDetector()); } /** * 获取文件的编码 * * @param file * File对象实例 * @param languageHint * 语言提示区域代码 eg:1 : Japanese; 2 : Chinese; 3 : Simplified Chinese; * 4 : Traditional Chinese; 5 : Korean; 6 : Dont know (default) * @return 文件编码,eg:UTF-8,GBK,GB2312形式,若无,则返回null * @throws FileNotFoundException * @throws IOException */ public String guestFileEncoding(File file, int languageHint) throws FileNotFoundException, IOException { return geestFileEncoding(file, new nsDetector(languageHint)); } /** * 获取文件的编码 * * @param path * 文件路径 * @return 文件编码,eg:UTF-8,GBK,GB2312形式,若无,则返回null * @throws FileNotFoundException * @throws IOException */ public String guestFileEncoding(String path) throws FileNotFoundException, IOException { return guestFileEncoding(new File(path)); } /** * 获取文件的编码 * * @param path * 文件路径 * @param languageHint * 语言提示区域代码 eg:1 : Japanese; 2 : Chinese; 3 : Simplified Chinese; * 4 : Traditional Chinese; 5 : Korean; 6 : Dont know (default) * @return * @throws FileNotFoundException * @throws IOException */ public String guestFileEncoding(String path, int languageHint) throws FileNotFoundException, IOException { return guestFileEncoding(new File(path), languageHint); } /** * 获取文件的编码 * * @param file * @param det * @return * @throws FileNotFoundException * @throws IOException */ private String geestFileEncoding(File file, nsDetector det) throws FileNotFoundException, IOException { // Set an observer... // The Notify() will be called when a matching charset is found. det.Init(new nsICharsetDetectionObserver() { public void Notify(String charset) { found = true; encoding = charset; } }); BufferedInputStream imp = new BufferedInputStream(new FileInputStream( file)); byte[] buf = new byte[1024]; int len; boolean done = false; boolean isAscii = true; while ((len = imp.read(buf, 0, buf.length)) != -1) { // Check if the stream is only ascii. if (isAscii) isAscii = det.isAscii(buf, len); // DoIt if non-ascii and not done yet. if (!isAscii && !done) done = det.DoIt(buf, len, false); } det.DataEnd(); if (isAscii) { encoding = "ASCII"; found = true; } if (!found) { String prob[] = det.getProbableCharsets(); if (prob.length > 0) { // 在没有发现情况下,则取第一个可能的编码 encoding = prob[0]; } else { return null; } } return encoding; }}
jar包下载地址:http://download.csdn.net/detail/u012587637/8047697
方法3:开源工程juniversalcharde
官网:http://code.google.com/p/juniversalchardet/
public static String getFileIncode(File file) { if (!file.exists()) { System.err.println("getFileIncode: file not exists!"); return null; } byte[] buf = new byte[4096]; FileInputStream fis = null; try { fis = new FileInputStream(file); // (1) UniversalDetector detector = new UniversalDetector(null); // (2) int nread; while ((nread = fis.read(buf)) > 0 && !detector.isDone()) { detector.handleData(buf, 0, nread); } // (3) detector.dataEnd(); // (4) String encoding = detector.getDetectedCharset(); if (encoding != null) { System.out.println("Detected encoding = " + encoding); } else { System.out.println("No encoding detected."); } // (5) detector.reset(); fis.close(); return encoding; } catch (Exception e) { e.printStackTrace(); } return null; }
jar包下载:http://download.csdn.net/detail/u012587637/8041181
4.使用阿帕奇的commons-io包
具体参考:http://blog.csdn.net/21aspnet/article/details/50612867
阅读全文
0 0
- android获取文本编码
- Android文本翻页编码实现
- 获取txt文本文档的编码类型
- 获取普通文本文档的编码类型
- Android 获取唯一编码
- android获取对话框文本注意事项
- 文本编码
- 文本编码
- 文本编码
- 文本编码
- 文本编码
- 获取txt文本文档的编码类型(c++,c#)
- (1)Tika获取文件的类型、编码、文本内容
- Android之绘制文本(FontMetrics) 获取文本高度
- Android之绘制文本(FontMetrics) 获取文本高度
- Android之获取、设置EditText的文本...
- Android 设备从 Server获取文本
- Android 获取文本内容及配置信息
- 欢迎使用CSDN-markdown编辑器
- 使用脚本把项目托管到Github上
- android手机安装google play服务
- sublime 多处同时操作和替换
- highcharts(1)------- 通过 Ajax 加载数据
- android获取文本编码
- VMware Workstation 11 中 Ubuntu 14.04 的 VMware Tools 问题 : 共享文件夹
- 为toolbar菜单栏添加图标
- 反射
- 解决MySql 数据库 提示:1045 access denied for user 'root'@'localhost' using password yes
- 软件架构设计原则和大数据平台架构层
- Android 7.0调用系统相机返回路径问题
- C#相关的网站、资源和书籍
- AIX:struct dirent d_type