java实现汉字字典

来源：互联网发布：潜韵耳机的官方淘宝店编辑：程序博客网时间：2024/04/29 05:53

环境：eclipsse, jdk1.6, 没有使用第三方的包，都是JDK有的。

注意，项目源文件我都使用的是UTF-8的编码格式，如果不是，代码里面的汉字注释会显示乱码。

设置UTF-8：windows->Preferences->General->Workspace 页面上Text file encoding，选择Other UTF-8

项目结构：

1.字典文件

dic.txt 下载地址:http://download.csdn.net/detail/wssiqi/5056993

这里只摘录一部分内容，里面共收录了20902个汉字

[plain] view plain copy
19968,一,一,1,1,GGLL,A,yi1,yī  
19969,丁,一,2,12,SGH,AI,ding1,dīng,zheng1,zhēng  
19970,丂,一,2,15,GNV,AZVV,kao3,kǎo,qiao3,qiǎo,yu2,yú  
19971,七,一,2,15,AGN,HD,qi1,qī  
19972,丄,一,2,21,HGD,IAVV,shang4,shàng  
19973,丅,一,2,12,GHK,AIAA,xia4,xià  
19974,丆,一,2,13,DGT,GDAA,han3,hǎn  
19975,万,一,3,153,DNV,,wan4,wàn,mo4,mò  
19976,丈,一,3,134,DYI,AOS,zhang4,zhàng  
19977,三,一,3,111,DGGG,CD,san1,sān  
19978,上,一,3,211,HHGG,IDA,shang3,shǎng,shang4,shàng  
19979,下,一,3,124,GHI,AID,xia4,xià  
19980,丌,一,3,132,GJK,AND,ji1,jī,qi2,qí  
19981,不,一,4,1324,GII,GI,fou3,fǒu,bu4,bù  
19982,与,一,3,151,GNGD,AZA,yu4,yù,yu3,yǔ,yu2,yú  
19983,丏,一,4,1255,GHNN,AIZY,mian3,miǎn  
19984,丐,一,4,1215,GHNV,AIZ,gai4,gài  
19985,丑,一,4,5211,NFD,XED,chou3,chǒu  
19986,丒,一,4,5341,VYGF,YDSA,chou3,chǒu  

2.Dic.java

[java] view plain copy
package com.siqi.dict;  
  
import java.io.BufferedReader;  
import java.io.ByteArrayInputStream;  
import java.io.File;  
import java.io.FileInputStream;  
import java.io.InputStreamReader;  
import java.nio.charset.Charset;  
  
/** 
 * 汉字本地字典。 <br/> 
 * 本地字典数据来自于<a href=http://www.zdic.net/search/?c=2>汉典</a> 
 * 实现了一下常用的需求，例如返回拼音，五笔，拼音首字母，笔画数目，笔画顺序。 
 *  
 * @author siqi 
 *  
 */  
public class Dic {  
  
    /** 
     * 设置是否输出调试信息 
     */  
    private static boolean DEBUG = true;  
  
    /** 
     * 默认编码 
     */  
    public static final Charset DEFAULT_CHARSET = Charset.forName("UTF-8");  
  
    /** 
     * 汉字Unicode最小编码 
     */  
    public static final int CN_U16_CODE_MIN = 0x4e00;  
  
    /** 
     * 汉字Unicode最大编码 
     */  
    public static final int CN_U16_CODE_MAX = 0x9fa5;  
  
    /** 
     * 本地字典文件名 
     */  
    public static final String DIC_FILENAME = "dic.txt";  
  
    /** 
     * 字典数据 
     */  
    public static byte[] bytes = new byte[0];  
      
    /** 
     * 字典汉字数目 
     */  
    public static int count = 0;  
  
    /** 
     * 汉字unicode值在一条汉字信息的位置<br/> 
     * 汉字信息，例："25171,打,扌,5,12112,RSH,DAI,da3,dǎ,da2,dá" 
     */  
    public static int INDEX_UNICODE = 0;  
    /** 
     * 汉字在一条汉字信息的位置<br/> 
     * 汉字信息，例："25171,打,扌,5,12112,RSH,DAI,da3,dǎ,da2,dá" 
     */  
    public static int INDEX_CHARACTER = 1;  
    /** 
     * 汉字部首在一条汉字信息的位置<br/> 
     * 汉字信息，例："25171,打,扌,5,12112,RSH,DAI,da3,dǎ,da2,dá" 
     */  
    public static int INDEX_BUSHOU = 2;  
    /** 
     * 汉字笔画在一条汉字信息的位置<br/> 
     * 汉字信息，例："25171,打,扌,5,12112,RSH,DAI,da3,dǎ,da2,dá" 
     */  
    public static int INDEX_BIHUA = 3;  
    /** 
     * 汉字笔画顺序在一条汉字信息的位置<br/> 
     * 汉字信息，例："25171,打,扌,5,12112,RSH,DAI,da3,dǎ,da2,dá" 
     */  
    public static int INDEX_BISHUN = 4;  
    /** 
     * 汉字五笔在一条汉字信息的位置<br/> 
     * 汉字信息，例："25171,打,扌,5,12112,RSH,DAI,da3,dǎ,da2,dá" 
     */  
    public static int INDEX_WUBI = 5;  
    /** 
     * 汉字郑码在一条汉字信息的位置<br/> 
     * 汉字信息，例："25171,打,扌,5,12112,RSH,DAI,da3,dǎ,da2,dá" 
     */  
    public static int INDEX_ZHENGMA = 6;  
    /** 
     * 第一个汉字拼音（英文字母）在一条汉字信息的位置<br/> 
     * 汉字信息，例："25171,打,扌,5,12112,RSH,DAI,da3,dǎ,da2,dá" 
     */  
    public static int INDEX_PINYIN_EN = 7;  
    /** 
     * 第一个汉字拼音（中文字母）在一条汉字信息的位置<br/> 
     * 汉字信息，例："25171,打,扌,5,12112,RSH,DAI,da3,dǎ,da2,dá" 
     */  
    public static int INDEX_PINYIN_CN = 8;  
  
    /** 
     * 装载字典 
     */  
    static {  
        long time = System.currentTimeMillis();  
          
        try {  
            LoadDictionary();  
            count = count();  
            if (DEBUG) {  
                System.out.println("成功载入字典" + new File(DIC_FILENAME).getCanonicalPath() + " ，用时："  
                        + (System.currentTimeMillis() - time) + "毫秒，载入字符数"+count);  
            }  
        } catch (Exception e) {  
            try {  
                System.out.println("载入字典失败" + new File(DIC_FILENAME).getCanonicalPath()+"\r\n");  
            } catch (Exception e1) {  
            }  
            e.printStackTrace();  
        }  
  
    }  
  
    /** 
     * 获取汉字unicode值 
     *  
     * @param ch 
     *            汉字 
     * @return 返回汉字的unicode值 
     * @throws Exception 
     */  
    public static String GetUnicode(Character ch) throws Exception {  
        return GetCharInfo(ch, INDEX_UNICODE);  
    }  
  
    /** 
     * 获取拼音（英文字母） 
     *  
     * @param ch 
     *            单个汉字字符 
     * @return 返回汉字的英文字母拼音。如 "大"->"da4"。 
     * @throws Exception 
     */  
    public static String GetPinyinEn(Character ch) throws Exception {  
        return GetCharInfo(ch, INDEX_PINYIN_EN);  
    }  
  
    /** 
     * 返回汉字字符串的拼音（英文字母） 
     *  
     * @param str 
     *            汉字字符串 
     * @return 返回汉字字符串的拼音。将字符串中的汉字替换成拼音，其他字符不变。拼音中间会有空格。 注意，对于多音字，返回的拼音可能不正确。 
     * @throws Exception 
     */  
    public static String GetPinyinEn(String str) throws Exception {  
        StringBuffer sb = new StringBuffer();  
        for (int i = 0; i < str.length(); i++) {  
            char ch = str.charAt(i);  
            if (isChineseChar(ch)) {  
                sb.append(GetPinyinEn(ch) + " ");  
            } else {  
                sb.append(ch);  
            }  
        }  
  
        return sb.toString().trim();  
    }  
  
    /** 
     * 获取拼音（中文字母） 
     *  
     * @param ch 
     *            单个汉字字符 
     * @return 返回汉字的中文字母拼音。如 "打"->"dǎ"。 
     * @throws Exception 
     */  
    public static String GetPinyinCn(Character ch) throws Exception {  
        return GetCharInfo(ch, INDEX_PINYIN_CN);  
    }  
  
    /** 
     * 返回汉字字符串的拼音（中文字母） 
     *  
     * @param str 
     *            汉字字符串 
     * @return 返回汉字字符串的拼音。将字符串中的汉字替换成拼音，其他字符不变。拼音中间会有空格。 注意，对于多音字，返回的拼音可能不正确。 
     * @throws Exception 
     */  
    public static String GetPinyinCn(String str) throws Exception {  
        StringBuffer sb = new StringBuffer();  
        for (int i = 0; i < str.length(); i++) {  
            char ch = str.charAt(i);  
            if (isChineseChar(ch)) {  
                sb.append(GetPinyinCn(ch) + " ");  
            } else {  
                sb.append(ch);  
            }  
        }  
  
        return sb.toString().trim();  
    }  
  
    /** 
     * 返回拼音首字母 
     *  
     * @param ch 
     * @return 
     * @throws Exception 
     */  
    public static String GetFirstLetter(Character ch) throws Exception {  
        if (isChineseChar(ch)) {  
            return GetPinyinEn(ch).substring(0, 1);  
        } else {  
            return "";  
        }  
    }  
  
    /** 
     * 返回汉字字符串拼音首字母，如果不是汉字，会被忽略掉。 
     *  
     * @param str 
     *            汉字字符串 
     * @return 
     * @throws Exception 
     */  
    public static String GetFirstLetter(String str) throws Exception {  
        StringBuffer sb = new StringBuffer();  
        for (int i = 0; i < str.length(); i++) {  
            char ch = str.charAt(i);  
            if (isChineseChar(ch)) {  
                sb.append(GetFirstLetter(ch));  
            }  
        }  
  
        return sb.toString().trim();  
    }  
  
    /** 
     * 获取汉字部首 
     *  
     * @param ch 
     *            汉字 
     * @return 返回汉字的部首 
     * @throws Exception 
     */  
    public static String GetBushou(Character ch) throws Exception {  
        return GetCharInfo(ch, INDEX_BUSHOU);  
    }  
  
    /** 
     * 获取汉字笔画数目 
     *  
     * @param ch 
     *            汉字 
     * @return 返回汉字的笔画数目 
     * @throws Exception 
     */  
    public static String GetBihua(Character ch) throws Exception {  
        return GetCharInfo(ch, INDEX_BIHUA);  
    }  
  
    /** 
     * 获取汉字笔画顺序 
     *  
     * @param ch 
     *            汉字 
     * @return 返回汉字的笔画顺序 
     * @throws Exception 
     */  
    public static String GetBishun(Character ch) throws Exception {  
        return GetCharInfo(ch, INDEX_BISHUN);  
    }  
  
    /** 
     * 获取汉字五笔 
     *  
     * @param ch 
     *            汉字 
     * @return 返回汉字五笔 
     * @throws Exception 
     */  
    public static String GetWubi(Character ch) throws Exception {  
        return GetCharInfo(ch, INDEX_WUBI);  
    }  
  
    /** 
     * 获取汉字郑码 
     *  
     * @param ch 
     *            汉字 
     * @return 返回汉字郑码 
     * @throws Exception 
     */  
    public static String GetZhengma(Character ch) throws Exception {  
        return GetCharInfo(ch, INDEX_ZHENGMA);  
    }  
  
    /** 
     * 从字典中获取汉字信息 
     *  
     * @param ch 
     *            要查询的汉字 
     * @return 返回汉字信息，如"25171,打,扌,5,12112,RSH,DAI,da3,dǎ,da2,dá" <br/> 
     *         第一是汉字unicode值<br/> 
     *         第二是汉字<br/> 
     *         第三是汉字部首<br/> 
     *         第四是汉字笔画<br/> 
     *         第五是汉字笔画顺序("12345"分别代表"横竖撇捺折")<br/> 
     *         第六是汉字五笔<br/> 
     *         第七是汉字郑码<br/> 
     *         第八及以后是汉字的拼音（英文字母拼音和中文字母拼音）<br/> 
     * @throws Exception 
     */  
    public static String GetCharInfo(Character ch) throws Exception {  
        if (!isChineseChar(ch)) {  
            throw new Exception("'" + ch + "' 不是一个汉字！");  
        }  
  
        String result = "";  
  
        ByteArrayInputStream bais = new ByteArrayInputStream(bytes);  
        BufferedReader br = new BufferedReader(new InputStreamReader(bais));  
  
        String strWord;  
        while ((strWord = br.readLine()) != null) {  
            if (strWord.startsWith(String.valueOf(ch.hashCode()))) {  
                result = strWord;  
                break;  
            }  
        }  
        br.close();  
        bais.close();  
  
        return result;  
    }  
  
    /** 
     * 返回汉字信息 
     *  
     * @param ch 
     *            汉字 
     * @param index 
     *            信息所在的Index 
     * @return 
     * @throws Exception 
     */  
    private static String GetCharInfo(Character ch, int index) throws Exception {  
        if (!isChineseChar(ch)) {  
            throw new Exception("'" + ch + "' 不是一个汉字！");  
        }  
  
        // 获取汉字信息  
        String charInfo = GetCharInfo(ch);  
  
        String result = "";  
        try {  
            result = charInfo.split(",")[index];  
        } catch (Exception e) {  
            throw new Exception("请查看字典中" + ch + "汉字记录是否正确！");  
        }  
  
        return result;  
    }  
  
    /** 
     * 载入字典文件到内存。 
     * @throws Exception  
     */  
    private static void LoadDictionary() throws Exception {  
        File file = new File(DIC_FILENAME);  
        bytes = new byte[(int) file.length()];  
        FileInputStream fis = new FileInputStream(file);  
        fis.read(bytes, 0, bytes.length);  
        fis.close();  
    }  
  
    /** 
     * 判断字符是否为汉字，在测试的时候，我发现汉字的字符的hashcode值 跟汉字Unicode 
     * 16的值一样，所以可以用hashcode来判断是否为汉字。 
     *  
     * @param ch 
     *            汉字 
     * @return 是汉字返回true，否则返回false。 
     */  
    public static boolean isChineseChar(Character ch) {  
        if (ch.hashCode() >= CN_U16_CODE_MIN  
                && ch.hashCode() <= CN_U16_CODE_MAX) {  
            return true;  
        } else {  
            return false;  
        }  
    }  
  
    /** 
     *  
     * @return 返回字典包含的汉字数目。 
     * @throws Exception 
     */  
    private static int count() throws Exception {  
        int cnt = 0;  
        ByteArrayInputStream bais = new ByteArrayInputStream(bytes);  
        BufferedReader br = new BufferedReader(new InputStreamReader(bais));  
  
        while (br.readLine() != null) {  
            cnt++;  
        }  
        br.close();  
        bais.close();  
  
        return cnt;  
    }  
}  

3.Sample.java

如何使用字典

[java] view plain copy
package com.siqi.dict;  
  
/** 
 * 包含两个实例，示例如何获取汉字的拼音等信息。 
 * @author siqi 
 * 
 */  
public class Sample {  
  
    /** 
     * 字典使用实例 
     *  
     * @param args 
     */  
    public static void main(String[] args) {  
        try {  
            long time = System.currentTimeMillis();  
  
            char ch = '打';  
            //汉字单个字符  
            System.out.println("====打字信息开始====");  
            System.out.println("首字母："+Dic.GetFirstLetter(ch));  
            System.out.println("拼音（中）："+Dic.GetPinyinCn(ch));  
            System.out.println("拼音（英）："+Dic.GetPinyinEn(ch));  
            System.out.println("部首："+Dic.GetBushou(ch));  
            System.out.println("笔画数目："+Dic.GetBihua(ch));  
            System.out.println("笔画："+Dic.GetBishun(ch));  
            System.out.println("五笔："+Dic.GetWubi(ch));  
            System.out.println("====打字信息结束====");  
              
            //汉字字符串  
            System.out.println("\r\n====汉字字符串====");  
            System.out.println(Dic.GetPinyinEn("返回汉字字符串的拼音。"));  
            System.out.println(Dic.GetPinyinCn("返回汉字字符串的拼音。"));  
            System.out.println(Dic.GetFirstLetter("返回汉字字符串的拼音。"));  
            System.out.println("====汉字字符串====\r\n");  
              
            System.out.println("用时："+(System.currentTimeMillis()-time)+"毫秒");  
              
        } catch (Exception e) {  
            e.printStackTrace();  
        }  
  
    }  
}  

4.结果

[html] view plain copy
====打字信息开始====  
成功载入字典C:\workspaces\01_java\DictLocal\dic.txt ，用时：15毫秒，载入字符数20902  
首字母：d  
拼音（中）：dǎ  
拼音（英）：da3  
部首：扌  
笔画数目：5  
笔画：12112  
五笔：RSH  
====打字信息结束====  
  
====汉字字符串====  
fan3 hui2 han4 zi4 zi4 fu2 chuan4 di2 pin1 yin1 。  
fǎn huí hàn zì zì fú chuàn dí pīn yīn 。  
fhhzzfcdpy  
====汉字字符串====  
  
Memory(Used/Total) : 1539/15872 KB  
用时：218毫秒  

待会再上传如何获取字典文件的，我是通过收集http://www.zdic.net/zd/的网页来获取的

=============补充，如何获取汉字的信息================

=============所有的信息都是从汉典网站上获取的=========

目录结构为：

环境：eclipsse, jdk1.6, 没有使用第三方的包，都是JDK有的。

注意，项目源文件我都使用的是UTF-8的编码格式，如果不是，代码里面的汉字注释会显示乱码。

设置UTF-8：windows->Preferences->General->Workspace 页面上Text file encoding，选择Other UTF-8

包说明：

com.siqi.http

Httpclient.Java是我写的一个简单的获取网页的类，用来获取网页内容；

com.siqi.dict

DictMain.java用来下载汉字网页，从中获取汉字的拼音信息，并保存到data.dat中

DownloadThread.java用来下载网页（多线程）

com.siqi.pinyin

PinYin.java在执行过DictMain.java后，会生成一个data.dat，把这个文件拷贝到com.siqi.pinyin包下面，就可以调用PinYin.java里面的函数得到汉字的拼音了

PinYinEle.java一个汉字->拼音->Unicode的模型

源码：

Httpclient.java 可以用来获取网页，可以的到网页内容，网页编码和网页的header，简版

[java] view plain copy
package com.siqi.http;  
  
import java.io.IOException;  
import java.io.InputStream;  
import java.net.Socket;  
import java.net.URLEncoder;  
import java.util.regex.Matcher;  
import java.util.regex.Pattern;  
  
/** 
 * 使用SOCKET实现简单的网页GET和POST 
 *  
 * @author siqi 
 *  
 */  
public class Httpclient {  
  
    /** 
     * processUrl 参数 HTTP GET 
     */  
    public static final int METHOD_GET = 0;  
    /** 
     * processUrl 参数 HTTP POST 
     */  
    public static final int METHOD_POST = 1;  
    /** 
     * HTTP GET的报头，简化版 
     */  
    public static final String HEADER_GET = "GET %s HTTP/1.0\r\nHOST: %s\r\n\r\n";  
    /** 
     * HTTP POST的报头，简化版 
     */  
    public static final String HEADER_POST = "POST %s HTTP/1.0\r\nHOST: %s\r\nContent-Length: 0\r\n\r\n";  
    /** 
     * 网页报头和内容的分割符 
     */  
    public static final String CONTENT_SEPARATOR = "\r\n\r\n";  
    /** 
     * 网页请求响应内容byte 
     */  
    private byte[] bytes = new byte[0];  
    /** 
     * 网页报头 
     */  
    private String header = "";  
    /** 
     * 网页内容 
     */  
    private String content = "";  
  
    /** 
     * 网页编码，默认为UTF-8 
     */  
    public static final String CHARSET_DEFAULT = "UTF-8";  
    /** 
     * 网页编码 
     */  
    private String charset = CHARSET_DEFAULT;  
  
    /** 
     * 使用Httpclient的例子 
     *  
     * @param args 
     * @throws Exception 
     */  
    public static void main(String[] args) throws Exception {  
        Httpclient httpclient = new Httpclient();  
        // 请求百度首页（手机版）  
        httpclient.processUrl("http://m.baidu.com/");  
        System.out.println("获取网页http://m.baidu.com/");  
        System.out.println("报头为：\r\n" + httpclient.getHeader());  
        System.out.println("内容为：\r\n" + httpclient.getContent());  
        System.out.println("编码为：\r\n" + httpclient.getCharset());  
        System.out.println("************************************");  
  
        // 使用百度搜索"中国"（手机版）  
        // 这是手机百度搜索框的源码 <input id="word" type="text" size="20" maxlength="64"  
        // name="word">  
        String url = String.format("http://m.baidu.com/s?word=%s",  
                URLEncoder.encode("中国", CHARSET_DEFAULT));  
        httpclient.processUrl(url, METHOD_POST);  
        System.out.println("获取网页http://m.baidu.com/s?word=中国");  
        System.out.println("报头为：\r\n" + httpclient.getHeader());  
        System.out.println("内容为：\r\n" + httpclient.getContent());  
        System.out.println("编码为：\r\n" + httpclient.getCharset());  
    }  
  
    /** 
     * 初始化，设置所有变量为默认值 
     */  
    private void init() {  
        this.bytes = new byte[0];  
        this.charset = CHARSET_DEFAULT;  
        this.header = "";  
        this.content = "";  
  
    }  
  
    /** 
     * 获取网页报头header 
     *  
     * @return 
     */  
    public String getHeader() {  
        return header;  
    }  
  
    /** 
     * 获取网页内容content 
     *  
     * @return 
     */  
    public String getContent() {  
        return content;  
    }  
  
    /** 
     * 获取网页编码 
     *  
     * @return 
     */  
    public String getCharset() {  
        return charset;  
    }  
  
    /** 
     * 请求网页内容（使用HTTP GET） 
     *  
     * @param url 
     * @throws Exception 
     */  
    public void processUrl(String url) throws Exception {  
        processUrl(url, METHOD_GET);  
    }  
  
    /** 
     * 使用Socket请求（获取）一个网页。<br/> 
     * 例如:<br/> 
     * processUrl("http://www.baidu.com/", METHOD_GET)会获取百度首页；<br/> 
     *  
     * @param url 
     *            这个网页或者网页内容的地址 
     * @param method 
     *            请求网页的方法: METHOD_GET或者METHOD_POST 
     * @throws Exception 
     */  
    public void processUrl(String url, int method) throws Exception {  
  
        init();  
  
        // url = "http://www.zdic.net/search/?c=2&q=%E5%A4%A7";  
        // 规范化链接，当网址为http://www.baidu.com时，将网址变为：http://www.baidu.com/  
        Matcher mat = Pattern.compile("https?://[^/]+").matcher(url);  
        if (mat.find() && mat.group().equals(url)) {  
            url += "/";  
        }  
  
        Socket socket = new Socket(getHostUrl(url), 80); // 设置要连接的服务器地址  
        socket.setSoTimeout(3000); // 设置超时时间为3秒  
  
        String request = null;  
        // 构造请求，详情请参考HTTP协议(RFC2616)  
        if (method == METHOD_POST) {  
            request = String.format(HEADER_POST, getSubUrl(url),  
                    getHostUrl(url));  
        } else {  
            request = String  
                    .format(HEADER_GET, getSubUrl(url), getHostUrl(url));  
        }  
  
        socket.getOutputStream().write(request.getBytes());// 发送请求  
  
        this.bytes = InputStream2ByteArray(socket.getInputStream());// 读取响应  
  
        // 获取网页编码，我们只需要测试查找前4096个字节，一般编码信息都会在里面找到  
        String temp = new String(this.bytes, 0,  
                bytes.length < 4096 ? bytes.length : 4096);  
        mat = Pattern.compile("(?<=<meta.{0,100}?charset=)[a-z-0-9]*",  
                Pattern.CASE_INSENSITIVE).matcher(temp);  
        if (mat.find()) {  
            this.charset = mat.group();  
        } else {  
            this.charset = CHARSET_DEFAULT;  
        }  
  
        // 用正确的编码得到网页报头和内容  
        temp = new String(this.bytes, this.charset);  
        int headerEnd = temp.indexOf(CONTENT_SEPARATOR);  
        this.header = temp.substring(0, headerEnd);  
        this.content = temp.substring(headerEnd + CONTENT_SEPARATOR.length(),  
                temp.length());  
  
        socket.close(); // 关闭socket  
    }  
  
    /** 
     * 根据网址，获取服务器地址<br/> 
     * 例如：<br/> 
     * http://m.weathercn.com/common/province.jsp 
     * <p> 
     * 返回：<br/> 
     * m.weathercn.com 
     *  
     * @param url 
     *            网址 
     * @return 
     */  
    public static String getHostUrl(String url) {  
        String host = "";  
        Matcher mat = Pattern.compile("(?<=https?://).+?(?=/)").matcher(url);  
        if (mat.find()) {  
            host = mat.group();  
        }  
  
        return host;  
    }  
  
    /** 
     * 根据网址，获取网页路径 例如：<br/> 
     * http://m.weathercn.com/common/province.jsp 
     * <p> 
     * 返回：<br/> 
     * /common/province.jsp 
     *  
     * @param url 
     * @return 如果没有获取到网页路径，返回""; 
     */  
    public static String getSubUrl(String url) {  
        String subUrl = "";  
        Matcher mat = Pattern.compile("https?://.+?(?=/)").matcher(url);  
        if (mat.find()) {  
            subUrl = url.substring(mat.group().length());  
        }  
  
        return subUrl;  
    }  
  
    /** 
     * 将b1和b2两个byte数组拼接成一个, 结果=b1+b2 
     *  
     * @param b1 
     * @param b2 
     * @return 
     */  
    public static byte[] ByteArrayCat(byte[] b1, byte[] b2) {  
        byte[] b = new byte[b1.length + b2.length];  
        System.arraycopy(b1, 0, b, 0, b1.length);  
        System.arraycopy(b2, 0, b, b1.length, b2.length);  
        return b;  
    }  
  
    /** 
     * 读取输入流并转为byte数组，不返回字符串， 是因为输入流的编码不确定，错误的编码会造成乱码。 
     *  
     * @param is 
     *            输入流inputstream 
     * @return 字符串 
     * @throws IOException 
     */  
    public static byte[] InputStream2ByteArray(InputStream is)  
            throws IOException {  
        byte[] b = new byte[0];  
        byte[] bb = new byte[4096]; // 缓冲区  
  
        int len = 0;  
        while ((len = is.read(bb)) != -1) {  
            byte[] newb = new byte[b.length + len];  
            System.arraycopy(b, 0, newb, 0, b.length);  
            System.arraycopy(bb, 0, newb, b.length, len);  
            b = newb;  
        }  
  
        return b;  
    }  
}  

DictMain.java

[java] view plain copy
package com.siqi.dict;  
  
import java.io.File;  
import java.io.FileReader;  
import java.io.FileWriter;  
import java.io.IOException;  
import java.util.regex.Matcher;  
import java.util.regex.Pattern;  
  
/** 
 * 从汉典下载汉字网页，并提取拼音信息 
 * @author siqi 
 * 
 */  
public class DictMain {  
    /** 
     * 网页保存路径 
     */  
    public static final String SAVEPATH = "dict/pages/";  
    /** 
     * 下载的汉字网页名称 
     */  
    public static final String FILEPATH = SAVEPATH + "%s.html";  
      
    /** 
     * 字典数据文件名称 
     */  
    public static final String DATA_FILENAME = "data.txt";  
      
    /** 
     * 汉字unicode最小 
     */  
    public static final int UNICODE_MIN = 0x4E00;  
      
    /** 
     * 汉字unicode最大 
     */  
    public static final int UNICODE_MAX = 0x9FFF;  
      
    /** 
     * 准备工作: 
     * 1.从汉典网站下载所有汉字的页面，注意，不要在eclipse中打开保存页面的文件夹， 
     * 因为每个汉字一个页面，总共有20000+个页面，容易卡死eclipse 
     * 2.从汉字页面获取汉字拼音信息，生成data.dat文件 
     * 3.生成的data.dat复制到com.siqi.pinyin下面 
     * 4.可以使用com.siqi.pinyin.PinYin.java了 
     */  
    static{  
        // 下载网页  
        for (int i = UNICODE_MIN; i <= UNICODE_MAX; i++) {  
            // 检查是否已经存在  
            String filePath = String.format(FILEPATH, i); // 文件名  
            File file = new File(filePath);  
            if (!file.exists()) {  
                new DownloadThread(i).start();  
            }  
        }  
          
        //解析网页，得到拼音信息，并保存到data.dat  
        StringBuffer sb = new StringBuffer();  
        for (int i = UNICODE_MIN; i <= UNICODE_MAX; i++) {  
            String word = new String(Character.toChars(i));  
            String pinyin = getPinYinFromWebpageFile(String.format(FILEPATH, i));  
            String str = String.format("%s,%s,%s\r\n", i,word,pinyin);  
            System.out.print(str);  
            sb.append(str);  
        }  
          
        //保存到data.dat  
        try {  
            FileWriter fw = new FileWriter(DATA_FILENAME);  
            fw.write(sb.toString());  
            fw.close();  
        } catch (IOException e) {  
            e.printStackTrace();  
        }  
          
    }  
      
    public static void main(String[] args){  
          
        System.out.println("All prepared!");  
    }  
      
    /** 
     * 从网页文件获取拼音信息 
     * @param file 
     * @return 
     */  
    private static String getPinYinFromWebpageFile(String file) {  
        try {  
              
            char[] buff = new char[(int) new File(file).length()];  
              
            FileReader reader = new FileReader(file);  
            reader.read(buff);  
            reader.close();  
              
            String content = new String(buff);  
            // spf("yi1")  
            Matcher mat = Pattern.compile("(?<=spf\\(\")[a-z1-4]{0,100}",  
                    Pattern.CASE_INSENSITIVE).matcher(content);  
            if (mat.find()) {  
                return mat.group();  
            }  
            //<span class="dicpy">cal</span> spf("xin1")  
            mat = Pattern.compile("(?<=class=\"dicpy\">)[a-z1-4]{0,100}",  
                    Pattern.CASE_INSENSITIVE).matcher(content);  
            if (mat.find()) {  
                return mat.group();  
            }  
        } catch (Exception e) {  
            e.printStackTrace();  
        }  
          
        return "";  
  
    }  
}  

DownloadThread.java

[java] view plain copy
package com.siqi.dict;  
  
import java.io.File;  
import java.io.FileWriter;  
import java.net.URLEncoder;  
import java.util.regex.Matcher;  
import java.util.regex.Pattern;  
  
import com.siqi.http.Httpclient;  
  
/** 
 * 将汉字页面从汉典网站下载下来，存储到本地 
 * http://www.zdic.net/search/?c=2 
 * @author siqi 
 * 
 */  
public class DownloadThread extends Thread{  
      
    /** 
     * 线程最大数目 
     */  
    public static int THREAD_MAX = 10;  
      
    /** 
     * 下载最大重复次数 
     */  
    public static int RETRY_MAX = 5;  
      
    /** 
     * 汉典网站搜索网址 
     */  
    public static String SEARCH_URL = "http://www.zdic.net/search/?q=%s";  
      
    /** 
     * 当前线程数目 
     */  
    private static int threadCnt = 0;  
      
    /** 
     * 当前线程处理汉字的unicode编码 
     */  
    private int unicode = 0;  
      
    /** 
     * 如果PATH文件夹不存在，那么创建它 
     */  
    static{  
        try {  
            File file = new File(DictMain.SAVEPATH);  
            if (!file.exists()) {  
                file.mkdirs();  
            }  
        } catch (Exception e) {  
  
        }  
    }  
      
    /** 
     * 返回当前线程数量 
     * @param i 修改当前线程数量 threadCnt += i; 
     * @return 返回修改后线程数量 
     */  
    public static synchronized int threadCnt(int i){  
        threadCnt += i;  
        return threadCnt;  
    }  
      
    /** 
     * 下载UNICODE编码为unicode的汉字网页 
     * @param unicode 
     */  
    public DownloadThread(int unicode){  
        //等待，直到当前线程数量小于THREAD_MAX  
        while(threadCnt(0)>THREAD_MAX){  
            try {  
                Thread.sleep(500);  
            } catch (InterruptedException e) {  
            }  
        }  
          
        threadCnt(1);   //线程数量+1  
        this.unicode = unicode;  
    }  
  
    @Override  
    public void run() {  
        long t1 = System.currentTimeMillis(); // 记录时间  
  
        String filePath = String.format(DictMain.FILEPATH, unicode); // 文件名  
  
        String word = new String(Character.toChars(unicode)); // 将unicode转换为数字  
  
        boolean downloaded = false;  
        int retryCnt = 0; // 下载失败重复次数  
        while (!downloaded && retryCnt < RETRY_MAX) {  
            try {  
                String content = DownloadPage(word);  
                SaveToFile(filePath, content);  
                downloaded = true;  
  
                threadCnt(-1);  
                System.out.println(String.format("%s, %s, 下载成功！线程数目：%s 用时：%s",  
                        unicode, word, threadCnt(0), System.currentTimeMillis()  
                                - t1));  
                return;  
            } catch (Exception e) {  
                retryCnt++;  
            }  
        }  
  
        threadCnt(-1);  
        System.err.println(String.format("%s, %s, 下载失败！线程数目：%s 用时：%s", unicode,  
                word, threadCnt(0), System.currentTimeMillis() - t1));  
    }  
      
    /** 
     * 在汉典网站上查找汉字，返回汉字字典页面内容 
     * @param word 
     * @return 
     * @throws Exception 
     */  
    public String DownloadPage(String word) throws Exception{  
        //查找word  
        Httpclient httpclient = new Httpclient();  
        String url = String.format(SEARCH_URL, URLEncoder.encode(word, "UTF-8"));  
        httpclient.processUrl(url, Httpclient.METHOD_POST);  
          
        //返回的是一个跳转页  
        //获取跳转的链接  
        Matcher mat = Pattern.compile("(?<=HREF=\")[^\"]+").matcher(httpclient.getContent());  
        if(mat.find()){  
            httpclient.processUrl(mat.group());  
        }  
          
        return httpclient.getContent();  
    }  
      
    /** 
     * 将内容content写入file文件 
     * @param file 
     * @param content 
     */  
    public void SaveToFile(String file, String content){  
        try {  
            FileWriter fw = new FileWriter(file);  
            fw.write(content);  
            fw.close();  
        } catch (Exception e) {  
            e.printStackTrace();  
        }  
    }  
}  

PinYin.java

[java] view plain copy
package com.siqi.pinyin;  
  
import java.io.BufferedReader;  
import java.io.InputStreamReader;  
import java.util.HashMap;  
import java.util.Map;  
  
public class PinYin {  
  
    private static Map<Integer, PinYinEle> map = new HashMap<Integer, PinYinEle>();  
  
    /** 
     * 载入pinyin数据文件 
     */  
    static {  
        try {  
            BufferedReader bReader = new BufferedReader(new InputStreamReader(  
                    PinYin.class.getResourceAsStream("data.dat")));  
            String aLine = null;  
            while ((aLine = bReader.readLine()) != null) {  
                PinYinEle ele = new PinYinEle(aLine);  
                map.put(ele.getUnicode(), ele);  
            }  
            bReader.close();  
        } catch (Exception e) {  
            e.printStackTrace();  
        }  
    }  
  
    /** 
     * 去掉注释可以测试一下 
     *  
     * @param args 
     */  
    public static void main(String[] args) {  
        System.out.println("　包含声调：" + PinYin.getPinYin("大家haome12345"));  
        System.out.println("不包含声调：" + PinYin.getPinYin("大家haome12345", false));  
    }  
  
    /** 
     * 获取汉字字符串的拼音，containsNumber是否获取拼音中的声调1、2、3、4 
     *  
     * @param str 
     * @param containsNumber 
     *            true = 包含声调，false = 不包含声调 
     * @return 
     */  
    public static String getPinYin(String str, boolean containsNumber) {  
        StringBuffer sb = new StringBuffer();  
        for (Character ch : str.toCharArray()) {  
            sb.append(getPinYin(ch, containsNumber));  
        }  
  
        return sb.toString();  
    }  
  
    /** 
     * 获取字符串的拼音 
     *  
     * @param str 
     * @return 
     */  
    public static String getPinYin(String str) {  
        StringBuffer sb = new StringBuffer();  
        for (Character ch : str.toCharArray()) {  
            sb.append(getPinYin(ch));  
        }  
  
        return sb.toString();  
    }  
  
    /** 
     * 获取单个汉字的拼音，包含声调 
     *  
     * @param ch 
     * @return 
     */  
    public static String getPinYin(Character ch) {  
        return getPinYin(ch, true);  
    }  
  
    /** 
     * 获取单个汉字的拼音 
     *  
     * @param ch 
     *            汉字. 如果输入非汉字，返回ch. 如果输入null，返回空字符串； 
     * @param containsNumber 
     *            true = 包含声调，false = 不包含声调 
     * @return 
     */  
    public static String getPinYin(Character ch, boolean containsNumber) {  
        if (ch != null) {  
            int code = ch.hashCode();  
            if (map.containsKey(code)) {  
                if (containsNumber) {  
                    return map.get(code).getPinyin();  
                } else {  
                    return map.get(code).getPinyin().replaceAll("[0-9]", "");  
                }  
            } else {  
                return ch.toString();  
            }  
        }  
        return "";  
    }  
}  

PinYinEle.java

[java] view plain copy
package com.siqi.pinyin;  
  
public class PinYinEle {  
    private int unicode;  
    private String ch;  
    private String pinyin;  
      
    public PinYinEle(){}  
      
    public PinYinEle(String str){  
        if(str!=null){  
            String[] strs = str.split(",");  
            if(strs.length == 3){  
                try{  
                this.unicode = Integer.parseInt(strs[0]);  
                }catch(Exception e){  
                      
                }  
                this.ch = strs[1];  
                this.pinyin = strs[2];  
            }  
        }  
          
    }  
      
    public int getUnicode() {  
        return unicode;  
    }  
    public void setUnicode(int unicode) {  
        this.unicode = unicode;  
    }  
    public String getCh() {  
        return ch;  
    }  
    public void setCh(String ch) {  
        this.ch = ch;  
    }  
    public String getPinyin() {  
        return pinyin;  
    }  
    public void setPinyin(String pinyin) {  
        this.pinyin = pinyin;  
    }  
      
      
}  

生成的data.dat里面内容（部分）为：

[java] view plain copy
﻿19968,一,yi1  
19969,丁,ding1  
19970,丂,kao3  
19971,七,qi1  
19972,丄,shang4  
19973,丅,xia4  
19974,丆,han3  
19975,万,wan4  
19976,丈,zhang4  
19977,三,san1  
19978,上,shang4  
19979,下,xia4  
19980,丌,qi2  
19981,不,bu4  

运行DictMain.java结果

执行时间可能会有几十分钟到几小时不等，总共会下载200+M的网页（20000+个网页），每次运行都会先判断以前下载过没有，所以结束掉程序不会有影响

显示All prepared!表示已经准备好了，刷新项目文件夹，可以看到网页保持在dict/pages下面，不建议在elipse中打开那个文件夹，因为里面有2万多个文件，会卡死eclipse，

还可以看到生成了data.txt文件，改为data.dat并复制到pinyin文件夹下面

运行PinYin.java

可以看到"大家haome12345"的拼音：

[java] view plain copy
包含声调：da4jia1haome12345  
包含声调：dajiahaome12345  

上面只是显示了如何获取拼音，获取笔画等的方法类似，在这里就不演示了。

1 0