汉字转拼音,主要是多音字的处理

来源:互联网 发布:时时彩冷热数据 编辑:程序博客网 时间:2024/04/28 00:07
package com.dt.luochen.util;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import net.sourceforge.pinyin4j.PinyinHelper;
import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.HanyuPinyinToneType;
import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;

public class LuochenPinyinUtils {
private static Map<String, List<String>> pinyinMap = new HashMap<String, List<String>>();

private static List<String> pinyin = new ArrayList<String>();
private static List<String> hpinyin = new ArrayList<String>();

private static String regx = "(,| |\\[|\\])";// 正则表达式,匹配字符串用

/**
* 初始化 所有的多音字词组
*/
static {
// 读取多音字的全部拼音表;
InputStream file = LuochenPinyinUtils.class.getResourceAsStream("/duoyinzi_dic.txt");
BufferedReader br = null;
String s = null;
try {
br = new BufferedReader(new InputStreamReader(file,"UTF-8"));
while ((s = br.readLine()) != null) {
if (s != null) {
String[] arr = s.split("#");
String pinyin = arr[0];
String chinese = arr[1];

if (chinese != null) {
String[] strs = chinese.split(" ");
//去空
if(strs!=null&&strs.length>0){
List<String> list = new ArrayList<String>();
for (int i = 0; i < strs.length; i++) {
if(!"".equals(strs[i].trim())){
list.add(strs[i].trim());
}
}
pinyinMap.put(pinyin, list);
}

}
}
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
br.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}

public static void main(String[] args) {
String str = "重庆银行";

String py = getPinyin(str);
System.out.println(str + " = " + py);

// String headP = getHeadPinyin(str);
// System.out.println(str + " = " + headP);
}

public static String getPinyin(String chinese){
convertChineseToPinyin(chinese);
return String.valueOf(pinyin).replaceAll(regx, "");
}

public static String getHeadPinyin(String chinese){
convertChineseToPinyin(chinese);
return String.valueOf(hpinyin).replaceAll(regx, "");
}

/**
* 汉字转拼音 最大匹配优先
*
* @param chinese
* @return
*/
public static void convertChineseToPinyin(String chinese) {

HanyuPinyinOutputFormat defaultFormat = new HanyuPinyinOutputFormat();
defaultFormat.setCaseType(HanyuPinyinCaseType.LOWERCASE);
defaultFormat.setVCharType(HanyuPinyinVCharType.WITH_V);
defaultFormat.setToneType(HanyuPinyinToneType.WITHOUT_TONE);

char[] arr = chinese.toCharArray();

for (int i = 0; i < arr.length; i++) {
char ch = arr[i];
if (ch > 128) { // 非ASCII码
// 取得当前汉字的所有全拼
try {
String[] results = PinyinHelper.toHanyuPinyinStringArray(
ch, defaultFormat);
if (results != null) { // 非中文
int len = results.length;
if (len == 1) { // 不是多音字
setValue(results[0]);
} else { // 多音字
// 合并同音不同声调(去重)
List<String> duoyinziPinyins= new ArrayList<String>();// 定义一个空的数组
for(int k=0;k<len;k++){
if(!duoyinziPinyins.contains(results[k])){
duoyinziPinyins.add(results[k]);
}
}

if(duoyinziPinyins.size()==1){
setValue(duoyinziPinyins.get(0));// 如果新的集合长度是1,就取第一个
}else{//
//System.out.println("注意!这是个多音字:" + ch);
int length = chinese.length();
boolean flag = false;
for (int x = 0; x < duoyinziPinyins.size(); x++) {
String py = duoyinziPinyins.get(x);

if (i + 3 <= length) { // 后向匹配2个汉字 大西洋
if(matchPinyins(py,chinese, i, i+3)){
flag = setValue(py);
break;
}
}

if (i + 2 <= length) { // 后向匹配 1个汉字 大西
if(matchPinyins(py,chinese, i, i+2)){
flag = setValue(py);
break;
}
}

if ((i - 2 >= 0) && (i + 1 <= length)) { // 前向匹配2个汉字
if(matchPinyins(py,chinese, i-2, i+1)){
flag = setValue(py);
break;
}
}

if ((i - 1 >= 0) && (i + 1 <= length)) { // 前向匹配1个汉字
// 固大
if(matchPinyins(py,chinese, i-1, i+1)){
flag = setValue(py);
break;
}
}

if ((i - 1 >= 0) && (i + 2 <= length)) { // 前向1个,后向1个
// 固大西
if(matchPinyins(py,chinese, i-1, i+2)){
flag = setValue(py);
break;
}
}
}

if (!flag) {
// 如果都没有找到,也就是常用读音
//System.out.println("default = " + duoyinziPinyins.get(0));
setValue(duoyinziPinyins.get(0));
}
}
}
}
} catch (BadHanyuPinyinOutputFormatCombination e) {
e.printStackTrace();
}
} else {
setValue(String.valueOf(ch));
}
}
}

// 截取词组,并匹配拼音表中的词组
private static boolean matchPinyins(String py,String chinese,int m,int n){
String s = chinese.substring(m,n);
List<String> cizus = pinyinMap.get(py);
if(cizus!=null&&cizus.contains(s)){
return true;
}
return false;
}

private static boolean setValue(String py) {
pinyin.add(py);
hpinyin.add(py.substring(0, 1));
return true;
}

}
0 0
原创粉丝点击