简单关键词匹配算法

来源:互联网 发布:在线谐音英文名软件 编辑:程序博客网 时间:2024/05/20 04:51

针对微博的短篇博文,编写的简单分词和匹配算法。相对于一篇文档的复杂分词算法,能够在效率和可用性上得到较好的平衡。


package com.sina.tblog.sentiment;import java.io.BufferedReader;import java.io.File;import java.io.FileOutputStream;import java.io.FileReader;import java.io.IOException;import java.io.OutputStreamWriter;import java.util.ArrayList;import java.util.HashSet;import java.util.List;import java.util.regex.Pattern;import com.sina.tblog.sentiment.constant.Constant;public class KeyWordFilter {public static HashSet<String> KeyWordsList = null;public static HashSet<String> letterKeyWordsList = null;/** * 初始化或重新导入关键词列表 * @throws IOException */static{try {initKeyWords(Constant.KeyWordsFiles);} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();}}public static int deleteNewWord(String word){if(word.length()>10||word.length()<2)return -1;if(!KeyWordsList.contains(word))return 0;KeyWordsList.remove(word);if(Pattern.compile("(?i)[a-z][A-Z]").matcher(word).find())  letterKeyWordsList.remove(word.toUpperCase());FileOutputStream stream; OutputStreamWriter writer;try {stream = new FileOutputStream(Constant.newWordsFile,true);writer = new OutputStreamWriter(stream);writer.write("\n"+word);writer.close();} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();return -1;}return 1;}public static int addWord(String word){if(word.length()>10)return -1;if(KeyWordsList.contains(word))return 0;KeyWordsList.add(word);if(Pattern.compile("(?i)[a-z][A-Z]").matcher(word).find())  letterKeyWordsList.add(word.toUpperCase());FileOutputStream stream; OutputStreamWriter writer;try {stream = new FileOutputStream(Constant.newWordsFile,true);writer = new OutputStreamWriter(stream);writer.write("\n"+word);writer.close();} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();return -1;}return 1;}private static void initKeyWords(String Files[]) throws IOException {if(KeyWordsList!=null)KeyWordsList.clear();elseKeyWordsList = new HashSet<String>();if(letterKeyWordsList!=null)letterKeyWordsList.clear();elseletterKeyWordsList = new HashSet<String>();for(int i=0;i<Files.length;i++){File file = new File(Files[i]);BufferedReader reader = null;reader = new BufferedReader(new FileReader(file));String tmp = reader.readLine();while(tmp!=null){KeyWordsList.add(tmp);if(Pattern.compile("(?i)[a-z][A-Z]").matcher(tmp).find())  letterKeyWordsList.add(tmp.toUpperCase());tmp = reader.readLine();}reader.close();}}private static boolean findWord(String str,boolean ignoreCase){if(ignoreCase == false)return KeyWordsList.contains(str);else{boolean match = KeyWordsList.contains(str);if(match == false){match = letterKeyWordsList.contains(str.toUpperCase());}return match;}}public static List<String> segmentStrQuickMatch( String str_line,boolean ignoreCase){String term = "";boolean term_tag = false;int str_size=0,left=0,len=0;List<String> list = new ArrayList<String>();str_size = str_line.length();while(left<str_size){len = Constant.max_len;while( len>=Constant.min_len )//gkm:每一词{term="";int right = left+len;int x = 0;if(right>str_size){x = right-str_size;right = str_size;}term=str_line.substring(left,right);term_tag=findWord(term,ignoreCase);if(term_tag==true)break;if(x>0)len-=x+1;elselen-=1;}if(term_tag==false)//gkm:词典中没有term,后移一个字符(以一个字符的速度后移,使得可以分出中英混合的词,没有判断无效字符,有待改进!!! ){left+=1;}else//gkm:词典中有term,后移len个字符,term加入到terms_vct[term_tag]{left+=len;list.add(term);}}//while(left<str_size)return list;}public static List<String> segmentStrFullMatch( String str_line,boolean ignoreCase){String term = "";boolean term_tag = false;int str_size=0,left=0,len=0;List<String> list = new ArrayList<String>();str_size = str_line.length();while(left<str_size){len = Constant.max_len;while( len>=Constant.min_len )//gkm:每一词{term="";int right = left+len;int x = 0;if(right>str_size){x = right-str_size;right = str_size;}term=str_line.substring(left,right);term_tag=findWord(term,ignoreCase);if(term_tag==true)list.add(term);if(x>0)len-=x+1;elselen-=1;}left+=1;}//while(left<str_size)return list;}public static void main(String[] args) throws IOException {System.out.println(segmentStrFullMatch("中华人民共和国",true));}}


原创粉丝点击