简单关键词匹配算法
来源:互联网 发布:在线谐音英文名软件 编辑:程序博客网 时间:2024/05/20 04:51
针对微博的短篇博文,编写的简单分词和匹配算法。相对于一篇文档的复杂分词算法,能够在效率和可用性上得到较好的平衡。
package com.sina.tblog.sentiment;import java.io.BufferedReader;import java.io.File;import java.io.FileOutputStream;import java.io.FileReader;import java.io.IOException;import java.io.OutputStreamWriter;import java.util.ArrayList;import java.util.HashSet;import java.util.List;import java.util.regex.Pattern;import com.sina.tblog.sentiment.constant.Constant;public class KeyWordFilter {public static HashSet<String> KeyWordsList = null;public static HashSet<String> letterKeyWordsList = null;/** * 初始化或重新导入关键词列表 * @throws IOException */static{try {initKeyWords(Constant.KeyWordsFiles);} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();}}public static int deleteNewWord(String word){if(word.length()>10||word.length()<2)return -1;if(!KeyWordsList.contains(word))return 0;KeyWordsList.remove(word);if(Pattern.compile("(?i)[a-z][A-Z]").matcher(word).find()) letterKeyWordsList.remove(word.toUpperCase());FileOutputStream stream; OutputStreamWriter writer;try {stream = new FileOutputStream(Constant.newWordsFile,true);writer = new OutputStreamWriter(stream);writer.write("\n"+word);writer.close();} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();return -1;}return 1;}public static int addWord(String word){if(word.length()>10)return -1;if(KeyWordsList.contains(word))return 0;KeyWordsList.add(word);if(Pattern.compile("(?i)[a-z][A-Z]").matcher(word).find()) letterKeyWordsList.add(word.toUpperCase());FileOutputStream stream; OutputStreamWriter writer;try {stream = new FileOutputStream(Constant.newWordsFile,true);writer = new OutputStreamWriter(stream);writer.write("\n"+word);writer.close();} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();return -1;}return 1;}private static void initKeyWords(String Files[]) throws IOException {if(KeyWordsList!=null)KeyWordsList.clear();elseKeyWordsList = new HashSet<String>();if(letterKeyWordsList!=null)letterKeyWordsList.clear();elseletterKeyWordsList = new HashSet<String>();for(int i=0;i<Files.length;i++){File file = new File(Files[i]);BufferedReader reader = null;reader = new BufferedReader(new FileReader(file));String tmp = reader.readLine();while(tmp!=null){KeyWordsList.add(tmp);if(Pattern.compile("(?i)[a-z][A-Z]").matcher(tmp).find()) letterKeyWordsList.add(tmp.toUpperCase());tmp = reader.readLine();}reader.close();}}private static boolean findWord(String str,boolean ignoreCase){if(ignoreCase == false)return KeyWordsList.contains(str);else{boolean match = KeyWordsList.contains(str);if(match == false){match = letterKeyWordsList.contains(str.toUpperCase());}return match;}}public static List<String> segmentStrQuickMatch( String str_line,boolean ignoreCase){String term = "";boolean term_tag = false;int str_size=0,left=0,len=0;List<String> list = new ArrayList<String>();str_size = str_line.length();while(left<str_size){len = Constant.max_len;while( len>=Constant.min_len )//gkm:每一词{term="";int right = left+len;int x = 0;if(right>str_size){x = right-str_size;right = str_size;}term=str_line.substring(left,right);term_tag=findWord(term,ignoreCase);if(term_tag==true)break;if(x>0)len-=x+1;elselen-=1;}if(term_tag==false)//gkm:词典中没有term,后移一个字符(以一个字符的速度后移,使得可以分出中英混合的词,没有判断无效字符,有待改进!!! ){left+=1;}else//gkm:词典中有term,后移len个字符,term加入到terms_vct[term_tag]{left+=len;list.add(term);}}//while(left<str_size)return list;}public static List<String> segmentStrFullMatch( String str_line,boolean ignoreCase){String term = "";boolean term_tag = false;int str_size=0,left=0,len=0;List<String> list = new ArrayList<String>();str_size = str_line.length();while(left<str_size){len = Constant.max_len;while( len>=Constant.min_len )//gkm:每一词{term="";int right = left+len;int x = 0;if(right>str_size){x = right-str_size;right = str_size;}term=str_line.substring(left,right);term_tag=findWord(term,ignoreCase);if(term_tag==true)list.add(term);if(x>0)len-=x+1;elselen-=1;}left+=1;}//while(left<str_size)return list;}public static void main(String[] args) throws IOException {System.out.println(segmentStrFullMatch("中华人民共和国",true));}}
- 简单关键词匹配算法
- DFA 算法实现关键词匹配
- 简单的网页关键词匹配工具
- 简单的网页关键词匹配工具
- 简单模式匹配算法
- 简单匹配算法测试
- 字符串简单匹配算法
- 简单匹配算法
- 关键字过滤系统(二)关键词匹配算法
- 简单的括号匹配算法
- 简单的字符串匹配算法
- 串匹配(简单算法)
- 简单的字符串匹配算法
- 简单匹配算法(BF)
- 海量字符串匹配或海量关键词匹配(脏字/敏感词汇搜索算法)
- 怎么匹配关键词
- 正则表达式匹配关键词
- 简单模式匹配算法和KMP模式匹配算法
- 线段树思想实现矩形切割
- 手机AP处理器简介:MTK-Marvell-Mtekvision
- 字符编码
- SetUnhandledExceptionFilter,设置异常处理函数
- Java中split的使用
- 简单关键词匹配算法
- <iOS>响应事件传递, nextResponder研究
- gunicorn.werkzeug和quixote的结合使用-框架篇
- 漫谈QWidget及其派生类(二)
- 08 - 基本控件:GridView表格视图
- (转)Android:Activity/Service/Broadcast Receiver/Content Provider
- 九、网络编程
- Base16Encoder.java
- Flash Air ActionScript3 二进制数据写入文件