java,scala通过ansj分词 并做去除停词处理

来源:互联网 发布:京东数据开放平台 编辑:程序博客网 时间:2024/06/05 07:30
下载ansj jar
<dependency>    <groupId>org.ansj</groupId>    <artifactId>ansj_seg</artifactId>    <version>0.9</version></dependency>

java版本

List<Term> parse = NlpAnalysis.parse("山东威海隧道发生交通事故 造成9死1伤3人失踪");new NatureRecognition(parse).recognition(); //词性标注HashMap<String, String> strHashMap = new HashMap<String, String>();//读取停词文件String stopWordTable = "F://360downloads/StopWordTable.txt";File f = new File(stopWordTable);try { FileInputStream fileInputStream = new FileInputStream(f);//读入停用词文件 BufferedReader StopWordFileBr = new BufferedReader(new InputStreamReader(fileInputStream, "GBK")); String stopWord = null; for (; (stopWord = StopWordFileBr.readLine()) != null; ) { strHashMap.put(stopWord, "_stop"); } StopWordFileBr.close(); FilterModifWord.setUpdateDic(strHashMap); List<Term> term = FilterModifWord.modifResult(parse); //根据词性做一些业务上的判断 List list = new ArrayList(); for(int i=0;i<term.size();i++){ String word = term.get(i).getName(); //拿到词 String nature=term.get(i).getNatrue().toString().split(":")[0];//拿到词性 // System.out.println(word+":"+nature); if(nature.equals("d")){ if((term.get(i+1).getNatrue().toString().split(":")[0]).equals("a")){ String w = word+term.get(i+1).getName(); list.add(w); } }else{ if(i>0 && (term.get(i-1).getNatrue().toString().split(":")[0]).equals("d") && nature.equals("a")){ System.out.println("移除这一条数据"); }else{ list.add(word); } } } for(int i=0;i<list.size();i++){ System.out.println(list.get(i)); }} catch (Exception e) { e.printStackTrace();}

scala版本

val parse = ToAnalysis.parse("
山东威海隧道发生交通事故 造成9死1伤3人失踪
")//new NatureRecognition(parse).recognition(); //词性标注val strHashMap = new util.HashMap[String, String]val file = Source.fromFile("F://360downloads/StopWordTable.txt", "GBK")for (line <- file.getLines) { strHashMap.put(line, "_stop")}file.closeFilterModifWord.setUpdateDic(strHashMap)val term = FilterModifWord.modifResult(parse)System.out.println(term)

原创粉丝点击