java，scala通过ansj分词并做去除停词处理

来源：互联网发布：京东数据开放平台编辑：程序博客网时间：2024/06/05 07:30

下载ansj jar<dependency>    <groupId>org.ansj</groupId>    <artifactId>ansj_seg</artifactId>    <version>0.9</version></dependency>

java版本

List<Term> parse = NlpAnalysis.parse("山东威海隧道发生交通事故 造成9死1伤3人失踪");new NatureRecognition(parse).recognition(); //词性标注HashMap<String, String> strHashMap = new HashMap<String, String>();//读取停词文件String stopWordTable = "F://360downloads/StopWordTable.txt";File f = new File(stopWordTable);try {    FileInputStream fileInputStream = new FileInputStream(f);//读入停用词文件    BufferedReader StopWordFileBr = new BufferedReader(new InputStreamReader(fileInputStream, "GBK"));    String stopWord = null;    for (; (stopWord = StopWordFileBr.readLine()) != null; ) {      strHashMap.put(stopWord, "_stop");    }    StopWordFileBr.close();    FilterModifWord.setUpdateDic(strHashMap);    List<Term> term = FilterModifWord.modifResult(parse);        //根据词性做一些业务上的判断    List list = new ArrayList();    for(int i=0;i<term.size();i++){        String word = term.get(i).getName(); //拿到词        String nature=term.get(i).getNatrue().toString().split(":")[0];//拿到词性        //  System.out.println(word+":"+nature);        if(nature.equals("d")){            if((term.get(i+1).getNatrue().toString().split(":")[0]).equals("a")){                String w = word+term.get(i+1).getName();                list.add(w);            }        }else{            if(i>0 && (term.get(i-1).getNatrue().toString().split(":")[0]).equals("d") && nature.equals("a")){                System.out.println("移除这一条数据");            }else{                list.add(word);            }        }    }        for(int i=0;i<list.size();i++){        System.out.println(list.get(i));    }} catch (Exception e) {    e.printStackTrace();}

scala版本

val parse = ToAnalysis.parse("山东威海隧道发生交通事故 造成9死1伤3人失踪
")//new NatureRecognition(parse).recognition(); //词性标注val strHashMap = new util.HashMap[String, String]val file = Source.fromFile("F://360downloads/StopWordTable.txt", "GBK")for (line <- file.getLines) {  strHashMap.put(line, "_stop")}file.closeFilterModifWord.setUpdateDic(strHashMap)val term = FilterModifWord.modifResult(parse)System.out.println(term)

阅读全文

0 0

java，scala通过ansj分词 并做去除停词处理

java，scala通过ansj分词并做去除停词处理