HanLPTokenizer HanLP分词器

来源:互联网 发布:软件演示平台 编辑:程序博客网 时间:2024/06/05 05:02
anlp在功能上的扩展主要体现在以下几个方面:
•关键词提取 
•自动摘要
•短语提取 
•拼音转换
•简繁转换

•文本推荐


下面是 hanLP分词器的代码

注:使用maven依赖 

<dependency>  
   <groupId>com.hankcs</groupId>  
   <artifactId>hanlp</artifactId>  
   <version>portable-1.3.4</version>  
</dependency> 

使用了java8进行处理

import java.util.ArrayList;import java.util.List;import java.util.stream.Collectors;import org.apache.commons.lang3.StringUtils;import com.hankcs.hanlp.seg.Segment;import com.hankcs.hanlp.seg.Dijkstra.DijkstraSegment;import com.hankcs.hanlp.seg.NShort.NShortSegment;import com.hankcs.hanlp.tokenizer.IndexTokenizer;import com.hankcs.hanlp.tokenizer.NLPTokenizer;import com.hankcs.hanlp.tokenizer.SpeedTokenizer;import com.hankcs.hanlp.tokenizer.StandardTokenizer;public class HanLPTokenizer {private static final Segment N_SHORT_SEGMENT = new NShortSegment().enableCustomDictionary(false).enablePlaceRecognize(true).enableOrganizationRecognize(true);private static final Segment DIJKSTRA_SEGMENT = new DijkstraSegment().enableCustomDictionary(false).enablePlaceRecognize(true).enableOrganizationRecognize(true);/*** 标准分词* @param text* @return*/public static List<String> standard(String text) {List<String> list = new ArrayList<String>();StandardTokenizer.segment(text).forEach(term -> {if (StringUtils.isNotBlank(term.word)) {list.add(term.word);}});return list.stream().distinct().collect(Collectors.toList());}/*** NLP分词* @param text* @return*/public static List<String> nlp(String text) {List<String> list = new ArrayList<String>();NLPTokenizer.segment(text).forEach(term -> {if (StringUtils.isNotBlank(term.word)) {list.add(term.word);}});return list.stream().distinct().collect(Collectors.toList());}/*** 索引分词* @param text* @return*/public static List<String> index(String text) {List<String> list = new ArrayList<String>();IndexTokenizer.segment(text).forEach(term -> {if (StringUtils.isNotBlank(term.word)) {list.add(term.word);}});return list.stream().distinct().collect(Collectors.toList());}/*** 极速词典分词* @param text* @return*/public static List<String> speed(String text) {List<String> list = new ArrayList<String>();SpeedTokenizer.segment(text).forEach(term -> {if (StringUtils.isNotBlank(term.word)) {list.add(term.word);}});return list;}/*** N-最短路径分词* @param text* @return*/public static List<String> nShort(String text) {List<String> list = new ArrayList<String>();N_SHORT_SEGMENT.seg(text).forEach(term -> {if (StringUtils.isNotBlank(term.word)) {list.add(term.word);}});return list.stream().distinct().collect(Collectors.toList());}/*** 最短路径分词* @param text* @return*/public static List<String> shortest(String text) {List<String> list = new ArrayList<String>();DIJKSTRA_SEGMENT.seg(text).forEach(term -> {if (StringUtils.isNotBlank(term.word)) {list.add(term.word);}});return list.stream().distinct().collect(Collectors.toList());}public static void main(String[] args) {String text = "测试勿动12";System.out.println("标准分词:" + standard(text));System.out.println("NLP分词:" + nlp(text));System.out.println("索引分词:" + index(text));System.out.println("N-最短路径分词:" + nShort(text));System.out.println("最短路径分词分词:" + shortest(text));System.out.println("极速词典分词:" + speed(text));}}


原创粉丝点击