Lucene7.0与HanLP分词器整合索引数据库建立索引文件

来源:互联网 发布:美工待遇 编辑:程序博客网 时间:2024/05/17 01:32

HanLP官网:http://hanlp.linrunsoft.com/

GitHup地址:https://github.com/hankcs/HanLP

HanLP插件地址:https://github.com/hankcs/hanlp-lucene-plugin


需要一下jar包



package com.kyd.demo.hanLP;import java.io.IOException;import java.nio.file.Paths;import java.sql.Connection;import java.sql.DriverManager;import java.sql.PreparedStatement;import java.sql.ResultSet;import java.sql.Statement;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;import org.apache.lucene.document.Document;import org.apache.lucene.document.LongPoint;import org.apache.lucene.document.StringField;import org.apache.lucene.document.TextField;import org.apache.lucene.document.Field.Store;import org.apache.lucene.document.IntPoint;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.IndexWriterConfig;import org.apache.lucene.index.IndexWriterConfig.OpenMode;import org.apache.lucene.index.IndexableField;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.junit.Test;import com.hankcs.lucene.HanLPAnalyzer;import com.hankcs.lucene.HanLPIndexAnalyzer;/** * 索引数据库字段建立索引文件 *  * @author zhengzhen * */public class JdbcIndexDemo {public static void main(String[] args) {try {Class.forName("com.mysql.jdbc.Driver");String url = "jdbc:mysql://192.168.100.69:3306/xxxx?useUnicode=true&characterEncoding=utf8&autoReconnect=true&failOverReadOnly=false";String password ="root";String userName ="root";String sql ="select * from xxxx";try (Connection conn = DriverManager.getConnection(url,userName,password);PreparedStatement sta =conn.prepareStatement(sql);ResultSet rs = sta.executeQuery();){/** * 1.设置索引文件保存路径 */Directory directory = FSDirectory.open(Paths.get("xxxx_index"));/** * 2.创建分词器 */Analyzer analyzer = new HanLPIndexAnalyzer();/** * 3.分词器配置 */IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);indexWriterConfig.setOpenMode(OpenMode.CREATE);/** * 4.创建索引输出流 */IndexWriter indexWriter = new IndexWriter(directory,indexWriterConfig);/** * 5.循环遍历创建索引文档 */while (rs.next()) {/** * 5.1.创建文档 */Document document = new Document();/** * 5.2.添加字段 */Long id  =rs.getLong("unitId");IndexableField unitIdField = new StringField("unitId", id+"",Store.YES);document.add(unitIdField);String title = rs.getString("title");if( title != null) {IndexableField sectionNameField = new TextField("sectionName", title, Store.YES);document.add(sectionNameField);}String  unitName= rs.getString("unitName");if( unitName != null) {IndexableField unitNameField = new TextField("unitName", unitName, Store.YES);document.add(unitNameField);}String  courseName= rs.getString("courseName");if(courseName !=null) {IndexableField courseNameField = new TextField("courseName", courseName, Store.YES);document.add(courseNameField);}String  startPage= rs.getString("startPage");if(startPage !=null) {IndexableField startPageField = new StringField("startPage", startPage, Store.YES);document.add(startPageField);}String  endPage= rs.getString("startEndPage");if(endPage != null) {IndexableField endPageField = new StringField("endPage", endPage,Store.YES);document.add(endPageField);}indexWriter.addDocument(document);}indexWriter.commit();} catch (Exception e) {e.printStackTrace();}} catch (ClassNotFoundException e1) {e1.printStackTrace();}}/** * HanLPAnalyzer * 这个分词器对于长词不会切割 ,例如 “中华人民共和国” 是一个长词会保留下来 * @throws IOException */@Testpublic void hanLPAnalyzerTest() throws IOException {String text = "中华人民共和国很辽阔";for (int i = 0; i < text.length(); ++i){    System.out.print(text.charAt(i) + "" + i + " ");}System.out.println();Analyzer analyzer = new HanLPAnalyzer();TokenStream tokenStream = analyzer.tokenStream("field", text);tokenStream.reset();while (tokenStream.incrementToken()){    CharTermAttribute attribute = tokenStream.getAttribute(CharTermAttribute.class);    // 偏移量    OffsetAttribute offsetAtt = tokenStream.getAttribute(OffsetAttribute.class);    // 距离    PositionIncrementAttribute positionAttr = tokenStream.getAttribute(PositionIncrementAttribute.class);    System.out.println(attribute + " " + offsetAtt.startOffset() + " " + offsetAtt.endOffset() + " " + positionAttr.getPositionIncrement());}/* 输出: * 中0 华1 人2 民3 共4 和5 国6 很7 辽8 阔9  * 中华人民共和国 0 7 1 * 很 7 8 1 * 辽阔 8 10 1 */}/** * HanLPIndexAnalyzer * 这个分词器会对长词进行分割 “中华人民共和国” 会切分成“中华人民共和国” “中华” “人民”等等 * @throws IOException */@Testpublic void hanLPIndexAnalyzerTest() throws IOException {String text = "中华人民共和国很辽阔";for (int i = 0; i < text.length(); ++i){    System.out.print(text.charAt(i) + "" + i + " ");}System.out.println();Analyzer analyzer = new HanLPIndexAnalyzer();TokenStream tokenStream = analyzer.tokenStream("field", text);tokenStream.reset();while (tokenStream.incrementToken()){    CharTermAttribute attribute = tokenStream.getAttribute(CharTermAttribute.class);    // 偏移量    OffsetAttribute offsetAtt = tokenStream.getAttribute(OffsetAttribute.class);    // 距离    PositionIncrementAttribute positionAttr = tokenStream.getAttribute(PositionIncrementAttribute.class);    System.out.println(attribute + " " + offsetAtt.startOffset() + " " + offsetAtt.endOffset() + " " + positionAttr.getPositionIncrement());}/* 输出: * 中0 华1 人2 民3 共4 和5 国6 很7 辽8 阔9  * 中华人民共和国 0 7 1 * 中华人民 0 4 1 * 中华 0 2 1 * 华人 1 3 1 * 人民共和国 2 7 1 * 人民 2 4 1 * 共和国 4 7 1 * 共和 4 6 1 * 很 7 8 1 * 辽阔 8 10 1 */}}


原创粉丝点击