全文检索Lucene(二)--特定项搜索与queryParser搜索
来源:互联网 发布:光能使者知乎 编辑:程序博客网 时间:2024/05/17 21:06
一、对特定项的搜索
1、准备数据源信息,这里准备了7个txt文档(纯英文)。
2、建立索引信息,通过三个文本域建立索引,并使用了标准的分词器,分别是fileName(存储在索引目录中),fullPath(存储),contents(不存储)。
package com.feiyang.lucene;import java.io.File;import java.io.FileReader;import java.io.IOException;import java.nio.file.Paths;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.document.TextField;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.IndexWriterConfig;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;public class Indexer {private IndexWriter writer;// 写索引实例/** * 构造方法 实例化写索引 * @param indexDir * @throws Exception */public Indexer(String indexDir) throws Exception {Directory dir = FSDirectory.open(Paths.get(indexDir));Analyzer analyzer = new StandardAnalyzer();//标准分词器IndexWriterConfig conf = new IndexWriterConfig(analyzer);writer = new IndexWriter(dir, conf);}//关闭写索引public void close() throws Exception{writer.close();}/** * 索引指定目录的所有文件 * @param dataDir * @throws Exception */public int index(String dataDir)throws Exception{File[] files = new File(dataDir).listFiles();for(File f:files){indexFile(f);}return writer.numDocs();}/** * 索引指定文件 * @param f * @throws IOException */private void indexFile(File f) throws IOException {// TODO Auto-generated method stubSystem.out.println("索引文件:"+f.getCanonicalPath());Document document = getDocument(f);writer.addDocument(document);}/** * 获取文档 ,文档里再设置每个字段 * @param f */private Document getDocument(File f) throws IOException{// TODO Auto-generated method stubDocument doc = new Document();doc.add(new TextField("contents",new FileReader(f)));doc.add(new TextField("fileName", f.getName(),Field.Store.YES));doc.add(new TextField("fullPath", f.getCanonicalPath(),Field.Store.YES));return doc;}public static void main(String[] args) {String indexDir="D:\\lucene4";String dataDir="D:\\lucene4\\data";Indexer indexer = null;int numIndexed = 0;Long startTime = System.currentTimeMillis();try {indexer = new Indexer(indexDir);numIndexed = indexer.index(dataDir);} catch (Exception e) {// TODO Auto-generated catch blocke.printStackTrace();}finally {try {indexer.close();} catch (Exception e) {// TODO Auto-generated catch blocke.printStackTrace();}}Long endTime = System.currentTimeMillis();System.out.println("索引:"+numIndexed+"个文件,花费了"+(endTime-startTime)+"毫秒");//System.out.println("");}}
测试结果为:可以看到我们的七个文件都已经成功添加索引。
3、对特定项进行搜索。
package com.feiyang.lucene;import static org.junit.Assert.*;import java.nio.file.Paths;import org.apache.lucene.document.Document;import org.apache.lucene.index.DirectoryReader;import org.apache.lucene.index.IndexReader;import org.apache.lucene.index.Term;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.Query;import org.apache.lucene.search.ScoreDoc;import org.apache.lucene.search.TermQuery;import org.apache.lucene.search.TopDocs;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.junit.After;import org.junit.Before;import org.junit.Test;public class SearchTest {private Directory dir;private IndexReader reader ;private IndexSearcher search;@Beforepublic void setUp() throws Exception {dir = FSDirectory.open(Paths.get("D:\\lucene4"));reader = DirectoryReader.open(dir);search = new IndexSearcher(reader);}@Testpublic void test() {fail("Not yet implemented");}@Afterpublic void tearDown() throws Exception {reader.close();}/** * 对特定项搜索 * @throws Exception */@Testpublic void testTermQuery() throws Exception{String searchField = "contents";String q = "particular";Term term = new Term(searchField, q);Query query = new TermQuery(term);TopDocs hits = search.search(query, 10);System.out.println("匹配 '"+q+"',总共查询到"+hits.totalHits+"个文档");for(ScoreDoc scoreDoc:hits.scoreDocs){Document doc=search.doc(scoreDoc.doc);System.out.println(doc.get("fullPath"));}}}
测试结果:成功匹配到4个文本中包含特定项‘particular’。
这个时候我们将特定项‘particular’改成‘particula’,去掉了最后一个字母,在进行测试。
测试结果:我们可以看到少了最后一个字母的特定项,没有检索出数据。
解释:我们的分词器会将我们指定的fileName,filePath,contents的内容进行分词,而基于特定项的检索就是根据分词后的结果进行检索。所以这种方法在我们环境中,不经常使用。
二、查询表达式:queryParser
通过Query query = parser.parse(“查询的关键词”),对关键词进行表达式设计,搜索信息。
/** * 解析查询 * @throws Exception */@Testpublic void testQueryParse()throws Exception{Analyzer analyzer = new StandardAnalyzer();String searchField = "contents";String[] q ={"particular","particular java","particular AND commercial ","particula","particula~"};//查询特定项particular信息//String q1 = "particular";//查询获取particular或者java的信息//String q2 = "particular java";//查询获取particular与java的信息//String q3 = "particular AND java";//查询特定项particula的信息//String q4 = "particula";//查询particula相近的信息//String q5 = "particula~";QueryParser parser = new QueryParser(searchField, analyzer);for(int i=0;i<q.length;i++){Query query = parser.parse(q[i]);TopDocs hits = search.search(query, 10);System.out.println("---------查询到'"+q[i]+"',共有"+hits.totalHits+"条信息---------");for(ScoreDoc scoreDoc :hits.scoreDocs){Document document = search.doc(scoreDoc.doc);System.out.println(document.get("fullPath"));}}}
测试结果:
三、其他查询方式:
1、指定项范围查询TermRangeQuery
2、指定数字范围查询NumbericRangeQuery
3、指定字符串开头搜索PrefixQuery
4、组合查询BooleanQuery
package com.feiyang.lucene;import static org.junit.Assert.fail;import java.io.IOException;import java.nio.file.Paths;import org.apache.lucene.document.Document;import org.apache.lucene.index.DirectoryReader;import org.apache.lucene.index.IndexReader;import org.apache.lucene.index.Term;import org.apache.lucene.search.BooleanClause;import org.apache.lucene.search.BooleanQuery;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.NumericRangeQuery;import org.apache.lucene.search.PrefixQuery;import org.apache.lucene.search.ScoreDoc;import org.apache.lucene.search.TermRangeQuery;import org.apache.lucene.search.TopDocs;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.util.BytesRef;import org.junit.After;import org.junit.Before;import org.junit.Test;public class SearchTest2 {private Directory dir;private IndexReader reader ;private IndexSearcher search;@Beforepublic void setUp() throws Exception {dir = FSDirectory.open(Paths.get("D:\\lucene5"));reader = DirectoryReader.open(dir);search = new IndexSearcher(reader);}@Testpublic void test() {fail("Not yet implemented");}@Afterpublic void tearDown() throws Exception {reader.close();}/** * 对特定项范围搜索:TermRangeQuery * @throws IOException * @throws Exception */@Testpublic void testTermRangeQuery() throws IOException{TermRangeQuery query = new TermRangeQuery("desc", new BytesRef("a".getBytes()), new BytesRef("c".getBytes()), true, true);TopDocs hits = search.search(query, 10);for(ScoreDoc scoreDoc:hits.scoreDocs){Document doc=search.doc(scoreDoc.doc);System.out.println(doc.get("id"));System.out.println(doc.get("city"));System.out.println(doc.get("desc"));}}/** * 指定数字范围查询:NumbericRangeQuery * @throws Exception */@Testpublic void testNumbericRangeQuery()throws Exception{NumericRangeQuery<Integer> query = NumericRangeQuery.newIntRange("id", 1, 2, true, true);TopDocs hits = search.search(query, 10);for(ScoreDoc scoreDoc:hits.scoreDocs){Document doc=search.doc(scoreDoc.doc);System.out.println(doc.get("id"));System.out.println(doc.get("city"));System.out.println(doc.get("desc"));}}/** * 指定字符串开头搜索:PrefixQuery * @throws Exception */@Testpublic void testPrefixQuery()throws Exception{String searchField = "city";String q = "q";Term term = new Term(searchField, q);PrefixQuery query = new PrefixQuery(term);TopDocs hits = search.search(query, 10);for(ScoreDoc scoreDoc:hits.scoreDocs){Document doc=search.doc(scoreDoc.doc);System.out.println(doc.get("id"));System.out.println(doc.get("city"));System.out.println(doc.get("desc"));}}/** * 组合查询 * @throws Exception */@Testpublic void testBooleanQuery()throws Exception{NumericRangeQuery<Integer> query = NumericRangeQuery.newIntRange("id", 1, 2, true, true);BooleanQuery.Builder booleanQuery = new BooleanQuery.Builder();//BooleanClause.Occur.MUST:包含//BooleanClause.Occur.MUST_NOT:不包含//BooleanClause.Occur.SHOULD:或者booleanQuery.add(query, BooleanClause.Occur.MUST);TopDocs hits = search.search(booleanQuery.build(), 10);for(ScoreDoc scoreDoc:hits.scoreDocs){Document doc=search.doc(scoreDoc.doc);System.out.println(doc.get("id"));System.out.println(doc.get("city"));System.out.println(doc.get("desc"));}}}
0 1
- 全文检索Lucene(二)--特定项搜索与queryParser搜索
- Lucene学习笔记之(四)特定项进行搜索
- Lucene全文搜索学习笔记(二)
- [搜索] Lucene全文检索的基本原理
- 分布式搜索 Lucene全文检索基本原理
- 分布式搜索 Lucene全文检索基本原理
- lucene,基于QueryParser的搜索
- 【Lucene】Apache Lucene全文检索引擎架构之搜索功能
- 【搜索那些事】细谈lucene(一)初识全文资源检索框架lucene
- 【搜索那些事】细谈lucene(一)初识全文资源检索框架lucene
- 【搜索那些事】细谈lucene(一)初识全文资源检索框架lucene
- Lucene高级搜素(Query、QueryParser和分页搜索)
- Lucene的QueryParser搜索时出现ParseException
- 使用Lucene进行全文检索(三)---进行搜索
- 全文检索Lucene入门之创建索引及简单搜索
- Apache Lucene全文检索引擎架构之搜索功能
- 项目中用到的全文搜索(lucene与Compass)
- 全文搜索---Solr(它与lucene的关系)
- HDU 1312 Red and Black
- java中volatile关键字的含义
- VBS获取当前文件目录
- RationalRose2003破解文件
- 游戏编程模式:前言(架构,性能和游戏)(Part III)
- 全文检索Lucene(二)--特定项搜索与queryParser搜索
- c3p0配置在spring配置文件中的详细配置
- Android权限大全
- 排序算法之快速排序、归并排序(java实现)
- 本程序员要回家啦
- 浅谈当前情况下的视频内容收费
- Android Studio 表格布局实现登录界面
- Adapter中常见遇到的NullPointerException
- Python入门小项目