PyLucene学习之二
来源:互联网 发布:linux 韦东山 编辑:程序博客网 时间:2024/05/20 00:12
文档和域
文档是Lucene索引和搜索的原子单位,文档为包含一个或多个域的容器,而域则依次包含”真正的“被索引内容。
索引
提取文本->创建对应Document实例->通过分析将域文本处理成大量语汇单元->将语汇单元加入段结构
使用倒排索引的数据结构进行存储,能够有效的利用磁盘空间,把文档中提取出的语汇单元作为查询关键字
索引步骤
1 首先创建Directory对象用于存放索引
store=SimpleFSDirectory(File(storeDir))
2 接下来在Directory对象上创建IndexWriter对象
config=IndexWriterConfig(Version.LUCENE_CURRENT,analyzer)config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)writer=IndexWriter(store,config)
3 创建Document对象和Fields对象,并将Document加入索引
域
域索引选项
Field.Index.* 通过倒排索引来控制域文本是否可被索引。
Index.ANALYZED:被分析器分析,分析器提供的主要功能是将文本处理成大量语汇单元,例如文章正文等就需要被解析。
Index.NOT_ANALYZED:对当前域不进行分析,例如一些不想被改变的内容。
域存储选项
Field.Store.* 用来确定是否需要存储域的真实值, 以便后续搜索时能恢复这个值。
域的项向量选项
项向量是介于索引域和存储域的一个中间结构。
域选项组合
例如content的FieldType:
t2=FieldType()t2.setIndexed(True)t2.setStored(False)t2.setTokenized(True)t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
索引代码:
#!/usr/bin/env python#coding:utf-8INDEX_DIR = "IndexFiles.index"import sys, os, lucene, threading, timefrom datetime import datetimefrom java.io import Filefrom org.apache.lucene.analysis.miscellaneous import LimitTokenCountAnalyzerfrom org.apache.lucene.analysis.core import WhitespaceAnalyzerfrom org.apache.lucene.document import Document, Field, FieldTypefrom org.apache.lucene.index import FieldInfo, IndexWriter, IndexWriterConfigfrom org.apache.lucene.store import SimpleFSDirectoryfrom org.apache.lucene.util import Version"""This class is loosely based on the Lucene (java implementation) demo classorg.apache.lucene.demo.IndexFiles. It will take a directory as an argumentand will index all of the files in that directory and downward recursively.It will index on the file path, the file name and the file contents. Theresulting Lucene index will be placed in the current directory and called'index'."""class Ticker(object): def __init__(self): self.tick = True def run(self): while self.tick: sys.stdout.write('.') sys.stdout.flush() time.sleep(1.0)class IndexFiles(object): """Usage: python IndexFiles <doc_dand will index all of the files in that directory and downward recursively.irectory>""" def __init__(self, root, storeDir, analyzer): if not os.path.exists(storeDir): os.mkdir(storeDir) store = SimpleFSDirectory(File(storeDir)) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) self.indexDocs(root, writer) ticker = Ticker() print 'commit index', threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print 'done' def indexDocs(self, root, writer): t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setIndexed(True) t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for root, dirnames, filenames in os.walk(root): for filename in filenames: #print filename if not filename.endswith('.txt'): continue print "adding", filename #try: path = os.path.join(root, filename) file = open(path) # contents = unicode(file.read(), 'iso-8859-1') contents = unicode(file.read(), 'utf-8') file.close() if len(contents) > 0: sentences = contents.split('###') i = 0 for sentence in sentences: i += 1 #print i doc = Document() doc.add(Field("name", filename, t1)) doc.add(Field("path", root, t1)) doc.add(Field("sentence_id", str(i), Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("contents", sentence, t2)) writer.addDocument(doc) else: print "warning: no content in %s" % filenameif __name__ == '__main__': lucene.initVM(vmargs=['-Djava.awt.headless=true']) print 'lucene', lucene.VERSION start = datetime.now() #try: base_dir = os.path.dirname(os.path.abspath('.')) print base_dir IndexFiles(".", os.path.join(base_dir, INDEX_DIR), WhitespaceAnalyzer(Version.LUCENE_CURRENT)) end = datetime.now() print end - start
搜索
#!/usr/bin/env python#coding:utf-8INDEX_DIR = "IndexFiles.index"import sys, os, lucenefrom java.io import Filefrom org.apache.lucene.analysis.standard import StandardAnalyzerfrom org.apache.lucene.analysis.core import WhitespaceAnalyzerfrom org.apache.lucene.index import DirectoryReader, IndexReader, Termfrom org.apache.lucene.queryparser.classic import QueryParserfrom org.apache.lucene.store import SimpleFSDirectoryfrom org.apache.lucene.search import IndexSearcher, Explanationfrom org.apache.lucene.util import Version"""This script is loosely based on the Lucene (java implementation) demo classorg.apache.lucene.demo.SearchFiles. It will prompt for a search query, then itwill search the Lucene index in the current directory called 'index' for thesearch query entered against the 'contents' field. It will then display the'path' and 'name' fields for each of the hits it finds in the index. Note thatsearch.close() is currently commented out because it causes a stack overflow insome cases."""def run(searcher, analyzer, reader): while True: print print "Hit enter with no input to quit." command = raw_input("Query:") if command == '': return print print "Searching for:", command term = Term("contents", command) print term.toString() term_vector = reader.totalTermFreq(term) print "%s total terms" % term_vector query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(command) scoreDocs = searcher.search(query, 10000).scoreDocs print "%s total matching documents." % len(scoreDocs) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) explanation = searcher.explain(query, scoreDoc.doc) #print explanation.toString() print 'path:', doc.get("path"), 'name:', doc.get("name"), doc.get("sentence_id")if __name__ == '__main__': lucene.initVM(vmargs=['-Djava.awt.headless=true']) print 'lucene', lucene.VERSION base_dir = os.path.dirname(os.path.abspath(".")) print base_dir directory = SimpleFSDirectory(File(os.path.join(base_dir, INDEX_DIR))) searcher = IndexSearcher(DirectoryReader.open(directory)) reader = IndexReader.open(directory) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) run(searcher, analyzer, reader) del searcher
很好的参考文献:http://www.cppblog.com/baby-fly/archive/2010/03/08/109189.html
0 0
- PyLucene学习之二
- PyLucene学习之三
- PyLucene学习之一
- PyLucene学习笔记 文件索引及检索
- pylucene入门
- PyLucene安装
- PyLucene实战
- [pylucene]Pylucene不能导入SmartChineseAnalyzer的解决办法
- [PyLucene]使用PyLucene中JVM的疑问
- STL学习之(二)
- DWR学习 之二
- 学习STL之二
- JCL学习之二
- ETL学习之二
- JSON学习之二
- 学习ext之二
- 批处理学习之二
- CUDA学习之二
- nodejs小问题:Ubuntu15下安装Node.js4.1.1
- 二维码扫描
- 策略模式
- 数据结构-散列表
- 《6》命令模式
- PyLucene学习之二
- Java中队列的使用
- 01.CocosCreator开发环境搭建
- 我们编程吧 之 git 学习手册v1.0
- MySQL分片水很深
- 快速漂亮的找出Linux下的大文件
- java synchronized的理解以及内置锁和对象锁
- c#之字符串的不可变性
- 【C#之值类型vs引用类型】