PyLucene学习之二

来源:互联网 发布:linux 韦东山 编辑:程序博客网 时间:2024/05/20 00:12

文档和域

文档是Lucene索引和搜索的原子单位,文档为包含一个或多个域的容器,而域则依次包含”真正的“被索引内容。


索引

提取文本->创建对应Document实例->通过分析将域文本处理成大量语汇单元->将语汇单元加入段结构
使用倒排索引的数据结构进行存储,能够有效的利用磁盘空间,把文档中提取出的语汇单元作为查询关键字

索引步骤

1 首先创建Directory对象用于存放索引

store=SimpleFSDirectory(File(storeDir))

2 接下来在Directory对象上创建IndexWriter对象

config=IndexWriterConfig(Version.LUCENE_CURRENT,analyzer)config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)writer=IndexWriter(store,config)

3 创建Document对象和Fields对象,并将Document加入索引


域索引选项

Field.Index.* 通过倒排索引来控制域文本是否可被索引。
Index.ANALYZED:被分析器分析,分析器提供的主要功能是将文本处理成大量语汇单元,例如文章正文等就需要被解析。
Index.NOT_ANALYZED:对当前域不进行分析,例如一些不想被改变的内容。

域存储选项

Field.Store.* 用来确定是否需要存储域的真实值, 以便后续搜索时能恢复这个值。

域的项向量选项

项向量是介于索引域和存储域的一个中间结构。

域选项组合

索引选项 存储选项 项向量 使用范例 NOT_ANALYZED_NO_FORMS YES NO 标识符,姓名,电话,日期 ANALYZED YES WITH_POSITIONS_OFFSETS 文档标题,摘要 ANALYZED NO WITH_POSITIONS_OFFSETS 文档正文 NO YES NO 文档类型,数据库主键 NOT_ANALYZED NO NO 隐藏的关键词

例如content的FieldType:

t2=FieldType()t2.setIndexed(True)t2.setStored(False)t2.setTokenized(True)t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

索引代码:

#!/usr/bin/env python#coding:utf-8INDEX_DIR = "IndexFiles.index"import sys, os, lucene, threading, timefrom datetime import datetimefrom java.io import Filefrom org.apache.lucene.analysis.miscellaneous import LimitTokenCountAnalyzerfrom org.apache.lucene.analysis.core import WhitespaceAnalyzerfrom org.apache.lucene.document import Document, Field, FieldTypefrom org.apache.lucene.index import FieldInfo, IndexWriter, IndexWriterConfigfrom org.apache.lucene.store import SimpleFSDirectoryfrom org.apache.lucene.util import Version"""This class is loosely based on the Lucene (java implementation) demo classorg.apache.lucene.demo.IndexFiles.  It will take a directory as an argumentand will index all of the files in that directory and downward recursively.It will index on the file path, the file name and the file contents.  Theresulting Lucene index will be placed in the current directory and called'index'."""class Ticker(object):    def __init__(self):        self.tick = True    def run(self):        while self.tick:            sys.stdout.write('.')            sys.stdout.flush()            time.sleep(1.0)class IndexFiles(object):    """Usage: python IndexFiles <doc_dand will index all of the files in that directory and downward recursively.irectory>"""    def __init__(self, root, storeDir, analyzer):        if not os.path.exists(storeDir):            os.mkdir(storeDir)        store = SimpleFSDirectory(File(storeDir))        analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)        writer = IndexWriter(store, config)        self.indexDocs(root, writer)        ticker = Ticker()        print 'commit index',        threading.Thread(target=ticker.run).start()        writer.commit()        writer.close()        ticker.tick = False        print 'done'    def indexDocs(self, root, writer):        t1 = FieldType()        t1.setIndexed(True)        t1.setStored(True)        t1.setTokenized(False)        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)        t2 = FieldType()        t2.setIndexed(True)        t2.setStored(False)        t2.setTokenized(True)        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)        for root, dirnames, filenames in os.walk(root):            for filename in filenames:                #print filename                if not filename.endswith('.txt'):                    continue                print "adding", filename                #try:                path = os.path.join(root, filename)                file = open(path)                # contents = unicode(file.read(), 'iso-8859-1')                contents = unicode(file.read(), 'utf-8')                file.close()                if len(contents) > 0:                    sentences = contents.split('###')                    i = 0                    for sentence in sentences:                        i += 1                        #print i                        doc = Document()                        doc.add(Field("name", filename, t1))                        doc.add(Field("path", root, t1))                        doc.add(Field("sentence_id", str(i), Field.Store.YES, Field.Index.NOT_ANALYZED))                        doc.add(Field("contents", sentence, t2))                        writer.addDocument(doc)                else:                    print "warning: no content in %s" % filenameif __name__ == '__main__':    lucene.initVM(vmargs=['-Djava.awt.headless=true'])    print 'lucene', lucene.VERSION    start = datetime.now()    #try:    base_dir = os.path.dirname(os.path.abspath('.'))    print base_dir    IndexFiles(".", os.path.join(base_dir, INDEX_DIR),               WhitespaceAnalyzer(Version.LUCENE_CURRENT))    end = datetime.now()    print end - start

搜索

#!/usr/bin/env python#coding:utf-8INDEX_DIR = "IndexFiles.index"import sys, os, lucenefrom java.io import Filefrom org.apache.lucene.analysis.standard import StandardAnalyzerfrom org.apache.lucene.analysis.core import WhitespaceAnalyzerfrom org.apache.lucene.index import DirectoryReader, IndexReader, Termfrom org.apache.lucene.queryparser.classic import QueryParserfrom org.apache.lucene.store import SimpleFSDirectoryfrom org.apache.lucene.search import IndexSearcher, Explanationfrom org.apache.lucene.util import Version"""This script is loosely based on the Lucene (java implementation) demo classorg.apache.lucene.demo.SearchFiles.  It will prompt for a search query, then itwill search the Lucene index in the current directory called 'index' for thesearch query entered against the 'contents' field.  It will then display the'path' and 'name' fields for each of the hits it finds in the index.  Note thatsearch.close() is currently commented out because it causes a stack overflow insome cases."""def run(searcher, analyzer, reader):    while True:        print        print "Hit enter with no input to quit."        command = raw_input("Query:")        if command == '':            return        print        print "Searching for:", command        term = Term("contents", command)        print term.toString()        term_vector = reader.totalTermFreq(term)        print "%s total terms" % term_vector        query = QueryParser(Version.LUCENE_CURRENT, "contents",                            analyzer).parse(command)        scoreDocs = searcher.search(query, 10000).scoreDocs        print "%s total matching documents." % len(scoreDocs)        for scoreDoc in scoreDocs:            doc = searcher.doc(scoreDoc.doc)            explanation = searcher.explain(query, scoreDoc.doc)            #print explanation.toString()            print 'path:', doc.get("path"), 'name:', doc.get("name"), doc.get("sentence_id")if __name__ == '__main__':    lucene.initVM(vmargs=['-Djava.awt.headless=true'])    print 'lucene', lucene.VERSION    base_dir = os.path.dirname(os.path.abspath("."))    print base_dir    directory = SimpleFSDirectory(File(os.path.join(base_dir, INDEX_DIR)))    searcher = IndexSearcher(DirectoryReader.open(directory))    reader = IndexReader.open(directory)    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)    run(searcher, analyzer, reader)    del searcher

很好的参考文献:http://www.cppblog.com/baby-fly/archive/2010/03/08/109189.html

0 0
原创粉丝点击