Lucene 在Ubuntu+Python2的环境下进行搜索

来源:互联网 发布:java web开发与实战 编辑:程序博客网 时间:2024/06/15 17:54

Lucene 在Ubuntu+Python2的环境下进行搜索

IndexFiles.py:

import sys, os, lucene, threading, time, tracebackfrom datetime import datetimefrom java.io import Filefrom org.apache.lucene.analysis.miscellaneous import LimitTokenCountAnalyzerfrom org.apache.lucene.analysis.standard import StandardAnalyzerfrom org.apache.lucene.document import TextField, Document, Field, FieldTypefrom org.apache.lucene.index import FieldInfo, IndexWriter, IndexWriterConfigfrom org.apache.lucene.store import SimpleFSDirectoryfrom org.apache.lucene.util import Versionfrom java.nio.file import Paths"""This class is loosely based on the Lucene (java implementation) demo classorg.apache.lucene.demo.IndexFiles.  It will take a directory as an argumentand will index all of the files in that directory and downward recursively.It will index on the file path, the file name and the file contents.  Theresulting Lucene index will be placed in the current directory and called'index'."""class Ticker(object):    def __init__(self):        self.tick = True    def run(self):        while self.tick:            sys.stdout.write('.')            sys.stdout.flush()            time.sleep(1.0)class IndexFiles(object):    """Usage: python IndexFiles <doc_directory>"""    def __init__(self, root, storeDir, analyzer):        if not os.path.exists(storeDir):            os.mkdir(storeDir)        path = Paths.get(storeDir)        store = SimpleFSDirectory(path)        config = IndexWriterConfig(analyzer)        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)        writer = IndexWriter(store, config)        self.indexDocs(root, writer)        ticker = Ticker()        print 'optimizing index',        threading.Thread(target=ticker.run).start()        writer.close()        ticker.tick = False        print 'done'    def indexDocs(self, root, writer):        for root, dirnames, filenames in os.walk(root):            print root            try:                sroot = unicode(root, 'GBK')                print sroot            except:                print "*****************************unicode error"                print root                continue            #FieldType:            t1 = FieldType()            t1.setStored(True)            t1.setTokenized(False)            t2 = FieldType()            t2.setStored(True)            t2.setTokenized(True)            #add dir            doc = Document()            #doc.add(TextField("path", sroot,Field.Store.YES))            #doc.add(TextField("name", sroot,Field.Store.YES))            #writer.addDocument(doc)            for filename in filenames:                try:                    filename = unicode(filename, 'GBK')                    print filename                except:                    print "*****************************unicode error"                    print filename                    continue                print "adding", filename                try:                    path = unicode(root, 'GBK')                    path =os.path.join(sroot, filename)                    print "read file: ", path                    file = open(path)                    #contents = unicode(file.read(), 'iso-8859-1')                    #contents = unicode(file.read(), 'GBK')                    contents = file.read()                    print "contents is:", contents                    file.close()                    doc = Document()                    doc.add(TextField("path", path,                                           Field.Store.YES))                    doc.add(TextField("name", filename,                                           Field.Store.YES))                    if len(contents) > 0:                        doc.add(TextField("contents", contents,                                               Field.Store.YES))                    else:                        print "warning: no content in %s" % filename                    writer.addDocument(doc)                    writer.commit()                    print "[THIS FILE]:",doc                except Exception, e:                    print "Failed in indexDocs:", e__debug = 0if __name__ == '__main__':    if __debug != 1:        if len(sys.argv) < 2:            print IndexFiles.__doc__            sys.exit(1)    print 'Lucene', lucene.VERSION    start = datetime.now()    lucene.initVM()    try:        if __debug != 1:            IndexFiles(sys.argv[1], "index", StandardAnalyzer())        else:            IndexFiles(r'../corpus/', "index", StandardAnalyzer())        end = datetime.now()        print end - start    except Exception, e:        print "Failed: ", e        print "traceback: ", traceback.print_exc()

SearchFile.py:

import sys, os, lucenefrom java.io import Filefrom org.apache.lucene.analysis.standard import StandardAnalyzerfrom org.apache.lucene.index import DirectoryReaderfrom org.apache.lucene.index import Termfrom org.apache.lucene.queryparser.classic import QueryParserfrom org.apache.lucene.store import SimpleFSDirectoryfrom org.apache.lucene.search import IndexSearcherfrom org.apache.lucene.search import Query, TermQueryfrom org.apache.lucene.util import Versionfrom java.nio.file import Paths"""This script is loosely based on the Lucene (java implementation) demo classorg.apache.lucene.demo.SearchFiles.  It will prompt for a search query, then itwill search the Lucene index in the current directory called 'index' for thesearch query entered against the 'contents' field.  It will then display the'path' and 'name' fields for each of the hits it finds in the index.  Note thatsearch.close() is currently commented out because it causes a stack overflow insome cases."""def run(searcher, analyzer):    while True:        print        print "Hit enter with no input to quit."        command = raw_input("Query:")        command = unicode(command, 'GBK')        if command == '':            return        print        print "Searching for:", command        parser = QueryParser("contents",analyzer)        query = parser.parse(command)        #query = QueryParser("name", analyzer).parse(command)        hits = searcher.search(query, 1000)        print "%s total matching documents." % hits.totalHits        scoredocs = hits.scoreDocs        for docs in scoredocs:            doc = searcher.doc(docs.doc)            print "path: ",doc.get("path"),"name: ",doc.get("name")if __name__ == '__main__':    STORE_DIR = "index"    print 'Lucene', lucene.VERSION    lucene.initVM()    directory = SimpleFSDirectory(Paths.get(STORE_DIR))    searcher = IndexSearcher(DirectoryReader.open(directory))    analyzer = StandardAnalyzer()    run(searcher, analyzer)

运行过程中首先运行IndexFiles.py建立索引:

sudo python IndexFiles.py ../corpus/

corpus下放的是预料文件。

然后进行搜索:

sudo python SearchFiles.py

然后再Query后面写相关搜索内容。

原创粉丝点击