pylucene的使用

来源:互联网 发布:梦龙网络计划软件 编辑:程序博客网 时间:2024/05/17 02:23

前一段时间做东西用到了pylucene,包括建立索引,检索,高亮显示等等。贴两段代码,希望对大家有用。

pylucene的安装就不多说了,我用的版本是PyLucene-1.9.1。

建立索引:

#!/usr/bin/env python

import os
import PyLucene


class IndexFiles:
    
"""
    create index by PyLucene, just need your dir path,
    the result files saved in the directory index in
    current path
    
"""

    
def __init__(self, root, storeDir, analyzer):
        
if not os.path.exists(storeDir):
            os.mkdir(storeDir)
        analyzer 
= PyLucene.StandardAnalyzer()       
        store 
= PyLucene.FSDirectory.getDirectory(storeDir , True)
        writer 
= PyLucene.IndexWriter(store, analyzer, True)
        self.indexDocs(root, writer)
        
print 'optimizing index',
        writer.optimize()
        writer.close()
        
print 'done'

    
def indexDocs(self, root, writer):
        
for root, dirnames, filenames in os.walk(root):
            
for filename in filenames:
                
if not filename.endswith('.txt'):
                    
continue
                
print "adding", filename
                
try:
                    path 
= os.path.join(root, filename)
                    file 
= open(path)
                    contents 
= unicode(file.read(), 'gbk')
                    file.close()
                    doc 
= PyLucene.Document()
                    doc.add(PyLucene.Field.Keyword(u
"name", filename.decode('gbk')))
                    doc.add(PyLucene.Field.Text(u
"path", path.decode('gbk')))
                    
if len(contents) > 0:
                        
pass
                        doc.add(PyLucene.Field.Text(u
"contents", contents))
                    
else:
                        
print "warning: no content in %s" % filename
                    writer.addDocument(doc)
                
except Exception, e:
                    
print "Failed in indexDocs:", e

def indexmain(path):
    
try:
        IndexFiles(path, 
"index", PyLucene.StandardAnalyzer())
        
return ''
    
except Exception, e:
        
return e
    
if __name__ == '__main__':
    indexpath 
= raw_input("path: ")
    indexmain(indexpath)

检索,高亮显示:

 

#!/usr/bin/env python

import time
from StringIO import StringIO
from PyLucene import *


class TestFormatter(Formatter):
    
    
    
def __init__(self):
        
pass

    
def highlightTerm(self, originalText, group):
        
if group.getTotalScore() <= 0:
            
return originalText    
        
return "<font color="red">" + originalText + "</font>"

class Search:
    
    
    
def __init__(self):
        STORE_DIR 
= "index"
        self.directory 
= FSDirectory.getDirectory(STORE_DIR, False)
        self.analyzer 
= ChineseAnalyzer()
        self.maxNumFragmentsRequired 
= 2
        self.fragmentSeparator 
= u"..."
        
    
def search(self, query, start):
        searcher 
= IndexSearcher(self.directory)  
        query 
= query.decode('gbk')
        query 
= QueryParser.parse(query, "contents", self.analyzer)
        starttime 
= time.time()
        hits 
= searcher.search(query)        
        formatter 
= TestFormatter()
        highlighter 
= Highlighter(formatter, QueryScorer(query))
        highlighter.setTextFragmenter(SimpleFragmenter(
60))
        resultdic 
= {}
        totalnum 
= hits.length()
        
for i in range(10):
            index 
= start + i
            
if index >= totalnum:
                
break
            
try:
                doc 
= hits.doc(index)
            
except:
                
continue
            text 
= doc.get("contents")
            tokenStream 
= self.analyzer.tokenStream("contents", StringIO(text)) 
            result 
= highlighter.getBestFragments(
              tokenStream,
              text,
              self.maxNumFragmentsRequired,
              self.fragmentSeparator)
            score 
= hits.score(index)
            
if resultdic.has_key(score):
                score 
+= 0.0001
            resultdic[score] 
= [result, doc.get("path")]
        stoptime 
= time.time()
        usetime 
= stoptime - starttime            
        searcher.close()    
        ks 
= resultdic.keys()
                
        
return resultdic, totalnum, usetime
        

if __name__ == '__main__':
    tt 
= Search()
    command 
= raw_input("Query:").decode('gbk')
    tt.search(command, 0)

处理中文时注意编码

 

原创粉丝点击