pylucene的使用
来源:互联网 发布:梦龙网络计划软件 编辑:程序博客网 时间:2024/05/17 02:23
前一段时间做东西用到了pylucene,包括建立索引,检索,高亮显示等等。贴两段代码,希望对大家有用。
pylucene的安装就不多说了,我用的版本是PyLucene-1.9.1。
建立索引:
#!/usr/bin/env python
import os
import PyLucene
class IndexFiles:
"""
create index by PyLucene, just need your dir path,
the result files saved in the directory index in
current path
"""
def __init__(self, root, storeDir, analyzer):
if not os.path.exists(storeDir):
os.mkdir(storeDir)
analyzer = PyLucene.StandardAnalyzer()
store = PyLucene.FSDirectory.getDirectory(storeDir , True)
writer = PyLucene.IndexWriter(store, analyzer, True)
self.indexDocs(root, writer)
print 'optimizing index',
writer.optimize()
writer.close()
print 'done'
def indexDocs(self, root, writer):
for root, dirnames, filenames in os.walk(root):
for filename in filenames:
if not filename.endswith('.txt'):
continue
print "adding", filename
try:
path = os.path.join(root, filename)
file = open(path)
contents = unicode(file.read(), 'gbk')
file.close()
doc = PyLucene.Document()
doc.add(PyLucene.Field.Keyword(u"name", filename.decode('gbk')))
doc.add(PyLucene.Field.Text(u"path", path.decode('gbk')))
if len(contents) > 0:
pass
doc.add(PyLucene.Field.Text(u"contents", contents))
else:
print "warning: no content in %s" % filename
writer.addDocument(doc)
except Exception, e:
print "Failed in indexDocs:", e
def indexmain(path):
try:
IndexFiles(path, "index", PyLucene.StandardAnalyzer())
return ''
except Exception, e:
return e
if __name__ == '__main__':
indexpath = raw_input("path: ")
indexmain(indexpath)
import os
import PyLucene
class IndexFiles:
"""
create index by PyLucene, just need your dir path,
the result files saved in the directory index in
current path
"""
def __init__(self, root, storeDir, analyzer):
if not os.path.exists(storeDir):
os.mkdir(storeDir)
analyzer = PyLucene.StandardAnalyzer()
store = PyLucene.FSDirectory.getDirectory(storeDir , True)
writer = PyLucene.IndexWriter(store, analyzer, True)
self.indexDocs(root, writer)
print 'optimizing index',
writer.optimize()
writer.close()
print 'done'
def indexDocs(self, root, writer):
for root, dirnames, filenames in os.walk(root):
for filename in filenames:
if not filename.endswith('.txt'):
continue
print "adding", filename
try:
path = os.path.join(root, filename)
file = open(path)
contents = unicode(file.read(), 'gbk')
file.close()
doc = PyLucene.Document()
doc.add(PyLucene.Field.Keyword(u"name", filename.decode('gbk')))
doc.add(PyLucene.Field.Text(u"path", path.decode('gbk')))
if len(contents) > 0:
pass
doc.add(PyLucene.Field.Text(u"contents", contents))
else:
print "warning: no content in %s" % filename
writer.addDocument(doc)
except Exception, e:
print "Failed in indexDocs:", e
def indexmain(path):
try:
IndexFiles(path, "index", PyLucene.StandardAnalyzer())
return ''
except Exception, e:
return e
if __name__ == '__main__':
indexpath = raw_input("path: ")
indexmain(indexpath)
检索,高亮显示:
#!/usr/bin/env python
import time
from StringIO import StringIO
from PyLucene import *
class TestFormatter(Formatter):
def __init__(self):
pass
def highlightTerm(self, originalText, group):
if group.getTotalScore() <= 0:
return originalText
return "<font color="red">" + originalText + "</font>"
class Search:
def __init__(self):
STORE_DIR = "index"
self.directory = FSDirectory.getDirectory(STORE_DIR, False)
self.analyzer = ChineseAnalyzer()
self.maxNumFragmentsRequired = 2
self.fragmentSeparator = u"..."
def search(self, query, start):
searcher = IndexSearcher(self.directory)
query = query.decode('gbk')
query = QueryParser.parse(query, "contents", self.analyzer)
starttime = time.time()
hits = searcher.search(query)
formatter = TestFormatter()
highlighter = Highlighter(formatter, QueryScorer(query))
highlighter.setTextFragmenter(SimpleFragmenter(60))
resultdic = {}
totalnum = hits.length()
for i in range(10):
index = start + i
if index >= totalnum:
break
try:
doc = hits.doc(index)
except:
continue
text = doc.get("contents")
tokenStream = self.analyzer.tokenStream("contents", StringIO(text))
result = highlighter.getBestFragments(
tokenStream,
text,
self.maxNumFragmentsRequired,
self.fragmentSeparator)
score = hits.score(index)
if resultdic.has_key(score):
score += 0.0001
resultdic[score] = [result, doc.get("path")]
stoptime = time.time()
usetime = stoptime - starttime
searcher.close()
ks = resultdic.keys()
return resultdic, totalnum, usetime
if __name__ == '__main__':
tt = Search()
command = raw_input("Query:").decode('gbk')
tt.search(command, 0)
import time
from StringIO import StringIO
from PyLucene import *
class TestFormatter(Formatter):
def __init__(self):
pass
def highlightTerm(self, originalText, group):
if group.getTotalScore() <= 0:
return originalText
return "<font color="red">" + originalText + "</font>"
class Search:
def __init__(self):
STORE_DIR = "index"
self.directory = FSDirectory.getDirectory(STORE_DIR, False)
self.analyzer = ChineseAnalyzer()
self.maxNumFragmentsRequired = 2
self.fragmentSeparator = u"..."
def search(self, query, start):
searcher = IndexSearcher(self.directory)
query = query.decode('gbk')
query = QueryParser.parse(query, "contents", self.analyzer)
starttime = time.time()
hits = searcher.search(query)
formatter = TestFormatter()
highlighter = Highlighter(formatter, QueryScorer(query))
highlighter.setTextFragmenter(SimpleFragmenter(60))
resultdic = {}
totalnum = hits.length()
for i in range(10):
index = start + i
if index >= totalnum:
break
try:
doc = hits.doc(index)
except:
continue
text = doc.get("contents")
tokenStream = self.analyzer.tokenStream("contents", StringIO(text))
result = highlighter.getBestFragments(
tokenStream,
text,
self.maxNumFragmentsRequired,
self.fragmentSeparator)
score = hits.score(index)
if resultdic.has_key(score):
score += 0.0001
resultdic[score] = [result, doc.get("path")]
stoptime = time.time()
usetime = stoptime - starttime
searcher.close()
ks = resultdic.keys()
return resultdic, totalnum, usetime
if __name__ == '__main__':
tt = Search()
command = raw_input("Query:").decode('gbk')
tt.search(command, 0)
处理中文时注意编码
- [PyLucene]使用PyLucene中JVM的疑问
- pylucene的使用
- PyLucene中使用自己的Analyzer
- PyLucene安装及使用
- [pylucene]Pylucene不能导入SmartChineseAnalyzer的解决办法
- PyLucene是基于Python的Lucene
- pylucene在python2.7下的安装
- 安装PyLucene时执行make的错误代码
- 在pyLucene中使用中文分词器(在pyLucene中引用Jar包)
- pylucene分别安装在linux和windows(官方无Windows版的pylucene,要自编译)
- pylucene入门
- PyLucene安装
- PyLucene实战
- pylucene安装手记
- PyLucene win版安装
- Linux安装配置Pylucene
- PyLucene安装与初试
- PyLucene学习之一
- IE中javascript不能运行问题如何解决?
- Java语言中所有常用术语解释大全
- C#指定窗口显示位置
- 暂时从风上找的 .net动态编绎 的文章
- 用观察者模式学习asp.net的控件技巧
- pylucene的使用
- 突破与局限 网关安全守卫在边缘
- 微软,您的.net为中国程序员带来了什么?
- 一天只关注走一步
- Tencent之行
- 免费短信控件(支持Wavecom模块)
- IronPython 2.0 Alpha 2 Released
- 换行
- 請讓軟件的幫助,更友善些吧