Lucene 在Ubuntu+Python2的环境下进行搜索
来源:互联网 发布:java web开发与实战 编辑:程序博客网 时间:2024/06/15 17:54
Lucene 在Ubuntu+Python2的环境下进行搜索
IndexFiles.py:
import sys, os, lucene, threading, time, tracebackfrom datetime import datetimefrom java.io import Filefrom org.apache.lucene.analysis.miscellaneous import LimitTokenCountAnalyzerfrom org.apache.lucene.analysis.standard import StandardAnalyzerfrom org.apache.lucene.document import TextField, Document, Field, FieldTypefrom org.apache.lucene.index import FieldInfo, IndexWriter, IndexWriterConfigfrom org.apache.lucene.store import SimpleFSDirectoryfrom org.apache.lucene.util import Versionfrom java.nio.file import Paths"""This class is loosely based on the Lucene (java implementation) demo classorg.apache.lucene.demo.IndexFiles. It will take a directory as an argumentand will index all of the files in that directory and downward recursively.It will index on the file path, the file name and the file contents. Theresulting Lucene index will be placed in the current directory and called'index'."""class Ticker(object): def __init__(self): self.tick = True def run(self): while self.tick: sys.stdout.write('.') sys.stdout.flush() time.sleep(1.0)class IndexFiles(object): """Usage: python IndexFiles <doc_directory>""" def __init__(self, root, storeDir, analyzer): if not os.path.exists(storeDir): os.mkdir(storeDir) path = Paths.get(storeDir) store = SimpleFSDirectory(path) config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) self.indexDocs(root, writer) ticker = Ticker() print 'optimizing index', threading.Thread(target=ticker.run).start() writer.close() ticker.tick = False print 'done' def indexDocs(self, root, writer): for root, dirnames, filenames in os.walk(root): print root try: sroot = unicode(root, 'GBK') print sroot except: print "*****************************unicode error" print root continue #FieldType: t1 = FieldType() t1.setStored(True) t1.setTokenized(False) t2 = FieldType() t2.setStored(True) t2.setTokenized(True) #add dir doc = Document() #doc.add(TextField("path", sroot,Field.Store.YES)) #doc.add(TextField("name", sroot,Field.Store.YES)) #writer.addDocument(doc) for filename in filenames: try: filename = unicode(filename, 'GBK') print filename except: print "*****************************unicode error" print filename continue print "adding", filename try: path = unicode(root, 'GBK') path =os.path.join(sroot, filename) print "read file: ", path file = open(path) #contents = unicode(file.read(), 'iso-8859-1') #contents = unicode(file.read(), 'GBK') contents = file.read() print "contents is:", contents file.close() doc = Document() doc.add(TextField("path", path, Field.Store.YES)) doc.add(TextField("name", filename, Field.Store.YES)) if len(contents) > 0: doc.add(TextField("contents", contents, Field.Store.YES)) else: print "warning: no content in %s" % filename writer.addDocument(doc) writer.commit() print "[THIS FILE]:",doc except Exception, e: print "Failed in indexDocs:", e__debug = 0if __name__ == '__main__': if __debug != 1: if len(sys.argv) < 2: print IndexFiles.__doc__ sys.exit(1) print 'Lucene', lucene.VERSION start = datetime.now() lucene.initVM() try: if __debug != 1: IndexFiles(sys.argv[1], "index", StandardAnalyzer()) else: IndexFiles(r'../corpus/', "index", StandardAnalyzer()) end = datetime.now() print end - start except Exception, e: print "Failed: ", e print "traceback: ", traceback.print_exc()
SearchFile.py:
import sys, os, lucenefrom java.io import Filefrom org.apache.lucene.analysis.standard import StandardAnalyzerfrom org.apache.lucene.index import DirectoryReaderfrom org.apache.lucene.index import Termfrom org.apache.lucene.queryparser.classic import QueryParserfrom org.apache.lucene.store import SimpleFSDirectoryfrom org.apache.lucene.search import IndexSearcherfrom org.apache.lucene.search import Query, TermQueryfrom org.apache.lucene.util import Versionfrom java.nio.file import Paths"""This script is loosely based on the Lucene (java implementation) demo classorg.apache.lucene.demo.SearchFiles. It will prompt for a search query, then itwill search the Lucene index in the current directory called 'index' for thesearch query entered against the 'contents' field. It will then display the'path' and 'name' fields for each of the hits it finds in the index. Note thatsearch.close() is currently commented out because it causes a stack overflow insome cases."""def run(searcher, analyzer): while True: print print "Hit enter with no input to quit." command = raw_input("Query:") command = unicode(command, 'GBK') if command == '': return print print "Searching for:", command parser = QueryParser("contents",analyzer) query = parser.parse(command) #query = QueryParser("name", analyzer).parse(command) hits = searcher.search(query, 1000) print "%s total matching documents." % hits.totalHits scoredocs = hits.scoreDocs for docs in scoredocs: doc = searcher.doc(docs.doc) print "path: ",doc.get("path"),"name: ",doc.get("name")if __name__ == '__main__': STORE_DIR = "index" print 'Lucene', lucene.VERSION lucene.initVM() directory = SimpleFSDirectory(Paths.get(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer() run(searcher, analyzer)
运行过程中首先运行IndexFiles.py建立索引:
sudo python IndexFiles.py ../corpus/
corpus下放的是预料文件。
然后进行搜索:
sudo python SearchFiles.py
然后再Query后面写相关搜索内容。
阅读全文
0 0
- Lucene 在Ubuntu+Python2的环境下进行搜索
- 在ubuntu环境下进行Opencv分类器的训练
- Lucene在同一索引多个域上进行搜索
- ubuntu下配置python2.7.13神经网络环境
- Ubuntu环境下python2和python3切换
- Ubuntu下进行Github的环境配置
- Lucene在Linux下环境的搭建和运行
- ubuntu+python2+pyqt4+eric6的环境搭建
- 在anaconda2下和cmd下进行sklearn安装的不同--python2.7
- 关于在WindowsXp 环境下Python2.6 + web.py 构建WEB开发环境遇到的问题
- lucene实现 在某一范围进行搜索 RangeQuery
- Lucene在多个索引上进行搜索
- 在ubuntu中使用virtualenv创建python2和python3的虚拟环境
- 在ubuntu中使用virtualenv创建python2和python3的虚拟环境
- Ubuntu下Python3和Python2的共存
- Python3在ubuntu环境下的安装
- Android在Ubuntu下的环境搭配
- NDK在Ubuntu环境下的配置
- ReadFile ReadFileEx异步操作 控制台程序
- 分析tensorflow代码(Hello world)
- 通过npm安装webpack
- MySql存储过程的异常处理理论及实例
- android 增加自己的应用签名校验
- Lucene 在Ubuntu+Python2的环境下进行搜索
- 关于微信支付
- 基础认证伪造工具phishery
- 阿里热修复
- JavaScript基础(一)
- 网页弹幕实现(PHP+JS)
- 阿里云前端周刊
- ubuntu永久修改主机名
- 用java程序生成助记码,即输入张三时,得到的结果是JS;获得名称的首字母大写