用PyLucene实现本地文件名、目录名全文索引
来源:互联网 发布:代县数据库工程师招聘 编辑:程序博客网 时间:2024/04/28 08:16
用PyLucene实现本地文件名全文索引
功能
如果我们想在硬盘上查找一个文件或目录,可以使用Windows下的“搜索”功能,但每次搜索都要遍历整个硬盘,速度很慢,另外,如果要查找多个关键字也很困难。能否做一个类似于“本地搜索”功能的简单搜索程序,空闲的时候对硬盘的目录名和文件名进行索引,但需要查找文件名时可以“非常”快的超找到文件的位置?这可以通过Lucene实现。
实现
我们用Python来实现。首先安装PyLucene,可以参考《PyLucene安装及使用》。
PyLucene Samples目录下的IndexFiles.py和SearchFiles.py完成了对指定目录下的.txt文件内容进行索引,我们可以修改两个文件实现上面的功能。另外为了能够检索简体中文、繁体中文文件名、目录名对文件名和目录名进行了Unicode编码。源码如下:
IndexFiles.py
# -*- coding:GB2312 -*-
import sys, os, PyLucene, threading, time
from datetime import datetime
"""
This class is loosely based on the Lucene (java implementation) demo class
org.apache.lucene.demo.IndexFiles. It will take a directory as an argument
and will index all of the files in that directory and downward recursively.
It will index on the file path, the file name and the file contents. The
resulting Lucene index will be placed in the current directory and called
'index'.
"""
class Ticker(object):
def __init__(self):
self.tick = True
def run(self):
while self.tick:
sys.stdout.write('.')
sys.stdout.flush()
time.sleep(1.0)
class IndexFiles(object):
"""Usage: python IndexFiles <doc_directory>"""
def __init__(self, root, storeDir, analyzer):
if not os.path.exists(storeDir):
os.mkdir(storeDir)
store = PyLucene.FSDirectory.getDirectory(storeDir, False)
writer = PyLucene.IndexWriter(store, analyzer, False)
writer.setMaxFieldLength(1048576)
self.indexDocs(root, writer)
ticker = Ticker()
print 'optimizing index',
threading.Thread(target=ticker.run).start()
writer.optimize()
writer.close()
ticker.tick = False
print 'done'
def indexDocs(self, root, writer):
for root, dirnames, filenames in os.walk(root):
print root
try:
sroot = unicode(root, 'GBK')
print sroot
except:
print "*****************************unicode error"
print root
continue
#add dir
doc = PyLucene.Document()
doc.add(PyLucene.Field("path", sroot,
PyLucene.Field.Store.YES,
PyLucene.Field.Index.UN_TOKENIZED))
doc.add(PyLucene.Field("name", sroot,
PyLucene.Field.Store.YES,
PyLucene.Field.Index.TOKENIZED))
writer.addDocument(doc)
for filename in filenames:
try:
filename = unicode(filename, 'GBK')
except:
print "*****************************unicode error"
print filename
continue
print "adding", filename
try:
#path = unicode(root, 'GB2312')#
#
path =os.path.join(sroot, filename)
#file = open(path)
#contents = unicode(file.read(), 'iso-8859-1')
#contents = unicode(file.read(), 'GBK')
#file.close()
doc = PyLucene.Document()
doc.add(PyLucene.Field("path", path,
PyLucene.Field.Store.YES,
PyLucene.Field.Index.UN_TOKENIZED))
doc.add(PyLucene.Field("name", filename,
PyLucene.Field.Store.YES,
PyLucene.Field.Index.TOKENIZED))
'''
if len(contents) > 0:
doc.add(PyLucene.Field("contents", contents,
PyLucene.Field.Store.YES,
PyLucene.Field.Index.TOKENIZED))
else:
print "warning: no content in %s" % filename
'''
writer.addDocument(doc)
except Exception, e:
print "Failed in indexDocs:", e
__debug = 0
if __name__ == '__main__':
if __debug != 1:
if len(sys.argv) < 2:
print IndexFiles.__doc__
sys.exit(1)
print 'PyLucene', PyLucene.VERSION, 'Lucene', PyLucene.LUCENE_VERSION
start = datetime.now()
try:
if __debug != 1:
IndexFiles(sys.argv[1], "index", PyLucene.StandardAnalyzer())
else:
IndexFiles(r'c:/testccc', "index", PyLucene.StandardAnalyzer())
end = datetime.now()
print end - start
except Exception, e:
print "Failed: ", e
import sys, os, PyLucene, threading, time
from datetime import datetime
"""
This class is loosely based on the Lucene (java implementation) demo class
org.apache.lucene.demo.IndexFiles. It will take a directory as an argument
and will index all of the files in that directory and downward recursively.
It will index on the file path, the file name and the file contents. The
resulting Lucene index will be placed in the current directory and called
'index'.
"""
class Ticker(object):
def __init__(self):
self.tick = True
def run(self):
while self.tick:
sys.stdout.write('.')
sys.stdout.flush()
time.sleep(1.0)
class IndexFiles(object):
"""Usage: python IndexFiles <doc_directory>"""
def __init__(self, root, storeDir, analyzer):
if not os.path.exists(storeDir):
os.mkdir(storeDir)
store = PyLucene.FSDirectory.getDirectory(storeDir, False)
writer = PyLucene.IndexWriter(store, analyzer, False)
writer.setMaxFieldLength(1048576)
self.indexDocs(root, writer)
ticker = Ticker()
print 'optimizing index',
threading.Thread(target=ticker.run).start()
writer.optimize()
writer.close()
ticker.tick = False
print 'done'
def indexDocs(self, root, writer):
for root, dirnames, filenames in os.walk(root):
print root
try:
sroot = unicode(root, 'GBK')
print sroot
except:
print "*****************************unicode error"
print root
continue
#add dir
doc = PyLucene.Document()
doc.add(PyLucene.Field("path", sroot,
PyLucene.Field.Store.YES,
PyLucene.Field.Index.UN_TOKENIZED))
doc.add(PyLucene.Field("name", sroot,
PyLucene.Field.Store.YES,
PyLucene.Field.Index.TOKENIZED))
writer.addDocument(doc)
for filename in filenames:
try:
filename = unicode(filename, 'GBK')
except:
print "*****************************unicode error"
print filename
continue
print "adding", filename
try:
#path = unicode(root, 'GB2312')#
#
path =os.path.join(sroot, filename)
#file = open(path)
#contents = unicode(file.read(), 'iso-8859-1')
#contents = unicode(file.read(), 'GBK')
#file.close()
doc = PyLucene.Document()
doc.add(PyLucene.Field("path", path,
PyLucene.Field.Store.YES,
PyLucene.Field.Index.UN_TOKENIZED))
doc.add(PyLucene.Field("name", filename,
PyLucene.Field.Store.YES,
PyLucene.Field.Index.TOKENIZED))
'''
if len(contents) > 0:
doc.add(PyLucene.Field("contents", contents,
PyLucene.Field.Store.YES,
PyLucene.Field.Index.TOKENIZED))
else:
print "warning: no content in %s" % filename
'''
writer.addDocument(doc)
except Exception, e:
print "Failed in indexDocs:", e
__debug = 0
if __name__ == '__main__':
if __debug != 1:
if len(sys.argv) < 2:
print IndexFiles.__doc__
sys.exit(1)
print 'PyLucene', PyLucene.VERSION, 'Lucene', PyLucene.LUCENE_VERSION
start = datetime.now()
try:
if __debug != 1:
IndexFiles(sys.argv[1], "index", PyLucene.StandardAnalyzer())
else:
IndexFiles(r'c:/testccc', "index", PyLucene.StandardAnalyzer())
end = datetime.now()
print end - start
except Exception, e:
print "Failed: ", e
SearchFiles.py
from PyLucene import QueryParser, IndexSearcher, StandardAnalyzer, FSDirectory
from PyLucene import VERSION, LUCENE_VERSION
"""
This script is loosely based on the Lucene (java implementation) demo class
org.apache.lucene.demo.SearchFiles. It will prompt for a search query, then it
will search the Lucene index in the current directory called 'index' for the
search query entered against the 'contents' field. It will then display the
'path' and 'name' fields for each of the hits it finds in the index. Note that
search.close() is currently commented out because it causes a stack overflow in
some cases.
"""
def run(searcher, analyzer):
while True:
print
print "Hit enter with no input to quit."
command = raw_input("Query:")
command = unicode(command, 'GBK')
if command == '':
return
print
print "Searching for:", command
#query = QueryParser("contents", analyzer).parse(command)
query = QueryParser("name", analyzer).parse(command)
hits = searcher.search(query)
print "%s total matching documents." % hits.length()
for i, doc in hits:
print 'path:', doc.get("path"), 'name:', doc.get("name")
if __name__ == '__main__':
STORE_DIR = "index"
print 'PyLucene', VERSION, 'Lucene', LUCENE_VERSION
directory = FSDirectory.getDirectory(STORE_DIR, False)
searcher = IndexSearcher(directory)
analyzer = StandardAnalyzer()
run(searcher, analyzer)
searcher.close()
from PyLucene import VERSION, LUCENE_VERSION
"""
This script is loosely based on the Lucene (java implementation) demo class
org.apache.lucene.demo.SearchFiles. It will prompt for a search query, then it
will search the Lucene index in the current directory called 'index' for the
search query entered against the 'contents' field. It will then display the
'path' and 'name' fields for each of the hits it finds in the index. Note that
search.close() is currently commented out because it causes a stack overflow in
some cases.
"""
def run(searcher, analyzer):
while True:
print "Hit enter with no input to quit."
command = raw_input("Query:")
command = unicode(command, 'GBK')
if command == '':
return
print "Searching for:", command
#query = QueryParser("contents", analyzer).parse(command)
query = QueryParser("name", analyzer).parse(command)
hits = searcher.search(query)
print "%s total matching documents." % hits.length()
for i, doc in hits:
print 'path:', doc.get("path"), 'name:', doc.get("name")
if __name__ == '__main__':
STORE_DIR = "index"
print 'PyLucene', VERSION, 'Lucene', LUCENE_VERSION
directory = FSDirectory.getDirectory(STORE_DIR, False)
searcher = IndexSearcher(directory)
analyzer = StandardAnalyzer()
run(searcher, analyzer)
searcher.close()
建立索引,运行:
python IndexFiles.py c:/
查找的时候,运行:
python SearchFiles.py
如果只查找一个关键词则直接输入;如果想同时查找两个关键词,如Python 网络,则输入:Python AND 网络;如果想查找Python或网络则:Python 网络,也可以Python OR 网络。
其他的查询请参考API文档。
如果上面代码中用到的函数不太明白可以参考《实战 Lucene》
- 用PyLucene实现本地文件名、目录名全文索引
- 目录 - 博客全文索引
- Oracle实现全文索引
- solr实现全文索引
- 用OpenFileDialog 取得文件名后 分别取出目录名和文件名
- 配置数据库的全文目录和索引
- 备份和还原全文目录和索引
- 关于SQLSERVER的全文目录跟全文索引的区别
- 关于SQLSERVER的全文目录跟全文索引的区别
- linux_取得路径文件名和目录名
- GetFiles GetDirectories 获取文件名,目录名排序
- shell 提取文件名和目录名
- shell递归输出文件名和目录名
- shell提取文件名和目录名
- Linux shell 提取文件名和目录名
- Linux shell 提取文件名和目录名
- shell截取文件名或目录名
- java实现web图片下载和url(文件名、目录名或卷标语法不正确)问题
- 《struts2权威指南》学习笔记之struts2多文件上传--使用数组方式
- [转]C/C++ 误区五:检查 new 的返回值
- 利用STRUTS实现国际化支持
- MessageClient
- 计算机基础知识 [比赛用]
- 用PyLucene实现本地文件名、目录名全文索引
- C中常用宏定义
- 用ASP.NET 2.0设计网络在线投票系统
- ColumnManager
- SVN使用实况
- 用C# 编写 Windows 服务
- liferay中portlet的action处理流程
- liferay 中的配置文件
- liferay中的proferences处理