用PyLucene实现本地文件名、目录名全文索引

来源：互联网发布：代县数据库工程师招聘编辑：程序博客网时间：2024/04/28 08:16

用PyLucene实现本地文件名全文索引

功能

如果我们想在硬盘上查找一个文件或目录，可以使用Windows下的“搜索”功能，但每次搜索都要遍历整个硬盘，速度很慢，另外，如果要查找多个关键字也很困难。能否做一个类似于“本地搜索”功能的简单搜索程序，空闲的时候对硬盘的目录名和文件名进行索引，但需要查找文件名时可以“非常”快的超找到文件的位置？这可以通过Lucene实现。

实现

我们用Python来实现。首先安装PyLucene，可以参考《PyLucene安装及使用》。

PyLucene Samples目录下的IndexFiles.py和SearchFiles.py完成了对指定目录下的.txt文件内容进行索引，我们可以修改两个文件实现上面的功能。另外为了能够检索简体中文、繁体中文文件名、目录名对文件名和目录名进行了Unicode编码。源码如下：

IndexFiles.py

# -*- coding:GB2312 -*-

import sys, os, PyLucene, threading, time

from datetime import datetime

"""

This class is loosely based on the Lucene (java implementation) demo class

org.apache.lucene.demo.IndexFiles. It will take a directory as an argument

and will index all of the files in that directory and downward recursively.

It will index on the file path, the file name and the file contents. The

resulting Lucene index will be placed in the current directory and called

'index'.

"""

class Ticker(object):

def __init__(self):

self.tick = True

def run(self):

while self.tick:

sys.stdout.write('.')

sys.stdout.flush()

time.sleep(1.0)

class IndexFiles(object):

"""Usage: python IndexFiles <doc_directory>"""

def __init__(self, root, storeDir, analyzer):

if not os.path.exists(storeDir):

os.mkdir(storeDir)

store = PyLucene.FSDirectory.getDirectory(storeDir, False)

writer = PyLucene.IndexWriter(store, analyzer, False)

writer.setMaxFieldLength(1048576)

self.indexDocs(root, writer)

ticker = Ticker()

print 'optimizing index',

threading.Thread(target=ticker.run).start()

writer.optimize()

writer.close()

ticker.tick = False

print 'done'

def indexDocs(self, root, writer):

for root, dirnames, filenames in os.walk(root):

print root

try:

sroot = unicode(root, 'GBK')

print sroot

except:

print "*****************************unicode error"

print root

continue

#add dir

doc = PyLucene.Document()

doc.add(PyLucene.Field("path", sroot,

PyLucene.Field.Store.YES,

PyLucene.Field.Index.UN_TOKENIZED))

doc.add(PyLucene.Field("name", sroot,

PyLucene.Field.Store.YES,

PyLucene.Field.Index.TOKENIZED))

writer.addDocument(doc)

for filename in filenames:

try:

filename = unicode(filename, 'GBK')

except:

print "*****************************unicode error"

print filename

continue

print "adding", filename

try:

#path = unicode(root, 'GB2312')#

path =os.path.join(sroot, filename)

#file = open(path)

#contents = unicode(file.read(), 'iso-8859-1')

#contents = unicode(file.read(), 'GBK')

#file.close()

doc = PyLucene.Document()

doc.add(PyLucene.Field("path", path,

PyLucene.Field.Store.YES,

PyLucene.Field.Index.UN_TOKENIZED))

doc.add(PyLucene.Field("name", filename,

PyLucene.Field.Store.YES,

PyLucene.Field.Index.TOKENIZED))

'''

if len(contents) > 0:

doc.add(PyLucene.Field("contents", contents,

PyLucene.Field.Store.YES,

PyLucene.Field.Index.TOKENIZED))

else:

print "warning: no content in %s" % filename

'''

writer.addDocument(doc)

except Exception, e:

print "Failed in indexDocs:", e

__debug = 0

if __name__ == '__main__':

if __debug != 1:

if len(sys.argv) < 2:

print IndexFiles.__doc__

sys.exit(1)

print 'PyLucene', PyLucene.VERSION, 'Lucene', PyLucene.LUCENE_VERSION

start = datetime.now()

try:

if __debug != 1:

IndexFiles(sys.argv[1], "index", PyLucene.StandardAnalyzer())

else:

IndexFiles(r'c:/testccc', "index", PyLucene.StandardAnalyzer())

end = datetime.now()

print end - start

except Exception, e:

print "Failed: ", e

SearchFiles.py

from PyLucene import QueryParser, IndexSearcher, StandardAnalyzer, FSDirectory

from PyLucene import VERSION, LUCENE_VERSION

"""

This script is loosely based on the Lucene (java implementation) demo class

org.apache.lucene.demo.SearchFiles. It will prompt for a search query, then it

will search the Lucene index in the current directory called 'index' for the

search query entered against the 'contents' field. It will then display the

'path' and 'name' fields for each of the hits it finds in the index. Note that

search.close() is currently commented out because it causes a stack overflow in

some cases.

"""

def run(searcher, analyzer):

while True:

print "Hit enter with no input to quit."

command = raw_input("Query:")

command = unicode(command, 'GBK')

if command == '':

return

print "Searching for:", command

#query = QueryParser("contents", analyzer).parse(command)

query = QueryParser("name", analyzer).parse(command)

hits = searcher.search(query)

print "%s total matching documents." % hits.length()

for i, doc in hits:

print 'path:', doc.get("path"), 'name:', doc.get("name")

if __name__ == '__main__':

STORE_DIR = "index"

print 'PyLucene', VERSION, 'Lucene', LUCENE_VERSION

directory = FSDirectory.getDirectory(STORE_DIR, False)

searcher = IndexSearcher(directory)

analyzer = StandardAnalyzer()

run(searcher, analyzer)

searcher.close()

建立索引，运行：

python IndexFiles.py c:/

查找的时候，运行：

python SearchFiles.py

如果只查找一个关键词则直接输入；如果想同时查找两个关键词，如Python 网络，则输入：Python AND 网络；如果想查找Python或网络则：Python 网络，也可以Python OR 网络。

其他的查询请参考API文档。

如果上面代码中用到的函数不太明白可以参考《实战 Lucene》