使用python实现短语查询
来源:互联网 发布:python生成多个随机数 编辑:程序博客网 时间:2024/04/30 12:07
searchPhrase函数实现了短语查询功能:
def word_split(text): """ Split a text in words. Returns a list of tuple that contains (word, location) location is the starting words position of the word. alse do the job of normalization """ word_list = [] wcurrent = [] windex = 0 #enumerate can get the index and the specific content of a string for i, c in enumerate(text): if c.isalnum(): wcurrent.append(c) elif wcurrent: windex = windex + 1 word = u''.join(wcurrent).lower() word_list.append((windex, word)) wcurrent = [] if wcurrent: windex = windex + 1 word = u''.join(wcurrent).lower() word_list.append((windex, word)) return word_listdef inverted_index(text): """ Create an Inverted-Index of the specified text document. {word:[locations]} """ inverted = {} for index, word in word_split(text): # setdefault func is similar with the func get,but it can add new key and set default value to the dic when the key does not exist locations = inverted.setdefault(word, []) locations.append(index) return inverteddef inverted_index_add(inverted, doc_id, doc_index): """ Add Invertd-Index doc_index of the document doc_id to the Multi-Document Inverted-Index (inverted), using doc_id as document identifier. {word:{doc_id:[locations]}} """ for word, locations in doc_index.iteritems(): indices = inverted.setdefault(word, {}) indices[doc_id] = locations return inverteddef search(inverted, query): """ Returns a set of documents id that contains all the words in your query. """ words = [word for _, word in word_split(query) if word in inverted] results = [set(inverted[word].keys()) for word in words]#Duplicate remove return reduce(lambda x, y: x & y, results) if results else []#find the doc in commondef searchPhrase(inverted,query): """ Returns a set of documents id that contains phrase in your query. """ words = [word for _, word in word_split(query) if word in inverted] tempDic = {} doc_return = [] for word in words: word_doc_ids = inverted[word].keys() tempDic.setdefault(word,{}) for ID in word_doc_ids: word_doc_position = inverted[word][ID] tempDic[word].setdefault(ID,word_doc_position) #print tempDic if len(words)>1: minKey = {} for i in range(0,len(words)): tempKeys = tempDic[words[i]].keys() minKey.setdefault(i,tempKeys) minKeyNew = minKey[0] for i in range(1,len(words)): minKeyNew = [val for val in minKeyNew if val in minKey[i]] for key in minKeyNew: list1 = tempDic[words[0]][key] tempPosition = [] for i in range(1,len(words)): listN = tempDic[words[i]][key] index1 = 0 indexN = 0 while listN[indexN]-list1[index1] != i: if listN[indexN]>list1[index1]: index1 = index1+1 if index1 == len(list1): index1 = index1 -1 break else: indexN = indexN + 1 if indexN == len(listN): indexN = indexN - 1 break if list1[index1] not in tempPosition and listN[indexN]-list1[index1] == i: tempPosition.append(list1[index1]) #print tempPosition,"tempPosition" isAdd = [] for i in range(0,len(tempPosition)): isAddForOneGroup = [] for m in range(1,len(words)): if tempPosition[i]+m not in tempDic[words[m]][key]: isAddForOneGroup.append(0) if 0 in isAddForOneGroup: isAdd.append(0) else: isAdd.append(1) if 1 in isAdd: doc_return.append(key) else: doc_return.append(tempDic[words[0]].keys()[0]) results = [] for doc_id in doc_return: if doc_id not in results: results.append(doc_id) return resultsdoc1 = """Niners head coach Mike Singletary will let Alex Smith remain his starting quarterback, but his vote of confidence is anything but a long-term mandate.Smith now will work on a week-to-week basis, because Singletary has voided his year-long lease on the job."I think from this point on, you have to do what's best for the football team,"Singletary said Monday, one day after threatening to bench Smith during a 27-24 loss to the visiting Eagles."""doc2 = """The fifth edition of West Coast Green, a conference focusing on "green" home innovations and products, rolled into San Francisco's Fort Mason last week intent, per usual, on making our living spaces more environmentally friendly - one used-tire house at a time.Zero-rated buildings To that end, there were presentations on topics such as water efficiency and the burgeoning future of Net Zero-rated buildings that consume no energy and produce no carbon emissions.on a job,on the job"""inverted = {}documents = {'doc1':doc1, 'doc2':doc2}for doc_id, text in documents.iteritems(): doc_index = inverted_index(text) inverted_index_add(inverted, doc_id, doc_index)# Print Inverted-Index#for word, doc_locations in inverted.iteritems(): #print word, doc_locations#search common wordsprint "*****search common words*****"queries = ['Week', 'Niners', 'coast']for query in queries: result_docs = search(inverted, query) print "Search for '%s': %r" % (query, result_docs)#search phrasesprint print "*****search phrases*****"newQueries = ['Zero-rated buildings', 'on the job', 'West Coast']for query in newQueries: result_docs = searchPhrase(inverted, query) print "Search for '%s': %r" % (query, result_docs)
1 0
- 使用python实现短语查询
- Lucene使用单字分词及短语查询实现类似全模糊查询效果
- python 短语查询(中文版本+英文版本)
- lucene使用PhraseQuery设置slop进行短语查询
- lucene-PhraseQuery通过短语查询
- elasticsearch 短语查询(match_phrase)
- CTE和WITH AS短语结合使用提高SQL查询性能
- CTE和WITH AS短语结合使用提高SQL查询性能
- 短语
- 短语
- 短语
- 短语
- python实现DNS查询
- python实现关键字查询
- python 实现mysql 查询
- python实现查询天气
- Python实现日历查询
- Python使用Com组件及Access查询分析类实现
- 你意想不到的的编程问题
- 观察者设计模式实现缓存机制
- 使用VS开发Qt项目时编译速度慢的问题解决
- cocos2d-x v3.9 与ActionInterval的孩子们之间的对话(2)
- iOS/OS X内存管理(二):借助工具解决内存问题
- 使用python实现短语查询
- linux命令-网络相关
- Fragment学习进阶<一>-------静态
- Scala语法 Case Class和模式匹配
- HTML5中canvas的fillRect、arc用法
- Linux中的likely()和unlikely()
- 调用百度语音合成接口
- 七大排序java实现
- SQL Update 存在则更新,不存在则插入