使用python实现短语查询

来源:互联网 发布:python生成多个随机数 编辑:程序博客网 时间:2024/04/30 12:07

searchPhrase函数实现了短语查询功能:

def word_split(text):    """    Split a text in words. Returns a list of tuple that contains    (word, location) location is the starting words position of the word.    alse do the job of normalization    """    word_list = []    wcurrent = []    windex = 0    #enumerate can get the index and the specific content of a string    for i, c in enumerate(text):        if c.isalnum():            wcurrent.append(c)        elif wcurrent:            windex = windex + 1            word = u''.join(wcurrent).lower()            word_list.append((windex, word))            wcurrent = []    if wcurrent:        windex = windex + 1        word = u''.join(wcurrent).lower()        word_list.append((windex, word))    return word_listdef inverted_index(text):    """    Create an Inverted-Index of the specified text document.        {word:[locations]}    """    inverted = {}    for index, word in word_split(text):        # setdefault func is similar with the func get,but it can add new key and set default value to the dic when the key does not exist        locations = inverted.setdefault(word, [])        locations.append(index)    return inverteddef inverted_index_add(inverted, doc_id, doc_index):    """    Add Invertd-Index doc_index of the document doc_id to the     Multi-Document Inverted-Index (inverted),     using doc_id as document identifier.        {word:{doc_id:[locations]}}    """    for word, locations in doc_index.iteritems():        indices = inverted.setdefault(word, {})        indices[doc_id] = locations    return inverteddef search(inverted, query):    """    Returns a set of documents id that contains all the words in your query.    """    words = [word for _, word in word_split(query) if word in inverted]    results = [set(inverted[word].keys()) for word in words]#Duplicate remove    return reduce(lambda x, y: x & y, results) if results else []#find the doc in commondef searchPhrase(inverted,query):    """    Returns a set of documents id that contains phrase in your query.    """    words = [word for _, word in word_split(query) if word in inverted]    tempDic = {}    doc_return = []    for word in words:        word_doc_ids =  inverted[word].keys()        tempDic.setdefault(word,{})        for ID in word_doc_ids:            word_doc_position =  inverted[word][ID]            tempDic[word].setdefault(ID,word_doc_position)    #print tempDic    if len(words)>1:        minKey = {}        for i in range(0,len(words)):            tempKeys = tempDic[words[i]].keys()            minKey.setdefault(i,tempKeys)        minKeyNew = minKey[0]        for i in range(1,len(words)):            minKeyNew = [val for val in minKeyNew if val in minKey[i]]        for key in minKeyNew:            list1 = tempDic[words[0]][key]            tempPosition = []            for i in range(1,len(words)):                listN = tempDic[words[i]][key]                index1 = 0                indexN = 0                while listN[indexN]-list1[index1] != i:                    if listN[indexN]>list1[index1]:                        index1 = index1+1                        if index1 == len(list1):                            index1 = index1 -1                            break                    else:                        indexN = indexN + 1                        if indexN == len(listN):                            indexN = indexN - 1                            break                if list1[index1] not in tempPosition and listN[indexN]-list1[index1] == i:                    tempPosition.append(list1[index1])            #print tempPosition,"tempPosition"            isAdd = []            for i in range(0,len(tempPosition)):                isAddForOneGroup = []                for m in range(1,len(words)):                    if tempPosition[i]+m not in tempDic[words[m]][key]:                        isAddForOneGroup.append(0)                if 0 in isAddForOneGroup:                    isAdd.append(0)                else:                    isAdd.append(1)            if 1 in isAdd:                doc_return.append(key)                              else:        doc_return.append(tempDic[words[0]].keys()[0])    results = []    for doc_id in doc_return:        if doc_id not in results:            results.append(doc_id)    return resultsdoc1 = """Niners head coach Mike Singletary will let Alex Smith remain his starting quarterback, but his vote of confidence is anything but a long-term mandate.Smith now will work on a week-to-week basis, because Singletary has voided his year-long lease on the job."I think from this point on, you have to do what's best for the football team,"Singletary said Monday, one day after threatening to bench Smith during a 27-24 loss to the visiting Eagles."""doc2 = """The fifth edition of West Coast Green, a conference focusing on "green" home innovations and products, rolled into San Francisco's Fort Mason last week intent, per usual, on making our living spaces more environmentally friendly - one used-tire house at a time.Zero-rated buildings To that end, there were presentations on topics such as water efficiency and the burgeoning future of Net Zero-rated buildings that consume no energy and produce no carbon emissions.on a job,on the job"""inverted = {}documents = {'doc1':doc1, 'doc2':doc2}for doc_id, text in documents.iteritems():    doc_index = inverted_index(text)    inverted_index_add(inverted, doc_id, doc_index)# Print Inverted-Index#for word, doc_locations in inverted.iteritems():    #print word, doc_locations#search common wordsprint "*****search common words*****"queries = ['Week', 'Niners', 'coast']for query in queries:    result_docs = search(inverted, query)    print "Search for '%s': %r" % (query, result_docs)#search phrasesprint print "*****search phrases*****"newQueries = ['Zero-rated buildings', 'on the job', 'West Coast']for query in newQueries:    result_docs = searchPhrase(inverted, query)    print "Search for '%s': %r" % (query, result_docs)
1 0
原创粉丝点击