Find the top5 similar issues in bugzilla system

来源:互联网 发布:webpack压缩单个js文件 编辑:程序博客网 时间:2024/06/08 00:55
#!/usr/bin/env python


'''
Created on Aug, 2017
@author: menghl
'''


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk
from gensim import corpora, models, similarities
from nltk.corpus import stopwords




# --------------get the basic data--------------------
def get_data():
    bug_df = pd.read_excel('/Users/mhl/Desktop/bug.xlsx')
    # print('bug_df', bug_df)
    bug_id = bug_df.loc[:, 'BugID']
    # print('bug_id', bug_id)
    bug_sum = bug_df.loc[:, 'Summary']
    # print('bug_sum', bug_sum)
    bug_pre = pd.concat([bug_id, bug_sum], axis=1)
    return bug_pre




# ------------delete the data in []------------------
def del_data_part1(bug_pre):
    bug_re = re.compile(r'\[.*?\]')
    for bug_item in range(len(bug_pre)):
        bug_pre.loc[bug_item, 'Summary'] = bug_re.sub('', bug_pre.loc[bug_item, 'Summary'])
    # print(bug_item)
    # print('bug_pre', bug_pre)




# -------------tokenize the bug_pre-----------------
def tokenize(bug_pre):
    bug_list = list(bug_pre.loc[:, 'Summary'])
    for loop in range(len(bug_list)):
        bug_list[loop] = bug_list[loop].lower()
        bug_list[loop] = nltk.word_tokenize(bug_list[loop])
        # print(bug_list[loop])
    # print('bug_list', bug_list)
    return bug_list




# ------------------delete non-important words--------------------
def del_words(bug_list):
    for loop in range(len(bug_list)):
        bug_list[loop] = del_stopwords(bug_list[loop])
        bug_list[loop] = del_spwords(bug_list[loop])
        bug_list[loop] = del_puncwords(bug_list[loop])




# ------------delete stopwords------------------------
def del_stopwords(bug_item):
    bug_item = [word for word in bug_item if word not in stopwords.words('english')]
    return bug_item




# ------------delete special words for lenovo------------------------
def del_spwords(bug_item):
    month = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    other_words = ['block', 'Flex', 'IO']
    spwords = month
    spwords.extend(other_words)
    bug_item = [word for word in bug_item if word not in spwords]
    return bug_item




# ------------delete punctuation------------------------------
def del_puncwords(bug_item):
    puncwords = ['', '\n', '\t', ',', '.', ':', ';', '?', '(', ')', \
                 '[', ']', '&', '!', '*', '@', '#', '$', '%']
    bug_item = [word for word in bug_item if word not in puncwords]
    return bug_item




# ---------calculate the tf-idf value of this model--------------
def cal_tfidf(bug_list):
    bug_dict = corpora.Dictionary(bug_list)
    # check the total words number
    # print(len(bug_dict))
    # print('bug_dict', bug_dict)
    # print(bug_dict.token2id)
    corpus = [bug_dict.doc2bow(bug_loop) for bug_loop in bug_list]
    # print(corpus)
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    return bug_dict, tfidf, corpus_tfidf




# ---------calculate the similarities of the new input-----------
def cal_sim(test_query, tfidf, corpus_tfidf):
    test_query_tfidf = tfidf[test_query]
    index = similarities.MatrixSimilarity(corpus_tfidf)
    sims = index[test_query_tfidf]
    sims_list = list(enumerate(sims))
    sims_sort = sorted(sims_list, key=lambda x:x[1], reverse=True)
    # print(sims_sort)
    top5_sims = sims_sort[0:5]
    return top5_sims




# -------------display data---------------------
def display(top5_sims, bug_pre):
    print(top5_sims)
    for loop in range(len(top5_sims)):
        print(bug_pre.loc[top5_sims[loop][0]], '\n')




# -------------test one-------------------------
def test_bug_list(bug_list):
    return bug_list[1]




# -------------test two-------------------------
def test_str_input():
    return 'expander'




# -------------test three-------------------------
def test3(bug_pre, bug_list):
    bug_num = input('Please enter the bug number you check:')
    #bug_num = 41414
    print(bug_num)
    bug_id_list = list(bug_pre['BugID'])
    index = bug_id_list.index(bug_num)
    bug_test = bug_list[index]
    print('bug_test', bug_test)
    return bug_test




# -------------Main--------------------------------
def main():
    bug_pre = get_data()
    del_data_part1(bug_pre)
    bug_list = tokenize(bug_pre)
    del_words(bug_list)
    # print(bug_list)
    bug_dict, tfidf, corpus_tfidf = cal_tfidf(bug_list)
    test_query_input = test3(bug_pre, bug_list);
    test_query = bug_dict.doc2bow(test_query_input)
    top5_sims = cal_sim(test_query, tfidf, corpus_tfidf)
    display(top5_sims, bug_pre)


if __name__=="__main__":
    main()
原创粉丝点击