基于增量的矩阵聚类

来源:互联网 发布:中华人软件下载 编辑:程序博客网 时间:2024/06/05 14:24
from gensim.models import word2vecimport jiebaimport numpy as npimport refrom zhon.hanzi import punctuationimport mathimport timefrom scipy import spatial# In[1]:from gensim import corpora, models, similaritiesclass_classpath = './classification_classification.txt'class_trainpath = './classification_trainData.txt'intention_classpath = './intention_classification.txt'intention_trainpath = './intention_trainData.txt'q_a_path = './user_quse_ans_scrawl.txt'# ques_ansid_path = './faq_0.9/ques_ansid_v10.txt'# ques_intid_path = './faq_0.9/ques_intid_v10.txt'ques_ans_path = './ques_ans_scrawl_from_matrix_v2.txt'#qq_path = './faq_0.7-0.8/qq_dui_v1.txt'jieba.load_userdict('/export/user/shizhengxin/tf-idf/new_dict_pro.txt')w2v=word2vec.Word2Vec.load('/export/user/shizhengxin/word2vec/word2vec_test_v7.model')def new_sent2vec(s):    words = s    M = []    for w in words:        try:            M.append(w2v.wv[w])        except:            continue    if len(M) == 0:        return [0]*300    else:        M = np.array(M)        v = M.sum(axis=0)        return v / M.shape[0]def throw_dirty(sentence):    content=re.sub("[%s]+" %punctuation, "", sentence)    newline2 = re.sub("[A-Za-z0-9\[\`\~\!\@\#\$\^\&\*\(\)\=\|\{\}\'\:\;\'\,\[\]\.\<\>\/\?\~\!\@\#\\\&\*\%\-\_]", "", content)    newline3 = re.sub(' ','',newline2)    return newline3def get_answer_intention_id(class_classpath,intention_classpath):    w = open(class_classpath,'r',encoding='utf-8')    ws = open(intention_classpath,'r',encoding='utf-8')    ans_id = {}    ans_int = {}    ans_jieba = {}    ans_vec = {}    times = 0    for line in w.readlines():        newline = line.strip().split('----')        if len(newline) < 4:            continue        ans_id[newline[3]] = times        sent = throw_dirty(newline[3])        ans_jieba[newline[3]] = new_sent2vec(list(jieba.cut(sent)))        times += 1        ans_int[newline[3]] = newline[1]    w.close()    sed = 0    int_id = {}    for s_line in ws.readlines():        newline = s_line.strip()        int_id[newline] = sed        sed += 1    ws.close()    return ans_id , ans_int , int_id,ans_jiebaans_id , ans_int , int_id , ans_jieba = get_answer_intention_id(class_classpath,intention_classpath)print(int_id)def cos_dist(a, b):    part_up = 0.0    a_sq = 0.0    b_sq = 0.0    for a1, b1 in zip(a,b):        part_up += a1*b1        a_sq += a1**2        b_sq += b1**2    part_down = math.sqrt(a_sq*b_sq)    if part_down == 0.0:        return 0    else:        return part_up / part_downdef get_rawdata(q_a_path):    w = open(q_a_path, 'r', encoding='utf-8')    times = 0    q_jieba = {}    q_a_dui = {}    for line in w.readlines():        newline = line.strip().split('----')        if len(newline) != 2:            continue        sentence1 = newline[1]        sentence1 = throw_dirty(sentence1)        sentence1 = list(jieba.cut(sentence1))        q_jieba[newline[1]] = new_sent2vec(sentence1)        if newline[1] in q_a_dui.keys():            q_a_dui[newline[1]].add(newline[0])        else:            q_a_dui[newline[1]] = set([newline[0]])    w.close()    return q_jieba , q_a_duiq_jieba , q_a_dui = get_rawdata(q_a_path)print(1)def get_matrix(q_jieba,ans_jieba):    q_mat = []    times = 0    q_dict  = {}    for q in  q_jieba.keys():        vec_q = q_jieba[q] / np.linalg.norm(q_jieba[q])        q_mat.append(vec_q)        q_dict[times] = q        times += 1    q_mat = np.mat(q_mat)    ans_mat = []    sed = 0    ans_dict = {}    for ans in ans_jieba.keys():        vec_ans = ans_jieba[ans]/np.linalg.norm(ans_jieba[ans])        ans_mat.append(vec_ans)        ans_dict[sed] = ans        sed += 1    ans_mat = np.mat(ans_mat)    return q_mat , ans_mat ,q_dict ,ans_dictdef get_user_data(q_jieba,q_a_dui,ans_id,ans_jieba,int_id,ans_int,ques_ans_path):    # w_qid = open(ques_ansid_path,'a',encoding='utf-8')    # w_qint = open(ques_intid_path,'a',encoding='utf-8')    w_qans = open(ques_ans_path,'a',encoding='utf-8')    #w_qq = open(qq_path,'a',encoding = 'utf-8')    q_a = {}    clock1 = time.time()    q_mat, ans_mat, q_dict, ans_dict = get_matrix(q_jieba,ans_jieba)    sta_user_mat = np.dot(ans_mat,q_mat.T)    sta_user_mat = np.nan_to_num(sta_user_mat)    print(sta_user_mat)    mat_index = np.argmax(sta_user_mat,axis=0)    print(mat_index.shape)    print(mat_index)    for column  in range(sta_user_mat.shape[1]):        #print(sta_user_mat[mat_index[0,column],column])        #print(q_dict[column] +'----' +ans_dict[mat_index[0,column]]+'\n')        for k in q_a_dui[q_dict[column]]:            w_qans.writelines(k +'----' +ans_dict[mat_index[0,column]]+'----'+str(sta_user_mat[mat_index[0,column],column])+'\n')    w_qans.close()   #   #      if 1 - spatial.distance.cosine(a,b) >= 0.9 :   #          w_qid.writelines(q_a_dui[line]+'----'+str(ans_id[key])+'\n')   #          w_qans.writelines(q_a_dui[line]+'----'+key+'\n')   #          w_qint.writelines(q_a_dui[line]+'----'+str(int_id[ans_int[key]])+'\n')   #          # w_qq.writelines(line+'----'+key+'\n')   #          break   #      times += 1   #      if  times % 1000 == 0:   #          clock2 = time.time()   #          print(times)   #          print('time consum '+str(clock2-clock1))   #  w_qint.close()   #  w_qid.close()   #  w_qans.close()   # # w_qq.close()# get_user_data(q_jieba,q_a_dui,ques_ansid_path,ques_intid_path,ques_ans_path,ans_id,ans_jieba,int_id,ans_int)get_user_data(q_jieba,q_a_dui,ans_id,ans_jieba,int_id,ans_int,ques_ans_path)
原创粉丝点击