基于增量的矩阵聚类
来源:互联网 发布:中华人软件下载 编辑:程序博客网 时间:2024/06/05 14:24
from gensim.models import word2vecimport jiebaimport numpy as npimport refrom zhon.hanzi import punctuationimport mathimport timefrom scipy import spatial# In[1]:from gensim import corpora, models, similaritiesclass_classpath = './classification_classification.txt'class_trainpath = './classification_trainData.txt'intention_classpath = './intention_classification.txt'intention_trainpath = './intention_trainData.txt'q_a_path = './user_quse_ans_scrawl.txt'# ques_ansid_path = './faq_0.9/ques_ansid_v10.txt'# ques_intid_path = './faq_0.9/ques_intid_v10.txt'ques_ans_path = './ques_ans_scrawl_from_matrix_v2.txt'#qq_path = './faq_0.7-0.8/qq_dui_v1.txt'jieba.load_userdict('/export/user/shizhengxin/tf-idf/new_dict_pro.txt')w2v=word2vec.Word2Vec.load('/export/user/shizhengxin/word2vec/word2vec_test_v7.model')def new_sent2vec(s): words = s M = [] for w in words: try: M.append(w2v.wv[w]) except: continue if len(M) == 0: return [0]*300 else: M = np.array(M) v = M.sum(axis=0) return v / M.shape[0]def throw_dirty(sentence): content=re.sub("[%s]+" %punctuation, "", sentence) newline2 = re.sub("[A-Za-z0-9\[\`\~\!\@\#\$\^\&\*\(\)\=\|\{\}\'\:\;\'\,\[\]\.\<\>\/\?\~\!\@\#\\\&\*\%\-\_]", "", content) newline3 = re.sub(' ','',newline2) return newline3def get_answer_intention_id(class_classpath,intention_classpath): w = open(class_classpath,'r',encoding='utf-8') ws = open(intention_classpath,'r',encoding='utf-8') ans_id = {} ans_int = {} ans_jieba = {} ans_vec = {} times = 0 for line in w.readlines(): newline = line.strip().split('----') if len(newline) < 4: continue ans_id[newline[3]] = times sent = throw_dirty(newline[3]) ans_jieba[newline[3]] = new_sent2vec(list(jieba.cut(sent))) times += 1 ans_int[newline[3]] = newline[1] w.close() sed = 0 int_id = {} for s_line in ws.readlines(): newline = s_line.strip() int_id[newline] = sed sed += 1 ws.close() return ans_id , ans_int , int_id,ans_jiebaans_id , ans_int , int_id , ans_jieba = get_answer_intention_id(class_classpath,intention_classpath)print(int_id)def cos_dist(a, b): part_up = 0.0 a_sq = 0.0 b_sq = 0.0 for a1, b1 in zip(a,b): part_up += a1*b1 a_sq += a1**2 b_sq += b1**2 part_down = math.sqrt(a_sq*b_sq) if part_down == 0.0: return 0 else: return part_up / part_downdef get_rawdata(q_a_path): w = open(q_a_path, 'r', encoding='utf-8') times = 0 q_jieba = {} q_a_dui = {} for line in w.readlines(): newline = line.strip().split('----') if len(newline) != 2: continue sentence1 = newline[1] sentence1 = throw_dirty(sentence1) sentence1 = list(jieba.cut(sentence1)) q_jieba[newline[1]] = new_sent2vec(sentence1) if newline[1] in q_a_dui.keys(): q_a_dui[newline[1]].add(newline[0]) else: q_a_dui[newline[1]] = set([newline[0]]) w.close() return q_jieba , q_a_duiq_jieba , q_a_dui = get_rawdata(q_a_path)print(1)def get_matrix(q_jieba,ans_jieba): q_mat = [] times = 0 q_dict = {} for q in q_jieba.keys(): vec_q = q_jieba[q] / np.linalg.norm(q_jieba[q]) q_mat.append(vec_q) q_dict[times] = q times += 1 q_mat = np.mat(q_mat) ans_mat = [] sed = 0 ans_dict = {} for ans in ans_jieba.keys(): vec_ans = ans_jieba[ans]/np.linalg.norm(ans_jieba[ans]) ans_mat.append(vec_ans) ans_dict[sed] = ans sed += 1 ans_mat = np.mat(ans_mat) return q_mat , ans_mat ,q_dict ,ans_dictdef get_user_data(q_jieba,q_a_dui,ans_id,ans_jieba,int_id,ans_int,ques_ans_path): # w_qid = open(ques_ansid_path,'a',encoding='utf-8') # w_qint = open(ques_intid_path,'a',encoding='utf-8') w_qans = open(ques_ans_path,'a',encoding='utf-8') #w_qq = open(qq_path,'a',encoding = 'utf-8') q_a = {} clock1 = time.time() q_mat, ans_mat, q_dict, ans_dict = get_matrix(q_jieba,ans_jieba) sta_user_mat = np.dot(ans_mat,q_mat.T) sta_user_mat = np.nan_to_num(sta_user_mat) print(sta_user_mat) mat_index = np.argmax(sta_user_mat,axis=0) print(mat_index.shape) print(mat_index) for column in range(sta_user_mat.shape[1]): #print(sta_user_mat[mat_index[0,column],column]) #print(q_dict[column] +'----' +ans_dict[mat_index[0,column]]+'\n') for k in q_a_dui[q_dict[column]]: w_qans.writelines(k +'----' +ans_dict[mat_index[0,column]]+'----'+str(sta_user_mat[mat_index[0,column],column])+'\n') w_qans.close() # # if 1 - spatial.distance.cosine(a,b) >= 0.9 : # w_qid.writelines(q_a_dui[line]+'----'+str(ans_id[key])+'\n') # w_qans.writelines(q_a_dui[line]+'----'+key+'\n') # w_qint.writelines(q_a_dui[line]+'----'+str(int_id[ans_int[key]])+'\n') # # w_qq.writelines(line+'----'+key+'\n') # break # times += 1 # if times % 1000 == 0: # clock2 = time.time() # print(times) # print('time consum '+str(clock2-clock1)) # w_qint.close() # w_qid.close() # w_qans.close() # # w_qq.close()# get_user_data(q_jieba,q_a_dui,ques_ansid_path,ques_intid_path,ques_ans_path,ans_id,ans_jieba,int_id,ans_int)get_user_data(q_jieba,q_a_dui,ans_id,ans_jieba,int_id,ans_int,ques_ans_path)
阅读全文
0 0
- 基于增量的矩阵聚类
- 基于Heritrix的增量抓取
- 基于Heritrix的增量抓取
- 基于jenkins的增量发布
- 增量矩阵与其转置矩阵的乘积_baidu_2016_09_13-3
- 增量矩阵与其转置矩阵的乘积_baidu_20160913-3
- 基于增量更新的协同过滤
- doc2vec的基于新文本内容的增量训练方案
- 基于CNN的增量学习论文的读后感
- MySQL数据库备份还原(基于binlog的增量备份)
- mysql备份还原-基于binlog的增量备份还原
- Kettle CDC(增量的实现)---基于快照实现
- 【Oracle】基于SCN的增量备份修复DataGuard GAP
- 基于MYSQL的Binlog增量数据同步服务
- 基于增量PID算法的无人机跟踪模块
- MySQL数据库备份还原(基于binlog的增量备份)
- MySQL数据库备份还原(基于binlog的增量备份)
- MySQL数据库备份还原(基于binlog的增量备份)
- 1027. 打印沙漏(20)
- STM32多串口共用printf打印串口数据
- HDOJ 1395 2^x mod n = 1
- 测试Java的静态代码快执行时机
- Leetcode:Triangle
- 基于增量的矩阵聚类
- hdu 5903
- 【Mybatis学习】Mybatis框架中的动态sql
- GitHub 上开源的区块链项目 90% 死亡了
- 【Linux】Shell
- 数据结构-栈--进制转换
- 质因子分解问题
- 打印100~200 之间的素数
- 习题 6 6.6