简单的文本相似性测量（Python实现）

来源：互联网发布：青岛海尔软件地址编辑：程序博客网时间：2024/06/05 04:48

一、数据集与测试集

数据集：

测试集：

imaging databases

二、Python代码

# -*- coding: utf-8 -*-"""Created on Mon Apr 13 09:49:25 2015@author: Administrator"""import numpyimport sysimport scipy as sp import osimport nltk.stem as  stmenglish_stemmer = stm.SnowballStemmer('english')from sklearn.feature_extraction.text import CountVectorizerclass StemmedCountVectorizer(CountVectorizer):    def build_analyzer(self):        analyzer=super(StemmedCountVectorizer,self).build_analyzer()        return lambda doc:(english_stemmer.stem(w) for w in analyzer(doc))    from sklearn.feature_extraction.text import CountVectorizerdef dist_raw(v1,v2):    delta=v1-v2    return sp.linalg.norm(delta.toarray())def dist_norm(v1,v2):    v1_normalized=v1/sp.linalg.norm(v1.toarray())    v2_normalized=v2/sp.linalg.norm(v2.toarray())    delta=v1_normalized-v2_normalized    return sp.linalg.norm(delta.toarray())        #content=["how to format my hard disk","Hard disk format problems"]#X=vectorizer.fit_transform(content)dir="E:\data"posts=[open(os.path.join(dir,f)).read() for f in os.listdir(dir)]#vectorizer=CountVectorizer(min_df=1)vectorizer=StemmedCountVectorizer(min_df=1,stop_words='english')x_train=vectorizer.fit_transform(posts)num_samples,num_features=x_train.shape"""print("#samples: %d,#features: %d" % (num_samples,num_features))print(vectorizer.get_feature_names())"""new_post="imaging databases"new_post_vec=vectorizer.transform([new_post])best_doc=Nonebest_dist=sys.maxintbest_i=Nonefor i in range(0,num_samples):    post=posts[i]        if(post==new_post):        continue    post_vec=x_train.getrow(i)    #d=dist_raw(post_vec,new_post_vec)    d=dist_norm(post_vec,new_post_vec)    print"=== Post %i with dis=%.2f: %s" %(i,d,post)    if d<best_dist:        best_dist=d        best_i=iprint("Best post is %i with dist=%.2f" %(best_i,best_dist))

3、结果
=== Post 0 with dis=1.41: This is a toy post about machine learning. Actually, it contains not much interesting stuff
=== Post 1 with dis=0.61: Imaging databases can get huge.
=== Post 2 with dis=0.63: Most imaging databases safe images permanently
=== Post 3 with dis=0.52: Imaging databases store images.
=== Post 4 with dis=0.52: Imaging databases store images. Imaging databases store images. Imaging databases store images.
Best post is 3 with dist=0.52

0 0