自编卡方检验程序 (python)
来源:互联网 发布:阿里云国际版购买 编辑:程序博客网 时间:2024/06/11 17:32
由于大作业中需要,本来是想找现成的卡方检验程序的,但是没找过,尴尬,所以一怒之下之前用了一个晚上编出来的(编程水平太渣,据说大神只用一小时.....)
这里还是提一下卡方检验的处理步骤吧,虽然我在实验报告里写到了......
老师在上课时提供的ppt里这部分的例子很好,放上来:
这段程序的用处是对一类文章中出现的词进行卡方检验统计,找到每个类别中CHI值较大的一些作为本类的特征词,然后,只保留本类中每个文章中出现的这些特征词,以便后续处理。我觉得我没有说明白,之后会放上整个实验的代码和文档,里面应该对整体流程说的比较清楚.....
#!/usr/bin/python# coding: utf8from __future__ import divisionimport osdef CHI_control_text(text,ca_num,textname,categoryName): #针对每个文章中只保留根据CHI值选取的特征词,使一会儿构成的特征词仅仅由这些组成 CHI_order_select = open("CHIorder" + '\\' + 'class' + str(ca_num) + "_CHIorder_select.txt", 'r') if not os.path.exists("text_remain\\sougou_all" + '\\' + categoryName): os.makedirs("text_remain\\sougou_all" + '\\' + categoryName) text_remain = open("text_remain" + '\\' + textname, 'w') #根据CHI筛选后的词语对给每个文章的分词结果进行修改,也就是只保留每篇文章中出现这些词的 dict = {} for kv in [d.strip() for d in CHI_order_select]: dict[kv] = kv #读入对应类别的保留词 have_word_num = 0 #用来记录本篇文章中共有几个关键词 for line in text.readlines(): # print line.strip() text_info = line.strip().split('\t') if dict.has_key(text_info[0].strip()): have_word_num += 1 #判断本篇文章中的词是否是保留词,如是,写入这篇文章的text_remain中 text_remain.write(text_info[0].strip() + '\n') if have_word_num < 2: #把出现关键词个数少于2的文章的文章删除,这种文章几乎和本主题没有什么关系 text_remain.close() os.remove("text_remain" + '\\' + textname) else: text_remain.close()#得到每个类别下,文章的数目class1_num = len(os.listdir("wordtimes\\sougou_all" + '\\' + "aoyun"))class2_num = len(os.listdir("wordtimes\\sougou_all" + '\\' + "fangchan"))class3_num = len(os.listdir("wordtimes\\sougou_all" + '\\' + "jiankang"))class4_num = len(os.listdir("wordtimes\\sougou_all" + '\\' + "jiaoyu"))class5_num = len(os.listdir("wordtimes\\sougou_all" + '\\' + "lvyou"))class6_num = len(os.listdir("wordtimes\\sougou_all" + '\\' + "qiche"))class7_num = len(os.listdir("wordtimes\\sougou_all" + '\\' + "shangye"))class8_num = len(os.listdir("wordtimes\\sougou_all" + '\\' + "shishang"))class9_num = len(os.listdir("wordtimes\\sougou_all" + '\\' + "tiyu"))class10_num = len(os.listdir("wordtimes\\sougou_all" + '\\' + "yule"))#得到每个类别下,每个词在多少个文章中出现dict_1 = {}with open("wordtimes\\sougou_all" + '\\' + "aoyun_classtimes.txt", 'r') as df1: for kv in [d.strip().split('\t') for d in df1]: dict_1[kv[0].decode('utf-8')] = kv[1]# for k in dict_gaokao: #这个k是只有单词的# print k,dict_gaokao[k]dict_2 = {}with open("wordtimes\\sougou_all" + '\\' + "fangchan_classtimes.txt", 'r') as df2: for kv in [d.strip().split('\t') for d in df2]: # print kv[0].decode('utf-8') dict_2[kv[0].decode('utf-8')] = kv[1]dict_3 = {}with open("wordtimes\\sougou_all" + '\\' + "jiankang_classtimes.txt", 'r') as df3: for kv in [d.strip().split('\t') for d in df3]: dict_3[kv[0].decode('utf-8')] = kv[1]dict_4 = {}with open("wordtimes\\sougou_all" + '\\' + "jiaoyu_classtimes.txt", 'r') as df4: for kv in [d.strip().split('\t') for d in df4]: dict_4[kv[0].decode('utf-8')] = kv[1]dict_5 = {}with open("wordtimes\\sougou_all" + '\\' + "lvyou_classtimes.txt", 'r') as df5: for kv in [d.strip().split('\t') for d in df5]: dict_5[kv[0].decode('utf-8')] = kv[1]dict_6 = {}with open("wordtimes\\sougou_all" + '\\' + "qiche_classtimes.txt", 'r') as df6: for kv in [d.strip().split('\t') for d in df6]: dict_6[kv[0].decode('utf-8')] = kv[1]dict_7 = {}with open("wordtimes\\sougou_all" + '\\' + "shangye_classtimes.txt", 'r') as df7: for kv in [d.strip().split('\t') for d in df7]: dict_7[kv[0].decode('utf-8')] = kv[1]dict_8 = {}with open("wordtimes\\sougou_all" + '\\' + "shishang_classtimes.txt", 'r') as df8: for kv in [d.strip().split('\t') for d in df8]: dict_8[kv[0].decode('utf-8')] = kv[1]dict_9 = {}with open("wordtimes\\sougou_all" + '\\' + "tiyu_classtimes.txt", 'r') as df9: for kv in [d.strip().split('\t') for d in df9]: dict_9[kv[0].decode('utf-8')] = kv[1]dict_10 = {}with open("wordtimes\\sougou_all" + '\\' + "yule_classtimes.txt", 'r') as df10: for kv in [d.strip().split('\t') for d in df10]: dict_10[kv[0].decode('utf-8')] = kv[1]for class_num in range(1,11): #这里注意才是1到10 用于遍历每个class词典 dictname = locals()['dict_'+str(class_num)] #超级棒的一个locals()[],可以这样得到变量的名字 CHI_dic = {} # 用于记录这个class中每个词的卡方检验值 for kv in dictname: #遍历这个类别下的每个词,把这个类别下每个词的CHI值比较一下,取前100个 # print kv #记录这个单词名称 kv_out_class = 0 # 统计一个新词时,初始化本类别外用到这个词的文档数目为0 相当于b not_kv_out_class = 0 #统计一个新词时,初始化本类别外没有用到这个词的文档数目为0 相当于d kv_in_class = int(dictname[kv]) #记录在这个分类下包含这个词的文档的数量 相当于a # print type(kv_in_class) #注意这里得到的是str型的,一会儿做减法要类型转换 not_kv_in_class = (locals()['class' + str(class_num) + '_num']) - kv_in_class ##记录在这个分类下不包含这个词的文档的数量 相当于c for class_compare in range(1,11) : if class_compare != class_num: comparename = locals()['dict_' + str(class_compare)] if comparename.has_key(kv): kv_out_class += int(comparename[kv]) not_kv_out_class += (locals()['class' + str(class_compare) + '_num']) - kv_in_class CHI_dic[kv] = ((kv_in_class*not_kv_out_class - kv_out_class*not_kv_in_class)** 2)/((kv_in_class+kv_out_class)*(not_kv_in_class+not_kv_out_class)) # print kv,CHI_dic[kv] # print sys.getdefaultencoding() CHI_order = open("CHIorder" + '\\' + 'class'+str(class_num) + "_CHIorder.txt", 'w') CHI_order.write(('\n'.join(sorted(CHI_dic, key=CHI_dic.get, reverse=True))).encode('utf-8')) fin = open("CHIorder" + '\\' + 'class'+str(class_num) + "_CHIorder.txt", 'r') N = int(0.015*len(locals()['dict_'+str(class_num)])) #只取CHI值较大的前0.015个单词 print "从第%d类中选出%d个关键词" % (class_num,N) CHI_order_select = open("CHIorder" + '\\' + 'class' + str(class_num) + "_CHIorder_select.txt", 'w') for line in fin.readlines()[0:N] : #得到CHI值较大的N个单词作为当前的特征词,N和本类别的单词的数量有关 CHI_order_select.write(line.strip() + '\n')#下面得到每个文章中出现这些被选出词的情况,也就是使一会儿构成的特征词仅仅由这些组成rootpath = "..\seg and anno"+"\\"+"results"+"\\"+"sougou_all"category = os.listdir(rootpath)ca_num = 1for categoryName in category: # 循环类别文件,OSX系统默认第一个是系统文件 # if categoryName == 'yule': if(categoryName=='.DS_Store'):continue categoryPath = os.path.join(rootpath,categoryName) # 这个类别的路径 filesList = os.listdir(categoryPath) # 这个类别内所有文件列表 for filename in filesList: if(filename=='.DS_Store'):continue textname = (os.path.join(categoryPath, filename))[24:] #gaokao\1.txt contents = open(os.path.join(categoryPath,filename)) text_remain = CHI_control_text(contents,ca_num,textname,categoryName) # break ca_num += 1 # breakprint "CHI_run is finished!"
0 0
- 自编卡方检验程序 (python)
- 【课堂程序整理】检验自幂数(由水仙花数扩展)
- python CRC16检验
- Python正态性检验
- python T检验
- python之自练习小程序(循环)
- 【身份证合法性检查程序】(计算最后一位检验码)
- 重庆退料检验PLSQL程序段(第二版)
- 中国居民身份证号码检验程序
- 括号匹配的检验程序
- 实现程序完整性检验方法
- 采购接收检验入库程序
- 利用python进行T检验
- python进行JB正态性检验
- 八卦一下模型检验(二) (转自g9老大的博文)
- python程序(scrapy爬虫)在windows环境下开机定时自启动
- 关于python自增运算(千万不要用++i,不然程序崩掉)
- 单置换检验,轮廓检验及其他程序
- C语言自动关机小程序300s
- Android组件化、模块化开发
- 手把手教你用Spring Cloud和Docker构建微服务
- 基础温习 - MFC _cstatusBar
- 面向协议编程与 Cocoa 的邂逅
- 自编卡方检验程序 (python)
- Java 深度克隆与浅克隆
- gitlab的一键安装包bitnami
- 【t074】上学路线
- loadrunner问题备忘
- c 语言宏定义 #define 的理解与资料整理
- 在Linux里设置环境变量的方法(export PATH)
- weblogic查看访问工程地址
- check the manual that corresponds to your MySQL server version for the right syntax to use near '?,?