基于对论文Content Importance Models for Scoring Writing From Sources的简单的1-gram的实现
来源:互联网 发布:剑网三iu捏脸数据 编辑:程序博客网 时间:2024/06/06 19:24
以下是基于对论文的简单的在 1-gram 下面的实现,有五种模型,因为没有弄到reading所以,那种模型还没做。本代码所对应的语料不公布
# -*-coding:utf-8 -*-import sysimport reimport osfrom jieba import *import jiebareload(sys)sys.setdefaultencoding('utf8')# lecture地址lecture_url = 'firstproject/lecture.txt'class model_1_gram(object): # 得到一元条件下的对于lecture的词的采集(去除了停用词),把采集结果放到一个list里面返回 def lecture_1_gram(self): with open('firstproject/lecture.txt','r') as lecture: # Windows下面有编码问题,这一步解决 content = lecture.read().strip().decode('gbk').encode('utf-8') # print content # 按空格,逗号或句号分词 # lecture_list = re.split(', |\. | "|" |\.|\n| ',content) lecture_list = re.findall('([A-Za-z\']+)', content) # print lecture_list # 下面这一大块都是去除停用词 with open('firstproject/stopword.txt','r') as stopword: stopword_content = stopword.read().strip().decode('gbk').encode('utf-8') stopword_content = re.split(' \n',stopword_content) lecture_list_new = [] for word in lecture_list: if word.lower() not in stopword_content: if word != '': # lecture_list.remove(word) lecture_list_new.append(word) # print lecture_list_new return lecture_list_new # 这个函数返回word这个词在essay里面出现的次数 def CxE(self,word,url): with open(url,'r') as essay: results = essay.read().strip().decode('gbk').encode('utf-8') essay_list = re.findall('([A-Za-z\']+)', results) # essay_list = re.split(', |\. | "|" |\.|\n| ', results) num = essay_list.count(word) return num # 返回lecture的所有词也就是n def getNum(self,url): with open(url,'r') as essay: results = essay.read().strip().decode('gbk').encode('utf-8') lecture_list = re.findall('([A-Za-z\']+)', results) # lecture_list = re.split(', |\. | "|" |\.|\n| ', results) num = len(lecture_list) return num # 第一个模型的权值 def naive_getWeight(self): return 1 # 第二个模型权值 def prob_getWeight(self,word): num = self.CxE(word,lecture_url) lecture_num = self.getNum(lecture_url) W = num*1.0/lecture_num # print W return W # 第三个模型权值 def position_getWeight(self,word): with open(lecture_url,'r') as lecture: results = lecture.read().strip().decode('gbk').encode('utf-8') # lecture_list = re.split(', |\. | "|" |\.|\n| ', results) lecture_list = re.findall('([A-Za-z\']+)', results) lecture_list_new = [] for i in lecture_list: if i!='': lecture_list_new.append(i) n = 0 for unit in lecture_list_new: n += 1 if unit == word: break lecture_num = self.getNum(lecture_url) W = n*1.0/lecture_num return W # 第五个模型权值 def Good_getWeight(self,word): url4 = 'firstproject/essay/4/' url5 = 'firstproject/essay/5/' # os获取目录url file_name4 = os.listdir(url4) file_name5 = os.listdir(url5) num = 0 total = 0 for single_file4 in file_name4: with open(url4+single_file4,'r') as essay: results = essay.read().strip().decode('gbk').encode('utf-8') # essay_list = re.split(', |\. | "|" |\.|\n| ', results) essay_list = re.findall('([A-Za-z\']+)', results) if word in essay_list: num += 1 total += 1 for single_file5 in file_name5: with open(url5+single_file5,'r') as essay: results = essay.read().strip().decode('gbk').encode('utf-8') # essay_list = re.split(', |\. | "|" |\.|\n| ', results) essay_list = re.findall('([A-Za-z\']+)', results) if word in essay_list: num += 1 total += 1 W = num*1.0/total return W # 第六个模型权值 def GoodVsBad_getWeight(self,word): url4 = 'firstproject/essay/4/' url5 = 'firstproject/essay/5/' file_name4 = os.listdir(url4) file_name5 = os.listdir(url5) num = 0 total = 0 for single_file4 in file_name4: with open(url4 + single_file4, 'r') as essay: results = essay.read().strip().decode('gbk').encode('utf-8') # essay_list = re.split(', |\. | "|" |\.|\n| ', results) essay_list = re.findall('([A-Za-z\']+)', results) if word in essay_list: num += 1 total += 1 for single_file5 in file_name5: with open(url5 + single_file5, 'r') as essay: results = essay.read().strip().decode('gbk').encode('utf-8') # essay_list = re.split(', |\. | "|" |\.|\n| ', results) essay_list = re.findall('([A-Za-z\']+)', results) if word in essay_list: num += 1 total += 1 Good_W = num * 1.0 / total # 上面是算好的,下面是算坏的 url1 = 'firstproject/essay/1/' url2 = 'firstproject/essay/2/' file_name1 = os.listdir(url1) file_name2 = os.listdir(url2) Bad_num = 0 Bad_total = 0 for single_file1 in file_name1: with open(url1 + single_file1, 'r') as essay: results = essay.read().strip().decode('gbk').encode('utf-8') # essay_list = re.split(', |\. | "|" |\.|\n| ', results) essay_list = re.findall('([A-Za-z\']+)', results) if word in essay_list: Bad_num += 1 Bad_total += 1 for single_file2 in file_name2: with open(url2 + single_file2, 'r') as essay: results = essay.read().strip().decode('gbk').encode('utf-8') # essay_list = re.split(', |\. | "|" |\.|\n| ', results) essay_list = re.findall('([A-Za-z\']+)', results) if word in essay_list: Bad_num += 1 Bad_total += 1 Bad_W = Bad_num * 1.0 / Bad_total W = Good_W-Bad_W # print W return W # 从模型得到权值然后算分的函数 def getScore(self): n=0 while n<5: n += 1 # 拼接完整的url url = 'firstproject/essay/'+str(n) # 调用os获取一个文件夹下的所有url,然后循环调用 file_name = os.listdir(url) score = 0 for single_file in file_name: with open(url+'/'+single_file,'r') as essay: content = essay.read() # essay_list = re.split(', |\. | "|" |\.|\n| ', content) essay_list = re.findall('([A-Za-z\']+)', content) lecture_list = self.lecture_1_gram() for unit in lecture_list: if unit in essay_list: # W = self.GoodVsBad_getWeight(unit) W = self.naive_getWeight() # W = self.prob_getWeight(unit) # W = self.position_getWeight(unit) # W = self.Good_getWeight(unit) cxe = self.CxE(unit,url+'/'+single_file) score += W*cxe*1.0 num = self.getNum(url+'/'+single_file) score /= num print str(score)+' '
model_1_gram = model_1_gram()model_1_gram.getScore()
阅读全文
0 0
- 基于对论文Content Importance Models for Scoring Writing From Sources的简单的1-gram的实现
- 基于对论文Content Importance Models for Scoring Writing From Sources的简单的2-gram的实现
- Content Importance Models for Scoring Writing From Sources
- 基于Skip-Gram的Word2Vec神经网络实现
- 基于k-gram的拼写校正方法的实现
- 简单的GRAM矩阵运算定义
- Lucene 的 Scoring 评分机制
- Lucene 的 Scoring 评分机制
- N-Gram的数据结构
- 基于词表和N-gram算法的新词识别实验
- 基于词表和N-gram算法的新词识别实验
- 基于统计的N-gram模型命名实体识别
- 基于N-gram的双向最大匹配中文分词
- 基于N-gram的双向最大匹配中文分词
- [DL]基于Pytorch的N-gram Language Model
- 简单的content provider
- 基于lame对mp3进行分割的简单实现
- 基于lame对mp3进行分割的简单实现
- QT creator+OpenCV2.4.2+MinGW 在windows下开发环境配置
- 第二次数据库课后习题(第四五章)
- shiro-从数据库中获取授权信息与显示菜单在页面上(注解版和jsp标签)
- 「一句话经典」从Spring到SpringMVC再到JavaEE开发颠覆者Spring Boot,最后佛法无边Spring Cloud;只不过却成了nodejs的笑柄
- 八大排序之--快速排序
- 基于对论文Content Importance Models for Scoring Writing From Sources的简单的1-gram的实现
- CF Round #413( Div.1/2) Fountains(线状数组查找)
- 操作系统(Operating System)2
- js中的Date对象在IOS系统中无法正常工作
- dfs求连通性,bfs求迷宫最短路
- spring使用注解注入bean理解
- 2.lucene入门案例(lucene6.0)
- 程序小白--ArrayList浅谈
- 决赛ppt讲解