基于对论文Content Importance Models for Scoring Writing From Sources的简单的1-gram的实现

来源:互联网 发布:剑网三iu捏脸数据 编辑:程序博客网 时间:2024/06/06 19:24

以下是基于对论文的简单的在 1-gram 下面的实现,有五种模型,因为没有弄到reading所以,那种模型还没做。本代码所对应的语料不公布




# -*-coding:utf-8 -*-import sysimport reimport osfrom jieba import *import jiebareload(sys)sys.setdefaultencoding('utf8')# lecture地址lecture_url = 'firstproject/lecture.txt'class model_1_gram(object):    # 得到一元条件下的对于lecture的词的采集(去除了停用词),把采集结果放到一个list里面返回    def lecture_1_gram(self):        with open('firstproject/lecture.txt','r') as lecture:            # Windows下面有编码问题,这一步解决            content = lecture.read().strip().decode('gbk').encode('utf-8')            # print content            # 按空格,逗号或句号分词            # lecture_list = re.split(', |\. | "|" |\.|\n| ',content)            lecture_list = re.findall('([A-Za-z\']+)', content)            # print lecture_list            # 下面这一大块都是去除停用词            with open('firstproject/stopword.txt','r') as stopword:                stopword_content = stopword.read().strip().decode('gbk').encode('utf-8')                stopword_content = re.split('  \n',stopword_content)                lecture_list_new = []                for word in lecture_list:                    if word.lower() not in stopword_content:                        if word != '':                            # lecture_list.remove(word)                            lecture_list_new.append(word)        # print lecture_list_new        return lecture_list_new    # 这个函数返回word这个词在essay里面出现的次数    def CxE(self,word,url):        with open(url,'r') as essay:            results = essay.read().strip().decode('gbk').encode('utf-8')            essay_list = re.findall('([A-Za-z\']+)', results)            # essay_list = re.split(', |\. | "|" |\.|\n| ', results)            num = essay_list.count(word)            return num    # 返回lecture的所有词也就是n    def getNum(self,url):        with open(url,'r') as essay:            results = essay.read().strip().decode('gbk').encode('utf-8')            lecture_list = re.findall('([A-Za-z\']+)', results)            # lecture_list = re.split(', |\. | "|" |\.|\n| ', results)            num = len(lecture_list)            return num    # 第一个模型的权值    def naive_getWeight(self):        return 1    # 第二个模型权值    def prob_getWeight(self,word):        num = self.CxE(word,lecture_url)        lecture_num = self.getNum(lecture_url)        W = num*1.0/lecture_num        # print W        return W    # 第三个模型权值    def position_getWeight(self,word):        with open(lecture_url,'r') as lecture:            results = lecture.read().strip().decode('gbk').encode('utf-8')            # lecture_list = re.split(', |\. | "|" |\.|\n| ', results)            lecture_list = re.findall('([A-Za-z\']+)', results)            lecture_list_new = []            for i in lecture_list:                if i!='':                    lecture_list_new.append(i)            n = 0            for unit in lecture_list_new:                n += 1                if unit == word:                    break            lecture_num = self.getNum(lecture_url)            W = n*1.0/lecture_num            return W    # 第五个模型权值    def Good_getWeight(self,word):        url4 = 'firstproject/essay/4/'        url5 = 'firstproject/essay/5/'        # os获取目录url        file_name4 = os.listdir(url4)        file_name5 = os.listdir(url5)        num = 0        total = 0        for single_file4 in file_name4:            with open(url4+single_file4,'r') as essay:                results = essay.read().strip().decode('gbk').encode('utf-8')                # essay_list = re.split(', |\. | "|" |\.|\n| ', results)                essay_list = re.findall('([A-Za-z\']+)', results)                if word in essay_list:                    num += 1            total += 1        for single_file5 in file_name5:            with open(url5+single_file5,'r') as essay:                results = essay.read().strip().decode('gbk').encode('utf-8')                # essay_list = re.split(', |\. | "|" |\.|\n| ', results)                essay_list = re.findall('([A-Za-z\']+)', results)                if word in essay_list:                    num += 1            total += 1        W = num*1.0/total        return W    # 第六个模型权值    def GoodVsBad_getWeight(self,word):        url4 = 'firstproject/essay/4/'        url5 = 'firstproject/essay/5/'        file_name4 = os.listdir(url4)        file_name5 = os.listdir(url5)        num = 0        total = 0        for single_file4 in file_name4:            with open(url4 + single_file4, 'r') as essay:                results = essay.read().strip().decode('gbk').encode('utf-8')                # essay_list = re.split(', |\. | "|" |\.|\n| ', results)                essay_list = re.findall('([A-Za-z\']+)', results)                if word in essay_list:                    num += 1            total += 1        for single_file5 in file_name5:            with open(url5 + single_file5, 'r') as essay:                results = essay.read().strip().decode('gbk').encode('utf-8')                # essay_list = re.split(', |\. | "|" |\.|\n| ', results)                essay_list = re.findall('([A-Za-z\']+)', results)                if word in essay_list:                    num += 1            total += 1        Good_W = num * 1.0 / total        # 上面是算好的,下面是算坏的        url1 = 'firstproject/essay/1/'        url2 = 'firstproject/essay/2/'        file_name1 = os.listdir(url1)        file_name2 = os.listdir(url2)        Bad_num = 0        Bad_total = 0        for single_file1 in file_name1:            with open(url1 + single_file1, 'r') as essay:                results = essay.read().strip().decode('gbk').encode('utf-8')                # essay_list = re.split(', |\. | "|" |\.|\n| ', results)                essay_list = re.findall('([A-Za-z\']+)', results)                if word in essay_list:                    Bad_num += 1            Bad_total += 1        for single_file2 in file_name2:            with open(url2 + single_file2, 'r') as essay:                results = essay.read().strip().decode('gbk').encode('utf-8')                # essay_list = re.split(', |\. | "|" |\.|\n| ', results)                essay_list = re.findall('([A-Za-z\']+)', results)                if word in essay_list:                    Bad_num += 1            Bad_total += 1        Bad_W = Bad_num * 1.0 / Bad_total        W = Good_W-Bad_W        # print W        return W    # 从模型得到权值然后算分的函数    def getScore(self):        n=0        while n<5:            n += 1            # 拼接完整的url            url = 'firstproject/essay/'+str(n)            # 调用os获取一个文件夹下的所有url,然后循环调用            file_name = os.listdir(url)            score = 0            for single_file in file_name:                with open(url+'/'+single_file,'r') as essay:                    content = essay.read()                    # essay_list = re.split(', |\. | "|" |\.|\n| ', content)                    essay_list = re.findall('([A-Za-z\']+)', content)                    lecture_list = self.lecture_1_gram()                    for unit in lecture_list:                        if unit in essay_list:                            # W = self.GoodVsBad_getWeight(unit)                            W = self.naive_getWeight()                            # W = self.prob_getWeight(unit)                            # W = self.position_getWeight(unit)                            # W = self.Good_getWeight(unit)                            cxe = self.CxE(unit,url+'/'+single_file)                            score += W*cxe*1.0                    num = self.getNum(url+'/'+single_file)                    score /= num                    print str(score)+' '
model_1_gram = model_1_gram()model_1_gram.getScore()






阅读全文
0 0
原创粉丝点击