费舍尔方法

来源：互联网发布：罗尼库尔曼数据深蹲编辑：程序博客网时间：2024/06/03 22:57

朴素贝叶斯不能对分类概率进行大致估算，只能判定特征项属于哪一类的概率最大，因此费舍尔方法弥补该缺陷，可以直接估算特征项从属于某一类的概率值，下面首先列出基本代码以供参考，后期补充内容：

# -*- coding: utf-8 -*-import reimport mathdef getwords(doc):    splitter=re.compile('\\W*')    words=[s.lower() for s in splitter.split(doc) if len(s)>2 and len(s)<20]    #只返回一组不重复的单词    return dict([(w,1) for w in words])def sampletrain(cl):    cl.train('hello,everybody,welcome to suning','good')    cl.train('hello,everybody,nice to meet you','good')    cl.train('hello,everybody,IT is bad to you','bad')class classifier:    def __init__(self,getfeatures,filename=None):        self.fc={}        self.cc={}        self.getfeatures=getfeatures    #参数类型 单词，文档类型(tezheng/fenlei)    def incf(self,f,cat):        self.fc.setdefault(f,{})        self.fc[f].setdefault(cat,0)        self.fc[f][cat]+=1    def incc(self,cat):        self.cc.setdefault(cat,0)        self.cc[cat]+=1    def fcount(self,f,cat):        if f in self.fc and cat in self.fc[f]:            return float(self.fc[f][cat])        return 0.0    def catcount(self,cat):        if cat in self.cc:            return float(self.cc[cat])        return 0    def totalcount(self):        return sum(self.cc.values())    def categories(self):        return self.cc.keys()    def train(self,item,cat):        features=self.getfeatures(item)        for f in features:            self.incf(f,cat)        self.incc(cat)    #由于已经对单词进行去重处理，故概率值不可能大于1,求P(f|cat)    def fprob(self,f,cat):        if self.catcount(cat)==0:return 0        return self.fcount(f,cat)/self.catcount(cat)    #提升ap的值可能提升特征词的概率值      def weightedprob(self,f,cat,prf,weight=1.0,ap=0.6):        basicprob=prf(f,cat)        #该词在所有类别中出现的次数        totals=sum([self.fcount(f,c) for c in self.categories()])        bp=((weight*ap)+(totals*basicprob))/(weight+totals)        return bpclass naivebayes(classifier):    def __init__(self,getfeatures):        classifier.__init__(self,getfeatures)        self.thresholds={}    def setthreshold(self,cat,t):        self.thresholds[cat]=t    def getthreshold(self,cat):        if cat not in self.thresholds:return 1.0        return self.thresholds[cat]         def docprob(self,item,cat):        features=self.getfeatures(item)        p=1        for f in features:            p*=self.weightedprob(f,cat,self.fprob)        return p    def  prob(self,item,cat):        catprob=self.catcount(cat)/self.totalcount()        docprob=self.docprob(item,cat)        return docprob*catprob    def classify(self,item,default=None):            probs={}            max=0.0            for cat in self.categories():                probs[cat]=self.prob(item,cat)                if probs[cat]>max:                    max=probs[cat]                    best=cat            for cat in probs:                if cat==best:continue                if probs[cat]*self.getthreshold(best)>probs[best]:return default            return best   class fisherclassifier(classifier):    def cprob(self,f,cat):        clf=self.fprob(f,cat)        if clf==0:return 0        freqsum=sum([self.fprob(f,c) for c in self.categories()])         p=clf/(freqsum)        return p    def fisherprob(self,item,cat):        p=1        features=self.getfeatures(item)        for f in features:            p*=(self.weightedprob(f,cat,self.cprob))        fscore=-2*math.log(p)        #利用倒置对数卡方函数求得概率        return self.invchi2(fscore,len(features)*2)    def invchi2(self,chi,df):        m=chi/2.0        sum=term=math.exp(-m)        for i in range(1,df/2):            term*=m/i            sum+=term        return min(sum,1.0)

0 0