机器学习-Python编写FP-growth进行关联性分析

来源:互联网 发布:马龙职业生涯数据 编辑:程序博客网 时间:2024/05/02 01:22

代码及数据集下载:FP-growth
当数据集较大时,Apriori算法需要多次扫描整个数据集,处理较慢,则需要一个增加高效的算法。FP-growth寻找频繁项集,只用扫描数据集两次。虽然该算法能够高效发现频繁项集,但是该算法并不能够发现规则。

循环一遍数据集:    计算单个元素项出现的频度,保存到头指针表根据最小频数筛选单个元素项循环一遍数据集:    将每个数据的元素按照头指针表排序    创建FP树

创建FP树

第一项是否在树的子节点中    如果不在就创建    如果在就加一第一项的头指针是否为none    如果为none则指向当前节点    如果不为none则沿指针找到最后一个点,并将最后一个点指向当前节点
import numpy as npdef loadSimpDat():    simpDat = [['r', 'z', 'h', 'j', 'p'],               ['z', 'y', 'x', 'w', 'v', 'u', 't', 's'],               ['z'],               ['r', 'x', 'n', 'o', 's'],               ['y', 'r', 'x', 'z', 'q', 't', 'p'],               ['y', 'z', 'x', 'e', 'q', 's', 't', 'm']]    return simpDatdef createInitSet(dataSet):    retDict = {}    for trans in dataSet:        retDict[frozenset(trans)] = retDict.get(frozenset(trans),0) + 1    return retDictclass treeNode:    def __init__(self,nameValue,numOccur,parentNode):        self.name = nameValue        self.count = numOccur        self.nodeLink = None        self.parent = parentNode        self.children = {}    def inc(self,numOccur):        self.count += numOccur    def disp(self,ind=1):        print(' '*ind,self.name,' ',self.count)        for child in self.children.values():            child.disp(ind+1)def createTree(dataSet,minSup=1):#生成树    headerTable = {}    for trans in dataSet:        for item in trans:            headerTable[item] = headerTable.get(item,0) + dataSet[trans]#    for key in headerTable.keys():#        if headerTable[key]<minSup:#            del(headerTable[key])#        headerTable = {k:v for k,v in headerTable.items() if v>=minSup}            freqItemSet = set(headerTable.keys())    if len(freqItemSet) == 0:        return None,None    for k in headerTable:        headerTable[k] = [headerTable[k],None]    retTree = treeNode('Null Set',0,None)    for transSet,count in dataSet.items():        localD = {}        for tran in transSet:            if tran in freqItemSet:                localD[tran] = headerTable[tran][0]        if len(localD) > 0:            orderedItems = [v[0] for v in sorted(localD.items(), key = lambda k: k[1],reverse = True)]            updateTree(orderedItems,retTree,headerTable,count)    return retTree,headerTabledef updateTree(items,inTree,headerTable,count):#更新树    if items[0] in inTree.children:        inTree.children[items[0]].inc(count)    else:        inTree.children[items[0]] = treeNode(items[0],count,inTree)#        print('this is ',inTree.children[items[0]].name,' items:',items)        if headerTable[items[0]][1] == None:            headerTable[items[0]][1] = inTree.children[items[0]]        else:            updateHeader(headerTable[items[0]][1],inTree.children[items[0]])    if len(items) > 1:        updateTree(items[1::],inTree.children[items[0]],headerTable,count)def updateHeader(nodeToTest,targetNode):#更新头指针    i = 0    while(nodeToTest.nodeLink != None):        nodeToTest = nodeToTest.nodeLink#        print(nodeToTest.name,' ',i)        i = i+1    nodeToTest.nodeLink = targetNodedef ascendTree(leafNode,prefixPath):    if leafNode.parent != None:        prefixPath.append(leafNode.name)        ascendTree(leafNode.parent,prefixPath)def findPrefixPath(basePat,treeNode):#寻找条件模式基    condPats = {}    while treeNode != None:        prefixPath = []        ascendTree(treeNode,prefixPath)        if len(prefixPath) > 1:            condPats[frozenset(prefixPath[1:])] = treeNode.count        treeNode = treeNode.nodeLink    return condPatsdef mineTree(inTree,headerTable,minSup,preFix,freqItemList):#生成频繁项集    bigL = [v[0] for v in sorted(headerTable.items(),key=lambda p:p[1][0])]    for basePat in bigL:        newFreqSet = preFix.copy()        newFreqSet.add(basePat)        freqItemList.append(newFreqSet)        condPattBases = findPrefixPath(basePat,headerTable[basePat][1])        myCondTree,myHead = createTree(condPattBases,minSup)        if myHead != None:            mineTree(myCondTree,myHead,minSup,newFreqSet,freqItemList)data = loadSimpDat()data = createInitSet(data)tree,header = createTree(data,3)freqItems = []mineTree(tree,header,3,set([]),freqItems)print(freqItems)
原创粉丝点击