python apriori

来源:互联网 发布:淘宝可以延长几天收货 编辑:程序博客网 时间:2024/06/15 11:16
小修改+注释
"""# Python 2.7# Filename: apriori.py# Author: llhthinker# Email: hangliu56[AT]gmail[DOT]com# Blog: http://www.cnblogs.com/llhthinker/p/6719779.html# Date: 2017-04-16""""""data_set = list[list[]]L = list[set(frozenset())]support_data = dic{frozenset()} = value (support count)C1 = set(frozenset())Lk = set(frozenset())item_count = dic{frozenset()}Lksub1 = set(frozenset())Ck_item = frozenset()Ck = set(frozenset())"""#return a list(list)def load_data_set():    """    Load a sample data set (From Data Mining: Concepts and Techniques, 3th Edition)    Returns:        A data set: A list of transactions. Each transaction contains several items.    """    data_set = [['s1', 's2', 's5'], ['s2', 's4'], ['s2', 's3'],            ['s1', 's2', 's4'], ['s1', 's3'], ['s2', 's3'],            ['s1', 's3'], ['s1', 's2', 's3', 's5'], ['s1', 's2', 's3']]    """    the type of the data_set is list of list-----------------------------------------------    """    return data_set#return a set(frozenset)def create_C1(data_set):    """    Create frequent candidate 1-itemset C1 by scaning data_set.    Args:        data_set: A list of transactions. Each transaction contains several items.    Returns:        C1: A set which contains all frequent candidate 1-itemsets    """    """        The explain of frozenset :http://www.cnblogs.com/panwenbin-logs/p/5519617.html    """    C1 = set()    for t in data_set:        for item in t:            item_set = frozenset([item])            #print(type(item_set),item_set)            C1.add(item_set)    #print(C1)    return C1#return a bool -> just judge  **step of pruning**def is_apriori(Ck_item, Lksub1):    """    Judge whether a frequent candidate k-itemset satisfy Apriori property.    Args:        Ck_item: a frequent candidate k-itemset in Ck which contains all frequent                 candidate k-itemsets.        Lksub1: Lk-1, a set which contains all frequent candidate (k-1)-itemsets.    Returns:        True: satisfying Apriori property.        False: Not satisfying Apriori property.    """    for item in Ck_item:  #Ck_item is only frozenset which contains only one element(set).        #print("aaa")        #print(item)     #str        #print('bbb')        #print(Ck_item)    #<class 'frozenset'>        #print(type(Ck_item))        #print("origin")        #print(Ck_item)        sub_Ck = Ck_item - frozenset([item])    #sub_Ck is (k-1)-itemsets        #print("after pruning")        #print(sub_Ck)        if sub_Ck not in Lksub1:            #print("xxx")            #print(sub_Ck)            return False    return True#return a set(frozenset())   **step of connection**def create_Ck(Lksub1, k):    """    Create Ck, a set which contains all all frequent candidate k-itemsets    by Lk-1's own connection operation.    Args:        Lksub1: Lk-1, a set which contains all frequent candidate (k-1)-itemsets.        k: the item number of a frequent itemset.    Return:        Ck: a set which contains all all frequent candidate k-itemsets.    """    Ck = set()    len_Lksub1 = len(Lksub1)  #the numbers of the (k-1)-itemsets    #print(len_Lksub1)    list_Lksub1 = list(Lksub1)  #transform (k-1)-itemsets of the set into list    #print(list_Lksub1)    for i in range(len_Lksub1):        for j in range(i+1, len_Lksub1):            l1 = list(list_Lksub1[i])    #list of the list            l2 = list(list_Lksub1[j])            l1.sort()            l2.sort()            #print(l1)            #print(l2)            if l1[0:k-2] == l2[0:k-2]:                Ck_item = list_Lksub1[i] | list_Lksub1[j]   #connecting list( two (k-1)-itemsets )                #print("xxx")                #print(Ck_item)                #print(list_Lksub1) --------------                #print(type(Ck_item))                #print(type(list_Lksub1))   #process -> list_Lk = list_1 | list_2 -> tranform list_LK into Ck_item            #else  pruning                if is_apriori(Ck_item, Lksub1):                    Ck.add(Ck_item)    #print(Ck)    #print(type(Ck))    return Ck#return a set(frozenset)  **scaning the data set**def generate_Lk_by_Ck(data_set, Ck, min_support, support_data):    """    Generate Lk by executing a delete policy from Ck.    Args:        data_set: A list of transactions. Each transaction contains several items.        Ck: A set which contains all all frequent candidate k-itemsets.        min_support: The minimum support.        support_data: A dictionary. The key is frequent itemset and the value is support.    Returns:        Lk: A set which contains all all frequent k-itemsets.    """    Lk = set()    item_count = {}    for t in data_set:  # t represent a transation        for item in Ck:  #item represent a candidate k-itemsets            """            print(type(item))    class->frozenset            print(item)          ->frozenset({'l2'}),which can be the key of the dictionary            print(type(t))        class->list            print(t)              [lx,lx,...lx]            """            if item.issubset(t):  # the set of item is the subset of the list of t                #print("Yes")                if item not in item_count:                    item_count[item] = 1                else:                    item_count[item] += 1           # else:           #     print("No")    t_num = float(len(data_set)) # total numbers of transations    for item in item_count:        if (item_count[item] / t_num) >= min_support:            Lk.add(item)            #print(Lk)            support_data[item] = item_count[item] #/ t_num    return Lk#return L = list(set(frozenset)) , support_data = dic()def generate_L(data_set, k, min_support):    """    Generate all frequent itemsets.    Args:        data_set: A list of transactions. Each transaction contains several items.        k: Maximum number of items for all frequent itemsets.        min_support: The minimum support.    Returns:        L: The list of Lk.        support_data: A dictionary. The key is frequent itemset and the value is support.    """    support_data = {}    C1 = create_C1(data_set)    L1 = generate_Lk_by_Ck(data_set, C1, min_support, support_data)    Lksub1 = L1.copy()    #print(Lksub1)    L = []    L.append(Lksub1)    #print(L)    for i in range(2, k+1):        Ci = create_Ck(Lksub1, i)        Li = generate_Lk_by_Ck(data_set, Ci, min_support, support_data)        Lksub1 = Li.copy()        L.append(Lksub1)      #every time append a set(frozenset) where contain k-itemsets    return L, support_datadef generate_big_rules(L, support_data, min_conf):    """    Generate big rules from frequent itemsets.    Args:        L: The list of Lk.        support_data: A dictionary. The key is frequent itemset and the value is support.        min_conf: Minimal confidence.    Returns:        big_rule_list: A list which contains all big rules. Each big rule is represented                       as a 3-tuple.    """    big_rule_list = []    sub_set_list = []    for i in range(0, len(L)):        for freq_set in L[i]:            for sub_set in sub_set_list:                if sub_set.issubset(freq_set):                    conf = support_data[freq_set] / support_data[freq_set - sub_set]                    big_rule = (freq_set - sub_set, sub_set, conf)                    if conf >= min_conf and big_rule not in big_rule_list:                        # print freq_set-sub_set, " => ", sub_set, "conf: ", conf                        big_rule_list.append(big_rule)            sub_set_list.append(freq_set)    return big_rule_listif __name__ == "__main__":    """        Test    """    data_set = load_data_set()    #load data    L, support_data = generate_L(data_set, k=3, min_support=0.2)    for Lk in L:        print ("="*50)        print ("frequent " + str(len(list(Lk)[0])) + "-itemsets\t\tsupport")        print ("="*50)        for freq_set in Lk:            print (freq_set, support_data[freq_set])    print ()    """    big_rules_list = generate_big_rules(L, support_data, min_conf=0.7)    print ("Big Rules")    for item in big_rules_list:        print (item[0], "=>", item[1], "conf: ", item[2])    """