【Python 编程】实现文本分类中的信息增益算法

来源：互联网发布：淘宝买家秀福利网站编辑：程序博客网时间：2024/04/29 22:44

运行结果如下：

word-doc—count矩阵
分辨率 2 2
2 [1, 1]
用户名 1 1
2 [1, 1]
鼠标 0 1
1 [0, 1]
密码技术 1 1
2 [1, 1]
密码 2 2
2 [1, 1]
账号 2 2
2 [1, 1]
电脑 0 1
1 [0, 1]
火狐 1 1
2 [1, 1]
系统 5 3
2 [1, 1]
苹果 1 1
2 [1, 1]
软件 1 1
2 [1, 1]
用户 1 1
2 [1, 1]
宽度 1 1
2 [1, 1]
键盘 0 1
1 [0, 1]
word IG
分辨率 0.333333333333
用户 0.333333333333
鼠标 0.666666666667
密码技术 0.333333333333
系统 0.333333333333
账号 0.333333333333
电脑 0.666666666667
软件 0.333333333333
密码 0.333333333333
苹果 0.333333333333
火狐 0.333333333333
用户名 0.333333333333
宽度 0.333333333333
键盘 0.666666666667
sort word IG
鼠标 0.666666666667
键盘 0.666666666667
电脑 0.666666666667
账号 0.333333333333
用户名 0.333333333333
用户 0.333333333333
系统 0.333333333333
软件 0.333333333333
苹果 0.333333333333
密码技术 0.333333333333
密码 0.333333333333
宽度 0.333333333333
火狐 0.333333333333
分辨率 0.333333333333

文件c1内容为：

1 系统 5
1 账号 2
1 密码 2
1 分辨率 2
1 用户名 1
1 用户 1
1 软件 1
1 苹果 1
1 密码技术 1
1 宽度 1
1 火狐 1

文件C2内容为：

2 系统 3
2 账号 2
2 密码 2
2 分辨率 2
2 用户名 1
2 用户 1
2 软件 1
2 苹果 1
2 密码技术 1
2 宽度 1
2 火狐 1
2 电脑 1
2 鼠标 1
2 键盘 1

def IG_count(c1,c2):

list1 = []
list2 = []
list3 = []
dict1 = dict()
ig_dict = dict()

class_count = 2
doc_count = 3

c1_file = open(c1,"r")
#c1_word_sets = c1_file.read()
for line in c1_file:
list1.append(line.split(" "))

c2_file = open(c2,"r")
for line in c2_file:
list1.append(line.split(" "))

#1、计算word-doc—count矩阵
"""
分辨率 [2, 2, 0] 2 2
用户名 [1, 1, 0] 1 1
鼠标 [0, 1] 0 1
密码技术 [1, 1, 0] 1 1
"""
for i in list1:
#print i[0],i[1],i[2]
if dict1.get(i[1]) == None:
list2 = list()
list2 = [0]
if i[2] == None:
list2.insert((int)(i[0]) - 1,0)
else:
list2.insert((int)(i[0]) - 1,(int)(i[2]))
dict1[i[1]] = list2
else:
if i[2] == None:
dict1[i[1]].insert((int)(i[0]) - 1,0)
else:
dict1[i[1]].insert((int)(i[0]) - 1,(int)(i[2]))


#2、计算每个Word的信息增益IG并保存到dit中
print "word-doc—count矩阵"
for dict_cont in dict1.viewkeys():
#print dict_cont,dict1[dict_cont][0],dict1[dict_cont][1]

t= class_count
entropy = class_count * 1/class_count * math.log(t,2);
wcount = 0 # // 出现word的文档的文档数量

category_doc_count = doc_count/class_count #每个类别中的文档数量
wcount_class = [0 for i in range(class_count)] #// 每个类别中出现单词word的文档数

pw = 0.0# // 出现word的文档占全部文档的比重
pcw = [0 for i in range(class_count)]# // 在单词word出现时各个类别中文档数占总文档数的比重
pcw_b = [0 for i in range(class_count)]#// 在单词word不出现时各个类别中文档数占总文档数的比重
#listabc = [0 for i in range(100)] or [0] * 100

for i in range(0,class_count):
for j in range(0,category_doc_count):
if dict1[dict_cont][j + i * category_doc_count] > 0 :
wcount_class[i] += 1
wcount += wcount_class[i]

print wcount, wcount_class

pw = 1.0 * wcount / doc_count;
for i in range(0,class_count):
pcw[i] = 1.0 * wcount_class[i] / wcount;
pcw_b[i] = 1.0 * (category_doc_count - wcount_class[i])/ (doc_count - wcount);

d1 = 0.0;
d2 = 0.0;

for i in range(0,class_count):
#print pcw[i],pcw_b[i]
if pcw[i] == 0:
t1 = 0
else:
t1 = math.log(pcw[i],2)
d1 += pcw[i] * t1;
if pcw_b[i] == 0:
t2 = 0
else:
t2 = math.log(pcw_b[i],2)
d2 += pcw_b[i] * t2;

ig = entropy + pw * d1 + (1.0 - pw) * d2;
ig_dict[dict_cont] = ig


#3、打印信息增益信息
print "word IG"
for dict_cont in ig_dict.viewkeys():
print dict_cont,ig_dict[dict_cont]

print "sort word IG"
word_dicts_list = sorted([(v, k) for k, v in ig_dict.items()], reverse=True)
for i in word_dicts_list:
print i[1],i[0]


c1_file.close()

c2_file.close()

IG_count(webpage_all_word_count_docno1,webpage_all_word_count_docno2)

算法参考：http://www.blogjava.net/zhenandaci/archive/2009/03/24/261701.html

Java实现从参考：http://www.cnblogs.com/zhangchaoyang/articles/2165482.html

0 0