聚类算法之层次聚类
来源:互联网 发布:mac炒股软件 编辑:程序博客网 时间:2024/05/17 08:06
分级聚类就是一棵树
加入我们有如下一张图
那么通过聚类之后形成一颗如下的树:
现在就分好了级,而且还能看出距离关系, 很明显ab之间的距离比de之间的距离要短
#coding:utf-8import osimport sysimport chardetfrom math import sqrtfrom PIL import Image, ImageDrawimport randomdef readFile(fileName): lines = [line for line in file(fileName)] colNames = lines[0].strip().split('\t')[1:] rowNames = [] data = [] for line in lines[1:]: p = line.strip().split('\t') rowNames.append(p[0]) data.append([float(x) for x in p[1:]]) return rowNames, colNames, datadef pearsonBeta(v1, v2): sum1 = sum(v1) sum2 = sum(v2) sum1Sq = sum([pow(v, 2) for v in v1]) sum2Sq = sum([pow(v, 2) for v in v2]) pSum = sum([v1[i] * v2[i] for i in range(len(v1))]) nums = pSum - (sum1 * sum2 / len(v1)) den = sqrt((sum1Sq - pow(sum1, 2) / len(v1)) * (sum2Sq - pow(sum2, 2) / len(v2))) if(den == 0): return 0 return 1.0 - nums/den#距离函数def pearson(v1, v2): sum1 = sum(v1) sum2 = sum(v2) eSum1 = sum1 / len(v1) eSum2 = sum2 / len(v2) pSum = sum([(v1[i] - eSum1) * (v2[i] - eSum2) for i in range(len(v1))]) pTmp1 = sqrt(sum([pow(v1[i] -eSum1, 2) for i in range(len(v1))])) pTmp2 = sqrt(sum([pow(v2[i] -eSum2, 2) for i in range(len(v2))])) pSqrtSum = pTmp1 * pTmp2 if pSqrtSum == 0: return 0 return 1 - pSum / pSqrtSum#距离函数2def tanimoto(v1, v2): c1, c2, shr = 0, 0, 0 for i in range(len(v1)): if v1[i] != 0: c1 += 1 if v2[i] != 0: c2 += 1 if v1[i] != 0 and v2[i] != 0: shr += 1 return 1.0 - float(shr) / (float(c1 + c2 - shr))class bicluster: def __init__(self, vec, left = None, right = None, distance = 0.0, id = None): self.vec = vec self.left = left self.right = right self.distance = distance self.id = id def vis(self): print self.vec#层次聚类def hCluster(rows, distanceFunc = pearson): distances = {} currentClustId = -1 clust = [bicluster(rows[i], id = i) for i in range(len(rows))] while len(clust) > 1: lowestPair = (0, 1) closest = distanceFunc(clust[0].vec, clust[1].vec) for i in range(len(clust)): for j in range(i + 1, len(clust)): if(clust[i].id, clust[j].id) not in distances: distances[(clust[i].id, clust[j].id)] = distanceFunc(clust[i].vec, clust[j].vec) d = distances[(clust[i].id, clust[j].id)] #直接写成了i,j , 害我找了半天 if d < closest: closest = d lowestPair = (i, j) mergevec = [(clust[lowestPair[0]].vec[i] + clust[lowestPair[1]].vec[i]) / 2.0 for i in range(len(clust[0].vec))] newCluster = bicluster(mergevec, left = clust[lowestPair[0]], right = clust[lowestPair[1]], distance = closest, id = currentClustId) currentClustId -= 1 del clust[lowestPair[1]] #must first del 1, then 0 del clust[lowestPair[0]] clust.append(newCluster) return clust[0]#k-均值聚类def kcluster(rows, distanceFunc = pearson, k = 5): ranges = [(min(row[i] for row in rows), max(row[i] for row in rows)) for i in range(len(rows[0]))] clusters = [[random.random() * (ranges[i][1] - ranges[i][0]) + ranges[i][0] for i in range(len(rows[0]))] for j in range(k)] bestMatches = None for t in range(100): print "iter is: %d" %(t) lastMatches = [[] for i in range(k)] for i in range(len(rows)): row = rows[i] lastMatch = 0 for j in range(k): d = distanceFunc(clusters[j], row) if d < distanceFunc(rows[lastMatch], row): lastMatch = j lastMatches[lastMatch].append(i) if lastMatches == bestMatches: break; bestMatches = lastMatches #move center for i in range(k): if len(bestMatches[i]) > 0: newRow = [] for j in range(len(rows[0])): sum = 0 for v in range(len(bestMatches[i])): sum += rows[v][j] newRow.append(sum) for j in range(len(newRow)): newRow[j] = newRow[j] / len(bestMatches[i]) clusters[i] = newRow return bestMatches #以缩进方式打印层次聚类的树def printClust(clust, labels = None, n = 0): for i in range(n):print ' ', if clust.id < 0: print '-' else: if labels == None: print clust.id else: print labels[clust.id] if clust.left != None: printClust(clust.left, labels = labels, n = n + 1) if clust.right != None: printClust(clust.right, labels = labels, n = n + 1) def getHeight(clust): if clust.left == None and clust.right == None: return 1 return getHeight(clust.left) + getHeight(clust.right)def getDepth(clust): if clust.left == None and clust.right == None: return 1 return max(getDepth(clust.left), getDepth(clust.right)) + clust.distancedef drawnode(draw, clust, x, y, scaling, labels): if clust.id < 0: h1 = getHeight(clust.left) * 20 h2 = getHeight(clust.right) * 20 top = y - (h1 + h2) / 2 bottom = y + (h1 + h2) / 2 li = clust.distance * scaling draw.line((x, top + h1/2, x, bottom - h2/2), fill = (255, 0, 0)) draw.line((x, top + h1/2, x + li, top + h1/2), fill = (255, 0, 0)) draw.line((x ,bottom - h2/2, x + li, bottom - h2/2), fill = (255, 0, 0)) drawnode(draw, clust.left, x + li, top + h1/2, scaling, labels) drawnode(draw, clust.right, x + li, bottom - h2/2, scaling, labels) else: draw.text((x + 5, y - 7), labels[clust.id], (0, 0, 0)) #以属性结构打印层次聚类的关系def drawdendrogram(clust, labels, jpeg = "zebo2.jpg"): h = getHeight(clust) * 20 w = 1200 depth = getDepth(clust) scaling = float(w - 150) / depth img = Image.new("RGB", (w, h), (255, 255, 255)) draw = ImageDraw.Draw(img) draw.line((0, h/2, 10, h/2), fill = (255, 0, 0)) drawnode(draw, clust, 10, (h/2), scaling, labels) img.save(jpeg, "JPEG") (wants, people, data) = readFile("F:\\py\\dataFetch\\julei\\data\\blogdata.txt")clust = hCluster(data, distanceFunc = pearson)printClust(clust, wants)#drawdendrogram(clust, wants)#print kcluster(data)#cluster = hCluster(data, distanceFunc = tanimoto)#drawdendrogram(cluster, rowNames)
那么执行这个程序之后生成如下一张图片:
本程序所需的数据是某某博客出现某某关键字的次数的一个矩阵, 不过现在官网的链接打不开了
附如下链接:http://pan.baidu.com/s/17SqJS
第一列是博客名, 后面的每一列都是单词在改博客出现的次数
- 聚类算法之层次聚类
- 聚类分析之层次聚类算法
- 层次聚类算法
- 层次聚类算法
- 层次聚类算法
- 层次聚类算法
- 层次聚类算法
- 层次聚类算法
- 层次聚类算法
- 层次聚类算法
- 层次聚类算法
- 层次聚类之AGNES算法和DIANA算法
- 层次聚类之AGNES算法和DIANA算法
- 层次聚类算法--AGENES
- 层次概率聚类算法
- 层次聚类算法总结
- 聚类算法之层次聚类(Java实现)
- 机器学习之聚类算法(层次聚类)
- [DX10游戏教程(C++)]教程1:在Visual Studio 2012中配置DirectX 10
- 【2种方法帮你恢复在w7下U盘隐藏的文件】
- android中常见控件
- C# 基类和派生类
- 编程之美_单链表面试题_结合3.4_3.6 .
- 聚类算法之层次聚类
- 内核的触发和处理流程——The kernel of the trigger and processing process
- poj 1330 Nearest Common Ancestors (LCA)
- 【幸福是什么,幸福的含义到底有多深呢?】
- oracle 11g RAC 停机之后启动rac 时报 CRS-0184: Cannot communicate with the CRS daemon.
- 程序设计”七宗罪“
- 被华为捉弄的面试经历
- C/C++ standard library -- <cstring>(string.h)
- ejb学习笔记