Python聚类分析死囚的最后遗言问题
来源:互联网 发布:洛基权杖淘宝网 编辑:程序博客网 时间:2024/05/04 10:39
聚类是无监督学习的一个例子,具体的定义百度一下吧!直接进入主题,先说明一下数据的问题,该数据是我重一家外国网站收集的关系死囚的临行前的一些最后遗言,以及死囚的一些个人数据,仅供参看。先说明一下怎样爬取数据吧!该案例使用urllib2,bs4,SGMLParser库中知识,其中urllib2用于爬取数据,bs4和SGMLParser用于解析数据,并保存到文件中去。具体的直接看代码吧!
# coding=utf-8import urllib2from bs4 import BeautifulSoupfrom sgmllib import SGMLParserclass FirstParser(SGMLParser): def __init__(self): SGMLParser.__init__(self) self.__start_tbody = False self.__start_tr = False self.__start_td = False self.__start_th = False self.__start_a = False self.__td_state = 0 self.__tr_value = [] self.data = [] def start_tbody(self, attr): self.__start_tbody = True def end_tbody(self): self.__start_tbody = False def start_tr(self, attrs): if self.__start_tbody: self.__start_tr = True def end_tr(self): if self.__start_tbody and self.__start_tr: self.data.append(self.__tr_value) self.__tr_value = [] self.__start_tr = False def start_th(self, attrs): if self.__start_tbody and self.__start_tr: self.__start_th = True def end_th(self): if self.__start_tbody and self.__start_tr and self.__start_th: self.__start_th = False def start_td(self, attrs): if self.__start_tbody and self.__start_tr: self.__start_td = True self.__td_state += 1 def end_td(self): if self.__start_tbody and self.__start_tr and self.__start_td: self.__start_td = False self.__td_state = 0 def start_a(self, attrs): if self.__start_tbody and self.__start_tr: self.__tr_value.append(attrs[0][1]) # print attrs self.__start_a = True def end_a(self): if self.__start_tbody and self.__start_tr and self.__start_td: self.__start_a = False def handle_data(self, data): if self.__start_tbody and self.__start_tr and \ (self.__start_td or self.__start_th): if self.__start_th: self.__tr_value.append(data) if self.__start_td: # if self.__td_state != 2 or self.__td_state != 3: self.__tr_value.append(data)def read_first(page): soup = BeautifulSoup(page, 'lxml') value = [] for row in soup.find_all('tbody'): tbody = row.find_all('tr') print len(tbody) for index, r in enumerate(tbody): t = [] if index == 0: for k in r.find_all('th'): t.append(k.string) else: for k in r.find_all('td'): t.append(k.string) value.append(t) return valuedef download_second(url): url = 'http://www.tdcj.state.tx.us/death_row/' + url page = urllib2.urlopen(url).read() page = page.replace('<br />', '') soup = BeautifulSoup(page, 'lxml') vl = [] v2 = [] for row in soup.find('table').find_all('tr'): td = row.find_all('td') vl.append(fun_replace(td[len(td) - 1].string)) p = soup.find_all('p') for row in p[1:]: temp = [] if len(row.find_all('span')) > 0: # temp.append(fun_replace(str(row.find_all('span')[0].string))) try: temp.append(fun_replace(str(row.text.split('\r\n')[1].strip()))) except: temp.append('') else: # temp.append(row.string) temp.append('') v2.append(temp) return [vl, v2]def download_three(url): url = 'http://www.tdcj.state.tx.us/death_row/' + url page = urllib2.urlopen(url).read() soup = BeautifulSoup(page, 'lxml') p = soup.find_all('p') v1 = [] if len(p) >= 6: for index, row in enumerate(p): if index % 2 == 1: v1.append([fun_replace(p[index].string), fun_replace(p[index + 1].string)]) if index >= 5: break return v1def fun_replace(s): return s.replace(',', '.') if s is not None else ''def down_first(): url = 'http://www.tdcj.state.tx.us/death_row/dr_executed_offenders.html' # page = urllib2.urlopen(url).read() page = open('first.html').read() first = FirstParser() first.feed(page) value = first.data with open('first.txt', 'a+') as f: for index, row in enumerate(value): print row[0] if index == 0: continue value = 'Execution,Name,TDCJ Number,Date of Birth,Date Received,' \ 'Age (when Received),Education Level (Highest Grade Completed),' \ 'Date of Offense,Age (at the time of Offense),County,Race,Gender,' \ 'Hair Color,Height,Weight,Eye Color,Native County,Native State,' \ 'Prior Occupation,Prior Prison Record,Summary of Incident,Co-Defendants,' \ 'Race and Gender of Victim,Date of Execution,Offender,Last Statement,' \ 'Last Name,First Name,Race,County\n' f.write(value) else: try: se = download_second(row[1]) th = download_three(row[3]) value = row[0] + ',' + se[0][0] + ',' + se[0][1] + ',' + se[0][2] + ',' + se[0][3] \ + ',' + se[0][4] + ',' + se[0][5] \ + ',' + se[0][6] + ',' + se[0][7] + ',' + se[0][8] + ',' + se[0][9] + ',' + se[0][10] \ + ',' + se[0][11] + ',' + se[0][12] + ',' + se[0][13] + ',' + se[0][14] \ + ',' + se[0][15] + ',' + se[0][16] + ',' + se[1][0][0] + ',' + se[1][1][0] + ',' \ + se[1][2][0] + ',' + se[1][3][0] + ',' + se[1][4][0] + ',' + th[0][1] + ',' + th[1][0] \ + ',' + th[2][0] + ',' + row[3] + ',' + row[4] + ',' + row[-2] + ',' + row[-1] + '\n' f.write(value.encode('utf-8')) except BaseException as e: print e # breakdown_first()
这里不建议直接通过案例直接去爬取数据,数据中有些坑,好多数据是以图片的形式展现的,没有办法获取到的!可以直接下载案例中的数据去使用。
下面开市进行聚类分析
# coding=utf-8import mathimport randomimport redef height(s): if s.find('\'') != -1: t = s.replace('"', '').split('\'') elif s.find('ft') != -1: t = s.replace('.', '').replace('in', '').split('ft') elif s.find('-') != -1: t = s.split('-') elif len(s.strip()) == 0: t = ['5', '11'] else: t = ['5', '11'] v = [float(t[0].strip()), float(t[1].strip() if len(t[1].strip()) != 0 else '0')] return round((12 * v[0] + v[1]) * 30.48 / 12, 2)def grade(s): p = re.match(r'\d+', s) if p is None: return 12 else: sp = p.span() return int(s[sp[0]:sp[1]])def load_dataset(): dataSet = [] labels = [] titles = [] with open('first.txt', 'r+') as f: for index, row in enumerate(f.readlines()): if index == 0: titles = [row for row in row.strip().split(',')] else: t = [row for row in row.strip().split(',')] dataSet.append([int(t[5]), grade(t[6]), height(t[13]), float(t[14].replace('lbs.', ''))]) labels.append([row.strip() for index, row in enumerate(t) if index not in (14, 13, 6, 5)]) return dataSet, labels, titlesdef pearson(v1, v2): ''' 计算皮尔相关度 :param v1: :param v2: :return: ''' sum1 = sum(v1) sum2 = sum(v2) sum1Sq = sum([pow(x, 2) for x in v1]) sum2Sq = sum([pow(x, 2) for x in v2]) psum = sum([v1[index] * v2[index] for index in range(len(v1))]) # 计算r num = psum - (sum1 * sum2 / len(v1)) den = math.sqrt((sum1Sq - pow(sum1, 2) / len(v1)) * (sum2Sq - pow(sum2, 2) / len(v1))) if den == 0: return 0 return 1.0 - num / dendef euclidean(v1, v2): ''' 欧几里得距离 :param v1: :param v2: :return: ''' return math.sqrt(sum([pow(v1[i] - v2[i], 2) for i in range(len(v1))]))class bicluster: def __init__(self, vec, left=None, right=None, distance=0.0, id=None): self.left = left self.right = right self.vec = vec self.id = id self.distance = distancedef hcluster(rows, distance=pearson): ''' 简单分类 :param rows: :param distance: :return: ''' distances = {} currentclustid = -1 clust = [bicluster(rows[i], id=i) for i in range(len(rows))] while len(clust) > 1: lowestpair = (0, 1) closest = distance(clust[0].vec, clust[1].vec) for i in range(len(clust)): for j in range(i + 1, len(clust)): if (clust[i].id, clust[j].id) not in distances: distances[(clust[i].id, clust[j].id)] = \ distance(clust[i].vec, clust[j].vec) d = distances[(clust[i].id, clust[j].id)] if d < closest: closest = d lowestpair = (i, j) mergevec = [ (clust[lowestpair[0]].vec[i] + clust[lowestpair[1]].vec[i]) / 2.0 for i in range(len(clust[0].vec))] newcluster = bicluster(mergevec, left=clust[lowestpair[0]], right=clust[lowestpair[1]], distance=closest, id=currentclustid) currentclustid -= 1 del clust[lowestpair[1]] del clust[lowestpair[0]] clust.append(newcluster) return clust[0]def find(clust, labels, data, distance=pearson): ''' 查找最合适的结果 :param clust: :param labels: :param data: :param distance: :return: ''' while True: left = clust.left right = clust.right if left is None and right is None: return labels[clust.id] else: if left is None and right is not None: clust = left continue elif left is not None and right is None: clust = right continue else: ls = distance(left.vec, data) rs = distance(right.vec, data) if ls <= rs: clust = left continue else: clust = right continuedef kcluster(rows, distance=pearson, k=4): ''' K-均值聚类 :param rows: :param distance: :param k: :return: ''' ranges = [(min([row[i] for row in rows]), max([row[i] for row in rows])) \ for i in range(len(rows[0]))] clusters = [[random.random() * (ranges[i][1] - ranges[i][0]) + ranges[i][0] \ for i in range(len(rows[0]))] for j in range(k)] lastmatches = None for t in range(100): # print 'Iteration %d' % t bestmatches = [[] for i in range(k)] for j in range(len(rows)): row = rows[j] bestmatch = 0 for i in range(k): d = distance(clusters[i], row) if d < distance(clusters[bestmatch], row): bestmatch = i bestmatches[bestmatch].append(j) if bestmatches == lastmatches: break lastmatches = bestmatches for i in range(k): avgs = [0.0] * len(rows[0]) if len(bestmatches[i]) > 0: for rowid in bestmatches[i]: for m in range(len(rows[rowid])): avgs[m] += rows[rowid][m] for j in range(len(avgs)): avgs[j] /= len(bestmatches[i]) clusters[i] = avgs return bestmatches, clustersdef find_k(bestmatches, clusters, dataSet, labels, data, distance=pearson): best = -1 best_value = 0 for i in range(len(clusters)): t1 = distance(clusters[i], data) if t1 <= best_value: best = i best_value = t1 best1 = -1 best_value1 = 0 for i, row in enumerate(bestmatches[best]): t1 = distance(dataSet[row], data) if t1 <= best_value1: best1 = row best_value1 = t1 return labels[best1], dataSet[best1]
具体不太了解的看注释,其中有两套方法,分类依据有皮尔逊相关系数和欧几里得距离,有分级聚类和k-均值聚类等,函数以参数的形式进行传递,有利于以后的扩展。
下面是测试程序
# coding=utf-8from analysis import *from show import *dataSet, labels, titles = load_dataset()#测试分级聚类,使用皮尔逊相关系数clust = hcluster(dataSet, distance=pearson)result = find(clust, labels, [26, 16, 176, 160], distance=pearson)print result#测试分级聚类,使用欧几里得距离# clust = hcluster(dataSet, distance=euclidean)# result = find(clust, labels, [26, 16, 176, 160], distance=euclidean)# print result#把分局结果进行分级显示# printclust(clust)#使用k-均值聚类,皮尔逊欧几里得系数# bestmatches, clusters = kcluster(dataSet, pearson, 4)# result = find_k(bestmatches, clusters, dataSet, labels, [26, 16, 176, 160], pearson)# print result
本人菜鸟一枚,仅供共同学习使用,还请大神多多指导。接下来会写用KNN进行分类的方法。
下载地址
1 0
- Python聚类分析死囚的最后遗言问题
- 惊人的预见:闻红岩烈士最后对党的泣血遗言有感
- 鲁迅的遗言
- 大水的遗言
- 遗言
- 聚类分析应注意的问题
- 使用python sklearn下的k_means聚类分析算法时遇到的问题
- 奥黛丽 赫本的遗言
- Python 对Facebook用户Friendship的聚类分析
- 聚类分析的K均值算法(Python实现)
- [收藏]三国时代的十大遗言
- 免死的遗言——悖论
- 奥黛丽·赫本的遗言
- 1000个人临终前的遗言
- 【Python-Opencv】KNN聚类分析
- 使用Python进行聚类分析
- Python KMeans聚类分析
- 中国最有名的10大遗言——民间的遗言是精华,哈哈
- 函数和方程的区别
- JAVA JTable 关于表格初始化设置、表格事件监听及数据更新问题
- MFCC整理
- 二级缓存具体应用
- JSP 过滤器
- Python聚类分析死囚的最后遗言问题
- Mac下Apache Tomcat安装配置
- TCP-客户端连接服务端
- Sphinx4语音识别的框架
- RadioGroup+ViewPager+Fragment的框架布局
- TextView 下划线
- VM VirtualBox安装Fedora24之后安装增强工具
- 打印最长公共子串【LCS】【回溯】
- HDU4435