Python解析HTML实例
来源:互联网 发布:gnuradio python模块 编辑:程序博客网 时间:2024/06/05 03:10
# coding:utf-8import urllib.requestimport reimport xlwtimport timefrom xlwt import Workbookfrom html.parser import HTMLParserfrom tempfile import TemporaryFileinfo = ''zhPattern = re.compile(u'[\u4e00-\u9fa5]+')#结果目录inforst = 'C:/Users/玲玲/PycharmProjects/untitled/***/'#写入Excel(定义Excel表头)book = Workbook()#自动换行style = xlwt.easyxf('align: wrap on')#设置时间格式ISOTIMEFORMAT='%Y-%m-%d %X'''''' HTMLParser的成员函数: handle_startendtag 处理开始标签和结束标签 handle_starttag 处理开始标签,比如<xx> handle_endtag 处理结束标签,比如</xx> handle_charref 处理特殊字符串,就是以&#开头的,一般是内码表示的字符 handle_entityref 处理一些特殊字符,以&开头的,比如 handle_data 处理数据,就是<xx>data</xx>中间的那些数据 handle_comment 处理注释 handle_decl 处理<!开头的,比如<!DOCTYPE html PUBLIC “-//W3C//DTD HTML 4.01 Transitional//EN” handle_pi 处理形如<?instruction>的东西 '''class myHtmlParser(HTMLParser): # 处理初始化数据结构 def __init__(self): HTMLParser.__init__(self) self.pflag = 0 self.showflag = 1 self.scores = [] self.names = [] self.addresses = [] self.areacodes = [] #处理标签 def handle_starttag(self, tag, attrs): if tag == 'p' or tag == 'br': self.pflag = 1 self.showflag = 1 elif tag == 'img':#店铺评分 for attr in attrs: for t in attr: if 'title' in t: ##print(attr[1]) if '店面' in attr[1]: self.scores.append(attr[1]) self.showflag = 0 elif tag == 'a':#店铺全称 for attr in attrs: for t in attr: if 'onclick' in t: ##print(attr[1][6:-3]) if attr[1][6:-3] not in self.names and zhPattern.search(attr[1][6:-3]):#去重、剔除不含中文部分 self.names.append(attr[1][6:-3]) self.showflag = 0 elif tag == 'option': # 地区 for attr in attrs: for t in attr: if 'value' in t: if '-1' not in attr[1]: self.areacodes.append(attr[1]) else: self.showflag = 0 #处理标签之间的数据 def handle_data(self, data):#店铺联系方式及地址 if self.pflag == 1 and str(data).strip() != '' and self.showflag == 1: ##print(str(data).strip()) self.addresses.append(str(data).strip()) #处理返回函数 def get_scores(self): return self.scores def get_names(self): return self.names def get_addresses(self): return self.addresses def get_areacodes(self): return self.areacodesif __name__ == '__main__': #定义输出格式 sheet1 = book.add_sheet('数据') row1 = sheet1.row(0) row1.write(0, '所属省份') row1.write(1, '所属城市') row1.write(2, '店铺名称') row1.write(3, '联系方式') row1.write(4, '店面综合评价') sheet1.col(0).width = 3000 sheet1.col(1).width = 3000 sheet1.col(2).width = 15000 sheet1.col(3).width = 20000 sheet1.col(4).width = 10000 m = myHtmlParser() url = 'http://www.***.com.cn/Find_***_store/index.html?province=360000&city=-1' req = urllib.request.Request(url) fd = urllib.request.urlopen(req) m.feed(fd.read().decode('utf-8')) areacodes = m.get_areacodes() s_rownum = 1#单个循环计数器 t_rownum = 1#所有计数器 # 获取地区编码 for i in range(1,len(areacodes)):#len(areacodes) #print(areacodes[i]) areacode = areacodes[i] url = 'http://www.***.com.cn/Find_***_store/index.html?province='+areacode+'&city=-1' req = urllib.request.Request(url) fd = urllib.request.urlopen(req) m = myHtmlParser() m.feed(fd.read().decode('utf-8')) scores = [] names = [] addresses = [] scores = m.get_scores() names = m.get_names() addresses = m.get_addresses() addressesT = []#处理过的地址 vAdd = '' #写入评价分 #print(s_rownum) for i in range(0, len(scores)): #print(i, scores[i]) row1 = sheet1.row(s_rownum) s_rownum = s_rownum + 1 row1.write(4, scores[i][6:], style) s_rownum = t_rownum #print(s_rownum) #写入名称和省市 for i in range(0, len(names)): #print(i, names[i]) ##print(i, names[i]) v_pos = str(names[i]).index('-') ##print(v_pos) v_lpos = str(names[i]).rindex('-') ##print(v_lpos) row1 = sheet1.row(s_rownum) s_rownum = s_rownum + 1 row1.write(0, names[i][0:int(v_pos)], style) row1.write(1, names[i][int(v_pos)+1:int(v_lpos)], style) row1.write(2, names[i], style) #写入地址 s_rownum = t_rownum #print(s_rownum) for i in range(0, len(addresses)): #print(i, addresses[i]) if '>' in addresses[i]: if '>' in vAdd: addressesT.append(vAdd) vAdd = '' vAdd = vAdd + addresses[i] else: vAdd = vAdd + ' ' + addresses[i] addressesT.append(vAdd) vAdd = '' addressesT.append(vAdd)#最后一个可能没有联系方式 s_rownum = t_rownum #print(s_rownum) for i in range(0, len(addressesT)): #print(i, addressesT[i]) row1 = sheet1.row(s_rownum) s_rownum = s_rownum + 1 row1.write(3, addressesT[i][1:], style) #print(s_rownum) #print(t_rownum) t_rownum = s_rownum#保存上一次行数 systime = str(time.strftime(ISOTIMEFORMAT, time.localtime())).replace(':','') book.save(inforst + '数据'+systime+'.xls') book.save(TemporaryFile()) m.close()
0 0
- Python解析HTML实例
- python 解析html基础 HTMLParser库,方法,及代码实例
- 用python解析html
- 用python解析html
- 用python解析html
- python解析html/xml
- python HTML解析器
- python html解析
- python 解析html
- python 解析HTML
- Python 抓取解析HTML
- Python+lxml解析html
- python解析html tag
- 【Python】 html解析BeautifulSoup
- 用python解析html
- python:html元素解析
- Jsoup 解析Html源码实例
- java jsoup解析html实例
- 03 JavaScript基础之--简单数据类型转换
- Bitmap too large to be uploaded into a texture 解决方案
- 如何在CSDN博客中生成“我喜欢的音乐”页面
- 防止头文件被重复包含、extern、变量定义与声明的区别
- Android创建子线程和回调主线程的几种方式
- Python解析HTML实例
- 17个CSS知识点总结
- C语言之按行读取文件
- Struts2下拉按钮标签:select的使用
- UVA 1146 Now or later(2-SAT)
- SQL基础学习
- 【Servlet】Filter过滤器
- 经典题
- 不用Pandas包和用pandas包处理数据集