Python解析HTML实例

来源:互联网 发布:gnuradio python模块 编辑:程序博客网 时间:2024/06/05 03:10
# coding:utf-8import urllib.requestimport reimport xlwtimport timefrom xlwt import Workbookfrom html.parser import HTMLParserfrom tempfile import TemporaryFileinfo = ''zhPattern = re.compile(u'[\u4e00-\u9fa5]+')#结果目录inforst = 'C:/Users/玲玲/PycharmProjects/untitled/***/'#写入Excel(定义Excel表头)book = Workbook()#自动换行style = xlwt.easyxf('align: wrap on')#设置时间格式ISOTIMEFORMAT='%Y-%m-%d %X'''''' HTMLParser的成员函数:     handle_startendtag  处理开始标签和结束标签     handle_starttag     处理开始标签,比如<xx>     handle_endtag       处理结束标签,比如</xx>     handle_charref      处理特殊字符串,就是以&#开头的,一般是内码表示的字符     handle_entityref    处理一些特殊字符,以&开头的,比如       handle_data         处理数据,就是<xx>data</xx>中间的那些数据     handle_comment      处理注释     handle_decl         处理<!开头的,比如<!DOCTYPE html PUBLIC “-//W3C//DTD HTML 4.01 Transitional//EN”     handle_pi           处理形如<?instruction>的东西 '''class myHtmlParser(HTMLParser):    # 处理初始化数据结构     def  __init__(self):        HTMLParser.__init__(self)        self.pflag = 0        self.showflag = 1        self.scores = []        self.names = []        self.addresses = []        self.areacodes = []    #处理标签    def handle_starttag(self, tag, attrs):        if tag == 'p' or tag == 'br':            self.pflag = 1            self.showflag = 1        elif tag == 'img':#店铺评分            for attr in attrs:                for t in  attr:                    if  'title' in  t:                        ##print(attr[1])                        if '店面' in attr[1]:                            self.scores.append(attr[1])                        self.showflag = 0        elif tag == 'a':#店铺全称            for attr in attrs:                for t in  attr:                    if  'onclick' in  t:                        ##print(attr[1][6:-3])                        if attr[1][6:-3] not in self.names and zhPattern.search(attr[1][6:-3]):#去重、剔除不含中文部分                            self.names.append(attr[1][6:-3])                        self.showflag = 0        elif tag == 'option':  # 地区            for attr in attrs:                for t in  attr:                    if  'value' in  t:                        if '-1' not in attr[1]:                            self.areacodes.append(attr[1])        else:            self.showflag = 0    #处理标签之间的数据    def handle_data(self, data):#店铺联系方式及地址        if self.pflag  == 1 and str(data).strip() != '' and self.showflag == 1:            ##print(str(data).strip())            self.addresses.append(str(data).strip())    #处理返回函数    def get_scores(self):        return self.scores    def get_names(self):        return self.names    def get_addresses(self):        return self.addresses    def get_areacodes(self):        return self.areacodesif __name__ == '__main__':    #定义输出格式    sheet1 = book.add_sheet('数据')    row1 = sheet1.row(0)    row1.write(0, '所属省份')    row1.write(1, '所属城市')    row1.write(2, '店铺名称')    row1.write(3, '联系方式')    row1.write(4, '店面综合评价')    sheet1.col(0).width = 3000    sheet1.col(1).width = 3000    sheet1.col(2).width = 15000    sheet1.col(3).width = 20000    sheet1.col(4).width = 10000    m = myHtmlParser()    url = 'http://www.***.com.cn/Find_***_store/index.html?province=360000&city=-1'    req = urllib.request.Request(url)    fd = urllib.request.urlopen(req)    m.feed(fd.read().decode('utf-8'))    areacodes = m.get_areacodes()    s_rownum = 1#单个循环计数器    t_rownum = 1#所有计数器    # 获取地区编码    for i in range(1,len(areacodes)):#len(areacodes)        #print(areacodes[i])        areacode = areacodes[i]        url = 'http://www.***.com.cn/Find_***_store/index.html?province='+areacode+'&city=-1'        req = urllib.request.Request(url)        fd = urllib.request.urlopen(req)        m = myHtmlParser()        m.feed(fd.read().decode('utf-8'))        scores = []        names = []        addresses = []        scores = m.get_scores()        names = m.get_names()        addresses = m.get_addresses()        addressesT = []#处理过的地址        vAdd = ''        #写入评价分        #print(s_rownum)        for i in range(0, len(scores)):            #print(i, scores[i])            row1 = sheet1.row(s_rownum)            s_rownum = s_rownum + 1            row1.write(4, scores[i][6:], style)        s_rownum = t_rownum        #print(s_rownum)        #写入名称和省市        for i in range(0, len(names)):            #print(i, names[i])            ##print(i, names[i])            v_pos = str(names[i]).index('-')            ##print(v_pos)            v_lpos = str(names[i]).rindex('-')            ##print(v_lpos)            row1 = sheet1.row(s_rownum)            s_rownum = s_rownum + 1            row1.write(0, names[i][0:int(v_pos)], style)            row1.write(1, names[i][int(v_pos)+1:int(v_lpos)], style)            row1.write(2,  names[i], style)        #写入地址        s_rownum = t_rownum        #print(s_rownum)        for i in range(0, len(addresses)):            #print(i, addresses[i])            if '>' in  addresses[i]:                if '>' in  vAdd:                    addressesT.append(vAdd)                    vAdd = ''                vAdd = vAdd + addresses[i]            else:                vAdd = vAdd + ' ' + addresses[i]                addressesT.append(vAdd)                vAdd = ''        addressesT.append(vAdd)#最后一个可能没有联系方式        s_rownum = t_rownum        #print(s_rownum)        for i in range(0, len(addressesT)):            #print(i, addressesT[i])            row1 = sheet1.row(s_rownum)            s_rownum = s_rownum + 1            row1.write(3, addressesT[i][1:], style)        #print(s_rownum)        #print(t_rownum)        t_rownum = s_rownum#保存上一次行数    systime = str(time.strftime(ISOTIMEFORMAT, time.localtime())).replace(':','')    book.save(inforst + '数据'+systime+'.xls')    book.save(TemporaryFile())    m.close()
0 0
原创粉丝点击