python解析html提取数据,并生成word文档

来源:互联网 发布:百战天下进阶数据 编辑:程序博客网 时间:2024/05/16 07:50

今天试着用ptyhon做了一个抓取网页内容,并生成word文档的功能,功能很简单,做一下记录以备以后用到。

生成word用到了第三方组件python-docx,所以先进行第三方组件的安装。由于windows下安装的python默认不带setuptools这个模块,所以要先安装setuptools这个模块:

1、在python官网上找到 https://bootstrap.pypa.io/ez_setup.py  ,把代码保存到本地并执行:  python ez_setup.py

2、下载python-docx  (https://pypi.python.org/pypi/python-docx/0.7.4),下载完成后解压并进入到  XXX\python-docx-0.7.4 安装python-docx : python setup.py install

这样python-docx就安装成功了,可以用它来操作word文档了,word文档的生成参考的这里https://python-docx.readthedocs.org/en/latest/index.html


html解析用到的是sgmllib里的SGMLParser  url内容的获取用到的是urllib、urllib2

代码如下:

# -*- coding: cp936 -*-from sgmllib import SGMLParserimport osimport sysimport urllibimport urllib2from docx import Documentfrom docx.shared import Inchesimport time##获取要解析的urlclass GetUrl(SGMLParser):    def __init__(self):        SGMLParser.__init__(self)        self.start=False        self.urlArr=[]    def start_div(self,attr):        for name,value in attr:            if value=="ChairmanCont Bureau":#页面js中的固定值                self.start=True    def end_div(self):        self.start=False    def start_a(self,attr):        if self.start:            for name,value in attr:                self.urlArr.append(value)                def getUrlArr(self):        return self.urlArr    ##解析上面获取的url,获取有用数据class getManInfo(SGMLParser):    def __init__(self):        SGMLParser.__init__(self)        self.start=False        self.p=False        self.dl=False        self.manInfo=[]        self.subInfo=[]    def start_div(self,attr):        for name,value in attr:            if value=="SpeakerInfo":#页面js中的固定值                self.start=True    def end_div(self):        self.start=False    def start_p(self,attr):        if self.dl:            self.p=True    def end_p(self):        self.p=False    def start_img(self,attr):        if self.dl:            for name,value in attr:                self.subInfo.append(value)            def handle_data(self,data):        if self.p:            self.subInfo.append(data.decode('utf-8'))    def start_dl(self,attr):        if self.start:            self.dl=True    def end_dl(self):        self.manInfo.append(self.subInfo)        self.subInfo=[]        self.dl=False    def getManInfo(self):        return self.manInfo                urlSource="http://www.XXX"sourceData=urllib2.urlopen(urlSource).read()startTime=time.clock()##get urlsgetUrl=GetUrl()getUrl.feed(sourceData)urlArr=getUrl.getUrlArr()getUrl.close()print "get url use:" + str((time.clock() - startTime))startTime=time.clock()##get maninfosmanInfos=getManInfo()for url in urlArr:#one url one person    data=urllib2.urlopen(url).read()    manInfos.feed(data)infos=manInfos.getManInfo()manInfos.close()print "get maninfos use:" + str((time.clock() - startTime))startTime=time.clock()#wordsaveFile=os.getcwd()+"\\xxx.docx"doc=Document()##word titledoc.add_heading("HEAD".decode('gbk'),0)p=doc.add_paragraph("HEADCONTENT:".decode('gbk'))##write infofor infoArr in infos:    i=0    for info in infoArr:        if i==0:##img url            arr1=info.split('.')            suffix=arr1[len(arr1)-1]            arr2=info.split('/')            preffix=arr2[len(arr2)-2]            imgFile=os.getcwd()+"\\imgs\\"+preffix+"."+suffix            if not os.path.exists(os.getcwd()+"\\imgs"):                os.mkdir(os.getcwd()+"\\imgs")            imgData=urllib2.urlopen(info).read()            try:                f=open(imgFile,'wb')                f.write(imgData)                f.close()                doc.add_picture(imgFile,width=Inches(1.25))                os.remove(imgFile)            except Exception as err:                print (err)                      elif i==1:            doc.add_heading(info+":",level=1)        else:            doc.add_paragraph(info,style='ListBullet')        i=i+1    doc.save(saveFile)print "word use:" + str((time.clock() - startTime))


1 1