无比强大!Python抓取cssmoban网站的模版并下载

来源:互联网 发布:淘宝怎么设置上新提醒 编辑:程序博客网 时间:2024/06/06 00:19

Python实现抓取http://www.cssmoban.com/cssthemes网站的模版并下载


实现代码

# -*- coding: utf-8 -*-import urlparseimport urllib2import reimport os  import os.pathURL='http://www.cssmoban.com/cssthemes'#全局超时设置 urllib2.socket.setdefaulttimeout(500)#根据url获取内容def getUrlContent(url):    response = urllib2.urlopen(url)    html = response.read();    return html#获取html中的a标签,且格式是<a target="_blank" href="/showcase/*">的def getAllUrl(html):    return re.findall('<a[\\s]+href="/cssthemes/\d+\.shtml">.*?\/a>',html)#获取下载文件的标题def getDownTitle(html):    return re.findall('\<h1>(.*?)\</h1>',html)#获取文件下载的urldef getDownUrl(html):    return re.findall('<a.*?class="button btn-down".*?\/a>',html)#获取下一页的urldef getNextUrl(html):    return re.findall('<a.*?下一页</a>',html)#下载文件def download(title,url):    result = urllib2.urlopen(url).read()    if os.path.exists("template/")==False:        os.makedirs("template/")    newname=("template/"+title.decode('utf-8'))    newname=newname+'.'+url[url.rfind('.')+1:len(url)]    open(newname, "wb").write(result)#记录日志def i(msg):    fileobj=open('info.log','a')    fileobj.write(msg+'\n')    fileobj.close();    print msg#记录错误日志def e(msg):    fileobj=open('error.log','a')    fileobj.write(msg+'\n')    fileobj.close();    print msgif __name__ == '__main__':    #print getDownUrl('<a href="http://down.cssmoban.com/cssthemes1/cctp_17_jeans.zip" target="_blank" class="button btn-down" title="免费下载"><i class="icon-down icon-white"></i><i class="icon-white icon-down-transiton"></i>免费下载</a>')        html= getUrlContent(URL)    i('开始下载:%s' %(URL))    while True:        lista= getAllUrl(html);        #print lista;        nextPage=getNextUrl(html)        #print nextPage[0]        nextUrl=''        #i('下一页%s'%(nextPage))                if len(nextPage)<=0:            e('地址:%s,未找到下一页,程序退出' %(nextPage))            break;                nextUrl=nextPage[0]        nextUrl=URL+'/'+nextUrl[nextUrl.index('href="')+6:nextUrl.index('" target')]        #print nextPage        for a in lista:            downGotoUrl=''            try:                #print a.decode('utf-8')                downGotoUrl=(URL+''+a[a.index('href="')+6:a.index('">')])                downGotoUrl=downGotoUrl.replace(URL,'http://www.cssmoban.com')                #print downGotoUrl                downHtml=getUrlContent(downGotoUrl)                #print downHtml                downTitleList= getDownTitle(downHtml)                downTitle=''                if len(downTitleList)>0:                    downTitle=downTitleList[0]                #print downTitle                downUrlList= getDownUrl(downHtml)                downUrl=''                if len(downUrlList)>0:                    downUrl=downUrlList[0]                downUrl= downUrl[downUrl.index('href="')+6:downUrl.index('" target')]                #print downUrl                i('开始下载:%s,文件名:%s' %(downUrl,downTitle))                download(downTitle,downUrl)                i('%s下载完成,保存文件名:%s' %(downUrl,downTitle))            except Exception,e:                e('地址:%s下载失败,失败信息:' %(downGotoUrl))                e(str(e))                        i('-----------------------------------------')        i('执行下一页:%s' %(nextUrl))        html= getUrlContent(nextUrl)            


1 0
原创粉丝点击