python实现下载韩寒博客中的所有文章,在本地存储

来源:互联网 发布:jquery.tooltips.js 编辑:程序博客网 时间:2024/06/04 17:59
# -*- coding:utf-8 -*- # import urllib,requestsimport bs4,os,reurlList = []urlListZuiHou = []urlPurpose = 'http://blog.sina.com.cn/twocold'#下载韩寒博客网页res = requests.get(urlPurpose)#检查是否下载成功try:    res.raise_for_status()except Exception as exc:    print('There was a problem:%s'%(exc))#通过正则表达式在下载网页中匹配博文地址存放在urlList列表中    patternUrl = r'http://blog.sina\.com\.cn/s/blog.+\.html'regex = re.compile(patternUrl)urlList = re.findall(regex,res.text)#去掉重复的URl路径项,存放在列表urlListZuiHou中 for i in range(0,len(urlList),4):    urlListZuiHou.append(urlList[i])#循环遍历所有博客地址,并下载保存到本地文件for i in range(len(urlListZuiHou)):    resBlog = urllib.request.urlopen(urlListZuiHou[i])    #转化下载文件的编码格式utf-8    html = str(resBlog.read(),'utf-8')#创建BeautifulSoup对象,并查找匹配博客标题        bs4BOne = bs4.BeautifulSoup(html)    bs4BlogBiaoTi = bs4BOne.select('title')#以博客标题为文件名称,将每一篇博客保存为html文件    openBlogFileHTML = open('C:\\Users\\Nick\\Desktop\\python\\drawing\\2\\quiz\\'+str(bs4BlogBiaoTi[0].getText())+'.html','at')    openBlogFileHTML.write(html)    openBlogFileHTML.close()

0 0