利用Python网络爬虫抓取豆瓣首页图片代码分享

来源：互联网发布：淘宝api查询商品编辑：程序博客网时间：2024/05/12 10:13
# 爬取豆瓣上的图片import urllib.requestimport sslimport re#定义一个能够从html文件里面利用正则表达式抓取出图片地址的函数def getImgLinks(html):    imgRe = re.compile(r'<img.*?(http[\w/:\.-]+?\.jpg).*?')    imgLinks = imgRe.findall(html)    print(list(imgLinks))    return imgLinks#打开具体的网址，并抓取其html页面    context = ssl._create_unverified_context()weburl = "https://www.douban.com/"#要爬取图片所在的网址# weburl = "https://www.sina.com.cn/"webheader = {    'Accept': 'text/html, application/xhtml+xml, */*',    'Accept-Language': 'zh-CN',    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',    'DNT': '1',    'Connection': 'Keep-Alive',    }req = urllib.request.Request(url=weburl,headers=webheader)#构造请求报头webPage = urllib.request.urlopen(req,context=context)data = webPage.read().decode('utf-8')print("抓取网页成功")#主函数，依次捕获图片imgLinks = getImgLinks(data)print(len(imgLinks))count = 0localStorePath = "D:/Python学习"for imgLink in imgLinks:    count += 1    print('现在开始抓取第{}个图片，其地址是{}'.format(count,imgLink))    try:        #以数字序号命名图片的两种方式        # urllib.request.urlretrieve(imgLink, localStorePath + '/%s.jpg'%count)        # urllib.request.urlretrieve(imgLink, localStorePath + '/{}.jpg'.format(count))        #以图片本身的名字命名        print(re.findall(r'([\w-]+)\.jpg',imgLink)[0])        urllib.request.urlretrieve(imgLink, localStorePath + '/{}.jpg'.format(re.findall(r'([\w-]+)\.jpg',imgLink)[0]))#以地址为文件名保存        except:        print('抓取第{}个时出现问题'.format(count))        continue    这是我自己写的代码，有两个方面供大家和大家分享，一是抓取html页面的时候规避ssl证书错误，二是保存图片时命名的方式。
阅读全文
0 0