使用Scrapy爬取大众点评图片

来源：互联网发布：淘宝砍价话术编辑：程序博客网时间：2024/04/27 16:59

O2O网站上商户的页面中，有大量来自用户的图片，是研究机器学习的好资源。对这些资源的利用首先需要用爬虫将这些资源爬取到本地。下文程序段为利用scrapy从大众点评上爬餐馆菜品图片的脚本示例。（注：考虑到很多网站的前端页面经常变化，对这些网站的爬取首先需要对web page进行源码分析，确定解析html文件时所需要的一些关键词。）

Scrapy是一个基于python的爬虫框架，可以自动实现并行爬取。详见Scrapy主页：http://scrapy.org/

DishRecSpider.py

#coding:utf-8import osimport sys  import timeimport reimport urllibimport scrapyfrom scrapy.contrib.spiders import CrawlSpiderfrom scrapy.http import Requestfrom scrapy.selector import HtmlXPathSelectorfrom dishRec.items import ShopItem, DishItem # set default encodingreload(sys)  sys.setdefaultencoding('gbk')  class DishRecSpider(CrawlSpider):MAX_PAGE_IDX = 10 #餐馆列表页的最大页数IMG_ROOT_DIR = 'E:\\dishRec\\image' #图像存储位置SMALL_IMG_SIZE_STR = '240c180' #小尺寸图像的命名关键字LARGE_IMG_SIZE_STR = '700x700' #大尺寸图像的命名关键字DISH_LOG_DIR = '../dish.txt' #日志文件name = 'dishRecSpider' #爬虫名称allowed_domains = ['dianping.com'] #域名rootUrl = 'http://www.dianping.com'#根URLdownload_delay = 1.5 #下载图像的时间间隔pageIdx = 1 #初始页面编号start_urls = ['http://www.dianping.com/search/category/2/10/r6979d1o10'] #爬虫入口URL地址def parse(self, response): #获取餐馆列表并进入餐馆主页# print '\n' + response.url + '\n'hxs = HtmlXPathSelector(response)shopInfoList = hxs.select('//a[@data-hippo-type="shop"]')for shopInfo in shopInfoList:shopUrl = self.rootUrl + shopInfo.select('@href').extract()[0]yield Request(url = shopUrl, callback = self.parse_dish)if self.pageIdx < self.MAX_PAGE_IDX:self.pageIdx = self.pageIdx + 1yield Request(url = self.start_urls[0] + 'p%d' %self.pageIdx, callback = self.parse) #请求爬取餐馆主页的页面def parse_dish(self, response): #获取餐馆信息和菜品列表并进入菜品相册页面#------parse the name and coordinate of the restauranthxs = HtmlXPathSelector(response)shopName = hxs.select('//h1[@class="shop-name"]/text()').extract()[0]coordScript = hxs.select('//div[@id="aside"]/script[1]').extract()[0]coordTupleText = re.findall(r'lng:\w+.\w+,lat:\w+.\w+', coordScript)[0]lngPre = re.search(r'lng:', coordTupleText)latPre = re.search(r'lat:', coordTupleText)sItem = ShopItem()sItem['name'] = shopName.strip()sItem['link'] = response.urlsItem['lng'] = coordTupleText[lngPre.end() : latPre.start()-1]sItem['lat'] = coordTupleText[latPre.end() : len(coordTupleText)]#fid = open(self.DISH_LOG_DIR,'a')#fid.write(sItem['name'] + '\n' + sItem['link'] + '\n')#fid.write(sItem['lng'] + ' ' + sItem['lat'] + '\n')hxs = HtmlXPathSelector(response)#------parse the name and link of each dishdishInfoScriptList = hxs.select('//div[@id="shop-tabs"]//script[@type="text/panel"]').extract()if not dishInfoScriptList:returndishItems = []dishInfoList = re.findall(r'class="item" href="/shop/\w+/dish-\W+"',dishInfoScriptList[0])for dishInfo in dishInfoList:item = DishItem()dishTitlePre = re.search(r'dish-',dishInfo)item['name'] = dishInfo[dishTitlePre.end() : len(dishInfo)-1]item['link'] = response.url + '/photos/tag-²Ë-' + item['name']dishItems.append(item)#fid.write(item['name'] + '\n')#fid.close()for item in dishItems:pageIdx = 1imgIdx = 1yield Request(url = item['link'], meta={'rootUrl': item['link'], 'dishName': item['name'], 'pageIdx': pageIdx, 'imgIdx': imgIdx}, callback = self.parse_img) #请求爬取菜品相册的页面def parse_img(self, response): #下载菜品图像hxs = HtmlXPathSelector(response)imgUrlList = hxs.select('//div[@class="picture-list"]//img/@src').extract()restName = hxs.select('//div[@class="crumb"]//strong/a/text()').extract()[0]dishName = hxs.select('//div[@class="dish-name"]/h1/text()').extract()[0]#fid = open('imgUrl.txt','a')if not imgUrlList:print restName + '-' + dishName + ': crawling finished'return#------create image folderdstFolderDir = self.IMG_ROOT_DIR + '\\' + restName + '\\' + dishNameif not os.path.isdir(dstFolderDir):os.makedirs(dstFolderDir)#------print the index of pagepageIdx = response.meta['pageIdx']print restName + '-' + dishName + '-pageIdx: ' + str(pageIdx)#------retrieve images using urlimgIdx = response.meta['imgIdx']for imgUrl in imgUrlList:idx = imgUrl.find(self.SMALL_IMG_SIZE_STR)largeImgUrl = imgUrl.replace(self.SMALL_IMG_SIZE_STR, self.LARGE_IMG_SIZE_STR)print largeImgUrl#fid.write(largeImgUrl + '\n')dstPath = dstFolderDir + '\\' + str(imgIdx) + '.jpg'if not os.path.exists(dstPath):urllib.urlretrieve(largeImgUrl, dstPath)imgIdx += 1#fid.close()pageIdx = response.meta['pageIdx'] + 1dishImgPageUrl = response.meta['rootUrl'] + '?pg=%d' %pageIdxyield Request(url = dishImgPageUrl, \meta={'rootUrl': response.meta['rootUrl'], \'pageIdx': pageIdx, \'imgIdx': imgIdx}, \callback = self.parse_img)

items.py

# -*- coding: utf-8 -*-# Define here the models for your scraped items## See documentation in:# http://doc.scrapy.org/en/latest/topics/items.htmlimport sys  reload(sys)  sys.setdefaultencoding('utf8')  from scrapy.item import Item, Fieldclass ShopItem(Item):name = Field()link = Field()lng = Field()lat = Field()class DishItem(Item):name = Field()link = Field()

1 0