爬虫常用的三种通用模板小结

来源:互联网 发布:哈登16 17赛季数据 编辑:程序博客网 时间:2024/06/15 16:52

确实有一段时间没怎么写爬虫了,最近又安排了写爬虫的任务,其实很多东西写过一份,之后再做大部分是复制粘贴代码,然后修改一下。所以这里总结一下通用的地方,之后编写爬虫的时候就可以直接拿来用了。


一、使用urllib2库

对于一些简单的网站,需要获取的信息又是以json格式返回时,我一般喜欢用urllib2库直接写爬虫获取。

 

代码模板:

import urllib2import urllibimport json requrl = "http://www.baidu.com"  # 输入要抓取的网站 # 如果有要post给requrl的数据,以dict的形式post_data = {'pageIndex':1, 'pagesize':12}post_data_urlencode = urllib.urlencode(post_data)    req = urllib2.Request(url=requrl, data=post_data_urlencode)res_data = urllib2.urlopen(req)res = res_data.read()   # 这里返回的是response中的一个json格式串,res是str类型 # json.dumps:dict转成str# json.loads: str转成dict# 将数据转换为dict,方便取出数据json_data = json.loads(res) 

二、使用selenium

from selenium import webdriver # 使用PhantomJS就不会在每次启动的时候都弹出浏览器了driver = webdriver.PhantomJS()   requrl = "http://www.baidu.com"driver.get(requrl) # 通过xpath获取到想要的元素elements = driver.find_elements_by_xpath('//div[@class="wd"]')for element in elements:next_url = element.get_attribute("href")# 这里获取到下一个需要抓取的网址后,就可以进入下一个函数了 driver.quit()

三、使用scrapy

spiders文件夹下自己创建一个.py文件,代码如下:

from scrapy.spiders import Spiderfrom hospital.items import hospital_301# hospital是整个工程的文件名, hospital_301是你要传入的item的类名from scrapy.selector import Selectorimport scrapy # hospital_spider这个类名可改class hospital_spider(Spider):    # 爬虫的名字,用于启动爬虫的时候用    # 启动爬虫命令:scrapy crawl 爬虫名字    name = "301hospital"    # allowed_domains = ['http://www.301hospital.com.cn']    start_urls = ["http://www.301hospital.com.cn/web/expert/myhc/yyzj.html"]     def parse(self, response):    sel = Selector(response)     # 取一个链接    elements = sel.xpath("//div[@class="keshiMenu"]//a/@href").extract()    for element in elements:    if element == '?':       pass    else:       # 拼接起来       next_url = "http://www.301hospital.com.cn/" + element       # 进入下一个函数       yield scrapy.Request(url, callback=self.parse_department)     # 若有item传入则使用下面这个形式    # yield scrapy.Request(url, meta={'item': item1}, callback=self.parse_detail)     def parse_detail(self, response):    .....        # 处理完直接return item,pipeline即可收到这个item        item1 = response.meta['item']        return item1

items.py文件代码如下:

from scrapy import Item, Field class HospitalItem(Item):    # define the fields for your item here like:    # name = scrapy.Field()    pass  class hospital_301(Item):    name = Field()  # 专家名字    title = Field()  # 职称    department = Field()  # 科室    introduction = Field()  # 详细介绍    specialty = Field()  # 医师特长    visit_info = Field()  # 出诊信息    photo = Field()  # 照片    link = Field()

pipeline.py代码如下:

# -*- coding: utf-8 -*- # Define your item pipelines here## Don't forget to add your pipeline to the ITEM_PIPELINES setting# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html  import MySQLdb.cursorsfrom twisted.enterprise import adbapifrom hospital.items import hospital_301  # hospital为工程文件夹的名字,hospital_301为要传入item的名字from scrapy.xlib.pydispatch import dispatcherfrom scrapy import signalsfrom scrapy.utils.project import get_project_settingsfrom scrapy import logimport chardet SETTINGS = get_project_settings()  class HospitalPipeline(object):    @classmethod    def from_crawler(cls, crawler):        return cls(crawler.stats)     def __init__(self, stats):        # Instantiate DB        self.dbpool = adbapi.ConnectionPool('MySQLdb',                                            host=SETTINGS['DB_HOST'],                                            user=SETTINGS['DB_USER'],                                            passwd=SETTINGS['DB_PASSWD'],                                            port=SETTINGS['DB_PORT'],                                            db=SETTINGS['DB_DB'],                                            charset='utf8',                                            use_unicode=True,                                            cursorclass=MySQLdb.cursors.DictCursor                                            )        self.stats = stats        dispatcher.connect(self.spider_closed, signals.spider_closed)     def spider_closed(self, spider):        """ Cleanup function, called after crawing has finished to close open            objects.            Close ConnectionPool. """        self.dbpool.close()     def process_item(self, item, spider):        query = self.dbpool.runInteraction(self._insert_record, item)        query.addErrback(self._handle_error)        return item     def _insert_record(self, tx, item):    name = ""    title = ""    department = ""    introduction = ""    specialty = ""    visit_info = ""    photo = ""    # 进行元素的获取,都使用try的形式,不然如果有一个元素为空,取出报错,整个程序会中断    try:        name = str(item['name']).decode('raw_unicode_escape').replace("[u'姓名:", "").replace("']", "")        except:        pass        try:        title = item['title'][0]        except:        pass    try:        department = item['department'][0].replace("科室:", "")    except:    pass     try:        introduction = item['introduction'][0]        except:        pass         try:        specialty = item['specialty'][0]        except:        pass         try:        visit_info = ''.join(item['visit_info'])        except:        pass         try:        photo = str(item['photo']).decode('raw_unicode_escape')        except:        pass         # 这一步有时候可删除,需要加上的原因是有时候中文存入数据库会出现乱码的情况        name = name.encode('utf-8')        title = title.encode('utf-8')        department = department.encode('utf-8')        introduction = introduction.encode('utf-8')        specialty = specialty.encode('utf-8')        visit_info = visit_info.encode('utf-8')        photo = photo.encode('utf-8')         # 用于调试用        # print "name--", name        # print "title--", title        # print "department--", department        # print "introduction--", introduction        # print "specialty--", specialty        # print "visit_info--", visit_info        # print "photo--", photo         sql = "INSERT INTO hospital_301 VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s')" % \              (name, title, department, introduction, specialty, visit_info, photo)         # print sql        tx.execute(sql)        print "yes"     def _handle_error(self, e):        log.err(e)

settings.py代码如下:

# -*- coding: utf-8 -*- BOT_NAME = 'hospital' SPIDER_MODULES = ['hospital.spiders']NEWSPIDER_MODULE = 'hospital.spiders' COOKIES_ENABLED = False   DOWNLOAD_DELAY = 7  # 下载延迟时间 LOG_LEVEL = 'INFO' # 数据库参数DB_HOST = 'localhost'DB_PORT = 3306  # 端口DB_USER = 'root'# 账号DB_PASSWD = 'xxx' # 密码DB_DB = 'hospitals'# 数据库名 ITEM_PIPELINES = {    'hospital.pipelines.HospitalPipeline': 300,  # hospital用工程的文件名替换}

如需要下载图片,代码如下:

photo_url = "http://www.hfyy.cn/bbyy/upload/2015-5/2015052660742901.jpg"# 给定图片存放名称filename = 'd:\\photos\\' + name + '.jpg'# 文件名是否存在try:    urllib.urlretrieve(photo_url, filename)    print "finished"except Exception, e:    print e



原创粉丝点击