基于scrapy的智联职位爬取
来源:互联网 发布:爬虫为什么用python 编辑:程序博客网 时间:2024/06/05 13:21
一、创建项目
cmd命令行界面,进入要保存项目的路径E:\python\project,执行创建scrapy项目命令:scrapy startproject zhaopin_zhilian
结果:会在E:\python\project目录中创建一个名叫zhaopin_zhilian的项目。
注:语法scrapy startprojec 项目名
二、执行项目
cmd命令行界面,进入到项目目录E:\python\project\zhaopin_zhilian,执行运行scrapy项目的命令:scrapy crawl zhilianspider
结果:执行这个项目中名为zhilianspider的爬虫程序
注:语法scrapy crawl 爬虫名
以下是项目中各个部分的编写:
1、项目截图
2、setting.py项目配置文件
# -*- coding: utf-8 -*-# Scrapy settings for zhaopin_zhilian project## For simplicity, this file contains only settings considered important or# commonly used. You can find more settings consulting the documentation:## http://doc.scrapy.org/en/latest/topics/settings.html# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.htmlBOT_NAME = 'zhaopin_zhilian'SPIDER_MODULES = ['zhaopin_zhilian.spiders']NEWSPIDER_MODULE = 'zhaopin_zhilian.spiders'LOG_FILE ="thelog_zhilian.log"LOG_ENABLED=True# Crawl responsibly by identifying yourself (and your website) on the user-agent#USER_AGENT = 'zhaopin_zhilian (+http://www.yourdomain.com)'# Obey robots.txt rulesROBOTSTXT_OBEY = False# Configure maximum concurrent requests performed by Scrapy (default: 16)#CONCURRENT_REQUESTS = 32# Configure a delay for requests for the same website (default: 0)# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay# See also autothrottle settings and docsDOWNLOAD_DELAY = 1# The download delay setting will honor only one of:#CONCURRENT_REQUESTS_PER_DOMAIN = 16#CONCURRENT_REQUESTS_PER_IP = 16# Disable cookies (enabled by default)#COOKIES_ENABLED = False# Disable Telnet Console (enabled by default)#TELNETCONSOLE_ENABLED = False# Override the default request headers:#DEFAULT_REQUEST_HEADERS = {# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',# 'Accept-Language': 'en',#}# Enable or disable spider middlewares# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html#SPIDER_MIDDLEWARES = {# 'zhaopin_zhilian.middlewares.ZhaopinZhilianSpiderMiddleware': 543,#}# Enable or disable downloader middlewares# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#DOWNLOADER_MIDDLEWARES = {# 'zhaopin_zhilian.middlewares.MyCustomDownloaderMiddleware': 543,#}# Enable or disable extensions# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html#EXTENSIONS = {# 'scrapy.extensions.telnet.TelnetConsole': None,#}# Configure item pipelines# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html#ITEM_PIPELINES = {# 'zhaopin_zhilian.pipelines.ZhaopinZhilianPipeline': 300,#}ITEM_PIPELINES = { 'zhaopin_zhilian.pipelines.ZhaopinZhilianPipeline': 1,}# Enable and configure the AutoThrottle extension (disabled by default)# See http://doc.scrapy.org/en/latest/topics/autothrottle.html#AUTOTHROTTLE_ENABLED = True# The initial download delay#AUTOTHROTTLE_START_DELAY = 5# The maximum download delay to be set in case of high latencies#AUTOTHROTTLE_MAX_DELAY = 60# The average number of requests Scrapy should be sending in parallel to# each remote server#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0# Enable showing throttling stats for every response received:#AUTOTHROTTLE_DEBUG = False# Enable and configure HTTP caching (disabled by default)# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings#HTTPCACHE_ENABLED = True#HTTPCACHE_EXPIRATION_SECS = 0#HTTPCACHE_DIR = 'httpcache'#HTTPCACHE_IGNORE_HTTP_CODES = []#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
3、zhilianitems.py组件编写
# -*- coding: utf-8 -*-# Define here the models for your scraped items## See documentation in:# http://doc.scrapy.org/en/latest/topics/items.htmlimport scrapyclass ZhiLianItem(scrapy.Item): zwmc = scrapy.Field()#职位名称 fkl = scrapy.Field()#反馈率 gsmc = scrapy.Field()#公司名称 zwyx = scrapy.Field()#职位月薪 #zwyx_least = scrapy.Field()#职位月薪最低 #zwyx_most = scrapy.Field()#职位月薪最高 gzdd = scrapy.Field()#工作地点 #gxsj = scrapy.Field()#更新时间 zwmcurl = scrapy.Field()#职位地址 url = scrapy.Field()#url fl = scrapy.Field()#福利 fbrq = scrapy.Field()#发布日期 gzxz = scrapy.Field()#工作性质 gzjy = scrapy.Field()#工作经验 zdxl = scrapy.Field()#最低学历 zprs = scrapy.Field()#招聘人数 zwlb = scrapy.Field()#职位类别 #gwzz = scrapy.Field()#岗位职责 #rzyq = scrapy.Field()#任职要求 zwms = scrapy.Field()#职位描述 gzdz = scrapy.Field()#工作地址 #lxdh = scrapy.Field()#联系电话 #email = scrapy.Field()#邮箱4、spider组件zhilianspider.py编写
# -*- coding:UTF-8 -*-import scrapyimport logging import reimport tracebackimport urllib.requestfrom scrapy.http import Requestfrom zhaopin_zhilian.zhilianitems import ZhiLianItemfrom zhaopin_zhilian.commoncode import *#招聘网站优先级:智联招聘、前程无忧、海投网、应届生求职网、猎聘网、中华英才网。从这6个网站爬取class ZhiLianSpider(scrapy.Spider): logger = logging.getLogger() name = "zhilianspider" allowed_domains = ['zhaopin.com'] CommonCode.DEALNUMBER=0 CommonCode.ALLNUMBER=0 start_urls = [] for line in open("/opt/customer_zhilian.txt"): if(line!='' and line.strip()!=''): gsmc=urllib.request.quote(line.strip()) #广东%2B江苏%2B山西%2B湖南%2B青海 ssurl1='http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E5%B9%BF%E4%B8%9C%2B%E6%B1%9F%E8%8B%8F%2B%E5%B1%B1%E8%A5%BF%2B%E6%B9%96%E5%8D%97%2B%E9%9D%92%E6%B5%B7&kw='+gsmc+'&p=1&kt=2&isadv=0' #湖北%2B山东%2B内蒙古%2B海南%2B宁夏 ssurl2='http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E6%B9%96%E5%8C%97%2B%E5%B1%B1%E4%B8%9C%2B%E5%86%85%E8%92%99%E5%8F%A4%2B%E6%B5%B7%E5%8D%97%2B%E5%AE%81%E5%A4%8F&kw='+gsmc+'&p=1&kt=2&isadv=0' #陕西%2B浙江%2B黑龙江%2B贵州%2B新疆 ssurl3='http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E9%99%95%E8%A5%BF%2B%E6%B5%99%E6%B1%9F%2B%E9%BB%91%E9%BE%99%E6%B1%9F%2B%E8%B4%B5%E5%B7%9E%2B%E6%96%B0%E7%96%86&kw='+gsmc+'&p=1&kt=2&isadv=0' #四川%2B广西%2B福建%2B云南%2B香港 ssurl4='http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E5%B9%BF%E4%B8%9C%2B%E6%B1%9F%E8%8B%8F%2B%E5%B1%B1%E8%A5%BF%2B%E6%B9%96%E5%8D%97%2B%E9%9D%92%E6%B5%B7&kw='+gsmc+'&p=1&kt=2&isadv=0' #辽宁%2B安徽%2B江西%2B西藏%2B澳门 ssurl5='http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E8%BE%BD%E5%AE%81%2B%E5%AE%89%E5%BE%BD%2B%E6%B1%9F%E8%A5%BF%2B%E8%A5%BF%E8%97%8F%2B%E6%BE%B3%E9%97%A8&kw='+gsmc+'&p=1&kt=2&isadv=0' #吉林%2B河北%2B河南%2B甘肃%2B台湾省 ssurl6='http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E5%90%89%E6%9E%97%2B%E6%B2%B3%E5%8C%97%2B%E6%B2%B3%E5%8D%97%2B%E7%94%98%E8%82%83%2B%E5%8F%B0%E6%B9%BE%E7%9C%81&kw='+gsmc+'&p=1&kt=2&isadv=0' start_urls.append(ssurl1) start_urls.append(ssurl2) start_urls.append(ssurl3) start_urls.append(ssurl4) start_urls.append(ssurl5) start_urls.append(ssurl6) CommonCode.ALLNUMBER=CommonCode.ALLNUMBER+6 print("一共客户请求数:"+str(len(start_urls)))#start_urls = ['http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E9%83%91%E5%B7%9E&kw=%E6%B2%B3%E5%8D%97%E8%AA%89%E5%87%8C%E7%94%B5%E5%AD%90%E7%A7%91%E6%8A%80%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8&p=1&kt=2&isadv=0'] def parse(self, response): CommonCode.DEALNUMBER=CommonCode.DEALNUMBER+1 print("处理进度:"+str(CommonCode.DEALNUMBER)+"/"+str(CommonCode.ALLNUMBER)) self.logger.info("处理进度:"+str(CommonCode.DEALNUMBER)+"/"+str(CommonCode.ALLNUMBER)) try: #第一页数据 zw_table = response.xpath('//table[@class="newlist"]') #遍历每个职位 for i in range(len(zw_table)): if(i>0):#第一个table是表格头部,不是职位信息 zwmc=zw_table[i].xpath('.//td[@class="zwmc"]//div/a[1]/text()').extract() fkl=zw_table[i].xpath('.//td[@class="fk_lv"]//span/text()').extract() if(len(fkl)==0): fkl='' else: fkl=fkl[0] zwmcurl = zw_table[i].xpath('.//td[@class="zwmc"]//div/a[1]/@href').extract() gsmc = zw_table[i].xpath('.//td[@class="gsmc"]//a[1]')[0].xpath('string(.)').extract() zwyx = zw_table[i].xpath('.//td[@class="zwyx"]//text()').extract() gzdd = zw_table[i].xpath('.//td[@class="gzdd"]//text()').extract() # item=ZhiLianItem() item['zwmc']=zwmc[0] item['fkl']=fkl item['gsmc']=gsmc[0] item['zwyx']=zwyx[0] item['gzdd']=gzdd[0] item['url']=response.url item['zwmcurl']=zwmcurl[0] #如果搜索的不是要获取的公司名(可能是模糊搜索到的) theGsmc=urllib.request.unquote(response.url.split('&kw=')[1].split('&p=')[0]) if(theGsmc==item['gsmc']): yield Request(item['zwmcurl'],meta = {'item':item},callback=self.parse_item_info) except Exception as err: print(err) self.logger.info("处理第一页职位列表异常:"+response.url+str(err)) CommonCode.insertErrorLog("处理第一页职位列表出错:"+response.url,str(err)) #获取分页信息,并查询下一页 try: # link_urls = response.xpath('//dd/a[1]/@href').extract() #获取总页数 countNumber = int(response.xpath('//span[@class="search_yx_tj"]//em/text()').extract()[0]) if(countNumber>0): theUrl=response.url perPageNumber=60 temP=0 if(countNumber%60>0): temP=1; countPage = int(countNumber//perPageNumber)+int(temP) for m in range(countPage): if(m>0): yield Request(theUrl.split('&p=')[0]+'&p='+str(m+1)+'&kt='+theUrl.split('&kt=')[1],meta = {},callback=self.parse_item) except Exception as err: print(err) traceback.print_exc() self.logger.info("获取下一页异常:"+response.url+str(err)) CommonCode.insertErrorLog("获取下一页出错:"+response.url,str(err)) #处理一页一页的数据 def parse_item(self,response): try: #职位信息table zw_table = response.xpath('//table[@class="newlist"]') #遍历每个职位 for i in range(len(zw_table)): if(i>0):#第一个table是表格头部,不是职位信息 zwmc=zw_table[i].xpath('.//td[@class="zwmc"]//div/a[1]/text()').extract() fkl=zw_table[i].xpath('.//td[@class="fk_lv"]//span/text()').extract() if(len(fkl)==0): fkl='' else: fkl=fkl[0] zwmcurl = zw_table[i].xpath('.//td[@class="zwmc"]//div/a[1]/@href').extract() gsmc = zw_table[i].xpath('.//td[@class="gsmc"]//a[1]/b/text()').extract() if(len(gsmc)==0): gsmc=zw_table[i].xpath('.//td[@class="gsmc"]//a[1]/text()').extract() zwyx = zw_table[i].xpath('.//td[@class="zwyx"]//text()').extract() gzdd = zw_table[i].xpath('.//td[@class="gzdd"]//text()').extract() # item=ZhiLianItem() item['zwmc']=zwmc[0] item['fkl']=fkl item['gsmc']=gsmc[0] item['zwyx']=zwyx[0] item['gzdd']=gzdd[0] item['url']=response.url item['zwmcurl']=zwmcurl[0] yield Request(item['zwmcurl'],meta = {'item':item},callback=self.parse_item_info) except Exception as err: print(err) traceback.print_exc() self.logger.info("处理下一页职位列表异常:"+response.url+str(err)) CommonCode.insertErrorLog("处理下一页职位列表出错:"+response.url,str(err)) #处理一个职位连接里的详情 def parse_item_info(self,response): try: item = response.meta['item'] #福利 fl='' flarray = response.xpath('//div[@class="welfare-tab-box"]//span/text()').extract() for i in range(len(flarray)): if(i==0): fl=fl+flarray[i] else: fl=fl+','+flarray[i] fbrq='' fbrqs=response.xpath('//ul[@class="terminal-ul clearfix"]//li[3]/strong/span/text()').extract() if(len(fbrqs)==0): fbrq=response.xpath('//ul[@class="terminal-ul clearfix"]//li[3]/strong/text()').extract()[0] else: fbrq=fbrqs[0] gzxz=response.xpath('//ul[@class="terminal-ul clearfix"]//li[4]/strong/text()').extract()[0] gzjy=response.xpath('//ul[@class="terminal-ul clearfix"]//li[5]/strong/text()').extract()[0] zdxl=response.xpath('//ul[@class="terminal-ul clearfix"]//li[6]/strong/text()').extract()[0] zprs=response.xpath('//ul[@class="terminal-ul clearfix"]//li[7]/strong/text()').extract()[0] zwlb=response.xpath('//ul[@class="terminal-ul clearfix"]//li[8]/strong/a[1]/text()').extract()[0] zwmss=response.xpath('//div[@class="terminalpage-main clearfix"]//div[@class="tab-cont-box"]/div[1]') zwms='' if(len(zwmss)>0): zwms=zwmss[0].xpath('string(.)').extract()[0].strip().replace(' ', '') if('工作地址:' in zwms): zwms=zwms[0:int(zwms.index('工作地址:'))] #工作地点 gzdz='' gzdzs=response.xpath('//div[@class="tab-inner-cont"]//h2/text()').extract() if(len(gzdzs)==0): gzdz='' else: gzdz=gzdzs[len(gzdzs)-1] item['fl']=fl item['fbrq']=fbrq item['gzxz']=gzxz item['gzjy']=gzjy item['zdxl']=zdxl item['zprs']=zprs item['zwlb']=zwlb item['zwms']=zwms item['gzdz']=gzdz.strip() self.logger.info("解析成功:"+item['gsmc']+"-"+item['zwmc']) yield item except Exception as err: print(err) traceback.print_exc() self.logger.info("处理职位详情异常:"+response.url+str(err)) CommonCode.insertErrorLog("处理职位详情出错:"+response.url,str(err))
5、pipelines.py
# -*- coding: utf-8 -*-# Define your item pipelines here## Don't forget to add your pipeline to the ITEM_PIPELINES setting# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.htmlimport scrapyimport loggingimport pymysql #python3连接数据库的模块pymysql from zhaopin_zhilian.commoncode import *class ZhaopinZhilianPipeline(object): logger = logging.getLogger() def database(self, item): try: cxn = pymysql.Connect(host = '192.168.72.164', user = 'root', passwd = 'newcapec',db="zhaopin",charset = "utf8") #游标 cur = cxn.cursor() sql = "insert into source_zwxx_zhilian (zwmc,fkl,gsmc,zwyx,gzdd,url,zwmcurl,fl,fbrq,gzxz,gzjy,zdxl,zprs,zwlb,zwms,gzdz,cjsj,source,level) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,now(),'智联招聘','1')" sql_delete = "delete from source_zwxx_zhilian where zwmc=%s and gsmc=%s and gzdd=%s" self.logger.info("删除语句:"+sql_delete) self.logger.info("添加语句:"+sql) cur.execute(sql_delete,[item['zwmc'],item['gsmc'],item['gzdd']]) cur.execute(sql,[item['zwmc'],item['fkl'],item['gsmc'],item['zwyx'],item['gzdd'],item['url'],item['zwmcurl'],item['fl'],item['fbrq'],item['gzxz'],item['gzjy'],item['zdxl'],item['zprs'],item['zwlb'],item['zwms'],item['gzdz']]) #关闭 cur.close() cxn.commit() cxn.close() except Exception as err: self.logger.info("保存Item异常"+str(err)+":"+item['gsmc']+"-"+item['zwmc']) CommonCode.insertErrorLog("保存Item出错:",str(err)) print("插入采集职位表出错啦。。。") print(err) def process_item(self, item, spider): self.database(item) return item
6、commoncode.py
import scrapyimport pymysqlimport loggingclass CommonCode(object): ALLNUMBER=0 DEALNUMBER=0 EXCEPTIONNUMBER=0 #收集错误日志,采集中错误的信息都放到此表 def insertErrorLog(msg,errorStr): logger = logging.getLogger() try: #cxn = pymysql.Connect(host = '127.0.0.1', user = 'root', passwd = 'root',db="zhaopin",charset = "utf8") cxn = pymysql.Connect(host = '192.168.72.164', user = 'root', passwd = 'newcapec',db="zhaopin",charset = "utf8") #游标 cur = cxn.cursor() sql = "insert into source_zwxx_zhilian_errlog (msg,err,sj) values (%s,%s,now())" cur.execute(sql,[msg,errorStr]) #关闭 cur.close() cxn.commit() cxn.close() except Exception as err: self.logger.info("插入错误日志表异常:"+errorStr) print("插入错误日志表出错啦。。。") print(err)
阅读全文
0 0
- 基于scrapy的智联职位爬取
- Scrapy框架爬取腾讯招聘所有职位
- 基于Scrapy爬取网页文章
- 爬取51job的职位信息
- 基于scrapy框架的关于58同城招聘网站信息的爬取
- 基于Python2.7和Scrapy, 爬取豆瓣9分榜单
- Scrapy 爬取 豆瓣电影的短评
- scrapy爬取post的数据
- scrapy捕获爬取失败的url
- 用Python 的 Scrapy 爬取 网站
- 爬取招聘职位一
- scrapy实战-爬取
- Scrapy爬取图片
- scrapy爬取图片
- Scrapy爬取1
- scrapy 爬取漫画
- scrapy爬取图片
- scrapy爬取链接
- Android View的绘制流程
- lintcode-最后一个单词的长度
- .net中的托管和非托管
- CentOS7 搭建LVS
- gc日志打印
- 基于scrapy的智联职位爬取
- (2)selenium常见问题-兼容性与键盘事件等
- jasig cas笔记(一):基础(非代理)认证流程
- Android网络编程Okhttp3用法详解
- 【canvas学习笔记六】状态保存和变换
- Java-String类的常用方法总结
- java web 开发所需要的依赖包maven
- retrofit网络框架源码解析
- python里怎么样通过函数名称来获取函数地址