spiders框架——post 和 页面的二次跳转
来源:互联网 发布:怎么在淘宝上直播 编辑:程序博客网 时间:2024/06/15 01:11
# -*- coding:utf-8 -*-import scrapyfrom bs4 import BeautifulSoupfrom p1.items import P1Itemimport jsonimport lxmlclass XiaoHuarSpider(scrapy.Spider): name = "keche" def start_requests(self): #第一个url请求,定义函数 start_requests # if post 请求 yield scrapy.FormRequest(url=url, headers=headers,callback=默认的是parse函数) #if get 请求 yield scrapy.Request(url=a7, headers=headers,callback=self.响应页要传的函数名 for i in range(1,2): url = '****'+str(i) yield scrapy.FormRequest(url=url, headers=headers) def parse(self, response): # 对页面一进行解析 # print(response, type(response)) # from scrapy.http.response.html import HtmlResponse # print(response.body_as_unicode()) a = {} a = json.loads(response.text)['data'] for j in range(0, len(a)): sss = a[j] a1 = sss['cah'] a2 = sss['cajlb'] a3 = sss['cbt'] a4 = sss['cslfyMc'] a5 = sss['cygMc'] a6=response.url a7='***'+sss['cBh']#从第一个页面取得的链接 yield scrapy.Request(url=a7,headers=headers},callback=self.two_parse)#对这个链接进行get请求 def two_parse(self,response):#对页面二进行解析 Soup=BeautifulSoup(response.text,'lxml') alist=Soup.find('div',class_='fd-fix') a1=alist.find('h2').text a2=alist.find('h5').text a3=alist.find('div',class_='fd-alt-all').text a4='' a5='' a6='' item=P1Item(a1=a1,a2=a2,a3=a3,a4=a4,a5=a5,a6=a6) yield item #返回要取得的值
阅读全文