pyspider抓取数据信息

来源:互联网 发布:淘宝连衣裙店铺 编辑:程序博客网 时间:2024/06/05 03:07

最近需要抓取一些信息,就了解了一下pyspider,不多说直接上代码,代码包括数据抓取,分析以及存入mysql数据库

#!/usr/bin/env python# -*- encoding: utf-8 -*-# Created on 2017-09-06 09:53:44# Project: dmp 登录抓取数据from pyspider.libs.base_handler import *import reclass Handler(BaseHandler):      crawl_config = {          'headers': {              'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko)',          }      }      @every(minutes=0)      def on_start(self):          self.crawl('http://http://xxxx.com/', callback=self.login)      @config(age=0)      def login(self, response):          cookies = response.cookies          url = response.doc("form").attr("action")          data = {}          for each in response.doc("form input"):              data[each.name]=each.value              if each.name == "username":                  data["username"] = "test"              if each.type == "password":                  data["password"] = "123456"          headers = {}          headers["Content-Type"]="application/x-www-form-urlencoded"          headers["Referer"]="http://xxx.com/login"          headers["Origin"]="http://xxx.com"           self.crawl(url, callback=self.login_ok,data=data,cookies=cookies,headers=headers,method="POST")      @config(priority=2)      def login_ok(self, response):                    self.crawl("http://xxx.com/distributionGoods/list?pageSize=100&status=1",                     cookies=response.cookies,callback=self.index_page)      @config(age=0)           def index_page(self, response):         base_url = 'http://xxx.com/'          for each in response.doc(".active p > a").items():             self.crawl( each.attr.href,cookies=response.cookies,callback=self.detail_page)    @config(age=0)             def detail_page(self, response):        return {            "url": response.url,            "title": response.doc('title').text(),            "productName" : response.doc('.separate-line > .column-col1 > div > div').eq(0).text(),            "price" : response.doc('.column-col3 > div > div').text(),        }

抓取数据入库

首先引入pymysql:    import pymysql获取抓取结果:@config(priority=2)    def detail_page(self, response):        return [{            "订单号" : x('td').eq(0).text(),            "订单ID" : x('td').eq(1).text(),            "订单号22" : x('td').eq(2).text(),            "订单日期" : x('td').eq(3).text(),            "订单状态" : x('td').eq(4).text(),            "省分" : x('td').eq(5).text(),            "地市" : x('td').eq(6).text(),            "商品类型" : x('td').eq(7).text(),            "商品名称" : x('td').eq(8).text(),            "套餐名称" : x('td').eq(9).text(),            "终端品牌" : x('td').eq(10).text(),        } for x in response.doc("tr").items()]

重写on_result方法

def on_result(self, result):        print("result: ", result)        config = {          'host':'127.0.0.1',          'port':3306,          'user':'admin',          'password':'123456',          'db':'sales',          'charset':'utf8mb4',          'cursorclass':pymysql.cursors.DictCursor,          }        #对数据进行预处理        if not result is None:            for x in result:                #if x["订单号"] is "":                #格式化日期                str = x["订单日期"]                if not str is "":                    str = str[0:4] + "-" + str[4:6] + "-" + str[6:]                    date_time = datetime.datetime.strptime(str,'%Y-%m-%d')                    x["订单日期"] = date_time                else:                    x["订单日期"] = None                #如果终端品牌为-,代表没有                if x["终端品牌"] is "-":                    x["终端品牌"] = None                #如果套餐名称为-,代表没有                if x["套餐名称"] is "-":                    x["套餐名称"] = None            #创建一个集合存放元组            list = []            for x in result:                obj = (                    x["订单号"],                    x["订单ID"],                    x["订单号22"],                    x["订单日期"],                    x["订单状态"],                    x["省分"],                    x["地市"],                    x["商品类型"],                    x["商品名称"],                    x["套餐名称"],                    x["终端品牌"]                    )                if not x["订单号"] is "":                    list.append(obj)            db = pymysql.connect(**config)            cursor = db.cursor()            sql = """INSERT INTO ORDER_TMP(ID, TRADE_ID, ESS_TRADE_ID,                     TRADE_DATE, TRADE_STATUS, PROVINCE, CITY, GOODS_TYPE,                     GOODS_NAME, PACKAGE_NAME, NET_TYPE, TERMINAL_BRAND) VALUES                     (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"""            print("sql: ", sql)            try:                cursor.executemany(sql, list)                db.commit()            except Exception as e:                print("exception have occur: ", e)                db.rollback()            db.close()
原创粉丝点击