jd手机信息爬虫

来源：互联网发布：冒险岛伴侣官方域名编辑：程序博客网时间：2024/05/24 03:34

最近在自己学习python和研究网络爬虫，自己用scrapy实现了简单的爬虫，爬取京东手机信息，值爬了第一页的手机名称，价格，手机店名，成交量等信息。不是很完整，后续继续研究，一步一步晚上。

1. 使用的IDE是pycharm，环境比较难折腾

2. 由于是动态网页，开始的时候怎么也获取不到价格等信息，所有用了PyQt5加载网页，然后再获取信息。

3. 把信息导出到xlsx文件

Jdspider.py

import sys
from scrapy.selector importSelector
from scrapy.contrib.spiders import CrawlSpider,Rule
from PyQt5.QtWebEngineWidgets import QWebEngineView
from PyQt5 importQtWebEngine
from PyQt5 importQtWidgets
from PyQt5.QtCore importQUrl
from PyQt5.QtCore importQEventLoop, QUrl, QTimer
from jdcrawl.items importJdcrawlItem

class JdSpider(CrawlSpider):
    name = "jdSpider"
    view = None
    app = None
    html = None
    isloadFinish = False
    allowed_domains = ["jd"]
    start_urls = [
        "https://list.jd.com/list.html?cat=9987,653,655&page=1&sort=sort_rank_asc&trans=1&JL=6_0_0&ms=6#J_main"
    ]

    def _callable(self, html):
        self.html = html
        print("app quit")
        self.app.quit()
        self.app.exit(0)
        self.isloadFinish = True
        print("_callable")
        filename = 'response.html'
        fp = open(filename, 'w',encoding='utf-8')
        fp.write(html)
        fp.close()

    def parserHtml(self):
        print("parserHtml..")
        select = Selector(text=self.html)
        for sel inselect.xpath('//*[@id="plist"]/ul/li[@class="gl-item"]'):
            item = JdcrawlItem()
            name = sel.xpath('div/div[@class="p-name"]/a/em/text()').extract()
            shop = sel.xpath('div/div[@class="p-shop"]/span/a[@title]/text()').extract()
            price = sel.xpath('div/div[@class="p-price"]/strong[@class="J_price"]/i/text()').extract()
            comment = sel.xpath('div/div[@class="p-commit"]/strong/a/text()').extract()
            item["phoneName"] = name
            item["phoneShop"] = shop
            item["price"] = price
            item["comments"] = comment
            print(name)
            # yield item

    def _timer_for_html(self):
      print("_timer_for_html")
        self.view.page().toHtml(self._callable)

    def _loadFinished(self,result):
        print("load finish.....")
        QTimer.singleShot(2 * 1000, self._timer_for_html)

    def parse(self, response):
        print("parse")
        self.app = QtWidgets.QApplication(sys.argv)
        self.view = QWebEngineView()
        self.view.loadFinished.connect(self._loadFinished)
        self.view.load(QUrl(response.url))
        self.app.exec();
        select = Selector(text=self.html)
        for sel inselect.xpath('//*[@id="plist"]/ul/li[@class="gl-item"]'):
            item = JdcrawlItem()
            name = sel.xpath('div/div[@class="p-namep-name-type3"]/a/em/text()').extract()
            shop = sel.xpath('div/div[@class="p-shop"]/span/a[@title]/text()').extract()
            price = sel.xpath('div/div[@class="p-price"]/strong[@class="J_price"]/i/text()').extract()
            comment = sel.xpath('div/div[@class="p-commit"]/strong/a/text()').extract()
            if len(name) >0:
                name = name[0].strip()
            if len(shop) >0 :
                shop = shop[0].strip()
            if len(price) >0:
                price = price[0].strip()
            if len(comment) >0:
                comment = comment[0].strip()
            item["phoneName"] = name
            item["phoneShop"] = shop
            item["price"] = price
            item["comments"] = comment
            # print(name)
            yield item

items.py

import scrapyclass JdcrawlItem(scrapy.Item):    # define the fields for your item here like:    phoneName = scrapy.Field()    phoneShop = scrapy.Field()    price = scrapy.Field()    comments = scrapy.Field()

pipelines.py

from openpyxl import Workbookclass JdcrawlPipeline(object):    wb = Workbook()    ws = wb.active    ws.append(['手机名称', '店名', '价格', '成交量'])  # 设置表头    def process_item(self, item, spider):        print("process item")        line = [item['phoneName'], item['phoneShop'], item['price'], item['comments']]  # 把数据中每一项整理出来        print(line)        self.ws.append(line)  # 将数据以行的形式添加到xlsx中        self.wb.save('phoneinfo.xlsx')  # 保存xlsx文件        return item#

settings.py新增以下代码：

ITEM_PIPELINES = {   'jdcrawl.pipelines.JdcrawlPipeline': 300,}

虽然之前用的是qt，由于对PyQt5不熟悉，所以运行到时候会弹出错误弹窗。这个后面再完善。

阅读全文

0 0