电商评价质量评分模型(二)

来源:互联网 发布:淘宝关键词卡首页 编辑:程序博客网 时间:2024/06/05 16:01

爬取某个商品的评价信息

爬虫工具:Scrapy

spider1/spiders/spider1.py

# -*- coding: utf-8 -*-import loggingimport refrom scrapy.selector import Selectorfrom scrapy.spiders import Spiderfrom scrapy.utils.response import get_base_urlfrom spider1.items import FeedbackItemlogger = logging.getLogger('xxx_spider_logger')class XxxSpider(Spider)    name = "xxx"    allowed_domains = ["xxx.com"]    start_urls = [        "http://feedback.xxx.com/display/productEvaluation.htm?productId=32349005878&ownerMemberId=202654736&companyId=215114749&memberType=seller&startValidDate=&i18n=true"    ]    def parse(self, response):        base_url = get_base_url(response)        print('base_url: ' + base_url)        items = []        sel = Selector(response)        feedbacks = sel.xpath('//div[@class="feedback-item clearfix"]')        logger.info('feedbacks length: %i', len(feedbacks))        for feedback in feedbacks:            item = FeedbackItem()            #user_country            fb_user_info = feedback.xpath('div[@class="fb-user-info"]')            item['user_country'] = fb_user_info.xpath('div[@class="user-country"]/b/text()').extract()[0]            # logger.info('user_country: %s', item['user_country'])            #fb_rate_info            fb_main = feedback.xpath('div[@class="fb-main"]')            item['fb_rate_info'] = fb_main.xpath('div[@class="f-rate-info"]/span[@class="star-view"]/span/@style').re('width:(\d+)%')[0]            # logger.info('fb_rate_info: %s', item['fb_rate_info'])            #fb_content            item['fb_content'] = fb_main.xpath('div[@class="f-content"]/dl/dt/span/text()').extract()[0]            # logger.info('fb_content: %s', item['fb_content'])            items.append(item)        return items

spider1/items.py

# -*- coding: utf-8 -*-# Define here the models for your scraped items## See documentation in:# http://doc.scrapy.org/en/latest/topics/items.htmlfrom scrapy.item import Item, Fieldclass FeedbackItem(Item):    user_country = Field()    fb_rate_info = Field()    fb_content = Field()

spider1/pipelines.py

# -*- coding: utf-8 -*-# Define your item pipelines here## Don't forget to add your pipeline to the ITEM_PIPELINES setting# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.htmlimport  jsonclass XxxPipeline(object):    def __init__(self):        self.file = open('item.json', 'wb')    def process_item(self, item, spider):        line = json.dumps(dict(item)) + "\r\n"        self.file.write(line)        return item

spider1/settings.py

# -*- coding: utf-8 -*-# Scrapy settings for spider1 project## For simplicity, this file contains only settings considered important or# commonly used. You can find more settings consulting the documentation:##     http://doc.scrapy.org/en/latest/topics/settings.html#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.htmlBOT_NAME = 'xxx'SPIDER_MODULES = ['spider1.spiders']NEWSPIDER_MODULE = 'spider1.spiders'# Crawl responsibly by identifying yourself (and your website) on the user-agent#USER_AGENT = 'spider1 (+http://www.yourdomain.com)'# Obey robots.txt rulesROBOTSTXT_OBEY = True# Configure maximum concurrent requests performed by Scrapy (default: 16)#CONCURRENT_REQUESTS = 32# Configure a delay for requests for the same website (default: 0)# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay# See also autothrottle settings and docs#DOWNLOAD_DELAY = 3# The download delay setting will honor only one of:#CONCURRENT_REQUESTS_PER_DOMAIN = 16#CONCURRENT_REQUESTS_PER_IP = 16# Disable cookies (enabled by default)#COOKIES_ENABLED = False# Disable Telnet Console (enabled by default)#TELNETCONSOLE_ENABLED = False# Override the default request headers:#DEFAULT_REQUEST_HEADERS = {#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',#   'Accept-Language': 'en',#}# Enable or disable spider middlewares# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html#SPIDER_MIDDLEWARES = {#    'spider1.middlewares.MyCustomSpiderMiddleware': 543,#}# Enable or disable downloader middlewares# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#DOWNLOADER_MIDDLEWARES = {#    'spider1.middlewares.MyCustomDownloaderMiddleware': 543,#}# Enable or disable extensions# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html#EXTENSIONS = {#    'scrapy.extensions.telnet.TelnetConsole': None,#}# Configure item pipelines# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.htmlITEM_PIPELINES = {    'spider1.pipelines.XxxPipeline': 300,}# Enable and configure the AutoThrottle extension (disabled by default)# See http://doc.scrapy.org/en/latest/topics/autothrottle.html#AUTOTHROTTLE_ENABLED = True# The initial download delay#AUTOTHROTTLE_START_DELAY = 5# The maximum download delay to be set in case of high latencies#AUTOTHROTTLE_MAX_DELAY = 60# The average number of requests Scrapy should be sending in parallel to# each remote server#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0# Enable showing throttling stats for every response received:#AUTOTHROTTLE_DEBUG = False# Enable and configure HTTP caching (disabled by default)# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings#HTTPCACHE_ENABLED = True#HTTPCACHE_EXPIRATION_SECS = 0#HTTPCACHE_DIR = 'httpcache'#HTTPCACHE_IGNORE_HTTP_CODES = []#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

相关文章:
电商评价质量评分模型(一)

0 0