(实战项目二)阳光热线问政平台

来源:互联网 发布:java并发和多线程区别 编辑:程序博客网 时间:2024/04/27 19:44

阳光热线问政平台

http://wz.sun0769.com/index.php/question/questionType?type=4

爬取投诉帖子的编号、帖子的url、帖子的标题,和帖子里的内容。

items.py


import scrapyclass DongguanItem(scrapy.Item):    # 每个帖子的标题    title = scrapy.Field()    # 每个帖子的编号    number = scrapy.Field()    # 每个帖子的文字内容    content = scrapy.Field()    # 每个帖子的url    url = scrapy.Field()

spiders/sunwz.py

Spider 版本

# -*- coding: utf-8 -*-import scrapyfrom dongguan.items import DongguanItemclass SunSpider(CrawlSpider):    name = 'sun'    allowed_domains = ['wz.sun0769.com']    url = 'http://wz.sun0769.com/index.php/question/questionType?type=4&page='    offset = 0    start_urls = [url + str(offset)]    def parse(self, response):        # 取出每个页面里帖子链接列表        links = response.xpath("//div[@class='greyframe']/table//td/a[@class='news14']/@href").extract()        # 迭代发送每个帖子的请求,调用parse_item方法处理        for link in links:            yield scrapy.Request(link, callback = self.parse_item)        # 设置页码终止条件,并且每次发送新的页面请求调用parse方法处理        if self.offset <= 71130:            self.offset += 30            yield scrapy.Request(self.url + str(self.offset), callback = self.parse)    # 处理每个帖子里    def parse_item(self, response):        item = DongguanItem()        # 标题        item['title'] = response.xpath('//div[contains(@class, "pagecenter p3")]//strong/text()').extract()[0]        # 编号        item['number'] = item['title'].split(' ')[-1].split(":")[-1]        # 文字内容,默认先取出有图片情况下的文字内容列表        content = response.xpath('//div[@class="contentext"]/text()').extract()        # 如果没有内容,则取出没有图片情况下的文字内容列表        if len(content) == 0:            content = response.xpath('//div[@class="c1 text14_2"]/text()').extract()            # content为列表,通过join方法拼接为字符串,并去除首尾空格            item['content'] = "".join(content).strip()        else:            item['content'] = "".join(content).strip()        # 链接        item['url'] = response.url        yield item


CrawlSpider 版本
# -*- coding: utf-8 -*-import scrapyfrom scrapy.linkextractors import LinkExtractorfrom scrapy.spiders import CrawlSpider, Rulefrom dongguan.items import DongguanItemimport timeclass SunSpider(CrawlSpider):    name = 'sun'    allowed_domains = ['wz.sun0769.com']    start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4&page=']    # 每一页的匹配规则    pagelink = LinkExtractor(allow=('type=4'))    # 每个帖子的匹配规则    contentlink = LinkExtractor(allow=r'/html/question/\d+/\d+.shtml')    rules = [        # 本案例为特殊情况,需要调用deal_links方法处理每个页面里的链接        Rule(pagelink, process_links = "deal_links", follow = True),        Rule(contentlink, callback = 'parse_item')    ]    # 需要重新处理每个页面里的链接,将链接里的‘Type&type=4?page=xxx’替换为‘Type?type=4&page=xxx’(或者是Type&page=xxx?type=4’替换为‘Type?page=xxx&type=4’),否则无法发送这个链接    def deal_links(self, links):        for link in links:            link.url = link.url.replace("?","&").replace("Type&", "Type?")            print link.url        return links    def parse_item(self, response):        print response.url        item = DongguanItem()        # 标题        item['title'] = response.xpath('//div[contains(@class, "pagecenter p3")]//strong/text()').extract()[0]        # 编号        item['number'] = item['title'].split(' ')[-1].split(":")[-1]        # 文字内容,默认先取出有图片情况下的文字内容列表        content = response.xpath('//div[@class="contentext"]/text()').extract()        # 如果没有内容,则取出没有图片情况下的文字内容列表        if len(content) == 0:            content = response.xpath('//div[@class="c1 text14_2"]/text()').extract()            # content为列表,通过join方法拼接为字符串,并去除首尾空格            item['content'] = "".join(content).strip()        else:            item['content'] = "".join(content).strip()        # 链接        item['url'] = response.url        yield itempipelines.py# -*- coding: utf-8 -*-# 文件处理类库,可以指定编码格式import codecsimport jsonclass JsonWriterPipeline(object):    def __init__(self):        # 创建一个只写文件,指定文本编码格式为utf-8        self.filename = codecs.open('sunwz.json', 'w', encoding='utf-8')    def process_item(self, item, spider):        content = json.dumps(dict(item), ensure_ascii=False) + "\n"        self.filename.write(content)        return item    def spider_closed(self, spider):        self.file.close()settings.pyITEM_PIPELINES = {    'dongguan.pipelines.DongguanPipeline': 300,}# 日志文件名和处理等级LOG_FILE = "dg.log"LOG_LEVEL = "DEBUG"

在项目根目录下新建main.py文件,用于调试

from scrapy import cmdlinecmdline.execute('scrapy crawl sunwz'.split())

执行程序

py2 main.py