用Scrapy爬取教务处通知公告

来源：互联网发布：mac book可以安装vs么编辑：程序博客网时间：2024/05/17 09:06

1.准备工作

python2.7.11 win32, scrapy 1.1.0rc1

scrapy入门教程 http://scrapy-chs.readthedocs.org/zh_CN/latest/intro/tutorial.html

xpath基础语法 http://www.cnblogs.com/zhaozhan/archive/2009/09/09/1563617.html

2.创建项目

scrapy startproject jwc

3.修改items.py

# -*- coding: utf-8 -*-# __author__ = 'Maximus'# Define here the models for your scraped items## See documentation in:# http://doc.scrapy.org/en/latest/topics/items.htmlimport scrapyclass JwcItem(scrapy.Item):    # define the fields for your item here like:    # name = scrapy.Field()    title = scrapy.Field()    url = scrapy.Field()    content = scrapy.Field()    date = scrapy.Field()

4.spiders目录下创建jwc_spider.py

# -*- coding: utf-8 -*-# __author__ = 'Maximus'from scrapy.http import Requestimport scrapyfrom jwc.items import JwcItemclass JwcSpider(scrapy.Spider):    name = "jwc"    start_urls = [        "http://jwc.njupt.edu.cn/s/24/t/923/p/21/i/1/list.htm"    ]    def parse(self, response):        for sel in response.xpath('//tr/td'):            item = JwcItem()            item['title'] = [n.encode('utf-8') for n in sel.xpath('a/font/text()').extract()]            item['url'] = "http://jwc.njupt.edu.cn" + "".join(sel.xpath('a/@href').extract())            item['date'] = sel.xpath("../td[@class='postTime']/text()").extract()            if item['title']:                yield Request(item['url'], callback=self.parse_content, meta={'item': item})        url = "http://jwc.njupt.edu.cn" + response.xpath("//table/tr/td/a[@title]/@href").extract()[2]        yield Request(url, callback=self.parse_from_second)    def parse_from_second(self, response):        for sel in response.xpath('//tr/td'):            item = JwcItem()            item['title'] = [n.encode('utf-8') for n in sel.xpath('a/font/text()').extract()]            item['url'] = "http://jwc.njupt.edu.cn" + "".join(sel.xpath('a/@href').extract())            item['date'] = sel.xpath("../td[@class='postTime']/text()").extract()            if item['title']:                yield Request(item['url'], callback=self.parse_content, meta={'item': item})        url = "http://jwc.njupt.edu.cn" + response.xpath("//table/tr/td/a[@title]/@href").extract()[4]        yield Request(url, callback=self.parse_from_second)    def parse_content(self, response):        item = response.meta['item']        item['content'] = [n.encode('utf-8') for n in response.xpath('//div[@id="container_content"]').extract()]        return item

5.修改pipelines.py

# -*- coding: utf-8 -*-# __author__ = 'Maximus'# Define your item pipelines here## Don't forget to add your pipeline to the ITEM_PIPELINES setting# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.htmlimport jsonimport codecsclass JwcPipeline(object):    def __init__(self):        self.file = codecs.open('items.json', 'wb', encoding='utf-8')    def process_item(self, item, spider):        line = json.dumps(dict(item)) + '\n'        self.file.write(line.decode("unicode_escape"))        return item

6.settings.py中添加pipelines

ITEM_PIPELINES = {    'jwc.pipelines.JwcPipeline': 300}

7.运行并将结果保存

scrapy crawl jwc -o items.json

1 0