scrapy framework

来源:互联网 发布:热血江湖源码 编辑:程序博客网 时间:2024/06/14 04:01

I learned scrapy these days. And wrote something out from it.

To grasp the courses info on fact, I tried to grasp something else, and I found lots of pages are js in it)

Not a complete experient though, I still haven't finished the scheduler...

Here are the codes:

import codecsimport scrapyfrom bs4 import BeautifulSoupfrom Imooc.items import ImoocItemdef PrintSoup(soup):    file ='soup.txt', 'w+', 'utf-8')    file.write(soup.prettify())    file.close()class Imooc(scrapy.Spider):    #---Spider info---    name = 'Imooc'    allowed_domains = ['']    start_urls = ['']    #---parsing part---    def parse(self, response):        html = response.body        soup = BeautifulSoup(html, 'html.parser')        #PrintSoup(soup)        tot = 0        #file ='div.txt', 'w+')        for div in soup.find_all(attrs={'class' : 'course-card-container'}):            item = ImoocItem()            tmp = div.find(attrs={'class' : 'course-label'})            tnp = tmp.find_all('label')            item['index'] = ''            for i in range(len(tnp)):                item['index'] += tnp[i].string                if i != len(tnp) - 1:                    item['index'] += ' '            tmp = div.find('h3')            item['name'] = tmp.string            tmp = div.find_all('span')            item['number'] = tmp[1].get_text()            tot += 1            #file.write(div.prettify())            #file.write(item)            yield item        print(tot)        #file.close()#This works perfect on scrapy#Make something out finally...

ITEM_PIPELINES = {    'Imooc.pipelines.ImoocPipeline': 1,}

import jsonimport codecsfrom pymongo import MongoClientclass ImoocPipeline(object):    def __init__(self):        self.file ='data.json', 'w', encoding='utf-8')        self.col = MongoClient('localhost', 27017).Mooc.Mooc    def process_item(self, item, spider):        #Caution: remember to convert dict()        #Caution: json.dumps with a parameter ensure_ascii        self.file.write(json.dumps(dict(item), ensure_ascii=False) + '\n')        self.col.insert(dict(item))        return item

The results in mongodb: