Scrapy爬虫实例——南方都市报

来源:互联网 发布:vb中的类 编辑:程序博客网 时间:2024/04/28 02:01

1.目录结构

工程目录下: 忽略__pycache__目录(缓存目录),应该是类似java编译后的class文件。目录结构如下:└─  SouthCity #工程(project)名    │      │  scrapy.cfg              #scrapy爬虫部署的配置文件(新建项目的时候自动生成)    │    └─SouthCity #spider名        │  items.py            #Items代码模块           │  middlewares.py      #Middlewares模块          │  pipelines.py        #Pipelines模块        │  settings.py         #scrapy爬虫的配置文件(配置定制的scrapy组件)        │  __init__.py        │        └─spiders              #spider模块目录           │  mpage.py           #spider代码           └─  __init__.py

2.爬虫简介

该爬虫主要分为3个部分:
自定义item(items.py ) scrapy内置的数据结构
爬取部分(mpage.py) 解析链接并存到item中
存储部分(pipelines.py) 从item中取出,存到数据库

3.自定义item

# -*- coding: utf-8 -*-import scrapy# 继承crapy.Item,# 定义自己的数据结构class ArticalItem(scrapy.Item):    leading_title = scrapy.Field() #大标题      title = scrapy.Field() #标题    subtitle = scrapy.Field() #副标题    link = scrapy.Field() #链接      source = scrapy.Field() #新闻来源    writeTime = scrapy.Field() #编写时间      section = scrapy.Field() #板块      author = scrapy.Field() #作者    news =  scrapy.Field()  #新闻内容修改Setting.pyITEM_PIPELINES = {  'SouthCity.pipelines.MySQLStoreCnblogsPipeline': 301,}

4.爬取部分

#   mpage.py#总共三个部分,也就是爬取的三个步骤。#parse              拿到报纸所有的板块的url#parse_section      拿到当前板块的所有新闻的url#parse_page         取得对应的新闻信息,放到定义的item种# -*- coding: utf-8 -*-import scrapyfrom bs4 import BeautifulSoupfrom datetime import datetimefrom SouthCity.items import ArticalItemnav={}class MpageSpider(scrapy.Spider):    name = 'mpage'    # allowed_domains = ['http://epaper.oeeee.com/epaper/A/html/']    start_urls = ['http://epaper.oeeee.com/epaper']    def parse(self, response):        html = response.body        soup=BeautifulSoup(html,'html.parser')        paper_div=soup.find('div','shortcutbox')        a=paper_div.find_all('a')        for i in a:            href=i.get('href')            link=response.urljoin(href) #自动拼接            # link='http://epaper.oeeee.com/epaper/'+href[href.find('A'):] (手动拼接相对连接)            nav[i.text]=link            try:                #yield 的作用是一次提交一次请求后,继续执行。避免一次返回一个迭代对象,占用过多的内存。                yield scrapy.Request(link,callback=self.parse_section)            except:                continue        # print(nav)    def parse_section(self, response):        html = response.body        soup=BeautifulSoup(html,'html.parser')        paper_div=soup.find('div','main-list')        a=paper_div.find_all('a')        nav={}        for i in a:            href=i.get('href')            link=response.urljoin(href)            nav[i.text]=link            try:                yield scrapy.Request(link,callback=self.parse_page)            except:                continue        # print(nav)    def parse_page(self,response):        detailbox=[]        artical='  '        html = response.body          soup = BeautifulSoup(html, "html.parser")          # try:        info = soup.find('div', "main-600 fl")        #print(1)        detail=info.find_all('span')        # detailbox.append(detail[1].text)        #print(2)        for dt in detail:            try:                dts=dt.text                dts=dts[dts.find(':')+1:].strip()                detailbox.append(dts)            except:                detailbox.append(dt.text)        #print(3)        news=info.find('div','text')        pp=news.find_all('p')        #print(4)        for p in pp:            pt = p.text            pt = pt.strip().replace("\xa0","")            artical += pt        #print(5)        try:            head1=info.find('h1').text            head2=info.find_all('h2')        except:            pass        item = ArticalItem()          #print(6)        item['leading_title'] = head2[0].text        item['title'] = head1        item['subtitle'] = head2[1].text        item['link']=response.url        item['writeTime']=detailbox[1]        item['source']=detailbox[0]        item['section']=detailbox[3]        item['author']=detailbox[4]        item['news']=artical        #print(7)        yield item        return item        # print(item)  

4.存储部分

# pipeline.py#两部分:# __init__          定义数据库的连接    # process_item      利用定义好的连接进行存储import pymysqlimport loggingfrom hashlib import md5import datetimeimport sysclass MySQLStoreCnblogsPipeline(object):    #定义一个变量,使得类中可以访问。就这点而言,相当于java的成员变量。    def __init__(self):        self.connect = pymysql.connect(            host='localhost',            db='TESTDB',            user='pymysql',            passwd='123123',            charset='utf8',            use_unicode=True)        self.cursor = self.connect.cursor()    def process_item(self, item, spider):        global NewTable_Tag        now = datetime.datetime.now()        date=str(now.date())        date_s=date[:4]+date[5:7]+date[8:]        print(date_s) #日期序列 如 20170912        table_name='sc_'+date_s #表名        #建表的sql语句        sql='CREATE TABLE SC_%s (leading_title varchar(255), title varchar(255), subtitle varchar(255), link varchar(250) NOT NULL primary key, writeTime varchar(20), source varchar(100),section varchar(50),author varchar(100),news text,updated datetime,img varchar(100))'%date_s        #查询的sql语句        sql_query = "SELECT 1 from SC_%s where link = '%s'"%(date_s,item['link'])        #更新的sql语句        sql_update = """UPDATE sc_%s set leading_title = '%s' ,                                         title = '%s',                                          subtitle = '%s' ,                                         link = '%s' ,                                          writetime = '%s' ,                                          source = '%s' ,                                         section = '%s' ,                                          author = '%s' ,                                         news = '%s' ,                                         updated = '%s'                                     where link = '%s'                    """% (date_s,                          item['leading_title'],                          item['title'],                          item['subtitle'],                          item['link'],                          item['writeTime'],                          item['source'],                          item['section'],                          item['author'],                          item['news'],                          now,                          item['link'])        #插入新值的sql语句        sql_insert =  """                    insert into sc_%s(leading_title, title, subtitle, link, writeTime,source,section,author,news,updated)                     values('%s', '%s', '%s','%s', '%s', '%s', '%s', '%s', '%s', '%s')                    """% (date_s,                        item['leading_title'],                        item['title'],                        item['subtitle'],                        item['link'],                        item['writeTime'],                        item['source'],                        item['section'],                        item['author'],                        item['news'],                        now)        print('我在loc1')                    #暂时忽略:tag 如何不重复检查?        #检查是否存在该表        self.cursor.execute('show tables')        tables=self.cursor.fetchall()        if  (table_name,) not in tables:          try:              #不存在,新建表              self.cursor.execute(sql)          except Exception as e:              raise e        print('我在loc2')         try:            #查询当前链接item['link']是否存在            self.cursor.execute(sql_query)            ret = self.cursor.fetchone()            if ret:                self.cursor.execute(sql_update)                print("成功更新一条数据!")                #print """                #    update cnblogsinfo set title = %s, description = %s, link = %s, listUrl = %s, updated = %s where linkmd5id = %s                #""", (item['title'], item['desc'], item['link'], item['listUrl'], now, linkmd5id)            #item['link']不存在               else:                self.cursor.execute(sql_insert)                print("成功插入一条数据!")                #print """                #    insert into cnblogsinfo(linkmd5id, title, description, link, listUrl, updated)                #    values(%s, %s, %s, %s, %s, %s)                #""", (linkmd5id, item['title'], item['desc'], item['link'], item['listUrl'], now)            print('我在loc3')            self.connect.commit()            # self.cursor.close() # 关闭游标,似乎没有关闭连接(不知会有什么影响,实际使用没发觉有问题)        except Exception as error:            logging.warning(error)        return item