python 将爬虫内容保存到mysql中

来源:互联网 发布:2017年8月份经济数据 编辑:程序博客网 时间:2024/05/29 15:29

二十五、python将爬取的内容保存到mysql

--------------------------------------------------------------

#sqlAlchemy的介绍

from sqlalchemy import create_engine

from sqlalchemy.ext.declarative import declarative_base

from sqlalchemy import Column, Integer, String, DateTime, ForeignKey

from sqlalchemy.orm import sessionmaker, relationship, backref

from sqlalchemy.dialects.mysql import LONGTEXT

import requests

from lxml import etree

 

# 获取段子内容、赞、分享、收藏数

def getJokeOfAllList(basurl='http://www.budejie.com/text/{0}'):

    nextPage = True

    pageNum = 1

    while nextPage:

        url = basurl.format(pageNum)

        response = requests.get(url)

        selector = etree.HTML(response.text)

        all = selector.xpath('//*/div[@class="j-r-list"]/ul/li')

        for a in all:

            joke = a.xpath('div[@class="j-r-list-c"]/div[@class="j-r-list-c-desc"]/a/text()')[0]

            like = a.xpath('div[@class="j-r-list-tool"]/div[@class="j-r-list-tool-l "]/ul/li[@class="j-r-list-tool-l-up"]/span/text()')[0]

            down = a.xpath('div[@class="j-r-list-tool"]/div[@class="j-r-list-tool-l "]/ul/li[@class="j-r-list-tool-l-down "]/span/text()')[0]

            share = a.xpath('div[@class="j-r-list-tool"]/div[@class="j-r-list-tool-ct"]/div[@class="j-r-list-tool-ct-share-c"]/span/text()')[0]

            comment = a.xpath('div[@class="j-r-list-tool"]/div[@class="j-r-list-tool-r j-r-list-tool-cc"]/ul/li[@class=" f-tac j-comment j-comment-width  j-comment-down-width"]/a/span[@class="comment-counts"]/text()')[0]

            # print joke, '=====', like, '====', down, '====', share.replace(u"分享??",""), '=====', comment, '====='

            yield joke, like, down, share.replace(u"分享??", ""), comment

        hasNext = selector.xpath('//a[@class="pagenxt"]')

        if hasNext:

            pageNum += 1

        else:

            nextPage = False

        # print pageNum

 

 

#大概步骤如下:

#1、创建连接

#mysql+mysqlconnector 或者 mysql+mysqldb    mysql代表数据库,mysqlconnector代表连接驱动

engine = create_engine('mysql+mysqlconnector://root:root@192.168.216.7:3306/testdb?charset=utf8',echo=True) #echo是否打印sql语句

#2、定义映射

Base = declarative_base()  #获取公共类,模板的顶级父类

#定义自己的实体类

class Joke(Base):

    __tablename__ = 'jokes'

    id = Column(Integer, nullable=True, primary_key=True, index=True)

    content = Column(LONGTEXT, nullable=True)

    like = Column(Integer, nullable=False)

    down = Column(Integer, nullable=False)

    share = Column(Integer, nullable=False)

    comment = Column(Integer, nullable=False)

 

 

#3、映射实例化,创建数据库表

Base.metadata.create_all(engine)

#4、创建会话

Session = sessionmaker(engine)

session = Session()

 

#5、对象实例持久化

if __name__ == '__main__':

    for i,(joke, like, down, share, comment) in enumerate(getJokeOfAllList()):

        #print i,joke, like, down, share, comment

        #print share.encode("utf-8").replace('??',"")

        joke = Joke(content=joke.encode("utf-8"), like=int(like), down=int(down), share=int(share.encode("utf-8").replace('??',"")), comment=int(comment))

        print joke.content, joke.like, joke.down, joke.share, joke.comment

        session.add(joke)

        if i % 10 == 0:

            print i

            session.commit()

 

--------------------------------------------------------------

阅读全文
0 0
原创粉丝点击