python 将爬虫内容保存到mysql中

来源：互联网发布：2017年8月份经济数据编辑：程序博客网时间：2024/05/29 15:29

二十五、python将爬取的内容保存到mysql

--------------------------------------------------------------

#sqlAlchemy的介绍

from sqlalchemy import create_engine

from sqlalchemy.ext.declarative import declarative_base

from sqlalchemy import Column, Integer, String, DateTime, ForeignKey

from sqlalchemy.orm import sessionmaker, relationship, backref

from sqlalchemy.dialects.mysql import LONGTEXT

import requests

from lxml import etree

# 获取段子内容、赞、分享、收藏数

def getJokeOfAllList(basurl='http://www.budejie.com/text/{0}'):

nextPage = True

pageNum = 1

while nextPage:

url = basurl.format(pageNum)

response = requests.get(url)

selector = etree.HTML(response.text)

all = selector.xpath('//*/div[@class="j-r-list"]/ul/li')

for a in all:

joke = a.xpath('div[@class="j-r-list-c"]/div[@class="j-r-list-c-desc"]/a/text()')[0]

like = a.xpath('div[@class="j-r-list-tool"]/div[@class="j-r-list-tool-l "]/ul/li[@class="j-r-list-tool-l-up"]/span/text()')[0]

down = a.xpath('div[@class="j-r-list-tool"]/div[@class="j-r-list-tool-l "]/ul/li[@class="j-r-list-tool-l-down "]/span/text()')[0]

share = a.xpath('div[@class="j-r-list-tool"]/div[@class="j-r-list-tool-ct"]/div[@class="j-r-list-tool-ct-share-c"]/span/text()')[0]

comment = a.xpath('div[@class="j-r-list-tool"]/div[@class="j-r-list-tool-r j-r-list-tool-cc"]/ul/li[@class=" f-tac j-comment j-comment-width j-comment-down-width"]/a/span[@class="comment-counts"]/text()')[0]

# print joke, '=====', like, '====', down, '====', share.replace(u"分享??",""), '=====', comment, '====='

yield joke, like, down, share.replace(u"分享??", ""), comment

hasNext = selector.xpath('//a[@class="pagenxt"]')

if hasNext:

pageNum += 1

else:

nextPage = False

# print pageNum

#大概步骤如下：

#1、创建连接

#mysql+mysqlconnector 或者 mysql+mysqldb mysql代表数据库，mysqlconnector代表连接驱动

engine = create_engine('mysql+mysqlconnector://root:root@192.168.216.7:3306/testdb?charset=utf8',echo=True) #echo是否打印sql语句

#2、定义映射

Base = declarative_base() #获取公共类，模板的顶级父类

#定义自己的实体类

class Joke(Base):

__tablename__ = 'jokes'

id = Column(Integer, nullable=True, primary_key=True, index=True)

content = Column(LONGTEXT, nullable=True)

like = Column(Integer, nullable=False)

down = Column(Integer, nullable=False)

share = Column(Integer, nullable=False)

comment = Column(Integer, nullable=False)

#3、映射实例化，创建数据库表

Base.metadata.create_all(engine)

#4、创建会话

Session = sessionmaker(engine)

session = Session()

#5、对象实例持久化

if __name__ == '__main__':

for i,(joke, like, down, share, comment) in enumerate(getJokeOfAllList()):

#print i,joke, like, down, share, comment

#print share.encode("utf-8").replace('??',"")

joke = Joke(content=joke.encode("utf-8"), like=int(like), down=int(down), share=int(share.encode("utf-8").replace('??',"")), comment=int(comment))

print joke.content, joke.like, joke.down, joke.share, joke.comment

session.add(joke)

if i % 10 == 0:

print i

session.commit()

--------------------------------------------------------------

阅读全文

0 0