简单爬虫,爬取CSDN博客阅读数量并存入数据库

来源:互联网 发布:足球经理ol停运 知乎 编辑:程序博客网 时间:2024/05/16 13:48

学习python中,所以写了一个简陋的东西,主要利用xpath来解析html,利用谷歌浏览器可以直接复制xpath,代码基于python 3.5:

# -*- coding: utf-8 -*-#coding=utf-8import urllib.request as reqimport datetimefrom lxml import etreeimport pymysql# 博客地址,{page_num}是要传入的页码数量base_url = "http://blog.csdn.net/i_am_kop/article/list/{page_num}"def get_html(url):    # 当前页数    page_num = 1    # 利用mysql批量插入,这里是values后面的值    sql_fra = []    while True:        print(100 * "-")        print("第", page_num, "页")        print(100 * "-")        # 获取页面        page = req.urlopen(url.format(page_num=page_num))        html = page.read().decode("utf-8")        # 开始解析xpath        selector = etree.HTML(html)        # 博客列表的div        blog_divs = selector.xpath("//*[@id=\"article_list\"]/div[*]")        # 如果此页没有内容,说明所有博客已经爬取完毕,退出        if not blog_divs:            break        # 循环读取本业博客        for blog_div in blog_divs:            # 标题            title_ele = blog_div.xpath("div[1]/h1/span/a")[0]            title = title_ele.xpath("string(.)").replace(" ", "").replace("\r\n", "")            # 数量            count_ele = blog_div.xpath("div[3]/span[2]")[0]            count = count_ele.xpath("string(.)").replace("阅读(", "").replace(")", "")            # 当前时间            now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')            # print("({title},{count},{date})".format(title=title, count=count, date=now))            sql_fra.append("('{title}',{count},'{date}')".format(title=title, count=count, date=now))        page_num += 1    # print(sql_fra)    # 开始插入mysql    conn = pymysql.connect(host='192.168.1.2', port=3306, user='root', passwd='password', db='blog_log')    cursor = conn.cursor()    sql = "INSERT INTO t_read_num(title,read_count,create_date) VALUES "+",".join(sql_fra)    conn.set_charset("utf8")    cursor.execute(sql)    conn.commit()    cursor.close()    conn.close()get_html(base_url)

sql脚本:

CREATE TABLE `t_read_num` (  `id` bigint(10) NOT NULL AUTO_INCREMENT,  `title` varchar(128) DEFAULT NULL,  `read_count` int(5) DEFAULT NULL,  `create_date` datetime DEFAULT NULL,  PRIMARY KEY (`id`)) ENGINE=InnoDB AUTO_INCREMENT=138 DEFAULT CHARSET=utf8
阅读全文
0 0
原创粉丝点击