使用python抓取csdn博客访问量并保存在sqlite3数据库中
来源:互联网 发布:javaweb js 跨域 编辑:程序博客网 时间:2024/05/11 08:34
转载请注明来源:http://blog.csdn.net/imred
解析网页使用的是BeautifulSoup,具体不解释了,代码里有注释,不懂欢迎提问。
# -*- coding: utf-8 -*-import osimport sqlite3import urllib.requestfrom bs4 import BeautifulSoup# 前缀+页数+后缀拼成URL,HOST用来把链接相对地址转为绝对地址PREFIX = "http://blog.csdn.net/imred/article/list/"SUFFIX = "?viewmode=contents"HOST = "http://blog.csdn.net"# t_article保存文章信息,t_time保存抓取时时间,t_view保存访问量SQL_CREATE_T_ARTICLE = "CREATE TABLE IF NOT EXISTS t_article ( \ id INTEGER PRIMARY KEY AUTOINCREMENT, \ title TEXT NOT NULL, \ link TEXT NOT NULL)"SQL_CREATE_T_TIME = "CREATE TABLE IF NOT EXISTS t_time ( \ id INTEGER PRIMARY KEY AUTOINCREMENT, \ time TEXT DEFAULT (datetime('now', 'localtime')))"SQL_CREATE_T_VIEW = "CREATE TABLE IF NOT EXISTS t_view ( \ id INTEGER PRIMARY KEY AUTOINCREMENT, \ tid INTEGER, \ aid INTEGER, \ view INTEGER NOT NULL, \ FOREIGN KEY(tid) REFERENCES t_time(id) ON DELETE CASCADE, \ FOREIGN KEY(aid) REFERENCES t_article(id) ON DELETE CASCADE)"# 得到最近插入的时间idSQL_QUERY_MAX_TID = "SELECT MAX(id) max_tid FROM t_time"SQL_INSERT_TIME = "INSERT INTO t_time(time) VALUES(datetime('now', 'localtime'))"# 通过链接地址查询文章idSQL_QUERY_ARTICLE = "SELECT id FROM t_article WHERE link=?"SQL_INSERT_ARTICLE = "INSERT INTO t_article(title, link) VALUES(?, ?)"SQL_INSERT_VIEW = "INSERT INTO t_view(tid, aid, view) VALUES(?, ?, ?)"class MyError(Exception): def __init__(self, value): self.value = value def __str__(self): return repr(self.value)def getHtml(url): HEADERS = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'} try: req = urllib.request.Request(url, headers=HEADERS) response = urllib.request.urlopen(req) html = response.read().decode() return html except urllib.error.HTTPError as e: raise MyError("Request for " + url + " failed: " + str(e))# 获取文章总共有多少页def getPageNum(soup): soupPageListDiv = soup.find("div", class_="pagelist") if soupPageListDiv is None: raise MyError("No any articles") soupPageNumSpan = soupPageListDiv.span strSpan = soupPageNumSpan.string left = strSpan.find("共") right = strSpan.find("页") strNum = strSpan[left + 1:right] return int(strNum)def main(): html = getHtml(PREFIX + "1" + SUFFIX) soup = BeautifulSoup(html, "lxml") iPageNum = getPageNum(soup) # 文章标题列表 titleListHTML = soup.find_all("span", class_="link_title") titleList = [] # 文章访问量列表和文章链接列表 viewListHTML = soup.find_all("span", class_="link_view") viewList = [] linkList = [] if iPageNum > 1: for i in range(2, iPageNum + 1): tmpHtml = getHtml(PREFIX + str(i) + SUFFIX) tempSoup = BeautifulSoup(tmpHtml, "lxml") titleListHTML += tempSoup.find_all("span", class_="link_title") viewListHTML += tempSoup.find_all("span", class_="link_view") for title in titleListHTML: titleList.append(title.a.string.strip()) for view in viewListHTML: viewList.append(view.contents[1].strip("()")) linkList.append(HOST + view.contents[0]['href']) for i in range(len(titleList)): print(titleList[i] + " " + viewList[i] + " " + linkList[i]) strDbPath = os.path.join(os.path.dirname(__file__), 'blog_stat.db') try: conn = sqlite3.connect(strDbPath) try: cursor = conn.cursor() cursor.execute(SQL_CREATE_T_ARTICLE) cursor.execute(SQL_CREATE_T_TIME) cursor.execute(SQL_CREATE_T_VIEW) cursor.execute(SQL_INSERT_TIME) for i in range(len(titleList)): title = titleList[i] link = linkList[i] # 在t_article中查询是否有这篇文章,如果没有,则插入一条新纪录 cursor.execute(SQL_QUERY_ARTICLE, (link,)) result = cursor.fetchall() if len(result) == 0: cursor.execute(SQL_INSERT_ARTICLE, (title, link)) # 得到时间id cursor.execute(SQL_QUERY_MAX_TID) result = cursor.fetchone() max_tid = result[0] for i in range(len(titleList)): link = linkList[i] view = viewList[i] cursor.execute(SQL_QUERY_ARTICLE, (link, )) result = cursor.fetchone() # 得到文章id aid = result[0] # 插入新纪录 cursor.execute(SQL_INSERT_VIEW, (max_tid, aid, view)) finally: cursor.close() conn.commit() finally: conn.close()if __name__ == "__main__": main()
本文采用 CC-BY 协议进行授权
阅读全文
0 0
- 使用python抓取csdn博客访问量并保存在sqlite3数据库中
- 使用url读取csdn的博客访问量并将记录保存到本地
- 使用python爬取csdn博客访问量
- 使用python爬取csdn博客访问量
- 使用python爬取csdn博客访问量
- 使用python爬取csdn博客访问量
- 使用python爬取csdn博客访问量
- WebMagic(三)----抓取CSDN博客通过JDBC保存到数据库中去
- WebMagic(三)----抓取CSDN博客通过JDBC保存到数据库中去
- 使用python统计csdn博客一段时间内的访问量
- python 中使用sqlite3数据库
- Python获取数据库数据并保存在excel表格中
- python爬虫之csdn刷博客访问量
- Python爬虫抓取csdn博客
- python 爬取csdn网页并保存博客到本地
- python抓取网页中图片并保存到本地
- python抓取网页中图片并保存到本地
- python抓取省市区的数据并保存到mysql中
- solution_101
- bzoj 4952: [Wf2017]Need for Speed(二分)
- SpringMVC:前台jsp页面和后台传值
- centos7安装扩展配置(redis,memcached,workerman,远程访问数据库)
- 数据库索引
- 使用python抓取csdn博客访问量并保存在sqlite3数据库中
- <C++>2.命名空间
- PYTHON基础笔记(1)
- 【项目管理和构建】——Maven下载、安装和配置
- bzoj 1053 反素数
- 腾讯-算法工程师电话面试
- resource fork, Finder information, or similar detritus not al site:forums.developer.apple.com..
- 项目管理和构建】十分钟教程,eclipse配置maven + 创建maven项目
- 【bzoj 2456】mode(乱搞)