知乎标签详情页爬虫
来源:互联网 发布:手机模拟器连发软件 编辑:程序博客网 时间:2024/06/06 02:18
#!/usr/bin/env pythonimport requestsimport MySQLdbimport reimport jsonimport threadingimport timefrom bs4 import BeautifulSoup# from lxml import etree# headers = {# 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',# 'Accept-Encoding':'gzip, deflate, sdch, br',# 'Accept-Language':'en-US,en;q=0.8',# 'Cache-Control':'no-cache',# 'Connection':'keep-alive',# 'Cookie':'d_c0="AABCKio7GwuPTg_zEKo2OENBOST2OEj7t3M=|1483586009"; aliyungf_tc=AQAAAPcIxkOnfwYAAqDsdHeqF0qZN/4B; r_cap_id="NDI0OTAyMTBjOTNkNDZkZDg4MjMzZWM4MDVlZWJkYWI=|1484017503|eb9a8f0838c44b8987b867f3b8b1d25b9bcf8de4"; _xsrf=1ebb4fc81969fe6fad33c2453d185a3c; _zap=220c7b1f-b56f-4281-9c9f-64af404464f3; _ga=GA1.2.19806769.1484042429; s-q=%E7%9C%8B%E5%BE%85; s-i=7; sid=tb9hcp0o; s-t=autocomplete; l_n_c=1; q_c1=09e4a486f51a4f3cb3285e36005f5a49|1485152343000|1485152343000; cap_id="NzkxYTYwOTc1MmYwNDMyN2EwZWJmYmFkNjFkNmZlNmY=|1485152343|95909972859da1e7dc90468786f810374594b724"; l_cap_id="MWFiNjBlODAwYzkwNGU4OTk0MTRkY2ZkZGQwNzk3OTM=|1485152343|6e627a6ad1d25d19f7b23ea6d7206d7411f0d464"; login="NzQwZjU0MTcwNDYzNDU3NTg4ZjdmMjY2MTExYmFmOGQ=|1485152431|961e6ce88e990ac79d16f613f534d578aa9bdbe1"; __utma=51854390.19806769.1484042429.1485172331.1485225558.4; __utmb=51854390.0.10.1485225558; __utmc=51854390; __utmz=51854390.1485170263.2.2.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/topic/19776749/organize/entire; __utmv=51854390.100--|2=registration_date=20161216=1^3=entry_date=20161216=1; z_c0=Mi4wQUVBQ0RlNkhBUXNBQUVJcUtqc2JDeGNBQUFCaEFsVk4yeTJ0V0FEZnNBQ3NieDVGblowUHJELUVnX3JXV1NyanlB|1485226773|49f3c4d2e7d1b61e6a059b2167321003fd45430e',# 'Host':'www.zhihu.com',# 'Pragma':'no-cache',# 'Upgrade-Insecure-Requests':'1',# 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'# }def run(res,header,cursor): print "process start!\n" try: for r in res: if r[2] not in spider_map: url = 'https://www.zhihu.com/topic/' + r[2] + '/hot' response = requests.get(url, headers = headers) resHtml = response.text.encode('utf-8') soup = BeautifulSoup(resHtml, 'lxml') followed = soup.find('strong') description = soup.find('div', 'zm-editable-content') sql = r"update zhihu_tag set description='%s',followed='%s' where token='%s'" % (description, followed, r[2]) cursor.execute(sql) print sql time.sleep(3) except Exception,e: print eheaders = { 'Host':'www.zhihu.com', 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}threads = []spider_map = []i = 0conn = MySQLdb.connect('127.0.0.1', 'root', '123456789')conn.autocommit(1)cursor = conn.cursor()res = cursor.execute('set names utf8')res = cursor.execute('use test')res = cursor.execute('select * from zhihu_tag')res = cursor.fetchall()spider_res = cursor.execute("select * from zhihu_tag where description != ''")spider_res = cursor.fetchall()for r in spider_res: spider_map.append(r[2])for i in range(0,len(res)+800): if i%800==0 : t = threading.Thread(target=run, args=(res[i:i+800], headers, cursor)) threads.append(t)for i in range(0, len(threads)): threads[i].start()for i in range(0, len(threads)): threads[i].join()print "down\n"# url = 'https://www.zhihu.com/topic/19681083/hot'# response = requests.get(url, headers = headers)# print response.text
# 例子2
#!/bin/bash/env pythonimport requestsimport timeimport MySQLdbfrom bs4 import BeautifulSoupdef run(urls): try: conn.ping() except: conn = MySQLdb.connect('127.0.0.1', 'root', '123456789') conn.autocommit(1) cursor = conn.cursor() res = cursor.execute('set names utf8') res = cursor.execute('use test') for url in urls: print url response = requests.get(url, headers = headers) resHtml = response.text.encode('utf-8') soup = BeautifulSoup(resHtml, 'lxml') tags = soup.select('a[data-za-element-name]') for i in tags: tag = i.get_text().encode('utf-8') token = i['data-token'].encode('utf-8') tags_map[token] = tag try: sql = r"insert into zhihu_tag_py(title,token) values('%s','%s')" % (tag, token) cursor.execute(sql) print sql except Exception,e: print e url = "https://www.zhihu.com/topic/%s/organize/entire" % token urls.append(url) print urltags_map = {}urls = []conn = MySQLdb.connect('127.0.0.1', 'root', '123456789')conn.autocommit(1)cursor = conn.cursor()res = cursor.execute('set names utf8')res = cursor.execute('use test')headers = { 'Accept':'*/*', 'Accept-Language':'en-US,en;q=0.8', 'Cookie':'d_c0="AABCKio7GwuPTg_zEKo2OENBOST2OEj7t3M=|1483586009"; aliyungf_tc=AQAAAPcIxkOnfwYAAqDsdHeqF0qZN/4B; r_cap_id="NDI0OTAyMTBjOTNkNDZkZDg4MjMzZWM4MDVlZWJkYWI=|1484017503|eb9a8f0838c44b8987b867f3b8b1d25b9bcf8de4"; _xsrf=1ebb4fc81969fe6fad33c2453d185a3c; _zap=220c7b1f-b56f-4281-9c9f-64af404464f3; _ga=GA1.2.19806769.1484042429; s-q=%E7%9C%8B%E5%BE%85; s-i=7; sid=tb9hcp0o; s-t=autocomplete; q_c1=09e4a486f51a4f3cb3285e36005f5a49|1485152343000|1485152343000; l_cap_id="OGMzODA3ZGJkMDg2NGY2NmExOGE2YzFkZWYzNzcyYzA=|1486088492|97402dd9f896aead9a3eed24afb33172233d374f"; cap_id="Y2I1NDMxMDczYWM2NDRjNWEzODVhMDdkYjlhMWE1ZDk=|1486088492|6c461c19070b5b9a5327659d4eea4cdc8537d13a"; login="OWYyM2FkMTlkNzI2NDI3Nzk5OGVmMWFjMjk2NDA5NDc=|1486088622|0126ec417721560f8e5eb13d27745353b6cde26a"; n_c=1; __utma=51854390.19806769.1484042429.1486094665.1486098722.5; __utmb=51854390.0.10.1486098722; __utmc=51854390; __utmz=51854390.1486087368.2.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmv=51854390.100--|2=registration_date=20161216=1^3=entry_date=20161216=1; z_c0=Mi4wQUVBQ0RlNkhBUXNBQUVJcUtqc2JDeGNBQUFCaEFsVk40WGE3V0FCdWsxaGpkTkY1YXJnQTRTd1dHXzVBc0t6ZDRR|1486105848|7ba9e77adc2be0c1966b1f890bb34f553288bd72', 'Accept-Encoding':'gzip, deflate, sdch, br', 'Cache-Control':'no-cache', 'Connection':'keep-alive', 'Host':'www.zhihu.com', 'Referer':'https://www.zhihu.com/topic/19776749/organize/entire', 'Pragma':'no-cache', 'Upgrade-Insecure-Requests':'1', 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}rootUrl = 'https://www.zhihu.com/topic/19776749/organize/entire';response = requests.get(rootUrl, headers = headers)resHtml = response.text.encode('utf-8')soup = BeautifulSoup(resHtml, 'lxml')tags = soup.select('a[data-za-element-name]')# description = soup.find('div', class_ = "zm-editable-content").get_text()# followed = soup.select('.zm-topic-side-followers-info > a > strong')[0].get_text()# print response.text# print tagsfor i in tags: tag = i.get_text() token = i['data-token'] tags_map[token] = tag url = "https://www.zhihu.com/topic/%s/organize/entire" % token urls.append(url)run(urls)
0 0
- 知乎标签详情页爬虫
- 知乎日报详情页的实现(集成webView)
- 知乎爬虫
- 知乎爬虫
- python 爬虫 知乎
- 知乎爬虫
- python爬虫知乎
- 知乎图片爬虫
- JAVA知乎爬虫
- 知乎爬虫
- 爬虫登录知乎
- 都说“知乎”逼格高,我们来实现“知乎”回答详情页动画效果
- 知乎爬虫(一)
- 知乎爬虫(二)
- 登录知乎的爬虫
- [Python] 知乎多线程爬虫
- 知乎爬虫web系统
- 第二次爬虫实战--知乎
- 《机器学习》——读书笔记2
- 令人振奋的Class(上)
- 保存服务器的运行状态
- linux debian下安装中国农历
- 最短路径(Floyd、Dijstra、BellmanFord)
- 知乎标签详情页爬虫
- [HDU 1698]Just a Hook(线段树)
- server研发 交流思考
- Android上传图片文件工具类
- 令人振奋的Class(下):继承和实战代码示例
- 亚像素级点定位及边缘定位算法
- P1220 关路灯
- 《posix多线程编程》笔记(一)
- 无线路由连接(不用连接WAN照样可以上网)