知乎标签详情页爬虫

来源:互联网 发布:手机模拟器连发软件 编辑:程序博客网 时间:2024/06/06 02:18
#!/usr/bin/env pythonimport requestsimport MySQLdbimport reimport jsonimport threadingimport timefrom bs4 import BeautifulSoup# from lxml import etree# headers = {#     'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',#     'Accept-Encoding':'gzip, deflate, sdch, br',#     'Accept-Language':'en-US,en;q=0.8',#     'Cache-Control':'no-cache',#     'Connection':'keep-alive',#     'Cookie':'d_c0="AABCKio7GwuPTg_zEKo2OENBOST2OEj7t3M=|1483586009"; aliyungf_tc=AQAAAPcIxkOnfwYAAqDsdHeqF0qZN/4B; r_cap_id="NDI0OTAyMTBjOTNkNDZkZDg4MjMzZWM4MDVlZWJkYWI=|1484017503|eb9a8f0838c44b8987b867f3b8b1d25b9bcf8de4"; _xsrf=1ebb4fc81969fe6fad33c2453d185a3c; _zap=220c7b1f-b56f-4281-9c9f-64af404464f3; _ga=GA1.2.19806769.1484042429; s-q=%E7%9C%8B%E5%BE%85; s-i=7; sid=tb9hcp0o; s-t=autocomplete; l_n_c=1; q_c1=09e4a486f51a4f3cb3285e36005f5a49|1485152343000|1485152343000; cap_id="NzkxYTYwOTc1MmYwNDMyN2EwZWJmYmFkNjFkNmZlNmY=|1485152343|95909972859da1e7dc90468786f810374594b724"; l_cap_id="MWFiNjBlODAwYzkwNGU4OTk0MTRkY2ZkZGQwNzk3OTM=|1485152343|6e627a6ad1d25d19f7b23ea6d7206d7411f0d464"; login="NzQwZjU0MTcwNDYzNDU3NTg4ZjdmMjY2MTExYmFmOGQ=|1485152431|961e6ce88e990ac79d16f613f534d578aa9bdbe1"; __utma=51854390.19806769.1484042429.1485172331.1485225558.4; __utmb=51854390.0.10.1485225558; __utmc=51854390; __utmz=51854390.1485170263.2.2.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/topic/19776749/organize/entire; __utmv=51854390.100--|2=registration_date=20161216=1^3=entry_date=20161216=1; z_c0=Mi4wQUVBQ0RlNkhBUXNBQUVJcUtqc2JDeGNBQUFCaEFsVk4yeTJ0V0FEZnNBQ3NieDVGblowUHJELUVnX3JXV1NyanlB|1485226773|49f3c4d2e7d1b61e6a059b2167321003fd45430e',#     'Host':'www.zhihu.com',#     'Pragma':'no-cache',#     'Upgrade-Insecure-Requests':'1',#     'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'# }def run(res,header,cursor):    print "process start!\n"    try:        for r in res:            if r[2] not in spider_map:                url = 'https://www.zhihu.com/topic/' + r[2] + '/hot'                response = requests.get(url, headers = headers)                resHtml = response.text.encode('utf-8')                soup = BeautifulSoup(resHtml, 'lxml')                followed = soup.find('strong')                description = soup.find('div', 'zm-editable-content')                sql = r"update zhihu_tag set description='%s',followed='%s' where token='%s'" % (description, followed, r[2])                cursor.execute(sql)                print sql                time.sleep(3)    except Exception,e:        print eheaders = {    'Host':'www.zhihu.com',    'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}threads = []spider_map = []i = 0conn = MySQLdb.connect('127.0.0.1', 'root', '123456789')conn.autocommit(1)cursor = conn.cursor()res = cursor.execute('set names utf8')res = cursor.execute('use test')res = cursor.execute('select * from zhihu_tag')res = cursor.fetchall()spider_res = cursor.execute("select * from zhihu_tag where description != ''")spider_res = cursor.fetchall()for r in spider_res:    spider_map.append(r[2])for i in range(0,len(res)+800):    if i%800==0 :        t = threading.Thread(target=run, args=(res[i:i+800], headers, cursor))        threads.append(t)for i in range(0, len(threads)):    threads[i].start()for i in range(0, len(threads)):    threads[i].join()print "down\n"# url = 'https://www.zhihu.com/topic/19681083/hot'# response = requests.get(url, headers = headers)# print response.text


# 例子2

#!/bin/bash/env pythonimport requestsimport timeimport MySQLdbfrom bs4 import BeautifulSoupdef run(urls):    try:        conn.ping()    except:        conn = MySQLdb.connect('127.0.0.1', 'root', '123456789')        conn.autocommit(1)        cursor = conn.cursor()        res = cursor.execute('set names utf8')        res = cursor.execute('use test')    for url in urls:        print url        response = requests.get(url, headers = headers)        resHtml = response.text.encode('utf-8')        soup = BeautifulSoup(resHtml, 'lxml')        tags = soup.select('a[data-za-element-name]')        for i in tags:            tag = i.get_text().encode('utf-8')            token = i['data-token'].encode('utf-8')            tags_map[token] = tag            try:                sql = r"insert into zhihu_tag_py(title,token) values('%s','%s')" % (tag, token)                cursor.execute(sql)                print sql            except Exception,e:                print e            url = "https://www.zhihu.com/topic/%s/organize/entire" % token            urls.append(url)            print urltags_map = {}urls = []conn = MySQLdb.connect('127.0.0.1', 'root', '123456789')conn.autocommit(1)cursor = conn.cursor()res = cursor.execute('set names utf8')res = cursor.execute('use test')headers = {    'Accept':'*/*',    'Accept-Language':'en-US,en;q=0.8',    'Cookie':'d_c0="AABCKio7GwuPTg_zEKo2OENBOST2OEj7t3M=|1483586009"; aliyungf_tc=AQAAAPcIxkOnfwYAAqDsdHeqF0qZN/4B; r_cap_id="NDI0OTAyMTBjOTNkNDZkZDg4MjMzZWM4MDVlZWJkYWI=|1484017503|eb9a8f0838c44b8987b867f3b8b1d25b9bcf8de4"; _xsrf=1ebb4fc81969fe6fad33c2453d185a3c; _zap=220c7b1f-b56f-4281-9c9f-64af404464f3; _ga=GA1.2.19806769.1484042429; s-q=%E7%9C%8B%E5%BE%85; s-i=7; sid=tb9hcp0o; s-t=autocomplete; q_c1=09e4a486f51a4f3cb3285e36005f5a49|1485152343000|1485152343000; l_cap_id="OGMzODA3ZGJkMDg2NGY2NmExOGE2YzFkZWYzNzcyYzA=|1486088492|97402dd9f896aead9a3eed24afb33172233d374f"; cap_id="Y2I1NDMxMDczYWM2NDRjNWEzODVhMDdkYjlhMWE1ZDk=|1486088492|6c461c19070b5b9a5327659d4eea4cdc8537d13a"; login="OWYyM2FkMTlkNzI2NDI3Nzk5OGVmMWFjMjk2NDA5NDc=|1486088622|0126ec417721560f8e5eb13d27745353b6cde26a"; n_c=1; __utma=51854390.19806769.1484042429.1486094665.1486098722.5; __utmb=51854390.0.10.1486098722; __utmc=51854390; __utmz=51854390.1486087368.2.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmv=51854390.100--|2=registration_date=20161216=1^3=entry_date=20161216=1; z_c0=Mi4wQUVBQ0RlNkhBUXNBQUVJcUtqc2JDeGNBQUFCaEFsVk40WGE3V0FCdWsxaGpkTkY1YXJnQTRTd1dHXzVBc0t6ZDRR|1486105848|7ba9e77adc2be0c1966b1f890bb34f553288bd72',    'Accept-Encoding':'gzip, deflate, sdch, br',    'Cache-Control':'no-cache',    'Connection':'keep-alive',    'Host':'www.zhihu.com',    'Referer':'https://www.zhihu.com/topic/19776749/organize/entire',    'Pragma':'no-cache',    'Upgrade-Insecure-Requests':'1',    'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}rootUrl = 'https://www.zhihu.com/topic/19776749/organize/entire';response = requests.get(rootUrl, headers = headers)resHtml = response.text.encode('utf-8')soup = BeautifulSoup(resHtml, 'lxml')tags = soup.select('a[data-za-element-name]')# description = soup.find('div', class_ = "zm-editable-content").get_text()# followed = soup.select('.zm-topic-side-followers-info > a > strong')[0].get_text()# print response.text# print tagsfor i in tags:    tag = i.get_text()    token = i['data-token']    tags_map[token] = tag    url = "https://www.zhihu.com/topic/%s/organize/entire" % token    urls.append(url)run(urls)
0 0
原创粉丝点击