简书

来源：互联网发布：大数据培训学什么课程编辑：程序博客网时间：2024/05/16 15:52

from selenium import webdriver
from bs4 import BeautifulSoup
import requests,re,os,time
driver = webdriver.PhantomJS()
urls=["http://www.jianshu.com/search?q=Python+selenium+PhantomJS&page=%d&type=notes"%x for x in range(1,100)]
for url in urls:
print(url)
try:
driver.get(url)
pt = driver.title
print(pt)
data=driver.page_source
bs1=BeautifulSoup(data,'lxml')
site=bs1.find("ul","unstyled list")
ws=site.find_all("li")
for w in ws:
time.sleep(4)
l=w("a")
title=w.a.text.strip()
link='http://www.jianshu.com/'+l[0].get('href')
print(link)
name=l[1].text
read=l[2].text
comment=l[3].text
info=title+link+name+read+comment
print(info)
title=re.sub(r'\\|\*|\>|\<|\?|\:|\"','',title)
headers={"User-Agent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)"}
content=requests.get(link,headers=headers).text
bs2=BeautifulSoup(content,'lxml')
content=bs2.find("div","show-content")
content=content.text
a=re.sub(r'。',r'。\n',content)
t='f://简书//%s.text'%title
with open(t,'w',errors='replace') as f:
f.write(info+'\n')
print('下载中')
f.write(a)
except:
pass

0 0