urllib 结合bs4通过调用selenium-webdriver来爬取jianshu首页文章,并依次阅读前5篇

来源:互联网 发布:linux kafka 编辑:程序博客网 时间:2024/06/08 04:22
开发环境:Win10+Python 3.6.1 64bit+PyCharm
from urllib import requestfrom bs4 import BeautifulSoupfrom selenium import webdriverimport randomimport time'''Firefox浏览器'''driver=webdriver.Firefox()url="https://www.jianshu.com"'''构造headers,模拟浏览器登录'''headers={"User_Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:58.0)"         " Gecko/20100101 Firefox/58.0"}'''request得到html文档'''url_1=request.Request(url,headers=headers)req=request.urlopen(url_1).read().decode("utf-8")#print(req)'''html.parser作为解析器,将内容转化为bs格式'''soup=BeautifulSoup(req,"html.parser")'''查找a标签中所有class=title的语句'''titles=soup.find_all("a","title")#print(titles)'''open读写文件,with会自动close()'''with open("D:\python\爬虫\jianshu.txt","w")as file:    for title in titles:        file.write(title.string+'\n')        file.write("www.jianshu.com"+str(title.get("href")+"\n"))'''调用浏览器,依次打开前5篇文章'''for i in range(0,5):    url_2="http://www.jianshu.com"+str(titles[i].get("href"))    print(i)    driver.get(url_2)    '''每篇60s的阅读时间'''    time.sleep(60)

原创粉丝点击