urllib 结合bs4通过调用selenium-webdriver来爬取jianshu首页文章，并依次阅读前5篇

来源：互联网发布：linux kafka 编辑：程序博客网时间：2024/06/08 04:22


开发环境：Win10+Python 3.6.1 64bit+PyCharmfrom urllib import requestfrom bs4 import BeautifulSoupfrom selenium import webdriverimport randomimport time'''Firefox浏览器'''driver=webdriver.Firefox()url="https://www.jianshu.com"'''构造headers，模拟浏览器登录'''headers={"User_Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:58.0)"         " Gecko/20100101 Firefox/58.0"}'''request得到html文档'''url_1=request.Request(url,headers=headers)req=request.urlopen(url_1).read().decode("utf-8")#print(req)'''html.parser作为解析器，将内容转化为bs格式'''soup=BeautifulSoup(req,"html.parser")'''查找a标签中所有class=title的语句'''titles=soup.find_all("a","title")#print(titles)'''open读写文件，with会自动close（）'''with open("D:\python\爬虫\jianshu.txt","w")as file:    for title in titles:        file.write(title.string+'\n')        file.write("www.jianshu.com"+str(title.get("href")+"\n"))'''调用浏览器，依次打开前5篇文章'''for i in range(0,5):    url_2="http://www.jianshu.com"+str(titles[i].get("href"))    print(i)    driver.get(url_2)    '''每篇60s的阅读时间'''    time.sleep(60)

阅读全文

0 0