selenium+python+BeautifulSoup爬取知乎文章信息
来源:互联网 发布:网上开店软件 编辑:程序博客网 时间:2024/05/16 01:05
本文通过selenium+python+BeautifulSoup来爬取知乎文章信息。
#知乎推荐文章爬取#2017/8/6# -*- encoding = utf-8 -*-from selenium import webdriverfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.support.ui import WebDriverWaitfrom selenium.webdriver.support import expected_conditions as ECfrom selenium.common.exceptions import TimeoutExceptionfrom selenium.common.exceptions import NoSuchElementExceptionfrom bs4 import BeautifulSoupimport csvimport osimport timeimport redriver = webdriver.Chrome()#登录知乎def putcookies(account,password): try: driver.get('https://www.zhihu.com/#signin') WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "body > div.index-main > div > div.desk-front.sign-flow.clearfix.sign-flow-simple > div.index-tab-navs > div > a.active"))) botton = driver.find_element_by_css_selector('body > div.index-main > div > div.desk-front.sign-flow.clearfix.sign-flow-simple > div.view.view-signin > div.qrcode-signin-container > div.qrcode-signin-step1 > div.qrcode-signin-cut-button > span') botton.click() form = driver.find_element_by_css_selector('body > div.index-main > div > div.desk-front.sign-flow.clearfix.sign-flow-simple > div.view.view-signin > form > div.group-inputs > div.account.input-wrapper > input[type="text"]') pas = driver.find_element_by_css_selector('body > div.index-main > div > div.desk-front.sign-flow.clearfix.sign-flow-simple > div.view.view-signin > form > div.group-inputs > div.verification.input-wrapper > input[type="password"]') sub = driver.find_element_by_css_selector('body > div.index-main > div > div.desk-front.sign-flow.clearfix.sign-flow-simple > div.view.view-signin > form > div.button-wrapper.command > button') form.send_keys(account) pas.send_keys(password) sub.click() try: print('请手动输入验证码') driver.implicitly_wait(10) driver.find_element_by_css_selector('#root > div > div:nth-child(2) > header > div > div.SearchBar > button') except NoSuchElementException: sub.click() except: putcookies(account,password)#滑动页面def change_page(num): WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CSS_SELECTOR,'#root > div > div:nth-child(2) > header > div > div.SearchBar > button'))) for i in range(num): driver.execute_script('window.scrollTo(0, document.body.scrollHeight)') time.sleep(3)#解析页面def findinf(html): soup = BeautifulSoup(html,'lxml') r = re.compile('(\d+)') links = soup.find_all('div',class_='Card TopstoryItem') for link in links: try: maininf = link.find(class_='Feed-meta-item').get_text()[-3:]#话题 writer = link.find(class_='AuthorInfo-head').get_text()#作者 except: continue try: intd = link.find('div',class_='RichText AuthorInfo-badgeText').string#作者个人介绍 except: intd = '' title = link.find('h2',class_='ContentItem-title').get_text()#标题 href = 'https://www.zhihu.com' + link.find('h2',class_='ContentItem-title').a['href']#文章链接 try: support = link.find(class_='Button VoteButton VoteButton--up').get_text()#点赞 except: support = link.find(class_='Button LikeButton ContentItem-action').get_text()#点赞 try: talking = r.match(link.find('button',class_='Button ContentItem-action Button--plain').get_text()[:-3]).group()#评论数 except: talking = '' content = link.find('span',class_='RichText CopyrightRichText-richText').get_text()#摘要 yield { 'maininf': maininf, 'writer':writer, 'intd':intd, 'title':title, 'support':support, 'talking':talking, 'content':content, 'href':href, }#创建一个文件夹def make(path): if not os.path.exists(path): os.makedirs(path)#保存数据def save_to_csv(inf,path): with open(path + '知乎文章信息概要采集.csv','a') as f: writer = csv.writer(f) writer.writerow(['标题','作者','话题','作者个人介绍','点赞数','评论数','文章链接','摘要']) try: for i in inf: writer.writerow([i['title'],i['writer'],i['maininf'],i['intd'],i['support'],i['talking'],i['href'],i['content']]) except: pass#主函数def main(account,password,num): path = 'D:/数据/知乎文章/' putcookies(account,password) change_page(num) inf = findinf(driver.page_source) make(path) print('---'*43) print('{:^60}'.format('知乎文章概要')) print("***"*43) for i in findinf(driver.page_source): print('标题:{:<10s}'.format(i['title'])) print('作者:{:>3s}'.format(i['writer']),end = ' '*5) print("话题:{:>3s}".format(i['maininf'])) print('作者个人介绍:') print('{:<5s}'.format(i['intd'])) print('点赞数:{:<2s}'.format(i['support']),end = ' '*5) print("评论数:{:3s}".format(i['talking'])) print("文章链接:" + i['href']) print("摘要:") print('{:<5s}'.format(i['content'])) print('---' *43) save_to_csv(inf,path)#执行程序if __name__ == '__main__': num = int(input('请输入要爬取的页面数:')) account = input("请输入知乎账号:") password = input("请输入知乎密码:") time_start = time.time() main(account,password,num) print("^^^"*43) print("共耗时{}秒".format(time.time()-time_start)) driver.quit()
阅读全文
1 0
- selenium+python+BeautifulSoup爬取知乎文章信息
- 使用Python+selenium+BeautifulSoup抓取动态网页的关键信息
- [python爬虫] BeautifulSoup和Selenium对比爬取豆瓣Top250电影信息
- [python爬虫] BeautifulSoup和Selenium简单爬取知网信息测试
- BeautifulSoup和Selenium对比爬取豆瓣Top250电影信息
- 【python系列】使用 BeautifulSoup 获取 meta 信息
- Python爬取淘宝搜索页,使用Selenium+BeautifulSoup
- python selenium+beautifulSoup爬取彩票网数据
- BeautifulSoup和Selenium对比
- python爬虫,抓取新浪科技的文章(beautifulsoup+mysql)
- 使用selenium+BeautifulSoup+正则表达式下载公众号我要whatyouneed文章里的音乐
- python beautifulsoup
- Python BeautifulSoup
- Python BeautifulSoup
- python中用Beautifulsoup提取集搜客网站的信息
- Python爬取百度百科,BeautifulSoup提取关键信息
- Python网络爬虫与信息提取(二) BeautifulSoup库
- python eclipse 插件安装 及BeautifulSoup requests selenium在线安装 PhantomJS 安装 环境配置
- dwarf
- 笔试题(1)
- 链表
- linux内核编程4部曲之一:linux内核编译(2.6.12版本)图文解说
- 劝学
- selenium+python+BeautifulSoup爬取知乎文章信息
- nyoj448 寻找最大数
- A cat
- HDU-3440 House Man(差分约束系统)
- 应用程序无法正常启动(0xc000007b)
- nyoj 586 疯牛
- MapReduce基础:HDFS的数据完整性
- Spring中集成Mybatis分页插件PageHelper
- jquery validate实现表单验证 (正则表达式)