python爬虫进阶(三):微博的抓取
来源:互联网 发布:申请域名后 编辑:程序博客网 时间:2024/05/17 20:31
说明:仅做学习之用
下面用常用的两种方法来爬取微博:使用selenium+phantomjs和API解析
一、使用selenium+phantomjs
最重要的是设置user_agent,否则无法跳转链接
当然,还可以设置其它更多参数
from selenium import webdriverfrom selenium.webdriver.common.desired_capabilities import DesiredCapabilitiesuser_agent = ( 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36 QIHU 360SE')dcap = dict(DesiredCapabilities.PHANTOMJS)dcap['phantomjs.page.settings.userAgent'] = user_agentdriver = webdriver.PhantomJS(desired_capabilities=dcap)
一般的,我们现在浏览器console控制台找到我们想要的东西
document.getElementById('loginname').value = '123'
document.getElementsByName('password')[0].value = '123'
下面是两种登录方式,但我在登录过程中遇到了验证码问题,后来在输入用户名和密码后各自休息了几秒钟,居然登录成功了,难道是程序输入太快而被阻止了吗???
#driver = webdriver.PhantomJS(desired_capabilities=dcap)driver = webdriver.Chrome()#driver.set_window_size(1280,2400)driver.get('https://www.weibo.com/')time.sleep(10)driver.find_element_by_id('loginname').send_keys(username)time.sleep(5)driver.find_element_by_name('password').send_keys(password)time.sleep(2)driver.find_element_by_xpath('//div[contains(@class,"login_btn")][1]/a').click()# wait = WebDriverWait(driver, 10)# u_id = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#loginname')))# p_word = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#pl_login_form > div > div:nth-child(3) > div.info_list.password > div > input')))# login = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#pl_login_form > div > div:nth-child(3) > div.info_list.login_btn > a')))# u_id.send_keys(username)# time.sleep(5)# p_word.send_keys(password)# time.sleep(2)# login.click()time.sleep(15)html = driver.page_sourceprint(html)
第一种方法代码简单,但需要自己分析代码位置,而且要设置一个slee等待页面加载成功,如果等待时间太长,效率低,如果等待时间太短,则有可能达不到效果。
第二种方法代码复杂一些,需要导入一些库,但它不用分析网页源码,可以自动等待页面加载,只需要在chrome浏览器控制台源码中找到然后右键》copy》copy selector
就这样登录成功了!!!!
当我们在翻阅别人的微博时,只能翻一页就需要登录
剩下的就是结合需求,分析源码,找到你想要的内容了!!!!
代码:
import hashlibimport threadingimport timeimport refrom lxml import etreefrom selenium import webdriverfrom selenium.webdriver.common.desired_capabilities import DesiredCapabilitiesfrom pybloom import BloomFilterfrom collections import dequeuser_agent = ( 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36 QIHU 360SE')username = 'your weibo ID'password = 'your password'dcap = dict(DesiredCapabilities.PHANTOMJS)dcap['phantomjs.page.settings.userAgent'] = user_agent#爬取个人主页的爬虫#feeds_crawler = webdriver.PhantomJS(desired_capabilities=dcap)feeds_crawler = webdriver.Chrome()feeds_crawler.set_window_size(1280,2400)#爬取个人中心:关注,粉丝,微博#user_crawler = webdriver.PhantomJS(desired_capabilities=dcap)user_crawler = webdriver.Chrome()feeds_crawler.set_window_size(1280,2400)domain = 'weibo.com'url_home = 'http://' + domaindownload_bf = BloomFilter(1024*1024*16,0.01)cur_queue = deque()seed_user = 'http://weibo.com/yaochen'#获取有价值用户所设阈值min_mblogs_allowed = 100 #user所发微博最低数量max_follow_fans_ratio_allowed = 3 #关注(follow)/粉丝(fans)的倍数不能错过3倍def extract_user(users): print('extract user') for i in range(0,20): for user_element in user_crawler.find_elements_by_xpath('//*[contains(@class, "follow_item")]'): tried = 0 while tried < 3: try: user = {} user['follows'] = re.findall('(\d+)', user_element.find_element_by_xpath('.//div[@class="info_connect"]/span').text)[0] user['follows_link'] = user_element.find_element_by_xpath('.//div[@class="info_connect"]/span//a').get_attribute('href') user['fans'] = re.findall('(\d+)', user_element.find_elements_by_xpath('.//div[@class="info_connect"]/span')[1].text)[0] user['fans_link'] = user_element.find_elements_by_xpath('.//div[@class="info_connect"]/span//a')[1].get_attribute('href') user['mblogs'] = re.findall('(\d+)', user_element.find_elements_by_xpath('.//div[@class="info_connect"]/span')[2].text)[0] user_link = user_element.find_element_by_xpath('.//div[contains(@class,"info_name")]/a') user['link'] = re.findall('(.+)\?', user_link.get_attribute('href'))[0] if user['link'][:4] != 'http': user['link'] = domain + user['link'] user['name'] = user_link.text user['icon'] = re.findall('/([^/]+)$', user_element.find_element_by_xpath('.//dt[@class="mod_pic"]/a/img').get_attribute('src'))[0] # name = user_element.find_element_by_xpath('.//a[@class="S_txt1"]') print('--------------------') print(user['name'] + ' follows: ' + user['follows'] + ' blogs:' + user['mblogs']) print(user['link']) # 如果微博数量少于阈值或者关注数量与粉丝数量比值超过阈值,则跳过 if int(user['mblogs']) < min_mblogs_allowed or int(user['follows'])/int(user['fans']) > max_follow_fans_ratio_allowed: break enqueueUrl(user['link']) users.append(user) break except Exception: time.sleep(1) tried += 1 if go_next_page(user_crawler) is False: return usersdef scroll_to_bottom(): print('scroll down !!') for i in range(50): feeds_crawler.execute_script('window.scrollTo(0, document.body.scrollHeight)') html = feeds_crawler.page_source res = etree.HTML(html) next_page_url = res.xpath('//a[contains(@class,"page next")]') if len(next_page_url) > 0: return next_page_url[0].get('href') if len(re.findall('点击重新载入', html)) > 0: print('scrolling failed, reload it') feeds_crawler.find_element_by_link_text('点击重新载入').click() time.sleep(1)def go_next_page(cur_driver): try: next_page = cur_driver.find_element_by_xpath('//a[contains(@class, "page next")]').get_attribute('href') print('next page is ' + next_page) cur_driver.get(next_page) time.sleep(3) return True except Exception: print('next page is not found') return Falsedef extract_feed(feeds): for i in range(20): scroll_to_bottom() #博文内容爬取 for element in feeds_crawler.find_elements_by_class_name('WB_detail'): tried = 0 while tried < 3: try: feed = {} feed['time'] = element.find_element_by_xpath('.//div[@class="WB_from S_txt2"]').text feed['content'] = element.find_element_by_class_name('WB_text').text feed['image_names'] = [] for image in element.find_elements_by_xpath('.//li[contains(@class,"WB_pic")]/img'): feed['image_names'].append(re.findall('/([^/]+)$', image.get_attribute('src'))) feeds.append(feed) print('--------------------') print(feed['time']) print(feed['content']) break except Exception: tried += 1 time.sleep(1) if go_next_page(feeds_crawler) is False: return feedsdef enqueueUrl(url): #将要爬取的url加入爬取队列 try: md5v = hashlib.md5(url.encode('gb2312')).hexdigest() #在Python3中字符串要编码成byte类型 if md5v not in download_bf: print(url + ' is added to queue') cur_queue.append(url) download_bf.add(md5v) except ValueError: print('enqueueUrl err !!!!!')def Login(username, password): #登录页面 ''' :param username: your weibo id :param password: your password :return: ''' feeds_crawler.get(url=url_home) user_crawler.get(url=url_home) time.sleep(8) print('find click button to login') feeds_crawler.find_element_by_id('loginname').send_keys(username) feeds_crawler.find_element_by_name('password').send_keys(password) time.sleep(3) #点击登录按钮 feeds_crawler.find_element_by_xpath('//div[contains(@class, "login_btn")][1]/a').click() # 也可以使用 execute_script 来执行一段 javascript # feeds_crawler.execute_script('document.getElementsByClassName("W_btn_a btn_32px")[0].click()') # #同样对于另一个爬虫也需要登录 user_crawler.find_element_by_id('loginname').send_keys(username) user_crawler.find_element_by_name('password').send_keys(password) time.sleep(3) # # 执行 click() user_crawler.find_element_by_xpath('//div[contains(@class,"login_btn")][1]/a').click()def dequeUrl(): return cur_queue.popleft() #将队列中的元素(url)一个一个的返回def get_element_by_xpath(cur_driver,path): tried = 0 while tried<6: html = cur_driver.page_source res = etree.HTML(html) elements = res.xpath(path) if len(elements) == 0: time.sleep(1) continue return elementsdef fetch_user(url): print('Downloading ' + url) feeds_crawler.get(url) time.sleep(5) #提取用户姓名 account_name = get_element_by_xpath(feeds_crawler,'//h1')[0].text photo = get_element_by_xpath(feeds_crawler, '//p[@class="photo_wrap"]/img')[0].get('src') account_photo = re.findall('/([^/]+)$', photo) # 提取他的关注主页 follows_link = get_element_by_xpath(feeds_crawler, '//a[@class="t_link S_txt1"]')[0].get('href') print('account: ' + account_name) print('follows link is ' + follows_link) follows_link = 'http:' + follows_link user_crawler.get( follows_link ) feeds = [] users = [] #设置两个线程同时抓取 t_feeds = threading.Thread(target=extract_feed, name=None, args=(feeds,)) #t_users = threading.Thread(target=extract_user, name=None, args=(users,)) t_feeds.setDaemon(True) #t_users.setDaemon(True) t_feeds.start() #t_users.start() t_feeds.join() #t_users.join()def crawl(): while True: url = dequeUrl() fetch_user(url)def main(): enqueueUrl(seed_user) Login(username,password) crawl() if __name__ == '__main__': main()
注意:在python3中用hashlib库进行加密时,要将加密内容转换成byte类型:
用于判断是否已经抓取,也可用前面讲的mmh库
>>> >>> import hashlib>>> url = 'http://www.weibo.com'>>> md5v = hashlib.md5(url).hexdigest()Traceback (most recent call last): File "<pyshell#4>", line 1, in <module> md5v = hashlib.md5(url).hexdigest()TypeError: Unicode-objects must be encoded before hashing>>> md5v = hashlib.md5(url.encode('gb2312')).hexdigest()>>> >>> >>> md5v2 = hashlib.md5(b'www.baidu.com').hexdigest()>>> md5v'17d7b29a31328702848d2d42ae79a240'>>> md5v2'dab19e82e1f9a681ee73346d3e7a575e'>>> >>> >>> md5v3 = hashlib.md5(url.encode('gb2312')).hexdigest()>>> md5v3'17d7b29a31328702848d2d42ae79a240'>>>
关于微博图片:
实际上,微博中只有图片文件名没有改变,而存储域名、分辨率可能会改变,因此在储存时,只需要储存图片文件名即可,当我们要用时补充完整即可。
二、API接口分析
之前已经掌握,这里不再赘述!!
网上也有很多
阅读全文
0 0
- python爬虫进阶(三):微博的抓取
- python写爬虫2-数据抓取的三种方式
- Python爬虫实战之抓取淘宝MM照片(三)
- Python抓取段子的爬虫
- python爬虫进阶(七):应对反爬虫的策略
- python利用新浪API实现数据的抓取\python微博数据爬虫
- python爬虫,抓取新浪科技的文章(beautifulsoup+mysql)
- python爬虫抓取的一些难点
- Python爬虫抓取框架:Scrapy的架构
- 简单的python爬虫抓取图片实例
- python爬虫:抓取页面上的超链接
- python 爬虫(三)
- python爬虫(三)
- python爬虫(三)
- Python爬虫之三种网页抓取方法性能比较
- Python爬虫实战三 | 蓝奏网盘抓取网盘链接信息
- python爬虫(一)抓取 色影无忌图片
- python爬虫(抓取百度新闻列表)
- ResultSet结果集
- LRU策略
- leetcode --5. Longest Palindromic Substring
- 正则表达式后向引用
- css根据屏幕大小切换样式
- python爬虫进阶(三):微博的抓取
- mha配置高可用mysql
- opencv 视频操作
- JFinal之旅——find方法及分页查询
- HDU
- 统计0~n中0~9的个数
- Jzzhu and Cities ----CodeForces
- pip和deb的离线打包安装(备份恢复)
- SpringBoot框架中URL参数如何进行Base64加密解密