python爬虫进阶(三):微博的抓取

来源:互联网 发布:申请域名后 编辑:程序博客网 时间:2024/05/17 20:31

说明:仅做学习之用


下面用常用的两种方法来爬取微博:使用selenium+phantomjs和API解析


一、使用selenium+phantomjs


最重要的是设置user_agent,否则无法跳转链接

当然,还可以设置其它更多参数

from selenium import webdriverfrom selenium.webdriver.common.desired_capabilities import DesiredCapabilitiesuser_agent = (    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36 QIHU 360SE')dcap = dict(DesiredCapabilities.PHANTOMJS)dcap['phantomjs.page.settings.userAgent'] = user_agentdriver = webdriver.PhantomJS(desired_capabilities=dcap)


在登录时需要输入用户名与密码,然后点击登录按钮,还好这里不需要输入验证码




一般的,我们现在浏览器console控制台找到我们想要的东西

document.getElementById('loginname').value = '123'

document.getElementsByName('password')[0].value = '123'


下面是两种登录方式,但我在登录过程中遇到了验证码问题,后来在输入用户名和密码后各自休息了几秒钟,居然登录成功了,难道是程序输入太快而被阻止了吗???

#driver = webdriver.PhantomJS(desired_capabilities=dcap)driver = webdriver.Chrome()#driver.set_window_size(1280,2400)driver.get('https://www.weibo.com/')time.sleep(10)driver.find_element_by_id('loginname').send_keys(username)time.sleep(5)driver.find_element_by_name('password').send_keys(password)time.sleep(2)driver.find_element_by_xpath('//div[contains(@class,"login_btn")][1]/a').click()# wait = WebDriverWait(driver, 10)# u_id = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#loginname')))# p_word = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#pl_login_form > div > div:nth-child(3) > div.info_list.password > div > input')))# login = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#pl_login_form > div > div:nth-child(3) > div.info_list.login_btn > a')))# u_id.send_keys(username)# time.sleep(5)# p_word.send_keys(password)# time.sleep(2)# login.click()time.sleep(15)html = driver.page_sourceprint(html)



第一种方法代码简单,但需要自己分析代码位置,而且要设置一个slee等待页面加载成功,如果等待时间太长,效率低,如果等待时间太短,则有可能达不到效果。

第二种方法代码复杂一些,需要导入一些库,但它不用分析网页源码,可以自动等待页面加载,只需要在chrome浏览器控制台源码中找到然后右键》copy》copy selector




就这样登录成功了!!!!

当我们在翻阅别人的微博时,只能翻一页就需要登录


剩下的就是结合需求,分析源码,找到你想要的内容了!!!!


代码:


import hashlibimport threadingimport timeimport refrom lxml import etreefrom selenium import webdriverfrom selenium.webdriver.common.desired_capabilities import DesiredCapabilitiesfrom pybloom import BloomFilterfrom collections import dequeuser_agent = (    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36 QIHU 360SE')username = 'your weibo ID'password = 'your password'dcap = dict(DesiredCapabilities.PHANTOMJS)dcap['phantomjs.page.settings.userAgent'] = user_agent#爬取个人主页的爬虫#feeds_crawler = webdriver.PhantomJS(desired_capabilities=dcap)feeds_crawler = webdriver.Chrome()feeds_crawler.set_window_size(1280,2400)#爬取个人中心:关注,粉丝,微博#user_crawler = webdriver.PhantomJS(desired_capabilities=dcap)user_crawler = webdriver.Chrome()feeds_crawler.set_window_size(1280,2400)domain = 'weibo.com'url_home = 'http://' + domaindownload_bf = BloomFilter(1024*1024*16,0.01)cur_queue = deque()seed_user = 'http://weibo.com/yaochen'#获取有价值用户所设阈值min_mblogs_allowed = 100            #user所发微博最低数量max_follow_fans_ratio_allowed = 3    #关注(follow)/粉丝(fans)的倍数不能错过3倍def extract_user(users):    print('extract user')    for i in range(0,20):        for user_element in user_crawler.find_elements_by_xpath('//*[contains(@class, "follow_item")]'):            tried = 0            while tried < 3:                try:                    user = {}                    user['follows'] = re.findall('(\d+)', user_element.find_element_by_xpath('.//div[@class="info_connect"]/span').text)[0]                    user['follows_link'] = user_element.find_element_by_xpath('.//div[@class="info_connect"]/span//a').get_attribute('href')                    user['fans'] = re.findall('(\d+)', user_element.find_elements_by_xpath('.//div[@class="info_connect"]/span')[1].text)[0]                    user['fans_link'] = user_element.find_elements_by_xpath('.//div[@class="info_connect"]/span//a')[1].get_attribute('href')                    user['mblogs'] = re.findall('(\d+)', user_element.find_elements_by_xpath('.//div[@class="info_connect"]/span')[2].text)[0]                    user_link = user_element.find_element_by_xpath('.//div[contains(@class,"info_name")]/a')                    user['link'] = re.findall('(.+)\?', user_link.get_attribute('href'))[0]                    if user['link'][:4] != 'http':                        user['link'] = domain + user['link']                    user['name'] = user_link.text                    user['icon'] = re.findall('/([^/]+)$', user_element.find_element_by_xpath('.//dt[@class="mod_pic"]/a/img').get_attribute('src'))[0]                    # name = user_element.find_element_by_xpath('.//a[@class="S_txt1"]')                    print('--------------------')                    print(user['name'] + ' follows: ' + user['follows'] + ' blogs:' + user['mblogs'])                    print(user['link'])                    # 如果微博数量少于阈值或者关注数量与粉丝数量比值超过阈值,则跳过                    if int(user['mblogs']) < min_mblogs_allowed or int(user['follows'])/int(user['fans']) > max_follow_fans_ratio_allowed:                        break                    enqueueUrl(user['link'])                    users.append(user)                    break                except Exception:                    time.sleep(1)                    tried += 1        if go_next_page(user_crawler) is False:            return usersdef scroll_to_bottom():    print('scroll down !!')    for i in range(50):        feeds_crawler.execute_script('window.scrollTo(0, document.body.scrollHeight)')        html = feeds_crawler.page_source        res = etree.HTML(html)        next_page_url = res.xpath('//a[contains(@class,"page next")]')        if len(next_page_url) > 0:            return next_page_url[0].get('href')        if len(re.findall('点击重新载入', html)) > 0:            print('scrolling failed, reload it')            feeds_crawler.find_element_by_link_text('点击重新载入').click()        time.sleep(1)def go_next_page(cur_driver):    try:        next_page = cur_driver.find_element_by_xpath('//a[contains(@class, "page next")]').get_attribute('href')        print('next page is ' + next_page)        cur_driver.get(next_page)        time.sleep(3)        return True    except Exception:        print('next page is not found')        return Falsedef extract_feed(feeds):    for i in range(20):        scroll_to_bottom()        #博文内容爬取        for element in feeds_crawler.find_elements_by_class_name('WB_detail'):            tried = 0            while tried < 3:                try:                    feed = {}                    feed['time'] = element.find_element_by_xpath('.//div[@class="WB_from S_txt2"]').text                    feed['content'] = element.find_element_by_class_name('WB_text').text                    feed['image_names'] = []                    for image in element.find_elements_by_xpath('.//li[contains(@class,"WB_pic")]/img'):                        feed['image_names'].append(re.findall('/([^/]+)$', image.get_attribute('src')))                    feeds.append(feed)                    print('--------------------')                    print(feed['time'])                    print(feed['content'])                    break                except Exception:                    tried += 1                    time.sleep(1)        if go_next_page(feeds_crawler) is False:            return feedsdef enqueueUrl(url):    #将要爬取的url加入爬取队列    try:        md5v = hashlib.md5(url.encode('gb2312')).hexdigest()        #在Python3中字符串要编码成byte类型        if md5v not in download_bf:            print(url + ' is added to queue')            cur_queue.append(url)            download_bf.add(md5v)    except ValueError:        print('enqueueUrl err !!!!!')def Login(username, password):    #登录页面    '''    :param username: your weibo id    :param password: your password    :return:     '''    feeds_crawler.get(url=url_home)    user_crawler.get(url=url_home)    time.sleep(8)    print('find click button to login')    feeds_crawler.find_element_by_id('loginname').send_keys(username)    feeds_crawler.find_element_by_name('password').send_keys(password)    time.sleep(3)    #点击登录按钮    feeds_crawler.find_element_by_xpath('//div[contains(@class, "login_btn")][1]/a').click()    # 也可以使用 execute_script 来执行一段 javascript    # feeds_crawler.execute_script('document.getElementsByClassName("W_btn_a btn_32px")[0].click()')    #    #同样对于另一个爬虫也需要登录    user_crawler.find_element_by_id('loginname').send_keys(username)    user_crawler.find_element_by_name('password').send_keys(password)    time.sleep(3)    # # 执行 click()    user_crawler.find_element_by_xpath('//div[contains(@class,"login_btn")][1]/a').click()def dequeUrl():    return cur_queue.popleft()      #将队列中的元素(url)一个一个的返回def get_element_by_xpath(cur_driver,path):    tried = 0    while tried<6:        html = cur_driver.page_source        res = etree.HTML(html)        elements = res.xpath(path)        if len(elements) == 0:            time.sleep(1)            continue        return elementsdef fetch_user(url):    print('Downloading ' + url)    feeds_crawler.get(url)    time.sleep(5)    #提取用户姓名    account_name = get_element_by_xpath(feeds_crawler,'//h1')[0].text    photo = get_element_by_xpath(feeds_crawler, '//p[@class="photo_wrap"]/img')[0].get('src')    account_photo = re.findall('/([^/]+)$', photo)    # 提取他的关注主页    follows_link = get_element_by_xpath(feeds_crawler, '//a[@class="t_link S_txt1"]')[0].get('href')    print('account: ' + account_name)    print('follows link is ' + follows_link)    follows_link = 'http:' + follows_link    user_crawler.get( follows_link )    feeds = []    users = []    #设置两个线程同时抓取    t_feeds = threading.Thread(target=extract_feed, name=None, args=(feeds,))    #t_users = threading.Thread(target=extract_user, name=None, args=(users,))    t_feeds.setDaemon(True)    #t_users.setDaemon(True)    t_feeds.start()    #t_users.start()    t_feeds.join()    #t_users.join()def crawl():    while True:        url = dequeUrl()        fetch_user(url)def main():    enqueueUrl(seed_user)    Login(username,password)    crawl()    if __name__ == '__main__':    main()    

注意:在python3中用hashlib库进行加密时,要将加密内容转换成byte类型:

用于判断是否已经抓取,也可用前面讲的mmh库

>>> >>> import hashlib>>> url = 'http://www.weibo.com'>>> md5v = hashlib.md5(url).hexdigest()Traceback (most recent call last):  File "<pyshell#4>", line 1, in <module>    md5v = hashlib.md5(url).hexdigest()TypeError: Unicode-objects must be encoded before hashing>>> md5v = hashlib.md5(url.encode('gb2312')).hexdigest()>>> >>> >>> md5v2 = hashlib.md5(b'www.baidu.com').hexdigest()>>> md5v'17d7b29a31328702848d2d42ae79a240'>>> md5v2'dab19e82e1f9a681ee73346d3e7a575e'>>> >>> >>> md5v3 = hashlib.md5(url.encode('gb2312')).hexdigest()>>> md5v3'17d7b29a31328702848d2d42ae79a240'>>> 

关于微博图片:



实际上,微博中只有图片文件名没有改变,而存储域名、分辨率可能会改变,因此在储存时,只需要储存图片文件名即可,当我们要用时补充完整即可。


二、API接口分析


之前已经掌握,这里不再赘述!!

网上也有很多





原创粉丝点击