[Crawler] Get the real file link of BaiduYun shared by user with Chrome

来源：互联网发布：知乎回答被删除编辑：程序博客网时间：2024/06/11 05:00

Get the real file link of BaiduYun shared by user.

with Python 2.7 + Selenium + Chrome driver

We finally got a viable approach after several unsatisfactory attempts and one among them is:

http://www.cnblogs.com/ghostr/p/5823191.html

Here still have a lot work to improve the performance, implement threading for example.
History is a sqlite database file which can be easily parse by sqlite3 module. You can browse the data with DB Browser for SQLite

# -*- coding: utf-8 -*-#----------------------------# Author: Kun Liu         # Start date: 2017-03-10 # Latest edit: 2017-03-13# Email: lancelotdev@163.com#=============================# Read baiduyun file links from chrome history file"""### 解决方案：    1. 制定user data目录，通过 selenium 模拟 chrome 浏览器创建下载任务，但并不完成下载。    2. 解析 userdata 中的 History 获取真实资源链接。### Note：1. 未做资源链接去重处理。2. 存在多次访问后出现的验证问题，待研究。"""from __future__ import absolute_importfrom __future__ import divisionfrom __future__ import print_functionfrom __future__ import unicode_literalsimport timeimport osfrom selenium import webdriverfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.support.ui import WebDriverWaitfrom selenium.webdriver.support import expected_conditions as ECfrom selenium.webdriver.common.alert import Alertfrom FileItem import FileItemuser_data_dir_path = "d://userData"options = webdriver.ChromeOptions()options.add_argument("user-data-dir=%s"%user_data_dir_path)# Travel all share url to get history.def baiduyun_url_travel(share_url_list=[]):    driver = webdriver.Chrome(chrome_options=options)    if not share_url_list:        return    # Init the user data such as cookie so you won't need to request a url twice.    driver.get(share_url_list[0])    for url in share_url_list:        driver.get(url)        time.sleep(3)        js_str = "Object.defineProperty(Object.getPrototypeOf(navigator),'platform',{get:function(){return 'sb_baidu';}})"        driver.execute_script(js_str)        try:            element = WebDriverWait(driver, 10).until(                EC.presence_of_element_located((By.XPATH, '//*[@id="layoutMain"]/div[1]/div[1]/div/div[2]/div/div/div[2]/a[2]'))            )        except Exception as e:            element = driver.find_element_by_xpath('//*[@id="layoutMain"]/div[1]/div[1]/div/div[2]/div/div/div[2]/a[2]')        finally:            element.click()        time.sleep(5)    driver.quit()# 2017-03-13  Liu Kun# The 'History' file is a sqlite database.# Some download links may jump to other urls which is clearly marked by Chrome # and here I use the direct link without jumping.def get_source_link_from_history(History_path):    import sqlite3 as db    conn = db.connect(History_path)    cursor = conn.cursor()    sql = "select id, chain_index, url from downloads_url_chains where chain_index=0"    rows = cursor.execute(sql).fetchall()    items = []    for row in rows:        id, _, file_link = row        sql = "select current_path, start_time from downloads where id=%d"%int(id)        file_info = cursor.execute(sql).fetchone()        if file_info:            current_path, time_stamp = file_info            time_stamp = str(time_stamp)            # C:\Users\kun_liu\Downloads\shadowsocks-nightly-3.2.7.apk.crdownload            file_name = current_path.split('\\')[-1].replace('.crdownload','')            x = time.localtime(int(time_stamp[0:10]))            # time.strptime(a,'%Y-%m-%d %H:%M:%S')            start_time = time.strftime('%Y-%m-%d %H:%M:%S',x)            item = FileItem(file_name, file_link, start_time)            items.append(item.make_dic())    return itemsif __name__ == "__main__":    # Movie:https://pan.baidu.com/s/1sl8litZ #App:https://pan.baidu.com/s/1o8K255K    share_url = ["https://pan.baidu.com/s/1sl8litZ", "https://pan.baidu.com/s/1dFBr37F", "https://pan.baidu.com/s/1o8K255K"]    baiduyun_url_travel(share_url)    History_path = os.path.join(user_data_dir_path, "Default", "History")    items = get_source_link_from_history(History_path)    import pprint    pprint.pprint(items)

FileItem.py:

# -*- coding: utf-8 -*-#----------------------------# Author: Kun Liu         # Start date: 2017-03-13  # Latest edit: 2017-03-13#=============================from __future__ import absolute_importfrom __future__ import divisionfrom __future__ import print_functionfrom __future__ import unicode_literalsimport pprintclass FileItem:    def __init__(self, file_name="", file_link="", catch_time= ""):        self.file_name = file_name        self.file_link = file_link        self.file_time = catch_time    def make_dic(self):        info_dic = {"file_name":self.file_name, "link":self.file_link, "time":self.file_time}        return info_dicif __name__ == "__main__":    pass

Document links：

Selenium-Python

0 0