http://www.cnblogs.com/ghostr/p/5823191.html
Here still have a lot work to improve the performance, implement threading for example.
History is a sqlite database file which can be easily parse by sqlite3 module. You can browse the data with DB Browser for SQLite
# -*- coding: utf-8 -*- #---------------------------- # Author: Kun Liu # Start date: 2017-03-10 # Latest edit: 2017-03-13 # Email: lancelotdev@163.com #============================= # Read baiduyun file links from chrome history file """ ### 解决方案: 1. 制定user data目录,通过 selenium 模拟 chrome 浏览器创建下载任务,但并不完成下载。 2. 解析 userdata 中的 History 获取真实资源链接。 ### Note: 1. 未做资源链接去重处理。 2. 存在多次访问后出现的验证问题,待研究。 """ from __future__ import absolute_import from __future__ import division from __future__ import print_function from __future__ import unicode_literals import time import os from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.alert import Alert from FileItem import FileItem user_data_dir_path = "d://userData" options = webdriver.ChromeOptions() options.add_argument("user-data-dir=%s"%user_data_dir_path) # Travel all share url to get history. def baiduyun_url_travel(share_url_list=[]): driver = webdriver.Chrome(chrome_options=options) if not share_url_list: return # Init the user data such as cookie so you won't need to request a url twice. driver.get(share_url_list[0]) for url in share_url_list: driver.get(url) time.sleep(3) js_str = "Object.defineProperty(Object.getPrototypeOf(navigator),'platform',{get:function(){return 'sb_baidu';}})" driver.execute_script(js_str) try: element = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.XPATH, '//*[@id="layoutMain"]/div[1]/div[1]/div/div[2]/div/div/div[2]/a[2]')) ) except Exception as e: element = driver.find_element_by_xpath('//*[@id="layoutMain"]/div[1]/div[1]/div/div[2]/div/div/div[2]/a[2]') finally: element.click() time.sleep(5) driver.quit() # 2017-03-13 Liu Kun # The 'History' file is a sqlite database. # Some download links may jump to other urls which is clearly marked by Chrome # and here I use the direct link without jumping. def get_source_link_from_history(History_path): import sqlite3 as db conn = db.connect(History_path) cursor = conn.cursor() sql = "select id, chain_index, url from downloads_url_chains where chain_index=0" rows = cursor.execute(sql).fetchall() items = [] for row in rows: id, _, file_link = row sql = "select current_path, start_time from downloads where id=%d"%int(id) file_info = cursor.execute(sql).fetchone() if file_info: current_path, time_stamp = file_info time_stamp = str(time_stamp) # C:\Users\kun_liu\Downloads\shadowsocks-nightly-3.2.7.apk.crdownload file_name = current_path.split('\\')[-1].replace('.crdownload','') x = time.localtime(int(time_stamp[0:10])) # time.strptime(a,'%Y-%m-%d %H:%M:%S') start_time = time.strftime('%Y-%m-%d %H:%M:%S',x) item = FileItem(file_name, file_link, start_time) items.append(item.make_dic()) return items if __name__ == "__main__": # Movie:https://pan.baidu.com/s/1sl8litZ #App:https://pan.baidu.com/s/1o8K255K share_url = ["https://pan.baidu.com/s/1sl8litZ", "https://pan.baidu.com/s/1dFBr37F", "https://pan.baidu.com/s/1o8K255K"] baiduyun_url_travel(share_url) History_path = os.path.join(user_data_dir_path, "Default", "History") items = get_source_link_from_history(History_path) import pprint pprint.pprint(items)