[Crawler] Get the real file link of BaiduYun shared by user with Chrome

xiaoxiao2021-03-25 96

Get the real file link of BaiduYun shared by user.

with Python 2.7 + Selenium + Chrome driver

We finally got a viable approach after several unsatisfactory attempts and one among them is:

http://www.cnblogs.com/ghostr/p/5823191.html

Here still have a lot work to improve the performance, implement threading for example.

History is a sqlite database file which can be easily parse by sqlite3 module. You can browse the data with DB Browser for SQLite

# -*- coding: utf-8 -*- #---------------------------- # Author: Kun Liu # Start date: 2017-03-10 # Latest edit: 2017-03-13 # Email: lancelotdev@163.com #============================= # Read baiduyun file links from chrome history file """ ### 解决方案： 1. 制定user data目录，通过 selenium 模拟 chrome 浏览器创建下载任务，但并不完成下载。 2. 解析 userdata 中的 History 获取真实资源链接。 ### Note： 1. 未做资源链接去重处理。 2. 存在多次访问后出现的验证问题，待研究。 """ from __future__ import absolute_import from __future__ import division from __future__ import print_function from __future__ import unicode_literals import time import os from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.alert import Alert from FileItem import FileItem user_data_dir_path = "d://userData" options = webdriver.ChromeOptions() options.add_argument("user-data-dir=%s"%user_data_dir_path) # Travel all share url to get history. def baiduyun_url_travel(share_url_list=[]): driver = webdriver.Chrome(chrome_options=options) if not share_url_list: return # Init the user data such as cookie so you won't need to request a url twice. driver.get(share_url_list[0]) for url in share_url_list: driver.get(url) time.sleep(3) js_str = "Object.defineProperty(Object.getPrototypeOf(navigator),'platform',{get:function(){return 'sb_baidu';}})" driver.execute_script(js_str) try: element = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.XPATH, '//*[@id="layoutMain"]/div[1]/div[1]/div/div[2]/div/div/div[2]/a[2]')) ) except Exception as e: element = driver.find_element_by_xpath('//*[@id="layoutMain"]/div[1]/div[1]/div/div[2]/div/div/div[2]/a[2]') finally: element.click() time.sleep(5) driver.quit() # 2017-03-13 Liu Kun # The 'History' file is a sqlite database. # Some download links may jump to other urls which is clearly marked by Chrome # and here I use the direct link without jumping. def get_source_link_from_history(History_path): import sqlite3 as db conn = db.connect(History_path) cursor = conn.cursor() sql = "select id, chain_index, url from downloads_url_chains where chain_index=0" rows = cursor.execute(sql).fetchall() items = [] for row in rows: id, _, file_link = row sql = "select current_path, start_time from downloads where id=%d"%int(id) file_info = cursor.execute(sql).fetchone() if file_info: current_path, time_stamp = file_info time_stamp = str(time_stamp) # C:\Users\kun_liu\Downloads\shadowsocks-nightly-3.2.7.apk.crdownload file_name = current_path.split('\\')[-1].replace('.crdownload','') x = time.localtime(int(time_stamp[0:10])) # time.strptime(a,'%Y-%m-%d %H:%M:%S') start_time = time.strftime('%Y-%m-%d %H:%M:%S',x) item = FileItem(file_name, file_link, start_time) items.append(item.make_dic()) return items if __name__ == "__main__": # Movie:https://pan.baidu.com/s/1sl8litZ #App:https://pan.baidu.com/s/1o8K255K share_url = ["https://pan.baidu.com/s/1sl8litZ", "https://pan.baidu.com/s/1dFBr37F", "https://pan.baidu.com/s/1o8K255K"] baiduyun_url_travel(share_url) History_path = os.path.join(user_data_dir_path, "Default", "History") items = get_source_link_from_history(History_path) import pprint pprint.pprint(items)

FileItem.py:

# -*- coding: utf-8 -*- #---------------------------- # Author: Kun Liu # Start date: 2017-03-13 # Latest edit: 2017-03-13 #============================= from __future__ import absolute_import from __future__ import division from __future__ import print_function from __future__ import unicode_literals import pprint class FileItem: def __init__(self, file_name="", file_link="", catch_time= ""): self.file_name = file_name self.file_link = file_link self.file_time = catch_time def make_dic(self): info_dic = {"file_name":self.file_name, "link":self.file_link, "time":self.file_time} return info_dic if __name__ == "__main__": pass

Document links：

Selenium-Python

转载请注明原文地址: https://ju.6miu.com/read-22926.html

技术

最新回复(0)