1. Code
'''
###########
Usage:
python download.py site.txt(containing https://...)
'''
from selenium
import webdriver
import time
from pymouse
import PyMouse
m = PyMouse()
def pause(length=1):
time.sleep(length)
def download(url):
b = webdriver.Firefox()
b.maximize_window()
pause(
1)
b.get(url)
pause(
2)
loading_time =
60
dt = b.find_elements_by_tag_name(
'dt')
dd = b.find_elements_by_tag_name(
'dd')
assert(len(dt) == len(dd))
dst_type =
"Computer Vision"
print b.get_window_size()
bias = [
254,
171]
screenIsVertical =
False
if screenIsVertical:
print "No implement when screen is vertical"
return
else:
pos = [b.get_window_size()[
'width']/
2 + bias[
0], b.get_window_size()[
'height']/
2 + bias[
1]]
for i
in xrange(
4, len(dt)):
if dst_type
not in dd[i].find_element_by_class_name(
'primary-subject').text:
continue
try:
dt[i].find_element_by_link_text(
'pdf').click()
except Exception, e:
continue
pause(loading_time)
b.find_element_by_id(
'download').click()
pause(
2)
m.click(pos[
0], pos[
1],
1,
1)
time.sleep(
1)
b.back()
time.sleep(
1)
dt = b.find_elements_by_tag_name(
'dt')
dd = b.find_elements_by_tag_name(
'dd')
b.close()
def main():
import sys
if len(sys.argv) !=
2:
print(__doc__)
return
with open(sys.argv[
1],
'r')
as fid:
urls = [x.split(
'\n')[
0]
for x
in fid.readlines()]
for url
in urls:
if url.startswith(
'#'):
continue
else:
download(url)
if __name__ ==
"__main__":
main()
2. Usage
python download
.py site
.txt
site.txt (example)
https://arxiv.org/find/
all/
1/ti:+
AND+object+detection/
0/
1/
0/
all/
0/
1
https://arxiv.org/find/
all/
1/ti:+
AND+object+detection/
0/
1/
0/
all/
0/
1?skip=
25&query_id=a6b6ed358647ff57
#https://arxiv.org/find/
all/
1/ti:+
AND+object+detection/
0/
1/
0/
all/
0/
1?skip=
50&query_id=a6b6ed358647ff57
https://arxiv.org/find/
all/
1/ti:+
AND+object+detection/
0/
1/
0/
all/
0/
1?skip=
75&query_id=a6b6ed358647ff57
You can use # to ignore specific url.
Refer this post for installing requirement.
转载请注明原文地址: https://ju.6miu.com/read-660718.html