因为找到的大部分教程都是python2下的,所以自己撸了个python3的。
python版本3.5,编辑器为pychram,用urllib和正则表达式。代码地址https://github.com/BladeXunGe/test
简单的框架为
# -*- coding: utf-8 -*- import urllib import urllib.request import re def download_page(url): request = urllib.request.Request(url) response = urllib.request.urlopen(request) data = response.read() return data def get_image(html): regx = r'src="(https://img.*?\.jpg)"' pattern = re.compile(regx) imlist = re.findall(pattern,repr(html)) num = 1 for img in imlist: image = download_page(img) with open('%s.jpg '%num,'wb') as fp: fp.write(image) num += 1 print('downloding pic%s'%num) return url = 'https://tieba.baidu.com/p/1181591427' html = download_page(url) get_image(html)这个只能爬一页
然后做了调整
# -*- coding: utf-8 -*- import urllib import urllib.request import re def download_page(url): request = urllib.request.Request(url) response = urllib.request.urlopen(request) data = response.read() return data def get_image(html,x): regx =r'"([.*\S]*\.jpg)" pic_ext="jpeg"' pattern = re.compile(regx) imlist = re.findall(pattern,repr(html)) print(imlist) for i in imlist: print (i) print (x) urllib.request.urlretrieve(i, '%s.jpg' % x) x += 1 return x x = 1 url = 'https://tieba.baidu.com/p/1181591427?pn=' for k in range(1,28): ul = url + str(k) print(ul) html = download_page(url) get_image(html,x) x = get_image(html,x) 存储到本地用了两种不同的写法后来考虑到增加一些功能路径保存和交互,为了方便储存改了写法
# -*- coding: utf-8 -*- import urllib import urllib.request import re import os def download_page(url): request = urllib.request.Request(url) response = urllib.request.urlopen(request) data = response.read() return data path = input('please enter your place like D:/imags/') if os.path.exists(path) == False: os.mkdir(path) def get_image(html,x): regx = r'src="(https://img.*?\.jpg)"' pattern = re.compile(regx) imlist = re.findall(pattern,repr(html)) print(imlist) for img in imlist: image = download_page(img) name = '%s.jpg '% x with open(path + name, 'wb') as fp: fp.write(image) x += 1 print('downloding pic%s' % x) return x x = 1 url = input('please enter your url like https://tieba.baidu.com/p/1181591427?pn=' for k in range(1,28): ul = url + str(k) print(ul) html = download_page(url) get_image(html,x) x = get_image(html,x)还需要增加多线程,以后再说。附知乎图片爬虫,功能不完善,勉强可用。
# -*- coding: utf-8 -*- import urllib import urllib.request import re import os def download_page(url): request = urllib.request.Request(url) response = urllib.request.urlopen(request) data = response.read() return data path = input('please enter your place like D:/imags/') if os.path.exists(path) == False: os.mkdir(path) def get_image(html): regx = r'img src="(http.*?)"' pattern = re.compile(regx) imlist = re.findall(pattern,repr(html)) num = 1 for img in imlist: image = download_page(img) name = '%s.jpg ' % num with open(path + name,'wb') as fp: fp.write(image) num += 1 print('downloding pic%s'%num) return url = 'https://www.zhihu.com/question/34378366' html = download_page(url) get_image(html)链接出无法交互,以后解决。