介绍利用Python爬虫抓取日本女演员照片。 遇到的最大问题就是该网站用了cloudflare以及其他的策略禁止爬虫爬取信息,导致urllib自带的urlretrieve函数无法使用,而其他部分都较为常规,故直接贴出代码。
import re import urllib2 def getHtml(url1): headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} request = urllib2.Request(url = url1,headers = headers) html = urllib2.urlopen(request).read() return html def getImg(html): headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} reg = r'<img src="(.*?)" title="">' imgre = re.compile(reg) imglist = re.findall(imgre,html) for imgurl in imglist: print imgurl name = re.findall(r'https://jp.netcdn.space/mono/actjpgs/(.*?).jpg',imgurl) print name namestr = "".join(name[0]) filename = namestr + '.jpg' picpath = 'D:\\ImageDownload\\%s' % (filename) print picpath timeout = 50 request = urllib2.Request(imgurl,None,headers) response = urllib2.urlopen(request,None,timeout) str = response.read() foo = open(picpath,"wb") foo.write(str) foo.close() start = int(1) end = int(2) for page in range(start,end): page = str(page) url = "https://avmo.pw/cn/actresses/page/"+page html = getHtml(url) getImg(html) print u""" --------------------------------------- name : avmo_img edition : 0.1 author : ultrakin time : 2016-09-27 --------------------------------------- """程序抓取结果: