自定义一个pipeline
# 图片下载类 class ImageDownloadPipeline(object): def process_item(self, item, spider): global img_index #if 'image_urls' in item: # 如何‘图片地址’在项目中 imgPath="/home/abc/image" # 下载图片的保存路径 if not os.path.isdir(imgPath): os.mkdir(imgPath) for url in item["image_urls"]: print("下载:", url) # 未能正确获得网页 就进行异常处理 try: res = urllib2.urlopen(url) if str(res.status) != '200': print('未下载成功:', url) continue except Exception as e: print('未下载成功:', url) filename = os.path.join(imgPath, str(img_index) + '.jpg') with open(filename, 'wb') as f: f.write(res.read()) print('下载完成\n') img_index += 1 return item
修改 setting.py配置:
DOWNLOADER_MIDDLEWARES = { 'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None,#关闭默认下载器 'tutorial.js_middleware.JavaScriptMiddleware':543 #键为中间件类的路径,值为中间件的顺序 }