Python3实例：爬取淘宝商品列表

xiaoxiao2021-04-11 35

这个实例是从淘宝爬数据，原文是：http://www.cnblogs.com/nima/p/5324490.html 因为我比较关心的是网络这一块，所以对文章做了很多删改。侧重在理解request、cookie两个模块至于如何把数据保存到excel，怎么排版，这些是完全没有意义的，不是正式生产环境，做得多么漂亮都没意义。这次用了很多新的模块或概念：图像相关的库Pillow，下载地址点击打开链接原文的是把图片链接写到excel里面的，我改了之后就没有了，只是下载下来。 Mozilla：经常看到这个单词。原来是只Mozilla基金会，为支持和领导开源的Mozilla而设立的一个非营利组织。 cx_Freeze：打包工具，将python程序打包成exe文件。下载地址：点击打开链接打包的稍后再研究。 cookie的内容通过cookiejar模块来操作，代码里面使用的是子类MozillaCookieJar。 cookie详细讲解看这里点击打开链接这个类跟父类只是保存和加载文件的方式有所不同。看一下保存的内容，用\t分割。为方便查看，用""包着了。 cookie.domain："s.m.taobao.com"，访问的域名 initial_dot："False"，域名是否以“.”开始的，要做一些特殊处理。 cookie.path："/" 好像是文件目录 secure："False" 安全的 expires："" 应该是过期时间 name："JSESSIONID" value："770326E8F4997185C7DB2714D7569FF1" request：这不是新的模块，但是趁着这次详细了解了一番。

点击打开链接

下面说说具体的代码

1.这里去的是手机淘宝的页面，获得的信息居然是json来的。

2.代码的核心就是从网上下载网页。因为淘宝可能会出现反爬虫，所以使用cookie，构建head是很有必要的。尽量把自己伪装成一个浏览器。

3.把内容写到excel中。怎么写不重要了。反正没什么好看的。一个练习的例子，往往都是下载数据而已。

但是从数据到信息还差一步，这是一个商业秘密了。所以数据获得再多，不能转化为信息，最终还只是练习而已。

好吧，还是看看代码吧。

# -*- coding:utf-8 -*- import urllib.request, urllib.parse, http.cookiejar import os, time,re from PIL import Image import json from openpyxl import Workbook # 找出文件夹下所有html后缀的文件 def listfiles(rootdir, prefix='.xml'): file = [] for parent, _, filenames in os.walk(rootdir): if parent == rootdir: for filename in filenames: if filename.endswith(prefix): file.append(rootdir + '/' + filename) return file else: pass def writeexcel(path,dealcontent): workbook = Workbook() #构造一个workBook的对象 worksheet = workbook.create_sheet('1',0)#构造一个表格。坐标要从1开始的。 for i in range(0,len(dealcontent)): for j in range(0,len(dealcontent[i])): if i!=0 and j==len(dealcontent[i])-1: if dealcontent[i][j] != '': try: worksheet.cell(row=i+1,column=j+1).value = dealcontent[i][j]#写入sheet中 except: pass else: if dealcontent[i][j]: worksheet.cell(row=i+1,column=j+1).value = dealcontent[i][j].replace(' ','') workbook.save(path) #这里才是代码的核心 def getHtml(url, myProxy='', postdata={}): """ 抓取网页：支持cookie url网址，postdata为POST的数据 """ # COOKIE文件保存路径 filename = 'cookie.txt' # 声明一个MozillaCookieJar对象实例保存在文件中 cj = http.cookiejar.MozillaCookieJar(filename) # 从文件中读取cookie内容到变量 # ignore_discard的意思是即使cookies将被丢弃也将它保存下来 # ignore_expires的意思是如果过期了也照样保存 # 如果存在，则读取主要COOKIE if os.path.exists(filename): cj.load(filename, ignore_discard=True, ignore_expires=True) # 建造带有COOKIE的处理器 cookieHandler = urllib.request.HTTPCookieProcessor(cj) if myProxy:# 开启代理支持 #使用代理，就要用到代理的Handler proxyHandler = urllib.request.ProxyHandler({'http':'http://'+myProxy}) print('代理:'+myProxy+'启动') opener = urllib.request.build_opener(proxyHandler, cookieHandler ) else: opener = urllib.request.build_opener(cookieHandler) # 打开专家加头部 opener.addheaders = [('User-Agent', 'Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5'), ('Referer', 'http://s.m.taobao.com'), ('Host', 'h5.m.taobao.com')] # 分配专家 urllib.request.install_opener(opener) # 有数据需要POST if postdata: # 数据URL编码 postdata = urllib.parse.urlencode(postdata) html_bytes = urllib.request.urlopen(url, postdata.encode()).read() else: html_bytes = urllib.request.urlopen(url).read() # 保存COOKIE到文件中 cj.save(ignore_discard=True, ignore_expires=True) return html_bytes # 去除标题中的非法字符 (Windows) def validateTitle(title): rstr = r"[\/\\\:\*\?\"\<\>\|]" # '/\:*?"<>|' new_title = re.sub(rstr, "", title) return new_title # 递归创建文件夹 def makeFolder(path): try: os.makedirs(path) except: print('目录已经存在：'+path) if __name__ == '__main__': #对应目录 dataDir = './data' imageDir = './image' makeFolder(dataDir) #表单参数 keyword = r'卡包' orderType = 1 #1.按销量优先，2.按价格低到高，3.价格高到低，4.信用排序，5.综合排序 pageNum = 10 #需要抓取的页数 waitSeconds = 4#每次抓取后暂停时间 isGetImage = 1#'抓取图片按1，不抓取按2：' #构建表单 postdata = {} postdata['event_submit_do_new_search_auction']= 1 postdata['search']= '提交查询' postdata['_input_charset']= 'utf-8' postdata['topSearch']= 1 postdata['atype']= 'b' postdata['searchfrom']= 1 postdata['action']= 'home:redirect_app_action' postdata['from']= 1 postdata['q']= keyword postdata['sst']= 1 postdata['n']= 20 postdata['buying']= 'buyitnow' postdata['m']= 'api4h5' postdata['abtest']= 16 postdata['wlsort']= 16 postdata['style']= 'list' postdata['closeModues']= 'nav,selecthot,onesearch' if orderType == 1: postdata['sort'] = '_sale' elif orderType == 2: postdata['sort'] = 'bid' elif orderType== 2: postdata['sort']='_bid' elif orderType==4: postdata['sort']='_ratesum' #获取每一页的数据 for page in range(0, pageNum): postdata['page']= page taobaoUrl = "http://s.m.taobao.com/search?" try: content1 = getHtml(taobaoUrl, '', postdata) file = open(dataDir + '/' + str(page) + '.json', 'wb')#这是手机淘宝，获得的是json文件 file.write(content1) except Exception as e: if hasattr(e, 'code'): print('页面不存在或时间太长.') print('Error code:', e.code) elif hasattr(e, 'reason'): print("无法到达主机.") print('Reason: ', e.reason) else: print(e) time.sleep(waitSeconds) print('暂停'+str(waitSeconds)+'秒') files = listfiles(dataDir, '.json') total = [['页数', '店名', '商品标题', '商品打折价', '发货地址', '评论数', '原价', '售出件数', '政策享受', '付款人数', '金币折扣','URL地址','图像URL','图像'],] for filename in files: try: doc = open(filename, 'rb') doccontent = doc.read().decode('utf-8', 'ignore') product = doccontent.replace(' ', '').replace('\n', '') product = json.loads(product) onefile = product['listItem'] except: print('抓不到' + filename) continue for item in onefile: itemlist = [filename, item['nick'], item['title'], item['price'], item['location'], item['commentCount']] itemlist.append(item['originalPrice']) itemlist.append(item['sold']) itemlist.append(item['zkType']) itemlist.append(item['act']) itemlist.append(item['coinLimit']) itemlist.append('http:'+item['url']) picpath=item['pic_path'].replace('60x60','720x720') itemlist.append(picpath) if isGetImage==1: if os.path.exists(imageDir): pass else: makeFolder(imageDir) url=urllib.parse.quote(picpath).replace(':',':') urllib.request.urlcleanup() try: pic=urllib.request.urlopen(url) picno=time.strftime('%H%M%S', time.localtime()) filenamep=imageDir+'/'+picno+validateTitle(item['nick']+'-'+item['title']) filenamepp=filenamep+'.jpeg' sfilename=filenamep+'s.jpeg' filess=open(filenamepp,'wb')#从网络上获得图片 filess.write(pic.read()) filess.close() img = Image.open(filenamepp)#以图片的格式打开 w, h = img.size size=w/6,h/6 img.thumbnail(size, Image.ANTIALIAS) img.save(sfilename,'jpeg') itemlist.append(sfilename) print('抓到图片：'+sfilename) except Exception as e: if hasattr(e, 'code'): print('页面不存在或时间太长.') print('Error code:', e.code) elif hasattr(e, 'reason'): print("无法到达主机.") print('Reason: ', e.reason) else: print(e) itemlist.append('') else: itemlist.append('') total.append(itemlist) if len(total) > 1: writeexcel( keyword + '淘宝手机商品.xlsx', total) else: print('什么都抓不到')

转载请注明原文地址: https://ju.6miu.com/read-666482.html

技术

最新回复(0)