点击打开链接
下面说说具体的代码
1.这里去的是手机淘宝的页面,获得的信息居然是json来的。
2.代码的核心就是从网上下载网页。因为淘宝可能会出现反爬虫,所以使用cookie,构建head是很有必要的。尽量把自己伪装成一个浏览器。
3.把内容写到excel中。怎么写不重要了。反正没什么好看的。一个练习的例子,往往都是下载数据而已。
但是从数据到信息还差一步,这是一个商业秘密了。所以数据获得再多,不能转化为信息,最终还只是练习而已。
好吧,还是看看代码吧。
# -*- coding:utf-8 -*- import urllib.request, urllib.parse, http.cookiejar import os, time,re from PIL import Image import json from openpyxl import Workbook # 找出文件夹下所有html后缀的文件 def listfiles(rootdir, prefix='.xml'): file = [] for parent, _, filenames in os.walk(rootdir): if parent == rootdir: for filename in filenames: if filename.endswith(prefix): file.append(rootdir + '/' + filename) return file else: pass def writeexcel(path,dealcontent): workbook = Workbook() #构造一个workBook的对象 worksheet = workbook.create_sheet('1',0)#构造一个表格。坐标要从1开始的。 for i in range(0,len(dealcontent)): for j in range(0,len(dealcontent[i])): if i!=0 and j==len(dealcontent[i])-1: if dealcontent[i][j] != '': try: worksheet.cell(row=i+1,column=j+1).value = dealcontent[i][j]#写入sheet中 except: pass else: if dealcontent[i][j]: worksheet.cell(row=i+1,column=j+1).value = dealcontent[i][j].replace(' ','') workbook.save(path) #这里才是代码的核心 def getHtml(url, myProxy='', postdata={}): """ 抓取网页:支持cookie url网址,postdata为POST的数据 """ # COOKIE文件保存路径 filename = 'cookie.txt' # 声明一个MozillaCookieJar对象实例保存在文件中 cj = http.cookiejar.MozillaCookieJar(filename) # 从文件中读取cookie内容到变量 # ignore_discard的意思是即使cookies将被丢弃也将它保存下来 # ignore_expires的意思是如果过期了也照样保存 # 如果存在,则读取主要COOKIE if os.path.exists(filename): cj.load(filename, ignore_discard=True, ignore_expires=True) # 建造带有COOKIE的处理器 cookieHandler = urllib.request.HTTPCookieProcessor(cj) if myProxy:# 开启代理支持 #使用代理,就要用到代理的Handler proxyHandler = urllib.request.ProxyHandler({'http':'http://'+myProxy}) print('代理:'+myProxy+'启动') opener = urllib.request.build_opener(proxyHandler, cookieHandler ) else: opener = urllib.request.build_opener(cookieHandler) # 打开专家加头部 opener.addheaders = [('User-Agent', 'Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5'), ('Referer', 'http://s.m.taobao.com'), ('Host', 'h5.m.taobao.com')] # 分配专家 urllib.request.install_opener(opener) # 有数据需要POST if postdata: # 数据URL编码 postdata = urllib.parse.urlencode(postdata) html_bytes = urllib.request.urlopen(url, postdata.encode()).read() else: html_bytes = urllib.request.urlopen(url).read() # 保存COOKIE到文件中 cj.save(ignore_discard=True, ignore_expires=True) return html_bytes # 去除标题中的非法字符 (Windows) def validateTitle(title): rstr = r"[\/\\\:\*\?\"\<\>\|]" # '/\:*?"<>|' new_title = re.sub(rstr, "", title) return new_title # 递归创建文件夹 def makeFolder(path): try: os.makedirs(path) except: print('目录已经存在:'+path) if __name__ == '__main__': #对应目录 dataDir = './data' imageDir = './image' makeFolder(dataDir) #表单参数 keyword = r'卡包' orderType = 1 #1.按销量优先,2.按价格低到高,3.价格高到低,4.信用排序,5.综合排序 pageNum = 10 #需要抓取的页数 waitSeconds = 4#每次抓取后暂停时间 isGetImage = 1#'抓取图片按1,不抓取按2:' #构建表单 postdata = {} postdata['event_submit_do_new_search_auction']= 1 postdata['search']= '提交查询' postdata['_input_charset']= 'utf-8' postdata['topSearch']= 1 postdata['atype']= 'b' postdata['searchfrom']= 1 postdata['action']= 'home:redirect_app_action' postdata['from']= 1 postdata['q']= keyword postdata['sst']= 1 postdata['n']= 20 postdata['buying']= 'buyitnow' postdata['m']= 'api4h5' postdata['abtest']= 16 postdata['wlsort']= 16 postdata['style']= 'list' postdata['closeModues']= 'nav,selecthot,onesearch' if orderType == 1: postdata['sort'] = '_sale' elif orderType == 2: postdata['sort'] = 'bid' elif orderType== 2: postdata['sort']='_bid' elif orderType==4: postdata['sort']='_ratesum' #获取每一页的数据 for page in range(0, pageNum): postdata['page']= page taobaoUrl = "http://s.m.taobao.com/search?" try: content1 = getHtml(taobaoUrl, '', postdata) file = open(dataDir + '/' + str(page) + '.json', 'wb')#这是手机淘宝,获得的是json文件 file.write(content1) except Exception as e: if hasattr(e, 'code'): print('页面不存在或时间太长.') print('Error code:', e.code) elif hasattr(e, 'reason'): print("无法到达主机.") print('Reason: ', e.reason) else: print(e) time.sleep(waitSeconds) print('暂停'+str(waitSeconds)+'秒') files = listfiles(dataDir, '.json') total = [['页数', '店名', '商品标题', '商品打折价', '发货地址', '评论数', '原价', '售出件数', '政策享受', '付款人数', '金币折扣','URL地址','图像URL','图像'],] for filename in files: try: doc = open(filename, 'rb') doccontent = doc.read().decode('utf-8', 'ignore') product = doccontent.replace(' ', '').replace('\n', '') product = json.loads(product) onefile = product['listItem'] except: print('抓不到' + filename) continue for item in onefile: itemlist = [filename, item['nick'], item['title'], item['price'], item['location'], item['commentCount']] itemlist.append(item['originalPrice']) itemlist.append(item['sold']) itemlist.append(item['zkType']) itemlist.append(item['act']) itemlist.append(item['coinLimit']) itemlist.append('http:'+item['url']) picpath=item['pic_path'].replace('60x60','720x720') itemlist.append(picpath) if isGetImage==1: if os.path.exists(imageDir): pass else: makeFolder(imageDir) url=urllib.parse.quote(picpath).replace(':',':') urllib.request.urlcleanup() try: pic=urllib.request.urlopen(url) picno=time.strftime('%H%M%S', time.localtime()) filenamep=imageDir+'/'+picno+validateTitle(item['nick']+'-'+item['title']) filenamepp=filenamep+'.jpeg' sfilename=filenamep+'s.jpeg' filess=open(filenamepp,'wb')#从网络上获得图片 filess.write(pic.read()) filess.close() img = Image.open(filenamepp)#以图片的格式打开 w, h = img.size size=w/6,h/6 img.thumbnail(size, Image.ANTIALIAS) img.save(sfilename,'jpeg') itemlist.append(sfilename) print('抓到图片:'+sfilename) except Exception as e: if hasattr(e, 'code'): print('页面不存在或时间太长.') print('Error code:', e.code) elif hasattr(e, 'reason'): print("无法到达主机.") print('Reason: ', e.reason) else: print(e) itemlist.append('') else: itemlist.append('') total.append(itemlist) if len(total) > 1: writeexcel( keyword + '淘宝手机商品.xlsx', total) else: print('什么都抓不到')
