Python爬虫实例

    xiaoxiao2021-03-25  228

    #-*-conding:utf-8-*- import urllib.request import re import bs4 #入口url import time url_mian = 'http://tieba.baidu.com/f?kw=剑网3&fr=index&fp=0&ie=utf-8&red_tag=q3464037905' #下载网页 def download(url,num_retries = 2,user_agent='wswp'): print("Downloading:",url) headers = {'User-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'} reheaders = urllib.request.Request(url,headers=headers) #判断url是否可访问 try: html = urllib.request.urlopen(url).read().decode('utf-8') except UnicodeDecodeError: # html = urllib.request.urlopen(url).read() return None # Urls.remove(url) except urllib.request.URLError as e: print('Downloading error',e.reason) html = None #判断是否5xx码错误 if num_retries>0: if hasattr((e,'code')) and 500 <=e.code< 600: return download(url,num_retries-1) except: print('asdddddddddddddddddddd') print('网页下载完成') # print(html) return html #拿到当前网页的标题和url地址 第一个页面所有的数据 def getUrl(url): content = download(url) if content != None: print('Download is success') reg = r'a href=(.*?) class="j_th_tit "' ref = re.compile(reg) cont = re.findall(ref,content) for i in cont: splicContent(i) print(Urls) url = findNext(url) if url != None: getUrl(url) else: return else: Urls.remove(url) Urls=[] #拆分拿到的内容 def splicContent(i): print('开始拆分......') list = i.split(' ') titles = list[1] if titles.find('818')!= -1 or titles.find('八一八')!= -1 or titles.find('树洞')!= -1 or titles.find('回忆')!=-1: titles = titles[6:] ends = len(list[0]) url = 'https://tieba.baidu.com' + list[0][1:ends-1] + '?see_lz=1' #拿到具体的url print('开始创建文件.....') lookInUrl(url)#*******开始爬取内容 if url in Urls: pass else: Urls.append(url) return Urls # print (Urls)#'https://tieba.baidu.com/p/5013522336"' #查找是否含有下一页,如果有则返回下一页的url def findNext(url): html = download(url) if html != None: reg = r'a href=.* class="next pagination-item "' ref = re.compile(reg) cont = re.findall(ref,html) if len(cont) != 0: #获取下一页的url list = cont[0].split(' ') ends= len(list[1]) nextUrl =(list[1][6:ends-1]) return (nextUrl) else: print('已经没有下一页') return None else : return None Contents = [] title = "" #下载具体页面 def lookInUrl(url): global title html = download(url) if html != None: # <div id="post_content_105016433767" class="d_post_content j_d_post_content "> #判断是否是第一页 reup = r'a href=".*?">上一页' refup = re.compile(reup) isFirst = re.findall(refup, html) if len(isFirst) == 0: # <h3 class="core_title_txt pull-left text-overflow " # title="树洞,昨天打了个3v3,然后我被奶秀亲友拉黑了" # style="width: 396px">树洞,昨天打了个3v3,然后我被奶秀亲友拉黑了</h3> # < h3 class ="core_title_txt pull-left text-overflow vip_red " title="【年度818】远程跳蛋好玩吗?" style="width: 396px" > 【年度818】远程跳蛋好玩吗? < / h3 > # print(html) #<h3 class="core_title_txt pull-left text-overflow vip_red " title="【年度818】远程跳蛋好玩吗?" style="width: 396px">【年度818】远程跳蛋好玩吗?</h3> try: reg = r'h3 class="core_title_txt pull-left text-overflow " title="(.*?)</h3>' # reg = r'h3 class="core_title_txt pull-left text-overflow vip_red " title=".*?"' ref = re.compile(reg) title = re.findall(ref, html) print(title) title = (title[0]).split('>')[1] print(title) # ************************************标题准备Ok except: reg = r'h3 class="core_title_txt pull-left text-overflow vip_red " title=".*?"' ref = re.compile(reg) title = re.findall(ref, html) title = (title[0].split(' ')[8][7:-1]) print(title) # 抓取内容 try: regg = r'<div id=".*" class="d_post_content j_d_post_content ">(.*?)</div>' reff = re.compile(regg) content = re.findall(reff,html) print(content) except: print('内容获取时异常') Excepts(url) # print(len(content)) # for i in content: # print(i) Contents.append(content) allContents = getContent(html)#*****************拿到帖子内所有信息 writer( title,Contents) else: return None def Excepts(url): print(url) content = urllib.request.urlopen(url) soup = bs4.BeautifulSoup(content, 'html.parser') soup.original_encoding html = soup # print(html) content = soup.find_all('div', {'class': 'd_post_content j_d_post_content '}) print(content) Contents.append(content) getContent(html) def getAllExcepts(url): content = urllib.request.urlopen(url) soup = bs4.BeautifulSoup(content, 'html.parser') soup.original_encoding html = soup Next = soup.find_all('li',{'class':'l_pager pager_theme_5 pb_list_pager'}) for i in Next: if i.find('下一页'): print(i) #拿到所有内容 def getContent(html): #<a href="/p/5013807180?see_lz=1&pn=2">下一页</a> regg = r'a href=".*?">下一页' reff = re.compile(regg) lls = re.findall(reff,html) if len(lls) != 0: newurl = 'https://tieba.baidu.com'+lls[0][8:-5] lookInUrl(newurl) else: # for i in Contents: # for n in i: # print('    '+n+'\n\n\n') return Contents def writer(title,content): # print(title) try: f = open('E:/J3/'+title+'.html','w',encoding='utf-8') f.write('<!DOCTYPE html>') f.write('<html>') f.write('<head>') f.write('<meta charset="UTF-8">') f.write('<title>%s</title>'%title) f.write('</head>') f.write('<body>') f.write('<table>') print('开始写入表格') for i in content: # print(type(i)) for n in i: # print(type(n)) # print(n) f.write('<tr>%s</tr>'%n) f.write('</table>') f.write('</body>') f.write('</html>') print('写入完成') time.sleep(2) except: return # urlssss='https://tieba.baidu.com/p/4509181593?see_lz=1' # print(lookInUrl(urlssss)) (getUrl(url_mian)) Python 才学没几天 刚看到爬虫就动手写了一个 都在一个类中QwQ 以后会重构的

    转载请注明原文地址: https://ju.6miu.com/read-20715.html

    最新回复(0)