1.安装python3.3版本
2.安装pip;
3.安装bs4和lxml工具包
安装bs4:pip install bs4或bs4.exe‘
安装lxml:http://blog.csdn.net/qq_23438131/article/details/52222489
4.控制编码格式:
#coding:utf-8 import sys reload(sys) sys.setdefaultencoding('utf-8')5.引用bs4 import bs4 from bs4 import BeautifulSoup as bs6.根据关键字百度搜索 #coding:utf-8 import bs4 from bs4 import BeautifulSoup as bs import urllib.parse import urllib.request import functools import re import time from time import sleep #import socket #socket.setdefaulttimeout(3) class BaiduSpider(object): def __init__(self,word,max_link): self._word = word self._max_link = max_link p = {"word":word} self._start_url = "http://www.news.baidu.com/ns?" + urllib.parse.urlencode(p) def _get_links(self): links = [] links.append(self._start_url) try: soup = bs(self._get_html(self._start_url),"lxml") links_tag = soup.select("#page") except AttributeError as e_Att: print(e_Att) time.sleep(10) return self._get_links() if 0 != len(links_tag): links_tag = links_tag[0] #get the second page link for child in links_tag.children: attr = child.attrs if attr: links.append("http://www.news.baidu.com" + attr["href"]) break #get 20~800 news links for i in range(20,810,10): link_temp = links[1].__str__() PatternObj = re.compile('&pn=(\\d)+?&') newLink = PatternObj.subn('&pn='+str(i)+'&', link_temp ) links.append(str(newLink[0])) end = self._max_link if self._max_link < len(links) else len(links) return links[:end] def _rightTime(self,summary): ''' 判断summary中的时间是否在2016年6月1日至今 中国基金网 14小时前 网易新闻 2016年08月12日 16:35 ''' #2016-06-01转化为datetime try: startDate_str = '2016-06-01' startTime = time.mktime(time.strptime(startDate_str, '%Y-%m-%d')) a = summary.split() time_in_text = a[1] if '年' in time_in_text: time_in_text = time_in_text.split(" ")[0] time_in_text = time_in_text.replace("年",'-').replace("月",'-').replace("日",'') textTime = time.mktime(time.strptime(time_in_text, '%Y-%m-%d')) if (float(textTime))<=(float(startTime)): return False return True except ValueError: print (time_in_text) def _get_html(self,link): res = urllib.request.urlopen(link) return res.read().decode("utf-8") def _get_html_Content_post(self,link,f_error,retries): print (link,'open the link using the post method:',time.time()) html_content = '' try: request = urllib.request.Request(link) res =urllib.request.urlopen(request,timeout=3) html_content = res.read() except Exception as e: #爬虫卡住或其他异常,则再次尝试,尝试机会有3次 print(link+'\n') print(e) f_error.write(link+'\n') if retries: return self._get_html_Content_post(link, f_error,retries-1) print ('close:',time.time()) return html_content def _get_html_Content(self,link, f_error,retries=2): print (link,'\n','open the link:',time.time()) html_content = '' try: user_agent='Mozilla/4.0(compatible;MSIE 5.5;Windows NT)' headers={'User-Agent':user_agent} request = urllib.request.Request(link) request.add_header('User-Agent', user_agent) #timeout=2 res =urllib.request.urlopen(request,timeout=3) html_content = res.read() except Exception as e: #爬虫卡住或其他异常,则再次尝试,尝试用post方式打开 print(link+'\n') print(e) f_error.write(link+'\n') if retries: return self._get_html_Content_post(link, f_error,retries=3) print ('close:',time.time()) return html_content def _get_content(self,content): # 先要把bs4.element.NavigableString类型转化为string类型 return functools.reduce(lambda x,y:x+y,map(lambda x:x.replace("<em>","").replace("</em>",""), map(lambda x:x.string,content))) def _spiderDetail(self, link,f_error,Verbdic): ''' input:link,f_error output:contents contained xiepeiyiverb 通过第一步获取的URL,得到新闻所在的内容页面URL,由于百度新闻列表页面上的新闻来自不同的站, 所以很难找到一个通用的结构,大多数的新闻类网站,内容都是放在p标签内,所以就采用了如下的方式获取新闻的内容 ''' html_content = self._get_html_Content(link, f_error,retries=2) contents ='' if html_content != '': soup = bs(html_content,"lxml") #reg=u".+?带领" #Res = re.compile(reg) #contents = soup.findAll(name="p", text=Res) contents = '<p>' iter = [] nodes_p = soup.find_all(name='p') for n in nodes_p: p_cont = n.get_text(strip=True) for ver in Verbdic: if ver in p_cont: iter.append(p_cont) break contents = contents.join(iter) return contents def _spider(self,f, f_error,Verbdic): ''' 百度新闻列表页面, 根据关键词检索新闻, 获取新闻标题、来源及时间、链接、链接页面文字 ''' total_links = self._get_links() print (total_links) for i,l in enumerate(total_links): print ("Page {0}".format(i+1)) soup = bs(self._get_html(l),"lxml") # 找到左边内容到的跟节点 left_div = soup.select("#content_left")[0] # base_div_list是一个新闻列表 for child_div in left_div.children: if isinstance(child_div,bs4.element.Tag) and child_div.div and child_div.div.get('class') and'result' in child_div.div['class']: base_div = child_div childs = base_div.children for child in childs: title = child.select(".c-title")[0] summary = "" summary = summary.join(self._get_content(child.select(".c-summary")[0].p.contents)) a_link = title.a["href"] titlename = "" titlename = titlename.join(self._get_content(title.a.contents)) #爬取新闻内容网页 content = '' if self._rightTime(summary): content = self._spiderDetail(a_link, f_error,Verbdic) f.write ('标题:'+titlename+'\t来源及时间:'+summary+ '\t链接:'+a_link +'\t新闻内容:'+content+"\n") def start(self,f, f_error,Verbdic): self._spider(f,f_error,Verbdic) if '__main__' == __name__: ''' f存储爬取结果 #f_error存储读取新闻内容错误的链接 ''' Verbdic = [ '协同','协助' ] with open("links2.txt",'wt',encoding='utf-8') as f, open("logError2.txt",'wt') as f_error, open("overVerb.txt",'wt') as f_over: for keyword in Verbdic: baidu_spider = BaiduSpider(keyword,800) baidu_spider.start( f, f_error,Verbdic) f_over.write(keyword+'\n') 7.爬虫问题:1.Python程序卡住:原因是链接的网站反爬虫、get/post方式错误、网络问题等。
解决方法一:模拟浏览器上网:
user_agent='Mozilla/4.0(compatible;MSIE 5.5;Windows NT)' headers={'User-Agent':user_agent} request = urllib.request.Request(link) request.add_header('User-Agent', user_agent)解决方法二:超时重试:
<span style="white-space:pre"> </span>try: request = urllib.request.Request(link) <span style="white-space:pre"> </span> res =urllib.request.urlopen(request,timeout=3) html_content = res.read() except Exception as e: #爬虫卡住或其他异常,则再次尝试,尝试用post方式打开 print(link+'\n') print(e) f_error.write(link+'\n') if retries: return self._get_html_Content_post(link, f_error,retries=3)解决方法三:如果模拟浏览器方式无法打开网页,即无法用get方式打开网页,则采用post方式打开网页:
def _get_html_Content_post(self,link,f_error,retries): print (link,'open the link using the post method:',time.time()) html_content = '' try: request = <span style="font-family: Arial, Helvetica, sans-serif;">urllib.request.</span><span style="font-family: Arial, Helvetica, sans-serif;">.Request(link)</span> res = <span style="font-family: Arial, Helvetica, sans-serif;">urllib.request</span><span style="font-family: Arial, Helvetica, sans-serif;">.urlopen(request,timeout=3)</span> html_content = res.read() except Exception as e: #爬虫卡住或其他异常,则再次尝试,尝试机会有3次 print(link+'\n') print(e) f_error.write(link+'\n') if retries: return self._get_html_Content_post(link, f_error,retries-1) print ('close:',time.time()) return html_content
