如何使用ip代理爬虫

    xiaoxiao2021-03-25  158

    import urllib import socket import urllib2 import time from bs4 import BeautifulSoup url = 'http://www.xicidaili.com/nn/' target="https://msdn.microsoft.com" dirt={} proxy = {'http': '223.15.151.149:8888'} proxy_support = urllib2.ProxyHandler(proxy) # opener = urllib2.build_opener(proxy_support,urllib2.HTTPHandler(debuglevel=1)) opener = urllib2.build_opener(proxy_support) urllib2.install_opener(opener) # 添加头信息,模仿浏览器抓取网页,对付返回403禁止访问的问题 # i_headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} i_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.48'} f = open("proxy.txt","w") for i in range(1,1504): new_url=url+str(i) print new_url time.sleep(3) req = urllib2.Request(new_url, headers=i_headers) html = urllib2.urlopen(req).read() soup=BeautifulSoup(html,"html.parser") #print soup.body ips = soup.find_all('tr') #print ips for x in range(1,len(ips)): ip = ips[x] tds = ip.find_all("td") #print tds[1].text,tds[2].text dirt[tds[1].text]=tds[2].text f.write(tds[1].text+":"+tds[2].text+"\n") print len(dirt) socket.setdefaulttimeout(3)

    转载请注明原文地址: https://ju.6miu.com/read-2882.html

    最新回复(0)