java爬虫

    xiaoxiao2021-03-27  37

    一原理

        创建HttpClient对象,并指定url,如需要get请求请创建HttpGet对象,post请求请创建HttpPost对象。HttpClient中execute方法发送请求。

    二小例子

    package com.xiang; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import org.apache.http.client.config.CookieSpecs; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; public class Spider { public static void main(String[] args) { // HttpClient 超时配置 RequestConfig requestConfig = RequestConfig.custom() .setCookieSpec(CookieSpecs.STANDARD) .setConnectionRequestTimeout(6000).setConnectTimeout(6000) .build(); CloseableHttpClient httpClient = HttpClients.custom() .setDefaultRequestConfig(requestConfig).build(); //for (int i = 0; i < 100; i++) {//页面上有页码用到,提高效率,并用多线程 HttpGet httpGet = new HttpGet("http://www.baidu.com");//此处填写地址 创建一个get请求 httpGet.addHeader( "User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.152 Safari/537.36"); CloseableHttpResponse response = null; InputStream in = null; try { // 不敢爬太快,封ip就不好了 //Thread.sleep(3600); response = httpClient.execute(httpGet); in = response.getEntity().getContent(); String html = convertStreamToString(in); new Thread(new BaiduParser(html)).start(); } catch (Exception e) { //do nothing }finally{ try { if(response != null){ response.close(); } } catch (IOException e) { // do nothing } } //} } //将爬到的内容转化为String private static String convertStreamToString(InputStream in) { BufferedReader reader = new BufferedReader(new InputStreamReader(in)); StringBuilder sb = new StringBuilder(); String line = null; try { while ((line = reader.readLine()) != null) { sb.append(line + "\n"); } } catch (IOException e) { e.printStackTrace(); } finally { try { in.close(); } catch (IOException e) { e.printStackTrace(); } } return sb.toString(); } }

    package com.xiang; public class BaiduParser implements Runnable{ String html; public BaiduParser(String html) { this.html = html; } public void run() { System.out.println(html); //通过正则表达式或截取取得自己想要的内容 } }

    转载请注明原文地址: https://ju.6miu.com/read-664454.html

    最新回复(0)