java网络爬虫

    xiaoxiao2025-11-05  2

    第一种方式是爬本地的网页,把网页保存再本地,然后爬取。

    核心代码:

    BufferedReader buf=new BufferedReader(new File());

    String st=buf.readLine();

    Pattern p=Pattern.compile(Regex);

    Matcher ma=p.matcher(st);

    while(ma.find())

    {print(ma.group());}

    package com.wanhao; import java.io.BufferedReader; import java.io.FileReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; public class RegexDemo { /** * @param args */ public static void main(String[] args) { // TODO Auto-generated method stub try { regex_test(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } public static void regex_test() throws IOException { BufferedReader bufIn=new BufferedReader(new FileReader("f:\\mail.html")); String regex="\\w+@\\w+(\\.\\w+)+"; List<String>lis=new ArrayList<String>(); Pattern p=Pattern.compile(regex); String mail=null; while((mail=bufIn.readLine())!=null) { Matcher ma= p.matcher(mail); while(ma.find()) { lis.add(ma.group()); } } for(String s:lis) { System.out.println(s); } } }

    第二种直接爬取网站上的邮箱信息:

    基本思路是相同的,不同处在于BufferedReader的获取方式不同了。

    URL url=new URL("要爬的网址");

    BufferedReader buf=new BufferedReader(new InputStreamReader(url.openStream()));

    package com.wanhao; import java.io.BufferedReader; import java.io.FileReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; public class RegexDemo { /** * @param args */ public static void main(String[] args) { // TODO Auto-generated method stub try { regex_test(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } public static void regex_test() throws IOException { URL url=new URL("http://tieba.baidu.com/p/4221284005?fr=ala0&pstaala=2&tpl=5"); BufferedReader bufIn=new BufferedReader(new InputStreamReader(url.openStream())); String regex="\\w+@\\w+(\\.\\w+)+"; List<String>lis=new ArrayList<String>(); Pattern p=Pattern.compile(regex); String mail=null; while((mail=bufIn.readLine())!=null) { Matcher ma= p.matcher(mail); while(ma.find()) { lis.add(ma.group()); } } for(String s:lis) { System.out.println(s); } } }

    转载请注明原文地址: https://ju.6miu.com/read-1303879.html
    最新回复(0)