java网络爬虫

xiaoxiao2025-11-05 2

第一种方式是爬本地的网页，把网页保存再本地，然后爬取。

核心代码：

BufferedReader buf=new BufferedReader(new File());

String st=buf.readLine();

Pattern p=Pattern.compile(Regex);

Matcher ma=p.matcher(st);

while(ma.find())

{print(ma.group());}

package com.wanhao; import java.io.BufferedReader; import java.io.FileReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; public class RegexDemo { /** * @param args */ public static void main(String[] args) { // TODO Auto-generated method stub try { regex_test(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } public static void regex_test() throws IOException { BufferedReader bufIn=new BufferedReader(new FileReader("f:\\mail.html")); String regex="\\w+@\\w+(\\.\\w+)+"; List<String>lis=new ArrayList<String>(); Pattern p=Pattern.compile(regex); String mail=null; while((mail=bufIn.readLine())!=null) { Matcher ma= p.matcher(mail); while(ma.find()) { lis.add(ma.group()); } } for(String s:lis) { System.out.println(s); } } }

第二种直接爬取网站上的邮箱信息：

基本思路是相同的，不同处在于BufferedReader的获取方式不同了。

URL url=new URL("要爬的网址");

BufferedReader buf=new BufferedReader(new InputStreamReader(url.openStream()));

package com.wanhao; import java.io.BufferedReader; import java.io.FileReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; public class RegexDemo { /** * @param args */ public static void main(String[] args) { // TODO Auto-generated method stub try { regex_test(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } public static void regex_test() throws IOException { URL url=new URL("http://tieba.baidu.com/p/4221284005?fr=ala0&pstaala=2&tpl=5"); BufferedReader bufIn=new BufferedReader(new InputStreamReader(url.openStream())); String regex="\\w+@\\w+(\\.\\w+)+"; List<String>lis=new ArrayList<String>(); Pattern p=Pattern.compile(regex); String mail=null; while((mail=bufIn.readLine())!=null) { Matcher ma= p.matcher(mail); while(ma.find()) { lis.add(ma.group()); } } for(String s:lis) { System.out.println(s); } } }

转载请注明原文地址: https://ju.6miu.com/read-1303879.html

最新回复(0)