第一种方式是爬本地的网页,把网页保存再本地,然后爬取。
核心代码:
BufferedReader buf=new BufferedReader(new File());
String st=buf.readLine();
Pattern p=Pattern.compile(Regex);
Matcher ma=p.matcher(st);
while(ma.find())
{print(ma.group());}
package com.wanhao; import java.io.BufferedReader; import java.io.FileReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; public class RegexDemo { /** * @param args */ public static void main(String[] args) { // TODO Auto-generated method stub try { regex_test(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } public static void regex_test() throws IOException { BufferedReader bufIn=new BufferedReader(new FileReader("f:\\mail.html")); String regex="\\w+@\\w+(\\.\\w+)+"; List<String>lis=new ArrayList<String>(); Pattern p=Pattern.compile(regex); String mail=null; while((mail=bufIn.readLine())!=null) { Matcher ma= p.matcher(mail); while(ma.find()) { lis.add(ma.group()); } } for(String s:lis) { System.out.println(s); } } }第二种直接爬取网站上的邮箱信息:
基本思路是相同的,不同处在于BufferedReader的获取方式不同了。
URL url=new URL("要爬的网址");
BufferedReader buf=new BufferedReader(new InputStreamReader(url.openStream()));
package com.wanhao; import java.io.BufferedReader; import java.io.FileReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; public class RegexDemo { /** * @param args */ public static void main(String[] args) { // TODO Auto-generated method stub try { regex_test(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } public static void regex_test() throws IOException { URL url=new URL("http://tieba.baidu.com/p/4221284005?fr=ala0&pstaala=2&tpl=5"); BufferedReader bufIn=new BufferedReader(new InputStreamReader(url.openStream())); String regex="\\w+@\\w+(\\.\\w+)+"; List<String>lis=new ArrayList<String>(); Pattern p=Pattern.compile(regex); String mail=null; while((mail=bufIn.readLine())!=null) { Matcher ma= p.matcher(mail); while(ma.find()) { lis.add(ma.group()); } } for(String s:lis) { System.out.println(s); } } }
