数据库使用的是MySQL,新建test数据库创建csdnblog表:
[html] view plain copy CREATE TABLE `csdnblog` ( `id` int(11) unsigned NOT NULL auto_increment, `keyes` int(11) unsigned NOT NULL, `titles` varchar(255) NOT NULL, `content` varchar(10240) NOT NULL, `dates` varchar(16) default NULL, `tags` varchar(255) default NULL, `category` varchar(255) default NULL, `views` int(11) unsigned default NULL, `comments` int(11) unsigned default NULL, `copyright` int(1) unsigned default NULL, PRIMARY KEY (`id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
接着新建实体类对应博客文章:
CsdnBlog.Java
[html] view plain copy public class CsdnBlog { private int key;// 编号 private String title;// 标题 private String date;// 日期 private String tags;// 标签 private String category;// 分类 private int view;// 阅读人数 private int comments;// 评论人数 private int copyright;// 是否原创 private String content; //文字内容 public String getContent() { return content; } public void setContent(String content) { this.content = content; } public int getKey() { return key; } public void setKey(int key) { this.key = key; } public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public String getDate() { return date; } public void setDate(String date) { this.date = date; } public String getTags() { return tags; } public void setTags(String tags) { this.tags = tags; } public String getCategory() { return category; } public void setCategory(String category) { this.category = category; } public int getView() { return view; } public void setView(int view) { this.view = view; } public int getComments() { return comments; } public void setComments(int comments) { this.comments = comments; } public int getCopyright() { return copyright; } public void setCopyright(int copyright) { this.copyright = copyright; } @Override public String toString() { return "CsdnBlog [key=" + key + ", title=" + title + ", content=" + content + ",date=" + date + ", tags=" + tags + ", category=" + category + ", view=" + view + ", comments=" + comments + ", copyright=" + copyright + "]"; } }
Dao类
CsdnBlogDao.java:
[html] view plain copy public class CsdnBlogDao { private Connection conn = null; private Statement stmt = null; public CsdnBlogDao() { try { Class.forName("com.mysql.jdbc.Driver"); String url = "jdbc:mysql://localhost:3306/test?" + "user=root&password=123&useUnicode=true&characterEncoding=UTF8"; conn = DriverManager.getConnection(url); stmt = conn.createStatement(); } catch (ClassNotFoundException e) { e.printStackTrace(); } catch (SQLException e) { e.printStackTrace(); } } public int add(CsdnBlog csdnBlog) { try { String sql = "INSERT INTO `test`.`csdnblog` (`keyes`, `titles`, `content` , `dates`, `tags`, `category`, `views`, `comments`, `copyright`) VALUES (?, ?, ?, ?, ?, ?, ?, ?,?);"; PreparedStatement ps = conn.prepareStatement(sql); ps.setInt(1, csdnBlog.getKey()); ps.setString(2, csdnBlog.getTitle()); ps.setString(3,csdnBlog.getContent()); ps.setString(4, csdnBlog.getDate()); ps.setString(5, csdnBlog.getTags()); ps.setString(6, csdnBlog.getCategory()); ps.setInt(7, csdnBlog.getView()); ps.setInt(8, csdnBlog.getComments()); ps.setInt(9, csdnBlog.getCopyright()); return ps.executeUpdate(); } catch (SQLException e) { e.printStackTrace(); } return -1; } }
测试类:
CsdnBlogPageProcessor.java
[html] view plain copy public class CsdnBlogPageProcessor implements PageProcessor { private static String username="CHENYUFENG1991"; // 设置csdn用户名 private static int size = 0;// 共抓取到的文章数量 // 抓取网站的相关配置,包括:编码、抓取间隔、重试次数等 private Site site = Site.me().setRetryTimes(3).setSleepTime(1000); public Site getSite() { return site; } // process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑 public void process(Page page) { // 列表页 if (!page.getUrl().regex("http://blog\\.csdn\\.net/" + username + "/article/details/\\d+").match()) { // 添加所有文章页 page.addTargetRequests(page.getHtml().xpath("//div[@id='article_list']").links()// 限定文章列表获取区域 .regex("/" + username + "/article/details/\\d+") .replace("/" + username + "/", "http://blog.csdn.net/" + username + "/")// 巧用替换给把相对url转换成绝对url .all()); // 添加其他列表页 page.addTargetRequests(page.getHtml().xpath("//div[@id='papelist']").links()// 限定其他列表页获取区域 .regex("/" + username + "/article/list/\\d+") .replace("/" + username + "/", "http://blog.csdn.net/" + username + "/")// 巧用替换给把相对url转换成绝对url .all()); // 文章页 } else { size++;// 文章数量加1 // 用CsdnBlog类来存抓取到的数据,方便存入数据库 CsdnBlog csdnBlog = new CsdnBlog(); // 设置编号 csdnBlog.setKey(Integer.parseInt( page.getUrl().regex("http://blog\\.csdn\\.net/" + username + "/article/details/(\\d+)").get())); // 设置标题 csdnBlog.setTitle( page.getHtml().xpath("//div[@class='article_title']//span[@class='link_title']/a/text()").get()); //设置内容 csdnBlog.setContent( page.getHtml().xpath("//div[@class='article_content']/allText()").get()); // 设置日期 csdnBlog.setDate( page.getHtml().xpath("//div[@class='article_r']/span[@class='link_postdate']/text()").get()); // 设置标签(可以有多个,用,来分割) csdnBlog.setTags(listToString(page.getHtml() .xpath("//div[@class='article_l']/span[@class='link_categories']/a/allText()").all())); // 设置类别(可以有多个,用,来分割) csdnBlog.setCategory( listToString(page.getHtml().xpath("//div[@class='category_r']/label/span/text()").all())); // 设置阅读人数 csdnBlog.setView(Integer.parseInt(page.getHtml().xpath("//div[@class='article_r']/span[@class='link_view']") .regex("(\\d+)人阅读").get())); // 设置评论人数 csdnBlog.setComments(Integer.parseInt(page.getHtml() .xpath("//div[@class='article_r']/span[@class='link_comments']").regex("\\((\\d+)\\)").get())); // 设置是否原创 csdnBlog.setCopyright(page.getHtml().regex("bog_copyright").match() ? 1 : 0); // 把对象存入数据库 new CsdnBlogDao().add(csdnBlog); // 把对象输出控制台 System.out.println(csdnBlog); } } // 把list转换为string,用,分割 public static String listToString(List<String> stringList) { if (stringList == null) { return null; } StringBuilder result = new StringBuilder(); boolean flag = false; for (String string : stringList) { if (flag) { result.append(","); } else { flag = true; } result.append(string); } return result.toString(); } public static void main(String[] args) { long startTime, endTime; System.out.println("【爬虫开始】..."); startTime = System.currentTimeMillis(); // 从用户博客首页开始抓,开启5个线程,启动爬虫 Spider.create(new CsdnBlogPageProcessor()).addUrl("http://blog.csdn.net/" + username).thread(5).run(); endTime = System.currentTimeMillis(); System.out.println("【爬虫结束】共抓取" + size + "篇文章,耗时约" + ((endTime - startTime) / 1000) + "秒,已保存到数据库,请查收!"); } }
运行main方法数据就保存到数据库中去了。
完整的demo: http://download.csdn.net/detail/u011781521/9672531