使用Java爬虫得到CSDN博客信息并保存(一)

来源：互联网发布：json查看编辑：程序博客网时间：2024/04/24 12:22

需求：

使用java爬虫得到blog.csdn.net首页的所有出现的博客地址，并逐个访问，把博客信息保存到本地。

思路：

（1）.通过URL类的方法得到首页的HTML源码，使用正则把博客的url都放到一个String数组中

（2）.再逐个访问个人博客的首页得到HTML源码，使用正则提取到需要的信息

（3）.使用IO把得到的信息保存到本地

具体实现：

这里我把程序分成了两个部分：得到博客中需要的信息、把信息保存到文件中。这篇博客先讲怎样得到博客信息。

代码实现：

package cn.test12.WebRobot04;import java.io.BufferedReader;import java.io.InputStreamReader;import java.net.URL;import java.net.URLConnection;import java.util.Arrays;import java.util.Comparator;import java.util.HashSet;import java.util.TreeSet;import java.util.regex.Matcher;import java.util.regex.Pattern;/* * 这个类就是得到cdsn的imgUrl，visits,titles,Bloger * 步骤： *    1.构造方法，传入url,得到属性urlString和webLine * 总结： *    1.set集合转String[]可以使用 String[] strs =  set.toArray(new String[0]); System.out.println(Arrays.toString(strs)); 切勿使用 strs = (String[])set.toArray()会出现转换异常 */public class GetCsdn {//代表访问的CSDN得URLpublic String urlString = null;//把整个网页都放到一行里面，方便正则表达式进行匹配public String webLine = null;// 创建对象后，这个对象中就已经有了表示这个网页的webLinepublic GetCsdn(String urlString) throws Exception {this.urlString = urlString;// 创建链接对象URL url = new URL(this.urlString);// 创建根据链接对象写出通道对象URLConnection uc = url.openConnection();uc.setRequestProperty("User-Agent","Mozilla/5.0 (Windows NT 6.3; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0");// 根据通道对象，得到读的方法,并指明字节流是utf-8格式的BufferedReader br = new BufferedReader(new InputStreamReader(uc.getInputStream(), "utf-8"));// 把得到的数据一行一行的交给filter()String line = null;StringBuffer sb = new StringBuffer();while ((line = br.readLine()) != null) {// 这里默认就不添加回车换行，所以保存好就是一行sb.append(line);}this.webLine = new String(sb);}// 一开始博主的名字是通过传入的得url得到的，但是，后来需要多页的时候，就不可以了，这里采用猜得到博主的主页再进行截取public String getBloger() {// 这句话得到的网址是：href="http://blog.csdn.net/xuejiayue1105"String regex = "href=\"http://blog.csdn.net/\\w+\"";Pattern pa = Pattern.compile(regex);Matcher ma = pa.matcher(this.webLine);ma.find();return ma.group().split("\"")[1].substring(21);}public String getWebName() {// 这句话得到<div id="blog_title"> <h2> <a// href="http://blog.csdn.net/wangquannetwork">WangQuanNetwork专栏</a></h2>// <h3></h3> <div class="clear"> </div>String regex = "<div id=\"blog_title\">.+?</div>";// 也可以用这个表达式，这个表达式捕获组1就是网站名字了// String regex = "<div id=\"blog_title\">.+?>(\\S+)</a>.+</div>";Pattern pa = Pattern.compile(regex);Matcher ma = pa.matcher(this.webLine);ma.find();String temp = ma.group();String tempRegex = "<.+?>";return temp.replaceAll(tempRegex, "").trim();}public String[] getVisits() {// 2.写出正则对象String regex = "<li>[\u4E00-\u9FA5]+.+?</li>";Pattern pa = Pattern.compile(regex);// 3.处理数据得到引擎对象Matcher matcher = pa.matcher(this.webLine);// 4.遍历引擎对象HashSet<String> visits = new HashSet<>();while (matcher.find()) {visits.add(matcher.group().replaceAll("<.+?>", ""));}// 返回数组类型的数据return visits.toArray(new String[0]);}public String[] getTitles() throws Exception {// 2.编写正则对象String tempRegex = "[0-9]+\">尾页";Pattern tempPa = Pattern.compile(tempRegex);// 3.使用正则对象得到引擎Matcher tempMatcher = tempPa.matcher(this.webLine);tempMatcher.find();// 4.得到一共有多少页int count = Integer.parseInt(tempMatcher.group().split("\"")[0]);// 5.这里需求是希望可以让先捕获标题在前面，所以这里使用可以排序的TreeSet集合，实现了比较器接口TreeSet<String> titles = new TreeSet<>(new Comparator<String>() {public int compare(String o1, String o2) {// 这里永远都返回1，表示后来的永远都在后面return 1;}});// 5.根据页数常见循环，每个循环中都是自己的组成的一个链接，调用getCsdn类中的方法得到html内容，然后再判断，得到titlefor (int i = 1; i <= count; i++) {String onePageLine = new GetCsdn(this.urlString + "/article/list/"+ i).webLine;// 2.编写正则对象String regex = "<span class=\"link_title\">.+?</a></span>";Pattern pa = Pattern.compile(regex);// 3.使用正则对象得到引擎Matcher matcher = pa.matcher(onePageLine);// 4.输出的引擎中的内容,首先把标签都去掉，然后使用String自带的trim()方法把标题旁边的空格都去掉// <span class="link_title"><a// href="/qq_20607829/article/details/47747947"> 《学习记录》Toast带图片的显示// </a></span>while (matcher.find()) {titles.add(matcher.group().replaceAll("<.+?>", "").trim());}}return titles.toArray(new String[0]);}public String getImgUrl() {// 2.写出正则对象// 这是早期使用的正则，但是因为如果博主有了专栏，就会截取不正常// String regex = "<img src=\"http://avatar.csdn.net.+?/>";String regex = "<img src=\"http://avatar.csdn.net/\\w{1}/.+?/>";Pattern pa = Pattern.compile(regex);// 3.处理数据得到引擎对象Matcher matcher = pa.matcher(this.webLine);// 4.遍历引擎对象matcher.find();// <img src="http://avatar.csdn.net/0/8/9/1_qq_20607829.jpg"// title="访问我的空间" style="max-width:90%"/>// 首先得到还有无用信息的链接，然后使用"进行切割，获取到第二段，就是需要的链接return matcher.group().split("\"")[1];}public String[] getBlogHomePage() {// 这句话得到的网址是：href="http://blog.csdn.net/wangquanjava"String regex = "href=\"http://blog.csdn.net/\\w+\"";Pattern pa = Pattern.compile(regex);Matcher ma = pa.matcher(this.webLine);// 使用HashSet集合的唯一性，把相同的链接去掉，但是这时链接中还有没用的部分HashSet<String> tempLinks = new HashSet<>();while (ma.find()) {tempLinks.add(ma.group());}// 新建一个集合用来放真正的链接HashSet<String> links = new HashSet<>();for (String link : tempLinks) {// 对之前的连接使用"进行截取，取其第二部分link = link.split("\"")[1];links.add(link);}return links.toArray(new String[0]);}public static void main(String[] args) throws Exception {GetCsdn gc = new GetCsdn("http://blog.csdn.net/wangquannetwork");System.out.println(gc.getBloger());System.out.println("----------------------------");System.out.println(gc.getImgUrl());System.out.println("----------------------------");System.out.println(gc.getWebName());System.out.println("----------------------------");System.out.println(Arrays.toString(gc.getTitles()));System.out.println("----------------------------");System.out.println(Arrays.toString(gc.getVisits()));System.out.println("----------------------------");}}

5.显示结果

0 0