抓取csdn指定用户的博文

来源:互联网 发布:知豆电动汽车zhidouzz 编辑:程序博客网 时间:2024/06/10 15:46

http请求类:

package com.blog.collection;import java.io.BufferedReader;import java.io.IOException;import java.io.InputStreamReader;import java.io.PrintWriter;import java.net.URL;import java.net.URLConnection;import java.util.List;import java.util.Map;public class HttpRequest {    /**     * 向指定URL发送GET方法的请求     *      * @param url     *            发送请求的URL     * @param param     *            请求参数,请求参数应该是 name1=value1&name2=value2 的形式。     * @return URL 所代表远程资源的响应结果     */    public static String sendGet(String url, String param) {        String result = "";        BufferedReader in = null;        try {            String urlNameString = url + "?" + param;            URL realUrl = new URL(urlNameString);            // 打开和URL之间的连接            URLConnection connection = realUrl.openConnection();            // 设置通用的请求属性            connection.setRequestProperty("accept", "*/*");            connection.setRequestProperty("connection", "Keep-Alive");            connection.setRequestProperty("Cache-Control", "public, no-store, max-age=60");            connection.setRequestProperty("Content-Encoding", "gzip");            connection.setRequestProperty("user-agent","Mozilla/5.0 (Windows NT 6.1; rv:29.0) Gecko/20100101 Firefox/29.0");            // 建立实际的连接            connection.connect();            // 获取所有响应头字段//            Map<String, List<String>> map = connection.getHeaderFields();            // 遍历所有的响应头字段//            for (String key : map.keySet()) {//                System.out.println(key + "--->" + map.get(key));//            }            // 定义 BufferedReader输入流来读取URL的响应            in = new BufferedReader(new InputStreamReader(                    connection.getInputStream()));            String line;            while ((line = in.readLine()) != null) {                result += line+"\n";            }        } catch (Exception e) {            System.out.println("发送GET请求出现异常!" + e);            e.printStackTrace();        }        // 使用finally块来关闭输入流        finally {            try {                if (in != null) {                    in.close();                }            } catch (Exception e2) {                e2.printStackTrace();            }        }        return result;    }    public static String send(String url){      String result = "";          BufferedReader in = null;          try {              String urlNameString = url;              URL realUrl = new URL(urlNameString);              // 打开和URL之间的连接              URLConnection connection = realUrl.openConnection();              // 建立实际的连接              connection.connect();              in = new BufferedReader(new InputStreamReader(                      connection.getInputStream()));              String line;              while ((line = in.readLine()) != null) {                  result += line+"\n";              }          } catch (Exception e) {              System.out.println("发送GET请求出现异常!" + e);              e.printStackTrace();          }          // 使用finally块来关闭输入流          finally {              try {                  if (in != null) {                      in.close();                  }              } catch (Exception e2) {                  e2.printStackTrace();              }          }          return result;    }        /**     * 向指定 URL 发送POST方法的请求     *      * @param url     *            发送请求的 URL     * @param param     *            请求参数,请求参数应该是 name1=value1&name2=value2 的形式。     * @return 所代表远程资源的响应结果     */    public static String sendPost(String url, String param) {        PrintWriter out = null;        BufferedReader in = null;        String result = "";        try {            URL realUrl = new URL(url);            // 打开和URL之间的连接            URLConnection conn = realUrl.openConnection();            // 设置通用的请求属性            conn.setRequestProperty("accept", "*/*");            conn.setRequestProperty("connection", "Keep-Alive");            conn.setRequestProperty("user-agent",                    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)");            // 发送POST请求必须设置如下两行            conn.setDoOutput(true);            conn.setDoInput(true);            // 获取URLConnection对象对应的输出流            out = new PrintWriter(conn.getOutputStream());            // 发送请求参数            out.print(param);            // flush输出流的缓冲            out.flush();            // 定义BufferedReader输入流来读取URL的响应            in = new BufferedReader(                    new InputStreamReader(conn.getInputStream()));            String line;            while ((line = in.readLine()) != null) {                result += line;            }        } catch (Exception e) {            System.out.println("发送 POST 请求出现异常!"+e);            e.printStackTrace();        }        //使用finally块来关闭输出流、输入流        finally{            try{                if(out!=null){                    out.close();                }                if(in!=null){                    in.close();                }            }            catch(IOException ex){                ex.printStackTrace();            }        }        return result;    }    }

处理类:

package com.blog.collection;import java.util.ArrayList;import java.util.List;import java.util.regex.Matcher;import java.util.regex.Pattern;import com.blog.model.Blog;public class CollectionHandler {private Progress progress;public void setProgress(Progress progress) {this.progress = progress;}public Progress getProgress() {return progress;}public void go(String user){HttpRequest request=new HttpRequest();System.out.println("加载中..."); String content=request.sendGet("http://blog.csdn.net/"+user+"/article/list/1", ""); //获取页码-摘要视图String count=matcher(content, "(?<=<div[\\s\\S]{0,10}id=\"papelist\"[\\s\\S]{0,10}class=\"pagelist\">[\\s\\S]{1,100}共)\\d+(?=页</span>)");Integer code=count.equals("")?0:Integer.parseInt(count);List<String> urls=new ArrayList<String>();getUrls(content, urls, null);for(int i=2;i<=code;i++){getUrls(null,urls, "http://blog.csdn.net/"+user+"/article/list/"+i);}System.out.println("数量:"+urls.size());for (String string : urls) {System.out.println(string);handler(string);}System.out.println("处理完成");}public void getUrls(String text,List<String> urls,String url){HttpRequest request=new HttpRequest();String content=null;if(text==null){content=request.sendGet(url, "");}else{content=text;}String regex="(?<=<span[\\s\\S]{0,10}class=\"link_title\"><a[\\s\\S]{0,10}\")[\\s\\S]*?(?=\">)";Pattern pattern = Pattern.compile(regex);Matcher matcher = pattern.matcher(content);while(matcher.find()){urls.add("http://blog.csdn.net"+matcher.group());}}/** * 处理博文 * @param url */public void handler(String url){Blog blog=new Blog();HttpRequest request=new HttpRequest();String content=request.sendGet(url, "");//System.out.println(content);String regex = "(?<=<span class=\"link_title\"><a[\\s\\S]{0,1000}?>)[\\s\\S]*?(?=</a></span>)";//标题String title=matcher(content, regex).replaceAll("\n", "").replaceAll(" ", "");System.out.println("标题");System.out.println(title);blog.setTitle(title);//文章内容regex="(?<=<div[\\s\\S]{0,100}id=\"article_content\"[\\s\\S]{0,100}class=\"article_content\">)[\\s\\S]*?(?=</div>[\\s\\S]{0,100}<!--)";System.out.println("博文");String text=matcher(content, regex);blog.setContent(text);//分类regex="(?<=<span[\\s\\S]{0,100}class=\"link_categories\">[\\s\\S]{0,1000}<a[\\s\\S]{0,200}?>)[\\s\\S]*?(?=</a>)";System.out.println("分类");String type=matcher(content, regex);blog.setTags(type);System.out.println(type);if(this.progress!=null){progress.handler(blog, type);}}public String matcher(String content,String regex){Pattern pattern = Pattern.compile(regex);Matcher matcher = pattern.matcher(content);if (matcher.find()) {String group = matcher.group(0);return group;}return "";}}


1 0
原创粉丝点击