抓取csdn指定用户的博文
来源:互联网 发布:知豆电动汽车zhidouzz 编辑:程序博客网 时间:2024/06/10 15:46
http请求类:
package com.blog.collection;import java.io.BufferedReader;import java.io.IOException;import java.io.InputStreamReader;import java.io.PrintWriter;import java.net.URL;import java.net.URLConnection;import java.util.List;import java.util.Map;public class HttpRequest { /** * 向指定URL发送GET方法的请求 * * @param url * 发送请求的URL * @param param * 请求参数,请求参数应该是 name1=value1&name2=value2 的形式。 * @return URL 所代表远程资源的响应结果 */ public static String sendGet(String url, String param) { String result = ""; BufferedReader in = null; try { String urlNameString = url + "?" + param; URL realUrl = new URL(urlNameString); // 打开和URL之间的连接 URLConnection connection = realUrl.openConnection(); // 设置通用的请求属性 connection.setRequestProperty("accept", "*/*"); connection.setRequestProperty("connection", "Keep-Alive"); connection.setRequestProperty("Cache-Control", "public, no-store, max-age=60"); connection.setRequestProperty("Content-Encoding", "gzip"); connection.setRequestProperty("user-agent","Mozilla/5.0 (Windows NT 6.1; rv:29.0) Gecko/20100101 Firefox/29.0"); // 建立实际的连接 connection.connect(); // 获取所有响应头字段// Map<String, List<String>> map = connection.getHeaderFields(); // 遍历所有的响应头字段// for (String key : map.keySet()) {// System.out.println(key + "--->" + map.get(key));// } // 定义 BufferedReader输入流来读取URL的响应 in = new BufferedReader(new InputStreamReader( connection.getInputStream())); String line; while ((line = in.readLine()) != null) { result += line+"\n"; } } catch (Exception e) { System.out.println("发送GET请求出现异常!" + e); e.printStackTrace(); } // 使用finally块来关闭输入流 finally { try { if (in != null) { in.close(); } } catch (Exception e2) { e2.printStackTrace(); } } return result; } public static String send(String url){ String result = ""; BufferedReader in = null; try { String urlNameString = url; URL realUrl = new URL(urlNameString); // 打开和URL之间的连接 URLConnection connection = realUrl.openConnection(); // 建立实际的连接 connection.connect(); in = new BufferedReader(new InputStreamReader( connection.getInputStream())); String line; while ((line = in.readLine()) != null) { result += line+"\n"; } } catch (Exception e) { System.out.println("发送GET请求出现异常!" + e); e.printStackTrace(); } // 使用finally块来关闭输入流 finally { try { if (in != null) { in.close(); } } catch (Exception e2) { e2.printStackTrace(); } } return result; } /** * 向指定 URL 发送POST方法的请求 * * @param url * 发送请求的 URL * @param param * 请求参数,请求参数应该是 name1=value1&name2=value2 的形式。 * @return 所代表远程资源的响应结果 */ public static String sendPost(String url, String param) { PrintWriter out = null; BufferedReader in = null; String result = ""; try { URL realUrl = new URL(url); // 打开和URL之间的连接 URLConnection conn = realUrl.openConnection(); // 设置通用的请求属性 conn.setRequestProperty("accept", "*/*"); conn.setRequestProperty("connection", "Keep-Alive"); conn.setRequestProperty("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)"); // 发送POST请求必须设置如下两行 conn.setDoOutput(true); conn.setDoInput(true); // 获取URLConnection对象对应的输出流 out = new PrintWriter(conn.getOutputStream()); // 发送请求参数 out.print(param); // flush输出流的缓冲 out.flush(); // 定义BufferedReader输入流来读取URL的响应 in = new BufferedReader( new InputStreamReader(conn.getInputStream())); String line; while ((line = in.readLine()) != null) { result += line; } } catch (Exception e) { System.out.println("发送 POST 请求出现异常!"+e); e.printStackTrace(); } //使用finally块来关闭输出流、输入流 finally{ try{ if(out!=null){ out.close(); } if(in!=null){ in.close(); } } catch(IOException ex){ ex.printStackTrace(); } } return result; } }
处理类:
package com.blog.collection;import java.util.ArrayList;import java.util.List;import java.util.regex.Matcher;import java.util.regex.Pattern;import com.blog.model.Blog;public class CollectionHandler {private Progress progress;public void setProgress(Progress progress) {this.progress = progress;}public Progress getProgress() {return progress;}public void go(String user){HttpRequest request=new HttpRequest();System.out.println("加载中..."); String content=request.sendGet("http://blog.csdn.net/"+user+"/article/list/1", ""); //获取页码-摘要视图String count=matcher(content, "(?<=<div[\\s\\S]{0,10}id=\"papelist\"[\\s\\S]{0,10}class=\"pagelist\">[\\s\\S]{1,100}共)\\d+(?=页</span>)");Integer code=count.equals("")?0:Integer.parseInt(count);List<String> urls=new ArrayList<String>();getUrls(content, urls, null);for(int i=2;i<=code;i++){getUrls(null,urls, "http://blog.csdn.net/"+user+"/article/list/"+i);}System.out.println("数量:"+urls.size());for (String string : urls) {System.out.println(string);handler(string);}System.out.println("处理完成");}public void getUrls(String text,List<String> urls,String url){HttpRequest request=new HttpRequest();String content=null;if(text==null){content=request.sendGet(url, "");}else{content=text;}String regex="(?<=<span[\\s\\S]{0,10}class=\"link_title\"><a[\\s\\S]{0,10}\")[\\s\\S]*?(?=\">)";Pattern pattern = Pattern.compile(regex);Matcher matcher = pattern.matcher(content);while(matcher.find()){urls.add("http://blog.csdn.net"+matcher.group());}}/** * 处理博文 * @param url */public void handler(String url){Blog blog=new Blog();HttpRequest request=new HttpRequest();String content=request.sendGet(url, "");//System.out.println(content);String regex = "(?<=<span class=\"link_title\"><a[\\s\\S]{0,1000}?>)[\\s\\S]*?(?=</a></span>)";//标题String title=matcher(content, regex).replaceAll("\n", "").replaceAll(" ", "");System.out.println("标题");System.out.println(title);blog.setTitle(title);//文章内容regex="(?<=<div[\\s\\S]{0,100}id=\"article_content\"[\\s\\S]{0,100}class=\"article_content\">)[\\s\\S]*?(?=</div>[\\s\\S]{0,100}<!--)";System.out.println("博文");String text=matcher(content, regex);blog.setContent(text);//分类regex="(?<=<span[\\s\\S]{0,100}class=\"link_categories\">[\\s\\S]{0,1000}<a[\\s\\S]{0,200}?>)[\\s\\S]*?(?=</a>)";System.out.println("分类");String type=matcher(content, regex);blog.setTags(type);System.out.println(type);if(this.progress!=null){progress.handler(blog, type);}}public String matcher(String content,String regex){Pattern pattern = Pattern.compile(regex);Matcher matcher = pattern.matcher(content);if (matcher.find()) {String group = matcher.group(0);return group;}return "";}}
1 0
- 抓取csdn指定用户的博文
- 抓取CSDN个人的用户访问量并且发邮件
- 抓取指定的html
- 抓取指定的Html
- 抓取csdn的数据
- heritrix 抓取指定的html
- php蜘蛛正常抓取,用户跳转指定页面
- python抓取CSDN博客首页的所有博文,对标题分词存入mongodb中
- 指定用户的授权
- 抓取指定网址的Html代码
- js抓取指定的子节点
- java-抓取指定URL网页的内容
- Python3 urllib抓取指定URL的内容
- Python3 urllib抓取指定URL的内容
- 通过指定的URL抓取网页内容
- 用Python抓取指定字符串的log
- php抓取网页上的指定内容
- CSDN上讨论抓取新闻的帖子
- 单调递增最长子序列
- 如何学习开源项目
- 磁盘容量计算
- MFC API——》GetAsyncKeyState 判断函数调用时指定虚拟键的状态
- c# socket:通常每个套接字地址(协议/网络地址/端口)只允许使用一次
- 抓取csdn指定用户的博文
- JVM内存设置
- iOS设备打印连接到同一Wifi的其余设备清单
- javascript读写json
- 谁抢走了中国男人的老婆?
- android 近百个源码项目
- HDU 1106 排序
- Cocos2d-x中通过JNI进行C++调用Java代码
- 黑马程序员-IOS-OC基础-ARC