java爬虫学习
来源:互联网 发布:工业机器人编程用什么 编辑:程序博客网 时间:2024/06/03 20:05
1.新建java项目,导入需要用到的jar包:
jsoup-1.8.1.jar
junit-4.8.2.jar
2.新建Rule.java
public class Rule { /** * 链接 */ private String url; /** * 参数集合 */ private String[] params; /** * 参数对应的值 */ private String[] values; /** * 对返回的HTML,第一次过滤所用的标签,请先设置type */ private String resultTagName; /** * CLASS / ID / SELECTION * 设置resultTagName的类型,默认为ID */ private int type = ID ; /** *GET / POST * 请求的类型,默认GET */ private int requestMoethod = GET ; public final static int GET = 0 ; public final static int POST = 1 ; public final static int CLASS = 0; public final static int ID = 1; public final static int SELECTION = 2; public Rule() { } public Rule(String url, String[] params, String[] values, String resultTagName, int type, int requestMoethod) { super(); this.url = url; this.params = params; this.values = values; this.resultTagName = resultTagName; this.type = type; this.requestMoethod = requestMoethod; } public String getUrl() { return url; } public void setUrl(String url) { this.url = url; } public String[] getParams() { return params; } public void setParams(String[] params) { this.params = params; } public String[] getValues() { return values; } public void setValues(String[] values) { this.values = values; } public String getResultTagName() { return resultTagName; } public void setResultTagName(String resultTagName) { this.resultTagName = resultTagName; } public int getType() { return type; } public void setType(int type) { this.type = type; } public int getRequestMoethod() { return requestMoethod; } public void setRequestMoethod(int requestMoethod) { this.requestMoethod = requestMoethod; } }
3.新建LinkTypeData.java
public class LinkTypeData { private int id; /** * 链接的地址 */ private String linkHref; /** * 链接的标题 */ private String linkText; /** * 摘要 */ private String summary; /** * 内容 */ private String content; public int getId() { return id; } public void setId(int id) { this.id = id; } public String getLinkHref() { return linkHref; } public void setLinkHref(String linkHref) { this.linkHref = linkHref; } public String getLinkText() { return linkText; } public void setLinkText(String linkText) { this.linkText = linkText; } public String getSummary() { return summary; } public void setSummary(String summary) { this.summary = summary; } public String getContent() { return content; } public void setContent(String content) { this.content = content; } }
4.新建TextUtil.java工具类
public class TextUtil { public static boolean isEmpty(String url){ if (url == null) { return true; } else if (url.toLowerCase().equals("null")) { return true; }else if(url.equals("")){ return true; }else{ return false; } }}
5.新建RuleException.java异常类
public class RuleException extends RuntimeException{ public RuleException() { super(); // TODO Auto-generated constructor stub } public RuleException(String message, Throwable cause) { super(message, cause); // TODO Auto-generated constructor stub } public RuleException(String message) { super(message); // TODO Auto-generated constructor stub } public RuleException(Throwable cause) { super(cause); // TODO Auto-generated constructor stub } }
6.ExtractService.java
import java.io.IOException;import java.util.ArrayList;import java.util.List;import org.jsoup.Connection;import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class ExtractService { /** * @param rule * @return */ public static List<LinkTypeData> extract(Rule rule) { // 进行对rule的必要校验 validateRule(rule); List<LinkTypeData> datas = new ArrayList<LinkTypeData>(); LinkTypeData data = null; try { /** * 解析rule */ String url = rule.getUrl(); String[] params = rule.getParams(); String[] values = rule.getValues(); String resultTagName = rule.getResultTagName(); int type = rule.getType(); int requestType = rule.getRequestMoethod(); Connection conn = Jsoup.connect(url).ignoreContentType(true); //Document pod = Jsoup.connect(url).ignoreContentType(true).get(); // 设置查询参数 if (params != null) { for (int i = 0; i < params.length; i++) { conn.data(params[i], values[i]); } } // 设置请求类型 Document doc = null; switch (requestType) { case Rule.GET: doc = conn.timeout(100000).get(); break; case Rule.POST: doc = conn.timeout(100000).post(); break; } //处理返回数据 Elements results = new Elements(); switch (type) { case Rule.CLASS: results = doc.getElementsByClass(resultTagName); break; case Rule.ID: Element result = doc.getElementById(resultTagName); results.add(result); break; case Rule.SELECTION: results = doc.select(resultTagName); break; default: //当resultTagName为空时默认去body标签 if (TextUtil.isEmpty(resultTagName)) { results = doc.getElementsByTag("body"); } } for (Element result : results) { // Elements links = result.getElementsByTag("a"); // // for (Element link : links) // { // //必要的筛选 // String linkHref = link.attr("href"); // String linkText = link.text(); // // data = new LinkTypeData(); // data.setLinkHref(linkHref); // data.setLinkText(linkText); // // datas.add(data); // } Elements links = result.getElementsByTag("body"); for (Element link : links) { String linkHref = link.attr("href"); String linkText = link.text(); data = new LinkTypeData(); data.setLinkHref(linkHref); data.setLinkText(linkText); datas.add(data); } } } catch (IOException e) { e.printStackTrace(); } return datas; } /** * 对传入的参数进行必要的校验 */ private static void validateRule(Rule rule) { String url = rule.getUrl(); if (TextUtil.isEmpty(url)) { throw new RuleException("url不能为空!"); } if (!url.startsWith("http://")) { throw new RuleException("url的格式不正确!"); } if (rule.getParams() != null && rule.getValues() != null) { if (rule.getParams().length != rule.getValues().length) { throw new RuleException("参数的键值对个数不匹配!"); } } } }
7.测试类(junit单元测试)
import java.util.List;public class Test { @org.junit.Test public void getDatasByClass() { // Rule rule = new Rule( // "http://www1.sxcredit.gov.cn/public/infocomquery.do?method=publicIndexQuery", // new String[] { "query.enterprisename","query.registationnumber" }, new String[] { "兴网","" }, // "cont_right", Rule.CLASS, Rule.POST); Rule rule=new Rule("http://hiweshare.com/topicaction/gettopics.do", new String[]{"page","rows"}, new String[]{"1","12"}, null, -1, Rule.POST); List<LinkTypeData> extracts = ExtractService.extract(rule); printf(extracts); } public void printf(List<LinkTypeData> datas) { for (LinkTypeData data : datas) { System.out.println(data.getLinkText()); System.out.println(data.getLinkHref()); System.out.println("***********************************"); } } }
8.可能会出现的报错信息:
如果这样获取连接的时候可能会报错:
Connection conn = Jsoup.connect(url)
报错信息:
Unhandled content type. Must be text/*, application/xml, or application/xhtm
修改为:
Connection conn = Jsoup.connect(url).ignoreContentType(true);
hiweshare.com是我的一个正在建设的网站,我先在上面测试了下爬虫
http://hiweshare.com/
感谢博主:http://blog.csdn.net/lmj623565791/article/details/23272657
0 0
- Java学习: 网络爬虫
- java爬虫学习
- 关于java爬虫的学习
- Java学习-简单爬虫系统
- java爬虫学习日记2-宽度优先爬虫代码实现
- java爬虫学习日记1-基本爬虫原理介绍
- Jsoup学习 JAVA爬虫爬取美女网站 JAVA爬虫爬取美图网站 爬虫
- Java微博爬虫的学习
- 学习简单的Java爬虫的心得
- 爬虫学习之Java(一)
- Java爬虫学习之元(ZERO)
- Java学习-URL和爬虫原理
- JS爬虫,Java爬虫
- 爬虫学习
- 爬虫学习
- 爬虫学习
- JAVA爬虫
- Java 爬虫
- 349. Intersection of Two Arrays的C++解法
- ScrollView上下滑动修改顶部title背景色淡入淡出
- html中引入js,不同时候引入的区别
- Android属性动画之XML定义方式
- Mybatis 多表关联查询(1) one-to-one关系
- java爬虫学习
- python库numpy的使用
- icon在线制作
- 关于数学中的范数
- 《直播疑难杂症排查》之二:播放卡顿
- 什么是医院随访系统
- 模拟线性队列
- LoadRunner使用
- java实现24点算法