网络爬虫,,,spider

来源:互联网 发布:知乎 男友阴茎太大 编辑:程序博客网 时间:2024/06/07 02:02
   这段时间要做一个门户网站,新闻模块的信息采集,谈到信息采集,就想到了网络爬虫,毕竟我们没有太多的经历去自己写新闻,那么sina,sohu,就不好意思了,借用一下信息,网络提倡资源共享,这也是我一直追求的,看了一下。我也曾经想过用爬虫爬一些网络上的资源,拿来主义。
   
package com.opensky.util;import java.util.HashMap;import java.util.Map;import org.htmlparser.Node;import org.htmlparser.NodeFilter;import org.htmlparser.Parser;import org.htmlparser.filters.AndFilter;import org.htmlparser.filters.HasAttributeFilter;import org.htmlparser.filters.HasParentFilter;import org.htmlparser.filters.NodeClassFilter;import org.htmlparser.filters.TagNameFilter;import org.htmlparser.parserapplications.filterbuilder.Filter;import org.htmlparser.tags.BodyTag;import org.htmlparser.tags.LinkTag;import org.htmlparser.util.NodeList;import org.htmlparser.util.ParserException;/** * httpclient与htmlparse对网页的解析 *  * @author Administrator *  */public class HtmlparseUtil {WebHttpClient util = new WebHttpClient();/** * 获得网页中的超链接,将href和text保存在Map中:map(href,text) *  * @param url * @param charset * @return */public Map<String, String> linkGet(String url, String charset) {String content = util.getWebContentByGet(url, charset);Map<String, String> linkMap = new HashMap<String, String>();try {// 开始解析Parser parser = Parser.createParser(content, charset);// 过滤出<a></a>标签NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);NodeList list = parser.extractAllNodesThatMatch(linkFilter);Node node = null;for (int i = 0; i < list.size(); i++) {node = list.elementAt(i);// 获得网页中的链接map(href,text)linkMap.put(((LinkTag) node).getLink(), this.processText(((LinkTag) node).getLinkText()));}} catch (ParserException e) {e.printStackTrace();}return linkMap;}/** * 获得网页<body></body>标签中的内容, 保存在body中 *  * @param url * @param charset * @return */public String bodyGet(String url, String charset) {String content = util.getWebContentByGet(url, charset);String body = "";try {Parser parser = Parser.createParser(content, charset);// 过滤<body></body>标签NodeFilter bodyFilter = new NodeClassFilter(BodyTag.class);NodeList list = parser.extractAllNodesThatMatch(bodyFilter);Node node = null;for (int i = 0; i < list.size(); i++) {node = list.elementAt(i);// 获得网页内容 保存在content中body = ((BodyTag) node).getBody();}} catch (ParserException e) {e.printStackTrace();}return body;}/** * 过滤出class为term的<span>元素,并获得他们的文本 *  * @param url * @param charset * @return */public Map<String, String> termGet(String url, String charset) {// 获得网页中的所有HTML内容String content = util.getWebContentByGet(url, charset);Map<String, String> map = new HashMap<String, String>();try {// 开始解析// 过滤出class为term的<span>元素Parser parser = Parser.createParser(content, charset);// TagNameFilter(标签名字)------HasAttributeFilter(属性,和属性的值)AndFilter filter = new AndFilter(new TagNameFilter("span"),new HasAttributeFilter("class", "term"));Node node = null;NodeList nodeList = parser.parse(filter);for (int i = 0; i < nodeList.size(); i++) {node = nodeList.elementAt(i);// System.out.println("-----------------------------node.toPlainTextString()--------------->");// System.out.println(node.toPlainTextString());map.put("term", node.toPlainTextString());}// 过滤出class为start-time的<span>元素Parser parser2 = Parser.createParser(content, charset);AndFilter filter2 = new AndFilter(new TagNameFilter("span"),new HasAttributeFilter("class", "start-time"));NodeList nodeList2 = parser2.parse(filter2);for (int i = 0; i < nodeList2.size(); i++) {node = nodeList2.elementAt(i);map.put("start-time", node.toPlainTextString());}// 过滤出id为J_SingleEndTimeLabel的<span>元素Parser parser3 = Parser.createParser(content, charset);AndFilter filter3 = new AndFilter(new TagNameFilter("span"),new HasAttributeFilter("id", "J_SingleEndTimeLabel"));NodeList nodeList3 = parser3.parse(filter3);for (int i = 0; i < nodeList3.size(); i++) {node = nodeList3.elementAt(i);map.put("end-time", node.toPlainTextString());}// 过滤出class为box post的<div>元素Parser parser4 = Parser.createParser(content, charset);AndFilter filter4 = new AndFilter(new TagNameFilter("div"),new HasAttributeFilter("class", "box post"));NodeList nodeList4 = parser4.parse(filter4);for (int i = 0; i < nodeList4.size(); i++) {node = nodeList4.elementAt(i);String temp = node.toPlainTextString().trim();temp = temp.substring(10, 20).trim();map.put("pre-term", temp);}// 过滤出class为J_AwardNumber的<span>元素Parser parser5 = Parser.createParser(content, charset);// AndFilter filter5 =// new AndFilter(new TagNameFilter("span"),new// HasAttributeFilter("class","J_AwardNumber"));NodeList nodeList5 = parser5.parse(new HasAttributeFilter("class","J_AwardNumber"));StringBuffer buffer = new StringBuffer();for (int i = 0; i < nodeList5.size(); i++) {node = nodeList5.elementAt(i);buffer.append("," + node.toPlainTextString());}buffer.append("|");// 过滤出class为blue J_AwardNumber的<span>元素Parser parser6 = Parser.createParser(content, charset);// AndFilter filter6 =// new AndFilter(new TagNameFilter("span"),new// HasAttributeFilter("class","blue J_AwardNumber"));NodeList nodeList6 = parser6.parse(new HasAttributeFilter("class","blue J_AwardNumber"));for (int i = 0; i < nodeList6.size(); i++) {node = nodeList6.elementAt(i);buffer.append(node.toPlainTextString() + ",");}map.put("numbers", buffer.toString());} catch (ParserException e) {// TODO Auto-generated catch blocke.printStackTrace();}return map;}/** * 过滤出class为list_00f_f14的 * <ul> * 元素,并获得其中 * <li>的文本,,新浪 国内新闻的,,国内要闻信息 *  * @param url * @param charset * @return */public Map<String, String> sinaChinaNewsGet(String url, String charset) {// 获得网页中的所有HTML内容String content = util.getWebContentByGet(url, charset);Map<String, String> map = new HashMap<String, String>();try {// 开始解析// 过滤出class为list_00f_f14的<ul>元素Parser parser = Parser.createParser(content, charset);// TagNameFilter(标签名字)------HasAttributeFilter(属性,和属性的值)//AndFilter filter = new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "list_00f_f14"));AndFilter filter=new AndFilter(new TagNameFilter("li"),new HasParentFilter(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "list_00f_f14"))));// TagNameFilter filter = new TagNameFilter("a");Node node = null;NodeList nodeList = parser.parse(filter);for (int i = 0; i < nodeList.size(); i++) {node = nodeList.elementAt(i);//System.out.println("------------------------>>>>国内新闻版块---新浪>>>>>>>>>>>>>>>>>>");//System.out.println("标题:"+node.toPlainTextString());map.put("title" + i, node.toPlainTextString());NodeList nodeChildList = node.getChildren();Node nodeChild = null;for (int j = 0; j < nodeChildList.size(); j++) {nodeChild = nodeChildList.elementAt(j);if (nodeChild instanceof LinkTag) {String hrefStr = ((LinkTag) nodeChild).getAttribute("href");//System.out.println("链接:"+hrefStr);map.put("href"+i, hrefStr);}}}} catch (ParserException e) {// TODO Auto-generated catch blocke.printStackTrace();}return map;}private String processText(String content) {content = content.trim().replaceAll(" ", "");// content=content.replaceAll("<p>", "\n");// content=content.replaceAll("</TD>", "");// content=content.replaceAll("</div>", "");// content=content.replaceAll("</a>", "");// content=content.replaceAll("<a href=.*>", "");return content;}}


          

原创粉丝点击