聚焦网络爬虫之Xpath+HttpClient快速实现页面新闻抓取

来源:互联网 发布:淘宝如何引流推广 编辑:程序博客网 时间:2024/05/16 01:02

最近因为项目需求,抓取了大大小小多个网站的新闻,刚开始写用的是jsoup解析页面,每个站点都有写一套解析方案,效率较慢,后来利用xpath解析,开发数度有了很大的提升,在一周内完成了一百多个站点的新闻抓取。

下面是我一个简单示例,博主刚毕业,还是个技术小白,如有写的不对或不妥的地方,请评论指出类,大家共同进步,下图是测试效果,不同的网站只需要更改xpath即可


为了帮助有需要的朋友,下面贴上我写的代码模型,由于新闻网站一般没有反爬,所有demo中没有反爬的相关策略,一般的爬虫项目由下载器、调度器、解析器组成,本demo中没有实现调度器。

1、项目是基于maven搭建的,首先引入相关依赖

<properties><project.build.sourceEncoding>UTF-8</project.build.sourceEncoding><!-- apache.httpclient --><httpclient_version>4.5.1</httpclient_version><!-- htmlcleaner --><htmlcleaner_version>2.16</htmlcleaner_version><!-- logger --><log4j_version>1.2.17</log4j_version></properties><dependencies><!-- https://mvnrepository.com/artifact/log4j/log4j --><dependency><groupId>log4j</groupId><artifactId>log4j</artifactId><version>${log4j_version}</version></dependency><dependency><groupId>com.google.guava</groupId><artifactId>guava</artifactId><version>19.0</version></dependency><dependency><groupId>commons-io</groupId><artifactId>commons-io</artifactId><version>2.4</version></dependency><dependency><groupId>commons-beanutils</groupId><artifactId>commons-beanutils</artifactId><version>1.9.2</version></dependency><dependency><groupId>commons-lang</groupId><artifactId>commons-lang</artifactId><version>2.6</version></dependency><!-- https://mvnrepository.com/artifact/net.sourceforge.htmlcleaner/htmlcleaner --><dependency><groupId>net.sourceforge.htmlcleaner</groupId><artifactId>htmlcleaner</artifactId><version>${htmlcleaner_version}</version></dependency><dependency><groupId>junit</groupId><artifactId>junit</artifactId><version>3.8.1</version><scope>test</scope></dependency><dependency><groupId>org.apache.httpcomponents</groupId><artifactId>fluent-hc</artifactId><version>${httpclient_version}</version></dependency><dependency><groupId>org.apache.httpcomponents</groupId><artifactId>httpclient</artifactId><version>${httpclient_version}</version></dependency><dependency><groupId>org.apache.httpcomponents</groupId><artifactId>httpcore</artifactId><version>4.4.3</version></dependency><dependency><groupId>org.apache.httpcomponents</groupId><artifactId>httpmime</artifactId><version>${httpclient_version}</version></dependency></dependencies>

2、下载器使用的是apache的开源项目httpclient,包含了httpclient连接池,工具类等

2.1 HttpClient连接池

package com.zhb.ims.utils.httpclient;import java.security.cert.CertificateException;import java.security.cert.X509Certificate;import java.util.concurrent.atomic.AtomicBoolean;import java.util.concurrent.locks.Lock;import java.util.concurrent.locks.ReentrantLock;import javax.net.ssl.HostnameVerifier;import javax.net.ssl.SSLContext;import org.apache.http.client.HttpRequestRetryHandler;import org.apache.http.config.Registry;import org.apache.http.config.RegistryBuilder;import org.apache.http.conn.socket.ConnectionSocketFactory;import org.apache.http.conn.socket.PlainConnectionSocketFactory;import org.apache.http.conn.ssl.SSLConnectionSocketFactory;import org.apache.http.impl.client.CloseableHttpClient;import org.apache.http.impl.client.HttpClients;import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;import org.apache.http.ssl.SSLContexts;import org.apache.http.ssl.TrustStrategy;public class HttpClientManger {private PoolingHttpClientConnectionManager connectionManager;private HttpRequestRetryHandler httpRequestRetryHandler;private static HttpClientManger httpClientManger;private static Lock lock = new ReentrantLock();private volatile AtomicBoolean isShutDown;public void init() {try {SSLContext sslContext = SSLContexts.custom().loadTrustMaterial(null, new DefaultTrustStrategy()).build();@SuppressWarnings("deprecation")HostnameVerifier hostnameVerifier = SSLConnectionSocketFactory.ALLOW_ALL_HOSTNAME_VERIFIER;SSLConnectionSocketFactory sslsf = new SSLConnectionSocketFactory(sslContext, hostnameVerifier);Registry<ConnectionSocketFactory> socketFactoryRegistry = RegistryBuilder.<ConnectionSocketFactory> create().register("http", PlainConnectionSocketFactory.getSocketFactory()).register("https", sslsf).build();connectionManager = new PoolingHttpClientConnectionManager(socketFactoryRegistry);connectionManager.setMaxTotal(800);connectionManager.setDefaultMaxPerRoute(20);httpRequestRetryHandler = new DefaultRequestRetryHandler();this.isShutDown = new AtomicBoolean(false);} catch (Exception e) {e.printStackTrace();}}private HttpClientManger() {super();this.isShutDown = new AtomicBoolean(true);init();}public static HttpClientManger newInstance() {lock.lock();if (httpClientManger == null) {httpClientManger = new HttpClientManger();}lock.unlock();return httpClientManger;}public CloseableHttpClient getClient() {CloseableHttpClient client = null;lock.lock();if (this.isShutDown.compareAndSet(false, true)) {client = HttpClients.custom().setConnectionManager(this.connectionManager).setRetryHandler(httpRequestRetryHandler).build();}else {init();client = HttpClients.custom().setConnectionManager(connectionManager).setRetryHandler(httpRequestRetryHandler).build();}lock.unlock();return client;}public void destory() {if (this.isShutDown.compareAndSet(false, true)) {this.connectionManager.shutdown();}isShutDown = new AtomicBoolean(true);}class DefaultTrustStrategy implements TrustStrategy{@Overridepublic boolean isTrusted(X509Certificate[] arg0, String arg1) throws CertificateException {return true;}}}
2.2默认的重连策略

package com.zhb.ims.utils.httpclient;import java.io.IOException;import java.io.InterruptedIOException;import java.net.UnknownHostException;import java.util.Iterator;import java.util.List;import javax.net.ssl.SSLException;import javax.net.ssl.SSLHandshakeException;import org.apache.http.NoHttpResponseException;import org.apache.http.client.HttpRequestRetryHandler;import org.apache.http.conn.ConnectTimeoutException;import org.apache.http.protocol.HttpContext;import com.google.common.collect.Lists;public class DefaultRequestRetryHandler implements HttpRequestRetryHandler {private int executionCount;List<Class<? extends Exception>> ignoreException;List<Class<? extends Exception>> dealException;public DefaultRequestRetryHandler() {super();Init();}@SuppressWarnings("unchecked")public DefaultRequestRetryHandler(int executionCount) {super();this.executionCount = executionCount;ignoreException = ignoreException.isEmpty()? Lists.newArrayList(ConnectTimeoutException.class,SSLException.class,UnknownHostException.class,InterruptedIOException.class,SSLHandshakeException.class): ignoreException;dealException = dealException.isEmpty()? Lists.newArrayList(NoHttpResponseException.class): dealException;}@SuppressWarnings("unchecked")public DefaultRequestRetryHandler(int executionCount, List<Class<? extends Exception>> ignoreException) {super();this.executionCount = executionCount;this.ignoreException = ignoreException;dealException = dealException.isEmpty()? Lists.newArrayList(NoHttpResponseException.class): dealException;}public DefaultRequestRetryHandler(int executionCount, List<Class<? extends Exception>> ignoreException,List<Class<? extends Exception>> dealException) {super();this.executionCount = executionCount;this.ignoreException = ignoreException;this.dealException = dealException;}@SuppressWarnings("unchecked")private void Init() {executionCount = executionCount <= 0 ? 5 : executionCount;ignoreException = (ignoreException == null ||ignoreException.isEmpty())? Lists.newArrayList(ConnectTimeoutException.class,SSLException.class,UnknownHostException.class,InterruptedIOException.class,SSLHandshakeException.class): ignoreException;dealException = (dealException ==null || dealException.isEmpty())? Lists.newArrayList(NoHttpResponseException.class): dealException;}@Overridepublic boolean retryRequest(IOException exception, int executionCount, HttpContext context) {if (executionCount >= this.executionCount) {return false;}for (Iterator<Class<? extends Exception>> iterator = ignoreException.iterator(); iterator.hasNext();) {Class<? extends Exception> clazz = (Class<? extends Exception>) iterator.next();if (exception.getClass().isAssignableFrom(clazz)) {return false;}}for (Iterator<Class<? extends Exception>> iterator = dealException.iterator(); iterator.hasNext();) {Class<? extends Exception> clazz = (Class<? extends Exception>) iterator.next();if (exception.getClass().isAssignableFrom(clazz)) {return true;}}exception.printStackTrace();return false;}}
2.3工具类
package com.zhb.ims.utils.httpclient;import java.awt.image.BufferedImage;import java.io.ByteArrayInputStream;import java.io.ByteArrayOutputStream;import java.io.File;import java.io.FileOutputStream;import java.io.InputStream;import java.util.ArrayList;import java.util.Iterator;import java.util.List;import java.util.Map;import java.util.Map.Entry;import java.util.regex.Matcher;import java.util.regex.Pattern;import javax.imageio.ImageIO;import org.apache.commons.io.IOUtils;import org.apache.commons.lang.StringUtils;import org.apache.http.Header;import org.apache.http.client.entity.UrlEncodedFormEntity;import org.apache.http.client.methods.CloseableHttpResponse;import org.apache.http.client.methods.HttpGet;import org.apache.http.client.methods.HttpPost;import org.apache.http.client.methods.HttpRequestBase;import org.apache.http.impl.client.CloseableHttpClient;import org.apache.http.message.BasicNameValuePair;import org.apache.http.util.EntityUtils;public class ClientMethodUtils {private static final String DefaultCharSet = "utf-8";/** * 给post或者get添加header参数的泛型方法 *  * @param c *            HttpPost/HttpGet对象 * @param map *            存放header的key-value的Map * @return * @throws Exception * @throws InstantiationException */public static <T extends HttpRequestBase> T addHeader(T t, Map<String, String> map) {if (t != null && map != null && map.size() > 0) {Iterator<Entry<String, String>> iterable = map.entrySet().iterator();while (iterable.hasNext()) {try {Entry<String, String> entry = iterable.next();t.setHeader(entry.getKey(), entry.getValue());} catch (Exception e) {System.out.println(e == null ? "HttpRequestBase Add Params Error!" : e.getMessage());}}} else {System.out.println("Parama Is Illegal!");}return t;}/** * 给post方法添加参数 *  * @param post * @param paramsMap * @param charSet * @return */public static HttpPost addPostWithParams(HttpPost post, Map<String, String> paramsMap, String charSet) {if (post != null && paramsMap != null && paramsMap.size() > 0) {List<BasicNameValuePair> nvps = new ArrayList<>();Iterator<Entry<String, String>> iterator = paramsMap.entrySet().iterator();try {while (iterator.hasNext()) {Entry<String, String> entry = iterator.next();String key = entry.getKey();String value = entry.getValue();if (key != null) {nvps.add(new BasicNameValuePair(key, value == null ? "" : value));} else {continue;}}post.setEntity(new UrlEncodedFormEntity(nvps, charSet));} catch (Exception e) {System.out.println("Add Params Error!");}} else {System.out.println("Params Is Illegal!");}return post;}/** * 从页面中解析字体编码 * @param htmlPage * @return */private static String getCharSet(final String htmlPage) {String regex1 = "<meta.*charset=([^;^\"]*).*";String value1 = com.lhh.util.StringUtils.getRegexIndex(htmlPage, regex1, 1);if (StringUtils.isNotBlank(value1)) {return value1;}return null;}/** * 从ResponseHeader头中读取字体编码 * @param response * @return */public static String charSet(final CloseableHttpResponse response){String charSet = null;if (response != null) {Header[] headers = response.getHeaders("Content-Type");String regex = "charset=([\\s\\S]*?);{0,1}";Pattern pattern = Pattern.compile(regex);for (Header header : headers) {String value = header.getValue().toLowerCase();Matcher matcher = pattern.matcher(value);while (matcher.find()) {charSet = matcher.group(1);return charSet;}}}return charSet;}/** *  * @param client * @param httpRequestBase * @param charSet * @return */public static String getContent(CloseableHttpClient client, HttpRequestBase httpRequestBase) {String pageContent = "";if (client != null && httpRequestBase != null) {try (ByteArrayOutputStream baos = new ByteArrayOutputStream();) {CloseableHttpResponse response = client.execute(httpRequestBase);IOUtils.copy(response.getEntity().getContent(), baos);InputStream stream1 = new ByteArrayInputStream(baos.toByteArray());InputStream stream2 = new ByteArrayInputStream(baos.toByteArray());String htmlPage = IOUtils.toString(stream1);String charSet = charSet(response);if (StringUtils.isBlank(charSet)) {charSet = getCharSet(htmlPage);}//未解析到字体编码,使用默认的字体编码pageContent = IOUtils.toString(stream2, StringUtils.isNotBlank(charSet) ? charSet : DefaultCharSet);response.getEntity().getContent().close();EntityUtils.consume(response.getEntity());response.close();httpRequestBase.abort();stream1.close();stream2.close();stream1 = null;stream2 = null;} catch (Exception e) {System.out.println(e == null ? "Do Execute Error!" : e.getMessage());e.printStackTrace();}finally {}} else {System.out.println("Params Is Illegal!");}return pageContent;}/** * 得到验证码图片 *  * @param client * @param url * @return */public static BufferedImage getImageByNet(CloseableHttpClient client, String url, String filePath) {HttpGet get = new HttpGet(url);CloseableHttpResponse response;BufferedImage image = null;try {response = client.execute(get);InputStream is = response.getEntity().getContent();File f = new File(filePath);if (!f.exists()) {f.createNewFile();}FileOutputStream fos = new FileOutputStream(f);byte[] b = new byte[1024];int len = -1;while ((len = is.read(b)) != -1) {fos.write(b, 0, len);}response.close();get.abort();fos.close();image = ImageIO.read(f);} catch (Exception e) {e.printStackTrace();}return image;}}

2.4简单下载器

package com.lhh.request;import org.apache.http.client.methods.HttpGet;import org.apache.http.client.methods.HttpPost;import org.apache.http.impl.client.CloseableHttpClient;import com.zhb.ims.utils.httpclient.ClientMethodUtils;import com.zhb.ims.utils.httpclient.HttpClientManger;public class BaseTestRequest {public static String getContent(String url) {HttpGet get = new HttpGet(url);CloseableHttpClient client = HttpClientManger.newInstance().getClient();String page = ClientMethodUtils.getContent(client, get);return page;}public static String postContent(String url) {HttpPost get = new HttpPost(url);CloseableHttpClient client = HttpClientManger.newInstance().getClient();String page = ClientMethodUtils.getContent(client, get);return page;}}


3、解析器代码

package com.lhh.parse;import java.util.HashMap;import java.util.Iterator;import java.util.Map;import java.util.Map.Entry;import javax.xml.namespace.QName;import javax.xml.xpath.XPath;import javax.xml.xpath.XPathConstants;import javax.xml.xpath.XPathFactory;import org.apache.commons.lang.StringUtils;import org.htmlcleaner.CleanerProperties;import org.htmlcleaner.DomSerializer;import org.htmlcleaner.HtmlCleaner;import org.htmlcleaner.TagNode;import org.w3c.dom.Document;import org.w3c.dom.Node;import org.w3c.dom.NodeList;import com.lhh.util.LoggerUtils;import com.lhh.util.ObjectUtils;public class BaseParse {public static String getNodeValue(final Object result) {if (result != null) {if (result instanceof NodeList) {final StringBuffer stringBuffer = new StringBuffer();NodeList nodeList = (NodeList) result;for (int i = 0; i < nodeList.getLength(); i++) {Node node = nodeList.item(i);stringBuffer.append(node.getNodeValue().trim().replaceAll("\n", "") + " ");}return stringBuffer.toString();}else {LoggerUtils.warn("Result Is Not A Node Or NodeList");}}else {LoggerUtils.warn("Result Is Null");}return null;}public static <T> T parseObject(Class<T> clazz,String htmlPage, Map<String, String> itemMap) throws Exception {HtmlCleaner hcCleaner = new HtmlCleaner();TagNode tagNode = hcCleaner.clean(htmlPage);Document dom = new DomSerializer(new CleanerProperties()).createDOM(tagNode);Iterator<Entry<String, String>> iterator = itemMap.entrySet().iterator();XPath xPath = XPathFactory.newInstance().newXPath();Map<Object, Object> resultMap = new HashMap<>();while (iterator.hasNext()) {Entry<String, String> entry = iterator.next();String key = entry.getKey();String xpathStr = entry.getValue();if (StringUtils.isNotBlank(key) && StringUtils.isNotBlank(xpathStr)) {Object result = xPath.evaluate(xpathStr, dom, XPathConstants.NODESET);resultMap.put(key, getNodeValue(result));}else {LoggerUtils.warn("Key Or Xpath Is Blank!");}}T t = clazz.newInstance();ObjectUtils.copyWithMap(t, resultMap);return t;}public static Object parse(String htmlPage,String xPathStr,QName qName) throws Exception{Object result = null;if (StringUtils.isNotBlank(htmlPage) && StringUtils.isNotBlank(xPathStr)) {HtmlCleaner hcCleaner = new HtmlCleaner();TagNode tagNode = hcCleaner.clean(htmlPage);Document dom = new DomSerializer(new CleanerProperties()).createDOM(tagNode);XPath xPath = XPathFactory.newInstance().newXPath();result = xPath.evaluate(xPathStr, dom, qName);}else {LoggerUtils.warn("Key Or Xpath Is Blank!");}return result;}}
4、用到的工具类

4.1、Logger工具类

package com.lhh.util;import java.util.concurrent.locks.Lock;import java.util.concurrent.locks.ReentrantLock;import org.apache.log4j.Logger;//import org.apache.log4j.Logger;/** *  * @author liuhang * */public class LoggerUtils {static class LoggerWapper {private Logger logger;private StackTraceElement stackTraceElement;private String methodName;private int lineNum;private Object message;private Object wapperMessage;private Class<?> clazz;public Logger getLogger() {return logger;}public void setLogger(Logger logger) {this.logger = logger;}public StackTraceElement getStackTraceElement() {return stackTraceElement;}public void setStackTraceElement(StackTraceElement stackTraceElement) {this.stackTraceElement = stackTraceElement;}public String getMethodName() {return methodName;}public void setMethodName(String methodName) {this.methodName = methodName;}public int getLineNum() {return lineNum;}public void setLineNum(int lineNum) {this.lineNum = lineNum;}public Object getMessage() {return message;}public void setMessage(Object message) {this.message = message;}public Object getWapperMessage() {return wapperMessage;}public void setWapperMessage(Object wapperMessage) {this.wapperMessage = wapperMessage;}public Class<?> getClazz() {return clazz;}public void setClazz(Class<?> clazz) {this.clazz = clazz;}public LoggerWapper(Object message) {super();this.message = message;}@Overridepublic String toString() {return "LoggerWapper [logger=" + logger + ", stackTraceElement=" + stackTraceElement + ", methodName="+ methodName + ", lineNum=" + lineNum + ", message=" + message + ", wapperMessage=" + wapperMessage+ ", clazz=" + clazz + "]";}}private static Class<?> getInvokeClass(StackTraceElement stackTraceElement) {if (stackTraceElement != null) {Class<?> clazz;try {clazz = Class.forName(stackTraceElement.getClassName());return clazz;} catch (ClassNotFoundException e) {e.printStackTrace();}}return null;}private static String getInvokeMethodName(StackTraceElement stackTraceElement) {if (stackTraceElement != null) {String methodName = null;methodName = stackTraceElement.getMethodName();return methodName;}return null;}private static Object msgWapper(Object message, StackTraceElement stackTraceElement) {if (stackTraceElement != null) {StringBuffer stringBuffer = new StringBuffer("");int lineNum = getInvokeLineNum(stackTraceElement);String methodName = getInvokeMethodName(stackTraceElement);Class<?> clazz = getInvokeClass(stackTraceElement);if (lineNum > 0) {stringBuffer.append(clazz.getName() + "." + methodName + "(" + clazz.getSimpleName() + ".java:" + lineNum + ")");stringBuffer.append(" -  " + message);}return stringBuffer.toString();}return message;}private static int getInvokeLineNum(StackTraceElement stackTraceElement) {int num = 0;if (stackTraceElement != null) {num = stackTraceElement.getLineNumber();}return num;}private static StackTraceElement getInvokeInfo(int num) {if (num > -1) {Lock lock = new ReentrantLock();lock.lock();StackTraceElement[] stackTraceElements = Thread.currentThread().getStackTrace();lock.unlock();if (stackTraceElements != null && stackTraceElements.length > num) {StackTraceElement stackTraceElement = stackTraceElements[num];return stackTraceElement;}}return null;}private static LoggerWapper getLoggerWapper(Object message) {LoggerWapper loggerWapper = new LoggerWapper(message);StackTraceElement stackTraceElement = getInvokeInfo(4);loggerWapper.setStackTraceElement(stackTraceElement);Class<?> clazz = getInvokeClass(stackTraceElement);loggerWapper.setClazz(clazz);Logger logger = Logger.getLogger(clazz);loggerWapper.setLogger(logger);String methodName = getInvokeMethodName(stackTraceElement);loggerWapper.setMethodName(methodName);int lineNum = getInvokeLineNum(stackTraceElement);loggerWapper.setLineNum(lineNum);Object wapperMessage = msgWapper(message, loggerWapper.getStackTraceElement());loggerWapper.setWapperMessage(wapperMessage);;return loggerWapper;}public static void debug(Object message) {LoggerWapper loggerWapper = getLoggerWapper(message);loggerWapper.getLogger().debug(loggerWapper.getWapperMessage());}public static void debug(Object message, Throwable t) {LoggerWapper loggerWapper = getLoggerWapper(message);loggerWapper.getLogger().debug(loggerWapper.getWapperMessage(), t);}public static void error(Object message) {LoggerWapper loggerWapper = getLoggerWapper(message);loggerWapper.getLogger().error(loggerWapper.getWapperMessage());}public static void error(Object message, Throwable t) {LoggerWapper loggerWapper = getLoggerWapper(message);loggerWapper.getLogger().error(loggerWapper.getWapperMessage(), t);}public static void fatal(Object message) {LoggerWapper loggerWapper = getLoggerWapper(message);loggerWapper.getLogger().fatal(loggerWapper.getWapperMessage());}public static void fatal(Object message, Throwable t) {LoggerWapper loggerWapper = getLoggerWapper(message);loggerWapper.getLogger().fatal(loggerWapper.getWapperMessage(), t);}public static void info(Object message) {LoggerWapper loggerWapper = getLoggerWapper(message);loggerWapper.getLogger().info(loggerWapper.getWapperMessage());}public static void info(Object message, Throwable t) {LoggerWapper loggerWapper = getLoggerWapper(message);loggerWapper.getLogger().info(loggerWapper.getWapperMessage(), t);}public static void warn(Object message) {LoggerWapper loggerWapper = getLoggerWapper(message);loggerWapper.getLogger().warn(loggerWapper.getWapperMessage());}public static void warn(Object message, Throwable t) {LoggerWapper loggerWapper = getLoggerWapper(message);loggerWapper.getLogger().warn(loggerWapper.getWapperMessage(), t);}}
4.2、字符串工具类

package com.lhh.util;import java.util.regex.Matcher;import java.util.regex.Pattern;public class StringUtils extends org.apache.commons.lang.StringUtils {public static String getRegexIndex(final String str, final String regex, final int index) {if (isNotBlank(regex) && isNotBlank(str)) {if (index >= 1) {Pattern pattern = Pattern.compile(regex);Matcher matcher = pattern.matcher(str);while (matcher.find()) {if (matcher.groupCount() < index) {LoggerUtils.warn("Index Is OutOfBounds!");} else {return matcher.group(index);}return "";}} else {LoggerUtils.warn("Index Is Illegal!");}} else {LoggerUtils.warn("Str Or Regex Is Blank!");}return null;}}
4.3、对象工具类

package com.lhh.util;import java.lang.reflect.Field;import java.util.Iterator;import java.util.Map;import java.util.Map.Entry;public class ObjectUtils {/** * 把r对象的所有属性拷贝到t对象中 * @param t * @param r */public static <T, R> void copy(final T t, final R r) {if (t != null && r != null) {Field[] rfields = r.getClass().getDeclaredFields();Field[] tfields = t.getClass().getDeclaredFields();L: for (Field rfield : rfields) {rfield.setAccessible(true);for (Field tfield : tfields) {if (rfield.getName().equals(tfield.getName())) {tfield.setAccessible(true);try {tfield.set(t, rfield.get(r));} catch (Exception e) {continue L;}}}}}}/** * 把map对象的key-value拷贝到t对象中 * @param t * @param r */public static <T> void copyWithMap(final T t, final Map<Object, Object> resMap) {if (t != null && resMap != null) {Field[] tfields = t.getClass().getDeclaredFields();Iterator<Entry<Object, Object>> iterator = resMap.entrySet().iterator();L: while (iterator.hasNext()) {Entry<Object, Object> entry = iterator.next();if (entry != null) {Object key = entry.getKey();Object value = entry.getValue();if (key != null && entry != null) {for (Field tfield : tfields) {if (key.toString().equals(tfield.getName())) {tfield.setAccessible(true);try {tfield.set(t, value);} catch (Exception e) {continue L;}}}}}}}}}
5、模型对象

package com.lhh.model;public class NewModel {private String title;private String content;private String time;private String source;public String getTitle() {return title;}public void setTitle(String title) {this.title = title;}public String getContent() {return content;}public void setContent(String content) {this.content = content;}public String getTime() {return time;}public void setTime(String time) {this.time = time;}public String getSource() {return source;}public void setSource(String source) {this.source = source;}@Overridepublic String toString() {return "NewModel [title=" + title + ", content=" + content + ", time=" + time + ", source=" + source + "]";}}
6、测试

package com.lhh.test;import java.util.HashMap;import java.util.Map;import javax.xml.xpath.XPathConstants;import org.apache.http.client.methods.HttpGet;import com.lhh.model.NewModel;import com.lhh.parse.BaseParse;import com.lhh.request.BaseTestRequest;import com.lhh.util.LoggerUtils;import com.zhb.ims.utils.httpclient.ClientMethodUtils;import com.zhb.ims.utils.httpclient.HttpClientManger;public class Test {public static void main(String[] args) throws Exception {//新闻列表urlString newListUrl = "http://roll.news.sina.com.cn/s/channel.php?ch=01#col=89&spec=&type=&ch=01&k=&offset_page=0&offset_num=0&num=60&asc=&page=1";String newListPage = BaseTestRequest.getContent(newListUrl);//获取新闻列表页面上新闻UrlString xpath = "//div[@id='d_list']/ul/li/span[@class='c_tit']/a/@href";Object result = BaseParse.parse(newListPage, xpath, XPathConstants.NODESET);String urlList = BaseParse.getNodeValue(result);String [] urlArray = urlList.split(" ");for (int i = 0; i < urlArray.length; i++) {Map<String, String> map = new HashMap<>();//配置新闻标题的xpath<span style="white-space:pre"></span>map.put("title", "//*[@id='main_title']/text() | //*[@id='artibodyTitle']/text()");<span style="white-space:pre"></span>//配置新闻发布时间的xpath<span style="white-space:pre"></span>map.put("time", "//*[@id='page-tools']/span/span[@class='titer']/text() | //*[@id='navtimeSource']/text()");<span style="white-space:pre"></span>//配置新闻正文内容的xpath<span style="white-space:pre"></span>map.put("content", "//*[@id='artibody']/p/text()");HttpGet get = new HttpGet(urlArray[i]);//使用下载器下载页面元素String page = ClientMethodUtils.getContent(HttpClientManger.newInstance().getClient(), get);//调用解析取解析页面数据NewModel weatherPojo = BaseParse.parseObject(NewModel.class, page, map);LoggerUtils.error(weatherPojo.toString());}}}










1 0
原创粉丝点击