使用HttpClient和Jsoup定向抓取数据
来源:互联网 发布:普宁淘宝培训学校在哪 编辑:程序博客网 时间:2024/05/17 23:00
1.业务需求:
从指定外网抓点货,冷启动
2.站点分析:
.限制IP…
.需要登录……
.对登录账号有抓取频率限制……….
.抓取频率过低,直接跳验证码页面…………..
.验证码长度、模样(纯数字&字母数字混合)TM不固定………………..
“我们能不能不抓了?“
“不行!必须得抓…”
“……”
这么说,此前写的爬虫,多线程、生产者—>消费者 并发抓取压根行不通。多线程毫无意义。
3.使用技术:
1.HttpClient
:读取指定URL网页内容
2.Jsoup
:解析所要的页面数据——省得写恶心的正则表达式
3.Swing
:绘制用户操作界面
4.Tess4J
:自动识别验证码(http://tess4j.sourceforge.net/)
5.Exe4J
:生成可独立运行的exe程序——给每人机器安装一个,大家一起监控抓~
4.实现要点:
1.代理IP
从一些网站上抓取代理IP,并检测是否可以使用,如下:
package com.ydj.zhuaqu.proxy;import java.io.IOException;import java.net.InetSocketAddress;import java.net.Socket;import java.net.UnknownHostException;import java.util.ArrayList;import java.util.Collections;import java.util.List;import java.util.Random;import java.util.concurrent.ExecutorService;import java.util.concurrent.Executors;import java.util.concurrent.ScheduledExecutorService;import java.util.concurrent.TimeUnit;import net.sf.json.JSONArray;import net.sf.json.JSONObject;import org.apache.commons.collections.map.LRUMap;import org.apache.commons.httpclient.HttpClient;import org.apache.commons.httpclient.methods.GetMethod;import org.apache.commons.httpclient.params.HttpMethodParams;import org.apache.http.HttpEntity;import org.apache.http.HttpHost;import org.apache.http.client.config.RequestConfig;import org.apache.http.client.methods.CloseableHttpResponse;import org.apache.http.client.methods.HttpGet;import org.apache.http.impl.client.CloseableHttpClient;import org.apache.http.impl.client.HttpClientBuilder;import org.apache.http.util.EntityUtils;import org.jsoup.Jsoup;import com.ydj.common.kit.MyLog;/**** @author : Ares.yi* @createTime : 2014-11-10 上午11:13:42 * @version : 1.0 * @description : **/public class ProxyIpPool { /**设置最多IP数*/ private static final int MAX_IP = 100; /**设置最少IP数(最好控制和外部使用线程数一致)*/ @SuppressWarnings("unused") private static final int MIN_IP = 10;// public static ConcurrentHashMap<Integer,Integer> canUseIPs = new ConcurrentHashMap<Integer,Integer>(); public static List<ProxyIp> canUseIpList = Collections.synchronizedList(new ArrayList<ProxyIp>(MAX_IP)); private static LRUMap notCanUseIPsTemp = new LRUMap(2000); /**每次抓取IP数*/ private static final int NUM = 20; private static final String ORDER_ID = "904557733280949"; private static final String KDL_URL = "http://dev.kuaidaili.com/api/getproxy?orderid="+ORDER_ID+"&num="+NUM+"&quality=1&an_ha=1&dedup=1&format=json"; private ProxyIpPool(){ } /** * 启动抓取代理IP线程 * * @author : Ares.yi * @createTime : 2015年10月29日 下午5:58:54 */ public static void startCrawl(){ final int period = 3; ScheduledExecutorService scheduledExecutorService = Executors.newScheduledThreadPool(1); scheduledExecutorService.scheduleAtFixedRate(new Runnable() { int i = 0 ; @Override public void run() { produceIP(i); i++; } }, 1, period,TimeUnit.MINUTES); } private static void produceIP(int i){ int currentSize = canUseIpList.size(); if( currentSize >= MAX_IP){ MyLog.logInfo(i+":current proxyPool size is:"+currentSize+",no need crawl new ip.NotCanUseIPsTemp size is:"+notCanUseIPsTemp.size()); return ; } JSONArray ips = getIPFromKuaiDaiLi(); produceIP(ips); MyLog.logInfo(i+":current proxyPool size is:"+canUseIpList.size()+",notCanUseIPsTemp size is:"+notCanUseIPsTemp.size()); } private static void produceIP(JSONArray ips){ if(ips == null || ips.isEmpty()){ return ; } for(int i = 0 ;i < ips.size() ;i++ ){ Object one = ips.get(i); String s[] = one.toString().split(":"); String ip = s[0]; int port = Integer.valueOf(s[1]); ProxyIp proxyIp = new ProxyIp(ip, port); if(isCanUse(ip, port)){ addIP(proxyIp); }else{ removeIP(proxyIp); } } } public static ProxyIp useOneProxyIp(){ if(canUseIpList.isEmpty()){ MyLog.logInfo(Thread.currentThread().getName()+" useOneProxyIp,but proxyPool is empty,need to wait 2 min crawl IP."); try { Thread.sleep(2 * 60 * 1000); } catch (InterruptedException e) { e.printStackTrace(); } } Collections.sort(canUseIpList); ProxyIp proxyIp = canUseIpList.remove(0); proxyIp.useThis(); return proxyIp; } public static void returnProxyIp(ProxyIp proxyIp){ proxyIp.setUseing(false); canUseIpList.add(proxyIp); return ; } /** * 从快代理网站获取代理IP * @return * * @author : Ares.yi * @createTime : 2015年10月29日 下午2:36:05 */ private static JSONArray getIPFromKuaiDaiLi(){ JSONArray ips = new JSONArray(); HttpClient client = new HttpClient(); GetMethod method = new GetMethod(KDL_URL); HttpMethodParams param = method.getParams(); param.setContentCharset("UTF-8"); try { client.executeMethod(method); String res = method.getResponseBodyAsString(); JSONObject json = JSONObject.fromObject(res); if(json != null && json.containsKey("data")){ ips = json.getJSONObject("data").getJSONArray("proxy_list"); MyLog.logInfo(ips); } } catch (Exception e) { e.printStackTrace(); } return ips; } /** * 从更多的网站获取代理IP * @return * * @author : Ares.yi * @createTime : 2015年10月29日 下午2:46:40 */ @SuppressWarnings("unused") private static JSONArray getIPFromXXX(){ JSONArray ips = new JSONArray(); HttpClient client = new HttpClient(); GetMethod method = new GetMethod("XXX"); HttpMethodParams param = method.getParams(); param.setContentCharset("UTF-8"); try { client.executeMethod(method); String res = method.getResponseBodyAsString(); JSONObject json = JSONObject.fromObject(res); if(json != null && json.containsKey("data")){ ips = json.getJSONObject("data").getJSONArray("proxy_list"); MyLog.logInfo(ips); } } catch (Exception e) { e.printStackTrace(); } return ips; } /** * 检测代理IP是否可用 * * @param ip * @param port * @return * * @author : Ares.yi * @createTime : 2015年10月29日 下午2:37:22 */ private static boolean isCanUse(String ip,int port){ if(port < 0 ){ return false; } if(notCanUseIPsTemp.containsKey(ip)){ MyLog.logInfo(ip+":"+port+" can't use again."); return false; } if(!checkIp(ip, port)){ return false; } return checkIpUseTargetSite(ip, port); } /** * 检测代理IP是否可用 * * @param ip * @param port * @return * * @author : Ares.yi * @createTime : 2015年10月29日 下午12:35:28 */ private static boolean checkIp(String ip,int port){ Socket server = null; try { server = new Socket(); InetSocketAddress address = new InetSocketAddress(ip,port); server.connect(address, 3000); MyLog.logInfo(ip+":"+port+" is ok!"); return true; }catch (UnknownHostException e) { //e.printStackTrace(); MyLog.logInfo(ip+":"+port+" is wrong!"); } catch (IOException e) { //e.printStackTrace(); MyLog.logInfo(ip+":"+port+" is wrong!!"); } return false; } /** * 到目标网站准确检测代理IP是否可用 * * @param ip * @param port * @return * * @author : Ares.yi * @createTime : 2015年10月29日 下午12:06:03 */ private static boolean checkIpUseTargetSite(String ip,int port){ HttpClientBuilder httpClientBuilder = HttpClientBuilder.create(); CloseableHttpClient closeableHttpClient = httpClientBuilder.build(); HttpHost proxy = new HttpHost(ip,port, "http"); RequestConfig config = RequestConfig.custom().setConnectTimeout(3000).setSocketTimeout(3000).setProxy(proxy).build(); HttpGet httpGet = new HttpGet("http://www.autozi.com/partCategory.html/"); httpGet.setConfig(config); try { CloseableHttpResponse response = closeableHttpClient.execute(httpGet); HttpEntity httpentity = response.getEntity(); String html = EntityUtils.toString(httpentity, "UTF-8"); if(Jsoup.parse(html).select("div[class=header fix]").first() != null){ return true; } } catch (Exception exc){// exc.printStackTrace(); MyLog.logError(exc.getMessage()); } return false; } public static void removeIP(ProxyIp proxyIp){ canUseIpList.remove(proxyIp); notCanUseIPsTemp.put(proxyIp.getIp(),proxyIp.getPort()); } public static void addIP(ProxyIp proxyIp){ canUseIpList.add(proxyIp); notCanUseIPsTemp.remove(proxyIp.getIp()); } /** * 测试使用代理IP * * @author : Ares.yi * @createTime : 2015年10月29日 下午6:00:16 */ private static void testUseProxyIp(){ ExecutorService threadPool = Executors.newFixedThreadPool(10); for(int i=0 ;i <20 ;i++){ final int flag = i; threadPool.execute(new Runnable() { @Override public void run() { ProxyIp proxyIp = useOneProxyIp(); MyLog.logInfo(flag+" job "+Thread.currentThread().getName()+" get proxyIp is : "+proxyIp.toString()); long millis = new Random().nextInt(10) * 1000; try { Thread.sleep(millis);//每个线程随机sleep N秒,模拟线程在工作 } catch (InterruptedException e) { e.printStackTrace(); } returnProxyIp(proxyIp); MyLog.logInfo(flag+" job "+Thread.currentThread().getName()+" use proxyIp is : "+proxyIp.toString()+",work use time "+millis+" end and return to pool."); } }); } }}
使用代理IP:
/** * 使用代理获取网页内容 * * @param url * @param proxyIp * @param proxyPort * @return * @throws ParseException * @throws IOException * * @author : Ares.yi * @createTime : 2015年10月30日 上午9:55:21 */ public static String getHtml(String url,String proxyIp,int proxyPort) throws ParseException, IOException { HttpClientBuilder httpClientBuilder = HttpClientBuilder.create(); CloseableHttpClient closeableHttpClient = httpClientBuilder.build(); HttpHost proxy = new HttpHost(proxyIp,proxyPort, "http"); RequestConfig config = RequestConfig.custom().setConnectTimeout(3000).setSocketTimeout(3000).setProxy(proxy).build(); HttpPost httpGet = new HttpPost(url); httpGet.setConfig(config); String html = ""; CloseableHttpResponse response = null; try { response = closeableHttpClient.execute(httpGet); }catch(Exception exc){ exc.printStackTrace(); System.out.println("get请求失败!"); return "cannot connect"; } HttpEntity httpEntity = response.getEntity(); if (httpEntity != null) { // 打印响应内容 try{ html = EntityUtils.toString(httpEntity, "UTF-8"); }catch(Exception excep){ System.out.println(url); } }else{ return "cannot connect"; } closeableHttpClient.close(); return html; }
2.模拟登录
提取登录Cookie和User-Agent:
代码片段,如下:
public static String postRequest(String url, Map<String, String> parameterMap, String charSet) throws UnsupportedEncodingException { CloseableHttpClient client = HttpClients.createDefault(); HttpPost httpPost = new HttpPost(url); UrlEncodedFormEntity postEntity = new UrlEncodedFormEntity(getParam(parameterMap), charSet); httpPost.setEntity(postEntity); httpPost.addHeader("HOST", "sec.1688.com"); httpPost.addHeader("User-Agent", Constant.userAgent); httpPost.addHeader("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); httpPost.addHeader("Cookie", Constant.cookie); MyLog.logInfo("request line:" + httpPost.getRequestLine()); try { // 执行post请求 HttpResponse httpResponse = client.execute(httpPost); Header header = httpResponse.getFirstHeader("Location"); if (header != null && Toolbox.isNotEmpty(header.getValue())) { MyLog.logInfo("location:" + header.getValue()); return "SUCCESS"; } else { String html = printResponse(httpResponse); return html; } } catch (IOException e) { e.printStackTrace(); } finally { try { client.close(); } catch (IOException e) { } } return ""; }
3.验证码
获取输入验证码页面信息:
public static Ali1688CheckCodeFormData getCheckCodeFormData(String url,String checkCodePageHtml){ Ali1688CheckCodeFormData ali1688CheckCodeFormData= null; if(Toolbox.isEmptyString(checkCodePageHtml)){ return ali1688CheckCodeFormData; } Document doc = Jsoup.parse(checkCodePageHtml); String action = doc.select("input[name=action]").attr("value"); String event_submit_do_query = doc.select("input[name=event_submit_do_query]").attr("value"); String smPolicy = doc.select("input[name=smPolicy]").attr("value"); String smReturn = doc.select("input[name=smReturn]").attr("value"); String smApp = doc.select("input[name=smApp]").attr("value"); String smCharset = doc.select("input[name=smCharset]").attr("value"); String smTag = doc.select("input[name=smTag]").attr("value"); String smSign = doc.select("input[name=smSign]").attr("value"); String identity = doc.select("input[name=identity]").attr("value"); String captcha = doc.select("input[name=captcha]").attr("value"); String sessionid = doc.select("img[id=checkcodeImg]").attr("src"); sessionid = sessionid.substring(sessionid.indexOf("sessionid=")+10,sessionid.indexOf("&")); ali1688CheckCodeFormData = new Ali1688CheckCodeFormData(action, event_submit_do_query, smPolicy, smReturn, smApp, smCharset, smTag, smSign, identity, captcha, sessionid,url); return ali1688CheckCodeFormData; }
提交验证码:
public static String submitCheckCode(String checkcode) throws UnsupportedEncodingException, IOException{ String smApp = Constant.ali1688CheckCodeFormData.getSmApp(); String smPolicy = Constant.ali1688CheckCodeFormData.getSmPolicy(); String smCharset = Constant.ali1688CheckCodeFormData.getSmCharset(); String smTag = Constant.ali1688CheckCodeFormData.getSmTag(); String smReturn = Constant.ali1688CheckCodeFormData.getSmReturn(); String smSign = Constant.ali1688CheckCodeFormData.getSmSign(); String get = "smApp="+smApp+"&smPolicy="+smPolicy+"&smCharset="+smCharset+"&smTag="+smTag+"&smReturn="+smReturn+"&smSign="+smSign; try { get = java.net.URLEncoder.encode(get,"utf-8"); } catch (UnsupportedEncodingException e1) { } String formAction = "https://sec.1688.com/query.htm?"+get; Map<String,String> parameterMap = new HashMap<String,String>(); parameterMap.put("action", Constant.ali1688CheckCodeFormData.getAction()); parameterMap.put("event_submit_do_query", Constant.ali1688CheckCodeFormData.getEvent_submit_do_query()); parameterMap.put("smPolicy", smPolicy); parameterMap.put("smReturn", smReturn); parameterMap.put("smApp", smApp); parameterMap.put("smCharset", smCharset); parameterMap.put("smTag", smTag); parameterMap.put("smSign", smSign); parameterMap.put("identity", Constant.ali1688CheckCodeFormData.getIdentity()); parameterMap.put("captcha", Constant.ali1688CheckCodeFormData.getCaptcha()); parameterMap.put("checkcode", checkcode); String res = HttpKit.postRequest(formAction, parameterMap, "UTF-8"); if (Toolbox.isNotEmpty(res) && "SUCCESS".equals(res)) { return "SUCCESS"; }else{ String html = res; Constant.ali1688CheckCodeFormData = getCheckCodeFormData(smReturn,html); } return ""; }
4.exe4j操作:
5.部分界面:
6.源码:
https://github.com/Aresyi/simpleSpider
阅读全文
0 0
- 使用HttpClient和Jsoup定向抓取数据
- 使用HttpClient和Jsoup进行简单数据抓取、解析
- httpclient+jsoup抓取数据
- HttpClient + Jsoup 网页数据抓取
- 利用HttpClient和Jsoup实现从网站中抓取数据
- HttpClient+jsoup实现网页数据抓取和处理
- 使用Jsoup抓取数据
- httpClient及jsoup抓取解析网页数据
- Jsoup学习-使用Jsoup抓取页面数据
- 使用JSOUP抓取页面数据
- 使用java开源工具httpClient及jsoup抓取解析网页数据
- 使用java开源工具httpClient及jsoup抓取解析网页数据
- 使用java开源工具httpClient及jsoup抓取解析网页数据
- 使用httpclient及jsoup从oj上抓取题目信息
- HttpClient+Jsoup 抓取网页信息
- 网页数据抓取——使用jsoup
- 使用Jsoup进行网页数据抓取
- 使用Jsoup 抓取页面的数据
- Linux系统--时间戳计数器TSC
- hexo —— 简单、快速、强大的Node.js静态博客框架
- R语言绘制股票K线图
- linux采用scp命令拷贝文件到本地,拷贝本地文件到远程服务器
- 从零开始学习Java Web(七):认识8大Listener
- 使用HttpClient和Jsoup定向抓取数据
- ant工具
- SQL优化
- Java基础应用之条件语句(判断用户输入月份的季节)
- Jquery 事件篇
- Digital Count数位统计
- php 面试题整理
- 简单的存储过程案例
- 睡眠不足大脑将被吃掉