使用HttpClient和Jsoup定向抓取数据

来源:互联网 发布:普宁淘宝培训学校在哪 编辑:程序博客网 时间:2024/05/17 23:00

1.业务需求:

从指定外网抓点货,冷启动

2.站点分析:

.限制IP…
.需要登录……
.对登录账号有抓取频率限制……….
.抓取频率过低,直接跳验证码页面…………..
.验证码长度、模样(纯数字&字母数字混合)TM不固定………………..

“我们能不能不抓了?“
“不行!必须得抓…”
“……”

这么说,此前写的爬虫,多线程、生产者—>消费者 并发抓取压根行不通。多线程毫无意义。

3.使用技术:

1.HttpClient:读取指定URL网页内容
2.Jsoup:解析所要的页面数据——省得写恶心的正则表达式
3.Swing:绘制用户操作界面
4.Tess4J:自动识别验证码(http://tess4j.sourceforge.net/)
5.Exe4J:生成可独立运行的exe程序——给每人机器安装一个,大家一起监控抓~

4.实现要点:

1.代理IP
从一些网站上抓取代理IP,并检测是否可以使用,如下:

package com.ydj.zhuaqu.proxy;import java.io.IOException;import java.net.InetSocketAddress;import java.net.Socket;import java.net.UnknownHostException;import java.util.ArrayList;import java.util.Collections;import java.util.List;import java.util.Random;import java.util.concurrent.ExecutorService;import java.util.concurrent.Executors;import java.util.concurrent.ScheduledExecutorService;import java.util.concurrent.TimeUnit;import net.sf.json.JSONArray;import net.sf.json.JSONObject;import org.apache.commons.collections.map.LRUMap;import org.apache.commons.httpclient.HttpClient;import org.apache.commons.httpclient.methods.GetMethod;import org.apache.commons.httpclient.params.HttpMethodParams;import org.apache.http.HttpEntity;import org.apache.http.HttpHost;import org.apache.http.client.config.RequestConfig;import org.apache.http.client.methods.CloseableHttpResponse;import org.apache.http.client.methods.HttpGet;import org.apache.http.impl.client.CloseableHttpClient;import org.apache.http.impl.client.HttpClientBuilder;import org.apache.http.util.EntityUtils;import org.jsoup.Jsoup;import com.ydj.common.kit.MyLog;/**** @author : Ares.yi* @createTime : 2014-11-10 上午11:13:42 * @version : 1.0 * @description : **/public class ProxyIpPool {    /**设置最多IP数*/    private static final int MAX_IP = 100;    /**设置最少IP数(最好控制和外部使用线程数一致)*/    @SuppressWarnings("unused")    private static final int MIN_IP = 10;//  public static ConcurrentHashMap<Integer,Integer> canUseIPs = new ConcurrentHashMap<Integer,Integer>();    public static List<ProxyIp> canUseIpList = Collections.synchronizedList(new ArrayList<ProxyIp>(MAX_IP));    private static LRUMap notCanUseIPsTemp = new LRUMap(2000);    /**每次抓取IP数*/    private static final int NUM = 20;    private static final String ORDER_ID = "904557733280949";    private static final String KDL_URL = "http://dev.kuaidaili.com/api/getproxy?orderid="+ORDER_ID+"&num="+NUM+"&quality=1&an_ha=1&dedup=1&format=json";    private ProxyIpPool(){    }    /**     * 启动抓取代理IP线程      *     * @author : Ares.yi     * @createTime : 2015年10月29日 下午5:58:54     */    public static void startCrawl(){        final int period = 3;        ScheduledExecutorService scheduledExecutorService = Executors.newScheduledThreadPool(1);        scheduledExecutorService.scheduleAtFixedRate(new Runnable() {            int i = 0 ;            @Override            public void run() {                produceIP(i);                i++;            }        }, 1, period,TimeUnit.MINUTES);    }    private static void produceIP(int i){        int currentSize = canUseIpList.size();        if( currentSize >= MAX_IP){            MyLog.logInfo(i+":current proxyPool size is:"+currentSize+",no need crawl new ip.NotCanUseIPsTemp size is:"+notCanUseIPsTemp.size());            return ;        }        JSONArray ips = getIPFromKuaiDaiLi();        produceIP(ips);        MyLog.logInfo(i+":current proxyPool size is:"+canUseIpList.size()+",notCanUseIPsTemp size is:"+notCanUseIPsTemp.size());    }    private static void produceIP(JSONArray ips){        if(ips == null || ips.isEmpty()){            return ;        }        for(int i = 0 ;i < ips.size() ;i++ ){            Object one  = ips.get(i);            String s[] = one.toString().split(":");            String ip = s[0];            int port = Integer.valueOf(s[1]);            ProxyIp proxyIp = new ProxyIp(ip, port);            if(isCanUse(ip, port)){                addIP(proxyIp);            }else{                removeIP(proxyIp);            }        }    }    public static ProxyIp useOneProxyIp(){        if(canUseIpList.isEmpty()){            MyLog.logInfo(Thread.currentThread().getName()+" useOneProxyIp,but proxyPool is empty,need to wait 2 min crawl IP.");            try {                Thread.sleep(2 * 60 * 1000);            } catch (InterruptedException e) {                e.printStackTrace();            }        }        Collections.sort(canUseIpList);        ProxyIp proxyIp = canUseIpList.remove(0);        proxyIp.useThis();        return proxyIp;    }    public static void returnProxyIp(ProxyIp proxyIp){        proxyIp.setUseing(false);        canUseIpList.add(proxyIp);        return ;    }    /**     * 从快代理网站获取代理IP     * @return     *     * @author : Ares.yi     * @createTime : 2015年10月29日 下午2:36:05     */    private static JSONArray getIPFromKuaiDaiLi(){        JSONArray ips = new JSONArray();        HttpClient client = new HttpClient();        GetMethod method = new GetMethod(KDL_URL);        HttpMethodParams param = method.getParams();        param.setContentCharset("UTF-8");        try {            client.executeMethod(method);            String res = method.getResponseBodyAsString();            JSONObject json = JSONObject.fromObject(res);            if(json != null && json.containsKey("data")){                ips = json.getJSONObject("data").getJSONArray("proxy_list");                MyLog.logInfo(ips);            }        } catch (Exception e) {            e.printStackTrace();        }        return ips;    }    /**     * 从更多的网站获取代理IP     * @return     *     * @author : Ares.yi     * @createTime : 2015年10月29日 下午2:46:40     */    @SuppressWarnings("unused")    private static JSONArray getIPFromXXX(){        JSONArray ips = new JSONArray();        HttpClient client = new HttpClient();        GetMethod method = new GetMethod("XXX");        HttpMethodParams param = method.getParams();        param.setContentCharset("UTF-8");        try {            client.executeMethod(method);            String res = method.getResponseBodyAsString();            JSONObject json = JSONObject.fromObject(res);            if(json != null && json.containsKey("data")){                ips = json.getJSONObject("data").getJSONArray("proxy_list");                MyLog.logInfo(ips);            }        } catch (Exception e) {            e.printStackTrace();        }        return ips;    }    /**     * 检测代理IP是否可用     *      * @param ip     * @param port     * @return     *     * @author : Ares.yi     * @createTime : 2015年10月29日 下午2:37:22     */    private static boolean isCanUse(String ip,int port){        if(port < 0 ){            return false;        }        if(notCanUseIPsTemp.containsKey(ip)){            MyLog.logInfo(ip+":"+port+" can't use again.");            return false;        }        if(!checkIp(ip, port)){            return false;        }        return checkIpUseTargetSite(ip, port);    }    /**     * 检测代理IP是否可用     *      * @param ip     * @param port     * @return     *     * @author : Ares.yi     * @createTime : 2015年10月29日 下午12:35:28     */    private static boolean checkIp(String ip,int port){        Socket server = null;        try {            server = new Socket();            InetSocketAddress address = new InetSocketAddress(ip,port);            server.connect(address, 3000);            MyLog.logInfo(ip+":"+port+" is ok!");            return true;        }catch (UnknownHostException e) {            //e.printStackTrace();            MyLog.logInfo(ip+":"+port+" is wrong!");        } catch (IOException e) {            //e.printStackTrace();            MyLog.logInfo(ip+":"+port+" is wrong!!");        }        return false;    }    /**     * 到目标网站准确检测代理IP是否可用     *      * @param ip     * @param port     * @return     *     * @author : Ares.yi     * @createTime : 2015年10月29日 下午12:06:03     */    private static boolean checkIpUseTargetSite(String ip,int port){        HttpClientBuilder httpClientBuilder = HttpClientBuilder.create();        CloseableHttpClient closeableHttpClient = httpClientBuilder.build();        HttpHost proxy = new HttpHost(ip,port, "http");        RequestConfig config = RequestConfig.custom().setConnectTimeout(3000).setSocketTimeout(3000).setProxy(proxy).build();        HttpGet httpGet = new HttpGet("http://www.autozi.com/partCategory.html/");        httpGet.setConfig(config);        try {            CloseableHttpResponse response = closeableHttpClient.execute(httpGet);            HttpEntity httpentity = response.getEntity();            String html =  EntityUtils.toString(httpentity, "UTF-8");            if(Jsoup.parse(html).select("div[class=header fix]").first() != null){                return true;            }        } catch (Exception exc){//          exc.printStackTrace();            MyLog.logError(exc.getMessage());        }        return false;    }    public static void removeIP(ProxyIp proxyIp){        canUseIpList.remove(proxyIp);        notCanUseIPsTemp.put(proxyIp.getIp(),proxyIp.getPort());    }    public static void addIP(ProxyIp proxyIp){        canUseIpList.add(proxyIp);        notCanUseIPsTemp.remove(proxyIp.getIp());    }    /**     * 测试使用代理IP      *     * @author : Ares.yi     * @createTime : 2015年10月29日 下午6:00:16     */    private static void testUseProxyIp(){        ExecutorService threadPool = Executors.newFixedThreadPool(10);         for(int i=0 ;i <20 ;i++){            final int flag = i;            threadPool.execute(new Runnable() {                @Override                public void run() {                    ProxyIp proxyIp = useOneProxyIp();                    MyLog.logInfo(flag+" job "+Thread.currentThread().getName()+" get proxyIp is : "+proxyIp.toString());                    long millis = new Random().nextInt(10) * 1000;                    try {                        Thread.sleep(millis);//每个线程随机sleep N秒,模拟线程在工作                    } catch (InterruptedException e) {                        e.printStackTrace();                    }                    returnProxyIp(proxyIp);                    MyLog.logInfo(flag+" job "+Thread.currentThread().getName()+" use proxyIp is : "+proxyIp.toString()+",work use time "+millis+" end and return to pool.");                }            });        }    }}

使用代理IP:

    /**     * 使用代理获取网页内容     *      * @param url     * @param proxyIp     * @param proxyPort     * @return     * @throws ParseException     * @throws IOException     *     * @author : Ares.yi     * @createTime : 2015年10月30日 上午9:55:21     */    public static String getHtml(String url,String proxyIp,int proxyPort) throws ParseException, IOException {        HttpClientBuilder httpClientBuilder = HttpClientBuilder.create();        CloseableHttpClient closeableHttpClient = httpClientBuilder.build();        HttpHost proxy = new HttpHost(proxyIp,proxyPort, "http");        RequestConfig config = RequestConfig.custom().setConnectTimeout(3000).setSocketTimeout(3000).setProxy(proxy).build();        HttpPost httpGet = new HttpPost(url);        httpGet.setConfig(config);        String html = "";        CloseableHttpResponse response = null;        try {            response = closeableHttpClient.execute(httpGet);        }catch(Exception exc){            exc.printStackTrace();            System.out.println("get请求失败!");            return "cannot connect";        }        HttpEntity httpEntity = response.getEntity();        if (httpEntity != null) {            // 打印响应内容            try{                html =  EntityUtils.toString(httpEntity, "UTF-8");            }catch(Exception excep){                System.out.println(url);            }           }else{            return "cannot connect";        }        closeableHttpClient.close();        return html;    }

2.模拟登录
提取登录Cookie和User-Agent:
这里写图片描述

代码片段,如下:

public static String postRequest(String url,            Map<String, String> parameterMap, String charSet)            throws UnsupportedEncodingException {        CloseableHttpClient client = HttpClients.createDefault();        HttpPost httpPost = new HttpPost(url);        UrlEncodedFormEntity postEntity = new UrlEncodedFormEntity(getParam(parameterMap), charSet);        httpPost.setEntity(postEntity);        httpPost.addHeader("HOST", "sec.1688.com");        httpPost.addHeader("User-Agent", Constant.userAgent);        httpPost.addHeader("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");        httpPost.addHeader("Cookie", Constant.cookie);        MyLog.logInfo("request line:" + httpPost.getRequestLine());        try {            // 执行post请求            HttpResponse httpResponse = client.execute(httpPost);            Header header = httpResponse.getFirstHeader("Location");            if (header != null && Toolbox.isNotEmpty(header.getValue())) {                MyLog.logInfo("location:" + header.getValue());                return "SUCCESS";            } else {                String html = printResponse(httpResponse);                return html;            }        } catch (IOException e) {            e.printStackTrace();        } finally {            try {                client.close();            } catch (IOException e) {            }        }        return "";    }

3.验证码
获取输入验证码页面信息:

public static Ali1688CheckCodeFormData getCheckCodeFormData(String url,String checkCodePageHtml){        Ali1688CheckCodeFormData ali1688CheckCodeFormData= null;        if(Toolbox.isEmptyString(checkCodePageHtml)){            return ali1688CheckCodeFormData;        }        Document doc = Jsoup.parse(checkCodePageHtml);        String action = doc.select("input[name=action]").attr("value");        String event_submit_do_query = doc.select("input[name=event_submit_do_query]").attr("value");        String smPolicy = doc.select("input[name=smPolicy]").attr("value");        String smReturn = doc.select("input[name=smReturn]").attr("value");        String smApp = doc.select("input[name=smApp]").attr("value");        String smCharset = doc.select("input[name=smCharset]").attr("value");        String smTag = doc.select("input[name=smTag]").attr("value");        String smSign = doc.select("input[name=smSign]").attr("value");        String identity = doc.select("input[name=identity]").attr("value");        String captcha = doc.select("input[name=captcha]").attr("value");        String sessionid = doc.select("img[id=checkcodeImg]").attr("src");         sessionid = sessionid.substring(sessionid.indexOf("sessionid=")+10,sessionid.indexOf("&"));        ali1688CheckCodeFormData = new Ali1688CheckCodeFormData(action, event_submit_do_query, smPolicy, smReturn, smApp, smCharset, smTag, smSign, identity, captcha, sessionid,url);        return ali1688CheckCodeFormData;    }

提交验证码:

public static String submitCheckCode(String checkcode) throws UnsupportedEncodingException, IOException{        String smApp = Constant.ali1688CheckCodeFormData.getSmApp();        String smPolicy = Constant.ali1688CheckCodeFormData.getSmPolicy();        String smCharset = Constant.ali1688CheckCodeFormData.getSmCharset();        String smTag = Constant.ali1688CheckCodeFormData.getSmTag();        String smReturn = Constant.ali1688CheckCodeFormData.getSmReturn();        String smSign = Constant.ali1688CheckCodeFormData.getSmSign();        String get = "smApp="+smApp+"&smPolicy="+smPolicy+"&smCharset="+smCharset+"&smTag="+smTag+"&smReturn="+smReturn+"&smSign="+smSign;        try {            get = java.net.URLEncoder.encode(get,"utf-8");        } catch (UnsupportedEncodingException e1) {        }        String formAction = "https://sec.1688.com/query.htm?"+get;        Map<String,String> parameterMap = new HashMap<String,String>();        parameterMap.put("action", Constant.ali1688CheckCodeFormData.getAction());        parameterMap.put("event_submit_do_query", Constant.ali1688CheckCodeFormData.getEvent_submit_do_query());        parameterMap.put("smPolicy", smPolicy);        parameterMap.put("smReturn", smReturn);        parameterMap.put("smApp", smApp);        parameterMap.put("smCharset", smCharset);        parameterMap.put("smTag", smTag);        parameterMap.put("smSign", smSign);        parameterMap.put("identity", Constant.ali1688CheckCodeFormData.getIdentity());        parameterMap.put("captcha", Constant.ali1688CheckCodeFormData.getCaptcha());        parameterMap.put("checkcode", checkcode);        String res = HttpKit.postRequest(formAction, parameterMap,  "UTF-8");        if (Toolbox.isNotEmpty(res) && "SUCCESS".equals(res)) {                return "SUCCESS";        }else{              String html = res;              Constant.ali1688CheckCodeFormData = getCheckCodeFormData(smReturn,html);        }         return "";    }

4.exe4j操作:
这里写图片描述

5.部分界面:

这里写图片描述
这里写图片描述
这里写图片描述

6.源码:

https://github.com/Aresyi/simpleSpider

原创粉丝点击