搜狗微信公众号抓取

来源:互联网 发布:淘宝手机视频尺寸大小 编辑:程序博客网 时间:2024/04/26 00:59

由于搜狗的反爬原因,不能直接对搜狗上面检索结果进行抓取,首先应先对操作进行抓包分析:
一、抓包分析:
1、搜狗微信公众号检索界面:
搜狗微信公众号检索界面

2、抓包获取对应的HTTP Referer:
抓包获取对应的HTTP Referer

HTTP Referer是header的一部分,当浏览器向web服务器发送请求的时候,一般会带上Referer,告诉服务器我是从哪个页面链接过来的,服务器藉此可以获得一些信息用于处理。比如从我主页上链接到一个朋友那里,他的服务器就能够从HTTP Referer中统计出每天有多少用户点击我主页上的链接访问他的网站。

3、分析获取到的可访问地址:

http://weixin.sogou.com/weixin?type=1
&sug_type=
&query=%E6%B1%BD%E8%BD%A6+%E4%BA%91%E5%8D%97
&ie=utf8
&sug=y
&w=01019900
&sut=2257
&sst0=1475908133010 //当前时间戳(ms)
&lkt=0%2C0%2C0

二、数据抓取:
通过referer来请求,获取请求数据(对方估计会根据频率对本地模拟请求有所限制,如被限制,隔一段时间就会恢复)

请求地址参数:

    private static final String OTHER_PARAMS = "&_sug_=y&w=01019900&sut=1790&lkt=0%2C0%2C0";    private static final String BASE_URL = "http://weixin.sogou.com/weixin?type=1&ie=utf8&_sug_type_=" + OTHER_PARAMS;    private static void start() {        String url = null;        String[] keyWords = {"汽车", "二手车", "车"};        System.out.println("--- start ---");        for (int i = 0; i < keyWords.length; i++) {            String keyWord = keyWords[i];            List<String> citys = getCity();            for (int j = 0; j < citys.size(); j++) {                String city = citys.get(j);                String url_suffix = null;                try {                    url_suffix = java.net.URLEncoder.encode(keyWord,"utf-8") + "+" + java.net.URLEncoder.encode(city,"utf-8");                    url = BASE_URL + "&query=" + url_suffix + "&sst0=" + System.currentTimeMillis();                    System.out.println("fetch : " + url);                    fetch(url);                } catch (UnsupportedEncodingException e) {                    e.printStackTrace();                }            }        }    }    private static List<String> getCity() {        String[] citys = new String[]{"贵州","甘肃","广西","浙江","福建","安徽","香港","广东","海南","河北","河南","黑龙江","重庆","辽宁","湖北","湖南","吉林","江苏","江西","天津",                "内蒙古","四川","宁夏","青海","山东","山西","陕西","上海","西藏","北京","新疆","云南"};        List<String> list = Arrays.asList(citys);        return list;    }

数据抓取并存储:

        private static void fetch(String url) {        Document doc = ToolkitForSpider.getHtmlDoc(url);        Elements eles = doc.select("div div.txt-box");        for (int i = 0; i < eles.size(); i++) {            String name = eles.get(i).select("h3").text();            String code = eles.get(i).select(" h4 span label").text();            Elements urlEles = doc.select("div.wx-rb.bg-blue.wx-rb_v1._item");            String wechat_url = urlEles.get(i).attr("href");            System.out.println(name + " " + code + " " + wechat_url);            insert(name, code, wechat_url);        }        Elements pageEles = doc.select("div.p a");        int totalPage = pageEles.size();        for (int i = 0; i < totalPage - 1; i++) {            String pageUrl = pageEles.get(i).attr("href");            pageUrl = "http://weixin.sogou.com/weixin" + pageUrl;            System.out.println("fetch : " + pageUrl);            Document pageDoc = ToolkitForSpider.getHtmlDoc(pageUrl);            Elements pageUrlEles = pageDoc.select("div div.txt-box");            for (int j = 0; j < pageUrlEles.size(); j++) {                String name = pageUrlEles.get(j).select("h3").text();                String code = pageUrlEles.get(j).select(" h4 span label").text();                Elements urlEles = doc.select("div.wx-rb.bg-blue.wx-rb_v1._item");                String wechat_url = urlEles.get(j).attr("href");                System.out.println(name + " " + code);                insert(name, code, wechat_url);            }        }    }    private static void insert(String name, String code, String url) {        Connection conn = ToolkitForSpider.getMySqlConnection();        String sql = "insert into inf_weixin(wx_name, wx_code, biz_url, last_update, last_update_title, newrank_index, update_time) "                + "values(?,?,?,?,?,?,now()) on duplicate key update update_time = now()";        Object[] values = new Object[]{name, code, url, "1970-01-01", "-1", "-1"};        try {            DBLib.update(conn, sql, values);        } catch (SQLException e) {            e.printStackTrace();        } finally {            ToolkitForSpider.close(conn);        }    }
0 0