【网络爬虫】HttpClient抓取+解析+存储数据

来源:互联网 发布:足球战术 软件 编辑:程序博客网 时间:2024/06/02 05:30

前面使用了HttpClient抓取数据(http://blog.csdn.net/tsj11514oo/article/details/71023314),现在我们就要进行对数据的解析和存储。实现整一套的流程:抓取——解析——存储。重点是封装了工具类。

结构:

这里写图片描述

代码:

(1)封装转JSON工具类:

```    package com.tsj.simple.utils;       import java.util.Iterator;      import org.json.simple.JSONArray;    import org.json.simple.JSONObject;    import org.json.simple.JSONValue;       /**     * json解析工具类     * @author tsj-pc     */    public class JsonOperatorUtil {            public static JSONObject toJSONObject(String str) {                    return (JSONObject) JSONValue.parse(str);            }               public static JSONArray toJSONArray(String str) {                    return (JSONArray) JSONValue.parse(str);            }               public static void main(String[] args) {                    String str = "[{\"one\":1,\"two\":\"2\"}]";    //      JSONObject jsonObject = JsonOperatorUtil.toJSONObject(str);            JSONArray jsonObject = JsonOperatorUtil.toJSONArray(str);                       Iterator<JSONObject> iterator=jsonObject.iterator();            while(iterator.hasNext()){                    System.out.println(iterator.next());            }            }    }

(2)测试转JSON工具类

这里写图片描述

(3)封装文件IO工具类

package com.tsj.simple.utils;import java.io.File;import java.io.FileOutputStream;import java.io.IOException;/** * 文件输入输出IO  * @author tsj-pc */public class IOUtil {    public static void writeFile(String filePath, String value, String encoding) {        FileOutputStream fos = null;        try {            fos = new FileOutputStream(new File(filePath));            fos.write(value.getBytes(encoding));            fos.close();        } catch (Exception e) {            e.printStackTrace();        } finally {            if (fos != null) {                try {                    fos.close();                } catch (IOException e) {                    e.printStackTrace();                }            }        }    }    public static void main(String[] args) {        String filePath = "test.txt";        String value = "hello world,123";        String encoding = "utf-8";    IOUtil.writeFile(filePath, value, encoding);    System.out.println("done!");    }}

(4)测试文件IO工具类

这里写图片描述

(5)使用文件IO工具类和转JSON工具类

package com.tsj.simple.manager;import java.util.HashMap;import java.util.HashSet;import java.util.Map;import java.util.Set;import javax.swing.text.AbstractDocument.Content;import org.json.simple.JSONArray;import org.json.simple.JSONObject;import com.tsj.simple.impl.crawl.HttpClientCrawlerImpl;import com.tsj.simple.pojos.CrawlResultPojo;import com.tsj.simple.pojos.UrlPojo;import com.tsj.simple.utils.IOUtil;import com.tsj.simple.utils.JsonOperatorUtil;/** * 网易贷抓取管理器 * @author tsj-pc * */public class WangYiDaiCrawlManager {    public static HttpClientCrawlerImpl httpClientCrawlerImpl = new HttpClientCrawlerImpl();    public static String[] column_key = { "platName", "locationAreaName",        "locationCityName", "platUrl" };    private static CrawlResultPojo crawlOnePage(UrlPojo urlPojo) {        CrawlResultPojo resultPojo = httpClientCrawlerImpl.crawl4Post(urlPojo);        return resultPojo;    }    public static int item_count = 0;    public static String parserOnePage(String jsonStr) {        // 解析该json        JSONObject jsonObj = JsonOperatorUtil.toJSONObject(jsonStr);        JSONArray jsonArray = JsonOperatorUtil.toJSONArray(jsonObj.get("list")                .toString());        StringBuilder stringBuilder = new StringBuilder();        for (Object json : jsonArray) {            JSONObject itemJson = (JSONObject) json;            for (String column : column_key) {                stringBuilder.append(itemJson.get(column) + "\t");            }            stringBuilder.append("\n");            item_count++;        }        return stringBuilder.toString();    }    public static void processWangYiDai(String url, int max_page_number,            String filePath) {        // 存储所有的抓取条目        StringBuilder all_items = new StringBuilder();        UrlPojo urlPojo = new UrlPojo(url);        Map<String, Object> parasMap = new HashMap<String, Object>();        int have_download_page_count = 0;        Set<String> uniqSet = new HashSet<String>();        for (int pageNumber = 1; pageNumber <= max_page_number; pageNumber++) {            parasMap.put("currPage", pageNumber);            parasMap.put("params", "");            parasMap.put("sort", 0);            urlPojo.setParasMap(parasMap);            CrawlResultPojo resultPojo = crawlOnePage(urlPojo);            if (uniqSet.contains(resultPojo.getPageContent())) {                System.out.println("碰到重复,代表已抓取完成!");                break;            } else {                uniqSet.add(resultPojo.getPageContent());            }            if (resultPojo != null) {                String content = resultPojo.getPageContent();                String page_items = parserOnePage(content);                all_items.append(page_items);                have_download_page_count++;            }        }        System.out.println("all items size---" + item_count);        System.out.println("已经下载了---" + have_download_page_count);        IOUtil.writeFile(filePath, all_items.toString(), "utf-8");        System.out.println("save successfully~");    }    public static void main(String[] args) {        String url = "http://www.wangdaizhijia.com/front_select-plat";        int max_page_number = 100;        String fileName = "网易贷_数据集.txt";        processWangYiDai(url, max_page_number, fileName);        System.out.println("done!");    }   }

(6)测试使用文件IO工具类和转JSON工具类情况

这里写图片描述

0 0
原创粉丝点击