数据爬取:爬取淘宝及国美在线搜索建议词

来源：互联网发布：网络诈骗300元怎么办编辑：程序博客网时间：2024/04/28 07:37

分为两种形式的抓取:

1.基于首字母的四轮次抓取如: a ,aa ,aaa,aaaa

2.基于汉语i拼音的三轮次抓取:附拼音表

链接：http://pan.baidu.com/s/1eS5Kdmq 密码：n9pb

使用的框架为webmagic

淘宝爬取:

public class TaobaoPinyinSuggestWordPageProcessor implements PageProcessor{    // 部分一：抓取网站的相关配置，包括编码、抓取间隔、重试次数等    private Site site = Site.            me().            setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36ss").            setRetryTimes(5).setSleepTime(10);    @Override    // process是定制爬虫逻辑的核心接口，在这里编写抽取逻辑    public void process(Page page) {        //["童装","40827952"]        JSONArray a = JSON.parseArray(JSON.parseObject(page.getJson().toString()).get("result").toString());        for (Object aa : a) {            String replace = aa.toString().replace("[", "").replace("]", "").replace("\"", "");                        String substring = replace.substring(0,replace.indexOf(","));            try {                IOUtils.write(substring.getBytes(), new FileOutputStream(new File(fileName),true));                IOUtils.write(IOUtils.LINE_SEPARATOR.getBytes(), new FileOutputStream(new File(fileName),true));            } catch (FileNotFoundException e) {                e.printStackTrace();            } catch (IOException e) {                e.printStackTrace();            }        }        if(a.size()>9){            String url = page.getUrl().toString();            //https://suggest.taobao.com/sug?code=utf-8&q=            int index = "https://suggest.taobao.com/sug?code=utf-8&q=".length();            String query = url.substring(index, url.length());            char[] chars = query.toCharArray();            int num = 0;            for (char c : chars) {                if(c>=65 && c <= 90){                    num++;                }            }            if(num <3){                List<String> temp = new ArrayList<String>();                for (String add : speeeds) {                    temp.add(url+add);                }                page.addTargetRequests(temp);            }        }    }    @Override    public Site getSite() {        return site;    }    private static String fileName ;    private static List<String> speeeds;    public static void main(String[] args) throws IOException {         if(args.length>1 && args.length<3){              fileName = args[0];              speeeds = FileUtils.readLines(new File(args[1]));          }else{              fileName = "E:\\temp\\temp_pinyin_suggest.txt";              speeeds = FileUtils.readLines(new File("E:\\temp\\pinyin.txt"));          }           long one = System.currentTimeMillis();           for (String q : speeeds) {                long temp = System.currentTimeMillis();                       String url ="https://suggest.taobao.com/sug?code=utf-8&q="+q;                       Spider.create(new TaobaoPinyinSuggestWordPageProcessor()).addPipeline(new ConsolePipeline()).addUrl(url).                       thread(7).run();              System.out.println("the speeed is : "+q+"  end and time is :" + (System.currentTimeMillis() -temp) + " ms");           }           long two = System.currentTimeMillis();           System.out.println("one end and time is :" + (two -one) + " ms");    }}

public class TaobaoSuggestWordPageProcessor implements PageProcessor{    // 部分一：抓取网站的相关配置，包括编码、抓取间隔、重试次数等    private Site site = Site.            me().            setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36ss").            setRetryTimes(5).setSleepTime(100);    @Override    // process是定制爬虫逻辑的核心接口，在这里编写抽取逻辑    public void process(Page page) {        //["童装","40827952"]        JSONArray a = JSON.parseArray(JSON.parseObject(page.getJson().toString()).get("result").toString());        for (Object aa : a) {            String replace = aa.toString().replace("[", "").replace("]", "").replace("\"", "");                        String substring = replace.substring(0,replace.indexOf(","));            try {                IOUtils.write(substring.getBytes(), new FileOutputStream(new File(fileName),true));                IOUtils.write(IOUtils.LINE_SEPARATOR.getBytes(), new FileOutputStream(new File(fileName),true));            } catch (FileNotFoundException e) {                e.printStackTrace();            } catch (IOException e) {                e.printStackTrace();            }        }        if(a.size()>9){            String url = page.getUrl().toString();            //https://suggest.taobao.com/sug?code=utf-8&q=            int index = "https://suggest.taobao.com/sug?code=utf-8&q=".length();            String query = url.substring(index, url.length());            if(query.length() <4){                String[] speeeds = {"q","w","e","r","t","y","u","i","o","p","a",                        "s","d","f","g","h","j","k","l",                        "z","x","c","v","b","n","m"};                List<String> temp = new ArrayList<String>();                for (String add : speeeds) {                    temp.add(url+add);                }                page.addTargetRequests(temp);            }        }    }    @Override    public Site getSite() {        return site;    }    private static String fileName ;    public static void main(String[] args) throws IOException {          if(args.length>0 && args.length<2){              fileName = args[0];          }else{              fileName = "E:\\temp\\temp_tb_suggest.txt";          }           String[] speeeds = {"q","w","e","r","t","y","u","i","o","p","a",                                "s","d","f","g","h","j","k","l",                                "z","x","c","v","b","n","m"};           long one = System.currentTimeMillis();           for (String q : speeeds) {               long temp = System.currentTimeMillis();                       String url ="https://suggest.taobao.com/sug?code=utf-8&q="+q;                       Spider.create(new TaobaoSuggestWordPageProcessor()).addPipeline(new ConsolePipeline()).addUrl(url).                       thread(7).run();                       System.out.println("the speeed is : "+q+"  end and time is :" + (System.currentTimeMillis() -temp) + " ms");           }           long two = System.currentTimeMillis();           System.out.println("one end and time is :" + (two -one) + " ms");    }}

国美抓取:

public class GomeSuggestWordPageProcessor implements PageProcessor{    // 部分一：抓取网站的相关配置，包括编码、抓取间隔、重试次数等    private Site site = Site.            me().            setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36ss").            setRetryTimes(5).setSleepTime(100);    @Override    // process是定制爬虫逻辑的核心接口，在这里编写抽取逻辑    public void process(Page page) {        //["童装","40827952"]        JSONArray a = JSON.parseArray(page.getJson().toString());        ArrayList<String> temp = new ArrayList<String>();        for (Object aa : a) {            if(aa.toString().indexOf("{\"cat\":")==-1){                String replace = aa.toString().replace("[", "").replace("]", "").replace("\"", "");                            String substring = replace.substring(0,replace.indexOf(","));                temp.add(substring);            }        }        for (int i = 0; i < temp.size(); i++) {            try {                IOUtils.write(temp.get(i).getBytes(), new FileOutputStream(new File(fileName),true));                IOUtils.write(IOUtils.LINE_SEPARATOR.getBytes(), new FileOutputStream(new File(fileName),true));            } catch (FileNotFoundException e) {                e.printStackTrace();            } catch (IOException e) {                e.printStackTrace();            }        }        if(temp.size()>8){            String url = page.getUrl().toString();            //https://suggest.taobao.com/sug?code=utf-8&q=            int index = "http://api.search.gome.com.cn/p/suggest?from=headSearch&query=".length();            String query = url.substring(index, url.length());            if(query.length() <4){                String[] speeeds = {"q","w","e","r","t","y","u","i","o","p","a",                        "s","d","f","g","h","j","k","l",                        "z","x","c","v","b","n","m"};                List<String> temps = new ArrayList<String>();                for (String add : speeeds) {                    temps.add(url+add);                }                page.addTargetRequests(temps);            }        }    }    @Override    public Site getSite() {        return site;    }    private static String fileName ;    public static void main(String[] args) throws IOException {        if(args.length>0 && args.length<2){            fileName = args[0];        }else{            fileName = "E:\\temp\\temp_gome_suggest.txt";        }           String[] speeeds = {"q","w","e","r","t","y","u","i","o","p","a",                                "s","d","f","g","h","j","k","l",                                "z","x","c","v","b","n","m"};        long one = System.currentTimeMillis();        for (String q : speeeds) {            long temp = System.currentTimeMillis();                  String url ="http://api.search.gome.com.cn/p/suggest?from=headSearch&query="+q;                  Spider.create(new GomeSuggestWordPageProcessor()).addPipeline(new ConsolePipeline()).addUrl(url).                  thread(7).run();           System.out.println("the speeed is : "+q+"  end and time is :" + (System.currentTimeMillis() -temp) + " ms");        }        long two = System.currentTimeMillis();        System.out.println("one end and time is :" + (two -one) + " ms");    }}

public class GomePinyinSuggestWordPageProcessor implements PageProcessor{    // 部分一：抓取网站的相关配置，包括编码、抓取间隔、重试次数等    private Site site = Site.            me().            setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36ss").            setRetryTimes(5).setSleepTime(50);    @Override    // process是定制爬虫逻辑的核心接口，在这里编写抽取逻辑    public void process(Page page) {        //["童装","40827952"]        JSONArray a = JSON.parseArray(page.getJson().toString());        ArrayList<String> temp = new ArrayList<String>();        for (Object aa : a) {            if(aa.toString().indexOf("{\"cat\":")==-1){                String replace = aa.toString().replace("[", "").replace("]", "").replace("\"", "");                            String substring = replace.substring(0,replace.indexOf(","));                temp.add(substring);            }        }        for (int i = 0; i < temp.size(); i++) {            try {                IOUtils.write(temp.get(i).getBytes(), new FileOutputStream(new File(fileName),true));                IOUtils.write(IOUtils.LINE_SEPARATOR.getBytes(), new FileOutputStream(new File(fileName),true));            } catch (FileNotFoundException e) {                e.printStackTrace();            } catch (IOException e) {                e.printStackTrace();            }        }        if(temp.size()>8){            String url = page.getUrl().toString();            //https://suggest.taobao.com/sug?code=utf-8&q=            int index = "http://api.search.gome.com.cn/p/suggest?from=headSearch&query=".length();            String query = url.substring(index, url.length());            char[] chars = query.toCharArray();            int num = 0;            for (char c : chars) {                if(c>=65 && c <= 90){                    num++;                }            }            if(num <3){                List<String> temps = new ArrayList<String>();                for (String add : speeeds) {                    temps.add(url+add);                }                page.addTargetRequests(temps);            }        }    }    @Override    public Site getSite() {        return site;    }    private static String fileName ;    private static List<String> speeeds;    public static void main(String[] args) throws IOException {         if(args.length>1 && args.length<3){              fileName = args[0];              speeeds = FileUtils.readLines(new File(args[1]));          }else{              fileName = "E:\\temp\\temp_pinyin_gome_suggest.txt";              speeeds = FileUtils.readLines(new File("E:\\temp\\pinyin.txt"));          }           long one = System.currentTimeMillis();           for (String q : speeeds) {                long temp = System.currentTimeMillis();                       String url ="http://api.search.gome.com.cn/p/suggest?from=headSearch&query="+q;                       Spider.create(new GomePinyinSuggestWordPageProcessor()).addPipeline(new ConsolePipeline()).addUrl(url).                       thread(5).run();              System.out.println("the speeed is : "+q+"  end and time is :" + (System.currentTimeMillis() -temp) + " ms");           }           long two = System.currentTimeMillis();           System.out.println("one end and time is :" + (two -one) + " ms");    }}

0 0