数据爬取:爬取淘宝及国美在线搜索建议词
来源:互联网 发布:网络诈骗300元怎么办 编辑:程序博客网 时间:2024/04/28 07:37
分为两种形式的抓取:
1.基于首字母的四轮次抓取 如: a ,aa ,aaa,aaaa
2.基于汉语i拼音的三轮次抓取:附拼音表
链接:http://pan.baidu.com/s/1eS5Kdmq 密码:n9pb
使用的框架为webmagic
淘宝爬取:
国美抓取:
public class TaobaoPinyinSuggestWordPageProcessor implements PageProcessor{ // 部分一:抓取网站的相关配置,包括编码、抓取间隔、重试次数等 private Site site = Site. me(). setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36ss"). setRetryTimes(5).setSleepTime(10); @Override // process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑 public void process(Page page) { //["童装","40827952"] JSONArray a = JSON.parseArray(JSON.parseObject(page.getJson().toString()).get("result").toString()); for (Object aa : a) { String replace = aa.toString().replace("[", "").replace("]", "").replace("\"", ""); String substring = replace.substring(0,replace.indexOf(",")); try { IOUtils.write(substring.getBytes(), new FileOutputStream(new File(fileName),true)); IOUtils.write(IOUtils.LINE_SEPARATOR.getBytes(), new FileOutputStream(new File(fileName),true)); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } if(a.size()>9){ String url = page.getUrl().toString(); //https://suggest.taobao.com/sug?code=utf-8&q= int index = "https://suggest.taobao.com/sug?code=utf-8&q=".length(); String query = url.substring(index, url.length()); char[] chars = query.toCharArray(); int num = 0; for (char c : chars) { if(c>=65 && c <= 90){ num++; } } if(num <3){ List<String> temp = new ArrayList<String>(); for (String add : speeeds) { temp.add(url+add); } page.addTargetRequests(temp); } } } @Override public Site getSite() { return site; } private static String fileName ; private static List<String> speeeds; public static void main(String[] args) throws IOException { if(args.length>1 && args.length<3){ fileName = args[0]; speeeds = FileUtils.readLines(new File(args[1])); }else{ fileName = "E:\\temp\\temp_pinyin_suggest.txt"; speeeds = FileUtils.readLines(new File("E:\\temp\\pinyin.txt")); } long one = System.currentTimeMillis(); for (String q : speeeds) { long temp = System.currentTimeMillis(); String url ="https://suggest.taobao.com/sug?code=utf-8&q="+q; Spider.create(new TaobaoPinyinSuggestWordPageProcessor()).addPipeline(new ConsolePipeline()).addUrl(url). thread(7).run(); System.out.println("the speeed is : "+q+" end and time is :" + (System.currentTimeMillis() -temp) + " ms"); } long two = System.currentTimeMillis(); System.out.println("one end and time is :" + (two -one) + " ms"); }}
public class TaobaoSuggestWordPageProcessor implements PageProcessor{ // 部分一:抓取网站的相关配置,包括编码、抓取间隔、重试次数等 private Site site = Site. me(). setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36ss"). setRetryTimes(5).setSleepTime(100); @Override // process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑 public void process(Page page) { //["童装","40827952"] JSONArray a = JSON.parseArray(JSON.parseObject(page.getJson().toString()).get("result").toString()); for (Object aa : a) { String replace = aa.toString().replace("[", "").replace("]", "").replace("\"", ""); String substring = replace.substring(0,replace.indexOf(",")); try { IOUtils.write(substring.getBytes(), new FileOutputStream(new File(fileName),true)); IOUtils.write(IOUtils.LINE_SEPARATOR.getBytes(), new FileOutputStream(new File(fileName),true)); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } if(a.size()>9){ String url = page.getUrl().toString(); //https://suggest.taobao.com/sug?code=utf-8&q= int index = "https://suggest.taobao.com/sug?code=utf-8&q=".length(); String query = url.substring(index, url.length()); if(query.length() <4){ String[] speeeds = {"q","w","e","r","t","y","u","i","o","p","a", "s","d","f","g","h","j","k","l", "z","x","c","v","b","n","m"}; List<String> temp = new ArrayList<String>(); for (String add : speeeds) { temp.add(url+add); } page.addTargetRequests(temp); } } } @Override public Site getSite() { return site; } private static String fileName ; public static void main(String[] args) throws IOException { if(args.length>0 && args.length<2){ fileName = args[0]; }else{ fileName = "E:\\temp\\temp_tb_suggest.txt"; } String[] speeeds = {"q","w","e","r","t","y","u","i","o","p","a", "s","d","f","g","h","j","k","l", "z","x","c","v","b","n","m"}; long one = System.currentTimeMillis(); for (String q : speeeds) { long temp = System.currentTimeMillis(); String url ="https://suggest.taobao.com/sug?code=utf-8&q="+q; Spider.create(new TaobaoSuggestWordPageProcessor()).addPipeline(new ConsolePipeline()).addUrl(url). thread(7).run(); System.out.println("the speeed is : "+q+" end and time is :" + (System.currentTimeMillis() -temp) + " ms"); } long two = System.currentTimeMillis(); System.out.println("one end and time is :" + (two -one) + " ms"); }}
国美抓取:
public class GomeSuggestWordPageProcessor implements PageProcessor{ // 部分一:抓取网站的相关配置,包括编码、抓取间隔、重试次数等 private Site site = Site. me(). setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36ss"). setRetryTimes(5).setSleepTime(100); @Override // process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑 public void process(Page page) { //["童装","40827952"] JSONArray a = JSON.parseArray(page.getJson().toString()); ArrayList<String> temp = new ArrayList<String>(); for (Object aa : a) { if(aa.toString().indexOf("{\"cat\":")==-1){ String replace = aa.toString().replace("[", "").replace("]", "").replace("\"", ""); String substring = replace.substring(0,replace.indexOf(",")); temp.add(substring); } } for (int i = 0; i < temp.size(); i++) { try { IOUtils.write(temp.get(i).getBytes(), new FileOutputStream(new File(fileName),true)); IOUtils.write(IOUtils.LINE_SEPARATOR.getBytes(), new FileOutputStream(new File(fileName),true)); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } if(temp.size()>8){ String url = page.getUrl().toString(); //https://suggest.taobao.com/sug?code=utf-8&q= int index = "http://api.search.gome.com.cn/p/suggest?from=headSearch&query=".length(); String query = url.substring(index, url.length()); if(query.length() <4){ String[] speeeds = {"q","w","e","r","t","y","u","i","o","p","a", "s","d","f","g","h","j","k","l", "z","x","c","v","b","n","m"}; List<String> temps = new ArrayList<String>(); for (String add : speeeds) { temps.add(url+add); } page.addTargetRequests(temps); } } } @Override public Site getSite() { return site; } private static String fileName ; public static void main(String[] args) throws IOException { if(args.length>0 && args.length<2){ fileName = args[0]; }else{ fileName = "E:\\temp\\temp_gome_suggest.txt"; } String[] speeeds = {"q","w","e","r","t","y","u","i","o","p","a", "s","d","f","g","h","j","k","l", "z","x","c","v","b","n","m"}; long one = System.currentTimeMillis(); for (String q : speeeds) { long temp = System.currentTimeMillis(); String url ="http://api.search.gome.com.cn/p/suggest?from=headSearch&query="+q; Spider.create(new GomeSuggestWordPageProcessor()).addPipeline(new ConsolePipeline()).addUrl(url). thread(7).run(); System.out.println("the speeed is : "+q+" end and time is :" + (System.currentTimeMillis() -temp) + " ms"); } long two = System.currentTimeMillis(); System.out.println("one end and time is :" + (two -one) + " ms"); }}
public class GomePinyinSuggestWordPageProcessor implements PageProcessor{ // 部分一:抓取网站的相关配置,包括编码、抓取间隔、重试次数等 private Site site = Site. me(). setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36ss"). setRetryTimes(5).setSleepTime(50); @Override // process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑 public void process(Page page) { //["童装","40827952"] JSONArray a = JSON.parseArray(page.getJson().toString()); ArrayList<String> temp = new ArrayList<String>(); for (Object aa : a) { if(aa.toString().indexOf("{\"cat\":")==-1){ String replace = aa.toString().replace("[", "").replace("]", "").replace("\"", ""); String substring = replace.substring(0,replace.indexOf(",")); temp.add(substring); } } for (int i = 0; i < temp.size(); i++) { try { IOUtils.write(temp.get(i).getBytes(), new FileOutputStream(new File(fileName),true)); IOUtils.write(IOUtils.LINE_SEPARATOR.getBytes(), new FileOutputStream(new File(fileName),true)); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } if(temp.size()>8){ String url = page.getUrl().toString(); //https://suggest.taobao.com/sug?code=utf-8&q= int index = "http://api.search.gome.com.cn/p/suggest?from=headSearch&query=".length(); String query = url.substring(index, url.length()); char[] chars = query.toCharArray(); int num = 0; for (char c : chars) { if(c>=65 && c <= 90){ num++; } } if(num <3){ List<String> temps = new ArrayList<String>(); for (String add : speeeds) { temps.add(url+add); } page.addTargetRequests(temps); } } } @Override public Site getSite() { return site; } private static String fileName ; private static List<String> speeeds; public static void main(String[] args) throws IOException { if(args.length>1 && args.length<3){ fileName = args[0]; speeeds = FileUtils.readLines(new File(args[1])); }else{ fileName = "E:\\temp\\temp_pinyin_gome_suggest.txt"; speeeds = FileUtils.readLines(new File("E:\\temp\\pinyin.txt")); } long one = System.currentTimeMillis(); for (String q : speeeds) { long temp = System.currentTimeMillis(); String url ="http://api.search.gome.com.cn/p/suggest?from=headSearch&query="+q; Spider.create(new GomePinyinSuggestWordPageProcessor()).addPipeline(new ConsolePipeline()).addUrl(url). thread(5).run(); System.out.println("the speeed is : "+q+" end and time is :" + (System.currentTimeMillis() -temp) + " ms"); } long two = System.currentTimeMillis(); System.out.println("one end and time is :" + (two -one) + " ms"); }}
0 0
- 数据爬取:爬取淘宝及国美在线搜索建议词
- 淘宝搜索定向爬取
- 淘宝商品数据爬取
- Selenium 爬取淘宝数据
- python selenium 爬取淘宝商品数据
- python爬虫 爬取淘宝网页数据
- python学习,爬取淘宝评论数据
- Python爬取淘宝搜索页,使用Selenium+BeautifulSoup
- 用requests爬取百度搜索数据
- 淘宝商品评论爬取
- 爬取淘宝MM图片
- pyspider 爬取淘宝食品
- Selenium+PhantomJS爬取淘宝
- python3爬取淘宝信息
- python3爬取淘宝信息!
- python 爬取淘宝信息
- python-爬取淘宝商品
- Python爬取淘宝图片
- splice函数和slice函数的区别
- 关于MyBatis插入语句,返回主键的值
- tomcat容器启动web应用(sbt和scala环境下)
- java中对对象的理解
- 一道常被人轻视的前端 JS 面试题
- 数据爬取:爬取淘宝及国美在线搜索建议词
- 使用Java调用dll动态链接库
- 【NOIP2016普及组复赛】魔法阵
- react教程之this.state
- 数据结构实验之查找七:线性之哈希表
- 原生JS实现ajax请求
- C++ sort用法
- 翻牌变色游戏
- 安卓动画