网页数据抓取之当当数据
来源:互联网 发布:我生也有涯而知也无涯 编辑:程序博客网 时间:2024/05/16 03:45
package com.atman.baiye.store.utils;import java.io.BufferedReader;import java.io.IOException;import java.io.InputStreamReader;import java.net.MalformedURLException;import java.net.URL;import java.net.URLConnection;import java.util.ArrayList;import java.util.HashMap;import java.util.List;import java.util.Map;import org.apache.log4j.Logger;import com.atman.baiye.store.domain.AiCommonInfo;public class GoodsDangDangUtils { protected static Logger logger = Logger.getLogger(GoodsDangDangUtils.class); public static final String DANGDANG_URL = "http://search.dangdang.com/?key="; public static List<String> getItems(String data){ List<String> list = new ArrayList<String>(); for(int i = 1; i<20 ; i++){ int startIndex = data.indexOf("class=\"line"+i+"\""); int endIndex = data.lastIndexOf("class=\"line"+(i+1)+"\""); String item = data.substring(startIndex-4, endIndex-4); list.add(item); } return list; } public static Map<String, String> getElement(String dataitem){ System.out.println("data:"+dataitem); Map<String, String> map = new HashMap<String, String>(); //title int beginIndex = dataitem.indexOf("title=\""); int endIndex = dataitem.indexOf("ddclick"); String title = dataitem.substring(beginIndex, endIndex-4); title = title.replace("title=\"", "").trim(); System.out.println("title:"+title); map.put("title", title); //pic_url beginIndex = dataitem.indexOf("original="); int append = 10; if(beginIndex == -1){ beginIndex = dataitem.indexOf("src="); append = 5; } endIndex = dataitem.indexOf(".jpg"); String pic_url = dataitem.substring(beginIndex+append, endIndex+4); System.out.println("pic_url:"+pic_url); map.put("pic_url", pic_url); //detail_url String tag = "http://product.dangdang.com"; beginIndex = dataitem.indexOf(tag); endIndex = dataitem.indexOf(".html"); String detail_url = dataitem.substring(beginIndex, endIndex+5); System.out.println("detail_url:"+detail_url); map.put("detail_url", detail_url); //price String pricetag = "search_now_price"; beginIndex = dataitem.indexOf(pricetag); String priceStr = dataitem.substring(beginIndex, beginIndex+50); beginIndex = priceStr.indexOf("yen;"); endIndex = priceStr.indexOf("</span"); String price = priceStr.substring(beginIndex+4, endIndex); System.out.println("price:"+price); map.put("price", price); //seserev_price String seserev_pricetag = "search_pre_price"; beginIndex = dataitem.indexOf(seserev_pricetag); if(beginIndex == -1){ map.put("seserev_price", price); }else{ String seserev_priceStr = dataitem.substring(beginIndex, beginIndex+50); beginIndex = seserev_priceStr.indexOf("yen;"); endIndex = seserev_priceStr.indexOf("</span"); String seserev_price = seserev_priceStr.substring(beginIndex+4, endIndex); System.out.println("seserev_price:"+seserev_price); map.put("seserev_price", seserev_price); } //location String locationtag = "P_cbs"; beginIndex = dataitem.indexOf(locationtag); if(beginIndex == -1){ map.put("location", ""); }else{ String locationStr = dataitem.substring(beginIndex, beginIndex+200); System.out.println("locationStr=="+locationStr); beginIndex = locationStr.indexOf("title='"); endIndex = locationStr.indexOf("'>"); String location = locationStr.substring(beginIndex+"title='".length(), endIndex); System.out.println("location:"+location); map.put("location", location); } //store_name String nametag = "itemlist-shop-name"; System.out.println("dataitem=="+dataitem); beginIndex = dataitem.indexOf(nametag); String store_name = "当当自营"; if(beginIndex == -1){ }else{ endIndex = dataitem.indexOf("search_star_line"); String nameStr = dataitem.substring(beginIndex, endIndex); System.out.println("store_namestr="+nameStr); beginIndex = nameStr.indexOf("title=\""); endIndex = nameStr.indexOf("\">"); store_name = nameStr.substring(beginIndex+"title=".length()+1, endIndex); } System.out.println("store_name:"+store_name); map.put("store_name", store_name); //store_url; String store_url = "http://www.dangdang.com/"; String starttag = "icon_shangjia\"></span><a href="; String endtag = "name=\"itemlist-shop-name\""; beginIndex = dataitem.indexOf(starttag); if(beginIndex != -1){ endIndex = dataitem.indexOf(endtag); store_url = dataitem.substring(beginIndex+starttag.length(), endIndex); } System.out.println("store_url:"+store_url); map.put("store_url", store_url); return map; } public static List<AiCommonInfo> getGoodsInfoList(String jsonInfo, String keyword) { List<AiCommonInfo> aiCommonInfoList = new ArrayList<AiCommonInfo>(); int startIndex = jsonInfo.indexOf("class=\"line1\""); int endIndex = jsonInfo.lastIndexOf("class=\"line50\""); if(startIndex != -1 && endIndex != -1){ jsonInfo = jsonInfo.substring(startIndex-4, endIndex-4); } List<String> datalist = getItems(jsonInfo); for (String dataitem : datalist) { Map<String, String> map = getElement(dataitem); AiCommonInfo aiCommonInfo = new AiCommonInfo(); aiCommonInfo.setTitle((String)map.get("title")); aiCommonInfo.setPicUrl(map.get("pic_url")); aiCommonInfo.setDetailUrl(map.get("detail_url")); aiCommonInfo.setKeyword(keyword); aiCommonInfo.setType(1006); aiCommonInfo.setSource(2); String price = (String)map.get("price"); aiCommonInfo.setPrice(Double.parseDouble(price)); String reserve_price = map.get("seserev_price"); aiCommonInfo.setReservePrice(Double.parseDouble(reserve_price)); aiCommonInfo.setStoreName(map.get("store_name")); aiCommonInfo.setStoreUrl(map.get("store_url")); aiCommonInfo.setLocation(map.get("location")); aiCommonInfoList.add(aiCommonInfo); } return aiCommonInfoList; } public static void main(String[] args) { String data = WebHttpClient.getBebContentByURL(DANGDANG_URL,"JAVA", false, "GBK"); getGoodsInfoList(data, "JAVA"); //System.out.println(goodsMap.get("title")); } }需要什么数据可根据自己需求做相应更改
0 0
- 网页数据抓取之当当数据
- 网页数据抓取之读取网页数据
- python 抓取当当网的图书数据
- PYTHON抓取当当网商品数据
- 网页数据抓取之新浪新闻数据
- 网页数据抓取之淘宝数据
- 网页数据抓取之大众点评数据
- PHP之抓取网页数据-终结版
- 网页抓取工具之数据预处理
- 关于抓取网页数据
- 抓取网页数据
- 抓取网页中的数据
- java抓取网页数据
- 网页数据抓取
- delphi网页数据抓取
- C# 抓取网页数据
- 网页数据的抓取
- java网页数据抓取
- C6-2 字符串的回文子序列个数
- XSL中调用模板的时候传递可变参数
- Laravel - CSRF token禁用方法
- Parasoft C++test使用教程:桩函数
- 时间格式化函数
- 网页数据抓取之当当数据
- mac应用程序不能打开问题解决
- Elasticsearch 5.0 —— Head插件部署指南
- 支付宝 Android 版使用的开源组件
- 【Leetcode】172. Factorial Trailing Zeroes
- DelayQueue 实现简单的定时任务
- Oracle触发器示例
- mysql 用户角色权限表建立
- angularJS+html+Spring+Mybatis