网页数据抓取之当当数据

来源:互联网 发布:我生也有涯而知也无涯 编辑:程序博客网 时间:2024/05/16 03:45
package com.atman.baiye.store.utils;import java.io.BufferedReader;import java.io.IOException;import java.io.InputStreamReader;import java.net.MalformedURLException;import java.net.URL;import java.net.URLConnection;import java.util.ArrayList;import java.util.HashMap;import java.util.List;import java.util.Map;import org.apache.log4j.Logger;import com.atman.baiye.store.domain.AiCommonInfo;public class GoodsDangDangUtils {    protected static Logger logger = Logger.getLogger(GoodsDangDangUtils.class);    public static final String DANGDANG_URL = "http://search.dangdang.com/?key=";     public static List<String> getItems(String data){        List<String> list = new ArrayList<String>();        for(int i = 1; i<20 ; i++){            int startIndex = data.indexOf("class=\"line"+i+"\"");            int endIndex = data.lastIndexOf("class=\"line"+(i+1)+"\"");            String item = data.substring(startIndex-4, endIndex-4);            list.add(item);        }        return list;    }        public static Map<String, String> getElement(String dataitem){        System.out.println("data:"+dataitem);        Map<String, String> map = new HashMap<String, String>();                //title         int beginIndex = dataitem.indexOf("title=\"");        int endIndex = dataitem.indexOf("ddclick");        String title = dataitem.substring(beginIndex, endIndex-4);        title = title.replace("title=\"", "").trim();        System.out.println("title:"+title);        map.put("title", title);                //pic_url        beginIndex = dataitem.indexOf("original=");        int append = 10;        if(beginIndex == -1){            beginIndex = dataitem.indexOf("src=");            append = 5;        }        endIndex = dataitem.indexOf(".jpg");        String pic_url = dataitem.substring(beginIndex+append, endIndex+4);        System.out.println("pic_url:"+pic_url);        map.put("pic_url", pic_url);                //detail_url        String tag = "http://product.dangdang.com";        beginIndex = dataitem.indexOf(tag);        endIndex = dataitem.indexOf(".html");        String detail_url = dataitem.substring(beginIndex, endIndex+5);        System.out.println("detail_url:"+detail_url);        map.put("detail_url", detail_url);                //price        String pricetag = "search_now_price";        beginIndex = dataitem.indexOf(pricetag);        String priceStr = dataitem.substring(beginIndex, beginIndex+50);        beginIndex = priceStr.indexOf("yen;");        endIndex = priceStr.indexOf("</span");        String price = priceStr.substring(beginIndex+4, endIndex);        System.out.println("price:"+price);        map.put("price", price);                //seserev_price        String seserev_pricetag = "search_pre_price";        beginIndex = dataitem.indexOf(seserev_pricetag);        if(beginIndex == -1){            map.put("seserev_price", price);        }else{            String seserev_priceStr = dataitem.substring(beginIndex, beginIndex+50);            beginIndex = seserev_priceStr.indexOf("yen;");            endIndex = seserev_priceStr.indexOf("</span");            String seserev_price = seserev_priceStr.substring(beginIndex+4, endIndex);            System.out.println("seserev_price:"+seserev_price);            map.put("seserev_price", seserev_price);        }                //location        String locationtag = "P_cbs";        beginIndex = dataitem.indexOf(locationtag);        if(beginIndex == -1){            map.put("location", "");        }else{            String locationStr = dataitem.substring(beginIndex, beginIndex+200);            System.out.println("locationStr=="+locationStr);            beginIndex = locationStr.indexOf("title='");            endIndex = locationStr.indexOf("'>");            String location = locationStr.substring(beginIndex+"title='".length(), endIndex);            System.out.println("location:"+location);            map.put("location", location);        }                //store_name        String nametag = "itemlist-shop-name";        System.out.println("dataitem=="+dataitem);        beginIndex = dataitem.indexOf(nametag);        String store_name = "当当自营";        if(beginIndex == -1){        }else{            endIndex = dataitem.indexOf("search_star_line");            String nameStr = dataitem.substring(beginIndex, endIndex);            System.out.println("store_namestr="+nameStr);            beginIndex = nameStr.indexOf("title=\"");            endIndex = nameStr.indexOf("\">");            store_name = nameStr.substring(beginIndex+"title=".length()+1, endIndex);        }        System.out.println("store_name:"+store_name);        map.put("store_name", store_name);                //store_url;        String store_url = "http://www.dangdang.com/";        String starttag = "icon_shangjia\"></span><a href=";        String endtag = "name=\"itemlist-shop-name\"";        beginIndex = dataitem.indexOf(starttag);        if(beginIndex != -1){            endIndex = dataitem.indexOf(endtag);            store_url = dataitem.substring(beginIndex+starttag.length(), endIndex);        }        System.out.println("store_url:"+store_url);        map.put("store_url", store_url);                return map;    }        public static List<AiCommonInfo> getGoodsInfoList(String jsonInfo, String keyword) {        List<AiCommonInfo> aiCommonInfoList = new ArrayList<AiCommonInfo>();        int startIndex = jsonInfo.indexOf("class=\"line1\"");        int endIndex = jsonInfo.lastIndexOf("class=\"line50\"");        if(startIndex != -1 && endIndex != -1){            jsonInfo = jsonInfo.substring(startIndex-4, endIndex-4);        }        List<String> datalist = getItems(jsonInfo);        for (String dataitem : datalist) {            Map<String, String> map = getElement(dataitem);            AiCommonInfo aiCommonInfo = new AiCommonInfo();            aiCommonInfo.setTitle((String)map.get("title"));            aiCommonInfo.setPicUrl(map.get("pic_url"));            aiCommonInfo.setDetailUrl(map.get("detail_url"));            aiCommonInfo.setKeyword(keyword);            aiCommonInfo.setType(1006);            aiCommonInfo.setSource(2);            String price = (String)map.get("price");            aiCommonInfo.setPrice(Double.parseDouble(price));            String reserve_price = map.get("seserev_price");            aiCommonInfo.setReservePrice(Double.parseDouble(reserve_price));            aiCommonInfo.setStoreName(map.get("store_name"));            aiCommonInfo.setStoreUrl(map.get("store_url"));            aiCommonInfo.setLocation(map.get("location"));            aiCommonInfoList.add(aiCommonInfo);        }        return aiCommonInfoList;    }        public static void main(String[] args) {       String data = WebHttpClient.getBebContentByURL(DANGDANG_URL,"JAVA", false, "GBK");       getGoodsInfoList(data, "JAVA");              //System.out.println(goodsMap.get("title"));    }    }
需要什么数据可根据自己需求做相应更改                                             
0 0
原创粉丝点击