抓取www.zara.cn衣服信息和图片

来源:互联网 发布:mac如何定时关机 编辑:程序博客网 时间:2024/04/27 17:50

这个较简单,直接上代码

YFMain.java

package com.yf.zara;import java.io.File;import java.io.FileOutputStream;import java.io.IOException;import java.text.SimpleDateFormat;import java.util.ArrayList;import java.util.Date;import java.util.List;import org.apache.commons.httpclient.params.HttpMethodParams;import org.apache.http.HttpResponse;import org.apache.http.client.ClientProtocolException;import org.apache.http.client.HttpClient;import org.apache.http.client.methods.HttpGet;import org.apache.http.impl.client.DefaultHttpClient;import org.apache.http.params.CoreConnectionPNames;import org.apache.http.util.EntityUtils;import org.apache.log4j.Logger;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;public class YFMain {private Logger log = Logger.getLogger(YFMain.class);private String menuurl = "http://www.zara.cn/cn/zh/%E5%A5%B3%E5%A3%AB-c281502.html";private List<YFNode> yfTree = new ArrayList<YFNode>();public static boolean debug = false;public static int MAX_DEBUG_LINE = 5;private boolean downloadImage = false;private String DIR_ROOT = "F:/yf-dkf/zara";private String configFilePath = this.DIR_ROOT + "/config.ini";private String zaraFilePath = this.DIR_ROOT + "/zara.ini";private String exceptionLogFilePath = this.DIR_ROOT + "/error.log";HttpGet get = null;HttpResponse httpResponse = null;HttpClient httpclient  = null;SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");public void init(){configFilePath = this.DIR_ROOT + "/config.ini";zaraFilePath = this.DIR_ROOT + "/zara.txt";exceptionLogFilePath = this.DIR_ROOT + "/error.log";}public void start(){init();httpclient = new DefaultHttpClient();httpclient.getParams().setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT,  30 * 1000);//连接时间20shttpclient.getParams().setParameter(CoreConnectionPNames.SO_TIMEOUT,  60 * 1000);httpclient.getParams().setParameter(HttpMethodParams.USER_AGENT, "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36");boolean beSuccess = false;long t1 = System.currentTimeMillis();String startedAt = sdf.format(new Date(t1));while(!beSuccess){try {beSuccess = this.step1();} catch (ClientProtocolException e) {e.printStackTrace();} catch (IOException e) {e.printStackTrace();}}beSuccess = false;while(!beSuccess){beSuccess = this.step2();}long t2 = System.currentTimeMillis();String endAt = sdf.format(new Date(t2));long diff = (t2 -t1)/1000;long hour = diff/3600;long minite = (diff - hour*3600)/(60);long sec = diff % 60;println("");println(""+startedAt+" ---> "+ endAt + "" );println("it takes "+hour+" h "+minite+" m "+sec+" s ." );println("=======  Finish ======");};public void downloadPhotos (String url, String savePath, String saveNamge){savePath = savePath.replaceAll("\"", "");File f = new File(savePath);if(!f.exists())f.mkdirs();File storeFile = new File( savePath + "/" + saveNamge ); if(storeFile.exists()){print(  " # Ignore exists image @ "+storeFile.getAbsolutePath() );return ;}FileOutputStream output = null;try {get = new HttpGet( url );httpResponse = httpclient.execute(get);output = new FileOutputStream(storeFile);  //得到网络资源的字节数组,并写入文件  output.write( EntityUtils.toByteArray(httpResponse.getEntity()) );  output.close();  print( " # saved image @ "+storeFile.getAbsolutePath() );} catch (Exception e) {if(output!=null)try {output.close();} catch (IOException e1) {e1.printStackTrace();}if(storeFile.canWrite())storeFile.delete();print( " # faile save image @ "+storeFile.getAbsolutePath() + " from="+ url);e.printStackTrace();}}// Step 5public void downloadYFImage(YFImage img){try{println("");print(img.getSpace());this.downloadPhotos(img.getUrl(), img.getDir(), img.getSaveName());img.setDownloaded(true);}catch(Exception e){e.printStackTrace();}}// Step 4public boolean getProectDetail(YFProduct pro) throws ClientProtocolException, IOException{get = new HttpGet( pro.getUrl() );httpResponse = httpclient.execute(get);String htmlstring = EntityUtils.toString(httpResponse.getEntity());Document html = Jsoup.parse(htmlstring);Element product = html.getElementById("product");Elements right = product.select(".right");try{pro.setReference( right.select(".reference").text() );pro.setPrice( right.select("span.price").attr("data-price"));pro.setDescription( right.select(".description").text());}catch(Exception e){e.printStackTrace();}if(downloadImage==false)return true;//不下载图片//下载图片Elements leftimgs = product.select(".left .bigImageContainer .media-wrap a img");int num = 0;for(Element img : leftimgs){num ++;YFImage image = new YFImage();image.setId( num );image.setUrl( img.attr("src") );image.setDownloaded(false);image.setProduct(pro);image.setDir(pro.getDir());image.setSpacenum( pro.getSpacenum() + 4);image.setSaveName(img.attr("data-id")+"_"+num+".jpg");pro.add(image);downloadYFImage(image);// download imageif(!image.getDownloaded()){image.getFaliedImage().add(image);}}return true;}// Step 3public boolean getProducts(YFNode pNode, Element productListUL) throws ClientProtocolException, IOException{boolean bOk = true;if(productListUL!=null){Elements products = productListUL.select("li div a");int num = 0;for(Element product : products){num ++;YFProduct pro = new YFProduct();pro.setId( num );pro.setName( product.text() );pro.setUrl( product.attr("href") );pro.setSpacenum( pNode.getSpacenum() + 4);pro.setDir(pNode.getDir()+"/"+pro.getName());pNode.add(pro);bOk &= getProectDetail(pro);// Step 4tempstr = pro.getSpace() + "productname=" + pro.getName() + ",\t reference="+ pro.getReference() +  ", \tprice="+pro.getPrice() + ", \tdescription="+pro.getDescription();infoout.write((tempstr +"\r\n").getBytes());println("");print(tempstr +", \t url=" + pro.getUrl() );}}return bOk;}//Step 2public void getSubNoes(YFNode pNode) throws ClientProtocolException, IOException{//println(" getSubNoes ");println("");print( pNode.getSpace() + "nodename=" + pNode.getName() + "  url=" + pNode.getUrl());get = new HttpGet( pNode.getUrl() );httpResponse = httpclient.execute(get);String htmlstring = EntityUtils.toString(httpResponse.getEntity());Document html = Jsoup.parse(htmlstring);Element mainNavigationMenu = html.getElementById( "mainNavigationMenu");Element menuItemData = html.getElementById(pNode.getItemid());if(menuItemData==null){return ;}Elements ullis = menuItemData.select("ul>li");if(ullis==null || ullis.size()<1){// get productsgetProducts(pNode, html.getElementById("product-list")); // Step 3}else{int num = 0;for(Element currentmenu : ullis){num ++;YFNode node = new YFNode();node.setId(num);node.setItemid(currentmenu.attr("id"));node.setUrl( currentmenu.select("a").attr("href"));node.setName( currentmenu.select("a").text());node.setpNode(pNode);node.setSpacenum(pNode.getSpacenum()+4);node.setDir(pNode.getDir()+"/"+node.getName());pNode.add(node);tempstr = node.getSpace() + node.getName() +"\r\n";infoout.write(tempstr.getBytes());getSubNoes(node);//get sub node pNode.setCurrentNum(node.getId());}}}private String tempstr;FileOutputStream infoout;// get tree nodespublic boolean step1() throws ClientProtocolException, IOException{println("=======  Step 1 ======");if(infoout!=null)infoout.close();File zaraFile = new File(this.zaraFilePath);infoout = new FileOutputStream(zaraFile);yfTree.clear();//clear dataget = new HttpGet( this.menuurl );httpResponse = httpclient.execute(get);String htmlstring = EntityUtils.toString(httpResponse.getEntity());Document html = Jsoup.parse(htmlstring);Element mainNavigationMenu = html.getElementById("mainNavigationMenu");if(mainNavigationMenu==null){log.debug("mainNavigationMenu==null");return false;}Elements currentmenu = mainNavigationMenu.select("li.current");YFNode root = new YFNode();root.setId(1);root.setCurrentNum(1);root.setItemid(currentmenu.attr("id"));root.setUrl( currentmenu.select("a").first().attr("href"));root.setName( currentmenu.select("a").first().text());root.setpNode(null);root.setSpacenum(0);root.setDir(this.DIR_ROOT);yfTree.add(root);tempstr = root.getSpace() + root.getName() +"\r\n";infoout.write(tempstr.getBytes() );this.getSubNoes(root);//getSubNoes Step 2infoout.close();get.releaseConnection();return true;}///public boolean step2() {println("\n=======  Step 2  ======");boolean bOK = true;println("YFImage.failedImage.size="+YFImage.getFaliedImage().size());for(YFImage img : YFImage.getFaliedImage()){downloadYFImage(img);if(img.getDownloaded())YFImage.getFaliedImage().remove(img);elsebOK = false;}return bOK;}public static void main(String[] args) {YFMain yf = new YFMain();if(args!=null){if(args.length >=1 ){yf.setDIR_ROOT(args[0]);System.out.println("reset DIR_ROOT : "+yf.getDIR_ROOT());}if(args.length >=2 ){if(args[1].equals("true"))yf.setDownloadImage(true);elseyf.setDownloadImage(false);}}yf.start();}public static void print(String str){System.out.print(str);}public static void println(String str){System.out.println(str);}public String getDIR_ROOT() {return DIR_ROOT;}public void setDIR_ROOT(String dIR_ROOT) {DIR_ROOT = dIR_ROOT;}public boolean isDownloadImage() {return downloadImage;}public void setDownloadImage(boolean downloadImage) {this.downloadImage = downloadImage;}}

YFNode.java

package com.yf.zara;import java.util.ArrayList;import java.util.List;public class YFNode {public static int currentNum = 0;private Integer id;private String name;private String url;private String itemid;private YFNode pNode;private List<YFNode> subNodes;private List<YFProduct> products;private int spacenum = 0;private String dir;public boolean add(YFNode node){if(subNodes==null)subNodes = new ArrayList<YFNode>();return this.subNodes.add(node);}public boolean add(YFProduct product){if(products==null)products = new ArrayList<YFProduct>();return this.products.add(product);}public String getSpace() {String str="";for(int i=0; i<spacenum; i++)str += " ";return str;}public String getName() {return name;}public void setName(String name) {this.name = name;}public String getUrl() {return url;}public void setUrl(String url) {this.url = url;}public String getItemid() {return itemid;}public void setItemid(String itemid) {this.itemid = itemid;}public YFNode getpNode() {return pNode;}public void setpNode(YFNode pNode) {this.pNode = pNode;}public Integer getId() {return id;}public void setId(Integer id) {this.id = id;}public int getSpacenum() {return spacenum;}public void setSpacenum(int spacenum) {this.spacenum = spacenum;}public List<YFNode> getSubNodes() {return subNodes;}public void setSubNodes(List<YFNode> subNodes) {this.subNodes = subNodes;}public String getDir() {return dir;}public void setDir(String dir) {this.dir = dir;}public int getCurrentNum() {return currentNum;}public void setCurrentNum(int currentNum) {this.currentNum = currentNum;}}


YFProduct.java

package com.yf.zara;import java.util.ArrayList;import java.util.List;public class YFProduct {public static int currentNum = 0;private Integer id;private String name;private String url;private String price;private String reference;private String description;private int spacenum = 0;private List<YFImage> imagelist;private String dir;public boolean add(YFImage img){if(imagelist==null)imagelist = new ArrayList<YFImage>();return imagelist.add(img);}public String getSpace() {String str="";for(int i=0; i<spacenum; i++)str += " ";return str;}public Integer getId() {return id;}public void setId(Integer id) {this.id = id;}public String getName() {return name;}public void setName(String name) {if(name==null || name.trim().isEmpty())this.name = "NoName_"+this.id;elsethis.name = name;}public String getUrl() {return url;}public void setUrl(String url) {this.url = url;}public String getPrice() {return price;}public void setPrice(String price) {this.price = price;}public String getDescription() {return description;}public void setDescription(String description) {this.description = description;}public int getSpacenum() {return spacenum;}public void setSpacenum(int spacenum) {this.spacenum = spacenum;}public String getReference() {return reference;}public void setReference(String reference) {this.reference = reference;}public String getDir() {return dir;}public void setDir(String dir) {this.dir = dir;}public int getCurrentNum() {return currentNum;}public void setCurrentNum(int currentNum) {this.currentNum = currentNum;}}

YFImage.java

package com.yf.zara;import java.util.ArrayList;import java.util.List;public class YFImage {private int id;private String url;private YFProduct product;private boolean downloaded;private String dir;private String saveName;private int spacenum = 0;private static List<YFImage> faliedImage = new ArrayList<YFImage>();//下载失败的图片public String getSpace() {String str="";for(int i=0; i<spacenum; i++)str += " ";return str;}public int getId() {return id;}public void setId(int id) {this.id = id;}public String getUrl() {return url;}public void setUrl(String url) {if(url.startsWith("//"))this.url = "http:"+url;elsethis.url = url;}public YFProduct getProduct() {return product;}public void setProduct(YFProduct product) {this.product = product;}public boolean getDownloaded() {return downloaded;}public void setDownloaded(boolean downloaded) {this.downloaded = downloaded;}public String getDir() {return dir;}public void setDir(String dir) {this.dir = dir;}public String getSaveName() {return saveName;}public void setSaveName(String saveName) {this.saveName = saveName;}public static List<YFImage> getFaliedImage() {return faliedImage;}public int getSpacenum() {return spacenum;}public void setSpacenum(int spacenum) {this.spacenum = spacenum;}}



0 0
原创粉丝点击