抓取凡客一万多商品，链接，商品名称，价格，图片~~~版本1

来源：互联网发布：北京国贸中心数据编辑：程序博客网时间：2024/04/23 20:38

版本1不能输出到txt中，还有一些没清干净的显示错误，

如：

需要屏蔽掉

市场价: >市场价￥299</>< class=

售价: >售价￥99</span>

</div>

</li>

中的一些杂乱符号项，留下市场价和售价就行了。。。。。。。。。

收工，先用笨方法，

AppendStringToATextFile.append(new File("d://List.txt"), "测试1");

把信息都存到D:/list.txt里边了，然后再用笨方法，用批量替换把符号都抠去，先保存一份像样的“D:/list改.txt”里边

除此，我还需要抓取图片，然后再列表里显示图片的本地链接。。。。

package newFolder;

import java.io.BufferedReader;

import java.io.InputStreamReader;

import java.net.HttpURLConnection;

import java.net.URL;

public class CatchUrlFinalV1 {

public static String getHtml(String urlString) {

try {

StringBuffer html = new StringBuffer();

URL url = new URL(urlString);

HttpURLConnection conn = (HttpURLConnection) url.openConnection();

InputStreamReader isr = new InputStreamReader(conn.getInputStream());

BufferedReader br = new BufferedReader(isr);

String temp;

while ((temp = br.readLine()) != null) {

html.append(temp).append("/n");

}

br.close();

isr.close();

String test = html.toString();

String[] testArray = test.split("/"");

System.out.println(test.split("href").length);

return test;

} catch (Exception e) {

e.printStackTrace();

return null;

}

public static void main(String[] args) {

String Url = "http://s.vancl.com/search.aspx";

int i = 0;

String nextPageUrl = "";

try{

do{//应该用一个dowhile，用下一页Url是否存在判断

String test = CatchUrlFinalV1.getHtml(Url);

String[] StringArray = test.split("li class=/"scListArea");

for(int j = 1;j < 40;j++){//循环控制，40个物品，分割了41个段落，取[1,41]

String[] output = StringArray[j].split("/"");

System.out.println("第"+(40*i+j)+"个商品");

System.out.println("物品链接: "+output[output.length-10]);

System.out.println("物品名称: "+output[output.length-8]);

// System.out.println("市场价: "+output[output.length-3]);

System.out.println("市场价: "+output[output.length-3].replaceAll("span",""));

// System.out.println("市场价: "+output[output.length-3].replaceAll("<.*>", ""));

System.out.println("售价: "+output[output.length-1]);

}

//第40个，和后边连在一起了，不能用length-10之类的

String[] output = StringArray[40].split("/"");

System.out.println("当前为第"+(40*(i+1))+"个物品");

System.out.println("物品链接: "+output[62]);

System.out.println("物品名称: "+output[64]);

System.out.println("市场价: "+output[69]);

// System.out.println("市场价: "+output[output.length-3].replaceAll("<.*>", ""));

System.out.println("售价: "+output[71]);

//获取跳转链接

String[] page = test.split("下一页");

String[] count = page[1].split("/"");

nextPageUrl = count[count.length-2];

System.out.println("下一页的链接: "+nextPageUrl);

Url = "http://s.vancl.com";

Url += nextPageUrl;//直接用/search.aspx?s=1&d=0&b=0&p=4&r=40不行，需要在前边加http://s.vancl.com/search.aspx

i++;

}while(nextPageUrl != "");

}catch(Exception e){//最后一页,会有 java.lang.ArrayIndexOutOfBoundsException异常

e.printStackTrace();

}

// System.out.println(StringArray[1].split("/"").length);

// System.out.println(test.split("li")[1]);

// System.out.println(test.split("href")[11]);

// System.out.println(test.split("href")[111]);

// System.out.println(CatchUrl.getHtml("http://s.vancl.com/search.aspx"));

}