I Love the World!

来源:互联网 发布:java高达 编辑:程序博客网 时间:2024/06/03 07:02

1.我的第一个博客

1.1我喜欢与人分享知识

1.2我与你们同在

2.我热爱生活

2.1我以后会把每天的感想都写在这里

2.2今天先测试一下

3.代码

public class JDProductMaster2 {
//private static final Jedis jedis = new Jedis("10.0.0.8", 6379);
// 创建固定大小的线程池(下载、解析、存储)
static ExecutorService threadPool = Executors.newFixedThreadPool(30);


// 队列---从首页和分页解析出来的文章url,存放在这个队列中
public static ArrayBlockingQueue<String> pidsQueue = new ArrayBlockingQueue<String>(1000);


// 队列---每个文章解析出来的html文档,存放这个队列中
public static ArrayBlockingQueue<String> urlsQueue = new ArrayBlockingQueue<String>(1000);


// 队列---每个文章的内容,也就是article对象,存放这个队列中
public static ArrayBlockingQueue<Product2> product2Queue = new ArrayBlockingQueue<Product2>(1000);




public static void main(String[] args) {
// 1.准备url
String indexUrl = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&wq=%E6%89%8B%E6%9C%BA&pvid=aa9a99bb0895488197c2a663d777a51b";
try {
HttpGet httpGet = new HttpGet(indexUrl);
// 2.获取首页的信息
String html = getHtml(httpGet);
// 3.解析首页 此处不需要返回值,直接在方法中调用redis的jedis的客户端
parseHtml(html);
} catch (Exception e) {
System.out.println("首页访问失败!" + indexUrl);
System.out.println("错误信息" +  e);
}
// 4、做分页请求
int page = 1;
for (int num = 2; num <= 100; num++) {
page = (2 * num) - 1;
String pagingUrl="https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&click=0&page="+page;
HttpGet httpGet = new HttpGet(pagingUrl);
try {
String pagingHtml = getHtml(httpGet);
parseHtml(pagingHtml);
} catch (Exception e) {
System.out.println("请求分页失败,分页编号是:"+page);
System.out.println("错误信息:"+e);
}
try {
Thread.sleep(2*1000);
} catch (InterruptedException e) {
e.printStackTrace();
}
}


}


/**
* 解析搜索页的首页时,需要获取到pid

* @param html
*/
public static void parseHtml(String html) {
if (html != null) {
Document doc = Jsoup.parse(html);
Elements eles = doc.select("[data-pid]");
for (Element element : eles) {
//jedis.lpush("itcast:spider:jd:pids", element.attr("data-pid"));
try {
//把爬取到的商品id放入到pidsQueue中
pidsQueue.put(element.attr("data-pid"));
} catch (Exception e) {
e.printStackTrace();
}
}
}


}


/**
* 专门用来访问httpget请求的方法

* @param httpGet
* @return
* @throws IOException
* @throws ClientProtocolException
*/
public static String getHtml(HttpGet httpGet) throws IOException, ClientProtocolException {
String html = null;
httpGet.setHeader("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
httpGet.setHeader("Accept-Encoding","gzip, deflate, br");
httpGet.setHeader("Accept-Language","zh-CN,zh;q=0.8");
httpGet.setHeader("Cache-Control","max-age=0");
httpGet.setHeader("Connection","keep-alive");
httpGet.setHeader("Cookie"," ");
httpGet.setHeader("Host","search.jd.com");
httpGet.setHeader("Upgrade-Insecure-Requests","1");
httpGet.setHeader("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) 
Chrome/60.0.3112.113 Safari/537.36");
CloseableHttpClient hc = HttpClients.createDefault();
CloseableHttpResponse res = hc.execute(httpGet);
if (res.getStatusLine().getStatusCode() == 200) {
HttpEntity entity = res.getEntity();
html = EntityUtils.toString(entity, Charset.forName("utf-8"));
}
return html;
}
}


原创粉丝点击