爬虫技术(2)--抓取网页java代码实现

来源：互联网发布：betternet mac 下载编辑：程序博客网时间：2024/06/02 01:59
package creeper.part1.capturepage;import java.io.IOException;import org.apache.http.HttpEntity;import org.apache.http.HttpResponse;import org.apache.http.client.ClientProtocolException;import org.apache.http.client.ResponseHandler;import org.apache.http.client.methods.HttpGet;import org.apache.http.impl.client.CloseableHttpClient;import org.apache.http.impl.client.HttpClients;import org.apache.http.util.EntityUtils;//爬虫技术(1)--抓取网页@SuppressWarnings("unused")public class capturePage {public static void main(String[] args) throws Exception {//声明一个HttpClient客户端，相当于打开一个浏览器(4.3以后都是CloseableHttpClient以前的已经过时)CloseableHttpClient httpClient=HttpClients.createDefault();//创建代理，省略...try {//get方法,相当于打开了一个网页String url="http://www.baidu.com";HttpGet get=new HttpGet(url);System.out.println("---------URI----------");System.out.println(get.getURI());//创建响应处理器处理响应内容ResponseHandler<String> handler=new ResponseHandler<String>(){@Overridepublic String handleResponse(HttpResponse response)throws ClientProtocolException, IOException {int status=response.getStatusLine().getStatusCode();//获取响应状态码//对状态码进行判断处理if(status>=200 && status<300 ){HttpEntity entity=response.getEntity();//获取响应的数据return entity==null?null:EntityUtils.toString(entity);}else{throw new ClientProtocolException("status:"+status);}}};//发送请求，相当于敲个回车String responseBody=httpClient.execute(get, handler);System.out.println("----------------responseBody-----------------");System.out.println(responseBody);System.out.println("----------------responseBody-----------------");} catch (Exception e) {}finally{httpClient.close();}}}
0 0