java爬虫

来源：互联网发布：生化危机mac版编辑：程序博客网时间：2024/05/16 08:10

一原理

创建HttpClient对象，并指定url，如需要get请求请创建HttpGet对象，post请求请创建HttpPost对象。HttpClient中execute方法发送请求。

二小例子

package com.xiang;import java.io.BufferedReader;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import org.apache.http.client.config.CookieSpecs;import org.apache.http.client.config.RequestConfig;import org.apache.http.client.methods.CloseableHttpResponse;import org.apache.http.client.methods.HttpGet;import org.apache.http.impl.client.CloseableHttpClient;import org.apache.http.impl.client.HttpClients;public class Spider {public static void main(String[] args) {// HttpClient 超时配置RequestConfig requestConfig = RequestConfig.custom().setCookieSpec(CookieSpecs.STANDARD).setConnectionRequestTimeout(6000).setConnectTimeout(6000).build();CloseableHttpClient httpClient = HttpClients.custom().setDefaultRequestConfig(requestConfig).build();//for (int i = 0; i < 100; i++) {//页面上有页码用到，提高效率，并用多线程HttpGet httpGet = new HttpGet("http://www.baidu.com");//此处填写地址 创建一个get请求httpGet.addHeader("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.152 Safari/537.36");CloseableHttpResponse response = null;InputStream in = null;try {// 不敢爬太快，封ip就不好了//Thread.sleep(3600);response = httpClient.execute(httpGet);in = response.getEntity().getContent();String html = convertStreamToString(in);new Thread(new BaiduParser(html)).start();} catch (Exception e) {//do nothing}finally{try {if(response != null){    response.close();}} catch (IOException e) {// do nothing}}//}}//将爬到的内容转化为Stringprivate static String convertStreamToString(InputStream in) {BufferedReader reader = new BufferedReader(new InputStreamReader(in));StringBuilder sb = new StringBuilder();String line = null;try {while ((line = reader.readLine()) != null) {sb.append(line + "\n");}} catch (IOException e) {e.printStackTrace();} finally {try {in.close();} catch (IOException e) {e.printStackTrace();}}return sb.toString();}}

package com.xiang;public class BaiduParser implements Runnable{String html;public BaiduParser(String html) {    this.html = html;}public void run() {System.out.println(html);//通过正则表达式或截取取得自己想要的内容}}

1 0