开始学习——爬取微博页面的html

来源：互联网发布：奥地利经济学派知乎编辑：程序博客网时间：2024/05/22 06:29

今天开始正式着手写爬取微博搜索的爬虫，上个星期，找了很多资料，也尝试了很多，想了很多，这个星期开始一点一点的写吧，新手入门，从最简单的开始，先爬取一个微博页面的Html。

package sina_weibo;//根据Demo4获取微博的Html文件import java.awt.Desktop;import java.io.IOException;import java.net.URI;import java.net.URISyntaxException;import org.apache.http.client.ClientProtocolException;import org.apache.http.client.config.CookieSpecs;import org.apache.http.client.config.RequestConfig;import org.apache.http.client.methods.CloseableHttpResponse;import org.apache.http.client.methods.HttpGet;import org.apache.http.config.Registry;import org.apache.http.config.RegistryBuilder;import org.apache.http.cookie.Cookie;import org.apache.http.cookie.CookieOrigin;import org.apache.http.cookie.CookieSpec;import org.apache.http.cookie.CookieSpecProvider;import org.apache.http.cookie.MalformedCookieException;import org.apache.http.impl.client.CloseableHttpClient;import org.apache.http.impl.client.HttpClients;import org.apache.http.impl.cookie.BestMatchSpecFactory;import org.apache.http.impl.cookie.BrowserCompatSpec;import org.apache.http.impl.cookie.BrowserCompatSpecFactory;import org.apache.http.protocol.HttpContext;import org.apache.http.util.EntityUtils;@SuppressWarnings("deprecation")public class getHtml {    /**     * 用默认浏览器打开指定网址     * @param url     * @throws URISyntaxException      * @throws IOException      */    public void runBroswer(String url) throws URISyntaxException, IOException {          Desktop desktop = Desktop.getDesktop();          if (Desktop.isDesktopSupported() && desktop.isSupported(Desktop.Action.BROWSE)) {              URI uri = new URI(url);              desktop.browse(uri);            }    }    /**     * 由url得到html     * @param url     * @return html     * @throws URISyntaxException     * @throws ClientProtocolException     * @throws IOException     */    public String getHTML(String url) throws URISyntaxException, ClientProtocolException, IOException {        CookieSpecProvider easySpecProvider = new CookieSpecProvider() {            @SuppressWarnings("deprecation")            public CookieSpec create(HttpContext context) {                return new BrowserCompatSpec() {                    @Override                    public void validate(Cookie cookie, CookieOrigin origin)                            throws MalformedCookieException {                    }                };            }        };        @SuppressWarnings("deprecation")        Registry<CookieSpecProvider> r = RegistryBuilder                .<CookieSpecProvider> create()                .register(CookieSpecs.BEST_MATCH, new BestMatchSpecFactory())                .register(CookieSpecs.BROWSER_COMPATIBILITY,                        new BrowserCompatSpecFactory())                .register("easy", easySpecProvider).build();        RequestConfig requestConfig = RequestConfig.custom()                .setCookieSpec("easy")                .setSocketTimeout(1000)//设置socket超时时间                .setConnectTimeout(1000)//设置connect超时时间                .build();        CloseableHttpClient httpClient = HttpClients.custom()                .setDefaultCookieSpecRegistry(r)                .setDefaultRequestConfig(requestConfig).build();        HttpGet httpGet = new HttpGet(url);        httpGet.setConfig(requestConfig);        String html = "html获取失败";//用于验证是否正常取到html        try{            CloseableHttpResponse response = httpClient.execute(httpGet);            html = EntityUtils.toString(response.getEntity());            //System.out.println(html);//打印返回的html        } catch(IOException e){            System.out.println("****连接超时，程序自动重连****");        }        return html;    }    public String get () throws IOException, URISyntaxException, IOException{        getHtml html = new getHtml();        String url = "http://weibo.com/1642088277/C8P1zpVDP";        String data = "";        html.runBroswer(url);        data = html.getHTML(url);        return data;    }    public static void main(String[] args) throws IOException, URISyntaxException{        getHtml test = new getHtml();        System.out.println(test.get());    }}

返回的即为一个微博页面的Html，之后对其用jsoup进行解析处理。

0 0