从国家统计局爬取山东省市区县乡镇

来源:互联网 发布:注销淘宝店铺重新开通 编辑:程序博客网 时间:2024/04/29 20:51
项目需要,写了个简单的程序爬去山东省市区县乡镇区划信息。

依赖的jar包来源于httpcomponents-client-4.2.5-bin.zip;


package org.apache.http.examples.test;import java.io.BufferedReader;import java.io.FileWriter;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.io.UnsupportedEncodingException;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.apache.http.HttpEntity;import org.apache.http.HttpResponse;import org.apache.http.client.ClientProtocolException;import org.apache.http.client.methods.HttpGet;import org.apache.http.impl.client.DefaultHttpClient;import org.apache.http.util.EntityUtils;//从国家统计局爬取山东省市区县乡镇public class DailySign {     public static final String URL_GET = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2014/";         public static int count = 0;        public static String pre="12";    public static String file = pre+".html";    public static String fileName = pre+".html";    public static FileWriter writer = null;    public static StringBuffer sb = new StringBuffer();        public static Pattern p = Pattern.compile("<a href='([^>]*)'>([^<]*)</a>");     public static void main(String[] args) throws Exception {        writer = new FileWriter(fileName+".txt");        get("",file);         try {        writer.write(sb.toString());    if(writer!=null){    writer.close();    }} catch (IOException e) {e.printStackTrace();}    }     public static void get(String prefix, String req) throws ClientProtocolException, IOException, InterruptedException {        count++;        //休眠,防止大量请求被网站拒绝        if(count % 200 == 0){            Thread.sleep(1000);        }        DefaultHttpClient client = new DefaultHttpClient();        HttpGet signGet = new HttpGet(URL_GET + prefix + req);        // 执行签到请求        HttpResponse signResponse = client.execute(signGet);        // 处理响应        showResult(signResponse);    }     /**     * 读取相应内容并输出     * @throws InterruptedException      */    public static void showResult(HttpResponse response) throws IOException, UnsupportedEncodingException, InterruptedException {        int status = response.getStatusLine().getStatusCode();        HttpEntity entity = response.getEntity();        InputStream instream = null;        if (entity != null) {            instream = entity.getContent();            BufferedReader reader = new BufferedReader(new InputStreamReader(instream,"GBK"));            String line = null;            while ((line = reader.readLine()) != null) {                line = new String(line.getBytes(), "UTF-8");                if (line.startsWith("<tr class='citytr'>") || line.startsWith("<tr class='countytr'>")                        || line.startsWith("<tr class='towntr'>")) {                    Matcher m = p.matcher(line);                    while (m.find()) {                        String code = m.group(1);                        String name = m.group(2);                        if (name.startsWith(pre)) {                            System.out.print(name + "\t");                            sb.append(name + "\t");                        } else {                            System.out.print(code + "\t");                            sb.append(code + "\t");                            sb.append(name+"\r\n");                            System.out.println(name);                            String prefix = "";                            if (line.startsWith("<tr class='countytr'>")) {                                prefix = "/"+code.substring(3, 5);                            }                            //递归                            get(prefix, "/"+code);                        }                    }                }                            }            instream.close();            EntityUtils.consume(entity);        }             }}


0 0
原创粉丝点击