抓取远程网页并解析HTML

来源:互联网 发布:淘宝钓鱼椅子图片价格, 编辑:程序博客网 时间:2024/05/22 02:27


正则表达式HTMLApache

 学习java的正则表达式,抓取网页并解析HTML部分内容   

 

 
Java代码  收藏代码

    import java.util.regex.Matcher;  
    import java.util.regex.Pattern;  
    import org.apache.commons.httpclient.HttpClient;  
    import org.apache.commons.httpclient.HttpStatus;  
    import org.apache.commons.httpclient.methods.GetMethod;  
      
    public class HttpClientDemo {  
          
        /**
         *  
         * @param url
         * @return
         * @throws Exception
         */  
        public static String getHTML(String url) throws Exception {  
            HttpClient httpClient = new HttpClient();  
            GetMethod getMethod = new GetMethod(url);  
            int statusCode = httpClient.executeMethod(getMethod);  
            if (statusCode != HttpStatus.SC_OK) {  
                System.err.println("Method failed: " + getMethod.getStatusLine());  
                return null;  
            }  
            // 读取内容  
            byte[] responseBody = getMethod.getResponseBody();  
            getMethod.releaseConnection();  
            return new String(responseBody);  
      
        }  
        /**
         *  
         * @throws Exception
         */  
        public static void test(String url) throws Exception{  
              
            String html = getHTML(url);  
            Pattern p = null;  
            Matcher m = null;  
            StringBuffer sb0 = new StringBuffer();  
            // ul正则  
            String regex = "<ul class=\"d2_9\">([\\s\\S]*<li>)<a.*href='(.*)'.*>(.+?)</a> \\[(.*)\\]</li>([\\s].*)";  
            // 链接正则  
            String regexa = "<a.*href='(.*)'.*>(.+?)</a> \\[(.*)\\]";  
            p = Pattern.compile(regex);  
            // m = p.matcher(sb.toString());  
            m = p.matcher(html);  
            int count = 0;  
            // ul字符串  
            while (m.find()) {  
                sb0.append(m.group());  
            }  
            //System.out.println(sb0.toString());  
            p = Pattern.compile(regexa);  
            m = p.matcher(sb0.toString());  
            // 链接地址和标题  
            while (m.find()) {  
                System.out.println("地址:" + m.group(1));  
                System.out.println("标题:" + m.group(2));  
                System.out.println("时间:" + m.group(3));  
                count++;  
            }  
              
            System.out.println("抓取条数:"+count);  
      
        }  
          
        public static void main(String[] args) throws Exception {  
            String url = "http://cpc.people.com.cn/GB/194302/194306/index.html";  
            test(url);  
              
        }  
    } 
0 0
原创粉丝点击