简单读取网站页面内容demo

来源：互联网发布：怛罗斯之战知乎编辑：程序博客网时间：2024/05/21 22:46
package Test;import java.io.BufferedReader;import java.io.IOException;import java.io.InputStreamReader;import java.net.MalformedURLException;import java.net.URL;import java.util.ArrayList;import java.util.HashMap;import java.util.List;import java.util.regex.Matcher;import java.util.regex.Pattern;public class Test {/** * 读取一个网页全部内容 * @param htmlurl * @return * @throws IOException */public String getOneHtml(String htmlurl) throws IOException{URL url;String temp;StringBuffer sb = new StringBuffer();try{url = new URL(htmlurl);BufferedReader in  = new BufferedReader(new InputStreamReader(url.openStream(),"utf-8"));while((temp = in.readLine())!=null){sb.append(temp);}in.close();}catch(MalformedURLException me){System.out.println("您输入的URL格式有问题！请检查后再次输入！");me.getMessage();throw me;}catch(IOException e){e.printStackTrace();throw e;}return sb.toString();}/** * 获得网页标题 * @param args */public String getTitle(String s){String regex;String title="";List<String> list = new ArrayList<String>();regex = "<title>.*?</title>";Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);Matcher ma = pa.matcher(s);while(ma.find()){list.add(ma.group());}for(int i=0;i<list.size();i++){title = title+list.get(i);}return outTag(title);}//获得链接public List<String> getLink(String s){String regex;String regex1;List<String> list = new ArrayList<String>();regex = "<a[^>]*href=('([^']*)'|'([^']*)'|([^s>]*))[^>]*>(.*?)</a>";Pattern pa = Pattern.compile(regex,Pattern.DOTALL);Matcher ma = pa.matcher(s);while(ma.find()){list.add(ma.group());}return list;}/** * 获得脚本代码 * @param s * @return */public List<String> getScript(String s){String regex ;List<String> list = new ArrayList<String>();regex = "<script.*?</script>";Pattern pa = Pattern.compile(regex, Pattern.DOTALL);Matcher ma = pa.matcher(s);while(ma.find()){list.add(ma.group());}return list;}/** * 获得CSS * @param s * @return */public List<String> getCSS(String s){String regex;List<String> list = new ArrayList<String>();regex = "<style.*?</style>";Pattern pa = Pattern.compile(regex,Pattern.DOTALL);Matcher ma = pa.matcher(s);while(ma.find()){list.add(ma.group());}return list;}public  String outTag(String s) { return s.replaceAll("<.*?>", "");}/** * 获取京东团购文章标题及内容 * @param args */public HashMap<String,String> getFrom360buy(String s){HashMap<String,String> hm = new HashMap<String,String>();StringBuffer sb = new StringBuffer();String html = "";System.out.println("------------开始读取网页("+s+")---------");try{html = getOneHtml(s);}catch(Exception e){e.getMessage();}System.out.println("--------------读取网页("+s+")结束----------");System.out.println("--------------分析("+s+")结果如下----------");String title = outTag(getTitle(html));//title = title.replace("京东团购", "");Pattern pa = Pattern.compile("<h1.*?</h1>",Pattern.DOTALL);Matcher ma = pa.matcher(html);while(ma.find()){sb.append(ma.group());}String temp = sb.toString();temp = temp.replaceAll("<p><em>.*?</em></p>","");hm.put("title", title);hm.put("no", outTag(temp));return hm;}/** * 测试一组网页 * @param args */public static void main(String[] args) {String url = "";List<String> list = new ArrayList<String>();System.out.println("输入URL，一行一个，输入结束后输入go程序开始运行");BufferedReader br = new BufferedReader(new InputStreamReader(System.in));try{while(!(url=br.readLine()).equals("go")){list.add(url);}}catch(Exception e){e.getMessage();} Test t = new Test(); HashMap<String, String> hm = new HashMap<String, String>();        for (int i = 0; i < list.size(); i++) {            hm = t.getFrom360buy(list.get(i));            System.out.println(list.get(i));            System.out.println("标题： " + hm.get("title"));            System.out.println("内容：  " + hm.get("no"));        }}}