抓取网页内容

来源:互联网 发布:淘宝子账号有哪些权限 编辑:程序博客网 时间:2024/06/14 22:27
public static final String generate(final String url) {
if (StringUtils.isBlank(url)) {
return null;
}
Pattern pattern = Pattern.compile("(http://|https://){1}[\\w\\.\\-/:]+");
Matcher matcher = pattern.matcher(url);
if (!matcher.find()) {
return null;
}
StringBuffer sb = new StringBuffer();
try {
URL _url = new URL(url);
URLConnection urlConnection = _url.openConnection();
BufferedReader in = new BufferedReader(new InputStreamReader(urlConnection.getInputStream()));
String inputLine;
while ((inputLine = in.readLine()) != null) {

sb.append(inputLine);




String url = "http://www.bnu.edu.cn";//www.bnu.edu.cn";

02URL theUrl= newURL(url);
03openStream = theUrl.openStream();
04//<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
05//构建输入流的的字符集必须和HTML源码中的 charset一致
06bf = newBufferedReader(new InputStreamReader(openStream,"utf-8"));
07String line = null;
08while((line = bf.readLine())!=null) {
09    System.out.println(line);
10}