利用正则表达式抽取网页信息

来源：互联网发布：福建厦门广电网络编辑：程序博客网时间：2024/06/05 10:41

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class ObtainNews {

/**
*
* @param htmlurl
* @return 读取一个网页全部内容
* @throws IOException
*/
public String getHtml(final String htmlurl) throws IOException {
URL url;
String temp;
final StringBuffer sb = new StringBuffer();
try {
url = new URL(htmlurl);
final BufferedReader in = new BufferedReader(new InputStreamReader(
url.openStream(), "utf-8"));// 读取网页全部内容
while ((temp = in.readLine()) != null) {
sb.append(temp);
}
in.close();
} catch (final MalformedURLException me) {
System.out.println("你输入的URL格式有问题！请仔细输入");
me.getMessage();
throw me;
} catch (final IOException e) {
e.printStackTrace();
throw e;
}
return sb.toString();
}

/**
* 获取标题/正文/发布时间/发布者：null/来源站点：reuters/记者/分类频道/专题标识：null/图片/视频
*/

public String getTitle(final String s) {
String regex;
String title = "";
final List<String> list = new ArrayList<String>();
regex = "<h1>.*?</h1>";
final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
final Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
for (int i = 0; i < list.size(); i++) {
title = title + list.get(i);
}
return title;
}

public String getContent(final String s) {
String regex;
String content = "";
final List<String> list = new ArrayList<String>();
regex = "<span id=\"midArticle_start\"></span>.*?</span></span>";
final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
final Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
for (int i = 0; i < list.size(); i++) {
content = content + list.get(i);
}
return content;
}

public String getTime(final String s) {
String regex;
String time = "";
final List<String> list = new ArrayList<String>();
regex = "<span class=\"timestamp\">.*?</span> </p>";
final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
final Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
for (int i = 0; i < list.size(); i++) {
time = time + list.get(i);
}
return time;
}

public String getReporter(final String s) {
String regex;
String reporter = "";
final List<String> list = new ArrayList<String>();
regex = "<p class=\"byline\">.*?</p> <p>";
final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
final Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
for (int i = 0; i < list.size(); i++) {
reporter = reporter + " " + list.get(i);
}
return reporter;
}

public String getChannel(final String s) {
String regex;
String channel = "";
final List<String> list = new ArrayList<String>();
regex = "<div class=\"actionButton\">.*?</a></div>";
final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
final Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
for (int i = 0; i < list.size(); i++) {
channel = channel + " " + list.get(i);
}
return channel;
}

public String getImgsrc(final String s) {
String regex;
String imgsrc = "";
final List<String> list = new ArrayList<String>();
regex = "<img src=\".*?\" border";
final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
final Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
for (int i = 0; i < list.size(); i++) {
imgsrc = imgsrc + list.get(i);
}
return imgsrc;
}

public String getVideosrc(final String s) {
String regex;
String videosrc = "";
final List<String> list = new ArrayList<String>();
regex = "<div class=\"photo\">.*?<img";
final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
final Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
for (int i = 0; i < list.size(); i++) {
videosrc = videosrc + list.get(i);
}
return videosrc;
}

/**
*
* @param s
* @return 去除标签
*/

public String outTag(final String s) {
return s.replaceAll("<.*?>", "");
}

/**
*
* @param s
* @return 获取内容
*/

public HashMap<String, String> getFromWeb(final String s) {

final HashMap<String, String> hm = new HashMap<String, String>();
String html = "";
System.out.println("\n开始读取网页(" + s + ")");
try {
html = getHtml(s);
} catch (final Exception e) {
e.getMessage();
}
System.out.println(html);
System.out.println("分析(" + s + ")结果\n");
String title = outTag(getTitle(html));
String content = outTag(getContent(html));
String time = outTag(getTime(html));
String reporter = outTag(getReporter(html)).replaceAll("By ", "");
String channel = outTag(getChannel(html));
String imgsrc = getImgsrc(html)
.replaceAll("<img src=\"", "").replaceAll("\" border", "").replaceAll(" ", "");
String videosrc = getVideosrc(html)
.replaceAll("<div class=\"photo\"><a href=\"", "").replaceAll("\"><img", "");
hm.put("title", title);
hm.put("content", content);
hm.put("time", time);
hm.put("reporter", reporter);
hm.put("channel", channel);
hm.put("imgsrc", imgsrc);
hm.put("videosrc", videosrc);
return hm;

}

/**
*
* @param args
* 测试网页www.reuters.com
*/

public static void main(final String args[]) {

String url = "";
final List<String> list = new ArrayList<String>();
System.out.print("输入新闻页面网址，换行输入run\n");
final BufferedReader br = new BufferedReader(new InputStreamReader(
System.in));
//http://www.reuters.com/article/2014/04/04/us-usa-cia-interrogation-idUSBREA321UC20140404
//http://www.reuters.com/article/2014/04/04/us-congress-justice-highspeed-idUSBREA3310O20140404
try {
while (!(url = br.readLine()).equals("run")) {
list.add(url);
}
} catch (final Exception e) {
e.getMessage();
}
final ObtainNews on = new ObtainNews();
HashMap<String, String> hm = new HashMap<String, String>();
for (int i = 0; i < list.size(); i++) {
hm = on.getFromWeb(list.get(i));
String title = hm.get("title");
String content = hm.get("content");
String time = hm.get("time");
String publisher = null;
String site = "reuters";
String reporter = hm.get("reporter"); if(reporter == "") reporter = null;
String channel = hm.get("channel"); if(channel == "") channel = null;
String subject = null;
String imgsrc = hm.get("imgsrc"); if(imgsrc == "") imgsrc = null;
String videosrc = hm.get("videosrc");
if(videosrc == "") videosrc = null;
else {
videosrc = "http://www.reuters.com" + videosrc;
videosrc = videosrc.replaceAll(" ", "");
}
String str = list.get(i)+"\t"+title+"\t"+content+"\t"+time+"\t"+publisher+"\t"
+site+"\t"+reporter+"\t"+channel+"\t"+subject+"\t"+imgsrc+"\t"+videosrc+"\n";

System.out.println("URL： " + list.get(i));
System.out.println("标题： " + title);
System.out.println("正文： " + content);
System.out.println("发布时间： " + time);
System.out.println("发布者：" + publisher);
System.out.println("来源站点：" + site);
System.out.println("记者：" + reporter);
System.out.println("分类频道：" + channel);
System.out.println("主题：" + subject);
System.out.println("图片链接：" + imgsrc);
System.out.println("视频链接：" + videosrc);
System.out.println(str);

try {
FileOutputStream fos = new FileOutputStream("D://News.txt",true);
OutputStreamWriter osw = new OutputStreamWriter(fos, "utf-8");
BufferedWriter bw = new BufferedWriter(osw);

bw.write(str);
bw.flush();
bw.close();
} catch (FileNotFoundException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
} catch (UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}

}
}
}

0 0