利用正则表达式抽取网页信息
来源:互联网 发布:福建厦门广电网络 编辑:程序博客网 时间:2024/06/05 10:41
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class ObtainNews {
/**
*
* @param htmlurl
* @return 读取一个网页全部内容
* @throws IOException
*/
public String getHtml(final String htmlurl) throws IOException {
URL url;
String temp;
final StringBuffer sb = new StringBuffer();
try {
url = new URL(htmlurl);
final BufferedReader in = new BufferedReader(new InputStreamReader(
url.openStream(), "utf-8"));// 读取网页全部内容
while ((temp = in.readLine()) != null) {
sb.append(temp);
}
in.close();
} catch (final MalformedURLException me) {
System.out.println("你输入的URL格式有问题!请仔细输入");
me.getMessage();
throw me;
} catch (final IOException e) {
e.printStackTrace();
throw e;
}
return sb.toString();
}
/**
* 获取 标题/正文/发布时间/发布者:null/来源站点:reuters/记者/分类频道/专题标识:null/图片/视频
*/
public String getTitle(final String s) {
String regex;
String title = "";
final List<String> list = new ArrayList<String>();
regex = "<h1>.*?</h1>";
final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
final Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
for (int i = 0; i < list.size(); i++) {
title = title + list.get(i);
}
return title;
}
public String getContent(final String s) {
String regex;
String content = "";
final List<String> list = new ArrayList<String>();
regex = "<span id=\"midArticle_start\"></span>.*?</span></span>";
final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
final Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
for (int i = 0; i < list.size(); i++) {
content = content + list.get(i);
}
return content;
}
public String getTime(final String s) {
String regex;
String time = "";
final List<String> list = new ArrayList<String>();
regex = "<span class=\"timestamp\">.*?</span> </p>";
final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
final Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
for (int i = 0; i < list.size(); i++) {
time = time + list.get(i);
}
return time;
}
public String getReporter(final String s) {
String regex;
String reporter = "";
final List<String> list = new ArrayList<String>();
regex = "<p class=\"byline\">.*?</p> <p>";
final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
final Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
for (int i = 0; i < list.size(); i++) {
reporter = reporter + " " + list.get(i);
}
return reporter;
}
public String getChannel(final String s) {
String regex;
String channel = "";
final List<String> list = new ArrayList<String>();
regex = "<div class=\"actionButton\">.*?</a></div>";
final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
final Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
for (int i = 0; i < list.size(); i++) {
channel = channel + " " + list.get(i);
}
return channel;
}
public String getImgsrc(final String s) {
String regex;
String imgsrc = "";
final List<String> list = new ArrayList<String>();
regex = "<img src=\".*?\" border";
final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
final Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
for (int i = 0; i < list.size(); i++) {
imgsrc = imgsrc + list.get(i);
}
return imgsrc;
}
public String getVideosrc(final String s) {
String regex;
String videosrc = "";
final List<String> list = new ArrayList<String>();
regex = "<div class=\"photo\">.*?<img";
final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
final Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
for (int i = 0; i < list.size(); i++) {
videosrc = videosrc + list.get(i);
}
return videosrc;
}
/**
*
* @param s
* @return 去除标签
*/
public String outTag(final String s) {
return s.replaceAll("<.*?>", "");
}
/**
*
* @param s
* @return 获取内容
*/
public HashMap<String, String> getFromWeb(final String s) {
final HashMap<String, String> hm = new HashMap<String, String>();
String html = "";
System.out.println("\n开始读取网页(" + s + ")");
try {
html = getHtml(s);
} catch (final Exception e) {
e.getMessage();
}
System.out.println(html);
System.out.println("分析(" + s + ")结果\n");
String title = outTag(getTitle(html));
String content = outTag(getContent(html));
String time = outTag(getTime(html));
String reporter = outTag(getReporter(html)).replaceAll("By ", "");
String channel = outTag(getChannel(html));
String imgsrc = getImgsrc(html)
.replaceAll("<img src=\"", "").replaceAll("\" border", "").replaceAll(" ", "");
String videosrc = getVideosrc(html)
.replaceAll("<div class=\"photo\"><a href=\"", "").replaceAll("\"><img", "");
hm.put("title", title);
hm.put("content", content);
hm.put("time", time);
hm.put("reporter", reporter);
hm.put("channel", channel);
hm.put("imgsrc", imgsrc);
hm.put("videosrc", videosrc);
return hm;
}
/**
*
* @param args
* 测试网页www.reuters.com
*/
public static void main(final String args[]) {
String url = "";
final List<String> list = new ArrayList<String>();
System.out.print("输入新闻页面网址,换行输入run\n");
final BufferedReader br = new BufferedReader(new InputStreamReader(
System.in));
//http://www.reuters.com/article/2014/04/04/us-usa-cia-interrogation-idUSBREA321UC20140404
//http://www.reuters.com/article/2014/04/04/us-congress-justice-highspeed-idUSBREA3310O20140404
try {
while (!(url = br.readLine()).equals("run")) {
list.add(url);
}
} catch (final Exception e) {
e.getMessage();
}
final ObtainNews on = new ObtainNews();
HashMap<String, String> hm = new HashMap<String, String>();
for (int i = 0; i < list.size(); i++) {
hm = on.getFromWeb(list.get(i));
String title = hm.get("title");
String content = hm.get("content");
String time = hm.get("time");
String publisher = null;
String site = "reuters";
String reporter = hm.get("reporter"); if(reporter == "") reporter = null;
String channel = hm.get("channel"); if(channel == "") channel = null;
String subject = null;
String imgsrc = hm.get("imgsrc"); if(imgsrc == "") imgsrc = null;
String videosrc = hm.get("videosrc");
if(videosrc == "") videosrc = null;
else {
videosrc = "http://www.reuters.com" + videosrc;
videosrc = videosrc.replaceAll(" ", "");
}
String str = list.get(i)+"\t"+title+"\t"+content+"\t"+time+"\t"+publisher+"\t"
+site+"\t"+reporter+"\t"+channel+"\t"+subject+"\t"+imgsrc+"\t"+videosrc+"\n";
System.out.println("URL: " + list.get(i));
System.out.println("标题: " + title);
System.out.println("正文: " + content);
System.out.println("发布时间: " + time);
System.out.println("发布者:" + publisher);
System.out.println("来源站点:" + site);
System.out.println("记者:" + reporter);
System.out.println("分类频道:" + channel);
System.out.println("主题:" + subject);
System.out.println("图片链接:" + imgsrc);
System.out.println("视频链接:" + videosrc);
System.out.println(str);
try {
FileOutputStream fos = new FileOutputStream("D://News.txt",true);
OutputStreamWriter osw = new OutputStreamWriter(fos, "utf-8");
BufferedWriter bw = new BufferedWriter(osw);
bw.write(str);
bw.flush();
bw.close();
} catch (FileNotFoundException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
} catch (UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
}
import java.io.BufferedWriter;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class ObtainNews {
/**
*
* @param htmlurl
* @return 读取一个网页全部内容
* @throws IOException
*/
public String getHtml(final String htmlurl) throws IOException {
URL url;
String temp;
final StringBuffer sb = new StringBuffer();
try {
url = new URL(htmlurl);
final BufferedReader in = new BufferedReader(new InputStreamReader(
url.openStream(), "utf-8"));// 读取网页全部内容
while ((temp = in.readLine()) != null) {
sb.append(temp);
}
in.close();
} catch (final MalformedURLException me) {
System.out.println("你输入的URL格式有问题!请仔细输入");
me.getMessage();
throw me;
} catch (final IOException e) {
e.printStackTrace();
throw e;
}
return sb.toString();
}
/**
* 获取 标题/正文/发布时间/发布者:null/来源站点:reuters/记者/分类频道/专题标识:null/图片/视频
*/
public String getTitle(final String s) {
String regex;
String title = "";
final List<String> list = new ArrayList<String>();
regex = "<h1>.*?</h1>";
final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
final Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
for (int i = 0; i < list.size(); i++) {
title = title + list.get(i);
}
return title;
}
public String getContent(final String s) {
String regex;
String content = "";
final List<String> list = new ArrayList<String>();
regex = "<span id=\"midArticle_start\"></span>.*?</span></span>";
final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
final Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
for (int i = 0; i < list.size(); i++) {
content = content + list.get(i);
}
return content;
}
public String getTime(final String s) {
String regex;
String time = "";
final List<String> list = new ArrayList<String>();
regex = "<span class=\"timestamp\">.*?</span> </p>";
final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
final Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
for (int i = 0; i < list.size(); i++) {
time = time + list.get(i);
}
return time;
}
public String getReporter(final String s) {
String regex;
String reporter = "";
final List<String> list = new ArrayList<String>();
regex = "<p class=\"byline\">.*?</p> <p>";
final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
final Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
for (int i = 0; i < list.size(); i++) {
reporter = reporter + " " + list.get(i);
}
return reporter;
}
public String getChannel(final String s) {
String regex;
String channel = "";
final List<String> list = new ArrayList<String>();
regex = "<div class=\"actionButton\">.*?</a></div>";
final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
final Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
for (int i = 0; i < list.size(); i++) {
channel = channel + " " + list.get(i);
}
return channel;
}
public String getImgsrc(final String s) {
String regex;
String imgsrc = "";
final List<String> list = new ArrayList<String>();
regex = "<img src=\".*?\" border";
final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
final Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
for (int i = 0; i < list.size(); i++) {
imgsrc = imgsrc + list.get(i);
}
return imgsrc;
}
public String getVideosrc(final String s) {
String regex;
String videosrc = "";
final List<String> list = new ArrayList<String>();
regex = "<div class=\"photo\">.*?<img";
final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
final Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
for (int i = 0; i < list.size(); i++) {
videosrc = videosrc + list.get(i);
}
return videosrc;
}
/**
*
* @param s
* @return 去除标签
*/
public String outTag(final String s) {
return s.replaceAll("<.*?>", "");
}
/**
*
* @param s
* @return 获取内容
*/
public HashMap<String, String> getFromWeb(final String s) {
final HashMap<String, String> hm = new HashMap<String, String>();
String html = "";
System.out.println("\n开始读取网页(" + s + ")");
try {
html = getHtml(s);
} catch (final Exception e) {
e.getMessage();
}
System.out.println(html);
System.out.println("分析(" + s + ")结果\n");
String title = outTag(getTitle(html));
String content = outTag(getContent(html));
String time = outTag(getTime(html));
String reporter = outTag(getReporter(html)).replaceAll("By ", "");
String channel = outTag(getChannel(html));
String imgsrc = getImgsrc(html)
.replaceAll("<img src=\"", "").replaceAll("\" border", "").replaceAll(" ", "");
String videosrc = getVideosrc(html)
.replaceAll("<div class=\"photo\"><a href=\"", "").replaceAll("\"><img", "");
hm.put("title", title);
hm.put("content", content);
hm.put("time", time);
hm.put("reporter", reporter);
hm.put("channel", channel);
hm.put("imgsrc", imgsrc);
hm.put("videosrc", videosrc);
return hm;
}
/**
*
* @param args
* 测试网页www.reuters.com
*/
public static void main(final String args[]) {
String url = "";
final List<String> list = new ArrayList<String>();
System.out.print("输入新闻页面网址,换行输入run\n");
final BufferedReader br = new BufferedReader(new InputStreamReader(
System.in));
//http://www.reuters.com/article/2014/04/04/us-usa-cia-interrogation-idUSBREA321UC20140404
//http://www.reuters.com/article/2014/04/04/us-congress-justice-highspeed-idUSBREA3310O20140404
try {
while (!(url = br.readLine()).equals("run")) {
list.add(url);
}
} catch (final Exception e) {
e.getMessage();
}
final ObtainNews on = new ObtainNews();
HashMap<String, String> hm = new HashMap<String, String>();
for (int i = 0; i < list.size(); i++) {
hm = on.getFromWeb(list.get(i));
String title = hm.get("title");
String content = hm.get("content");
String time = hm.get("time");
String publisher = null;
String site = "reuters";
String reporter = hm.get("reporter"); if(reporter == "") reporter = null;
String channel = hm.get("channel"); if(channel == "") channel = null;
String subject = null;
String imgsrc = hm.get("imgsrc"); if(imgsrc == "") imgsrc = null;
String videosrc = hm.get("videosrc");
if(videosrc == "") videosrc = null;
else {
videosrc = "http://www.reuters.com" + videosrc;
videosrc = videosrc.replaceAll(" ", "");
}
String str = list.get(i)+"\t"+title+"\t"+content+"\t"+time+"\t"+publisher+"\t"
+site+"\t"+reporter+"\t"+channel+"\t"+subject+"\t"+imgsrc+"\t"+videosrc+"\n";
System.out.println("URL: " + list.get(i));
System.out.println("标题: " + title);
System.out.println("正文: " + content);
System.out.println("发布时间: " + time);
System.out.println("发布者:" + publisher);
System.out.println("来源站点:" + site);
System.out.println("记者:" + reporter);
System.out.println("分类频道:" + channel);
System.out.println("主题:" + subject);
System.out.println("图片链接:" + imgsrc);
System.out.println("视频链接:" + videosrc);
System.out.println(str);
try {
FileOutputStream fos = new FileOutputStream("D://News.txt",true);
OutputStreamWriter osw = new OutputStreamWriter(fos, "utf-8");
BufferedWriter bw = new BufferedWriter(osw);
bw.write(str);
bw.flush();
bw.close();
} catch (FileNotFoundException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
} catch (UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
}
0 0
- 利用正则表达式抽取网页信息
- Java使用正则表达式及字符串操作,抽取网页信息
- 正则表达式抽取网页内容
- 利用正则表达式抽取句子
- 用正则表达式对网页进行有效内容抽取
- 正则表达式 抽取网页中的e-mail地址
- 用正则表达式对网页进行有效内容抽取
- 利用视觉模型对网页有效信息的抽取
- 抽取网页信息
- Java抽取网页信息
- 网页信息抽取阶段性成果
- Java中正则表达式、模式匹配与信息抽取
- Python中文文本信息抽取中常见的正则表达式
- ObjC利用正则表达式抓取网页内容
- UCI网页信息抽取技术
- 抓取网页信息,并用正则表达式分析后得到信息。
- 新浪爬虫微博个人地址公司等信息抽取正则表达式
- C#使用正则表达式提取网页中的信息数据
- iOS库 .a与.framework区别
- android的json解析
- Html的label的for属性
- 还是畅通工程
- 计算自己活了多少天 SimpleDateFormat Date getTime()
- 利用正则表达式抽取网页信息
- LeetCode-20 Valid Parentheses
- LINQ体验(13)——LINQ to SQL语句之运算符转换和ADO.NET与LINQ to SQL
- 查税( 斜率优化&单调队列维护凸包 &分块 )
- 使用Eclipse构建Maven项目
- Android使用 LruCache 缓存图片
- vs2015/MFC静态文本控件
- LINQ体验(12)——LINQ to SQL语句之对象标识和对象加载
- 第25章:Spinner的用法