一个Swing程序,用来判断一个URL页面内包含的好链接和坏链接数目

来源:互联网 发布:移动数据流量套餐退订 编辑:程序博客网 时间:2024/06/04 17:54

入口类

import java.awt.Dimension;import java.awt.Insets;import java.awt.event.ActionEvent;import java.awt.event.ActionListener;import java.io.IOException;import java.net.MalformedURLException;import java.net.URL;import java.net.URLConnection;import javax.swing.JButton;import javax.swing.JFrame;import javax.swing.JLabel;import javax.swing.JMenuBar;import javax.swing.JScrollPane;import javax.swing.JTextArea;import javax.swing.JTextField;import javax.swing.ScrollPaneConstants;import javax.swing.SwingUtilities;/** * Description 检查URL是否是合法的URL,入口类,直接运行该类,将需要分析的URL地址粘入文本框即可 *  * @author wangxu *  */public class CheckLinks extends JFrame implements Runnable, ISpiderReportable {// Used by addNotifyboolean frameSizeAdjusted = false;JLabel label1 = new JLabel();JButton begin = new JButton();JTextField url = new JTextField();JScrollPane errorScroll = new JScrollPane();JTextArea errors = new JTextArea();JLabel current = new JLabel();JLabel goodLinksLabel = new JLabel();JLabel badLinksLabel = new JLabel();protected Thread backgroundThread;protected Spider spider;protected URL base;protected int badLinksCount = 0;protected int goodLinksCount = 0;private static final long serialVersionUID = 1L;public CheckLinks() {setTitle("Find Broken Links");// 设置JFrame的标题getContentPane().setLayout(null);// 设置布局方式setSize(405, 288);setVisible(true);label1.setText("Enter a URL:");getContentPane().add(label1);label1.setBounds(12, 12, 84, 12);begin.setText("Begin");begin.setActionCommand("Begin");getContentPane().add(begin);begin.setBounds(12, 36, 84, 24);// 设置坐标和宽、高getContentPane().add(url);url.setBounds(108, 36, 288, 24);errorScroll.setAutoscrolls(true);// 自动显示滚动条errorScroll.setHorizontalScrollBarPolicy(ScrollPaneConstants.HORIZONTAL_SCROLLBAR_ALWAYS);// 水平方向始终显示errorScroll.setVerticalScrollBarPolicy(ScrollPaneConstants.VERTICAL_SCROLLBAR_ALWAYS);// 垂直方向始终显示errorScroll.setOpaque(true);// 设置不透明getContentPane().add(errorScroll);errorScroll.setBounds(12, 120, 384, 156);errors.setEditable(false);// 设置不可编辑errorScroll.getViewport().add(errors);// 将文本域添加进滚动条errors.setBounds(0, 0, 366, 138);current.setText("Currently Processing: ");getContentPane().add(current);// 加入显示当前信息的JLabelcurrent.setBounds(12, 72, 384, 12);goodLinksLabel.setText("Good Links: 0");getContentPane().add(goodLinksLabel);goodLinksLabel.setBounds(12, 96, 192, 12);badLinksLabel.setText("Bad Links: 0");getContentPane().add(badLinksLabel);badLinksLabel.setBounds(216, 96, 96, 12);SymAction lSymAction = new SymAction();// 实例化一个事件监听器begin.addActionListener(lSymAction);// 注册监听}static public void main(String args[]) {new CheckLinks();// 程序入口}public void addNotify() {// Record the size of the window prior to calling parent's addNotify.Dimension size = getSize();super.addNotify();if (frameSizeAdjusted)return;frameSizeAdjusted = true;// Adjust size of frame according to the insets and menu barInsets insets = getInsets();JMenuBar menuBar = getRootPane().getJMenuBar();int menuBarHeight = 0;if (menuBar != null)menuBarHeight = menuBar.getPreferredSize().height;setSize(insets.left + insets.right + size.width, insets.top + insets.bottom + size.height + menuBarHeight);}class SymAction implements ActionListener {public void actionPerformed(ActionEvent event) {Object object = event.getSource();if (object == begin)begin_actionPerformed(event);}}void begin_actionPerformed(ActionEvent event) {if (backgroundThread == null) {begin.setText("Cancel");backgroundThread = new Thread(this);// 用当前对象来实例化一个Thread对象backgroundThread.start();// 启动线程,执行run方法goodLinksCount = 0;badLinksCount = 0;} else {spider.cancel();// 设置标志位true}}@Overridepublic void run() {try {errors.setText("");spider = new Spider(this);// 用当前对象来实例化一个Spider对象,因为当前类实现了ISpiderReportable接口spider.clear();base = new URL(url.getText());// 取得需要搜索的URL地址spider.addURL(base);//将URL地址加入spiderspider.begin();//spider开始工作Runnable doLater = new Runnable() {public void run() {begin.setText("Begin");}};// 导致 doRun.run() 在 AWT 事件指派线程上异步执行。在所有挂起的 AWT// 事件被处理后才发生。此方法应该在应用程序线程需要更新该 GUI时使用。在下面的示例中,invokeLater// 调用将事件指派线程上的 Runnable对象 doHelloWorld加入队列,然后输出一条信息。SwingUtilities.invokeLater(doLater);backgroundThread = null;// 将后台线程重新置空,以便接受下一个URL} catch (MalformedURLException e) {UpdateErrors err = new UpdateErrors();err.msg = "Bad address.";SwingUtilities.invokeLater(err);}}//检测两个URL地址是否属于同一主机,如果是返回true,否则false@Overridepublic boolean spiderFoundURL(URL base, URL url) {UpdateCurrentStats cs = new UpdateCurrentStats();cs.msg = url.toString();//将URL信息赋值给cs.msg,使用后台线程进行打印SwingUtilities.invokeLater(cs);if (!checkLink(url)) {UpdateErrors err = new UpdateErrors();err.msg = url + "(on page " + base + ")\n";SwingUtilities.invokeLater(err);badLinksCount++;return false;}goodLinksCount++;if (!url.getHost().equalsIgnoreCase(base.getHost()))return false;elsereturn true;}@Overridepublic void spiderURLError(URL url) {System.out.println("没找到的URL:" + url);}protected boolean checkLink(URL url) {try {URLConnection connection = url.openConnection();connection.connect();return true;} catch (IOException e) {return false;}}public void spiderFoundEMail(String email) {System.out.println("获得Email:" + email);}class UpdateErrors implements Runnable {public String msg;public void run() {errors.append(msg);}}class UpdateCurrentStats implements Runnable {public String msg;public void run() {current.setText("Currently Processing: " + msg);goodLinksLabel.setText("Good Links: " + goodLinksCount);badLinksLabel.setText("Bad Links: " + badLinksCount);}}}
import javax.swing.text.html.*;/** * Swing JEditorPane 文本组件通过称为 EditorKit 的插件机制来支持不同种类的内容。因为 HTML * 是很流行的内容格式,因此默认提供了某种支持。此类提供了 HTML version 3.2(带有某些扩展)的默认支持,并正在向 version 4.0 * 迁移。不支持 <applet> 标记,但为 <object> 标记提供了某种支持。 *  * @author wangxu *  */public class HTMLParse extends HTMLEditorKit {private static final long serialVersionUID = 1L;public HTMLEditorKit.Parser getParser() {return super.getParser();}}
import java.net.*;public interface ISpiderReportable {// 找到URL链接public boolean spiderFoundURL(URL base, URL url);public void spiderURLError(URL url);// 找到Email的链接public void spiderFoundEMail(String email);}
import java.util.*;import java.net.*;import java.io.*;import javax.swing.text.*;import javax.swing.text.html.*;public class Spider {// 装载错误的工作集protected Collection workloadError = new ArrayList(3);// 等待工作集protected Collection workloadWaiting = new ArrayList(3);// 已处理的工作集protected Collection workloadProcessed = new ArrayList(3);protected ISpiderReportable report;protected boolean cancel = false;public Spider(ISpiderReportable report) {this.report = report;}public Collection getWorkloadError() {return workloadError;}public Collection getWorkloadWaiting() {return workloadWaiting;}public Collection getWorkloadProcessed() {return workloadProcessed;}public void clear() {getWorkloadError().clear();getWorkloadWaiting().clear();getWorkloadProcessed().clear();}public void cancel() {cancel = true;}public void addURL(URL url) {if (getWorkloadWaiting().contains(url))// 如果等待的工作集中已经包含该URL,返回return;if (getWorkloadError().contains(url))// 如果出错的工作集中已经包含该URL,返回return;if (getWorkloadProcessed().contains(url))// 如果已处理的工作集中包含该URL,返回return;log("Adding to workload: " + url);getWorkloadWaiting().add(url);// 将其加入等待的工作集中}// 具体分析URL的方法public void processURL(URL url) {try {log("Processing: " + url);// 控制台打印处理的URL地址// get the URL's contentsURLConnection connection = url.openConnection();System.out.println(connection.getContentType() + "++++++++++++++++====");if ((connection.getContentType() != null) && !connection.getContentType().toLowerCase().startsWith("text/")) {getWorkloadWaiting().remove(url);getWorkloadProcessed().add(url);log("Not processing because content type is: " + connection.getContentType());return;}// read the URLInputStream is = connection.getInputStream();Reader r = new InputStreamReader(is);// parse the URLHTMLEditorKit.Parser parse = new HTMLParse().getParser();// Parse the given stream and drive the given callback with the// results of the parse. This method should be implemented to be// thread-safe.// 解析给定的流并通过解析的结果驱动给定的回调。该方法执行完之后,会调用给定的回调函数parse.parse(r, new Parser(url), true);} catch (IOException e) {// 如果出错getWorkloadWaiting().remove(url);// 从工作集中移除URLgetWorkloadError().add(url);// 将出错的URL加入错误的工作集log("Error: " + url);report.spiderURLError(url);// 报告该出错的URLreturn;}// mark URL as completegetWorkloadWaiting().remove(url);getWorkloadProcessed().add(url);log("Complete: " + url);}// 蜘蛛工作的方法,只要等待工作集不为空,并且标志位为false,那么一直从集合中取出URLpublic void begin() {cancel = false;while (!getWorkloadWaiting().isEmpty() && !cancel) {Object list[] = getWorkloadWaiting().toArray();for (int i = 0; (i < list.length) && !cancel; i++)processURL((URL) list[i]);// 调用分析URL的方法}}protected class Parser extends HTMLEditorKit.ParserCallback {protected URL base;public Parser(URL base) {this.base = base;}public void handleSimpleTag(HTML.Tag tag, MutableAttributeSet mutableAttributeSet, int pos) {String href = (String) mutableAttributeSet.getAttribute(HTML.Attribute.HREF);// 获取href链接if ((href == null) && (tag == HTML.Tag.FRAME))href = (String) mutableAttributeSet.getAttribute(HTML.Attribute.SRC);if (href == null)return;int i = href.indexOf('#');if (i != -1)href = href.substring(0, i);// 开始截取到'#'字符if (href.toLowerCase().startsWith("mailto:")) {// 如果是邮件链接report.spiderFoundEMail(href);return;}if (tag == HTML.Tag.META) {String title = (String) mutableAttributeSet.getAttribute(HTML.Attribute.NAME);System.out.println("title:" + title);}// 处理新得到的链接handleLink(base, href);}public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {handleSimpleTag(t, a, pos); // handle the same way}// 处理链接的函数protected void handleLink(URL base, String str) {try {URL url = new URL(base, str);// 判断,如果属于同一主机,加入待处理工作集if (report.spiderFoundURL(base, url))addURL(url);} catch (MalformedURLException e) {log("Found malformed URL: " + str);}}}public void log(String entry) {System.out.println((new Date()) + ":" + entry);}}

0 0
原创粉丝点击