网络爬行者(SearchCrawler)源代码

来源:互联网 发布:景别的作用 知乎 编辑:程序博客网 时间:2024/04/30 06:58

import java.awt.*;
import java.awt.event.*;
import java.io.*;
import java.net.*;
import java.util.*;
import java.util.regex.*;
import javax.swing.*;
import javax.swing.table.*;

//一个Web的爬行者(注:爬行在这里的意思与抓取,捕获相同)
public class SearchCrawler extends JFrame{
 //最大URL保存值
 private static final String[] MAX_URLS={"50","100","500","1000"};
 
 //缓存robot禁止爬行列表
 private HashMap disallowListCache=new HashMap();
 
 //搜索GUI控件
 private JTextField startTextField;
 private JComboBox maxComboBox;
 private JCheckBox limitCheckBox;
 private JTextField logTextField;
 private JTextField searchTextField;
 private JCheckBox caseCheckBox;
 private JButton searchButton;
 
 //搜索状态GUI控件
 private JLabel crawlingLabel2;
 private JLabel crawledLabel2;
 private JLabel toCrawlLabel2;
 private JProgressBar progressBar;
 private JLabel matchesLabel2;
 
 //搜索匹配项表格列表
 private JTable table;
 
 //标记爬行机器是否正在爬行
 private boolean crawling;
 
 //写日志匹配文件的引用
 private PrintWriter logFileWriter;
 
 //网络爬行者的构造函数
 public SearchCrawler(){
  //设置应用程序标题栏
  setTitle("搜索爬行者");
  //设置窗体大小
  setSize(600,600);
  
  //处理窗体关闭事件
  addWindowListener(new WindowAdapter(){
   public void windowClosing(WindowEvent e){
    actionExit();
   }
  });
  
  //设置文件菜单
  JMenuBar menuBar=new JMenuBar();
  JMenu fileMenu=new JMenu("文件");
  fileMenu.setMnemonic(KeyEvent.VK_F);
  JMenuItem fileExitMenuItem=new JMenuItem("退出",KeyEvent.VK_X);
  fileExitMenuItem.addActionListener(new ActionListener(){
   public void actionPerformed(ActionEvent e){
    actionExit();
   }
  });
  fileMenu.add(fileExitMenuItem);
  menuBar.add(fileMenu);
  setJMenuBar(menuBar);
  
  //设置搜索面板
  JPanel searchPanel=new JPanel();
  GridBagConstraints constraints;
  GridBagLayout layout=new GridBagLayout();
  searchPanel.setLayout(layout);
  
  JLabel startLabel=new JLabel("开始URL:");
  constraints=new GridBagConstraints();
  constraints.anchor=GridBagConstraints.EAST;
  constraints.insets=new Insets(5,5,0,0);
  layout.setConstraints(startLabel,constraints);
  searchPanel.add(startLabel);
  
  startTextField=new JTextField();
  constraints=new GridBagConstraints();
  constraints.fill=GridBagConstraints.HORIZONTAL;
  constraints.gridwidth=GridBagConstraints.REMAINDER;
  constraints.insets=new Insets(5,5,0,5);
  layout.setConstraints(startTextField,constraints);
  searchPanel.add(startTextField);
  
  JLabel maxLabel=new JLabel("最大抓取URL数(0表示不限制):");
  constraints=new GridBagConstraints();
  constraints.anchor=GridBagConstraints.EAST;
  constraints.insets=new Insets(5,5,0,0);
  layout.setConstraints(maxLabel,constraints);
  searchPanel.add(maxLabel);
  
  
  maxComboBox=new JComboBox(MAX_URLS);
  maxComboBox.setEditable(true);
  constraints=new GridBagConstraints();
  constraints.insets=new Insets(5,5,0,0);
  layout.setConstraints(maxComboBox,constraints);
  searchPanel.add(maxComboBox);
  
  limitCheckBox=new JCheckBox("限制抓取开始URL站点");
  constraints=new GridBagConstraints();
  constraints.anchor=GridBagConstraints.WEST;
  constraints.insets=new Insets(0,10,0,0);
  layout.setConstraints(limitCheckBox,constraints);
  searchPanel.add(limitCheckBox);
  
  JLabel blankLabel=new JLabel();
  constraints=new GridBagConstraints();
  constraints.gridwidth=GridBagConstraints.REMAINDER;
  layout.setConstraints(blankLabel,constraints);
  searchPanel.add(blankLabel);
  
  JLabel logLabel=new JLabel("匹配日志文件:");
  constraints=new GridBagConstraints();
  constraints.anchor=GridBagConstraints.EAST;
  constraints.insets=new Insets(5,5,0,0);
  layout.setConstraints(logLabel,constraints);
  searchPanel.add(logLabel);
  
  String file=System.getProperty("user.dir")+
     System.getProperty("file.separator")+
     "crawler.log";
  logTextField=new JTextField(file);
  constraints=new GridBagConstraints();
  constraints.fill=GridBagConstraints.HORIZONTAL;
  constraints.gridwidth=GridBagConstraints.REMAINDER;
  constraints.insets=new Insets(5,5,0,5);
  layout.setConstraints(logTextField,constraints);
  searchPanel.add(logTextField);
  
  JLabel searchLabel=new JLabel("搜索字符串:");
  constraints=new GridBagConstraints();
  constraints.anchor=GridBagConstraints.EAST;
  constraints.insets=new Insets(5,5,0,0);
  layout.setConstraints(searchLabel,constraints);
  searchPanel.add(searchLabel);
  
  searchTextField=new JTextField();
  constraints=new GridBagConstraints();
  constraints.fill=GridBagConstraints.HORIZONTAL;
  constraints.insets=new Insets(5,5,0,0);
  constraints.gridwidth=2;
  constraints.weightx=1.0d;
  layout.setConstraints(searchTextField,constraints);
  searchPanel.add(searchTextField);
  
  caseCheckBox=new JCheckBox("大小写敏感");
  constraints=new GridBagConstraints();
  constraints.insets=new Insets(5,5,0,5);
  constraints.gridwidth=GridBagConstraints.REMAINDER;
  layout.setConstraints(caseCheckBox,constraints);
  searchPanel.add(caseCheckBox);
  
  searchButton=new JButton("搜索");
  searchButton.addActionListener(new ActionListener(){
   public void actionPerformed(ActionEvent e){
    actionSearch();
   }
  });
  constraints=new GridBagConstraints();
  constraints.gridwidth=GridBagConstraints.REMAINDER;
  constraints.insets=new Insets(5,5,5,5);
  layout.setConstraints(searchButton,constraints);
  searchPanel.add(searchButton);
  
  JSeparator separator=new JSeparator();
  constraints=new GridBagConstraints();
  constraints.fill=GridBagConstraints.HORIZONTAL;
  constraints.gridwidth=GridBagConstraints.REMAINDER;
  constraints.insets=new Insets(5,5,5,5);
  layout.setConstraints(separator,constraints);
  searchPanel.add(separator);
  
  JLabel crawlingLabel1=new JLabel("爬行:");
  constraints=new GridBagConstraints();
  constraints.anchor=GridBagConstraints.EAST;
  constraints.insets=new Insets(5,5,0,0);
  layout.setConstraints(crawlingLabel1,constraints);
  searchPanel.add(crawlingLabel1);
  
  crawlingLabel2=new JLabel();
  crawlingLabel2.setFont(crawlingLabel2.getFont().deriveFont(Font.PLAIN));
  constraints=new GridBagConstraints();
  constraints.fill=GridBagConstraints.HORIZONTAL;
  constraints.gridwidth=GridBagConstraints.REMAINDER;
  constraints.insets=new Insets(5,5,0,5);
  layout.setConstraints(crawlingLabel2,constraints);
  searchPanel.add(crawlingLabel2);
  
  
  JLabel crawledLabel1=new JLabel("已抓取的URL数:");
  constraints=new GridBagConstraints();
  constraints.anchor=GridBagConstraints.EAST;
  constraints.insets=new Insets(5,5,0,0);
  layout.setConstraints(crawledLabel1,constraints);
  searchPanel.add(crawledLabel1);
  
  crawledLabel2=new JLabel();
  crawledLabel2.setFont(crawledLabel2.getFont().deriveFont(Font.PLAIN));
  constraints=new GridBagConstraints();
  constraints.fill=GridBagConstraints.HORIZONTAL;
  constraints.gridwidth=GridBagConstraints.REMAINDER;
  constraints.insets=new Insets(5,5,0,5);
  layout.setConstraints(crawledLabel2,constraints);
  searchPanel.add(crawledLabel2);
  
  JLabel toCrawlLabel1=new JLabel("爬行的URL数");
  constraints=new GridBagConstraints();
  constraints.anchor=GridBagConstraints.EAST;
  constraints.insets=new Insets(5,5,0,0);
  layout.setConstraints(toCrawlLabel1,constraints);
  searchPanel.add(toCrawlLabel1);
  
  toCrawlLabel2=new JLabel();
  toCrawlLabel2.setFont(toCrawlLabel2.getFont().deriveFont(Font.PLAIN));
  constraints=new GridBagConstraints();
  constraints.fill=GridBagConstraints.HORIZONTAL;
  constraints.gridwidth=GridBagConstraints.REMAINDER;
  constraints.insets=new Insets(5,5,0,5);
  layout.setConstraints(toCrawlLabel2,constraints);
  searchPanel.add(toCrawlLabel2);
  
  JLabel progressLabel=new JLabel("正在爬行进度:");
  constraints=new GridBagConstraints();
  constraints.anchor=GridBagConstraints.EAST;
  constraints.insets=new Insets(5,5,0,0);
  layout.setConstraints(progressLabel,constraints);
  searchPanel.add(progressLabel);
  
  progressBar=new JProgressBar();
  progressBar.setMinimum(0);
  progressBar.setStringPainted(true);
  constraints=new GridBagConstraints();
  constraints.gridwidth=GridBagConstraints.HORIZONTAL;
  constraints.insets=new Insets(5,5,0,5);
  layout.setConstraints(progressBar,constraints);
  searchPanel.add(progressBar);
  
  JLabel matchesLabel1=new JLabel("搜索匹配:");
  constraints=new GridBagConstraints();
  constraints.anchor=GridBagConstraints.EAST;
  constraints.insets=new Insets(5,5,10,0);
  layout.setConstraints(matchesLabel1,constraints);
  searchPanel.add(matchesLabel1);
  
  matchesLabel2=new JLabel();
  matchesLabel2.setFont(matchesLabel2.getFont().deriveFont(Font.PLAIN));
  constraints=new GridBagConstraints();
  constraints.fill=GridBagConstraints.HORIZONTAL;
  constraints.gridwidth=GridBagConstraints.REMAINDER;
  constraints.insets=new Insets(5,5,10,5);
  layout.setConstraints(matchesLabel2,constraints);
  searchPanel.add(matchesLabel2);
  
  //设置匹配表
  table=new JTable(new DefaultTableModel(new Object[][]{},new String[]{"URL"}){
   public boolean isCellEditable(int row,int column){
    return false;
   }
  });
  
  //设置匹配面板
  JPanel matchesPanel=new JPanel();
  matchesPanel.setBorder(BorderFactory.createTitledBorder("匹配"));
  matchesPanel.setLayout(new BorderLayout());
  matchesPanel.add(new JScrollPane(table),BorderLayout.CENTER);
  
  //把面板添加到窗体上
  getContentPane().setLayout(new BorderLayout());
  getContentPane().add(searchPanel,BorderLayout.NORTH);
  getContentPane().add(matchesPanel,BorderLayout.CENTER);
 }
 
 //处理搜索/停止按钮被点到
 private void actionSearch(){
  //如果停止按钮被点到,爬行标志关闭
  if(crawling){
   crawling=false;
   return;
  }
  
  ArrayList errorList=new ArrayList();
  
  //验证起始URL已经输入
  String startUrl=startTextField.getText().trim();
  if(startUrl.length()<1){
   errorList.add("没有起始URL");
  }else if(verifyUrl(startUrl)==null){//校验起始URL
   errorList.add("非法的起始URL");
  }
  
  //校验最大URL数是否为空或者是一个数字
  int maxUrls=0;
  String max=((String)maxComboBox.getSelectedItem()).trim();
  if(max.length()>0){
   try{
    maxUrls=Integer.parseInt(max);
   }catch(NumberFormatException e){
   }
   
   if(maxUrls<1){
    errorList.add("非法最大URL数值");
   }
  }
  
  //验证匹配的日志文件已经键入
  String logFile=logTextField.getText().trim();
  if(logFile.length()<0){
   errorList.add("未填写日志文件");
  }
  
  //验证搜索字符串已经被键入
  String searchString=searchTextField.getText().trim();
  if(searchString.length()<1){
   errorList.add("未填写搜索字符串");
  }
  
  //如果有错,显示这些错误,然后返回
  if(errorList.size()>0){
   StringBuffer message=new StringBuffer();
   
   //连接所有的错误到一个字符串中
   for(int i=0;i<errorList.size();i++){
    message.append(errorList.get(i));
    if(i+1<errorList.size()){
     message.append("/n");
    }
   }
   
   showError(message.toString());
   return;
  }
  
  //从起始URL移除"www"
  startUrl=removeWwwFromUrl(startUrl);
  
  //启动搜索爬行者
  search(logFile,startUrl,maxUrls,searchString);
 }
 
 private void search(final String logFile,final String startUrl,
      final int maxUrls,final String searchString){
  //在一个新线程里开始搜索
  Thread thread=new Thread(new Runnable(){
   public void run(){
    //当搜索正在进行时,换一个等待鼠标
    setCursor(Cursor.getPredefinedCursor(Cursor.WAIT_CURSOR));
    
    //禁用搜索控制面板
    startTextField.setEnabled(false);
    maxComboBox.setEnabled(false);
    limitCheckBox.setEnabled(false);
    logTextField.setEnabled(false);
    searchTextField.setEnabled(false);
    caseCheckBox.setEnabled(false);
    
    //更改搜索按钮为"停止"
    searchButton.setText("停止");
    
    //重设状态
    table.setModel(new DefaultTableModel(new Object[][]{},new String[]{"URL"}){
     public boolean isCellEditable(int row,int column){
      return false;
     }
    });
    updateStats(startUrl,0,0,maxUrls);
    
    //打开匹配日志文件
    try{
     logFileWriter=new PrintWriter(new FileWriter(logFile));
    }catch(Exception e){
     showError("不能打开匹配日志文件");
     return;
    }
    
    //打开正在爬行标志
    crawling=true;
    
    //执行真正的爬行
    crawl(startUrl,maxUrls,limitCheckBox.isSelected(),searchString,caseCheckBox.isSelected());
    
    //关闭正在爬行标志
    crawling=false;
    
    //关闭匹配日志文件
    try{
     logFileWriter.close();
    }catch(Exception e){
     showError("不能关闭匹配日志文件");
    }
    
    //标记搜索结束
    crawlingLabel2.setText("结束");
    
    //重新使搜索面板可用
    startTextField.setEnabled(true);
    maxComboBox.setEnabled(true);
    limitCheckBox.setEnabled(true);
    logTextField.setEnabled(true);
    searchTextField.setEnabled(true);
    caseCheckBox.setEnabled(true);
    
    //将搜索按钮改回"搜索"
    searchButton.setText("搜索");
    
    //改回默认的鼠标形状
    setCursor(Cursor.getDefaultCursor());
    
    //如果搜索字符串未被发现显示一个信息
    if(table.getRowCount()==0){
     JOptionPane.showMessageDialog(SearchCrawler.this,"你的搜索字符串未被发现,请尝试其它","搜索字符串未被发现",JOptionPane.WARNING_MESSAGE);
    }
   }
  });
  thread.start();
 }
 
 
 //退出程序
 private void actionExit(){
  System.exit(0);
 }
 
 //校验URL格式
 private URL verifyUrl(String url ){
  //只允许HTTP的URL
  if(!url.toLowerCase().startsWith("http://")){
   return null;
  }
  
  //校验URL的格式
  URL verifiedUrl=null;
  try{
   verifiedUrl=new URL(url);
  }catch(Exception e){
   return null;
  }
  
  return verifiedUrl;
 }
 
 //添加匹配到匹配表和日志文件
 private void addMatch(String url){
  //添加URL到匹配表
  DefaultTableModel model=(DefaultTableModel)table.getModel();
  model.addRow(new Object[]{url});
  
  //添加URL到日志文件
  try{
   logFileWriter.println(url);
  }catch(Exception e){
   showError("未成功的日志匹配");
  }
 }
 
 //更新爬行中状态
 private void updateStats(String crawling,int crawled,int toCrawl,int maxUrls){
  crawlingLabel2.setText(crawling);
  crawledLabel2.setText(""+crawled);
  toCrawlLabel2.setText(""+toCrawl);
  
  //更新进度条
  if(maxUrls==-1){
   progressBar.setMaximum(crawled+toCrawl);
  }else{
   progressBar.setMaximum(maxUrls);
  }
  progressBar.setValue(crawled);
  
  matchesLabel2.setText(""+table.getRowCount());
 }
 
 
 //检查机器人是否允许访问获得的URL
 private boolean isRobotAllowed(URL urlToCheck){
  String host=urlToCheck.getHost().toLowerCase();
  
  //从缓冲中找回服务器的不被允许列表
  ArrayList disallowList=(ArrayList)disallowListCache.get(host);
  
  //如果列表不在名单中,下载将它收入列表
  if(disallowList==null){
   disallowList=new ArrayList();
   
   try{
    URL robotsFileUrl=new URL("http://"+host+"/robots.txt");
    
    //打开并读取robot文件
    BufferedReader reader=new BufferedReader(new InputStreamReader(robotsFileUrl.openStream()));
    
    //读robot文件,建立不被允许路径列表
    String line;
    while((line=reader.readLine())!=null){
     if(line.indexOf("Disallow:")==0){
      String disallowPath=line.substring("Disallow:".length());
      
      //检查不被允许路径中如果含有注释则去除它
      int commentIndex=disallowPath.indexOf("#");
      if(commentIndex!=-1){
       disallowPath=disallowPath.substring(0,commentIndex);
      }
      
      //移除不被允许路径前后空格
      disallowPath=disallowPath.trim();
      
      //添加不被允许路径到列表中
      disallowList.add(disallowPath);
     }
    }
   }catch(Exception e){
    //假设当robot文件不存在时,所有的路径都将被允许爬行
    return true;
   }
  }
  
  //循环检查列表中是否包含给定的URL
  String file=urlToCheck.getFile();
  for(int i=0;i<disallowList.size();i++){
   String disallow=(String)disallowList.get(i);
   if(file.startsWith(disallow)){
    return false;
   }
  }
  return true;
 }
 
 //下载给定的URL页
 private String downloadPage(URL pageUrl){
  try{
   //为读取打开一个到URL的连接
   BufferedReader reader=new BufferedReader(new InputStreamReader(pageUrl.openStream()));
   
   //读文件到缓冲中
   String line;
   StringBuffer pageBuffer=new StringBuffer();
   while((line=reader.readLine())!=null){
    pageBuffer.append(line);
   }
   
   return pageBuffer.toString();
  }catch(Exception e){
  }
  
  return null;
 }
 
 
 //从一个URL中删除开头的"www",如果它存在
 private String removeWwwFromUrl(String url){
  int index=url.indexOf("://www");
  if(index!=-1){
   return url.substring(0,index+3)+url.substring(index+7);
  }
  return url;
 }
 
 //解析所有的页面内容找到链接
 private ArrayList retrieveLinks(URL pageUrl,String pageContents,HashSet crawledList,boolean limitHost){
  //编译链接匹配模式
  Pattern p=Pattern.compile("<a//s+href//s*=//s*/"?(.*?)[/"|>]",Pattern.CASE_INSENSITIVE);
  Matcher m=p.matcher(pageContents);
  
  //建立链接匹配列表
  ArrayList linkList=new ArrayList();
  while(m.find()){
   String link=m.group(1).trim();
   
   //跳过空链接
   if(link.length()<1){
    continue;
   }
   
   //跳过页面锚记链接
   if(link.charAt(0)=='#'){
    continue;
   }
   
   //跳过邮件链接
   if(link.indexOf("mailto:")!=-1){
    continue;
   }
   
   //跳过JavaScript链接
   if(link.toLowerCase().indexOf("javascript")!=-1){
    continue;
   }
   
   //如果需要,加上绝对与相对URL
   if(link.indexOf("://")==-1){
    //处理绝对URL
    if(link.charAt(0)=='/'){    
    link="http://"+pageUrl.getHost()+link;
    //处理相对URL
    }else{
     String file=pageUrl.getFile();
     if(file.indexOf('/')==-1){
      link="http://"+pageUrl.getHost()+"/"+link;
     }else{
      String path=file.substring(0,file.lastIndexOf('/')+1);
      link="http://"+pageUrl.getHost()+path+link;
     }
    }
   }
   
   //从链接移除锚记
   int index=link.indexOf('#');
   if(index!=-1){
    link=link.substring(0,index);
   }
   
   //去除开头的"www"
   link=removeWwwFromUrl(link);
   
   //校验链接,如果非法,则跳过
   URL verifiedLink=verifyUrl(link);
   if(verifiedLink==null){
    continue;
   }
   
   //如果是特定的,那些与起始相同的服务器的链接,则跳过
   if(limitHost && !pageUrl.getHost().toLowerCase().equals(
       verifiedLink.getHost().toLowerCase())){
    continue;
   }
   
   //如果它已经被捕获,则跳过
   if(crawledList.contains(link)){
    continue;
   }
   
   //添加链接到列表
   linkList.add(link);
  }
  
  
  return linkList;
 }
 
 //决定获得的页面内容里是否有匹配的字符串
 private boolean searchStringMatches(String pageContents,String searchString,boolean caseSensitive){
  String searchContents=pageContents;
  
  //如果是非大小写敏感,小写所有页面内容
  if(!caseSensitive){
   searchContents=pageContents.toLowerCase();
  }
  
  //从个别的队列中分隔字符串
  Pattern p=Pattern.compile("[//s]+");
  String[] terms=p.split(searchString);
  
  //检查每一个队列是否匹配
  for(int i=0;i<terms.length;i++){
   if(caseSensitive){
    if(searchContents.indexOf(terms[i])==-1){
     return false;
    }
   }else{
    if(searchContents.indexOf(terms[i].toLowerCase())==-1){
     return false;
    }
   }
  }
  
  return false;
 }
 
 //执行真正的爬行,搜索搜索字符串
 public void crawl(String startUrl,int maxUrls,boolean limitHost,String searchString,boolean caseSensitive){
  //设置爬行列表
  HashSet crawledList=new HashSet();
  LinkedHashSet toCrawlList=new LinkedHashSet();
  
  //添加开始URL到要爬行列表
  toCrawlList.add(startUrl);
  
  //循环整个要爬行列表,执行真正的爬行
  while(crawling && toCrawlList.size()>0){
   //如果指定过最大URL数,则检查是否达到了最大URL数
   if(maxUrls!=-1){
    if(crawledList.size()==maxUrls){
     break;
    }
   }
   
   //从底部的列表中获得URL
   String url=(String)toCrawlList.iterator().next();
   
   //从要爬行列表中移除URL
   toCrawlList.remove(url);
   
   //转换字符串URL为URL对象
   URL verifiedUrl=verifyUrl(url);
   
   //如果robots不允许访问这个URL,则跳过
   if(!isRobotAllowed(verifiedUrl)){
    continue;
   }
   
   //更新爬行状态
   updateStats(url,crawledList.size(),toCrawlList.size(),maxUrls);
   
   //添加页面到被爬行的列表
   crawledList.add(url);
   
   //从获得的URL下载页面
   String pageContents=downloadPage(verifiedUrl);
   
   //如果一个页面被下载成功,则找到所有的链接并比较是否包含搜索字符串
   if(pageContents!=null&&pageContents.length()>0){
    //从页面获得合法的链接
    ArrayList links=retrieveLinks(verifiedUrl,pageContents,crawledList,limitHost);
    
    //添加列表到被爬行列表
    toCrawlList.addAll(links);
    
    //检查搜索字符串是否存在,如果存在,则记录一个匹配
    if(searchStringMatches(pageContents,searchString,caseSensitive)){
     addMatch(url);
    }
   }
   
   //更新爬行状态
   updateStats(url,crawledList.size(),toCrawlList.size(),maxUrls);
  }
  
 }
 
 //显示错误信息
 private void showError(String message){
  JOptionPane.showMessageDialog(this,message,"错误",JOptionPane.ERROR_MESSAGE);
 }
 
 public static void main(String[] args){
  SearchCrawler crawler=new SearchCrawler();
  crawler.show();
 }
}