A simple HTTP PHP class to crawl a URL for internal and external URLs

来源：互联网发布：申请中文域名要多少钱编辑：程序博客网时间：2024/06/05 19:59

http://ericlondon.com/2012/01/31/a-simple-http-php-class-to-crawl-a-url-for-internal-and-external-urls.html

Here's a simple PHP class I wrote to crawl a URL and return a list of internal and external URLs. I've used it in the past for development purposes [only] to find 404s and repetition in URL structure. IE: It does not read in robots.txt files or obey any similar rules. Just thought I'd pull it out of the archives and share on the web..

#!/usr/bin/php<?phpclass Crawl {  protected $regex_link;  protected $website_url;  protected $website_url_base;  protected $urls_processed;  protected $urls_external;  protected $urls_not_processed;  protected $urls_ignored;  public function __construct($website_url = NULL) {    // enable error tracking, grr.    ini_set('track_errors', true);    // setup variables    $this->regex_link = "/<\s*a\s+[^>]*href\s*=\s*[\"']?([^\"' >]+)[\"' >]/isU";    $this->urls_processed = array();    $this->urls_external = array();    $this->urls_not_processed = array();    $this->urls_ignored = array(      '/search/apachesolr_search/',      '/comment/reply/',    );    // validate argument(s)    $result = $this->validate_arg_website_url($website_url);    // error check    if (!$result) {      return FALSE;    }    // set website argument    $this->website_url = $website_url;    // get url base    $url_base = $this->get_url_base($this->website_url);    // error check    if (!$url_base) {      return FALSE;    }    // set website url base    $this->website_url_base = $url_base;    // add url to list of urls to process    $this->urls_not_processed[] = $this->website_url;    while(count($this->urls_not_processed)) {      $this->process_urls_not_processed();    }    // sort data    sort($this->urls_processed);    sort($this->urls_external);  }  protected function validate_arg_website_url($website_url = NULL) {    // validate argument    if (!(is_string($website_url) && (substr($website_url,0,7)=='http://' || substr($website_url,0,8)=='https://'))) {      return FALSE;    }    return TRUE;  }  protected function get_url_base($url = NULL) {    // validate url    if (!$url || !strlen($url)) {      return FALSE;    }    $url_parts = parse_url($url);    // validate    if (!is_array($url_parts)) {      return FALSE;    }    // explode host on '.'    $exploded = explode('.', $url_parts['host']);    // return host and domain extension    $url_base = $exploded[count($exploded)-2] . '.' . $exploded[count($exploded)-1];    return $url_base;  }  protected function scan_url($url) {    // validate url    if (!is_string($url) || !$url || !strlen($url)) {      return FALSE;    }    // ensure url has not already been processed    if (in_array($url, $this->urls_processed)) {      return FALSE;    }    // add url to processed list    $this->urls_processed[] = $url;    // remove any previously saved errors    unset($php_errormsg);    // load page contents    $page_contents = file_get_contents($url);    // check for error when loading url; text starting with "file_get_contents"    $error_text = 'file_get_contents';    if (isset($php_errormsg) && substr($php_errormsg,0,strlen($error_text))==$error_text) {      return FALSE;    }    // check for additional errors    elseif ($page_contents === false || !strlen($page_contents)) {      return FALSE;    }    // execute regex    preg_match_all($this->regex_link, $page_contents, $matches);    if (is_array($matches) && isset($matches[1])) {      return array_unique($matches[1]);    }    return FALSE;  }  protected function process_matches($matches = NULL) {    // validate    if (!$matches || !is_array($matches) || empty($matches)) {      return FALSE;    }    foreach ($matches as $match) {      // ensure match exists      if (empty($match)) {        continue;      }      // ignore anchors      elseif (substr($match,0,1)=='#') {        continue;      }      // ignore javascript      elseif (substr($match,0,11)=='javascript:') {        continue;      }      // ignore mailto      elseif (substr($match,0,7)=='mailto:') {        continue;      }      // check for internal urls that begin with '/'      if (substr($match,0,1)=='/') {        $match = 'http://' . $this->website_url_base . $match;      }      // remove trailing slash      if (substr($match, -1)=='/') {        $match = substr($match, 0, -1);      }      // ensure href starts with http or https      // NOTE: this needs work, URL could begin with relative paths like '../', ftp://, etc.      if (!(substr($match,0,7)=='http://' || substr($match,0,8)=='https://')) {        $match = 'http://' . $this->website_url_base . '/' . $match;      }      // check if url is to be ignored      foreach ($this->urls_ignored as $ignored) {        if (stripos($match, $ignored) !== FALSE) {          continue 2;        }      }      // get url base      $url_base = $this->get_url_base($match);      // check for external url      if ($url_base != $this->website_url_base) {        if (!in_array($match, $this->urls_external)) {          $this->urls_external[] = $match;        }        continue;      }      // check if url has already been processed      if (in_array($match, $this->urls_processed)) {        continue;      }      // add url to list of urls to process      if (!in_array($match, $this->urls_not_processed)) {        $this->urls_not_processed[] = $match;      }    // end: foreach    }    return TRUE;  }  protected function process_urls_not_processed() {    if (empty($this->urls_not_processed)) {      return FALSE;    }    // get unprocessed url    $url = array_shift($this->urls_not_processed);    // scan url    $matches = $this->scan_url($url);    // error check    if (!$matches || !is_array($matches) || empty($matches)) {      return FALSE;    }    $this->process_matches($matches);  }  public function output_all_urls() {    echo "===== INTERNAL URLS =====\n";    foreach ($this->urls_processed as $url) {      print $url . "\n";    }    echo "===== EXTERNAL URLS =====\n";    foreach ($this->urls_external as $url) {      print $url . "\n";    }  }}?>

It can be used as such:

<?php$website_url = 'http://www.example.com';$crawl = new Crawl($website_url);$crawl->output_all_urls();?>

0 0