URL地址相对路径转绝对路径

来源:互联网 发布:可靠性分析软件下载 编辑:程序博客网 时间:2024/05/17 09:28

URL相对路径转绝对路径

在做爬虫或者网页解析时,经常会从网页中提取到很多相对URL,在做进一步爬取之前,我们需要将这个地址转成完整的URL,其实URL和文件路径是相似的,看似很简单,实则有很多情况需要考虑,网上提供的很多方法都会有各种各样的问题,这里提供了一个相对靠谱的方法,代码如下。

echo PhpUri::doIt("https://www.google.com/", "foo");//输出 https://www.google.com/foo
class PhpUri{    public $scheme;    public $authority;    public $path;    public $query;    public $fragment;    /**     * 调用这个方法进行转换     *     * @param $baseUrl     * @param $relativeUrl     * @return string     */    public static function doIt($baseUrl, $relativeUrl)    {        return self::parse($baseUrl)->join($relativeUrl);    }    /**     * PhpUri constructor.     */    public function __construct($url)    {        preg_match_all( '/^(([^:\/?#]+):)?(\/\/([^\/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?$/',            $url, $matches);        $this->scheme = $matches[2][0];        $this->authority = $matches[4][0];        $this->path = $matches[5][0];        $this->query = $matches[7][0];        $this->fragment = $matches[9][0];    }    public static function parse($url)    {        $uri = new PhpUri($url);        if(empty($uri->path))            $uri->path = '/';        return $uri;    }    public function join($relative)    {        $uri = new PhpUri($relative);        switch (true)        {            case !empty($uri->scheme):                break;            case !empty($uri->authority):                break;            case empty($uri->path):                $uri->path = $this->path;                if (empty($uri->query))                {                    $uri->query = $this->query;                }                break;            case strpos($uri->path, '/')===0:                break;            default:                $base_path = $this->path;                if (strpos( $base_path, '/' ) === FALSE)                {                    $base_path = '';                }                else                {                    $base_path = preg_replace('/\/[^\/]+$/', '/', $base_path);                }                if (empty($base_path) && empty($this->authority))                {                    $base_path = '/';                }                $uri->path = $base_path.$uri->path;        }        if (empty($uri->scheme))        {            $uri->scheme = $this->scheme;            if (empty($uri->authority ))            {                $uri->authority = $this->authority;            }        }        return $uri->toStr();    }    private function toStr()    {        $ret = '';        if ( !empty( $this->scheme ) )        {            $ret .= "{$this->scheme}:";        }        if ( !empty( $this->authority ) )        {            $ret .= "//{$this->authority}";        }        $ret .= $this->normalizePath( $this->path );        if ( !empty( $this->query ) )        {            $ret .= "?{$this->query}";        }        if ( !empty( $this->fragment ) )        {            $ret .= "#{$this->fragment}";        }        return $ret;    }    private function normalizePath( $path )    {        if ( empty( $path ) )        {            return '';        }        $normalized_path = $path;        $normalized_path = preg_replace( '`//+`', '/', $normalized_path, -1, $c0 );        $normalized_path = preg_replace( '`^/\\.\\.?/`', '/', $normalized_path, -1, $c1 );        $normalized_path = preg_replace( '`/\\.(/|$)`', '/', $normalized_path, -1, $c2 );        /**         * CHANGE:         * @author Dominik Habichtsberg <Dominik.Habichtsberg@Hbg-IT.de>         * @since  24 Mai 2015 10:05 Uhr         * changed limit form -1 to 1, because climbing up the directory-tree failed         */        $normalized_path = preg_replace( '`/[^/]*?/\\.\\.(/|$)`', '/', $normalized_path, 1, $c3 );        $num_matches     = $c0 + $c1 + $c2 + $c3;        return ( $num_matches > 0 ) ? $this->normalizePath( $normalized_path ) : $normalized_path;    }}

参考

  • monkeysuffrage/phpuri
0 0
原创粉丝点击