PHP搜索引擎

来源:互联网 发布:淘宝卖写真集违规吗 编辑:程序博客网 时间:2024/05/16 11:24

简单PHP搜索引擎源代码,需要开启PHP的cURL扩展。功能:对某一网址进行检索,获取网站基本信息,同时提取网站的所有连接。
<?php
class Engine{
    private $_url = '';      //URL地址
    private $_sites = '';    //页面信息

    public function __construct($url){
        $this->_url = $url;
    }

    //启动引擎
    public function start(){
        //$content = $this->socketOpen($this->_url);
        $content = $this->getContent($this->_url);
        $this->_sites['url'] = $this->_url;
        $this->_sites['meta'] = $this->getMeta($content);
        $this->_sites['title'] = $this->getTitle($content);
        //$this->_sites['detail'] = $this->getDetail($content);
        $this->_sites['links'] = $this->getLinks($content);
    }

    //获取meta内容
    public function getMeta($content){
        $file = 'metaCache';
        file_put_contents($file,$content);
        $meta = get_meta_tags($file);
        return $meta;
    }

    //获取body内容
    public function getDetail($content){
        preg_match('/<body>(.*?)<\/body>/i',$content,$matchs);
        $body = $this->stripHTML($matchs[1]);
        return substr($body,0,400);
    }

    //获取title内容
    public function getTitle($content){
        preg_match('/<title>(.+)<\/title>/i',$content,$matchs);
        return $matchs[1];
    }

    //获取a链接
    public function getLinks($content){
        $pat = '/<a[^>](.*?)href="(.*?)"(.*?)>(.*?)<\/a>/i';
        preg_match_all($pat,$content,$matchs);
        $result['href'] = $matchs[2];
        $result['name'] = $this->stripTags($matchs[4]);
        return $result;
    }

    //Socket监听
    public function socketOpen($url){
        $fp = fsockopen($url,80,$errno,$errstr,30);
        if($fp === false){
            echo "连接失败:$errstr($errno)<br/>";
            return false;
        }
        else{
            $out = "GET/HTTP/1.1\r\n";
            $out .= "Host:$url\r\n";
            $out .= "Connection:Close\r\n";
            fwrite($fp,$out);
            $content = '';
            while(!feof($fp)){
                $content .= fgets($fp,1024);
            }
            fclose($fp);
            var_dump($content);exit;
            return $content;
        }
    }

    //获取指定url内容
    public function getContent($url){
        $ch = @curl_init($url);
        @curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.0)");
        ob_start();
        $result = @curl_exec($ch);
        $content = ob_get_clean();
        ob_end_clean();
        @curl_close($ch);
        return $content;
    }

    //取出script和style标签
    public function stripHTML($string){
        $pat = array(
            "/<script[^>].*?>.*?<\/script>/i",
            "/<style[^>].*?>.*?<\/style>/i"
        );
        $rep = array('','');
        return preg_replace($pat,$rep,$string);
    }

    //去除数组元素的标签
    public function stripTags(&$arr){
        foreach ($arr as $key => $val )
        {
            if(is_array($val)){
                $this->stripTags($arr[$key]);
            }
            else{
                $arr[$key] = strip_tags($val);
            }
        }
        return $arr;
    }

    function show(){
        echo "<pre>";
        print_r($this->_sites);
        echo "</pre>";
    }
    //End Class Engine
}
$engine = new Engine('http://www.163.com');
$engine->start();
$engine->show();
?>
这只是引擎的主要部分,接下来要做的就是把相关信息存入数据库,然后接着对所有获取的连接再去检索,然后把相关信息再存入数据库,那么核心部分就是我们获取了这些信息之后根据信息内容来设定网站的关键字,然后给他一个排名,供以后搜索。


本文来自PHP100

0 0
原创粉丝点击