统计数据库中的无效网址

来源：互联网发布：mac 当前用户路径编辑：程序博客网时间：2024/05/20 23:06

由于内容组之前输入网址时，输入的网址大多数为无效网址，所以需要将无效的网址筛选出来，因此自己写了一个脚本来处理。逻辑思维如下：

不过刚开始是用的php自带的函数get_headers().由于数据有几千条。运行时就出现运行超时的错误。为了探究程序运行消耗的时间在什么地方，于是使用microtime()来计算程序运行时间

microtime(true)得到的为浮点秒数

set_time_limit(300);$map["parentid"]=3;$ce_data=$M->where($map)->limit(200)->select();$txt=APP_PATH."Instution2.txt";file_put_contents($txt,"\r\n---------------------------------------------2---博物馆前300-500数据啊-----\r\n");foreach($ce_data as $key=> $row){    if(strpos($row['ins_url'],"http")!== 0 && !empty($row["ins_url"])){        $ce_data[$key]["ins_url"]="http://".$row['ins_url'];    }    if(!empty($row["ins_url"])){        $res=get_code($ce_data[$key]["ins_url"]);        if(!$res){            $ce_data[$key]["ins_url"]="";            file_put_contents($txt,$row["name"]."\r\n",FILE_APPEND);        }elseif ($res != 200){            $ce_data[$key]["ins_url"]="";            file_put_contents($txt,$row["name"]."\r\n",FILE_APPEND);        } else{        }    }}

最后发现get_headers() 消耗了很多时间。为了节约时间，采用了如下函数

function get_url_headers($url, $timeout = 10){    $ch = curl_init();    curl_setopt($ch, CURLOPT_URL, $url);    curl_setopt($ch, CURLOPT_HEADER, true);    curl_setopt($ch, CURLOPT_NOBODY, true);    curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);    curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);    $data = curl_exec($ch);    $data = preg_split('/\n/', $data);    $data = array_filter(array_map(function ($data) {        $data = trim($data);        if ($data) {            $data = preg_split('/:\s/', trim($data), 2);            $length = count($data);            switch ($length) {                case 2:                    return array($data[0] => $data[1]);                    break;                case 1:                    return $data;                    break;                default:                    break;            }        }    }, $data));    sort($data);    foreach ($data as $key => $value) {        $itemKey = array_keys($value)[0];        if (is_int($itemKey)) {            $data[$key] = $value[$itemKey];        } elseif (is_string($itemKey)) {            $data[$itemKey] = $value[$itemKey];            unset($data[$key]);        }    }    return $data;}/** * 获取链接响应的返回码   hhb 20160927 * @param  string $url url链接 * @return int */function get_code($url){    $ch = curl_init ();    curl_setopt($ch, CURLOPT_URL, $url);    curl_setopt($ch, CURLOPT_TIMEOUT, 200);    curl_setopt($ch, CURLOPT_HEADER, false); //不需要输出头部信息    curl_setopt($ch, CURLOPT_NOBODY, true); //不用输出内容    curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE); //超时重试    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); //抓取转跳    curl_setopt($ch, CURLOPT_CUSTOMREQUEST, 'GET'); //post方式提交    curl_exec($ch);    $a = curl_getinfo($ch,CURLINFO_HTTP_CODE); //获取url响应    return $a;}

节省了时间。

最后再送大家一个甜点：

php自带函数basename(),不支持中文文件命名，有时为了获取中文文件名可以采取如下函数

function get_basename($filename){ return preg_replace('/^.+[\\\\\\/]/', '', $filename);}

0 0