php多线程实现方法汇总

来源:互联网 发布:照片后期调色软件 编辑:程序博客网 时间:2024/04/29 15:56

方法一:

 

用php自带的curl功能实现的多线程下载工具,比file_get_contents,以及linux自带的命令行curl、wget效率高多了,我亲自测试过的。

大家如果觉得好,就拿去直接用吧。

/**
* @param mixed string or array,参数$urlArray是要抓取的网页(或文件,下同)的网址,可以是单个网址,也可以是多个网址组成的数组。
*/
function multiDownload($urlArray) {
if (empty($urlArray)) return false;
$isStr = false;
if (is_string($urlArray)) {
$urlArray = array($urlArray);
$isStr = true;
}
self::log(sprintf("%s Multi thread download begin...", __METHOD__));
$mh = curl_multi_init(); //curl_multi_init -- Returns a new cURL multi handle
$curlArray = array();

foreach ($urlArray as $i => $url) {
self::log(sprintf("%s Download url: |%s|...", __METHOD__, $url));

$curlArray[$i] = curl_init($url);

curl_setopt($curlArray[$i], CURLOPT_RETURNTRANSFER, true); //设置为true表示返回抓取的内容,而不是直接输出到浏览器上。TRUE to return the transfer as a string of the return value of curl_exec() instead of outputting it out directly

curl_setopt($curlArray[$i], CURLOPT_AUTOREFERER, true); //自动设置referer。TRUE to automatically set the Referer: field in requests where it follows a Location: redirect.

curl_setopt($curlArray[$i], CURLOPT_FOLLOWLOCATION, true); //跟踪url的跳转,比如301, 302等

curl_setopt($curlArray[$i], CURLOPT_MAXREDIRS, 2); //跟踪最大的跳转次数

curl_setopt($curlArray[$i], CURLOPT_HEADER, 0); //TRUE to include the header in the output.

curl_setopt($curlArray[$i], CURLOPT_ENCODING, ""); //接受的编码类型,The contents of the "Accept-Encoding: " header. This enables decoding of the response. Supported encodings are "identity", "deflate", and "gzip". If an empty string, "", is set, a header containing all supported encoding types is sent.

curl_setopt($curlArray[$i], CURLOPT_CONNECTTIMEOUT, 5); //连接超时时间

curl_multi_add_handle($mh, $curlArray[$i]); //curl_multi_add_handle -- Add a normal cURL handle to a cURL multi handle
}
$running = NULL;
$count = 0;
do {
//10秒钟没退出,就超时退出
if ($count++>100) break;
usleep(100000);
curl_multi_exec($mh, $running); //curl_multi_exec -- Run the sub-connections of the current cURL handle
} while($running > 0);
$content = array();
foreach ($urlArray as $i => $url) {
$content[$url] = curl_multi_getcontent($curlArray[$i]); //curl_multi_getcontent -- Return the content of a cURL handle if CURLOPT_RETURNTRANSFER is set
}
//curl_multi_remove_handle -- Remove a multi handle from a set of cURL handles
foreach ($urlArray as $i => $url){
curl_multi_remove_handle($mh, $curlArray[$i]);
}
//curl_multi_close -- Close a set of cURL handles
curl_multi_close($mh);
self::log(sprintf("%s Multi thread download end...", __METHOD__));
//如果参数$urlArray是字符串,则将返回值也转换为字符串
if ($isStr) $content = implode('', $content);
return $content;
}

 

 

方法二

 

php实现异步加载页面模块时(不同模块不同地址),怎么并发请求,搜索了些资料,找到这篇文章转载之。感觉以后生产环境确实能用的上。

核心提示:一般CURL 抓网页的方法, 是一页一页抓, 假设要抓 4页, 所费时间各别是 5,10,7,5 秒, 那全部总合所花的时间就是 5 + 10 + 7 + 5 = 27 秒。

一般CURL 抓网页的方法, 是一页一页抓, 假设要抓 4页, 所费时间各别是 5,10,7,5 秒, 那全部总合所花的时间就是 5 + 10 + 7 + 5 = 27 秒。

若能同时间去抓取多个网页, 所花费的时间 5,10,7,5 秒, 全部总合所花的时间是 10 秒。(花费最多时间的秒数)

于JavaScript 可使用 AJAX 的 async(YAHOO.util.Connect.asyncRequest)来达成, 于 PHP 可以用 CURL 来达成此 Multi-Threading 的效果。

程序(async.php)

 函数:

function async_get_url($url_array, $wait_usec = 0)
{
    if (!is_array($url_array))
        return false;

    $wait_usec = intval($wait_usec);

    $data    = array();
    $handle  = array();
    $running = 0;

    $mh = curl_multi_init(); // multi curl handler

    $i = 0;
    foreach($url_array as $url) {
        $ch = curl_init();

        curl_setopt($ch, CURLOPT_URL, $url);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); // return don't print
        curl_setopt($ch, CURLOPT_TIMEOUT, 30);
        curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)');
        curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); // 302 redirect
        curl_setopt($ch, CURLOPT_MAXREDIRS, 7);

        curl_multi_add_handle($mh, $ch); // 把 curl resource 放进 multi curl handler 里

        $handle[$i++] = $ch;
    }

    /* 执行 */
    do {
        curl_multi_exec($mh, $running);

        if ($wait_usec > 0) /* 每个 connect 要间隔多久 */
            usleep($wait_usec); // 250000 = 0.25 sec
    } while ($running > 0);

    /* 读取资料 */
    foreach($handle as $i => $ch) {
        $content  = curl_multi_getcontent($ch);
        $data[$i] = (curl_errno($ch) == 0) ? $content : false;
    }

    /* 移除 handle*/
    foreach($handle as $ch) {
        curl_multi_remove_handle($mh, $ch);
    }

    curl_multi_close($mh);

    return $data;
}



 调用:

$t1 = time();

$culs = array("http://a.com/sleep.php?t=3","http://a.com/sleep.php?t=6","http://a.com/sleep.php?t=4");

print_r(async_get_url($culs)); // [0] => example1, [1] => example2

echo "<br>执行时间:";echo time()-$t1;

 

sleep.php

$t = !empty($_GET['t'])?$_GET['t']:3;

echo 'sleep';echo $t;sleep($t)

 

 

 

方法三

 

class MultiHttpRequest{
public $urls = array();
public $curlopt_header = 1;
public $method ="GET";

function __construct($urls= false)
{
$this->urls= $urls;
}

function set_urls($urls)
{
$this->urls= $urls;
return $this;
}

function is_return_header($b)
{
$this->curlopt_header= $b;
return $this;
}

function set_method($m)
{
$this->medthod= strtoupper($m);
return $this;
}

function start()
{
if(!is_array($this->urls) or count($this->urls)== 0){
return false;
}

$curl = $text= array();

$handle = curl_multi_init();

foreach($this->urls as $k=>$v){
$curl[$k]= $this->add_handle($handle, $v);
}

$this->exec_handle($handle);

foreach($this->urls as $k=>$v){
$text[$k]= curl_multi_getcontent ($curl[$k]);
echo $text[$k],"\n\n";
curl_multi_remove_handle($handle, $curl[$k]);
}

curl_multi_close($handle);
}


private function add_handle($handle, $url)
{
$curl = curl_init();
curl_setopt($curl, CURLOPT_URL, $url);
curl_setopt($curl, CURLOPT_HEADER, $this->curlopt_header);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
curl_multi_add_handle($handle, $curl);
return $curl;
}

private function exec_handle($handle)
{
$flag = null;
do {
curl_multi_exec($handle, $flag);
} while ($flag> 0);
}
}

$urls = array("http://baidu.com","http://dzone.com","http://www.g.cn");
$mp = new MultiHttpRequest($urls);
$mp->start();

 

 

方法四:

 

PHP 利用 Curl Functions 可以完成各种传送文件操作,比如模拟浏览器发送GET,POST请求等等,受限于php语言本身不支持多线程,所以开发爬虫程序效率并不高,这时候往往需 要借助Curl Multi Functions 它可以实现并发多线程的访问多个url地址。既然 Curl Multi Function如此强大,能否用 Curl Multi Functions 来写并发多线程下载文件呢,当然可以,下面给出我的代码:

代码1:将获得的代码直接写入某个文件

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
<?php
$urls = array(
'http://www.sina.com.cn/',
'http://www.sohu.com/',
'http://www.163.com/'
); // 设置要抓取的页面URL
$save_to='/test.txt';// 把抓取的代码写入该文件
$st = fopen($save_to,"a");
$mh = curl_multi_init();
foreach ($urls as$i => $url) {
$conn[$i] = curl_init($url);
curl_setopt($conn[$i], CURLOPT_USERAGENT,"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)");
curl_setopt($conn[$i], CURLOPT_HEADER ,0);
curl_setopt($conn[$i], CURLOPT_CONNECTTIMEOUT,60);
curl_setopt($conn[$i], CURLOPT_FILE,$st);// 设置将爬取的代码写入文件
curl_multi_add_handle ($mh,$conn[$i]);
} // 初始化
do {
curl_multi_exec($mh,$active);
} while($active);// 执行
foreach ($urls as$i => $url) {
curl_multi_remove_handle($mh,$conn[$i]);
curl_close($conn[$i]);
} // 结束清理
curl_multi_close($mh);
fclose($st);
?>

代码2:将获得的代码先放入变量,再写入某个文件

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
<?php
$urls = array(
'http://www.sina.com.cn/',
'http://www.sohu.com/',
'http://www.163.com/'
);
$save_to='/test.txt';// 把抓取的代码写入该文件
$st = fopen($save_to,"a");
$mh = curl_multi_init();
foreach ($urls as$i => $url) {
$conn[$i] = curl_init($url);
curl_setopt($conn[$i], CURLOPT_USERAGENT,"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)");
curl_setopt($conn[$i], CURLOPT_HEADER ,0);
curl_setopt($conn[$i], CURLOPT_CONNECTTIMEOUT,60);
curl_setopt($conn[$i],CURLOPT_RETURNTRANSFER,true);// 设置不将爬取代码写到浏览器,而是转化为字符串
curl_multi_add_handle ($mh,$conn[$i]);
}
do {
curl_multi_exec($mh,$active);
} while($active);
foreach ($urls as$i => $url) {
$data= curl_multi_getcontent($conn[$i]);// 获得爬取的代码字符串
fwrite($st,$data);// 将字符串写入文件。当然,也可以不写入文件,比如存入数据库
} // 获得数据变量,并写入文件
foreach ($urls as$i => $url) {
curl_multi_remove_handle($mh,$conn[$i]);
curl_close($conn[$i]);
}
curl_multi_close($mh);
fclose($st);
?>

 

原创粉丝点击