PHPquery php爬虫

来源:互联网 发布:实名淘宝小号单个购买 编辑:程序博客网 时间:2024/05/17 10:26

PHPQuery

include 'phpQuery.php'; phpQuery::newDocumentFileXHTML('my-xhtml.html')->find('p'); $ul = pq('ul');$url='http://wwwbaidu.com';$data=file_git_content($url);$data=phpQuery::newDocumentFile('http://wwwbaidu.com'); echo pq("title")->text();   // 获取网页标题echo pq("div#header")->html();  $proxyArray = array();foreach (pq('.articleList2 ul li', $doc) as $liOne) {    $proxyOne = array();    foreach (pq('a', $liOne) as $aOne) {      $a = pq($aOne)->text();      $href=$aOne->getAttribute('href');      $proxyOne['href'] = $base.$href;      $proxyOne['content']=getContent($proxyOne['href']);      $proxyOne['title'] = trim($a);    }    foreach (pq('span', $liOne) as $spanOne) {        $span = pq($spanOne)->text();        $proxyOne['time'] = strtotime(trim($span,'[]'));    }$proxyArray[] = $proxyOne;}
phpQuery::newDocument($html, $contentType = null) 根据标记URL新建一个文档。如果 $contentType为空,则根据文档自动检测编码。检测失败, 则对于text/html类型文档自动赋予utf-8编码。phpQuery::newDocumentFile($file, $contentType = null) 根据文件新建一个文档。类似于newDocument()phpQuery::newDocumentHTML($html, $charset = 'utf-8')phpQuery::newDocumentXHTML($html, $charset = 'utf-8')phpQuery::newDocumentXML($html, $charset = 'utf-8')phpQuery::newDocumentPHP($html, $contentType = null) phpQuery::newDocumentFileHTML($file, $charset = 'utf-8')phpQuery::newDocumentFileXHTML($file, $charset = 'utf-8')phpQuery::newDocumentFileXML($file, $charset = 'utf-8')phpQuery::newDocumentFilePHP($file, $contentType) 
pq($param, $context = null);pq(); 相当于 jQuery的$();。它主要完成三件事情:1. 载入标记资源:输入到载入的文档: 对于最开始输入的字符串不接收文本类型的节点:pq('<div/>')`$pq->getDocumentID()根据ID载入到文档: pq('<div/>', $pq->getDocumentID())`// 根据DOM节点的归属将同样的文档载入:pq('<div/>', DOMNode)// 从phpQuery 对象载入文档: pq('<div/>', $pq)2. 运行查询// 根据最后一个选择的文档执行查询:pq('div.myClass')// 根据$pq->getDocumentID()的ID从文档中进行查询:pq('div.myClass', $pq->getDocumentID())// 在同样的文档上根据DOM节点的归属进行查询并且使用节点作为查询的根节点:pq('div.myClass', DOMNode)// 在文档上使用phpQuery对象进行查询// 同时使用对象的栈作为根节点进行查询: pq('div.myClass', $pq) 3. 使用phpQuery对象对DOM节点进行原型化操作foreach(pq('li') as $li) // $li是纯DOM节点, 将它变为phpQuery对象: pq($li);**
//curl获得页面function request($url,$https=true,$proxy=false,$method='get',$data=null){    //1.初始化    $ch = curl_init($url);    //2.设置curl    //返回数据不输出    curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);    //开启支持gzip    curl_setopt($ch, CURLOPT_ENCODING, 'gzip,deflate');    //设置超时限制    // curl_setopt($ch, CURLOPT_TIMEOUT, 5);    //根据url设置referer    $host = parse_url($url);    $host = $host['host'];    curl_setopt($ch, CURLOPT_REFERER, 'http://'.$host);    curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36');    //确认是否开启代理    if($proxy === true){      // $proxyArray = $this->getProxy();      // $proxyOne = $proxyArray[rand(1,(count($proxyArray)-1))];      // // file_put_contents('./dbug',json_encode($proxyOne));      // //开启代理      // curl_setopt($ch, CURLOPT_PROXY, $proxyOne[0]);      // curl_setopt($ch, CURLOPT_PROXYPORT,$proxyOne[1]);      curl_setopt($ch, CURLOPT_PROXY, '61.191.41.130');      curl_setopt($ch, CURLOPT_PROXYPORT,80);    }    //满足https    if($https === true){      //绕过ssl验证      curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);      curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);    }    //满足post    if($method === 'post'){      curl_setopt($ch, CURLOPT_POST, true);      curl_setopt($ch, CURLOPT_POSTFIELDS, $data);    }    //3.发送请求    $content = curl_exec($ch);    //4.关闭资源    curl_close($ch);    return $content;  }