PHP正则采集类,采集moviereleased.net站点,可以无限扩展

来源:互联网 发布:简谱视唱教学软件 编辑:程序博客网 时间:2024/06/16 08:19
<?phpdefine('COOKIE_PATH',dirname(__FILE__).'/cookie.txt');class Collection {private $_url;private $_regex;private $_match_href = array();private $_cookie_file = COOKIE_PATH;private $_login_url;private $_detail = array();function __construct() {}function setUrl($url) {if(!is_array($url)){return false;}$this->_url = $url['target_url'];if($url['login_url']){$this->_login_url = $url['login_url'];}}function setRegex($regex) {if(!is_array($regex)){return false;}$this->_regex = $regex;}function userLogin($user_data) {if(!file_exists($this->_cookie_file)){file_put_contents($this->_cookie_file,'');}$this->getData($this->_login_url, $user_data, $this->_cookie_file);}function matchLink() {$data = $this->getData($this->_url);preg_match_all(array_shift($this->_regex),$data,$match);$second = array_shift($this->_regex);foreach($match as $key=>$value){if(is_int($key)) continue;foreach($value as $v){preg_match($second,$v,$matched);$this->_match_href[] = $matched['href'];}}return $this->_match_href;}function matchDetail($user_data=false) {$this->matchLink();if($user_data){$this->userLogin($user_data);}if(empty($this->_match_href)) return false;foreach($this->_match_href as $m){$detail = $this->getData($m,false, $this->_cookie_file);foreach($this->_regex as $key=>$val){preg_match($this->_regex[$key],$detail,$match);$this->_detail[$key][] = $match[$key];}}return $this->_detail;}function getData($url, $data=false,$cookie_file=false,$timeout=3) {$ch = curl_init();curl_setopt($ch, CURLOPT_URL, $url);curl_setopt($ch, CURLOPT_HEADER, 0);curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);if($data){curl_setopt($ch, CURLOPT_POST, true);curl_setopt($ch, CURLOPT_POSTFIELDS, $data);}if($cookie_file){curl_setopt($ch, CURLOPT_COOKIEFILE,$cookie_file);curl_setopt($ch, CURLOPT_COOKIEJAR,$cookie_file);}$data = curl_exec($ch);curl_close($ch);return $data;}}$c = new Collection();$regex = array('list_h2'=>'/<h2\s*class="\s*posttitle\s*"\s*>(?<link>.*?)<\/h2>/is','alink'=>'/<a\s*href="(?<href>.*?)">.*?<\/a>/is','post_title'=>'/<h2\s*class="\s*posttitle\s*"\s*><a.*?href=".*?".*?>(?<post_title>.*?)<\/a><\/h2>/is','post_content'=>'/<div\s*class="postcontent"\s*><p>(?<post_content>.*?)<div\s*class="wumii-hook">/is','post_img'=>'/<div\s*class="postcontent"\s*><p><a\s*href="(?<large_img>.*?)"\s*><img.*?src="(?<post_img>.*?)".*?><\/a>/is','review'=>'/<li.*?class="comment\s*byuser\s*comment-author-admin.*?".*?>(?<review>.*?)<\/li>/is',);$url = array('target_url'=>'http://moviereleased.net/','login_url'=>'http://moviereleased.net/wp-login.php');$c->setRegex($regex);$c->setUrl($url);$user_data = 'log=testuser&pwd=testuser';$data = $c->matchDetail($user_data);print_r($data['post_img']);