基于dede的一个自定义采集器
来源:互联网 发布:semantic ui 怎么 js 编辑:程序博客网 时间:2024/04/26 11:03
搞不懂dede的采集器,有些东西难以满足需求,于是写了这个
<?php/** * * 采集绝想日记网文章 * @author 秦仙游 <dab1117@163.com> * @version 1.0 * @package suibiwu.com *//* 处理页面最大请求时间为无限 */set_time_limit(0);header('Content-Type:text/html;charset=utf8');require_once("simple_html_dom.php");require_once("./framework/tool/MySQLDBTool.class.php");require_once("./framework/tool/ImageTool.class.php");/* key为需要采集的id号 */$arr = array();$arr['24'] = 10196; $arr['21'] = 10197; $GLOBALS['prefix'] = 'http://www.juexiang.com';if (count($arr) > 0) { foreach ($arr as $key => $value) { echo "<h2 style='color:red;'>{$key}分类</h2>"; $typeid = $key; /* 副栏目号 */ $typeid2 = ''; $base_url = 'http://www.juexiang.com/list/' . $value; for ($i = 6; $i <= 8; $i++) { $list_url = $base_url . '?p=' . $i; $items = getArcticleList($list_url); echo "<h3>第{$i}页</h3>"; foreach ($items as $k => $v) { $id = $v['id']; console("获取到id号:$id"); myFlush(); sleep(1); $obj = getArticle('detail/' . $id); console("获取到文档信息:$id"); $obj['typeid'] = $typeid; $obj['typeid2'] = $typeid2; sleep(0); $info = htmlToText(postToDede($obj)) . "\r\n"; console("执行结果:$info"); echo '<hr/>'; myFlush(); } /* 防盗链设置,虽然可能没有效果 */ sleep(3); } }}/** * 格式化输出 * @param type $msg */function console($msg) { echo "<p style='line-height:20px; font-size:12px; line-height:20px;'>{$msg}</p>";}/* 刷新缓存 */function myFlush() { ob_flush(); flush();}/** * 获取文章详情 * @param type $id * @return type */function getArticle($id) { $obj = pickOne($id); $body_without_html = htmlToText($obj['body']); $obj['description'] = mb_substr($body_without_html, '0', 150, 'utf8'); $obj['source'] = '绝想日记网'; /* 设置文章属性 */ if (!empty($obj['litpic'])) { $obj['flag'] = ',f,p'; } if (rand(1, 3) == 2) $obj['flag'] = ',c'; if (rand(1, 3) == 2) $obj['flag'] = ',h'; if (rand(1, 3) == 2) $obj['flag'] = ',s'; $obj['flag'] = ltrim($flag['flag'], ','); $obj['flag'] = explode(',', $flag); $obj['keywords'] = '随笔坞'; $obj['qianbian'] = rand(0, 20); $obj['zhichi'] = rand(0, 300); $obj['zhaoma'] = rand(0, 70); $obj['gaoxiao'] = rand(0, 80); $obj['chedan'] = rand(0, 100); $obj['bujie'] = rand(0, 200); $obj['chijing'] = rand(0, 50); $obj['henbang'] = rand(0, 200); return $obj;}//var_dump($obj);//echo saveToDatabase($obj) . "\r\n";function postToDede($obj) { /* 构造表单数据 */ $data = array( 'channelid' => '1', 'dopost' => 'save', 'title' => $obj['title'], 'shorttitle' => '', 'redirecturl' => '', 'tags' => '', 'weight' => '1', 'picname' => '', 'litpic' => '', 'source' => $obj['source'], 'writer' => $obj['writer'], 'typeid' => $obj['typeid'], 'typeid2' => $obj['typeid2'], 'keywords' => '', 'autokey' => '1', 'desciption' => '', 'qianbian' => $obj['qianbian'], 'zhichi' => $obj['zhichi'], 'zhaoma' => $obj['zhaoma'], 'gaoxiao' => $obj['gaoxiao'], 'chedan' => $obj['chedan'], 'bujie' => $obj['bujie'], 'chijing' => $obj['chijing'], 'henbang' => $obj['henbang'], 'music' => $obj['music'], 'dede_addonfields' => 'qianbian,int;zhichi,int;zhaoma,int;gaoxiao,int;chedan,int;bujie,int;chijing,int;henbang,int;music,text', 'remote' => '1', 'dellink' => '1', 'autolitpic' => '1', 'needwatermark' => '1', 'sptype' => 'hand', 'spsize' => 'voteid', 'body' => $obj['body'], 'voteid' => '', 'notpost' => '0', 'click' => $obj['views'], 'sortup' => '0', 'color' => '', 'arcrank' => '', 'money' => '0', 'pubdate' => date('Y-m-d H:i:s', $obj['senddate']), 'ishtml' => '0', 'filename' => '', 'templet' => '', 'imageField.x' => '24', 'imageField.x' => '14' ); if (isset($obj['flag'])) { $index = 0; foreach ($obj['flag'] as $key => $value) { $data['flag[' . $index . ']'] = $value; $index++; } } $url = 'http://sx.cc/administrator/article_add.php'; $pro = curl_init(); curl_setopt($pro, CURLOPT_URL, $url); curl_setopt($pro, CURLOPT_POST, true); curl_setopt($pro, CURLOPT_RETURNTRANSFER, 1); /* cookie字符串,需替换成自己的 */ curl_setopt($pro, CURLOPT_COOKIE, 'menuitems=1_1%2C2_1%2C3_1%2C4_1; Hm_lvt_86f43783acc56b0c8abb5bb039edc763=1447468152; lastCid=12; lastCid__ckMd5=32334e2dceed96e5; PHPSESSID=3i9hfpit8h8gvoho4mbptou1h7; DedeUserID=1; DedeUserID__ckMd5=21656f81551e2194; DedeLoginTime=1447951987; DedeLoginTime__ckMd5=e8b68eb16c46c0a4; dede_vote_2365=1; ENV_GOBACK_URL=%2Fadministrator%2Fcontent_list.php%3Fchannelid%3D1'); curl_setopt($pro, CURLOPT_REFERER, 'http://sx.cc/administrator/article_add.php?channelid=1&cid=0'); curl_setopt($pro, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36'); curl_setopt($pro, CURLOPT_POSTFIELDS, $data); $result = curl_exec($pro); unset($pro); return $result;}/** * 保存数据库到数据库 不可用,最好的方式还是提交到dede * @param array $obj 文章对象 */function saveToDatabase($obj) { $option = array( 'host' => 'localhost', 'port' => '3306', 'user' => 'root', 'password' => 'sa123', 'charset' => 'utf8', 'database' => 'dedecmsv57utf8sp1', 'prefix' => 'dede_' ); $db = MySQLDBTool::getInstance($option); $sql = "select count(*) from dede_archives where title='{$obj['title']}'"; $count = $db->executeScalar($sql); if ($count > 0) return false; $sql1 = "INSERT INTO dede_arctiny(typeid,typeid2,arcrank,channel,senddate,sortrank,`mid`) VALUES ('{$obj["typeid"]}','{$obj["typeid2"]}',0,1,'{$obj["senddate"]}','{$obj["sortrank"]}',1)"; $id = $db->last_insert_id($sql1); if ($id > 0) { $sql2 = "INSERT INTO dede_archives(id,typeid,typeid2,sortrank,flag,ismake,channel,arcrank,click,money,title,shorttitle,color,writer,source,litpic," . "pubdate,senddate,`mid`,keywords,lastpost,scores,goodpost,badpost,voteid,notpost,description,filename,dutyadmin,tackid,mtype,weight) VALUES" . "({$id},'{$obj['typeid']}','{$obj['typeid2']}','{$obj['sortrank']}','{$obj['flag']}','-1','1','0','{$obj['views']}','0','{$obj['title']}','','','{$obj['writer']}'," . "'{$obj['source']}','{$obj['litpic']}','{$obj['pubdate']}','{$obj['pubdate']}','1','{$obj['keywords']}','0','0','0','0','0','0','{$obj['description']}','','1','0','0','0')"; $sql2_r = $db->exec($sql2); $sql3 = "INSERT INTO dede_addonarticle(aid,typeid,body,redirecturl,templet,userip,qianbian,zhichi,zhaoma,gaoxiao,chedan,bujie,chijing,henbang,music) VALUES " . "({$id},{$obj['typeid']},'{$obj['body']}','','','127.0.0.1','{$obj['qianbian']}','{$obj['zhichi']}','{$obj['zhaoma']}','{$obj['gaoxiao']}','{$obj['chedan']}','{$obj['bujie']}','{$obj['chijing']}','{$obj['henbang']}','{$obj['music']}')"; $sql3_r = $db->exec($sql3); if ($sql3_r < 0 or $sql2_r < 0) { $db->exec('delete from dede_archives where id=' . $id); $db->exec('delete from dede_addonarticle where aid=' . $id); $db->exec('delete from dede_arctiny where id=' . $id); return false; } return true; } else { return false; }}/** * 取得没有html文档的字符串 * @param type $html_str * @return type */function htmlToText($html_str) { return preg_replace('/\s+/', '', preg_replace('/ /s', '', preg_replace('/<.*>/sU', '', $html_str)));}/** * dede中的sortrank计算方法增加天数 既然是提交,就不需要这个方法了 * * @param int $ntime 当前时间 * @param int $aday 增加天数 * @return int 计算后的时间 */function AddDay($ntime, $aday) { $dayst = 3600 * 24; $oktime = $ntime + ($aday * $dayst); return $oktime;}/** * 获取单个列表页全部的文章列表 * @param int $url 列表页链接地址 */function getArcticleList($url) { $dom = new simple_html_dom; $dom->load_file($url); $i = 0; $items = $dom->find('.left .item .arttitle'); foreach ($items as $k => $v) { $res[$i]['href'] = $GLOBALS['prefix'] . $v->children[0]->attr['href']; $res[$i]['title'] = $v->children[0]->innertext; preg_match('/(\d+)\.html/isU', $res[$i]['href'], $temp_1); $res[$i]['id'] = $temp_1[1]; $i++; } /* 释放资源 */ $dom->clear(); unset($dom); return $res;}/** * 获取一篇文章信息绝想 * @param int $id 日记id */function pickOne($id) { $url = $GLOBALS['prefix'] . "/{$id}.html"; echo $url; $dom = new simple_html_dom; $dom->load_file($url); $postHeader_title = $dom->find('.left h1[0]'); $pubtime = $dom->find('.pubtime'); $heart = $dom->find('.week a'); $info = $dom->find('.author a'); $content = $dom->find('.content'); $views = $dom->find('.views b'); $writer = $info[0]->innertext; $title = $postHeader_title[0]->innertext; $senddate = $pubdate = strtotime($pubtime[0]->innertext); $body = $content[0]->innertext; $views = $views[0]->innertext; /* 释放资源 */ $dom->clear(); unset($dom); $mp3 = preg_match_all('/\?mp3=(.+\.mp3).*autostart/Us', $body, $res); if ($mp3 and isset($res) and count($res) > 0) { $music = $res[1][0]; } /* 去除超链接和开头的空白段落 */ $body = preg_replace('/<a.*>(.*)<\/a>/isU', '$1', preg_replace('/<div.*>(.*)<\/div>/isU', '', preg_replace('/^<p>\s*<\/p>/', '', preg_replace('/\s*<style.+<\/style>/s', '', $body)))); /* 本地化图片 */ saveImages($body, $res_img); $obj = array('title' => $title, 'writer' => $writer, 'body' => $body, 'pubdate' => $pubdate, 'senddate' => $senddate, 'views' => $views); $obj['sortrank'] = AddDay($senddate, 0); /* 处理背景音乐 */ $obj['music'] = isset($music) ? $music : ''; return $obj;}/** * 生成缩略图并打水印 * @param string $file 文件全名 * @return string 缩略图名字 */function makeThumb($file) { $image_tool = new ImageTool($file); $new_name = $image_tool->makeThumb(300, 300); $image_tool->waterMark('./upload/cklogo.png'); return $new_name;}/** * 本地化远程图片并返回图片列表 * @param string $html_str 要存储图片的路径 * @param string $predix 要查找图片的前缀 */function saveImages(& $html_str, & $result) { $reg = '/<img.+src\s*=\s*[\'"]\s*(.+\.(\w{3,5}))\s*[\'"].*>/iU';// $html_str = preg_replace_callback($reg, 'savePic', $html_str); $html_str = preg_replace_callback($reg, 'noHttp', $html_str); preg_match_all($reg, $html_str, $result);}/* 判断图片是否是http绝对路径开头 */function noHttp($matches) { $image_url = $matches[1]; if (strpos($image_url, '/') === false) { $image_url = $GLOBALS['prefix'] . $image_url; } return '<center><img src="' . $image_url . '" style="max-width:680px;"/></center>';}/** * 保存图片到本地 * @param type $matches * @return type */function savePic($matches) { $error_img = '/upload/yimo.png'; $filename = uniqid() . '.' . $matches[2]; if (!downloadPicture($matches[1], './uploads/cj/', $filename)) { $src = $error_img; } else { $src = 'http://eshop.cc/uploads/cj/' . $filename; } return "<img style=\"max-width:500px\" src=\"$src\"/>";}/** * 下载远程图片 * @param string $image_url 图片保存地址 * @param string $save_path 图片保存路径 * @param string $filename 图片文件名 * @return string 成功返回文件字节数,失败返回false */function downloadPicture($image_url, $save_path, $filename = 'rand') { $image_url = (!strpos($image_url, 'http://')) ? $GLOBALS['prefix'] . $image_url : $image_url; if (substr($save_path, -1) == '/' || substr($save_path, -1) == '\\') { $save_path = rtrim($save_path, '/'); $save_path = rtrim($save_path, '\\'); } if (!is_dir($save_path)) { if (!mkdir($save_path, 0, true)) die('创建目录失败'); } if ($filename == 'rand') { $filename = uniqid(); } $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $image_url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_COOKIE, 'juexiangssid=acnjkmock336p1ofl7sbgf2p97'); curl_setopt($ch, CURLOPT_REFERER, 'http://www.juexiang.com/'); curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36'); $img = curl_exec($ch); curl_close($ch); unset($ch); return file_put_contents($save_path . DIRECTORY_SEPARATOR . $filename, $img);}
0 0
- 基于dede的一个自定义采集器
- dede dede:arclist 自定义字段的调用
- QueryList一个基于phpQuery的无比强大的采集工具
- 基于Mina实现的一个简单数据采集中间件
- 基于高并发的数据采集器
- TCollector基于opentsdb的采集器
- [dede应用]采集过滤
- 织梦dede采集教程
- DEDE采集教程
- DEDE采集教程
- dede采集正则过滤
- dede采集教程
- dede采集过滤规则
- dede采集教程
- dede:list调用自定义属性和 dede:arclist的区别
- dede的采集不好用 火车头2010才是王道
- 火车头locoysp3采集器的一个bug
- 一个基于UIViewController的高度自定义TabBarController
- 蓝懿ios 技术内容交流和学习心得 11.20
- 我的linux笔记
- 【java】匿名内部类
- Eclipse下如何安装genymotion
- Basic CalculatorII
- 基于dede的一个自定义采集器
- Android Resource篇--- II 访问资源文件
- 提问的智慧
- 苹果Mac安装Dr.com
- STL中set的相关运用
- hdu 1856 并查序 特殊优化
- python初级了解
- 我的dubbo学习笔记
- [leetcode] 149. Max Points on a Line