采集

来源:互联网 发布:淘宝 炉石卡组 编辑:程序博客网 时间:2024/04/29 19:18

,采集中如果是img标签和文字结合在一起如

 

那么如果你在采集过程中只需要标题或者是文字不含有图片那么需要过滤一下图片

[ ^< > ]*   ;

1,^ 是非

2,< > 含有 < > 括号的标签

3, * 很多

因此可以完美解决这样的问题!

注意:采集网页的文字编码!!!如果不是utf-8形式的,应该用  

iconv(in_charset, out_charset, str)

In_charset : 要转化的文字编码

Out_charset:转化为文字编码

Str:转化的对象!

如采集"http://sports.sohu.com/nba.shtml";

<?php
include "Snoopy.class.php";
$sp = new Snoopy;
$url = "http://sports.sohu.com/nba.shtml";
$sp->fetch($url);
 $data = $sp->results;
 $data = iconv('GBK','utf-8', $data);
$reg = '#<div style="HEIGHT: 290px; OVERFLOW: hidden" id="columnID" class="heavyColumn">.*<div id="media-list" class="media-list clear">#isU';


preg_match($reg,$data,$content);
$reg1='#<div style="HEIGHT: 290px; OVERFLOW: hidden" id="columnID" class="heavyColumn">.*<div class="clear">#isU';
preg_match($reg1, $content[0],$arr);
$reg2='#<p>(.*)<a onFocus=".*" href=".*" target=".*">.*</a></p>#isU';
$reg3='#<h4><a onFocus=".*" .* href=".*" target=".*">(.*)</a></h4>#isU';
$reg4='#<h3><a onFocus=".*" href=".*" target=".*">(.*)</a></h3>#isU';
$reg5='#<a onFocus=".*" .* href=".*" target=".*"><img .* src="(.*)" .*></a>#isU';
preg_match_all($reg2, $arr[0],$arr2);
preg_match_all($reg2, $arr[0],$arr3);
preg_match_all($reg4, $arr[0],$arr4);
preg_match_all($reg5, $arr[0],$arr5);
/*foreach ($arr5[1] as $key => $va) {
    $str = file_get_contents($va);//获取内容
     $exc = substr($va, strrpos($va, "."));
    $filename = '../image/'.time().rand(1000,9999).$exc;
    file_put_contents($filename, $str);
}*/
$data =array();
foreach ($arr2[1] as $k => $v) {
    $data[$k]['content']=$v;

}
foreach ($arr3[1] as $key => $value) {
    $data[$key]['h4']=$value;
}
foreach ($arr4[1] as $key => $value) {
    $data[$key]['h3']=$value;
}
foreach ($arr5[1] as $key => $value) {
    $data[$key]['img']=$value;
}

$db=new PDO("mysql:host=127.0.0.1;dbname=exam","root","root");
$db->exec("set names utf8");
foreach ($data as $val){
  $sql = " insert into sport (h4,h3,content,img) values ('".$val['h4']."','".$val['h3']."',
      '".$val['content']."','".$val['img']."')";
  $db->exec($sql);
}





 


 ?>

0 0
原创粉丝点击