blog search in Perl 量化的角度看blog
来源:互联网 发布:免费发货单打印软件 编辑:程序博客网 时间:2024/09/21 09:23
###################################################################
#
#
# this perl script is written by Zui Tao (wineinceramic@163.com)
# It is used to claw blog page url from the internet and store them into a local
# MS access database. though this script is simple and strait forward,
# blog itself as linked by the owner's favor may indecate a kind of
# relationship that help to form a map and show us a whole view of ciber community
# farther research would be using some data mining algorithm such as apriori....
#
#
#
# Though it is succeed in clawing a minimun of blog links , it should be upgraded to
# be a serious research。This script could be copied and rewirte at anyone's will.
#
#
# 无论如何,初始化第一个开始的URL是比较重要的一个线索
# 这里本人选择在sina的blog里人气比较热的http://blog.sina.com.cn/m/xiaojingzi来作为初始化URL
# 目的在于研究blog作为一个媒介的联系关系,研究其间人际的网络是如何铺展的
#
#
# 3-15-2006
#
###################################################################
#read start index and total number and search ranks of links from config file
$index;
$totalNumber;
$ranks;
$iHASH = 'NUN';
use FileHandle;
$fh = new FileHandle "config.txt", "r";
if(defined $fh)
{
$s = $fh->getline();
$index = 0 + $s;
$s = $fh->getline();
$totalNumber = 0 + $s;
$s = $fh->getline();
$ranks = 0 + $s;
undef $fh;
}
#store total number to current number
$currentTotalNumber = $totalNumber;
#open database first
use Win32::ODBC;
$dsn = "testDB";
$table = "DATASTORE";
# ====== Check if the database can be opened correctly
if (!($db = new Win32::ODBC($dsn))) {
print "Error in opening DSN /"$dsn/"!/n";
print "Error: " . Win32::ODBC::Error(). "/n";
exit;
}
#begin search loop
#if search ranks exist and total - pre number > search ranks than quit
while(($currentTotalNumber - $totalNumber) < $ranks)
{
# ====== Select all fields from the given table
$sql = "select * from $table where INDEXNUM = $index;";
#read url from database by the index
if ($db->Sql($sql)) {
print "Error in SQL query: /"$sql/"!/n";
print "Error: " . $db->Error() . "/n";
$db->Close();
exit;
}
while($db->FetchRow()) {
($HASH, $URL) = $db->Data("HASHCODE", "URL");
}
print $URL ,"/n";
#begin to get each link
use LWP::UserAgent;
use FileHandle;
$ua = LWP::UserAgent->new;
my $req = HTTP::Request->new(GET => $URL);
$res = $ua->request($req, "tempHTMLStoreFile.txt");
if ($res->is_success)
{
print "ok/n";
use HTML::TreeBuilder;
my $tree = HTML::TreeBuilder->new();
$tree->parse_file("tempHTMLStoreFile.txt");
foreach my $a ($tree->look_down('_tag', 'a',))
{
#$tag = $a->tag;
if(defined $a)
{
$hrefvalue = $a->attr('href');
if(defined $hrefvalue)
{
$hrefvalue = lc $hrefvalue;
#pre check if this link is a blog link simply to see if it contain the word 'blog'
###################################################################
#
#this section is used for filtering some names that supposed not
#to be blog urls ,some may be image links
#
####################################################################
$_ = $hrefvalue;
/blog/;
if(!defined($&))
{
next; #only the url contain blog can go farther
}
$_ = $hrefvalue;
/image/;
if($& eq 'image')
{
next; #only the url contain blog can go farther
}
$_ = $hrefvalue;
//'/;
if($& eq '/'')
{
next; #only the url contain blog can go farther
}
#pre check if this link is a blog link simply to see if it contain the word 'http://'
$_ = $hrefvalue;
/http://///;
######################################################################
if($& eq 'http://')
{
#try to store this url here and increment the currentTotalNumber
#.................................
#.................................
print $hrefvalue,"/n";
use Digest::MD5 qw(md5_hex);
$hashcode = md5_hex($hrefvalue);
$sqlLink = "select * from $table where HASHCODE = '$hashcode';";
if ($db->Sql($sqlLink))
{
print "Error in SQL query: /"$sql/"!/n";
print "Error: " . $db->Error() . "/n";
$db->Close();
exit;
}
if(defined $iHASH)
{
$iHASH = 'NUN';
}
while($db->FetchRow())
{
($iHASH,$iLINK) = $db->Data("HASHCODE","LINKNO");
}
print $iHASH ,"/n";
#if links exist add the index to its link field
#else store the links and total number increment 1
if($iHASH eq 'NUN')#store the href
{
$iStr = sprintf("%d",$index);
$currentTotalNumber = $currentTotalNumber + 1;
$sqlinsert = "INSERT INTO DATASTORE VALUES ($currentTotalNumber,'$hashcode','$hrefvalue', '$iStr')";
$rc = $db->Sql($sqlinsert);
die qq(SQL 失败 "$sqlinsert": ), $db->Error(), qq(n) if $rc;
}
else #save the index
{
$iStr = sprintf("%d",$index);
$iLINK = join('',$iLINK,", ",$iStr);
$sqlupdate = "UPDATE DATASTORE SET LINKNO = '$iLINK' WHERE HASHCODE = '$iHASH'";
$rc = $db->Sql($sqlupdate);
die qq(SQL 失败 "$sqlupdate": ), $db->Error(), qq(n) if $rc;
}
}#if(defined($&))
}#if(defined $hrefvalue)
}#if(defined $a)
}#foreach my $a ($tree->look_down('_tag', 'a',))
$tree->delete;
}#if ($res->is_success)
else
{
print $res->status_line, "/n";
}
#end getting each links
#start index increment
$index = $index + 1;
#store current index and total number in to config file
use IO::File;
$filehdler = new IO::File " > config.txt";
if (defined $filehdler) {
print $filehdler $index ,"/n";
print $filehdler $currentTotalNumber ,"/n";
print $filehdler $ranks ,"/n";
$filehdler->close;
}
}
#end search loop
#close database
$db->Close();
print $currentTotalNumber,"/n";
- blog search in Perl 量化的角度看blog
- 看各家的blog
- 看师兄的blog有感!
- 看别人的blog有感
- Quickly Search Articles in My Blog - 快速搜索本专栏内的文章
- 041213看candy的blog有感!
- 看了好一歇别人的 blog
- 推荐程序员看的一些BLOG
- 我看blog
- 待看blog
- perl 中文手册 and blog
- 从jvm管理角度看java类的静态属性和静态方法----------------转自http://ruixin.iteye.com/blog/897171
- blog??
- blog
- blog
- blog
- blog
- blog
- 日常记录
- 成才之道
- Lack of self-confidence
- 大学时代看系统分析员
- 王怡推荐的自由主义书单
- blog search in Perl 量化的角度看blog
- 【协议】TCP/IP基础及详解
- 新的准备实施的CSDN用户激活制度:
- 在frame里的页面的关闭方法
- windows自定义消息
- 金山为什么永远做不到颠峰
- __osplatform 已经在 atlmincrt.lib(atlinit.obj) 中定义解决方法
- 用SQL语句判断数据库中的记录是否存在
- windows下修改使用的JDK版本的方法