asp采集器技术分析与源代码

来源:互联网 发布:c语言实验正弦信号 编辑:程序博客网 时间:2024/06/05 10:11

 最近新开发了一个asp的采集器,简单说一下功能方面的分析吧。

第一、采集器的原理

        网络上常见的采集器有很多重,有开发成熟一些的都是带后台的,可以直接生成采集文件直接将采集回来的内容入库的。你想想比如你运营一个网站的话每天都花很多时间去录资料,想想就觉得头疼。那么在仔细想想,采集信息在哪采集?

       有的朋友直接想说当然是在别人的网站上,实际上是采集别人的网页内容。

 

第二、采集器的制作步骤

 

       上面说明了采集器就是采集别人的网页内容,那么你首先要做的就是搜索一下“asp返回网页内容源代码”,如果你会写这样的程序那就没有搜索的必要了,接下来就是网页内容处理了,你把别人网页整页的源代码都返回来了,那么你一定很头疼,因为你想要得到的只是文章的标题和文章的内容吧,所以这就需要把你想要的部份提出来,或你直接把没用的代码过滤掉也是可以的,你可以在网站搜“asp截取指定位置内容”如果有了这个函数,你就可以轻松采集别人网站上的信息了!

 

 

第三、源代码

        说了这么多还是拿点实际的出来吧,要不大家该说,你小子到底会不会呀,呵呵,说句实话,我就是写了一个带后台的采集器,发一个生成出来的采集程序的代码吧,上面提到的函数这里都有,你自己挑吧.

<%

'绝对原创-极限风暴制作
Dim DB,Connstr,Conn
DB="Databases/qaz537zjkj123.mdb"
On Error Resume Next
Connstr="DBQ="+Server.MapPath(""&DB&"")+";Driver={Microsoft Access Driver (*.mdb)};"
Set Conn=Server.CreateObject("ADODB.Connection")
 Conn.Open Connstr
If Err Then
Err.Clear
Response.Write("数据库连接出错!")
Response.End()
End if
Function GetURL(URL)
Set http=Server.CreateObject("Microsoft.XMLHTTP")
 On Error Resume Next
 http.Open "GET",URL,False
 http.send()
 if Err then
 Err.Clear
 Response.Write("没有找到网页!")
 Response.End()
 End if
 getHTTPPage=bytesToBSTR(Http.responseBody,"gb2312")
set http=nothing
GetURL=getHTTPPage
End Function
Function BytesToBstr(body,Cset)
dim objstream
set objstream = Server.CreateObject("adodb.stream")
objstream.Type = 1
objstream.Mode =3
objstream.Open
objstream.Write body
objstream.Position = 0
objstream.Type = 2
objstream.Charset = Cset
BytesToBstr = objstream.ReadText
objstream.Close
set objstream = nothing
End Function
Function Html(StrText)
StrText = Replace(StrText,"&lt;","<")
StrText = Replace(StrText,"&quot;","")
StrText = Replace(StrText,"&gt;",">")
StrText = Replace(StrText,"&amp;","&")
StrText = Replace(StrText,"&#183;","·")
Html=StrText
End Function
Function RemoveHTML(strHTML)
StrHtml = Replace(StrHtml,vbCrLf,"")
StrHtml = Replace(StrHtml,Chr(13)&Chr(10),"")
StrHtml = Replace(StrHtml,Chr(13),"")
StrHtml = Replace(StrHtml,Chr(10),"")
StrHtml = Replace(StrHtml,"&nbsp;","")
StrHtml = Replace(StrHtml,"    ","")
 Dim objRegExp, Match, Matches 
 Set objRegExp = New Regexp 
 objRegExp.IgnoreCase = True
 objRegExp.Global = True
 objRegExp.Pattern = "<style(.+?)/style>"
 Set Matches = objRegExp.Execute(strHTML) 
 For Each Match in Matches 
 strHtml=Replace(strHTML,Match.Value,"")
 Next
 objRegExp.Pattern = "<script(.+?)/script>" 
 Set Matches = objRegExp.Execute(strHTML)
 For Each Match in Matches 
 strHtml=Replace(strHTML,Match.Value,"")
 Next
 objRegExp.Pattern = "<title(.+?)/title>"
 Set Matches = objRegExp.Execute(strHTML)  
 For Each Match in Matches 
 strHtml=Replace(strHTML,Match.Value,"")
 Next
 objRegExp.Pattern = "<!--(.+?)-->"
 Set Matches = objRegExp.Execute(strHTML)
 For Each Match in Matches 
 strHtml=Replace(strHTML,Match.Value,"")
 Next
 objRegExp.Pattern = "<.+?>"
 Set Matches = objRegExp.Execute(strHTML) 
 For Each Match in Matches 
 strHtml=Replace(strHTML,Match.Value,"")
 Next
 RemoveHTML=strHTML
 Set objRegExp = Nothing
End Function
Function CreateMDB(DBPach)
DBName = Server.MapPath(DBPach)
Set myFso=Server.CreateObject("Scripting.FileSystemObject")
If myFso.FileExists(DBName) Then
CreateMDB=DBPach&"文件已存在!"
Exit Function
End if
Set myCreate = Server.CreateObject( "ADOX.Catalog" )
 myCreate.Create("Provider=Microsoft.Jet.OLEDB.4.0;Data Source="&DBName)
Set myCreate=NotHing
Set myFso=NotHing
End Function
Function SearchURL(strng)
Set regEx=New RegExp  
patrn="(?:/[[^/]]+/]([^/[]+)/[//[^/]]+/])|(?:(?:href|src)=([^/s|^>]+)[""|/s/'])"
regEx.Pattern=patrn  
regEx.IgnoreCase=True  
regEx.Global=True  
strng=content
Set Matches=regEx.Execute(strng)  
z=0
For Each Match in Matches '//遍历匹配集合
if Checkword(match.value)=False Then
Search=StringText(match.value)
RetStr=RetStr&Search
RetStr=RetStr&"|"
End if
Next
SearchURL=RetStr
End Function
Function Checkword(words)
Const Invaildwords=".css|.js|.gif|.jpg|.jpeg|.png|.ico|javascript:|mailto:"
Checkword=True
Invaildword=Split(Invaildwords,"|")
inwords=LCase(Trim(words))
For i=LBound(Invaildword) To UBound(Invaildword)
If Instr(inwords,Invaildword(i))>0 Then
Checkword=True
Exit Function
End If
Next
Checkword=False
End Function
Function StringText(Content)
If Len(Content)<>0 Then
Content = Replace(Content,"HREF=","")
Content = Replace(Content,"SRC=","")
Content = Replace(Content,"href=""","")
Content = Replace(Content,"Src=""","")
Content = Replace(Content,"""","")
Content = Replace(Content,"href='","")
Content = Replace(Content,"Src='","")
Content = Replace(Content,"'","")
Content = Replace(Content,"Src=","")
Content = Replace(Content,"src=","")
Content = Replace(Content,"href=","")
Content = Replace(Content,";","")
Content = Replace(Content,"#","")
Content = Replace(Content,"+","")
End if
StringText=Content
End Function
Function strCut(strContent,StartStr,EndStr,CutType)
    Dim strHtml,S1,S2
    strHtml = strContent
    On Error Resume Next
    Select Case CutType
    Case 1
        S1 = InStr(strHtml,StartStr)
        S2 = InStr(S1,strHtml,EndStr)+Len(EndStr)
    Case 2
        S1 = InStr(strHtml,StartStr)+Len(StartStr)
        S2 = InStr(S1,strHtml,EndStr)
    End Select
    If Err Then
        strCute = "<p align='center'>没有找到需要的内容。</p>"
        Err.Clear
        Exit Function
    Else
        strCut = Mid(strHtml,S1,S2-S1)
    End If
End Function
Function CreateFileName(Ex)
Str=year(now())
month(now())
day(now())
Hour(now())
Minute(now())
Second(now())
CreateFileName=Str&Ex
End Function
Function SplitURL(URL)
On Error Resume Next
StrURL=Split(URL,"/")
SplitURL=Replace(URL,StrURL(Clng(UBound(StrURL))),"")
If Err Then
Err.Clear
SplitURL=URL
End If
End Function
Domain=Trim(Request("Domain"))
If Domain<>"" Then
ContentStr="<!--正文开始-->"
ContentEnd="<!--关于内容的操作开始-->"
StrType=2
TitleStarStr="<div id=""artibodyTitle"">"
TitleEndStr="<div class=""from_info"">"
TitleStrType=2
ContentStarStr="<!--正文内容开始-->"
ContentEndStr="<!--正文内容结束-->"
ContentStrType=2
getHTTPPage=GetURL(Domain)
PageContent=strCut(getHTTPPage,ContentStr,ContentEnd,StrType)
CreateTitle=Trim(RemoveHTML(strCut(PageContent,TitleStarStr,TitleEndStr,TitleStrType)))
CreateContent=Trim(strCut(getHTTPPage,ContentStarStr,ContentEndStr,ContentStrType))
If CreateTitle<>"" Then
Set Rs1=Server.CreateObject("ADODB.Recordset")
 Sql1="Select * From News Order By OrderID Desc"
 Rs1.Open Sql1,Conn,1,3
 MaxID=Rs1("OrderID")
 Rs1.Close
Set Rs1=NotHing
Set Rs=Server.CreateObject("ADODB.Recordset")
 Sql="Select * From News Where Title='"&CreateTitle&"'"
 Rs.Open Sql,Conn,1,3
 If Rs.Eof Then
 Rs.AddNew
 Rs("Title")=Replace(CreateTitle,"endMain"">","")
 Rs("Content")=CreateContent
 Rs("BigClassName")="综合新闻"
 Rs("SmallClassName")="娱乐新闻"
 Rs("Hits")=0
 Rs("UpdateTime")=Now()
 Rs("FTitle")=""
 Rs("Elite")=True
 Rs("Passed")=True
 Rs("IncludePic")=True
 Rs("OrderID")=MaxID+1
 Rs.Update
 End If
 Rs.Close
Set Rs=notHing
End if
Response.Write(CreateTitle)
Response.Write(CreateContent)
End If
%>

原创粉丝点击