asp采集器技术分析与源代码
来源:互联网 发布:c语言实验正弦信号 编辑:程序博客网 时间:2024/06/05 10:11
最近新开发了一个asp的采集器,简单说一下功能方面的分析吧。
第一、采集器的原理
网络上常见的采集器有很多重,有开发成熟一些的都是带后台的,可以直接生成采集文件直接将采集回来的内容入库的。你想想比如你运营一个网站的话每天都花很多时间去录资料,想想就觉得头疼。那么在仔细想想,采集信息在哪采集?
有的朋友直接想说当然是在别人的网站上,实际上是采集别人的网页内容。
第二、采集器的制作步骤
上面说明了采集器就是采集别人的网页内容,那么你首先要做的就是搜索一下“asp返回网页内容源代码”,如果你会写这样的程序那就没有搜索的必要了,接下来就是网页内容处理了,你把别人网页整页的源代码都返回来了,那么你一定很头疼,因为你想要得到的只是文章的标题和文章的内容吧,所以这就需要把你想要的部份提出来,或你直接把没用的代码过滤掉也是可以的,你可以在网站搜“asp截取指定位置内容”如果有了这个函数,你就可以轻松采集别人网站上的信息了!
第三、源代码
说了这么多还是拿点实际的出来吧,要不大家该说,你小子到底会不会呀,呵呵,说句实话,我就是写了一个带后台的采集器,发一个生成出来的采集程序的代码吧,上面提到的函数这里都有,你自己挑吧.
<%
'绝对原创-极限风暴制作
Dim DB,Connstr,Conn
DB="Databases/qaz537zjkj123.mdb"
On Error Resume Next
Connstr="DBQ="+Server.MapPath(""&DB&"")+";Driver={Microsoft Access Driver (*.mdb)};"
Set Conn=Server.CreateObject("ADODB.Connection")
Conn.Open Connstr
If Err Then
Err.Clear
Response.Write("数据库连接出错!")
Response.End()
End if
Function GetURL(URL)
Set http=Server.CreateObject("Microsoft.XMLHTTP")
On Error Resume Next
http.Open "GET",URL,False
http.send()
if Err then
Err.Clear
Response.Write("没有找到网页!")
Response.End()
End if
getHTTPPage=bytesToBSTR(Http.responseBody,"gb2312")
set http=nothing
GetURL=getHTTPPage
End Function
Function BytesToBstr(body,Cset)
dim objstream
set objstream = Server.CreateObject("adodb.stream")
objstream.Type = 1
objstream.Mode =3
objstream.Open
objstream.Write body
objstream.Position = 0
objstream.Type = 2
objstream.Charset = Cset
BytesToBstr = objstream.ReadText
objstream.Close
set objstream = nothing
End Function
Function Html(StrText)
StrText = Replace(StrText,"<","<")
StrText = Replace(StrText,""","")
StrText = Replace(StrText,">",">")
StrText = Replace(StrText,"&","&")
StrText = Replace(StrText,"·","·")
Html=StrText
End Function
Function RemoveHTML(strHTML)
StrHtml = Replace(StrHtml,vbCrLf,"")
StrHtml = Replace(StrHtml,Chr(13)&Chr(10),"")
StrHtml = Replace(StrHtml,Chr(13),"")
StrHtml = Replace(StrHtml,Chr(10),"")
StrHtml = Replace(StrHtml," ","")
StrHtml = Replace(StrHtml," ","")
Dim objRegExp, Match, Matches
Set objRegExp = New Regexp
objRegExp.IgnoreCase = True
objRegExp.Global = True
objRegExp.Pattern = "<style(.+?)/style>"
Set Matches = objRegExp.Execute(strHTML)
For Each Match in Matches
strHtml=Replace(strHTML,Match.Value,"")
Next
objRegExp.Pattern = "<script(.+?)/script>"
Set Matches = objRegExp.Execute(strHTML)
For Each Match in Matches
strHtml=Replace(strHTML,Match.Value,"")
Next
objRegExp.Pattern = "<title(.+?)/title>"
Set Matches = objRegExp.Execute(strHTML)
For Each Match in Matches
strHtml=Replace(strHTML,Match.Value,"")
Next
objRegExp.Pattern = "<!--(.+?)-->"
Set Matches = objRegExp.Execute(strHTML)
For Each Match in Matches
strHtml=Replace(strHTML,Match.Value,"")
Next
objRegExp.Pattern = "<.+?>"
Set Matches = objRegExp.Execute(strHTML)
For Each Match in Matches
strHtml=Replace(strHTML,Match.Value,"")
Next
RemoveHTML=strHTML
Set objRegExp = Nothing
End Function
Function CreateMDB(DBPach)
DBName = Server.MapPath(DBPach)
Set myFso=Server.CreateObject("Scripting.FileSystemObject")
If myFso.FileExists(DBName) Then
CreateMDB=DBPach&"文件已存在!"
Exit Function
End if
Set myCreate = Server.CreateObject( "ADOX.Catalog" )
myCreate.Create("Provider=Microsoft.Jet.OLEDB.4.0;Data Source="&DBName)
Set myCreate=NotHing
Set myFso=NotHing
End Function
Function SearchURL(strng)
Set regEx=New RegExp
patrn="(?:/[[^/]]+/]([^/[]+)/[//[^/]]+/])|(?:(?:href|src)=([^/s|^>]+)[""|/s/'])"
regEx.Pattern=patrn
regEx.IgnoreCase=True
regEx.Global=True
strng=content
Set Matches=regEx.Execute(strng)
z=0
For Each Match in Matches '//遍历匹配集合
if Checkword(match.value)=False Then
Search=StringText(match.value)
RetStr=RetStr&Search
RetStr=RetStr&"|"
End if
Next
SearchURL=RetStr
End Function
Function Checkword(words)
Const Invaildwords=".css|.js|.gif|.jpg|.jpeg|.png|.ico|javascript:|mailto:"
Checkword=True
Invaildword=Split(Invaildwords,"|")
inwords=LCase(Trim(words))
For i=LBound(Invaildword) To UBound(Invaildword)
If Instr(inwords,Invaildword(i))>0 Then
Checkword=True
Exit Function
End If
Next
Checkword=False
End Function
Function StringText(Content)
If Len(Content)<>0 Then
Content = Replace(Content,"HREF=","")
Content = Replace(Content,"SRC=","")
Content = Replace(Content,"href=""","")
Content = Replace(Content,"Src=""","")
Content = Replace(Content,"""","")
Content = Replace(Content,"href='","")
Content = Replace(Content,"Src='","")
Content = Replace(Content,"'","")
Content = Replace(Content,"Src=","")
Content = Replace(Content,"src=","")
Content = Replace(Content,"href=","")
Content = Replace(Content,";","")
Content = Replace(Content,"#","")
Content = Replace(Content,"+","")
End if
StringText=Content
End Function
Function strCut(strContent,StartStr,EndStr,CutType)
Dim strHtml,S1,S2
strHtml = strContent
On Error Resume Next
Select Case CutType
Case 1
S1 = InStr(strHtml,StartStr)
S2 = InStr(S1,strHtml,EndStr)+Len(EndStr)
Case 2
S1 = InStr(strHtml,StartStr)+Len(StartStr)
S2 = InStr(S1,strHtml,EndStr)
End Select
If Err Then
strCute = "<p align='center'>没有找到需要的内容。</p>"
Err.Clear
Exit Function
Else
strCut = Mid(strHtml,S1,S2-S1)
End If
End Function
Function CreateFileName(Ex)
Str=year(now())
month(now())
day(now())
Hour(now())
Minute(now())
Second(now())
CreateFileName=Str&Ex
End Function
Function SplitURL(URL)
On Error Resume Next
StrURL=Split(URL,"/")
SplitURL=Replace(URL,StrURL(Clng(UBound(StrURL))),"")
If Err Then
Err.Clear
SplitURL=URL
End If
End Function
Domain=Trim(Request("Domain"))
If Domain<>"" Then
ContentStr="<!--正文开始-->"
ContentEnd="<!--关于内容的操作开始-->"
StrType=2
TitleStarStr="<div id=""artibodyTitle"">"
TitleEndStr="<div class=""from_info"">"
TitleStrType=2
ContentStarStr="<!--正文内容开始-->"
ContentEndStr="<!--正文内容结束-->"
ContentStrType=2
getHTTPPage=GetURL(Domain)
PageContent=strCut(getHTTPPage,ContentStr,ContentEnd,StrType)
CreateTitle=Trim(RemoveHTML(strCut(PageContent,TitleStarStr,TitleEndStr,TitleStrType)))
CreateContent=Trim(strCut(getHTTPPage,ContentStarStr,ContentEndStr,ContentStrType))
If CreateTitle<>"" Then
Set Rs1=Server.CreateObject("ADODB.Recordset")
Sql1="Select * From News Order By OrderID Desc"
Rs1.Open Sql1,Conn,1,3
MaxID=Rs1("OrderID")
Rs1.Close
Set Rs1=NotHing
Set Rs=Server.CreateObject("ADODB.Recordset")
Sql="Select * From News Where Title='"&CreateTitle&"'"
Rs.Open Sql,Conn,1,3
If Rs.Eof Then
Rs.AddNew
Rs("Title")=Replace(CreateTitle,"endMain"">","")
Rs("Content")=CreateContent
Rs("BigClassName")="综合新闻"
Rs("SmallClassName")="娱乐新闻"
Rs("Hits")=0
Rs("UpdateTime")=Now()
Rs("FTitle")=""
Rs("Elite")=True
Rs("Passed")=True
Rs("IncludePic")=True
Rs("OrderID")=MaxID+1
Rs.Update
End If
Rs.Close
Set Rs=notHing
End if
Response.Write(CreateTitle)
Response.Write(CreateContent)
End If
%>
- asp采集器技术分析与源代码
- 常用网络流量采集技术分析
- 图形建模, 矢量图编辑, 科学分析VC++源代码, 数据采集与显示组件库, 软件, VC++源代码
- 数据采集与反采集原理分析
- asp.net采集函数(采集、分析、替换、入库)
- asp.net采集函数(采集、分析、替换、入库一体)
- asp.net采集函数(采集、分析、替换、入库)
- asp.net采集函数(采集、分析、替换、入库一体)
- ASP.NET截取字符串函数(用于网页内容分析与采集
- 机器人技术—基于DELPHI的数据采集与分析类上位机软件的编写
- Asp木马技术分析
- 使用源代码分析工具分析ASP源代码的方法
- 巧用ASP技术保护DHTML源代码
- 巧用ASP技术保护DHTML源代码
- 优酷专辑采集与分析
- 房屋价格数据采集与分析
- 新计划--天气数据采集与分析
- ASP.NET编程技术-在ASP.NET下用Microsoft Excel进行数据分析与报表
- Delphi 让程序只运行一次
- [VIM技巧]global命令详解
- HQL的使用-part2 hql的查询(4)
- 执行触发器失败 :-6519:ORA-06519: 检测到活动的自治事务处理, 已经回退
- 如何反注册DLL文件
- asp采集器技术分析与源代码
- 关于指针的判空问题
- Run-Time Check Failure #2 - Stack around the variable 'var' was corrupted
- cvs
- 以色列科技进步的启示:公民智慧成最大的资源
- 我在08年的最后一天,,,
- 网站开发规范
- 软件工程师日语---仕様書2
- 使用xampp安装mantis