多线程内容匹配抓取(线程池)

来源：互联网发布：手机淘宝怎么看差评编辑：程序博客网时间：2024/04/30 08:19

多线程内容匹配抓取(2009-03-26 15:36:52)

说明：在线程池控制本线程时，同时使用下面3个全局变量
catch_urllist: TList; 待抓取的URL结构列表
matching_urllist: TList; 符合内容的URL结构列表
error_urllist: TStrings; 不符合内容的URL

*******************************************************************************
unit threadspider;
interface

uses
Classes, Windows, Messages, IdHTTP, ComCtrls, SysUtils, PerlRegEx, SyncObjs;

const
WM_THREADSTART = WM_USER + 100; { 线程启动 }
WM_THREADSTOP = WM_USER + 101; { 一次抓取完毕 }
WM_THREADGETSTOP = WM_USER + 102; { 线程结束 }

type
TUrlInfo = record
url: string[100]; //网址
count: Integer; //累计次数
end;

PMatchInfo = ^TMatchInfo;
TMatchInfo = record
url: array [0..100] of Char; //网址
Title: array [0..100] of Char; //标题
count: Integer; //累计次数
end;

TThreadSpider = class(TThread)
private
FHandle: THandle;
Fhttp: TIdHTTP;
FTimeOut: Integer;
FLock: TCriticalSection; //数据共享保护
procedure SetName;
function GetHtml(url: string): string;
procedure LinksFromHtml(html: string);
function GetPageTitle(html: string): string;
function ExistFilter(html: string): Boolean;
function ExistInclude(html: string): Boolean;
{ ********************* }
function GetUrlInfo(var UrlInfo: TUrlInfo): Boolean;
function ExistMacthList(url: string): Boolean;
function ExistErrorList(url: string): Boolean;
function ExistCatchList(url: string): Boolean;
procedure AddToMacthList(matchinfo: TMatchInfo);
procedure AddToErrorList(urlinfo: TUrlInfo);
procedure AddToCatchList(url: string);
{ ********************* }
protected
procedure Execute; override;
public
constructor Create(Handle: THandle; TimeOut: Integer);
destructor Destroy; override;
end;

implementation

uses variable;
type
TThreadNameInfo = record
FType: LongWord; // must be 0x1000
FName: PChar; // pointer to name (in user address space)
FThreadID: LongWord; // thread ID (-1 indicates caller thread)
FFlags: LongWord; // reserved for future use, must be zero
end;

{ TThreadSpider }

procedure TThreadSpider.SetName;
var
ThreadNameInfo: TThreadNameInfo;
begin
ThreadNameInfo.FType := $1000;
ThreadNameInfo.FName := 'ThreadSpider';
ThreadNameInfo.FThreadID := $FFFFFFFF;
ThreadNameInfo.FFlags := 0;
try
RaiseException( $406D1388, 0, sizeof(ThreadNameInfo) div sizeof(LongWord), @ThreadNameInfo );
except
end;
end;

procedure TThreadSpider.Execute;
var
html: string;
urlinfo: TUrlInfo;
matchinfo: TMatchInfo;
begin
SetName;
while not Terminated do
begin
//已没有新的URL可采集，则退出，销毁线程
if not GetUrlInfo(urlinfo) then Exit;
html := GetHtml(urlinfo.url);
//用户停止线程
if Terminated then Exit;
if html = '' then
begin
//说明页面无法访问
AddToErrorList(urlinfo);
end
//判断过滤条件
else if (not ExistFilter(html)) and ExistInclude(html) then
begin
//说明此URL符合要求，加至符合要求的LIST中
FillChar(matchinfo, SizeOf(Tmatchinfo), #0);
StrPCopy(matchinfo.url, urlinfo.url);
StrPCopy(matchinfo.Title, GetPageTitle(html));
matchinfo.count := urlinfo.count;
AddToMacthList(matchinfo);
//取该连接下面的URL
LinksFromHtml(html);
end else
begin
//说明页面中不存在相关的内容，废弃
AddToErrorList(urlinfo);
end;
//一次抓取完毕。
SendMessage(FHandle, WM_THREADGETSTOP, 0, 0);
end;
{ Place thread code here }
end;

constructor TThreadSpider.Create(Handle: THandle; TimeOut: Integer);
begin
inherited Create(True);
Fhttp := TIdHTTP.Create(nil);
FLock := TCriticalSection.Create;
Fhttp.ReadTimeout := TimeOut * 1000; //抓取超时
FHandle := Handle;
FTimeOut := TimeOut;
FreeOnTerminate := True;
SendMessage(FHandle, WM_THREADSTART, 0, 0);
end;

destructor TThreadSpider.Destroy;
begin
Fhttp.Disconnect;
Fhttp.Free;
FLock.Free;
SendMessage(FHandle, WM_THREADSTOP, Integer(Self), 0);
inherited Destroy;
end;

function TThreadSpider.GetHtml(url: string): string;
var
html: string;
begin
try
html := Fhttp.Get(url);
except
html := '';
end;
Result := html;
end;

{ 取HTML页面内的URL地址 }
procedure TThreadSpider.LinksFromHtml(html: string);
var
MyRegex: TPerlRegEx;
List: TStrings;
url: string;
I: Integer;
begin
MyRegex := TPerlRegEx.Create(nil);
List := TStringList.Create;
try
MyRegex.RegEx := 'http://(?P<domain>[a-z-A-Z0-9.]+)';
MyRegex.Subject := Html;
if MyRegex.Match then
begin
repeat
url := 'http://' + MyRegex.SubExpressions[1] + '/';
if List.IndexOf(url)<0 then List.Add(url);
until not MyRegex.MatchAgain;
end;
//开始判断URL是否存在和所在区域（已过滤，已匹配）
for i := 0 to List.Count -1 do
begin
url := List.Strings[i];
if ExistMacthList(url) then Continue; // 已匹配
if ExistErrorList(url) then Continue; // 已过滤
{ 如果不存在，就添加到待采集的LIST中 }
if not ExistCatchList(url) then AddToCatchList(url);
end;
finally
List.Free;
MyRegex.Free;
end;
end;

{ 取网页TITLE }
function TThreadSpider.GetPageTitle(html: string): string;
var
MyRegex: TPerlRegEx;
begin
MyRegex := TPerlRegEx.Create(nil);
try
MyRegex.RegEx := '<title>[s ]*(.*?)[s ]*</title>';
MyRegex.Subject := Html;
if MyRegex.Match then
begin
Result := MyRegex.SubExpressions[1];
end;
finally
MyRegex.Free;
end;
end;

//取欲获取HTML的URL结构
function TThreadSpider.GetUrlInfo(var UrlInfo: TUrlInfo): Boolean;
begin
Result := False;
try
FLock.Enter;
if catch_urllist.Count > 0 then
begin
//取最前面的
urlinfo := TUrlInfo(catch_urllist.Items[0]^);
//取得后删除，
FreeMem(catch_urllist.Items[0]);
catch_urllist.Delete(0);
Result := True;
end;
finally
FLock.Leave;
end;
end;

{ 过滤不允许的关键字 }
function TThreadSpider.ExistFilter(html: string): Boolean;
var
i: Integer;
begin
Result := False;
try
FLock.Enter;
for i := 0 to Word_Filter.Count -1 do
begin
if Pos(LowerCase(Word_Filter.Strings[i]), LowerCase(html)) > 0 then {Word_Filter 外部TStringList}
Result := True;
Break;
end;
finally
FLock.Leave;
end;
end;

{ 查询包含的关键字 }
function TThreadSpider.ExistInclude(html: string): Boolean;
var
i: Integer;
begin
Result := False;
try
FLock.Enter;
for i := 0 to Word_Include.Count -1 do
begin
if Pos(LowerCase(Word_Include.Strings[i]), LowerCase(html)) > 0 then {Word_Include 外部TStringList}
Result := True;
Break;
end;
finally
FLock.Leave;
end;
end;

{ 判断是否存在于已符合要求的URL LIST中 }
function TThreadSpider.ExistMacthList(url: string): Boolean;
var
i, count: Integer;
MatchInfo: TMatchInfo;
begin
Result := False;
try
FLock.Enter;
for i := 0 to matching_urllist.Count -1 do
begin
MatchInfo := TMatchInfo(matching_urllist.Items[i]^);
if LowerCase(Trim(MatchInfo.url)) = LowerCase(Trim(url)) then
begin
count := MatchInfo.count + 1;
TMatchInfo(matching_urllist.Items[i]^).count := count;
Result := True;
Break;
end;
end;
finally
FLock.Leave;
end;
end;

{ 判断是否存在于不符合要求的URL LIST中 }
function TThreadSpider.ExistErrorList(url: string): Boolean;
var
i: Integer;
begin
Result := False;
try
FLock.Enter;
for i := 0 to error_urllist.Count -1 do
begin
if LowerCase(error_urllist.Strings[i]) = LowerCase(url) then
begin
Result := True;
Break;
end;
end;
finally
FLock.Leave;
end;
end;

{ 判断是否存在于待采集的URL LIST中 }
function TThreadSpider.ExistCatchList(url: string): Boolean;
var
i, count: Integer;
urlinfo: TUrlInfo;
begin
Result := False;
try
FLock.Enter;
for i := 0 to catch_urllist.Count -1 do
begin
urlinfo := TUrlInfo(catch_urllist.Items[i]^);
if LowerCase(urlinfo.url) = LowerCase(url) then
begin
count := urlinfo.count + 1;
TUrlInfo(catch_urllist.Items[i]^).count := count;
Result := True;
Break;
end;
end;
finally
FLock.Leave;
end;
end;

procedure TThreadSpider.AddToMacthList(matchinfo: TMatchInfo);
var
info: PMatchInfo;
begin
info := AllocMem(SizeOf(TMatchInfo));
info.url := matchinfo.url;
info.Title := matchinfo.Title;
info.count := matchinfo.count;
try
FLock.Enter;
matching_urllist.Add(info);
finally
FLock.Leave;
end;
end;

procedure TThreadSpider.AddToErrorList(urlinfo: TUrlInfo);
begin
try
FLock.Enter;
error_urllist.Add(urlinfo.url);
finally
FLock.Leave;
end;
end;

{ 添加一个新的URL }
procedure TThreadSpider.AddToCatchList(url: string);
var
info: ^TUrlInfo;
begin
info := AllocMem(SizeOf(TUrlInfo));
info^.url := Trim(url);
info^.count := 1; //新的，默认为 1
try
FLock.Enter;
catch_urllist.Add(info);
finally
FLock.Leave;
end;
end;

end.