CCrawl::DoCrawl()

来源：互联网发布：淘宝怎么继续延长收货编辑：程序博客网时间：2024/04/30 01:59

CCrawl::DoCrawl()

//该函数的功能是：将url从种子url文件和未访问的url文件中读出来//并将这些url存储起来，该函数并没有抓取网页
void CCrawl::DoCrawl()
{ /* set the signal function */
//如果SIGTERM、SIGKILL、SIGINT，那么就触发SigTerm函数
//如果是SIGPIPE或者SIGCHLD，那么就触发SIG_IGN函数
signal(SIGTERM, SigTerm); signal(SIGKILL, SigTerm);
signal(SIGINT, SigTerm); signal(SIGPIPE, SIG_IGN);
signal(SIGCHLD,SIG_IGN);
// output the begin time char strTime[128]; time_t tDate;
memset(strTime,0,128); time(&tDate);
//gmtime():Converts a time value to a structure.
//将格式化的时间存入strTime strftime(strTime, 128,"%a, %d %b %Y %H:%M:%S GMT", gmtime(&tDate)); cout << "/n/nBegin at: " << strTime << "/n/n";
// get the other info from file
//将访问过的url和网页的md5值添加到集合setVisitedUrlMD5和setVisitedPageMD5中
GetVisitedUrlMD5(); GetVisitedPageMD5();
//将ip段从文件中读出，然后插入mapIpBlock GetIpBlock();
//获得无法访问的主机，并且将它的md5值存储在集合setUnreachHostMD5中 GetUnreachHostMD5();
// open the seed url file //读入种子url
ifstream ifsSeed(m_sInputFileName.c_str());
if (!ifsSeed){ cerr << "Cannot open " << m_sInputFileName << " for input/n"; return; }
// open the files for output //打开所需要的所有文件 OpenFilesForOutput();
// Create thread ID structures.
pthread_t *tids = (pthread_t*)malloc(NUM_WORKERS * sizeof(pthread_t)); if( tids == NULL){ cerr << "malloc error" << endl; }
//线程数 /* UNIX环境创建线程函数，具体格式：　　
#include　　
int pthread_create(pthread_t *restrict tidp,const pthread_attr_t *restrict attr,
void*（*start_rtn)(void*),void *restrict arg);　　
返回值：若成功则返回0，否则返回出错编号　　
返回成功时，由tidp指向的内存单元被设置为新创建线程的线程ID。
attr参数用于制定各种不同的线程属性。
新创建的线程从start_rtn函数的地址开始运行，该函数只有一个无指针参数arg，如果需要向start_rtn函数传递的参数不止一个，那么需要把这些参数放到一个结构中，然后把这个结构的地址作为arg的参数传入。 */
for(unsigned int i=0; i< NUM_WORKERS; i++){ if( pthread_create( &tids[i], NULL, start, this)) cerr << "create threads error" << endl; }
string strUrl; CPage iCPage;
//从种子文件中读入种子url,处理后加入待抓取url集合
while( getline(ifsSeed, strUrl) ){ string::size_type idx;
//空 if(strUrl[0]=='/0' strUrl[0]=='#' strUrl[0]== '/n'){ continue; }
//去除制表符
idx = strUrl.find('/t'); if(idx != string::npos){ strUrl = strUrl.substr(0,idx); }
//idx = strUrl.find("http");
//规格化url
idx = CStrFun::FindCase(strUrl, "http"); if(idx == string::npos){ //continue; idx = strUrl.find('/'); if( idx == string::npos ){ strUrl = "http://" + strUrl + "/"; }else{ strUrl = "http://" + strUrl; } }//if
//if( strUrl.length() < 8 ) continue;
//如果该url不合规格，那么取下一条记录
if( iCPage.IsFilterLink(strUrl) ) continue;
//否则将url加入url集合 AddUrl(strUrl.c_str()); }//while
// Get the unvisited URL //获得还未访问的url，将其处理后加入待抓取url集合
ifstream ifsUnvisitedUrl(UNVISITED_FILE.c_str()); if( ifsUnvisitedUrl ){
//按行读入
while( getline(ifsUnvisitedUrl, strUrl) ){ string::size_type idx;
//空行 if( strUrl[0]=='/0' strUrl[0]=='#' strUrl[0]== '/n'){ continue; }
//清除制表符
idx = strUrl.find('/t'); if(idx != string::npos){ strUrl = strUrl.substr(0,idx); }
// filter invalid urls
if( iCPage.IsFilterLink(strUrl) ) continue;
AddUrl(strUrl.c_str()); } }else{ //cerr << "Cannot open " << UNVISITED_FILE << " for input/n"; }
// sleep(30);
b_fOver = true; cout << "finished to get all unvisited urls." << endl;
// Wait for the threads. /* void pthread_exit(void *retval) int pthread_join(pthread_t th, void **thread_return) pthread_join()的调用者将挂起并等待th线程终止， retval是pthread_exit()调用者线程（线程ID为th）的返回值，如果thread_return不为NULL，则*thread_return=retval。需要注意的是一个线程仅允许唯一的一个线程使用 pthread_join()等待它的终止，并且被等待的线程应该处于可join状态，即非DETACHED状态。 */
for (unsigned int i = 0; i < NUM_WORKERS; ++i){ (void)pthread_join(tids[i], NULL); }
cout << "closed " << NUM_WORKERS << " threads." << endl;
//将未访问过的url存入文件 SaveUnvisitedUrl();
SaveReplicas("repli");
memset(strTime,0,128); time(&tDate); strftime(strTime, 128,"%a, %d %b %Y %H:%M:%S GMT", gmtime(&tDate)); cout << "/n/nEnd at: " << strTime << "/n/n";}