larbin的编译 运行+三个配置文件阅读

来源:互联网 发布:美版iphone7plus 淘宝 编辑:程序博客网 时间:2024/04/27 18:08

原文地址:http://blog.csdn.net/wbybeyond/article/details/8173075


larbin-2.6.3.tar.gz 官网地址:http://larbin.sourceforge.net/index-eng.html

首先解压安装包 tar -xvzf larbin-2.6.3.tar.gz
安装gcc,g++,xutils-dev
命令: sudo apt-get install
然后修改 ./adns/internal.h 将其中的569行-571行注释掉
adns_status adns__parse_domain(adns_state ads, int serv, adns_query qu,
vbuf *vb, parsedomain_flags flags,
const byte *dgram, int dglen, int *cbyte_io, int max);
即这几行注释掉
然后./configure
然后 make
会提示 error: iostream.h: No such file or directory
这时将提醒这个错误的文件 #include<iostream.h>修改为#include<iostream> 并补上using namespace std;
然后make 直至不出现这个错误 编译成功

然后./larbin运行,默认情况larbin是没有输出,可以修改option.h或types.h来修改配置,然后make重新编译
可以在larbin.conf 修改url起始地址,是否限制在某个域名,是否拓展到外网等等,但此时不需要重新编译
可以用ctrl+c来终止larbin的运行,用./larbin -scratch 重新开始抓取(前提是option.h里已经使reload有效)
也可以用./larbin -c filename 来设置配置文件 larbin默认的是larbin.conf

下面是对larbin.conf option.h types.h阅读
larbin.conf 

############################################### # Who are you ? # mail of the one who launched larbin (YOUR mail) From larbin2.6.3@unspecified.mail # name of the bot (sent with http headers) UserAgent larbin_2.6.3 ############################################ # What are the inputs and ouputs of larbin # port on which is launched the http statistic webserver # if unset or set to 0, no webserver is launched 启用服务器的端口号 httpPort 8081 # port on which you can submit urls to fetch 提交url的端口号 # no input is possible if you comment this line or use port 0 #inputPort 1976 ############################################ # parameters to adapt depending on your network # Number of connexions in parallel (to adapt depending of your network speed) 在同一时间并行处理的页面数(取决于你的网速)pagesConnexions 100 # Number of dns calls in parallel 并行工作的dns服务器数 dnsConnexions 5 # How deep do you want to go in a site 网站搜索深度 depthInSite 5 # do you want to follow external links 是否搜索链接到外部的页面 #noExternalLinks # time between 2 calls on the same server (in sec) : NEVER less than 30 重拨同一服务器的间隔时间waitDuration 60 # Make requests through a proxy (use with care) 是否通过代理服务器发出请求(详见proxy附解) #proxy www 8080 ############################################## # now, let's customize the search # first page to fetch (you can specify several urls) 搜索的初始页面,可以选择多个 startUrl http://slashdot.org/ # Do you want to limit your search to a specific domain ? 是否限制在某个域名站点 # if yes, uncomment the following line #limitToDomain .fr .dk .uk end # What are the extensions you surely don't want 你所不要的外部拓展 # never forbid .html, .htm and so on : larbin needs them forbiddenExtensions .tar .gz .tgz .zip .Z .rpm .deb .ps .dvi .pdf .png .jpg .jpeg .bmp .smi .tiff .gif .mov .avi .mpeg .mpg .mp3 .qt .wav .ram .rm .jar .java .class .diff .doc .xls .ppt .mdb .rtf .exe .pps .so .psd end 


option.h 
///////////////////////////////////////////////////////////// //Select the output module 选择输出模式 #define default_output 默认输出 不进行任何操作 //#define simple_save 以文件保存,以d******-fyyyyyy命名 //#define mirror_save 以网页的继承关系保存 //#define stats_output 对页面进行数据统计 ///////////////////////////////////////////////////////////// // Set up a specific search 建立特定的搜索 //#define SPECIFICSEARCH //#define contentTypes ((char *[]) { "audio/mpeg", NULL }) //#define privilegedExts ((char *[]) { ".mp3", NULL }) // how do you want to manage specific pages (select one of the followings) //#define DEFAULT_SPECIFIC //#define SAVE_SPECIFIC //#define DYNAMIC_SPECIFIC ////////////////////////////////////////////////////////// // What do you want the crawler to do 你想让爬虫完成的是? // do you want to follow links in pages 跟踪链接 #define FOLLOW_LINKS // do you want the crawler to associate to each page the list of its sons 让爬虫得到每个页面儿子列表//#define LINKS_INFO // do you want to associate a tag to pages (given in input) // this allows to follow a page from input to output (and follow redirection) 跟踪从页面输入到输出 (跟踪重定向)//#define URL_TAGS // do you want to suppress duplicate pages 阻止副本 //#define NO_DUP // do you want larbin to stop when everything has been fetched 让larbin停止当所有已被采集 //#define EXIT_AT_END // do you want to fetch images 是否抓取图片 // if you enable this option, update forbiddenExtensions in larbin.conf **选择的话需更新larbin.conf 禁止拓展选项//#define IMAGES // downlaod everything (ie no check of content type in http headers) 下载所有的(不检查任何内容,类型)//#define ANYTYPE // do you want to manage cookies 是否管理cookies //#define COOKIES ////////////////////////////////////////////////////////// // Various options 拓展选项 // do you want to get cgi 是否使用cgi(详见cgi附解) // 0 : yes ; 1 : no ; 2 : NO ! #define CGILEVEL 1 // limit bandwith usage (in octets/sec) 限制带宽的使用 // be carefull, larbin might use 10 to 20% more //#define MAXBANDWIDTH 200000 // the depth is initialized each time a link goes to another site 每次链接到其它网站的深度被初始#define DEPTHBYSITE ////////////////////////////////////////////////////////// // Efficiency vs feature 效率VS特征 // do we need a special thread for output 是否需要独立的线程给输出 // This is compulsory if it can block 这是必须的如果拥挤的话 // (not needed if you did not add code yourself)不需要如果你未添加命令 //#define THREAD_OUTPUT // if this option is set, larbin saves the hashtable from time to time 如果选中,larbin会时不时地保存hash表// this way it can restart from where it last stopped 这样可以通过载入hash表每次从上次停止的地方再开始search// by reloading the table //#define RELOAD ////////////////////////////////////////////////////////// // now it's just if you need to know how it works // do not launch the webserver 不要启动服务器 // this can be usefull in order to launch no thread at all //#define NOWEBSERVER // do you want nice graphs for in the stats page 在统计中是否需要图 #define GRAPH // uncomment if you are not interested in debugging information 不要调试的信息取消注释 //#define NDEBUG // enable this if you really dislike stats (in the webserver) 使能如果你不想要统计信息 //#define NOSTATS // enable this if you really like stats (on stdout) 使能如果你想要统计信息 //#define STATS //#define BIGSTATS // Please enable this option if you want to report a crash 使能如果你想要报告冲突 // then compile with "make debug" //#define CRASH #endif // LARBIN_CONFIG 


types.h 

// Larbin // Sebastien Ailleret // 12-01-00 -> 10-12-01 #ifndef TYPES_H #define TYPES_H // Size of the HashSize (max number of urls that can be fetched) 最大采集的url数 #define hashSize 64000000 // Size of the duplicate hashTable 副本hashtable数目 #define dupSize hashSize #define dupFile "dupfile.bak" // Size of the arrays of Sites in main memory 内存中站点的大小 #define namedSiteListSize 20000 #define IPSiteListSize 10000 // Max number of urls in ram ram中url的最大数 #define ramUrls 100000 #define maxIPUrls 80000 // this should allow less dns call **这个必须允许少的dns请求 // Max number of urls per site in Url 每个网站的url允许最大值 #define maxUrlsBySite 40 // must fit in uint8_t // time out when reading a page (in sec) 何时页面超时 #define timeoutPage 30 // default time out #define timeoutIncr 2000 // number of bytes for 1 more sec // How long do we keep dns answers and robots.txt 我们要将dns回答保持。。。。。 #define dnsValidTime 2*24*3600 // Maximum size of a page 页面最大值 #define maxPageSize 100000 #define nearlyFullPage 90000 // Maximum size of a robots.txt that is read // the value used is min(maxPageSize, maxRobotsSize) #define maxRobotsSize 10000 // How many forbidden items do we accept in a robots.txt 有多少种禁止抓取的页面 #define maxRobotsItem 100 // file name used for storing urls on disk 存储urls的文件名 #define fifoFile "fifo" #define fifoFileWait "fifowait" // number of urls per file on disk urls 每个文件存储url的数目 // should be equal to ramUrls for good interaction with restart #define urlByFile ramUrls // Size of the buffer used to read sockets 缓冲区大小用来读socket接口 #define BUF_SIZE 16384 #define STRING_SIZE 1024 // Max size for a url 单个url大小 #define maxUrlSize 512 #define maxSiteSize 40 // max size for the name of a site // max size for cookies 收藏夹最大值 #define maxCookieSize 128 // Standard size of a fifo in a Site 一个站点的标准文件大小 #define StdVectSize maxRobotsItem // maximum number of input connections 输入连接数的最大值 #define maxInput 5 // if we save files, how many files per directory and where 保存文件,一个路径下有多少文件 #define filesPerDir 2000 #define saveDir "save/" #define indexFile "index.html" // for MIRROR_SAVE #define nbDir 1000 // for MIRROR_SAVE // options for SPECIFICSEARCH (except with DEFAULT_SPECIFIC) #define specDir "specific/" #define maxSpecSize 5000000 // Various reasons of error when getting a page 出错原因 #define nbAnswers 16 enum FetchError { success, 成功 noDNS, 无DNS域名服务器 noConnection, 无连接 forbiddenRobots, 禁用的。。。。 timeout,超时 badType, tooBig, err30X, err40X, earlyStop, duplicate, fastRobots, fastNoConn, fastNoDns, tooDeep, urlDup }; // standard types typedef unsigned int uint; #endif // TYPES_H

原创粉丝点击