Nutch recrawl script
来源:互联网 发布:java调用dll实例 编辑:程序博客网 时间:2024/05/23 00:29
For Apache Nutch 1.8 or latter.
For Apache Nutch, after a time intervial, it will begin to recrawl crawled urls.
But the original "crawl" command did not have full function for recrawl.
So, we need a script to do this kind of work. More information of recrawl: http://wiki.apache.org/nutch/Recrawl
It support hadoop environment.
#! /usr/bin/env bash# recrawl script to run the Nutch bot for crawling and re-crawling.# Usage: bin/recrawl# Author: Jerome(base on Joey and Susam Pal's work. please refer http://blog.csdn.net/iterjpnic/article/details/7644407 and http://wiki.apache.org/nutch/Crawl)#if [ $# != 3 ]#then# echo "Usage: recrawl DATADIR URLDIR SOLRSERVERADDRESS"# echo "where "# echo " DATADIR is the parent directory where crawled data will be stored "# echo " URLDIR is the parent directory of the injected url files "# echo " SOLRSERVERADDRESS is the address of solr index server "# echo "eg: recrawl hdfs://localhost:9000/user/root/mydir \# hdfs://localhost:9000/user/root/urls http://master:8983/solr/collection1_shard1_replica1"# exit 1#fi# DATADIR is the parent directory where crawled data will be storedDATADIR="mydir"# URLDIR is the parent directory of the injected url filesURLDIR="urls"#URLDIR="seed.txt"# SOLRADDRESS is the address of solr index serverSOLRADDRESS="http://slave1:8983/solr/collection1"# set the number of slaves nodesnumSlaves=8# set the timeinterval which page need to re-fetchaddDays=0# and the total number of available tasks# set Hadoop parameter "mapred.reduce.tasks"numTasks=`expr $numSlaves \* 5`# number of urls to fethc in one iterationsizeFetchlist=`expr $numSlaves \* 4000`#sizeFetchlist=1# time limit for fechingtimeLimitFetch=180# num threads for fetchingnumThreads=50# iterations of re-crawldepth=10# Temp segments dir in Hadoop DFSTEMPSEGMENTSDIR="tempsegments"BACKUPSEGDIR="backdir/segments"# Arguments for hadoop dfsshell ls/cp/rm/mv commandLSCOMMAND="hadoop fs -ls"RMCOMMAND="hadoop fs -rm -r"CPCOMMAND="hadoop fs -cp"MVCOMMAND="hadoop fs -mv"if [ -z "$NUTCH_HOME" ]then NUTCH_HOME="/usr/nutch/apache-nutch/runtime/deploy/bin" echo recrawl: $0 could not find environment variable NUTCH_HOME echo recrawl: NUTCH_HOME=$NUTCH_HOME has been set by the scriptelse echo recrawl: $0 found environment variable NUTCH_HOME=$NUTCH_HOMEfi# note that some of the options listed here could be set in the# corresponding hadoop site xml param filecommonOptions="-D mapreduce.job.reduces=$numTasks -D mapred.child.java.opts=-Xmx1000m -D mapred.reduce.tasks.speculative.execution=false -D mapred.map.tasks.speculative.execution=false -D mapred.compress.map.output=true"for((j=0;j>-1;j++))do # "----- See if it will go on to crawl -----" for switch in 'cat $NUTCH_HOME/recrawlswitcher' do break done if [$switch == "off"] then echo "--- Shut down the recrawl due to recrawl switcher is off ---" break fi echo "--- Beginning at count `expr $j + 1` ---" steps=6 echo "----- Inject (Step 1 of $steps) -----" $NUTCH_HOME/nutch inject $DATADIR/crawldb $URLDIR if [ $? -ne 0 ] then exit $? fi echo "----- Generate, Fetch, Parese, Update (Step 2 of $steps) -----" for((i=0;i<$depth;i++)) do echo "--- Beginning crawl at depth `expr $i + 1` of $depth ---" $NUTCH_HOME/nutch generate $commomOptions $DATADIR/crawldb $DATADIR/segments -topN $sizeFetchlist -numFetchers $numSlaves -adddays $addDays if [ $? -ne 0 ] then echo "recrawl: Stopping at depth $depth. No more URLs to fetch." break fi segment=`$LSCOMMAND $DATADIR/segments/ | tail -1 | awk '{print $8}'` echo "--- fetch into segment:"$segment" ---" $NUTCH_HOME/nutch fetch $commonOptions -D fetcher.timelimit.mins=$timeLimitFetch $segment -noParsing -threads $numThreads if [ $? -ne 0 ] then echo "recrawl: fetch $segment at depth `expr $i + 1` failed." echo "recrawl: Deleting segment $segment." $RMCOMMAND $segment continue fi echo "--- Beginning parsing ---" skipRecordsOptions="-D mapred.skip.attempts.to.start.skipping=2 -D mapred.skip.map.max.skip.records=1" $NUTCH_HOME/nutch parse $commomOptions $skipRecordsOptions $segment if [ $? -ne 0 ] then exit $? fi echo "--- Begin updatedb ---" $NUTCH_HOME/nutch updatedb $commonOptions $DATADIR/crawldb $segment #-filter if [ $? -ne 0 ] then exit $? fi echo "--- Begin copy segment dir to temp segments dir ---" $CPCOMMAND $segment $TEMPSEGMENTSDIR/ echo "--- Merge Segments (Step 3 of $steps) ---" $NUTCH_HOME/nutch mergesegs $DATADIR/MERGEDsegments -dir $DATADIR/segments #-filter if [ $? -ne 0 ] then exit $? fi echo "--- Backup old segments dir, delete backups when this-count crawl finished ---" $MVCOMMAND $DATADIR/segments $DATADIR/segmentsBackup if [ $? -ne 0 ] then echo "recrawl: Failed to backup current segments, so exit in case of data loss" exit 1 fi echo "--- Move the MERGEDsegments to segments dir ---" $MVCOMMAND $DATADIR/MERGEDsegments $DATADIR/segments if [ $? -ne 0 ] then echo "recrawl: Failed to move MERGEDsegments to segments, so exit in case of data loss" exit 1 fi echo "----- Invert Links (Step 4 of $steps) -----" $NUTCH_HOME/nutch invertlinks $DATADIR/linkdb -dir $TEMPSEGMENTSDIR if [ $? -ne 0 ] then exit $? fi echo "----- Delete Duplicates (Step 5 of $steps) -----" $NUTCH_HOME/nutch dedup $DATADIR/crawldb if [ $? -ne 0 ] then exit $? fi echo "----- Index (Step 6 of $steps) -----" $NUTCH_HOME/nutch index -D solr.server.url=$SOLRADDRESS $DATADIR/crawldb -linkdb $DATADIR/linkdb -dir $TEMPSEGMENTSDIR if [ $? -ne 0 ] then exit $? fi echo "--- The main recrawl process is done, now gona delete the temp segments dir ---" $MVCOMMAND $TEMPSEGMENTSDIR/* $BACKUPSEGDIR/ echo "--- Delete the temp old segments backups ---" $RMCOMMAND $DATADIR/segmentsBackup $RMCOMMAND .Trash echo "recrawl: FINISHED: Crawl `expr $i + 1` -th completed!" done echo "----- Clean Index -----" $NUTCH_HOME/nutch clean -D solr.server.url=$SOLRADDRESS $DATADIR/crawldb if [ $? -ne 0 ] then exit $? fi breakdoneecho "All FINISHED with `expr $j + 1` count..."
0 0
- Nutch recrawl script
- nutch的重爬recrawl
- Nutch 笔记(二):Craw more urls and Recrawl(收藏)
- nutch recrawl中出现的问题及解决
- nutch 1.2 增量爬取url 完成 recrawl.sh 编写
- nutch 1.4 的增量爬取(recrawl)脚本
- Nutch 笔记(二):Craw more urls and Recrawl
- Dissecting The Nutch Crawler - The "nutch" shell script
- nutch
- nutch
- Nutch
- Nutch
- nutch
- Nutch
- nutch
- Nutch
- Nutch
- nutch
- 如何编写LVS对Real Server的健康状态检测脚本
- VS2010动态库不生成lib
- java中的匿名内部类
- LVS+keepalived实现DR模式负载均衡
- 人大金仓数据库兼容oracle参数调整compatible_level
- Nutch recrawl script
- cocos2d-x 网络编程Curl
- linux 路由表 的一些相关资料
- Linux命令学习:30道运维面试题
- linux下vi命令大全
- C# 多线程与跨线程访问界面控件
- Android学习之Button事件实现方法的总结
- CentOS 64位安装hadoop
- Android meta-data的使用以及含义