Nutch recrawl script

来源：互联网发布：java调用dll实例编辑：程序博客网时间：2024/05/23 00:29
For Apache Nutch 1.8 or latter.
For Apache Nutch, after a time intervial, it will begin to recrawl crawled urls.
But the original "crawl" command did not have full function for recrawl.
So, we need a script to do this kind of work. More information of recrawl: http://wiki.apache.org/nutch/Recrawl
It support hadoop environment.
#! /usr/bin/env bash# recrawl script to run the Nutch bot for crawling and re-crawling.# Usage: bin/recrawl# Author: Jerome(base on Joey and Susam Pal's work. please refer http://blog.csdn.net/iterjpnic/article/details/7644407 and http://wiki.apache.org/nutch/Crawl)#if [ $# != 3 ]#then#  echo "Usage: recrawl DATADIR URLDIR SOLRSERVERADDRESS"#  echo "where "#  echo "        DATADIR is the parent directory where crawled data will be stored "#  echo "        URLDIR is the parent directory of the injected url files "#  echo "        SOLRSERVERADDRESS is the address of solr index server "#  echo "eg: recrawl hdfs://localhost:9000/user/root/mydir \#  hdfs://localhost:9000/user/root/urls http://master:8983/solr/collection1_shard1_replica1"#  exit 1#fi# DATADIR is the parent directory where crawled data will be storedDATADIR="mydir"# URLDIR is the parent directory of the injected url filesURLDIR="urls"#URLDIR="seed.txt"# SOLRADDRESS is the address of solr index serverSOLRADDRESS="http://slave1:8983/solr/collection1"# set the number of slaves nodesnumSlaves=8# set the timeinterval which page need to re-fetchaddDays=0# and the total number of available tasks# set Hadoop parameter "mapred.reduce.tasks"numTasks=`expr $numSlaves \* 5`# number of urls to fethc in one iterationsizeFetchlist=`expr $numSlaves \* 4000`#sizeFetchlist=1# time limit for fechingtimeLimitFetch=180# num threads for fetchingnumThreads=50# iterations of re-crawldepth=10# Temp segments dir in Hadoop DFSTEMPSEGMENTSDIR="tempsegments"BACKUPSEGDIR="backdir/segments"# Arguments for hadoop dfsshell ls/cp/rm/mv commandLSCOMMAND="hadoop fs -ls"RMCOMMAND="hadoop fs -rm -r"CPCOMMAND="hadoop fs -cp"MVCOMMAND="hadoop fs -mv"if [ -z "$NUTCH_HOME" ]then  NUTCH_HOME="/usr/nutch/apache-nutch/runtime/deploy/bin"  echo recrawl: $0 could not find environment variable NUTCH_HOME  echo recrawl: NUTCH_HOME=$NUTCH_HOME has been set by the scriptelse  echo recrawl: $0 found environment variable NUTCH_HOME=$NUTCH_HOMEfi# note that some of the options listed here could be set in the# corresponding hadoop site xml param filecommonOptions="-D mapreduce.job.reduces=$numTasks -D mapred.child.java.opts=-Xmx1000m -D mapred.reduce.tasks.speculative.execution=false -D mapred.map.tasks.speculative.execution=false -D mapred.compress.map.output=true"for((j=0;j>-1;j++))do  # "----- See if it will go on to crawl -----"  for switch in 'cat $NUTCH_HOME/recrawlswitcher'  do    break  done  if [$switch == "off"]  then      echo "--- Shut down the recrawl due to recrawl switcher is off ---"      break  fi  echo "--- Beginning at count `expr $j + 1` ---"  steps=6  echo "----- Inject (Step 1 of $steps) -----"  $NUTCH_HOME/nutch inject $DATADIR/crawldb $URLDIR  if [ $? -ne 0 ]  then exit $?  fi  echo "----- Generate, Fetch, Parese, Update (Step 2 of $steps) -----"  for((i=0;i<$depth;i++))  do    echo "--- Beginning crawl at depth `expr $i + 1` of $depth ---"    $NUTCH_HOME/nutch generate $commomOptions $DATADIR/crawldb $DATADIR/segments -topN $sizeFetchlist -numFetchers $numSlaves -adddays $addDays    if [ $? -ne 0 ]    then        echo "recrawl: Stopping at depth $depth. No more URLs to fetch."        break    fi    segment=`$LSCOMMAND $DATADIR/segments/ | tail -1 | awk '{print $8}'`    echo "--- fetch into segment:"$segment" ---"    $NUTCH_HOME/nutch fetch $commonOptions -D fetcher.timelimit.mins=$timeLimitFetch $segment -noParsing -threads $numThreads    if [ $? -ne 0 ]    then        echo "recrawl: fetch $segment at depth `expr $i + 1` failed."        echo "recrawl: Deleting segment $segment."        $RMCOMMAND $segment        continue    fi    echo "--- Beginning parsing ---"    skipRecordsOptions="-D mapred.skip.attempts.to.start.skipping=2 -D mapred.skip.map.max.skip.records=1"    $NUTCH_HOME/nutch parse $commomOptions $skipRecordsOptions $segment    if [ $? -ne 0 ]    then exit $?    fi    echo "--- Begin updatedb ---"    $NUTCH_HOME/nutch updatedb $commonOptions $DATADIR/crawldb $segment #-filter    if [ $? -ne 0 ]    then exit $?    fi    echo "--- Begin copy segment dir to temp segments dir ---"    $CPCOMMAND $segment $TEMPSEGMENTSDIR/    echo "--- Merge Segments (Step 3 of $steps) ---"    $NUTCH_HOME/nutch mergesegs $DATADIR/MERGEDsegments -dir $DATADIR/segments #-filter    if [ $? -ne 0 ]    then exit $?    fi    echo "--- Backup old segments dir, delete backups when this-count crawl finished ---"    $MVCOMMAND $DATADIR/segments $DATADIR/segmentsBackup    if [ $? -ne 0 ]    then      echo "recrawl: Failed to backup current segments, so exit in case of data loss"      exit 1    fi    echo "--- Move the MERGEDsegments to segments dir ---"    $MVCOMMAND $DATADIR/MERGEDsegments $DATADIR/segments    if [ $? -ne 0 ]    then        echo "recrawl: Failed to move MERGEDsegments to segments, so exit in case of data loss"        exit 1    fi    echo "----- Invert Links (Step 4 of $steps) -----"    $NUTCH_HOME/nutch invertlinks $DATADIR/linkdb -dir $TEMPSEGMENTSDIR    if [ $? -ne 0 ]    then exit $?    fi    echo "----- Delete Duplicates (Step 5 of $steps)  -----"    $NUTCH_HOME/nutch dedup $DATADIR/crawldb    if [ $? -ne 0 ]    then exit $?    fi    echo "----- Index (Step 6 of $steps) -----"    $NUTCH_HOME/nutch index -D solr.server.url=$SOLRADDRESS $DATADIR/crawldb -linkdb $DATADIR/linkdb -dir $TEMPSEGMENTSDIR    if [ $? -ne 0 ]    then exit $?    fi    echo "--- The main recrawl process is done, now gona delete the temp segments dir ---"    $MVCOMMAND $TEMPSEGMENTSDIR/* $BACKUPSEGDIR/    echo "--- Delete the temp old segments backups ---"    $RMCOMMAND $DATADIR/segmentsBackup    $RMCOMMAND .Trash    echo "recrawl: FINISHED: Crawl `expr $i + 1` -th completed!"  done  echo "----- Clean Index -----"  $NUTCH_HOME/nutch clean -D solr.server.url=$SOLRADDRESS $DATADIR/crawldb  if [ $? -ne 0 ]  then exit $?  fi  breakdoneecho "All FINISHED with `expr $j + 1` count..."
0 0