nutch1.9_mysql

来源:互联网 发布:侠客风云传前传mac版 编辑:程序博客网 时间:2024/04/30 00:53
一、ivysetting.xml
注掉:
<property name="repo.maven.org"
    value="http://repo1.maven.org/maven2/"
    override="false"/>

加上:
<property name="repo.maven.org"
      value="http://maven.oschina.net/content/groups/public/"
          override="false"/>
 
 
 http://mirrors.ibiblio.org/maven2/
二、ivy.xml


 添加:
<dependency org="mysql" name="mysql-connector-java" rev="5.1.34"/>
        <dependency org="org.springframework" name="spring-jdbc" rev="4.0.8.RELEASE"/>
        <dependency org="commons-dbcp" name="commons-dbcp" rev="1.3"/>
<dependency org="com.googlecode.juniversalchardet" name="juniversalchardet" rev="1.0.3"/>

三、建库:db_news  utf-8
建表:
CREATE TABLE `tb_content` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`html` longtext,
`url` varchar(200) DEFAULT NULL,
`status` int(11) DEFAULT NULL,
PRIMARY KEY (`id`),
KEY `index_id` (`id`),
KEY `index_status` (`status`)
) ENGINE=MyISAM AUTO_INCREMENT=8990828 DEFAULT CHARSET=utf8;

四、src/java/org/...../fetcher
统计目录放入DBHelper.java
Fetcher.java   case ProtocolStatus.SUCCESS: 下面插入
try{
                        String contentType=content.getContentType();
                        if(contentType.contains("text")){
                                int result=DBHelper.addArticle(content.getUrl(),content.getContent());
                                Log.info("Upload "+content.getUrl()+" result="+result);
                        }
                }catch(Exception ex){
                        System.out.println("Upload Failed:"+ex.toString());
                        Log.info("Upload Failed:"+ex.toString());
                }

注释掉:
    /*   if (!rules.isAllowed(fit.u.toString())) {
                // unblock
                fetchQueues.finishFetchItem(fit, true);
                if (LOG.isDebugEnabled()) {
                  LOG.debug("Denied by robots.txt: " + fit.url);
                }
                output(fit.url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE);
                reporter.incrCounter("FetcherStatus", "robots_denied", 1);
                continue;
              }
   */


五、src/bin/crawl
注释、修改下面相应内容


SEEDDIR="$1"
CRAWL_PATH="$2"
#SOLRURL="$3"
LIMIT="$3"




#if [ "$SOLRURL" = "" ]; then
#    echo "Missing SOLRURL : crawl <seedDir> <crawlDir> <solrURL> <numberOfRounds>"
#    exit -1;
#fi


# set the number of slaves nodes
numSlaves=1


# num threads for fetching
numThreads=100






  # parsing the segment
 # echo "Parsing : $SEGMENT"
  # enable the skipping of records for the parsing so that a dodgy document 
  # so that it does not fail the full task
 # skipRecordsOptions="-D mapred.skip.attempts.to.start.skipping=2 -D mapred.skip.map.max.skip.records=1"
 # "$bin/nutch" parse $commonOptions $skipRecordsOptions "$CRAWL_PATH"/segments/$SEGMENT


#  if [ $? -ne 0 ] 
#  then exit $? 
 # fi






#  echo "Indexing $SEGMENT on SOLR index -> $SOLRURL"
#  "$bin/nutch" index -D solr.server.url=$SOLRURL "$CRAWL_PATH"/crawldb -linkdb "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT
  
#  if [ $? -ne 0 ] 
#   then exit $? 
#  fi


 # echo "Cleanup on SOLR index -> $SOLRURL"
 # "$bin/nutch" clean -D solr.server.url=$SOLRURL "$CRAWL_PATH"/crawldb
  
 # if [ $? -ne 0 ] 
 #  then exit $? 
 # fi


 六、conf/nutch-site.xml
 
 
<property>
  <name>fetcher.parse</name>
  <value>true</value>
  <description>If true, fetcher will parse content. Default is false, which means
  that a separate parsing step is required after fetching is finished.</description>
</property>
<property>
  <name>fetcher.store.content</name>
  <value>false</value>
  <description>If true, fetcher will store content.</description>
</property>
<property>
  <name>http.content.limit</name>
  <value>-1</value>
  <description>The length limit for downloaded content using the http://
  protocol, in bytes. If this value is nonnegative (>=0), content longer
  than it will be truncated; otherwise, no truncation at all. Do not
  confuse this setting with the file.content.limit setting.
  </description>
</property>


<property>
  <name>fetcher.threads.per.queue</name>
  <value>15</value>
  <description>This number is the maximum number of threads that
    should be allowed to access a queue at one time.
   </description>
</property>
<property>
  <name>http.agent.name</name>
  <value>nutch</value>
  <description>HTTP 'User-Agent' request header. MUST NOT be empty -
  please set this to a single word uniquely related to your organization.


  NOTE: You should also check other related properties:


        http.robots.agents
        http.agent.description
        http.agent.url
        http.agent.email
        http.agent.version


  and set their values appropriately.


  </description>
</property>
<property>  
  <name>parser.skip.truncated</name>  
  <value>false</value>  
</property>


七、多机情况
model=distributed
local=false

八、conf/regex-urlfilter.txt
flv|FLV

九、bin/crawl urls crawl1 5

0 0
原创粉丝点击