nutch-0.8在eclipse中运行

来源:互联网 发布:矩阵式项目管理组织 编辑:程序博客网 时间:2024/05/29 04:23
nutch-0.8在eclipse中运行
运行环境:
  1. jdk 1.5
  2. eclipse 3.2
  3. nutch-0.8已解压制D:/ (注:此目录是随意的)

nutch 0.8缺少的两个jar包:
  1. 下载---关于parse-mp3的jid3lib-0.5.1.jar
  2. 下载---关于parse-rtf的rtf-parser.jar
分别拷贝至
  1. D:/nutch-0.8.1/src/plugin/parse-mp3/lib
  2. D:/nutch-0.8.1/src/plugin/parse-rtf/lib

配置文件的更改:
主要更改的配置文件为如下三个:
  1. nutch-default.xml
    更改plugin.folders的value为D:/nutch-0.8.1/src/plugin
    更改http.agent.name的value为godric(注:此处随意‘)
  2. 在D:/nutch-0.8.1下添加文件夹urls,在urls目录下新建文本文档nutch.txt(注:文件名随意),内容为你要爬取得网站url,例如:http://www.iscas.ac.cn
  3. crawl-urlfilter.txt
    修改为如下所示:
    # accept hosts in MY.DOMAIN.NAME
    +^http://www.iscas.ac.cn/

项目文件:
  1. .classpath 其内容为:
    <?xml version="1.0" encoding="UTF-8"?>
    <classpath>
        
    <classpathentry kind="src" path="conf"/>
        
    <classpathentry kind="src" path="src/java"/>
        
    <classpathentry kind="src" path="src/test"/>
        
    <classpathentry kind="src" path="src/plugin/creativecommons/src/java"/>
        
    <classpathentry kind="src" path="src/plugin/summary-lucene/src/java"/>
        
    <classpathentry kind="src" path="src/plugin/parse-rtf/src/java"/>
        
    <classpathentry kind="src" path="src/plugin/query-url/src/java"/>
        
    <classpathentry kind="src" path="src/plugin/parse-rss/src/java"/>
        
    <classpathentry kind="src" path="src/plugin/parse-msword/src/java"/>
        
    <classpathentry kind="src" path="src/plugin/urlfilter-suffix/src/test"/>
        
    <classpathentry kind="src" path="src/plugin/protocol-http/src/java"/>
        
    <classpathentry kind="src" path="src/plugin/parse-pdf/src/test"/>
        
    <classpathentry kind="src" path="src/plugin/parse-zip/src/java"/>
        
    <classpathentry kind="src" path="src/plugin/summary-basic/src/java"/>
        
    <classpathentry kind="src" path="src/plugin/parse-rtf/src/test"/>
        
    <classpathentry kind="src" path="src/plugin/parse-swf/src/test"/>
        
    <classpathentry kind="src" path="src/plugin/parse-oo/src/java"/>
        
    <classpathentry kind="src" path="src/plugin/urlfilter-regex/src/test"/>
        
    <classpathentry kind="src" path="src/plugin/protocol-httpclient/src/java"/>
        
    <classpathentry kind="src" path="src/plugin/parse-msword/src/test"/>
        
    <classpathentry kind="src" path="src/plugin/subcollection/src/test"/>
        
    <classpathentry kind="src" path="src/plugin/parse-oo/src/test"/>
        
    <classpathentry kind="src" path="src/plugin/protocol-http/src/test"/>
        
    <classpathentry kind="src" path="src/plugin/query-url/src/test"/>
        
    <classpathentry kind="src" path="src/plugin/urlfilter-prefix/src/java"/>
        
    <classpathentry kind="src" path="src/plugin/protocol-ftp/src/java"/>
        
    <classpathentry kind="src" path="src/plugin/query-site/src/java"/>
        
    <classpathentry kind="src" path="src/plugin/parse-rss/src/test"/>
        
    <classpathentry kind="src" path="src/plugin/parse-swf/src/java"/>
        
    <classpathentry kind="src" path="src/plugin/parse-zip/src/test"/>
        
    <classpathentry kind="src" path="src/plugin/parse-text/src/java"/>
        
    <classpathentry kind="src" path="src/plugin/urlfilter-suffix/src/java"/>
        
    <classpathentry kind="src" path="src/plugin/parse-pdf/src/java"/>
        
    <classpathentry kind="src" path="src/plugin/protocol-file/src/java"/>
        
    <classpathentry kind="src" path="src/plugin/query-more/src/java"/>
        
    <classpathentry kind="src" path="src/plugin/scoring-opic/src/java"/>
        
    <classpathentry kind="src" path="src/plugin/urlfilter-automaton/src/test"/>
        
    <classpathentry kind="src" path="src/plugin/urlfilter-automaton/src/java"/>
        
    <classpathentry kind="src" path="src/plugin/query-basic/src/java"/>
        
    <classpathentry kind="src" path="src/plugin/urlfilter-regex/src/java"/>
        
    <classpathentry kind="src" path="src/plugin/subcollection/src/java"/>
        
    <classpathentry kind="src" path="src/plugin/parse-ext/src/java"/>
        
    <classpathentry kind="src" path="src/plugin/parse-mp3/src/test"/>
        
    <classpathentry kind="src" path="src/plugin/ontology/src/test"/>
        
    <classpathentry kind="src" path="src/plugin/creativecommons/src/test"/>
        
    <classpathentry kind="src" path="src/plugin/lib-parsems/src/java"/>
        
    <classpathentry kind="src" path="src/plugin/ontology/src/java"/>
        
    <classpathentry kind="src" path="src/plugin/index-basic/src/java"/>
        
    <classpathentry kind="src" path="src/plugin/parse-js/src/java"/>
        
    <classpathentry kind="src" path="src/plugin/index-more/src/java"/>
        
    <classpathentry kind="src" path="src/plugin/lib-regex-filter/src/test"/>
        
    <classpathentry kind="src" path="src/plugin/analysis-de/src/java"/>
        
    <classpathentry kind="src" path="src/plugin/microformats-reltag/src/java"/>
        
    <classpathentry kind="src" path="src/plugin/parse-mspowerpoint/src/java"/>
        
    <classpathentry kind="src" path="src/plugin/lib-http/src/java"/>
        
    <classpathentry kind="src" path="src/plugin/parse-msexcel/src/test"/>
        
    <classpathentry kind="src" path="src/plugin/languageidentifier/src/java"/>
        
    <classpathentry kind="src" path="src/plugin/lib-regex-filter/src/java"/>
        
    <classpathentry kind="src" path="src/plugin/parse-html/src/test"/>
        
    <classpathentry kind="src" path="src/plugin/parse-html/src/java"/>
        
    <classpathentry kind="src" path="src/plugin/clustering-carrot2/src/test"/>
        
    <classpathentry kind="src" path="src/plugin/parse-ext/src/test"/>
        
    <classpathentry kind="src" path="src/plugin/parse-mspowerpoint/src/test"/>
        
    <classpathentry kind="src" path="src/plugin/parse-mp3/src/java"/>
        
    <classpathentry kind="src" path="src/plugin/parse-msexcel/src/java"/>
        
    <classpathentry kind="src" path="src/plugin/lib-http/src/test"/>
        
    <classpathentry kind="src" path="src/plugin/analysis-fr/src/java"/>
        
    <classpathentry kind="src" path="src/plugin/clustering-carrot2/src/java"/>
        
    <classpathentry kind="src" path="src/plugin/languageidentifier/src/test"/>
        
    <classpathentry kind="lib" path="lib/commons-cli-2.0-SNAPSHOT.jar"/>
        
    <classpathentry kind="lib" path="lib/commons-lang-2.1.jar"/>
        
    <classpathentry kind="lib" path="lib/commons-logging-1.0.4.jar"/>
        
    <classpathentry kind="lib" path="lib/commons-logging-api-1.0.4.jar"/>
        
    <classpathentry kind="lib" path="lib/concurrent-1.3.4.jar"/>
        
    <classpathentry kind="lib" path="lib/hadoop-0.4.0-patched.jar"/>
        
    <classpathentry kind="lib" path="lib/jakarta-oro-2.0.7.jar"/>
        
    <classpathentry kind="lib" path="lib/jetty-5.1.4.jar"/>
        
    <classpathentry kind="lib" path="lib/junit-3.8.1.jar"/>
        
    <classpathentry kind="lib" path="lib/log4j-1.2.13.jar"/>
        
    <classpathentry kind="lib" path="lib/lucene-core-1.9.1.jar"/>
        
    <classpathentry kind="lib" path="lib/lucene-misc-1.9.1.jar"/>
        
    <classpathentry kind="lib" path="lib/servlet-api.jar"/>
        
    <classpathentry kind="lib" path="lib/taglibs-i18n.jar"/>
        
    <classpathentry kind="lib" path="lib/xerces-2_6_2.jar"/>
        
    <classpathentry kind="lib" path="lib/xerces-2_6_2-apis.jar"/>
        
    <classpathentry kind="lib" path="src/plugin/clustering-carrot2/lib/carrot2-filter-lingo.jar"/>
        
    <classpathentry kind="lib" path="src/plugin/clustering-carrot2/lib/carrot2-local-core.jar"/>
        
    <classpathentry kind="lib" path="src/plugin/clustering-carrot2/lib/carrot2-snowball-stemmers.jar"/>
        
    <classpathentry kind="lib" path="src/plugin/clustering-carrot2/lib/carrot2-util-common.jar"/>
        
    <classpathentry kind="lib" path="src/plugin/clustering-carrot2/lib/carrot2-util-tokenizer.jar"/>
        
    <classpathentry kind="lib" path="src/plugin/clustering-carrot2/lib/commons-collections-3.1-patched.jar"/>
        
    <classpathentry kind="lib" path="src/plugin/clustering-carrot2/lib/commons-pool-1.1.jar"/>
        
    <classpathentry kind="lib" path="src/plugin/clustering-carrot2/lib/Jama-1.0.1-patched.jar"/>
        
    <classpathentry kind="lib" path="src/plugin/clustering-carrot2/lib/violinstrings-1.0.2.jar"/>
        
    <classpathentry kind="lib" path="src/plugin/lib-commons-httpclient/lib/commons-httpclient-3.0.jar"/>
        
    <classpathentry kind="lib" path="src/plugin/lib-jakarta-poi/lib/poi-3.0-alpha1-20050704.jar"/>
        
    <classpathentry kind="lib" path="src/plugin/lib-jakarta-poi/lib/poi-scratchpad-3.0-alpha1-20050704.jar"/>
        
    <classpathentry kind="lib" path="src/plugin/lib-log4j/lib/log4j-1.2.11.jar"/>
        
    <classpathentry kind="lib" path="src/plugin/lib-lucene-analyzers/lib/lucene-analyzers-1.9-rc1-dev.jar"/>
        
    <classpathentry kind="lib" path="src/plugin/lib-nekohtml/lib/nekohtml-0.9.4.jar"/>
        
    <classpathentry kind="lib" path="src/plugin/lib-xml/lib/jaxen-core.jar"/>
        
    <classpathentry kind="lib" path="src/plugin/lib-xml/lib/jaxen-jdom.jar"/>
        
    <classpathentry kind="lib" path="src/plugin/lib-xml/lib/jdom.jar"/>
        
    <classpathentry kind="lib" path="src/plugin/lib-xml/lib/saxpath.jar"/>
        
    <classpathentry kind="lib" path="src/plugin/lib-xml/lib/xercesImpl.jar"/>
        
    <classpathentry kind="lib" path="src/plugin/ontology/lib/commons-logging-1.0.3.jar"/>
        
    <classpathentry kind="lib" path="src/plugin/ontology/lib/icu4j_2_6_1.jar"/>
        
    <classpathentry kind="lib" path="src/plugin/ontology/lib/jena-2.1.jar"/>
        
    <classpathentry kind="lib" path="src/plugin/parse-html/lib/tagsoup-1.0rc3.jar"/>
        
    <classpathentry kind="lib" path="src/plugin/parse-pdf/lib/PDFBox-0.7.2-log4j.jar"/>
        
    <classpathentry kind="lib" path="src/plugin/parse-rss/lib/commons-feedparser-0.6-fork.jar"/>
        
    <classpathentry kind="lib" path="src/plugin/parse-rss/lib/xmlrpc-1.2.jar"/>
        
    <classpathentry kind="lib" path="src/plugin/parse-swf/lib/javaswf.jar"/>
        
    <classpathentry kind="lib" path="src/plugin/protocol-ftp/lib/commons-net-1.2.0-dev.jar"/>
        
    <classpathentry kind="lib" path="src/plugin/protocol-httpclient/lib/commons-codec.jar"/>
        
    <classpathentry kind="lib" path="src/plugin/summary-lucene/lib/lucene-highlighter-2.0-rc1-dev.jar"/>
        
    <classpathentry kind="lib" path="src/plugin/urlfilter-automaton/lib/automaton.jar"/>
        
    <classpathentry kind="lib" path="src/plugin/parse-mp3/lib/jid3lib-0.5.1.jar"/>
        
    <classpathentry kind="lib" path="src/plugin/parse-rtf/lib/rtf-parser.jar"/>
        
    <classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
        
    <classpathentry kind="output" path="tmp_build"/>
    </classpath>

  2. .project 其内容为:
    <?xml version="1.0" encoding="UTF-8"?>
    <projectDescription>
        
    <name>nutch-0.8</name>
        
    <comment></comment>
        
    <projects>
        
    </projects>
        
    <buildSpec>
            
    <buildCommand>
                
    <name>org.eclipse.jdt.core.javabuilder</name>
                
    <arguments>
                
    </arguments>
            
    </buildCommand>
        
    </buildSpec>
        
    <natures>
            
    <nature>org.eclipse.jdt.core.javanature</nature>
        
    </natures>
    </projectDescription>


导入eclipse:
  1. File->Import选择目录D:/nutch-0.8.1
  2. Run->Run...
    配置如下:


    Main class:  org.apache.nutch.crawl.Crawl
    Program arguments: urls -dir crawl.iscas -depth 2 -topN 50
    VM arguments: -Dhadoop.log.dir=logs -Dhadoop.log.file=hadoop.log



不知您是否运行,如果还没有,请email我:godric.wu@gmail.com






原创粉丝点击