用nutch和solrnet建立自己的搜索网站

来源:互联网 发布:路由器配置内网ip域名 编辑:程序博客网 时间:2024/06/10 07:57

下载解压nutch和solr

添加javahome

 export  JAVA_HOME=/opt/bitnami/java

修改vi conf/regex-urlfilter.txt
+^http://([a-z0-9]*\.)*letv.com/

新建urls目录,用于存放首要抓取的url列表,我们存放http://guodo.net

[zhouhh@Hadoop48 nutch]$ mkdir urls[zhouhh@Hadoop48 urls]$ vi seed.txthttp://www.letv.com/
修改
 nutch-site.xml
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>


<!-- Put site-specific property overrides in this file. -->


<configuration>
<property>
  <name>http.agent.name</name>
  <value>myag</value>
  <description>HTTP 'User-Agent' request header. MUST NOT be empty - 
  please set this to a single word uniquely related to your organization.


  NOTE: You should also check other related properties:


http.robots.agents
http.agent.description
http.agent.url
http.agent.email
http.agent.version


  and set their values appropriately.


  </description>
</property>
<property> 
  <name>parser.skip.truncated</name> 
  <value>myag</value> 
</property>
<property>
  <name>http.robots.agents</name>
  <value>myag,*</value>
  <description>The agent strings we'll look for in robots.txt files,
  comma-separated, in decreasing order of precedence. You should
  put the value of http.agent.name as the first agent name, and keep the
  default * at the end of the list. E.g.: BlurflDev,Blurfl,*
  </description>
</property>
</configuration>

复制nutch/conf里面的schema-solr4.xml到solr并改名为
schema.xml
并在fields里面添加一行
 <field name="_version_" type="long" indexed="true" stored="true"/>

到solr目录启动solr

java -jar start.jar
cd /opt/bitnami/nutch/
后台不挂起运行nutch
不带错误log
nohup ./bin/crawl /opt/bitnami/nutch/urls /opt/bitnami/nutch/sp http://localhost:8983/solr/ 50 >/dev/null 2>&1 &
带错误lognohup ./bin/crawl /opt/bitnami/nutch/urls /opt/bitnami/nutch/sp http://localhost:8983/solr/ 50 >/dev/null 2>log &
前台运行
./bin/crawl /opt/bitnami/nutch/urls /opt/bitnami/nutch/sp http://localhost:8983/solr/ 50 
查看记录数
./bin/nutch readdb /opt/bitnami/nutch/hc/crawldb -stats
到solrweb管理界面
http://localhost:8983/solr/ 
然后用slornet调用这个solr做测试网站
 public partial class gosearch : System.Web.UI.Page    {        public string pidx = "0";        public string q = "tgbus";        protected void Page_Load(object senderEventArgs e)        {            if (!IsPostBack)            {                if (Request.QueryString["q"] != null)                {                    q = Request.QueryString["q"].ToString();                    C1InputText1.Text = q;                    if (Request.QueryString["pidx"] != null)                    {                        pidx = Request.QueryString["pidx"].ToString();                        Label1.Text = "当前为" + (int.Parse(pidx) + 1).ToString() + "页";                        bind();                    }                }                else                {                    pidx = "0";                    bind();                }            }                                 /* C1Pager1.PageCount = 100;            Label1.Text = pidx;*/        }         private void bind()        {            Startup.Init<prt>("http://192.168.1.157:8983/solr");                     /*  var mapper = new AllPropertiesMappingManager();              mapper.SetUniqueKey(typeof(prt).GetProperty("id"));*/            var solr = ServiceLocator.Current.GetInstance<ISolrOperations<prt>>();            QueryOptions qo = new QueryOptions();            qo.Rows = 20;            qo.Start = int.Parse(pidx);            //   var results = solr.Query(new SolrQueryByField("url", "tgbus"));            ISolrQueryResults<prtresults = solr.Query(qqo);          //  dataGridView1.AutoGenerateColumns = true;           // MessageBox.Show(results.NumFound.ToString());            //   dataGridView1.Rows.Clear();            C1GridView1.DataSource = results.ToList();            C1GridView1.DataBind();            if (results.NumFound>20)            {                 C1Pager1.PageCount = results.NumFound/20;            }            else            {                C1Pager1.PageCount = 1;            }            C1Pager1.PageIndex = int.Parse(pidx);            Label2.Text ="共找到"results.NumFound.ToString()+"项";        }         protected void C1Pager1_PageIndexChanged(object senderEventArgs e)        {            C1Pager cp = sender as C1Pager;            Label1.Text ="当前为"+( cp.PageIndex+1).ToString()+"页";           // Response.Redirect("gosearch.aspx?pidx="+cp.PageIndex.ToString());            pidx = cp.PageIndex.ToString();            q = C1InputText1.Text;            bind();        }         protected void Button1_Click(object senderEventArgs e)        {            if (this.C1InputText1.Text != "")            {                Response.Redirect("gosearch.aspx?q=" + this.C1InputText1.Text + "&pidx=0");            }        }        public static bool IsChinese(string CString)        {            return System.Text.RegularExpressions.Regex.IsMatch(CString@"[\u4e00-\u9fa5]");           // return Regex.IsMatch(CString, @"^[\u4e00-\u9fa5]+$");        }    }}
测试地址
http://asearch.azurewebsites.net

0 0
原创粉丝点击