开发搜索引擎初步（一）建立索引（Lucene实现）

来源：互联网发布：天津话发音软件编辑：程序博客网时间：2024/05/17 03:01

开发自己的搜索引擎完成了一段时间了，现在准备开始梳理一下思路，把以前的总结一下，为以后做真正的“谷歌”埋下伏笔，呵呵。。。。。。

一。Lucene的下载

牛逼的Apache旗下的Lucene，呵呵，无人不知啊，http://lucene.apache.org/,去这个地址自己下载，别说不会Dowmload

二.使用Lucene建立索引

将下载下来的包解压，把里面的Core,memory,analyzer啥的都拿出来，配置到自己的Eclipse上面，下面的事情就是写代码了。

view plaincopy to clipboardprint?

package com.dreamers.creatindex;
import java.io.File;
import java.util.ArrayList;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.FSDirectory;
import org.dom4j.DocumentException;
import org.wltea.analyzer.lucene.IKAnalyzer;
import com.dreamers.xml.*;
import com.dreamers.read.*;
/**
* @category 创建所有XML索引
* @author bird
*
*/
public class CreatIndex {
private String INDEX_STORE_PATH ;
//创建索引
@SuppressWarnings("deprecation")
public void creatIndex(){
try{
GetPath path = new GetPath();
INDEX_STORE_PATH = path.getIndexPath();
File file = new File(INDEX_STORE_PATH);
Analyzer analyzer = new IKAnalyzer();
XmlReader xml = new XmlReader();
FSDirectory directory = FSDirectory.open(file);
IndexWriter writer = new IndexWriter(directory, analyzer, true,IndexWriter.MaxFieldLength.LIMITED);
ArrayList<String> lisId = xml.getId();
ArrayList<String> lisTitle = xml.getTitle();
ArrayList<String> lisKeyWords = xml.getKeyWords();
ArrayList<String> lisKind = xml.getKind();
ArrayList<String> lisDescribe = xml.getDescribe();
ArrayList<String> lisDate = xml.getDate();
ArrayList<String> lisUrl = xml.getUrl();
ArrayList<String> lisAuthor = xml.getAuthor();
ArrayList<String> lisPublisher = xml.getPublisher();
//System.out.println(lisUrl.get(5));
for (int i = 0; i < xml.getCount();i++){
Document doc = new Document();
//为ID创建Field
Field field = new Field("id",lisId.get(i),Field.Store.YES,Field.Index.NOT_ANALYZED );
doc.add(field);
//为title创建索引
field = new Field("title",lisTitle.get(i),Field.Store.YES,Field.Index.ANALYZED);
doc.add(field);
//为keywords创建索引
field = new Field("keywords",lisKeyWords.get(i),Field.Store.YES,Field.Index.ANALYZED);
doc.add(field);
//为kind创建索引
field = new Field("kind",lisKind.get(i),Field.Store.YES,Field.Index.NOT_ANALYZED);
doc.add(field);
//为describe创建索引
field = new Field("describe",lisDescribe.get(i),Field.Store.YES,Field.Index.ANALYZED);
doc.add(field);
//为data创建索引
field = new Field("date",lisDate.get(i),Field.Store.YES,Field.Index.NOT_ANALYZED);
doc.add(field);
//为URL创建索引
field = new Field("url",lisUrl.get(i),Field.Store.YES,Field.Index.NOT_ANALYZED);
doc.add(field);
//为author创建索引
field = new Field("author",lisAuthor.get(i),Field.Store.YES,Field.Index.NOT_ANALYZED);
doc.add(field);
//为publisher创建索引
field = new Field("publisher",lisPublisher.get(i),Field.Store.YES,Field.Index.NOT_ANALYZED);
doc.add(field);
}
writer.addDocument(doc);
}
writer.close();
//directory.close();
System.out.println("索引创建完毕");
} catch (Exception e){
e.printStackTrace();
}
}
public static void main(String [] args) throws DocumentException{
CreatIndex index = new CreatIndex();
index.creatIndex();
}
}

package com.dreamers.creatindex;import java.io.File;import java.util.ArrayList;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.store.FSDirectory;import org.dom4j.DocumentException;import org.wltea.analyzer.lucene.IKAnalyzer;import com.dreamers.xml.*;import com.dreamers.read.*;/** * @category 创建所有XML索引 * @author bird * */public class CreatIndex {private String INDEX_STORE_PATH ;//创建索引@SuppressWarnings("deprecation")public void creatIndex(){try{GetPath path = new GetPath();INDEX_STORE_PATH = path.getIndexPath();File file = new File(INDEX_STORE_PATH);Analyzer analyzer = new IKAnalyzer();XmlReader xml = new XmlReader();FSDirectory directory = FSDirectory.open(file);IndexWriter writer = new IndexWriter(directory, analyzer, true,IndexWriter.MaxFieldLength.LIMITED);ArrayList<String> lisId = xml.getId();ArrayList<String> lisTitle = xml.getTitle();ArrayList<String> lisKeyWords = xml.getKeyWords();ArrayList<String> lisKind = xml.getKind();ArrayList<String> lisDescribe = xml.getDescribe();ArrayList<String> lisDate = xml.getDate();ArrayList<String> lisUrl = xml.getUrl();ArrayList<String> lisAuthor = xml.getAuthor();ArrayList<String> lisPublisher = xml.getPublisher();//System.out.println(lisUrl.get(5));for (int i = 0; i < xml.getCount();i++){Document doc = new Document();//为ID创建FieldField field = new Field("id",lisId.get(i),Field.Store.YES,Field.Index.NOT_ANALYZED );doc.add(field);//为title创建索引    field = new Field("title",lisTitle.get(i),Field.Store.YES,Field.Index.ANALYZED);doc.add(field);//为keywords创建索引    field = new Field("keywords",lisKeyWords.get(i),Field.Store.YES,Field.Index.ANALYZED);doc.add(field);//为kind创建索引field = new Field("kind",lisKind.get(i),Field.Store.YES,Field.Index.NOT_ANALYZED);doc.add(field);//为describe创建索引     field = new Field("describe",lisDescribe.get(i),Field.Store.YES,Field.Index.ANALYZED);doc.add(field);//为data创建索引    field = new Field("date",lisDate.get(i),Field.Store.YES,Field.Index.NOT_ANALYZED);doc.add(field);//为URL创建索引field = new Field("url",lisUrl.get(i),Field.Store.YES,Field.Index.NOT_ANALYZED);doc.add(field);//为author创建索引field = new Field("author",lisAuthor.get(i),Field.Store.YES,Field.Index.NOT_ANALYZED);doc.add(field);//为publisher创建索引field = new Field("publisher",lisPublisher.get(i),Field.Store.YES,Field.Index.NOT_ANALYZED);doc.add(field);}writer.addDocument(doc);}    writer.close(); //directory.close();System.out.println("索引创建完毕");}  catch (Exception e){e.printStackTrace();}}  public static void main(String [] args) throws DocumentException{CreatIndex index = new CreatIndex();index.creatIndex();}}

这里不多说，最上面的每个list里面都藏有巨大的信息，都是一些字符串，就当是放到容器里的字符窜吧，然后下面的建立索引的过程都是一样的，代码比较短，就不需要什么注释了，呵呵