lucene搜索引擎技术的分析与整理(代码情景分析)

来源:互联网 发布:制作手机视频短片软件 编辑:程序博客网 时间:2024/05/01 02:17

6. 测试的主程序

规则:

加粗体的黑色代码,表示将作深入分析

try {

Directory directory = new RAMDirectory();

Analyzer analyzer = new SimpleAnalyzer();

IndexWriter writer = new IndexWriter(directory, analyzer, true);

String[] docs = {

"a b c d e",

"a b c d e a b c d e",

"a b c d e f g h i j",

"a c e",

"e c a",

"a c e a c e",

"a c e a b c"

};

for (int j = 0; j < docs.length; j++) {

Document d = new Document();

d.add(Field.Text("contents", docs[j]));

writer.addDocument(d);

}

writer.close();

以上代码是准备工作,生成索引

Searcher searcher = new IndexSearcher(directory);

以上代码,初始化查询,分析编号 1 。 1

String[] queries = {"/"a c e/"",

};

Hits hits = null;

QueryParser parser = new QueryParser("contents", analyzer);

parser.setPhraseSlop(0);

for (int j = 0; j < queries.length; j++) {

Query query = parser.parse(queries[j]);

该 Query = PhraseQuery

System.out.println("Query: " + query.toString("contents"));

hits = searcher.search(query);

以上代码,初始化查询,分析编号 1 。 2

System.out.println(hits.length() + " total results");

for (int i = 0 ; i < hits.length() && i < 10; i++) {

Document d = hits.doc(i);

System.out.println(i + " " + hits.score(i)

// + " " + DateField.stringToDate(d.get("modified"))

+ " " + d.get("contents"));

}

}

searcher.close();

} catch (Exception e) {

System.out.println(" caught a " + e.getClass() +

"/n with message: " + e.getMessage());

}

 

查询结果:

Query: "a c e"

3 total results

0 1.0 a c e a c e

1 0.9428091 a c e

2 0.7071068 a c e a b c

 

1.1. Searcher searcher = new IndexSearcher(directory)

1.1.1. 初始化

通过目录,创建一个索引搜索器,

调用类

IndexSearcher :: public IndexSearcher(Directory directory) throws IOException {

this( IndexReader.open(directory) , true);

}

调用

private IndexSearcher(IndexReader r, boolean closeReader) {

reader = r;

this.closeReader = closeReader;

}

调用

private static IndexReader open(final Directory directory, final boolean closeDirectory) throws IOException {

synchronized (directory) { // in- & inter-process sync

return (IndexReader)new Lock.With(

directory.makeLock(IndexWriter.COMMIT_LOCK_NAME),

IndexWriter.COMMIT_LOCK_TIMEOUT) {

public Object doBody() throws IOException {

SegmentInfos infos = new SegmentInfos();

从目录中读取 SegmentInfos

infos.read(directory);

if (infos.size() == 1) { // index is optimized

return new SegmentReader(infos, infos.info(0), closeDirectory);

} else {

IndexReader[] readers = new IndexReader[infos.size()];

for (int i = 0; i < infos.size(); i++)

readers[i] = new SegmentReader(infos.info(i));

return new MultiReader(directory, infos, closeDirectory, readers);

}

}

}.run();

}

}

代码到这里,已经读取了文件 segments 文件,获得段信息,该测试只有一个段,所以执行了 return new SegmentReader(infos, infos.info(0), closeDirectory); ,记住 IndexReader = SegmentReader

infos.read(directory):

/** 读取输入参数的目录,下的 segments 文件

* 代码分析:

* 1 。读取格式,小于 0 表示该文件有隐含的格式信息,小于- 1 就表示该格式是未知的,因为最小的格式是- 1

* 2 。小于 0 时,再读取版本信息以及段的计数

* 3 。大于 0 ,表示 segments 文件开头部分没有版本信息,只有段的计数

* 4 。读取段的数量

* 5 。循环读取段信息,然后构建段信息对象,最后把这些对象都加入到段集合中

* 6 。大于 0 时,判断是否文件最后有版本信息,有的话就赋值 version ,没有的话, version = 0 */ , 该段代码比较简单,读者可以从看 src 中代码

return new SegmentReader(infos, infos.info(0), closeDirectory);

SegmentReader(SegmentInfos sis, SegmentInfo si, boolean closeDir)

throws IOException {

super(si.dir, sis, closeDir);

initialize(si);

}

super(si.dir, sis, closeDir);

IndexReader :: IndexReader(Directory directory, SegmentInfos segmentInfos, boolean closeDirectory) {

this.directory = directory;

this.segmentInfos = segmentInfos;

directoryOwner = true;

this.closeDirectory = closeDirectory;

stale = false;

hasChanges = false;

writeLock = null;

}

SegmentReader :: initialize(si);

/** 初始化这个段信息

该段代码是初始化了

* 1 。读入域信息,只有域的名字

* 2. 打开保存域、保存域索引的文件

*/

private void initialize(SegmentInfo si) throws IOException

{

segment = si.name;

// Use compound file directory for some files, if it exists

Directory cfsDir = directory();// 就是保存该段的目录

// CompoundFileReader (组合文件读取器)也是 ( 目录 ) 的子类

if (directory().fileExists(segment + ".cfs")) {

cfsReader = new CompoundFileReader(directory(), segment + ".cfs");

cfsDir = cfsReader;

}

// 1 。读入域信息,只有域的名字

fieldInfos = new FieldInfos(cfsDir, segment + ".fnm"); // 这个过程读入所有的域信息了

// 2 。打开保存域、保存域索引的文件

fieldsReader = new FieldsReader(cfsDir, segment, fieldInfos);

tis = new TermInfosReader(cfsDir, segment, fieldInfos);

if (hasDeletions(si))

deletedDocs = new BitVector(directory(), segment + ".del");// 读入删除表

freqStream = cfsDir.openFile(segment + ".frq");// 读入频率文件

proxStream = cfsDir.openFile(segment + ".prx");// 读入 位置文件

openNorms(cfsDir);// 读入文件 segment.f1,segment.f2 ……,建立 hashtable

if (fieldInfos.hasVectors()) { // open term vector files only as needed

termVectorsReader = new TermVectorsReader(cfsDir, segment, fieldInfos);

}

}

1.2. hits = searcher.search(query);

这时, searcher = IndexSearcher , 对该代码的跟踪如下:

调用: return search(query, (Filter)null)

调用: return new Hits(this, query, filter);

调用: Hit :: Hits(Searcher s, Query q, Filter f) throws IOException {

query = q;

searcher = s;

filter = f;

getMoreDocs(50); // retrieve 100 initially

}

getMoreDocs(int min) 调用:: TopDocs topDocs = searcher.search(query, filter, n)

searcher.search(query, filter, n) 调用 Scorer scorer = query.weight(this).scorer(reader);

IndexSearcher :: public TopDocs search(Query query, Filter filter, final int nDocs)

throws IOException {

Scorer scorer = query.weight(this).scorer(reader);

if (scorer == null)

return new TopDocs(0, new ScoreDoc[0]);

final BitSet bits = filter != null ? filter.bits(reader) : null;

final HitQueue hq = new HitQueue(nDocs);

final int[] totalHits = new int[1];

scorer.score(new HitCollector() {

private float minScore = 0.0f ;

public final void collect(int doc, float score) {

if (score > 0.0f && // ignore zeroed buckets

(bits==null || bits.get(doc))) { // skip docs not in bits

totalHits[0]++;

if (hq.size() < nDocs || score >= minScore) {

hq.insert(new ScoreDoc(doc, score));

minScore = ((ScoreDoc)hq.top()).score; // maintain minScore

}

}

}

});

ScoreDoc[] scoreDocs = new ScoreDoc[hq.size()];

for (int i = hq.size()-1; i >= 0; i--) // put docs in array

scoreDocs[i] = (ScoreDoc)hq.pop();

return new TopDocs(totalHits[0], scoreDocs);

}

1.2.1. Scorer scorer = query.weight(this).scorer(reader);

参数分析: query = PhraseQuery (该参数由主测试程序中的 Query query = parser.parse(queries[j]); 初始化)

this = IndexSearcher (该参数初始化,已经初始化了主要的文件,具体可参考 1.1 )

由代码

1 PhraseQuery ::

protected Weight createWeight(Searcher searcher) {

if (terms.size() == 1) { // optimize one-term case

Term term = (Term)terms.elementAt(0);

Query termQuery = new TermQuery(term);

termQuery.setBoost(getBoost());

return termQuery.createWeight(searcher);

}

return new PhraseWeight(searcher);

}

 

query.weight(this) 创建了 PhraseWeight(searcher)

Scorer scorer = query.weight(this).scorer(reader) 就相当于 PhraseWeight(searcher). .scorer(reader), 即调用以下代码:

2 PhraseQuery ::

public Scorer scorer(IndexReader reader) throws IOException {

if (terms.size() == 0) // optimize zero-term case

return null;

// 读取项的 位置信息

TermPositions[] tps = new TermPositions[terms.size()];

for (int i = 0; i < terms.size(); i++) {

TermPositions p = reader.termPositions((Term)terms.elementAt(i));

if (p == null)

return null;

tps[i] = p;

}

得到所有项的项信息, TermPositions[ ] = SegmentTermPositions[ ]

if (slop == 0) // optimize exact case

return new ExactPhraseScorer(this, tps, getPositions(), getSimilarity(searcher) ,

reader.norms(field));

}

 

ü TermPositions p = reader.termPositions((Term)terms.elementAt(i));

这时 Term 文本为查询里的项

public TermPositions termPositions(Term term) throws IOException {

TermPositions termPositions = termPositions();

termPositions.seek(term);

return termPositions;

}

termPositions() ::

SegmentReader :: public final TermPositions termPositions() throws IOException {

return new SegmentTermPositions(this);

}

parent = SegmentReader, 即刚才的段读取器

tis = new TermInfosReader(cfsDir, segment, fieldInfos); 即项信息读取器

SegmentTermPositions(this) ::

SegmentTermPositions :: SegmentTermPositions(SegmentReader p) throws IOException {

super(p);

this.proxStream = (InputStream)parent.proxStream.clone();

}

super(p) ::

SegmentTermDocs(SegmentReader parent)

throws IOException {

this.parent = parent;

this.freqStream = (InputStream) parent.freqStream.clone();

this.deletedDocs = parent.deletedDocs;

this.skipInterval = parent.tis.getSkipInterval();

}

termPositions.seek(term);

public void seek(Term term) throws IOException {

根据项,从项信息读取器中读取对应的项信息,该方法是线程安全的

TermInfo ti = parent.tis.get(term);

seek(ti);

}

seek(TermInfo ti)

SegmentTermDocs 的项信息转变为现在读入的项的信息

void seek(TermInfo ti) throws IOException {

count = 0;

if (ti == null) {

df = 0;

} else {

df = ti.docFreq;

doc = 0;

skipDoc = 0;

skipCount = 0;

numSkips = df / skipInterval;

freqPointer = ti.freqPointer;

proxPointer = ti.proxPointer;

skipPointer = freqPointer + ti.skipOffset;

freqStream.seek(freqPointer);

haveSkipped = false;

}

}

new ExactPhraseScorer(this, tps, getPositions(), getSimilarity(searcher) , reader.norms(field));

调用构造器

ExactPhraseScorer(Weight weight, TermPositions[] tps, int[] positions, Similarity similarity,

byte[] norms) throws IOException {

super(weight, tps, positions, similarity, norms);

调用超类构造器,获得短语位置的频繁度信息和位置信息,并构造一个优先队列

PhraseScorer(Weight weight, TermPositions[] tps, int[] positions, Similarity similarity,

byte[] norms) {

super(similarity);

this.norms = norms;

this.weight = weight;

this.value = weight.getValue();

// convert tps to a list

// 把 PhrasePositions 放在一个一般的队列里面(以链表形式)

for (int i = 0; i < tps.length; i++) {

PhrasePositions pp = new PhrasePositions(tps[i], positions[i]);

if (last != null) { // add next to end of list

last.next = pp;

} else

first = pp;

last = pp;

}

pq = new PhraseQueue(tps.length); // construct empty pq

}

使用该记分器记分,并收集

scorer.score(new HitCollector()

public void score(HitCollector hc) throws IOException {

while (next()) {

hc.collect(doc(), score());

}

}

hc.collect(doc(), score());

score() 调用, value 为权值

PhraseScorer :: public float score() throws IOException {

//System.out.println("scoring " + first.doc);

float raw = getSimilarity().tf(freq) * value; // raw score

return raw * Similarity.decodeNorm(norms[first.doc]); // normalize

}

把各个位置的文档和得分收集

public final void collect(int doc, float score) {

if (score > 0.0f && // ignore zeroed buckets

(bits==null || bits.get(doc))) { // skip docs not in bits

totalHits[0]++;

if (hq.size() < nDocs || score >= minScore) {

hq.insert(new ScoreDoc(doc, score));

minScore = ((ScoreDoc)hq.top()).score; // maintain minScore

}

}

}

到这里就出来了查询的文档和分数,并且这些文档和分数经过了指定的排序和过滤