lucene-使用项向量

来源:互联网 发布:星际老男孩淘宝店没了 编辑:程序博客网 时间:2024/05/22 01:44

1、项向量是一组由项-频率对组成的集合。

1)设一个文档只包括cat和dog两个项,一共有2个文档,向量可表示为图形上的有方向的直线,一个向量就是一个文档。2个项因为是一个二维空间,cat为y轴,dog为x轴。向量为从(0,0)出发到(x,y)截止。x表示dog在该向量表示的文档中出现的频率,y表示cat在该向量表示的文档中出现的频率。

如果是3个文档,则有三个向量,这三个向量表示为3条从原点出发的直线,在第一象限。如果有3个项,5个文档,则表示为一个三维空间,空间内有5条向量,分别表示5个文档。

2)向量之间的夹角越接近,这2个向量的特征就越相似,这2个文档就越相似.

2、查找相似书

1)

public class BooksLikeThis{

   public static void main()throws IOException{

       String indexDir=System.getProperty("index.dir");

       FSDirectory directory=FSDirectory.getDirectory(indexDir,false);

       IndexReader reader=IndexReader.open(directory);

       int numDocs=reader.maxDoc();

       BooksLikeThis blt=new BooksLinkThis(reader);

       for(int i=0;i<numDocs;i++){

           System.out.println();

           Document  doc=reader.document(i);

            System.out.println(doc.get("title"));

           //查找与这本书类似的书,遍历每一本书

           Document[] docs=blt.docsLike(i,10);

           if (docs.length==0){

               System.out.println("   None likethis");

           }

            for(int j=0;j<docs.length;j++){

               Document likeThisDoc=docs[j];

               System.out.println("->"+likeThisDoc.get("title"));

           }

       }        
   }

   private IndexReaderreader;

   private IndexSearchersearcher;

   publicBooksLinkeThis(IndexReader reader){

       this.reader=reader;

       searcher=newIndexSearcher(reader); 

   }

  

   public Document[]docsLike() throws IOException{

      Document doc=reader.document(id);

//对作者相同的书进行因子增强,一本书可以有多个作者

      String[] authors=doc.getValues("author");

      BooleanQuery authorquery=new BooleanQuery();

      for (int i=0;i<authors.length;i++){

           String author=authors[i];

            authorQuery.add(newTermQuery(new Term("author",author)),false,false);

     }

     authorQuery.setBoost(2.0f);

      //使用项向量,项为subject,getTermFreqVector得到项的频率

    TermFreqVector vector=reader.getTermFreqVector(id,"subject");

     BooleanQuerysubjectQuery=new BooleanQuery();

    for (int j=0;j<vector.size();j++){

          TermQuerytq=new TermQuery(new Term("subject",vector.getTerms()[j]));

         subjectQuery.add(tq,false,false);

    }

     //创造最终查询对象

    BooleanQuery likeThisQuery=new BooleanQuery();

    likeThisQuery.add(authorQuery,false,false);

     likeThisQuery.add(subjectQuery,false,false);

    

     likeThisQuery.add(newTermQuery(newTerm("isbn",dco.get("isbn"))),false,true); 

    Hitshits=searcher.search(likeThisQuery);   

    int size=max;

    if (max>hits.length()) size=hits.length();

   

     Document[]docs=new Document(size);

    for(int i=0;i<size;i++){

         docs[i]=hits.doc[i];

    }

    return docs;

   }

}

2)按向量角计主题中包括extreme、agile、methodology,则这本书属于/technology/computers/programming/methodology分类。

public void testCategorization() throws Exception{

   assertEquals("/technology/computers/programming/methodology",getCategory("extremeagilemethodology"));   

}

为每个类别建立向量

public class CategorizerTest extends testcase{

   Map categoryMap;

   protected void setUp()throws Exception{

      super.setUp();

      categoryMap=new TreeMap();

      buildCategoryVectors();

   }
}

private void buildCategoryVectors() throws IOException{

   IndexReader reader=IndexReader.open(directory);

    intmaxDoc=reader.maxDoc();

   

    for (inti=0;i<macDoc;i++){

       if (!reader.isDeleted(i)){

           Document doc=reader.document(i);

           String category=doc.get("category");

           Map vectorMap=(Map) categoryMap.get(category);

           if (vectorMap==null){ 

               vectorMap=new TreeMap();

               categoryMap.put(category,vectorMap);

           }

           TermFreqVectortermFreqVector=reader.getTermFreqVector(i,"subject");

          addTermFreqToMap(vectorMap,termFreqVector);//将文档各个项的频率加入

                                                      //到分类中。             

       }

    }

}

private void addTermFreqToMap(Map vectorMap,TermFreqVectortermFreq){

    String[] terms=termFreqVector.getTerms();

    int[] freqs=termFreqVector.getTermFrequencies();

     

    for (int i=0;i<term.length;i++){

         Stringterm=terms[i];    

         if (vectorMap.contiansKey(term)){

             Integer value=(Integer) vectorMap.get(term);

             vectorMap.put(term,newInteger(value.intValue()+freqs[i])); 

          }

         else {

             vectorMap.put(term,new Integer(freq[i]));

         }

    }

 }

得到新书与每个类别向量之间的夹角,找到最匹配的类别

private String getCategory(String subject){

   String[]words=subject.split(" ");

   IteratorcategoryIterator=categoryMap.keySet().iterator();

   doublebestAngle=Double.MAX_VALUE;

   StringbestCategory=null;

   while(categoryIterator.hasNext()){

         Stringcategory=(String) categoryIterator.next();

        double angle=computeAngle(words,category);

        if (angle<bestAngle){

             bestAngle=angle;

             bestCategory=category;

         }

      }

    return bestCategory;

}

计算向量夹角

private double computAngle(String[] words,String category){

    MapvectorMap=(Map) categoryMap.get(category);

   

    intdtProduct=0;

    intsumOfSquares=0;

    for (inti=0;i<words.length;i++){

         String word=words[i];

         int categoryWrodFreq=0;

         if (vectorMap.containsKey(word)){

              categoryWordFreq=((Integer)vectorMap.get(word)).intValue();

         }

         doProduct+=categoryWordFreq;

         sumOfSquares+=categoryWrodFreq*categoryWordFreq;

    }

    doubledenominator;

    if(sumOfSquares==words.length){

        denominator=sumOfSquares;

   }else{

        denominator=Math.sqrt(sumOfSquares)+Math.sqrt(words.length);

    }

    double ratio=dotProduct/denomiator;

    returnMath.acos(ratio);     

}

原创粉丝点击