自然语言处理基于java实现(4) 之 基于VSM模型的信息检索程序

来源:互联网 发布:java程序开发培训中心 编辑:程序博客网 时间:2024/04/29 14:13

一. 题目如下:
依据实验3建立的索引文件,实现一个基于VSM模型的信息检索程序。

解释下,如何实现一个基于VSM模型的信息检索程序?

                //      段落序号,词条, tf-idf值   的二维空间表    private final Table<Integer,String,Double> vectorTable = HashBasedTable.create();

没错,简单无脑的理解,就是给个检索的句子,从这一张二维空间表中找出最佳匹配的段落,这个检索算法模型叫VSM模型

二. 实现步骤
1. 构造数据结构
2. 实现算法
3. 没了,做个测试吧

三. 源代码
1.二维空间表数据结构(准确的讲,它已经实现了检索算法)

package experiment4;import java.util.HashSet;import java.util.Map;import java.util.Set;import com.google.common.collect.HashBasedTable;import com.google.common.collect.Table;public class VectorSpace {    //      段落序号,    词条,    tf-idf值   的二维空间表    private final Table<Integer,String,Double> vectorTable = HashBasedTable.create();    protected VectorSpace(){    }    //VMS检索    public int VSM(Map<String,Double> words){        int index = 0;        double max = 0;        for(int rkey:vectorTable.rowKeySet()){            Map<String,Double> map = vectorTable.row(rkey);            double cos = cos(map,words);            if(cos>max){                max = cos;                index = rkey;            }        }        return index;    }    /**     * 求两个句子的余弦相似度     * @param map1     * @param map2     * @return     */    public double cos(Map<String, Double> map1, Map<String, Double> map2) {        Set<String> set = new HashSet<>();        set.addAll(map1.keySet());        set.addAll(map2.keySet());        double fenzi = 0;        double fenmu1 = 0;        double fenmu2 = 0;        for(String word:set){            double x1 = map1.getOrDefault(word, 0.0);            double x2 = map2.getOrDefault(word, 0.0);            fenzi = fenzi + x1 * x2;            fenmu1 = fenmu1 + x1 * x1;            fenmu2 = fenmu2 + x2 * x2;        }        return fenzi/(Math.sqrt(fenmu1)+Math.sqrt(fenmu2));    }    //get set    public Table<Integer, String, Double> getVectorTable() {        return vectorTable;    }}

2.简化下操作,写个VectorSpaces类

package experiment4;import java.util.HashMap;import java.util.Map;import experiment1.Thesaurus;import experiment2.HMM;import experiment2.HMMFactory;import experiment3.Term;import experiment3.TermList;import util.FileRW;/** * 严格来讲,它是一个综合实验1,2,3,4的词库类 * 但是我的初衷只想把它设计为VectorSpace工具类 */public final class VectorSpaces {    //数据存储容器    private static Map<String,WordStatistics> map = new HashMap<>();    /**     * 创建一个统计词库     * @param path     */    public static void create(String path){        map.put(path, new WordStatistics(FileRW.read(path)));    }    /**     * vsm检索语句     * @param name     * @param words     * @return     */    public static String VSM(String name,String words){        WordStatistics ws = map.get(name);        if(ws==null)            return null;        String []word = ws.thesaurus.spitWord(words);        Map<String,Integer> map = new HashMap<>();        for(String s:word){            map.put(s, map.getOrDefault(s, 0)+1);        }        Map<String,Double> vsm = new HashMap<>();        map.forEach((key,value)->{vsm.put(key, tf_idf(ws.termList,ws.thesaurus,key, value));});        return ws.termList.indexOfDocument(ws.vec.VSM(vsm));    }    /**     * 计算tf_idf     * @param termList     * @param thesaurus     * @param word     * @param times     * @return     */    private static double tf_idf(TermList termList, Thesaurus thesaurus, String word,int times){        Term term = termList.getTerm(word);        if(term==null)            return 0;        return Math.log((1+times)*Math.log(thesaurus.numOfSize()*1.0/(term.getNdoc()+1)));    }    /**     * 词条统计综合库     */    protected static class WordStatistics{        public final VectorSpace vec;        public final TermList termList;        public final HMM hmm;        public final Thesaurus thesaurus;        public WordStatistics(String content){            thesaurus = new Thesaurus(content);            hmm = HMMFactory.createHMM(content);            termList = new TermList(content);            vec = new VectorSpace();            for(Term term:termList.getList()){                term.getMap().forEach((key,value)->{                    vec.getVectorTable().put(key, term.getWord(), tf_idf(termList, thesaurus, term.getWord(), value));                });            }        }    }}

3.测试一下

package test;import experiment4.VectorSpaces;public class Test4 {    public static void main(String[] args) {        String path = "199801.txt";        VectorSpaces.create(path);        System.out.println(VectorSpaces.VSM(path, "美国政府对朝鲜实行打压政策"));        System.out.println("=============================================================================");        System.out.println(VectorSpaces.VSM(path, "在这一年中,中国的改革开放和现代化建设继续向前迈进。国民经济保持了“高增长、低通胀”的良好发展态势。"));        System.out.println("==============================================================================");        System.out.println(VectorSpaces.VSM(path, "中国收复台湾指日可待"));        System.out.println("==============================================================================");        System.out.println(VectorSpaces.VSM(path, "我曾经想画幅漫画位能把各种人造卫星送上太空的中国专家,低头拿出国产圆珠笔要在文件上签字,把纸划破了,字还是写不出来"));    }}

程序源码,在第一篇开篇处

1 0
原创粉丝点击