LargeNGramModel API 语言模型

来源:互联网 发布:局域网视频网站源码 编辑:程序博客网 时间:2024/06/07 06:21
<span style="font-size:24px;">public class LargeNGramModel implements LanguageModel, BackoffLanguageModel使用一个二进制NGram(n元)语言模型文件(”DMP 文件”)的语言模型。此语言模型是通过SphinxBase sphinx_lm_convert产生的。本类的属性:@S4String(mandatory = false)public static final String PROP_QUERY_LOG_FILE = "queryLogFile";记录了所有询问N-grams的文件名属性。如果此属性值为null,则它意味着询问询问N-grams没有被记录。  @S4Integer(defaultValue = 100000)public static final String PROP_NGRAM_CACHE_SIZE = "ngramCacheSize";属性定义了能缓存的ngrams的最大的数目即个数。@S4Boolean(defaultValue = false)public static final String PROP_CLEAR_CACHES_AFTER_UTTERANCE = "clearCachesAfterUtterance";属性用于控制在每一个utterance后是否清空ngram缓存。@S4Double(defaultValue = 1.0f)public final static String PROP_LANGUAGE_WEIGHT = "languageWeight";属性定义了为搜索定义了语言权重。@S4Component(type = LogMath.class)public final static String PROP_LOG_MATH = "logMath";属性定义了logmath组件。@S4Boolean(defaultValue = false)public final static String PROP_APPLY_LANGUAGE_WEIGHT_AND_WIP = "applyLanguageWeightAndWip";属性用于控制语言模型是否将会应用语言权重和字插入概率。@S4Double(defaultValue = 1.0f)public final static String PROP_WORD_INSERTION_PROBABILITY = "wordInsertionProbability";字插入概率属性。@S4Boolean(defaultValue = false)public final static String PROP_FULL_SMEAR = "fullSmear";如果为true,使用全bigram信息来决定smear。public static final int BYTES_PER_NGRAM = 4;public static final int BYTES_PER_NMAXGRAM = 2;由the CMU-Cambridge Statistical Language Modeling Toolkit统计语言模型工具产生的语言模型文件中的每一个N-gram的所占的字节数。private final static int SMEAR_MAGIC = 0xC0CAC01A;smear魔法数。事情将会更好。配置数据:URL location;    protected Logger logger;    protected LogMath logMath;    protected int maxDepth;、    protected int ngramCacheSize;    protected boolean clearCacheAfterUtterance;    protected boolean fullSmear;    protected Dictionary dictionary;    protected String format;    protected boolean applyLanguageWeightAndWip;    protected float languageWeight;    protected float unigramWeight;    protected double wip;统计数据:private int ngramMisses;private int ngramHits;private int smearTermCount;protected String ngramLogFile;子组件:private BinaryLoader loader;private PrintWriter logFile;工作时数据:private Map<Word, UnigramProbability> unigramIDMap;    private Map<WordSequence, NGramBuffer>[] loadedNGramBuffers;    private LRUCache<WordSequence, ProbDepth> ngramDepthCache;    private Map<Long, Float> bigramSmearMap;    private NGramBuffer[] loadedBigramBuffers;    private UnigramProbability[] unigrams;    private int[][] ngramSegmentTable;    private float[][] ngramProbTable;    private float[][] ngramBackoffTable;    private float[] unigramSmearTerm;本类的构造方法:public LargeNGramModel( String format, URL location, String ngramLogFile,int maxNGramCacheSize, boolean clearCacheAfterUtterance, int maxDepth,  LogMath logMath, Dictionary dictionary,boolean applyLanguageWeightAndWip, float languageWeight,double wip, float unigramWeight, boolean fullSmear );给定参数创建对象。public LargeNGramModel();空的构造方法.本类的方法:public void newProperties(PropertySheet ps);对属性进行设置。public void allocate();分配资源。private void buildUnigramIDMap(Dictionary dictionary);建立word与UnigramProbability对的map即对unigramIDMap进行了设置。往其中放入word与UnigramProbability对。public void start();在识别之前调用。public void stop();在识别后调用。在本方法中清空了缓存和logfile。private void clearCache();清空ngram缓存。public ProbDepth getProbDepth(WordSequence wordSequence);返回预测的概率和深度。使用了为高阶的ngrams。wordSequence为字序列用来获得概率。private ProbDepth getUnigramProbDepth(WordSequence wordSequence) ;返回的是给定unigram的unigram概率。参数:wordSequence 为unigram字序列。返回的是unigram的概率。返回的是wordSequence 中第一个字对应的unigram的概率即ProbDepth 。public float getProbability(WordSequence wordSequence);获得字序列的ngram概率。概率是在log域的。private NGramProbability findNGram(WordSequence wordSequence);返回的一个给定ngram的NGramProbability。wordSequence为装载ngram。本方法查找或装载给定ngram的NGramProbability。private boolean is32bits();告诉模型是16位的还是32位的。是32位返回为true。private NGramBuffer loadNGramBuffer(WordSequence ws);返回的是给定字序列的所有NGram跟随者的一个NGramBuffer对象,ws为n-1gram用来查找跟随者。本方法把给定的n-1gram的所有ngram跟随者装载入一个缓存中。private NGramBuffer getBigramBuffer(int firstWordID);返回给定字的bigrams。输入参数:firstWordID为字的id。返回的是字的bigrams。private NGramBuffer getNGramBuffer(WordSequence wordSequence);返回的是给定字序列的ngrams,wordSequence用来得到缓存。返回的是字序列的ngrambuffer。  private int getFirstNGramEntry(NGramProbability nMinus1Gram, int firstNMinus1GramEntry, int n);返回的是给定n-1gram的第一个ngram项的索引。输入参数:nMinus1Gram为我们所查找的首个ngram项的n-1gram。firstNMinus1GramEntry为在考虑的n-1gram的第一个n-1gram项。N为ngram的阶。private ProbDepth getUnigramProbDepth(WordSequence wordSequence);返回的是给定unigram的unigram概率。wordSequence为unigram字序列。private UnigramProbability getUnigram(Word unigram);如果此语言模型有给定的unigram,则返回它的unigramprobability。否则为null。Unigram为要查找的unigram。private UnigramProbability getUnigram(Word unigram);如果此语言模型有给定的unigram,则返回它的UnigramProbability 。通过给定字来获得相应的UnigramProbability 对象。private boolean hasUnigram(Word unigram) ;如果此语言模型包括输入的unigram 字,则返回为true。否则为false。public final int getWordID(Word word);返回给定字的id。public float getSmearOld(WordSequence wordSequence);得到给定字序列的smear项。返回的是the smear term associated with this word sequence。public float getSmear(WordSequence wordSequence);返回与输入相关的smearterm。private int getNumberBigramFollowers(int wordID);返回的是一个字的bigrams跟随者的数目即个数。wordID 为字的id。返回的是the number of bigram followers 。public int getMaxDepth() ;返回的是语言模型的最大深度。即tMaxDepth属性。public Set<String> getVocabulary();返回的是在语言模型中的字拼写的集合。此集合是不可改变的。String 为字的拼写。public int getNGramMisses();返回当一个ngram被询问,但是在语言模型中却不存在此ngram的次数。在这种情况下它使用的是backoff概率。返回的ngram丢失的次数。即NGramMisses属性。public int getNGramHits();返回的是ngram 碰撞的次数,即NGramHits属性。private NGramBuffer getBigramBuffer(int firstWordID);返回的是给定字的bigrams。firstWordID为字的id。返回的是存储bigrams的ngrambuffer。private NGramBuffer loadTrigramBuffer(int firstWordID, int secondWordID);返回的是把给定bigram的所有trigram跟随者存入缓存。输入参数:firstWordID为首字id,secondWordID为第二个字id。返回的是存储trigram跟随者的ngrambuffer。private void buildSmearInfo();创建smear信息。private void dumpProbs(double[] ugNumerator, double[] ugDenominator, int i,int j, float logugprob, float logbgprob, double ugprob,double bgprob, double backoffbgprob, double logbackoffbgprob);打印出概率信息。private void writeSmearInfo(String filename);把smear信息写入到指定的文件中。private void readSmearInfo(String filename);从给定文件中读取smear信息。private void putSmearTerm(int word1, int word2, float smearTerm);为2个字放置smear term。private Float getSmearTerm(int word1, int word2);获得2个字的smear term。private float getBigramProb(int word1, int word2);获得2个给定字的bigram概率。public void deallocate();释放相应的资源。调用了load.deallocate()方法。private void readSmearInfo(String filename);从给定文件中读取smear信息。</span>

0 0
原创粉丝点击