LargeNGramModel API 语言模型

来源：互联网发布：局域网视频网站源码编辑：程序博客网时间：2024/06/07 06:21
<span style="font-size:24px;">public class LargeNGramModel implements LanguageModel, BackoffLanguageModel使用一个二进制NGram（n元）语言模型文件（”DMP 文件”）的语言模型。此语言模型是通过SphinxBase sphinx_lm_convert产生的。本类的属性：@S4String(mandatory = false)public static final String PROP_QUERY_LOG_FILE = "queryLogFile";记录了所有询问N-grams的文件名属性。如果此属性值为null，则它意味着询问询问N-grams没有被记录。  @S4Integer(defaultValue = 100000)public static final String PROP_NGRAM_CACHE_SIZE = "ngramCacheSize";属性定义了能缓存的ngrams的最大的数目即个数。@S4Boolean(defaultValue = false)public static final String PROP_CLEAR_CACHES_AFTER_UTTERANCE = "clearCachesAfterUtterance";属性用于控制在每一个utterance后是否清空ngram缓存。@S4Double(defaultValue = 1.0f)public final static String PROP_LANGUAGE_WEIGHT = "languageWeight";属性定义了为搜索定义了语言权重。@S4Component(type = LogMath.class)public final static String PROP_LOG_MATH = "logMath";属性定义了logmath组件。@S4Boolean(defaultValue = false)public final static String PROP_APPLY_LANGUAGE_WEIGHT_AND_WIP = "applyLanguageWeightAndWip";属性用于控制语言模型是否将会应用语言权重和字插入概率。@S4Double(defaultValue = 1.0f)public final static String PROP_WORD_INSERTION_PROBABILITY = "wordInsertionProbability";字插入概率属性。@S4Boolean(defaultValue = false)public final static String PROP_FULL_SMEAR = "fullSmear";如果为true，使用全bigram信息来决定smear。public static final int BYTES_PER_NGRAM = 4;public static final int BYTES_PER_NMAXGRAM = 2;由the CMU-Cambridge Statistical Language Modeling Toolkit统计语言模型工具产生的语言模型文件中的每一个N-gram的所占的字节数。private final static int SMEAR_MAGIC = 0xC0CAC01A;smear魔法数。事情将会更好。配置数据：URL location;    protected Logger logger;    protected LogMath logMath;    protected int maxDepth;、    protected int ngramCacheSize;    protected boolean clearCacheAfterUtterance;    protected boolean fullSmear;    protected Dictionary dictionary;    protected String format;    protected boolean applyLanguageWeightAndWip;    protected float languageWeight;    protected float unigramWeight;    protected double wip;统计数据：private int ngramMisses;private int ngramHits;private int smearTermCount;protected String ngramLogFile;子组件：private BinaryLoader loader;private PrintWriter logFile;工作时数据：private Map<Word, UnigramProbability> unigramIDMap;    private Map<WordSequence, NGramBuffer>[] loadedNGramBuffers;    private LRUCache<WordSequence, ProbDepth> ngramDepthCache;    private Map<Long, Float> bigramSmearMap;    private NGramBuffer[] loadedBigramBuffers;    private UnigramProbability[] unigrams;    private int[][] ngramSegmentTable;    private float[][] ngramProbTable;    private float[][] ngramBackoffTable;    private float[] unigramSmearTerm;本类的构造方法：public LargeNGramModel( String format, URL location, String ngramLogFile,int maxNGramCacheSize, boolean clearCacheAfterUtterance, int maxDepth,  LogMath logMath, Dictionary dictionary,boolean applyLanguageWeightAndWip, float languageWeight,double wip, float unigramWeight, boolean fullSmear )；给定参数创建对象。public LargeNGramModel()；空的构造方法.本类的方法：public void newProperties(PropertySheet ps)；对属性进行设置。public void allocate()；分配资源。private void buildUnigramIDMap(Dictionary dictionary)；建立word与UnigramProbability对的map即对unigramIDMap进行了设置。往其中放入word与UnigramProbability对。public void start()；在识别之前调用。public void stop()；在识别后调用。在本方法中清空了缓存和logfile。private void clearCache()；清空ngram缓存。public ProbDepth getProbDepth(WordSequence wordSequence)；返回预测的概率和深度。使用了为高阶的ngrams。wordSequence为字序列用来获得概率。private ProbDepth getUnigramProbDepth(WordSequence wordSequence) ；返回的是给定unigram的unigram概率。参数：wordSequence 为unigram字序列。返回的是unigram的概率。返回的是wordSequence 中第一个字对应的unigram的概率即ProbDepth 。public float getProbability(WordSequence wordSequence)；获得字序列的ngram概率。概率是在log域的。private NGramProbability findNGram(WordSequence wordSequence)；返回的一个给定ngram的NGramProbability。wordSequence为装载ngram。本方法查找或装载给定ngram的NGramProbability。private boolean is32bits()；告诉模型是16位的还是32位的。是32位返回为true。private NGramBuffer loadNGramBuffer(WordSequence ws)；返回的是给定字序列的所有NGram跟随者的一个NGramBuffer对象，ws为n-1gram用来查找跟随者。本方法把给定的n-1gram的所有ngram跟随者装载入一个缓存中。private NGramBuffer getBigramBuffer(int firstWordID)；返回给定字的bigrams。输入参数：firstWordID为字的id。返回的是字的bigrams。private NGramBuffer getNGramBuffer(WordSequence wordSequence)；返回的是给定字序列的ngrams，wordSequence用来得到缓存。返回的是字序列的ngrambuffer。  private int getFirstNGramEntry(NGramProbability nMinus1Gram, int firstNMinus1GramEntry, int n)；返回的是给定n-1gram的第一个ngram项的索引。输入参数：nMinus1Gram为我们所查找的首个ngram项的n-1gram。firstNMinus1GramEntry为在考虑的n-1gram的第一个n-1gram项。N为ngram的阶。private ProbDepth getUnigramProbDepth(WordSequence wordSequence)；返回的是给定unigram的unigram概率。wordSequence为unigram字序列。private UnigramProbability getUnigram(Word unigram)；如果此语言模型有给定的unigram，则返回它的unigramprobability。否则为null。Unigram为要查找的unigram。private UnigramProbability getUnigram(Word unigram)；如果此语言模型有给定的unigram，则返回它的UnigramProbability 。通过给定字来获得相应的UnigramProbability 对象。private boolean hasUnigram(Word unigram) ；如果此语言模型包括输入的unigram 字，则返回为true。否则为false。public final int getWordID(Word word)；返回给定字的id。public float getSmearOld(WordSequence wordSequence)；得到给定字序列的smear项。返回的是the smear term associated with this word sequence。public float getSmear(WordSequence wordSequence)；返回与输入相关的smearterm。private int getNumberBigramFollowers(int wordID)；返回的是一个字的bigrams跟随者的数目即个数。wordID 为字的id。返回的是the number of bigram followers 。public int getMaxDepth() ；返回的是语言模型的最大深度。即tMaxDepth属性。public Set<String> getVocabulary()；返回的是在语言模型中的字拼写的集合。此集合是不可改变的。String 为字的拼写。public int getNGramMisses()；返回当一个ngram被询问，但是在语言模型中却不存在此ngram的次数。在这种情况下它使用的是backoff概率。返回的ngram丢失的次数。即NGramMisses属性。public int getNGramHits()；返回的是ngram 碰撞的次数，即NGramHits属性。private NGramBuffer getBigramBuffer(int firstWordID)；返回的是给定字的bigrams。firstWordID为字的id。返回的是存储bigrams的ngrambuffer。private NGramBuffer loadTrigramBuffer(int firstWordID, int secondWordID)；返回的是把给定bigram的所有trigram跟随者存入缓存。输入参数：firstWordID为首字id，secondWordID为第二个字id。返回的是存储trigram跟随者的ngrambuffer。private void buildSmearInfo()；创建smear信息。private void dumpProbs(double[] ugNumerator, double[] ugDenominator, int i,int j, float logugprob, float logbgprob, double ugprob,double bgprob, double backoffbgprob, double logbackoffbgprob)；打印出概率信息。private void writeSmearInfo(String filename)；把smear信息写入到指定的文件中。private void readSmearInfo(String filename)；从给定文件中读取smear信息。private void putSmearTerm(int word1, int word2, float smearTerm)；为2个字放置smear term。private Float getSmearTerm(int word1, int word2)；获得2个字的smear term。private float getBigramProb(int word1, int word2)；获得2个给定字的bigram概率。public void deallocate()；释放相应的资源。调用了load.deallocate（）方法。private void readSmearInfo(String filename)；从给定文件中读取smear信息。</span>
0 0