HTK3.4程序员手册(2.3)--特征参数提取HParm.c

来源:互联网 发布:2017年旅游业数据分析 编辑:程序博客网 时间:2024/05/22 00:23
HTK3.4程序员手册(2.3)--特征参数提取HParm.c
by 云龙


HTK book中提到的参数有11种:
"LPC", "LPREFC", "LPCEPSTRA",   "LPDELCEP", "IREFC", "MFCC", "FBANK", "MELSPEC","DISCRETE", "PLP","ANON"



但HTK3.4中是否都支持呢?
请看ConvertFrame()函数中的以下代码:
   switch(btgt){
   case LPC:
      Wave2LPC(cf->s,cf->a,cf->k,&re,&te);
      v = cf->a; bsize = cf->lpcOrder;
      break;
   case LPREFC:
      Wave2LPC(cf->s,cf->a,cf->k,&re,&te);
      v = cf->k; bsize = cf->lpcOrder;
      break;     
   case LPCEPSTRA:
      Wave2LPC(cf->s,cf->a,cf->k,&re,&te);
      LPC2Cepstrum(cf->a,cf->c);
      if (cf->cepLifter > 0)
         WeightCepstrum(cf->c, 1, cf->numCepCoef, cf->cepLifter);
      v = cf->c; bsize = cf->numCepCoef;
      break;
   case MELSPEC:
   case FBANK:
      Wave2FBank(cf->s, cf->fbank, rawE?NULL:&te, cf->fbInfo);
      v = cf->fbank; bsize = cf->numChans;
      break;
   case MFCC:
      Wave2FBank(cf->s, cf->fbank, rawE?NULL:&te, cf->fbInfo);
      FBank2MFCC(cf->fbank, cf->c, cf->numCepCoef);
      if (cf->cepLifter > 0)
         WeightCepstrum(cf->c, 1, cf->numCepCoef, cf->cepLifter);
      v = cf->c; bsize = cf->numCepCoef;
      break;
   case PLP:
      Wave2FBank(cf->s, cf->fbank, rawE ? NULL : &te, cf->fbInfo);
      FBank2ASpec(cf->fbank, cf->as, cf->eql, cf->compressFact, cf->fbInfo);
      ASpec2LPCep(cf->as, cf->ac, cf->lp, cf->c, cf->cm);
      if (cf->cepLifter > 0)
         WeightCepstrum(cf->c, 1, cf->numCepCoef, cf->cepLifter);
      v = cf->c;
      bsize = cf->numCepCoef;
      break;
   default:
      HError(6321,"ConvertFrame: target %s is not a parameterised form",
             ParmKind2Str(cf->tgtPK,buf));
   }
可以看出HTK3.4支持7中参数:LPC,LPREFC,LPCEPSTRA,MELSPEC,FBANK,MFCC,PLP。
参数转换顺序可以参照HTK book Fig.5.9:


IOConfigRec
数据结构存放着很多参数,在特征提取中:

typedef struct {

   /* ------- Overrideable parameters ------- */

   ParmKind srcPK;            /* Source ParmKind */

   FileFormat srcFF;          /* Source File format */

   HTime srcSampRate;         /* Source Sample Rate */

   Boolean zMeanSrc;          /* Zero Mean the Source */

   ParmKind tgtPK;            /* Target ParmKind */

   FileFormat tgtFF;          /* Target File format */ 
...... 

}IOConfigRec;

 

ValidCodeParms()函数检查analysis.conf的参数是否合理。

/* ValidCodeParms: check to ensure reasonable wave->parm code params */

static void ValidCodeParms(IOConfig cf)

 

/* SetUpForCoding: set style, sizes and  working storage */

static void SetUpForCoding(MemHeap *x, IOConfig cf, int frSize)

 

ValidConversion()函数检查原格式到目标格式的转换是否可能完成。

/* EXPORT->ValidConversion: checks that src -> tgt conversion is possible */

Boolean ValidConversion (ParmKind src, ParmKind tgt)

 

TotalComps()函数返回特征参数的维度。

/* TotalComps: return the total number of components in a parameter vector

   with nStatic components and ParmKind pk */

static int TotalComps(int nStatic, ParmKind pk)

 

OpenAsChannel()函数中,计算特征参数所需的内存空间:

dBytes = cf->nCols * pbuf->main.maxRows * sizeof(float);

      = 39 * 243 * 4 = 37908

 

在提取特征参数FillBufFromChannel()函数前,调用了StartBuffer()函数,那么StartBuffer()函数有什么作用呢?

/* EXPORT->StartBuffer: start audio and fill the buffer */

void StartBuffer(ParmBuf pbuf)

{

……

   if (pbuf->status == PB_INIT) {

      if (pbuf->cf->useSilDet) ChangeState(pbuf,PB_WAITING);

      else ChangeState(pbuf,PB_FILLING);

   }

……

}

typedef enum {

   PB_INIT,     /* Buffer is initialised and empty */

   PB_WAITING,  /* Buffer is waiting for speech */

   PB_STOPPING, /* Buffer is waiting for silence */

   PB_FILLING,  /* Buffer is filling */

   PB_STOPPED,  /* Buffer has stopped but not yet empty */

   PB_CLEARED   /* Buffer has been emptied */

} PBStatus;

PBStatus status;    /* status of this buffer */

通过ChangeState()函数可以看出,ParmBuf pbuf有一个状态标志PBStatus status,而StartBuffer()函数就是将ParmBuf的状态标志PBStatus改为PB_FILLING,表示正在填装数据。

 

FillBufFromChannel()函数提取wav数据的特征,FillBufFromChannel()函数在OpenAsChannel()函数中被调用。

/* OpenAsChannel: open and create an audio input buffer */

static ReturnStatus OpenAsChannel(ParmBuf pbuf, int maxObs,

                                  char *fname, FileFormat ff,

                                  TriState silMeasure)

{

……

   if (maxObs==0) {

      /* maxObs==0 indicates want a table straight away */

      StartBuffer(pbuf);

      while(pbuf->status<PB_STOPPED)

         FillBufFromChannel(pbuf,MAX_INT);

   }

……

}

 

FillBufFromChannel()函数中调用了函数FramesInChannel()和函数GetFrameFromChannel()。

 

FramesInChannel()函数注释看,好像是提取参数,但仔细一看,原来是返回可以读取的行数(wav语音窗数)。

   /* Fill Buffer with converted static coef vectors */

   newRows=FramesInChannel(pbuf,pbuf->chType);

 

/* Return number of frames that can be read without blocking */

/*       -1 == Done, no more to read. */

/*        0 == May block on reading first frame. */

/*        N == Can read N frames immediately without blocking. */

/*  INT_MAX == Will not block. */

static int FramesInChannel(ParmBuf pbuf,int chType){

……

}

再来看看GetFrameFromChannel()函数,

/* Get a single frame from particular channel */

/*  Return value indicates number of frames read okay */

static int GetFrameFromChannel(ParmBuf pbuf,int chType,void *vp)

 

 

FillBufFromChannel中逐窗提取语音特征参数的for循环:

   /* Read the necessary frames */

   for (i=0; i<newRows; i++) {

      /* But have final check on read just in case */

      if (pbuf->dShort) {

         if (GetFrameFromChannel(pbuf,pbuf->chType,sp1)!=1) {

            pbuf->chClear=TRUE;

            break;

         }

         sp1 += cf->nCols;

      }

      else {

         if (GetFrameFromChannel(pbuf,pbuf->chType,fp1)!=1) {       //提取特征参数

            pbuf->chClear=TRUE;

            break;

         }

         fp1 += cf->nCols;

      }

      pbuf->inRow++;pbuf->main.nRows++;

   }

 
 

//fp1表示存放特征参数的bufferMfcc参数是float型。

// pbuf->main.data的原型:void *data;       /* parameterised data for this block */

fp1 = (float*) pbuf->main.data + pbuf->main.nRows*cf->nCols;

 

static void FillBufFromChannel(ParmBuf pbuf,int minRows)

{

……

   for (i=0; i<newRows; i++) { //此处newRows就是wav语音文件的窗数(Frame Number

      /* But have final check on read just in case */

      if (pbuf->dShort) {

         if (GetFrameFromChannel(pbuf,pbuf->chType,sp1)!=1) {

            pbuf->chClear=TRUE;

            break;

         }

         sp1 += cf->nCols;

      }

      else {

         if (GetFrameFromChannel(pbuf,pbuf->chType,fp1)!=1) { //调用这里

            pbuf->chClear=TRUE;

            break;

         }

         fp1 += cf->nCols;

      }

      pbuf->inRow++;pbuf->main.nRows++;

   }

……

}

 

GetFrameFromChannel()函数调用ConvertFrame()来将语音转换为特征参数。

/* Get a single frame from particular channel */

/*  Return value indicates number of frames read okay */

static int GetFrameFromChannel(ParmBuf pbuf,int chType,void *vp)

{

……

      /* Then convert it to a frame */

      if (ConvertFrame(cf, (float *) vp) != cf->nCvrt)

……

}

 

ConvertFrame()函数是最直接的参数提取函数,原wav语音数据存放于cf->s中,而cf->sVector s类型,即float*类型。

如何将单声道16bwav语音存放为float型呢?

HTK将样本点将short int强行转换为float型,在GetWave()函数实现。

/* EXPORT->GetWave: Get next nFrames from w and store in buf */

void GetWave(Wave w, int nFrames, float *buf)

{
..... 

*buf++ = w->data[w->frIdx+k];          //short int转换为float,存放于cf->s
...... 

}

 

/* ConvertFrame: convert frame in cf->s and store in pbuf, return total

   parameters stored in pbuf */

static int ConvertFrame(IOConfig cf, float *pbuf)

{

   ParmKind btgt = cf->tgtPK&BASEMASK;

   float re,rawte=0.0,te,*p, cepScale = 1.0;

   int i,bsize=0;

   Vector v=NULL;

   char buf[50];

   Boolean rawE;

  

   p = pbuf;

   rawE = cf->rawEnergy;

   if (btgt<MFCC && cf->v1Compat)

      rawE = FALSE;

 

   if (cf->addDither!=0.0)

      for (i=1; i<=VectorSize(cf->s); i++)

         cf->s[i] += (RandomValue()*2.0 - 1.0)*cf->addDither;

 

   if (cf->zMeanSrc && !cf->v1Compat)

      ZeroMeanFrame(cf->s);

   if ((cf->tgtPK&HASENERGY) && rawE){

      rawte = 0.0;

      for (i=1; i<=VectorSize(cf->s); i++)

         rawte += cf->s[i] * cf->s[i];

   }

   if (cf->preEmph>0.0)

      PreEmphasise(cf->s,cf->preEmph);

   if (cf->useHam) Ham(cf->s);

   switch(btgt){

   case LPC:

      Wave2LPC(cf->s,cf->a,cf->k,&re,&te);

      v = cf->a; bsize = cf->lpcOrder;

      break;

   case LPREFC:

      Wave2LPC(cf->s,cf->a,cf->k,&re,&te);

      v = cf->k; bsize = cf->lpcOrder;

      break;     

   case LPCEPSTRA:

      Wave2LPC(cf->s,cf->a,cf->k,&re,&te);

      LPC2Cepstrum(cf->a,cf->c);

      if (cf->cepLifter > 0)

         WeightCepstrum(cf->c, 1, cf->numCepCoef, cf->cepLifter);

      v = cf->c; bsize = cf->numCepCoef;

      break;

   case MELSPEC:

   case FBANK:

      Wave2FBank(cf->s, cf->fbank, rawE?NULL:&te, cf->fbInfo);

      v = cf->fbank; bsize = cf->numChans;

      break;

   case MFCC:

      Wave2FBank(cf->s, cf->fbank, rawE?NULL:&te, cf->fbInfo);

      FBank2MFCC(cf->fbank, cf->c, cf->numCepCoef);

      if (cf->cepLifter > 0)

         WeightCepstrum(cf->c, 1, cf->numCepCoef, cf->cepLifter);

      v = cf->c; bsize = cf->numCepCoef;

      break;

   case PLP:

      Wave2FBank(cf->s, cf->fbank, rawE ? NULL : &te, cf->fbInfo);

      FBank2ASpec(cf->fbank, cf->as, cf->eql, cf->compressFact, cf->fbInfo);

      ASpec2LPCep(cf->as, cf->ac, cf->lp, cf->c, cf->cm);

      if (cf->cepLifter > 0)

         WeightCepstrum(cf->c, 1, cf->numCepCoef, cf->cepLifter);

      v = cf->c;

      bsize = cf->numCepCoef;

      break;

   default:

      HError(6321,"ConvertFrame: target %s is not a parameterised form",

             ParmKind2Str(cf->tgtPK,buf));

   }

 

   if (btgt == PLP || btgt == MFCC)

      cepScale = (cf->v1Compat) ? 1.0 : cf->cepScale;

   for (i=1; i<=bsize; i++)

      *p++ = v[i] * cepScale;

 

   if (cf->tgtPK&HASZEROC){

      if (btgt == MFCC) {

         *p = FBank2C0(cf->fbank) * cepScale;

         if (cf->v1Compat) *p *= cf->eScale;

         ++p;

      }

      else      /* For PLP include gain as C0 */

         *p++ = v[bsize+1] * cepScale;  

      cf->curPK|=HASZEROC ;

   }

   if (cf->tgtPK&HASENERGY) {

      if (rawE) te = rawte;

      *p++ = (te<MINLARG) ? LZERO : log(te); 

      cf->curPK|=HASENERGY;

   }

   return p - pbuf;

}

0 0
原创粉丝点击