live555中关于mpeg4的处理

来源：互联网发布：ps mac破解版百度云编辑：程序博客网时间：2024/06/08 05:17

live555支持mpeg4的ES(Elemental Stream)流，相关类为MPEGVideoStreamFramer、MPEG4ESVideoRTPSink。我想扩展其对avi格式的支持，将avi中的MPEG4数据包解析出来后，交给MPEGVideoStreamFramer进行处理。后来发现，这样根本不行。问题在于，MPEGVideoStreamFramer处理的是严格的MPEG4 ES流。

先简单的说明一下MPEG4的ES流：

MPEG4 Elemental stream 组成如下：
VOS->VO->VOL->GOV(可选)->VOP
VOS 视觉对像序列
VO 视觉对像
VOL 视觉对对象层
GOV 视觉对象平面组(VOP组)
VOP 视觉对象平面

紧跟着VOP开始的，有一个2bit 的标志，用来表示这个Frame到底是一个 I Frame，P Frame，B Frame抑或是S Frame（GMS-VOP）
标志如下：
00: I Frame
01: P Frame
10: B Frame
11: S Frame

起始符及结束符定义如下:

[cpp] view plaincopyprint?
#define VISUAL_OBJECT_SEQUENCE_START_CODE 0x000001B0  
#define VISUAL_OBJECT_SEQUENCE_END_CODE   0x000001B1  
#define GROUP_VOP_START_CODE              0x000001B3  
#define VISUAL_OBJECT_START_CODE          0x000001B5  
#define VOP_START_CODE                    0x000001B6  

用二进制方式打开avi文件，发现只存在vop开始符，说明只存在VOP层次，而不是严格的ES流。可以认为一个VOP对应着一个帧。

后来发现，live555中实现了另一个类，MPEG4VideoStreamDiscreteFramer，继承自MPEG4VideoStreamFramer。它可以处理VOS，也可以处理一个个的BOV及VOP，正好可以满足需求。
看一下MPEG4VideoStreamDiscreteFramer对MPEG4数据的处理

[cpp] view plaincopyprint?
void MPEG4VideoStreamDiscreteFramer  
::afterGettingFrame1(unsigned frameSize, unsigned numTruncatedBytes,  
                     struct timeval presentationTime,  
                     unsigned durationInMicroseconds) {  
  // Check that the first 4 bytes are a system code:  
  if (frameSize >= 4 && fTo[0] == 0 && fTo[1] == 0 && fTo[2] == 1) {  
    fPictureEndMarker = True; // Assume that we have a complete 'picture' here  
    unsigned i = 3;  
  
  
    //  
    //视觉对象序列，按照完整的MPEG4 Elemental Stream进行解析  
    //  
    if (fTo[i] == 0xB0) { // VISUAL_OBJECT_SEQUENCE_START_CODE  
      // The next byte is the "profile_and_level_indication":  
      if (frameSize >= 5) fProfileAndLevelIndication = fTo[4];  
  
  
      // The start of this frame - up to the first GROUP_VOP_START_CODE  
      // or VOP_START_CODE - is stream configuration information.  Save this:  
      for (i = 7; i < frameSize; ++i) {  
    if ((fTo[i] == 0xB3 /*GROUP_VOP_START_CODE*/ ||  
         fTo[i] == 0xB6 /*VOP_START_CODE*/)  
        && fTo[i-1] == 1 && fTo[i-2] == 0 && fTo[i-3] == 0) {  
      break; // The configuration information ends here  
    }  
      }  
      fNumConfigBytes = i < frameSize ? i-3 : frameSize;  
      delete[] fConfigBytes; fConfigBytes = new unsigned char[fNumConfigBytes];  
      for (unsigned j = 0; j < fNumConfigBytes; ++j) fConfigBytes[j] = fTo[j];  
  
  
      // This information (should) also contain a VOL header, which we need  
      // to analyze, to get "vop_time_increment_resolution" (which we need  
      // - along with "vop_time_increment" - in order to generate accurate  
      // presentation times for "B" frames).  
      analyzeVOLHeader();  
    }  
  
  
    if (i < frameSize) {  
      u_int8_t nextCode = fTo[i];  
  
  
      //  
      //VOP组  
      //  
      if (nextCode == 0xB3 /*GROUP_VOP_START_CODE*/) {  
    // Skip to the following VOP_START_CODE (if any):  
    for (i += 4; i < frameSize; ++i) {  
      if (fTo[i] == 0xB6 /*VOP_START_CODE*/  
          && fTo[i-1] == 1 && fTo[i-2] == 0 && fTo[i-3] == 0) {  
        nextCode = fTo[i];  
        break;  
      }  
    }  
      }  
  
  
      //  
      //视觉对象平面  
      //  
      if (nextCode == 0xB6 /*VOP_START_CODE*/ && i+5 < frameSize) {  
    ++i;  
  
  
    // Get the "vop_coding_type" from the next byte:  
    u_int8_t nextByte = fTo[i++];  
    u_int8_t vop_coding_type = nextByte>>6;   //VOP开始符后的2bit，表示帧类型I/P/B/S  
  
  
    // Next, get the "modulo_time_base" by counting the '1' bits that  
    // follow.  We look at the next 32-bits only.  
    // This should be enough in most cases.  
    u_int32_t next4Bytes  
      = (fTo[i]<<24)|(fTo[i+1]<<16)|(fTo[i+2]<<8)|fTo[i+3];  
    i += 4;  
    u_int32_t timeInfo = (nextByte<<(32-6))|(next4Bytes>>6);  
    unsigned modulo_time_base = 0;  
    u_int32_t mask = 0x80000000;  
    while ((timeInfo&mask) != 0) {  
      ++modulo_time_base;  
      mask >>= 1;  
    }  
    mask >>= 2;  
  
  
    // Then, get the "vop_time_increment".  
    unsigned vop_time_increment = 0;  
    // First, make sure we have enough bits left for this:  
    if ((mask>>(fNumVTIRBits-1)) != 0) {  
      for (unsigned i = 0; i < fNumVTIRBits; ++i) {  
        vop_time_increment |= timeInfo&mask;  
        mask >>= 1;  
      }  
      while (mask != 0) {  
        vop_time_increment >>= 1;  
        mask >>= 1;  
      }  
    }  
  
  
    //  
    //若是"B"frame, 需要修正时间时间戳  
    //  
    // If this is a "B" frame, then we have to tweak "presentationTime":  
    if (vop_coding_type == 2/*B*/  
        && (fLastNonBFramePresentationTime.tv_usec > 0 ||  
        fLastNonBFramePresentationTime.tv_sec > 0)) {  
      int timeIncrement  
        = fLastNonBFrameVop_time_increment - vop_time_increment;  
      if (timeIncrement<0) timeIncrement += vop_time_increment_resolution;  
      unsigned const MILLION = 1000000;  
      double usIncrement = vop_time_increment_resolution == 0 ? 0.0  
        : ((double)timeIncrement*MILLION)/vop_time_increment_resolution;  
      unsigned secondsToSubtract = (unsigned)(usIncrement/MILLION);  
      unsigned uSecondsToSubtract = ((unsigned)usIncrement)%MILLION;  
  
  
      presentationTime = fLastNonBFramePresentationTime;  
      if ((unsigned)presentationTime.tv_usec < uSecondsToSubtract) {  
        presentationTime.tv_usec += MILLION;  
        if (presentationTime.tv_sec > 0) --presentationTime.tv_sec;  
      }  
      presentationTime.tv_usec -= uSecondsToSubtract;  
      if ((unsigned)presentationTime.tv_sec > secondsToSubtract) {  
        presentationTime.tv_sec -= secondsToSubtract;  
      } else {  
        presentationTime.tv_sec = presentationTime.tv_usec = 0;  
      }  
    } else {  
      fLastNonBFramePresentationTime = presentationTime;  
      fLastNonBFrameVop_time_increment = vop_time_increment;  
    }  
      }  
    }  
  }  
  
  
  // Complete delivery to the client:  
  fFrameSize = frameSize;  
  fNumTruncatedBytes = numTruncatedBytes;  
  fPresentationTime = presentationTime;  
  fDurationInMicroseconds = durationInMicroseconds;  
  afterGetting(this);  
}  

上面的代码，其实只完成一个功能，就是当当前VOP为B帧时，调整时间戳。

最后关注一下，MPEG4 ES流时间戳的处理。在处理MPEG4 的ES流时，使用MPEG4VideoStreamFramer，作为source。使用分析器MPEG4VideoStreamParser，对完整的MPEG4 Elemental Stream进行分析，主要是解析出其中的时间信息。

[cpp] view plaincopyprint?
void MPEGVideoStreamFramer::continueReadProcessing() {  
  unsigned acquiredFrameSize = fParser->parse();  
  if (acquiredFrameSize > 0) {  
    // We were able to acquire a frame from the input.  
    // It has already been copied to the reader's space.  
    fFrameSize = acquiredFrameSize;  
    fNumTruncatedBytes = fParser->numTruncatedBytes();  
  
  
    // "fPresentationTime" should have already been computed.  
  
  
    //  
    //根据帧计数及帧率计算帧的持续时间  
    //  
    // Compute "fDurationInMicroseconds" now:  
    fDurationInMicroseconds  
      = (fFrameRate == 0.0 || ((int)fPictureCount) < 0) ? 0  
      : (unsigned)((fPictureCount*1000000)/fFrameRate);  
  
  
    fPictureCount = 0;  
  
  
    // Call our own 'after getting' function.  Because we're not a 'leaf'  
    // source, we can call this directly, without risking infinite recursion.  
    afterGetting(this);  
  } else {  
    // We were unable to parse a complete frame from the input, because:  
    // - we had to read more data from the source stream, or  
    // - the source stream has ended.  
  }  
}  

计算fDurationInMicroseconds需要frame rate参数fFrameRate, 它是通过分析VOL头确定的

[cpp] view plaincopyprint?
void MPEG4VideoStreamParser::analyzeVOLHeader() {  
    //  
    //从VOL中解析出时间信息  
    //  
  // Extract timing information (in particular,  
  // "vop_time_increment_resolution") from the VOL Header:  
...  
  do {  
  
  
      ...  
  
  
    // Use "vop_time_increment_resolution" as the 'frame rate'  
    // (really, 'tick rate'):  
    usingSource()->fFrameRate = (double)vop_time_increment_resolution;  //帧率  
  
  
    return;  
  } while (0);  
  
...  
}