LZ77源码阅读笔记

来源：互联网发布：三星scx3401扫描软件编辑：程序博客网时间：2024/05/28 15:35

LZ77作为关键字可以搜索到一份被转载了很多次的源码，不过这份代码基本上一行注释都没有，阅读学习非常的不方便。于是决定给这个代码的关键函数加上注释，作为阅读的学习笔记。

/*********************************************************************
*
* Project description:
* Lz77 compression/decompression algorithm.
*
*********************************************************************/

#include <windows.h>
#include <conio.h>
#include <stdio.h>
#include <assert.h>

#define OFFSET_CODING_LENGTH    (10)
#define MAX_WND_SIZE            1024
//#define MAX_WND_SIZE          (1<<OFFSET_CODING_LENGTH)
#define OFFSET_MASK_CODE        (MAX_WND_SIZE-1)

const ULONG m=3; //m是Golomb编码使用的常数

UCHAR __buffer1__[0x200000];
UCHAR __buffer2__[0x200000];

因为只分析重点的函数，所以代码只是片断。上面是源码用到的常量的定义。

首先看Write1ToBitStream函数，用来在给定的Buff偏移位置写入1：

void
Write1ToBitStream(
   PUCHAR pBuffer,
   ULONG   ulBitOffset
   )
{
   ULONG   ulByteBoundary;
   ULONG   ulOffsetInByte;

//计算ulBitOffset右移3位，即判断所给定的位偏移，相对pBuffer来说是第几个字符的地址，每个字符占8个bit

//ulBitOffset与0x00000111做与运算可以知道要在所指向的那个字符的第几个位置置1

ulByteBoundary = ulBitOffset>>3
ulOffsetInByte = ulBitOffset&7；

*(pBuffer+ulByteBoundary) |= (1<<ulOffsetInByte);

}

读取的过程正好和写过程是对称：

ULONG
ReadBitFromBitStream(
   PUCHAR pBuffer,
   ULONG   ulBitOffset
   )
{
   ULONG   ulByteBoundary;
   ULONG   ulOffsetInByte;

//首先计算字符位置和字符内偏移量

ulByteBoundary = ulBitOffset>>3 ;
ulOffsetInByte = ulBitOffset&7;

//然后通过偏移与0x00000001与运算就知道所给定的偏移位置上是0还是1

return ((*(PULONG)(pBuffer+ulByteBoundary))>>ulOffsetInByte)&1 ;
}

Golomb编码：

先看一下Golomb编码的规范：

Golomb 编码。假设对正整数 x 进行 Golomb 编码，选择参数 m，令

b = 2^m

q = INT((x - 1)/b)

r = x - q^b - 1

则 x 可以被编码为两部分，第一部分是由 q 个 1 加 1 个 0 组成，第二部分为 m 位二进制数，其值为 r。我们将 m = 0, 1, 2, 3 时的 Golomb 编码表列出：

   值 x        m = 0       m = 1       m = 2       m = 3
-------------------------------------------------------------
    1             0         0 0        0 00        0 000
    2            10         0 1        0 01        0 001
    3           110        10 0        0 10        0 010
    4          1110        10 1        0 11        0 011
    5         11110       110 0       10 00        0 100
    6        111110       110 1       10 01        0 101
    7       1111110      1110 0       10 10        0 110
    8      11111110      1110 1       10 11        0 111
    9     111111110     11110 0      110 00       10 000
从表中我们可以看出，Golomb 编码不但符合前缀编码的规律，而且可以用较少的位表示
较小的 x 值，而用较长的位表示较大的 x 值。这样，如果 x 的取值倾向于比较小的数值，Golomb 编码就可以有效地节省空间。当然，根据 x 的分布规律不同，我们可以选取不同的 m 值以达到最好的压缩效果。

对我们上面讨论的三元组 len 值，我们可以采用 Golomb 方式编码。上面的讨论中 len 可能取 0，我们只需用 len + 1 的 Golomb 编码即可。至于参数 m 的选择，一般经验是取 3 或 4 即可。

ULONG WINAPI
WriteGolombCode(
   ULONG   x,
   PUCHAR pBuffer,
   ULONG   ulBitOffset
   )
{
   ULONG           q, r;
   int             i;

q = (x-1)>>m;
r = x-(q<<m)-1;

//首先写q个1

   for(i=0; (ULONG)i<q; i++, ulBitOffset++)
   {
       Write1ToBitStream(pBuffer, ulBitOffset);
   }

//q个1和m位r的2进制编码间用0隔开
Write0ToBitStream(pBuffer, ulBitOffset);
ulBitOffset++;

//m位r的2进制编码

   for(i=0; i<m; i++, ulBitOffset++)
   {
       if( (r>>i)&1 )
       {
           Write1ToBitStream(pBuffer, ulBitOffset);
       }
       else
       {
           Write0ToBitStream(pBuffer, ulBitOffset);
       }
   }

//返回GolombCode长度，这个长度是解码时候计算偏移的增量有用

return m+q+1;
}

然后看写入Bit的操作WriteBits，用来在指定的Buff偏移位置写入一串bits：

void
WriteBits(
   PUCHAR pDataBuffer,
   ULONG   ulOffsetToWrite,
   ULONG   ulBits,
   ULONG   ulBitLength
   )
{
   ULONG   ulDwordsOffset;
   ULONG   ulBitsOffset, ulBitsRemained;

   ulDwordsOffset = ulOffsetToWrite>>5;
   ulBitsOffset = ulOffsetToWrite&31;
   ulBitsRemained = 32 - ulBitsOffset;

   if( 0==ulBitsOffset )
   {
       *((PULONG)pDataBuffer+ulDwordsOffset) = ulBits;
   }
   else if( ulBitsRemained>=ulBitLength )
   {
       *((PULONG)pDataBuffer+ulDwordsOffset) |= (ulBits<<ulBitsOffset);
   }
   else
   {
       *((PULONG)pDataBuffer+ulDwordsOffset) |= (ulBits<<ulBitsOffset);
       *((PULONG)pDataBuffer+ulDwordsOffset+1) = ulBits>>ulBitsRemained;
   }
}

看起来有点绕，不过通过示意图就很明了了：

下面就是LZ77压缩的主体函数：

void
lz77compress(
   PUCHAR pDataBuffer,
   ULONG   ulDataLength,
   PUCHAR pOutputBuffer,
   PULONG pulNumberOfBits
   )
{
   LONG        iSlideWindowPtr;
   ULONG       ulBytesCoded;
   ULONG       ulMaxlength;
   PUCHAR      pSlideWindowPtr;
   PUCHAR      pUnprocessedDataPtr;

   ULONG   offset;
   ULONG   length;
   ULONG   ulCodingLength;

ULONG ulBitOffset;
UCHAR cc;

int i;

//滑动窗口的初始长度是-MAX_WND_SIZE，因为现在还没有字符被编码，所以滑动窗口的最右端就是编码字串的最前端。

   iSlideWindowPtr = -MAX_WND_SIZE;
   pSlideWindowPtr = NULL;
   ulBitOffset = 0;
   ulBytesCoded = 0;

while( ulBytesCoded<ulDataLength )
{

//当iSildeWindowPtr>=0时，已编码的字串长度已经大于等于滑动窗的长度
       if( iSlideWindowPtr>=0 )
       {
           pSlideWindowPtr = pDataBuffer+iSlideWindowPtr;
           ulMaxlength = MAX_WND_SIZE;

       }
       else if( iSlideWindowPtr>=-MAX_WND_SIZE )
       {
           pSlideWindowPtr = pDataBuffer;
           ulMaxlength = MAX_WND_SIZE + iSlideWindowPtr;
       }
       else
       {
           pSlideWindowPtr = NULL;
           ulMaxlength = 0;
       }

       pUnprocessedDataPtr = pDataBuffer + ulBytesCoded;
       if( ulMaxlength>ulDataLength-ulBytesCoded )
       {
           ulMaxlength = ulDataLength-ulBytesCoded;
       }

//在已编码的串中搜索待编码字串的最长组合，并记录长度、偏移

       FindLongestSubstring(
           pSlideWindowPtr,
           pUnprocessedDataPtr,
           ulMaxlength,
           &offset,
           &length
           );

assert( length<=MAX_WND_SIZE );
assert( offset<MAX_WND_SIZE );

if(length>1)
{

//如果匹配长度大于1，将首位置1，然后写入偏移的位置，因为搜索窗口长度是10位的，所以这个偏移按照10位的2进制编码写入。最后写入Golomb编码。

Write1ToBitStream(pOutputBuffer, ulBitOffset);
ulBitOffset++;

           for(i=0; i<OFFSET_CODING_LENGTH; i++, ulBitOffset++)
           {
               if( (offset>>i)&1 )
               {
                   Write1ToBitStream(pOutputBuffer, ulBitOffset);
               }
               else
               {
                   Write0ToBitStream(pOutputBuffer, ulBitOffset);
               }
           }

ulCodingLength = WriteGolombCode(length, pOutputBuffer, ulBitOffset);

           ulBitOffset += ulCodingLength;
           iSlideWindowPtr += length;
           ulBytesCoded += length;

       }
       else
       {

//如果匹配长度小于等于1，将首位置0，然后写入对应字符的8位2进制编码
Write0ToBitStream(pOutputBuffer, ulBitOffset);
ulBitOffset++;

           cc = (*pUnprocessedDataPtr);
           for(i=0; i<8; i++, ulBitOffset++)
           {
               if( (cc>>i)&1 )
               {
                   Write1ToBitStream(pOutputBuffer, ulBitOffset);
               }
               else
               {
                   Write0ToBitStream(pOutputBuffer, ulBitOffset);
               }
           }

           iSlideWindowPtr++;
           ulBytesCoded++;
       }

}

   if( ulBytesCoded!=ulDataLength )
   {
       assert(ulBytesCoded==ulDataLength);
   }

*pulNumberOfBits = ulBitOffset;

}

LZ77的解码过程是上面的逆过程就不敷述了。