数据压缩实验三——霍夫曼编解码算法实现

来源：互联网发布：网络借贷还不起怎么办编辑：程序博客网时间：2024/06/05 17:52

一、实验原理

霍夫曼编码

Huffman Coding (霍夫曼编码)是一种无失真编码的编码方式，Huffman编码是可变字长编码(VLC)的一种。
Huffman 编码基于信源的概率统计模型，它的基本思路是，出现概率大的信源符号编长码，出现概率小的信源符号编短码，从而使平均码长最小。
在程序实现中常使用一种叫做树的数据结构实现Huffman编码，由它编出的码是即时码

霍夫曼编码方法

统计符号的发生概率
把频率按从小到大的顺序排列
每一次选出最小的两个值，作为二叉树的两个叶子节点，将和作为它们的根节点，这两个叶子节点不再参与比较，新的根节点参与比较
重复3，直到最后得到和为1的根节点
将形成的二叉树的左节点标0，右节点标1，把从最上面的根节点到最下面的叶子节点途中遇到的0，1序列串起来，就得到了各个符号的编码。

二、实验流程及代码分析

实验流程

读入待编码的源文件
第一次扫描：统计文件中各个字符出现频率
建立Huffman树
将码表及其他必要信息写入输出文件
第二次扫描：对源文件进行编码并输出

霍夫曼编码的数据结构

霍夫曼节点结构：

typedef struct huffman_node_tag{    unsigned char isLeaf;               //是否为树叶    unsigned long count;                //节点代表的符号加权和    struct huffman_node_tag *parent;  //父节点指针    union    {        struct        {                struct huffman_node_tag *zero, *one;                   //子节点指针,分别代表0,1子节点指针        };    unsigned char symbol;             //节点代表的符号    }; } huffman_node;

霍夫曼码结构：

typedef struct huffman_code_tag{    unsigned long numbits;            //该码所用的比特数unsigned char *bits;            //指向该码比特串的指针    // 76543210 | bit[0]    // ******98 | bit[1]} huffman_code;

霍夫曼统计结构

typedef struct huffman_statistics_result{    // 霍夫曼码字统计结果结构体，包含每个树叶节点的频率、位数、码字    float freq[256];    unsigned long numbits[256];    // 以字节为单位存放码字    unsigned char bits[256][100];}huffman_stat;

代码分析

编码流程

主程序

读入待编码源文件及调用霍夫曼编码函数。

int main(int argc, char** argv){    char memory = 0;    // 是否读取内存数据操作，0为否    char compress = 1;    // 压缩或解压缩，1为压缩    // ......省略文件初始化    //step1:add by yzhang for huffman statistics    FILE * outTable = NULL;    // 输出的码字统计结果excel表格文件    //end by yzhang    /* Get the command line arguments. */    while((opt = getopt(argc, argv, "i:o:cdhvmt:")) != -1)     {        // 使用getopt函数读取命令行参数        switch(opt)        {        case 'i':            file_in = optarg;            break;        case 'o':            file_out = optarg;            break;        ......        }    }    /* If an input file is given then open it. */    ......    /* If an output file is given then create it. */    ......    /* If an out table file is given then create it. */    ......    if(memory)    {        // 是否读取内存数据进行压缩或解压操作        return compress ?            memory_encode_file(in, out) : memory_decode_file(in, out);    }    if(compress)  //change by yzhang        // 是否对读入文件进行压缩或解压缩操作        huffman_encode_file(in, out,outTable);        //step1:changed by yzhang from huffman_encode_file(in, out) to huffman_encode_file(in, out,outTable)    else    huffman_decode_file(in, out);    //......省略关闭文件操作    return 0;}

函数嵌套结构与功能
– huffman_encode_file 读文件，输出码表、编码后文件
– get_symbol_frequencies 构造叶子节点
– init_frequencies 初始化叶子节点
– new_leaf_node 建构造叶子节点
– huffST_getSymFrequencies 保存码字频率统计结果
– calculate_huffman_codes 组织霍夫曼二叉树结构
– qsort / SFComp 排序/规则为从小到大
– new_nonleaf_node 构造非叶子节点
– build_symbol_encoder 生成码字
– huffST_getcodeword 保存码字
– output_huffman_statistics 输出码字统计结果
– fprintf 写文件
– write_code_table 码表写入文件
– do_file_encode 读文件，查表编码，写入文件

霍夫曼编码函数

主函数

int huffman_encode_file(FILE *in, FILE *out, FILE *out_Table)  //step1:changed by yzhang for huffman statistics from (FILE *in, FILE *out) to (FILE *in, FILE *out, FILE *out_Table){    SymbolFrequencies sf;    // 叶子结点结构体数组    SymbolEncoder *se;    // 码节点结构体数组    huffman_node *root = NULL;    // 根节点，初始化为空    int rc;    unsigned int symbol_count;    // 总符号数    //step2:add by yzhang for huffman statistics    huffman_stat hs;    // 霍夫曼统计结构体    //end by yzhang    /* Get the frequency of each symbol in the input file. */    symbol_count = get_symbol_frequencies(&sf, in);     // 为出现过的符号建立叶子节点，并返回总符号数    //step3:add by yzhang for huffman statistics,...  get the frequency of each symbol     huffST_getSymFrequencies(&sf,&hs,symbol_count);    // 保存各码字频率统计结果，给霍夫曼统计结构体    //end by yzhang    /* Build an optimal table from the symbolCount. */    se = calculate_huffman_codes(&sf);    // 构造非叶子节点、组织霍夫曼二叉树结构、生成霍夫曼码字，返回霍夫曼码字结构体    root = sf[0];    // 保存根节点    //step3:add by yzhang for huffman statistics... output the statistics to file    huffST_getcodeword(se, &hs);    // 保存码字    output_huffman_statistics(&hs,out_Table);    // 输出统计结果    //end by yzhang    /* Scan the file again and, using the table       previously built, encode it into the output file. */    rewind(in);    // 文件指针重新指向首地址    rc = write_code_table(out, se, symbol_count);    // 将码表写入文件    if(rc == 0)        // 读取输入文件，查表编码，写入文件        rc = do_file_encode(in, out, se);    /* Free the Huffman tree. */    free_huffman_tree(root);    // 由根节点递归释放叶子节点和非叶子结点内存空间    free_encoder(se);    // 释放霍夫曼码结构体    return rc;}

第一次扫描：统计文件中各个字符出现频率

static unsigned intget_symbol_frequencies(SymbolFrequencies *pSF, FILE *in){    int c;    unsigned int total_count = 0;    /* Set all frequencies to 0. */    init_frequencies(pSF);    // 初始化叶子节点 memset(*pSF, 0, sizeof(SymbolFrequencies));    /* Count the frequency of each symbol in the input file. */    while((c = fgetc(in)) != EOF)    // fgetc函数每次读取一字节数据，也就是char数据类型，同时将文件指针移动到指向下一字节的地址    {        unsigned char uc = c;        if(!(*pSF)[uc])        // 若没有标号为uc的节点则新建一个叶子节点            (*pSF)[uc] = new_leaf_node(uc);        ++(*pSF)[uc]->count;        // 标号为uc的节点计数值加一        ++total_count;        // 总叶子点数计数，并作为返回值    }    return total_count;}

#define MAX_SYMBOLS 256typedef huffman_node* SymbolFrequencies[MAX_SYMBOLS];// 定义一个指针数组SymbolFrequencies，数组中每个元素是指向一个霍夫曼节点的指针typedef huffman_code* SymbolEncoder[MAX_SYMBOLS];// 定义一个指针数组SymbolEncoder，数组中每个元素是指向码（叶子）节点的指针

static huffman_node*new_leaf_node(unsigned char symbol){    /* 生成并初始化新的叶子节点 */    huffman_node *p = (huffman_node*)malloc(sizeof(huffman_node));    p->isLeaf = 1;    p->symbol = symbol;    p->count = 0;    p->parent = 0;    return p;}

int huffST_getSymFrequencies(SymbolFrequencies *SF, huffman_stat *st,int total_count){    // 保存各码字频率统计结果，给霍夫曼统计结构体    int i,count =0;    for(i = 0; i < MAX_SYMBOLS; ++i)    {           if((*SF)[i])        {            st->freq[i]=(float)(*SF)[i]->count/total_count;            // 计算码字频率            count+=(*SF)[i]->count;            // 计算码字总数        }        else         {            st->freq[i]= 0;        }    }    if(count==total_count)        // 用码字总数校验计算是否正确        return 1;    else        return 0;}

建立Huffman树

/* * calculate_huffman_codes turns pSF into an array * with a single entry that is the root of the * huffman tree. The return value is a SymbolEncoder, * which is an array of huffman codes index by symbol value. */static SymbolEncoder*calculate_huffman_codes(SymbolFrequencies * pSF){    unsigned int i = 0;    unsigned int n = 0;    huffman_node *m1 = NULL, *m2 = NULL;    SymbolEncoder *pSE = NULL;#if 1    printf("BEFORE SORT\n");    print_freqs(pSF);       // 打印排序前节点顺序#endif    /* Sort the symbol frequency array by ascending frequency. */    qsort((*pSF), MAX_SYMBOLS, sizeof((*pSF)[0]), SFComp);       // qsort函数对码符号数排序，排序规则为SFComp#if 1       printf("AFTER SORT\n");    print_freqs(pSF);    // 打印按符号频率从小到大排序后节点的顺序#endif    /* Get the number of symbols. */    for(n = 0; n < MAX_SYMBOLS && (*pSF)[n]; ++n);    // (*pSF)[n]不为空时进入循环，统计符号数    /*     * Construct a Huffman tree. This code is based     * on the algorithm given in Managing Gigabytes     * by Ian Witten et al, 2nd edition, page 34.     * Note that this implementation uses a simple     * count instead of probability.     */    // 组织霍夫曼编码二叉树结构    for(i = 0; i < n - 1; ++i)    {        //符号数-1次循环，排序后最终只剩下一个根节点        /* Set m1 and m2 to the two subsets of least probability. */        // 将m1, m2置成前两个树叶节点        m1 = (*pSF)[0];        m2 = (*pSF)[1];        /* Replace m1 and m2 with a set {m1, m2} whose probability         * is the sum of that of m1 and m2. */        // 构造m1, m2的父节点（非叶子节点）,将合并概率赋给父节点        (*pSF)[0] = m1->parent = m2->parent =            new_nonleaf_node(m1->count + m2->count, m1, m2);        // 1节点置空        (*pSF)[1] = NULL;        /* Put newSet into the correct count position in pSF. */        // 重新排序        qsort((*pSF), n, sizeof((*pSF)[0]), SFComp);    }    /* Build the SymbolEncoder array from the tree. */    pSE = (SymbolEncoder*)malloc(sizeof(SymbolEncoder));    memset(pSE, 0, sizeof(SymbolEncoder));    // 初始化码节点结构体数组    build_symbol_encoder((*pSF)[0], pSE);    return pSE;}

qsort函数介绍
功能：使用快速排序例程进行排序
头文件：stdlib.h
用法：

void qsort(void *base, int nelem, int width, int (*fcmp)(const void *,const void *));

参数：
1 待排序数组首地址
2 数组中待排序元素数量
3 各元素的占用空间大小
4 指向函数的指针，用于确定排序的顺序

compare函数原型

compare( (void *) & elem1, (void *) & elem2 );

Compare 函数的返回值描述小于0 elem1将被排在elem2前面 0 elem1 等于 elem2 大于0 elem1 将被排在elem2后面

/* * When used by qsort, SFComp sorts the array so that * the symbol with the lowest frequency is first. Any * NULL entries will be sorted to the end of the list. */static intSFComp(const void *p1, const void *p2){    const huffman_node *hn1 = *(const huffman_node**)p1;    const huffman_node *hn2 = *(const huffman_node**)p2;    // 将传入指针的地址设为霍夫曼节点指针指向的地址    /* Sort all NULLs to the end. */    // 第一类，当有传入节点为空时，将空节点排在后面    if(hn1 == NULL && hn2 == NULL)        return 0;    if(hn1 == NULL)        return 1;    if(hn2 == NULL)        return -1;    // 第二类，当传入节点非空时，进行比较    if(hn1->count > hn2->count)        return 1;    else if(hn1->count < hn2->count)        return -1;    return 0;}

static huffman_node*new_nonleaf_node(unsigned long count, huffman_node *zero, huffman_node *one){    /* 生成并初始化新的非叶子节点，传入该节点计数值、子节点0&1指针 */    huffman_node *p = (huffman_node*)malloc(sizeof(huffman_node));    p->isLeaf = 0;    p->count = count;    p->zero = zero;    p->one = one;    p->parent = 0;    return p;}

/* * build_symbol_encoder builds a SymbolEncoder by walking * down to the leaves of the Huffman tree and then, * for each leaf, determines its code. */static voidbuild_symbol_encoder(huffman_node *subtree, SymbolEncoder *pSF){    /* 递归生成霍夫曼码字，传入参数为根节点指针，传出参数为叶子节点结构体指针 */    if(subtree == NULL)        return;    // 若为树叶,则构造霍夫曼码结构    if(subtree->isLeaf)        (*pSF)[subtree->symbol] = new_code(subtree);    else    // 否则递归这个函数,直到达到树叶节点    {        build_symbol_encoder(subtree->zero, pSF);        build_symbol_encoder(subtree->one, pSF);    }}

int huffST_getcodeword(SymbolEncoder *se, huffman_stat *st){    // 保存各码字，给霍夫曼统计结构体结构体    unsigned long i,j;    for(i = 0; i < MAX_SYMBOLS; ++i)    {        huffman_code *p = (*se)[i];        if(p)        {            unsigned int numbytes;            st->numbits[i] = p->numbits;            // 计算码字所需字节数            numbytes = numbytes_from_numbits(p->numbits);            for (j=0;j<numbytes;j++)            // 按字节存放码字                st->bits[i][j] = p->bits[j];        }        else            st->numbits[i] =0;    }    return 0;}

void output_huffman_statistics(huffman_stat *st,FILE *out_Table){    int i,j;    unsigned char c;    fprintf(out_Table,"symbol\t   freq\t   codelength\t   code\n");    for(i = 0; i < MAX_SYMBOLS; ++i)    {           // 使用fprintf函数将码统计结果写入文件out_Table        fprintf(out_Table,"%d\t   ",i);        fprintf(out_Table,"%f\t   ",st->freq[i]);        fprintf(out_Table,"%d\t    ",st->numbits[i]);        if(st->numbits[i])        {            // 如果有码字            for(j = 0; j < st->numbits[i]; ++j)            {                // 按位输出码字                c =get_bit(st->bits[i], j);                fprintf(out_Table,"%d",c);            }        }        fprintf(out_Table,"\n");    }}

fprintf函数介绍
功能：根据指定的format(格式)发送信息(参数)到由stream(流)指定的文件
头文件：stdlib.h
用法：

int fprintf (FILE* stream, const char*format, [argument]);

参数：
1 文件指针
2 输出格式
3 附加参数列表

将码表及其他必要信息写入输出文件。

/* * Write the huffman code table. The format is: * 4 byte code count in network byte order. * 4 byte number of bytes encoded *   (if you decode the data, you should get this number of bytes) * code1 * ... * codeN, where N is the count read at the begginning of the file. * Each codeI has the following format: * 1 byte symbol, 1 byte code bit length, code bytes. * Each entry has numbytes_from_numbits code bytes. * The last byte of each code may have extra bits, if the number of * bits in the code is not a multiple of 8. */static intwrite_code_table(FILE* out, SymbolEncoder *se, unsigned int symbol_count){    // 将码表写入文件    unsigned long i, count = 0;    /* Determine the number of entries in se. */    for(i = 0; i < MAX_SYMBOLS; ++i)    {        if((*se)[i])            ++count;    }    /* Write the number of entries in network byte order. */    i = htonl(count);    //在网络传输中，采用big-endian序，对于0x0A0B0C0D ，传输顺序就是0A 0B 0C 0D ，    //因此big-endian作为network byte order，little-endian作为host byte order。    //little-endian的优势在于unsigned char/short/int/long类型转换时，存储位置无需改变    if(fwrite(&i, sizeof(i), 1, out) != 1)    // 写入4比特总码字数        return 1;    /* Write the number of bytes that will be encoded. */    symbol_count = htonl(symbol_count);    if(fwrite(&symbol_count, sizeof(symbol_count), 1, out) != 1)    // 写入4比特总码字占比特数        return 1;    /* Write the entries. */    for(i = 0; i < MAX_SYMBOLS; ++i)    {        huffman_code *p = (*se)[i];        if(p)        {            unsigned int numbytes;            /* Write the 1 byte symbol. */            // 写符号            fputc((unsigned char)i, out);            /* Write the 1 byte code bit length. */            // 写码长（位）            fputc(p->numbits, out);            /* Write the code bytes. */            // 写码            numbytes = numbytes_from_numbits(p->numbits);            if(fwrite(p->bits, 1, numbytes, out) != numbytes)                return 1;        }    }    return 0;}

第二次扫描：对源文件进行编码并输出

static intdo_file_encode(FILE* in, FILE* out, SymbolEncoder *se){    // 读取输入文件，查表写入文件    unsigned char curbyte = 0;    unsigned char curbit = 0;    int c;    while((c = fgetc(in)) != EOF)    // 按字节读取源文件    {        unsigned char uc = (unsigned char)c;        huffman_code *code = (*se)[uc];        unsigned long i;        for(i = 0; i < code->numbits; ++i)        {            /* Add the current bit to curbyte. */            // 每次循环将新一位码字或进当前字节curbyte            curbyte |= get_bit(code->bits, i) << curbit;            /* If this byte is filled up then write it             * out and reset the curbit and curbyte. */            // 当前字节填满时，将其写入文件，输出码字不是按字节划分的            if(++curbit == 8)            {                fputc(curbyte, out);                curbyte = 0;                curbit = 0;            }        }    }

    /*     * If there is data in curbyte that has not been     * output yet, which means that the last encoded     * character did not fall on a byte boundary,     * then output it.     */    if(curbit > 0)        // 若当前比特不为0，说明最后一字节的码还没有写入文件，将其写入文件        fputc(curbyte, out);    return 0;}

解码流程

读入编码文件
提取必要信息，依照码表重建霍夫曼树
从根节点开始依据从文件中读取的霍夫曼码字沿树行走，至叶结点时输出符号至输出文件，并回到根节点
所有 Huffman 码字解码完毕，文件解码完成

读取码表并重建霍夫曼树

huffman_node* read_code_table(FILE* in, unsigned int *pDataBytes) {    huffman_node *root = new_nonleaf_node(0, NULL, NULL);    // 生成根节点    unsigned int count;     if(fread(&count, sizeof(count), 1, in) != 1)     // 得到码表中的符号数      {           free_huffman_tree(root);        return NULL;    }     count = ntohl(count);    // 读文件用小端方式（主机字节序），而符号数count用大端方式（网络字节序）存，所以需要转换    /* Read the number of data bytes this encoding represents. */      if(fread(pDataBytes, sizeof(*pDataBytes), 1, in) != 1)     {           free_huffman_tree(root);         return NULL;      }      *pDataBytes = ntohl(*pDataBytes);    // 字节数与符号数同理，需要转换    /* Read the entries. */      while(count-- > 0)     // 检查是否仍有叶节点未建立，每循环一次建立起一条由根节点至叶结点（符号）的路径     {           int c;        unsigned int curbit;        unsigned char symbol;        unsigned char numbits;        unsigned char numbytes;        unsigned char *bytes;        huffman_node *p = root;        if((c = fgetc(in)) == EOF)         {                free_huffman_tree(root);            return NULL;        }           symbol = (unsigned char)c;         // 每次读取一字节，作为符号        if((c = fgetc(in)) == EOF)           {            free_huffman_tree(root);                return NULL;          }              numbits = (unsigned char)c;         // 每次读取一字节，作为码长          numbytes = (unsigned char)numbytes_from_numbits(numbits);          // 码长转换为字节数        bytes = (unsigned char*)malloc(numbytes);         // 为读取码字分配空间          if(fread(bytes, 1, numbytes, in) != numbytes)         // 读取码字           {                free(bytes);                free_huffman_tree(root);            return NULL;           }          /*             * Add the entry to the Huffman tree. The value             * of the current bit is used switch between             * zero and one child nodes in the tree. New nodes            * are added as needed in the tree.             */           for(curbit = 0; curbit < numbits; ++curbit)         // 读取当前码字的每一位，并依据读取的结果逐步建立起由根节点至该符号叶结点的路径           {                if(get_bit(bytes, curbit)) // 当前读取位是否为’1’                {                 // 当前读取位为’1’                     if(p->one == NULL)                    {                          p->one = curbit == (unsigned char)(numbits - 1)                     // 是否是当前码字的最后一位，是，则新建叶结点；不是，则新建非叶结点。                               ? new_leaf_node(symbol)                               : new_nonleaf_node(0, NULL, NULL);                     p->one->parent = p;                     // ‘1’的一枝的父节点指向当前节点                     }                     p = p->one;                 // 沿’1’方向下移一级                }                else             {                  // 当前读取位为’0’                     if(p->zero == NULL)                     {                          p->zero = curbit == (unsigned char)(numbits - 1)                               ? new_leaf_node(symbol)                              : new_nonleaf_node(0, NULL, NULL);                    p->zero->parent = p;                     }                     p = p->zero;                }         }              free(bytes);    }      return root;     // 返回 Huffman 树的根结点  }

ntohl函数介绍
功能：将一个无符号长整形数从网络字节顺序（小端）转换为主机字节顺序（小端）。
头文件：Winsock2.h
用法：

uint32_t ntohl(uint32_t netlong);

netlong：一个以网络字节顺序表达的32位数。

读取霍夫曼码字，并解码输出

int huffman_decode_file(FILE *in, FILE *out) {      huffman_node *root, *p;    int c;    unsigned int data_count;    /* Read the Huffman code table. */    root = read_code_table(in, &data_count);    if(!root)        return 1;         // Huffman 树建立失败      /* Decode the file. */      p = root;      while(data_count > 0 && (c = fgetc(in)) != EOF)     // data_count >0 ：逻辑上仍有数据；(c = fgetc(in)) != EOF)：文件中仍有数据。      {           unsigned char byte = (unsigned char)c;         // 1byte 的码字           unsigned char mask = 1;         // mask 用于逐位读出码字           while(data_count > 0 && mask)         // loop9: mask = 0x00000000,跳出循环           {                p = byte & mask ? p->one : p->zero;             // 沿 Huffman 树前进                mask <<= 1;            // loop1: byte & 0x00000001               // loop2: byte & 0x00000010                // ……                   // loop8: byte & 0x10000000              if(p->isLeaf)             // 至叶结点（解码完毕）                {                     fputc(p->symbol, out);                 p = root;                    --data_count;            }         }    }      free_huffman_tree(root);     // 所有 Huffman 码字均已解码输出，文件解码完毕      return 0;}

实验结果

对9种不同格式的文件进行霍夫曼编码，并输出码统计结果。

out_table示例：

symbol freq codelength code 0 0.001801 9 11101101 1 0.000173 12 10101110110 2 0.000181 12 11000100000 3 0.000196 12 11001110100 4 0.000215 12 11011101001 5 0.000202 12 11011101000 6 0.000242 12 11110111111

各文件字符概率分布图

压缩效率表

文件类型平均码长信源熵(bit/sym) 原文件大小(kB) 压缩后文件大小(kB) 压缩比 yuv 3.478396 3.572306 798 344 2.319767 ppt 5.547454 5.580906 185 131 1.412214 docx 7.81157 7.829101 44 43 1.023256 png 7.974747 7.98686 677 676 1.001479 jpg 7.912941 7.940257 264 263 1.003802 pdf 7.99668 7.99996 430 430 1 wav 6.122056 6.143849 67 53 1.264151 exe 7.477447 7.508548 980 921 1.064061 rar 7.983922 7.999291 14 15 0.933333

由上表可知霍夫曼编码的平均码长接近信源熵。最大信源熵为8bit/sym。观察高压缩比文件的概率分布图，发现符号概率分布越不均匀的文件，经霍夫曼编码后，压缩比越高，因为霍夫曼编码中概率越大的符号码长越短。

0 0