实验三 Huffman编解码算法实现与压缩效率分析

来源：互联网发布：淘宝客怎样设置佣金编辑：程序博客网时间：2024/05/24 05:49

一、Huffman编解码原理

1. Huffman编码

对原始文件进行Huffman编码，首先需要解决以下几点问题：

文件符号的概率分布情况是怎样的？
Huffman树是如何建立的？
建立起Huffman树后，又是怎样读出符号对应码字的？

这三个问题在程序中的实现思路如下图：

这里写图片描述

将待编码文件里的数据参照已形成的Huffman码表一一进行转换，就可以得到编码后的文件了。

2. Huffman解码

Huffman解码是查表＋翻译的过程。读取随接收文件传来的码表后，再逐位读取文件实际数据，对照码表进行翻译即可。

二、程序实现

流程中最关键的对Huffman树的操作在程序中主要通过两个结构体实现：Huffman_node和Huffman_code。
建立的二叉树上每个节点都以Huffman_node类型存在。节点之间的主要关系有父子、兄弟，Huffman_node中定义了指向父节点的指针*parent和指向孩子的指针*zero, *one来表述节点与节点之间的关系。除此之外，还有节点本身的属性：isLeaf、count、symbol。
而编码码字定义为了Huffman_code，本身属性包括码字占用的比特数和码字本身。
具体程序如下，部分理解在注释中给出。

Huffcode.c

/* *  huffcode - Encode/Decode files using Huffman encoding. *  http://huffman.sourceforge.net *  Copyright (C) 2003  Douglas Ryan Richardson; Gauss Interprise, Inc * *  This library is free software; you can redistribute it and/or *  modify it under the terms of the GNU Lesser General Public *  License as published by the Free Software Foundation; either *  version 2.1 of the License, or (at your option) any later version. * *  This library is distributed in the hope that it will be useful, *  but WITHOUT ANY WARRANTY; without even the implied warranty of *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU *  Lesser General Public License for more details. * *  You should have received a copy of the GNU Lesser General Public *  License along with this library; if not, write to the Free Software *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */#include "huffman.h"#include <stdio.h>#include <string.h>#include <errno.h>#include <stdlib.h>#include <assert.h>#ifdef WIN32#include <malloc.h>extern int getopt(int, char**, char*);extern char* optarg;#else#include <unistd.h>#endifstatic int memory_encode_file(FILE *in, FILE *out);static int memory_decode_file(FILE *in, FILE *out);static voidversion(FILE *out){    fputs("huffcode 0.3\n"          "Copyright (C) 2003 Douglas Ryan Richardson"          "; Gauss Interprise, Inc\n",          out);}static voidusage(FILE* out){    fputs("Usage: huffcode [-i<input file>] [-o<output file>] [-d|-c]\n"          "-i - input file (default is standard input)\n"          "-o - output file (default is standard output)\n"          "-d - decompress\n"          "-c - compress (default)\n"          "-m - read file into memory, compress, then write to file (not default)\n",          // step1: by yzhang, for huffman statistics          "-t - output huffman statistics\n",          //step1:end by yzhang          out);}intmain(int argc, char** argv){    char memory = 0;    char compress = 1;    int opt;    const char *file_in = NULL, *file_out = NULL;    //step1:add by yzhang for huffman statistics    const char *file_out_table = NULL;    //end by yzhang    FILE *in = stdin;    FILE *out = stdout;    //step1:add by yzhang for huffman statistics    FILE * outTable = NULL;    //end by yzhang    /* Get the command line arguments. */    while((opt = getopt(argc, argv, "i:o:cdhvmt:")) != -1) //演示如何跳出循环，及查找括号对    {        switch(opt)        {        case 'i':            file_in = optarg;            break;        case 'o':            file_out = optarg;            break;        case 'c':            compress = 1;//压缩            break;        case 'd':            compress = 0;//解压            break;        case 'h':            usage(stdout);            return 0;        case 'v':            version(stdout);            return 0;        case 'm':            memory = 1;            break;        // by yzhang for huffman statistics        case 't':            file_out_table = optarg;                        break;        //end by yzhang        default:            usage(stderr);            return 1;        }    }    /* If an input file is given then open it. */    if(file_in)    {        in = fopen(file_in, "rb");        if(!in)        {            fprintf(stderr,                    "Can't open input file '%s': %s\n",                    file_in, strerror(errno));            return 1;        }    }    /* If an output file is given then create it. */    if(file_out)    {        out = fopen(file_out, "wb");        if(!out)        {            fprintf(stderr,                    "Can't open output file '%s': %s\n",                    file_out, strerror(errno));            return 1;        }    }    //by yzhang for huffman statistics    if(file_out_table)    {        outTable = fopen(file_out_table, "w");        if(!outTable)        {            fprintf(stderr,                "Can't open output file '%s': %s\n",                file_out_table, strerror(errno));            return 1;        }    }    //end by yzhang    if(memory)    {        return compress ?            memory_encode_file(in, out) : memory_decode_file(in, out);    }    if(compress)  //change by yzhang        huffman_encode_file(in, out,outTable);//step1:changed by yzhang from huffman_encode_file(in, out) to huffman_encode_file(in, out,outTable)    else    huffman_decode_file(in, out);    if(in)        fclose(in);    if(out)        fclose(out);    if(outTable)        fclose(outTable);    return 0;}static intmemory_encode_file(FILE *in, FILE *out){    unsigned char *buf = NULL, *bufout = NULL;    unsigned int len = 0, cur = 0, inc = 1024, bufoutlen = 0;    assert(in && out);    /* Read the file into memory. */    while(!feof(in))    {        unsigned char *tmp;        len += inc;        tmp = (unsigned char*)realloc(buf, len);        if(!tmp)        {            if(buf)                free(buf);            return 1;        }        buf = tmp;        cur += fread(buf + cur, 1, inc, in);    }    if(!buf)        return 1;    /* Encode the memory. */    if(huffman_encode_memory(buf, cur, &bufout, &bufoutlen))    {        free(buf);        return 1;    }    free(buf);    /* Write the memory to the file. */    if(fwrite(bufout, 1, bufoutlen, out) != bufoutlen)    {        free(bufout);        return 1;    }    free(bufout);    return 0;}static intmemory_decode_file(FILE *in, FILE *out){    unsigned char *buf = NULL, *bufout = NULL;    unsigned int len = 0, cur = 0, inc = 1024, bufoutlen = 0;    assert(in && out);    /* Read the file into memory. */    while(!feof(in))    {        unsigned char *tmp;        len += inc;        tmp = (unsigned char*)realloc(buf, len);        if(!tmp)        {            if(buf)                free(buf);            return 1;        }        buf = tmp;        cur += fread(buf + cur, 1, inc, in);    }    if(!buf)        return 1;    /* Decode the memory. */    if(huffman_decode_memory(buf, cur, &bufout, &bufoutlen))    {        free(buf);        return 1;    }    free(buf);    /* Write the memory to the file. */    if(fwrite(bufout, 1, bufoutlen, out) != bufoutlen)    {        free(bufout);        return 1;    }    free(bufout);    return 0;}

Huffman.c

/* *  huffman - Encode/Decode files using Huffman encoding. *  http://huffman.sourceforge.net *  Copyright (C) 2003  Douglas Ryan Richardson; Gauss Interprise, Inc * *  This library is free software; you can redistribute it and/or *  modify it under the terms of the GNU Lesser General Public *  License as published by the Free Software Foundation; either *  version 2.1 of the License, or (at your option) any later version. * *  This library is distributed in the hope that it will be useful, *  but WITHOUT ANY WARRANTY; without even the implied warranty of *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU *  Lesser General Public License for more details. * *  You should have received a copy of the GNU Lesser General Public *  License along with this library; if not, write to the Free Software *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */#include <stdio.h>#include <stdlib.h>#include <string.h>#include <assert.h>#include "huffman.h"#ifdef WIN32#include <winsock2.h>#include <malloc.h>#define alloca _alloca#else#include <netinet/in.h>#endiftypedef struct huffman_node_tag{    unsigned char isLeaf;    unsigned long count;    struct huffman_node_tag *parent;    union    {        struct        {            struct huffman_node_tag *zero, *one;        };        unsigned char symbol;    };} huffman_node;typedef struct huffman_code_tag{    /* The length of this code in bits. */    unsigned long numbits;    /* The bits that make up this code. The first       bit is at position 0 in bits[0]. The second       bit is at position 1 in bits[0]. The eighth       bit is at position 7 in bits[0]. The ninth       bit is at position 0 in bits[1]. */    unsigned char *bits;} huffman_code;//step2:add by yzhang for huffman statistics//存放信源符号的信息:符号频率、比特数、符号码字typedef struct huffman_statistics_result{    float freq[256];    unsigned long numbits[256];    unsigned char bits[256][100];}huffman_stat;/*huffman_stat *init_huffstatistics(){   huffman_stat *p;    int i;    p = (huffman_stat*)malloc(sizeof(huffman_stat));    p->freq = (float *)malloc(sizeof(float)*256 );    p->numbits = (unsigned long *) malloc(sizeof(unsigned long)*256);    for (i=0 ; i<256;i++)        p->bits[i] = (unsigned char *)malloc(sizeof(unsigned char)*100);     return p;}*///end by yzhang//将bit数转换为其对应的byte数,不能被8整除的部分要多分配一整个byte给它static unsigned longnumbytes_from_numbits(unsigned long numbits){    return numbits / 8 + (numbits % 8 ? 1 : 0);}/* * get_bit returns the ith bit in the bits array * in the 0th position of the return value. */static unsigned charget_bit(unsigned char* bits, unsigned long i){    return (bits[i / 8] >> i % 8) & 1;}//由于程序中从二叉树形成码字的过程是从叶到根的,所以需要bit反转函数来获得顺序正确的码字,同时以byte为单位对其进行规范化//例:传入倒序码字为010111011,通过bit反转函数变为00000001 10111010static voidreverse_bits(unsigned char* bits, unsigned long numbits){    unsigned long numbytes = numbytes_from_numbits(numbits);    unsigned char *tmp =        (unsigned char*)alloca(numbytes);//alloca与malloc功能相似，但alloca会自动释放申请的空间    unsigned long curbit;    long curbyte = 0;    memset(tmp, 0, numbytes);//将tmp指向空间的前numbytes个字节内容全部置0    for(curbit = 0; curbit < numbits; ++curbit)    {        unsigned int bitpos = curbit % 8;        //如果一个byte写满了，就跳到下一个byte继续写        if(curbit > 0 && curbit % 8 == 0)            ++curbyte;        //通过get_bit函数从传入的bits里获得当前操作的比特结果,用移位运算将其移动到在一个byte里对应的位置        //由于tmp的指向操作是以byte为单位的，这里只能通过按位取或(|=)来把bit一个一个写到tmp指向的空间里去        //bit反转是靠numbits-curbit-1实现的        tmp[curbyte] |= (get_bit(bits, numbits - curbit - 1) << bitpos);    }    memcpy(bits, tmp, numbytes);//把反转后的tmp写回到bits里}/* * new_code builds a huffman_code from a leaf in * a Huffman tree. */static huffman_code*new_code(const huffman_node* leaf){    /* Build the huffman code by walking up to     * the root node and then reversing the bits,     * since the Huffman code is calculated by     * walking down the tree. */    unsigned long numbits = 0;    unsigned char* bits = NULL;    huffman_code *p;    //此段while循环的目的是从传入的叶结点开始向上进行寻根,得到该叶结点对应的码字    while(leaf && leaf->parent)    {        huffman_node *parent = leaf->parent;        unsigned char cur_bit = (unsigned char)(numbits % 8);        unsigned long cur_byte = numbits / 8;        /* If we need another byte to hold the code,           then allocate it. */        if(cur_bit == 0)        {            size_t newSize = cur_byte + 1;            bits = (unsigned char*)realloc(bits, newSize);//把bits所占的空间大小调整为newSize个字节            bits[newSize - 1] = 0; /* Initialize the new byte. */        }        /* If a one must be added then or it in. If a zero         * must be added then do nothing, since the byte         * was initialized to zero. */        if(leaf == parent->one)//如果叶结点的地址等于该叶结点的爹妈的1孩子地址,则进行对应的移位操作            bits[cur_byte] |= 1 << cur_bit;        ++numbits;        leaf = parent;    }    if(bits)        reverse_bits(bits, numbits);    p = (huffman_code*)malloc(sizeof(huffman_code));    p->numbits = numbits;    p->bits = bits;    return p;//p里包含了编完的码字、码字长度}#define MAX_SYMBOLS 256typedef huffman_node* SymbolFrequencies[MAX_SYMBOLS];typedef huffman_code* SymbolEncoder[MAX_SYMBOLS];//传入符号,建立其对应的叶结点,设置参数static huffman_node*new_leaf_node(unsigned char symbol){    huffman_node *p = (huffman_node*)malloc(sizeof(huffman_node));    p->isLeaf = 1;    p->symbol = symbol;    p->count = 0;    p->parent = 0;    return p;}//建立一个非叶结点,并将它的0、1孩子地址设置为传入的0、1结点地址static huffman_node*new_nonleaf_node(unsigned long count, huffman_node *zero, huffman_node *one){    huffman_node *p = (huffman_node*)malloc(sizeof(huffman_node));    p->isLeaf = 0;    p->count = count;    p->zero = zero;    p->one = one;    p->parent = 0;    return p;}static voidfree_huffman_tree(huffman_node *subtree){    if(subtree == NULL)        return;    if(!subtree->isLeaf)    {        free_huffman_tree(subtree->zero);        free_huffman_tree(subtree->one);    }    free(subtree);}static voidfree_code(huffman_code* p){    free(p->bits);    free(p);}static voidfree_encoder(SymbolEncoder *pSE){    unsigned long i;    for(i = 0; i < MAX_SYMBOLS; ++i)    {        huffman_code *p = (*pSE)[i];        if(p)            free_code(p);    }    free(pSE);}static voidinit_frequencies(SymbolFrequencies *pSF){    memset(*pSF, 0, sizeof(SymbolFrequencies));#if 0    unsigned int i;    for(i = 0; i < MAX_SYMBOLS; ++i)    {        unsigned char uc = (unsigned char)i;        (*pSF)[i] = new_leaf_node(uc);    }#endif}typedef struct buf_cache_tag{    unsigned char *cache;    unsigned int cache_len;    unsigned int cache_cur;    unsigned char **pbufout;    unsigned int *pbufoutlen;} buf_cache;static int init_cache(buf_cache* pc,                      unsigned int cache_size,                      unsigned char **pbufout,                      unsigned int *pbufoutlen){    assert(pc && pbufout && pbufoutlen);    if(!pbufout || !pbufoutlen)        return 1;    pc->cache = (unsigned char*)malloc(cache_size);    pc->cache_len = cache_size;    pc->cache_cur = 0;    pc->pbufout = pbufout;    *pbufout = NULL;    pc->pbufoutlen = pbufoutlen;    *pbufoutlen = 0;    return pc->cache ? 0 : 1;}static void free_cache(buf_cache* pc){    assert(pc);    if(pc->cache)    {        free(pc->cache);        pc->cache = NULL;    }}static int flush_cache(buf_cache* pc){    assert(pc);    if(pc->cache_cur > 0)    {        unsigned int newlen = pc->cache_cur + *pc->pbufoutlen;        unsigned char* tmp = realloc(*pc->pbufout, newlen);        if(!tmp)            return 1;        memcpy(tmp + *pc->pbufoutlen, pc->cache, pc->cache_cur);        *pc->pbufout = tmp;        *pc->pbufoutlen = newlen;        pc->cache_cur = 0;    }    return 0;}static int write_cache(buf_cache* pc,                       const void *to_write,                       unsigned int to_write_len){    unsigned char* tmp;    assert(pc && to_write);    assert(pc->cache_len >= pc->cache_cur);    /* If trying to write more than the cache will hold     * flush the cache and allocate enough space immediately,     * that is, don't use the cache. */    if(to_write_len > pc->cache_len - pc->cache_cur)    {        unsigned int newlen;        flush_cache(pc);        newlen = *pc->pbufoutlen + to_write_len;        tmp = realloc(*pc->pbufout, newlen);        if(!tmp)            return 1;        memcpy(tmp + *pc->pbufoutlen, to_write, to_write_len);        *pc->pbufout = tmp;        *pc->pbufoutlen = newlen;    }    else    {        /* Write the data to the cache. */        memcpy(pc->cache + pc->cache_cur, to_write, to_write_len);        pc->cache_cur += to_write_len;    }    return 0;}//为信源符号建立叶结点,统计次数static unsigned intget_symbol_frequencies(SymbolFrequencies *pSF, FILE *in){    int c;    unsigned int total_count = 0;    /* Set all frequencies to 0. */    init_frequencies(pSF);    /* Count the frequency of each symbol in the input file. */    while((c = fgetc(in)) != EOF)    {        unsigned char uc = c;        if(!(*pSF)[uc])//如果第一次遇到这个符号,则新建该符号的叶结点            (*pSF)[uc] = new_leaf_node(uc);        ++(*pSF)[uc]->count;//对所有符号出现的次数分别进行计数        ++total_count;    }    return total_count;}static unsigned intget_symbol_frequencies_from_memory(SymbolFrequencies *pSF,                                   const unsigned char *bufin,                                   unsigned int bufinlen){    unsigned int i;    unsigned int total_count = 0;    /* Set all frequencies to 0. */    init_frequencies(pSF);    /* Count the frequency of each symbol in the input file. */    for(i = 0; i < bufinlen; ++i)    {        unsigned char uc = bufin[i];        if(!(*pSF)[uc])            (*pSF)[uc] = new_leaf_node(uc);        ++(*pSF)[uc]->count;        ++total_count;    }    return total_count;}/* * When used by qsort, SFComp sorts the array so that * the symbol with the lowest frequency is first. Any * NULL entries will be sorted to the end of the list. */static intSFComp(const void *p1, const void *p2){    const huffman_node *hn1 = *(const huffman_node**)p1;    const huffman_node *hn2 = *(const huffman_node**)p2;    /* Sort all NULLs to the end. */    if(hn1 == NULL && hn2 == NULL)        return 0;    if(hn1 == NULL)        return 1;    if(hn2 == NULL)        return -1;    if(hn1->count > hn2->count)        return 1;    else if(hn1->count < hn2->count)        return -1;    return 0;}#if 1static voidprint_freqs(SymbolFrequencies * pSF){    size_t i;    for(i = 0; i < MAX_SYMBOLS; ++i)    {        if((*pSF)[i])            printf("%d, %ld\n", (*pSF)[i]->symbol, (*pSF)[i]->count);        else            printf("NULL\n");    }}#endif/* * build_symbol_encoder builds a SymbolEncoder by walking * down to the leaves of the Huffman tree and then, * for each leaf, determines its code. */static voidbuild_symbol_encoder(huffman_node *subtree, SymbolEncoder *pSF){    if(subtree == NULL)        return;    //如果传入的结点是叶结点,对其进行编码并存放在对应的指针指向的空间里;如果不是,用递归方法不断调用自身传入该结点的左、右孩子,直到叶结点    if(subtree->isLeaf)        (*pSF)[subtree->symbol] = new_code(subtree);    else    {   //递归        build_symbol_encoder(subtree->zero, pSF);        build_symbol_encoder(subtree->one, pSF);    }}/* * calculate_huffman_codes turns pSF into an array * with a single entry that is the root of the * huffman tree. The return value is a SymbolEncoder, * which is an array of huffman codes index by symbol value. */static SymbolEncoder*calculate_huffman_codes(SymbolFrequencies * pSF){    unsigned int i = 0;    unsigned int n = 0;    huffman_node *m1 = NULL, *m2 = NULL;    SymbolEncoder *pSE = NULL;#if 1    printf("BEFORE SORT\n");    print_freqs(pSF);   //演示堆栈的使用#endif    /* Sort the symbol frequency array by ascending frequency. */    //qsort是自带的快速排序函数,参数为待排序数组的首地址(*pSF),排序元素数量(MAX_SYMBOLS),每个元素的长度(sizeof((*pSF)[0])),自定义的比较函数(SFComp,返回1则前〉后,-1则后〉前)    qsort((*pSF), MAX_SYMBOLS, sizeof((*pSF)[0]), SFComp);   //讲解SFComp函数的作用，断点在调试程序里的作用#if 1       printf("AFTER SORT\n");    print_freqs(pSF);#endif    /* Get the number of symbols. */    for(n = 0; n < MAX_SYMBOLS && (*pSF)[n]; ++n)        ;    /*     * Construct a Huffman tree. This code is based     * on the algorithm given in Managing Gigabytes     * by Ian Witten et al, 2nd edition, page 34.     * Note that this implementation uses a simple     * count instead of probability.     */    for(i = 0; i < n - 1; ++i)    {        /* Set m1 and m2 to the two subsets of least probability. */            m1 = (*pSF)[0];        m2 = (*pSF)[1];        /* Replace m1 and m2 with a set {m1, m2} whose probability         * is the sum of that of m1 and m2. */        (*pSF)[0] = m1->parent = m2->parent =            new_nonleaf_node(m1->count + m2->count, m1, m2);        (*pSF)[1] = NULL;        /* Put newSet into the correct count position in pSF. */        qsort((*pSF), n, sizeof((*pSF)[0]), SFComp);    }    /* Build the SymbolEncoder array from the tree. */    pSE = (SymbolEncoder*)malloc(sizeof(SymbolEncoder));    memset(pSE, 0, sizeof(SymbolEncoder));    build_symbol_encoder((*pSF)[0], pSE);    return pSE;}/* * Write the huffman code table. The format is: * 4 byte code count in network byte order. * 4 byte number of bytes encoded *   (if you decode the data, you should get this number of bytes) * code1 * ... * codeN, where N is the count read at the begginning of the file. * Each codeI has the following format: * 1 byte symbol, 1 byte code bit length, code bytes. * Each entry has numbytes_from_numbits code bytes. * The last byte of each code may have extra bits, if the number of * bits in the code is not a multiple of 8. */static intwrite_code_table(FILE* out, SymbolEncoder *se, unsigned int symbol_count){    unsigned long i, count = 0;    /* Determine the number of entries in se. */    for(i = 0; i < MAX_SYMBOLS; ++i)    {        if((*se)[i])            ++count;    }    /* Write the number of entries in network byte order. */    i = htonl(count);    //在网络传输中，采用big-endian序，对于0x0A0B0C0D ，传输顺序就是0A 0B 0C 0D ，    //因此big-endian作为network byte order，little-endian作为host byte order。    //little-endian的优势在于unsigned char/short/int/long类型转换时，存储位置无需改变    if(fwrite(&i, sizeof(i), 1, out) != 1)        return 1;    /* Write the number of bytes that will be encoded. */    symbol_count = htonl(symbol_count);    if(fwrite(&symbol_count, sizeof(symbol_count), 1, out) != 1)        return 1;    /* Write the entries. */    for(i = 0; i < MAX_SYMBOLS; ++i)    {        huffman_code *p = (*se)[i];        if(p)        {            unsigned int numbytes;            /* Write the 1 byte symbol. */            fputc((unsigned char)i, out);            /* Write the 1 byte code bit length. */            fputc(p->numbits, out);            /* Write the code bytes. */            numbytes = numbytes_from_numbits(p->numbits);            if(fwrite(p->bits, 1, numbytes, out) != numbytes)                return 1;        }    }    return 0;}/* * Allocates memory and sets *pbufout to point to it. The memory * contains the code table. */static intwrite_code_table_to_memory(buf_cache *pc,                           SymbolEncoder *se,                           unsigned int symbol_count){    unsigned long i, count = 0;    /* Determine the number of entries in se. */    for(i = 0; i < MAX_SYMBOLS; ++i)    {        if((*se)[i])            ++count;    }    /* Write the number of entries in network byte order. */    i = htonl(count);    if(write_cache(pc, &i, sizeof(i)))        return 1;    /* Write the number of bytes that will be encoded. */    symbol_count = htonl(symbol_count);    if(write_cache(pc, &symbol_count, sizeof(symbol_count)))        return 1;    /* Write the entries. */    for(i = 0; i < MAX_SYMBOLS; ++i)    {        huffman_code *p = (*se)[i];        if(p)        {            unsigned int numbytes;            /* The value of i is < MAX_SYMBOLS (256), so it can            be stored in an unsigned char. */            unsigned char uc = (unsigned char)i;            /* Write the 1 byte symbol. */            if(write_cache(pc, &uc, sizeof(uc)))                return 1;            /* Write the 1 byte code bit length. */            uc = (unsigned char)p->numbits;            if(write_cache(pc, &uc, sizeof(uc)))                return 1;            /* Write the code bytes. */            numbytes = numbytes_from_numbits(p->numbits);            if(write_cache(pc, p->bits, numbytes))                return 1;        }    }    return 0;}/* * read_code_table builds a Huffman tree from the code * in the in file. This function returns NULL on error. * The returned value should be freed with free_huffman_tree. */static huffman_node*read_code_table(FILE* in, unsigned int *pDataBytes){    //在解码端重建huffman树    huffman_node *root = new_nonleaf_node(0, NULL, NULL);    unsigned int count;    /* Read the number of entries.       (it is stored in network byte order). */    if(fread(&count, sizeof(count), 1, in) != 1)    {        free_huffman_tree(root);        return NULL;    }    count = ntohl(count);//将一个无符号长整形数从网络字节顺序转换为主机字节顺序    /* Read the number of data bytes this encoding represents. */    if(fread(pDataBytes, sizeof(*pDataBytes), 1, in) != 1)    {        free_huffman_tree(root);        return NULL;    }    *pDataBytes = ntohl(*pDataBytes);    /* Read the entries. */    while(count-- > 0)    {        int c;        unsigned int curbit;        unsigned char symbol;        unsigned char numbits;        unsigned char numbytes;        unsigned char *bytes;        huffman_node *p = root;        if((c = fgetc(in)) == EOF)//读取符号并判断        {            free_huffman_tree(root);            return NULL;        }        symbol = (unsigned char)c;        if((c = fgetc(in)) == EOF)//读取字符长度并判断        {            free_huffman_tree(root);            return NULL;        }        numbits = (unsigned char)c;        numbytes = (unsigned char)numbytes_from_numbits(numbits);        bytes = (unsigned char*)malloc(numbytes);        if(fread(bytes, 1, numbytes, in) != numbytes)        {            free(bytes);            free_huffman_tree(root);            return NULL;        }        /*         * Add the entry to the Huffman tree. The value         * of the current bit is used switch between         * zero and one child nodes in the tree. New nodes         * are added as needed in the tree.         */        for(curbit = 0; curbit < numbits; ++curbit)        {            if(get_bit(bytes, curbit))            {                if(p->one == NULL)                {                    p->one = curbit == (unsigned char)(numbits - 1)                        ? new_leaf_node(symbol)                        : new_nonleaf_node(0, NULL, NULL);                    p->one->parent = p;                }                p = p->one;            }            else            {                if(p->zero == NULL)                {                    p->zero = curbit == (unsigned char)(numbits - 1)                        ? new_leaf_node(symbol)                        : new_nonleaf_node(0, NULL, NULL);                    p->zero->parent = p;                }                p = p->zero;            }        }        free(bytes);    }    return root;}static intmemread(const unsigned char* buf,        unsigned int buflen,        unsigned int *pindex,        void* bufout,        unsigned int readlen){    assert(buf && pindex && bufout);    assert(buflen >= *pindex);    if(buflen < *pindex)        return 1;    if(readlen + *pindex >= buflen)        return 1;    memcpy(bufout, buf + *pindex, readlen);    *pindex += readlen;    return 0;}static huffman_node*read_code_table_from_memory(const unsigned char* bufin,                            unsigned int bufinlen,                            unsigned int *pindex,                            unsigned int *pDataBytes){    huffman_node *root = new_nonleaf_node(0, NULL, NULL);    unsigned int count;    /* Read the number of entries.       (it is stored in network byte order). */    if(memread(bufin, bufinlen, pindex, &count, sizeof(count)))    {        free_huffman_tree(root);        return NULL;    }    count = ntohl(count);    /* Read the number of data bytes this encoding represents. */    if(memread(bufin, bufinlen, pindex, pDataBytes, sizeof(*pDataBytes)))    {        free_huffman_tree(root);        return NULL;    }    *pDataBytes = ntohl(*pDataBytes);    /* Read the entries. */    while(count-- > 0)    {        unsigned int curbit;        unsigned char symbol;        unsigned char numbits;        unsigned char numbytes;        unsigned char *bytes;        huffman_node *p = root;        if(memread(bufin, bufinlen, pindex, &symbol, sizeof(symbol)))        {            free_huffman_tree(root);            return NULL;        }        if(memread(bufin, bufinlen, pindex, &numbits, sizeof(numbits)))        {            free_huffman_tree(root);            return NULL;        }        numbytes = (unsigned char)numbytes_from_numbits(numbits);        bytes = (unsigned char*)malloc(numbytes);        if(memread(bufin, bufinlen, pindex, bytes, numbytes))        {            free(bytes);            free_huffman_tree(root);            return NULL;        }        /*         * Add the entry to the Huffman tree. The value         * of the current bit is used switch between         * zero and one child nodes in the tree. New nodes         * are added as needed in the tree.         */        for(curbit = 0; curbit < numbits; ++curbit)        {            if(get_bit(bytes, curbit))            {                if(p->one == NULL)                {                    p->one = curbit == (unsigned char)(numbits - 1)                        ? new_leaf_node(symbol)                        : new_nonleaf_node(0, NULL, NULL);                    p->one->parent = p;                }                p = p->one;            }            else            {                if(p->zero == NULL)                {                    p->zero = curbit == (unsigned char)(numbits - 1)                        ? new_leaf_node(symbol)                        : new_nonleaf_node(0, NULL, NULL);                    p->zero->parent = p;                }                p = p->zero;            }        }        free(bytes);    }    return root;}static intdo_file_encode(FILE* in, FILE* out, SymbolEncoder *se){    unsigned char curbyte = 0;    unsigned char curbit = 0;    int c;    while((c = fgetc(in)) != EOF)    {        unsigned char uc = (unsigned char)c;        huffman_code *code = (*se)[uc];        unsigned long i;        for(i = 0; i < code->numbits; ++i)        {            /* Add the current bit to curbyte. */            curbyte |= get_bit(code->bits, i) << curbit;            /* If this byte is filled up then write it             * out and reset the curbit and curbyte. */            if(++curbit == 8)            {                fputc(curbyte, out);                curbyte = 0;                curbit = 0;            }        }    }    /*     * If there is data in curbyte that has not been     * output yet, which means that the last encoded     * character did not fall on a byte boundary,     * then output it.     */    if(curbit > 0)//写最后一个符号没写满8bit的情况        fputc(curbyte, out);    return 0;}static intdo_memory_encode(buf_cache *pc,                 const unsigned char* bufin,                 unsigned int bufinlen,                 SymbolEncoder *se){    unsigned char curbyte = 0;    unsigned char curbit = 0;    unsigned int i;    for(i = 0; i < bufinlen; ++i)    {        unsigned char uc = bufin[i];        huffman_code *code = (*se)[uc];        unsigned long i;        for(i = 0; i < code->numbits; ++i)        {            /* Add the current bit to curbyte. */            curbyte |= get_bit(code->bits, i) << curbit;            /* If this byte is filled up then write it             * out and reset the curbit and curbyte. */            if(++curbit == 8)            {                if(write_cache(pc, &curbyte, sizeof(curbyte)))                    return 1;                curbyte = 0;                curbit = 0;            }        }    }    /*     * If there is data in curbyte that has not been     * output yet, which means that the last encoded     * character did not fall on a byte boundary,     * then output it.     */    return curbit > 0 ? write_cache(pc, &curbyte, sizeof(curbyte)) : 0;}//step3:add by yzhang for huffman statisticsint huffST_getSymFrequencies(SymbolFrequencies *SF, huffman_stat *st,int total_count){    int i,count =0;    for(i = 0; i < MAX_SYMBOLS; ++i)    {           if((*SF)[i])        {            st->freq[i]=(float)(*SF)[i]->count/total_count;            count+=(*SF)[i]->count;        }        else         {            st->freq[i]= 0;        }    }    if(count==total_count)        return 1;    else        return 0;}int huffST_getcodeword(SymbolEncoder *se, huffman_stat *st){    unsigned long i,j;    for(i = 0; i < MAX_SYMBOLS; ++i)    {        huffman_code *p = (*se)[i];        if(p)        {            unsigned int numbytes;            st->numbits[i] = p->numbits;            numbytes = numbytes_from_numbits(p->numbits);            for (j=0;j<numbytes;j++)                st->bits[i][j] = p->bits[j];        }        else            st->numbits[i] =0;    }    return 0;}void output_huffman_statistics(huffman_stat *st,FILE *out_Table){    int i,j;    unsigned char c;    fprintf(out_Table,"symbol\t   freq\t   codelength\t   code\n");    for(i = 0; i < MAX_SYMBOLS; ++i)    {           fprintf(out_Table,"%d\t   ",i);        fprintf(out_Table,"%f\t   ",st->freq[i]);        fprintf(out_Table,"%d\t    ",st->numbits[i]);        if(st->numbits[i])        {            for(j = 0; j < st->numbits[i]; ++j)            {                c =get_bit(st->bits[i], j);                fprintf(out_Table,"%d",c);            }        }        fprintf(out_Table,"\n");    }}//end by yzhang/* * huffman_encode_file huffman encodes in to out. */inthuffman_encode_file(FILE *in, FILE *out, FILE *out_Table)  //step1:changed by yzhang for huffman statistics from (FILE *in, FILE *out) to (FILE *in, FILE *out, FILE *out_Table){    SymbolFrequencies sf;    SymbolEncoder *se;    huffman_node *root = NULL;    int rc;    unsigned int symbol_count;    //step2:add by yzhang for huffman statistics    huffman_stat hs;    //end by yzhang    /* Get the frequency of each symbol in the input file. */    symbol_count = get_symbol_frequencies(&sf, in); //演示扫描完一遍文件后，SF指针数组的每个元素的构成    //step3:add by yzhang for huffman statistics,...  get the frequency of each symbol     huffST_getSymFrequencies(&sf,&hs,symbol_count);    //end by yzhang    /* Build an optimal table from the symbolCount. */    se = calculate_huffman_codes(&sf);    root = sf[0];    //step3:add by yzhang for huffman statistics... output the statistics to file    huffST_getcodeword(se, &hs);    output_huffman_statistics(&hs,out_Table);    //end by yzhang    /* Scan the file again and, using the table       previously built, encode it into the output file. */    rewind(in);    rc = write_code_table(out, se, symbol_count);    if(rc == 0)        rc = do_file_encode(in, out, se);    /* Free the Huffman tree. */    free_huffman_tree(root);    free_encoder(se);    return rc;}inthuffman_decode_file(FILE *in, FILE *out){    huffman_node *root, *p;    int c;    unsigned int data_count;    /* Read the Huffman code table. */    root = read_code_table(in, &data_count);    if(!root)        return 1;    /* Decode the file. */    p = root;    while(data_count > 0 && (c = fgetc(in)) != EOF)    {        unsigned char byte = (unsigned char)c;        unsigned char mask = 1;        while(data_count > 0 && mask)        {            p = byte & mask ? p->one : p->zero;            mask <<= 1;            if(p->isLeaf)            {                fputc(p->symbol, out);                p = root;                --data_count;            }        }    }    free_huffman_tree(root);    return 0;}#define CACHE_SIZE 1024int huffman_encode_memory(const unsigned char *bufin,                          unsigned int bufinlen,                          unsigned char **pbufout,                          unsigned int *pbufoutlen){    SymbolFrequencies sf;    SymbolEncoder *se;    huffman_node *root = NULL;    int rc;    unsigned int symbol_count;    buf_cache cache;    /* Ensure the arguments are valid. */    if(!pbufout || !pbufoutlen)        return 1;    if(init_cache(&cache, CACHE_SIZE, pbufout, pbufoutlen))        return 1;    /* Get the frequency of each symbol in the input memory. */    symbol_count = get_symbol_frequencies_from_memory(&sf, bufin, bufinlen);    /* Build an optimal table from the symbolCount. */    se = calculate_huffman_codes(&sf);    root = sf[0];    /* Scan the memory again and, using the table       previously built, encode it into the output memory. */    rc = write_code_table_to_memory(&cache, se, symbol_count);    if(rc == 0)        rc = do_memory_encode(&cache, bufin, bufinlen, se);    /* Flush the cache. */    flush_cache(&cache);    /* Free the Huffman tree. */    free_huffman_tree(root);    free_encoder(se);    free_cache(&cache);    return rc;}int huffman_decode_memory(const unsigned char *bufin,                          unsigned int bufinlen,                          unsigned char **pbufout,                          unsigned int *pbufoutlen){    huffman_node *root, *p;    unsigned int data_count;    unsigned int i = 0;    unsigned char *buf;    unsigned int bufcur = 0;    /* Ensure the arguments are valid. */    if(!pbufout || !pbufoutlen)        return 1;    /* Read the Huffman code table. */    root = read_code_table_from_memory(bufin, bufinlen, &i, &data_count);    if(!root)        return 1;    buf = (unsigned char*)malloc(data_count);    /* Decode the memory. */    p = root;    for(; i < bufinlen && data_count > 0; ++i)     {        unsigned char byte = bufin[i];        unsigned char mask = 1;        while(data_count > 0 && mask)        {            p = byte & mask ? p->one : p->zero;            mask <<= 1;            if(p->isLeaf)            {                buf[bufcur++] = p->symbol;                p = root;                --data_count;            }        }    }    free_huffman_tree(root);    *pbufout = buf;    *pbufoutlen = bufcur;    return 0;}

三、结果分析

实验选取了10中文件类型进行Huffman编码，分别为bmp、doc、exe、pdf、png、ppt、rar、wav、xls、yuv。对编码后的文件进行分析，得到以下结果图表：
这里写图片描述
可以看到，进行Huffman编码后，大多数文件都变小了，压缩比在1到4之间。但也有rar这样经过编码后不小反大的文件。

再观察每个文件的字符概率分布情况：
这里写图片描述

对比联合图表可以发现，压缩比是由概率分布决定的。相比于实验选用的bmp、doc等字符概率比较集中的文件，字符概率分布平均分散的文件（如rar、png、pdf），压缩比更小，信源熵更大。

阅读全文

0 0