Huffman编解码完全注释

来源：互联网发布：如何查看本机端口号编辑：程序博客网时间：2024/05/16 02:05
Huffman编解码完全注释

/* *  huffman - Encode/Decode files using Huffman encoding. *  Copyright (C) 2003  Douglas Ryan Richardson; Gauss Interprise, Inc * *  This library is free software; you can redistribute it and/or *  modify it under the terms of the GNU Lesser General Public *  License as published by the Free Software Foundation; either *  version 2.1 of the License, or (at your option) any later version. * *  This library is distributed in the hope that it will be useful, *  but WITHOUT ANY WARRANTY; without even the implied warranty of *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU *  Lesser General Public License for more details. * *  You should have received a copy of the GNU Lesser General Public *  License along with this library; if not, write to the Free Software *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */ /* * 针对可打印字符的huffman编解码. */ #include <stdio.h>#include <stdlib.h>#include <string.h>#include <assert.h>#include "huffman.h"  #ifdef WIN32#include <winsock2.h>#include <winsock.h>#include <malloc.h>#define alloca _alloca#else#include <netinet/in.h>#include <sys/socket.h>#include <sys/types.h>#endif   // ---------------------------------------------------------------------------------------/* 描述huffman树的节点 */typedef struct huffman_node_tag{    unsigned char        isLeaf;       // 当前节点是否为叶子节点    unsigned long        count;     // 权(字符出现的次数)    struct huffman_node_tag *parent;   // 当前节点的父亲    union {                                  /*        * 如果改节点为叶子节点, 则有unsigned char symbol字段,        * 如果该节点不是叶子节点, 则具有左孩子右孩子字段.        */       struct {           struct huffman_node_tag *zero;  // 左孩子           struct huffman_node_tag *one;   // 右孩子       };       unsigned char symbol;    // 存储的字符.    };} huffman_node;/* 编码 */typedef struct huffman_code_tag{    /*     * numbits 这个字段是用来记录从leaf--->root一共走了多少步.     * 也就是叶子节点所含字符的编码长度     * bits 是用来存储编码的, 但是它是以Byte为单位的, 而编码是     * 以bit为单位的, 所以需要根据 numbits 去从 bits 里提取出前     * numbits 个bit。 这才是真正的编码.     *     * 比如     * numbits=4     * bits = 0110 1011     * 那么我们应该提取bits的低4位 1011     *     * 注意 numbits 和 bits是有关系的, bits一定不可能超过 numbits/8      */         /* The length of this code in bits. */    unsigned long numbits;/* 记录从叶子走到root需要多少步, 也就是说需要多少位来对指定的字符进行编码 */       /* The bits that make up this code. The first       bit is at position 0 in bits[0]. The second       bit is at position 1 in bits[0]. The eighth       bit is at position 7 in bits[0]. The ninth       bit is at position 0 in bits[1]. */    unsigned char *bits;     /* 用来存储编码String */} huffman_code;    intmain(int argc, char** argv){    /*    char       memory = 0;    char       compress = 1;    int        opt;    const char    *file_in = NULL;    const char  *file_out = NULL;    */    //FILE     *in = stdin;    //FILE     *out = stdout;          FILE       *in = NULL;    FILE       *out = NULL;    FILE       *out2 = NULL;       in = fopen("test.txt", "r+");    out = fopen("test1.txt", "w+");    //out2 = fopen("test2.txt", "w+");       int result = 0;    result = huffman_encode_file(in, out);    //result = huffman_decode_file(out, out2);       printf("result : %d/n", result);       system("pause");    return 1;}  // ---------------------------------------------------------------------------------------/** * bit-->byte转换, 不足8位则补齐8位. */static unsigned longnumbytes_from_numbits(unsigned long numbits){    return numbits / 8 + (numbits % 8 ? 1 : 0);} /* * get_bit returns the ith bit in the bits array * in the 0th position of the return value. * 取出第i位(从低位往高位排) */static unsigned charget_bit(unsigned char *bits, unsigned long i){    return ( bits[i/8] >> (i%8) ) & 1;} /** * 反转数组的前((numbits/8)+1)个字符 * */static voidreverse_bits(unsigned char* bits, unsigned long numbits){    unsigned long numbytes = numbytes_from_numbits(numbits);    // 所占字节数.    unsigned char *tmp = (unsigned char*)calloc(numbytes, sizeof(unsigned char));    // 分配内存    unsigned long curbit;           // index -- 当前位    long          curbyte = 0;  // 当前是byte的index          memset(tmp, 0, numbytes);   // 将tmp指向的buffer清零    /*     *     */    for(curbit=0; curbit<numbits; ++curbit) {       unsigned int bitpos = curbit % 8;  // 当前byte里的index       if(curbit>0 && curbit%8==0) {      //           ++curbyte;       }       /*        * 按位 OR        *        */       tmp[curbyte] |= ( get_bit(bits, numbits-curbit-1) << bitpos );    }    memcpy(bits, tmp, numbytes);} /* * new_code builds a huffman_code from a leaf in * a Huffman tree. * 对指定的叶子进行编码. */static huffman_code*new_code(const huffman_node* leaf){    /* Build the huffman code by walking up to     * the root node and then reversing the bits,     * since the Huffman code is calculated by     * walking down the tree. */    unsigned long numbits = 0;    unsigned char *bits = NULL;    huffman_code  *p;       while(leaf!=NULL && leaf->parent!=NULL) {       huffman_node *parent = leaf->parent;   // 当前活动节点的双亲       unsigned long cur_byte   = numbits / 8;    // 当前byte       unsigned char cur_bit    = (unsigned char)(numbits % 8); // 当前byte里的当前bit             /* If we need another byte to hold the code,          then allocate it. */       if(cur_bit == 0) {   // 满了一个byte就需要重新分配内存           size_t newSize = cur_byte + 1;           bits = (char*)realloc(bits, newSize);           bits[newSize - 1] = 0; /* Initialize the new byte. */       }             /*        * ************************************************************************        * 需要注意的是右孩子的处理, 因为左孩子已经是0, 初始分配bits的时候就已经把每位都设置为0了.        * ************************************************************************        */               /* If a one must be added then or it in. If a zero        * must be added then do nothing, since the byte        * was initialized to zero. */       if(leaf == parent->one) {           /*            * 右孩子的话就需要把当前位置为1            */           bits[cur_byte] |= (1<<cur_bit);       }             ++numbits;       leaf = parent;    }       if( bits != 0) {       /*        * 如果编码里头含有1, 则需要进行反转, 如果全为0, 反转后和反转前是一样的, 就没必要反转了        */       reverse_bits(bits, numbits);    }       p = (huffman_code*)malloc(sizeof(huffman_code));    p->numbits    = numbits; /* 记录从叶子走到root需要多少步, 也就是说需要多少位来对指定的字符进行编码 */    p->bits    = bits;       /* 用来存储编码的区间 */          return p;} /** * 创建一个孤立的"叶子"节点, 该节点为一个单独的树 * symbol : 该叶子节点的权,即字符 */static huffman_node*new_leaf_node(unsigned char symbol){    huffman_node *p = (huffman_node*)malloc( sizeof(huffman_node) );       p->isLeaf = 1;    p->symbol = symbol;    p->count = 0;    p->parent = 0;       return p;} /** * 创建节点(该节点不是叶子节点) * * count  : 字符出现的次数 * zero   : 左孩子 * one    : 右兄弟 * * return : 返回一个节点对象 */static huffman_node*new_nonleaf_node(unsigned long count, huffman_node *zero, huffman_node *one){    huffman_node *p = (huffman_node*)malloc( sizeof(huffman_node) );    p->isLeaf = 0;    p->count = count;    p->zero = zero;    p->one = one;    p->parent = 0;    return p;} /** * 释放树占用的内存空间 */static voidfree_huffman_tree(huffman_node *subtree){    if(subtree == NULL)       return;       if( !(subtree->isLeaf) ) {       /*        * 先序遍历进行递归调用        */       free_huffman_tree( subtree->zero );       free_huffman_tree( subtree->one );    }    free( subtree );}/** * free code */static voidfree_code(huffman_code* p){    free(p->bits);    free(p);}     // ----------------------------------------------------------------------------------------#define MAX_SYMBOLS 256typedef huffman_node* SymbolFrequencies[MAX_SYMBOLS];   /*  */typedef huffman_code* SymbolEncoder[MAX_SYMBOLS];       /*  *//*  */static voidfree_encoder(SymbolEncoder *pSE){    unsigned long i;    for(i = 0; i < MAX_SYMBOLS; ++i) {       huffman_code *p = (*pSE)[i];       if( p )    free_code(p);    }}/*  */static voidinit_frequencies(SymbolFrequencies *pSF){    memset(*pSF, 0, sizeof(SymbolFrequencies) );  /* 清零 */}  // ----------------------------------------------------------------------------------------typedef struct buf_cache_tag{    /*     * 该结构主要描述了两个部分, 一个是cache, 一个是bufout.     * cache是一个临时存储数据的buffer, cache会将数据写往bufout区间,     * bufout类似一个仓库, 会一直存储cache写入的数据.     * cache可以多次网bufout内写数据, bufout会一直保存这些数据.     * bufout是一个动态的buffer, cache每一次往bufout内写数据的时候bufout都需要realloc一次.     */    unsigned char *cache;       // 指向真正存储数据的buffer    unsigned int cache_len; // buffer的长度, 初始的时候就可以设置 cache的大小的    unsigned int cache_cur; // 数据结尾处(或者说是光标位置)    unsigned char **pbufout; /*                             * cache要写数据就往这个空间内写(类似一个动态仓库, 一定是动态的)                             * (*pbufout)就是真实的存储区                             */    unsigned int *pbufoutlen;  // 仓库的大小} buf_cache;/* 初始化一个buf_cache */static int init_cache(buf_cache    *pc,                    unsigned int    cache_size,                    unsigned char **pbufout,                    unsigned int    *pbufoutlen){    assert(pc && pbufout && pbufoutlen);    if(!pbufout || !pbufoutlen) return 1;       pc->cache     = (unsigned char*)malloc(cache_size);  // 分配存储空间    pc->cache_len = cache_size; //    pc->cache_cur = 0;       // 光标从0开始    pc->pbufout   = pbufout; //    *pbufout      = NULL;       //    pc->pbufoutlen    = pbufoutlen;    *pbufoutlen   = 0;       //       return (pc->cache==NULL ? 0 : 1);} /* 释放buf_cache */static void free_cache(buf_cache* pc){    assert( pc );    if( pc->cache != NULL)    {       /*        * 我觉得这里没有必要free( pc->cache );, 直接执行pc->cache = NULL;就可以保证        * 逻辑上清cache了, 至于真实的存储区内是否仍然有数据并没有意义.        */       free( pc->cache );       pc->cache = NULL;    }}/* * 将cache内的数据写到pbufout中去, 并清洗cache * 成功则返回0, 失败返回1. */static int flush_cache(buf_cache* pc){    assert( pc );       if(pc->cache_cur > 0)    // 确定cache_cur有平移, 这样才能确定cache内有数据. 才可以flush    {       /* 当前要写的数据长度为pc->cache_cur, 原来的bufout里头本身还有*(pc->pbufoutlen)长度的数据 */       unsigned int newlen = pc->cache_cur + *(pc->pbufoutlen);       /* 需要重新为*(pc->pbufout)分配空间, tmp为这个新buffer的首地址 */       unsigned char*    tmp = realloc(*(pc->pbufout), newlen);             if( !tmp ) return 1;             /* 追加到pbufout结尾处, 而不是覆盖 */       memcpy(tmp + *(pc->pbufoutlen), pc->cache, pc->cache_cur);       *pc->pbufout = tmp;         // pbufout指针重定位到新的扩大了的内存区.         *pc->pbufoutlen = newlen;   // 重新计算pbufoutlen       pc->cache_cur = 0;          // cache逻辑上清零    }       return 0;}/* 写cache */static int write_cache(buf_cache* pc,                     const void *to_write,                     unsigned int to_write_len){    unsigned char* tmp;       assert(pc && to_write);    assert(pc->cache_len >= pc->cache_cur);       /* If trying to write more than the cache will hold     * flush the cache and allocate enough space immediately,     * that is, don't use the cache. */    if(to_write_len > pc->cache_len - pc->cache_cur)    {       /*        * to_write_len : 需要往cache内写的数据长度        * pc->cache_len-pc->cache_cur : cache内剩余空间        * 如果cache存储能力不够则先清洗cache, 再将数据直接写到        * pbufout中去, 不使用cache.        */       unsigned int newlen;             flush_cache( pc );             newlen = *pc->pbufoutlen + to_write_len;       tmp = realloc(*pc->pbufout, newlen);       if( !tmp ) return 1;       memcpy(tmp + *pc->pbufoutlen, to_write, to_write_len);       *pc->pbufout = tmp;       *pc->pbufoutlen = newlen;    }    else    {       /*        * Write the data to the cache        * 如果cache存储能力足够,则往cache内追加数据且将当前光标移动到新的数据尾部.        */       memcpy(pc->cache+pc->cache_cur, to_write, to_write_len);       pc->cache_cur += to_write_len;    }       return 0;}  // -------------------------------------------------------------------------------------/* * 计算FILE对象内的各个字符出现的频率 * 扫描FILE对象, */static unsigned intget_symbol_frequencies(SymbolFrequencies *pSF, FILE *in){    int c;    unsigned int total_count = 0;   // FILE对象内的字符总数       init_frequencies( pSF ); /* Set all frequencies to 0. */       /* Count the frequency of each symbol in the input file. */    while( (c=fgetc(in)) != EOF )    {       unsigned char uc = c;        if( !(*pSF)[uc] ) // 如果这个字符没有出现过则为这个字符建立一个叶子       {           /*           * 这里设计的相当有意思,           * fgetc会获得当前光标所在的字符, 这个字符必然是一个char, 也就是一个           * unsinged int型数.           * 我们前面有定义           * #define MAX_SYMBOLS 256           * typedef huffman_node* SymbolFrequencies[MAX_SYMBOLS];           * typedef huffman_code* SymbolEncoder[MAX_SYMBOLS];           * 这里我们采用SymbolFrequencies[uc]就存储c这个字符.           * 也就是说           * SymbolFrequencies[0]==0x00           * SymbolFrequencies[1]==0x01           *  ......           * SymbolFrequencies[65]==A  ( A==65 :) )           * SymbolFrequencies[66]==B  ( B==66 :) )           *  ......           * 起初在typedf SymbolFrequencies的时候, 我着实没看懂, 实在太巧妙了.           */           (*pSF)[uc] = new_leaf_node( uc );       }       /*        * 这个自加也非常巧妙, 遇到uc字符则将第uc个huffman_node的count自加        * 实际就是一个字符一个字符读下去,,        * 读到了A就 ++(pSF['A'].count) == ++(pSF[65].count)        * 读到了B就 ++(pSF['B'].count) == ++(pSF[66].count)        * 真TNND的牛X!        */       ++( (*pSF)[uc]->count );       ++total_count;    }     return total_count;}/* 计算buffer内各个字符的频率,和get_symbol_frequencies函数同理 */static unsigned intget_symbol_frequencies_from_memory(SymbolFrequencies    *pSF,                               const unsigned char    *bufin,                               unsigned int       bufinlen){    unsigned int i;    unsigned int total_count = 0;       /* Set all frequencies to 0. */    init_frequencies(pSF);       /* Count the frequency of each symbol in the input file. */    for(i = 0; i < bufinlen; ++i)    {       unsigned char uc = bufin[i];       if( !(*pSF)[uc] )       {           (*pSF)[uc] = new_leaf_node(uc);       }       ++(*pSF)[uc]->count;       ++total_count;    }     return total_count;} /* * When used by qsort, SFComp sorts the array so that * the symbol with the lowest frequency is first. Any * NULL entries will be sorted to the end of the list. * * 两个huffman_node进行对比, 以count作为比较依据. * 即对比两个不同 symbol 出现的频率 * 非叶子节点通通排在后面 */static intSFComp(const void *p1, const void *p2){    const huffman_node *hn1 = *(const huffman_node**)p1;    const huffman_node *hn2 = *(const huffman_node**)p2;       /* Sort all NULLs to the end. */    if(hn1 == NULL && hn2 == NULL)     return 0;    if(hn1 == NULL)                    return 1;    if(hn2 == NULL)                    return -1;       if(hn1->count > hn2->count)        return 1;    else if(hn1->count < hn2->count)   return -1;       return 0;} /* * build_symbol_encoder builds a SymbolEncoder by walking * down to the leaves of the Huffman tree and then, * for each leaf, determines its code. * * 递归方式为每个叶子节点编码 */static voidbuild_symbol_encoder(huffman_node *subtree, SymbolEncoder *pSE){    if(subtree == NULL)  return;       if( subtree->isLeaf )    {       (*pSE)[subtree->symbol] = new_code( subtree );    }    else    {       build_symbol_encoder(subtree->zero, pSE);       build_symbol_encoder(subtree->one, pSE);    }} /* * calculate_huffman_codes turns pSF into an array * with a single entry that is the root of the * huffman tree. The return value is a SymbolEncoder, * which is an array of huffman codes index by symbol value. * * 为每个node编码. 这个函数比较重要, 精华就是在这个函数里头的for循环. 哈哈 * 整个tree的建立全都依赖这个函数 */static SymbolEncoder*calculate_huffman_codes(SymbolFrequencies * pSF){    unsigned int i = 0;    unsigned int n = 0;    huffman_node *m1  = NULL, *m2 = NULL;    SymbolEncoder *pSE = NULL;       /*     * Sort the symbol frequency array by ascending frequency.     * 快速排序例程进行排序     * 以symbol频率为关键字做升序排列     * 有symbol的节点都会按升序排列, 没有symbol的节点会统一排在后面,     * 通过一个for就能计算出symbol的个数了.     */    qsort((*pSF), MAX_SYMBOLS, sizeof((*pSF)[0]), SFComp);       /*     * Get the number of symbols.     * 计算huffman树中的字符数, 这个实现可读性不够好     */    for(n = 0; (n<MAX_SYMBOLS) && (*pSF)[n]; ++n)       ;       /*     * Construct a Huffman tree. This code is based     * on the algorithm given in Managing Gigabytes     * by Ian Witten et al, 2nd edition, page 34.     * Note that this implementation uses a simple     * count instead of probability.     */    for(i = 0; i < (n-1); ++i)    {       /* Set m1 and m2 to the two subsets of least probability. */       m1 = (*pSF)[0];       m2 = (*pSF)[1];       /* Replace m1 and m2 with a set {m1, m2} whose probability        * is the sum of that of m1 and m2.        * 这个算法有优化的余地的, 因为n在一直减小.        * 将最小的两个元素合并后得到一个一个节点为m12, 此时m1,m2已经建立起来了关系.        * 这个m12的地址又被pSF[0]存储, 循环直至整个Tree建立成功.        * 指针在这里运用的实在太巧妙了.        * 这一行代码就是建树, 靠，NBA!        */       (*pSF)[0] = m1->parent = m2->parent = new_nonleaf_node(m1->count+m2->count, m1, m2);       (*pSF)[1] = NULL;       /*        * Put newSet into the correct count position in pSF.        * 这里应该可以再进行优化, 是否有必要再进行排序, 或者被排序的数组过长了.        * 实际上每循环一次n都减少了一次        */       qsort((*pSF), n, sizeof((*pSF)[0]), SFComp);    }/* for完毕的时候就求出了root, pSF[0]就是root, 后面的元素都是NULL      * 而树通过for循环里头的      * (*pSF)[0] = m1->parent = m2->parent = new_nonleaf_node(m1->count+m2->count, m1, m2);      * 已经建立完成了*/       /* Build the SymbolEncoder array from the tree. */    pSE = (SymbolEncoder*)malloc(sizeof(SymbolEncoder));    memset(pSE, 0, sizeof(SymbolEncoder));    build_symbol_encoder((*pSF)[0], pSE);       return pSE;} /* * Write the huffman code table. The format is: * 4 byte code count in network byte order. * 4 byte number of bytes encoded *   (if you decode the data, you should get this number of bytes) * code1 * ... * codeN, where N is the count read at the begginning of the file. * Each codeI has the following format: * 1 byte symbol, 1 byte code bit length, code bytes. * Each entry has numbytes_from_numbits code bytes. * The last byte of each code may have extra bits, if the number of * bits in the code is not a multiple of 8. * * 编码后的格式 : * 0-3个byte是FILE内出现的不同字符个数(几不同的字符个数) * 4-7个byte是FILE内出现的全部字符个数(所有的字符) * 8-X是真正的编码后值 * */static intwrite_code_table(FILE* out, SymbolEncoder *se, unsigned int symbol_count){    unsigned long i, count = 0;       /*     * Determine the number of entries in se     * 计算 SymbolEncoder 内具有编码值的元素个数.     * 即有几种字符     */    for(i = 0; i < MAX_SYMBOLS; ++i)       if( (*se)[i] )           ++count;       /*     * Write the number of entries in network byte order.     * 将字符种数写入到文件头部, 即[0, 3]一共4个字节     */    //i = htonl( count );    i = count;    if(fwrite(&i, sizeof(i), 1, out) != 1) return 1;       /*     * Write the number of bytes that will be encoded.     * 将字符个数追加到[4,7]一共4个字节     */    //symbol_count = htonl(symbol_count);    symbol_count = symbol_count;    if(fwrite(&symbol_count, sizeof(symbol_count), 1, out) != 1)   return 1;       /*     * Write the entries.     */    for(i = 0; i < MAX_SYMBOLS; ++i)    {       huffman_code *p = (*se)[i];       if( p != NULL )       {   /*            * 每个单元分为三个部分 :             * symbol  -- 字符            * numbits  -- 叶子走到root需要的步数            * bits    -- 叶子走到root的方式(即最终的编码, 比如说0101)            */           unsigned int numbytes;           /* Write the 1 byte symbol. */           fputc((unsigned char)i, out);             /* Write the 1 byte code bit length. */           fputc(p->numbits, out);           /* Write the code bytes. 她这个注释就没有说是几byte, 值得思考一下 */           numbytes = numbytes_from_numbits( p->numbits );           /* 将叶子走到root的方式写进去, 这个方式会被整理为byte格式, 不够就补0 */           if(fwrite(p->bits, 1, numbytes, out) != numbytes)    return 1;       }    }     return 0;} /* * Allocates memory and sets *pbufout to point to it. The memory * contains the code table. * * 以指定的格式将编码后的数据写入到cache中去, 实际是写到pbufout中去了. * */static intwrite_code_table_to_memory(buf_cache      *pc,                        SymbolEncoder   *se,                        unsigned int    symbol_count){    unsigned long i, count = 0;       /* Determine the number of entries in se. */    for(i = 0; i < MAX_SYMBOLS; ++i)    {       if((*se)[i])       {           ++count;   // 计算不同字符的个数       }    }     /* Write the number of entries in network byte order. */    //i = htonl(count);    i = count;    if( write_cache(pc, &i, sizeof(i)) )   // 前四个字节是memory内所有字符数       return 1;       /* Write the number of bytes that will be encoded. */    //symbol_count = htonl(symbol_count);    symbol_count = symbol_count;    if( write_cache(pc, &symbol_count, sizeof(symbol_count)) )  // 4-8字节是不同字符个数       return 1;     /* Write the entries. */    for(i = 0; i < MAX_SYMBOLS; ++i)    {       huffman_code *p = (*se)[i];       if( p )       {           /*            * 对于每次循环来说, 如果p不为NULL, 则将该字符对应的编码写入到cache内.            * 存储格式为三个字节作为一个单位.            * byte0 --- 字符本身            * byte1 --- 该字符编码后的码值长度(即2进制的位数)            * byte2 --- 该字符对应的码值            */           unsigned int numbytes;           /*            * The value of i is < MAX_SYMBOLS (256), so it can            * be stored in an unsigned char.            * 将i转换为char型, 可以对应到字符集            */           unsigned char uc = (unsigned char)i;           /*            * Write the 1 byte symbol.            * 将字符写到cache内            */           if(write_cache(pc, &uc, sizeof(uc)))   return 1;           /*            * Write the 1 byte code bit length.            * 将叶子节点到root所需要经过的步数写到cache内, 也就是编码的长度            * 这个数据是为了解码使用的.            */           uc = (unsigned char)p->numbits;           if(write_cache(pc, &uc, sizeof(uc)))   return 1;           /*            * Write the code bytes.            * 将编码值对齐并写如到cache内            * 事先必须知道编码由几位组成, 如果编码为9位, 那么就需要2个byte来存储这个码值            * 如果编码为4位, 那么就需要1个byte来存储了,            */           numbytes = numbytes_from_numbits(p->numbits);           if(write_cache(pc, p->bits, numbytes)) return 1;       }    }     return 0;} /* * read_code_table builds a Huffman tree from the code * in the in file. This function returns NULL on error. * The returned value should be freed with free_huffman_tree. * * */static huffman_node*read_code_table(FILE* in, unsigned int *pDataBytes){    huffman_node *root = new_nonleaf_node(0, NULL, NULL);    unsigned int count;       /*     * Read the number of entries.     * (it is stored in network byte order).     * 获得字符种数, 前2个byte就是出现的字符种数     */    if( fread(&count, sizeof(count), 1, in) != 1 )    {       free_huffman_tree( root );       return NULL;    }     //count = ntohl(count);    count = count;    /*     * Read the number of data bytes this encoding represents.     * 一个有多少个字符     */    if( fread(pDataBytes, sizeof(*pDataBytes), 1, in) != 1 )    {       free_huffman_tree(root);       return NULL;    }       //*pDataBytes = ntohl(*pDataBytes);    *pDataBytes = *pDataBytes;       /* Read the entries. */    while(count-- > 0)    {       int           c;       unsigned int curbit;       unsigned char symbol;       unsigned char numbits;       unsigned char numbytes;       unsigned char *bytes;       huffman_node *p = root;             if( (c=fgetc(in)) == EOF )       {           free_huffman_tree( root );           return NULL;       }       symbol = (unsigned char)c;             if( (c=fgetc(in)) == EOF )       {           free_huffman_tree( root );           return NULL;       }             numbits    = (unsigned char)c;       numbytes   = (unsigned char)numbytes_from_numbits( numbits );       bytes      = (unsigned char*)malloc( numbytes );       if( fread(bytes, 1, numbytes, in) != numbytes )       {           free(bytes);           free_huffman_tree(root);           return NULL;       }        /*        * Add the entry to the Huffman tree. The value        * of the current bit is used switch between        * zero and one child nodes in the tree. New nodes        * are added as needed in the tree.        */       for(curbit = 0; curbit < numbits; ++curbit)       {           if(get_bit(bytes, curbit))           {              if(p->one == NULL)              {                  p->one =                     curbit == (unsigned char)(numbits-1) ?                     new_leaf_node(symbol) : new_nonleaf_node(0, NULL, NULL);                  p->one->parent = p;              }              p = p->one;           }           else           {              if(p->zero == NULL)              {                  p->zero =                     curbit == (unsigned char)(numbits - 1) ?                     new_leaf_node(symbol) : new_nonleaf_node(0, NULL, NULL);                  p->zero->parent = p;              }              p = p->zero;           }       }             free(bytes);    }       return root;}/* * 将数据从buf读到bufout中, 成功则返回0, 其他则返回1. * pindex  -- 拷贝的起点 */static intmemread(const unsigned char*    buf,       unsigned int         buflen,       unsigned int         *pindex,       void*                bufout,       unsigned int         readlen){    assert(buf && pindex && bufout);    assert(buflen >= *pindex);       // 错误    if(buflen < *pindex)        return 1;    if(readlen + *pindex >= buflen) return 1;       memcpy(bufout, buf + *pindex, readlen);    *pindex += readlen;       return 0;}/* * 从编码后的buf内读数据. */static huffman_node*read_code_table_from_memory(const unsigned char* bufin,                         unsigned int         bufinlen,                         unsigned int         *pindex,                         unsigned int         *pDataBytes){    huffman_node *root = new_nonleaf_node(0, NULL, NULL);    unsigned int count;       /*     * Read the number of entries.     * (it is stored in network byte order).     * 读取     */    if( memread(bufin, bufinlen, pindex, &count, sizeof(count)) )    {       free_huffman_tree(root);       return NULL;    }     //count = ntohl(count);    count = count;       /* Read the number of data bytes this encoding represents. */    if(memread(bufin, bufinlen, pindex, pDataBytes, sizeof(*pDataBytes)))    {       free_huffman_tree(root);       return NULL;    }       //*pDataBytes = ntohl(*pDataBytes);    *pDataBytes = *pDataBytes;       /* Read the entries. */    while( (count--) > 0 )    {       unsigned int curbit;       unsigned char symbol;       unsigned char numbits;       unsigned char numbytes;       unsigned char *bytes;       huffman_node *p = root;        if(memread(bufin, bufinlen, pindex, &symbol, sizeof(symbol)))       {           free_huffman_tree(root);           return NULL;       }        if(memread(bufin, bufinlen, pindex, &numbits, sizeof(numbits)))       {           free_huffman_tree(root);           return NULL;       }             numbytes = (unsigned char)numbytes_from_numbits(numbits);       bytes = (unsigned char*)malloc(numbytes);       if(memread(bufin, bufinlen, pindex, bytes, numbytes))       {           free(bytes);           free_huffman_tree(root);           return NULL;       }        /*        * Add the entry to the Huffman tree. The value        * of the current bit is used switch between        * zero and one child nodes in the tree. New nodes        * are added as needed in the tree.        */       for(curbit = 0; curbit < numbits; ++curbit)       {           if(get_bit(bytes, curbit))           {              if(p->one == NULL)              {                  p->one = ( curbit==(unsigned char)(numbits - 1) ) ?                  new_leaf_node(symbol) : new_nonleaf_node(0, NULL, NULL);                  p->one->parent = p;              }              p = p->one;           }           else           {              if(p->zero == NULL)              {                  p->zero = curbit == (unsigned char)(numbits - 1)                     ? new_leaf_node(symbol)                     : new_nonleaf_node(0, NULL, NULL);                  p->zero->parent = p;              }              p = p->zero;           }       }             free(bytes);    }     return root;}/* * 依次将各个字符的编码写入到out中, 这次是直接写, 不对编码进行整齐工作 * 也就是不将编码强制为byte类型了, 而是直接写入到out中. */static intdo_file_encode(FILE* in, FILE* out, SymbolEncoder *se){    unsigned char curbyte = 0;    unsigned char curbit  = 0;    int c;       while( (c = fgetc(in)) != EOF)    {       //printf("c=%c   /n", c);       unsigned char uc     = (unsigned char)c;       huffman_code *code = (*se)[uc];       unsigned long i;             for(i = 0; i < code->numbits; ++i)       {           //printf("i=%d   /n", i);           /* Add the current bit to curbyte. */           curbyte |= get_bit(code->bits, i) << curbit;           /* If this byte is filled up then write it            * out and reset the curbit and curbyte. */           if(++curbit == 8)           {              /*               * 依次将各个字符的编码写入到out中, 这次是直接写, 不对编码进行整齐工作               * 也就是不将编码强制为byte类型了, 而是直接写入到out中.               */              fputc(curbyte, out);              printf("curbyte=%d/n", curbyte);                        // printf("curbyte=%d   /n", curbyte);              curbyte = 0;              curbit = 0;           }       }       printf("/n");    }     /*     * If there is data in curbyte that has not been     * output yet, which means that the last encoded     * character did not fall on a byte boundary,     * then output it.     */    if(curbit > 0)       fputc(curbyte, out);       return 0;}/* * 对memory进行编码. * * pc      -- cache * bufin   -- 待编码的buffer * bufinlen   -- buffer长, 即字符个数 * se      -- */static intdo_memory_encode(buf_cache             *pc,               const unsigned char *bufin,               unsigned int         bufinlen,               SymbolEncoder           *se){    unsigned char curbyte    = 0;       //    unsigned char curbit = 0;    unsigned int i;       /* 对 bufin 内的字符依次循环 */    for(i = 0; i < bufinlen; ++i)    {       unsigned char uc = bufin[i];       huffman_code *code = (*se)[uc]; // 取出第i个字符的编码       unsigned long i;                        /* 对第i个字符编码长度进行循环 */       for(i = 0; i < code->numbits; ++i)       {           /*            * Add the current bit to curbyte.            * 依次取出            */           curbyte |= ( get_bit(code->bits, i) << curbit );                     /*            * If this byte is filled up then write it            * out and reset the curbit and curbyte            */           if(++curbit == 8)           {              /*               * 满了一个字节则写cache               *               */              if(write_cache(pc, &curbyte, sizeof(curbyte)))   return 1;              curbyte = 0;              curbit  = 0;           }       }    }     /*     * If there is data in curbyte that has not been     * output yet, which means that the last encoded     * character did not fall on a byte boundary,     * then output it.     */    return curbit > 0 ? write_cache(pc, &curbyte, sizeof(curbyte)) : 0;} /* * huffman_encode_file huffman encodes in to out. * 对FILE对象进行编码, 将*in编码后写入*out. */inthuffman_encode_file(FILE *in, FILE *out){    SymbolFrequencies sf;    SymbolEncoder *se;    huffman_node *root = NULL;    int rc;    unsigned int symbol_count;       /*     * Get the frequency of each symbol in the input file.     * 将FILE中的数据写入到SymbolFrequencies中, 计算出了各个字符出现频率,     * 也计算出了FILE对象内的总字符数.     */    symbol_count = get_symbol_frequencies(&sf, in);          /*     * Build an optimal table from the symbolCount.     * 建树完毕, root就是sf[0], sf[1]=sf[2]=...sf[MAX_SYMBOLS]=NULL了     */    se = calculate_huffman_codes( &sf );    root = sf[0];       /*     * Scan the file again and, using the table     * previously built, encode it into the output file.     */    rewind( in ); // 将文件指针重新指向一个流的开头    /*     * 将编码写如文件流     */    rc = write_code_table(out, se, symbol_count);    if(rc == 0)       rc = do_file_encode(in, out, se);       /* Free the Huffman tree. */    free_huffman_tree( root );    free_encoder(se);       return rc;} /** * 对FILE进行解码 */inthuffman_decode_file(FILE *in, FILE *out){    huffman_node *root;    huffman_node *p;    int           c;    unsigned int data_count;       /* Read the Huffman code table. */    root = read_code_table(in, &data_count);    if( !root )   return 1;     /* Decode the file. */    p = root;    while(data_count>0 && (c=fgetc(in))!=EOF)    {       unsigned char byte = (unsigned char)c;       unsigned char mask = 1;       while(data_count > 0 && mask)       {           p = ( (byte&mask)? p->one : p->zero );           mask <<= 1;                     if( p->isLeaf )           {              fputc(p->symbol, out);              p = root;              --data_count;           }       }    }       free_huffman_tree( root );       return 0;}  // --------------------------------------------------------------------------------------#define CACHE_SIZE 1024     /*                          * 对memory编码需要考虑到数据源的速度和bufout的接收速度不匹配, cache                          * cache是为了连接不同速度的设备而设计的                          *//** * 对buffer进行编码 * */int huffman_encode_memory(const unsigned char    *bufin,                       unsigned int           bufinlen,                       unsigned char       **pbufout,                       unsigned int           *pbufoutlen){    SymbolFrequencies    sf;    SymbolEncoder     *se;       huffman_node *root = NULL;    int rc;    unsigned int symbol_count;  // memory中的字符个数       buf_cache cache;         //       /* Ensure the arguments are valid. 检测参数合法性 */    if(!pbufout || !pbufoutlen) return 1;    if( init_cache(&cache, CACHE_SIZE, pbufout, pbufoutlen) )   return 1;       /*     * Get the frequency of each symbol in the input memory     * 计算bufin内各个字符出现的频率, 并求得bufin内存储的字符个数.     */    symbol_count = get_symbol_frequencies_from_memory(&sf, bufin, bufinlen);       /*     * Build an optimal table from the symbolCount.     * 为每个Node编码, 如果这个Node的symbol为NULL, 则不编码了.     */    se = calculate_huffman_codes( &sf );    root = sf[0]; // root来拉, 哈哈, 逻辑树出来了.       /*     * Scan the memory again and, using the table     * previously built, encode it into the output memory.     * 将se内的数据统统的写入到cache中克.     */    rc = write_code_table_to_memory(&cache, se, symbol_count);    if(rc == 0)    {       /*        * 为什么write_code_table_to_memory成功之后还要要执行一次do_memory_encode?        *        */       rc = do_memory_encode(&cache, bufin, bufinlen, se);    }       /* Flush the cache. */    flush_cache( &cache );       /* Free the Huffman tree. */    free_huffman_tree( root );    free_encoder( se );    free_cache( &cache );       return rc;} /** * 对bufin进行解码. 将解码后的数据写入到bufout中. */int huffman_decode_memory(const unsigned char    *bufin,                       unsigned int           bufinlen,                       unsigned char       **pbufout,                       unsigned int           *pbufoutlen){    huffman_node *root, *p;    unsigned int data_count;    unsigned int i = 0;    unsigned char *buf;    unsigned int bufcur = 0;     /* Ensure the arguments are valid. */    if(!pbufout || !pbufoutlen) return 1;          /* Read the Huffman code table. */    root = read_code_table_from_memory(bufin, bufinlen, &i, &data_count);    if(!root)  return 1;     buf = (unsigned char*)malloc(data_count);     /* Decode the memory. */    p = root;    for(; i < bufinlen && data_count > 0; ++i)    {       unsigned char byte = bufin[i];       unsigned char mask = 1;       while(data_count > 0 && mask)       {           p = byte & mask ? p->one : p->zero;           mask <<= 1;            if(p->isLeaf)           {              buf[bufcur++] = p->symbol;              p = root;              --data_count;           }       }    }     free_huffman_tree(root);    *pbufout = buf;    *pbufoutlen = bufcur;       return 0;} /*-------------------------------------------------------------------------------- 不妨假设待编码的buffer为 "ABCAADC"手工分析可得该树的形状  :当然也可以也可以将这个树沿y方向翻转180度               root                //               /  /              A    *                  //                 /  /                C    *                    //                   /  /                  B   D 现在我们知道的两个事实是 :这个buffer内的字符数为         : symbol_count    = 7  ( "ABCAADC"一个有7个字符 )这个buffer内出现的字符种数为    : count       = 4 ( 只出现了ABCD四种字符 ) 接下来人工分析各个字符 :symbol |   count  |       bits--------|-----------|---------------------  A     |     3     |   0000 0000  B     |     1     |   0000 0110  C     |     2     |   0000 0010  D     |     1     |    0000 0111我们设置左边为0, 右边为1. bits为从叶子节点走到root的路径. 分析完毕后, 需要实现整个编码过程. 编码过程暂时跳过.假设成功编码完毕, 需要把编码后的数据写入到bufout内. bufout内的0-3个byte为字符种数 count4-7个byte为字符个数 symbol_count  然后是遍历SymbolEncoder, 依次对每种字符进行编码(我们这个例子只进行4次编码)我们对每种字符都会进行编码, 每个字符编码后的输出不妨称为frame那么这个frame是由三个部分组成的: (这个我们可以肯定一个char肯定是1byte的)symbol (1byte)    -- 字符 (这个我们可以肯定就算这个树根本没有分支, 永远只有左/右孩子, 那也了不起是是256的深度)numbits (1byte)   -- 叶子走到root需要的步数 bits   (1byte)    -- 叶子走到root的方式(即最终的编码, 比如说011)开始我对这个bites到底占了多少个byte很是怀疑, 因为我不知道从叶子走到root到底耗费了几步. 这里需要好好研究一下, 最好和最差情况. 暂时假设是个变化的byte吧.但是有一点bites跟numbits是有关系的, 所以只要知道numbits还是可以知道bites占据了多少byte的, 也知道bits到底是有几位.    byte       content---------------------------------------------    0-3(4)     count    4-7(4)     symbol_count                  A占xa个byte frame_struct    B占xb个byte frame_struct    C占xc个byte frame_struct    D占xd个byte frame_struct          X          X       这个X是do_file_encode函数写到bufout中去的数据, 那么这个数据是什么呢?实际上它是循环的把出现的字符的bits写到bufout中,根据这个数据,解码的时候就可以依次的找到第0,1,2...个位置出现的是什么字符了--------------------------------------------------------------------------------*/