Huffman Coding 原理与C/C++代码

来源：互联网发布：淘宝买家情趣内衣秀编辑：程序博客网时间：2024/06/16 22:03

Huffman编码的代码计划一直躺在我的Evernote里面。这几天正好是论文初稿的提交后的空窗期，就花两天把这项todolist干掉。

Huffman Coding 原理

Huffman Coding（霍夫曼编码）是通信专业必学的一个知识点，在研僧期间老蔡《信息论》的课上也是再次强调了数遍。Huffman Coding在数据编码领域里面相当重要，在诸如数据压缩、音频编码、图像编码中都得到了广泛的应用，例如，MPEG1音频标准的LayerIII、H.263视频编码标准中都使用Huffman Coding来进行数据压缩。

Huffman Coding是由Huffman在1951年提出的。当时，Huffman和他在MIT信息论的同学需要选择是完成学期报告还是期末考试。导师Fano给他们的学期报告题目是，查找最有效的二进制编码。由于无法证明哪个已有编码是最有效的，Huffman放弃了对已有编码的研究，转向新的探索，最终发现了基于有序频率二叉树编码的想法，并很快证明了这个方法的有效性。Huffman Coding的具体文献如[1]所示，如果你想要深入研究Huffman Coding，那么最好研读一番。

这个方法完成依据字符出现的概率来构造平均长度最短的码字。具体过程如下：

先对各个字符出现的概率进行统计；
然后按照各个字符出现概率的大小排列，把最小的两个概率相加，作为新的概率和剩余的概率重新排队；
再把最小的两个概率相加，再重新排队，直到最后变成1。每次相加时都把“0”和“1”赋给相加的两个概率，读出时由该符号开始一直到最后的“1”。

Huffman Coding的过程

Pseudo Code

begin     count frequencies of each single characters     sort them to non-decreasing sequence     create a leaf node (character, frequency c, left son = NULL, right son = NULL)      of the tree for each character and put nodes into queue F     while (|F|>=2) do       begin        pop the first two nodes (u1, u2) with the lowest           frequencies from sorted queue        create a node evaluated with sum of the chosen units,           successors are chosen units (eps, c(u1)+c(u2), u1, u2)        insert new node into queue      end     node evaluate with way from root to leaf node (left son 0, right son 1)     create output from coded intput charactersend

C Code

#include<stdio.h>#include<stdlib.h>#include<conio.h>#define MAX_TREE_HT 100typedef struct tagNode{    char character;    unsigned frequency;    struct tagNode *left, *right;}HNode;typedef struct tagHeap{    unsigned size;    unsigned space;    HNode **array;}HHeap;HNode* newNode(char character, unsigned frequency){    HNode* temp = (HNode*)malloc(sizeof(HNode));    temp->left = NULL;    temp->right = NULL;    temp->character = character;    temp->frequency = frequency;    return temp;}HHeap* createHHeap(unsigned space){    HHeap* HHeapX = (HHeap*)malloc(sizeof(HHeap));    HHeapX->size = 0;    HHeapX->space = space;    HHeapX->array = (HNode**)malloc(HHeapX->space * sizeof(HNode*));    return HHeapX;}void swapHNode(HNode** a,HNode** b){    HNode* t = *a;    *a = *b;    *b = t;}void HHeapify(HHeap* HHeapX, int idx){    int smallest = idx;    int left = 2*idx + 1;    int right = 2*idx + 2;    if ((left < HHeapX->size) && (HHeapX->array[left]->frequency < HHeapX->array[smallest]->frequency) )    {        smallest = left;    }    if ((right < HHeapX->size)&& (HHeapX->array[right]->frequency < HHeapX->array[smallest]->frequency))    {        smallest = right;    }    if (smallest != idx)    {        swapHNode(&HHeapX->array[smallest], &HHeapX->array[idx]);        HHeapify(HHeapX, smallest);    }}int isSizeOne(HHeap* HHeapX){    return (HHeapX->size == 1);}HNode* extractMin(HHeap* HHeapX){    HNode* temp = HHeapX->array[0];    HHeapX->array[0] = HHeapX->array[HHeapX->size - 1];    --HHeapX->size;    HHeapify(HHeapX,0);    return temp;}void insertHHeap(HHeap* HHeapX, HNode* HNodeX){    //int i = HHeapX->size - 1;    int i = HHeapX->size; //不减1    ++HHeapX->size;    while ((i > 1) && HNodeX->frequency < HHeapX->array[(i-1)/2]->frequency)    {        HHeapX->array[i] = HHeapX->array[(i-1)/2];        i = (i-1)/2;    }    HHeapX->array[i] = HNodeX;}void buildHHeap(HHeap* HHeapX){    int n = HHeapX->size - 1;    for (int i = (n-1)/2; i >= 0 ; --i)    {        HHeapify(HHeapX, i);    }}void printArr(int arr[],int n){    for (int i = 0; i < n; i++)    {        printf("%d", arr[i]);    }    printf("\n");}int isLeaf(HNode* root){    return !(root->left) && !(root->right) ;}HHeap* createAndBuildHHeap(char character[], int frequency[], int size){    int i;    HHeap* HHeapX = createHHeap(size);    for (i = 0; i < size; ++i)        HHeapX->array[i] = newNode(character[i], frequency[i]);    HHeapX->size = size;    buildHHeap(HHeapX);    return HHeapX;}HNode* buildHuffmanTree(char character[], int frequency[], int size){    HNode *l, *r, *top;    HHeap* HHeap = createAndBuildHHeap(character, frequency, size);    while (!isSizeOne(HHeap))    {        l = extractMin(HHeap);        r = extractMin(HHeap);        top = newNode('$', l->frequency + r->frequency);        top->left = l;        top->right = r;        insertHHeap(HHeap, top);    }    return extractMin(HHeap);}void printCodes(HNode* root, int arr[], int top){    if (root->left)    {        arr[top] = 0;        printCodes(root->left, arr, top + 1);    }    if (root->right)    {        arr[top] = 1;        printCodes(root->right, arr, top + 1);    }    if (isLeaf(root))    {        printf("%c: ", root->character);        printArr(arr, top);    }}void HuffmanCoding(char character[], int frequency[], int size){    HNode* root = buildHuffmanTree(character, frequency, size);    int arr[MAX_TREE_HT], top = 0;    printCodes(root, arr, top);}int countStrFreq(const char *s, char character[], int frequency[]){    // 用表计算字符出现的频率    int freq[128] = {0};    while (*s)    {        freq[(int)*s++]++;        //printf("%c",*s);    }    int c = 0;    for (int i = 0; i < 128; i++)    {        if (freq[i] != 0)        {            character[c] = char(i);            frequency[c] = freq[i];            c++;        }    }    return c;}void main(){    // 输入的字符串    const char *str = "this is an example for huffman encoding";    // ASCII码共包含128个字符，因此初始化大小设为128    char cha[128];    int freq[128]={0};    // 计算字符串中各字符出现的频率    int val;    val = countStrFreq(str,cha,freq);    // 进行Huffman编码    HuffmanCoding(cha, freq, val);    system("pause");}

C++ Code

在改写C++代码的时候遇到了两个bug。
bug1：
在C++文件流处理中，当利用file.eof()函数来判断文件末尾的时候，会出现文件末尾重复的现象，即原始为abc，会变成abcc。这里的解决方案是在while循环中加入if(file.eof()) break 来提前退出；
bug2：
当文件指针到达eof后，seekg()函数会失效，这个时候需要使用file.clear()函数来恢复流车状态。

/*****************************************************************   Huffman coding algorithm Version 1.0*   Author: Sergey Tikhonov*   Modifier: Jeremy Lin*   Email: jianmin1990@outlook.com*   Date: 2015.03.14 pm HQU*   More detail: http://blog.csdn.net/linj_m****************************************************************/#include <map>#include <string>#include <vector>#include <iostream>#include <fstream>using namespace std;struct cNode{    char ch; // character    float pro; // probability};struct treeNode: public cNode{    char lcode;    char rcode;    treeNode *left; // left child    treeNode *right; // right child};static int nodeCompare(const void *elem1, const void *elem2){    const cNode a = *(cNode*)elem1;    const cNode b = *(cNode*)elem2;    if (a.pro < b.pro)        return 1;    else if(a.pro > b.pro)        return -1;    else        return 0;}class HCode{private:    int tsize; // table size (number of chars)    cNode *ptable; // table of probabilities    map<char, string> codes; // codeword for each charpublic:    void enCode(const char* inputFilepath, const char* outputFilepath)    {        map<char, int> freqs; // frequency for each char from input text        int i;        // Opening input file        //        ifstream inputFile;        inputFile.open(inputFilepath, ifstream::in);        if (!inputFile)        {            cerr<<"error: unable to open input file: " << inputFilepath <<endl;        }        // Counting chars        //        // bug 1, use eof() to judge the end of file will bring error。        char ch; //char        unsigned total = 0;        while (true)        {            inputFile.get(ch);            if(inputFile.eof())                 break;            freqs[ch]++;            total++;        }        tsize = (int)freqs.size();        // Building decreasing freqs table        //        ptable =new cNode[tsize];        //assert(ptable);        float ftot = float(total);        map<char, int>::iterator fi;        for (fi = freqs.begin(), i = 0; fi != freqs.end();  ++fi, ++i)        {            ptable[i].ch = (*fi).first;            ptable[i].pro = float((*fi).second)/ftot;        }        qsort(ptable, tsize, sizeof(cNode), nodeCompare);        // Encoding        //        EncHuffman();        // Opening output file        //        ofstream outputFile;        outputFile.open(outputFilepath, ofstream::out);        if (!outputFile)        {            cerr<<"error: unable to open output file: " << outputFilepath <<endl;        }        // Outputing ptable and codes        //        std::cout<<endl<<tsize<<endl;        outputFile<<tsize<<endl;        for (int i = 0; i < tsize; i++)        {            std::cout <<ptable[i].ch<<"\t"<<ptable[i].pro<<"\t"<<codes[ptable[i].ch].c_str()<<endl;            outputFile<<ptable[i].ch<<"\t"<<ptable[i].pro<<"\t"<<codes[ptable[i].ch].c_str()<<endl;        }        // Outputing encoded text        //        // bug 2, if inputfile's eofbit is ture,the seekg()function will out of work.         //so you have to use clear() to reset inputfile‘s state.        inputFile.clear();          inputFile.seekg(0,inputFile.beg);        std::cout<<endl;        outputFile<<endl;        while (true)        {            inputFile.get(ch);            if (inputFile.eof())                break;            std::cout<<codes[ch].c_str();            outputFile<<codes[ch].c_str();        }        std::cout<<endl;        // Cleaning        //        codes.clear();        delete[] ptable;        // Closing files        //        outputFile.close();        outputFile.clear();        inputFile.close();        inputFile.clear();    }    void Decode(const char* inputFilename, const char* outputFilename)    {        // Opening input file        ifstream inputFile;        inputFile.open(inputFilename);        if (!inputFile)        {            cerr<<"error: unable to open input file: " << inputFilename <<endl;        }        // Loading codes        //        inputFile>>tsize;        char ch, code[128];        float p;        int i;        inputFile.get();        for (i = 0; i < tsize; i++)        {            inputFile.get(ch);            inputFile>>p>>code;            codes[ch] = code;            inputFile.get();        }        inputFile.get();        // Opening output file        //        ofstream outputFile;        outputFile.open(outputFilename);        if (!outputFile)        {            cerr<<"error: unable to open output file: "<<outputFilename<<endl;        }        // Decoding and outputing to file        //        string accum = "";        map<char, string>::iterator ci;        while (true)        {            inputFile.get(ch);            if(inputFile.eof())                break;            accum += ch;            for (ci = codes.begin(); ci != codes.end(); ++ci)            {                if (!strcmp((*ci).second.c_str(), accum.c_str()))                {                    accum = "";                    std::cout<<(*ci).first;                    outputFile<<(*ci).first;                }            }        }        std::cout<<endl;        // Cleaning        //        outputFile.close();        outputFile.clear();        inputFile.close();        inputFile.clear();    }private:    void EncHuffman()    {        // Creating leaves (initial top-nodes)        //        treeNode *n;        vector<treeNode*> tops; // top-nodes        int i, numtop = tsize;        for (i = 0; i < numtop; i++)        {            n = new treeNode;            //assert(n);            n->ch = ptable[i].ch;            n->pro = ptable[i].pro;            n->left = NULL;            n->right = NULL;            tops.push_back(n);        }        // Building binary tree.        // Combining last two nodes, replacing them by new node        // without invalidating sort        //        while (numtop > 1)        {            n = new treeNode;            //assert(n);            n->pro = tops[numtop - 2]->pro + tops[numtop - 1]->pro;            n->left = tops[numtop - 2];            n->right = tops[numtop - 1];            if ( n->left->pro < n->right->pro)            {                n->lcode = '0';                n->rcode = '1';            }            else            {                n->lcode = '1';                n->rcode = '0';            }            tops.pop_back();            tops.pop_back();            bool isins = false;            std::vector<treeNode*>::iterator ti;            for ( ti = tops.begin(); ti != tops.end(); ++ti)            {                if ( (*ti)->pro < n->pro)                {                    tops.insert(ti, n);                    isins = true;                    break;                }            }            if ( !isins)                 tops.push_back(n);            numtop--;        }        // Building codes        //        treeNode *root = tops[0];        GenerateCode(root);        // Cleaning         //         DestroyNode(root);        tops.clear();    }    void GenerateCode( treeNode *node ) // for outside call: node is root    {        static string sequence = "";        if( node->left )        {            sequence += node->lcode;            GenerateCode( node->left );        }        if( node->right )        {            sequence += node->rcode;            GenerateCode( node->right );        }        if( !node->left && !node->right )            codes[node->ch] = sequence;        int l = (int)sequence.length();        if( l > 1 )             sequence = sequence.substr( 0, l-1 );        else             sequence = "";    }    void DestroyNode( treeNode *node) // for outside call: node is root    {        if (node->left)        {            DestroyNode(node->left);            delete node->left;            node->left = NULL;        }        if (node->right)        {            DestroyNode(node->right);            delete node->right;            node->right = NULL;        }    }};int show_usage(){    cout<<"Huffman Coding Algorithm Version 1.0"<<endl;    cout<<"  Modifier:Jeremy Lin 2015-03-14 @HQU"<<endl;    cout<<"  Email:jianmin1990@outlook.com"<<endl;    cout<<endl;    cout<<"Usage:"<<endl;     cout<<" huffman [OPTIONS] input [output]"<<endl;    cout<<" The defaul action is to encode the input file."<<endl;    cout<<" -d\tDecode file."<<endl;    cout<<endl;    cout<<"Examples:"<<endl;    cout<<" huffman input.txt"<<endl;    cout<<" huffman input.txt encoded.txt"<<endl;    cout<<" huffman -d encoded.txt"<<endl;    exit(0);}int main(int argc, char **argv){    int i = 1;    bool decFlag = false;   // decode flag    char inputFilename[128];      char outputFilename[128];    if (argc < 2)    {        show_usage();    }    if (strcmp(argv[i],"-d") == 0)    {        decFlag = true;        ++i;        if (i == argc)        {            show_usage();        }    }    strcpy(inputFilename, argv[i]);    ++i;    if (i < argc)    {        strcpy(outputFilename, argv[i]);    }    else    {        if (decFlag) strcpy(outputFilename, "decoded.txt");        else         strcpy(outputFilename, "encoded.txt");    }    // Calling encoding or decoding subroutine    //     HCode *pCoder;    pCoder = new HCode;    if (!pCoder)    {        cerr<<"error: unable to create a pointer to HCode"<<endl;    }    if (!decFlag)    {        pCoder->enCode(inputFilename, outputFilename);    }    else    {        pCoder->Decode(inputFilename, outputFilename);    }    delete pCoder;    return 0;}

这里写图片描述

本文地址：http://blog.csdn.net/linj_m/article/details/44241543
更多资源请关注博客： LinJM-机器视觉微博：林建民-机器视觉

[1] Huffman, D.A., A method for the construction of minimum redundancy codes. Proceedings of the IRE, 1952. 40(9): p. 1098-1101.
[2] http://scanftree.com/Data_Structure/huffman-code

0 0