Huffman编码压缩和解压文档,C++实现

来源:互联网 发布:犀牛mac破解版下载 编辑:程序博客网 时间:2024/06/14 19:07

注:本演示代码采用自上而下得到huffman编码(二叉树)
关于huffman树及相关算法这里就略过,这里探讨的是如何进行编码和解压缩。先说一下大致步骤
1.首先,读取文档(txt格式),将其存入string类型的变量pretext里
2.进行词频统计
3.创建Huffman树,并以此得到各字符的二进制编码
4.对pretext进行遍历,通过上面得到编码表,将其转化为二进制字符串code,这个二进制串可能十分长
5.将二进制串进行八位一编码,得到压缩后的compresstext文本
至此,压缩完成
还原步骤如下:
1.将compress化为二进制串newcode
2.将二进制串newcode对应如上得到的huffman树,根据二进制串依次遍历该树,得到对应原始字符,这些字符连接起来,即得到原文本newtext(也可对二进制串进行打表,注:打表也需要逐个遍历,查看对应二进制字符是否在表内,经博主测试,大量数据下,根据huffman树解码快一些)

需要注意的问题:
1.将二进制八位一编码,可能最后剩余并非是整八位,因此需要对最后几位进行补位,使其成为整八位,并记录补位长度(补了几位,记录有效位也可),这里代码为了简便,将补位长度直接放在compresstext的首部
2.一些数字和二进制之间的转换可能需要自己写函数实现
3.创建Huffman树时,采用stl的优先队列注意排序写法
4.中文为16位宽,处理时看做两个字符即可

#include <iostream>#include <vector>#include <queue>#include <string>#include <map>#include <stack>#include <unordered_map>#include <sstream>#include <algorithm>#include <ctime>#include <fstream>using namespace std;#define MaxBitL 256;struct HuffmanTreeNode{    unsigned char leafchar;    bool bit;    int weight;    HuffmanTreeNode *left,*right;    HuffmanTreeNode(unsigned char c,unsigned int w,HuffmanTreeNode *l,HuffmanTreeNode *r):leafchar(c),bit(0),weight(w),left(l),right(r){}};struct cmp{    bool operator ()(HuffmanTreeNode* &a,HuffmanTreeNode* &b) const    {        return a->weight>b->weight;    }};unordered_map<unsigned char,unsigned int> freqTable;//字符频率统计表unordered_map<unsigned char,string> dictTable;//字符到二进制序列的映射map<string,unsigned char> RedictTable;//二进制序列到字符的映射unordered_map<string,char> codeToCharTable;//二进制到字符(压缩)unordered_map<char,string> charToCodeTable;//二进制到字符(解压)priority_queue<HuffmanTreeNode*,vector<HuffmanTreeNode*>,cmp> pq;//优先队列void ComputeFreqTable(const string &text){    for(unsigned char c:text)        freqTable[c]++;    cout<<"ComputeFreqTable Finished"<<endl;}HuffmanTreeNode * CreatHuffmanTree(){    HuffmanTreeNode *root=NULL;    for(auto &x:freqTable)    {        HuffmanTreeNode *node=new HuffmanTreeNode(x.first,x.second,NULL,NULL);        pq.push(node);    }    while(!pq.empty())    {        HuffmanTreeNode *first=pq.top();        pq.pop();        if(pq.empty())        {            root=first;            break;        }        HuffmanTreeNode *second=pq.top();        pq.pop();        if(first->weight>second->weight) swap(first,second);        HuffmanTreeNode *s=new HuffmanTreeNode('\0',first->weight+second->weight,first,second);        s->right->bit=1;        pq.push(s);    }    cout<<"CreatHuffmanTree Finished"<<endl;    return root;}void dictHelp(HuffmanTreeNode *r,string &bin){    if(r)    {        bin.push_back(r->bit+'0');        if(!r->left&&!r->right)        {            dictTable[r->leafchar]=bin;            RedictTable[bin]=r->leafchar;        }        dictHelp(r->left,bin);        dictHelp(r->right,bin);        bin.pop_back();    }}void ComputeDictTable(HuffmanTreeNode *r){    string bin;    dictHelp(r,bin);    cout<<"ComputeDictTable Finished"<<endl;}string enCode(const string &Text){    string code;    for(auto &x:Text)    {        code+=dictTable[x];    }    cout<<"encode Finished"<<endl;    return code;}string ConvertDecToBinStr(int n,int bitnum){    string bin;    while(n)    {        bin+='0'+n%2;        n/=2;    }    string tail(bitnum-bin.size(),'0');    return bin+tail;}void CreatcodeToCharTable(){    for(int i=0;i<256;++i)    {        string b=ConvertDecToBinStr(i,8);        charToCodeTable[i+'\0']=b;        codeToCharTable[b]=i+'\0';    }    cout<<"CreatcodeToCharTable Finished"<<endl;}string compressCode(const string &code){    string compressText,codetemp=code;    int len=0;    cout<<"try! : ";    while(!codetemp.empty())    {        //cout<<".";        string temp;        if(codetemp.size()>=8)        {            temp=codetemp.substr(0,8);            codetemp=codetemp.substr(8,codetemp.size()-8);        }        else         {            len=8-codetemp.size();            string tail(len,'0');            codetemp+=tail;            compressText+=codeToCharTable[codetemp];            break;        }        compressText+=codeToCharTable[temp];    }    char head='0'+len;    compressText=head+compressText;    cout<<"compressCode Finished"<<endl;    return compressText;}string decompressCode(const string &compressText){    string comTexttemp=compressText,newcode;    int len=comTexttemp[0]-'0';    comTexttemp=comTexttemp.substr(1,compressText.size()-1);    for(unsigned char x:comTexttemp)    {        newcode+=charToCodeTable[x-'\0'];    }    cout<<"decompressCode Finished"<<endl;    return newcode.substr(0,newcode.size()-len);}string ConvertCodeToText(const string &code){    string newtext,bin;    for(unsigned char x:code)    {        bin+=x;        auto it=RedictTable.find(bin);        if(it!=RedictTable.end())        {            newtext+=it->second;            bin.clear();        }    }    cout<<"ConvertCodeToText Finished"<<endl;    return newtext;}void disStr(const string &Text){    cout<<"------------------------"<<endl;    cout<<Text<<endl;    cout<<"------------------------"<<endl;}void WriteTextByRand(string &Text,unsigned int size){    srand((unsigned)time(NULL));    for(unsigned int i=0;i<size;++i)    {        int r=rand()%(126-33)+33;        cout<<(char)r;        Text.push_back((char)r);    }}void WriteTextByFile(string &Text){    ifstream in;    in.open("1.txt");//文件名    if(in.is_open())    {        std::stringstream buffer;          buffer<<in.rdbuf();        Text=buffer.str();    }    else cout<<"can not find this file"<<endl;    in.close();    cout<<"WriteTextByFile Finished"<<endl;}void prtime(clock_t s){    cout<<"usetime : "<<clock()-s<<" ms"<<endl;}char HelpPlus(const string &str,unsigned int &i,HuffmanTreeNode *r){    unsigned int len=str.size();    while(i<len&&r)    {        if(!r->left&&!r->right) {return r->leafchar;}        int bit=str[++i]-'0';        if(bit==1) r=r->right;        else r=r->left;    }    return '\0';}string DeCodeToText(string &decompressText,HuffmanTreeNode *r){    string NewText="";    unsigned int i=0;    while(i<decompressText.size())    {        NewText.push_back(HelpPlus(decompressText,i,r));        ++i;    }    return NewText;}int main(){    clock_t start_time=clock();    string PreText="";//文本    //WriteTextByRand(PreText,10000);    WriteTextByFile(PreText);    prtime(start_time);    //disStr(PreText);    ComputeFreqTable(PreText);prtime(start_time);    HuffmanTreeNode *huff=CreatHuffmanTree();prtime(start_time);    ComputeDictTable(huff);prtime(start_time);    CreatcodeToCharTable();prtime(start_time);    string code=enCode(PreText);prtime(start_time);    //cout<<code<<endl;    string compressText=compressCode(code);prtime(start_time);    //disStr(compressText);    string decode=decompressCode(compressText);prtime(start_time);    //cout<<decode<<endl;    //string newtext=ConvertCodeToText(decode);//根据打表解码    string newtext=DeCodeToText(decode,huff);//根据huffman树解码    prtime(start_time);    //disStr(newtext);    if(code==decode) cout<<"Same Code"<<endl;    else cout<<"Different Code"<<endl;    if(PreText==newtext) cout<<"Same Text"<<endl;    else cout<<"Different Text"<<endl;    cout<<"PreText Length : "<<PreText.size()<<"   ComText Length : "<<compressText.size()<<endl;    cout<<"Rate : "<<100.0*compressText.size()/PreText.size()<<"%"<<endl;    ofstream o("compressText.txt");    o<<compressText<<endl;    o.close();    ofstream out("newtext.txt");    out<<newtext<<endl;    out.close();    clock_t end_time=clock();    cout<<"rt : "<<(end_time-start_time)*1.0/CLOCKS_PER_SEC<<" s"<<endl;    return 0;}

运行结果:
1.对随机字符:随机生成10000个字符,进行huffman编码,压缩率仅仅为95%左右

ComputeFreqTable Finishedusetime : 488 msCreatHuffmanTree Finishedusetime : 493 msComputeDictTable Finishedusetime : 497 msCreatcodeToCharTable Finishedusetime : 504 msencode Finishedusetime : 508 mstry! : compressCode Finishedusetime : 573 msdecompressCode Finishedusetime : 577 msusetime : 581 msSame CodeSame TextPreText Length : 10000   ComText Length : 9481Rate : 94.81%rt : 0.591 s

2.对英语文章:文章选自china daily,题材来源不同,共计12199字,多次测试,压缩率稳定在70%左右

WriteTextByFile Finishedusetime : 1 msComputeFreqTable Finishedusetime : 3 msCreatHuffmanTree Finishedusetime : 3 msComputeDictTable Finishedusetime : 3 msCreatcodeToCharTable Finishedusetime : 4 msencode Finishedusetime : 7 mstry! : compressCode Finishedusetime : 61 msdecompressCode Finishedusetime : 64 msusetime : 66 msSame CodeSame TextPreText Length : 12199   ComText Length : 8489Rate : 69.5877%rt : 0.068 s

3.中文字符:文章选自《福尔摩斯探案集》东方探案部分,文字总计437700字,压缩率89%左右,花费时间较长,为342.452 s

WriteTextByFile Finishedusetime : 4 msComputeFreqTable Finishedusetime : 59 msCreatHuffmanTree Finishedusetime : 60 msComputeDictTable Finishedusetime : 60 msCreatcodeToCharTable Finishedusetime : 61 msencode Finishedusetime : 168 mstry! : compressCode Finishedusetime : 342272 msdecompressCode Finishedusetime : 342356 msusetime : 342420 msSame CodeSame TextPreText Length : 437700   ComText Length : 389238Rate : 88.928%rt : 342.452 s

4.时间对比:将英文累积到432673,花费时间为193.675s,远远小于编码中文时间,压缩率稳定在70%左右

WriteTextByFile Finishedusetime : 7 msComputeFreqTable Finishedusetime : 80 msCreatHuffmanTree Finishedusetime : 81 msComputeDictTable Finishedusetime : 81 msCreatcodeToCharTable Finishedusetime : 82 msencode Finishedusetime : 183 mstry! : compressCode Finishedusetime : 193529 msdecompressCode Finishedusetime : 193597 msusetime : 193652 msSame CodeSame TextPreText Length : 432673   ComText Length : 300885Rate : 69.541%rt : 193.675 s

总结:Huffman编码对英文支持较好,时间主要花费在将二进制字符串转化为字符这一过程上,但是在实际计算机中,二进制序列是自动化为字符的,这里仅仅是为了模拟这一过程,因此,huffman编码在通信中还是有较大的应用价值