Huffman编码压缩和解压文档,C++实现
来源:互联网 发布:犀牛mac破解版下载 编辑:程序博客网 时间:2024/06/14 19:07
注:本演示代码采用自上而下得到huffman编码(二叉树)
关于huffman树及相关算法这里就略过,这里探讨的是如何进行编码和解压缩。先说一下大致步骤
1.首先,读取文档(txt格式),将其存入string类型的变量pretext里
2.进行词频统计
3.创建Huffman树,并以此得到各字符的二进制编码
4.对pretext进行遍历,通过上面得到编码表,将其转化为二进制字符串code,这个二进制串可能十分长
5.将二进制串进行八位一编码,得到压缩后的compresstext文本
至此,压缩完成
还原步骤如下:
1.将compress化为二进制串newcode
2.将二进制串newcode对应如上得到的huffman树,根据二进制串依次遍历该树,得到对应原始字符,这些字符连接起来,即得到原文本newtext(也可对二进制串进行打表,注:打表也需要逐个遍历,查看对应二进制字符是否在表内,经博主测试,大量数据下,根据huffman树解码快一些)
需要注意的问题:
1.将二进制八位一编码,可能最后剩余并非是整八位,因此需要对最后几位进行补位,使其成为整八位,并记录补位长度(补了几位,记录有效位也可),这里代码为了简便,将补位长度直接放在compresstext的首部
2.一些数字和二进制之间的转换可能需要自己写函数实现
3.创建Huffman树时,采用stl的优先队列注意排序写法
4.中文为16位宽,处理时看做两个字符即可
#include <iostream>#include <vector>#include <queue>#include <string>#include <map>#include <stack>#include <unordered_map>#include <sstream>#include <algorithm>#include <ctime>#include <fstream>using namespace std;#define MaxBitL 256;struct HuffmanTreeNode{ unsigned char leafchar; bool bit; int weight; HuffmanTreeNode *left,*right; HuffmanTreeNode(unsigned char c,unsigned int w,HuffmanTreeNode *l,HuffmanTreeNode *r):leafchar(c),bit(0),weight(w),left(l),right(r){}};struct cmp{ bool operator ()(HuffmanTreeNode* &a,HuffmanTreeNode* &b) const { return a->weight>b->weight; }};unordered_map<unsigned char,unsigned int> freqTable;//字符频率统计表unordered_map<unsigned char,string> dictTable;//字符到二进制序列的映射map<string,unsigned char> RedictTable;//二进制序列到字符的映射unordered_map<string,char> codeToCharTable;//二进制到字符(压缩)unordered_map<char,string> charToCodeTable;//二进制到字符(解压)priority_queue<HuffmanTreeNode*,vector<HuffmanTreeNode*>,cmp> pq;//优先队列void ComputeFreqTable(const string &text){ for(unsigned char c:text) freqTable[c]++; cout<<"ComputeFreqTable Finished"<<endl;}HuffmanTreeNode * CreatHuffmanTree(){ HuffmanTreeNode *root=NULL; for(auto &x:freqTable) { HuffmanTreeNode *node=new HuffmanTreeNode(x.first,x.second,NULL,NULL); pq.push(node); } while(!pq.empty()) { HuffmanTreeNode *first=pq.top(); pq.pop(); if(pq.empty()) { root=first; break; } HuffmanTreeNode *second=pq.top(); pq.pop(); if(first->weight>second->weight) swap(first,second); HuffmanTreeNode *s=new HuffmanTreeNode('\0',first->weight+second->weight,first,second); s->right->bit=1; pq.push(s); } cout<<"CreatHuffmanTree Finished"<<endl; return root;}void dictHelp(HuffmanTreeNode *r,string &bin){ if(r) { bin.push_back(r->bit+'0'); if(!r->left&&!r->right) { dictTable[r->leafchar]=bin; RedictTable[bin]=r->leafchar; } dictHelp(r->left,bin); dictHelp(r->right,bin); bin.pop_back(); }}void ComputeDictTable(HuffmanTreeNode *r){ string bin; dictHelp(r,bin); cout<<"ComputeDictTable Finished"<<endl;}string enCode(const string &Text){ string code; for(auto &x:Text) { code+=dictTable[x]; } cout<<"encode Finished"<<endl; return code;}string ConvertDecToBinStr(int n,int bitnum){ string bin; while(n) { bin+='0'+n%2; n/=2; } string tail(bitnum-bin.size(),'0'); return bin+tail;}void CreatcodeToCharTable(){ for(int i=0;i<256;++i) { string b=ConvertDecToBinStr(i,8); charToCodeTable[i+'\0']=b; codeToCharTable[b]=i+'\0'; } cout<<"CreatcodeToCharTable Finished"<<endl;}string compressCode(const string &code){ string compressText,codetemp=code; int len=0; cout<<"try! : "; while(!codetemp.empty()) { //cout<<"."; string temp; if(codetemp.size()>=8) { temp=codetemp.substr(0,8); codetemp=codetemp.substr(8,codetemp.size()-8); } else { len=8-codetemp.size(); string tail(len,'0'); codetemp+=tail; compressText+=codeToCharTable[codetemp]; break; } compressText+=codeToCharTable[temp]; } char head='0'+len; compressText=head+compressText; cout<<"compressCode Finished"<<endl; return compressText;}string decompressCode(const string &compressText){ string comTexttemp=compressText,newcode; int len=comTexttemp[0]-'0'; comTexttemp=comTexttemp.substr(1,compressText.size()-1); for(unsigned char x:comTexttemp) { newcode+=charToCodeTable[x-'\0']; } cout<<"decompressCode Finished"<<endl; return newcode.substr(0,newcode.size()-len);}string ConvertCodeToText(const string &code){ string newtext,bin; for(unsigned char x:code) { bin+=x; auto it=RedictTable.find(bin); if(it!=RedictTable.end()) { newtext+=it->second; bin.clear(); } } cout<<"ConvertCodeToText Finished"<<endl; return newtext;}void disStr(const string &Text){ cout<<"------------------------"<<endl; cout<<Text<<endl; cout<<"------------------------"<<endl;}void WriteTextByRand(string &Text,unsigned int size){ srand((unsigned)time(NULL)); for(unsigned int i=0;i<size;++i) { int r=rand()%(126-33)+33; cout<<(char)r; Text.push_back((char)r); }}void WriteTextByFile(string &Text){ ifstream in; in.open("1.txt");//文件名 if(in.is_open()) { std::stringstream buffer; buffer<<in.rdbuf(); Text=buffer.str(); } else cout<<"can not find this file"<<endl; in.close(); cout<<"WriteTextByFile Finished"<<endl;}void prtime(clock_t s){ cout<<"usetime : "<<clock()-s<<" ms"<<endl;}char HelpPlus(const string &str,unsigned int &i,HuffmanTreeNode *r){ unsigned int len=str.size(); while(i<len&&r) { if(!r->left&&!r->right) {return r->leafchar;} int bit=str[++i]-'0'; if(bit==1) r=r->right; else r=r->left; } return '\0';}string DeCodeToText(string &decompressText,HuffmanTreeNode *r){ string NewText=""; unsigned int i=0; while(i<decompressText.size()) { NewText.push_back(HelpPlus(decompressText,i,r)); ++i; } return NewText;}int main(){ clock_t start_time=clock(); string PreText="";//文本 //WriteTextByRand(PreText,10000); WriteTextByFile(PreText); prtime(start_time); //disStr(PreText); ComputeFreqTable(PreText);prtime(start_time); HuffmanTreeNode *huff=CreatHuffmanTree();prtime(start_time); ComputeDictTable(huff);prtime(start_time); CreatcodeToCharTable();prtime(start_time); string code=enCode(PreText);prtime(start_time); //cout<<code<<endl; string compressText=compressCode(code);prtime(start_time); //disStr(compressText); string decode=decompressCode(compressText);prtime(start_time); //cout<<decode<<endl; //string newtext=ConvertCodeToText(decode);//根据打表解码 string newtext=DeCodeToText(decode,huff);//根据huffman树解码 prtime(start_time); //disStr(newtext); if(code==decode) cout<<"Same Code"<<endl; else cout<<"Different Code"<<endl; if(PreText==newtext) cout<<"Same Text"<<endl; else cout<<"Different Text"<<endl; cout<<"PreText Length : "<<PreText.size()<<" ComText Length : "<<compressText.size()<<endl; cout<<"Rate : "<<100.0*compressText.size()/PreText.size()<<"%"<<endl; ofstream o("compressText.txt"); o<<compressText<<endl; o.close(); ofstream out("newtext.txt"); out<<newtext<<endl; out.close(); clock_t end_time=clock(); cout<<"rt : "<<(end_time-start_time)*1.0/CLOCKS_PER_SEC<<" s"<<endl; return 0;}
运行结果:
1.对随机字符:随机生成10000个字符,进行huffman编码,压缩率仅仅为95%左右
ComputeFreqTable Finishedusetime : 488 msCreatHuffmanTree Finishedusetime : 493 msComputeDictTable Finishedusetime : 497 msCreatcodeToCharTable Finishedusetime : 504 msencode Finishedusetime : 508 mstry! : compressCode Finishedusetime : 573 msdecompressCode Finishedusetime : 577 msusetime : 581 msSame CodeSame TextPreText Length : 10000 ComText Length : 9481Rate : 94.81%rt : 0.591 s
2.对英语文章:文章选自china daily,题材来源不同,共计12199字,多次测试,压缩率稳定在70%左右
WriteTextByFile Finishedusetime : 1 msComputeFreqTable Finishedusetime : 3 msCreatHuffmanTree Finishedusetime : 3 msComputeDictTable Finishedusetime : 3 msCreatcodeToCharTable Finishedusetime : 4 msencode Finishedusetime : 7 mstry! : compressCode Finishedusetime : 61 msdecompressCode Finishedusetime : 64 msusetime : 66 msSame CodeSame TextPreText Length : 12199 ComText Length : 8489Rate : 69.5877%rt : 0.068 s
3.中文字符:文章选自《福尔摩斯探案集》东方探案部分,文字总计437700字,压缩率89%左右,花费时间较长,为342.452 s
WriteTextByFile Finishedusetime : 4 msComputeFreqTable Finishedusetime : 59 msCreatHuffmanTree Finishedusetime : 60 msComputeDictTable Finishedusetime : 60 msCreatcodeToCharTable Finishedusetime : 61 msencode Finishedusetime : 168 mstry! : compressCode Finishedusetime : 342272 msdecompressCode Finishedusetime : 342356 msusetime : 342420 msSame CodeSame TextPreText Length : 437700 ComText Length : 389238Rate : 88.928%rt : 342.452 s
4.时间对比:将英文累积到432673,花费时间为193.675s,远远小于编码中文时间,压缩率稳定在70%左右
WriteTextByFile Finishedusetime : 7 msComputeFreqTable Finishedusetime : 80 msCreatHuffmanTree Finishedusetime : 81 msComputeDictTable Finishedusetime : 81 msCreatcodeToCharTable Finishedusetime : 82 msencode Finishedusetime : 183 mstry! : compressCode Finishedusetime : 193529 msdecompressCode Finishedusetime : 193597 msusetime : 193652 msSame CodeSame TextPreText Length : 432673 ComText Length : 300885Rate : 69.541%rt : 193.675 s
总结:Huffman编码对英文支持较好,时间主要花费在将二进制字符串转化为字符这一过程上,但是在实际计算机中,二进制序列是自动化为字符的,这里仅仅是为了模拟这一过程,因此,huffman编码在通信中还是有较大的应用价值
- Huffman编码压缩和解压文档,C++实现
- Huffman编码实现压缩、解压文件
- C++ Huffman编码压缩解压
- Huffman压缩和解压txt
- 数据压缩实验三:用c语言实现Huffman编码和压缩效率分析
- Huffman编码实现文本文件压缩
- Huffman编码实现压缩解压缩
- Huffman编码C实现
- 用哈弗曼编码实现文件压缩和解压
- Huffman编码压缩算法之压缩与解压篇
- 任意ASCII码格式信息的huffman tree压缩(编码)和解压(译码)
- Jcompress: 一款基于huffman编码和最小堆的压缩、解压缩小程序
- 【c++】Huffman实现文件压缩
- 一个用C实现的哈弗曼编码实现文件压缩和解压
- Huffman压缩解压器
- Linux下实现Huffman编码压缩算法
- Linux下实现Huffman编码压缩算法
- Huffman霍夫曼压缩编码算法实现分析
- mysql 两表联合查询数据多了很慢
- 常用的正则表达式(一)
- 最长公共子字符串
- Kotlin学习中
- 1004:Let the Balloon Rise
- Huffman编码压缩和解压文档,C++实现
- hibernate(三)
- scanf(“%c”)等问题,清空输入输出缓存
- JavaScript的选项卡操作
- Java技能清单
- Android音频系统
- Filter与Servlet的区别与联系
- git push 失败问题
- 平台与上游对账逻辑