[HTTP]_[C/C++]_[HTML特殊字符转义成正常字符]
来源:互联网 发布:python退出程序命令 编辑:程序博客网 时间:2024/06/06 01:40
场景:
1.在没有HTML库时(谁有好的html库介绍下,C/C++的?tinyXML?),以SAX方式解析HTML时,会读入特殊字符,这时候需要转义成正常字符才能使用。
2.耗时,4-6小时(被打扰)。
3.replace虽然挺好,但是会循环整个字符串执行替换,效率应该没有一次过替换高.
#include <algorithm> void replace( iterator start, iterator end, const TYPE& old_value, const TYPE& new_value );
4.所有转义字符的网址:
http://114.xixik.com/character/
文件1:test_htmlescape.cpp
#include <stdio.h>#include <stdlib.h>#include <string.h>#include <iostream>#include <assert.h>using namespace std;int IsLittleEndian(){ int x = 1; if (*(char*) &x == 1) { return 1; } else { return 0; }}void HtmEscapeEntityCodeToUnicode(const char* entity_code,char* html_char){ int decimal_value = atoi(entity_code); char* uchari = (char*)&decimal_value; if(IsLittleEndian()) { html_char[0] = uchari[0] & 0xFF; html_char[1] = uchari[1] & 0xFF; }else { html_char[0] = uchari[1] & 0xFF; html_char[1] = uchari[0] & 0xFF; }}void OneUnicode2UTF8(const char* unicode_char,size_t unicode_char_length, char* utf_char){ //unicode: 0x192->110010010 ,utf8:0xC692->1100011010010010 int value = 0; memcpy(&value,unicode_char,unicode_char_length); if (value >= 0x0000 && value <= 0x007F) { utf_char[0] = unicode_char[0]; } else if (value >= 0x0080 && value <= 0x07FF) { utf_char[0] = ((value >> 6) | 0xC0); utf_char[1] = ((value & 0x3F) | 0x80); } else if (value >= 0x0800 && value <= 0xFFFF) { utf_char[0] = ((value >> 12) | 0xE0); utf_char[1] = ((value >> 6 & 0x3F) | 0x80); utf_char[2] = ((value & 0x3F) | 0x80); } else if (value >= 0x10000 && value <= 0x10FFFF) { utf_char[0] = (value >> 18 | 0xF0); utf_char[1] = ((value >> 12 & 0x3F) | 0x80); utf_char[2] = ((value >> 6 & 0x3F) | 0x80); utf_char[3] = ((value & 0x3F) | 0x80); } else { cerr << "value too big." << endl; assert(0); }}static const char* kEntityNameToEntityCodeMap[] = { "oelig","339","amp","38","rArr","8658","fnof","402"};//1.这里可以优化的余地很大.const char* HtmEscapeEntityNameToEntityCode(const char* entity_name){ static size_t length = sizeof(kEntityNameToEntityCodeMap)/sizeof(char*); for(size_t i = 0; i < length; i+=2) { if(!strcmp(entity_name,kEntityNameToEntityCodeMap[i])) { return kEntityNameToEntityCodeMap[i+1]; } } return NULL;}string UnescapeUTFHTMLContent(const char* str){ string temp; char* pos_amp = NULL; char* pos_semicolon = (char*)str; const char* start_amp = str; int entity_length = 0; char entity_code[5]; const int kMaxEntityLength = 4; char entity_name[20]; const int kMaxEntityNameLength = 18; char unicode[3]; char utf8[4]; while(true) { if(!start_amp || !(*start_amp)) { break; } pos_amp = strchr(start_amp,'&'); if(!pos_amp) { temp.append(start_amp); break; } int pos_no = pos_amp - pos_semicolon; if(pos_no > 0) { temp.append(start_amp,pos_no); start_amp = pos_amp; } char* pos_amp1 = pos_amp+1; if(!pos_amp1 || !(*pos_amp1)) { string t2(start_amp); temp.append(start_amp); break; } if(isalpha(*pos_amp1)) { pos_semicolon = strchr(pos_amp1,';'); if(pos_semicolon) { //调用 HtmEscapeEntityNameToEntityCode memset(entity_name,0,sizeof(entity_name)); entity_length = ((pos_semicolon - pos_amp1) > kMaxEntityNameLength)?kMaxEntityNameLength: (pos_semicolon - pos_amp1); strncpy(entity_name,pos_amp1,entity_length); const char* entity_code_c = HtmEscapeEntityNameToEntityCode(entity_name); if(entity_code_c) { memset(unicode,0,sizeof(unicode)); memset(utf8,0,sizeof(utf8)); HtmEscapeEntityCodeToUnicode(entity_code_c,unicode); OneUnicode2UTF8(unicode,2,utf8); temp.append(utf8); }else { temp.append(entity_name); } //1.entity_name转换为entity_code之后再转换为utf8字符. start_amp = pos_semicolon + 1; pos_semicolon+=1; }else { start_amp = pos_amp1; } }else if(*pos_amp1 =='#') { char* pos_digit = (pos_amp1+1); if(!pos_digit) { break; } if(isdigit(*pos_digit)) { //1.需要判断数值小于10000. pos_semicolon = strchr(start_amp,';'); if(pos_semicolon) { memset(entity_code,0,sizeof(entity_code)); entity_length = ((pos_semicolon - pos_digit) > kMaxEntityLength)?kMaxEntityLength: (pos_semicolon - pos_digit); strncpy(entity_code,pos_digit,entity_length); memset(unicode,0,sizeof(unicode)); memset(utf8,0,sizeof(utf8)); HtmEscapeEntityCodeToUnicode(entity_code,unicode); OneUnicode2UTF8(unicode,2,utf8); temp.append(utf8); start_amp = pos_semicolon + 1; pos_semicolon+=1; }else { start_amp = pos_digit; } } }else { string sa(start_amp,pos_amp1 - start_amp); temp.append(sa); start_amp = pos_amp1; } } return temp;}int main(int argc, char *argv[]){ printf("Hello, world\n"); string str; const char *html_str = "ΖabcdΕhello©<a>⇒⇒" "ƒ…</a>" "asfas‡dfeΥΨΩ<img>n↓n⋅nωmmm</img>1jh"; str = UnescapeUTFHTMLContent(html_str); cout << "str: " << str << endl; html_str = "<td>&#8736;</td>"; str = UnescapeUTFHTMLContent(html_str); cout << "str: " << str << endl; html_str = "<td>&#8736;</td>;;#8736;"; str = UnescapeUTFHTMLContent(html_str); cout << "str: " << str << endl; html_str = "<td>&#8736;</td>;&"; str = UnescapeUTFHTMLContent(html_str); cout << "str: " << str << endl; html_str = "<td>&#8736;</td>;&"; str = UnescapeUTFHTMLContent(html_str); cout << "str: " << str << endl; html_str = "<td>&#8736;</td>;&"; str = UnescapeUTFHTMLContent(html_str); cout << "str1: " << str << endl; html_str = "ab"; str = UnescapeUTFHTMLContent(html_str); cout << "ab: " << str << endl; return 0;}
str: ΖabcdΕhello©<a>⇒⇒ƒ…</a>asfas‡dfeΥPsiΩ<img>n↓nsdotnωmmm</img>1jhstr: <td>∠</td>str: <td>∠</td>;;#8736;str: <td>∠</td>;&str: <td>∠</td>;ampstr1: <td>∠</td>;&ab: ab
- [HTTP]_[C/C++]_[HTML特殊字符转义成正常字符]
- [HTTP]_[C/C++]_[解析URL的转义字符百分比字符串]
- html 特殊转义字符
- 2_转义字符
- C语言中的特殊转义字符
- ORACLE 特殊字符 _
- HTML特殊转义字符列表
- HTML特殊转义字符列表
- HTML特殊转义字符列表
- HTML特殊转义字符列表
- HTML中的特殊转义字符
- HTML特殊转义字符列表
- HTML特殊转义字符列表
- HTML特殊转义字符列表
- HTML特殊转义字符列表
- HTML特殊转义字符列表
- Oracle 转义HTML特殊字符
- html - 常用特殊转义字符
- Unittest使用和代码简介
- 2012年的最后一天
- 设计模式之4.1 Spring入门初步之Spring bean工厂
- 目前游戏行业内部主要几款游戏引擎的技术对比
- 求建议:现在做PC上的中大型游戏,有哪些引擎可供选择
- [HTTP]_[C/C++]_[HTML特殊字符转义成正常字符]
- Raspberry Pi 树莓派的备份
- Sql Server 里的向上取整、向下取整、四舍五入取整的实例!
- [C/C++]_[单个Unicode转UTF8]
- 从cmos中读取数据并显示
- link和@import的区别
- Capitulum Sampling
- 教务这段时间
- VBO__PBO与FBO