C++将HTML特殊字符转换成正常字符
来源:互联网 发布:极速开票软件 编辑:程序博客网 时间:2024/05/22 12:49
文件1:HtmlEscape.cpp
//HTML转义相关#include <stdio.h> #include <stdlib.h> #include <string.h> #include <iostream> #include <assert.h> using namespace std;int IsLittleEndian(){int x = 1;if (*(char*)&x == 1){return 1;}else{return 0;}}void HtmEscapeEntityCodeToUnicode(const char* entity_code, char* html_char){int decimal_value = atoi(entity_code);char* uchari = (char*)&decimal_value;if (IsLittleEndian()){html_char[0] = uchari[0] & 0xFF;html_char[1] = uchari[1] & 0xFF;}else{html_char[0] = uchari[1] & 0xFF;html_char[1] = uchari[0] & 0xFF;}}void OneUnicode2UTF8(const char* unicode_char, size_t unicode_char_length,char* utf_char){//unicode: 0x192->110010010 ,utf8:0xC692->1100011010010010 int value = 0;memcpy(&value, unicode_char, unicode_char_length);if (value >= 0x0000 && value <= 0x007F){utf_char[0] = unicode_char[0];}else if (value >= 0x0080 && value <= 0x07FF){utf_char[0] = ((value >> 6) | 0xC0);utf_char[1] = ((value & 0x3F) | 0x80);}else if (value >= 0x0800 && value <= 0xFFFF){utf_char[0] = ((value >> 12) | 0xE0);utf_char[1] = ((value >> 6 & 0x3F) | 0x80);utf_char[2] = ((value & 0x3F) | 0x80);}else if (value >= 0x10000 && value <= 0x10FFFF){utf_char[0] = (value >> 18 | 0xF0);utf_char[1] = ((value >> 12 & 0x3F) | 0x80);utf_char[2] = ((value >> 6 & 0x3F) | 0x80);utf_char[3] = ((value & 0x3F) | 0x80);}else{cerr << "value too big." << endl;assert(0);}}static const char* kEntityNameToEntityCodeMap[] ={"oelig", "339", "amp", "38", "rArr", "8658", "fnof", "402"};//1.这里可以优化的余地很大. const char* HtmEscapeEntityNameToEntityCode(const char* entity_name){static size_t length = sizeof(kEntityNameToEntityCodeMap) / sizeof(char*);for (size_t i = 0; i < length; i += 2){if (!strcmp(entity_name, kEntityNameToEntityCodeMap[i])){return kEntityNameToEntityCodeMap[i + 1];}}return NULL;}string UnescapeUTFHTMLContent(const char* str){string temp;char* pos_amp = NULL;char* pos_semicolon = (char*)str;const char* start_amp = str;int entity_length = 0;char entity_code[5];const int kMaxEntityLength = 4;char entity_name[20];const int kMaxEntityNameLength = 18;char unicode[3];char utf8[4];while (true){if (!start_amp || !(*start_amp)){break;}pos_amp = (char *)strchr(start_amp, '&');if (!pos_amp){temp.append(start_amp);break;}int pos_no = pos_amp - pos_semicolon;if (pos_no > 0){temp.append(start_amp, pos_no);start_amp = pos_amp;}char* pos_amp1 = pos_amp + 1;if (!pos_amp1 || !(*pos_amp1)){string t2(start_amp);temp.append(start_amp);break;}if (isalpha(*pos_amp1)){pos_semicolon = strchr(pos_amp1, ';');if (pos_semicolon){//调用 HtmEscapeEntityNameToEntityCode memset(entity_name, 0, sizeof(entity_name));entity_length = ((pos_semicolon - pos_amp1) >kMaxEntityNameLength) ? kMaxEntityNameLength :(pos_semicolon - pos_amp1);strncpy_s(entity_name, _countof(entity_name), pos_amp1, entity_length);const char* entity_code_c =HtmEscapeEntityNameToEntityCode(entity_name);if (entity_code_c){memset(unicode, 0, sizeof(unicode));memset(utf8, 0, sizeof(utf8));HtmEscapeEntityCodeToUnicode(entity_code_c, unicode);OneUnicode2UTF8(unicode, 2, utf8);temp.append(utf8);}else{temp.append(entity_name);}//1.entity_name转换为entity_code之后再转换为utf8字符. start_amp = pos_semicolon + 1;pos_semicolon += 1;}else{start_amp = pos_amp1;}}else if (*pos_amp1 == '#'){char* pos_digit = (pos_amp1 + 1);if (!pos_digit){break;}if (isdigit(*pos_digit)){//1.需要判断数值小于10000. pos_semicolon = (char *)strchr(start_amp, ';');if (pos_semicolon){memset(entity_code, 0, sizeof(entity_code));entity_length = ((pos_semicolon - pos_digit) >kMaxEntityLength) ? kMaxEntityLength :(pos_semicolon - pos_digit);strncpy_s(entity_code, _countof(entity_code), pos_digit, entity_length);memset(unicode, 0, sizeof(unicode));memset(utf8, 0, sizeof(utf8));HtmEscapeEntityCodeToUnicode(entity_code, unicode);OneUnicode2UTF8(unicode, 2, utf8);temp.append(utf8);start_amp = pos_semicolon + 1;pos_semicolon += 1;}else{start_amp = pos_digit;}}}else{string sa(start_amp, pos_amp1 - start_amp);temp.append(sa);start_amp = pos_amp1;}}return temp;}
int main(int argc, char *argv[]) { printf("Hello, world\n"); string str; const char *html_str = "ΖabcdΕhello©<a>⇒⇒" "ƒ…</a>" "asfas‡dfeΥΨΩ<img>n↓n⋅nωmmm</img>1jh"; str = UnescapeUTFHTMLContent(html_str); cout << "str: " << str << endl; html_str = "<td>∠</td>"; str = UnescapeUTFHTMLContent(html_str); cout << "str: " << str << endl; html_str = "<td>∠</td>;;#8736;"; str = UnescapeUTFHTMLContent(html_str); cout << "str: " << str << endl; html_str = "<td>∠</td>;&"; str = UnescapeUTFHTMLContent(html_str); cout << "str: " << str << endl; html_str = "<td>∠</td>;&"; str = UnescapeUTFHTMLContent(html_str); cout << "str: " << str << endl; html_str = "<td>∠</td>;&"; str = UnescapeUTFHTMLContent(html_str); cout << "str1: " << str << endl; html_str = "ab"; str = UnescapeUTFHTMLContent(html_str); cout << "ab: " << str << endl; return 0; }
输出结果为:
str: ΖabcdΕhello©<a>⇒⇒ƒ…</a>asfas‡dfeΥPsiΩ<img>n↓nsdotnωmmm</img>1jh str: <td>∠</td> str: <td>∠</td>;;#8736; str: <td>∠</td>;& str: <td>∠</td>;amp str1: <td>∠</td>;& ab: ab
0 0
- C++将HTML特殊字符转换成正常字符
- java将html实体字符转换成正常字符
- java将html实体字符转换成正常字符
- java将html实体字符转换成正常字符
- android 特殊字符(Html)转换正常
- [HTTP]_[C/C++]_[HTML特殊字符转义成正常字符]
- html特殊字符转换
- phpcms html 特殊字符转换
- 用JS将xml中特殊字符转换成Unicode
- 防sql注入,将特殊字符转换成%20
- php应用如何转换HTML特殊字符
- html特殊字符转换(java)
- html特殊字符转换(java)
- java实现html特殊字符转换
- Java 如何将html转以后的字符转化成正常显示的字符
- javascript的转换特殊字符为HTML实体字符
- java转换 HTML字符实体,java特殊字符转义字符串
- selenium webdriver获取不到标签文本及将html特殊符号&转换为正常字符的方法
- 设计模式-单例模式
- IT 圈里经常被读错的词
- PHP将操作数据库得到的数据生成Excel导出
- reunet 在家开着电脑挂个东西就能每天有收入
- eclipse经常卡死的问题
- C++将HTML特殊字符转换成正常字符
- 客户端存储技术-PersistJS
- PHP将服务器上的文件下载(例如下载Excel模板)
- C++构造函数 explicit
- 程序员面试题目总结(持续更新)
- OC学习(一)类的定义和对象的创建
- HDU 1102
- 线程问题 之 支付时支付界面未弹出问题(C++调用Java)
- c++ 指针值和指针地址的输出&指针和数组保存字符串的区别