c++ 11 utf8 subst…

来源:互联网 发布:淘宝知识侵权 编辑:程序博客网 时间:2024/06/05 23:55
改编自:http://www.zedwood.com/article/cpp-utf-8-mb_substr-function
这个也有问题,我基础这个改了个完善的另外写了个文字个数统计


uint64_t pinyin::utf8_len(const std::string &str) {
    uint64_t i = 0;
    uint64_t count =0;
    uint64_t c;

    for (i = 0; i <str.length(); i++) {
       count++;

       c = (unsigned char) str[i];
       if (c >= 0 && c <= 127) i +=0;
       else if ((c & 0xE0) == 0xC0) i += 1;
       else if ((c & 0xF0) == 0xE0) i += 2;
       else if ((c & 0xF8) == 0xF0) i += 3;
           //else if(($c & 0xFC) == 0xF8) i+=4; // 111110bb //byte 5, unnecessaryin 4 byte UTF-8
           //else if(($c & 0xFE) == 0xFC) i+=5; // 1111110b //byte 6, unnecessaryin 4 byte UTF-8
       else return 0;//invalid utf8
    }
    return count;
}

std::string pinyin::utf8_substr(const std::string &str,uint64_t start, uint64_t leng) {
    if (leng == 0) { return""; }
    uint64_t c, i, ix, q,min = std::string::npos, max = std::string::npos;
    for (q = 0, i = 0, ix =str.length(); i < ix; i++, q++) {
       if (q == start) { min = i; }
       if (q <= start + leng || leng ==std::string::npos) { max = i; }

       c = (unsigned char) str[i];
       if (c >= 0 && c <= 127) i +=0;
       else if ((c & 0xE0) == 0xC0) i += 1;
       else if ((c & 0xF0) == 0xE0) i += 2;
       else if ((c & 0xF8) == 0xF0) i += 3;
           //else if(($c & 0xFC) == 0xF8) i+=4; // 111110bb //byte 5, unnecessaryin 4 byte UTF-8
           //else if(($c & 0xFE) == 0xFC) i+=5; // 1111110b //byte 6, unnecessaryin 4 byte UTF-8
       else return "";//invalid utf8
    }
    if (q <= start + leng|| leng == std::string::npos) { max = i; }
    if (min ==std::string::npos || max == std::string::npos) { return ""; }
    return str.substr(min,max - min);
}
阅读全文
0 0
原创粉丝点击