Lua中对于UTF-8字符串的处理

来源:互联网 发布:网络销售新三板靠谱吗 编辑:程序博客网 时间:2024/06/05 14:41

UTF-8字符串工具类


Lua代码:utf8_simple.lua

-- ABNF from RFC 3629---- UTF8-octets = *( UTF8-char )-- UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4-- UTF8-1 = %x00-7F-- UTF8-2 = %xC2-DF UTF8-tail-- UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /-- %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )-- UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /-- %xF4 %x80-8F 2( UTF8-tail )-- UTF8-tail = %x80-BF-- 0xxxxxxx                            | 007F   (127)-- 110xxxxx10xxxxxx                   | 07FF   (2047)-- 1110xxxx10xxxxxx 10xxxxxx          | FFFF   (65535)-- 11110xxx10xxxxxx 10xxxxxx 10xxxxxx | 10FFFF (1114111)local pattern = '[%z\1-\127\194-\244][\128-\191]*'-- helper functionlocal posrelat =function (pos, len)if pos < 0 thenpos = len + pos + 1endreturn posendlocal utf8 = {}-- THE MEAT-- maps f over s's utf8 characters f can accept args: (visual_index, utf8_character, byte_index)utf8.map =function (s, f, no_subs)local i = 0if no_subs thenfor b, e in s:gmatch('()' .. pattern .. '()') doi = i + 1local c = e - bf(i, c, b)endelsefor b, c in s:gmatch('()(' .. pattern .. ')') doi = i + 1f(i, c, b)endendend-- THE REST-- generator for the above -- to iterate over all utf8 charsutf8.chars =function (s, no_subs)return coroutine.wrap(function () return utf8.map(s, coroutine.yield, no_subs) end)end-- returns the number of characters in a UTF-8 stringutf8.len =function (s)-- count the number of non-continuing bytesreturn select(2, s:gsub('[^\128-\193]', ''))end-- replace all utf8 chars with mappingutf8.replace =function (s, map)return s:gsub(pattern, map)end-- reverse a utf8 stringutf8.reverse =function (s)-- reverse the individual greater-than-single-byte characterss = s:gsub(pattern, function (c) return #c > 1 and c:reverse() end)return s:reverse()end-- strip non-ascii characters from a utf8 stringutf8.strip =function (s)return s:gsub(pattern, function (c) return #c > 1 and '' end)end-- like string.sub() but i, j are utf8 strings-- a utf8-safe string.sub()utf8.sub =function (s, i, j)local l = utf8.len(s)i =       posrelat(i, l)j = j and posrelat(j, l) or lif i < 1 then i = 1 endif j > l then j = l endif i > j then return '' endlocal diff = j - ilocal iter = utf8.chars(s, true)-- advance up to ifor _ = 1, i - 1 do iter() endlocal c, b = select(2, iter())-- i and j are the same, single-charaacter subif diff == 0 thenreturn string.sub(s, b, b + c - 1)endi = b-- advance up to jfor _ = 1, diff - 1 do iter() endc, b = select(2, iter())return string.sub(s, i, b + c - 1)endreturn utf8
参考项目:https://github.com/Pogs/lua-utf8-simple


使用示例


local utf8 = require('utf8_simple')local str = "你好!"-- 获取字符串长度print(str .. " len = " .. utf8.len(str))-- 截取字符串(Lua中字符串索引从1开始)print("sub str = " .. utf8.sub(str,1,2))
运行效果:


0 0
原创粉丝点击