汉字转拼音的类

来源:互联网 发布:java数据清洗 编辑:程序博客网 时间:2024/04/28 15:56

这个类使用查表法获取汉字的汉语拼音。对多音字只取常用的读音

涵盖 GBK 字库的全部汉字

对照表采用 UCS-2 内码顺序排列,有地址连续的优点。拼音信息(声母、韵母、调号)被压缩在两个字节内

所以有体积小,检索速度快的特点

可自动识别传入串的字符集

class Tpinyin {  protected $dict = array();  protected $dictfile = 'py.dict';  protected $tune = array();  private $charset = '';  private $maxlength = 0;  private $fp;  function __construct() {$this->dict[] = explode(',', ',b,p,m,f,d,t,n,l,g,k,h,j,q,x,zh,ch,sh,r,z,c,s,y,e,w,a,o,e');$this->dict[] = explode(',', ',a,ai,an,ang,ao,e,ei,en,eng,ev,i,ia,ian,iang,iao,ie,in,ing,iong,iou,iu,o,ong,ou,r,u,ua,uai,uan,uang,ue,uei,uen,ueng,ui,un,uo,uong,v,van,vn,ve,n,ng');  }  function loaddict($fn='pylib.bmp') {file_put_contents($this->dictfile, '');$t = fopen($this->dictfile, 'rb+');$fp = fopen($fn, 'r');while($s = fgets($fp)) {$ar = explode(',', $s);preg_match('/(.h?)([a-z]*)(\d)/', $ar[1], $r);$n = (array_search($r[1], $dict[0])* 100 + array_search($r[2], $dict[1]))*10 + $r[3];$os = mb_convert_encoding($ar[0], "UCS-2", "UTF-8, GBK");$o = hexdec(bin2hex($os)) - 0x4e00;fseek($t, $o*2);fwrite($t, pack('S', $n));}  }  function settune() {$tune = array('a' => array("\x01\x01","\x00\xe1","\x01\xce","\x00\xe0",),'e' => array("\x01\x13","\x00\xe9","\x01\x1b","\x00\xe8",),'i' => array("\x01\x2b","\x00\xed","\x01\xd0","\x00\xec",),'o' => array("\x01\x4d","\x00\xf3","\x01\xd2","\x00\xf2",),'u' => array("\x01\x6b","\x00\xfa","\x01\xd4","\x00\xf9",),'v' => array("\x01\xd6","\x01\xd8","\x01\xda","\x01\xdc",),);foreach($tune as $k=>$r)foreach($r as $i=>$v)$this->tune[$k][$i] = mb_convert_encoding($v, $this->charset, 'UCS-2');  }  function pinyin($str) {$this->charset = mb_check_encoding($str, 'UTF-8') ? 'UTF-8' : 'GBK';$this->settune();if(! file_exists($this->dictfile)) $this->loaddict();$this->fp = fopen($this->dictfile, 'rb');$this->maxlength = filesize($this->dictfile);$str = mb_convert_encoding($str, "UCS-2", "UTF-8, GBK");return array_map(array($this, 'pinyin_back'), str_split($str, 2));  }  function pinyin_back($ch) {if(ord($ch{0}) == 0) return $ch{1};$o = hexdec(bin2hex($ch)) - 0x4e00;if($o < 0 || $o >= $this->maxlength) return mb_convert_encoding($ch, $this->charset, 'UCS-2');fseek($this->fp, $o*2);$x = sprintf('%05d', current(unpack('S', fread($this->fp, 2))));$t = $this->dict[0][substr($x, 0, 2)+0] . $this->dict[1][substr($x, 2, 2)+0];        $n = substr($x, -1) - 1;$s = $t;foreach($this->tune as $k=>$v) {$s = str_replace($k, $v[$n], $s);if($s != $t) break;}return $s;  }}

示例

$p = new Tpinyin;$t = $p->pinyin('啊!中国');echo join('', $t);

将输出 ā!zhōngguó


原创粉丝点击