PHP多音字拼音处理方案

来源:互联网 发布:容桂讯宝淘宝分部 编辑:程序博客网 时间:2024/04/20 01:31

在很多时候我们遇到了各种各样的问题,就是在类似于suggest当中需要输入汉字转为拼音或者跟据拼音来产生热门的关键词。

热门关键词在这里我们暂时不做讨论。我们来说一下拼音处理的手法。在拼音处理的过程中有一个方案必须要做到的就是有一个拼音库。在这儿我会把拼音库给大家,让大家进行下载。

此拼音处理类存在的问题是效率过低,我们在处理的时候建议写成PHP扩展的模式来进行处理。在下一期中我们将使用PHP扩展的模式来进行处理和讲解。

拼音库的下载地址:http://www.mdbg.net/chindict/chindict.php?page=cedict

下面是使用演示:

echo Pinyin::trans(‘带着希望去旅行,比到达终点更美好’), “\n”;

 //output: “dài zhe xī wàng qù lǔ xíng bǐ dào dá zhōng diǎn gèng měi hǎo”* </pre>

下面是实现的代码:
class Pinyin
{

/**
* dictionary path
*
* @var string
*/
protected $dictionary;

/**
* settings
*
* @var array
*/
protected static $setting = array(
‘delimiter’ => ‘ ‘,
‘accent’ => true,
);

/**
* instance
*
* @var Pinyin
*/
protected static $instance;

/**
* constructor
*
* set dictionary path.
*/
public function __construct()
{
ini_set(‘memory_limit’, ‘160M’);
$this->dictionary = __DIR__ . ‘/cedict/cedict_ts.u8′;
}

/**
* set the dictionary.
*
* @param array $setting settings.
*/
public static function set(array $setting = array())
{
self::$setting = array_merge(self::$setting, $setting);
}

/**
* get Pinyin instance
*
* @return Pinyin
*/
public static function getInstance()
{
if (is_null(self::$instance)) {
self::$instance = new self;
}

return self::$instance;
}

/**
* chinese to pinyin
*
* @param string $string source string.
* @param array $setting settings.
*
* @return string
*/
public static function trans($string, array $setting = array())
{
$instance = self::getInstance();

// merge setting
empty($setting) || self::set($setting);

$dictionary = $instance->loadDictionary();

// do replace
foreach ($dictionary as $line) {
$string = str_replace($line['simplified'], “{$line['pinyin_marks']} “, $string);
if (!$instance->containsChinese($string)) {
break;
}
}

// add accents
if(self::$setting['accent']) {
$string = $instance->pinyin_addaccents(strtolower($string));
} else {
$string = $instance->removeTone(strtolower($string));
}

// clean the string
$string = $instance->removeUnwantedCharacters($string);

// add delimiter
$string = $instance->addDelimiter($string);

return $instance->escape($string);
}

/**
* load dictionary content
*
* @return array
*/
protected function loadDictionary()
{
$cacheFilename = $this->getCacheFilename($this->dictionary);

// load from cache
if (file_exists($cacheFilename)) {
return $this->loadFromCache($cacheFilename);
}

// parse and cache
$parsedDictionary = $this->parseDictionary($this->dictionary);
$this->cache($cacheFilename, $parsedDictionary);

return $parsedDictionary;
}

/**
* get the filename of cache file.
*
* @param string $dictionary dictionary path.
*
* @return string
*/
protected function getCacheFilename($dictionary)
{
is_dir(__DIR__ .’/cache/’) || mkdir(__DIR__ .’/cache/’, 0755, true);

return __DIR__ .’/cache/’ . md5($dictionary);
}

/**
* parse the dict to php array
*
* @param string $dictionary path of dictionary file.
*
* @return array
*/
protected function parseDictionary($dictionary)
{
//ini_set(‘memory_limit’, ‘180M’);
$dictionary = file($dictionary);
$regex = “#(.*?) (.*?) \[(.*?)\] \/(.*)\/#”;

$content = array();

foreach ($dictionary as $entry) {
if (0 === stripos($entry, ‘#’)) {
continue;
}

preg_match($regex, $entry, $matches);

$content[] = array(
//’traditional’ => $matches[1],
‘simplified’ => $matches[2],
//’pinyin_numbers’ => $matches[3],
‘pinyin_marks’ => $matches[3],
//’translation’ => $this->escape($matches[4]),
);
}

// sort by simplified string length.
usort($content, function($a, $b){
if (mb_strlen($a['simplified']) == mb_strlen($b['simplified'])) {
return 0;
}

return mb_strlen($a['simplified']) < mb_strlen($b['simplified']) ? 1 : -1;
});

return $content;
}

/**
* load dictionary from cached file
*
* @param string $dictionary cached file name
*
* @return array
*/
protected function loadFromCache($dictionary)
{
return include $dictionary;
}

/**
* write array to file
*
* @param string $filename filename.
* @param array $array parsed dictionary.
*
* @return void
*/
protected function cache($filename, $array)
{
file_put_contents($filename, “ ‘u’,
‘/\d/’ => ”,
);

return preg_replace(array_keys($replacement), $replacement, $string);
}

/**
* Credits for these 2 functions go to Bouke Versteegh, who shared these
* at http://stackoverflow.com/questions/1598856/convert-numbered-to-accentuated-pinyin
*
* @param string $string The pinyin string with tone numbers, i.e. “ni3 hao3″
*
* @return string The formatted string with tone marks, i.e.
*/
protected function pinyin_addaccents($string)
{
# Find words with a number behind them, and replace with callback fn.
return str_replace(‘u:’, ‘ü’, preg_replace_callback(
‘~([a-zA-ZüÜ]+\:?)(\d)~’,
array($this, ‘pinyin_addaccents_cb’),
$string));
}

# Helper callback
protected function pinyin_addaccents_cb($match)
{
static $accentmap = null;

if ($accentmap === null) {
# Where to place the accent marks
$stars =
‘a* e* i* o* u* ü* ‘ .
‘A* E* I* O* U* Ü* ‘ .
‘a*i a*o e*i ia* ia*o ie* io* iu* ‘ .
‘A*I A*O E*I IA* IA*O IE* IO* IU* ‘ .
‘o*u ua* ua*i ue* ui* uo* üe* ‘ .
‘O*U UA* UA*I UE* UI* UO* ÜE*';
$nostars =
‘a e i o u ü ‘ .
‘A E I O U Ü ‘ .
‘ai ao ei ia iao ie io iu ‘ .
‘AI AO EI IA IAO IE IO IU ‘ .
‘ou ua uai ue ui uo üe ‘ .
‘OU UA UAI UE UI UO ÜE';

# Build an array like array(‘a’ => ‘a*’) and store statically
$accentmap = array_combine(explode(‘ ‘, $nostars), explode(‘ ‘, $stars));
}

static $vowels = array(‘a*’, ‘e*’, ‘i*’, ‘o*’, ‘u*’, ‘ü*’, ‘A*’, ‘E*’, ‘I*’, ‘O*’, ‘U*’, ‘Ü*’);

static $pinyin = array(
1 => array(‘ā’, ‘ē’, ‘ī’, ‘ō’, ‘ū’, ‘ǖ’, ‘Ā’, ‘Ē’, ‘Ī’, ‘Ō’, ‘Ū’, ‘Ǖ’),
2 => array(‘á’, ‘é’, ‘í’, ‘ó’, ‘ú’, ‘ǘ’, ‘Á’, ‘É’, ‘Í’, ‘Ó’, ‘Ú’, ‘Ǘ’),
3 => array(‘ǎ’, ‘ě’, ‘ǐ’, ‘ǒ’, ‘ǔ’, ‘ǚ’, ‘Ǎ’, ‘Ě’, ‘Ǐ’, ‘Ǒ’, ‘Ǔ’, ‘Ǚ’),
4 => array(‘à’, ‘è’, ‘ì’, ‘ò’, ‘ù’, ‘ǜ’, ‘À’, ‘È’, ‘Ì’, ‘Ò’, ‘Ù’, ‘Ǜ’),
5 => array(‘a’, ‘e’, ‘i’, ‘o’, ‘u’, ‘ü’, ‘A’, ‘E’, ‘I’, ‘O’, ‘U’, ‘Ü’)
);

list(, $word, $tone) = $match;
# Add star to vowelcluster
$word = strtr($word, $accentmap);
# Replace starred letter with accented
$word = str_replace($vowels, $pinyin[$tone], $word);

return $word;
}

}

感谢本文的作者:@安小超 www.joychao.cc

0 0