一种不太完善的OpenStreetMap字典汉化方法

来源:互联网 发布:在哪购买淘宝小号靠谱 编辑:程序博客网 时间:2024/06/06 01:43

根据世界地名词典,对OpenStreetMap进行汉化,使用了下面的代码。不太完善,这里仅贴出来。
地名字典在我的资源世界地名大词典下载。

#include <QCoreApplication>#include <QDebug>#include <QFile>#include <QHash>#include <QMap>#include <QRegExp>#include <QString>#include <QSqlDatabase>#include <QSqlError>#include <QSqlQuery>#include <QTextStream>#include <QVector>QHash <QString, QMap<int,QVector<QString> > >  make_dictionary(QSqlDatabase db);void outputDictionary(QHash <QString, QMap<int,QVector<QString> > > dict);void prepareToTranslate(const QHash <QString, QMap<int,QVector<QString> > > dict,                       QSqlDatabase db,                       const QString & tableName,                       QVector<qint64> & vec_osmid,                       QVector<QString> & vec_rawName,                       QVector<QString> & vec_TransName                       );int main(int argc, char *argv[]){    QCoreApplication a(argc, argv);    QTextStream Stdout(stdout,QIODevice::WriteOnly);    QSqlDatabase db = QSqlDatabase::addDatabase("QPSQL");    if (db.isValid()==false)        return 0;    db.setHostName("127.0.0.1");    db.setDatabaseName("gis");    db.setUserName("archosm");    db.setPassword("archosm");    if (db.open()==false)    {        Stdout << db.lastError().text()<<"\n";        qDebug() << db.lastError().text();        return 0;    }    try    {        QHash <QString, QMap<int,QVector<QString> > >   dict = make_dictionary(db);        outputDictionary(dict);        //! start to translate        QSqlQuery queryWordsToTrans(db);        const QString tableNames[4] = {            QString("planet_osm_line"),QString("planet_osm_point"),QString("planet_osm_polygon"),QString("planet_osm_roads")        };        //输出        QFile fpDict(QCoreApplication::applicationDirPath()+"/trans.txt");        if (fpDict.open(QIODevice::WriteOnly)==false)            return 0;        QTextStream stout(&fpDict);        QSqlQuery queryUpdate(db);        queryUpdate.setForwardOnly(true);        db.transaction();        for (int i=0;i<4;++i)        {            QVector<qint64>  vec_osmid;            QVector<QString>  vec_rawName;            QVector<QString>  vec_TransName;            prepareToTranslate(dict,db,tableNames[i],vec_osmid,vec_rawName,vec_TransName);            QMap<QString, QString> map_trans;            int nTransed = vec_osmid.size();            for (int j=0;j<nTransed;++j)                map_trans[vec_rawName[j]] = vec_TransName[j];            QList<QString> key_raws = map_trans.keys();            foreach (QString str_rawName, key_raws)            {                QString strTransName = map_trans[str_rawName];                stout<<tableNames[i]<<","<<str_rawName<<","<<strTransName<<"\n";                queryUpdate.prepare(QString("update %1 set name = ? , trans_name_chs = ? where name = ? and trans_name_chs is null;").arg(tableNames[i]));                queryUpdate.addBindValue(str_rawName + ","+strTransName);                queryUpdate.addBindValue(strTransName);                queryUpdate.addBindValue(str_rawName);                if (queryUpdate.exec()==false)                    throw queryUpdate.lastError().text();                stout.flush();                fpDict.flush();            }        }        db.commit();        fpDict.close();    }    catch (QString errMessage)    {        db.rollback();        Stdout<<"Error!"<<errMessage<<"\n";        qDebug()<<"Error!"<<errMessage;    }    db.close();    Stdout<<"Finished!\n";    qDebug()<<"Finished!";    exit(0);    return a.exec();}//预处理原始数据,生成词典QHash <QString, QMap<int,QVector<QString> > >  make_dictionary(QSqlDatabase db){    QVector<QString> lst_tails;    //这些后缀去掉后,会得到更多的有效词根。    lst_tails.push_back(QString::fromUtf8("国家野生动物保护区"));    lst_tails.push_back(QString::fromUtf8("国家森林公园"));    lst_tails.push_back(QString::fromUtf8("野生动物保护区"));    lst_tails.push_back(QString::fromUtf8("森林公园"));    lst_tails.push_back(QString::fromUtf8("国家公园"));    lst_tails.push_back(QString::fromUtf8("深海平原"));    lst_tails.push_back(QString::fromUtf8("海底峡谷"));    lst_tails.push_back(QString::fromUtf8("断裂带"));    lst_tails.push_back(QString::fromUtf8("自治区"));    lst_tails.push_back(QString::fromUtf8("裂口"));    lst_tails.push_back(QString::fromUtf8("盐湖"));    lst_tails.push_back(QString::fromUtf8("内湖"));    lst_tails.push_back(QString::fromUtf8("海岭"));    lst_tails.push_back(QString::fromUtf8("环礁"));    lst_tails.push_back(QString::fromUtf8("大区"));    lst_tails.push_back(QString::fromUtf8("机场"));    lst_tails.push_back(QString::fromUtf8("山口"));    lst_tails.push_back(QString::fromUtf8("公园"));    lst_tails.push_back(QString::fromUtf8("半岛"));    lst_tails.push_back(QString::fromUtf8("冰川"));    lst_tails.push_back(QString::fromUtf8("沙漠"));    lst_tails.push_back(QString::fromUtf8("峡谷"));    lst_tails.push_back(QString::fromUtf8("山谷"));    lst_tails.push_back(QString::fromUtf8("海沟"));    lst_tails.push_back(QString::fromUtf8("水道"));    lst_tails.push_back(QString::fromUtf8("水库"));    lst_tails.push_back(QString::fromUtf8("大坝"));    lst_tails.push_back(QString::fromUtf8("神庙"));    lst_tails.push_back(QString::fromUtf8("干河"));    lst_tails.push_back(QString::fromUtf8("平原"));    lst_tails.push_back(QString::fromUtf8("海岸"));    lst_tails.push_back(QString::fromUtf8("群岛"));    lst_tails.push_back(QString::fromUtf8("火山"));    lst_tails.push_back(QString::fromUtf8("浅滩"));    lst_tails.push_back(QString::fromUtf8("大桥"));    lst_tails.push_back(QString::fromUtf8("洼地"));    lst_tails.push_back(QString::fromUtf8("瀑布"));    lst_tails.push_back(QString::fromUtf8("海峡"));    lst_tails.push_back(QString::fromUtf8("熔岩"));    lst_tails.push_back(QString::fromUtf8("岛"));    lst_tails.push_back(QString::fromUtf8("湖"));    lst_tails.push_back(QString::fromUtf8("湾"));    lst_tails.push_back(QString::fromUtf8("山"));    lst_tails.push_back(QString::fromUtf8("河"));    lst_tails.push_back(QString::fromUtf8("滩"));    lst_tails.push_back(QString::fromUtf8("村"));    lst_tails.push_back(QString::fromUtf8("市"));    lst_tails.push_back(QString::fromUtf8("坝"));    lst_tails.push_back(QString::fromUtf8("港"));    lst_tails.push_back(QString::fromUtf8("区"));    lst_tails.push_back(QString::fromUtf8("县"));    lst_tails.push_back(QString::fromUtf8("省"));    lst_tails.push_back(QString::fromUtf8("礁"));    lst_tails.push_back(QString::fromUtf8("角"));    lst_tails.push_back(QString::fromUtf8("峰"));    lst_tails.push_back(QString::fromUtf8("站"));    lst_tails.push_back(QString::fromUtf8("岭"));    const int remvSz = lst_tails.size();    QSqlQuery query(db);    query.setForwardOnly(true);    if (false == query.exec("select * from national_place_names"))        throw query.lastError().text();    QHash <QString, QMap<int,QVector<QString> > >  hash_dict;    //Make dictionary    while (query.next())    {        const QString raw_name = query.value("place_name").toString()                .replace("<u>","")                .replace("</u>","")                .replace("<rt>","")                .replace("</rt>","")                .replace("<ruby>","")                .replace("</ruby>","");        const QString raw_trans = query.value("trans_name").toString();        ///Replace some split comma.        //! Replace "见"        QStringList lst_raw_name = raw_name.split(QRegExp(QString::fromUtf8("[〈〉见,()]")),QString::SkipEmptyParts);        if (lst_raw_name.size())        {            QString word = lst_raw_name.first();            QString upperKey = word.toUpper().trimmed();            upperKey.replace(QRegExp(QString::fromUtf8("[ ,, ]")),"_");            upperKey.replace("-","_");            upperKey.replace(".","_");            QStringList listWordsKey = upperKey.split("_");            int n = listWordsKey.size();            for (int i = 0 ;i < n; ++i)            {                QString finalKey;                for (int j = 0;j<=i;++j)                {                    if (j)                        finalKey += "_";                    finalKey += listWordsKey.at(j);                }                //CHS                QStringList chslists = raw_trans.split(QRegExp(QString::fromUtf8("[()(),;。]")),QString::SkipEmptyParts);                if (chslists.size())                {                    bool bfound = false;                    int deleted = 0;                    QString chs_value = chslists.first();                    do                    {                        bfound = false;                        for (int k = 0; k< remvSz ;++k)                        {                            if (chs_value.endsWith(lst_tails[k]))                            {                                QString newv = chs_value.left(chs_value.length()-lst_tails[k].length());                                if (newv.size())                                {                                    bfound = true;                                    chs_value = newv;                                    ++deleted;                                    break;                                }                            }                        }//end for (int k = 0; k< remvSz && bfound==true;++k)                    }while (bfound); //end do remove laterFix                    hash_dict[finalKey][deleted-i].push_back(chs_value);                }//end if (chslists.size())            }//end for i = 1 ~ n n = listWordsKey.size();        }//end if (lst_raw_name.size())    }    return hash_dict;}void outputDictionary(QHash <QString, QMap<int,QVector<QString> > > dict){    QFile fpDict(QCoreApplication::applicationDirPath()+"/dict.txt");    if (fpDict.open(QIODevice::WriteOnly)==false)        return;    QTextStream stout(&fpDict);    QList<QString> words = dict.keys();    std::sort(words.begin(),words.end());    foreach (QString word, words)    {        stout<<word<<":";        const QMap<int,QVector<QString> >  & vals = dict[word];        QList<int> simrts = vals.keys();        foreach (int simrt, simrts)        {            stout<<simrt<<"={";            const QVector<QString> & transs = vals[simrt];            const int nPoss = transs.size();            for(int i=0;i<nPoss;++i)                stout<<transs[i]<<",";            stout<<simrt<<"}; ";        }        stout<<"\n";    }    fpDict.close();}void prepareToTranslate(QHash <QString, QMap<int,QVector<QString> > > dict,                       QSqlDatabase db,                       const QString & tableName,                       QVector<qint64> & vec_osmid,                       QVector<QString> & vec_rawName,                       QVector<QString> & vec_TransName                       ){    QSqlQuery query(db);    query.setForwardOnly(true);    if (false == query.exec(QString("select osm_id,name from %1 where name > ' ';").arg(tableName)))        throw query.lastError().text();    while (query.next())    {        qint64 osmid = query.value(0).toLongLong();        const QString strRawName = query.value(1).toString();        QString transName;        if (strRawName.size()>1)        {            QString upperKey = strRawName.toUpper().trimmed();            upperKey.replace(QRegExp(QString::fromUtf8("[ ,, ]")),"_");            upperKey.replace("-","_");            upperKey.replace(".","_");            QStringList listWordsKey = upperKey.split("_");            int n = listWordsKey.size();            if (n )            {                for (int i = n-1 ;i >=0; --i)                {                    QString finalKey;                    for (int j = 0;j<=i;++j)                    {                        if (j)                            finalKey += "_";                        finalKey += listWordsKey.at(j);                    }                    if (dict.contains(finalKey))                    {                        if (finalKey.size()>3)                        {                            if (transName.size())                                transName +="_";                            transName += dict[finalKey].first().first();                            for (int j = 0; j<=i;++j)                                listWordsKey.pop_front();                        }                        break;                    }                    if (i<2)                        break;                }            }        }        if (transName.size())        {            vec_osmid.push_back(osmid);            vec_rawName.push_back(strRawName);            vec_TransName.push_back(transName);        }    }}
原创粉丝点击