Python 省市分词提取

来源:互联网 发布:linux 网卡配置网关 编辑:程序博客网 时间:2024/06/04 23:20
第一次使用python编写项目程序,实现地址的省市提取,补全缺省词。
已知地址表file1.xls,包含省、市、县名称。
从数据库取出的地址通过jieba分词,再与本地表比较,最后将省和市分别列出来。
from pandas import *from numpy import *import pandas as pdimport numpy as npimport MySQLdbimport jieba# #建立数据库连接conn = MySQLdb.connect(host="*.*.*.*",user="*",passwd="*",port=0,db="*",charset="utf8")# 通过获取到的数据库连接conn下的cursor()方法来创建游标cursor =conn.cursor()# 解决乱码问题cursor.execute("SET NAMES utf8")# sql语句sql1= "SELECT customer_id,customer_name,address FROM t_customer  ORDER BY customer_id limit 50"# read_sql 方法返回的数据类型是DataFramedfSql=pd.read_sql(sql1,con=conn)# 为数据框添加两列dfSql['Province']=''dfSql['City']=''#将省市县数据读取到数据框,本地查询表dfExcel = pd.read_excel('D:\\file\\file1.xls')#遍历从数据库取的数据的每一行for index,dfSqlRows in dfSql.iterrows():    if dfSqlRows[2] is NAN:        continue    seg_list = list(jieba.cut(str(dfSqlRows[2]), cut_all=False))    #遍历本地地址库的每一行,与当前数据库数据比对    if len(seg_list) > 0:        for flag,dfExcelRows in dfExcel.iterrows():            if seg_list[0] in dfExcelRows[2]:                dfSql.loc[index, ['Province']] = dfExcelRows[0]                dfSql.loc[index, ['City']] = dfExcelRows[1]                break            elif seg_list[0] in dfExcelRows[1]:                dfSql.loc[index, ['Province']] = dfExcelRows[0]                dfSql.loc[index, ['City']] = dfExcelRows[1]                break            elif seg_list[0] in dfExcelRows[0]:                dfSql.loc[index, ['Province']] = dfExcelRows[0]                if len(seg_list) > 1:                    if seg_list[1] in dfExcelRows[1] or seg_list[1] in dfExcelRows[2]:                        dfSql.loc[index, ['City']] = dfExcelRows[1]                        break                    else:                        dfSql.loc[index, ['City']] = "not"                else:                    dfSql.loc[index, ['City']] = "not"                    break#将数据写到excel中dfSql.to_excel('D:\\file\\省市分类表.xlsx')#关闭游标cursor.close()#关闭数据库连接conn.close()