如何利用python中的langid，对文本语种进行分类

来源：互联网发布：java闰年判断switch 编辑：程序博客网时间：2024/05/23 01:17

1、首先下载langid

链接：Github主页：https://github.com/saffsd/langid.py

F:\githubDownLoad\langid.py-master（下载在我的本地文件夹）

2、在该文件夹下面新建一个python文件 languageID.py

import langid #引入langid模块
import os
fout = open("languageID.txt",'w') #打开一个txt文本存储结果
FileList=[]
Findpath = "D:/歌词/" #所要处理的文件夹路径
FileNames=os.listdir(Findpath)
for fn in FileNames:
fullfilename=os.path.join(Findpath,fn) #获得文件夹路径下面的文件名
#FileList.append(fullfilename)
fin = open(fullfilename,'r') #打开该文件
ch = 0
en = 0
ko = 0
ja = 0
x=0
for eachLine in fin:
line = eachLine.strip().decode('utf-8','ignore') #每行读取内容
lineTuple = langid.classify(line) #判断每行内容属于什么语种
if lineTuple[0]=="zh": #langid.classify(line)的输出结果是一个二元组，二元组的第一项表示该文本所属的语系，如：zh中文，en英文，mr日文
ch = ch+1
elif lineTuple[0]=="en":
en = en +1
elif lineTuple[0]=="ko":
ko = ko+1
elif lineTuple[0]=="mr":
ja = ja +1
else:
x = x+1

m=max(ch,en,ko,ja,x)
fout.write(fullfilename + '\t')
if m==ch:
outstr = "ch"
fout.write('ch'+'\n')
elif m==en:
outstr = "en"
fout.write('en'+'\n')
elif m==ko:
outstr = "ko"
fout.write('ko'+'\n')
elif m==ja:
outstr = "ja"
fout.write('ja'+'\n')
else :
outstr = "x"
fout.write('x'+'\n')

fin.close()

fout.close()

0 0