python分词脚本 注意python对于中文的编码方式

来源:互联网 发布:淘宝销量为0敢买吗 编辑:程序博客网 时间:2024/05/21 05:59

对于中文以及windows下路径的修改是要注意的,尤其是编码方式

ASCII不能存储中文

unicode是中文在内存的编码方式

utf-8是中文在硬盘的编码方式

需要转化尤其是在调用存储的时候

下面的代码先decode的目的在于,将原本存于硬盘的utf-8代码解析成Unicode,然后再转换成utf-8显示

还有就是split对于分词来说十分有用

python下标是从0开始的。。。

# -*- coding: UTF-8 -*-import os,sys import restr2 = 'C:/Users/Hit/Desktop/文本/199801.txt' path = unicode(str2,"utf8") fo = open(path) fw = open('new.txt','w')count = 0 done = 0while not done:    line = fo.readline()     if line:        count = count+1         if count != 0:            split_line = line.split("  ")            clear_time = 1            for item in split_line:                if clear_time == 1:                    clear_time = clear_time + 1                    continue                else:                    term = re.split('/',item)                    if term[0] != '\n':                        for word in term[1].split():                            if word == 'nr' or word == 'ns' or word == 'nz' or word == 'nt':                                count_nr = 0                                isfirst = 1                                for contain in term[0].decode('utf-8'):                                    count_nr  = count_nr + 1                                    if count_nr == 1 and contain == '[':                                        continue                                    else:                                        fw.write(contain.encode('utf-8'))                                        fw.write(' ')                                        if isfirst == 1:                                            fw.write(word.upper())                                            fw.write('-B')                                            isfirst = isfirst + 1                                        else:                                            fw.write('I')                                        fw.write('\n')                            else:                                for contain in term[0].decode('utf-8'):                                    fw.write(contain.encode('utf-8'))                                    fw.write(' O\n')                                                fw.write('\n')             else:        done = 1fw.close()fo.close()

# -*- coding: UTF-8 -*-import os,sys import restr2 = 'C:/Users/Hit/Desktop/文本/199801.txt' path = unicode(str2,"utf8") fo = open(path) fw = open('new.txt','w')count = 0 done = 0while not done:    line = fo.readline()     if line:        count = count+1         if count ==4:            split_line = line.split("  ")            clear_time = 1            print len(split_line)            rows = 0            pre = ''            preterm = []            for num in range(len(split_line)):                if num == 0:                    continue                else:                    print "NEW ITERATION :",                    print num                    term = re.split('/',split_line[num])                    print term[0]                    if term[0] != '\n':                        word = term[1]                        if word == 'nr' or word == 'ns' or word == 'nz' or word == 'nt':                            if word != pre:                                if word == 'nr' or word == 'ns' or word == 'nz' or word == 'nt':                                    count_nr = 0                                    isfirst = 1                                    for contain in term[0].decode('utf-8'):                                        count_nr  = count_nr + 1                                        if count_nr == 1 and contain == '[':                                            continue                                        else:                                            fw.write(contain.encode('utf-8'))                                            fw.write(' ')                                            if isfirst == 1:                                                fw.write(word.upper())                                                fw.write('-B')                                                isfirst = isfirst + 1                                            else:                                                fw.write('I')                                        fw.write('\n')                            else:                                if word == 'nr' or word == 'ns' or word == 'nz' or word == 'nt':                                    count_nr = 0                                    isfirst = 1                                    for contain in term[0].decode('utf-8'):                                        count_nr  = count_nr + 1                                        if count_nr == 1 and contain == '[':                                            continue                                        else:                                            fw.write(contain.encode('utf-8'))                                            fw.write(' ')                                            if isfirst == 1:                                                fw.write('I')                                                isfirst = isfirst + 1                                            else:                                                fw.write('I')                                        fw.write('\n')                        else:                             for contain in term[0].decode('utf-8'):                                fw.write(contain.encode('utf-8'))                                fw.write(' O\n')                if num == 1:                    continue                preterm = re.split('/',split_line[num])                pre = preterm[1]            fw.write('\n')             else:        done = 1fw.close()fo.close()


原创粉丝点击