删除文件中重复的词语

来源:互联网 发布:好的手机管理软件知乎 编辑:程序博客网 时间:2024/04/26 15:06
#!/usr/bin/env python# -*- coding: utf-8 -*-#@function delete depublicattions:#1.delete depublicate lines in textimport sysfrom optparse import OptionParserdef readfile(filename):    try:        f = open(filename)    except Exception :        print ("No such file")        exit(0)    text = f.readlines()    f.close()    for i in range(0,len(text)-1):        text[i] = text[i][:-1]    return textdef unique(arr):    arr1 = list(set(arr))    arr1.sort(key = arr.index)    return arr1def main():    parser = OptionParser()    parser.add_option("-f", "--file", dest="filename",help="write report to FILE", metavar="FILE")  #文件路径    #edit configurations--script parameters -f --file C:\Users\llfang1\Desktop\gongsi资料\词条5.txt    (options, args) = parser.parse_args()    filename = options.filename    text = readfile(filename)    text_dealed = unique(text)    for i in range(0,len(text_dealed)-1):        text_dealed[i] = text_dealed[i] +'\n'    f = open("output.txt","w")    f.writelines(text_dealed)    f.close()    deduplication_num = len(text) - len(text_dealed)    print ("success")    print ("The num of data from the source file        :" + str(len(text)))    print ("The num of data from the preprocessed file: :" + str(len(text_dealed)))    print ("The num of data removed                     :" + str(deduplication_num))if __name__ == '__main__':    main()
原创粉丝点击