
来源:互联网 发布:使用sql语句创建数据库 编辑:程序博客网 时间:2024/06/05 19:02

<span style="font-family: Arial, Helvetica, sans-serif; background-color: rgb(255, 255, 255);"></span><span style="font-family: Arial, Helvetica, sans-serif; background-color: rgb(255, 255, 255);">TIMIT之后,这次来分析Librispeech的词频,文件组织结构如图所示:</span>







import osimport os.pathrootDir = "dev-clean"    #functions used"""This functions get in the file path and dictionary of keyword to count keywords"""def keywordCounter(fileDir, keywordContainer):    f = open(fileDir)    row = f.readline()    while row != '':        wordPart = row[row.index(" ")+1:-1]        words = wordPart.split(" ")        for word in words:            if keywordContainer.has_key(word):                keywordContainer[word] += 1            else:                keywordContainer[word] = 1        row = f.readline()#1print "Step 1: Get absolute directory of all the transcript file inside this folder""""I got the script from cnblog without fully understanding, just use it as black box"""transPathDoc = open("dreaminghzAnalysedData\pathDoc.txt","w+")for parent, dirnames, filenames in os.walk(rootDir):    #for dirname in dirnames:        #print dirname    for filename in filenames:        if filename[-4:] == ".txt":            #print os.path.join(parent,filename)            transPathDoc.write(os.path.join(parent,filename)+"\n")transPathDoc.close() print "Step 1 finished"#2print "Step 2: Read in all the transcript file and do the word counting"keywordContainer = {}pathes = open("dreaminghzAnalysedData\pathDoc.txt")pathTmp = pathes.readline()#readin path and call the function keywordCounter to process itwhile pathTmp != '':    keywordCounter(pathTmp[:-1], keywordContainer)    pathTmp = pathes.readline()pathes.close()print "Step 2 finished"#claimprint "There's totally " + str(len(keywordContainer)) + " keywords"#3print "Step 3: Save keyword into dreaminghzAnalysedData\keyWords.txt"outfile = open("dreaminghzAnalysedData\keyWords.txt","w+")for ks in keywordContainer.keys():    outfile.write(ks + " " + str(keywordContainer[ks]) + "\n")outfile.close()


"""This script is used for finding high frequency keywords that appears more than given time number"""#use the file generated by LibriWordCounter.pykw = open("keyWords.txt")qualification = Falsewhile not qualification:    try:        num = int(raw_input("Input the lower bound of frequency as positive integer pls:"))        if num <= 0:            qualification = False        else:            qualification = True    except:        qualification = FalsehighFFilename = "keywords-noless-" + str(num) + "-times.txt"highFkw = open(highFFilename,"w+")#read in and write down qualified keywordsrow = kw.readline()ctr = 0while row != '':    unit = row[:-1].split(' ')    key, frequency = unit[0], int(unit[1])    if frequency >= num:        highFkw.write(row)        ctr += 1    row = kw.readline()kw.close()highFkw.close()print("Finished, there's totally " + str(ctr) + " records written into the file")

0 0