统计词频

来源:互联网 发布:剑三大师捏脸数据 编辑:程序博客网 时间:2024/05/16 09:25

具体操作步骤如下:

#统计词频#1 输入文章#2 建立统计词频的空字典#3 对文本的一行统计词频#4 获取字典数据对#5 对数据对进行排序#6 输出结果#7 Turtle库绘制词频结果图表
实现效果如下:
实现代码如下:
import turtle##全局变量##count = 10data = []words = []yScale = 6xScale = 30####################################Turtle Start ##############绘制线段(x1,y1)(x2,y2)def drawLine(t,x1,y1,x2,y2):    t.penup()    t.goto(x1,y1)    t.pendown()    t.goto(x2,y2)#在坐标(x,y)处写文字def drawText(t,x,y,text):    t.penup()    t.goto(x,y)    t.pendown()    t.write(text)#绘制图表def drawGraph(t):    #绘制x/y轴线    drawLine(t,0,0,360,0)    drawLine(t,0,300,0,0)    #x轴:坐标以及描述    for x in range(count):        x = x+1  #右移一位,不在原点        drawText(t,x*xScale-4,-20,words[x-1])        drawText(t,x*xScale-4,data[x-1]*yScale+10,data[x-1])    drawBar(t)#绘制一个柱体def drawRectangle(t,x,y):    x = x * xScale    y = y * yScale    drawLine(t, x - 5, 0, x - 5, y)    drawLine(t, x - 5, y, x + 5, y)    drawLine(t, x + 5, y, x + 5, 0)    drawLine(t, x + 5, 0, x - 5, 0)#绘制多个柱体def drawBar(t):    for i in range(count):        drawRectangle(t,i+1,data[i])###################### Turtle end ###############对文本的每一行计算统计词频的函数def processLine(line,wordCounts):  #wordCounts空字典    #用空格代替标点符号    line = replacePunctuations(line)    #从每一行获取每个词    words = line.split()    for word in words:        if word in wordCounts:            wordCounts[word] += 1        else:            wordCounts[word] = 1#空格替换标点的函数def replacePunctuations(line):    for ch in "~@#$%^&*()_-+=<>?/,.:;{}[]|\'""":        line = line.replace(ch," ")    return linedef main():    #用户输入文件名    filename = raw_input("enter a filename: ").strip()    infile = open(filename,"r")    #建立用于计算词频的字典    wordCounts = {}    for line in infile:        processLine(line.lower(),wordCounts)    #从字典获取数据对    pairs = list(wordCounts.items())    # #列表中的数据对交换位置,数据对排序    items = [[x,y] for [y,x] in pairs]    items.sort()    # 输出count个数词频结果    for i in range(len(items)-1,len(items)-count-1,-1):        print(items[i][1]+"\t"+str(items[i][0]))        data.append(items[i][0])        words.append(items[i][1])    infile.close()    # 根据词频结果绘制柱状图    turtle.title("词频结果柱状图")    turtle.setup(900,750,0,0)    t= turtle.Turtle()    t.hideturtle()    t.width(3)    drawGraph(t)    turtle.done()if __name__ == '__main__':    main()

原创粉丝点击