用Python和Shell结合进行词频统计

来源：互联网发布：轩辕剑腰甲进阶数据编辑：程序博客网时间：2024/06/07 17:49

1、示例测试文本
/Users/nisj/PycharmProjects/EsDataProc/word.txt

foo|-X-|foo|-X-|quux|-X-|iio|-X-|oo|-X-|pp|-X-|pp|-X-|oosee|-X-|you|-X-|you|-X-|again|-X-|welcome|-X-|testtest|-X-|ddd|-X-|gggg|-X-|gggacc|-X-|aaa|-X-|ddddbbb|-X-|ddd|-X-|cccddd|-X-|ccc|-X-|aaawo|-X-|ni|-X-|tawho|-X-|am|-X-|iisds|wew|ww|-X-|kkxcx|-X-|xcxcxcxcxc

2、进行数据Map操作的Python
/Users/nisj/PycharmProjects/EsDataProc/wc_mapper.py

# -*- coding=utf-8 -*-#!/usr/bin/env pythonimport sys# 输入为标准输入stdinfor line in sys.stdin:    # 删除开头和结果的空格    line = line.strip()    # 以默认空格分隔行单词到words列表    words = line.split('|-X-|')    for word in words:        # 输出所有单词，格式为“单词，1”以便作为reduce的输入        print '%s\t%s' % (word, 1)

3、进行数据Reduce操作的Python
/Users/nisj/PycharmProjects/EsDataProc/wc_reducer.py

# -*- coding=utf-8 -*-#!/usr/bin/env pythonimport syscurrent_word = Nonecurrent_count = 0word = None#获取标准输入，即mapper.py的输出for line in sys.stdin:    line = line.strip()    #解析mapper.py输出作为程序的输入，以tab作为分隔符    word,count = line.split('\t',1)    #转换count从字符型成整型    try:        count = int(count)    except ValueError:        #非字符时忽略此行        continue    #要求mapper.py的输出做排序（sort）操作，以便对连续的word做判断    if current_word == word:        current_count +=count    else:        if current_word:            #输出当前word统计结果到标准输出            print '%s\t%s' %(current_word,current_count)        current_count =count        current_word =word#输出最后一个word统计if current_word ==word:    print '%s\t%s' % (current_word,current_count)

4、测试的Shell
/Users/nisj/PycharmProjects/EsDataProc/wc_batch.sh

#!/bin/bashecho "foo foo quux labs foo bar quux" |python ./wc_mapper.pyecho "foo foo quux labs foo bar quux" |python ./wc_mapper.py | sort -k1,1 |python ./wc_reducer.pycat ./word.txt |python ./wc_mapper.py | sort -k1,1 |python ./wc_reducer.py

0 0