hadoop python streaming 特殊文本解析

来源:互联网 发布:淘宝产品如何上架 编辑:程序博客网 时间:2024/05/05 18:47
#!/usr/bin/env python
 
import sys
skey=sys.argv[1].split(',')


for line in sys.stdin:
    dic={}
    cols=line.split('\t')
    for kv in cols[1:]:
        kv_tmp=kv.split('\x01')
        dic[kv_tmp[0]]=kv_tmp[1]
    tmp=''
    for i in skey:
        tmp=tmp+'\t'+str(dic.get(i,''))
    print '%s' % (tmp[1:])



hadoop jar /home/hadoop/opt/hadoop-0.20.2-cdh3u2/contrib/streaming/hadoop-streaming-0.20.2-cdh3u2.jar -input /2.txt -output /out1/ -mapper '1.py cookie_aa_ad_gid,trackid,prereferer' -reducer cat  -file /home/hadoop/1.py