字段占空比和特征字段占空比

来源：互联网发布：在app里找淘宝官方客服编辑：程序博客网时间：2024/05/01 05:57

测试文本：

hello|nice|chx|||hhh|yiyi|12345hello2|nice2|chx2|5|heh2|hhh2|yiyi2|12341hello3|nice3|chx3||heh3|hhh3|yiyi3|12342hello4|nice4|chx4|4|heh4|hhh|yiyi|12343hello|nice5|chx5||heh5|hhh5|yiyi|12344hello|nice4|chx3||heh2|hhh|yiyi|12345hello|nice|chx|3|heh2|hhh1|yiyi|12346hello|nice|chx|2|heh|hhh3|yiyi|12347hello|nice|chx|||hhh|yiyi|12345hello2|nice2|chx2|5||hhh2|yiyi2|12341hello3|nice3|chx3||heh3|hhh3|yiyi3|12342hello4|nice4|chx4|4|heh4|hhh|yiyi|12343hello|nice5|chx5|||hhh5|yiyi|12344hello|nice4|chx3||heh2|hhh|yiyi|12345|nice|chx|3|heh2|hhh1|yiyi|12346hello|nice|chx|2|heh|hhh3|yiyi|12347hello|nice|chx||heh|hhh|yiyi|12345hello2||chx2|5|heh2|hhh2|yiyi2|12341hello3|nice3|chx3||heh3|hhh3|yiyi3|12342hello4|nice4|chx4|4|heh4|hhh|yiyi|12343hello|nice5|chx5||heh5|hhh5|yiyi|12344hello|nice4|chx3||heh2|hhh|yiyi|12345hello|nice|chx|3|heh2|hhh1|yiyi|12346hello|nice|chx|2|heh|hhh3|yiyi|12347

各个字段的非占空比：

mapper:

#!/usr/bin/env python'''求解各个字段的非占空比情况'''import sysdef read_input(file,separator):    for line in file:        yield line.strip().split(separator)def main(separator='|'):    data = read_input(sys.stdin,separator)    for words in data:for i in range(8):word = words[i].strip()if word=='':tag="NULL"else:tag="NONULL"print "%s\t%s\t" %(i,tag)if __name__ == "__main__":    main()

reducer:

#!/usr/bin/env python'''    求解各个字段的非占空比情形'''from __future__ import divisionfrom operator import itemgetterfrom itertools import groupbyimport sysdef read_mapper_output(file, separator = '\t'):    for line in file:        yield line.rstrip().split(separator,1)        def get_ff(data):    ff={}    for words in data:        no=words[0]        #word=words[1]        tag=words[1]                key=no+"-"+tag                if ff.has_key(key):            ff[key]+=1        else:            ff[key]=1    return ffdef get_result(ff):    fff={}    for i in range(8):        key1=str(i)+"-"+"NULL"        key2=str(i)+"-"+"NONULL"        if ff.has_key(key1) and ff.has_key(key2):            fff[i]=ff[key2]/(ff[key1]+ff[key2])        else:            fff[i]=1        print "%s\t%s" %(i,fff[i])        def get_count(ff):    for key in ff.keys():        print "%s\t%s" %(key,ff[key])def main(separator = '\t'):    data = read_mapper_output(sys.stdin, separator = separator)    ff=get_ff(data)    get_count(ff)    get_result(ff)    if __name__=='__main__':    main()

关键字段关联区域的非占空比以第二列为类

mapper:

#!/usr/bin/env python'''求解特征字段关联的各个字段的非占空比情形'''import sysdef read_input(file,separator):    for line in file:        yield line.strip().split(separator)def main(separator='|'):    data = read_input(sys.stdin,separator)    for words in data:req=words[2]#取前八个字段for i in range(8):word = words[i].strip()#tag 表示为是否为空的字段if word=='':tag="NULL"else:tag="NONULL"print "%s\t%s\t%s\t" %(i,tag,req)if __name__ == "__main__":    main()

对应的redder:

#!/usr/bin/env python'''求解特征字段关联的各个字段的非占空比情形'''from __future__ import division from operator import itemgetterfrom itertools import groupbyimport sysdef read_mapper_output(file, separator = '\t'):for line in file:yield line.rstrip().split(separator,2)def get_ff(data):ff={}#lis 链表存放对应的关键字段lis=[]for words in data:no=words[0]tag=words[1]#关键的特征字段值req=words[2]if not req in lis:lis.append(req)#链表追加字符key=no+"-"+tag+"-"+reqif ff.has_key(key):ff[key]+=1else:ff[key]=1return ff,lis#写成字典形式def get_result(ff,lis):#存放结果值fff={}print lisa1=range(0,2)a2=range(3,8)a3=a1+a2for req in lis:for i in a3:key1=str(i)+"-"+"NULL"+"-"+reqkey2=str(i)+"-"+"NONULL"+"-"+req#求解非占空比if ff.has_key(key1) and ff.has_key(key2):fff[i]=ff[key2]/(ff[key1]+ff[key2])else:fff[i]=1print "%s\t%s\t%s" %(i,req,fff[i])#各个字段的总数和def get_count(ff):for key in ff.keys():print "%s\t%s" %(key,ff[key])def main(separator = '\t'):data = read_mapper_output(sys.stdin, separator = separator)ff,lis=get_ff(data)get_count(ff)get_result(ff,lis)if __name__=='__main__':main()

最后放入hadoop集群中，通过streaming运行~

0 0