hadoop实例

来源:互联网 发布:sql游标的用法 编辑:程序博客网 时间:2024/06/05 14:54
#!/bin/bashsource "../yew_functions.sh"hdp_input="/file/stat.bz2";hdp_output="/user/out";hadoop="$HADOOP_HOME/bin/hadoop"$hadoop fs -rmr $hdp_outputmapper_cmd="python27/bin/python mapper.py"mapper_file="parseuv_mapper.py"reducer_cmd="python27/bin/python parseuv_reducer.py"reducer_file="reducer.py"$hadoop jar $HADOOP_STREAMING_HOME/$STREAMING_JAR \   -D mapred.job.name="[test]" \   -D mapred.reduce.tasks="1" \   -cacheArchive "${PYTHON_LIB}/python27.tar.gz#python27" \   -mapper "${mapper_cmd}" \   -reducer "${reducer_cmd}" \   -input "${hdp_input}" \   -output "${hdp_output}" \   -file "${mapper_file}" \   -file "${reducer_file}" 

# coding:utf8import sys, redic = {}for line in sys.stdin:  line = line.strip()  cols = line.split()  for item in cols:    if item.startswith("id"):      key_value = item.split(":")      if(len(key_value) != 2):        break      uid = key_value[1]      print uid      break

# coding:utf8import syscnt=0current_uid = ""dic = {}for line in sys.stdin:  line = line.strip()  if current_uid != line:    cnt += 1    current_uid = lineprint "Number of records:%s" % (cnt)

0 0
原创粉丝点击