hadoop 之 InputFormat类 --- KeyValueTextInputFormat 实例

来源：互联网发布：全国最好的python 编辑：程序博客网时间：2024/06/05 06:22

KeyValueTextInputFormat 介绍

文本由任务读取时，需要一种格式读入，KeyValueTextInputFormat 是InputFormat 类的一个具体子类，他定义的读取格式是这样的：

一行是一条记录;
读取后按照（key,value）对表示一条记录；
一行中可能被分成多个区域（可能是制表符、逗号或者其他作为分隔符），第一个区域作为key，其他区域作为value。

应用实例

1.要处理的数据，tradeinfoIn文件

zhangsan@163.com    6000    0   2014-02-20lisi@163.com    2000    0   2014-02-20lisi@163.com    0   100 2014-02-20zhangsan@163.com    3000    0   2014-02-20wangwu@126.com  9000    0   2014-02-20wangwu@126.com  0   200     2014-02-20

2.被Job任务读入后的格式:

<zhangsan@163.com,  6000    0   2014-02-20><lisi@163.com,2000  0   2014-02-20><lisi@163.com,0 100 2014-02-20><zhangsan@163.com,3000  0   2014-02-20><wangwu@126.com,9000    0   2014-02-20><wangwu@126.com,0   200     2014-02-20>

3.代码
代码中关于KeyValueTextInputFormat的关键代码

job.setInputFormatClass(KeyValueTextInputFormat.class);

来设置文件被Job读入时的格式。

import java.io.IOException;import java.util.HashMap;import java.util.Map;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.conf.Configured;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Partitioner;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.Tool;import org.apache.hadoop.util.ToolRunner;import mapreduce.bean.InfoBeanMy;public class SumStepByTool extends Configured implements Tool{    public static class SumStepByToolMapper extends Mapper<Text, Text, Text, InfoBeanMy>{        private InfoBeanMy outBean = new InfoBeanMy();        private Text k = new Text();        @Override        protected void map(Text key, Text value, Context context) throws IOException, InterruptedException{            String line = value.toString();            String[] fields = line.split("\t");            String account = key.toString();            double income = Double.parseDouble(fields[0]);            double expense = Double.parseDouble(fields[1]);            outBean.setFields(account, income, expense);            k.set(account);            context.write(key, outBean);        }    }    public static class SumStepByToolReducer extends Reducer<Text, InfoBeanMy, Text, InfoBeanMy>{        private InfoBeanMy outBean = new InfoBeanMy();        @Override        protected void reduce(Text key, Iterable<InfoBeanMy> values, Context context) throws IOException, InterruptedException{            double income_sum = 0;            double expense_sum = 0;            for(InfoBeanMy infoBeanMy : values)            {                income_sum += infoBeanMy.getIncome();                expense_sum += infoBeanMy.getExpense();            }            outBean.setFields("", income_sum, expense_sum);            context.write(key, outBean);        }    }    public static class SumStepByToolPartitioner extends Partitioner<Text, InfoBeanMy>{        private static Map<String, Integer> accountMap = new HashMap<String, Integer>();         static {            accountMap.put("zhangsan", 1);            accountMap.put("lisi", 2);            accountMap.put("wangwu", 3);        }        @Override        public int getPartition(Text key, InfoBeanMy value, int numPartitions) {            String keyString = key.toString();            String name = keyString.substring(0, keyString.indexOf("@"));            Integer part = accountMap.get(name);            if (part == null )            {                part = 0;            }            return part;        }    }    public int run(String[] args) throws Exception {        Configuration conf = getConf();        Job job = Job.getInstance(conf);        job.setJarByClass(this.getClass());        job.setJobName("SumStepByTool");        //job.setInputFormatClass(TextInputFormat.class); //这个是默认的输入格式        job.setInputFormatClass(KeyValueTextInputFormat.class); //这个把一行记录的第一个区域当做key，其他区域作为value        job.setMapperClass(SumStepByToolMapper.class);        job.setMapOutputKeyClass(Text.class);        job.setMapOutputValueClass(InfoBeanMy.class);        job.setReducerClass(SumStepByToolReducer.class);        job.setOutputKeyClass(Text.class);        job.setOutputValueClass(InfoBeanMy.class);        job.setNumReduceTasks(3);        FileInputFormat.setInputPaths(job, new Path(args[0]));        FileOutputFormat.setOutputPath(job, new Path(args[1]));        return job.waitForCompletion(true) ? 0:-1;    }    public static void main(String[] args) throws Exception {        int exitCode = ToolRunner.run(new SumStepByTool(),args);        System.exit(exitCode);    }}

注意

跟默认的格式（TextInputFormat）不一样的地方在于，key不再是字符的偏移量；

0 0