MapReduce 的格式输入----NLineInputFormat

来源：互联网发布：奚梦瑶天涯扒皮知乎编辑：程序博客网时间：2024/05/22 13:43

通过TextInputFormat和keyvalueTextInputformat，每个Mapper收到的输入的行数不同，行数取决于分片的大小和行的长度，如果希望mapper收到固定的行数的输入，需要将NLineInputFormat作为InputFormat使用，与TextInputFormat一样，键是行的字节的偏移量，值是行本身。

1、WordCountMapper

package hadoop.mr.input.nline;

import com.it18zhang.debugtool.RTUtil;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**

* WordCountMapper

*/

public class WordCountMapper extends Mapper<LongWritable, Text, Text,IntWritable> {

/**

* 每一行

*/

protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

String line = value.toString();

String[] arr = line.split(" ");

Text keyOut = new Text();

IntWritable valueOut = new IntWritable(1);

for(String word : arr){

keyOut.set(word);

context.write(keyOut,valueOut);

}

}

}

2、WordCountReducer

package .hadoop.mr.input.nline;

import com.it18zhang.debugtool.RTUtil;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**

* WordCountReducer

*/

public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{

protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {

int count = 0 ;

for(IntWritable iw : values){

count = count + iw.get() ;

}

context.write(key,new IntWritable(count));

}

}

3、App

package c.hadoop.mr.input.nline;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.input.NLineInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**

*/

public class App {

public static void main(String[] args) throws Exception {

args = new String[]{"d:/java/mr/data/6.txt" , "d:/java/mr/out"} ;

Configuration conf = new Configuration();

conf.set("fs.defaultFS","file:///");

conf.set("mapreduce.framework.name","local");

FileSystem fs = FileSystem.get(conf);

if(fs.exists(new Path(args[1]))){

fs.delete(new Path(args[1]),true);

}

Job job = Job.getInstance(conf);

job.setJobName("WordCount");

job.setJarByClass(App.class);

job.setMapperClass(WordCountMapper.class);

job.setReducerClass(WordCountReducer.class);

FileInputFormat.addInputPath(job,new Path(args[0]));

FileOutputFormat.setOutputPath(job,new Path(args[1]));

//n行输入格式

job.setInputFormatClass(NLineInputFormat.class);

//修改N行的数量

job.getConfiguration().setInt(NLineInputFormat.LINES_PER_MAP,3);

job.setMapOutputKeyClass(Text.class);

job.setMapOutputValueClass(IntWritable.class);

job.setOutputKeyClass(Text.class);

job.setOutputValueClass(IntWritable.class);

job.setNumReduceTasks(2);

job.waitForCompletion(true);

}

}

阅读全文

0 0