MapReduce 的格式输入----NLineInputFormat

来源:互联网 发布:奚梦瑶天涯扒皮知乎 编辑:程序博客网 时间:2024/05/22 13:43
通过TextInputFormat和keyvalueTextInputformat,每个Mapper收到的输入的行数不同,行数取决于分片的大小和行的长度,如果希望mapper收到固定的行数的输入,需要将NLineInputFormat作为InputFormat使用,与TextInputFormat一样,键是行的字节的偏移量,值是行本身。
1、WordCountMapper
package hadoop.mr.input.nline;

import com.it18zhang.debugtool.RTUtil;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
* WordCountMapper
*/
public class WordCountMapper extends Mapper<LongWritable, Text, Text,IntWritable> {

/**
* 每一行
*/
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] arr = line.split(" ");

Text keyOut = new Text();
IntWritable valueOut = new IntWritable(1);
for(String word : arr){
keyOut.set(word);
context.write(keyOut,valueOut);
}
}
}


2、WordCountReducer
package .hadoop.mr.input.nline;

import com.it18zhang.debugtool.RTUtil;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
* WordCountReducer
*/
public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{

protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int count = 0 ;
for(IntWritable iw : values){
count = count + iw.get() ;
}
context.write(key,new IntWritable(count));
}
}
3、App
package c.hadoop.mr.input.nline;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.NLineInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
*/
public class App {
public static void main(String[] args) throws Exception {
args = new String[]{"d:/java/mr/data/6.txt" , "d:/java/mr/out"} ;
Configuration conf = new Configuration();
conf.set("fs.defaultFS","file:///");
conf.set("mapreduce.framework.name","local");

FileSystem fs = FileSystem.get(conf);
if(fs.exists(new Path(args[1]))){
fs.delete(new Path(args[1]),true);
}

Job job = Job.getInstance(conf);

job.setJobName("WordCount");
job.setJarByClass(App.class);

job.setMapperClass(WordCountMapper.class);
job.setReducerClass(WordCountReducer.class);

FileInputFormat.addInputPath(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));

//n行输入格式
job.setInputFormatClass(NLineInputFormat.class);
//修改N行的数量
job.getConfiguration().setInt(NLineInputFormat.LINES_PER_MAP,3);

job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);

job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);

job.setNumReduceTasks(2);

job.waitForCompletion(true);
}
}

原创粉丝点击