mapreduce实现多文件自定义输出

来源：互联网发布：unity3d对电脑的要求编辑：程序博客网时间：2024/05/21 08:35

本人在项目中遇到一个问题，就是在处理日志的时候，需要有多个key，比如一行日志是 domain sip minf h b

而我处理的时候需要map输出为 key：domain+minf value h+"|"+b 和key：sip+minf value h+"|"+b，而且还要做逻辑运算，比如相同的key的value要做累加，

普通的mr通常情况下，计算结果会以part-000*输出成多个文件，并且输出的文件数量和reduce数量一样，这样就没法区分各个输出在哪个文件中，所以这样也不利于后续将mr的运行结果再做处理。

下面介绍我的处理过程，啥也不说了，上代码：

ComplexKey 是我重写的类，实现了WritableComparable接口，便于对key排序，之所以排序，是希望将相同的key放到同一个reduce中去处理。

package sina.dip.logfilter.mr;import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;import org.apache.hadoop.io.Text;import org.apache.hadoop.io.WritableComparable;public class ComplexKey implements WritableComparable<ComplexKey> {private Text name;private Text value;private Text minf;public ComplexKey() {this.name = new Text();this.value = new Text();this.minf = new Text();}public ComplexKey(String name, String value, String minf) {this.name = new Text(name);this.value = new Text(value);this.minf = new Text(minf);}public Text getName() {return name;}public void setName(Text name) {this.name = name;}public Text getValue() {return value;}public void setValue(Text value) {this.value = value;}public Text getMinf() {return minf;}public void setMinf(Text minf) {this.minf = minf;}@Overridepublic int compareTo(ComplexKey c) {int compare = 0;compare = name.compareTo(c.name);if (compare != 0) {return compare;}compare = value.compareTo(c.value);if (compare != 0) {return compare;}compare = minf.compareTo(c.minf);if (compare != 0) {return compare;}return 0;}@Overridepublic void readFields(DataInput in) throws IOException {name.readFields(in);value.readFields(in);minf.readFields(in);}@Overridepublic void write(DataOutput out) throws IOException {name.write(out);value.write(out);minf.write(out);}}

分区类：

package sina.dip.logfilter.mr;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Partitioner;public class ComplexKeyPartitioner extends Partitioner<ComplexKey, Text> {@Overridepublic int getPartition(ComplexKey key, Text value, int numPartitions) {return Math.abs(key.getValue().hashCode()) % numPartitions;}}

这是map阶段：

package sina.dip.logfilter.mr;import java.io.IOException;import java.math.BigInteger;import java.util.HashMap;import java.util.Map;import java.util.Set;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer.Context;import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;//map中的结果都放到了 mapoutput ，在clean的时候同意处理，将逻辑放在这边，是为了减小reduce的压力，之前累加的逻辑放入reduce，发现100G的数据，要跑大约10多分钟   而map只用了1分钟，但是放入map中后，整个处理过程不到两分钟。 public class AnalysisMapper extendsMapper<LongWritable, Text, ComplexKey, Text> {private MultipleOutputs<ComplexKey, Text> outputs;private Map<String,String> mapoutput = new HashMap<String,String>();private Set<String> outputkeyset;private String[] mapkey;private String[] mapvalue;private BigInteger paravalue;protected void setup(Context context) throws IOException,InterruptedException {outputs = new MultipleOutputs<ComplexKey, Text>(context);};protected void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException {String line = value.toString();if (line == null || line.isEmpty()) {return;}String[] words = line.split("\t");//System.out.println("words.length:"+words.length);if (words.length != 17 && words.length != 18) {//System.out.println("line:"+value.toString());return;}//if (words[0] == null || words[0].isEmpty() || words[1] == null//|| words[1].isEmpty() || words[2] == null || words[2].isEmpty()//|| words[14] == null || words[14].isEmpty()//|| words[16] == null || words[16].isEmpty()) {//return;//}BigInteger hit,bit;Text hb;//System.out.println("words.length:"+words.length);if(words[1].equals("172.20.20.37")){if(words.length == 17){//System.out.println("mapoutput17:"+mapoutput.size());hb = new Text(words[14] + "|" + words[16]);if(null != mapoutput.get("domain"+"|"+words[2]+"|"+words[0])){//如果结果中已经存在 domain|minfmapvalue = (mapoutput.get("domain"+"|"+words[2]+"|"+words[0])).toString().split("\\|");hit = new BigInteger(mapvalue[0]);bit = new BigInteger(mapvalue[1]);hit = hit.add(new BigInteger(words[14]));bit = bit.add(new BigInteger(words[16]));mapoutput.put("domain"+"|"+words[2]+"|"+words[0], hit+"|"+bit);}else{mapoutput.put("domain"+"|"+words[2]+"|"+words[0], words[14]+"|"+words[16]);}if(null != mapoutput.get("sip"+"|"+words[1]+"|"+words[0])){//如果结果中已经存在 sip|minfmapvalue = (mapoutput.get("sip"+"|"+words[1]+"|"+words[0])).toString().split("\\|");hit = new BigInteger(mapvalue[0]);bit = new BigInteger(mapvalue[1]);hit = hit.add(new BigInteger(words[14]));bit = bit.add(new BigInteger(words[16]));mapoutput.put("sip"+"|"+words[1]+"|"+words[0], hit+"|"+bit);}else{mapoutput.put("sip"+"|"+words[1]+"|"+words[0], words[14]+"|"+words[16]);}}else if(words.length == 18){//System.out.println("mapoutput18:"+mapoutput.size());hb = new Text(words[15] + "|" + words[17]);if(null != mapoutput.get("domain"+"|"+words[2]+"|"+words[0])){//如果结果中已经存在 domain|minfmapvalue = (mapoutput.get("domain"+"|"+words[2]+"|"+words[0])).toString().split("\\|");hit = new BigInteger(mapvalue[0]);bit = new BigInteger(mapvalue[1]);hit = hit.add(new BigInteger(words[15]));bit = bit.add(new BigInteger(words[17]));mapoutput.put("domain"+"|"+words[2]+"|"+words[0], hit+"|"+bit);}else{mapoutput.put("domain"+"|"+words[2]+"|"+words[0], words[15]+"|"+words[17]);}if(null != mapoutput.get("sip"+"|"+words[1]+"|"+words[0])){//如果结果中已经存在 sip|minfmapvalue = (mapoutput.get("sip"+"|"+words[1]+"|"+words[0])).toString().split("\\|");hit = new BigInteger(mapvalue[0]);bit = new BigInteger(mapvalue[1]);hit = hit.add(new BigInteger(words[15]));bit = bit.add(new BigInteger(words[17]));mapoutput.put("sip"+"|"+words[1]+"|"+words[0], hit+"|"+bit);}else{mapoutput.put("sip"+"|"+words[1]+"|"+words[0], words[15]+"|"+words[17]);}}}};//多个输出，每个会有不同的key protected void cleanup(Context context) throws IOException,InterruptedException {outputkeyset = mapoutput.keySet();for(String outputkey : outputkeyset){mapkey = outputkey.split("\\|");if(mapkey[0].equals("domain")){mapvalue = mapoutput.get(outputkey).split("\\|");//System.out.println("domainh:"+mapvalue[0]);ComplexKey domain = new ComplexKey("domain", mapkey[1], mapkey[2]);Text hb = new Text(mapvalue[0] + "|" + mapvalue[1]);context.write(domain, hb);}else if(mapkey[0].equals("sip")){mapvalue = mapoutput.get(outputkey).split("\\|");ComplexKey sip = new ComplexKey("sip", mapkey[1], mapkey[2]);Text hb = new Text(mapvalue[0] + "|" + mapvalue[1]);//System.out.println("siph:"+mapvalue[0]);context.write(sip, hb);}//else if(mapkey[0].equals("httpcode")){//ComplexKey sip = new ComplexKey("httpcode", mapkey[1], mapkey[2]);//Text h = new Text(mapoutput.get(outputkey));//context.write(sip, h);//}}outputs.close();};}

reduce：

package sina.dip.logfilter.mr;import java.io.IOException;import java.math.BigInteger;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;/** * 根据不同的key处理不同的逻辑，然后输出到相应的目录下 * */public class AnalysisReducerBack1 extends Reducer<ComplexKey, Text, Text, Text> {private MultipleOutputs<Text, Text> outputs;protected void setup(Context context) throws IOException,InterruptedException {outputs = new MultipleOutputs<Text, Text>(context);};    //根据不同的key，处理不同的逻辑，并输出到不同的目录下protected void reduce(ComplexKey key, Iterable<Text> values, Context context)throws IOException, InterruptedException {Text oKey = null,oValue = null;BigInteger h = new BigInteger("0"),b = new BigInteger("0");if(key.getName().toString().equals("sip") || key.getName().toString().equals("domain")){for (Text value : values) {String[] words = value.toString().split("\\|");h = h.add(new BigInteger(words[0]));b = b.add(new BigInteger(words[1]));//h += Integer.valueOf(words[0]);//b += Integer.valueOf(words[1]);}oKey = new Text(key.getValue() + "\t" + key.getMinf());oValue = new Text(h + "\t" + b);}else if(key.getName().toString().equals("httpcode")){for (Text value : values) {h = h.add(new BigInteger(value.toString()));//h += Integer.valueOf(value.toString());}oKey = new Text(key.getValue() + "\t" + key.getMinf());oValue = new Text(String.valueOf(h));}outputs.write(oKey, oValue, key.getName().toString()+"/"+key.getName().toString());//根据key输出，比如domain的key，则输出到了outputpath/domain/domain-part-000x;//或者设置为outputs.write(oKey, oValue, key.getName().toString());则输出为outputpath/domain-part-000x;      };protected void cleanup(Context context) throws IOException,InterruptedException {outputs.close();};}

最后就是在job调用时设置了

package sina.dip.logfilter.mr;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.filecache.DistributedCache;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;import sina.dip.logfilter.DipFilterLogData;import sina.dip.logfilter.config.LogConfig;import sina.dip.logfilter.config.ServerConfig;import sina.dip.logfilter.util.FileUtil;public class AnalysisLoader {/** * @param args * @param conf * @return * @throws Exception */public boolean run(Configuration conf, String inputPath, String outPath,String category)throws Exception {Job job = new Job(conf, "DIP_DIPLOGFILTER-"+category);DistributedCache.addFileToClassPath(new Path("/libs/hbase-0.92.1-cdh4.0.0-security.jar"), job.getConfiguration());               //解决第三包的调用问题，在其他的文章中有介绍                 job.setJarByClass(AnalysisLoader.class);job.setInputFormatClass(TextInputFormat.class);job.setOutputFormatClass(TextOutputFormat.class);job.setMapperClass(AnalysisMapper.class);job.setMapOutputKeyClass(ComplexKey.class);job.setMapOutputValueClass(Text.class);job.setPartitionerClass(ComplexKeyPartitioner.class);//job.setCombinerClass(AnalysisReducer.class);job.setReducerClass(AnalysisReducer.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(Text.class);job.setNumReduceTasks(LogConfig.reduceCount);String hdfs = ServerConfig.getHDFS();String[] inputPaths =inputPath.split(",");for (String p : inputPaths) {if (!p.startsWith(hdfs)) {p = hdfs + p;}MultipleInputs.addInputPath(job, new Path(p),TextInputFormat.class, AnalysisMapper.class);}FileOutputFormat.setOutputPath(job, new Path(outPath));return(job.waitForCompletion(true));}}