Hadoop0.20+ custom MultipleOutputFormat

来源:互联网 发布:极光数据恢复软件官网 编辑:程序博客网 时间:2024/05/20 16:08


Hadoop0.20.2中无法使用MultipleOutputFormat,多文件输出这个方法。尽管0.19.2中的方法老的方法org.apache.hadoop.mapred.lib.MultipleOutputFormat还是可以继续在0.20.2中使用,但是org.apache.hadoop.mapred下的方法都是标记为“已过时”,在hadoop下个版本中可能就不能使用了。hadoop 0.20.2中是推荐使用Configuration替换JobConf,而这个老的方法org.apache.hadoop.mapred.lib.MultipleOutputFormat中还是使用的JobConf,就是说还没有新的可替换API。

此外hadoop 0.20.2还只是一个中间版本,并不是所有API都升级到最新了,没有提供的API只能自己写。

 

重写MultipleOutputFormat需要2个类:

LineRecordWriter

MultipleOutputFormat

 

PartitionByFilenameOutputFormat是实验中需要自定义的每个文件各自输出结果

 

LineRecordWriter:

 

Java代码  收藏代码
  1. package cn.xmu.dm;  
  2.   
  3. import java.io.DataOutputStream;  
  4. import java.io.IOException;  
  5. import java.io.UnsupportedEncodingException;  
  6. import org.apache.hadoop.io.NullWritable;  
  7. import org.apache.hadoop.io.Text;  
  8. import org.apache.hadoop.mapreduce.RecordWriter;  
  9. import org.apache.hadoop.mapreduce.TaskAttemptContext;  
  10. import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;  
  11.   
  12. public class LineRecordWriter<K, V> extends RecordWriter<K, V> {  
  13.     private static final String utf8 = "UTF-8";  
  14.   
  15.     protected DataOutputStream out;  
  16.     private final byte[] keyValueSeparator;  
  17.   
  18.     public LineRecordWriter(DataOutputStream out, String keyValueSeparator) {  
  19.         this.out = out;  
  20.         try {  
  21.             this.keyValueSeparator = keyValueSeparator.getBytes(utf8);  
  22.         } catch (UnsupportedEncodingException uee) {  
  23.             throw new IllegalArgumentException("can't find " + utf8  
  24.                     + " encoding");  
  25.         }  
  26.     }  
  27.   
  28.     public LineRecordWriter(DataOutputStream out) {  
  29.         this(out, "/t");  
  30.     }  
  31.   
  32.     private void writeObject(Object o) throws IOException {  
  33.         if (o instanceof Text) {  
  34.             Text to = (Text) o;  
  35.             out.write(to.getBytes(), 0, to.getLength());  
  36.         } else {  
  37.             out.write(o.toString().getBytes(utf8));  
  38.         }  
  39.     }  
  40.   
  41.     public synchronized void write(K key, V value) throws IOException {  
  42.         boolean nullKey = key == null || key instanceof NullWritable;  
  43.         boolean nullValue = value == null || value instanceof NullWritable;  
  44.         if (nullKey && nullValue) {  
  45.             return;  
  46.         }  
  47.         if (!nullKey) {  
  48.             writeObject(key);  
  49.         }  
  50.         if (!(nullKey || nullValue)) {  
  51.             out.write(keyValueSeparator);  
  52.         }  
  53.         if (!nullValue) {  
  54.             writeObject(value);  
  55.         }  
  56.         out.write("\r\n".getBytes());  
  57.     }  
  58.   
  59.     public synchronized void close(TaskAttemptContext context)  
  60.             throws IOException {  
  61.         out.close();  
  62.     }  
  63. }  

 

 

MultipleOutputFormat:

 

 

Java代码  收藏代码
  1. package cn.xmu.dm;  
  2.   
  3. import java.io.DataOutputStream;  
  4. import java.io.IOException;  
  5. import java.util.HashMap;  
  6. import java.util.Iterator;  
  7. import org.apache.hadoop.conf.Configuration;  
  8. import org.apache.hadoop.fs.FSDataOutputStream;  
  9. import org.apache.hadoop.fs.Path;  
  10. import org.apache.hadoop.io.Writable;  
  11. import org.apache.hadoop.io.WritableComparable;  
  12. import org.apache.hadoop.io.compress.CompressionCodec;  
  13. import org.apache.hadoop.io.compress.GzipCodec;  
  14. import org.apache.hadoop.mapreduce.OutputCommitter;  
  15. import org.apache.hadoop.mapreduce.RecordWriter;  
  16. import org.apache.hadoop.mapreduce.TaskAttemptContext;  
  17. import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;  
  18. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  
  19. import org.apache.hadoop.util.ReflectionUtils;  
  20. public abstract class MultipleOutputFormat<K extends WritableComparable<?>, V extends Writable>  
  21.         extends FileOutputFormat<K, V> {  
  22.     private MultiRecordWriter writer = null;  
  23.     public RecordWriter<K, V> getRecordWriter(TaskAttemptContext job) throws IOException,  
  24.             InterruptedException {  
  25.         if (writer == null) {  
  26.             writer = new MultiRecordWriter(job, getTaskOutputPath(job));  
  27.         }  
  28.         return writer;  
  29.     }  
  30.     private Path getTaskOutputPath(TaskAttemptContext conf) throws IOException {  
  31.         Path workPath = null;  
  32.         OutputCommitter committer = super.getOutputCommitter(conf);  
  33.         if (committer instanceof FileOutputCommitter) {  
  34.             workPath = ((FileOutputCommitter) committer).getWorkPath();  
  35.         } else {  
  36.             Path outputPath = super.getOutputPath(conf);  
  37.             if (outputPath == null) {  
  38.                 throw new IOException("Undefined job output-path");  
  39.             }  
  40.             workPath = outputPath;  
  41.         }  
  42.         return workPath;  
  43.     }  
  44.       
  45.     protected abstract String generateFileNameForKeyValue(K key, V value, Configuration conf);  
  46.     public class MultiRecordWriter extends RecordWriter<K, V> {  
  47.           
  48.         private HashMap<String, RecordWriter<K, V>> recordWriters = null;  
  49.         private TaskAttemptContext job = null;  
  50.           
  51.         private Path workPath = null;  
  52.         public MultiRecordWriter(TaskAttemptContext job, Path workPath) {  
  53.             super();  
  54.             this.job = job;  
  55.             this.workPath = workPath;  
  56.             recordWriters = new HashMap<String, RecordWriter<K, V>>();  
  57.         }  
  58.         @Override  
  59.         public void close(TaskAttemptContext context) throws IOException, InterruptedException {  
  60.             Iterator<RecordWriter<K, V>> values = this.recordWriters.values().iterator();  
  61.             while (values.hasNext()) {  
  62.                 values.next().close(context);  
  63.             }  
  64.             this.recordWriters.clear();  
  65.         }  
  66.         @Override  
  67.         public void write(K key, V value) throws IOException, InterruptedException {  
  68.           
  69.             String baseName = generateFileNameForKeyValue(key, value, job.getConfiguration());  
  70.             RecordWriter<K, V> rw = this.recordWriters.get(baseName);  
  71.             if (rw == null) {  
  72.                 rw = getBaseRecordWriter(job, baseName);  
  73.                 this.recordWriters.put(baseName, rw);  
  74.             }  
  75.             rw.write(key, value);  
  76.         }  
  77.       
  78.         private RecordWriter<K, V> getBaseRecordWriter(TaskAttemptContext job, String baseName)  
  79.                 throws IOException, InterruptedException {  
  80.             Configuration conf = job.getConfiguration();  
  81.             boolean isCompressed = getCompressOutput(job);  
  82.             String keyValueSeparator = ",";  
  83.             RecordWriter<K, V> recordWriter = null;  
  84.             if (isCompressed) {  
  85.                 Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job,  
  86.                         GzipCodec.class);  
  87.                 CompressionCodec codec = ReflectionUtils.newInstance(codecClass, conf);  
  88.                 Path file = new Path(workPath, baseName + codec.getDefaultExtension());  
  89.                 FSDataOutputStream fileOut = file.getFileSystem(conf).create(file, false);  
  90.                 recordWriter = new LineRecordWriter<K, V>(new DataOutputStream(codec  
  91.                         .createOutputStream(fileOut)), keyValueSeparator);  
  92.             } else {  
  93.                 Path file = new Path(workPath, baseName);  
  94.                 FSDataOutputStream fileOut = file.getFileSystem(conf).create(file, false);  
  95.                 recordWriter = new LineRecordWriter<K, V>(fileOut, keyValueSeparator);  
  96.             }  
  97.             return recordWriter;  
  98.         }  
  99.     }  
  100. }  

 

 

PartitionByFilenameOutputFormat:

 

Java代码  收藏代码
  1. package cn.xmu.dm;  
  2.   
  3.   
  4.   
  5. import org.apache.hadoop.conf.Configuration;  
  6.   
  7. import org.apache.hadoop.io.Text;  
  8.   
  9.   
  10.   
  11. public class PartitionByFilenameOutputFormat extends MultipleOutputFormat<Text, Text>{  
  12.   
  13.   
  14.   
  15.     @Override  
  16.   
  17.     protected String generateFileNameForKeyValue(Text key, Text value,  
  18.   
  19.             Configuration conf) {  
  20.   
  21.         return value.toString().substring(0, value.toString().indexOf("\t"));  
  22.   
  23.     }  
  24.   
  25.   
  26.   
  27. }  

 http://irwenqiang.iteye.com/blog/1535275


0 0