Hadoop MapReduce处理小的压缩文件:基于CombineFileInputFormat

来源:互联网 发布:52单片机引脚图电路图 编辑:程序博客网 时间:2024/05/16 10:28

处理压缩小文件的,不是压缩文件,代码也是可以用的,只是这时候每个split是一个小文件

其他就不在多说了,提供3个完整java文件,只是缺少map和reduce的代码、main的代码,而且缺少的这几个java文件在前几个博客中已经提供了CompressedCombineFileInputFormat .java

package compressedCombineFile;

import java.io.IOException;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader;
import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;

public class CompressedCombineFileInputFormat 
extends CombineFileInputFormat<CompressedCombineFileWritable, Text>  {

public CompressedCombineFileInputFormat(){
super();

}


public RecordReader<CompressedCombineFileWritable,Text> 
createRecordReader(InputSplit split,
   TaskAttemptContext context) throws IOException {
 return new 
CombineFileRecordReader<CompressedCombineFileWritable, 
Text>((CombineFileSplit)split, context, 
CompressedCombineFileRecordReader.class);
}

@Override
protected boolean isSplitable(JobContext context, Path file){
 return false;
}
}


CompressedCombineFileRecordReader .java 设置读取格式

package compressedCombineFile;

import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;
import org.apache.hadoop.util.LineReader;


/**
 * RecordReader is responsible from extracting records from a chunk
 * of the CombineFileSplit. 
 */
public class CompressedCombineFileRecordReader 
  extends RecordReader<CompressedCombineFileWritable, Text> {

private long startOffset;
private long end; 
private long pos; 
private FileSystem fs;
private Path path;
private Path dPath;
private CompressedCombineFileWritable key = new CompressedCombineFileWritable();
private Text value;
private long rlength;
private FSDataInputStream fileIn;
private LineReader reader;
 
public CompressedCombineFileRecordReader(CombineFileSplit split,
     TaskAttemptContext context, Integer index) throws IOException {
   
Configuration currentConf = context.getConfiguration();
  this.path = split.getPath(index);
  boolean isCompressed =  findCodec(currentConf ,path);
  if(isCompressed)
  codecWiseDecompress(context.getConfiguration());

  fs = this.path.getFileSystem(currentConf);
 
  this.startOffset = split.getOffset(index);

  if(isCompressed){
  this.end = startOffset + rlength;
  }else{
  this.end = startOffset + split.getLength(index);
  dPath =path;
  }
 
  boolean skipFirstLine = false;
   
       fileIn = fs.open(dPath);
       
       if(isCompressed)  fs.deleteOnExit(dPath);
       
       if (startOffset != 0) {
        skipFirstLine = true;
        --startOffset;
        fileIn.seek(startOffset);
       }
       reader = new LineReader(fileIn);
       if (skipFirstLine) {  
        startOffset += reader.readLine(new Text(), 0,
        (int)Math.min((long)Integer.MAX_VALUE, end - startOffset));
       }
       this.pos = startOffset;
 }

 public void initialize(InputSplit split, TaskAttemptContext context)
     throws IOException, InterruptedException {
 }

 public void close() throws IOException { }

 public float getProgress() throws IOException {
   if (startOffset == end) {
     return 0.0f;
   } else {
     return Math.min(1.0f, (pos - startOffset) / (float)
                  (end - startOffset));
   }
 }

 public boolean nextKeyValue() throws IOException {
   if (key.fileName== null) {
     key = new CompressedCombineFileWritable();
     key.fileName = dPath.getName();
   }
   key.offset = pos;
   if (value == null) {
     value = new Text();
   }
   int newSize = 0;
   if (pos < end) {
     newSize = reader.readLine(value);
     pos += newSize;
   }
   if (newSize == 0) {
     key = null;
     value = null;
     return false;
   } else {
     return true;
   }
 }

 public CompressedCombineFileWritable getCurrentKey() 
     throws IOException, InterruptedException {
 return key;
 }

 public Text getCurrentValue() throws IOException, InterruptedException {
 return value;
 }
   
private void codecWiseDecompress(Configuration conf) throws IOException{
 
CompressionCodecFactory factory = new CompressionCodecFactory(conf);
CompressionCodec codec = factory.getCodec(path);
   
   if (codec == null) {
    System.err.println("No Codec Found For " + path);
    System.exit(1);
   }
   
   String outputUri = 
CompressionCodecFactory.removeSuffix(path.toString(), 
codec.getDefaultExtension());
   dPath = new Path(outputUri);
   
   InputStream in = null;
   OutputStream out = null;
   fs = this.path.getFileSystem(conf);
   
   try {
    in = codec.createInputStream(fs.open(path));
    out = fs.create(dPath);
    IOUtils.copyBytes(in, out, conf);
    } finally {
    IOUtils.closeStream(in);
    IOUtils.closeStream(out);
rlength = fs.getFileStatus(dPath).getLen();
    }
 }

private boolean findCodec(Configuration conf, Path p){

CompressionCodecFactory factory = new CompressionCodecFactory(conf);
   CompressionCodec codec = factory.getCodec(path);
   
   if (codec == null) 
    return false; 
   else 
    return true;

}
  
}

CompressedCombineFileWritable .java

package compressedCombineFile;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;

/**
 * This record keeps filename,offset pairs.
 */

@SuppressWarnings("rawtypes")

public class CompressedCombineFileWritable implements WritableComparable {
    public long offset;
    public String fileName;
    
    public CompressedCombineFileWritable() {
super();
}

public CompressedCombineFileWritable(long offset, String fileName) {
super();
this.offset = offset;
this.fileName = fileName;
}

public void readFields(DataInput in) throws IOException {
      this.offset = in.readLong();
      this.fileName = Text.readString(in);
    }

    public void write(DataOutput out) throws IOException {
      out.writeLong(offset);
      Text.writeString(out, fileName);
    }
    
    public int compareTo(Object o) {
      CompressedCombineFileWritable that = (CompressedCombineFileWritable)o;


      int f = this.fileName.compareTo(that.fileName);
      if(f == 0) {
        return (int)Math.signum((double)(this.offset - that.offset));
      }
      return f;
    }
    @Override
    public boolean equals(Object obj) {
      if(obj instanceof CompressedCombineFileWritable)
        return this.compareTo(obj) == 0;
      return false;
    }
    @Override
    public int hashCode() {
  
    final int hashPrime = 47;
        int hash = 13;
        hash =   hashPrime* hash + (this.fileName != null ? this.fileName.hashCode() :0);
        hash =  hashPrime* hash + (int) (this.offset ^ (this.offset >>> 16));
   
        return hash; 
    }
    @Override
    public String toString(){
    return this.fileName+"-"+this.offset;
    }

  }


0 0
原创粉丝点击