hadoop之MR字节码转换

来源:互联网 发布:电视盒子破解软件下载 编辑:程序博客网 时间:2024/05/16 11:50

大家如果看过hadoop的文本文件输入字符集格式,就知道在TextOutputFormat源码中写死了输出字节码格式是UTF-8,源码如下

  import java.io.DataOutputStream;  import java.io.IOException;  import java.io.UnsupportedEncodingException;    import org.apache.hadoop.conf.Configuration;  import org.apache.hadoop.fs.FileSystem;  import org.apache.hadoop.fs.Path;  import org.apache.hadoop.fs.FSDataOutputStream;    import org.apache.hadoop.io.NullWritable;  import org.apache.hadoop.io.Text;  import org.apache.hadoop.io.compress.CompressionCodec;  import org.apache.hadoop.io.compress.GzipCodec;  import org.apache.hadoop.mapreduce.OutputFormat;  import org.apache.hadoop.mapreduce.RecordWriter;  import org.apache.hadoop.mapreduce.TaskAttemptContext;  import org.apache.hadoop.util.*;    /** An {@link OutputFormat} that writes plain text files. */  public class TextOutputFormat<K, V> extends FileOutputFormat<K, V> {//TextInputFormat是默认的输出文件格式     protected static class LineRecordWriter<K, V>//默认       extends RecordWriter<K, V> {      private static final String utf8 = "UTF-8"; //这个地方写死了输出字符集是UTF-8     private static final byte[] newline;//行结束符?       static {        try {          newline = "\n".getBytes(utf8);        } catch (UnsupportedEncodingException uee) {          throw new IllegalArgumentException("can't find " + utf8 + " encoding");        }      }        protected DataOutputStream out;      private final byte[] keyValueSeparator;//key和value的分隔符,默认的好像是Tab         public LineRecordWriter(DataOutputStream out, String keyValueSeparator) {//构造函数,初始化输出流及分隔符          this.out = out;        try {          this.keyValueSeparator = keyValueSeparator.getBytes(utf8);        } catch (UnsupportedEncodingException uee) {          throw new IllegalArgumentException("can't find " + utf8 + " encoding");        }      }        public LineRecordWriter(DataOutputStream out) {//默认的分隔符         this(out, "\t");      }        /**      * Write the object to the byte stream, handling Text as a special输出流是byte格式的      * case.      * @param o the object to print是要输出的对象      * @throws IOException if the write throws, we pass it on      */      private void writeObject(Object o) throws IOException {//应该是一行一行的写 key keyValueSeparator value \n         if (o instanceof Text) {//如果o是Text的实例           Text to = (Text) o;          out.write(to.getBytes(), 0, to.getLength());//写出         } else {          out.write(o.toString().getBytes(utf8));        }      }        public synchronized void write(K key, V value)//给写线程加锁,写是互斥行为         throws IOException {  <span style="white-space:pre">    </span>//下面是为了判断key和value是否为空值         boolean nullKey = key == null || key instanceof NullWritable;//这语句太牛了         boolean nullValue = value == null || value instanceof NullWritable;        if (nullKey && nullValue) {//           return;        }        if (!nullKey) {          writeObject(key);        }        if (!(nullKey || nullValue)) {          out.write(keyValueSeparator);        }        if (!nullValue) {          writeObject(value);        }        out.write(newline);      }        public synchronized       void close(TaskAttemptContext context) throws IOException {        out.close();      }    }      public RecordWriter<K, V>    getRecordWriter(TaskAttemptContext job//获得writer实例                            ) throws IOException, InterruptedException {      Configuration conf = job.getConfiguration();      boolean isCompressed = getCompressOutput(job);//       String keyValueSeparator= conf.get("mapred.textoutputformat.separator",                                         "\t");      CompressionCodec codec = null;//压缩格式 还是?       String extension = "";      if (isCompressed) {        Class<? extends CompressionCodec> codecClass =           getOutputCompressorClass(job, GzipCodec.class);        codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);        extension = codec.getDefaultExtension();      }      Path file = getDefaultWorkFile(job, extension);//这个是获取缺省的文件路径及名称,在FileOutput中有对其的实现       FileSystem fs = file.getFileSystem(conf);      if (!isCompressed) {        FSDataOutputStream fileOut = fs.create(file, false);        return new LineRecordWriter<K, V>(fileOut, keyValueSeparator);      } else {        FSDataOutputStream fileOut = fs.create(file, false);        return new LineRecordWriter<K, V>(new DataOutputStream                                          (codec.createOutputStream(fileOut)),                                          keyValueSeparator);      }    }  }  
但是在生产环境中,输入输出字符集格式总是不一定会是utf-8格式,有可能处理之后的文本要求输出格式是GBK、BIG5等之类,作为下一个程序的输入格式,尤其是银行业,日志格式一般都是GBK,指定输出格式是GBK替换上面的源代码中的UTF-8即可,但是字符集编码格式那么多,做一个大数据的平台产品,面向的就是全世界的客户,这样去指定输出格式没有那么自动化,受众也是极窄的。如果是我们能够在MR程序的设置指定字符集那就完美契合生产环境中复杂的需求。故而修改源码如下:

package com.huateng.hadoop.mapred.transcoding.format;import java.io.DataOutputStream;import java.io.IOException;import java.io.UnsupportedEncodingException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FSDataOutputStream;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.io.compress.CompressionCodec;import org.apache.hadoop.io.compress.GzipCodec;import org.apache.hadoop.mapreduce.RecordWriter;import org.apache.hadoop.mapreduce.TaskAttemptContext;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.ReflectionUtils;public class EncodingOutputFormat<K, V> extends FileOutputFormat<K, V>{    public static String SEPERATOR = "mapreduce.output.textoutputformat.separator";  protected static class LineRecordWriter<K, V>    extends RecordWriter<K, V>  {    private   String charset;//手动指定的参数,在这里我们可以任意去指定一个输出的字符集    private   byte[] newline;       protected DataOutputStream out;    private final byte[] keyValueSeparator;//传入的参数,在下面写进输出行记录,那么指定的字符集参数从哪里传进去了?必须在MR执行的时候能够起作用。所以我们就必须在构造器中赋予该参数    public LineRecordWriter(DataOutputStream out, String keyValueSeparator,String dsc_charset) {      this.out = out;      charset=dsc_charset;       try {      newline = "\n".getBytes(charset);      this.keyValueSeparator = keyValueSeparator.getBytes(charset);      } catch (UnsupportedEncodingException uee) {      throw new IllegalArgumentException("can't find " + charset + " encoding");      }    }    /**     * Write the object to the byte stream, handling Text as a special case.     * @param o the object to print     * @throws IOException if the write throws, we pass it on     */    private void writeObject(Object o) throws IOException {      if (o instanceof Text) {//        Text to = (Text) o;//        out.write(to.getBytes(), 0, to.getLength());//      } else {      out.write(o.toString().getBytes(charset));      }    }    public synchronized void write(K key, V value)    throws IOException {      boolean nullKey = key == null || key instanceof NullWritable;      boolean nullValue = value == null || value instanceof NullWritable;      if (nullKey && nullValue) {        return;      }      if (!nullKey) {        writeObject(key);      }      if (!(nullKey || nullValue)) {        out.write(keyValueSeparator);      }      if (!nullValue) {        writeObject(value);      }      out.write("\n".getBytes());    }    public synchronized     void close(TaskAttemptContext context) throws IOException {      out.close();    }  }  public RecordWriter<K, V>          getRecordWriter(         TaskAttemptContext job                         ) throws IOException, InterruptedException {    Configuration conf = job.getConfiguration();    String dst_charset = job.getConfiguration().get("ark.dsccodec");//    compress压缩
//参数就是从这里传进来的,我们得到MR的job的参数,进行指定    boolean isCompressed = getCompressOutput(job);    String keyValueSeparator= conf.get(SEPERATOR, "\t");    CompressionCodec codec = null;    String extension = "";    if (isCompressed) {      Class<? extends CompressionCodec> codecClass =         getOutputCompressorClass(job, GzipCodec.class);      codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);      extension = codec.getDefaultExtension();    }//    setOutputName(job,"transform");    Path file = getDefaultWorkFile(job, extension);    FileSystem fs = file.getFileSystem(conf);    if (!isCompressed) {      FSDataOutputStream fileOut = fs.create(file, false);      return new LineRecordWriter<K, V>(fileOut, keyValueSeparator,dst_charset);    } else {      FSDataOutputStream fileOut = fs.create(file, false);      return new LineRecordWriter<K, V>(new DataOutputStream(codec.createOutputStream(fileOut)),                                keyValueSeparator,dst_charset);    }  }}
 private   String charset;//手动指定的参数,在这里我们可以任意去指定一个输出的字符集
/传入的参数,在下面写进输出行记录,那么指定的字符集参数从哪里传进去了?必须在MR执行的时候能够起作用。所以我们就必须在构造器中赋予该参数    public LineRecordWriter(DataOutputStream out, String keyValueSeparator,String dsc_charset) {}
<pre name="code" class="html">protected static class LineRecordWriter<K, V>    extends RecordWriter<K, V>  
在LineRecordWriter的父类RecordWriter中得到指定的字符集

<pre name="code" class="html">public RecordWriter<K, V>          getRecordWriter(         TaskAttemptContext job                         ) throws IOException, InterruptedException {    Configuration conf = job.getConfiguration();    String dst_charset = job.getConfiguration().get("ark.dsccodec");
//参数就是从这里传进来的,我们得到MR的job的参数,进行指定
}
<pre name="code" class="html">job.getConfiguration().get("ark.dsccodec")这个设定Configuration则来源于创建Job实例的进行指定,至此完成。本人生产环境(华为FI集群)亲测,JDK支持的字符集都可
转换。具体main方法设定参数如下:
<pre name="code" class="html">import java.io.IOException;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;import com.google.common.base.Preconditions;import com.huateng.hadoop.mapred.MapRedAdapter;import com.huateng.hadoop.mapred.transcoding.format.EncodingOutputFormat;//import com.huateng.hadoop.mapred.transcoding.format.GB2312OutputFormat;//import com.huateng.hadoop.mapred.transcoding.format.GBKOutputFormat;import com.huateng.hdfs.common.HDFSClient;import com.huateng.util.common.StringUtils;/* * @author canMao */public class TranscodingJob {String other_code=null;private Job internalJob;public TranscodingJob(String in_path,String src_charset,String out_path,String dst_charset)throws Exception{Preconditions.checkArgument(!StringUtils.hasNullOrEmpty(new String[]{src_charset, dst_charset})," source_encoding and destination_encoding is null at least one");Job job = MapRedAdapter.createJob();job.getConfiguration().set("ark.codec", src_charset);job.getConfiguration().set("ark.dsccodec", dst_charset);job.setJarByClass(TranscodingJob.class);job.setMapperClass(TranscodingMapper.class);job.setNumReduceTasks(0);job.setOutputKeyClass(Text.class);job.setOutputValueClass(NullWritable.class);if (dst_charset.equals("UTF-8")) {job.setOutputFormatClass(TextOutputFormat.class);}else{job.setOutputFormatClass(EncodingOutputFormat.class);}FileInputFormat.setInputPaths(job, new Path(in_path));if (HDFSClient.getFileSystem().exists(new Path(out_path))) {HDFSClient.getFileSystem().delete(new Path(out_path),true);}FileOutputFormat.setOutputPath(job, new Path(out_path));internalJob = job;}public boolean submit() throws ClassNotFoundException, IOException, InterruptedException{//float progress=0.0f;//internalJob.submit();//while(true){//internalJob.mapProgress();//}return internalJob.waitForCompletion(true);//internalJob.submit();}}







0 0
原创粉丝点击