hadoop之MR字节码转换
来源:互联网 发布:电视盒子破解软件下载 编辑:程序博客网 时间:2024/05/16 11:50
大家如果看过hadoop的文本文件输入字符集格式,就知道在TextOutputFormat源码中写死了输出字节码格式是UTF-8,源码如下
import java.io.DataOutputStream; import java.io.IOException; import java.io.UnsupportedEncodingException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.mapreduce.OutputFormat; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.util.*; /** An {@link OutputFormat} that writes plain text files. */ public class TextOutputFormat<K, V> extends FileOutputFormat<K, V> {//TextInputFormat是默认的输出文件格式 protected static class LineRecordWriter<K, V>//默认 extends RecordWriter<K, V> { private static final String utf8 = "UTF-8"; //这个地方写死了输出字符集是UTF-8 private static final byte[] newline;//行结束符? static { try { newline = "\n".getBytes(utf8); } catch (UnsupportedEncodingException uee) { throw new IllegalArgumentException("can't find " + utf8 + " encoding"); } } protected DataOutputStream out; private final byte[] keyValueSeparator;//key和value的分隔符,默认的好像是Tab public LineRecordWriter(DataOutputStream out, String keyValueSeparator) {//构造函数,初始化输出流及分隔符 this.out = out; try { this.keyValueSeparator = keyValueSeparator.getBytes(utf8); } catch (UnsupportedEncodingException uee) { throw new IllegalArgumentException("can't find " + utf8 + " encoding"); } } public LineRecordWriter(DataOutputStream out) {//默认的分隔符 this(out, "\t"); } /** * Write the object to the byte stream, handling Text as a special输出流是byte格式的 * case. * @param o the object to print是要输出的对象 * @throws IOException if the write throws, we pass it on */ private void writeObject(Object o) throws IOException {//应该是一行一行的写 key keyValueSeparator value \n if (o instanceof Text) {//如果o是Text的实例 Text to = (Text) o; out.write(to.getBytes(), 0, to.getLength());//写出 } else { out.write(o.toString().getBytes(utf8)); } } public synchronized void write(K key, V value)//给写线程加锁,写是互斥行为 throws IOException { <span style="white-space:pre"> </span>//下面是为了判断key和value是否为空值 boolean nullKey = key == null || key instanceof NullWritable;//这语句太牛了 boolean nullValue = value == null || value instanceof NullWritable; if (nullKey && nullValue) {// return; } if (!nullKey) { writeObject(key); } if (!(nullKey || nullValue)) { out.write(keyValueSeparator); } if (!nullValue) { writeObject(value); } out.write(newline); } public synchronized void close(TaskAttemptContext context) throws IOException { out.close(); } } public RecordWriter<K, V> getRecordWriter(TaskAttemptContext job//获得writer实例 ) throws IOException, InterruptedException { Configuration conf = job.getConfiguration(); boolean isCompressed = getCompressOutput(job);// String keyValueSeparator= conf.get("mapred.textoutputformat.separator", "\t"); CompressionCodec codec = null;//压缩格式 还是? String extension = ""; if (isCompressed) { Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job, GzipCodec.class); codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf); extension = codec.getDefaultExtension(); } Path file = getDefaultWorkFile(job, extension);//这个是获取缺省的文件路径及名称,在FileOutput中有对其的实现 FileSystem fs = file.getFileSystem(conf); if (!isCompressed) { FSDataOutputStream fileOut = fs.create(file, false); return new LineRecordWriter<K, V>(fileOut, keyValueSeparator); } else { FSDataOutputStream fileOut = fs.create(file, false); return new LineRecordWriter<K, V>(new DataOutputStream (codec.createOutputStream(fileOut)), keyValueSeparator); } } }但是在生产环境中,输入输出字符集格式总是不一定会是utf-8格式,有可能处理之后的文本要求输出格式是GBK、BIG5等之类,作为下一个程序的输入格式,尤其是银行业,日志格式一般都是GBK,指定输出格式是GBK替换上面的源代码中的UTF-8即可,但是字符集编码格式那么多,做一个大数据的平台产品,面向的就是全世界的客户,这样去指定输出格式没有那么自动化,受众也是极窄的。如果是我们能够在MR程序的设置指定字符集那就完美契合生产环境中复杂的需求。故而修改源码如下:
package com.huateng.hadoop.mapred.transcoding.format;import java.io.DataOutputStream;import java.io.IOException;import java.io.UnsupportedEncodingException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FSDataOutputStream;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.io.compress.CompressionCodec;import org.apache.hadoop.io.compress.GzipCodec;import org.apache.hadoop.mapreduce.RecordWriter;import org.apache.hadoop.mapreduce.TaskAttemptContext;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.ReflectionUtils;public class EncodingOutputFormat<K, V> extends FileOutputFormat<K, V>{ public static String SEPERATOR = "mapreduce.output.textoutputformat.separator"; protected static class LineRecordWriter<K, V> extends RecordWriter<K, V> { private String charset;//手动指定的参数,在这里我们可以任意去指定一个输出的字符集 private byte[] newline; protected DataOutputStream out; private final byte[] keyValueSeparator;//传入的参数,在下面写进输出行记录,那么指定的字符集参数从哪里传进去了?必须在MR执行的时候能够起作用。所以我们就必须在构造器中赋予该参数 public LineRecordWriter(DataOutputStream out, String keyValueSeparator,String dsc_charset) { this.out = out; charset=dsc_charset; try { newline = "\n".getBytes(charset); this.keyValueSeparator = keyValueSeparator.getBytes(charset); } catch (UnsupportedEncodingException uee) { throw new IllegalArgumentException("can't find " + charset + " encoding"); } } /** * Write the object to the byte stream, handling Text as a special case. * @param o the object to print * @throws IOException if the write throws, we pass it on */ private void writeObject(Object o) throws IOException { if (o instanceof Text) {// Text to = (Text) o;// out.write(to.getBytes(), 0, to.getLength());// } else { out.write(o.toString().getBytes(charset)); } } public synchronized void write(K key, V value) throws IOException { boolean nullKey = key == null || key instanceof NullWritable; boolean nullValue = value == null || value instanceof NullWritable; if (nullKey && nullValue) { return; } if (!nullKey) { writeObject(key); } if (!(nullKey || nullValue)) { out.write(keyValueSeparator); } if (!nullValue) { writeObject(value); } out.write("\n".getBytes()); } public synchronized void close(TaskAttemptContext context) throws IOException { out.close(); } } public RecordWriter<K, V> getRecordWriter( TaskAttemptContext job ) throws IOException, InterruptedException { Configuration conf = job.getConfiguration(); String dst_charset = job.getConfiguration().get("ark.dsccodec");// compress压缩
//参数就是从这里传进来的,我们得到MR的job的参数,进行指定 boolean isCompressed = getCompressOutput(job); String keyValueSeparator= conf.get(SEPERATOR, "\t"); CompressionCodec codec = null; String extension = ""; if (isCompressed) { Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job, GzipCodec.class); codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf); extension = codec.getDefaultExtension(); }// setOutputName(job,"transform"); Path file = getDefaultWorkFile(job, extension); FileSystem fs = file.getFileSystem(conf); if (!isCompressed) { FSDataOutputStream fileOut = fs.create(file, false); return new LineRecordWriter<K, V>(fileOut, keyValueSeparator,dst_charset); } else { FSDataOutputStream fileOut = fs.create(file, false); return new LineRecordWriter<K, V>(new DataOutputStream(codec.createOutputStream(fileOut)), keyValueSeparator,dst_charset); } }}
private String charset;//手动指定的参数,在这里我们可以任意去指定一个输出的字符集
/传入的参数,在下面写进输出行记录,那么指定的字符集参数从哪里传进去了?必须在MR执行的时候能够起作用。所以我们就必须在构造器中赋予该参数 public LineRecordWriter(DataOutputStream out, String keyValueSeparator,String dsc_charset) {}
<pre name="code" class="html">protected static class LineRecordWriter<K, V> extends RecordWriter<K, V>在LineRecordWriter的父类RecordWriter中得到指定的字符集
<pre name="code" class="html">public RecordWriter<K, V> getRecordWriter( TaskAttemptContext job ) throws IOException, InterruptedException { Configuration conf = job.getConfiguration(); String dst_charset = job.getConfiguration().get("ark.dsccodec");
//参数就是从这里传进来的,我们得到MR的job的参数,进行指定}
<pre name="code" class="html">job.getConfiguration().get("ark.dsccodec")这个设定Configuration则来源于创建Job实例的进行指定,至此完成。本人生产环境(华为FI集群)亲测,JDK支持的字符集都可
转换。具体main方法设定参数如下:
<pre name="code" class="html">import java.io.IOException;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;import com.google.common.base.Preconditions;import com.huateng.hadoop.mapred.MapRedAdapter;import com.huateng.hadoop.mapred.transcoding.format.EncodingOutputFormat;//import com.huateng.hadoop.mapred.transcoding.format.GB2312OutputFormat;//import com.huateng.hadoop.mapred.transcoding.format.GBKOutputFormat;import com.huateng.hdfs.common.HDFSClient;import com.huateng.util.common.StringUtils;/* * @author canMao */public class TranscodingJob {String other_code=null;private Job internalJob;public TranscodingJob(String in_path,String src_charset,String out_path,String dst_charset)throws Exception{Preconditions.checkArgument(!StringUtils.hasNullOrEmpty(new String[]{src_charset, dst_charset})," source_encoding and destination_encoding is null at least one");Job job = MapRedAdapter.createJob();job.getConfiguration().set("ark.codec", src_charset);job.getConfiguration().set("ark.dsccodec", dst_charset);job.setJarByClass(TranscodingJob.class);job.setMapperClass(TranscodingMapper.class);job.setNumReduceTasks(0);job.setOutputKeyClass(Text.class);job.setOutputValueClass(NullWritable.class);if (dst_charset.equals("UTF-8")) {job.setOutputFormatClass(TextOutputFormat.class);}else{job.setOutputFormatClass(EncodingOutputFormat.class);}FileInputFormat.setInputPaths(job, new Path(in_path));if (HDFSClient.getFileSystem().exists(new Path(out_path))) {HDFSClient.getFileSystem().delete(new Path(out_path),true);}FileOutputFormat.setOutputPath(job, new Path(out_path));internalJob = job;}public boolean submit() throws ClassNotFoundException, IOException, InterruptedException{//float progress=0.0f;//internalJob.submit();//while(true){//internalJob.mapProgress();//}return internalJob.waitForCompletion(true);//internalJob.submit();}}
0 0
- hadoop之MR字节码转换
- Hadoop之MR的调优性能
- Hadoop学习笔记之深入浅出MR
- Hadoop入门之Yarn调度MR流程
- hadoop 之MR的join操作
- hadoop随笔二之MR-wordcount小试
- hadoop mr数据流总结
- Hadoop MR VS Spark
- hadoop mr 流程图
- Hadoop(12) MR Partitioner
- Hadoop(13) MR 排序
- Hadoop(14) MR Combiner
- Hadoop(15) MR shuffle
- hadoop MR的过程
- 字节转换之大小端
- 转换字节码
- 字节码转换学习
- 字节码转换工具
- C#基础知识总结(三)
- POJ2531 Network Saboteur dfs
- [HNOI2012]矿场搭建 解题报告
- Rails 根目录获取
- adb环境配置
- hadoop之MR字节码转换
- “error LNK1169: 找到一个或多个多重定义的符号”的解决方法
- 蓝桥杯-排列(例题)
- 图像拼接基础学习笔记
- jbpm异常
- DUBBO 报错 Failed to check the status of the service
- 正则表达式详解
- JAVA+Ftp 操作
- java获取本机IP 系统时间