hadoop MapReduce 输出结果中文乱码解决

来源：互联网发布：域名转让流程编辑：程序博客网时间：2024/05/16 06:28

hadoop涉及输出文本的默认输出编码统一用没有BOM的UTF-8的形式，但是对于中文的输出window系统默认的是GBK，有些格式文件例如CSV格式的文件用excel打开输出编码为没有BOM的UTF-8文件时，输出的结果为乱码，只能由UE或者记事本打开才能正常显示。因此将hadoop默认输出编码更改为GBK成为非常常见的需求。
自定义 TextOutputFormat.class

package com.ljt.hdfs;import java.io.DataOutputStream;import java.io.IOException;import java.io.UnsupportedEncodingException;import org.apache.hadoop.classification.InterfaceAudience;import org.apache.hadoop.classification.InterfaceStability;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FSDataOutputStream;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.io.compress.CompressionCodec;import org.apache.hadoop.io.compress.GzipCodec;import org.apache.hadoop.mapreduce.RecordWriter;import org.apache.hadoop.mapreduce.TaskAttemptContext;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.ReflectionUtils;/** *  * <p>Title:hadoop MapReduce 输出结果中文乱码解决</p> * <p> 功能描述:: </p> * <p>Company: adteach </p>  * @author  刘建涛 *  * @date    2017年7月19日下午4:37:41 * @version 1.0 */@InterfaceAudience.Public@InterfaceStability.Stablepublic class TextOutputFormat<K, V> extends FileOutputFormat<K, V> {  public static String SEPERATOR = "mapreduce.output.textoutputformat.separator";  protected static class LineRecordWriter<K, V>    extends RecordWriter<K, V> {    private static final String utf8 = "UTF-8";  // 将UTF-8转换成GBK     private static final byte[] newline;    static {      try {        newline = "\n".getBytes(utf8);      } catch (UnsupportedEncodingException uee) {        throw new IllegalArgumentException("can't find " + utf8 + " encoding");      }    }    protected DataOutputStream out;    private final byte[] keyValueSeparator;    public LineRecordWriter(DataOutputStream out, String keyValueSeparator) {      this.out = out;      try {        this.keyValueSeparator = keyValueSeparator.getBytes(utf8);      } catch (UnsupportedEncodingException uee) {        throw new IllegalArgumentException("can't find " + utf8 + " encoding");      }    }    public LineRecordWriter(DataOutputStream out) {      this(out, "\t");    }    /**     * Write the object to the byte stream, handling Text as a special     * case.     * @param o the object to print     * @throws IOException if the write throws, we pass it on     */    private void writeObject(Object o) throws IOException {      if (o instanceof Text) {        Text to = (Text) o;   // 将此行代码注释掉        out.write(to.getBytes(), 0, to.getLength());  // 将此行代码注释掉      } else { // 将此行代码注释掉              out.write(o.toString().getBytes(utf8));      }    }    public synchronized void write(K key, V value)      throws IOException {      boolean nullKey = key == null || key instanceof NullWritable;      boolean nullValue = value == null || value instanceof NullWritable;      if (nullKey && nullValue) {        return;      }      if (!nullKey) {        writeObject(key);      }      if (!(nullKey || nullValue)) {        out.write(keyValueSeparator);      }      if (!nullValue) {        writeObject(value);      }      out.write(newline);    }    public synchronized     void close(TaskAttemptContext context) throws IOException {      out.close();    }  }  public RecordWriter<K, V>          getRecordWriter(TaskAttemptContext job                         ) throws IOException, InterruptedException {    Configuration conf = job.getConfiguration();    boolean isCompressed = getCompressOutput(job);    String keyValueSeparator= conf.get(SEPERATOR, "\t");    CompressionCodec codec = null;    String extension = "";    if (isCompressed) {      Class<? extends CompressionCodec> codecClass =         getOutputCompressorClass(job, GzipCodec.class);      codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);      extension = codec.getDefaultExtension();    }    Path file = getDefaultWorkFile(job, extension);    FileSystem fs = file.getFileSystem(conf);    if (!isCompressed) {      FSDataOutputStream fileOut = fs.create(file, false);      return new LineRecordWriter<K, V>(fileOut, keyValueSeparator);    } else {      FSDataOutputStream fileOut = fs.create(file, false);      return new LineRecordWriter<K, V>(new DataOutputStream                                        (codec.createOutputStream(fileOut)),                                        keyValueSeparator);    }  }}

  默认的情况下MR主程序中，设定输出编码的设置语句为：

job.setOutputFormatClass(TextOutputFormat.class);

上述代码的第48行可以看出hadoop已经限定此输出格式统一为UTF-8，因此为了改变hadoop的输出代码的文本编码只需定义一个和TextOutputFormat相同的类GbkOutputFormat同样继承FileOutputFormat
（注意是
org.apache.hadoop.mapreduce.lib.output.FileOutputFormat）
即可，如下代码

package com.ljt.hdfs;import java.io.DataOutputStream;import java.io.IOException;import java.io.UnsupportedEncodingException;import org.apache.hadoop.classification.InterfaceAudience;import org.apache.hadoop.classification.InterfaceStability;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FSDataOutputStream;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.io.compress.CompressionCodec;import org.apache.hadoop.io.compress.GzipCodec;import org.apache.hadoop.mapreduce.RecordWriter;import org.apache.hadoop.mapreduce.TaskAttemptContext;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.ReflectionUtils;/** *  * <p> * Title: GbkOutputFormat * </p> * <p> * 功能描述:: * hadoop已经限定此输出格式统一为UTF-8，因此为了改变hadoop的输出代码的文本编码只需定义一个和TextOutputFormat相同的类GbkOutputFormat同样继承FileOutputFormat * （注意是 org.apache.hadoop.mapreduce.lib.output.FileOutputFormat） * </p> * <p> * Company: adteach * </p> *  * @author 刘建涛 * * @date 2017年7月19日下午4:42:05 * @version 1.0 */@InterfaceAudience.Public@InterfaceStability.Stablepublic class GbkOutputFormat<K, V> extends FileOutputFormat<K, V> {  public static String SEPERATOR = "mapreduce.output.textoutputformat.separator";  protected static class LineRecordWriter<K, V>    extends RecordWriter<K, V> {    private static final String utf8 = "GBK";    private static final byte[] newline;    static {      try {        newline = "\n".getBytes(utf8);      } catch (UnsupportedEncodingException uee) {        throw new IllegalArgumentException("can't find " + utf8 + " encoding");      }    }    protected DataOutputStream out;    private final byte[] keyValueSeparator;    public LineRecordWriter(DataOutputStream out, String keyValueSeparator) {      this.out = out;      try {        this.keyValueSeparator = keyValueSeparator.getBytes(utf8);      } catch (UnsupportedEncodingException uee) {        throw new IllegalArgumentException("can't find " + utf8 + " encoding");      }    }    public LineRecordWriter(DataOutputStream out) {      this(out, "\t");    }    /**     * Write the object to the byte stream, handling Text as a special     * case.     * @param o the object to print     * @throws IOException if the write throws, we pass it on     */    private void writeObject(Object o) throws IOException {      if (o instanceof Text) {//        Text to = (Text) o;//        out.write(to.getBytes(), 0, to.getLength());//      } else {        out.write(o.toString().getBytes(utf8));      }    }    public synchronized void write(K key, V value)      throws IOException {      boolean nullKey = key == null || key instanceof NullWritable;      boolean nullValue = value == null || value instanceof NullWritable;      if (nullKey && nullValue) {        return;      }      if (!nullKey) {        writeObject(key);      }      if (!(nullKey || nullValue)) {        out.write(keyValueSeparator);      }      if (!nullValue) {        writeObject(value);      }      out.write(newline);    }    public synchronized     void close(TaskAttemptContext context) throws IOException {      out.close();    }  }  public RecordWriter<K, V>          getRecordWriter(TaskAttemptContext job                         ) throws IOException, InterruptedException {    Configuration conf = job.getConfiguration();    boolean isCompressed = getCompressOutput(job);    String keyValueSeparator= conf.get(SEPERATOR, "\t");    CompressionCodec codec = null;    String extension = "";    if (isCompressed) {      Class<? extends CompressionCodec> codecClass =         getOutputCompressorClass(job, GzipCodec.class);      codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);      extension = codec.getDefaultExtension();    }    Path file = getDefaultWorkFile(job, extension);    FileSystem fs = file.getFileSystem(conf);    if (!isCompressed) {      FSDataOutputStream fileOut = fs.create(file, false);      return new LineRecordWriter<K, V>(fileOut, keyValueSeparator);    } else {      FSDataOutputStream fileOut = fs.create(file, false);      return new LineRecordWriter<K, V>(new DataOutputStream                                        (codec.createOutputStream(fileOut)),                                        keyValueSeparator);    }  }}

最后将输出编码类型设置成GbkOutputFormat.class，如：
job.setOutputFormatClass(GbkOutputFormat.class);

阅读全文

0 0