控制MapReduce输出文件个数及格式

来源:互联网 发布:淘宝优惠券在哪里查看 编辑:程序博客网 时间:2024/05/16 07:08

控制MapReduce多文件输出

默认情况下MapReduce任务结束后一个reduce产生一个输出文件,文件名类似part-xxxxx, 有时为了方便后续对这些文件的处理,比如根据文件名import到不通的hive分区,我们需要控制reduce输出产生的文件名,让相同的reduce key写入同一个文件,此时可继承MultipleOutputFormat重载generateFileNameForKeyValue定制OutputFormat。

public static class MyMultipleOutputFormat extends MultipleOutputFormat<K, V>{   protected String generateFileNameForKeyValue(K key, V value) {        return String.format("%s_%s", key.getDt(), key.getHour());   }   abstract protected RecordWriter<K, V> getBaseRecordWriter(FileSystem fs,      JobConf job, String path, Progressable arg3) throws IOException;}

这样同一天相同小时的日志就被聚集到同一个文件,很容易导入到对应的小时分区。

控制MapReduce输出格式

reduce输出文件格式不同,存储和读取效率都会有差别,可以重载getBaseRecordWriter返回不同格式的Writer.

SequenceFileWriter

import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FSDataOutputStream;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.SequenceFile;import org.apache.hadoop.io.SequenceFile.CompressionType;import org.apache.hadoop.io.compress.CompressionCodec;import org.apache.hadoop.mapreduce.RecordWriter;import org.apache.hadoop.mapreduce.TaskAttemptContext;import org.apache.hadoop.util.ReflectionUtils;public class ValSequenceFileWriter<K, V> extends RecordWriter<K, V> {    private SequenceFile.Writer out = null;    private FSDataOutputStream fsout;    public ValSequenceFileWriter(TaskAttemptContext context, Class<?> codecClass, FSDataOutputStream fsout) throws IOException {        Configuration conf = context.getConfiguration();        CompressionCodec codec = (CompressionCodec)                 ReflectionUtils.newInstance(codecClass, conf);        this.fsout = fsout;        out = SequenceFile.createWriter(            conf, fsout,            NullWritable.class,            context.getOutputValueClass(),            CompressionType.BLOCK,            codec);    }    @Override    public synchronized void close(TaskAttemptContext context) throws IOException,            InterruptedException {        out.close();        fsout.close();    }    @Override    public synchronized void write(K key, V value) throws IOException, InterruptedException {        out.append(NullWritable.get(), value);    }   }

RCFileWriter

import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.hive.ql.io.RCFile;import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable;import org.apache.hadoop.hive.serde2.columnar.BytesRefWritable;import org.apache.hadoop.io.compress.CompressionCodec;import org.apache.hadoop.mapreduce.RecordWriter;import org.apache.hadoop.mapreduce.TaskAttemptContext;import org.apache.hadoop.util.ReflectionUtils;import com.pplive.bip.metadata.hive.HiveTableInfo;public class ValRCFileWriter<K, V> extends RecordWriter<K, V> {    protected RCFile.Writer out;    /*     * buffered output array for output     */    private final BytesRefArrayWritable array;    private final int numColumns;    /**     * construct     * @param conf     * @param codecClass     * @param path     * @throws IOException     */    public ValRCFileWriter(Configuration conf ,            Class<?> codecClass, Path path) throws IOException {        FileSystem fs = path.getFileSystem(conf);        CompressionCodec codec = (CompressionCodec)             ReflectionUtils.newInstance(codecClass, conf);        this.out = new RCFile.Writer(fs, conf, path, null, codec);        numColumns = conf.getInt(RCFile.COLUMN_NUMBER_CONF_STR, 0);        this.array = new BytesRefArrayWritable(numColumns);    }    @Override    public synchronized void write(K key, V value) throws IOException {        String[] fields = value.toString().split(HiveTableInfo.FIELD_DELIMITED, -1);        for (int i = 0; i < fields.length && i < this.numColumns; i++) {            array.set(i, new BytesRefWritable(fields[i].getBytes("UTF-8")));        }        out.append(array);    }    @Override    public synchronized void close(TaskAttemptContext context) throws IOException,            InterruptedException {        out.close();            }}

ORCFileWriter

import java.io.IOException;import java.lang.reflect.Constructor;import java.util.*;import com.google.common.base.Throwables;import com.pplive.bip.metadata.hive.HiveColumnInfo;import com.pplive.bip.metadata.hive.HiveColumnType;import com.pplive.bip.metadata.log.LogInfo;import org.apache.commons.lang3.StringUtils;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.hive.ql.io.orc.*;import org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter;import org.apache.hadoop.hive.serde2.objectinspector.*;import org.apache.hadoop.hive.serde2.objectinspector.primitive.*;import org.apache.hadoop.hive.serde2.typeinfo.*;import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants;import org.apache.hadoop.mapreduce.TaskAttemptContext;import com.pplive.bip.metadata.hive.HiveTableInfo;public class ValORCFileWriter<K, V> extends org.apache.hadoop.mapreduce.RecordWriter<K, V> {    private final static String MapTypeString = "MAP<STRING,STRING>";    private final static String ArrayTypeString = "ARRAY<STRING>";    private  OrcSerde orcSerde;    private RecordWriter writer;    private final SettableStructObjectInspector tableInspector;    private final List<StructField> structFields;    private final Object orcRow;    private final int numColumns;    private static Constructor<? extends RecordWriter> getOrcWriterConstructor()    {        try {            String writerClassName = OrcOutputFormat.class.getName() + "$OrcRecordWriter";            Constructor<? extends RecordWriter> constructor = OrcOutputFormat.class.getClassLoader()                    .loadClass(writerClassName).asSubclass(RecordWriter.class)                    .getDeclaredConstructor(Path.class, OrcFile.WriterOptions.class);            constructor.setAccessible(true);            return constructor;        }        catch (ReflectiveOperationException e) {            throw Throwables.propagate(e);        }    }    private static String  getHiveType(HiveColumnInfo column) {        String hiveType;        HiveColumnType columnType = column.getHiveType();        if (columnType == HiveColumnType.Map)            hiveType =  MapTypeString;        else if (columnType == HiveColumnType.Complex)            hiveType =  column.getColumnTypeValue();        else if (columnType == HiveColumnType.Array)            hiveType =  ArrayTypeString;        else            hiveType = column.getHiveType().toString();        return hiveType.toLowerCase();    }    private static ObjectInspector getObjectInspector(String hiveType) {        TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(hiveType.toLowerCase());        return TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo(typeInfo);    }    /**     * construct     * @param conf     * @param path     * @throws IOException     */    public ValORCFileWriter(Configuration conf, LogInfo logInfo, Path path) throws IOException {        HiveTableInfo hiveTableInfo = logInfo.getHiveTable();        List<String> columnNames = new ArrayList<String>();        List<String> hiveTypeNames = new ArrayList<String>();        List<ObjectInspector> columnInspectors = new ArrayList<ObjectInspector>();        for (HiveColumnInfo columnInfo: hiveTableInfo.getHiveColumns()) {            columnNames.add(columnInfo.getName());            String hiveType = getHiveType(columnInfo);            hiveTypeNames.add(hiveType);            columnInspectors.add(getObjectInspector(hiveType));        }        this.tableInspector = ObjectInspectorFactory.getStandardStructObjectInspector(columnNames, columnInspectors);        this.structFields = (List<StructField>) tableInspector.getAllStructFieldRefs();        Constructor<? extends RecordWriter> writerConstructor = getOrcWriterConstructor();        try {            this.writer = writerConstructor.newInstance(path, OrcFile.writerOptions(conf).inspector(this.tableInspector));        } catch (ReflectiveOperationException  e) {            throw new RuntimeException("Failed to create writer", e);        }        this.orcSerde = new OrcSerde();        Properties properties = new Properties();        properties.setProperty(hive_metastoreConstants.META_TABLE_COLUMNS, StringUtils.join(columnNames, ','));        properties.setProperty(hive_metastoreConstants.META_TABLE_COLUMN_TYPES, StringUtils.join(hiveTypeNames, ':'));        this.orcSerde.initialize(conf, properties);        this.orcRow = tableInspector.create();        this.numColumns = hiveTableInfo.getHiveColumnCount();    }    @Override    public synchronized void write(K key, V value) throws IOException {        String[] fields = value.toString().split(HiveTableInfo.FIELD_DELIMITED, -1);        for (int i = 0; i < fields.length && i < this.numColumns; i++) {            StructField sf = structFields.get(i);            tableInspector.setStructFieldData(orcRow, sf, getJavaObject(fields[i], sf.getFieldObjectInspector()));        }        this.writer.write(this.orcSerde.serialize(orcRow, tableInspector));    }    private Object getJavaObject(String value, ObjectInspector oi) {        Class clazz = oi.getClass();        if(value.isEmpty()) {            return null;        }        Object o;        try {            if (clazz == JavaShortObjectInspector.class) {                o = new Short(value);            } else if (clazz == JavaIntObjectInspector.class) {                o = new Integer(value);            }  else if (clazz == JavaLongObjectInspector.class) {                o = new Long(value);            } else if (clazz == JavaStringObjectInspector.class) {                o = value;            } else if (clazz == JavaFloatObjectInspector.class) {                o = new Float(value);            } else if (clazz == JavaDoubleObjectInspector.class) {                o = new Double(value);            } else if (clazz == StandardListObjectInspector.class) {                ObjectInspector elementObjectInspector = ((StandardListObjectInspector)oi).getListElementObjectInspector();                String[] vs = value.split(",");                List l  = new ArrayList();                for(String v: vs) {                    l.add(getJavaObject(v,elementObjectInspector));                }                o = l;            } else if (clazz == StandardMapObjectInspector.class) {                ObjectInspector keyObjectInspector = ((StandardMapObjectInspector)oi).getMapKeyObjectInspector();                ObjectInspector valueObjectInspector = ((StandardMapObjectInspector)oi).getMapValueObjectInspector();                Map m = new HashMap();                if(!value.isEmpty()) {                    String[] vs = value.split(",");                    for(String v: vs) {                        String[] kv = v.split(":");                        if(kv.length == 2) {                            m.put(getJavaObject(kv[0],keyObjectInspector), getJavaObject(kv[1],valueObjectInspector));                        }                    }                }                o = m;            } else if (clazz == StandardStructObjectInspector.class) {                StandardStructObjectInspector soi = (StandardStructObjectInspector)oi;                List<StructField> fields = (List<StructField>) soi.getAllStructFieldRefs();                ArrayList result = (ArrayList)soi.create();                String[] vs = value.split(":");                if(vs.length == fields.size()) {                    for (int i = 0; i < fields.size(); i++) {                        StructField structField = fields.get(i);                        soi.setStructFieldData(result,structField,getJavaObject(vs[i], structField.getFieldObjectInspector()));                    }                }                o = result;            } else {                throw new RuntimeException("invalid object ObjectInspector" + oi.toString());            }        }catch (NumberFormatException e) {            o = null;        }        return o;    }    @Override    public synchronized void close(TaskAttemptContext context) throws IOException, InterruptedException {        writer.close(false);    }

三种不同的Writer输出的数据查询效率依次变高,ORC最常用,效率也最高。