控制MapReduce输出文件个数及格式
来源:互联网 发布:淘宝优惠券在哪里查看 编辑:程序博客网 时间:2024/05/16 07:08
控制MapReduce多文件输出
默认情况下MapReduce任务结束后一个reduce产生一个输出文件,文件名类似part-xxxxx, 有时为了方便后续对这些文件的处理,比如根据文件名import到不通的hive分区,我们需要控制reduce输出产生的文件名,让相同的reduce key写入同一个文件,此时可继承MultipleOutputFormat重载generateFileNameForKeyValue定制OutputFormat。
public static class MyMultipleOutputFormat extends MultipleOutputFormat<K, V>{ protected String generateFileNameForKeyValue(K key, V value) { return String.format("%s_%s", key.getDt(), key.getHour()); } abstract protected RecordWriter<K, V> getBaseRecordWriter(FileSystem fs, JobConf job, String path, Progressable arg3) throws IOException;}
这样同一天相同小时的日志就被聚集到同一个文件,很容易导入到对应的小时分区。
控制MapReduce输出格式
reduce输出文件格式不同,存储和读取效率都会有差别,可以重载getBaseRecordWriter返回不同格式的Writer.
SequenceFileWriter
import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FSDataOutputStream;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.SequenceFile;import org.apache.hadoop.io.SequenceFile.CompressionType;import org.apache.hadoop.io.compress.CompressionCodec;import org.apache.hadoop.mapreduce.RecordWriter;import org.apache.hadoop.mapreduce.TaskAttemptContext;import org.apache.hadoop.util.ReflectionUtils;public class ValSequenceFileWriter<K, V> extends RecordWriter<K, V> { private SequenceFile.Writer out = null; private FSDataOutputStream fsout; public ValSequenceFileWriter(TaskAttemptContext context, Class<?> codecClass, FSDataOutputStream fsout) throws IOException { Configuration conf = context.getConfiguration(); CompressionCodec codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf); this.fsout = fsout; out = SequenceFile.createWriter( conf, fsout, NullWritable.class, context.getOutputValueClass(), CompressionType.BLOCK, codec); } @Override public synchronized void close(TaskAttemptContext context) throws IOException, InterruptedException { out.close(); fsout.close(); } @Override public synchronized void write(K key, V value) throws IOException, InterruptedException { out.append(NullWritable.get(), value); } }
RCFileWriter
import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.hive.ql.io.RCFile;import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable;import org.apache.hadoop.hive.serde2.columnar.BytesRefWritable;import org.apache.hadoop.io.compress.CompressionCodec;import org.apache.hadoop.mapreduce.RecordWriter;import org.apache.hadoop.mapreduce.TaskAttemptContext;import org.apache.hadoop.util.ReflectionUtils;import com.pplive.bip.metadata.hive.HiveTableInfo;public class ValRCFileWriter<K, V> extends RecordWriter<K, V> { protected RCFile.Writer out; /* * buffered output array for output */ private final BytesRefArrayWritable array; private final int numColumns; /** * construct * @param conf * @param codecClass * @param path * @throws IOException */ public ValRCFileWriter(Configuration conf , Class<?> codecClass, Path path) throws IOException { FileSystem fs = path.getFileSystem(conf); CompressionCodec codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf); this.out = new RCFile.Writer(fs, conf, path, null, codec); numColumns = conf.getInt(RCFile.COLUMN_NUMBER_CONF_STR, 0); this.array = new BytesRefArrayWritable(numColumns); } @Override public synchronized void write(K key, V value) throws IOException { String[] fields = value.toString().split(HiveTableInfo.FIELD_DELIMITED, -1); for (int i = 0; i < fields.length && i < this.numColumns; i++) { array.set(i, new BytesRefWritable(fields[i].getBytes("UTF-8"))); } out.append(array); } @Override public synchronized void close(TaskAttemptContext context) throws IOException, InterruptedException { out.close(); }}
ORCFileWriter
import java.io.IOException;import java.lang.reflect.Constructor;import java.util.*;import com.google.common.base.Throwables;import com.pplive.bip.metadata.hive.HiveColumnInfo;import com.pplive.bip.metadata.hive.HiveColumnType;import com.pplive.bip.metadata.log.LogInfo;import org.apache.commons.lang3.StringUtils;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.hive.ql.io.orc.*;import org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter;import org.apache.hadoop.hive.serde2.objectinspector.*;import org.apache.hadoop.hive.serde2.objectinspector.primitive.*;import org.apache.hadoop.hive.serde2.typeinfo.*;import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants;import org.apache.hadoop.mapreduce.TaskAttemptContext;import com.pplive.bip.metadata.hive.HiveTableInfo;public class ValORCFileWriter<K, V> extends org.apache.hadoop.mapreduce.RecordWriter<K, V> { private final static String MapTypeString = "MAP<STRING,STRING>"; private final static String ArrayTypeString = "ARRAY<STRING>"; private OrcSerde orcSerde; private RecordWriter writer; private final SettableStructObjectInspector tableInspector; private final List<StructField> structFields; private final Object orcRow; private final int numColumns; private static Constructor<? extends RecordWriter> getOrcWriterConstructor() { try { String writerClassName = OrcOutputFormat.class.getName() + "$OrcRecordWriter"; Constructor<? extends RecordWriter> constructor = OrcOutputFormat.class.getClassLoader() .loadClass(writerClassName).asSubclass(RecordWriter.class) .getDeclaredConstructor(Path.class, OrcFile.WriterOptions.class); constructor.setAccessible(true); return constructor; } catch (ReflectiveOperationException e) { throw Throwables.propagate(e); } } private static String getHiveType(HiveColumnInfo column) { String hiveType; HiveColumnType columnType = column.getHiveType(); if (columnType == HiveColumnType.Map) hiveType = MapTypeString; else if (columnType == HiveColumnType.Complex) hiveType = column.getColumnTypeValue(); else if (columnType == HiveColumnType.Array) hiveType = ArrayTypeString; else hiveType = column.getHiveType().toString(); return hiveType.toLowerCase(); } private static ObjectInspector getObjectInspector(String hiveType) { TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(hiveType.toLowerCase()); return TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo(typeInfo); } /** * construct * @param conf * @param path * @throws IOException */ public ValORCFileWriter(Configuration conf, LogInfo logInfo, Path path) throws IOException { HiveTableInfo hiveTableInfo = logInfo.getHiveTable(); List<String> columnNames = new ArrayList<String>(); List<String> hiveTypeNames = new ArrayList<String>(); List<ObjectInspector> columnInspectors = new ArrayList<ObjectInspector>(); for (HiveColumnInfo columnInfo: hiveTableInfo.getHiveColumns()) { columnNames.add(columnInfo.getName()); String hiveType = getHiveType(columnInfo); hiveTypeNames.add(hiveType); columnInspectors.add(getObjectInspector(hiveType)); } this.tableInspector = ObjectInspectorFactory.getStandardStructObjectInspector(columnNames, columnInspectors); this.structFields = (List<StructField>) tableInspector.getAllStructFieldRefs(); Constructor<? extends RecordWriter> writerConstructor = getOrcWriterConstructor(); try { this.writer = writerConstructor.newInstance(path, OrcFile.writerOptions(conf).inspector(this.tableInspector)); } catch (ReflectiveOperationException e) { throw new RuntimeException("Failed to create writer", e); } this.orcSerde = new OrcSerde(); Properties properties = new Properties(); properties.setProperty(hive_metastoreConstants.META_TABLE_COLUMNS, StringUtils.join(columnNames, ',')); properties.setProperty(hive_metastoreConstants.META_TABLE_COLUMN_TYPES, StringUtils.join(hiveTypeNames, ':')); this.orcSerde.initialize(conf, properties); this.orcRow = tableInspector.create(); this.numColumns = hiveTableInfo.getHiveColumnCount(); } @Override public synchronized void write(K key, V value) throws IOException { String[] fields = value.toString().split(HiveTableInfo.FIELD_DELIMITED, -1); for (int i = 0; i < fields.length && i < this.numColumns; i++) { StructField sf = structFields.get(i); tableInspector.setStructFieldData(orcRow, sf, getJavaObject(fields[i], sf.getFieldObjectInspector())); } this.writer.write(this.orcSerde.serialize(orcRow, tableInspector)); } private Object getJavaObject(String value, ObjectInspector oi) { Class clazz = oi.getClass(); if(value.isEmpty()) { return null; } Object o; try { if (clazz == JavaShortObjectInspector.class) { o = new Short(value); } else if (clazz == JavaIntObjectInspector.class) { o = new Integer(value); } else if (clazz == JavaLongObjectInspector.class) { o = new Long(value); } else if (clazz == JavaStringObjectInspector.class) { o = value; } else if (clazz == JavaFloatObjectInspector.class) { o = new Float(value); } else if (clazz == JavaDoubleObjectInspector.class) { o = new Double(value); } else if (clazz == StandardListObjectInspector.class) { ObjectInspector elementObjectInspector = ((StandardListObjectInspector)oi).getListElementObjectInspector(); String[] vs = value.split(","); List l = new ArrayList(); for(String v: vs) { l.add(getJavaObject(v,elementObjectInspector)); } o = l; } else if (clazz == StandardMapObjectInspector.class) { ObjectInspector keyObjectInspector = ((StandardMapObjectInspector)oi).getMapKeyObjectInspector(); ObjectInspector valueObjectInspector = ((StandardMapObjectInspector)oi).getMapValueObjectInspector(); Map m = new HashMap(); if(!value.isEmpty()) { String[] vs = value.split(","); for(String v: vs) { String[] kv = v.split(":"); if(kv.length == 2) { m.put(getJavaObject(kv[0],keyObjectInspector), getJavaObject(kv[1],valueObjectInspector)); } } } o = m; } else if (clazz == StandardStructObjectInspector.class) { StandardStructObjectInspector soi = (StandardStructObjectInspector)oi; List<StructField> fields = (List<StructField>) soi.getAllStructFieldRefs(); ArrayList result = (ArrayList)soi.create(); String[] vs = value.split(":"); if(vs.length == fields.size()) { for (int i = 0; i < fields.size(); i++) { StructField structField = fields.get(i); soi.setStructFieldData(result,structField,getJavaObject(vs[i], structField.getFieldObjectInspector())); } } o = result; } else { throw new RuntimeException("invalid object ObjectInspector" + oi.toString()); } }catch (NumberFormatException e) { o = null; } return o; } @Override public synchronized void close(TaskAttemptContext context) throws IOException, InterruptedException { writer.close(false); }
三种不同的Writer输出的数据查询效率依次变高,ORC最常用,效率也最高。
阅读全文
0 0
- 控制MapReduce输出文件个数及格式
- MapReduce多种输出格式
- mapreduce程序reduce输出控制
- MapReduce多文件输出
- MapReduce之多文件输出
- Printf()输出格式控制
- Printf()输出格式控制
- 控制报表输出格式
- Printf() 输出格式控制
- Printf()输出格式控制
- Printf() 输出格式控制
- printf()输出格式控制
- PRINTF()输出格式控制
- Printf()输出格式控制
- Printf()输出格式控制
- Printf()输出格式控制
- C++输出格式控制
- Printf()输出格式控制
- 拓扑排序-LintCode
- HDU 5933 ArcSoft's Office Rearrangement(构造题)
- Java IO流_2
- 友盟埋点统计之点击事件
- 画出轮廓drawContours
- 控制MapReduce输出文件个数及格式
- redis集群——删除节点
- 【HDU1852】Beijing 2008
- leetcode[Find All Anagrams in a String]
- java笔记
- nginx的使用经历
- SOAPUI中文教程---调试测试用例
- 初识ubuntu 安装steam
- 机器学习中的特征选择——决策树模型预测泰坦尼克号乘客获救实例