Hadoop自定义 inputformat 和outputformat 实现图像的读写

来源：互联网发布：交换机的tftp端口号编辑：程序博客网时间：2024/06/05 19:42

数据输入格式InputFormat用于描述MapReduce作业的数据输入规范。MapReduce框架依靠数据输入格式完成输入规范检查（比如输入文件目录的检查、对数据文件进行输入分块（InputSplit）,从输入分片中将数据记录逐一读出、并转换为Map过程的输入（键值对）。Hadoop 使我们能够实现自定义的InputFormat，从而实现我们自定义的Mapreduce计算。可以实现自定义的InputFormat类，从而更好的控制输入数据，以支持专用的或特定的应用程序的输入数据的文件格式，InputFormat实现应该扩展org.apache.hadoop.mapreduce.InputFormat
抽象类，并重写createRecordReader()和getSplit()方法。
下面将实现基于FlieInputFormat的自定义Inputformat也就是ImageInputFormat，和ImageRecordReader()。

import java.io.IOException;import org.apache.hadoop.io.BytesWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.InputSplit;import org.apache.hadoop.mapreduce.RecordReader;import org.apache.hadoop.mapreduce.TaskAttemptContext;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;public class ImageFileInputFormat extends FileInputFormat<Text, BytesWritable> {    @Override    protected boolean isSplitable(JobContext context, Path ){    return false;//保证单个图不被分割    }    @Override    public RecordReader<Text, BytesWritable> createRecordReader(InputSplit arg0,            TaskAttemptContext arg1) throws IOException, InterruptedException {        // TODO Auto-generated method stub        return new ImageRecordReader();    }}

ImageRecordReader类将图像文件名存为Text类型的键，图像信息存为BytesWritable类型的值。也就是将图像转换为Map过程的输入（键值对）

import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FSDataInputStream;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.BytesWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.io.IOUtils;import org.apache.hadoop.mapreduce.InputSplit;import org.apache.hadoop.mapreduce.RecordReader;import org.apache.hadoop.mapreduce.TaskAttemptContext;import org.apache.hadoop.mapreduce.lib.input.FileSplit;public class ImageRecordReader extends RecordReader<Text, BytesWritable> {    private Text key=null;    private BytesWritable value=null;    private FSDataInputStream fileStream=null;    private FileSplit filesplit;    private boolean processed= false;    private Configuration conf;    @Override    public void initialize(InputSplit split, TaskAttemptContext context)            throws IOException, InterruptedException {        // TODO Auto-generated method stub         filesplit = (FileSplit)split;         conf= context.getConfiguration();    }    @Override    public boolean nextKeyValue() throws IOException, InterruptedException {        // TODO Auto-generated method stub        if(!processed){        Path filePath =filesplit.getPath();        FileSystem fs= filePath.getFileSystem(conf);        this.fileStream= fs.open(filePath);        this.key=new Text(filePath.getName());        byte[] bytes =new byte[(int) filesplit.getLength()];        IOUtils.readFully(this.fileStream, bytes, 0, bytes.length);        this.value= new BytesWritable(bytes);        IOUtils.closeStream(fileStream);        processed =true;        return true;    }        return false;    }    @Override    public Text getCurrentKey() throws IOException, InterruptedException {        // TODO Auto-generated method stub        return key;    }    @Override    public BytesWritable getCurrentValue() throws IOException, InterruptedException {        // TODO Auto-generated method stub        return value;    }    @Override    public float getProgress() throws IOException, InterruptedException {        // TODO Auto-generated method stub        return processed ? 1.0f : 0.0f;    }    @Override    public void close() throws IOException {        // TODO Auto-generated method stub    }}

ImageMapper类。

import java.awt.image.BufferedImage;import java.io.ByteArrayInputStream;import java.io.IOException;import javax.imageio.ImageIO;import org.apache.hadoop.io.BytesWritable;import org.apache.hadoop.io.FloatWritable;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;public class ImageMapper extends Mapper<Text, BytesWritable, Text, BytesWritable> {public void map(Text key, BytesWritable value, Context context)throws IOException, InterruptedException { ByteArrayInputStream  image= new ByteArrayInputStream(value.getBytes());            BufferedImage bi=ImageIO.read(image);//从BufferedImage中我们可以得到所需要的所有图的信息。        int H =  bi.getHeight();         int W = bi.getWidth();         context.write(key, value);         }    }

下面实现写图像到hdfs.可以使用Hadoop 的OutputFormat 来为MapReduce计算的输出定义数据存储格式、数据存储位置和数据组织形式。Output准备输出位置，并提供一个RecordWriter的实现来执行实际的数据序列化和存储。下面实现的ImageOutputFormat类和ImageRecordWriter类。

import java.awt.image.BufferedImage;import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.RecordWriter;import org.apache.hadoop.mapreduce.TaskAttemptContext;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;public class ImageFileOutputFormat extends FileOutputFormat<Text, BufferedImage> {    @Override    public RecordWriter<Text, BufferedImage> getRecordWriter(            TaskAttemptContext job) throws IOException, InterruptedException {        // TODO Auto-generated method stub         Configuration conf = job.getConfiguration();              Path file = getDefaultWorkFile(job,"");              FileSystem fs = file.getFileSystem(conf);          return new ImageRecordWriter(file,fs);    }}

ImageRecordWriter类。

import java.awt.image.BufferedImage;import java.io.IOException;import javax.imageio.ImageIO;import org.apache.hadoop.fs.FSDataOutputStream;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.RecordWriter;import org.apache.hadoop.mapreduce.TaskAttemptContext;public class ImageRecordWriter extends RecordWriter<Text, BufferedImage> {    private Path file;    private FileSystem fs;    FSDataOutputStream fileStream;    public ImageRecordWriter(  Path file , FileSystem fs) {        // TODO Auto-generated constructor stub        this.file=file;        this.fs= fs;    }    @Override    public void close(TaskAttemptContext context) throws IOException,            InterruptedException {        // TODO Auto-generated method stub    fileStream.close();    }    @Override    public void write(Text key, BufferedImage value) throws IOException, InterruptedException {        // TODO Auto-generated method stub         String name=key.toString();        Path filePath = new Path(file,name);        fileStream = fs.create(filePath, false);        ImageIO.write(value, "BMP", fileStream);    }}

ImageReducer类。

import java.awt.Color;import java.awt.image.BufferedImage;import java.io.IOException;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Reducer;public class ImageReducer extends Reducer<Text, BytesWritable, Text, BufferedImage> {    public void reduce(Text key, Iterable<BytesWritable> values, Context context)            throws IOException, InterruptedException {        // process values        for (ImageWritable val : values) {       ByteArrayInputStream  image= new ByteArrayInputStream(val.getBytes());            BufferedImage bi=ImageIO.read(image);         context.write(key, bi);    }}

最后我们写一个Driver类，调用自定义的输入输出接口。

import java.awt.image.BufferedImage;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.BytesWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;public class ImageDriver {    public static void main(String[] args) throws Exception {        Configuration conf = new Configuration();        Job job = Job.getInstance(conf, "ImageDriver");        job.setJarByClass(ImageDriver.class);        job.setInputFormatClass(ImageFileInputFormat.class);        job.setMapOutputKeyClass(Text.class);        job.setMapOutputValueClass(BytesWritable.class);        job.setMapperClass(ImageMapper.class);        job.setReducerClass(ImageReducer.class);        job.setOutputKeyClass(Text.class);        job.setOutputValueClass(BufferedImage.class);        job.setOutputFormatClass(ImageFileOutputFormat.class);        // TODO: specify input and output DIRECTORIES (not files)        FileInputFormat.setInputPaths(job, new Path(args[0]));        FileOutputFormat.setOutputPath(job, new Path(args[1]));        if (!job.waitForCompletion(true))            return;    }}

以上程序仅仅实现了图像的读写没有做任何计算。可以在Mapper类和Reducer类中实现相应的计算。

0 0