MapReduce高级编程——自定义InputFormat

来源:互联网 发布:手机哼歌识曲的软件 编辑:程序博客网 时间:2024/05/01 08:43

http://irwenqiang.iteye.com/blog/1448164

0、测试集样例

Java代码  收藏代码
  1. ball, 3.512.79.0  
  2. car, 1523.7642.23  
  3. device, 0.012.4, -67.1  
 

1、测试Point3D InputFormat

Java代码  收藏代码
  1. import java.io.IOException;  
  2. import java.net.URI;  
  3.   
  4. import javax.xml.soap.Text;  
  5.   
  6. import org.apache.hadoop.conf.Configuration;  
  7. import org.apache.hadoop.fs.FileSystem;  
  8. import org.apache.hadoop.fs.Path;  
  9. import org.apache.hadoop.mapreduce.Job;  
  10. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  
  11. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  
  12.   
  13. /** 
  14.  * desc:Custom Data Types <code>TestPoint3DInputFormat</code> 
  15.  *  
  16.  * @author chenwq 
  17.  */  
  18. public class TestPoint3DInputFormat {  
  19.      /** 
  20.      * @param args 
  21.      * @throws IOException  
  22.      * @throws ClassNotFoundException  
  23.      * @throws InterruptedException  
  24.      */  
  25.     public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {  
  26.         // TODO Auto-generated method stub  
  27.         System.out.println("hello,chenwq!");  
  28.         Job job=new Job();  
  29.         Configuration conf=new Configuration();  
  30.         FileSystem fs=FileSystem.get(URI.create(args[1]), conf);  
  31.         fs.delete(new Path(args[1]));  
  32.         job.setJobName("测试MyInputFormat程序。。。。。");  
  33.         FileInputFormat.addInputPath(job, new Path(args[0]));  
  34.         FileOutputFormat.setOutputPath(job, new Path(args[1]));  
  35.         job.setInputFormatClass(Point3DinputFormat.class);  
  36.         job.setMapOutputKeyClass(Text.class);  
  37.         job.setMapOutputValueClass(Point3D.class);  
  38.         job.setMapperClass(Point3DMapper.class);  
  39.         job.setNumReduceTasks(0);  
  40.         job.waitForCompletion(false);  
  41.     }  
  42. }  

 

2、自定义类型Point3D必须实现WritableComparable接口,才能在Hadoop环境中传输

Java代码  收藏代码
  1. import java.io.DataInput;  
  2. import java.io.DataOutput;  
  3. import java.io.IOException;  
  4.   
  5. import org.apache.hadoop.io.WritableComparable;  
  6.   
  7. /** 
  8.  * desc:Custom Data Types <code>Point</code> 
  9.  *  
  10.  * @author chenwq 
  11.  */  
  12. public class Point3D implements WritableComparable {  
  13.     public float x;  
  14.     public float y;  
  15.     public float z;  
  16.   
  17.     public Point3D(float x, float y, float z) {  
  18.         this.x = x;  
  19.         this.y = y;  
  20.         this.z = z;  
  21.     }  
  22.   
  23.     public Point3D() {  
  24.         this(0.0f, 0.0f, 0.0f);  
  25.     }  
  26.   
  27.     public void set(float x, float y, float z) {  
  28.         this.x = x;  
  29.         this.y = y;  
  30.         this.z = z;  
  31.     }  
  32.   
  33.     public void write(DataOutput out) throws IOException {  
  34.         out.writeFloat(x);  
  35.         out.writeFloat(y);  
  36.         out.writeFloat(z);  
  37.     }  
  38.   
  39.     public void readFields(DataInput in) throws IOException {  
  40.         x = in.readFloat();  
  41.         y = in.readFloat();  
  42.         z = in.readFloat();  
  43.     }  
  44.   
  45.     public String toString() {  
  46.         return Float.toString(x) + ", " + Float.toString(y) + ", "  
  47.                 + Float.toString(z);  
  48.     }  
  49.   
  50.     public float distanceFromOrigin() {  
  51.         return (float) Math.sqrt(x * x + y * y + z * z);  
  52.     }  
  53.   
  54.     public int compareTo(Object other) {  
  55.         float myDistance = this.distanceFromOrigin();  
  56.         float otherDistance = ((Point3D) other).distanceFromOrigin();  
  57.   
  58.         return Float.compare(myDistance, otherDistance);  
  59.     }  
  60.   
  61.     public boolean equals(Object o) {  
  62.         Point3D other = (Point3D) o;  
  63.         if (!(other instanceof Point3D)) {  
  64.             return false;  
  65.         }  
  66.   
  67.         return this.x == other.x && this.y == other.y && this.z == other.z;  
  68.     }  
  69.   
  70.     public int hashCode() {  
  71.         return Float.floatToIntBits(x) ^ Float.floatToIntBits(y)  
  72.                 ^ Float.floatToIntBits(z);  
  73.     }  
  74.   
  75. }  

 3、自定义Point3DInputFormat类型,供MapReduce编程模型使用

Java代码  收藏代码
  1. import java.io.IOException;  
  2.   
  3. import java.util.StringTokenizer;  
  4.   
  5. import org.apache.hadoop.conf.Configuration;  
  6. import org.apache.hadoop.fs.FSDataInputStream;  
  7. import org.apache.hadoop.fs.FileSystem;  
  8. import org.apache.hadoop.fs.Path;  
  9. import org.apache.hadoop.io.Text;  
  10. import org.apache.hadoop.mapreduce.InputSplit;  
  11. import org.apache.hadoop.mapreduce.JobContext;  
  12. import org.apache.hadoop.mapreduce.RecordReader;  
  13. import org.apache.hadoop.mapreduce.TaskAttemptContext;  
  14. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  
  15. import org.apache.hadoop.mapreduce.lib.input.FileSplit;  
  16. import org.apache.hadoop.util.LineReader;  
  17.   
  18. public class Point3DinputFormat extends FileInputFormat<Text, Point3D> {  
  19.       
  20.     @Override  
  21.     protected boolean isSplitable(JobContext context, Path filename) {  
  22.         // TODO Auto-generated method stub  
  23.         return false;  
  24.     }  
  25.     @Override  
  26.     public RecordReader<Text, Point3D> createRecordReader(InputSplit inputsplit,  
  27.             TaskAttemptContext context) throws IOException, InterruptedException {  
  28.         // TODO Auto-generated method stub  
  29.         return new objPosRecordReader();  
  30.     }  
  31.     public static class objPosRecordReader extends RecordReader<Text,Point3D>{  
  32.   
  33.         public LineReader in;  
  34.         public Text lineKey;  
  35.         public Point3D lineValue;  
  36.         public StringTokenizer token=null;  
  37.           
  38.         public Text line;  
  39.         
  40.         @Override  
  41.         public void close() throws IOException {  
  42.             // TODO Auto-generated method stub  
  43.               
  44.         }  
  45.   
  46.         @Override  
  47.         public Text getCurrentKey() throws IOException, InterruptedException {  
  48.             //lineKey.set(token.nextToken());  
  49.             return lineKey;  
  50.         }  
  51.   
  52.         @Override  
  53.         public Point3D getCurrentValue() throws IOException,  
  54.                 InterruptedException {  
  55.             // TODO Auto-generated method stub  
  56.             return lineValue;  
  57.         }  
  58.   
  59.         @Override  
  60.         public float getProgress() throws IOException, InterruptedException {  
  61.             // TODO Auto-generated method stub  
  62.             return 0;  
  63.         }  
  64.   
  65.         @Override  
  66.         public void initialize(InputSplit input, TaskAttemptContext context)  
  67.                 throws IOException, InterruptedException {  
  68.             // TODO Auto-generated method stub  
  69.             FileSplit split=(FileSplit)input;  
  70.             Configuration job=context.getConfiguration();  
  71.             Path file=split.getPath();  
  72.             FileSystem fs=file.getFileSystem(job);  
  73.               
  74.             FSDataInputStream filein=fs.open(file);  
  75.             in=new LineReader(filein,job);  
  76.               
  77.             line=new Text();  
  78.             lineKey=new Text();  
  79.             lineValue=new Point3D();  
  80.         }  
  81.   
  82.         @Override  
  83.         public boolean nextKeyValue() throws IOException, InterruptedException {  
  84.             // TODO Auto-generated method stub  
  85.             int linesize=in.readLine(line);  
  86.             if(linesize==0)  
  87.                 return false;  
  88.               
  89.             String[] pieces = line.toString().split(",");  
  90.             if(pieces.length != 4){  
  91.                 throw new IOException("Invalid record received");  
  92.             }  
  93.               
  94.             // try to parse floating point components of value  
  95.             float fx, fy, fz;  
  96.             try{  
  97.                 fx = Float.parseFloat(pieces[1].trim());  
  98.                 fy = Float.parseFloat(pieces[2].trim());  
  99.                 fz = Float.parseFloat(pieces[3].trim());  
  100.             }catch(NumberFormatException nfe){  
  101.                 throw new IOException("Error parsing floating poing value in record");  
  102.             }  
  103.             lineKey.set(pieces[0]);  
  104.               
  105.             lineValue.set(fx, fy, fz);  
  106.               
  107.             return true;  
  108.         }  
  109.     }  
  110. }  

 

4、编写Mapper类,这里仅仅测试自定义类型Point3D的InputFormat,不需要Reducer

Java代码  收藏代码
  1. import java.io.IOException;  
  2.   
  3. import org.apache.hadoop.io.Text;  
  4. import org.apache.hadoop.mapreduce.Mapper;  
  5.   
  6.   
  7. public class Point3DMapper extends Mapper<Text, Point3D, Text, Point3D>{  
  8.     protected void map(Text key, Point3D value, Context context) throws IOException, InterruptedException{  
  9.         context.write(key, value);  
  10.     }  
  11. }  

上面RecordReader<LongWritable, Text>的<key、value>都是可以自己定义的。但key必须实现WritableComparable类,而value必须实现Writable类。

0 0