完全分布式Hadoop 调用DFS上so文件做缓存分发跑mr

来源：互联网发布：安卓java模拟器星星编辑：程序博客网时间：2024/05/21 17:40

1、前提是你得搭建一个完全分布式环境，不然测试缓存分发没有意义，搭建教程：点击打开链接

2、使用JNA调用so：点击打开链接

3、这里为了只演示so文件的缓存分发，调用到的JNA jar包我就直接跟WordCount一起打包成一个jar包了。

4、测试程序如下，其中libtest.so是linux下编译的C程序动态链接库文件，拷贝到dfs上：

package com.busymonkey;import java.io.IOException;import java.net.URI;import java.util.StringTokenizer;    import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.filecache.DistributedCache;import org.apache.hadoop.fs.Path;  import org.apache.hadoop.io.IntWritable;  import org.apache.hadoop.io.Text;  import org.apache.hadoop.mapreduce.Job;  import org.apache.hadoop.mapreduce.Mapper;  import org.apache.hadoop.mapreduce.Reducer;  import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.GenericOptionsParser;import com.sun.jna.Library;import com.sun.jna.Native;    public class WordCount {      public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> {          private final static IntWritable one = new IntWritable(1);          private Text word = new Text();          public void map(Object key, Text value, Context context)         throws IOException, InterruptedException {        TestDll2.INSTANCE.test();              int c = TestDll2.INSTANCE.addTest(10, 20);              System.out.println("================="+c);            String line = value.toString();             StringTokenizer itr = new StringTokenizer(line);              while (itr.hasMoreTokens()) {                  word.set(itr.nextToken().toLowerCase());                  context.write(word, one);              }          }      }        public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> {          private IntWritable result = new IntWritable();          public void reduce(Text key, Iterable<IntWritable> values, Context context)         throws IOException, InterruptedException {              int sum = 0;             for (IntWritable val : values) {                sum += val.get();              }              result.set(sum);             context.write(key, result);        }      }      public interface TestDll2 extends Library {TestDll2 INSTANCE = (TestDll2) Native.loadLibrary("test", TestDll2.class);void test();          int addTest(int a, int b);}      public static void main(String[] args) throws Exception {         Configuration conf = new Configuration();         DistributedCache.createSymlink(conf);        DistributedCache.addCacheFile(new URI("/apps/icps/houTest/libtest.so#libtest.so"), conf);        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();        if (otherArgs.length != 2) {              System.err.println("Usage: wordcount <in> <out>");              System.exit(2);          }          Job job = Job.getInstance(conf,"word count");        job.setJarByClass(WordCount.class);          job.setMapperClass(TokenizerMapper.class);          job.setCombinerClass(IntSumReducer.class);          job.setReducerClass(IntSumReducer.class);          job.setOutputKeyClass(Text.class);          job.setOutputValueClass(IntWritable.class);          FileInputFormat.addInputPath(job, new Path(otherArgs[0]));          FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));         System.exit(job.waitForCompletion(true) ? 0 : 1);      }  }

5、对与文本、压缩包、jar包的缓存分发（通过符号链接的方式，这样可以省去复杂的文件操作）：

Configuration conf = new Configuration();DistributedCache.createSymlink(conf);//创建符号链接DistributedCache.addCacheFile(new URI("/user/tinfo/zhangguochen/file1#myfile"), conf);//加入分布式缓存,myfile是符号2.在mapreduce中使用public void setup(Context context) {        File myfile = new File("myfile");//在这里就可以直接通过符号myfile使用此文件}

6、关于Hadoop DistributedCache 的简介：

DistributedCache是Hadoop提供的文件缓存工具，它能够自动将指定的文件分发到各个节点上，缓存到本地，供用户程序读取使用。它具有以下几个特点：缓存的文件是只读的，修改这些文件内容没有意义；用户可以调整文件可见范围（比如只能用户自己使用，所有用户都可以使用等），进而防止重复拷贝现象；按需拷贝，文件是通过HDFS作为共享数据中心分发到各节点的，且只发给任务被调度到的节点。本文将介绍DistributedCache在Hadoop 1.0和2.0中的使用方法及实现原理。

Hadoop DistributedCache有以下几种典型的应用场景：1）分发字典文件，一些情况下Mapper或者Reducer需要用到一些外部字典，比如黑白名单、词表等；2）map-side join：当多表连接时，一种场景是一个表很大，一个表很小，小到足以加载到内存中，这时可以使用DistributedCache将小表分发到各个节点上，以供Mapper加载使用；3）自动化软件部署：有些情况下，MapReduce需依赖于特定版本的库，比如依赖于某个版本的PHP解释器，一种做法是让集群管理员把这个版本的PHP装到各个机器上，这通常比较麻烦，另一种方法是使用DistributedCache分发到各个节点上，程序运行完后，Hadoop自动将其删除。

7、对于新版本的 hadoop api，这里提供新的缓存分发调用方式：

package com.busymonkey;import java.io.IOException;import java.net.URI;import java.util.StringTokenizer;    import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;  import org.apache.hadoop.io.IntWritable;  import org.apache.hadoop.io.Text;  import org.apache.hadoop.mapreduce.Job;  import org.apache.hadoop.mapreduce.Mapper;  import org.apache.hadoop.mapreduce.Reducer;  import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.GenericOptionsParser;import com.sun.jna.Library;import com.sun.jna.Native;    public class WordCount {      public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> {          private final static IntWritable one = new IntWritable(1);          private Text word = new Text();          public void map(Object key, Text value, Context context)         throws IOException, InterruptedException {        TestDll2.INSTANCE.test();              int c = TestDll2.INSTANCE.addTest(10, 20);              System.out.println("================="+c);            String line = value.toString();             StringTokenizer itr = new StringTokenizer(line);              while (itr.hasMoreTokens()) {                  word.set(itr.nextToken().toLowerCase());                  context.write(word, one);              }          }      }        public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> {          private IntWritable result = new IntWritable();          public void reduce(Text key, Iterable<IntWritable> values, Context context)         throws IOException, InterruptedException {              int sum = 0;             for (IntWritable val : values) {                sum += val.get();              }              result.set(sum);             context.write(key, result);        }      }      public interface TestDll2 extends Library {TestDll2 INSTANCE = (TestDll2) Native.loadLibrary("test", TestDll2.class);void test();          int addTest(int a, int b);}      public static void main(String[] args) throws Exception {         Configuration conf = new Configuration();         String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();        if (otherArgs.length != 2) {              System.err.println("Usage: wordcount <in> <out>");              System.exit(2);          }          Job job = Job.getInstance(conf,"word count");        job.addCacheFile(new URI("/apps/icps/houTest/libtest.so#libtest.so")); //新方法        job.setJarByClass(WordCount.class);          job.setMapperClass(TokenizerMapper.class);          job.setCombinerClass(IntSumReducer.class);          job.setReducerClass(IntSumReducer.class);          job.setOutputKeyClass(Text.class);          job.setOutputValueClass(IntWritable.class);          FileInputFormat.addInputPath(job, new Path(otherArgs[0]));          FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));         System.exit(job.waitForCompletion(true) ? 0 : 1);      }  }

8、这里如果要引用jar包，同样的也是先拷贝到hdfs上，然后加入：

        job.addFileToClassPath(new Path("/apps/icps/houTest/jna-3.5.1.jar"));

9、其实分发的本质就是把文件从dfs上拷贝到本地节点（只要用到的节点都会拷贝），对于引用 so 文件，或者jar包，因为在实际运行的时候程序是跑在本地的，所以本地的环境变量需要能找到这些库文件和包的位置才行，对于so文件，可以在环境变量里面设置：

export LD_LIBRARY_PATH=

对于jar，通常是设置classpath，但是如果使用的是hadoop命令行来运行程序的话，classpath跟本地系统的classpath就不是一回事，这个需要在hadoop的配置文件中进行配置，比如用hadoop命令行加上 classpath就能打印出一些目录。只要能找到本地缓存下来的so和jar的位置，就是能正常运行程序的，否则就会提示少类。

这里再注意一下，如果hadoop的 mapred-site.xml文件没有配置缓存目录，那么他就会自动在默认的目录下缓存。这个可以在调用分发方法之后看到日志里有提示这个目录，一般是系统本地目录的 /tmp/hadoop-root/mapred/local 目录下。

0 0

完全分布式Hadoop 调用DFS上so文件做缓存分发 跑mr

完全分布式Hadoop 调用DFS上so文件做缓存分发跑mr