DistributedCache介绍及应用

来源：互联网发布：csol mac 编辑：程序博客网时间：2024/06/05 20:04

DistributedCache是Hadoop的一个分布式文件缓存类，使用DistributedCache可以来完成文件在分布式集群中的共享，而且在执行一些join操作时，将小表放入cache中，可以提高join效率。

在hadoop中，共享变量或共享文件有以下几种方法：
1.使用Configuration的set方法，只适合数据内容比较小的场景；
2.将共享文件放在HDFS上，每次都去读取，效率比较低；

3.将共享文件放在DistributedCache里，在setup初始化一次后，即可多次使用。

DistributedCache缓存的文件是只读的，文件类型可以是：文本文件，压缩文件和jar文件等。

在作业启动之后和task启动之前，MapReduce会将需要缓存的文件复制到执行任务节点的本地。

DistributedCache共享文件的模式，只能在集群的环境中使用。

下面的例子实现过滤掉DistributedCache里面的内容后统计单词数。

代码如下：

package org.cy.pack;import java.io.BufferedReader;import java.io.FileNotFoundException;import java.io.FileReader;import java.io.IOException;import java.net.URI;import java.net.URISyntaxException;import java.util.HashSet;import org.apache.commons.lang.text.StrTokenizer;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.filecache.DistributedCache;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;/** * @author caiyong *DistributedCache只能够在集群模式下运行，不然会报找不到文件的错误。 *根据console的信息，找到cache所在的目录，但是找不到具体的文件。 */public class DistridCache {    private static Logger logger=LoggerFactory.getLogger(DistridCache.class);    public static class MyMap extends Mapper<Object,Text,Text,IntWritable>{                private final static IntWritable one = new IntWritable(1);        private Text word = new Text();        private HashSet<String> keyword;        private Path localFiles[] = null;                public void setup(Context context){            keyword = new HashSet<String>();            Configuration conf = new Configuration();            try {                localFiles = DistributedCache.getLocalCacheFiles(conf);                System.out.println("获取的路径是：  "+localFiles[0].toString());            } catch (IOException e) {                // TODO Auto-generated catch block                e.printStackTrace();            }                        for(int i=0;i<localFiles.length;i++){                String akeyword;                BufferedReader BR = null;                try {                    BR = new BufferedReader(new FileReader(localFiles[i].toString()));                } catch (FileNotFoundException e) {                    // TODO Auto-generated catch block                    e.printStackTrace();                }                                try {                    while((akeyword = BR.readLine()) != null){                        keyword.add(akeyword);                    }                } catch (IOException e) {                    // TODO Auto-generated catch block                    e.printStackTrace();                }                try {                    BR.close();                } catch (IOException e) {                    // TODO Auto-generated catch block                    e.printStackTrace();                }            }        }                         public void map(Object key,Text value, Context context) throws InterruptedException{             StrTokenizer itr = new StrTokenizer(value.toString());             while(itr.hasNext()){                 String aword = itr.nextToken();                 if(keyword.contains(aword) == true){                     continue;                 }                     word.set(aword);                     try {                        context.write(word, one);                    } catch (IOException e) {                        // TODO Auto-generated catch block                        e.printStackTrace();                    }                 }             }         }            public static class MyReduce extends Reducer<Text,IntWritable,Text,IntWritable>{        private IntWritable result = new IntWritable();        public void reduce(Text key,Iterable<IntWritable> values,Context context) throws IOException, InterruptedException{            int sum = 0;            for(IntWritable val : values){                sum += val.get();            }            result.set(sum);            context.write(key, result);        }    }    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {        // TODO Auto-generated method stub                Configuration conf = new Configuration();                Path in = new Path("hdfs://127.0.0.1:8020/home/Distritestdata");        Path out = new Path("hdfs://127.0.0.1:8020/home/Distritestdata/out");                 //设置待缓存文件的地址        DistributedCache.addCacheFile(new URI("hdfs://127.0.0.1:8020/home/DIstriCachedata/filterdata.txt"), conf);                Job job = new Job(conf,"DistridCache");        job.setJarByClass(DistridCache.class);                FileSystem fs=FileSystem.get(conf);         if(fs.exists(out)){             fs.delete(out, true);             System.out.println("存在此路径, 已经删除......");         }        job.setMapperClass(MyMap.class);        job.setCombinerClass(MyReduce.class);        job.setReducerClass(MyReduce.class);        job.setOutputKeyClass(Text.class);        job.setOutputValueClass(IntWritable.class);                FileInputFormat.addInputPath(job, in);        FileOutputFormat.setOutputPath(job, out);                System.exit(job.waitForCompletion(true)?0:1);    }}

0 0