Hadoop中进行分词，加载自定义词典， DistributedCache使用

来源：互联网发布：恋夜秀场软件下载编辑：程序博客网时间：2024/05/22 06:21

由于需要再hadoop上进行分词，而且需要加载自定义词典，因此需要在hadoop中读取字典文件。
但是在hadoop中如何在map处理数据的同时进行分词，如何读取到词典？
hadoop 提供了DistributedCache，其是Map/Reduce框架提供的功能，能够缓存应用程序所需的文件（包括文本，档案文件，jar文件等）
hadoop中的MapReduce框架里已经预定义了相关的接口，其中如Mapper类下的方法setup()和cleanup()。
—-setup()
此方法被MapReduce框架仅且执行一次，在执行Map任务前，进行相关变量或者资源的集中初始化工作。
若是将资源初始化工作放在方法map()中，导致Mapper任务在解析每一行输入时都会进行资源初始化工作，导致重复，程序运行效率不高！
—-cleanup()
此方法被MapReduce框架仅且执行一次，在执行完毕Map任务后，进行相关变量或资源的释放工作。
若是将释放资源工作放入方法map()中，也会导致Mapper任务在解析、处理每一行文本后释放资源，
而且在下一行文本解析前还要重复初始化，导致反复重复，程序运行效率不高！

所以，建议资源初始化及释放工作，分别放入方法setup()和cleanup()中进行。
根据上面的原理，因此在Mapper中加载词典的工作放在setup（）中进行

本人的hadoop MapReduce程序分为三个文件：

//主函数public class CacheMain {    private static void DistibuteCacheFile(Configuration conf,String path,String label) throws URISyntaxException{        Path filePath = new Path(path);        String uriWithLink = filePath.toUri().toString() + "#" + label;        System.out.println("uriWithLink:" + uriWithLink);        DistributedCache.addCacheFile(new URI(uriWithLink), conf);    }    public static void main(String[] args) throws Exception {        Configuration conf = new Configuration();        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();        if (otherArgs.length != 2) {            System.err.println("Usage: wordcount <in> <out>");            System.exit(2);        }        DistributedCache.createSymlink(conf);        String pathSation = "/output/middle/station_label.txt";        String labelSation="station.txt";        DistibuteCacheFile(conf,pathSation,labelSation);        Job job = new Job(conf, "CacheDemo");        job.setJarByClass(CacheMain.class);        job.setMapperClass(CacheMapper.class);        job.setReducerClass(CacheReducer.class);        job.setNumReduceTasks(1);        job.setOutputKeyClass(Text.class);        job.setOutputValueClass(IntWritable.class);        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));        Path outDir = new Path(otherArgs[1]);        FileSystem fstm = FileSystem.get(conf);        fstm.delete(outDir, true);        FileOutputFormat.setOutputPath(job, outDir);        System.exit(job.waitForCompletion(true) ? 0 : 1);    }}

//Mapperpublic class CacheMapper extends Mapper<Object, Text, Text, IntWritable> {    private static final Log LOG = LogFactory.getLog(CacheMapper.class);    private final static IntWritable one = new IntWritable(1);    private Text word = new Text();    public static void UseDistributedCacheBySymbolicLink() throws Exception {        UserDefineLibrary.loadLibrary(UserDefineLibrary.FOREST, "station.txt");    }    protected void setup(Context context) throws IOException, InterruptedException {        LOG.info("Now, use the distributed cache and syslink");        try {            UseDistributedCacheBySymbolicLink();        } catch (Exception e) {            e.printStackTrace();        }    }    public void map(Object key, Text value, Context context) throws IOException, InterruptedException {        String line = value.toString();        String[] linelist;        String feedback;        String text;        try {            // get TV                    List<Term> parse = DicAnalysis.parse(text).getTerms();// 分词                    for (int i = 0; i < parse.size(); i++) {                        if (parse.get(i).getNatureStr().contains("station")) {                            StringBuilder sb = new StringBuilder();                            sb.append(mac);                            sb.append('\t').append("TV").append("\t").append("换台");                            word.set(sb.toString());                            context.write(word, one);                        }                    }                }            }        } catch (Exception e) {        }    }}

//Reducerpublic class CacheReducer extends Reducer<Text, IntWritable, Text, IntWritable> {    private IntWritable result = new IntWritable();    public void reduce(Text key, Iterable<IntWritable> values, Context context)            throws IOException, InterruptedException {        int sum = 0;        for (IntWritable val : values) {            sum += val.get();        }        result.set(sum);        context.write(key, result);    }}

参考：
http://dongxicheng.org/mapreduce-nextgen/hadoop-distributedcache-details/
http://blog.csdn.net/a_step_further/article/details/50333961
http://hpuxtbjvip0.blog.163.com/blog/static/3674131320132794940734/
http://www.cnblogs.com/quchunhui/articles/5460860.html

阅读全文

0 0