Hadoop0.20.2 Bloom filter应用示例

来源:互联网 发布:基于 的推荐算法 编辑:程序博客网 时间:2024/06/09 11:53

1. 简介

    参见《Hadoop in Action》P102 以及 《Hadoop实战(第2版)》(陆嘉恒)P69

    

           

2. 案例

    网上大部分的说明仅仅是按照《Hadoop in Action》中的示例代码给出,这里是Hadoop0.20.2版本,在该版本中已经实现了BloomFilter。

    案例文件如下:

    customers.txt

    1,Stephanie Leung,555-555-5555
    2,Edward Kim,123-456-7890
    3,Jose Madriz,281-330-8004
    4,David Stork,408-555-0000

    -----------------------------------------------------------------

    orders.txt

    3,A,12.95,02-Jun-2008
    1,B,88.25,20-May-2008
    2,C,32.00,30-Nov-2007
    3,D,25.02,22-Jan-2009
    5,E,34.59,05-Jan-2010
    6,F,28.67,16-Jan-2008
    7,G,49.82,24-Jan-2009

    两个文件通过customer ID关联。

3. 代码

import java.io.BufferedReader;import java.io.IOException;import java.io.InputStreamReader;import java.util.ArrayList;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FSDataInputStream;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.FileSplit;import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;import org.apache.hadoop.util.GenericOptionsParser;import org.apache.hadoop.util.bloom.BloomFilter;import org.apache.hadoop.util.bloom.Key;import org.apache.hadoop.util.hash.Hash;public class BloomMRMain {public static class BloomMapper extends Mapper<Object, Text, Text, Text> {BloomFilter bloomFilter = new BloomFilter(10000, 6, Hash.MURMUR_HASH);protected void setup(Context context) throws IOException ,InterruptedException {Configuration conf = context.getConfiguration();String path = "hdfs://localhost:9000/user/hezhixue/input/customers.txt";Path file = new Path(path);FileSystem hdfs = FileSystem.get(conf);FSDataInputStream dis = hdfs.open(file);BufferedReader reader = new BufferedReader(new InputStreamReader(dis));String temp;  while ((temp = reader.readLine()) != null) { //System.out.println("bloom filter temp:" + temp);String[] tokens = temp.split(",");if (tokens.length > 0) {bloomFilter.add(new Key(tokens[0].getBytes()));}}}protected void map(Object key, Text value, Context context) throws IOException ,InterruptedException {//获得文件输入路径            String pathName = ((FileSplit) context.getInputSplit()).getPath().toString();            if (pathName.contains("customers")) {            String data = value.toString();            String[] tokens = data.split(",");            if (tokens.length == 3) {            String outKey = tokens[0];            String outVal = "0" + ":" + tokens[1] + "," + tokens[2];            context.write(new Text(outKey), new Text(outVal));            }            } else if (pathName.contains("orders")) {            String data = value.toString();            String[] tokens = data.split(",");            if (tokens.length == 4) {            String outKey = tokens[0];            System.out.println("in map and outKey:" + outKey);            if (bloomFilter.membershipTest(new Key(outKey.getBytes()))) {            String outVal = "1" + ":" + tokens[1] + "," + tokens[2]+ "," + tokens[3];            context.write(new Text(outKey), new Text(outVal));            }            }            }}}public static class BloomReducer extends Reducer<Text, Text, Text, Text> {ArrayList<Text> leftTable = new ArrayList<Text>();ArrayList<Text> rightTable = new ArrayList<Text>();protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException ,InterruptedException { leftTable.clear();         rightTable.clear();for (Text val : values) {String outVal = val.toString();System.out.println("key: " + key.toString() + " : " + outVal);int index = outVal.indexOf(":");String flag = outVal.substring(0, index);if ("0".equals(flag)) {leftTable.add(new Text(outVal.substring(index+1)));} else if ("1".equals(flag)) {rightTable.add(new Text(outVal.substring(index + 1)));}}if (leftTable.size() > 0 && rightTable.size() > 0) {for(Text left : leftTable) {for (Text right : rightTable) {context.write(key, new Text(left.toString() + "," + right.toString()));}}}}}public static void main(String[] args) throws Exception {Configuration conf = new Configuration();    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();        if (otherArgs.length != 2) {      System.err.println("Usage: BloomMRMain <in> <out>");      System.exit(2);    }            Job job = new Job(conf, "BloomMRMain");    job.setJarByClass(BloomMRMain.class);        job.setMapperClass(BloomMapper.class);    job.setReducerClass(BloomReducer.class);        job.setInputFormatClass(TextInputFormat.class);    job.setOutputFormatClass(TextOutputFormat.class);        job.setMapOutputKeyClass(Text.class);    job.setMapOutputValueClass(Text.class);        job.setOutputKeyClass(Text.class);    job.setOutputValueClass(Text.class);        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));        System.exit(job.waitForCompletion(true) ? 0 : 1);}}


0 0
原创粉丝点击