利用mapreduce统计单词个数

来源:互联网 发布:淘宝达人赚钱不 编辑:程序博客网 时间:2024/06/05 01:29

wordCount.class

package wordCount;import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;public class wordCount {    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {        // TODO Auto-generated method stub        Configuration configuration = new Configuration();        FileSystem fs = FileSystem.get(configuration);        Job job = new Job(configuration);        job.setJarByClass(wordCount.class);        FileInputFormat.addInputPath(job,new Path("/file"));        if(fs.exists(new Path("/result"))){            System.out.println("文件已存在,将删除后重建!");            fs.delete(new Path("/result"));        }        FileOutputFormat.setOutputPath(job,new Path("/result"));        job.setMapperClass(myMapper.class);        job.setReducerClass(myReducer.class);        job.setNumReduceTasks(2);        job.setPartitionerClass(myPartitioner.class);        job.setOutputKeyClass(Text.class);        job.setOutputValueClass(IntWritable.class);        job.waitForCompletion(true);    }}

myMapper.class

package wordCount;import java.io.IOException;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;public class myMapper extends Mapper<LongWritable, Text,Text, IntWritable>{    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException{        //String[] str = value.toString().split("[^a-zA-Z0-9]");        String[] str = value.toString().split(" ");        Text ss = new Text();        /*for(String ss:str){            if(!ss.equals(" ")&&ss!=null){*/                //context.write(new Text(ss),new IntWritable(1));                ss.clear();                ss.set(new StringBuffer("loglevel::").append(str[1]).toString());                context.write(ss, new IntWritable(1));                ss.clear();                ss.set(new StringBuffer("logresource::").append(str[2]).toString());                context.write(ss, new IntWritable(1));        /*  }        }*/    }}

myPartitioner.class

package wordCount;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Partitioner;public class myPartitioner extends Partitioner<Text, IntWritable> {    @Override    public int getPartition(Text key, IntWritable value, int arg2) {        // TODO Auto-generated method stub        if(key.toString().startsWith("loglevel::"))            return 0;        if(key.toString().startsWith("logresource::"))            return 1;        return 0;    }}

myReducer.class

package wordCount;import java.io.IOException;import java.util.Iterator;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Reducer;public class myReducer extends Reducer<Text,IntWritable, Text,IntWritable>{    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException{        int sum = 0;        Iterator<IntWritable> it = values.iterator();        while(it.hasNext()){            sum += it.next().get();        }        context.write(key, new IntWritable(sum));    }}
原创粉丝点击