mapreduce计算分词权重

来源：互联网发布：php mvc框架有哪些编辑：程序博客网时间：2024/05/21 12:44

计算每个词在每篇微博中的权值
思路：
公式：TF* loge(N/DF)
TF:当前词在本篇微博中出现的次数
N：总微博数
DF：当前词在多少微博中出现过
编程时特别注意不要导错包，不让会出现许多奇怪的错误：
1.测试数据

3823890335901756        今天是今年最暖和的一天，果断出来逛街！3823890364788305        春天来了，约好友一起出去去踏青，去赏花！3823890369489295        我在平湖，让你开挂练九阳真经，走火入魔毁了三叉神经了吧，改练九阴真经吧小子。   (免费下载 )3823890373686361        约了小伙伴一起去理发！3823890378201539        今天约了姐妹去逛街吃美食，周末玩得很开心啊！3823890382081678        这几天一直在约，因为感冒发烧了，所以和老公约好了陪我去打针，求九阳安慰，我想喝豆浆，药好苦的3823890399188850        和吃货的约会么就是吃3823890419856548        全国包邮！九阳3823890436963972        我亲爱的

代码：
First Map

import java.io.StringReader;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;import org.wltea.analyzer.core.IKSegmenter;import org.wltea.analyzer.core.Lexeme;//计算每个词在该条微博中出现的次数，也就是公式中的（TF),统计N(微博总条数)public class FirstMapper extends Mapper<LongWritable, Text, Text, IntWritable>{    protected void map(LongWritable key, Text value, Context context) throws InterruptedException {        //String[] v=value.toString().trim().split("\t");        String[] v=value.toString().split("\t");        if(v.length>=2){            String id=v[0].trim();            String content=v[1].trim();            StringReader sr=new StringReader(content);            IKSegmenter ikSegmenter=new IKSegmenter(sr, true);            Lexeme word=null;            try {                while((word=ikSegmenter.next())!=null){                    String w=word.getLexemeText();                    context.write(new Text(w+"_"+id), new IntWritable(1));                }                //修改                sr.close();                //计算公式中的N                context.write(new Text("count"), new IntWritable(1));            } catch (Exception e) {                // TODO Auto-generated catch block                e.printStackTrace();            }        }else{            System.out.println(value.toString()+"------------------------------");        }    };}

FirstReduce:

import java.io.IOException;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Reducer;//public class FirstReduce extends Reducer<Text, IntWritable, Text, IntWritable>{    protected void reduce(Text key,Iterable<IntWritable> count, Context context) throws IOException ,InterruptedException {        int sum=0;        for(IntWritable i:count){            sum=sum+i.get();        }        if(key.equals(new Text("count"))){            System.out.println(key.toString()+"__________"+sum);;        }        context.write(key, new IntWritable(sum));    }//去掉分号;}

FirstPartition:

import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;//导错该包导致好几次测试都没有数据输出，特别注意//import org.apache.hadoop.mapred.lib.HashPartitioner;public class FIrstPartition extends HashPartitioner<Text, IntWritable>{    @Override    public int getPartition(Text key, IntWritable value, int numReduceTasks) {        if(key.equals(new Text("count")))            return 3;        else            //默认HashPartitioner--哈希值模Reduce数量            return super.getPartition(key, value, numReduceTasks-1);    }}

FIrstJob:

import java.io.IOException;import org.apache.hadoop.conf.Configuration;//import org.apache.hadoop.examples.SecondarySort.FirstPartitioner;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;public class FirstJob {    public static void main(String[] args) throws ClassNotFoundException, InterruptedException {        Configuration conf=new Configuration();        try {            Job job=Job.getInstance(conf,"weibo1");            job.setJarByClass(FirstJob.class);            //设置Map任务输出的Key和Value类型            job.setOutputKeyClass(Text.class);            job.setOutputValueClass(IntWritable.class);            //设置Reduce个数            job.setNumReduceTasks(4);            //导错导致无结果写入            //job.setPartitionerClass(FirstPartitioner.class);            job.setPartitionerClass(FIrstPartition.class);            job.setMapperClass(FirstMapper.class);            job.setCombinerClass(FirstReduce.class);            job.setReducerClass(FirstReduce.class);            //mr运行时的输入数据从hdfs的哪个目录中获取            FileInputFormat.addInputPath(job, new Path("/input/weibo1"));            FileOutputFormat.setOutputPath(job, new Path("/output/weibo1"));            boolean f=job.waitForCompletion(true);            if(f){                System.out.println("执行job成功");                TwoJob.mainJob();            }        } catch (IOException e) {            e.printStackTrace();        }    }}

TwoMapper

import java.io.IOException;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.lib.input.FileSplit;//统计每个词的DFpublic class TwoMapper extends Mapper<LongWritable, Text, Text, IntWritable>{    protected void map(LongWritable key, Text value,Context context) throws InterruptedException {        FileSplit fs=(FileSplit) context.getInputSplit();        if(!fs.getPath().getName().contains("part-r-00003")){            String[] v=value.toString().trim().split("\t");            if(v.length>=2){                String[] ss=v[0].split("_");                if(ss.length>=2){                    String w=ss[0];                    //统计DF，该词一共在那些微博中出现过                    try {                        context.write(new Text(w), new IntWritable(1));                    } catch (Exception e) {                        e.printStackTrace();                    }                }            }else{                System.out.println(value.toString()+"---------------");            }        }    };}

TwoReduce

import java.io.IOException;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Reducer;public class TwoReduce extends Reducer<Text, IntWritable, Text, IntWritable>{    protected void reduce(Text key, Iterable<IntWritable> count, Context context) throws IOException ,InterruptedException {        int sum=0;        for(IntWritable i:count){            sum=sum+i.get();        }        context.write(key, new IntWritable(sum));    }}

TwoJob

import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;public class TwoJob {  public static void mainJob() {      Configuration conf=new Configuration();        try {            Job job=Job.getInstance(conf,"weibo2");            job.setJarByClass(TwoJob.class);            //设置Map任务输出的Key和Value类型            job.setOutputKeyClass(Text.class);            job.setOutputValueClass(IntWritable.class);            job.setMapperClass(TwoMapper.class);            job.setCombinerClass(TwoReduce.class);            job.setReducerClass(TwoReduce.class);            //mr运行时的输入数据从hdfs的哪个目录中获取            FileInputFormat.addInputPath(job, new Path("/output/weibo1"));            FileOutputFormat.setOutputPath(job, new Path("/output/weibo2"));            boolean f=job.waitForCompletion(true);            if(f){                System.out.println("job2执行成功");                LastJob.mainJob();            }        }catch (Exception e) {            e.printStackTrace();        }  } }

LastMapper:

import java.io.BufferedReader;import java.io.FileReader;import java.io.IOException;import java.net.URI;import java.text.NumberFormat;import java.util.HashMap;import java.util.Map;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.lib.input.FileSplit;public class LastMapper extends Mapper<LongWritable, Text, Text, Text>{    public static Map<String,Integer> cmap=null;    public static Map<String,Integer>df=null;    //在map方法之前执行    protected void setup(Context context) throws IOException ,InterruptedException {        if(cmap==null||cmap.size()==0||df==null||df.size()==0){           URI[] ss=context.getCacheFiles();           if(ss!=null){               for(int i=0;i<ss.length;i++){                   URI uri=ss[i];                   if(uri.getPath().endsWith("part-r-00003")){                       Path path=new Path(uri.getPath());                       BufferedReader br=new BufferedReader(new FileReader(path.getName()));                       String line=br.readLine();                       if(line.startsWith("count")){                           String[] ls=line.split("\t");                           cmap=new HashMap<String, Integer>();                           cmap.put(ls[0], Integer.parseInt(ls[1].trim()));                       }                       br.close();                   }else if(uri.getPath().endsWith("part-r-00000")){                       df=new HashMap<String,Integer>();                       Path path=new Path(uri.getPath());                       BufferedReader br=new BufferedReader(new FileReader(path.getName()));                       String line=null;                       while((line=br.readLine())!=null){                           String[] ls=line.split("\t");                           df.put(ls[0], Integer.parseInt(ls[1].trim()));                       }                       br.close();                   }               }           }        }    };    protected void map(LongWritable key, Text value,Context context) throws IOException ,InterruptedException {        FileSplit fs=(FileSplit) context.getInputSplit();        if(!fs.getPath().getName().contains("part-r-00003")){            String[] v=value.toString().trim().split("\t");            if(v.length>=2){                int tf=Integer.parseInt(v[1].trim());                String[] ss=v[0].split("_");                if(ss.length>=2){                    String w=ss[0];                    String id=ss[1];                    double s=tf*Math.log(cmap.get("count")/df.get(w));                    NumberFormat nf=NumberFormat.getInstance();                    nf.setMaximumFractionDigits(5);                    context.write(new Text(id), new Text(w+":"+nf.format(s)));                }            }else{                System.out.println(value.toString()+"-----------------");            }        }    };}

LastReduce:

import java.io.IOException;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Reducer;public class LastReduce extends Reducer<Text, Text, Text, Text>{   protected void reduce(Text key, Iterable<Text> value, Context context) throws InterruptedException {       StringBuffer sb=new StringBuffer();       for(Text text:value){           sb.append(text.toString()+"\t");       }       try {        context.write(key, new Text(sb.toString()));    } catch (IOException e) {        e.printStackTrace();    }   }}

LastJob:

import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;public class LastJob {    public static void mainJob() {Configuration conf=new Configuration();        try {            Job job=Job.getInstance(conf,"weibo3");            job.setJarByClass(LastJob.class);            job.addCacheFile(new Path("/output/weibo1/part-r-00003").toUri());            job.addCacheFile(new Path("/output/weibo2/part-r-00000").toUri());            //设置Map任务输出的Key和Value类型            job.setOutputKeyClass(Text.class);            job.setOutputValueClass(Text.class);            job.setMapperClass(LastMapper.class);            job.setCombinerClass(LastReduce.class);            job.setReducerClass(LastReduce.class);            //mr运行时的输入数据从hdfs的哪个目录中获取            FileInputFormat.addInputPath(job, new Path("/output/weibo1/"));            FileOutputFormat.setOutputPath(job, new Path("/output/weibo3"));            boolean f=job.waitForCompletion(true);            if(f){                System.out.println("job3执行成功");            }        }catch (Exception e) {            e.printStackTrace();        }    }   }

结果：
这里写图片描述

0 0