mapreduce计算分词权重
来源:互联网 发布:php mvc框架有哪些 编辑:程序博客网 时间:2024/05/21 12:44
计算每个词在每篇微博中的权值
思路:
公式:TF* loge(N/DF)
TF:当前词在本篇微博中出现的次数
N:总微博数
DF:当前词在多少微博中出现过
编程时特别注意不要导错包,不让会出现许多奇怪的错误:
1.测试数据
3823890335901756 今天是今年最暖和的一天,果断出来逛街!3823890364788305 春天来了,约好友一起出去去踏青,去赏花!3823890369489295 我在平湖,让你开挂练九阳真经,走火入魔毁了三叉神经了吧,改练九阴真经吧小子。 (免费下载 )3823890373686361 约了小伙伴一起去理发!3823890378201539 今天约了姐妹去逛街吃美食,周末玩得很开心啊!3823890382081678 这几天一直在约,因为感冒发烧了,所以和老公约好了陪我去打针,求九阳安慰,我想喝豆浆,药好苦的3823890399188850 和吃货的约会么就是吃3823890419856548 全国包邮!九阳3823890436963972 我亲爱的
代码:
First Map
import java.io.StringReader;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;import org.wltea.analyzer.core.IKSegmenter;import org.wltea.analyzer.core.Lexeme;//计算每个词在该条微博中出现的次数,也就是公式中的(TF),统计N(微博总条数)public class FirstMapper extends Mapper<LongWritable, Text, Text, IntWritable>{ protected void map(LongWritable key, Text value, Context context) throws InterruptedException { //String[] v=value.toString().trim().split("\t"); String[] v=value.toString().split("\t"); if(v.length>=2){ String id=v[0].trim(); String content=v[1].trim(); StringReader sr=new StringReader(content); IKSegmenter ikSegmenter=new IKSegmenter(sr, true); Lexeme word=null; try { while((word=ikSegmenter.next())!=null){ String w=word.getLexemeText(); context.write(new Text(w+"_"+id), new IntWritable(1)); } //修改 sr.close(); //计算公式中的N context.write(new Text("count"), new IntWritable(1)); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } }else{ System.out.println(value.toString()+"------------------------------"); } };}
FirstReduce:
import java.io.IOException;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Reducer;//public class FirstReduce extends Reducer<Text, IntWritable, Text, IntWritable>{ protected void reduce(Text key,Iterable<IntWritable> count, Context context) throws IOException ,InterruptedException { int sum=0; for(IntWritable i:count){ sum=sum+i.get(); } if(key.equals(new Text("count"))){ System.out.println(key.toString()+"__________"+sum);; } context.write(key, new IntWritable(sum)); }//去掉分号;}
FirstPartition:
import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;//导错该包导致好几次测试都没有数据输出,特别注意//import org.apache.hadoop.mapred.lib.HashPartitioner;public class FIrstPartition extends HashPartitioner<Text, IntWritable>{ @Override public int getPartition(Text key, IntWritable value, int numReduceTasks) { if(key.equals(new Text("count"))) return 3; else //默认HashPartitioner--哈希值模Reduce数量 return super.getPartition(key, value, numReduceTasks-1); }}
FIrstJob:
import java.io.IOException;import org.apache.hadoop.conf.Configuration;//import org.apache.hadoop.examples.SecondarySort.FirstPartitioner;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;public class FirstJob { public static void main(String[] args) throws ClassNotFoundException, InterruptedException { Configuration conf=new Configuration(); try { Job job=Job.getInstance(conf,"weibo1"); job.setJarByClass(FirstJob.class); //设置Map任务输出的Key和Value类型 job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); //设置Reduce个数 job.setNumReduceTasks(4); //导错导致无结果写入 //job.setPartitionerClass(FirstPartitioner.class); job.setPartitionerClass(FIrstPartition.class); job.setMapperClass(FirstMapper.class); job.setCombinerClass(FirstReduce.class); job.setReducerClass(FirstReduce.class); //mr运行时的输入数据从hdfs的哪个目录中获取 FileInputFormat.addInputPath(job, new Path("/input/weibo1")); FileOutputFormat.setOutputPath(job, new Path("/output/weibo1")); boolean f=job.waitForCompletion(true); if(f){ System.out.println("执行job成功"); TwoJob.mainJob(); } } catch (IOException e) { e.printStackTrace(); } }}
TwoMapper
import java.io.IOException;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.lib.input.FileSplit;//统计每个词的DFpublic class TwoMapper extends Mapper<LongWritable, Text, Text, IntWritable>{ protected void map(LongWritable key, Text value,Context context) throws InterruptedException { FileSplit fs=(FileSplit) context.getInputSplit(); if(!fs.getPath().getName().contains("part-r-00003")){ String[] v=value.toString().trim().split("\t"); if(v.length>=2){ String[] ss=v[0].split("_"); if(ss.length>=2){ String w=ss[0]; //统计DF,该词一共在那些微博中出现过 try { context.write(new Text(w), new IntWritable(1)); } catch (Exception e) { e.printStackTrace(); } } }else{ System.out.println(value.toString()+"---------------"); } } };}
TwoReduce
import java.io.IOException;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Reducer;public class TwoReduce extends Reducer<Text, IntWritable, Text, IntWritable>{ protected void reduce(Text key, Iterable<IntWritable> count, Context context) throws IOException ,InterruptedException { int sum=0; for(IntWritable i:count){ sum=sum+i.get(); } context.write(key, new IntWritable(sum)); }}
TwoJob
import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;public class TwoJob { public static void mainJob() { Configuration conf=new Configuration(); try { Job job=Job.getInstance(conf,"weibo2"); job.setJarByClass(TwoJob.class); //设置Map任务输出的Key和Value类型 job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(TwoMapper.class); job.setCombinerClass(TwoReduce.class); job.setReducerClass(TwoReduce.class); //mr运行时的输入数据从hdfs的哪个目录中获取 FileInputFormat.addInputPath(job, new Path("/output/weibo1")); FileOutputFormat.setOutputPath(job, new Path("/output/weibo2")); boolean f=job.waitForCompletion(true); if(f){ System.out.println("job2执行成功"); LastJob.mainJob(); } }catch (Exception e) { e.printStackTrace(); } } }
LastMapper:
import java.io.BufferedReader;import java.io.FileReader;import java.io.IOException;import java.net.URI;import java.text.NumberFormat;import java.util.HashMap;import java.util.Map;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.lib.input.FileSplit;public class LastMapper extends Mapper<LongWritable, Text, Text, Text>{ public static Map<String,Integer> cmap=null; public static Map<String,Integer>df=null; //在map方法之前执行 protected void setup(Context context) throws IOException ,InterruptedException { if(cmap==null||cmap.size()==0||df==null||df.size()==0){ URI[] ss=context.getCacheFiles(); if(ss!=null){ for(int i=0;i<ss.length;i++){ URI uri=ss[i]; if(uri.getPath().endsWith("part-r-00003")){ Path path=new Path(uri.getPath()); BufferedReader br=new BufferedReader(new FileReader(path.getName())); String line=br.readLine(); if(line.startsWith("count")){ String[] ls=line.split("\t"); cmap=new HashMap<String, Integer>(); cmap.put(ls[0], Integer.parseInt(ls[1].trim())); } br.close(); }else if(uri.getPath().endsWith("part-r-00000")){ df=new HashMap<String,Integer>(); Path path=new Path(uri.getPath()); BufferedReader br=new BufferedReader(new FileReader(path.getName())); String line=null; while((line=br.readLine())!=null){ String[] ls=line.split("\t"); df.put(ls[0], Integer.parseInt(ls[1].trim())); } br.close(); } } } } }; protected void map(LongWritable key, Text value,Context context) throws IOException ,InterruptedException { FileSplit fs=(FileSplit) context.getInputSplit(); if(!fs.getPath().getName().contains("part-r-00003")){ String[] v=value.toString().trim().split("\t"); if(v.length>=2){ int tf=Integer.parseInt(v[1].trim()); String[] ss=v[0].split("_"); if(ss.length>=2){ String w=ss[0]; String id=ss[1]; double s=tf*Math.log(cmap.get("count")/df.get(w)); NumberFormat nf=NumberFormat.getInstance(); nf.setMaximumFractionDigits(5); context.write(new Text(id), new Text(w+":"+nf.format(s))); } }else{ System.out.println(value.toString()+"-----------------"); } } };}
LastReduce:
import java.io.IOException;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Reducer;public class LastReduce extends Reducer<Text, Text, Text, Text>{ protected void reduce(Text key, Iterable<Text> value, Context context) throws InterruptedException { StringBuffer sb=new StringBuffer(); for(Text text:value){ sb.append(text.toString()+"\t"); } try { context.write(key, new Text(sb.toString())); } catch (IOException e) { e.printStackTrace(); } }}
LastJob:
import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;public class LastJob { public static void mainJob() {Configuration conf=new Configuration(); try { Job job=Job.getInstance(conf,"weibo3"); job.setJarByClass(LastJob.class); job.addCacheFile(new Path("/output/weibo1/part-r-00003").toUri()); job.addCacheFile(new Path("/output/weibo2/part-r-00000").toUri()); //设置Map任务输出的Key和Value类型 job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(LastMapper.class); job.setCombinerClass(LastReduce.class); job.setReducerClass(LastReduce.class); //mr运行时的输入数据从hdfs的哪个目录中获取 FileInputFormat.addInputPath(job, new Path("/output/weibo1/")); FileOutputFormat.setOutputPath(job, new Path("/output/weibo3")); boolean f=job.waitForCompletion(true); if(f){ System.out.println("job3执行成功"); } }catch (Exception e) { e.printStackTrace(); } } }
结果:
0 0
- mapreduce计算分词权重
- mapreduce计算分词权重
- 应用各种算法都要分词计算权重
- 即插即用demo系列——结巴分词并计算权重
- 中文检索(分词、同义词、权重)
- 搜索引擎如何计算权重
- sphinx 权重计算
- css 权重 计算
- 计算汉明权重
- css权重的计算
- css权重计算小结
- 信息熵计算权重
- CSS权重计算问题
- 关键词权重计算算法
- 文本挖掘分词mapreduce化
- 词权重计算及应用
- 词权重计算及应用
- 词权重计算及应用
- 重复验证和验证码
- Java多线程/并发13、保持线程间的数据独立: Collections.synchronizedMap应用
- tensorflow之线性回归
- 深度学习:前馈神经网络neural network
- 使用canvas实现图片压缩上传
- mapreduce计算分词权重
- mysql
- 如何最快速的找到页面某一元素所绑定的点击事件,并查看js代码
- J
- PHP curl实现get/post/delete/put封装
- Linux下时间函数:struct timeval结构体
- 4820: [Sdoi2017]硬币游戏
- webstorm 使用zencoding快捷编码
- Android在项目中接入腾讯TBS浏览器WebView的教程与注意的地方