MapReduce 编程模型在日志分析方面的应用
来源:互联网 发布:手机改ip软件 编辑:程序博客网 时间:2024/06/06 19:02
public class TFCal extends Configured implements Tool, Mapper,Reducer{ public void map(Text usr, Text url, OutputCollector output, Reporter reporter)throws IOException { Text[] words = callCrawl(url); // 调用爬虫程序 for(Text word: words) // 每个词进行输出 output.collect(usr + url + word, new IntWritable(1)); } public void reduce(Text key, Iterator iter,OutputCollector<text, IntWritable> output, Reporter reporter) throws IOException { tf = iter 中包含元素的数目 ; output.collect(key, tf); } public void runCal(Path input, Path output) throws IOException { JobConf job = new JobConf(getConf(), TFCal.class); job.setInputPath(input); job.setOutputPath(output); job.setMapperClass(TFCal.class); job.setMapperClass(TFCal.class); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); JobClient.runJob(job); } }
public class TFCal2 extends Configured implements Tool, Mapper<text, text,="" IntWritable>{ public void map(Text usr, Text url, OutputCollector output, Reporter reporter)throws IOException { HashMap wordCount = new HashMap(); //HashMap 统计 TF Text[] words = callCrawl(url); // 调用爬虫程序 for(Text word: words){ // 统计词次数信息 int cnt = wordCount.get(word); wordCount.put(word,(cnt>0)?(cnt+1):1); } Iterator iter = wordCount.entrySet().iterator(); while(iter.hasNext()){ Map.Entry entry = iter.next(); // Map 输出,key 为用户 +url+ 词,value 为 TF output.collect(usr + url + entry.getKey(), entry.getValue()); } } public void runCal(Path input, Path output) throws IOException { JobConf job = new JobConf(getConf(), TFCal2.class); 设置 InputPath, outPath, MapperClass, InputFormat, OutputFormat, … job.setReduceNum(0); //Reduce 数目设为 0,不进行 Reduce 操作。 JobClient.runJob(job); } }
public class DFCal extends Configured implements Tool, Mapper<text, text,="" intwritable,="" IntWritable>,Reducer{ public void map(Text key, IntWritable url, OutputCollector output, Reporter reporter)throws IOException { 将 key 拆分成 user,url,word 三部分 output.collect(user+word, new IntWritable(1); } public void reduce(Text key, Iterator iter, OutputCollector<text, LuceneDocumentWrapper> output, Reporter reporter)throws IOException { int df = iter 中包含元素数目 ; // 建立 Lucene 索引,以 user+word 为 key,DF 作为 value,进行存储 Document doc = new Document(); doc.add(new Field("word", key.toString(), Field.Store.NO, Field.Index.UN_TOKENIZED)); doc.add(new Field("DF", df, Field.Store.YES,Field.Index.NO)); output.collect(new Text(), new LuceneDocumentWrapper(doc)); } public void runDFCal(Path input, Path output) throws IOException { JobConf job = new JobConf(getConf(), DFCal.class); 设置 InputPath, outPath, MapperClass, InputFormat, … job.setOutputFormat(LuceneOutputFormat); // 设置输出格式为 LuceneOutputFormat JobClient.runJob(job); 合并各个 reduce 生成的索引文件为一个完整索引文件(Lucene 的 IndexWriter 类提供了相应接口) } … . }
public class LuceneDocumentWrapper implements Writable { private Document doc; public LuceneDocumentWrapper(Document doc) { this.doc = doc; } public void set(Document doc_) { doc = doc_; } public Document get() { return doc; } public void readFields(DataInput in) throws IOException { // intentionally left blank } public void write(DataOutput out) throws IOException { // intentionally left blank } } public class OutputFormat extends org.apache.hadoop.mapred.FileOutputFormat { public RecordWriter getRecordWriter(final FileSystem fs,JobConf job, String name, final Progressable progress) throws IOException { final Path perm = new Path(FileOutputFormat.getOutputPath(job), name); final Path temp = job.getLocalPath("index/_" + Integer.toString( new Random().nextInt())); // 设置临时输出路径为 Reduce 节点本地局部路径 final IndexWriter writer = new IndexWriter(fs.startLocalOutput(perm, temp).toString(), new StandardAnalyzer(), true); // 初始化 IndexWriter return new RecordWriter() { public void write(WritableComparable key, LuceneDocumentWrapper value) throws IOException { // 将 document 加入到索引之中 Document doc = value.get(); writer.addDocument(doc); progress.progress(); } public void close(final Reporter reporter) throws IOException { boolean closed = false; // 标识索引是否已经输出完毕 Thread prog = new Thread() { public void run() { 如果索引未输出完毕 closed != true,保持等待,并设置 reporter 状态为 closing } }; try { prog.start(); writer.optimize(); // 索引进行优化并关闭 writer.close(); 拷贝本地输出至全局文件系统 HDFS 中 }finally{ closed = true; } } }; } }
public class KeyWordCal extends Configured implements Tool, Mapper<text, text,="" IntWritable>{ String fWeights[]; // 记录特征权重 IndexSearcher searcher = null; // 用于查询 Lucene 索引文件 public void map(Text key, Text wordInfo, OutputCollector output, Reporter reporter)throws IOException { 解析 key,从中得到 word 信息 // 查找索引文件,得到 DF Term t = new Term("word", word); Query query = new TermQuery(t); Hits hits = searcher.search(query); if (hits.length() == 1) { Document doc = hits.doc(0); String df = doc.get(“DF”); 从 wordInfo 中提取出来每个特征对应取值 , 存储在数组 val 中 weight = sum(val[i] × fWeights[i]); // 计算该词作为关键词权重 if(weight >= threshold) // 权重大于阈值的视为网页关键词 output.collect(key, new Writable(1)); // 关键词输出,key 包含用户 + 关键词,value 为 1 } } // configure 函数会在每个 Map 节点运行 Map 函数对文件按行处理之前调用,通常用来做全局操作 public void configure(JobConf job) { String fWeightPath = job.getStrings(“fWeight.path”)[0]; /// 内部获得特征权重路径 读取特征权重文件,得到特征权重列表,填入 fWeights; String dfPath = job.getStrings(“DF.path”)[0]; FsDirectory fsDirectory = new FsDirectory(FileSystem.get(getConf()),dfpath, false, getConf()); searcher = new IndexSearcher(fsDirectory); } public void runkeyWordCal(String input, String output, String DFPath){ String featureWeightFile; SCGIS(featureWeightFile); // 调用机器学习算法,计算特征权重,并将权重存储在指定文件中 JobConf job = new JobConf(getConf(),KeyWordCal.class); 设置 InputPath, outPath, MapperClass, InputFormat, OutputFormat, … job.setStrings(“fWeight.path”, featureWeightFile);// 设置参数,以传入 Map 和 configure job.setStrings(“DF.path”, DFPath); // 设置 DF 索引文件位置 JobClient.run(job); } … . }
public class UserWordCal1 extends Configured implements Tool, Reducer<text, intwritable,="" Text,Text>{ public void reduce(Text key, Iterator iter, OutputCollector output, Reporter reporter)throws IOException { 解析 key,分别得到 user 信息和 word 信息 output.collect(user, new Text(word + iter 中包含元素的个数 )); //value 为用户访问该词次数 } … . } public class UserWordCal2 extends Configured implements Tool, Reducer<text, text,="" Text>{ public void reduce(Text key, Iterator iter, OutputCollector output, Reporter reporter)throws IOException { Struct Word; // 定义一个数据结构,包含两项,分别存储 word 和次数信息 ArrayList wList; 遍历 iter,将访问词的信息填入 wList; QuickSort(wList); // 对 wList 按次数排序 Normalize(wList); // 对 wList 进行权重归一化 String wordInfo = “”; for(Word word: wList) // 将词和对应的权重信息拼接 wordInfo = wordInfo + word + word.getWeight(); output.collect(user, new Text(wordInfo)); } … . }
public class WordsCorrCal1 extends Configured implements Tool, Mapper<text, Text, Text, IntWritable>, Reducer{ public void map(Text key, Text url, OutputCollector output, Reporter reporter)throws IOException { Text[] words = callCrawlKeyWord(url); // 对网页爬虫,获取 meta 中 keyword 域的词 for(int i = 0; i < words.length(); i++) for(int j = i + 1; j < words.length(); j++) output.collect((words[i] < words[j])? ( words[i] + words[j]) : (words[j] + words[i]), new IntWriable(1)); } public void reduce(Text key, Iterator iter, OutputCollector<text, IntWritable> output, Reporter reporter)throws IOException { output.collect(key, iter 中包含元素的数目 ); //value 为两个词共现次数 } … . }
public class WordsCorrCal2 extends Configured implements Tool, Mapper<text, IntWritable, Text, IntWritable>, Reducer<text, text,="" intwritable,="" LuceneDocumentWrapper >{ public void map(Text wordPair, IntWritable cnt, OutputCollector<text, IntWritable> output, Reporter reporter)throws IOException { 将 wordPair 分为两个词 word1, word2 output.collect(word1, cnt); output.collect(word2, cnt); } public void reduce(Text key, Iterator iter, OutputCollector<text, LuceneDocumentWrapper> output, Reporter reporter)throws IOException { int wordCnt = 0; while(iter.hasNext()) wordCnt += iter.next().get(); // 建立 Lucene 索引,以 word 为 key,出现次数作为 value,进行存储 Document doc = new Document(); doc.add(new Field("word", key.toString(), Field.Store.NO, Field.Index.UN_TOKENIZED)); doc.add(new Field("count", wordCnt, Field.Store.YES,Field.Index.NO)); output.collect(new Text(), new LuceneDocumentWrapper(doc)); } … . }
public class WordsCorrCal3 extends Configured implements Tool, Mapper<text, IntWritable, Text, FloatWritable >{ public void map(Text wordPair, IntWritable cnt, OutputCollector<text, FloatWritable > output, Reporter reporter)throws IOException { 将 wordPair 分为两个词 word1, word2 查找 Lucene 索引文件 , 得到 word1 出现次数 cnt1 查找 Lucene 索引文件 , 得到 word2 出现次数 cnt2 计算 Pij。Pij = cnt/(cnt1 + cnt2 – cnt); output.collect(wordPair, new FloatWritable(Pij)); } … . }
public class WordsCorrCal_21 extends Configured implements Tool, Mapper<text, IntWritable, Text, Text>, Reducer{ public void map(Text key, IntWritable cnt, OutputCollector output, Reporter reporter)throws IOException { String[] words = key.toString.split(“[\t]”); // 如果对应的是词对的输入文件 if(words.length() == 2){ output.collect(new Text(words[0]), new Text( words[1] + ”\t” + cnt + “\t”)); output.collect(new Text(words[1]), new Text( words[0] + ”\t” + cnt + “\t”)}; ) else if(words.length() == 1) { // 如果对应的是单个词的输入文件 output.collect(key, new Text(“\t” + cnt); ) } public void reduce(Text key, Iterator iter, OutputCollector<text, IntWritable> output,Reporter reporter)throws IOException { ArrayList corrWords = new ArrayList(); int wordCnt; while(iter.hasNext()){ String val = iter.next().toString(); String[] vals = val.split(“[\t]”); if(vals.length() == 2) //val 存储的是单个词出现次数 wordCnt = Integer.parse(vals[1]); else //val 存储的是词对的信息,前两项分别是共现词及共现次数 corrWords.add(vals[0]+”\t”+vals[1]); ) for(String corrWord: corrWords){ // 输出 key 为:词 1+ 词 2+ 共现次数; //输出 value:单个词次数 String[] cor = corrWords.split(“[\t]”); output.collect((key < cor[0])?(key + “\t” + corrWord):( cor[0] + “\t” + key + cor[1]),wordCnt); } } … . }
public class WordsCorrCal_22 extends Configured implements Tool, Reducer<text, IntWritable,Text, FloatWritable >{ public void reduce(Text key, Iterator iter, OutputCollector output,Reporter reporter)throws IOException { int word1
public class WordsCorrCal_23 extends Configured implements Tool, Mapper<text, FloatWritable, Text, Text>, Reducer{ public void map(Text wordPair, FloatWritable corr, OutputCollector output,Reporter reporter)throws IOException { 将 key 解析成 word1,word2 output.collect(new Text(word1), new Text(word2 + “\t” + corr.get()); output.collect(new Text(word2), new Text(word1 + “\t” + corr.get()); } public void reduce(Text key, Iterator iter, OutputCollector<text, LuceneDocumentWrapper> output,Reporter reporter)throws IOException { String corrInfo = “”; while(iter.hasNext()) corrInfo = corrInfo + iter.next() + “\t”; // 建立 Lucene 索引,以 word 为 key,共现词信息作为 value,进行存储 Document doc = new Document(); doc.add(new Field("word", key.toString(), Field.Store.NO, Field.Index.UN_TOKENIZED)); doc.add(new Field("corrInfo", corrInfo, Field.Store.YES,Field.Index.NO)); output.collect(new Text(), new LuceneDocumentWrapper(doc)); } … . }
public class WordExp extends Configured implements Tool, Mapper<text, text,="" Text, Text >{ IndexSearcher searcher = null; // 用于查询 Lucene 索引文件 public void map(Text key, Text val, OutputCollector output,Reporter reporter)throws IOException { HashMap words; //key 为词,value 为用户访问该词的权重 HashMap wordNewInfo; // 存储调整后的列表信息 将 val 关键词信息进行解析,依次置入 words; 拷贝 words 中信息至 wordNewInfo 中 ; for(words 中每一个关键词 word){ float w1 = words.get(word); 查找 Lucene 索引文件,得到该词相关词列表 corrWords; for(corrWords 中每个词 corrW){ // 如果 corrW 也被用户访问,修改两个词的权重 if((float w2 = words.get(corrW)) != null){ wordsNewInfo.put(word, wordsNewInfo.get(word) + w2 * corrW.pij); wordsNewInfo.put(corrW, wordsNewInfo.get(corrW) + w1 * corrW.pij); }else{ // 如果未被访问,将词加入到用户访问列表中 wordsNewInfo.put(corrW, w1 * corrW.pij); } } } String wordListNew = “”; for(wordNewInfo 中每个元组 entry) wordListNew = wordListNew + entry.getKey() + entry.getVal(); output.collect(key, new Text(wordListNew); } // configure 函数会在每个 Map 节点运行 Map 函数对文件按行处理之前调用,通常用来做全局操作 public void configure(JobConf job) { String corListPath = job.getStrings(“corrList.path”)[0]; /// 内部获得特征权重路径 FsDirectory fsDirectory = new FsDirectory(FileSystem.get(getConf()),corListPath, false,getConf()); searcher = new IndexSearcher(fsDirectory); } public void runWordExp(String input, String output, String corPath){ JobConf job = new JobConf(getConf(),WordExp.class); 设置 InputPath, outPath, MapperClass, InputFormat, OutputFormat, … job.setStrings(“corrList.path”, corPath); // 设置相关词列表索引信息 JobClient.run(job); } … . }
0 0
- MapReduce 编程模型在日志分析方面的应用
- MapReduce 编程模型在日志分析方面的应用
- MapReduce 编程模型在日志分析方面的应用
- MapReduce 编程模型在日志分析方面的应用
- MapReduce 编程模型在日志分析方面的应用
- MapReduce 编程模型在日志分析方面的应用
- MapReduce 编程模型在日志分析方面的应用
- MapReduce 编程模型在日志分析方面的应用
- MapReduce 编程模型在日志分析方面的应用
- Spring AOP在函数接口调用性能分析及其日志处理方面的应用
- ACE在服务器编程方面的应用
- 形状分析在植物种类鉴定方面的应用
- “天眼”系统在视频智能分析方面的实战应用
- MapReduce编程模型的要点
- MapReduce编程模型的要点
- MapReduce编程模型的认识
- MapReduce编程模型之InputFormat分析(-)
- MapReduce编程模型之InputFormat分析(二)
- 忘记虚拟机密码
- 深度测试(消隐)demo
- 音视频二次开发 IP组播技术
- 需要写的学习笔记
- iOS UIWebView中javascript与Objective-C交互、获取摄像头
- MapReduce 编程模型在日志分析方面的应用
- UIView 和 CALayer的关系
- DevExpress部分控件应用大集合(持续添加)
- iOS数据存储
- ActionBar自定义背景
- Java注解
- Apache Hadoop 2.6.0 新特性
- 程序员的笑话/漫画集锦
- UITableView缓存加载图片