hadoop实现的一个简单的Pagerank例子
来源:互联网 发布:gis基础软件平台 编辑:程序博客网 时间:2024/05/21 12:41
/** * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */package org.apache.hadoop.examples;import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.DoubleWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapred.TaskTracker;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.GenericOptionsParser;public class Pagerank { public static double dampFactor = 0.85; public static int iterations = 10 ; public static class PagerankMapper extends Mapper<LongWritable, Text, Text, DoubleWritable>{ public void map(LongWritable key, Text value, Context context ) throws IOException, InterruptedException { String line = value.toString(); String urls[] = line.split("\t"); int urlSize = urls.length; // TaskTracker.LOG.info("line: " + line + " urlSize: " + urlSize) ; //urls[0] is the srcUrl. double outLinkNum = urlSize-1; double srcPageRank = 0.25 ; double pageRank_part = srcPageRank/outLinkNum; for(int i = 1; i < urlSize; i++) { context.write(new Text(urls[i]), new DoubleWritable(pageRank_part)); } } } public static class PagerankReducer extends Reducer<Text,DoubleWritable,Text,DoubleWritable> { public void reduce(Text key, Iterable<DoubleWritable> values, Context context) throws IOException, InterruptedException {// TaskTracker.LOG.info("Reducer key: " + key) ; double pageRank = 0; for(DoubleWritable value : values) { pageRank += value.get(); // TaskTracker.LOG.info("Reducer value: " + value) ; } pageRank = 1-dampFactor+dampFactor*pageRank; context.write(key, new DoubleWritable(pageRank)); } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: pagerank <in> <out>"); System.exit(2); } Job job = new Job(conf, "pagerank"); job.setJarByClass(Pagerank.class); job.setMapperClass(PagerankMapper.class); job.setCombinerClass(PagerankReducer.class); job.setReducerClass(PagerankReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(DoubleWritable.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }}
输入数据格式为:
1\t2\t\4 //以\t隔开,即起始顶点\t边1\t边2...
这里有个问题就是一轮迭代结束后,顶点的边的信息无法传入下一轮中,如何改进?
批量测试脚本;
for i in `seq 1 10` ; do hadoop jar hadoop-examples-1.2.1.jar pagerank /test/soc-LiveJournal_final.txt /output ; hadoop fs -rmr /output ; done
阅读全文
0 0
- hadoop实现的一个简单的Pagerank例子
- PageRank的一个简单实现
- spark实现简单的pagerank
- PageRank算法的简单实现.
- PageRank的php简单实现
- PageRank的简单实现(scala版)
- 简单PageRank的理解
- 一个简单的Spring实现的例子
- hadoop上的pageRank算法
- hadoop上的pageRank算法
- [Atlas]一个Atlas实现的简单例子
- 一个简单的VCard实现例子
- Laravel实现一个简单的小例子
- 一个最简单的dubbo例子实现
- 实现一个简单的工作流例子全过程
- 实现一个简单的工作流例子全过程
- 实现一个简单的工作流例子全过程
- PageRank的MapReduce实现
- 海明码
- 【一颗不甘的心】今天,说说自己 -- 2017/5/29更新
- 使用java实现面向对象编程
- 怎样在本地搭建IIS服务器
- 关于JSON与对象集合的简单转换
- hadoop实现的一个简单的Pagerank例子
- 坐标轴范围设定
- 数十种TensorFlow实现案例汇集:代码+笔记
- jsp上传文件案例(有源码)
- JS中的真与假
- 拉格朗日插值法 C语言实现
- 导入低版本的WEB项目需要修改的地方
- python3使用Scrapy
- windows下彻底卸载MySQL的方法