Hadoop-03-第二个MapReduce程序--模拟分析购物日志

来源：互联网发布：php 取出字符串后几位编辑：程序博客网时间：2024/05/30 19:34

第二个MapReduce程序---模拟分析购物日志

在Eclipse中开发MapReduce程序的步骤见上篇文章。

这里模拟购物日志的分析，使用java程序产生了7百万条数据，每行的格式为：用户ID-邮箱-购买物品-时间戳。本例使用MapReduce来统计这7百万条数据中用户使用的邮箱。

///产生数据的代码如下：（程序呀从来不会缺数据！）

在程序中略微修改一下参数，产生七个文件day1.txt,....,day7.txt,每个文件一百万行。

package com.test;

import java.io.BufferedWriter;

import java.io.File;

import java.io.FileWriter;

public class GenerateShoppingLoggingData {

private static final String[] emails = new String[] { "@qq.com",

"@sina.com", "@sohu.com", "@163.com", "@126.com", "@msn.cn",

"@edu.cn" };

private static final String[] goodsCategories = new String[] { "book",

"commodity", "clothes", "trousers", "mobilephone", "shoes", "toys",

"computer", "laptop", "tablet pc" };

private static final int RECORDS_NUMBER = 1000000;

/**

* @param args

public static void main(String[] args) {

try {

File file = new File("D:\\day1.txt");

FileWriter fw = new FileWriter(file);

BufferedWriter bw = new BufferedWriter(fw);

for (int i = 1; i <= RECORDS_NUMBER; i++) {

bw.write(i + "-" + getRandom(10000000) + emails[getRandom(7)]

+ "-" + goodsCategories[getRandom(10)] + "-"

+ System.currentTimeMillis());

bw.newLine();

}

bw.close();

fw.close();

System.out.println("done---");

} catch (Exception e) {

e.printStackTrace();

}

private static int getRandom(int base) {

int x = (int) (Math.random() * base);

return x;

}

新建MapReduce程序，其中代码如下：

//Mapper--ShoppingLogMapper

package com.shopping.mapred;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Mapper;

public class ShoppingLogMapper extends

Mapper<LongWritable, Text, Text, IntWritable> {

@Override

protected void map(LongWritable key, Text value,

Mapper<LongWritable, Text, Text, IntWritable>.Context context)

throws IOException, InterruptedException {

String line = value.toString();

String[] strs = line.split("-");

String email = strs[1];

// context.write(new Text(email), new Text(email));

if (email.contains("@163.com")) {

context.write(new Text("@163.com"), new IntWritable(1));

} else if (email.contains("@126.com")) {

context.write(new Text("@126.com"), new IntWritable(1));

} else if (email.contains("@sohu.com")) {

context.write(new Text("@sodu.com"), new IntWritable(1));

} else if (email.contains("@qq.com")) {

context.write(new Text("@qq.com"), new IntWritable(1));

} else if (email.contains("@sina.com")) {

context.write(new Text("@sina.com"), new IntWritable(1));

} else if (email.contains("@msn.cn")) {

context.write(new Text("@msn.cn"), new IntWritable(1));

} else if (email.contains("@edu.cn")) {

context.write(new Text("@edu.cn"), new IntWritable(1));

}

///Reducer--ShoppingLogReducer

package com.shopping.mapred;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Reducer;

public class ShoppingLogReducer extends

Reducer<Text, IntWritable, Text, IntWritable> {

@Override

protected void reduce(Text key, Iterable<IntWritable> values,

Reducer<Text, IntWritable, Text, IntWritable>.Context context)

throws IOException, InterruptedException {

int sum = 0;

for (IntWritable value : values) {

sum += value.get();

}

context.write(key, new IntWritable(sum));

}

///主函数--ShoppingLog

package com.shopping.mapred;

import org.apache.hadoop.conf.Configured;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.util.Tool;

import org.apache.hadoop.util.ToolRunner;

public class ShoppingLog extends Configured implements Tool {

@Override

public int run(String[] args) throws Exception {

if (args.length != 2) {

System.err.println("Usage: ShoppingLog <input path> <output path>");

System.exit(-1);

}

Job job = new Job();

job.setJarByClass(ShoppingLog.class);

job.setJobName("shopping logging analysis");

FileInputFormat.addInputPath(job, new Path(args[0]));

FileOutputFormat.setOutputPath(job, new Path(args[1]));

job.setMapperClass(ShoppingLogMapper.class);

job.setReducerClass(ShoppingLogReducer.class);

job.setOutputKeyClass(Text.class);

job.setOutputValueClass(IntWritable.class);

return job.waitForCompletion(true) ? 0 : 1;

}

public static void main(String[] args) throws Exception {

long start = System.currentTimeMillis();

int code = ToolRunner.run(new ShoppingLog(), args);

long end = System.currentTimeMillis();

System.out.println("Time-------------------->" + (end - start) + " ms");

System.exit(code);

}

配置并运行。

在一台机器上的三台虚拟机中运行，统计这7百万条数据花费时间大概是16s。

最终产生的结果如下：

@126.com 999703

@163.com 1000948

@edu.cn 1001126

@msn.cn 1000668

@qq.com 1001226

@sina.com 998386

@sohu.com 997943

不喜欢写代码的朋友可以直接到下面链接下载测试数据。

链接：http://pan.baidu.com/s/1pJuldfP 密码：1zw1

0 0