004 开发mapreduce程序对用户行为日志加强

来源:互联网 发布:网络攻击的工作流程 编辑:程序博客网 时间:2024/04/29 18:48

对于用户行为日志一般处理思路: 通过mapreduce对日志进行增强,然后保持到hdfs上,通过hive分析各个指标。这里的思路也是如此,本章节重点给大家介绍一些核心代码


(1)AccessLogEnhanceImportHDFS  主程序
(2)AccessLogEnhanceMapper 定义开发mapper任务,处理用户行为日志,对用户行为日志进行加强
(3)AccessLogEnhanceReducer 定义开发reducer任务,把用户行为日志输出到hdfs上,注意:这里输出目录采用分区方式处理,目的为后续hive
分析数据表做准备


1、主程序代码

package com.yun.job;import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.conf.Configured;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.Tool;import org.apache.hadoop.util.ToolRunner;import com.yun.domain.AccessLog;/** * 电子商务网站用户行为日志增加job *  * 命令1:重点关注access_day=20150705,这样hive表可以直接按照天进行分区 * hdfs://mycluster:9000/user/hadoop/external/jfpc/input/20150705   hdfs://mycluster:9000/user/hadoop/external/jfpc/output/access_day=20150705 * hdfs://mycluster:9000/user/hadoop/external/jfpc/input/20150706   hdfs://mycluster:9000/user/hadoop/external/jfpc/output/access_day=20150706 *  * 命令2:集群上运行命令: * hadoop jar jfyun.jar com.yun.job.AccessLogEnhanceImportHDFS external/jfpc/input/20150706 external/jfpc/output/access_day=20150706 *  *  * @author shenfl * @Version V1.0 * @date 2015-7-8 * */public class AccessLogEnhanceImportHDFS extends Configured implements Tool {public int run(String[] args) throws Exception {Configuration conf = new Configuration();Job job = Job.getInstance(conf, AccessLogEnhanceImportHDFS.class.getSimpleName());job.setJarByClass(AccessLogEnhanceImportHDFS.class);//集群模式添加下面一行代码job.setJar("jfyun.jar");job.setMapperClass(AccessLogEnhanceMapper.class);job.setMapOutputKeyClass(AccessLog.class);job.setMapOutputValueClass(NullWritable.class);job.setReducerClass(AccessLogEnhanceReducer.class);job.setOutputKeyClass(NullWritable.class);job.setOutputValueClass(AccessLog.class);FileInputFormat.setInputDirRecursive(job, true);FileInputFormat.setInputPaths(job, new Path(args[0]));Path outputDir = new Path(args[1]);deleteOutDir(conf, outputDir);FileOutputFormat.setOutputPath(job, outputDir);return job.waitForCompletion(true) ? 0 : 1;}/** * @param conf * @param outputDir * @throws IOException */public void deleteOutDir(Configuration conf, Path outputDir) throws IOException {FileSystem fs = FileSystem.get(conf);if (fs.exists(outputDir)) {fs.delete(outputDir, true);}}public static void main(String[] args) {try {int run = ToolRunner.run(new Configuration(), new AccessLogEnhanceImportHDFS(), args);// 0-正常退出,1-异常退出System.exit(run);} catch (Exception e) {e.printStackTrace();}}}/** *  * 参考文章:http://hbase.apache.org/book.html#configuration * (1)执行命令:集群模式执行下面的命令, * hadoop jar jfyun.jar com.yun.job.AccessLogEnhanceRunner external/jfpc/input/20150715 external/jfpc/output/20150715/access_day=20150715 *  * (2)会发现出现下面的错误 * 15/09/21 21:48:41 INFO mapreduce.Job: Task Id : attempt_1442890056563_0002_r_000000_1, Status : FAILEDError: java.lang.RuntimeException: java.lang.reflect.InvocationTargetException        at org.apache.hadoop.util.ReflectionUtils.newInstance(ReflectionUtils.java:131)        at org.apache.hadoop.mapred.ReduceTask.runNewReducer(ReduceTask.java:611)        at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:389)        at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:163)        at java.security.AccessController.doPrivileged(Native Method)        at javax.security.auth.Subject.doAs(Subject.java:415)        at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1628)        at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:158)Caused by: java.lang.reflect.InvocationTargetException        at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)        at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:57)        at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)        at java.lang.reflect.Constructor.newInstance(Constructor.java:526)        at org.apache.hadoop.util.ReflectionUtils.newInstance(ReflectionUtils.java:129)        ... 7 moreCaused by: java.lang.NoClassDefFoundError: org/apache/hadoop/hbase/HTableDescriptor        at com.yun.job.AccessLogEnhanceReducer.<init>(AccessLogEnhanceReducer.java:23)        ... 12 moreCaused by: java.lang.ClassNotFoundException: org.apache.hadoop.hbase.HTableDescriptor        at java.net.URLClassLoader$1.run(URLClassLoader.java:366)        at java.net.URLClassLoader$1.run(URLClassLoader.java:355)        at java.security.AccessController.doPrivileged(Native Method)        at java.net.URLClassLoader.findClass(URLClassLoader.java:354)        at java.lang.ClassLoader.loadClass(ClassLoader.java:425)        at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:308)        at java.lang.ClassLoader.loadClass(ClassLoader.java:358) * (3) 解决方案 * 3.1 原因: hadoop jar在运行的时候,没有加载到hbase相关的jar包 * 3.2 解决处理: 在hadoop-env.sh添加HADOOP_CLASSPTH,编辑hadoop-env.sh ,添加如下配置即可解决 * export HADOOP_CLASSPATH=/home/hadoop/app/hbase-0.98.13-hadoop2/lib/* *  */

2、map函数代码

package com.yun.job;import java.io.IOException;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;import com.yun.domain.AccessLog;import com.yun.util.AccessLogParser;/** * 电商平台用户访问轨迹增强 *  * @author shenfl * */public class AccessLogEnhanceMapper extends Mapper<LongWritable, Text, AccessLog, NullWritable> {AccessLogParser parser = null;AccessLog v  = null;/** *  * key: 偏移量 ,日期 value:一行记录 */public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {    parser = new AccessLogParser();// 获取行记录行并转为对象v = parser.process(value.toString().trim());if(v!=null){context.write(v,NullWritable.get());}}}


3、reduce函数代码

package com.yun.job;import java.io.IOException;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.mapreduce.Reducer;import com.yun.domain.AccessLog;/** *  * @author shenfl * */public class AccessLogEnhanceReducer extends Reducer<AccessLog,NullWritable, NullWritable, AccessLog> {@Overridepublic void reduce(AccessLog k2, Iterable<NullWritable> v2s,Context context) throws IOException,InterruptedException {context.write(NullWritable.get(), k2);}}



到这里一些核心的代码就出来后,然后可以运行程序就可以对日志进行加强了。

0 0