hadoop编程实战——日志分析
来源:互联网 发布:python 微秒时间戳 编辑:程序博客网 时间:2024/04/27 14:18
上传日志文件到hadoop的dfs当中去
一、根据上述日志文件,计算该天的独立ip数,pv数(注意要筛选日志,并非每条记录都要统计),被传输页面的总字节数
1、将日志信息分为8个字段,创建指标对象KPI
import java.text.ParseException;import java.text.SimpleDateFormat;import java.util.Date;import java.util.HashSet;import java.util.Locale;import java.util.Set;/* * KPI Object */public class KPI { private String remote_addr;// 记录客户端的ip地址 private String remote_user;// 记录客户端用户名称,忽略属性"-" private String time_local;// 记录访问时间与时区 private String request;// 记录请求的url与http协议 private String status;// 记录请求状态;成功是200 private String body_bytes_sent;// 记录发送给客户端文件主体内容大小 private String http_referer;// 用来记录从那个页面链接访问过来的 private String http_user_agent;// 记录客户浏览器的相关信息 private boolean valid = true;// 判断数据是否合法 private static KPI parser(String line) { KPI kpi = new KPI(); String[] arr = line.split(" "); if (arr.length > 11) { kpi.setRemote_addr(arr[0]); kpi.setRemote_user(arr[1]); kpi.setTime_local(arr[3].substring(1)); kpi.setRequest(arr[6]); kpi.setStatus(arr[8]); kpi.setBody_bytes_sent(arr[9]); kpi.setHttp_referer(arr[10]); if (arr.length > 12) { kpi.setHttp_user_agent(arr[11] + " " + arr[12]); } else { kpi.setHttp_user_agent(arr[11]); } try{ // 存在status没有的情况,直接pass if (Integer.parseInt(kpi.getStatus()) >= 400) {// 大于400,HTTP错误 kpi.setValid(false); } }catch(Exception e){ System.out.println(line); kpi.setValid(false); } } else { kpi.setValid(false); } return kpi; } /** * 按page的pv分类 */ public static KPI filterPVs(String line) { /*KPI kpi = parser(line); Set<String> pages = new HashSet<String>(); pages.add("/forum-46-1.html"); pages.add("/forum-58-1.html"); pages.add("/forum-61-1.html"); if (!pages.contains(kpi.getRequest())) { kpi.setValid(false); } return kpi;*/ return parser(line); } /** * 按page的独立ip分类 */ public static KPI filterIPs(String line) { /*KPI kpi = parser(line); Set<String> pages = new HashSet<String>(); pages.add("/forum-46-1.html"); pages.add("/forum-58-1.html"); pages.add("/forum-61-1.html"); if (!pages.contains(kpi.getRequest())) { kpi.setValid(false); } return kpi;*/ return parser(line); } /** * PV按浏览器分类 */ public static KPI filterBroswer(String line) { return parser(line); } /** * PV按小时分类 */ public static KPI filterTime(String line) { return parser(line); } /** * PV按访问域名分类 */ public static KPI filterDomain(String line) { return parser(line); } @Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append("valid:" + this.valid); sb.append("\nremote_addr:" + this.remote_addr); sb.append("\nremote_user:" + this.remote_user); sb.append("\ntime_local:" + this.time_local); sb.append("\nrequest:" + this.request); sb.append("\nstatus:" + this.status); sb.append("\nbody_bytes_sent:" + this.body_bytes_sent); sb.append("\nhttp_referer:" + this.http_referer); sb.append("\nhttp_user_agent:" + this.http_user_agent); return sb.toString(); } public String getRemote_addr() { return remote_addr; } public void setRemote_addr(String remote_addr) { this.remote_addr = remote_addr; } public String getRemote_user() { return remote_user; } public void setRemote_user(String remote_user) { this.remote_user = remote_user; } public String getTime_local() { return time_local; } public Date getTime_local_Date() throws ParseException { SimpleDateFormat df = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss", Locale.US); return df.parse(this.time_local); } public String getTime_local_Date_hour() throws ParseException { SimpleDateFormat df = new SimpleDateFormat("yyyyMMddHH"); return df.format(this.getTime_local_Date()); } public void setTime_local(String time_local) { this.time_local = time_local; } public String getRequest() { return request; } public void setRequest(String request) { this.request = request; } public String getStatus() { return status; } public void setStatus(String status) { this.status = status; } public String getBody_bytes_sent() { return body_bytes_sent; } public void setBody_bytes_sent(String body_bytes_sent) { this.body_bytes_sent = body_bytes_sent; } public String getHttp_referer() { return http_referer; } public String getHttp_referer_domain() { if (http_referer.length() < 8) { return http_referer; } String str = this.http_referer.replace("\"", "").replace("http://", "") .replace("https://", ""); return str.indexOf("/") > 0 ? str.substring(0, str.indexOf("/")) : str; } public void setHttp_referer(String http_referer) { this.http_referer = http_referer; } public String getHttp_user_agent() { return http_user_agent; } public void setHttp_user_agent(String http_user_agent) { this.http_user_agent = http_user_agent; } public boolean isValid() { return valid; } public void setValid(boolean valid) { this.valid = valid; } public static void main(String args[]) { String line = "112.97.24.243 - - [31/Jan/2012:00:14:48 +0800] \"GET /data/cache/style_2_common.css?AZH HTTP/1.1\" 200 57752 \"http://f.dataguru.cn/forum-58-1.html\" \"Mozilla/5.0 (iPhone; CPU iPhone OS 5_0_1 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Mobile/9A406\""; System.out.println(line); KPI kpi = new KPI(); String[] arr = line.split(" "); kpi.setRemote_addr(arr[0]); kpi.setRemote_user(arr[1]); kpi.setTime_local(arr[3].substring(1)); kpi.setRequest(arr[6]); kpi.setStatus(arr[8]); kpi.setBody_bytes_sent(arr[9]); kpi.setHttp_referer(arr[10]); kpi.setHttp_user_agent(arr[11] + " " + arr[12]); System.out.println(kpi); try { SimpleDateFormat df = new SimpleDateFormat("yyyy.MM.dd:HH:mm:ss z",Locale.US); System.out.println(df.format(kpi.getTime_local_Date())); System.out.println(kpi.getTime_local_Date_hour()); System.out.println(kpi.getHttp_referer_domain()); } catch (ParseException e) { e.printStackTrace(); } }}
2、计算独立ip、pv和总字节数代码
package week5;import java.io.IOException;import java.util.ArrayList;import java.util.HashSet;import java.util.Iterator;import java.util.List;import java.util.Set;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapred.FileInputFormat;import org.apache.hadoop.mapred.FileOutputFormat;import org.apache.hadoop.mapred.JobClient;import org.apache.hadoop.mapred.JobConf;import org.apache.hadoop.mapred.MapReduceBase;import org.apache.hadoop.mapred.Mapper;import org.apache.hadoop.mapred.OutputCollector;import org.apache.hadoop.mapred.Reducer;import org.apache.hadoop.mapred.Reporter;import org.apache.hadoop.mapred.TextInputFormat;import org.apache.hadoop.mapred.TextOutputFormat;public class KPIIPVBYTE {public static class ParseLogMapper extends MapReduceBase implements Mapper<Object, Text, Text, Text> { @Override public void map(Object key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { KPI kpi = KPI.filterIPs(value.toString()); if (kpi.isValid()) { output.collect(new Text("ip"), new Text(kpi.getRemote_addr())); output.collect(new Text("pv"), new Text("1")); output.collect(new Text("ps"), new Text("".equals(kpi.getBody_bytes_sent())?"0":kpi.getBody_bytes_sent())); } }} public static class parseLogReducer extends MapReduceBase implements Reducer<Text, Text, Text, Text> { private Set<String> count = new HashSet<String>(); private int sumPv=0; private long sumPs=0; @Override public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { String keys = key.toString(); List<Text> listv = new ArrayList<Text>(); if("ip".equals(keys.toLowerCase().trim())){ while (values.hasNext()) { count.add(values.next().toString()); } output.collect(new Text("IP总数:"), new Text(String.valueOf(count.size()))); }else if("pv".equals(keys.toLowerCase().trim())){ while (values.hasNext()) { sumPv+= Long.parseLong(values.next().toString()); } output.collect(new Text("PV数:"), new Text(String.valueOf(sumPv))); }else if("ps".equals(keys.toLowerCase().trim())){ while (values.hasNext()) { sumPs +=Integer.parseInt(values.next().toString()); } sumPs = sumPs/1024/1024; output.collect(new Text("总字节数:"), new Text(String.valueOf(sumPs)+"M")); } } }public static void main(String[] args) throws Exception{ String input = "hdfs://hadoop1:9000/week5/in/access.20120104.log"; String output = "hdfs://hadoop1:9000/week5/out/"; JobConf conf = new JobConf(KPIIPVBYTE.class); conf.setJobName("IPPVPS"); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(ParseLogMapper.class); conf.setReducerClass(parseLogReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(input)); FileOutputFormat.setOutputPath(conf, new Path(output)); JobClient.runJob(conf); System.exit(0); }}
结果:
IP总数: 34413总字节数: 67627MPV数: 2910085
二、统计来源网站,列出域名及带来的独立ip数
1、来源统计代码
package week5;import java.io.IOException;import java.util.HashSet;import java.util.Set;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;public class SourceCount { public static class MyMapper extends Mapper<Object, Text, Text, Text> { private Text state = new Text(); private Text ip = new Text(); public void map(Object key, Text value, Context context) throws IOException, InterruptedException { KPI kpi = KPI.filterIPs(value.toString()); if (kpi.isValid()) { state.set(kpi.getHttp_referer()); ip.set(kpi.getRemote_addr()); context.write(state, ip); } } } public static class SumReducer extends Reducer<Text, Text, Text, Text> { private Text result = new Text(); public void reduce(Text key, Iterable<Text> values, Context context)throws IOException, InterruptedException { Set<String> ips = new HashSet<String>(); for (Text val : values) { ips.add(val.toString()); } result.set(String.valueOf(ips.size())); context.write(key, result); } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = new Job(conf); job.setJarByClass(SourceCount.class); job.setMapperClass(MyMapper.class); job.setCombinerClass(SumReducer.class); job.setReducerClass(SumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path("hdfs://hadoop1:9000/week5/in/access.20120104.log")); FileOutputFormat.setOutputPath(job, new Path("hdfs://hadoop1:9000/week5/out3")); System.exit(job.waitForCompletion(true) ? 0 : 1); }}
结果:
太多了,取部分为证
三、统计用户使用的浏览器种类,计算出各种浏览器占的百分比
1、浏览器占比代码
package week5;import java.io.IOException;import java.util.Iterator;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.Text;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.mapred.FileInputFormat;import org.apache.hadoop.mapred.FileOutputFormat;import org.apache.hadoop.mapred.JobClient;import org.apache.hadoop.mapred.JobConf;import org.apache.hadoop.mapred.MapReduceBase;import org.apache.hadoop.mapred.Mapper;import org.apache.hadoop.mapred.OutputCollector;import org.apache.hadoop.mapred.Reducer;import org.apache.hadoop.mapred.Reporter;import org.apache.hadoop.mapred.TextInputFormat;import org.apache.hadoop.mapred.TextOutputFormat;public class KPIBroswer { private static long total = 0; public static class KPIBrowserMapper extends MapReduceBase implements Mapper<Object, Text, Text, IntWritable> { private Text browserInfo = new Text(); private IntWritable one = new IntWritable(1); public void map(Object key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException { KPI kpi = KPI.filterBroswer(value.toString()); if (kpi.isValid()) { browserInfo.set(kpi.getHttp_user_agent()); total++; output.collect(browserInfo, one); } } } public static class KPIBrowserReducer extends MapReduceBase implements Reducer<Text, IntWritable, Text, Text> { private Text result = new Text(); public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { long sum = 0; while (values.hasNext()) { sum += values.next().get(); } System.out.println("answer is over there"); System.out.println(sum); System.out.println(total); System.out.println(String.valueOf(((double) sum / total * 100) + "%")); result.set(String.valueOf(((double) sum / total * 100) + "%")); output.collect(key, result); } } public static void main(String[] args) throws Exception { String input = "hdfs://hadoop1:9000/week5/in/access.20120104.log"; String output = "hdfs://hadoop1:9000/week5/out4"; JobConf conf = new JobConf(KPIBroswer.class); conf.setJobName("KPIBrowser"); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(IntWritable.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(KPIBrowserMapper.class); // conf.setCombinerClass(KPIBrowserReducer.class); conf.setReducerClass(KPIBrowserReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(input)); FileOutputFormat.setOutputPath(conf, new Path(output)); JobClient.runJob(conf); System.exit(0); }}
2、结果:
2.0617954458374926E-4%"" 2.0617954458374926E-4%"(C)Nokia6700s/SymbianOS/9.1 Series60/3.0" 3.4363257430624875E-5%"-" 1.5351785257131665%"-" "-" 6.872651486124975E-5%"-" "Mozilla/4.0" 1.0308977229187463E-4%"Amoi-F90/Plat-F/WAP2.0/MIDP1.0/CLDC1.0 UP.Browser/6.2.2.6.f.1.100 6.872651486124975E-5%"AmoiE70/6.1.08/WAP2.0 Profile/MIDP2.0 3.4363257430624875E-5%"AndroidDownloadManager" 6.872651486124975E-5%"Apache-HttpClient/4.1 (java 3.4363257430624875E-5%"Apache-HttpClient/4.1.1 (java 3.4363257430624875E-5%"AppEngine-Google; (+http://code.google.com/appengine; 2.0617954458374926E-4%"Apple-PubSub/28" 0.001683799614100619%"Apple-PubSub/65" 3.4363257430624875E-5%"Apple-PubSub/65.20" 3.4363257430624875E-5%"Apple-PubSub/65.23" 6.185386337512477E-4%"Apple-PubSub/65.28 AppEngine-Google; 3.4363257430624875E-5%"Apple-PubSub/65.28" 0.015807098418087445%"BGSY bot/1.0" 0.0041235908916749855%"BaiduMobile/1.3.1 CFNetwork/485.12.7 3.4363257430624875E-5%"BaiduMobile/1.3.2 CFNetwork/548.0.4 3.4363257430624875E-5%"BaiduMobile/1.3.4 CFNetwork/485.12.7 6.872651486124975E-5%"BaiduMobile/1.3.4 CFNetwork/485.13.9 6.872651486124975E-5%"BaiduMobile/1.3.5 CFNetwork/485.12.7 3.4363257430624875E-5%"BaiduMobile/1.3.5 CFNetwork/485.13.9 2.4054280201437413E-4%"BaiduMobile/1.3.5 CFNetwork/548.0.3 6.872651486124975E-5%"BaiduMobile/1.3.5 CFNetwork/548.0.4 1.374530297224995E-4%"Baiduspider" 1.374530297224995E-4%"Baiduspider+(+http://www.baidu.com/search/spider.htm)" 0.01147732798182871%"Baiduspider-news+(+http://www.baidu.com/search/spider.htm)" 0.009003173446823718%"BlackBerry8820/2.7.0.105-4.5.0.174 Profile/MIDP-2.0 1.718162871531244E-4%"BlackBerry8900/5.2.0.96 Profile/MIDP-2.0 6.872651486124975E-5%"BlackBerry9000/5.2.0.89 Profile/MIDP-2.0 3.4363257430624875E-5%"BlackBerry9700/5.0.0.862 Profile/MIDP-2.1 3.4363257430624875E-5%"CoolPadF800/CMCC WindowsCEOS/6.0/(2009.10.30)10.01.F800/WAP2.0 1.374530297224995E-4%"Dalvik/1.2.0 (Linux; 5.49812118889998E-4%"Dalvik/1.4.0 (Linux; 3.4363257430624875E-5%"DoCoMo/2.0 N905i(c100;TB;W24H16) 3.779958317368737E-4%"DoCoMo/2.0 P900i(c100;TB;W24H11) 1.374530297224995E-4%"Domnutch-Bot/Nutch-1.0 (Domnutch; 6.872651486124975E-5%"Doubanbot/1.0 (bot@douban.com 9.278079506268717E-4%"E63/SymbianOS/9.1 Series60/3.0" 2.4054280201437413E-4%"E66/SymbianOS/9.1 Series60/3.0" 4.123590891674985E-4%"E71/SymbianOS/9.1 Series60/3.0" 6.872651486124975E-5%"FTRF: Friendly 6.185386337512477E-4%"Feed43 Proxy/1.0 3.779958317368737E-4%"FeedDemon/3.1 (http://www.feeddemon.com/; 6.872651486124975E-5%"FeedDemon/4.0 (http://www.feeddemon.com/; 0.00402050111938311%"FeedFetcher-Google-CoOp; (+http://www.google.com/coop/cse/cref)" 7.559916634737474E-4%"Feedfetcher-Google; (+http://www.google.com/feedfetcher.html)" 6.872651486124976E-4%"Feedfetcher-Google; (+http://www.google.com/feedfetcher.html; 0.3223273546992614%"Feedreader 3.14 0.0035394155153543627%"GIONEE-L011/SW1.0.0/WAP2.0/MIDP2.1 Configuration/CLDC-1.1" 6.872651486124975E-5%"GoodReaderIPad/3.12.0 CFNetwork/548.0.4 3.4363257430624875E-5%"GoogleProducer" 0.0015119833269474948%"Googlebot-Image/1.0" 1.0308977229187463E-4%"Googlebot/2.1 (+http://www.google.com/bot.html)" 0.0017181628715312442%"Googlebot/2.1 (+http://www.googlebot.com/bot.html)" 2.0617954458374926E-4%"Googlebot/2.1 (http://www.googlebot.com/bot.html)" 1.0308977229187463E-4%"GreatNews/1.0" 0.012576952219608707%"HD_T8282 Mozilla/4.0 3.4363257430624875E-5%"HTCT9188_TD/1.0 WindowsMobile/6.5 1.0308977229187463E-4%"HTC_Touch_Diamond2_T5353 Mozilla/4.0 1.0308977229187463E-4%"HTC_Touch_Pro_T7272 Mozilla/4.0 3.4363257430624875E-5%"HTMLParser/1.6" 6.872651486124975E-5%"HTTP Fetcher/HTTP/1.0" 3.4363257430624875E-5%"HTTP_Request2/2.0.0 (http://pear.php.net/package/http_request2) 2.74906059444999E-4%"Holleycomm-H8800/2.0 WAP2.0 3.436325743062488E-4%"HuaweiSymantecSpider/1.0+DSE-support@huaweisymantec.com+(compatible; MSIE 0.04123590891674985%"HuaweiT5211_TD/1.0 RTKE_OS/01.00 1.718162871531244E-4%"HuaweiT8100_TD/1.0 Android/2.2 1.374530297224995E-4%"HuaweiU7520/B000 Browser/NetFront/4.1 3.4363257430624875E-5%"Huaweisymantecspider (compatible; 0.027490605944499907%"IUC(U;iOS 3.1.3;Zh-cn;320*480;)" 3.4363257430624875E-5%"IUC(U;iOS 4.1;Zh-cn;320*480;)/UCWEB8.1.0.104/41/997" 4.123590891674985E-4%...
0 0
- hadoop编程实战——日志分析
- 实战2——Hadoop的日志分析
- hadoop 实战——网站日志数据分析
- Hadoop项目实战---日志分析
- 【hive实战】使用hive分析 hadoop 日志
- HIve实战分析Hadoop的日志
- 【hive实战】使用hive分析 hadoop 日志
- Hadoop项目实战---黑马论坛日志分析
- Hadoop项目实战---黑马论坛日志分析
- hadoop学习笔记(九)——hadoop日志分析系统
- HDInsight-Hadoop实战(一)网站日志分析
- hadoop 实战———WordCount源码分析
- 《hadoop实战》笔记1—分布式编程框架
- Hadoop学习笔记—20.网站日志分析项目案例
- AWK日志分析实战
- hadoop日志分析程序
- hadoop日志简单分析
- hadoop 日志分析程序
- eclipse,struts2的简单配置及使用,返回json
- Appium在Android手机执行测试脚本
- java适配器模式
- 一 javase学习记录10
- Maven Web项目使用MyBatis_Generator_1.3.1自动生成javabean,dao,mapper.xml代码
- hadoop编程实战——日志分析
- HDOJ 2767 Proving Equivalences
- Python中*args 和**kwargs的用法
- JAVA多线程之——读写锁 ReentrantReadWriteLock
- 算法提高 金属采集
- Java之BigDecimal类使用
- 遇到的一个关于base64编码的小问题
- js object.create()、Object.defineproperty()、,Object.keys()...for..in,for..each
- 系统学习javaweb-01-java基础语法