hadoop编程实战——日志分析

来源:互联网 发布:python 微秒时间戳 编辑:程序博客网 时间:2024/04/27 14:18

上传日志文件到hadoop的dfs当中去

一、根据上述日志文件,计算该天的独立ip数,pv数(注意要筛选日志,并非每条记录都要统计),被传输页面的总字节数

1、将日志信息分为8个字段,创建指标对象KPI

import java.text.ParseException;import java.text.SimpleDateFormat;import java.util.Date;import java.util.HashSet;import java.util.Locale;import java.util.Set;/* * KPI Object */public class KPI {    private String remote_addr;// 记录客户端的ip地址    private String remote_user;// 记录客户端用户名称,忽略属性"-"    private String time_local;// 记录访问时间与时区    private String request;// 记录请求的url与http协议    private String status;// 记录请求状态;成功是200    private String body_bytes_sent;// 记录发送给客户端文件主体内容大小    private String http_referer;// 用来记录从那个页面链接访问过来的    private String http_user_agent;// 记录客户浏览器的相关信息    private boolean valid = true;// 判断数据是否合法    private static KPI parser(String line) {        KPI kpi = new KPI();        String[] arr = line.split(" ");        if (arr.length > 11) {            kpi.setRemote_addr(arr[0]);            kpi.setRemote_user(arr[1]);            kpi.setTime_local(arr[3].substring(1));            kpi.setRequest(arr[6]);            kpi.setStatus(arr[8]);            kpi.setBody_bytes_sent(arr[9]);            kpi.setHttp_referer(arr[10]);            if (arr.length > 12) {                kpi.setHttp_user_agent(arr[11] + " " + arr[12]);            } else {                kpi.setHttp_user_agent(arr[11]);            }            try{                // 存在status没有的情况,直接pass                if (Integer.parseInt(kpi.getStatus()) >= 400) {// 大于400,HTTP错误                    kpi.setValid(false);                }            }catch(Exception e){                System.out.println(line);                kpi.setValid(false);            }        } else {            kpi.setValid(false);        }        return kpi;    }    /**     * 按page的pv分类     */    public static KPI filterPVs(String line) {        /*KPI kpi = parser(line);        Set<String> pages = new HashSet<String>();        pages.add("/forum-46-1.html");        pages.add("/forum-58-1.html");        pages.add("/forum-61-1.html");        if (!pages.contains(kpi.getRequest())) {            kpi.setValid(false);        }        return kpi;*/        return parser(line);    }    /**     * 按page的独立ip分类     */    public static KPI filterIPs(String line) {        /*KPI kpi = parser(line);        Set<String> pages = new HashSet<String>();        pages.add("/forum-46-1.html");        pages.add("/forum-58-1.html");        pages.add("/forum-61-1.html");        if (!pages.contains(kpi.getRequest())) {            kpi.setValid(false);        }        return kpi;*/        return parser(line);    }    /**     * PV按浏览器分类     */    public static KPI filterBroswer(String line) {        return parser(line);    }    /**     * PV按小时分类     */    public static KPI filterTime(String line) {        return parser(line);    }    /**     * PV按访问域名分类     */    public static KPI filterDomain(String line) {        return parser(line);    }    @Override    public String toString() {        StringBuilder sb = new StringBuilder();        sb.append("valid:" + this.valid);        sb.append("\nremote_addr:" + this.remote_addr);        sb.append("\nremote_user:" + this.remote_user);        sb.append("\ntime_local:" + this.time_local);        sb.append("\nrequest:" + this.request);        sb.append("\nstatus:" + this.status);        sb.append("\nbody_bytes_sent:" + this.body_bytes_sent);        sb.append("\nhttp_referer:" + this.http_referer);        sb.append("\nhttp_user_agent:" + this.http_user_agent);        return sb.toString();    }    public String getRemote_addr() {        return remote_addr;    }    public void setRemote_addr(String remote_addr) {        this.remote_addr = remote_addr;    }    public String getRemote_user() {        return remote_user;    }    public void setRemote_user(String remote_user) {        this.remote_user = remote_user;    }    public String getTime_local() {        return time_local;    }    public Date getTime_local_Date() throws ParseException {        SimpleDateFormat df = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss",                Locale.US);        return df.parse(this.time_local);    }    public String getTime_local_Date_hour() throws ParseException {        SimpleDateFormat df = new SimpleDateFormat("yyyyMMddHH");        return df.format(this.getTime_local_Date());    }    public void setTime_local(String time_local) {        this.time_local = time_local;    }    public String getRequest() {        return request;    }    public void setRequest(String request) {        this.request = request;    }    public String getStatus() {        return status;    }    public void setStatus(String status) {        this.status = status;    }    public String getBody_bytes_sent() {        return body_bytes_sent;    }    public void setBody_bytes_sent(String body_bytes_sent) {        this.body_bytes_sent = body_bytes_sent;    }    public String getHttp_referer() {        return http_referer;    }    public String getHttp_referer_domain() {        if (http_referer.length() < 8) {            return http_referer;        }        String str = this.http_referer.replace("\"", "").replace("http://", "")                .replace("https://", "");        return str.indexOf("/") > 0 ? str.substring(0, str.indexOf("/")) : str;    }    public void setHttp_referer(String http_referer) {        this.http_referer = http_referer;    }    public String getHttp_user_agent() {        return http_user_agent;    }    public void setHttp_user_agent(String http_user_agent) {        this.http_user_agent = http_user_agent;    }    public boolean isValid() {        return valid;    }    public void setValid(boolean valid) {        this.valid = valid;    }    public static void main(String args[]) {        String line = "112.97.24.243 - - [31/Jan/2012:00:14:48 +0800] \"GET /data/cache/style_2_common.css?AZH HTTP/1.1\" 200 57752 \"http://f.dataguru.cn/forum-58-1.html\" \"Mozilla/5.0 (iPhone; CPU iPhone OS 5_0_1 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Mobile/9A406\"";        System.out.println(line);        KPI kpi = new KPI();        String[] arr = line.split(" ");        kpi.setRemote_addr(arr[0]);        kpi.setRemote_user(arr[1]);        kpi.setTime_local(arr[3].substring(1));        kpi.setRequest(arr[6]);        kpi.setStatus(arr[8]);        kpi.setBody_bytes_sent(arr[9]);        kpi.setHttp_referer(arr[10]);        kpi.setHttp_user_agent(arr[11] + " " + arr[12]);        System.out.println(kpi);        try {            SimpleDateFormat df = new SimpleDateFormat("yyyy.MM.dd:HH:mm:ss z",Locale.US);            System.out.println(df.format(kpi.getTime_local_Date()));            System.out.println(kpi.getTime_local_Date_hour());            System.out.println(kpi.getHttp_referer_domain());        } catch (ParseException e) {            e.printStackTrace();        }    }}

2、计算独立ip、pv和总字节数代码

package week5;import java.io.IOException;import java.util.ArrayList;import java.util.HashSet;import java.util.Iterator;import java.util.List;import java.util.Set;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapred.FileInputFormat;import org.apache.hadoop.mapred.FileOutputFormat;import org.apache.hadoop.mapred.JobClient;import org.apache.hadoop.mapred.JobConf;import org.apache.hadoop.mapred.MapReduceBase;import org.apache.hadoop.mapred.Mapper;import org.apache.hadoop.mapred.OutputCollector;import org.apache.hadoop.mapred.Reducer;import org.apache.hadoop.mapred.Reporter;import org.apache.hadoop.mapred.TextInputFormat;import org.apache.hadoop.mapred.TextOutputFormat;public class KPIIPVBYTE {public static class ParseLogMapper extends MapReduceBase implements Mapper<Object, Text, Text, Text> {  @Override  public void map(Object key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException {   KPI kpi = KPI.filterIPs(value.toString());   if (kpi.isValid()) {    output.collect(new Text("ip"), new Text(kpi.getRemote_addr()));    output.collect(new Text("pv"), new Text("1"));    output.collect(new Text("ps"), new Text("".equals(kpi.getBody_bytes_sent())?"0":kpi.getBody_bytes_sent()));   }  }}  public static class parseLogReducer extends MapReduceBase implements Reducer<Text, Text, Text, Text> {    private  Set<String> count = new HashSet<String>();       private int sumPv=0;       private long sumPs=0;         @Override         public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException {             String keys = key.toString();             List<Text> listv = new ArrayList<Text>();           if("ip".equals(keys.toLowerCase().trim())){            while (values.hasNext()) {                   count.add(values.next().toString());               }            output.collect(new Text("IP总数:"), new Text(String.valueOf(count.size())));           }else if("pv".equals(keys.toLowerCase().trim())){            while (values.hasNext()) {             sumPv+= Long.parseLong(values.next().toString());               }            output.collect(new Text("PV数:"), new Text(String.valueOf(sumPv)));           }else if("ps".equals(keys.toLowerCase().trim())){           while (values.hasNext()) {                sumPs +=Integer.parseInt(values.next().toString());           }           sumPs = sumPs/1024/1024;           output.collect(new Text("总字节数:"), new Text(String.valueOf(sumPs)+"M"));           }            }    }public static void main(String[] args) throws Exception{   String input = "hdfs://hadoop1:9000/week5/in/access.20120104.log";         String output = "hdfs://hadoop1:9000/week5/out/";         JobConf conf = new JobConf(KPIIPVBYTE.class);         conf.setJobName("IPPVPS");         conf.setMapOutputKeyClass(Text.class);         conf.setMapOutputValueClass(Text.class);         conf.setOutputKeyClass(Text.class);         conf.setOutputValueClass(Text.class);         conf.setMapperClass(ParseLogMapper.class);         conf.setReducerClass(parseLogReducer.class);         conf.setInputFormat(TextInputFormat.class);         conf.setOutputFormat(TextOutputFormat.class);         FileInputFormat.setInputPaths(conf, new Path(input));         FileOutputFormat.setOutputPath(conf, new Path(output));         JobClient.runJob(conf);         System.exit(0);     }}

结果:

IP总数:   34413总字节数:   67627MPV数:    2910085

二、统计来源网站,列出域名及带来的独立ip数

1、来源统计代码

package week5;import java.io.IOException;import java.util.HashSet;import java.util.Set;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;public class SourceCount {        public static class MyMapper extends Mapper<Object, Text, Text, Text> {                private Text state = new Text();                private Text ip = new Text();                public void map(Object key, Text value, Context context)                                throws IOException, InterruptedException {                    KPI kpi = KPI.filterIPs(value.toString());                    if (kpi.isValid()) {                            state.set(kpi.getHttp_referer());                            ip.set(kpi.getRemote_addr());                            context.write(state, ip);                    }                }        }        public static class SumReducer extends Reducer<Text, Text, Text, Text> {                private Text result = new Text();                public void reduce(Text key, Iterable<Text> values, Context context)throws IOException, InterruptedException {                        Set<String> ips = new HashSet<String>();                        for (Text val : values) {                                ips.add(val.toString());                        }                        result.set(String.valueOf(ips.size()));                        context.write(key, result);                }        }        public static void main(String[] args) throws Exception {                 Configuration conf = new Configuration();                 Job job = new Job(conf);                 job.setJarByClass(SourceCount.class);                 job.setMapperClass(MyMapper.class);                 job.setCombinerClass(SumReducer.class);                 job.setReducerClass(SumReducer.class);                 job.setOutputKeyClass(Text.class);                 job.setOutputValueClass(Text.class);                 FileInputFormat.addInputPath(job, new Path("hdfs://hadoop1:9000/week5/in/access.20120104.log"));                 FileOutputFormat.setOutputPath(job, new  Path("hdfs://hadoop1:9000/week5/out3"));                 System.exit(job.waitForCompletion(true) ? 0 : 1);        }}

结果:

这里写代码片
太多了,取部分为证

三、统计用户使用的浏览器种类,计算出各种浏览器占的百分比

1、浏览器占比代码

package week5;import java.io.IOException;import java.util.Iterator;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.Text;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.mapred.FileInputFormat;import org.apache.hadoop.mapred.FileOutputFormat;import org.apache.hadoop.mapred.JobClient;import org.apache.hadoop.mapred.JobConf;import org.apache.hadoop.mapred.MapReduceBase;import org.apache.hadoop.mapred.Mapper;import org.apache.hadoop.mapred.OutputCollector;import org.apache.hadoop.mapred.Reducer;import org.apache.hadoop.mapred.Reporter;import org.apache.hadoop.mapred.TextInputFormat;import org.apache.hadoop.mapred.TextOutputFormat;public class KPIBroswer {    private static long total = 0;    public static class KPIBrowserMapper extends MapReduceBase implements            Mapper<Object, Text, Text, IntWritable> {        private Text browserInfo = new Text();        private IntWritable one = new IntWritable(1);        public void map(Object key, Text value,                OutputCollector<Text, IntWritable> output, Reporter reporter)                throws IOException {            KPI kpi = KPI.filterBroswer(value.toString());            if (kpi.isValid()) {                browserInfo.set(kpi.getHttp_user_agent());                total++;                output.collect(browserInfo, one);            }        }    }    public static class KPIBrowserReducer extends MapReduceBase implements            Reducer<Text, IntWritable, Text, Text> {        private Text result = new Text();        public void reduce(Text key, Iterator<IntWritable> values,                OutputCollector<Text, Text> output, Reporter reporter)                throws IOException {            long sum = 0;            while (values.hasNext()) {                sum += values.next().get();            }            System.out.println("answer is over there");            System.out.println(sum);            System.out.println(total);            System.out.println(String.valueOf(((double) sum / total * 100) + "%"));            result.set(String.valueOf(((double) sum / total * 100) + "%"));            output.collect(key, result);        }    }    public static void main(String[] args) throws Exception {        String input = "hdfs://hadoop1:9000/week5/in/access.20120104.log";        String output = "hdfs://hadoop1:9000/week5/out4";        JobConf conf = new JobConf(KPIBroswer.class);        conf.setJobName("KPIBrowser");        conf.setMapOutputKeyClass(Text.class);        conf.setMapOutputValueClass(IntWritable.class);        conf.setOutputKeyClass(Text.class);        conf.setOutputValueClass(Text.class);        conf.setMapperClass(KPIBrowserMapper.class);        // conf.setCombinerClass(KPIBrowserReducer.class);        conf.setReducerClass(KPIBrowserReducer.class);        conf.setInputFormat(TextInputFormat.class);        conf.setOutputFormat(TextOutputFormat.class);        FileInputFormat.setInputPaths(conf, new Path(input));        FileOutputFormat.setOutputPath(conf, new Path(output));        JobClient.runJob(conf);        System.exit(0);    }}

2、结果:

    2.0617954458374926E-4%""  2.0617954458374926E-4%"(C)Nokia6700s/SymbianOS/9.1 Series60/3.0"  3.4363257430624875E-5%"-" 1.5351785257131665%"-" "-" 6.872651486124975E-5%"-" "Mozilla/4.0"   1.0308977229187463E-4%"Amoi-F90/Plat-F/WAP2.0/MIDP1.0/CLDC1.0 UP.Browser/6.2.2.6.f.1.100  6.872651486124975E-5%"AmoiE70/6.1.08/WAP2.0 Profile/MIDP2.0  3.4363257430624875E-5%"AndroidDownloadManager"    6.872651486124975E-5%"Apache-HttpClient/4.1 (java    3.4363257430624875E-5%"Apache-HttpClient/4.1.1 (java  3.4363257430624875E-5%"AppEngine-Google; (+http://code.google.com/appengine;  2.0617954458374926E-4%"Apple-PubSub/28"   0.001683799614100619%"Apple-PubSub/65"   3.4363257430624875E-5%"Apple-PubSub/65.20"    3.4363257430624875E-5%"Apple-PubSub/65.23"    6.185386337512477E-4%"Apple-PubSub/65.28 AppEngine-Google;   3.4363257430624875E-5%"Apple-PubSub/65.28"    0.015807098418087445%"BGSY bot/1.0"  0.0041235908916749855%"BaiduMobile/1.3.1 CFNetwork/485.12.7   3.4363257430624875E-5%"BaiduMobile/1.3.2 CFNetwork/548.0.4    3.4363257430624875E-5%"BaiduMobile/1.3.4 CFNetwork/485.12.7   6.872651486124975E-5%"BaiduMobile/1.3.4 CFNetwork/485.13.9   6.872651486124975E-5%"BaiduMobile/1.3.5 CFNetwork/485.12.7   3.4363257430624875E-5%"BaiduMobile/1.3.5 CFNetwork/485.13.9   2.4054280201437413E-4%"BaiduMobile/1.3.5 CFNetwork/548.0.3    6.872651486124975E-5%"BaiduMobile/1.3.5 CFNetwork/548.0.4    1.374530297224995E-4%"Baiduspider"   1.374530297224995E-4%"Baiduspider+(+http://www.baidu.com/search/spider.htm)" 0.01147732798182871%"Baiduspider-news+(+http://www.baidu.com/search/spider.htm)"    0.009003173446823718%"BlackBerry8820/2.7.0.105-4.5.0.174 Profile/MIDP-2.0    1.718162871531244E-4%"BlackBerry8900/5.2.0.96 Profile/MIDP-2.0   6.872651486124975E-5%"BlackBerry9000/5.2.0.89 Profile/MIDP-2.0   3.4363257430624875E-5%"BlackBerry9700/5.0.0.862 Profile/MIDP-2.1  3.4363257430624875E-5%"CoolPadF800/CMCC WindowsCEOS/6.0/(2009.10.30)10.01.F800/WAP2.0 1.374530297224995E-4%"Dalvik/1.2.0 (Linux;   5.49812118889998E-4%"Dalvik/1.4.0 (Linux;   3.4363257430624875E-5%"DoCoMo/2.0 N905i(c100;TB;W24H16)   3.779958317368737E-4%"DoCoMo/2.0 P900i(c100;TB;W24H11)   1.374530297224995E-4%"Domnutch-Bot/Nutch-1.0 (Domnutch;  6.872651486124975E-5%"Doubanbot/1.0 (bot@douban.com  9.278079506268717E-4%"E63/SymbianOS/9.1 Series60/3.0"    2.4054280201437413E-4%"E66/SymbianOS/9.1 Series60/3.0"    4.123590891674985E-4%"E71/SymbianOS/9.1 Series60/3.0"    6.872651486124975E-5%"FTRF: Friendly 6.185386337512477E-4%"Feed43 Proxy/1.0   3.779958317368737E-4%"FeedDemon/3.1 (http://www.feeddemon.com/;  6.872651486124975E-5%"FeedDemon/4.0 (http://www.feeddemon.com/;  0.00402050111938311%"FeedFetcher-Google-CoOp; (+http://www.google.com/coop/cse/cref)"   7.559916634737474E-4%"Feedfetcher-Google; (+http://www.google.com/feedfetcher.html)" 6.872651486124976E-4%"Feedfetcher-Google; (+http://www.google.com/feedfetcher.html;  0.3223273546992614%"Feedreader 3.14    0.0035394155153543627%"GIONEE-L011/SW1.0.0/WAP2.0/MIDP2.1 Configuration/CLDC-1.1" 6.872651486124975E-5%"GoodReaderIPad/3.12.0 CFNetwork/548.0.4    3.4363257430624875E-5%"GoogleProducer"    0.0015119833269474948%"Googlebot-Image/1.0"   1.0308977229187463E-4%"Googlebot/2.1 (+http://www.google.com/bot.html)"   0.0017181628715312442%"Googlebot/2.1 (+http://www.googlebot.com/bot.html)"    2.0617954458374926E-4%"Googlebot/2.1 (http://www.googlebot.com/bot.html)" 1.0308977229187463E-4%"GreatNews/1.0" 0.012576952219608707%"HD_T8282 Mozilla/4.0   3.4363257430624875E-5%"HTCT9188_TD/1.0 WindowsMobile/6.5  1.0308977229187463E-4%"HTC_Touch_Diamond2_T5353 Mozilla/4.0   1.0308977229187463E-4%"HTC_Touch_Pro_T7272 Mozilla/4.0    3.4363257430624875E-5%"HTMLParser/1.6"    6.872651486124975E-5%"HTTP Fetcher/HTTP/1.0" 3.4363257430624875E-5%"HTTP_Request2/2.0.0 (http://pear.php.net/package/http_request2)    2.74906059444999E-4%"Holleycomm-H8800/2.0 WAP2.0    3.436325743062488E-4%"HuaweiSymantecSpider/1.0+DSE-support@huaweisymantec.com+(compatible; MSIE  0.04123590891674985%"HuaweiT5211_TD/1.0 RTKE_OS/01.00   1.718162871531244E-4%"HuaweiT8100_TD/1.0 Android/2.2 1.374530297224995E-4%"HuaweiU7520/B000 Browser/NetFront/4.1  3.4363257430624875E-5%"Huaweisymantecspider (compatible;  0.027490605944499907%"IUC(U;iOS 3.1.3;Zh-cn;320*480;)"   3.4363257430624875E-5%"IUC(U;iOS 4.1;Zh-cn;320*480;)/UCWEB8.1.0.104/41/997"   4.123590891674985E-4%...
0 0
原创粉丝点击