mapreduce web日志预处理

来源:互联网 发布:口腔医学专升本知乎 编辑:程序博客网 时间:2024/06/05 11:41

需求:
对web访问日志中的各字段识别切分
去除日志中不合法的记录
根据KPI统计需求,生成各类访问请求过滤数据

2、实现代码:
a) 定义一个bean,用来记录日志数据中的各数据字段
public class WebLogBean {

private String remote_addr;// 记录客户端的ip地址private String remote_user;// 记录客户端用户名称,忽略属性"-"private String time_local;// 记录访问时间与时区private String request;// 记录请求的url与http协议private String status;// 记录请求状态;成功是200private String body_bytes_sent;// 记录发送给客户端文件主体内容大小private String http_referer;// 用来记录从那个页面链接访问过来的private String http_user_agent;// 记录客户浏览器的相关信息private boolean valid = true;// 判断数据是否合法public String getRemote_addr() {    return remote_addr;}public void setRemote_addr(String remote_addr) {    this.remote_addr = remote_addr;}public String getRemote_user() {    return remote_user;}public void setRemote_user(String remote_user) {    this.remote_user = remote_user;}public String getTime_local() {    return time_local;}public void setTime_local(String time_local) {    this.time_local = time_local;}public String getRequest() {    return request;}public void setRequest(String request) {    this.request = request;}public String getStatus() {    return status;}public void setStatus(String status) {    this.status = status;}public String getBody_bytes_sent() {    return body_bytes_sent;}public void setBody_bytes_sent(String body_bytes_sent) {    this.body_bytes_sent = body_bytes_sent;}public String getHttp_referer() {    return http_referer;}public void setHttp_referer(String http_referer) {    this.http_referer = http_referer;}public String getHttp_user_agent() {    return http_user_agent;}public void setHttp_user_agent(String http_user_agent) {    this.http_user_agent = http_user_agent;}public boolean isValid() {    return valid;}public void setValid(boolean valid) {    this.valid = valid;}@Overridepublic String toString() {    StringBuilder sb = new StringBuilder();    sb.append(this.valid);    sb.append("\001").append(this.remote_addr);    sb.append("\001").append(this.remote_user);    sb.append("\001").append(this.time_local);    sb.append("\001").append(this.request);    sb.append("\001").append(this.status);    sb.append("\001").append(this.body_bytes_sent);    sb.append("\001").append(this.http_referer);    sb.append("\001").append(this.http_user_agent);    return sb.toString();

}
}

b)定义一个parser用来解析过滤web访问日志原始记录
public class WebLogParser {
public static WebLogBean parser(String line) {
WebLogBean webLogBean = new WebLogBean();
String[] arr = line.split(” “);
if (arr.length > 11) {
webLogBean.setRemote_addr(arr[0]);
webLogBean.setRemote_user(arr[1]);
webLogBean.setTime_local(arr[3].substring(1));
webLogBean.setRequest(arr[6]);
webLogBean.setStatus(arr[8]);
webLogBean.setBody_bytes_sent(arr[9]);
webLogBean.setHttp_referer(arr[10]);

        if (arr.length > 12) {            webLogBean.setHttp_user_agent(arr[11] + " " + arr[12]);        } else {            webLogBean.setHttp_user_agent(arr[11]);        }        if (Integer.parseInt(webLogBean.getStatus()) >= 400) {// 大于400,HTTP错误            webLogBean.setValid(false);        }    } else {        webLogBean.setValid(false);    }    return webLogBean;}public static String parserTime(String time) {    time.replace("/", "-");    return time;}

}

c) mapreduce程序
public class WeblogPreProcess {

static class WeblogPreProcessMapper extends Mapper<LongWritable, Text, Text, NullWritable> {    Text k = new Text();    NullWritable v = NullWritable.get();    @Override    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {        String line = value.toString();        WebLogBean webLogBean = WebLogParser.parser(line);        if (!webLogBean.isValid())            return;        k.set(webLogBean.toString());        context.write(k, v);    }}public static void main(String[] args) throws Exception {    Configuration conf = new Configuration();    Job job = Job.getInstance(conf);    job.setJarByClass(WeblogPreProcess.class);    job.setMapperClass(WeblogPreProcessMapper.class);    job.setOutputKeyClass(Text.class);    job.setOutputValueClass(NullWritable.class);    FileInputFormat.setInputPaths(job, new Path(args[0]));    FileOutputFormat.setOutputPath(job, new Path(args[1]));    job.waitForCompletion(true);}

}