Pig 学习之 日志处理

来源:互联网 发布:微软sql server 编辑:程序博客网 时间:2024/05/20 20:55

以Apache 日志为例。

创建加载器

    @Override    public Tuple getNext() throws IOException {        tuple = new ArrayList<Object>(11);        for (int i = 0; i < 11; i++) {            tuple.add(null);        }        try {            // 如果recordReader 读取到input split 的末端,则返回null            if (!in.nextKeyValue()) {                return null;            }            setTuple(in.getCurrentValue());            return factory.newTupleNoCopy(tuple);        } catch (InterruptedException e) {            int errCode = 6018;            String errMsg = "Error while reading input";            throw new ExecException(errMsg, errCode,                    PigException.REMOTE_ENVIRONMENT, e);        }    }    // 设置 tuple    private void setTuple(CommonLogWritable entry) throws IOException {        tuple.set(0, entry.getRemoteAddress());        tuple.set(1, entry.getRemoteLogname());        tuple.set(2, entry.getUserId());        tuple.set(3, entry.getTime());        tuple.set(4, entry.getRequestLine());        tuple.set(5, entry.getStatusCode());        tuple.set(6, entry.getObjSize());        tuple.set(7, entry.getMethod());        tuple.set(8, entry.getResource());        tuple.set(9, entry.getProtocol());        tuple.set(10, entry.getEpoch());    }

我们希望加载器可以指定字段的类型信息,这里需要实现LoadMetadata接口,并提供字段名称和相关信息的有序列表。

    @Override    public ResourceSchema getSchema(String location, Job job) throws IOException {        return new ResourceSchema(new Schema(                Arrays.asList(                        new Schema.FieldSchema(CommonLogLoaderConstants.REMOTE_ADDR, DataType.CHARARRAY),                        new Schema.FieldSchema(CommonLogLoaderConstants.REMOTE_LOGNAME, DataType.CHARARRAY),                        new Schema.FieldSchema(CommonLogLoaderConstants.USERID, DataType.CHARARRAY),                        new Schema.FieldSchema(CommonLogLoaderConstants.TIME, DataType.CHARARRAY),                        new Schema.FieldSchema(CommonLogLoaderConstants.REQUEST_LINE, DataType.CHARARRAY),                        new Schema.FieldSchema(CommonLogLoaderConstants.STATUS_CODE, DataType.LONG),                        new Schema.FieldSchema(CommonLogLoaderConstants.OBJ_SIZE, DataType.LONG),                        new Schema.FieldSchema(CommonLogLoaderConstants.METHOD, DataType.CHARARRAY),                        new Schema.FieldSchema(CommonLogLoaderConstants.RESOURCE, DataType.CHARARRAY),                        new Schema.FieldSchema(CommonLogLoaderConstants.PROTOCOL, DataType.CHARARRAY),                        new Schema.FieldSchema(CommonLogLoaderConstants.EPOCH, DataType.LONG)                )));    }

pig脚本:

REGISTER pig.jar;REGISTER geoip-api-1.2.14.jar;DEFINE LogLoader com.hadoop2.pig.CommonLogLoader();logs = LOAD 'access.log' USING LogLoader;grpd = GROUP logs BY statusCode;/** 统计 请求失败的次数 */cntd = FOREACH grpd GENERATE group,COUNT(logs);/** 减少字段处理 */projected_logs = FOREACH logs GENERATE remoteAddr,statusCode,resource;ip_group = GROUP projected_logs BY (remoteAddr,statusCode);addrstatus_count = FOREACH ip_group GENERATE FLATTEN(group),COUNT(projected_logs);DEFINE GeoIP com.hadoop2.pig.PigGeolocationUDF();countries = FOREACH addrstatus_count GENERATE *,GeoIP(remoteAddr);dump countries;

输出结果:

(10.0.1.75,302,2)(10.0.1.91,200,12)(10.0.1.91,301,2)(10.0.5.17,200,114)(10.0.5.17,301,2)(10.0.5.17,302,2)(10.0.5.17,503,2)(10.0.5.50,200,8)(10.0.5.78,200,74)(10.0.5.91,200,26)(10.0.5.92,200,58)(10.0.5.92,503,8)(10.0.5.173,200,12)(10.0.5.211,200,62)(10.0.6.168,200,8)(10.0.6.193,200,54)(10.0.6.196,200,16)(10.0.6.197,200,636)(10.0.6.247,200,6)(172.16.1.36,200,2380)(172.16.1.36,503,64)

由于我的日志都是来源于内网,所以最后一步国家统计的未执行!

0 0
原创粉丝点击