Hadoop2.8.0<Mapreduce实现多表关联>

来源:互联网 发布:java中命令行是什么 编辑:程序博客网 时间:2024/06/04 19:57

1.数据结构

fanData_WT02287.csv

数据说明:

数据来源 风机编号 时间 风机状态 风速 电机转速 桨叶转速 风向 偏航角度 齿轮箱油温 齿轮箱轴承油温 环境温度 机舱温度 发电机温度 A相电流 B相电流 C相电流 A相电压 B相电压 C相电压 电机频率 无功功率 有功功率 功率因素 总发电量 总发电时间 故障时间 备用时间 备注
taizhang.csv

数据说明

项目公司 风机编号 区域 项目公司 风场名称 风电场简称 区域 风场ID 风场接入时间 风机编号 风机名称 风机类型 机组容量 风机厂家 机组协议号

需求:

SELECT b.*,a.风场名称,a.所属区域,a.项目公司,a.风机名称,a.风机类型 FROM taizhang a, fanData b where a.fanNo=b.FanNo;

将两个文件上传到HDFS上

2.BasicInfo
这个类用来解析taizhang数据,生成Bean

public class BasicInfo {    private String fanNo;    private String area;    private String project;    private String fanName;    private String shortName;    private String farmId;    private String farmName;    private String fanType;    public void getInfo(String row) {        String[] cols = row.split(",");        this.fanNo = cols[1];        this.area = cols[2];        this.project = cols[3];        this.farmName = cols[4];        this.shortName = cols[5];        this.farmId = cols[7];        this.fanName = cols[10];        this.fanType = cols[11];    }    public String getFanNo() {        return fanNo;    }    public void setFanNo(String fanNo) {        this.fanNo = fanNo;    }    public String getArea() {        return area;    }    public void setArea(String area) {        this.area = area;    }    public String getProject() {        return project;    }    public void setProject(String project) {        this.project = project;    }    public String getFanName() {        return fanName;    }    public void setFanName(String fanName) {        this.fanName = fanName;    }    public String getShortName() {        return shortName;    }    public void setShortName(String shortName) {        this.shortName = shortName;    }    public String getFarmId() {        return farmId;    }    public void setFarmId(String farmId) {        this.farmId = farmId;    }    public String getFarmName() {        return farmName;    }    public void setFarmName(String farmName) {        this.farmName = farmName;    }    public String getFanType() {        return fanType;    }    public void setFanType(String fanType) {        this.fanType = fanType;    }    @Override    public String toString() {        return "join.BasicInfo [fanNo=" + fanNo + ", area=" + area + ", project="                + project + ", fanName=" + fanName + ", shortName=" + shortName                + ", farmId=" + farmId + ", farmName=" + farmName                + ", fanType=" + fanType + "]";    }}

3.FanData
这个类用来解析fandata生成对应的bean

public class FanData {    private String dataSource;// 数据来源    private String fanNo;    private String windSpeed;// 风速    private String time;    private String rs;// 转速    public void getInstance(String row) {        String[] cols = row.split(",");        this.dataSource = cols[0];        this.fanNo = cols[1];        this.windSpeed = cols[4];        this.time = cols[2];        this.rs = cols[6];    }    public String getDataSource() {        return dataSource;    }    public void setDataSource(String dataSource) {        this.dataSource = dataSource;    }    public String getFanNo() {        return fanNo;    }    public void setFanNo(String fanNo) {        this.fanNo = fanNo;    }    public String getWindSpeed() {        return windSpeed;    }    public void setWindSpeed(String windSpeed) {        this.windSpeed = windSpeed;    }    public String getTime() {        return time;    }    public void setTime(String time) {        this.time = time;    }    public String getRs() {        return rs;    }    public void setRs(String rs) {        this.rs = rs;    }    @Override    public String toString() {        return "dataSource=" + this.dataSource                + ",fanNo=" + this.fanNo + ",windSpeed=" + this.windSpeed + ",rs=" + this.rs;    }}

4.mapreduce

public class TableJoin {    public static class TableJoinMapper extends Mapper<Object, Text, Text, Text> {        private FanData fanData = new FanData();        private BasicInfo bi = new BasicInfo();        private Text text = new Text();        @Override        protected void map(Object key, Text value, Context context)                throws IOException, InterruptedException {            InputSplit inputSplit = context.getInputSplit();            String fileName = ((FileSplit) inputSplit).getPath().getName();            if (fileName.contains("fanData")) {                fanData.getInstance(value.toString());                String fanNo = fanData.getFanNo();                //在这里给数据加上标签                text.set(value.toString() + "_fandata");                context.write(new Text(fanNo), text);            } else if (fileName.contains("taizhang")) {                bi.getInfo(value.toString());                String fanNo = bi.getFanNo();                text.set(value.toString() + "_bi");                context.write(new Text(fanNo), text);            }        }    }    public static class TableJoinReducer extends Reducer<Text, Text, Text, NullWritable> {        private BasicInfo bi = new BasicInfo();        @Override        protected void reduce(Text key, Iterable<Text> values, Context context)                throws IOException, InterruptedException {            List<String> fanDatas = new ArrayList<String>();            String row = null;            for (Text value : values) {                row = value.toString();                if (row.endsWith("fandata")) {                    fanDatas.add(row);                } else {                    bi.getInfo(value.toString());                }            }            for (String fanData : fanDatas) {                String newRow = fanData.substring(0, fanData.length() - 8) + "," + bi.getFarmName() + "," + bi.getArea() + "," + bi.getProject() + "," + bi.getFanType();                System.out.println(newRow);                context.write(new Text(newRow), NullWritable.get());            }        }    }    public static void main(String[] args) throws Exception {        Configuration conf = new Configuration();        Job job = Job.getInstance(conf, "Table Join");        HDFSUtils hdfs = new HDFSUtils(conf);        hdfs.deleteDir(args[2]);        job.setJarByClass(TableJoin.class);        job.setMapperClass(TableJoinMapper.class);        job.setMapOutputKeyClass(Text.class);        job.setMapOutputValueClass(Text.class);        job.setReducerClass(TableJoinReducer.class);        job.setOutputKeyClass(Text.class);        job.setOutputValueClass(NullWritable.class);        FileInputFormat.addInputPath(job, new Path(args[1]));        FileInputFormat.addInputPath(job, new Path(args[0]));        FileOutputFormat.setOutputPath(job, new Path(args[2]));        System.exit(job.waitForCompletion(true) ? 0 : 1);    }}

大体思路是这样子的:
在map阶段首先判断输入文件是哪一个文件中的内容

InputSplit inputSplit = context.getInputSplit();String fileName = ((FileSplit) inputSplit).getPath().getName();

输出的(k,v),K是fanNo,v分别是两个文件中的每一行数据分别追加上不同的标签fandata的例如,数据追加fanData

然后再reduce阶段,相同key的只有一个basicinfo,但是有多条fandata数据,所以使用一个Basicinfo对象存储这个basicinfo数据,然后使用一个ArrayList存储所有相同key的fandata,
最后是将ArrayList中的每一条数据追加上basicinfo的信息(需要注意的是,在追加的时候,将map阶段的追加的数据给删除掉,比如_fandata)

最终的结果如下:

pi10mr,WT02287,2015-09-01 00:30:00,1,3.67,0,0,349.53,0,40.36,40.16,20.32,25.88,46.13,0,0,0,0,0,0,0,0,-0.47,0.39,9.46539e+06,0,0,0,15,富饶山风电场,辽宁,沈阳龙源雄亚风力发电有限公司,GW50-750pi10mr,WT02287,2015-09-01 00:20:00,1,2.81,0,0,217.79,0,40.57,40.3,20.37,26.07,46.44,0,0,0,0,0,0,0,0,-0.5,0.31,9.46539e+06,0,0,0,24,富饶山风电场,辽宁,沈阳龙源雄亚风力发电有限公司,GW50-750pi10mr,WT02287,2015-09-01 00:10:00,1,3.22,0,0,224.85,0,40.61,40.53,20.43,26.2,46.75,0,0,0,0,0,0,0,0,-0.3,0.48,9.46539e+06,0,0,0,20,富饶山风电场,辽宁,沈阳龙源雄亚风力发电有限公司,GW50-750pi10mr,WT02287,2015-09-01 00:00:00,1,3.54,0,0,214.89,0,40.76,40.64,20.5,26.31,47.05,0,0,0,0,0,0,0,0,-0.52,0.32,9.46539e+06,0,0,0,27,富饶山风电场,辽宁,沈阳龙源雄亚风力发电有限公司,GW50-750