维度模型数据仓库(六) —— 增加列

来源:互联网 发布:淘宝做什么生意比较好 编辑:程序博客网 时间:2024/06/06 16:34
(五)进阶技术
        1. 增加列
        数据仓库最常碰到的扩展是给一个已经存在的维度表和事实表添加列。本篇先讨论如果需要增加列,模式会发生怎样的变化。然后进一步说明如何在客户维度和销售订单事实表上添加列,并在新列上应用SCD2。假设需要在客户维度中增加送货地址属性,并在销售订单事实表中增加数量度量值。

        修改数据库模式

        图(五)- 1-1 显示了修改后的模式,在它的customer_dim表和sales_order_fact表上增加了新列。customer_dim表增加的新列是shipping_address、shipping_zip_code、shipping_city和shipping_state。sales_order_fact表增加的新列是order_quantity。使用清单(五)-1-1里的SQL脚本修改数据库模式。

图(五)- 1-1

USE dw;ALTER TABLE customer_dim  ADD shipping_address VARCHAR (50) AFTER customer_state, ADD shipping_zip_code INT (5) AFTER shipping_address, ADD shipping_city VARCHAR (30) AFTER shipping_zip_code, ADD shipping_state VARCHAR (2) AFTER shipping_city ;ALTER TABLE customer_stg  ADD shipping_address VARCHAR (50) AFTER customer_state, ADD shipping_zip_code INT (5) AFTER shipping_address, ADD shipping_city VARCHAR (30) AFTER shipping_zip_code, ADD shipping_state VARCHAR (2) AFTER shipping_city ;ALTER TABLE sales_order_factADD order_quantity INT AFTER order_amount ;USE source;ALTER TABLE customer  ADD shipping_address VARCHAR (50) AFTER customer_state, ADD shipping_zip_code INT (5) AFTER shipping_address, ADD shipping_city VARCHAR (30) AFTER shipping_zip_code, ADD shipping_state VARCHAR (2) AFTER shipping_city ;ALTER TABLE sales_order  ADD order_quantity INT AFTER order_amount ;
清单(五)-1-1

        修改定期装载脚本
        修改数据库模式后,还要修改已经使用的定期装载脚本。清单(五)-1-2显示了修改后的定期装载SQL脚本。
USE dw;-- 设置SCD的截止时间和生效时间SET @pre_date = SUBDATE(CURRENT_DATE,1) ;-- 设置CDC的上限时间UPDATE cdc_time SET current_load = CURRENT_DATE ;-- 装载客户维度TRUNCATE TABLE customer_stg;INSERT INTO customer_stgSELECT   customer_number, customer_name, customer_street_address, customer_zip_code, customer_city, customer_state, shipping_address, shipping_zip_code, shipping_city, shipping_stateFROM source.customer ;/* 在所有地址列上 SCD2                           *//* 置过期                          */UPDATE customer_dim a,    customer_stg b SET     expiry_date = @pre_dateWHERE    a.customer_number = b.customer_number        AND (a.customer_street_address <> b.customer_street_address        OR a.customer_city <> b.customer_city        OR a.customer_zip_code <> b.customer_zip_code        OR a.customer_state <> b.customer_state        OR a.shipping_address <> b.shipping_address        OR a.shipping_city <> b.shipping_city        OR a.shipping_zip_code <> b.shipping_zip_code        OR a.shipping_state <> b.shipping_state        OR a.shipping_address IS NULL        OR a.shipping_city IS NULL        OR a.shipping_zip_code IS NULL        OR a.shipping_state IS NULL)        AND expiry_date = '2200-01-01';/* 加新行                          */INSERT INTO customer_dimSELECT  NULL, b.customer_number, b.customer_name, b.customer_street_address, b.customer_zip_code, b.customer_city, b.customer_state, b.shipping_address, b.shipping_zip_code, b.shipping_city, b.shipping_state, a.version + 1, @pre_date, '2200-01-01'FROM  customer_dim a, customer_stg bWHERE    a.customer_number = b.customer_numberAND ( a.customer_street_address <> b.customer_street_address     OR a.customer_city <> b.customer_city     OR a.customer_zip_code <> b.customer_zip_code     OR a.customer_state <> b.customer_state     OR a.shipping_address <> b.shipping_address     OR a.shipping_city <> b.shipping_city     OR a.shipping_zip_code <> b.shipping_zip_code     OR a.shipping_state <> b.shipping_state     OR a.shipping_address IS NULL     OR a.shipping_city IS NULL     OR a.shipping_zip_code IS NULL     OR a.shipping_state IS NULL)AND EXISTS(SELECT *FROM customer_dim xWHERE    b.customer_number=x.customer_numberAND a.expiry_date = @pre_date )AND NOT EXISTS (SELECT *FROM customer_dim yWHERE    b.customer_number = y.customer_numberAND y.expiry_date = '2200-01-01') ;/* 在 customer_name 列上 SCD1                                             */UPDATE customer_dim a, customer_stg bSET a.customer_name = b.customer_nameWHERE a.customer_number = b.customer_number      AND a.customer_name <> b.customer_name ;/* 新增的客户                                                   */INSERT INTO customer_dimSELECT  NULL, customer_number, customer_name, customer_street_address, customer_zip_code, customer_city, customer_state, shipping_address, shipping_zip_code, shipping_city, shipping_state, 1, @pre_date,'2200-01-01'FROM customer_stgWHERE customer_number NOT IN(SELECT y.customer_numberFROM customer_dim x, customer_stg yWHERE x.customer_number = y.customer_number) ;/* 装载产品维度                                           */TRUNCATE TABLE product_stg ;INSERT INTO product_stgSELECT   product_code, product_name, product_categoryFROM source.product ;/* 在 product_name 和 product_category 列上 SCD2                                    *//* 置过期                                 */UPDATE  product_dim a, product_stg bSET  expiry_date = @pre_dateWHERE    a.product_code = b.product_codeAND (   a.product_name <> b.product_name     OR a.product_category <> b.product_category)AND expiry_date = '2200-01-01';/* 加新行                                */INSERT INTO product_dimSELECT  NULL, b.product_code, b.product_name, b.product_category, a.version + 1, @pre_date,'2200-01-01'FROM  product_dim a, product_stg bWHERE    a.product_code = b.product_codeAND (   a.product_name <> b.product_name     OR a.product_category <> b.product_category)AND EXISTS(SELECT *FROM product_dim xWHERE     b.product_code = x.product_code      AND a.expiry_date = @pre_date)AND NOT EXISTS (SELECT *FROM product_dim yWHERE     b.product_code = y.product_code      AND y.expiry_date = '2200-01-01') ;/* 新增的产品                                                    */INSERT INTO product_dimSELECT  NULL, product_code, product_name, product_category, 1, @pre_date, '2200-01-01'FROM product_stgWHERE product_code NOT IN(SELECT y.product_codeFROM product_dim x, product_stg yWHERE x.product_code = y.product_code) ;-- 装载订单维度,新增前一天的订单号INSERT INTO order_dim (  order_number, effective_date, expiry_date)SELECT  order_number, order_date, '2200-01-01'FROM source.sales_order, cdc_timeWHERE entry_date >= last_load AND entry_date < current_load ;-- 装载事实表,新增前一天的订单INSERT INTO sales_order_factSELECT  order_sk, customer_sk, product_sk, date_sk, order_amount, order_quantityFROM  source.sales_order a, order_dim b, customer_dim c, product_dim d, date_dim e, cdc_time fWHERE    a.order_number = b.order_numberAND a.customer_number = c.customer_numberAND a.order_date >= c.effective_dateAND a.order_date < c.expiry_dateAND a.product_code = d.product_codeAND a.order_date >= d.effective_dateAND a.order_date < d.expiry_dateAND a.order_date = e.dateAND a.entry_date >= f.last_load AND a.entry_date < f.current_load ;-- 更新时间戳表的last_load字段UPDATE cdc_time SET last_load = current_load ;COMMIT ;
清单(五)-1-2

        假设源数据里的客户送货地址可以为空,当客户在数据源里的送货地址变为有值时,就要更新这些数据仓库中已经存在的客户。如果还想维护送货地址的历史数据,就要在送货地址列上应用SCD2。还假设数量在销售订单源数据中是有效的,并且数据仓库中已经存在的销售订单不做更新。

        测试步骤:
  1. 执行清单(五)- 1-3里的SQL脚本准备准备客户和销售订单测试数据。
  2. 设置系统日期为2015年3月3日。
  3. 执行清单(五)-1-2里的SQL脚本或Kettle步骤进行定期装载。
        说明:Kettle的修改相对于SQL来说更容易,只需要对上一篇的三个步骤进行修改,这三个步骤分别是“装载过渡表”、“装载客户维度”、“装载事实表”,把新增加的列补充上即可。如图(五)- 1-2到(五)- 1-7显示了变化的步骤。
USE source;/*** 客户数据的改变如下:更新已有八个客户的送货地址新增客户9***/UPDATE customer SET   shipping_address = customer_street_address, shipping_zip_code = customer_zip_code, shipping_city = customer_city, shipping_state = customer_state ;INSERT INTO customer (customer_name, customer_street_address, customer_zip_code, customer_city, customer_state, shipping_address, shipping_zip_code, shipping_city, shipping_state)VALUES ('Online Distributors', '2323 Louise Dr.', 17055, 'Pittsburgh', 'PA', '2323 Louise Dr.', 17055, 'Pittsburgh', 'PA') ;INSERT INTO sales_order VALUES  (38, 1, 1, '2015-03-02', '2015-03-02', 1000, 10), (39, 2, 2, '2015-03-02', '2015-03-02', 2000, 20), (40, 3, 3, '2015-03-02', '2015-03-02', 4000, 40), (41, 4, 4, '2015-03-02', '2015-03-02', 6000, 60), (42, 5, 1, '2015-03-02', '2015-03-02', 2500, 25), (43, 6, 2, '2015-03-02', '2015-03-02', 5000, 50), (44, 7, 3, '2015-03-02', '2015-03-02', 7500, 75), (45, 8, 4, '2015-03-02', '2015-03-02', 1000, 10), (46, 9, 1, '2015-03-02', '2015-03-02', 1000, 10) ;COMMIT ;
清单(五)- 1-3

图(五)- 1-2

图(五)- 1-3

图(五)- 1-4

图(五)- 1-5

图(五)- 1-6

图(五)- 1-7

验证结果应该如下所示:
mysql> select
    ->     customer_number no,
    ->     customer_name name,
    ->     shipping_city,
    ->     shipping_zip_code zip,
    ->     shipping_state st,
    ->     version ver,
    ->     effective_date eff,
    ->     expiry_date exp
    -> from
    ->     customer_dim;
+------+------------------------+---------------+-------+------+-----+------------+------------+
| no   | name                   | shipping_city | zip   | st   | ver | eff        | exp        |
+------+------------------------+---------------+-------+------+-----+------------+------------+
|    1 | Really Large Customers | NULL          |  NULL | NULL |   1 | 2013-03-01 | 2015-03-02 |
|    2 | Small Stores           | NULL          |  NULL | NULL |   1 | 2013-03-01 | 2015-03-02 |
|    3 | Medium Retailers       | NULL          |  NULL | NULL |   1 | 2013-03-01 | 2015-03-02 |
|    4 | Good Companies         | NULL          |  NULL | NULL |   1 | 2013-03-01 | 2015-03-02 |
|    5 | Wonderful Shops        | NULL          |  NULL | NULL |   1 | 2013-03-01 | 2015-03-02 |
|    6 | Loyal Clients          | NULL          |  NULL | NULL |   1 | 2013-03-01 | 2015-03-01 |
|    7 | Distinguished Agencies | NULL          |  NULL | NULL |   1 | 2013-03-01 | 2015-03-02 |
|    6 | Loyal Clients          | NULL          |  NULL | NULL |   2 | 2015-03-01 | 2015-03-02 |
|    8 | Subsidiaries           | NULL          |  NULL | NULL |   1 | 2015-03-01 | 2015-03-02 |
|    1 | Really Large Customers | Mechanicsburg | 17050 | PA   |   2 | 2015-03-02 | 2200-01-01 |
|    2 | Small Stores           | Pittsburgh    | 17055 | PA   |   2 | 2015-03-02 | 2200-01-01 |
|    3 | Medium Retailers       | Pittsburgh    | 17055 | PA   |   2 | 2015-03-02 | 2200-01-01 |
|    4 | Good Companies         | Mechanicsburg | 17050 | PA   |   2 | 2015-03-02 | 2200-01-01 |
|    5 | Wonderful Shops        | Mechanicsburg | 17050 | PA   |   2 | 2015-03-02 | 2200-01-01 |
|    6 | Loyal Clients          | Pittsburgh    | 17055 | PA   |   3 | 2015-03-02 | 2200-01-01 |
|    7 | Distinguished Agencies | Mechanicsburg | 17050 | PA   |   2 | 2015-03-02 | 2200-01-01 |
|    8 | Subsidiaries           | Pittsburgh    | 17055 | PA   |   2 | 2015-03-02 | 2200-01-01 |
|    9 | Online Distributors    | Pittsburgh    | 17055 | PA   |   1 | 2015-03-02 | 2200-01-01 |
+------+------------------------+---------------+-------+------+-----+------------+------------+
18 rows in set (0.00 sec)
已存在客户的新记录有了送货地址。老的(过期)记录没有。9号客户是新加的,具有运输地址。

mysql>  select
    ->     order_sk o_sk,
    ->     customer_sk c_sk,
    ->     product_sk p_sk,
    ->     order_date_sk od_sk,
    ->     order_amount amt,
    ->     order_quantity qty
    -> from
    ->     sales_order_fact;
+------+------+------+-------+---------+------+
| o_sk | c_sk | p_sk | od_sk | amt     | qty  |
+------+------+------+-------+---------+------+
|    1 |    3 |    3 |  4809 | 4000.00 | NULL |
|    2 |    4 |    1 |  4854 | 4000.00 | NULL |
|    3 |    5 |    2 |  4889 | 6000.00 | NULL |
|    4 |    6 |    3 |  4960 | 6000.00 | NULL |
|    5 |    7 |    1 |  4993 | 8000.00 | NULL |
|    6 |    1 |    2 |  5063 | 8000.00 | NULL |
|    7 |    2 |    3 |  5119 | 1000.00 | NULL |
|    8 |    3 |    1 |  5155 | 1000.00 | NULL |
|    9 |    4 |    2 |  5188 | 2000.00 | NULL |
|   10 |    5 |    3 |  5224 | 2500.00 | NULL |
|   11 |    6 |    1 |  5264 | 3000.00 | NULL |
|   12 |    7 |    2 |  5266 | 3500.00 | NULL |
|   13 |    1 |    3 |  5310 | 4000.00 | NULL |
|   14 |    2 |    1 |  5356 | 4500.00 | NULL |
|   15 |    3 |    2 |  5362 | 1000.00 | NULL |
|   16 |    4 |    3 |  5392 | 1000.00 | NULL |
|   17 |    5 |    1 |  5489 | 4000.00 | NULL |
|   18 |    6 |    2 |  5530 | 4000.00 | NULL |
|   19 |    7 |    3 |  5538 | 4000.00 | NULL |
|   20 |    1 |    1 |  5539 | 1000.00 | NULL |
|   21 |    2 |    2 |  5539 | 2000.00 | NULL |
|   22 |    3 |    4 |  5539 | 3000.00 | NULL |
|   23 |    4 |    5 |  5539 | 4000.00 | NULL |
|   24 |    5 |    2 |  5539 | 1000.00 | NULL |
|   25 |    8 |    2 |  5539 | 3000.00 | NULL |
|   26 |    7 |    4 |  5539 | 5000.00 | NULL |
|   27 |    9 |    5 |  5539 | 7000.00 | NULL |
|   28 |    1 |    1 |  5539 | 1000.00 | NULL |
|   29 |    2 |    2 |  5539 | 2000.00 | NULL |
|   30 |    3 |    4 |  5539 | 4000.00 | NULL |
|   31 |    4 |    5 |  5539 | 6000.00 | NULL |
|   32 |    5 |    1 |  5539 | 2500.00 | NULL |
|   33 |    8 |    2 |  5539 | 5000.00 | NULL |
|   34 |    7 |    4 |  5539 | 7500.00 | NULL |
|   35 |    9 |    5 |  5539 | 1000.00 | NULL |
|   36 |   10 |    1 |  5540 | 1000.00 |   10 |
|   37 |   11 |    2 |  5540 | 2000.00 |   20 |
|   38 |   12 |    4 |  5540 | 4000.00 |   40 |
|   39 |   13 |    5 |  5540 | 6000.00 |   60 |
|   40 |   14 |    1 |  5540 | 2500.00 |   25 |
|   41 |   15 |    2 |  5540 | 5000.00 |   50 |
|   42 |   16 |    4 |  5540 | 7500.00 |   75 |
|   43 |   17 |    5 |  5540 | 1000.00 |   10 |
|   44 |   18 |    1 |  5540 | 1000.00 |   10 |
+------+------+------+-------+---------+------+
44 rows in set (0.00 sec)
只有九个订单有数量,老的销售数据没有。
0 0
原创粉丝点击