lianxi

来源:互联网 发布:制作电子文档软件 编辑:程序博客网 时间:2024/05/22 06:13


#Dodger促销和上座率的预测模型library(car) # 线性回归的包library(lattice) # 绘图软件包# 读入数据,并建立数据框导入dodgers <- read.csv("/Users/lily/Documents/model_predict/Chapter_2/dodgers.csv")print(str(dodgers)) # 查看数据结构
'data.frame': 81 obs. of  12 variables:
 $ month      : Factor w/ 7 levels "APR","AUG","JUL",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ day        : int  10 11 12 13 14 15 23 24 25 27 ...
 $ attend     : int  56000 29729 28328 31601 46549 38359 26376 44014 26345 44807 ...
 $ day_of_week: Factor w/ 7 levels "Friday","Monday",..: 6 7 5 1 3 4 2 6 7 1 ...
 $ opponent   : Factor w/ 17 levels "Angels","Astros",..: 13 13 13 11 11 11 3 3 3 10 ...
 $ temp       : int  67 58 57 54 57 65 60 63 64 66 ...
 $ skies      : Factor w/ 2 levels "Clear ","Cloudy": 1 2 2 2 2 1 2 2 2 1 ...
 $ day_night  : Factor w/ 2 levels "Day","Night": 1 2 2 2 2 1 2 2 2 2 ...
 $ cap        : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 1 1 1 1 1 ...
 $ shirt      : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 1 1 1 1 1 ...
 $ fireworks  : Factor w/ 2 levels "NO","YES": 1 1 1 2 1 1 1 1 1 2 ...
 $ bobblehead : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 1 1 1 1 1 ...
NULL

# 定义一周七天的次序变量,以便绘图和数据小结时用dodgers$ordered_day_of_week <- with(data = dodgers,                                    ifelse((day_of_week == 'Monday'), 1,                                           ifelse((day_of_week == 'Tuesday'), 2,                                                  ifelse((day_of_week == 'Wednesday'), 3,                                                         ifelse((day_of_week == 'Thursday'), 4,                                                                ifelse((day_of_week == 'Friday'), 5,                                                                       ifelse((day_of_week == 'Saturday'), 6, 7)))))))dodgers$ordered_day_of_week <- factor(dodgers$ordered_day_of_week, levels = 1:7,                                      labels = c("Mon", "Tue", "Wed", "Thur", "Fri", "Sat", "Sun"))#以标准绘图观测数据分析,一周七天的观众人数par(family='STKaiti')with(data = dodgers, plot(ordered_day_of_week, attend/1000,xlab = "一周内的每一天",                          ylab = "出席率(单位:1000)", col = "violet", las = 1))#当dodger采用摇头娃娃促销时with(dodgers, table(bobblehead, ordered_day_of_week)) # 星期二的摇头娃娃促销
          ordered_day_of_week
bobblehead Mon Tue Wed Thur Fri Sat Sun
       NO   12   7  12    3  13  11  12
       YES   0   6   0    2   0   2   1

#定义月次序变量,绘制数据小结时用dodgers$ordered_month <- with(data = dodgers,                              ifelse((month == 'APR'), 4,                                      ifelse((month == 'MAY'), 5,                                             ifelse((month == 'JUN'), 6,                                                   ifelse((month == 'JUL'), 7,                                                          ifelse((month == 'AUG'), 8,                                                                  ifelse((month == 'SEP'), 9, 10)))))))dodgers$ordered_month <- factor(dodgers$ordered_month, levels = 4:10,                                 labels = c("April", "MAY", "June", "July", "Aug", "Sept", "Oct"))#每个月的观众人数par(family = 'STKaiti')with(dodgers, plot(ordered_month, attend/1000, xlab = "月份", ylab = "出席率(单位:1000人)",                   col = "light blue", las = 1))

#使用更多的变量来观测数据分析,观察观众人数与日场/夜场#天空是否晴朗、气温,或者是否燃放烟火的关系group.labels <- c("No Fireworks", "Fireworks")group.symbols <- c(21, 24)group.colors <- c("black", "black")group.fill <- c("black", "red")xyplot(attend/1000 ~ temp | skies + day_night,       data = dodgers,       groups = fireworks,       pch = group.symbols,       aspect = 0.5, cex = 1.5, col = group.colors, fill = group.fill,       layout = c(2, 2), type = c("p", "g"),       strip = strip.custom(strip.levels = TRUE, strip.names = FALSE, style = 1),       xlab = "Temperature (Degrees Fahrenheit)", ylab = "Attendance(thousands)",       key = list(space = "top",                  text = list(rev(group.labels), col = rev(group.colors)),                  points = list(pch = rev(group.symbols), col = rev(group.colors),                  fill = rev(group.fill))))# 观众人数与比赛对手,日场、夜场的关系group.labels <- c("Day", "Night")group.symbols <- c(1, 20)group.symbols.size <- c(2, 2.75)bwplot(opponent ~ attend/1000, data = dodgers, groups = day_night,       xlab = "Attend(thousands)", aspect = 2,       panel = function(x, y, groups, subscripts,...)         {         panel.grid(h = (length(levels(dodgers$opponent)) -1), v = -1)         panel.stripplot(x, y, groups = groups, subscripts = subscripts, cex = group.symbols.size,                          pch = group.symbols, col = "darkblue")       },       key = list(space = "top",                  text = list(group.labels, col = "black"),                  points = list(pch = group.symbols, cex = group.symbols.size, col = "darkblue"))       )#采用训练并测试的方案set.seed(1234)training_test <- c(rep(1, length = trunc((2/3) * nrow(dodgers))),rep(2, length = (nrow(dodgers) - trunc((2/3) * nrow(dodgers)))))dodgers$training_test <- sample(training_test) # 随机排列dodgers$training_test <- factor(dodgers$training_test, levels = c(1, 2), labels = c("TRAIN", "TEST"))dodgers.train <- subset(dodgers, training_test == 'TRAIN')print(str(dodgers.train)) # 查看数据框属性dodgers.test <- subset(dodgers, training_test == 'TEST')print(str(dodgers.test))