多元logit回归参数估计(多分类logit回归预测)
来源:互联网 发布:淘宝购买小号 编辑:程序博客网 时间:2024/04/28 10:01
目录
- 多元离散选择模型简介
- 样例程序
- 样例程序的问题
- 问题解决
多元离散选择模型简介
常用的离散选择模型有logit模型及probit模型,其区别就是假设不可观测量的分布不同。logit模型假设不可观测项服从Gumbel (Extreme Value Type I) Distribution 。多元logit模型是,多元离散选择模型的一种,适合于效用最大化时的分布选择。
如果决策者i在J项可供选择方案中选择了第j项,那么其效用模型为
样例程序
以下是基于R语言的一个程序
#引入所需要的包require(foreign)require(nnet)require(ggplot2)require(reshape2)##读取数据集,这里数据集可以来找csv,mysql等ml <- read.dta("http://www.ats.ucla.edu/stat/data/hsbdemo.dta")head(ml) ## 查看数据的前六行with(ml, table(ses, prog)) ## 以表格的形式统计数目##函数tapply(进行分组统计)with(ml, do.call(rbind, tapply(write, prog,function(x) c(M = mean(x), SD = sd(x)))))## 重新排列因子水平,其中以academic作为基础标准ml$prog2 <- relevel(ml$prog, ref = "academic")ml$prog2##训练模型test <- multinom(prog2 ~ ses + write, data = ml)summary(test)#获取z值z <- summary(test)$coefficients/summary(test)$standard.errorsz#获取p值p <- (1 - pnorm(abs(z), 0, 1))*2p## extract the coefficients from the model and exponentiateexp(coef(test))head(pp <- fitted(test))##测试集上检测dses <- data.frame(ses = c("low", "middle", "high"),write = mean(ml$write))#获取预测的概率predict(test, newdata = dses, "probs")dwrite <- data.frame(ses = rep(c("low", "middle", "high"), each = 41),write = rep(c(30:70), 3))## store the predicted probabilities for each value of ses and writepp.write <- cbind(dwrite, predict(test, newdata = dwrite, type = "probs", se = TRUE))## calculate the mean probabilities within each level of sesby(pp.write[, 3:5], pp.write$ses, colMeans)## melt data set to long for ggplot2lpp <- melt(pp.write, id.vars = c("ses", "write"), value.name = "probability")head(lpp) # view first few rows## plot predicted probabilities across write values for## each level of ses facetted by program typeggplot(lpp, aes(x = write, y = probability, colour = ses)) + geom_line() + facet_grid(variable ~ ., scales="free")
样例程序的问题
但是,在使用上面的样例,如下,当X的维数较大时,类别较多时,就会出现问题,其主要问题是迭代时权重的问题,为此,我们需要修改源码。
test <- multinom(trainset$car_id2 ~ ., data = trainset[,-c(1,13,14)])
以下是容易出现的问题:
Error in nnet.default(X, Y, w, mask = mask, size = 0, skip = TRUE, softmax = TRUE, : too many (11745) weights
问题解决
针对问题too many (11745) weights,这类问题,解决的办法是修改源码。即新写一个方法。如下,我修改的部分是这里面的
MaxNWts=12000 ##将程序中的MaxNWts变大就行了
##估计参数的函数nnetfunction<-function (formula, data, weights, subset, na.action, contrasts = NULL, Hess = FALSE, summ = 0, censored = FALSE, model = FALSE, ...) { class.ind <- function(cl) { n <- length(cl) x <- matrix(0, n, length(levels(cl))) x[(1L:n) + n * (as.integer(cl) - 1L)] <- 1 dimnames(x) <- list(names(cl), levels(cl)) x } summ2 <- function(X, Y) { X <- as.matrix(X) Y <- as.matrix(Y) n <- nrow(X) p <- ncol(X) q <- ncol(Y) Z <- t(cbind(X, Y)) storage.mode(Z) <- "double" z <- .C(VR_summ2, as.integer(n), as.integer(p), as.integer(q), Z = Z, na = integer(1L)) Za <- t(z$Z[, 1L:z$na, drop = FALSE]) list(X = Za[, 1L:p, drop = FALSE], Y = Za[, p + 1L:q]) } call <- match.call() m <- match.call(expand.dots = FALSE) m$summ <- m$Hess <- m$contrasts <- m$censored <- m$model <- m$... <- NULL m[[1L]] <- quote(stats::model.frame) m <- eval.parent(m) Terms <- attr(m, "terms") X <- model.matrix(Terms, m, contrasts) cons <- attr(X, "contrasts") Xr <- qr(X)$rank Y <- model.response(m) if (!is.matrix(Y)) Y <- as.factor(Y) w <- model.weights(m) if (length(w) == 0L) if (is.matrix(Y)) w <- rep(1, dim(Y)[1L]) else w <- rep(1, length(Y)) lev <- levels(Y) if (is.factor(Y)) { counts <- table(Y) if (any(counts == 0L)) { empty <- lev[counts == 0L] warning(sprintf(ngettext(length(empty), "group %s is empty", "groups %s are empty"), paste(sQuote(empty), collapse = " ")), domain = NA) Y <- factor(Y, levels = lev[counts > 0L]) lev <- lev[counts > 0L] } if (length(lev) < 2L) stop("need two or more classes to fit a multinom model") if (length(lev) == 2L) Y <- as.integer(Y) - 1 else Y <- class.ind(Y) } if (summ == 1) { Z <- cbind(X, Y) z1 <- cumprod(apply(Z, 2L, max) + 1) Z1 <- apply(Z, 1L, function(x) sum(z1 * x)) oZ <- order(Z1) Z2 <- !duplicated(Z1[oZ]) oX <- (seq_along(Z1)[oZ])[Z2] X <- X[oX, , drop = FALSE] Y <- if (is.matrix(Y)) Y[oX, , drop = FALSE] else Y[oX] w <- diff(c(0, cumsum(w))[c(Z2, TRUE)]) print(dim(X)) } if (summ == 2) { Z <- summ2(cbind(X, Y), w) X <- Z$X[, 1L:ncol(X)] Y <- Z$X[, ncol(X) + 1L:ncol(Y), drop = FALSE] w <- Z$Y print(dim(X)) } if (summ == 3) { Z <- summ2(X, Y * w) X <- Z$X Y <- Z$Y[, 1L:ncol(Y), drop = FALSE] w <- rep(1, nrow(X)) print(dim(X)) } offset <- model.offset(m) r <- ncol(X) if (is.matrix(Y)) { p <- ncol(Y) sY <- Y %*% rep(1, p) if (any(sY == 0)) stop("some case has no observations") if (!censored) { Y <- Y/matrix(sY, nrow(Y), p) w <- w * sY } if (length(offset) > 1L) { if (ncol(offset) != p) stop("ncol(offset) is wrong") mask <- c(rep(FALSE, r + 1L + p), rep(c(FALSE, rep(TRUE, r), rep(FALSE, p)), p - 1L)) X <- cbind(X, offset) Wts <- as.vector(rbind(matrix(0, r + 1L, p), diag(p))) fit <- nnet(X, Y, w, Wts = Wts, mask = mask, size = 0, skip = TRUE, softmax = TRUE, censored = censored, rang = 0,MaxNWts=12000, ...) } else { mask <- c(rep(FALSE, r + 1L), rep(c(FALSE, rep(TRUE, r)), p - 1L)) fit <- nnet(X, Y, w, mask = mask, size = 0, skip = TRUE, softmax = TRUE, censored = censored, rang = 0,MaxNWts=12000, ...) } } else { if (length(offset) <= 1L) { mask <- c(FALSE, rep(TRUE, r)) fit <- nnet(X, Y, w, mask = mask, size = 0, skip = TRUE, entropy = TRUE, rang = 0,MaxNWts=12000, ...) } else { mask <- c(FALSE, rep(TRUE, r), FALSE) Wts <- c(rep(0, r + 1L), 1) X <- cbind(X, offset) fit <- nnet(X, Y, w, Wts = Wts, mask = mask, size = 0, skip = TRUE, entropy = TRUE, rang = 0, MaxNWts=12000, ...) } } fit$formula <- attr(Terms, "formula") fit$terms <- Terms fit$call <- call fit$weights <- w fit$lev <- lev fit$deviance <- 2 * fit$value fit$rank <- Xr edf <- ifelse(length(lev) == 2L, 1, length(lev) - 1) * Xr if (is.matrix(Y)) { edf <- (ncol(Y) - 1) * Xr if (length(dn <- colnames(Y)) > 0) fit$lab <- dn else fit$lab <- 1L:ncol(Y) } fit$coefnames <- colnames(X) fit$vcoefnames <- fit$coefnames[1L:r] fit$na.action <- attr(m, "na.action") fit$contrasts <- cons fit$xlevels <- .getXlevels(Terms, m) fit$edf <- edf fit$AIC <- fit$deviance + 2 * edf if (model) fit$model <- m class(fit) <- c("multinom", "nnet") if (Hess) fit$Hessian <- multinomHess(fit, X) fit}
0 0
- 多元logit回归参数估计(多分类logit回归预测)
- 逻辑回归(Logit Regression)原理与实现
- logit
- logistic回归的一些直观理解(1.连接函数 logit probit)
- 多元线性回归的预测
- 【matlab 多元回归】matlab数值预测--多元回归算法
- probit logit
- (十二)多元回归
- 多元线性回归分析预测法概述
- Machine Learning第三讲[Logistic回归] --(三)多元分类
- 多元回归
- Python_多元回归(一元回归)
- 回归:多元线性回归
- 多元线性回归(上)
- 多元线性回归(下)
- multinomial logit model 多项 Logit 模型
- 多元线性回归 ——模型、估计、检验与预测
- 用多元线性回归预测网页访问量(R语言)
- [模板]高斯消元,异或方程组
- git分支管理
- JNDI学习总结(二)——Tomcat下使用C3P0配置JNDI数据源
- Codeforces Round #368 (Div. 2)C Pythagorean Triples
- 批处理文件批量修改文件名
- 多元logit回归参数估计(多分类logit回归预测)
- JNDI学习总结(三)——Tomcat下使用Druid配置JNDI数据源
- #206 Interval Sum
- linux查看所有用户 以及 用户(组)管理命令
- LeetCode 84. Largest Rectangle in Histogram
- ubuntu git server
- Java基础-数组的内存分配
- 做一个App前需要考虑的几件事
- shell 中的 i++