load("data/diamond_train.RData")
# formulas
formla1 <- price ~ carat + as.factor(cut) + as.factor(clarity)
formla2 <- price ~ carat + as.factor(cut) + as.factor(clarity) + depth + table + x + y + x
formla3 <- price ~ (carat + as.factor(cut) + as.factor(clarity) + depth + table + x + y + x)^2
# estimate each model
mod1 <- lm(formla1, data=diamond_train)
mod2 <- lm(formla2, data=diamond_train)
mod3 <- lm(formla3, data=diamond_train)
mod_sel <- function(formla) {
mod <- lm(formla, data=diamond_train)
r.squared <- summary(mod)$r.squared
adj.r.squared <- summary(mod)$adj.r.squared
n <- nrow(diamond_train)
uhat <- resid(mod)
ssr <- sum(uhat^2)
k <- length(coef(mod))
aic <- 2*k + n*log(ssr)
bic <- k*log(n) + n*log(ssr)
# show results
result <- tidyr::tibble(r.squared, adj.r.squared, aic, bic)
return(result)
}
res1 <- mod_sel(formla1)
res2 <- mod_sel(formla2)
res3 <- mod_sel(formla3)
round(rbind.data.frame(res1, res2, res3),3)
#> # A tibble: 3 × 4
#> r.squared adj.r.squared aic bic
#> <dbl> <dbl> <dbl> <dbl>
#> 1 0.898 0.898 1104188. 1104301.
#> 2 0.9 0.9 1103499. 1103647.
#> 3 0.928 0.928 1089372. 1090328.
# k-fold cross validation
# setup data
k <- 10
n <- nrow(diamond_train)
diamond_train$fold <- sample(1:k, size=n, replace=TRUE)
diamond_train$id <- 1:n
cv_mod_sel <- function(formla) {
u.squared <- rep(NA, n)
for (i in 1:k) {
this.train <- subset(diamond_train, fold != i)
this.test <- subset(diamond_train, fold == i)
this.id <- this.test$id
cv_reg <- lm(formla,
data=this.train)
pred <- predict(cv_reg, newdata=this.test)
u <- this.test$price - pred
u.squared[this.id] <- u^2
}
cv <- sqrt(mean(u.squared))
return(cv)
}
cv_res1 <- cv_mod_sel(formla1)
cv_res2 <- cv_mod_sel(formla2)
cv_res3 <- cv_mod_sel(formla3)
res1 <- cbind.data.frame(res1, cv=cv_res1)
res2 <- cbind.data.frame(res2, cv=cv_res2)
res3 <- cbind.data.frame(res3, cv=cv_res3)
round(rbind.data.frame(res1, res2, res3),3)
#> r.squared adj.r.squared aic bic cv
#> 1 0.898 0.898 1104188 1104301 1366.021
#> 2 0.900 0.900 1103499 1103647 1397.104
#> 3 0.928 0.928 1089372 1090328 2247.940
# lasso and ridge
library(glmnet)
library(glmnetUtils)
lasso_res <- cv.glmnet(formla3, data=diamond_train, alpha=1)
ridge_res <- cv.glmnet(formla3, data=diamond_train, alpha=0)
# out of sample predictions
load("data/diamond_test.RData")
# compute prediction errors with test data
u1 <- diamond_test$price - predict(mod1, newdata=diamond_test)
u2 <- diamond_test$price - predict(mod2, newdata=diamond_test)
u3 <- diamond_test$price - predict(mod3, newdata=diamond_test)
u_lasso <- diamond_test$price - predict(lasso_res, newdata=diamond_test)
u_ridge <- diamond_test$price - predict(ridge_res, newdata=diamond_test)
# compute root mean squared prediction errors
rmspe1 <- sqrt(mean(u1^2))
rmspe2 <- sqrt(mean(u2^2))
rmspe3 <- sqrt(mean(u3^2))
rmspe_lasso <- sqrt(mean(u_lasso^2))
rmspe_ridge <- sqrt(mean(u_ridge^2))
# report results
rmspe <- data.frame(mod1=rmspe1, mod2=rmspe2, mod3=rmspe3, lasso=rmspe_lasso, ridge=rmspe_ridge)
round(rmspe,3)
#> mod1 mod2 mod3 lasso ridge
#> 1 856.014 785.171 616.996 683.36 794.276