6.5 Lab 5: Solutions

load("data/diamond_train.RData")

# formulas
formla1 <- price ~ carat + as.factor(cut) + as.factor(clarity)
formla2 <- price ~ carat + as.factor(cut) + as.factor(clarity) + depth + table + x + y + x
formla3 <- price ~ (carat + as.factor(cut) + as.factor(clarity) + depth + table + x + y + x)^2

# estimate each model
mod1 <- lm(formla1, data=diamond_train)
mod2 <- lm(formla2, data=diamond_train)
mod3 <- lm(formla3, data=diamond_train)

mod_sel <- function(formla) {
  mod <- lm(formla, data=diamond_train)
  r.squared <- summary(mod)$r.squared
  adj.r.squared <- summary(mod)$adj.r.squared

  n <- nrow(diamond_train)
  uhat <- resid(mod)
  ssr <- sum(uhat^2)
  k <- length(coef(mod))
  aic <- 2*k + n*log(ssr)
  bic <- k*log(n) + n*log(ssr)

  # show results
  result <- tidyr::tibble(r.squared, adj.r.squared, aic, bic)
  return(result)
}

res1 <- mod_sel(formla1)
res2 <- mod_sel(formla2)
res3 <- mod_sel(formla3)

round(rbind.data.frame(res1, res2, res3),3)
#> # A tibble: 3 × 4
#>   r.squared adj.r.squared      aic      bic
#>       <dbl>         <dbl>    <dbl>    <dbl>
#> 1     0.898         0.898 1104188. 1104301.
#> 2     0.9           0.9   1103499. 1103647.
#> 3     0.928         0.928 1089372. 1090328.

# k-fold cross validation

# setup data
k <- 10
n <- nrow(diamond_train)
diamond_train$fold <- sample(1:k, size=n, replace=TRUE)
diamond_train$id <- 1:n

cv_mod_sel <- function(formla) {
  u.squared <- rep(NA, n)
  for (i in 1:k) {
    this.train <- subset(diamond_train, fold != i)
    this.test <- subset(diamond_train, fold == i)
    this.id <- this.test$id
    cv_reg <- lm(formla,
              data=this.train)
    pred <- predict(cv_reg, newdata=this.test)
    u <- this.test$price - pred
    u.squared[this.id] <- u^2
  }
  cv <- sqrt(mean(u.squared))
  return(cv)
}

cv_res1 <- cv_mod_sel(formla1)
cv_res2 <- cv_mod_sel(formla2)
cv_res3 <- cv_mod_sel(formla3)

res1 <- cbind.data.frame(res1, cv=cv_res1)
res2 <- cbind.data.frame(res2, cv=cv_res2)
res3 <- cbind.data.frame(res3, cv=cv_res3)

round(rbind.data.frame(res1, res2, res3),3)
#>   r.squared adj.r.squared     aic     bic       cv
#> 1     0.898         0.898 1104188 1104301 1366.021
#> 2     0.900         0.900 1103499 1103647 1397.104
#> 3     0.928         0.928 1089372 1090328 2247.940

# lasso and ridge
library(glmnet)
library(glmnetUtils)

lasso_res <- cv.glmnet(formla3, data=diamond_train, alpha=1)
ridge_res <- cv.glmnet(formla3, data=diamond_train, alpha=0)

# out of sample predictions

load("data/diamond_test.RData")

# compute prediction errors with test data
u1 <- diamond_test$price - predict(mod1, newdata=diamond_test)
u2 <- diamond_test$price - predict(mod2, newdata=diamond_test)
u3 <- diamond_test$price - predict(mod3, newdata=diamond_test)
u_lasso <- diamond_test$price - predict(lasso_res, newdata=diamond_test)
u_ridge <- diamond_test$price - predict(ridge_res, newdata=diamond_test)

# compute root mean squared prediction errors
rmspe1 <- sqrt(mean(u1^2))
rmspe2 <- sqrt(mean(u2^2))
rmspe3 <- sqrt(mean(u3^2))
rmspe_lasso <- sqrt(mean(u_lasso^2))
rmspe_ridge <- sqrt(mean(u_ridge^2))

# report results
rmspe <- data.frame(mod1=rmspe1, mod2=rmspe2, mod3=rmspe3, lasso=rmspe_lasso, ridge=rmspe_ridge)
round(rmspe,3)
#>      mod1    mod2    mod3  lasso   ridge
#> 1 856.014 785.171 616.996 683.36 794.276