\[\newcommand{\E}{\mathbb{E}}\]

Code

diamond_train$volume <- diamond_train$x*diamond_train$y*diamond_train$z

prof <- price ~ (carat + as.factor(cut) + as.factor(clarity) + as.factor(color) + depth + table + x + y + z)^2 + I(carat^2) + I(depth^2) + I(table^2) + I(x^2) + I(y^2) + I(z^2)
out_of_state <- price~ volume + clarity + color + cut + carat + volume*carat + clarity*volume
cobb <- price ~ carat+cut+color+clarity+depth+table
gwinnett <- price~ carat + clarity
everybody_else <- price~carat^2+cut+clarity+depth+x+y
rural <- price~carat^2+cut^2+color+carat*clarity+depth

mod_prof <- lm(prof, data=diamond_train)
mod_out_of_state <- lm(out_of_state, data=diamond_train)
mod_cobb <- lm(cobb, data=diamond_train)
mod_gwinnett <- lm(gwinnett, data=diamond_train)
mod_everyone_else <- lm(everybody_else, data=diamond_train)
mod_rural <- lm(rural, data=diamond_train)

mod_sel <- function(formla) {
  mod <- lm(formla, data=diamond_train)
  r.squared <- summary(mod)$r.squared
  adj.r.squared <- summary(mod)$adj.r.squared
 
  n <- nrow(diamond_train)
  uhat <- resid(mod)
  ssr <- sum(uhat^2)
  k <- length(coef(mod))
  aic <- 2*k + n*log(ssr)
  bic <- k*log(n) + n*log(ssr)
 
  # show results
  result <- tidyr::tibble(r.squared, adj.r.squared, aic, bic)
  return(result)
}

mod_sel(prof)
mod_sel(out_of_state)
mod_sel(cobb)
mod_sel(gwinnett)
mod_sel(everybody_else)
mod_sel(rural)


# cross validation
J <- 10
n <- nrow(diamond_train)
diamond_train$fold <- sample(1:J, size=n, replace=TRUE)

cv_mod_sel <- function(formla) {
  cv <- 0
  for (j in 1:J) {
    this.train <- subset(diamond_train, fold != j)
    this.test <- subset(diamond_train, fold == j)
   
    # regression on training data
    cv_reg <- lm(formla, data=this.train)
   
    # prediction for testing data
    Yhat <- predict(cv_reg, newdata=this.test)
    u <- this.test$price - Yhat
    cv <- cv + mean(u^2)
  }
 
  sqrt(cv)
}

cv_mod_sel(prof)
cv_mod_sel(out_of_state)
cv_mod_sel(cobb)
cv_mod_sel(gwinnett)
cv_mod_sel(everybody_else)
cv_mod_sel(rural)


library(glmnetUtils)
lasso <- cv.glmnet(prof, data=diamond_train,
                   use.model.frame = TRUE)
coef(lasso)

ridge <- cv.glmnet(prof, data=diamond_train,
                   alpha=0,
                   use.model.frame = TRUE)
coef(ridge)

# Notice: this takes in a model rather than a formula
# this is just to make it work with our lasso and ridge results too
oos_mspe <- function(mod) {
  Yhat <- predict(mod, newdata = diamond_test)
  uhat <- diamond_test$price - Yhat
  sqrt(mean(uhat^2))
}

diamond_test$volume <- diamond_test$x*diamond_test$y*diamond_test$z


oos_mspe(mod_gwinnett)
oos_mspe(mod_everyone_else)
oos_mspe(mod_cobb)
oos_mspe(mod_rural)
oos_mspe(mod_out_of_state)
oos_mspe(mod_prof)
oos_mspe(lasso)
oos_mspe(ridge)