\[\newcommand{\E}{\mathbb{E}}\]

Code

prof <- price ~ (carat + as.factor(cut) + as.factor(clarity) + as.factor(color) + depth + table + x + y + z)^2 + I(carat^2) + I(depth^2) + I(table^2) + I(x^2) + I(y^2) + I(z^2)
tennis <- price~ I(carat^3) + carat + depth + color + clarity + cut
+ I(color*clarity) +x +y +z
other <- price~ carat + cut + color + clarity +depth
golf <- price ~ (carat + as.factor(cut) + as.factor(clarity) + as.factor(color) + depth + table)
basketball <- price ~ carat + x + y + z

mod_prof <- lm(prof, data=diamond_train)
mod_tennis <- lm(tennis, data=diamond_train)
mod_other <- lm(other, data=diamond_train)
mod_golf <- lm(golf, data=diamond_train)
mod_basketball <- lm(basketball, data=diamond_train)

mod_sel <- function(formla) {
  mod <- lm(formla, data=diamond_train)
  r.squared <- summary(mod)$r.squared
  adj.r.squared <- summary(mod)$adj.r.squared
 
  n <- nrow(diamond_train)
  uhat <- resid(mod)
  ssr <- sum(uhat^2)
  k <- length(coef(mod))
  aic <- 2*k + n*log(ssr)
  bic <- k*log(n) + n*log(ssr)
 
  # show results
  result <- data.frame(r.squared, adj.r.squared, aic, bic)
  return(result)
}

mod_sel(prof)
mod_sel(golf)
mod_sel(other)
mod_sel(tennis)
mod_sel(basketball)


# cross validation
J <- 10
n <- nrow(diamond_train)
diamond_train$fold <- sample(1:J, size=n, replace=TRUE)

cv_mod_sel <- function(formla) {
  cv <- 0
  for (j in 1:J) {
    this.train <- subset(diamond_train, fold != j)
    this.test <- subset(diamond_train, fold == j)
   
    # regression on training data
    cv_reg <- lm(formla, data=this.train)
   
    # prediction for testing data
    Yhat <- predict(cv_reg, newdata=this.test)
    u <- this.test$price - Yhat
    cv <- cv + mean(u^2)
  }
 
  sqrt(cv)
}

cv_mod_sel(prof)
cv_mod_sel(golf)
cv_mod_sel(other)
cv_mod_sel(tennis)
cv_mod_sel(basketball)


# lasso and ridge
library(glmnetUtils)
lasso <- cv.glmnet(prof, data=diamond_train,
                   use.model.frame = TRUE)
coef(lasso)

ridge <- cv.glmnet(prof, data=diamond_train,
                   alpha=0,
                   use.model.frame = TRUE)
coef(ridge)


# out of sample prediction error
# Notice: this takes in a model rather than a formula
# this is just to make it work with our lasso and ridge results too
oos_mspe <- function(mod) {
  Yhat <- predict(mod, newdata = diamond_test)
  uhat <- diamond_test$price - Yhat
  sqrt(mean(uhat^2))
}


oos_mspe(mod_prof)
oos_mspe(mod_golf)
oos_mspe(mod_other)
oos_mspe(mod_tennis)
oos_mspe(mod_basketball)

oos_mspe(lasso)
oos_mspe(ridge)