$\newcommand{\E}{\mathbb{E}}$

# Code

prof <- price ~ (carat + as.factor(cut) + as.factor(clarity) + as.factor(color) + depth + table + x + y + z)^2 + I(carat^2) + I(depth^2) + I(table^2) + I(x^2) + I(y^2) + I(z^2)
tennis <- price~ I(carat^3) + carat + depth + color + clarity + cut
+ I(color*clarity) +x +y +z
other <- price~ carat + cut + color + clarity +depth
golf <- price ~ (carat + as.factor(cut) + as.factor(clarity) + as.factor(color) + depth + table)
basketball <- price ~ carat + x + y + z

mod_prof <- lm(prof, data=diamond_train)
mod_tennis <- lm(tennis, data=diamond_train)
mod_other <- lm(other, data=diamond_train)
mod_golf <- lm(golf, data=diamond_train)

mod_sel <- function(formla) {
mod <- lm(formla, data=diamond_train)
r.squared <- summary(mod)$r.squared adj.r.squared <- summary(mod)$adj.r.squared

n <- nrow(diamond_train)
uhat <- resid(mod)
ssr <- sum(uhat^2)
k <- length(coef(mod))
aic <- 2*k + n*log(ssr)
bic <- k*log(n) + n*log(ssr)

# show results
result <- data.frame(r.squared, adj.r.squared, aic, bic)
return(result)
}

mod_sel(prof)
mod_sel(golf)
mod_sel(other)
mod_sel(tennis)

# cross validation
J <- 10
n <- nrow(diamond_train)
diamond_train$fold <- sample(1:J, size=n, replace=TRUE) cv_mod_sel <- function(formla) { cv <- 0 for (j in 1:J) { this.train <- subset(diamond_train, fold != j) this.test <- subset(diamond_train, fold == j) # regression on training data cv_reg <- lm(formla, data=this.train) # prediction for testing data Yhat <- predict(cv_reg, newdata=this.test) u <- this.test$price - Yhat
cv <- cv + mean(u^2)
}

sqrt(cv)
}

cv_mod_sel(prof)
cv_mod_sel(golf)
cv_mod_sel(other)
cv_mod_sel(tennis)

# lasso and ridge
library(glmnetUtils)
lasso <- cv.glmnet(prof, data=diamond_train,
use.model.frame = TRUE)
coef(lasso)

ridge <- cv.glmnet(prof, data=diamond_train,
alpha=0,
use.model.frame = TRUE)
coef(ridge)

# out of sample prediction error
# Notice: this takes in a model rather than a formula
# this is just to make it work with our lasso and ridge results too
oos_mspe <- function(mod) {
Yhat <- predict(mod, newdata = diamond_test)
uhat <- diamond_test\$price - Yhat
sqrt(mean(uhat^2))
}

oos_mspe(mod_prof)
oos_mspe(mod_golf)
oos_mspe(mod_other)
oos_mspe(mod_tennis)