\[\newcommand{\E}{\mathbb{E}}\]
diamond_train$volume <- diamond_train$x*diamond_train$y*diamond_train$z
prof <- price ~ (carat + as.factor(cut) + as.factor(clarity) + as.factor(color) + depth + table + x + y + z)^2 + I(carat^2) + I(depth^2) + I(table^2) + I(x^2) + I(y^2) + I(z^2)
out_of_state <- price~ volume + clarity + color + cut + carat + volume*carat + clarity*volume
cobb <- price ~ carat+cut+color+clarity+depth+table
gwinnett <- price~ carat + clarity
everybody_else <- price~carat^2+cut+clarity+depth+x+y
rural <- price~carat^2+cut^2+color+carat*clarity+depth
mod_prof <- lm(prof, data=diamond_train)
mod_out_of_state <- lm(out_of_state, data=diamond_train)
mod_cobb <- lm(cobb, data=diamond_train)
mod_gwinnett <- lm(gwinnett, data=diamond_train)
mod_everyone_else <- lm(everybody_else, data=diamond_train)
mod_rural <- lm(rural, data=diamond_train)
mod_sel <- function(formla) {
mod <- lm(formla, data=diamond_train)
r.squared <- summary(mod)$r.squared
adj.r.squared <- summary(mod)$adj.r.squared
n <- nrow(diamond_train)
uhat <- resid(mod)
ssr <- sum(uhat^2)
k <- length(coef(mod))
aic <- 2*k + n*log(ssr)
bic <- k*log(n) + n*log(ssr)
# show results
result <- tidyr::tibble(r.squared, adj.r.squared, aic, bic)
return(result)
}
mod_sel(prof)
mod_sel(out_of_state)
mod_sel(cobb)
mod_sel(gwinnett)
mod_sel(everybody_else)
mod_sel(rural)
# cross validation
J <- 10
n <- nrow(diamond_train)
diamond_train$fold <- sample(1:J, size=n, replace=TRUE)
cv_mod_sel <- function(formla) {
cv <- 0
for (j in 1:J) {
this.train <- subset(diamond_train, fold != j)
this.test <- subset(diamond_train, fold == j)
# regression on training data
cv_reg <- lm(formla, data=this.train)
# prediction for testing data
Yhat <- predict(cv_reg, newdata=this.test)
u <- this.test$price - Yhat
cv <- cv + mean(u^2)
}
sqrt(cv)
}
cv_mod_sel(prof)
cv_mod_sel(out_of_state)
cv_mod_sel(cobb)
cv_mod_sel(gwinnett)
cv_mod_sel(everybody_else)
cv_mod_sel(rural)
library(glmnetUtils)
lasso <- cv.glmnet(prof, data=diamond_train,
use.model.frame = TRUE)
coef(lasso)
ridge <- cv.glmnet(prof, data=diamond_train,
alpha=0,
use.model.frame = TRUE)
coef(ridge)
# Notice: this takes in a model rather than a formula
# this is just to make it work with our lasso and ridge results too
oos_mspe <- function(mod) {
Yhat <- predict(mod, newdata = diamond_test)
uhat <- diamond_test$price - Yhat
sqrt(mean(uhat^2))
}
diamond_test$volume <- diamond_test$x*diamond_test$y*diamond_test$z
oos_mspe(mod_gwinnett)
oos_mspe(mod_everyone_else)
oos_mspe(mod_cobb)
oos_mspe(mod_rural)
oos_mspe(mod_out_of_state)
oos_mspe(mod_prof)
oos_mspe(lasso)
oos_mspe(ridge)