# =======================================================
# Cross-Validation for Linear Regression
# =======================================================

# Load the dataset and convert to dataframe
library(MASS)
attach(Boston)
boston <- data.frame(Boston)
boston$chas <- as.factor(boston$chas) # qualitative predictor
boston$rad <- as.factor(boston$rad) # qualitative predictor
str(boston)

# Target Linear Regression Problem
plot(boston$lstat, boston$medv, 
     col = "grey", pch = 21,
     xlab = "lstat", ylab = "medv")

# Validation Set Approach
# Training Set : Validation Set = 1 : 1 (random)
set.seed(1)
train <- sample(nrow(boston), 0.5*nrow(boston), replace = FALSE)
points(boston[train,]$lstat, boston[train,]$medv, col = "turquoise", pch = 20)

# Fit linear model only on the Training Data
glmFit <- glm(medv ~ lstat, data = boston, subset = train)
abline(glmFit$coefficients, lwd = 3, col = "steelblue")
errors <- (boston$medv - predict(glmFit,boston))

# Compare the two MSE
mean(errors[train]^2) # Training data
mean(errors[-train]^2) # Validation data

# Compute Training and Validation MSE over different Models
nModels <- 7
nTrials <- 5
trnMSE <- matrix(data = NA, nrow = nTrials, ncol = nModels)
valMSE <- matrix(data = NA, nrow = nTrials, ncol = nModels)

for (t in 1:nTrials) {
  set.seed(t)
  train <- sample(nrow(boston), 0.5*nrow(boston), replace = FALSE)
  for (m in 1:nModels) {
    glmFit <- glm(medv ~ poly(lstat,m), data = boston, subset = train)
    errors <- (boston$medv - predict(glmFit,boston))
    trnMSE[t,m] <- mean(errors[train]^2)
    valMSE[t,m] <- mean(errors[-train]^2)
  }
}

# Plot the average Training and Validation MSE for different Models
plot(1:nModels, colMeans(trnMSE), 
     lwd = 2, col = "green", type = "line", 
     ylim = c(min(trnMSE),max(valMSE)), 
     xlab = "Model Complexity", ylab = "Mean Squared Error")
lines(1:nModels, colMeans(valMSE), lwd = 2, col = "red", type = "line")

# Plot the actual Training and Validation MSE for different Models
for (t in 1:nTrials) {
  lines(1:nModels, trnMSE[t,], lwd = 0.5, col = "limegreen", type = "line")
  lines(1:nModels, valMSE[t,], lwd = 0.5, col = "pink", type = "line")
}

# LOOCV (Leave One Out Cross Validation) Approach
library(boot)
glmFit <- glm(medv ~ lstat, data = boston)
cvMSE <- cv.glm(boston, glmFit)
cvMSE$delta[1]

# Compute LOOCV MSE over different Models
nModels <- 7
cvMSE <- rep(0, nModels)
  
for (m in 1:nModels) {
  glmFit <- glm(medv ~ poly(lstat,m), data = boston)
  cvMSE[m] <- cv.glm(boston, glmFit)$delta[1]
}
cvMSE
lines(1:nModels, cvMSE, lwd = 2, col = "steelblue", type = "line", 
      xlab = "Model Complexity", ylab = "LOOCV MSE")

# k-Fold Cross Validation Approach
# Compute k-Fold CV MSE over different Models
nModels <- 7
kcvMSE <- rep(0, nModels)

for (m in 1:nModels) {
  glmFit <- glm(medv ~ poly(lstat,m), data = boston)
  kcvMSE[m] <- cv.glm(boston, glmFit, K = k)$delta[1]
}
kcvMSE
lines(1:nModels, kcvMSE, lwd = 2, col = "magenta", type = "line", 
      xlab = "Model Complexity", ylab = "k-Fold CV MSE")

# Using the "caret" package
library(caret)

# Split the data into Train : Test = 75% : 25%
train <- createDataPartition(boston$medv, p=0.75, list=FALSE)
trainData <- boston[train,]
testData <- boston[-train,]

# Set the validation method (LOOCV, cv, repeatedcv, boot)
fitControl <- trainControl(method = "cv", number = 10)

# Train a linear model accordingly
glmFit <- train(medv ~ lstat, data = trainData, 
                method = "glm", 
                trControl = fitControl)

glmFit$finalModel