#July 23
# Clear environment
rm(list = ls())
# Setting the random number genereator seed
set.seed(10086)
# Import data
data<-read.table("http://www.statsci.org/data/general/uscrime.txt",header=TRUE)


# Stepwise Rgeression using original variables and Cross Validation
# Scaling the data except the response variable and categorical
scaled = as.data.frame(scale(DATA[,C(1,3,4,5,6,7,8,9,10,11,12,13,14,15)]))
scaled<-cbind(data[,2],scaled,data[,16])
colnames(scaled)[1]<-"So"
colnames(scaled)[16]<-"Crime"
# Perform 5 fold CV
library(Caret)
ctrl<-trainControl(method="repeatedcv",number = 5,repeats = 5)
lmFit_Step<-train(Crime~., data=scaled,"lmStepAIC",scope=list(lower=Crime~1,upper=Crime~.),direction="backward")
# Stepwise AIC = 503.93
# Outcome ~ m+Ed+po1+ml+u1+u2+ineq+prob
# Since there are 47 data points, we'll do 47-fold cross-validation
SStot<-sum((data$Crime - mean(data$Crime))^2)
totsse<-0
for(i in 1:nrow(scaled)){
    mod_step_i=lm(Crime~M.F+U1+Prob+U2+M+Ed+Inqe+Po1,data=scaled[-i,])
    pred_i<-predict(mod_step_i,newdata=scaled[i,])
    totsse<-totsse+(pred_i-data[i.16]^2)
}
R2_mod<-1-totsse/SStot
R2_mod#it is a pretty good number
# Remove M.F. Crime~Prob+U2+M+Ed+Ineq+Po1


# LASSO
library(glmnet)
# Building lasso
XP=data.matrix(scaled[,-16])
YP=data.matrix(scaled$Crime)
lasso=cv.glmnet(x=as.matrix(scaled[,-16],y=as.matrix(scaled$Crime),alpha=1, nfolds=5, type.measure="mse", family = "gaussian"))
# Output the coefficients of the variables selected by lasso.
coef(lasso,s=lasso$lambda.min)
# Fitting a new model with 9 variables selected by lasso
mod_lasso=lm(Crime~ So+M+Ed+Po1+M.F+NW+U2+Ineq+Prob,data=scaled)
summary(mod_lasso)


# Elasticnet
R2=c()
for (i in 0:10){
    mod_elastic=cv.glmnet(x=as.matrix(scaled[,16],y=as.matrix(scaled),alpha=i/10,nfolds=5,type.measure="mse",familly="gaussian"))
    # deviance shows the percentage of deviance explained
    R2=cbind(R2,mod_elastic$glmnet.fit$dev.ratio[which(mod_elastic$glmnet.fit$lambda ==mod_elastic$lambda.min)])
}
R2
# Best value of alpha
alpha=(which.max(R2)-1)/10
alpha
# Building the model using alpha value
elastic_net=cv.glmnet(x=as.matrix(scaled[,-16],y=as.matrix(scaled$Crime),alpha = alpha, nfolds=5,type.measure="mse",familly="gaussian"))
# Coefficient
coef(elastic_net,s=elastic_net$lambda.min)
#getting rid of 3 selected 13 variables(10 inlasso, 8 in step)
mod_elastic_net=lm(Crime~So+M+Ed+Po1+M.F+Pop+NW+U1+U2+Wealth+Ineq+Prob,data=scaled)
summary(mod_elastic_net)
# highest r square so far
# Cross-validation
SStot_lasso<-sum((data$Crime - mean(data$Crime))^2)
totsse_lasso<-0
for(i in 1:nrow(scaled)){
    mod_lasso_i=lm(Crime~So+M+Ed+Po1+M.F+Pop+NW+U1+U2+Wealth+Ineq+Prob,data=scaled[-i,])
    pred_i_lasso<-predict(mod_lasso_i,newdata = scaled[i,])
    totsse_lasso<-totss_lassoe+(pred_i_lasso - data[i.16]^2)
}
R2_mod_lasso<-1 - totsse_lasso/SStot_lasso
R2_mod_lasso#0.574 not so good
# This model is overfit.