# include packages
library(openintro)     # data
library(ggplot2)       # visualization
library(rpart)         # decision trees
library(rpart.plot)    # decision tree visualization
library(caret)         # performance evaluation
library(randomForest)  # random forests
library(e1071)         # svm + tuning
library(mlr)           # unique wrapper to allow comaparsion using DALEX 
library(DALEX)         # model intepretation: variable importance and PDP
library(ingredients)   # tool for clculation of variable importance and pdp within DALEX 

# reproducibility with RVersion "3.5.1"
RNGversion(vstr = "3.5.1")   

# data generation
data(marioKart)
str(marioKart)

# remove variables ID, shipping price and shipping speed
x <- marioKart[,c(2:5,9:12,7)]   
names(x)[9] <- "Price"  # rename Target variable
str(x)

# split into training- and test data
set.seed(42)
ids       <- sample(nrow(x))  
train     <- x[ids[1:100],]      # training data
taskdata  <- x[ids[101:143],-9]  # test data w/o target
truth     <- x[ids[101:143],9]   # target values of test data for evaluation

# create mlr task and train models 
task <- makeRegrTask(data = train[,-8], target = "Price")
lns <- listLearners()
lm.learner <- makeLearner("regr.lm")
rp.learner <- makeLearner("regr.rpart")
rf.learner <- makeLearner("regr.randomForest")

# tree
rp.model <- train(rp.learner, task)
rpart.plot(rp.model$learner.model)

# forest
set.seed(42)
rf.model <- train(rf.learner, task)

# variable selection for linear regression
sel <- NULL
nms <- nms.full <- names(train)[-c(8:9)] # Price and title
adrsqs <- rep(NA, length(nms))
names(adrsqs) <- nms
for(j in nms){
  form <- paste("Price ~ ", j)
  summary(lm(form, data = train))$adj.r.squared
  adrsqs[which(names(adrsqs) == j)] <- summary(lm(form, data = train))$adj.r.squared
}

sel <- c(sel, which.max(adrsqs))
sel
nms <- nms.full[-sel]

while(length(nms) > 0){
  adrsqs <- cbind(adrsqs, NA)
  for(v in nms){
    #x <- names(d)[i]  
    form <- "Price ~ "
    for (j in sel)   form <- paste(form, nms.full[j], "+ ")
    form <- paste(form, v)
    summary(lm(form, data = x))$adj.r.squared
    adrsqs[which(rownames(adrsqs) == v), ncol(adrsqs)] <- summary(lm(form, data = train))$adj.r.squared
  }
  adrsqs
  cat(which.max(adrsqs[,ncol(adrsqs)]), max(adrsqs[,ncol(adrsqs)], na.rm=T))
  sel <- c(sel, which.max(adrsqs[,ncol(adrsqs)]))
  sel
  nms <- nms.full[-sel]
}

colnames(adrsqs) <- paste("Step",1:ncol(adrsqs))
round(adrsqs, 3)

# linear regression model
lm.task <- makeRegrTask(data = train[,-c(5,8)], target = "Price")
lm.model <- train(lm.learner, lm.task)
summary(lm.model$learner.model)


# preprocess data for svm
train_svm <- train
levels(train_svm$cond) <- c(1,-1)
train_svm$cond <- as.numeric(as.character(train_svm$cond))
levels(train_svm$stockPhoto) <- c(-1, 1) 
train_svm$stockPhoto <- as.numeric(as.character(train_svm$stockPhoto))

# remove outliers
bp <- boxplot(train_svm$Price, plot=FALSE)
train_svm <- train_svm[-which(train_svm$Price > bp$stats[5,1]),]

train_targets <- train_svm$Price

# standardize
mean_r_1 <- apply(train_svm[,c(1:7)], 2, mean)
std_r_1 <- apply(train_svm[,c(1:7)], 2, sd)

train_svm <- scale(train_svm[,c(1:7)], center = mean_r_1, scale = std_r_1)
train_svm <- as.data.frame(train_svm)

# tune svm hyperparameters 
set.seed(42)
svm_tune <- tune.svm(Price~., data=data.frame(train_svm ,"Price" = train_targets),
                     gamma = c(0,0.001,0.0025,0.005,0.0075,0.01,0.02,0.025,0.03,0.04,
                               0.05,0.06,0.07,0.075,0.08,0.09,0.1,0.25,0.5,0.75,1,5,10), 
                     cost = c(0.1,0.5,1,5,10,15,20,25:45,50,55,60,75,100))
svm_tune

# fit svm with optimized hyperparameters on entire training data
sv.task    <- makeRegrTask(data = data.frame(train_svm, Price = train_targets), target = "Price")
sv.learner <- makeLearner("regr.svm", par.vals = list(gamma = 0.01, cost = 5))
sv.model   <- train(sv.learner, sv.task)

# for comparison: default svm 
sv0.learner <- makeLearner("regr.svm")
sv0.model   <- train(sv0.learner, sv.task)

# prepare test data 
task_svm <- taskdata
levels(task_svm$cond) <- c(1,-1)
task_svm$cond <- as.numeric(as.character(task_svm$cond))
levels(task_svm$stockPhoto) <- c(-1, 1) 
task_svm$stockPhoto <- as.numeric(as.character(task_svm$stockPhoto))
task_svm <- scale(task_svm[,c(1:7)], center = mean_r_1, scale = std_r_1)
task_svm <- as.data.frame(task_svm)


### Performance evaluation
res.test    <- NULL
res.test_wo <- NULL
res.train   <- NULL

# decision tree
pred <- predict(rp.model, newdata = taskdata[,-8])$data$response
res.test <- rbind(res.test, postResample(pred, truth)) 

pred <- predict(rp.model, newdata = taskdata[truth < 300, -8])$data$response
res.test_wo <- rbind(res.test_wo, postResample(pred, truth[truth < 300])) 

pred <- predict(rp.model, task)$data$response
res.train <- rbind(res.train, postResample(pred, train$Price)) 


# linear model
pred <- predict(lm.model, newdata = taskdata[,-c(5,8)])$data$response
res.test <- rbind(res.test, postResample(pred, truth)) 

pred <- predict(lm.model, newdata = taskdata[truth < 300,-c(5,8)])$data$response
res.test_wo <- rbind(res.test_wo, postResample(pred, truth[truth < 300])) 

pred <- predict(lm.model, lm.task)$data$response
res.train <- rbind(res.train, postResample(pred, train$Price)) 


# random forest
pred <- predict(rf.model, newdata = taskdata[,-8])$data$response
res.test <- rbind(res.test, postResample(pred, truth)) 

pred <- predict(rf.model, newdata = taskdata[truth < 300,-8])$data$response
res.test_wo <- rbind(res.test_wo, postResample(pred, truth[truth < 300])) 

pred <- predict(rf.model, task)$data$response
res.train <- rbind(res.train, postResample(pred, train$Price)) 


# svm
pred <- predict(sv.model, newdata = task_svm)$data$response
res.test <- rbind(res.test, postResample(pred, truth)) 

pred <- predict(sv.model, newdata = task_svm[truth < 300,])$data$response
res.test_wo <- rbind(res.test_wo, postResample(pred, truth[truth < 300])) 

pred <- predict(sv.model, sv.task)$data$response
res.train <- rbind(res.train, postResample(pred, train_targets)) 


# default svm
pred <- predict(sv0.model, newdata = task_svm)$data$response
res.test <- rbind(res.test, postResample(pred, truth)) 

pred <- predict(sv0.model, newdata = task_svm[truth < 300,])$data$response
res.test_wo <- rbind(res.test_wo, postResample(pred, truth[truth < 300])) 

pred <- predict(sv0.model, sv.task)$data$response
res.train <- rbind(res.train, postResample(pred, train_targets)) 

row.names(res.train) <- c("decision tree", "linear model", "random forest","tuned svm","default svm")
result <- cbind(res.test, res.test_wo, res.train)
result <- result[,-c(1,4,7)]
colnames(result) <- paste(colnames(result), rep(c("test","test_w/o","train"),each = 2), sep = "_")
result <- result[order(result[,3], decreasing = TRUE),]
result


# create dalex explainers required for computation of variable importance and pdp 
custom_predict <- function(object, newdata) return(predict(object, newdata = newdata)$data$response) # custom predict function for mlr regression models

rp.explainer  <- explain(rp.model, data = train, y = train$Price, label = "rpart", predict_function = custom_predict)
rf.explainer  <- explain(rf.model, data = train[,-8], y = train$Price, label = "rf", predict_function = custom_predict)
lm.explainer  <- explain(lm.model, data = train, y = train$Price, label = "lm", predict_function = custom_predict)
sv.explainer  <- explain(sv.model, data = train_svm, y = train_targets, label = "svm", predict_function = custom_predict)

# variable importance
library(ingredients)
set.seed(42)
rp.vi  <- feature_importance(rp.explainer)
rf.vi  <- feature_importance(rf.explainer)
lm.vi  <- feature_importance(lm.explainer)
sv.vi  <- feature_importance(sv.explainer)
plot(rf.vi, sv.vi, bar_width = 2) 

### PDP
rp.pdp   <- partial_dependency(rp.explainer,  variables = "wheels")
rf.pdp   <- partial_dependency(rf.explainer,  variables = "wheels")
lm.pdp   <- partial_dependency(lm.explainer,  variables = "wheels") 
sv.pdp   <- partial_dependency(sv.explainer,  variables = "wheels")
# re-scale standardized variable from svm model 
sv.pdp$`_x_` <- sv.pdp$`_x_` * std_r_1[7] + mean_r_1[7]  
plot(rp.pdp, rf.pdp, lm.pdp, sv.pdp)

