library(moments)
library(Hmisc)
library(caret)
library(pROC)
library(e1071)
library(klaR)
library(class)
library(randomForest)
library(rpart)
library(neuralnet)
library(C50)
library(scmamp)
library(BiocManager)
library(Rgraphviz)
rm(list = ls())
data = read.csv("xalan-2.4.csv", header = TRUE)
wmc = data['wmc']
dit = data['dit']
noc = data['noc']
cbo = data['cbo']
rfc = data['rfc']
lcom = data['lcom']
bug = data['bug']
wmc_vec = unlist(wmc)
dit_vec = unlist(dit)
noc_vec = unlist(noc)
cbo_vec = unlist(cbo)
rfc_vec = unlist(rfc)
lcom_vec = unlist(lcom)
bug_vec = unlist(bug)
datas = cbind(wmc_vec, dit_vec, noc_vec, cbo_vec, rfc_vec, lcom_vec)
mins = c(min(wmc_vec), min(dit_vec), min(noc_vec), min(cbo_vec), min(rfc_vec), min(lcom_vec))
per25 = unname(c(quantile(wmc_vec, 0.25), quantile(dit_vec, 0.25), quantile(noc_vec, 0.25), quantile(cbo_vec, 0.25), quantile(rfc_vec, 0.25), quantile(lcom_vec, 0.25)))
medians = c(median(wmc_vec), median(dit_vec), median(noc_vec), median(cbo_vec), median(rfc_vec), median(lcom_vec))
per75 = unname(c(quantile(wmc_vec, 0.75), quantile(dit_vec, 0.75), quantile(noc_vec, 0.75), quantile(cbo_vec, 0.75), quantile(rfc_vec, 0.75), quantile(lcom_vec, 0.75)))
maxs = c(max(wmc_vec), max(dit_vec), max(noc_vec), max(cbo_vec), max(rfc_vec), max(lcom_vec))
means = c(mean(wmc_vec), mean(dit_vec), mean(noc_vec), mean(cbo_vec), mean(rfc_vec), mean(lcom_vec))
skews = c(skewness(wmc_vec), skewness(dit_vec), skewness(noc_vec), skewness(cbo_vec), skewness(rfc_vec), skewness(lcom_vec))
kurs = c(kurtosis(wmc_vec), kurtosis(dit_vec), kurtosis(noc_vec), kurtosis(cbo_vec), kurtosis(rfc_vec), kurtosis(lcom_vec))
print(mins)
print(per25)
print(medians)
print(per75)
print(maxs)
print(means)
print(skews)
print(kurs)
sp_wmc = rcorr(wmc_vec, bug_vec, type="spearman")
sp_dit = rcorr(dit_vec, bug_vec, type="spearman")
sp_noc = rcorr(noc_vec, bug_vec, type="spearman")
sp_cbo = rcorr(cbo_vec, bug_vec, type="spearman")
sp_rfc = rcorr(rfc_vec, bug_vec, type="spearman")
sp_lcom = rcorr(lcom_vec, bug_vec, type="spearman")
p_wmc = rcorr(wmc_vec, bug_vec, type="pearson")
p_dit = rcorr(dit_vec, bug_vec, type="pearson")
p_noc = rcorr(noc_vec, bug_vec, type="pearson")
p_cbo = rcorr(cbo_vec, bug_vec, type="pearson")
p_rfc = rcorr(rfc_vec, bug_vec, type="pearson")
p_lcom = rcorr(lcom_vec, bug_vec, type="pearson")
print(sp_wmc$r)
print(sp_wmc$P)
print(sp_dit$r)
print(sp_dit$P)
print(sp_noc$r)
print(sp_noc$P)
print(sp_cbo$r)
print(sp_cbo$P)
print(sp_rfc$r)
print(sp_rfc$P)
print(sp_lcom$r)
print(sp_lcom$P)
print(p_wmc$r)
print(p_wmc$P)
print(p_dit$r)
print(p_dit$P)
print(p_noc$r)
print(p_noc$P)
print(p_cbo$r)
print(p_cbo$P)
print(p_rfc$r)
print(p_rfc$P)
print(p_lcom$r)
print(p_lcom$P)
set.seed(555)
bug_vec[bug_vec > 0] = 1
bug_fac = as.factor(bug_vec)
datas = data.frame(datas)
train_test = cbind(datas, bug_fac)
folds = createMultiFolds(y = train_test$bug_fac, k=10, times=100)
lg_auc_value <- as.numeric()
nb_auc_value <- as.numeric()
svm_auc_value <- as.numeric()
knn_auc_value <- as.numeric()
rf_auc_value <- as.numeric()
id3_auc_value <- as.numeric()
cart_auc_value <- as.numeric()
bp_auc_value <- as.numeric()
c50_auc_value <- as.numeric()
knn5_auc_value <- as.numeric()
lg_acc <- as.numeric()
nb_acc <- as.numeric()
svm_acc <- as.numeric()
knn_acc <- as.numeric()
rf_acc <- as.numeric()
id3_acc <- as.numeric()
cart_acc <- as.numeric()
bp_acc <- as.numeric()
c50_acc <- as.numeric()
knn5_acc <- as.numeric()
for(i in 1:100){
train <- train_test[folds[[i]],]
test <- train_test[-folds[[i]],]
standard = preProcess(train, method = c("center","scale"))
train = predict(standard, train)
test = predict(standard, test)
lg_model = glm(bug_fac ~ ., data = train, family = 'binomial')
lg_model_pre = predict(lg_model, type='response', newdata=test)
lg_roc = roc(test$bug_fac, lg_model_pre)
lg_auc = as.numeric(auc(lg_roc))
lg_auc_value = append(lg_auc_value, lg_auc)
lg_pre_fac = lg_model_pre
lg_pre_fac[lg_pre_fac >= 0.5] = 1
lg_pre_fac[lg_pre_fac < 0.5] = 0
lg_pre_fac = as.factor(lg_pre_fac)
tmp = as.numeric(caret::confusionMatrix(lg_pre_fac, test$bug_fac)$overall["Accuracy"])
lg_acc = append(lg_acc, tmp)
nb_model = NaiveBayes(bug_fac ~ ., train)
nb_model_pre = predict(nb_model, test)$posterior[, 2]
nb_roc = roc(test$bug_fac, nb_model_pre)
nb_auc = as.numeric(auc(nb_roc))
nb_auc_value <- append(nb_auc_value, nb_auc)
nb_pre_fac = nb_model_pre
nb_pre_fac[nb_pre_fac >= 0.5] = 1
nb_pre_fac[nb_pre_fac < 0.5] = 0
nb_pre_fac = as.factor(nb_pre_fac)
tmp = as.numeric(caret::confusionMatrix(nb_pre_fac, test$bug_fac)$overall["Accuracy"])
nb_acc = append(nb_acc, tmp)
svm_model = svm(bug_fac ~ ., data = train)
svm_model_pre = as.numeric(predict(svm_model, test)) - 1
svm_roc = roc(test$bug_fac, svm_model_pre)
svm_auc = as.numeric(auc(svm_roc))
svm_auc_value <- append(svm_auc_value, svm_auc)
svm_pre_fac = as.factor(svm_model_pre)
tmp = as.numeric(caret::confusionMatrix(svm_pre_fac, test$bug_fac)$overall["Accuracy"])
svm_acc = append(svm_acc, tmp)
knn_model = knn(train[, 1:6], test[, 1:6], cl = train[, 7])
knn_model_pre = as.numeric(knn_model) - 1
knn_roc = roc(test$bug_fac, knn_model_pre)
knn_auc = as.numeric(auc(knn_roc))
knn_auc_value <- append(knn_auc_value, knn_auc)
knn_pre_fac = as.factor(knn_model_pre)
tmp = as.numeric(caret::confusionMatrix(knn_pre_fac, test$bug_fac)$overall["Accuracy"])
knn_acc = append(knn_acc, tmp)
rf_model = randomForest(bug_fac ~ ., data = train, ntree = 500, mtry = 5, importance = TRUE)
rf_model_pre = as.numeric(predict(rf_model, test)) - 1
rf_roc = roc(test$bug_fac, rf_model_pre)
rf_auc = as.numeric(auc(rf_roc))
rf_auc_value <- append(rf_auc_value, rf_auc)
rf_pre_fac = as.factor(rf_model_pre)
tmp = as.numeric(caret::confusionMatrix(rf_pre_fac, test$bug_fac)$overall["Accuracy"])
rf_acc = append(rf_acc, tmp)
id3_model = rpart(bug_fac~., data = train, parms = list(split="information"))
id3_model_pre = predict(id3_model, test)[, 2]
id3_roc = roc(test$bug_fac, id3_model_pre)
id3_auc = as.numeric(auc(id3_roc))
id3_auc_value <- append(id3_auc_value, id3_auc)
id3_pre_fac = id3_model_pre
id3_pre_fac[id3_pre_fac >= 0.5] = 1
id3_pre_fac[id3_pre_fac < 0.5] = 0
id3_pre_fac = as.factor(id3_pre_fac)
tmp = as.numeric(caret::confusionMatrix(id3_pre_fac, test$bug_fac)$overall["Accuracy"])
id3_acc = append(id3_acc, tmp)
cart_model = rpart(bug_fac~., data = train, parms = list(split="gini"))
cart_model_pre = predict(cart_model, test)[, 2]
cart_roc = roc(test$bug_fac, cart_model_pre)
cart_auc = as.numeric(auc(cart_roc))
cart_auc_value <- append(cart_auc_value, cart_auc)
cart_pre_fac = cart_model_pre
cart_pre_fac[cart_pre_fac >= 0.5] = 1
cart_pre_fac[cart_pre_fac < 0.5] = 0
cart_pre_fac = as.factor(cart_pre_fac)
tmp = as.numeric(caret::confusionMatrix(cart_pre_fac, test$bug_fac)$overall["Accuracy"])
cart_acc = append(cart_acc, tmp)
bp_model = neuralnet(bug_fac ~., train, hidden=2)
bp_model_pre = compute(bp_model, test)$net.result[, 2]
bp_roc = roc(test$bug_fac, bp_model_pre)
bp_auc = as.numeric(auc(bp_roc))
bp_auc_value <- append(bp_auc_value, bp_auc)
bp_pre_fac = bp_model_pre
bp_pre_fac[bp_pre_fac >= 0.5] = 1
bp_pre_fac[bp_pre_fac < 0.5] = 0
bp_pre_fac = as.factor(bp_pre_fac)
tmp = as.numeric(caret::confusionMatrix(bp_pre_fac, test$bug_fac)$overall["Accuracy"])
bp_acc = append(bp_acc, tmp)
c50_model = C5.0(bug_fac ~., data = train)
c50_model_pre = as.numeric(predict(c50_model, test)) - 1
c50_roc = roc(test$bug_fac, c50_model_pre)
c50_auc = as.numeric(auc(c50_roc))
c50_auc_value <- append(c50_auc_value, c50_auc)
c50_pre_fac = as.factor(c50_model_pre)
tmp = as.numeric(caret::confusionMatrix(c50_pre_fac, test$bug_fac)$overall["Accuracy"])
c50_acc = append(c50_acc, tmp)
knn5_model = knn(train[, 1:6], test[, 1:6], k=5, cl = train[, 7])
knn5_model_pre = as.numeric(knn5_model) - 1
knn5_roc = roc(test$bug_fac, knn5_model_pre)
knn5_auc = as.numeric(auc(knn5_roc))
knn5_auc_value <- append(knn5_auc_value, knn5_auc)
knn5_pre_fac = as.factor(knn5_model_pre)
tmp = as.numeric(caret::confusionMatrix(knn5_pre_fac, test$bug_fac)$overall["Accuracy"])
knn5_acc = append(knn5_acc, tmp)
}
lg_ce_value = rep(0, 100)
lg_auc_max = max(lg_auc_value) - 0.5
lg_ce_value = (lg_auc_value - 0.5) / lg_auc_max
nb_ce_value = rep(0, 100)
nb_auc_max = max(nb_auc_value) - 0.5
nb_ce_value = (nb_auc_value - 0.5) / nb_auc_max
svm_ce_value = rep(0, 100)
svm_auc_max = max(svm_auc_value) - 0.5
svm_ce_value = (svm_auc_value - 0.5) / svm_auc_max
knn_ce_value = rep(0, 100)
knn_auc_max = max(knn_auc_value) - 0.5
knn_ce_value = (knn_auc_value - 0.5) / knn_auc_max
rf_ce_value = rep(0, 100)
rf_auc_max = max(rf_auc_value) - 0.5
rf_ce_value = (rf_auc_value - 0.5) / rf_auc_max
id3_ce_value = rep(0, 100)
id3_auc_max = max(id3_auc_value) - 0.5
id3_ce_value = (id3_auc_value - 0.5) / id3_auc_max
cart_ce_value = rep(0, 100)
cart_auc_max = max(cart_auc_value) - 0.5
cart_ce_value = (cart_auc_value - 0.5) / cart_auc_max
bp_ce_value = rep(0, 100)
bp_auc_max = max(bp_auc_value) - 0.5
bp_ce_value = (bp_auc_value - 0.5) / bp_auc_max
c50_ce_value = rep(0, 100)
c50_auc_max = max(c50_auc_value) - 0.5
c50_ce_value = (c50_auc_value - 0.5) / c50_auc_max
knn5_ce_value = rep(0, 100)
knn5_auc_max = max(knn5_auc_value) - 0.5
knn5_ce_value = (knn5_auc_value - 0.5) / knn5_auc_max
row_name = c(paste("auc-", 1:100, sep = ""), paste("ce-", 1:100, sep = ""))
CD_table = data.frame(row.names = row_name)
CD_table$Logistic = c(lg_auc_value, lg_ce_value)
CD_table$NaiveBayes = c(nb_auc_value, nb_ce_value)
CD_table$SVM = c(svm_auc_value, svm_ce_value)
CD_table$KNN_k1 = c(knn_auc_value, knn_ce_value)
CD_table$RandomForest = c(rf_auc_value, rf_ce_value)
CD_table$ID3 = c(id3_auc_value, id3_ce_value)
CD_table$CART = c(cart_auc_value, cart_ce_value)
CD_table$NeuralNetwork = c(bp_auc_value, bp_ce_value)
CD_table$C5.0 = c(c50_auc_value, c50_ce_value)
CD_table$KNN_k5 = c(knn5_auc_value, knn5_ce_value)
plotCD(CD_table)
row_name = c(paste("subset-", 1:100, sep = ""))
acc_table = data.frame(row.names = row_name)
acc_table$Logistic = c(lg_acc)
acc_table$NaiveBayes = c(nb_acc)
acc_table$SVM = c(svm_acc)
acc_table$KNN_k1 = c(knn_acc)
acc_table$RandomForest = c(rf_acc)
acc_table$ID3 = c(id3_acc)
acc_table$CART = c(cart_acc)
acc_table$NeuralNetwork = c(bp_acc)
acc_table$C5.0 = c(c50_acc)
pv_matrix = friedmanAlignedRanksPost(data=acc_table, control=NULL)
adjustShaffer(pv_matrix)
pv_adj <- adjustBergmannHommel(pv_matrix)
r_means <- colMeans(rankMatrix(acc_table))
acc_table$KNN_k5 = c(knn5_acc)
heatmap(t(as.matrix(acc_table)))