编辑代码

library(moments)
library(Hmisc)
library(caret)
library(pROC)
library(e1071) # svm
library(klaR) # naive Bayes
library(class) # knn
library(randomForest) # randomForest
library(rpart) # ID 3/ CART
library(neuralnet) # BP
library(C50) # C50
library(scmamp)
library(BiocManager)
library(Rgraphviz)


# /Users/guopengxiao/Downloads/NJU/课程/软件度量/Assignments/03/xalan-2.4.csv
rm(list = ls())
data = read.csv("xalan-2.4.csv", header = TRUE)

wmc = data['wmc']
dit = data['dit']
noc = data['noc']
cbo = data['cbo']
rfc = data['rfc']
lcom = data['lcom']
bug = data['bug']


wmc_vec = unlist(wmc)
dit_vec = unlist(dit)
noc_vec = unlist(noc)
cbo_vec = unlist(cbo)
rfc_vec = unlist(rfc)
lcom_vec = unlist(lcom)
bug_vec = unlist(bug)

datas = cbind(wmc_vec, dit_vec, noc_vec, cbo_vec, rfc_vec, lcom_vec)

mins = c(min(wmc_vec), min(dit_vec), min(noc_vec), min(cbo_vec), min(rfc_vec), min(lcom_vec))
per25 = unname(c(quantile(wmc_vec, 0.25), quantile(dit_vec, 0.25), quantile(noc_vec, 0.25), quantile(cbo_vec, 0.25), quantile(rfc_vec, 0.25), quantile(lcom_vec, 0.25)))
medians = c(median(wmc_vec), median(dit_vec), median(noc_vec), median(cbo_vec), median(rfc_vec), median(lcom_vec))
per75 = unname(c(quantile(wmc_vec, 0.75), quantile(dit_vec, 0.75), quantile(noc_vec, 0.75), quantile(cbo_vec, 0.75), quantile(rfc_vec, 0.75), quantile(lcom_vec, 0.75)))
maxs = c(max(wmc_vec), max(dit_vec), max(noc_vec), max(cbo_vec), max(rfc_vec), max(lcom_vec))
means = c(mean(wmc_vec), mean(dit_vec), mean(noc_vec), mean(cbo_vec), mean(rfc_vec), mean(lcom_vec))
skews = c(skewness(wmc_vec), skewness(dit_vec), skewness(noc_vec), skewness(cbo_vec), skewness(rfc_vec), skewness(lcom_vec))
kurs = c(kurtosis(wmc_vec), kurtosis(dit_vec), kurtosis(noc_vec), kurtosis(cbo_vec), kurtosis(rfc_vec), kurtosis(lcom_vec))


print(mins)
print(per25)
print(medians)
print(per75)
print(maxs)
print(means)
print(skews)
print(kurs)

sp_wmc = rcorr(wmc_vec, bug_vec, type="spearman")
sp_dit = rcorr(dit_vec, bug_vec, type="spearman")
sp_noc = rcorr(noc_vec, bug_vec, type="spearman")
sp_cbo = rcorr(cbo_vec, bug_vec, type="spearman")
sp_rfc = rcorr(rfc_vec, bug_vec, type="spearman")
sp_lcom = rcorr(lcom_vec, bug_vec, type="spearman")

p_wmc = rcorr(wmc_vec, bug_vec, type="pearson")
p_dit = rcorr(dit_vec, bug_vec, type="pearson")
p_noc = rcorr(noc_vec, bug_vec, type="pearson")
p_cbo = rcorr(cbo_vec, bug_vec, type="pearson")
p_rfc = rcorr(rfc_vec, bug_vec, type="pearson")
p_lcom = rcorr(lcom_vec, bug_vec, type="pearson")

print(sp_wmc$r)
print(sp_wmc$P)
print(sp_dit$r)
print(sp_dit$P)
print(sp_noc$r)
print(sp_noc$P)
print(sp_cbo$r)
print(sp_cbo$P)
print(sp_rfc$r)
print(sp_rfc$P)
print(sp_lcom$r)
print(sp_lcom$P)

print(p_wmc$r)
print(p_wmc$P)
print(p_dit$r)
print(p_dit$P)
print(p_noc$r)
print(p_noc$P)
print(p_cbo$r)
print(p_cbo$P)
print(p_rfc$r)
print(p_rfc$P)
print(p_lcom$r)
print(p_lcom$P)



set.seed(555)
bug_vec[bug_vec > 0] = 1
bug_fac = as.factor(bug_vec)
datas = data.frame(datas)

train_test = cbind(datas, bug_fac)
folds = createMultiFolds(y = train_test$bug_fac, k=10, times=100)

# 建一个放auc值的空向量
lg_auc_value <- as.numeric()
nb_auc_value <- as.numeric()
svm_auc_value <- as.numeric()
knn_auc_value <- as.numeric()
rf_auc_value <- as.numeric()
id3_auc_value <- as.numeric()
cart_auc_value <- as.numeric()
bp_auc_value <- as.numeric()
c50_auc_value <- as.numeric()
knn5_auc_value <- as.numeric()

lg_acc <- as.numeric()
nb_acc <- as.numeric()
svm_acc <- as.numeric()
knn_acc <- as.numeric()
rf_acc <- as.numeric()
id3_acc <- as.numeric()
cart_acc <- as.numeric()
bp_acc <- as.numeric()
c50_acc <- as.numeric()
knn5_acc <- as.numeric()


for(i in 1:100){
  train <- train_test[folds[[i]],]
  test <- train_test[-folds[[i]],]
  
  # 数据标准化
  standard = preProcess(train, method = c("center","scale"))
  train = predict(standard, train)
  test = predict(standard, test)
  
  # method_1 - Logistic 
  lg_model = glm(bug_fac ~ ., data = train, family = 'binomial')
  lg_model_pre = predict(lg_model, type='response', newdata=test)
  
  lg_roc = roc(test$bug_fac, lg_model_pre)
  lg_auc = as.numeric(auc(lg_roc))
  lg_auc_value = append(lg_auc_value, lg_auc)
  
  lg_pre_fac = lg_model_pre
  lg_pre_fac[lg_pre_fac >= 0.5] = 1
  lg_pre_fac[lg_pre_fac < 0.5] = 0
  lg_pre_fac = as.factor(lg_pre_fac)
  tmp = as.numeric(caret::confusionMatrix(lg_pre_fac, test$bug_fac)$overall["Accuracy"])
  lg_acc = append(lg_acc, tmp)

  # method_2 - Naive Bayes
  nb_model = NaiveBayes(bug_fac ~ ., train)
  nb_model_pre = predict(nb_model, test)$posterior[, 2]
  
  nb_roc = roc(test$bug_fac, nb_model_pre) # plot(nb_roc)
  nb_auc = as.numeric(auc(nb_roc))
  nb_auc_value <- append(nb_auc_value, nb_auc)
  
  nb_pre_fac = nb_model_pre
  nb_pre_fac[nb_pre_fac >= 0.5] = 1
  nb_pre_fac[nb_pre_fac < 0.5] = 0
  nb_pre_fac = as.factor(nb_pre_fac)
  tmp = as.numeric(caret::confusionMatrix(nb_pre_fac, test$bug_fac)$overall["Accuracy"])
  nb_acc = append(nb_acc, tmp)
  
  # method_3 - SVM
  svm_model = svm(bug_fac ~ ., data = train)
  svm_model_pre = as.numeric(predict(svm_model, test)) - 1
  
  svm_roc = roc(test$bug_fac, svm_model_pre) # plot(svm_roc)
  svm_auc = as.numeric(auc(svm_roc))
  svm_auc_value <- append(svm_auc_value, svm_auc)
  
  svm_pre_fac = as.factor(svm_model_pre)
  tmp = as.numeric(caret::confusionMatrix(svm_pre_fac, test$bug_fac)$overall["Accuracy"])
  svm_acc = append(svm_acc, tmp)
  
  # method_4 - KNN - 1
  knn_model = knn(train[, 1:6], test[, 1:6], cl = train[, 7])
  knn_model_pre = as.numeric(knn_model) - 1
  
  knn_roc = roc(test$bug_fac, knn_model_pre) # plot(knn_roc)
  knn_auc = as.numeric(auc(knn_roc))
  knn_auc_value <- append(knn_auc_value, knn_auc)
  
  knn_pre_fac = as.factor(knn_model_pre)
  tmp = as.numeric(caret::confusionMatrix(knn_pre_fac, test$bug_fac)$overall["Accuracy"])
  knn_acc = append(knn_acc, tmp)
  
  # method_5 - RandomForest
  rf_model = randomForest(bug_fac ~ ., data = train, ntree = 500, mtry = 5, importance = TRUE)
  rf_model_pre = as.numeric(predict(rf_model, test)) - 1
  
  rf_roc = roc(test$bug_fac, rf_model_pre) # plot(rf_roc)
  rf_auc = as.numeric(auc(rf_roc))
  rf_auc_value <- append(rf_auc_value, rf_auc)
  
  rf_pre_fac = as.factor(rf_model_pre)
  tmp = as.numeric(caret::confusionMatrix(rf_pre_fac, test$bug_fac)$overall["Accuracy"])
  rf_acc = append(rf_acc, tmp)
  
  # method_6 - ID3
  id3_model = rpart(bug_fac~., data = train, parms = list(split="information"))
  id3_model_pre = predict(id3_model, test)[, 2]
  
  id3_roc = roc(test$bug_fac, id3_model_pre) # plot(id3_roc)
  id3_auc = as.numeric(auc(id3_roc))
  id3_auc_value <- append(id3_auc_value, id3_auc)
  
  id3_pre_fac = id3_model_pre
  id3_pre_fac[id3_pre_fac >= 0.5] = 1
  id3_pre_fac[id3_pre_fac < 0.5] = 0
  id3_pre_fac = as.factor(id3_pre_fac)
  tmp = as.numeric(caret::confusionMatrix(id3_pre_fac, test$bug_fac)$overall["Accuracy"])
  id3_acc = append(id3_acc, tmp)
  
  # method_7 - CART
  cart_model = rpart(bug_fac~., data = train, parms = list(split="gini"))
  cart_model_pre = predict(cart_model, test)[, 2]
  
  cart_roc = roc(test$bug_fac, cart_model_pre) # plot(cart_roc)
  cart_auc = as.numeric(auc(cart_roc))
  cart_auc_value <- append(cart_auc_value, cart_auc)
  
  cart_pre_fac = cart_model_pre
  cart_pre_fac[cart_pre_fac >= 0.5] = 1
  cart_pre_fac[cart_pre_fac < 0.5] = 0
  cart_pre_fac = as.factor(cart_pre_fac)
  tmp = as.numeric(caret::confusionMatrix(cart_pre_fac, test$bug_fac)$overall["Accuracy"])
  cart_acc = append(cart_acc, tmp)
  
  # method_8 - BP-4
  bp_model = neuralnet(bug_fac ~., train, hidden=2)
  bp_model_pre = compute(bp_model, test)$net.result[, 2]
  
  bp_roc = roc(test$bug_fac, bp_model_pre) # plot(bp_roc)
  bp_auc = as.numeric(auc(bp_roc))
  bp_auc_value <- append(bp_auc_value, bp_auc)
  
  bp_pre_fac = bp_model_pre
  bp_pre_fac[bp_pre_fac >= 0.5] = 1
  bp_pre_fac[bp_pre_fac < 0.5] = 0
  bp_pre_fac = as.factor(bp_pre_fac)
  tmp = as.numeric(caret::confusionMatrix(bp_pre_fac, test$bug_fac)$overall["Accuracy"])
  bp_acc = append(bp_acc, tmp)
  
  # method_9 - c50
  c50_model =  C5.0(bug_fac ~., data = train)
  c50_model_pre = as.numeric(predict(c50_model, test)) - 1
  
  c50_roc = roc(test$bug_fac, c50_model_pre) # plot(c50_roc)
  c50_auc = as.numeric(auc(c50_roc))
  c50_auc_value <- append(c50_auc_value, c50_auc)
  
  c50_pre_fac = as.factor(c50_model_pre)
  tmp = as.numeric(caret::confusionMatrix(c50_pre_fac, test$bug_fac)$overall["Accuracy"])
  c50_acc = append(c50_acc, tmp)
  
  # method_10 - KNN-5
  knn5_model = knn(train[, 1:6], test[, 1:6], k=5, cl = train[, 7])
  knn5_model_pre = as.numeric(knn5_model) - 1
  
  knn5_roc = roc(test$bug_fac, knn5_model_pre) # plot(knn5_roc)
  knn5_auc = as.numeric(auc(knn5_roc))
  knn5_auc_value <- append(knn5_auc_value, knn5_auc)
  
  knn5_pre_fac = as.factor(knn5_model_pre)
  tmp = as.numeric(caret::confusionMatrix(knn5_pre_fac, test$bug_fac)$overall["Accuracy"])
  knn5_acc = append(knn5_acc, tmp)
}


lg_ce_value = rep(0, 100)
lg_auc_max = max(lg_auc_value) - 0.5
lg_ce_value = (lg_auc_value - 0.5) / lg_auc_max

nb_ce_value = rep(0, 100)
nb_auc_max = max(nb_auc_value) - 0.5
nb_ce_value = (nb_auc_value - 0.5) / nb_auc_max

svm_ce_value = rep(0, 100)
svm_auc_max = max(svm_auc_value) - 0.5
svm_ce_value = (svm_auc_value - 0.5) / svm_auc_max

knn_ce_value = rep(0, 100)
knn_auc_max = max(knn_auc_value) - 0.5
knn_ce_value = (knn_auc_value - 0.5) / knn_auc_max

rf_ce_value = rep(0, 100)
rf_auc_max = max(rf_auc_value) - 0.5
rf_ce_value = (rf_auc_value - 0.5) / rf_auc_max

id3_ce_value = rep(0, 100)
id3_auc_max = max(id3_auc_value) - 0.5
id3_ce_value = (id3_auc_value - 0.5) / id3_auc_max

cart_ce_value = rep(0, 100)
cart_auc_max = max(cart_auc_value) - 0.5
cart_ce_value = (cart_auc_value - 0.5) / cart_auc_max

bp_ce_value = rep(0, 100)
bp_auc_max = max(bp_auc_value) - 0.5
bp_ce_value = (bp_auc_value - 0.5) / bp_auc_max

c50_ce_value = rep(0, 100)
c50_auc_max = max(c50_auc_value) - 0.5
c50_ce_value = (c50_auc_value - 0.5) / c50_auc_max

knn5_ce_value = rep(0, 100)
knn5_auc_max = max(knn5_auc_value) - 0.5
knn5_ce_value = (knn5_auc_value - 0.5) / knn5_auc_max

row_name = c(paste("auc-", 1:100, sep = ""), paste("ce-", 1:100, sep = ""))
CD_table = data.frame(row.names = row_name)

CD_table$Logistic = c(lg_auc_value, lg_ce_value)
CD_table$NaiveBayes = c(nb_auc_value, nb_ce_value)
CD_table$SVM = c(svm_auc_value, svm_ce_value)
CD_table$KNN_k1 = c(knn_auc_value, knn_ce_value)
CD_table$RandomForest = c(rf_auc_value, rf_ce_value)
CD_table$ID3 = c(id3_auc_value, id3_ce_value)
CD_table$CART = c(cart_auc_value, cart_ce_value)
CD_table$NeuralNetwork = c(bp_auc_value, bp_ce_value)
CD_table$C5.0 = c(c50_auc_value, c50_ce_value)
CD_table$KNN_k5 = c(knn5_auc_value, knn5_ce_value)

plotCD(CD_table)

row_name = c(paste("subset-", 1:100, sep = ""))
acc_table = data.frame(row.names = row_name)

acc_table$Logistic = c(lg_acc)
acc_table$NaiveBayes = c(nb_acc)
acc_table$SVM = c(svm_acc)
acc_table$KNN_k1 = c(knn_acc)
acc_table$RandomForest = c(rf_acc)
acc_table$ID3 = c(id3_acc)
acc_table$CART = c(cart_acc)
acc_table$NeuralNetwork = c(bp_acc)
acc_table$C5.0 = c(c50_acc)
# acc_table$KNN_k5 = c(knn5_acc)

pv_matrix = friedmanAlignedRanksPost(data=acc_table, control=NULL)
adjustShaffer(pv_matrix)
pv_adj <- adjustBergmannHommel(pv_matrix)
r_means <- colMeans(rankMatrix(acc_table))
# drawAlgorithmGraph(pvalue.matrix=pv_adj, mean.value=r_means)


acc_table$KNN_k5 = c(knn5_acc)
heatmap(t(as.matrix(acc_table)))