编辑代码

import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

# 假设 df 是已经加载好的数据
df = pd.read_csv('实验3-商业银行信用评价数据.csv', encoding='gbk')

X = df[['信贷比率', '90日内逾期次数', '花呗逾期次数', '历史分期消费次数', '家庭负债率',
        '网贷逾期次数', '贷款额度', '定存数量', '理财产品数量']]
y = df['好坏客户']

# 卡方分箱函数
def chi2_bin(x, y, max_bins=10):
    total_good = y.sum()
    total_bad = y.count() - total_good
    bins = np.unique(x)
    while len(bins) > max_bins:
        chi2_values = []
        for i in range(len(bins) - 1):
            bin1 = (x >= bins[i]) & (x < bins[i + 1])
            bin2 = (x >= bins[i + 1]) & (x < bins[i + 2] if i + 2 < len(bins) else True)
            good1 = y[bin1].sum()
            bad1 = bin1.sum() - good1
            good2 = y[bin2].sum()
            bad2 = bin2.sum() - good2

            # 添加判断条件,避免出现期望频率为 0 的情况
            if good1 == 0 or bad1 == 0 or good2 == 0 or bad2 == 0:
                continue

            observed = np.array([[good1, bad1], [good2, bad2]])
            chi2, _, _, _ = chi2_contingency(observed)
            chi2_values.append(chi2)

        # 如果 chi2_values 为空,直接结束循环
        if not chi2_values:
            break

        min_index = np.argmin(chi2_values)
        bins = np.delete(bins, min_index + 1)
    return bins

# 对每个特征进行卡方分箱
bin_edges = {}
for col in X.columns:
    bin_edges[col] = chi2_bin(X[col], y)

# 绘制特征的卡方分箱分布图
for col in X.columns:
    plt.figure(figsize=(10, 6))
    plt.hist(X[col], bins=bin_edges[col], edgecolor='black')
    plt.title(f'{col} 的卡方分箱分布')
    plt.xlabel(col)
    plt.ylabel('频数')
    plt.show()

# 计算 WOE 和 IV
def woe_iv(x, y, bins):
    woe_values = []
    iv_values = []
    total_good = y.sum()
    total_bad = y.count() - total_good
    for i in range(len(bins) - 1):
        bin_mask = (x >= bins[i]) & (x < bins[i + 1])
        good = y[bin_mask].sum()
        bad = bin_mask.sum() - good
        good_rate = good / total_good if good > 0 else 0.0001
        bad_rate = bad / total_bad if bad > 0 else 0.0001
        woe = np.log(good_rate / bad_rate)
        iv = (good_rate - bad_rate) * woe
        woe_values.append(woe)
        iv_values.append(iv)
    return np.array(woe_values), np.sum(iv_values)

woe_dict = {}
iv_dict = {}
for col in X.columns:
    woe, iv = woe_iv(X[col], y, bin_edges[col])
    woe_dict[col] = woe
    iv_dict[col] = iv

# 绘制 WOE 值分布图
for col in X.columns:
    plt.figure(figsize=(10, 6))
    plt.plot(bin_edges[col][:-1], woe_dict[col], marker='o')
    plt.title(f'{col} 的 WOE 值分布')
    plt.xlabel(col)
    plt.ylabel('WOE 值')
    plt.show()

# 替换特征值为 WOE 值
X_woe = X.copy()
for col in X.columns:
    for i in range(len(bin_edges[col]) - 1):
        mask = (X[col] >= bin_edges[col][i]) & (X[col] < bin_edges[col][i + 1])
        X_woe.loc[mask, col] = woe_dict[col][i]

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X_woe, y, test_size=0.2, random_state=42)

# 逻辑回归模型训练
model = LogisticRegression()
model.fit(X_train, y_train)

# 预测和评分
y_pred_proba = model.predict_proba(X_test)[:, 1]
scores = y_pred_proba * 100  # 假设简单将概率转换为评分

# 绘制评分结果饼图
plt.figure(figsize=(6, 6))
labels = ['Good', 'Bad']
bins = [0, 50, 100]
categories = pd.cut(scores, bins=bins, labels=labels)
category_counts = categories.value_counts()
plt.pie(category_counts, labels=category_counts.index, autopct='%1.1f%%')
plt.title('评分结果分布')
plt.show()

# 绘制 ROC 曲线和 AUC 值
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC 曲线 (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('假正率 (FPR)')
plt.ylabel('真正率 (TPR)')
plt.title('ROC 曲线')
plt.legend(loc='lower right')
plt.show()

# 绘制混淆矩阵图
y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.xlabel('预测标签')
plt.ylabel('真实标签')
plt.title('混淆矩阵')
plt.show()

# 绘制特征重要性图
feature_importance = pd.Series(model.coef_[0], index=X.columns)
feature_importance = feature_importance.sort_values(ascending=False)

plt.figure(figsize=(10, 6))
feature_importance.plot(kind='bar')
plt.title('特征重要性')
plt.xlabel('特征')
plt.ylabel('权重')
plt.show()

# 导出评分结果为 CSV 文件
result_df = pd.DataFrame({'评分': scores})
result_df.to_csv('credit_scores.csv', index=False)

# 模型评价
accuracy = accuracy_score(y_test, y_pred)
print('模型准确率:', accuracy)