import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('实验3-商业银行信用评价数据.csv', encoding='gbk')
X = df[['信贷比率', '90日内逾期次数', '花呗逾期次数', '历史分期消费次数', '家庭负债率',
'网贷逾期次数', '贷款额度', '定存数量', '理财产品数量']]
y = df['好坏客户']
def chi2_bin(x, y, max_bins=10):
total_good = y.sum()
total_bad = y.count() - total_good
bins = np.unique(x)
while len(bins) > max_bins:
chi2_values = []
for i in range(len(bins) - 1):
bin1 = (x >= bins[i]) & (x < bins[i + 1])
bin2 = (x >= bins[i + 1]) & (x < bins[i + 2] if i + 2 < len(bins) else True)
good1 = y[bin1].sum()
bad1 = bin1.sum() - good1
good2 = y[bin2].sum()
bad2 = bin2.sum() - good2
if good1 == 0 or bad1 == 0 or good2 == 0 or bad2 == 0:
continue
observed = np.array([[good1, bad1], [good2, bad2]])
chi2, _, _, _ = chi2_contingency(observed)
chi2_values.append(chi2)
if not chi2_values:
break
min_index = np.argmin(chi2_values)
bins = np.delete(bins, min_index + 1)
return bins
bin_edges = {}
for col in X.columns:
bin_edges[col] = chi2_bin(X[col], y)
for col in X.columns:
plt.figure(figsize=(10, 6))
plt.hist(X[col], bins=bin_edges[col], edgecolor='black')
plt.title(f'{col} 的卡方分箱分布')
plt.xlabel(col)
plt.ylabel('频数')
plt.show()
def woe_iv(x, y, bins):
woe_values = []
iv_values = []
total_good = y.sum()
total_bad = y.count() - total_good
for i in range(len(bins) - 1):
bin_mask = (x >= bins[i]) & (x < bins[i + 1])
good = y[bin_mask].sum()
bad = bin_mask.sum() - good
good_rate = good / total_good if good > 0 else 0.0001
bad_rate = bad / total_bad if bad > 0 else 0.0001
woe = np.log(good_rate / bad_rate)
iv = (good_rate - bad_rate) * woe
woe_values.append(woe)
iv_values.append(iv)
return np.array(woe_values), np.sum(iv_values)
woe_dict = {}
iv_dict = {}
for col in X.columns:
woe, iv = woe_iv(X[col], y, bin_edges[col])
woe_dict[col] = woe
iv_dict[col] = iv
for col in X.columns:
plt.figure(figsize=(10, 6))
plt.plot(bin_edges[col][:-1], woe_dict[col], marker='o')
plt.title(f'{col} 的 WOE 值分布')
plt.xlabel(col)
plt.ylabel('WOE 值')
plt.show()
X_woe = X.copy()
for col in X.columns:
for i in range(len(bin_edges[col]) - 1):
mask = (X[col] >= bin_edges[col][i]) & (X[col] < bin_edges[col][i + 1])
X_woe.loc[mask, col] = woe_dict[col][i]
X_train, X_test, y_train, y_test = train_test_split(X_woe, y, test_size=0.2, random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred_proba = model.predict_proba(X_test)[:, 1]
scores = y_pred_proba * 100
plt.figure(figsize=(6, 6))
labels = ['Good', 'Bad']
bins = [0, 50, 100]
categories = pd.cut(scores, bins=bins, labels=labels)
category_counts = categories.value_counts()
plt.pie(category_counts, labels=category_counts.index, autopct='%1.1f%%')
plt.title('评分结果分布')
plt.show()
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC 曲线 (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('假正率 (FPR)')
plt.ylabel('真正率 (TPR)')
plt.title('ROC 曲线')
plt.legend(loc='lower right')
plt.show()
y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.xlabel('预测标签')
plt.ylabel('真实标签')
plt.title('混淆矩阵')
plt.show()
feature_importance = pd.Series(model.coef_[0], index=X.columns)
feature_importance = feature_importance.sort_values(ascending=False)
plt.figure(figsize=(10, 6))
feature_importance.plot(kind='bar')
plt.title('特征重要性')
plt.xlabel('特征')
plt.ylabel('权重')
plt.show()
result_df = pd.DataFrame({'评分': scores})
result_df.to_csv('credit_scores.csv', index=False)
accuracy = accuracy_score(y_test, y_pred)
print('模型准确率:', accuracy)