import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, classification_report, 
                            confusion_matrix, roc_auc_score, roc_curve)
import matplotlib.pyplot as plt
import seaborn as sns

# 1. 数据加载与探索
data = pd.read_csv('breast_cancer.csv')
print("数据概览:\n", data.head())
print("\n类别分布:\n", data['diagnosis'].value_counts())

# 2. 数据预处理
X = data.drop('diagnosis', axis=1)
y = data['diagnosis'].map({'M':1, 'B':0})  # 将类别转换为数值

# 处理缺失值
if X.isnull().sum().any():
    X = X.fillna(X.mean())

# 特征标准化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 3. 划分训练测试集
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42, stratify=y)

# 4. 模型训练与调参
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

model = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='roc_auc')
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
print("\n最佳参数:", grid_search.best_params_)

# 5. 模型评估
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:,1]

print("\n准确率:", accuracy_score(y_test, y_pred))
print("\n分类报告:\n", classification_report(y_test, y_pred))
print("\nAUC分数:", roc_auc_score(y_test, y_proba))

# 6. 可视化
# 混淆矩阵
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d')
plt.title('Confusion Matrix')
plt.show()

# ROC曲线
fpr, tpr, _ = roc_curve(y_test, y_proba)
plt.plot(fpr, tpr)
plt.plot([0,1],[0,1],'r--')
plt.title('ROC Curve')
plt.show()

# 特征重要性
importances = best_model.feature_importances_
indices = np.argsort(importances)[-10:]
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices])
plt.yticks(range(len(indices)), X.columns[indices])
plt.show()