import pandas as pd
from scipy import stats
# 载入数据
file_path = '/mnt/data/A题-沃尔玛客户购买行为数据集.xlsx'
data_new = pd.read_excel(file_path)

# 清洗数据：删除第一行（因为它包含列名）
data_new_clean = data_new.drop(index=0)

# 转换为数值类型，确保购买金额和年龄列正确
data_new_clean['Purchase_Amount'] = pd.to_numeric(data_new_clean['Purchase_Amount'], errors='coerce')
data_new_clean['Age'] = pd.to_numeric(data_new_clean['Age'], errors='coerce')

# 处理缺失值：填充购买金额和年龄的缺失值
data_new_clean['Purchase_Amount'].fillna(data_new_clean['Purchase_Amount'].mean(), inplace=True)
data_new_clean['Age'].fillna(data_new_clean['Age'].mean(), inplace=True)
# 将城市列转换为类别数据，以减少内存消耗
data_new_clean['City'] = data_new_clean['City'].astype('category')

# 对城市进行分组并获取各组的购买金额
city_purchase_data = data_new_clean.groupby('City')['Purchase_Amount'].apply(list)

# 执行ANOVA检验
f_statistic, p_value = stats.f_oneway(*city_purchase_data)

# 输出结果
print("F统计量:", f_statistic)
print("P值:", p_value)