前言
存在的问题
- 是否分离主效应和交互效应?如果是关注单独特征的影响,应该分离主效应和交互效应去观察主效应
这是不分离主效应和交互效应的代码
explainer_test = shap.Explainer(best_model, X_test,) # ✅ 推荐做法
shap_values_test = explainer_test(X_test,check_additivity=False)
import matplotlib.pyplot as plt
import shap
plt.figure(figsize=(10, 8), dpi=300) # figsize 控制图像大小,dpi 控制分辨率
fig = shap.summary_plot(shap_values_test, X_test, title=y_name,max_display=25,show=False)
plt.savefig("./{}.png".format(y_name),dpi=300)
plt.show()
这是分离主效应和交互效应的结果
import shap
import numpy as np
import matplotlib.pyplot as plt
# explainer
explainer = shap.TreeExplainer(best_model)
# interaction values
shap_interaction_values = explainer.shap_interaction_values(X_test)
# main effects
shap_main = np.diagonal(shap_interaction_values, axis1=1, axis2=2)
# plot
plt.figure(figsize=(10,8), dpi=300)
shap.summary_plot(
shap_main,
X_test,
max_display=25,
title="SHAP Main Effects",
show=False
)
plt.savefig("./{}-main-effect.png".format(y_name), dpi=300)
plt.show()
-
为什么 shap均值的正负和图中看起来的趋势不一致?对于特征分布为0 1 变量,且大部分分布为0,此时summary_plot与计算出的均值(不作绝对值)影响方向不一致,
-
应该用训练数据还是 测试数据去绘制SHAP图?一般情况下推荐用测试数据,但一定要保证这训练测试数据的标签分布一致,如果不一致,那你绘制的图与训练过程的表现会有明显不一致
代码实战
交互效应绘图代码:
# 6. 网络图可视化函数
def plot_feature_interaction_and_main_effects(
interaction_matrix, # 交互作用矩阵
main_effect_matrix, # 主效应矩阵
interaction_cmap=cm.Blues, # 交互作用颜色映射
main_effect_cmap=cm.Greens, # 主效应颜色映射
title="Feature Interaction and Main Effects Network",
output_path=None, # 输出路径
background_circle_color='lightgray', # 背景颜色
node_size_scale=1000, # 节点大小缩放
edge_width_scale=5, # 边宽度缩放
min_edge_threshold=0.1 # 最小边显示阈值
):
"""
绘制特征交互作用与主效应网络图
"""
# 准备数据
features = interaction_matrix.index.tolist()
n_features = len(features)
# 节点大小和颜色(基于主效应)
node_sizes = main_effect_matrix['main_effect'].values
node_colors = node_sizes
# 归一化
node_sizes_normalized = node_size_scale * (node_sizes - node_sizes.min()) / (node_sizes.max() - node_sizes.min() + 1e-8)
node_colors_normalized = (node_colors - node_colors.min()) / (node_colors.max() - node_colors.min() + 1e-8)
# 边的权重和颜色(基于交互作用)
edge_weights = interaction_matrix.values.copy()
np.fill_diagonal(edge_weights, 0) # 移除自循环
# 应用阈值过滤弱连接
edge_weights[edge_weights < min_edge_threshold * edge_weights.max()] = 0
# 创建图形
fig, (ax_main, ax_network) = plt.subplots(1, 2, figsize=(20, 8), gridspec_kw={'width_ratios': [1, 2]})
# 左图:主效应条形图
sorted_idx = np.argsort(node_sizes)[::-1]
sorted_features = [features[i] for i in sorted_idx]
sorted_effects = node_sizes[sorted_idx]
bars = ax_main.barh(range(n_features), sorted_effects, color=main_effect_cmap(node_colors_normalized[sorted_idx]))
ax_main.set_yticks(range(n_features))
ax_main.set_yticklabels(sorted_features)
ax_main.set_xlabel('Main Effect (Absolute SHAP)')
ax_main.set_title('Feature Main Effects')
ax_main.invert_yaxis()
# 右图:网络图
# 创建圆形布局
angles = np.linspace(0, 2 * np.pi, n_features, endpoint=False).tolist()
pos = {features[i]: (np.cos(angles[i]), np.sin(angles[i])) for i in range(n_features)}
# 绘制背景圆形
circle = plt.Circle((0, 0), 1.2, color=background_circle_color, alpha=0.3, fill=True)
ax_network.add_patch(circle)
# 绘制边
edge_colors = []
edge_widths = []
max_edge_weight = edge_weights.max()
for i in range(n_features):
for j in range(i+1, n_features):
if edge_weights[i, j] > 0:
weight = edge_weights[i, j]
# 边的颜色和宽度
color_intensity = weight / (max_edge_weight + 1e-8)
color = interaction_cmap(color_intensity)
width = edge_width_scale * color_intensity
# 绘制边
x1, y1 = pos[features[i]]
x2, y2 = pos[features[j]]
ax_network.plot([x1, x2], [y1, y2],
color=color,
linewidth=width,
alpha=0.6)
edge_colors.append(color_intensity)
edge_widths.append(width)
# 绘制节点
scatter = ax_network.scatter(
[pos[f][0] for f in features],
[pos[f][1] for f in features],
s=node_sizes_normalized,
c=node_colors_normalized,
cmap=main_effect_cmap,
edgecolors='black',
linewidths=1.5,
zorder=10
)
# 添加特征标签
for feature in features:
x, y = pos[feature]
# 调整标签位置避免重叠
offset = 1.15
ax_network.text(x * offset, y * offset, feature,
ha='center', va='center',
fontsize=9, fontweight='bold')
# 设置图形属性
ax_network.set_xlim(-1.5, 1.5)
ax_network.set_ylim(-1.5, 1.5)
ax_network.set_aspect('equal')
ax_network.axis('off')
ax_network.set_title('Interaction Network')
# 添加颜色条
plt.colorbar(scatter, ax=ax_network, label='Main Effect Intensity')
# 整体标题
fig.suptitle(title, fontsize=16, fontweight='bold')
plt.tight_layout()
# 保存或显示
if output_path:
plt.savefig(output_path, format='pdf', bbox_inches='tight', dpi=1200)
print(f"Plot saved to {output_path}")
else:
plt.show()
训练找最优模型
def run_shap_interaction_analysis(df, target_col, column_names, output_pdf):
import pandas as pd
import numpy as np
import shap
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import warnings
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
warnings.filterwarnings("ignore")
# ======================
# 1 数据准备
# ======================
df["label"] = df[target_col]
X = df[column_names]
y = df["label"]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# ======================
# 2 XGBoost模型
# ======================
xgb_model = XGBRegressor(
random_state=42,
eval_metric="rmse"
)
param_grid = {
'n_estimators': [40,50,100,150],
'max_depth': [3,4,5,6,7],
'learning_rate': [0.001,0.01, 0.05, 0.1],
'subsample': [0.8, 0.9, 1.0],
'colsample_bytree': [0.8, 0.9, 1.0],
'gamma': [0, 0.1, 0.2],
}
grid_search = GridSearchCV(
estimator=xgb_model,
param_grid=param_grid,
cv=5,
n_jobs=16,
scoring='neg_mean_squared_error',
verbose=1
)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
print("Best Params:", grid_search.best_params_)
print("Best RMSE:", np.sqrt(-grid_search.best_score_))
# ======================
# 3 测试集评估
# ======================
y_pred = best_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print("Test RMSE:", rmse)
print("Test R2:", r2)
# ======================
# 4 SHAP interaction
# ======================
explainer = shap.TreeExplainer(best_model)
# =================================
# 1. 计算 SHAP interaction values
# =================================
shap_interaction_values = explainer.shap_interaction_values(X_test) # shape: (n_samples, n_features, n_features)
# -------------------------
# 2. 主效应(main effects)
# -------------------------
main_effects = np.mean(np.abs(np.diagonal(shap_interaction_values, axis1=1, axis2=2)), axis=0)
# -------------------------
# 3. 交互效应(interaction effects)
# -------------------------
interaction_effects = np.mean(np.abs(shap_interaction_values), axis=0)
np.fill_diagonal(interaction_effects, 0) # 对角线置零
interaction_effects = interaction_effects * 2 # 补偿对称矩阵的另一半
feature_names = X.columns.tolist()
main_effect_df = pd.DataFrame({
"feature": feature_names,
"main_effect": main_effects
}).set_index("feature")
interaction_df = pd.DataFrame(
interaction_effects,
index=feature_names,
columns=feature_names
)
# ======================
# 5 绘图
# ======================
plot_feature_interaction_and_main_effects(
interaction_matrix=interaction_df,
main_effect_matrix=main_effect_df,
interaction_cmap=cm.Greys,
main_effect_cmap=cm.coolwarm,
title=f"{target_col} Feature Interaction Network",
output_path=output_pdf
)
# ======================
# 6 Top interaction
# ======================
interaction_pairs = []
for i in range(len(feature_names)):
for j in range(i+1, len(feature_names)):
interaction_pairs.append({
"feature1": feature_names[i],
"feature2": feature_names[j],
"interaction": interaction_df.iloc[i, j]
})
interaction_pairs_df = pd.DataFrame(interaction_pairs)\
.sort_values("interaction", ascending=False)
print("\nTop 10 Interaction Pairs:")
print(interaction_pairs_df.head(10))
# ======================
# 返回模型与测试数据
# ======================
return best_model, X_train, X_test
调用
best_model, X_train, X_test= run_shap_interaction_analysis(
df,
target_col=y_name,
column_names=column_names, # column_names 是一个列表 存储df中的各个特征的名字
output_pdf="{}.pdf".format(y_name) # 保存图像到指定路径
)