SHAP分析

前言

存在的问题

是否分离主效应和交互效应？如果是关注单独特征的影响，应该分离主效应和交互效应去观察主效应

这是不分离主效应和交互效应的代码

explainer_test = shap.Explainer(best_model, X_test,)  # ✅ 推荐做法
shap_values_test = explainer_test(X_test,check_additivity=False)


import matplotlib.pyplot as plt
import shap
plt.figure(figsize=(10, 8), dpi=300)  # figsize 控制图像大小，dpi 控制分辨率
fig = shap.summary_plot(shap_values_test, X_test, title=y_name,max_display=25,show=False)
plt.savefig("./{}.png".format(y_name),dpi=300)
plt.show()

这是分离主效应和交互效应的结果

import shap
import numpy as np
import matplotlib.pyplot as plt

# explainer
explainer = shap.TreeExplainer(best_model)
# interaction values
shap_interaction_values = explainer.shap_interaction_values(X_test)

# main effects
shap_main = np.diagonal(shap_interaction_values, axis1=1, axis2=2)

# plot
plt.figure(figsize=(10,8), dpi=300)

shap.summary_plot(
    shap_main,
    X_test,
    max_display=25,
    title="SHAP Main Effects",
    show=False
)

plt.savefig("./{}-main-effect.png".format(y_name), dpi=300)
plt.show()

为什么 shap均值的正负和图中看起来的趋势不一致？对于特征分布为0 1 变量，且大部分分布为0，此时summary_plot与计算出的均值（不作绝对值）影响方向不一致，
应该用训练数据还是测试数据去绘制SHAP图？一般情况下推荐用测试数据，但一定要保证这训练测试数据的标签分布一致，如果不一致，那你绘制的图与训练过程的表现会有明显不一致

代码实战

交互效应绘图代码：

# 6. 网络图可视化函数
def plot_feature_interaction_and_main_effects(
    interaction_matrix,          # 交互作用矩阵
    main_effect_matrix,          # 主效应矩阵
    interaction_cmap=cm.Blues,   # 交互作用颜色映射
    main_effect_cmap=cm.Greens,  # 主效应颜色映射
    title="Feature Interaction and Main Effects Network",
    output_path=None,            # 输出路径
    background_circle_color='lightgray',  # 背景颜色
    node_size_scale=1000,        # 节点大小缩放
    edge_width_scale=5,          # 边宽度缩放
    min_edge_threshold=0.1       # 最小边显示阈值
):
    """
    绘制特征交互作用与主效应网络图
    """
    # 准备数据
    features = interaction_matrix.index.tolist()
    n_features = len(features)
    
    # 节点大小和颜色（基于主效应）
    node_sizes = main_effect_matrix['main_effect'].values
    node_colors = node_sizes
    
    # 归一化
    node_sizes_normalized = node_size_scale * (node_sizes - node_sizes.min()) / (node_sizes.max() - node_sizes.min() + 1e-8)
    node_colors_normalized = (node_colors - node_colors.min()) / (node_colors.max() - node_colors.min() + 1e-8)
    
    # 边的权重和颜色（基于交互作用）
    edge_weights = interaction_matrix.values.copy()
    np.fill_diagonal(edge_weights, 0)  # 移除自循环
    
    # 应用阈值过滤弱连接
    edge_weights[edge_weights < min_edge_threshold * edge_weights.max()] = 0
    
    # 创建图形
    fig, (ax_main, ax_network) = plt.subplots(1, 2, figsize=(20, 8), gridspec_kw={'width_ratios': [1, 2]})
    
    # 左图：主效应条形图
    sorted_idx = np.argsort(node_sizes)[::-1]
    sorted_features = [features[i] for i in sorted_idx]
    sorted_effects = node_sizes[sorted_idx]
    
    bars = ax_main.barh(range(n_features), sorted_effects, color=main_effect_cmap(node_colors_normalized[sorted_idx]))
    ax_main.set_yticks(range(n_features))
    ax_main.set_yticklabels(sorted_features)
    ax_main.set_xlabel('Main Effect (Absolute SHAP)')
    ax_main.set_title('Feature Main Effects')
    ax_main.invert_yaxis()
    
    # 右图：网络图
    # 创建圆形布局
    angles = np.linspace(0, 2 * np.pi, n_features, endpoint=False).tolist()
    pos = {features[i]: (np.cos(angles[i]), np.sin(angles[i])) for i in range(n_features)}
    
    # 绘制背景圆形
    circle = plt.Circle((0, 0), 1.2, color=background_circle_color, alpha=0.3, fill=True)
    ax_network.add_patch(circle)
    
    # 绘制边
    edge_colors = []
    edge_widths = []
    max_edge_weight = edge_weights.max()
    
    for i in range(n_features):
        for j in range(i+1, n_features):
            if edge_weights[i, j] > 0:
                weight = edge_weights[i, j]
                # 边的颜色和宽度
                color_intensity = weight / (max_edge_weight + 1e-8)
                color = interaction_cmap(color_intensity)
                width = edge_width_scale * color_intensity
                
                # 绘制边
                x1, y1 = pos[features[i]]
                x2, y2 = pos[features[j]]
                ax_network.plot([x1, x2], [y1, y2], 
                              color=color, 
                              linewidth=width, 
                              alpha=0.6)
                
                edge_colors.append(color_intensity)
                edge_widths.append(width)
    
    # 绘制节点
    scatter = ax_network.scatter(
        [pos[f][0] for f in features],
        [pos[f][1] for f in features],
        s=node_sizes_normalized,
        c=node_colors_normalized,
        cmap=main_effect_cmap,
        edgecolors='black',
        linewidths=1.5,
        zorder=10
    )
    
    # 添加特征标签
    for feature in features:
        x, y = pos[feature]
        # 调整标签位置避免重叠
        offset = 1.15
        ax_network.text(x * offset, y * offset, feature, 
                       ha='center', va='center',
                       fontsize=9, fontweight='bold')
    
    # 设置图形属性
    ax_network.set_xlim(-1.5, 1.5)
    ax_network.set_ylim(-1.5, 1.5)
    ax_network.set_aspect('equal')
    ax_network.axis('off')
    ax_network.set_title('Interaction Network')
    
    # 添加颜色条
    plt.colorbar(scatter, ax=ax_network, label='Main Effect Intensity')
    
    # 整体标题
    fig.suptitle(title, fontsize=16, fontweight='bold')
    plt.tight_layout()
    
    # 保存或显示
    if output_path:
        plt.savefig(output_path, format='pdf', bbox_inches='tight', dpi=1200)
        print(f"Plot saved to {output_path}")
    else:
        plt.show()

训练找最优模型

def run_shap_interaction_analysis(df, target_col, column_names, output_pdf):

    import pandas as pd
    import numpy as np
    import shap
    import matplotlib.pyplot as plt
    import matplotlib.cm as cm
    import warnings

    from sklearn.model_selection import train_test_split, GridSearchCV
    from sklearn.metrics import mean_squared_error, r2_score
    from xgboost import XGBRegressor

    warnings.filterwarnings("ignore")

    # ======================
    # 1 数据准备
    # ======================
    df["label"] = df[target_col]

    X = df[column_names]
    y = df["label"]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # ======================
    # 2 XGBoost模型
    # ======================
    xgb_model = XGBRegressor(
        random_state=42,
        eval_metric="rmse"
    )

    param_grid = {
        'n_estimators': [40,50,100,150],
        'max_depth': [3,4,5,6,7],
        'learning_rate': [0.001,0.01, 0.05, 0.1],
        'subsample': [0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0],
        'gamma': [0, 0.1, 0.2],
    }

    grid_search = GridSearchCV(
        estimator=xgb_model,
        param_grid=param_grid,
        cv=5,
        n_jobs=16,
        scoring='neg_mean_squared_error',
        verbose=1
    )

    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_

    print("Best Params:", grid_search.best_params_)
    print("Best RMSE:", np.sqrt(-grid_search.best_score_))

    # ======================
    # 3 测试集评估
    # ======================
    y_pred = best_model.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    print("Test RMSE:", rmse)
    print("Test R2:", r2)

    # ======================
    # 4 SHAP interaction
    # ======================
    explainer = shap.TreeExplainer(best_model)

    # =================================
    # 1. 计算 SHAP interaction values
    # =================================
    shap_interaction_values = explainer.shap_interaction_values(X_test)  # shape: (n_samples, n_features, n_features)
    
    # -------------------------
    # 2. 主效应（main effects）
    # -------------------------
    main_effects = np.mean(np.abs(np.diagonal(shap_interaction_values, axis1=1, axis2=2)), axis=0)
    
    # -------------------------
    # 3. 交互效应（interaction effects）
    # -------------------------
    interaction_effects = np.mean(np.abs(shap_interaction_values), axis=0)
    np.fill_diagonal(interaction_effects, 0)  # 对角线置零
    interaction_effects = interaction_effects * 2  # 补偿对称矩阵的另一半
    
    feature_names = X.columns.tolist()

    main_effect_df = pd.DataFrame({
        "feature": feature_names,
        "main_effect": main_effects
    }).set_index("feature")

    interaction_df = pd.DataFrame(
        interaction_effects,
        index=feature_names,
        columns=feature_names
    )

    # ======================
    # 5 绘图
    # ======================
    plot_feature_interaction_and_main_effects(
        interaction_matrix=interaction_df,
        main_effect_matrix=main_effect_df,
        interaction_cmap=cm.Greys,
        main_effect_cmap=cm.coolwarm,
        title=f"{target_col} Feature Interaction Network",
        output_path=output_pdf
    )

    # ======================
    # 6 Top interaction
    # ======================
    interaction_pairs = []

    for i in range(len(feature_names)):
        for j in range(i+1, len(feature_names)):
            interaction_pairs.append({
                "feature1": feature_names[i],
                "feature2": feature_names[j],
                "interaction": interaction_df.iloc[i, j]
            })

    interaction_pairs_df = pd.DataFrame(interaction_pairs)\
        .sort_values("interaction", ascending=False)

    print("\nTop 10 Interaction Pairs:")
    print(interaction_pairs_df.head(10))

    # ======================
    # 返回模型与测试数据
    # ======================
    return best_model,  X_train, X_test

调用

best_model,  X_train, X_test= run_shap_interaction_analysis(
    df,
    target_col=y_name,
    column_names=column_names, # column_names 是一个列表 存储df中的各个特征的名字
    output_pdf="{}.pdf".format(y_name) # 保存图像到指定路径
)

上篇WHO自杀预防建议

下篇Linux常用命令