Skip to content

Efficiency global

import effector
import numpy as np
import timeit
import time
import matplotlib.pyplot as plt
np.random.seed(21)
def return_predict(t):
    def predict(x):
        time.sleep(t)
        model = effector.models.DoubleConditionalInteraction()
        return model.predict(x)
    return predict

def return_jacobian(t):
    def jacobian(x):
        time.sleep(t)
        model = effector.models.DoubleConditionalInteraction()
        return model.jacobian(x)
    return jacobian
def measure_time(method_name, features, model, N, M, D, repetitions, K, model_jac=None,):
    fit_time_list, eval_time_list = [], []
    X = np.random.uniform(-1, 1, (N, D))
    xx = np.linspace(-1, 1, M)
    axis_limits = np.array([[-1] * D, [1] * D])

    method_map = {
        "pdp": effector.PDP,
        "d_pdp": effector.DerPDP,
        "ale": effector.ALE,
        "rhale": effector.RHALE,
        "shap_dp": effector.ShapDP
    }

    for _ in range(repetitions):
        # general kwargs
        method_kwargs = {"data": X, "model": model, "axis_limits": axis_limits, "nof_instances": "all"}
        fit_kwargs = {"features": features, "centering": True, "points_for_centering": K}

        # specialize kwargs per method
        if method_name in ["d_pdp", "rhale"]:
            method_kwargs["model_jac"] = model_jac
        if method_name in ["rhale", "ale"]:
            fit_kwargs["binning_method"] = effector.axis_partitioning.Fixed(nof_bins=20)

        # init
        method = method_map[method_name](**method_kwargs)

        # fit
        tic = time.time()
        method.fit(**fit_kwargs)
        fit_time_list.append(time.time() - tic)

        # eval
        tic = time.time()
        for feat in features:
            eval_kwargs = {"feature": feat, "xs": xx, "centering": True, "heterogeneity": True}
            method.eval(**eval_kwargs)
        eval_time_list.append(time.time() - tic)

    return {"fit": np.mean(fit_time_list), "eval": np.mean(eval_time_list), "total": (np.mean(fit_time_list) + np.mean(eval_time_list))}
import matplotlib.pyplot as plt

def bar_plot(xs, time_dict, methods, metric, title, xlabel, ylabel, bar_width=0.02):

    bar_width = (np.max(xs) - np.min(xs)) / 40
    method_to_label = {"ale": "ALE", "rhale": "RHALE", "pdp": "PDP", "d_pdp": "d-pdp", "shap_dp": "SHAP DP"}
    plt.figure()

    # Calculate the offsets for each bar group
    offsets = np.linspace(-2*bar_width, 2*bar_width, len(methods))

    for i, method in enumerate(methods):
        label = method_to_label[method]
        plt.bar(
            xs + offsets[i],
            [tt[metric] for tt in time_dict[method]],
            label=label,
            width=bar_width
        )

    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.xticks(xs)
    plt.legend()
    plt.show()

\(T_1\): runtime vs N

For one feature

t = 0.001
N = 10_000
D = 3
K = 100
M = 100
repetitions = 2
features=[0]
method_names = ["ale", "rhale", "pdp", "d_pdp"]
vec = np.array([10_000, 25_000, 50_000])
time_dict = {method_name: [] for method_name in method_names}
for N in vec:
    model = return_predict(t)
    model_jac = return_jacobian(t)
    for method_name in method_names:
        time_dict[method_name].append(measure_time(method_name, features, model, N, M, D, repetitions, K, model_jac=model_jac))
for metric in ["total"]: # ["fit", "eval", "total"]:
    if metric in ["fit", "eval"]:
        title = "Runtime: ." + metric + "() -- single feature"
    else:
        title = "Runtime: .fit() + .eval() -- single feature"

    bar_plot(
        vec, 
        time_dict, 
        method_names,
        metric=metric,
        title=title,
        xlabel="N: number of instances",
        ylabel="time (sec)"
)

png

For all features

features=[i for i in range(D)]
method_names = ["ale", "rhale", "pdp", "d_pdp"]
vec = np.array([10_000, 25_000, 50_000])
time_dict = {method_name: [] for method_name in method_names}
for N in vec:
    model = return_predict(t)
    model_jac = return_jacobian(t)
    for method_name in method_names:
        time_dict[method_name].append(measure_time(method_name, features, model, N, M, D, repetitions, K, model_jac=model_jac))
for metric in ["total"]: # ["fit", "eval", "total"]:
    if metric in ["fit", "eval"]:
        title = "Runtime: ." + metric + "() -- single feature"
    else:
        title = "Runtime: .fit() + .eval() -- single feature"

    bar_plot(
        vec, 
        time_dict, 
        method_names,
        metric=metric,
        title=title,
        xlabel="N: number of instances",
        ylabel="time (sec)"
)

png

Conclusion

Method .fit() .eval() \(T_1\) (single feature) \(T_1\) (all features)
PDP / d-PDP \(c_1 N\) \(c_2 N\) \((c_1 + c_2) N\) \(D (c_1 + c_2) N\)
ALE \(\epsilon\) Free \(\epsilon\) \(D \epsilon \approx 0\)
RHALE \(\epsilon\) Free \(\epsilon\) \(D \epsilon \approx 0\)

Here, \(c_1\) and \(c_2\) are small but nonzero, meaning the runtime scales linearly with \(N\) but remains low. In contrast, \(\epsilon\) is extremely small, making ALE and RHALE effectively free in practice.

\(T_2\): runtime vs. \(t_f\):

To isolate the impact of \(t_f\), we reduce \(N\) to a small value. This assumes that the execution time of \(f(X)\) remains constant regardless of the dataset size \(X\). While this is not always true in general, it is a reasonable assumption for many ML models with vectorized implementations, as long as \(f(X)\) can be computed in a single pass.

For a single feature

t = 0.001
N = 1_000
D = 3
K = 100
M = 100
repetitions = 2
features=[0]
method_names = ["ale", "rhale", "pdp", "d_pdp"]
vec = np.array([.1, .5, 1.])
time_dict = {method_name: [] for method_name in method_names}
for t in vec:
    model = return_predict(t)
    model_jac = return_jacobian(t)
    for method_name in method_names:
        time_dict[method_name].append(measure_time(method_name, features, model, N, M, D, repetitions, K, model_jac=model_jac))
for metric in ["total"]: # ["fit", "eval", "total"]:
    if metric in ["fit", "eval"]:
        title = "Runtime: ." + metric + "() -- single feature"
    else:
        title = "Runtime: .fit() + .eval() -- single feature"

    bar_plot(
        vec, 
        time_dict, 
        method_names,
        metric=metric,
        title=title,
        xlabel="time (sec) to execute f(dataset)",
        ylabel="time (sec)"
)

png

For all features

t = 0.1
N = 10_000
D = 3
K = 100
M = 100
repetitions = 2
features=[i for i in range(D)]
method_names = ["ale", "rhale", "pdp", "d_pdp"]
vec = np.array([.1, .5, 1.])
time_dict = {method_name: [] for method_name in method_names}
for t in vec:
    model = return_predict(t)
    model_jac = return_jacobian(t)
    for method_name in method_names:
        time_dict[method_name].append(measure_time(method_name, features, model, N, M, D, repetitions, K, model_jac=model_jac))
for metric in ["total"]: #["fit", "eval", "total"]:
    if metric in ["fit", "eval"]:
        title = "Runtime: ." + metric + "() -- all features"
    else:
        title = "Runtime: .fit() + .eval() -- all features"

    bar_plot(
        vec, 
        time_dict, 
        method_names,
        metric=metric,
        title=title,
        xlabel="time (sec) to execute f(dataset)",
        ylabel="time (sec)"
)

png

Conclusion

Method .fit() .eval() \(T_2\) (one feature) \(T_2\) (all features)
PDP / d-PDP \(t_f\) \(t_f\) \(2t_f\) \(2Dt_f\)
ALE \(2t_f\) Free \(2t_f\) \(2Dt_f\)
RHALE \(t_f\) Free \(t_f\) \(t_f\)

Total Runtime

Adding the two parts, we have the total runtime:

Method \(T = T_1 + T_2\) (one feature) \(T = T_1 + T_2\) (all features)
PDP / d-PDP \((c_1 + c_2) N + 2 t_f\) \(D (c_1 + c_2) N + 2 D t_f\)
ALE \(2 t_f\) \(2 D t_f\)
RHALE \(t_f\) \(t_f\)

SHAP-DP

SHAP-DP is a much slower method, compared to the others. Let's see how it scales with \(N\) and \(t_f\).

t = 0.1
N = 10_000
D = 3
K = 100
M = 100
features = [0]
repetitions = 2
method_names = ["shap_dp"]
vec = np.array([10, 50, 100, 200])
time_dict = {method_name: [] for method_name in method_names}
for N in vec:
    model = return_predict(t)
    model_jac = return_jacobian(t)
    for method_name in method_names:
        time_dict[method_name].append(
            measure_time(method_name, features, model, N, M, D, repetitions, K, model_jac=model_jac))
ExactExplainer explainer: 101it [00:20,  2.49it/s]                         
ExactExplainer explainer: 101it [00:20,  2.49it/s]                         
ExactExplainer explainer: 201it [00:40,  3.73it/s]                         
ExactExplainer explainer: 201it [00:40,  3.73it/s]
plt.figure()
plt.plot(
    vec,
    [tt["total"] for tt in time_dict["shap_dp"]],
    "o-",
)
plt.title("Runtime: SHAP DP")
plt.xlabel("N: number of instances")
plt.ylabel("time (sec)")
plt.xticks(vec)
plt.show()
/tmp/ipykernel_831737/3543727158.py:11: UserWarning: No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
  plt.legend()

png

t = 0.1
N = 50
D = 3
K = 100
M = 100
features = [0]
repetitions = 2
# compare with t_f
method_names = ["shap_dp"]
vec = np.array([.01, .1, .5, 1.])
time_dict = {method_name: [] for method_name in method_names}
for t in vec:
    model = return_predict(t)
    model_jac = return_jacobian(t)
    for method_name in method_names:
        time_dict[method_name].append(
            measure_time(method_name, features, model, N, M, D, repetitions, K, model_jac=model_jac))
ExactExplainer explainer: 51it [00:49,  1.24s/it]                        
ExactExplainer explainer: 51it [00:49,  1.24s/it]                        
ExactExplainer explainer: 51it [01:39,  2.20s/it]                        
ExactExplainer explainer: 51it [01:39,  2.20s/it]
plt.figure()
plt.plot(
    vec,
    [tt["total"] for tt in time_dict["shap_dp"]],
    "o-",
)
plt.title("Runtime: SHAP DP")
plt.xlabel("time (sec) to execute f(dataset)")
plt.ylabel("time (sec)")
plt.xticks(vec)
plt.show()

png

So if we add shap-DP to the table, we have:

Method \(T = T_1 + T_2\) (one feature) \(T = T_1 + T_2\) (all features)
PDP / d-PDP \((c_{PDP}) N + 2 t_f\) \(D c_{PDP} N + 2 D t_f\)
ALE \(2 t_f\) \(2 D t_f\)
RHALE \(t_f\) \(t_f\)
SHAP-DP \(c_{SHAP-DP} N t_f\) \(c_{SHAP-DP} D N t_f\)

But \(c_{SHAP-DP}\) is a large constant $c_{SHAP-DP} \approx 2 $. In contrast, \(c_{PDP} \approx 10^{-5}\).