Skip to content

API reference

Feature Effect Methods

effector.global_effect_ale.ALEBase

Bases: GlobalEffectBase

Source code in /home/runner/work/effector/effector/effector/global_effect_ale.py
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
class ALEBase(GlobalEffectBase):

    def __init__(
            self,
            data: np.ndarray,
            model: callable,
            nof_instances: Union[int, str] = "all",
            axis_limits: Optional[np.ndarray] = None,
            avg_output: Optional[float] = None,
            feature_names: Optional[List] = None,
            target_name: Optional[str] = None,
            method_name: str = "ALE",
    ):
        self.method_name = method_name
        super(ALEBase, self).__init__(
            method_name,
            data,
            model,
            nof_instances,
            axis_limits,
            avg_output,
            feature_names,
            target_name
        )

    @abstractmethod
    def _fit_feature(self,
                     feature: int,
                     binning_method: typing.Union[str, bm.DynamicProgramming, bm.Greedy, bm.Fixed] = "greedy"
                     ) -> typing.Dict:
        raise NotImplementedError

    @abstractmethod
    def fit(self,
            features: typing.Union[int, str, list] = "all",
            **kwargs) -> None:
        raise NotImplementedError

    def _compute_norm_const(
        self, feature: int, method: str = "zero_integral", nof_points: int = 100
    ) -> float:
        """Compute the normalization constant."""
        assert method in ["zero_integral", "zero_start"]

        def create_partial_eval(feat):
            return lambda x: self._eval_unnorm(feat, x, heterogeneity=False)

        partial_eval = create_partial_eval(feature)
        start = self.axis_limits[0, feature]
        stop = self.axis_limits[1, feature]

        if method == "zero_integral":
            z = utils_integrate.mean_1d_linspace(partial_eval, start, stop, nof_points)
        else:
            z = partial_eval(np.array([start])).item()
        return z

    def _fit_loop(self, features, binning_method, centering):
        features = helpers.prep_features(features, self.dim)
        centering = helpers.prep_centering(centering)
        for s in features:
            # compute all information required for plotting and evaluating the feature effect
            self.feature_effect["feature_" + str(s)] = self._fit_feature(
                s, binning_method
            )

            # append the "norm_const" to the feature effect if centering is not False
            if centering is not False:
                self.feature_effect["feature_" + str(s)]["norm_const"] = self._compute_norm_const(s, method=centering)
            else:
                self.feature_effect["feature_" + str(s)]["norm_const"] = self.empty_symbol

            self.is_fitted[s] = True
            self.method_args["feature_" + str(s)] = {
                "centering": centering,
            }

    def _eval_unnorm(self, feature: int, x: np.ndarray, heterogeneity: bool = False):
        params = self.feature_effect["feature_" + str(feature)]
        y = utils.compute_accumulated_effect(
            x, limits=params["limits"], bin_effect=params["bin_effect"], dx=params["dx"]
        )
        if heterogeneity:
            std = utils.compute_accumulated_effect(
                x,
                limits=params["limits"],
                bin_effect=np.sqrt(params["bin_variance"]),
                dx=params["dx"],
            )

            return y, std
        else:
            return y

    def eval(
        self,
        feature: int,
        xs: np.ndarray,
        heterogeneity: bool = False,
        centering: typing.Union[bool, str] = False,
    ) -> typing.Union[np.ndarray, typing.Tuple[np.ndarray, np.ndarray]]:
        """Evalueate the (RH)ALE feature effect of feature `feature` at points `xs`.

        Notes:
            This is a common method inherited by both ALE and RHALE.

        Args:
            feature: index of feature of interest
            xs: the points along the s-th axis to evaluate the FE plot
              - `np.ndarray` of shape `(T, )`
            heterogeneity: whether to return heterogeneity:

                  - `False`, returns the mean effect `y` at the given `xs`
                  - `True`, returns a tuple `(y, H)` of two `ndarrays`; `y` is the mean effect and `H` is the
                  heterogeneity evaluated at `xs`

            centering: whether to center the plot:

                - `False` means no centering
                - `True` or `zero_integral` centers around the `y` axis.
                - `zero_start` starts the plot from `y=0`.
        Returns:
            the mean effect `y`, if `heterogeneity=False` (default) or a tuple `(y, std)` otherwise

        """
        centering = helpers.prep_centering(centering)

        if self.refit(feature, centering):
            self.fit(features=feature, centering=centering)

        # Check if the lower bound is less than the upper bound
        assert self.axis_limits[0, feature] < self.axis_limits[1, feature]

        # Evaluate the feature
        yy = self._eval_unnorm(feature, xs, heterogeneity=heterogeneity)
        y, std = yy if heterogeneity else (yy, None)

        # Center if asked
        y = (
            y - self.feature_effect["feature_" + str(feature)]["norm_const"]
            if centering
            else y
        )

        return (y, std) if heterogeneity is not False else y

    def plot(
            self,
            feature: int,
            heterogeneity: bool = False,
            centering: Union[bool, str] = False,
            scale_x: Optional[dict] = None,
            scale_y: Optional[dict] = None,
            show_avg_output: bool = False,
            y_limits: Optional[List] = None,
            dy_limits: Optional[List] = None
    ):
        """
        Plot the (RH)ALE feature effect of feature `feature`.

        Notes:
            This is a common method inherited by both ALE and RHALE.

        Parameters:
            feature: the feature to plot
            heterogeneity: whether to plot the heterogeneity

                  - `False`, plots only the mean effect
                  - `True`, the std of the bin-effects will be plotted using a red vertical bar

            centering: whether to center the plot:

                - `False` means no centering
                - `True` or `zero_integral` centers around the `y` axis.
                - `zero_start` starts the plot from `y=0`.

            scale_x: None or Dict with keys ['std', 'mean']

                - If set to None, no scaling will be applied.
                - If set to a dict, the x-axis will be scaled by the standard deviation and the mean.
            scale_y: None or Dict with keys ['std', 'mean']

                - If set to None, no scaling will be applied.
                - If set to a dict, the y-axis will be scaled by the standard deviation and the mean.
            show_avg_output: if True, the average output will be shown as a horizontal line.
            y_limits: None or tuple, the limits of the y-axis

                - If set to None, the limits of the y-axis are set automatically
                - If set to a tuple, the limits are manually set

            dy_limits: None or tuple, the limits of the dy-axis

                - If set to None, the limits of the dy-axis are set automatically
                - If set to a tuple, the limits are manually set
        """
        heterogeneity = helpers.prep_confidence_interval(heterogeneity)
        centering = helpers.prep_centering(centering)

        # hack to fit the feature if not fitted
        self.eval(
            feature, np.array([self.axis_limits[0, feature]]), centering=centering
        )

        if show_avg_output:
            avg_output = helpers.prep_avg_output(self.data, self.model, self.avg_output, scale_y)
        else:
            avg_output = None

        vis.ale_plot(
            self.feature_effect["feature_" + str(feature)],
            self.eval,
            feature,
            centering=centering,
            error=heterogeneity,
            scale_x=scale_x,
            scale_y=scale_y,
            title=self.method_name + " plot",
            avg_output=avg_output,
            feature_names=self.feature_names,
            target_name=self.target_name,
            y_limits=y_limits,
            dy_limits=dy_limits
        )

eval(feature, xs, heterogeneity=False, centering=False)

Evalueate the (RH)ALE feature effect of feature feature at points xs.

Notes

This is a common method inherited by both ALE and RHALE.

Parameters:

Name Type Description Default
feature int

index of feature of interest

required
xs np.ndarray

the points along the s-th axis to evaluate the FE plot - np.ndarray of shape (T, )

required
heterogeneity bool

whether to return heterogeneity:

  • False, returns the mean effect y at the given xs
  • True, returns a tuple (y, H) of two ndarrays; y is the mean effect and H is the heterogeneity evaluated at xs
False
centering typing.Union[bool, str]

whether to center the plot:

  • False means no centering
  • True or zero_integral centers around the y axis.
  • zero_start starts the plot from y=0.
False

Returns:

Type Description
typing.Union[np.ndarray, typing.Tuple[np.ndarray, np.ndarray]]

the mean effect y, if heterogeneity=False (default) or a tuple (y, std) otherwise

Source code in /home/runner/work/effector/effector/effector/global_effect_ale.py
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
def eval(
    self,
    feature: int,
    xs: np.ndarray,
    heterogeneity: bool = False,
    centering: typing.Union[bool, str] = False,
) -> typing.Union[np.ndarray, typing.Tuple[np.ndarray, np.ndarray]]:
    """Evalueate the (RH)ALE feature effect of feature `feature` at points `xs`.

    Notes:
        This is a common method inherited by both ALE and RHALE.

    Args:
        feature: index of feature of interest
        xs: the points along the s-th axis to evaluate the FE plot
          - `np.ndarray` of shape `(T, )`
        heterogeneity: whether to return heterogeneity:

              - `False`, returns the mean effect `y` at the given `xs`
              - `True`, returns a tuple `(y, H)` of two `ndarrays`; `y` is the mean effect and `H` is the
              heterogeneity evaluated at `xs`

        centering: whether to center the plot:

            - `False` means no centering
            - `True` or `zero_integral` centers around the `y` axis.
            - `zero_start` starts the plot from `y=0`.
    Returns:
        the mean effect `y`, if `heterogeneity=False` (default) or a tuple `(y, std)` otherwise

    """
    centering = helpers.prep_centering(centering)

    if self.refit(feature, centering):
        self.fit(features=feature, centering=centering)

    # Check if the lower bound is less than the upper bound
    assert self.axis_limits[0, feature] < self.axis_limits[1, feature]

    # Evaluate the feature
    yy = self._eval_unnorm(feature, xs, heterogeneity=heterogeneity)
    y, std = yy if heterogeneity else (yy, None)

    # Center if asked
    y = (
        y - self.feature_effect["feature_" + str(feature)]["norm_const"]
        if centering
        else y
    )

    return (y, std) if heterogeneity is not False else y

plot(feature, heterogeneity=False, centering=False, scale_x=None, scale_y=None, show_avg_output=False, y_limits=None, dy_limits=None)

Plot the (RH)ALE feature effect of feature feature.

Notes

This is a common method inherited by both ALE and RHALE.

Parameters:

Name Type Description Default
feature int

the feature to plot

required
heterogeneity bool

whether to plot the heterogeneity

  • False, plots only the mean effect
  • True, the std of the bin-effects will be plotted using a red vertical bar
False
centering Union[bool, str]

whether to center the plot:

  • False means no centering
  • True or zero_integral centers around the y axis.
  • zero_start starts the plot from y=0.
False
scale_x Optional[dict]

None or Dict with keys ['std', 'mean']

  • If set to None, no scaling will be applied.
  • If set to a dict, the x-axis will be scaled by the standard deviation and the mean.
None
scale_y Optional[dict]

None or Dict with keys ['std', 'mean']

  • If set to None, no scaling will be applied.
  • If set to a dict, the y-axis will be scaled by the standard deviation and the mean.
None
show_avg_output bool

if True, the average output will be shown as a horizontal line.

False
y_limits Optional[List]

None or tuple, the limits of the y-axis

  • If set to None, the limits of the y-axis are set automatically
  • If set to a tuple, the limits are manually set
None
dy_limits Optional[List]

None or tuple, the limits of the dy-axis

  • If set to None, the limits of the dy-axis are set automatically
  • If set to a tuple, the limits are manually set
None
Source code in /home/runner/work/effector/effector/effector/global_effect_ale.py
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
def plot(
        self,
        feature: int,
        heterogeneity: bool = False,
        centering: Union[bool, str] = False,
        scale_x: Optional[dict] = None,
        scale_y: Optional[dict] = None,
        show_avg_output: bool = False,
        y_limits: Optional[List] = None,
        dy_limits: Optional[List] = None
):
    """
    Plot the (RH)ALE feature effect of feature `feature`.

    Notes:
        This is a common method inherited by both ALE and RHALE.

    Parameters:
        feature: the feature to plot
        heterogeneity: whether to plot the heterogeneity

              - `False`, plots only the mean effect
              - `True`, the std of the bin-effects will be plotted using a red vertical bar

        centering: whether to center the plot:

            - `False` means no centering
            - `True` or `zero_integral` centers around the `y` axis.
            - `zero_start` starts the plot from `y=0`.

        scale_x: None or Dict with keys ['std', 'mean']

            - If set to None, no scaling will be applied.
            - If set to a dict, the x-axis will be scaled by the standard deviation and the mean.
        scale_y: None or Dict with keys ['std', 'mean']

            - If set to None, no scaling will be applied.
            - If set to a dict, the y-axis will be scaled by the standard deviation and the mean.
        show_avg_output: if True, the average output will be shown as a horizontal line.
        y_limits: None or tuple, the limits of the y-axis

            - If set to None, the limits of the y-axis are set automatically
            - If set to a tuple, the limits are manually set

        dy_limits: None or tuple, the limits of the dy-axis

            - If set to None, the limits of the dy-axis are set automatically
            - If set to a tuple, the limits are manually set
    """
    heterogeneity = helpers.prep_confidence_interval(heterogeneity)
    centering = helpers.prep_centering(centering)

    # hack to fit the feature if not fitted
    self.eval(
        feature, np.array([self.axis_limits[0, feature]]), centering=centering
    )

    if show_avg_output:
        avg_output = helpers.prep_avg_output(self.data, self.model, self.avg_output, scale_y)
    else:
        avg_output = None

    vis.ale_plot(
        self.feature_effect["feature_" + str(feature)],
        self.eval,
        feature,
        centering=centering,
        error=heterogeneity,
        scale_x=scale_x,
        scale_y=scale_y,
        title=self.method_name + " plot",
        avg_output=avg_output,
        feature_names=self.feature_names,
        target_name=self.target_name,
        y_limits=y_limits,
        dy_limits=dy_limits
    )

effector.global_effect_ale.ALE

Bases: ALEBase

Source code in /home/runner/work/effector/effector/effector/global_effect_ale.py
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
class ALE(ALEBase):
    def __init__(
            self,
            data: np.ndarray,
            model: callable,
            nof_instances: Union[int, str] = "all",
            axis_limits: Optional[np.ndarray] = None,
            avg_output: Optional[float] = None,
            feature_names: Optional[List] = None,
            target_name: Optional[str] = None,
    ):
        """
        Constructor for the ALE plot.

        Definition:
            ALE is defined as:
            $$
            \hat{f}^{ALE}(x_s) = TODO
            $$

            The heterogeneity is:
            $$
            TODO
            $$

            The std of the bin-effects is:
            $$
            TODO
            $$

        Notes:
            - The required parameters are `data` and `model`. The rest are optional.

        Args:
            data: the design matrix

                - shape: `(N,D)`
            model: the black-box model. Must be a `Callable` with:

                - input: `ndarray` of shape `(N, D)`
                - output: `ndarray` of shape `(N, )`

            nof_instances: the number of instances to use for the explanation

                - use an `int`, to specify the number of instances
                - use `"all"`, to use all the instances

            axis_limits: The limits of the feature effect plot along each axis

                - use a `ndarray` of shape `(2, D)`, to specify them manually
                - use `None`, to be inferred from the data

            avg_output: the average output of the model on the data

                - use a `float`, to specify it manually
                - use `None`, to be inferred as `np.mean(model(data))`

            feature_names: The names of the features

                - use a `list` of `str`, to specify the name manually. For example: `                  ["age", "weight", ...]`
                - use `None`, to keep the default names: `["x_0", "x_1", ...]`

            target_name: The name of the target variable

                - use a `str`, to specify it name manually. For example: `"price"`
                - use `None`, to keep the default name: `"y"`
        """
        super(ALE, self).__init__(
            data, model, nof_instances, axis_limits, avg_output, feature_names, target_name, "ALE"
        )

    def _fit_feature(self, feature: int, binning_method="fixed") -> typing.Dict:

        # drop points outside of limits
        ind = np.logical_and(
            self.data[:, feature] >= self.axis_limits[0, feature],
            self.data[:, feature] <= self.axis_limits[1, feature],
        )
        data = self.data[ind, :]

        # assertion
        assert binning_method == "fixed" or isinstance(
            binning_method, bm.Fixed
        ), "ALE can work only with the fixed binning method!"

        if isinstance(binning_method, str):
            binning_method = bm.Fixed(nof_bins=20, min_points_per_bin=0)
        bin_est = bm.find_limits(data, None, feature, self.axis_limits, binning_method)
        bin_name = bin_est.__class__.__name__

        # assert bins can be computed else raise error
        assert bin_est.limits is not False, (
            "Impossible to compute bins with enough points for feature "
            + str(feature + 1)
            + " and binning strategy: "
            + bin_name
            + ". Change bin strategy or "
            "the parameters of the method"
        )

        # compute data effect on bin limits
        data_effect = utils.compute_local_effects(
            data, self.model, bin_est.limits, feature
        )

        # compute the bin effect
        dale_params = utils.compute_ale_params(
            data[:, feature], data_effect, bin_est.limits
        )
        dale_params["alg_params"] = "fixed"
        return dale_params

    def fit(
        self,
        features: typing.Union[int, str, list] = "all",
        binning_method: typing.Union[str, bm.Fixed] = "fixed",
        centering: typing.Union[bool, str] = "zero_integral",
    ) -> None:
        """Fit the ALE plot.

        Args:
            features: the features to fit. If set to "all", all the features will be fitted.

            binning_method:

                - If set to `"fixed"`, the ALE plot will be computed with the  default values, which are
                `20` bins with at least `10` points per bin and the featue is considered as categorical if it has
                less than `15` unique values.
                - If you want to change the parameters of the method, you pass an instance of the
                class `effector.binning_methods.Fixed` with the desired parameters.
                For example: `Fixed(nof_bins=20, min_points_per_bin=0, cat_limit=10)`

            centering: whether to compute the normalization constant for centering the plot:

                - `False` means no centering
                - `True` or `zero_integral` centers around the `y` axis.
                - `zero_start` starts the plot from `y=0`.
        """
        assert binning_method == "fixed" or isinstance(
            binning_method, bm.Fixed
        ), "ALE can work only with the fixed binning method!"

        self._fit_loop(features, binning_method, centering)

__init__(data, model, nof_instances='all', axis_limits=None, avg_output=None, feature_names=None, target_name=None)

Constructor for the ALE plot.

Definition

ALE is defined as: $$ \hat{f}^{ALE}(x_s) = TODO $$

The heterogeneity is: $$ TODO $$

The std of the bin-effects is: $$ TODO $$

Notes
  • The required parameters are data and model. The rest are optional.

Parameters:

Name Type Description Default
data np.ndarray

the design matrix

  • shape: (N,D)
required
model callable

the black-box model. Must be a Callable with:

  • input: ndarray of shape (N, D)
  • output: ndarray of shape (N, )
required
nof_instances Union[int, str]

the number of instances to use for the explanation

  • use an int, to specify the number of instances
  • use "all", to use all the instances
'all'
axis_limits Optional[np.ndarray]

The limits of the feature effect plot along each axis

  • use a ndarray of shape (2, D), to specify them manually
  • use None, to be inferred from the data
None
avg_output Optional[float]

the average output of the model on the data

  • use a float, to specify it manually
  • use None, to be inferred as np.mean(model(data))
None
feature_names Optional[List]

The names of the features

  • use a list of str, to specify the name manually. For example: ["age", "weight", ...]
  • use None, to keep the default names: ["x_0", "x_1", ...]
None
target_name Optional[str]

The name of the target variable

  • use a str, to specify it name manually. For example: "price"
  • use None, to keep the default name: "y"
None
Source code in /home/runner/work/effector/effector/effector/global_effect_ale.py
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
def __init__(
        self,
        data: np.ndarray,
        model: callable,
        nof_instances: Union[int, str] = "all",
        axis_limits: Optional[np.ndarray] = None,
        avg_output: Optional[float] = None,
        feature_names: Optional[List] = None,
        target_name: Optional[str] = None,
):
    """
    Constructor for the ALE plot.

    Definition:
        ALE is defined as:
        $$
        \hat{f}^{ALE}(x_s) = TODO
        $$

        The heterogeneity is:
        $$
        TODO
        $$

        The std of the bin-effects is:
        $$
        TODO
        $$

    Notes:
        - The required parameters are `data` and `model`. The rest are optional.

    Args:
        data: the design matrix

            - shape: `(N,D)`
        model: the black-box model. Must be a `Callable` with:

            - input: `ndarray` of shape `(N, D)`
            - output: `ndarray` of shape `(N, )`

        nof_instances: the number of instances to use for the explanation

            - use an `int`, to specify the number of instances
            - use `"all"`, to use all the instances

        axis_limits: The limits of the feature effect plot along each axis

            - use a `ndarray` of shape `(2, D)`, to specify them manually
            - use `None`, to be inferred from the data

        avg_output: the average output of the model on the data

            - use a `float`, to specify it manually
            - use `None`, to be inferred as `np.mean(model(data))`

        feature_names: The names of the features

            - use a `list` of `str`, to specify the name manually. For example: `                  ["age", "weight", ...]`
            - use `None`, to keep the default names: `["x_0", "x_1", ...]`

        target_name: The name of the target variable

            - use a `str`, to specify it name manually. For example: `"price"`
            - use `None`, to keep the default name: `"y"`
    """
    super(ALE, self).__init__(
        data, model, nof_instances, axis_limits, avg_output, feature_names, target_name, "ALE"
    )

fit(features='all', binning_method='fixed', centering='zero_integral')

Fit the ALE plot.

Parameters:

Name Type Description Default
features typing.Union[int, str, list]

the features to fit. If set to "all", all the features will be fitted.

'all'
binning_method typing.Union[str, bm.Fixed]
  • If set to "fixed", the ALE plot will be computed with the default values, which are 20 bins with at least 10 points per bin and the featue is considered as categorical if it has less than 15 unique values.
  • If you want to change the parameters of the method, you pass an instance of the class effector.binning_methods.Fixed with the desired parameters. For example: Fixed(nof_bins=20, min_points_per_bin=0, cat_limit=10)
'fixed'
centering typing.Union[bool, str]

whether to compute the normalization constant for centering the plot:

  • False means no centering
  • True or zero_integral centers around the y axis.
  • zero_start starts the plot from y=0.
'zero_integral'
Source code in /home/runner/work/effector/effector/effector/global_effect_ale.py
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
def fit(
    self,
    features: typing.Union[int, str, list] = "all",
    binning_method: typing.Union[str, bm.Fixed] = "fixed",
    centering: typing.Union[bool, str] = "zero_integral",
) -> None:
    """Fit the ALE plot.

    Args:
        features: the features to fit. If set to "all", all the features will be fitted.

        binning_method:

            - If set to `"fixed"`, the ALE plot will be computed with the  default values, which are
            `20` bins with at least `10` points per bin and the featue is considered as categorical if it has
            less than `15` unique values.
            - If you want to change the parameters of the method, you pass an instance of the
            class `effector.binning_methods.Fixed` with the desired parameters.
            For example: `Fixed(nof_bins=20, min_points_per_bin=0, cat_limit=10)`

        centering: whether to compute the normalization constant for centering the plot:

            - `False` means no centering
            - `True` or `zero_integral` centers around the `y` axis.
            - `zero_start` starts the plot from `y=0`.
    """
    assert binning_method == "fixed" or isinstance(
        binning_method, bm.Fixed
    ), "ALE can work only with the fixed binning method!"

    self._fit_loop(features, binning_method, centering)

effector.global_effect_ale.RHALE

Bases: ALEBase

Source code in /home/runner/work/effector/effector/effector/global_effect_ale.py
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
class RHALE(ALEBase):
    def __init__(
            self,
            data: np.ndarray,
            model: callable,
            model_jac: typing.Union[None, callable] = None,
            nof_instances: typing.Union[int, str] = "all",
            axis_limits: typing.Optional[np.ndarray] = None,
            data_effect: typing.Optional[np.ndarray] = None,
            avg_output: typing.Optional[float] = None,
            feature_names: typing.Optional[list] = None,
            target_name: typing.Optional[str] = None,
    ):
        """
        Constructor for RHALE.

        Definition:
            RHALE is defined as:
            $$
            \hat{f}^{RHALE}(x_s) = TODO
            $$

            The heterogeneity is:
            $$
            TODO
            $$

            The std of the bin-effects is:
            $$
            TODO
            $$

        Notes:
            The required parameters are `data` and `model`. The rest are optional.

        Args:
            data: the design matrix

                - shape: `(N,D)`
            model: the black-box model. Must be a `Callable` with:

                - input: `ndarray` of shape `(N, D)`
                - output: `ndarray` of shape `(N, )`

            model_jac: the Jacobian of the model. Must be a `Callable` with:

                - input: `ndarray` of shape `(N, D)`
                - output: `ndarray` of shape `(N, D)`

            nof_instances: the number of instances to use for the explanation

                - use an `int`, to specify the number of instances
                - use `"all"`, to use all the instances

            axis_limits: The limits of the feature effect plot along each axis

                - use a `ndarray` of shape `(2, D)`, to specify them manually
                - use `None`, to be inferred from the data

            data_effect:
                - if np.ndarray, the model Jacobian computed on the `data`
                - if None, the Jacobian will be computed using model_jac

            avg_output: the average output of the model on the data

                - use a `float`, to specify it manually
                - use `None`, to be inferred as `np.mean(model(data))`

            feature_names: The names of the features

                - use a `list` of `str`, to specify the name manually. For example: `["age", "weight", ...]`
                - use `None`, to keep the default names: `["x_0", "x_1", ...]`

            target_name: The name of the target variable

                - use a `str`, to specify it name manually. For example: `"price"`
                - use `None`, to keep the default name: `"y"`
        """
        self.model_jac = model_jac

        # select nof_instances from the data
        nof_instances, indices = helpers.prep_nof_instances(nof_instances, data.shape[0])
        data = data[indices, :]
        data_effect = data_effect[indices, :] if data_effect is not None else None
        self.data_effect = data_effect

        super(RHALE, self).__init__(
            data, model, "all", axis_limits, avg_output, feature_names, target_name, "RHALE"
        )

    def compile(self):
        """Prepare everything for fitting, i.e., compute the gradients on data points.
        """
        if self.data_effect is None and self.model_jac is not None:
            self.data_effect = self.model_jac(self.data)
        elif self.data_effect is None and self.model_jac is None:
            self.data_effect = utils.compute_jacobian_numerically(self.model, self.data)

    def _fit_feature(
            self,
            feature: int,
            binning_method: Union[str, bm.DynamicProgramming, bm.Greedy, bm.Fixed] = "greedy"
    ) -> typing.Dict:
        if self.data_effect is None:
            self.compile()

        # drop points outside of limits
        ind = np.logical_and(
            self.data[:, feature] >= self.axis_limits[0, feature],
            self.data[:, feature] <= self.axis_limits[1, feature],
        )
        data = self.data[ind, :]
        data_effect = self.data_effect[ind, :]

        # bin estimation
        bin_est = bm.find_limits(
            data, data_effect, feature, self.axis_limits, binning_method
        )
        bin_name = bin_est.__class__.__name__

        # assert bins can be computed else raise error
        assert bin_est.limits is not False, (
            "Impossible to compute bins with enough points for feature "
            + str(feature + 1)
            + " and binning strategy: "
            + bin_name
            + ". Change bin strategy or "
            "the parameters of the method"
        )

        # compute the bin effect
        dale_params = utils.compute_ale_params(
            data[:, feature], data_effect[:, feature], bin_est.limits
        )
        dale_params["alg_params"] = binning_method
        return dale_params

    def fit(
        self,
        features: typing.Union[int, str, list] = "all",
        binning_method: typing.Union[str, bm.DynamicProgramming, bm.Greedy, bm.Fixed] = "greedy",
        centering: typing.Union[bool, str] = False,
    ) -> None:
        """Fit the model.

        Args:
            features (int, str, list): the features to fit.

                - If set to "all", all the features will be fitted.

            binning_method (str): the binning method to use.

                - Use `"greedy"` for using the Greedy binning solution with the default parameters.
                  For custom parameters initialize a `binning_methods.Greedy` object
                - Use `"dp"` for using a Dynamic Programming binning solution with the default parameters.
                  For custom parameters initialize a `binning_methods.DynamicProgramming` object
                - Use `"fixed"` for using a Fixed binning solution with the default parameters.
                  For custom parameters initialize a `binning_methods.Fixed` object

            centering: whether to compute the normalization constant for centering the plot:

                - `False` means no centering
                - `True` or `zero_integral` centers around the `y` axis
                - `zero_start` starts the plot from `y=0`
        """
        assert binning_method in [
            "greedy",
            "dynamic",
            "fixed"
        ] or isinstance(
            binning_method, bm.Greedy
        ) or isinstance(
            binning_method, bm.DynamicProgramming
        ) or isinstance(
            binning_method, bm.Fixed
        ), "Unknown binning method!"

        self._fit_loop(features, binning_method, centering)

__init__(data, model, model_jac=None, nof_instances='all', axis_limits=None, data_effect=None, avg_output=None, feature_names=None, target_name=None)

Constructor for RHALE.

Definition

RHALE is defined as: $$ \hat{f}^{RHALE}(x_s) = TODO $$

The heterogeneity is: $$ TODO $$

The std of the bin-effects is: $$ TODO $$

Notes

The required parameters are data and model. The rest are optional.

Parameters:

Name Type Description Default
data np.ndarray

the design matrix

  • shape: (N,D)
required
model callable

the black-box model. Must be a Callable with:

  • input: ndarray of shape (N, D)
  • output: ndarray of shape (N, )
required
model_jac typing.Union[None, callable]

the Jacobian of the model. Must be a Callable with:

  • input: ndarray of shape (N, D)
  • output: ndarray of shape (N, D)
None
nof_instances typing.Union[int, str]

the number of instances to use for the explanation

  • use an int, to specify the number of instances
  • use "all", to use all the instances
'all'
axis_limits typing.Optional[np.ndarray]

The limits of the feature effect plot along each axis

  • use a ndarray of shape (2, D), to specify them manually
  • use None, to be inferred from the data
None
data_effect typing.Optional[np.ndarray]
  • if np.ndarray, the model Jacobian computed on the data
  • if None, the Jacobian will be computed using model_jac
None
avg_output typing.Optional[float]

the average output of the model on the data

  • use a float, to specify it manually
  • use None, to be inferred as np.mean(model(data))
None
feature_names typing.Optional[list]

The names of the features

  • use a list of str, to specify the name manually. For example: ["age", "weight", ...]
  • use None, to keep the default names: ["x_0", "x_1", ...]
None
target_name typing.Optional[str]

The name of the target variable

  • use a str, to specify it name manually. For example: "price"
  • use None, to keep the default name: "y"
None
Source code in /home/runner/work/effector/effector/effector/global_effect_ale.py
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
def __init__(
        self,
        data: np.ndarray,
        model: callable,
        model_jac: typing.Union[None, callable] = None,
        nof_instances: typing.Union[int, str] = "all",
        axis_limits: typing.Optional[np.ndarray] = None,
        data_effect: typing.Optional[np.ndarray] = None,
        avg_output: typing.Optional[float] = None,
        feature_names: typing.Optional[list] = None,
        target_name: typing.Optional[str] = None,
):
    """
    Constructor for RHALE.

    Definition:
        RHALE is defined as:
        $$
        \hat{f}^{RHALE}(x_s) = TODO
        $$

        The heterogeneity is:
        $$
        TODO
        $$

        The std of the bin-effects is:
        $$
        TODO
        $$

    Notes:
        The required parameters are `data` and `model`. The rest are optional.

    Args:
        data: the design matrix

            - shape: `(N,D)`
        model: the black-box model. Must be a `Callable` with:

            - input: `ndarray` of shape `(N, D)`
            - output: `ndarray` of shape `(N, )`

        model_jac: the Jacobian of the model. Must be a `Callable` with:

            - input: `ndarray` of shape `(N, D)`
            - output: `ndarray` of shape `(N, D)`

        nof_instances: the number of instances to use for the explanation

            - use an `int`, to specify the number of instances
            - use `"all"`, to use all the instances

        axis_limits: The limits of the feature effect plot along each axis

            - use a `ndarray` of shape `(2, D)`, to specify them manually
            - use `None`, to be inferred from the data

        data_effect:
            - if np.ndarray, the model Jacobian computed on the `data`
            - if None, the Jacobian will be computed using model_jac

        avg_output: the average output of the model on the data

            - use a `float`, to specify it manually
            - use `None`, to be inferred as `np.mean(model(data))`

        feature_names: The names of the features

            - use a `list` of `str`, to specify the name manually. For example: `["age", "weight", ...]`
            - use `None`, to keep the default names: `["x_0", "x_1", ...]`

        target_name: The name of the target variable

            - use a `str`, to specify it name manually. For example: `"price"`
            - use `None`, to keep the default name: `"y"`
    """
    self.model_jac = model_jac

    # select nof_instances from the data
    nof_instances, indices = helpers.prep_nof_instances(nof_instances, data.shape[0])
    data = data[indices, :]
    data_effect = data_effect[indices, :] if data_effect is not None else None
    self.data_effect = data_effect

    super(RHALE, self).__init__(
        data, model, "all", axis_limits, avg_output, feature_names, target_name, "RHALE"
    )

fit(features='all', binning_method='greedy', centering=False)

Fit the model.

Parameters:

Name Type Description Default
features int, str, list

the features to fit.

  • If set to "all", all the features will be fitted.
'all'
binning_method str

the binning method to use.

  • Use "greedy" for using the Greedy binning solution with the default parameters. For custom parameters initialize a binning_methods.Greedy object
  • Use "dp" for using a Dynamic Programming binning solution with the default parameters. For custom parameters initialize a binning_methods.DynamicProgramming object
  • Use "fixed" for using a Fixed binning solution with the default parameters. For custom parameters initialize a binning_methods.Fixed object
'greedy'
centering typing.Union[bool, str]

whether to compute the normalization constant for centering the plot:

  • False means no centering
  • True or zero_integral centers around the y axis
  • zero_start starts the plot from y=0
False
Source code in /home/runner/work/effector/effector/effector/global_effect_ale.py
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
def fit(
    self,
    features: typing.Union[int, str, list] = "all",
    binning_method: typing.Union[str, bm.DynamicProgramming, bm.Greedy, bm.Fixed] = "greedy",
    centering: typing.Union[bool, str] = False,
) -> None:
    """Fit the model.

    Args:
        features (int, str, list): the features to fit.

            - If set to "all", all the features will be fitted.

        binning_method (str): the binning method to use.

            - Use `"greedy"` for using the Greedy binning solution with the default parameters.
              For custom parameters initialize a `binning_methods.Greedy` object
            - Use `"dp"` for using a Dynamic Programming binning solution with the default parameters.
              For custom parameters initialize a `binning_methods.DynamicProgramming` object
            - Use `"fixed"` for using a Fixed binning solution with the default parameters.
              For custom parameters initialize a `binning_methods.Fixed` object

        centering: whether to compute the normalization constant for centering the plot:

            - `False` means no centering
            - `True` or `zero_integral` centers around the `y` axis
            - `zero_start` starts the plot from `y=0`
    """
    assert binning_method in [
        "greedy",
        "dynamic",
        "fixed"
    ] or isinstance(
        binning_method, bm.Greedy
    ) or isinstance(
        binning_method, bm.DynamicProgramming
    ) or isinstance(
        binning_method, bm.Fixed
    ), "Unknown binning method!"

    self._fit_loop(features, binning_method, centering)

effector.global_effect_pdp.PDPBase

Bases: GlobalEffectBase

Source code in /home/runner/work/effector/effector/effector/global_effect_pdp.py
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
class PDPBase(GlobalEffectBase):
    def __init__(
        self,
        data: np.ndarray,
        model: Callable,
        model_jac: Optional[Callable] = None,
        axis_limits: Optional[np.ndarray] = None,
        avg_output: Optional[float] = None,
        nof_instances: Union[int, str] = 300,
        feature_names: Optional[List] = None,
        target_name: Optional[str] = None,
        method_name: str = "PDP",
    ):
        """
        Constructor of the PDPBase class.
        """

        self.model_jac = model_jac

        super(PDPBase, self).__init__(
            method_name,
            data,
            model, nof_instances, axis_limits, avg_output, feature_names, target_name
        )

    def _predict(self, data, xx, feature, use_vectorized=True):
        method = pdp_1d_vectorized if use_vectorized else pdp_1d_non_vectorized
        if self.method_name == "pdp":
            y = method(
                self.model, data, xx, feature, False, False, True
            )
        else:
            if self.model_jac is not None:
                y = method(self.model_jac, self.data, xx, feature, False, True, True)
            else:
                y = method(self.model, self.data, xx, feature, False, False, True, True)
        return y

    def _fit_feature(
        self,
        feature: int,
        centering: Union[bool, str] = False,
        points_for_centering: int = 100,
        use_vectorized: bool = True,
    ) -> dict:

        # drop points outside of limits
        self.data = self.data[self.data[:, feature] >= self.axis_limits[0, feature]]
        self.data = self.data[self.data[:, feature] <= self.axis_limits[1, feature]]
        data = self.data

        if centering is True or centering == "zero_integral":
            xx = np.linspace(
                self.axis_limits[0, feature],
                self.axis_limits[1, feature],
                points_for_centering,
            )
            y = self._predict(data, xx, feature, use_vectorized)
            norm_const = np.mean(y, axis=0)
            fe = {"norm_const": norm_const}
        elif centering == "zero_start":
            xx = self.axis_limits[0, feature, np.newaxis]
            y = self._predict(data, xx, feature, use_vectorized)
            fe = {"norm_const": y[0]}
        else:
            fe = {"norm_const": helpers.EMPTY_SYMBOL}
        return fe

    def fit(
        self,
        features: Union[int, str, list] = "all",
        centering: Union[bool, str] = True,
        points_for_centering: int = 100,
        use_vectorized: bool = True,
    ):
        """
        Fit the PDP or d-PDP.

        Notes:
            You can use `.eval` or `.plot` without calling `.fit` explicitly.
            The only thing that `.fit` does is to compute the normalization constant for centering the PDP and ICE plots.
            This will be automatically done when calling `eval` or `plot`, so there is no need to call `fit` explicitly.

        Args:
            features: the features to fit.
                - If set to "all", all the features will be fitted.

            centering: whether to center the plot:

                - `False` means no centering
                - `True` or `zero_integral` centers around the `y` axis.
                - `zero_start` starts the plot from `y=0`.

            points_for_centering: number of linspaced points along the feature axis used for centering.

                - If set to `"all"`, all the dataset points will be used.

            use_vectorized: whether to use the vectorized version of the PDP computation

        """
        centering = helpers.prep_centering(centering)
        features = helpers.prep_features(features, self.dim)

        for s in features:
            self.feature_effect["feature_" + str(s)] = self._fit_feature(
                s, centering, points_for_centering, use_vectorized
            )
            self.is_fitted[s] = True
            self.method_args["feature_" + str(s)] = {
                "centering": centering,
                "points_for_centering": points_for_centering,
            }

    def eval(
        self,
        feature: int,
        xs: np.ndarray,
        heterogeneity: bool = False,
        centering: typing.Union[bool, str] = False,
        return_all: bool = False,
        use_vectorized: bool = True,
    ) -> typing.Union[np.ndarray, typing.Tuple[np.ndarray, np.ndarray]]:
        """Evaluate the effect of the s-th feature at positions `xs`.

        Args:
            feature: index of feature of interest
            xs: the points along the s-th axis to evaluate the FE plot

              - `np.ndarray` of shape `(T, )`

            heterogeneity: whether to return the heterogeneity measures.

                  - if `heterogeneity=False`, the function returns the mean effect at the given `xs`
                  - If `heterogeneity=True`, the function returns `(y, std)` where `y` is the mean effect and `std` is the standard deviation of the mean effect

            centering: whether to center the PDP

                - If `centering` is `False`, the PDP not centered
                - If `centering` is `True` or `zero_integral`, the PDP is centered around the `y` axis.
                - If `centering` is `zero_start`, the PDP starts from `y=0`.

            return_all: whether to return PDP and ICE plots evaluated at `xs`

                - If `return_all=False`, the function returns the mean effect at the given `xs`
                - If `return_all=True`, the function returns a `ndarray` of shape `(T, N)` with the `N` ICE plots evaluated at `xs`

            use_vectorized: whether to use the vectorized version of the PDP computation

        Returns:
            the mean effect `y`, if `heterogeneity=False` (default) or a tuple `(y, std)` otherwise

        """
        centering = helpers.prep_centering(centering)

        if self.refit(feature, centering):
            self.fit(features=feature, centering=centering, use_vectorized=use_vectorized)

        # Check if the lower bound is less than the upper bound
        assert self.axis_limits[0, feature] < self.axis_limits[1, feature]

        # new implementation
        yy = self._predict(self.data, xs, feature, use_vectorized)

        if centering:
            norm_consts = np.expand_dims(
                self.feature_effect["feature_" + str(feature)]["norm_const"], axis=0
            )
            yy = yy - norm_consts

        y_pdp = np.mean(yy, axis=1)

        if return_all:
            return yy

        if heterogeneity:
            std = np.std(yy, axis=1)
            return y_pdp, std
        else:
            return y_pdp

    def plot(
        self,
        feature: int,
        heterogeneity: Union[bool, str] = False,
        centering: Union[bool, str] = False,
        nof_points: int = 30,
        scale_x: Optional[dict] = None,
        scale_y: Optional[dict] = None,
        nof_ice: Union[int, str] = "all",
        show_avg_output: bool = False,
        y_limits: Optional[List] = None,
        use_vectorized: bool = True,
    ):
        """
        Plot the PDP or d-PDP.

        Args:
            feature: index of the plotted feature
            heterogeneity: whether to output the heterogeneity of the SHAP values

                - If `heterogeneity` is `False`, no heterogeneity is plotted
                - If `heterogeneity` is `True` or `"std"`, the standard deviation of the shap values is plotted
                - If `heterogeneity` is `ice`, the ICE plots are plotted

            centering: whether to center the PDP

                - If `centering` is `False`, the PDP not centered
                - If `centering` is `True` or `zero_integral`, the PDP is centered around the `y` axis.
                - If `centering` is `zero_start`, the PDP starts from `y=0`.

            nof_points: number of points to evaluate the SDP plot
            scale_x: dictionary with keys "mean" and "std" for scaling the x-axis
            scale_y: dictionary with keys "mean" and "std" for scaling the y-axis
            nof_ice: number of shap values to show on top of the SHAP curve
            show_avg_output: whether to show the average output of the model
            y_limits: limits of the y-axis
            use_vectorized: whether to use the vectorized version of the PDP computation
        """
        heterogeneity = helpers.prep_confidence_interval(heterogeneity)
        centering = helpers.prep_centering(centering)

        x = np.linspace(
            self.axis_limits[0, feature], self.axis_limits[1, feature], nof_points
        )

        yy = self.eval(
            feature, x, heterogeneity=False, centering=centering, return_all=True, use_vectorized=use_vectorized
        )

        if show_avg_output:
            avg_output = helpers.prep_avg_output(self.data, self.model, self.avg_output, scale_y)
        else:
            avg_output = None

        title = "PDP" if self.method_name == "pdp" else "d-PDP"
        vis.plot_pdp_ice(
            x,
            feature,
            yy=yy,
            title=title,
            confidence_interval=heterogeneity,
            y_pdp_label="PDP" if self.method_name == "pdp" else "d-PDP",
            y_ice_label="ICE" if self.method_name == "pdp" else "d-ICE",
            scale_x=scale_x,
            scale_y=scale_y,
            avg_output=avg_output,
            feature_names=self.feature_names,
            target_name=self.target_name,
            nof_ice=nof_ice,
            y_limits=y_limits,
        )

fit(features='all', centering=True, points_for_centering=100, use_vectorized=True)

Fit the PDP or d-PDP.

Notes

You can use .eval or .plot without calling .fit explicitly. The only thing that .fit does is to compute the normalization constant for centering the PDP and ICE plots. This will be automatically done when calling eval or plot, so there is no need to call fit explicitly.

Parameters:

Name Type Description Default
features Union[int, str, list]

the features to fit. - If set to "all", all the features will be fitted.

'all'
centering Union[bool, str]

whether to center the plot:

  • False means no centering
  • True or zero_integral centers around the y axis.
  • zero_start starts the plot from y=0.
True
points_for_centering int

number of linspaced points along the feature axis used for centering.

  • If set to "all", all the dataset points will be used.
100
use_vectorized bool

whether to use the vectorized version of the PDP computation

True
Source code in /home/runner/work/effector/effector/effector/global_effect_pdp.py
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
def fit(
    self,
    features: Union[int, str, list] = "all",
    centering: Union[bool, str] = True,
    points_for_centering: int = 100,
    use_vectorized: bool = True,
):
    """
    Fit the PDP or d-PDP.

    Notes:
        You can use `.eval` or `.plot` without calling `.fit` explicitly.
        The only thing that `.fit` does is to compute the normalization constant for centering the PDP and ICE plots.
        This will be automatically done when calling `eval` or `plot`, so there is no need to call `fit` explicitly.

    Args:
        features: the features to fit.
            - If set to "all", all the features will be fitted.

        centering: whether to center the plot:

            - `False` means no centering
            - `True` or `zero_integral` centers around the `y` axis.
            - `zero_start` starts the plot from `y=0`.

        points_for_centering: number of linspaced points along the feature axis used for centering.

            - If set to `"all"`, all the dataset points will be used.

        use_vectorized: whether to use the vectorized version of the PDP computation

    """
    centering = helpers.prep_centering(centering)
    features = helpers.prep_features(features, self.dim)

    for s in features:
        self.feature_effect["feature_" + str(s)] = self._fit_feature(
            s, centering, points_for_centering, use_vectorized
        )
        self.is_fitted[s] = True
        self.method_args["feature_" + str(s)] = {
            "centering": centering,
            "points_for_centering": points_for_centering,
        }

eval(feature, xs, heterogeneity=False, centering=False, return_all=False, use_vectorized=True)

Evaluate the effect of the s-th feature at positions xs.

Parameters:

Name Type Description Default
feature int

index of feature of interest

required
xs np.ndarray

the points along the s-th axis to evaluate the FE plot

  • np.ndarray of shape (T, )
required
heterogeneity bool

whether to return the heterogeneity measures.

  • if heterogeneity=False, the function returns the mean effect at the given xs
  • If heterogeneity=True, the function returns (y, std) where y is the mean effect and std is the standard deviation of the mean effect
False
centering typing.Union[bool, str]

whether to center the PDP

  • If centering is False, the PDP not centered
  • If centering is True or zero_integral, the PDP is centered around the y axis.
  • If centering is zero_start, the PDP starts from y=0.
False
return_all bool

whether to return PDP and ICE plots evaluated at xs

  • If return_all=False, the function returns the mean effect at the given xs
  • If return_all=True, the function returns a ndarray of shape (T, N) with the N ICE plots evaluated at xs
False
use_vectorized bool

whether to use the vectorized version of the PDP computation

True

Returns:

Type Description
typing.Union[np.ndarray, typing.Tuple[np.ndarray, np.ndarray]]

the mean effect y, if heterogeneity=False (default) or a tuple (y, std) otherwise

Source code in /home/runner/work/effector/effector/effector/global_effect_pdp.py
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
def eval(
    self,
    feature: int,
    xs: np.ndarray,
    heterogeneity: bool = False,
    centering: typing.Union[bool, str] = False,
    return_all: bool = False,
    use_vectorized: bool = True,
) -> typing.Union[np.ndarray, typing.Tuple[np.ndarray, np.ndarray]]:
    """Evaluate the effect of the s-th feature at positions `xs`.

    Args:
        feature: index of feature of interest
        xs: the points along the s-th axis to evaluate the FE plot

          - `np.ndarray` of shape `(T, )`

        heterogeneity: whether to return the heterogeneity measures.

              - if `heterogeneity=False`, the function returns the mean effect at the given `xs`
              - If `heterogeneity=True`, the function returns `(y, std)` where `y` is the mean effect and `std` is the standard deviation of the mean effect

        centering: whether to center the PDP

            - If `centering` is `False`, the PDP not centered
            - If `centering` is `True` or `zero_integral`, the PDP is centered around the `y` axis.
            - If `centering` is `zero_start`, the PDP starts from `y=0`.

        return_all: whether to return PDP and ICE plots evaluated at `xs`

            - If `return_all=False`, the function returns the mean effect at the given `xs`
            - If `return_all=True`, the function returns a `ndarray` of shape `(T, N)` with the `N` ICE plots evaluated at `xs`

        use_vectorized: whether to use the vectorized version of the PDP computation

    Returns:
        the mean effect `y`, if `heterogeneity=False` (default) or a tuple `(y, std)` otherwise

    """
    centering = helpers.prep_centering(centering)

    if self.refit(feature, centering):
        self.fit(features=feature, centering=centering, use_vectorized=use_vectorized)

    # Check if the lower bound is less than the upper bound
    assert self.axis_limits[0, feature] < self.axis_limits[1, feature]

    # new implementation
    yy = self._predict(self.data, xs, feature, use_vectorized)

    if centering:
        norm_consts = np.expand_dims(
            self.feature_effect["feature_" + str(feature)]["norm_const"], axis=0
        )
        yy = yy - norm_consts

    y_pdp = np.mean(yy, axis=1)

    if return_all:
        return yy

    if heterogeneity:
        std = np.std(yy, axis=1)
        return y_pdp, std
    else:
        return y_pdp

plot(feature, heterogeneity=False, centering=False, nof_points=30, scale_x=None, scale_y=None, nof_ice='all', show_avg_output=False, y_limits=None, use_vectorized=True)

Plot the PDP or d-PDP.

Parameters:

Name Type Description Default
feature int

index of the plotted feature

required
heterogeneity Union[bool, str]

whether to output the heterogeneity of the SHAP values

  • If heterogeneity is False, no heterogeneity is plotted
  • If heterogeneity is True or "std", the standard deviation of the shap values is plotted
  • If heterogeneity is ice, the ICE plots are plotted
False
centering Union[bool, str]

whether to center the PDP

  • If centering is False, the PDP not centered
  • If centering is True or zero_integral, the PDP is centered around the y axis.
  • If centering is zero_start, the PDP starts from y=0.
False
nof_points int

number of points to evaluate the SDP plot

30
scale_x Optional[dict]

dictionary with keys "mean" and "std" for scaling the x-axis

None
scale_y Optional[dict]

dictionary with keys "mean" and "std" for scaling the y-axis

None
nof_ice Union[int, str]

number of shap values to show on top of the SHAP curve

'all'
show_avg_output bool

whether to show the average output of the model

False
y_limits Optional[List]

limits of the y-axis

None
use_vectorized bool

whether to use the vectorized version of the PDP computation

True
Source code in /home/runner/work/effector/effector/effector/global_effect_pdp.py
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
def plot(
    self,
    feature: int,
    heterogeneity: Union[bool, str] = False,
    centering: Union[bool, str] = False,
    nof_points: int = 30,
    scale_x: Optional[dict] = None,
    scale_y: Optional[dict] = None,
    nof_ice: Union[int, str] = "all",
    show_avg_output: bool = False,
    y_limits: Optional[List] = None,
    use_vectorized: bool = True,
):
    """
    Plot the PDP or d-PDP.

    Args:
        feature: index of the plotted feature
        heterogeneity: whether to output the heterogeneity of the SHAP values

            - If `heterogeneity` is `False`, no heterogeneity is plotted
            - If `heterogeneity` is `True` or `"std"`, the standard deviation of the shap values is plotted
            - If `heterogeneity` is `ice`, the ICE plots are plotted

        centering: whether to center the PDP

            - If `centering` is `False`, the PDP not centered
            - If `centering` is `True` or `zero_integral`, the PDP is centered around the `y` axis.
            - If `centering` is `zero_start`, the PDP starts from `y=0`.

        nof_points: number of points to evaluate the SDP plot
        scale_x: dictionary with keys "mean" and "std" for scaling the x-axis
        scale_y: dictionary with keys "mean" and "std" for scaling the y-axis
        nof_ice: number of shap values to show on top of the SHAP curve
        show_avg_output: whether to show the average output of the model
        y_limits: limits of the y-axis
        use_vectorized: whether to use the vectorized version of the PDP computation
    """
    heterogeneity = helpers.prep_confidence_interval(heterogeneity)
    centering = helpers.prep_centering(centering)

    x = np.linspace(
        self.axis_limits[0, feature], self.axis_limits[1, feature], nof_points
    )

    yy = self.eval(
        feature, x, heterogeneity=False, centering=centering, return_all=True, use_vectorized=use_vectorized
    )

    if show_avg_output:
        avg_output = helpers.prep_avg_output(self.data, self.model, self.avg_output, scale_y)
    else:
        avg_output = None

    title = "PDP" if self.method_name == "pdp" else "d-PDP"
    vis.plot_pdp_ice(
        x,
        feature,
        yy=yy,
        title=title,
        confidence_interval=heterogeneity,
        y_pdp_label="PDP" if self.method_name == "pdp" else "d-PDP",
        y_ice_label="ICE" if self.method_name == "pdp" else "d-ICE",
        scale_x=scale_x,
        scale_y=scale_y,
        avg_output=avg_output,
        feature_names=self.feature_names,
        target_name=self.target_name,
        nof_ice=nof_ice,
        y_limits=y_limits,
    )

effector.global_effect_pdp.PDP

Bases: PDPBase

Source code in /home/runner/work/effector/effector/effector/global_effect_pdp.py
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
class PDP(PDPBase):
    def __init__(
        self,
        data: np.ndarray,
        model: Callable,
        axis_limits: Optional[np.ndarray] = None,
        nof_instances: Union[int, str] = 300,
        avg_output: Optional[float] = None,
        feature_names: Optional[List] = None,
        target_name: Optional[str] = None,
    ):
        """
        Constructor of the PDP class.

        Definition:
            PDP is defined as:
            $$
            \hat{f}^{PDP}(x_s) = {1 \over N} \sum_{i=1}^N f(x_s, x_C^{(i)})b
            $$

            The ICE plots are:
            $$
            \hat{f}^{(i)}(x_s) = f(x_s, x_C^{(i)}), \quad i=1, \dots, N
            $$

            The heterogeneity is:
            $$
            \mathcal{H}^{PDP}(x_s) = \sqrt {{1 \over N} \sum_{i=1}^N ( \hat{f}^{(i)}(x_s) - \hat{f}^{PDP}(x_s) )^2}
            $$

        Notes:
            The required parameters are `data` and `model`. The rest are optional.

        Args:
            data: the design matrix

                - shape: `(N,D)`
            model: the black-box model. Must be a `Callable` with:

                - input: `ndarray` of shape `(N, D)`
                - output: `ndarray` of shape `(N, )`

            axis_limits: The limits of the feature effect plot along each axis

                - use a `ndarray` of shape `(2, D)`, to specify them manually
                - use `None`, to be inferred from the data

            nof_instances: maximum number of instances to be used for PDP.

                - use "all", for using all instances.
                - use an `int`, for using `nof_instances` instances.

            avg_output: The average output of the model.

                - use a `float`, to specify it manually
                - use `None`, to be inferred as `np.mean(model(data))`

            feature_names: The names of the features

                - use a `list` of `str`, to specify the name manually. For example: `                  ["age", "weight", ...]`
                - use `None`, to keep the default names: `["x_0", "x_1", ...]`

            target_name: The name of the target variable

                - use a `str`, to specify it name manually. For example: `"price"`
                - use `None`, to keep the default name: `"y"`
        """

        super(PDP, self).__init__(
            data, model, None, axis_limits, avg_output, nof_instances, feature_names, target_name, method_name="PDP"
        )

__init__(data, model, axis_limits=None, nof_instances=300, avg_output=None, feature_names=None, target_name=None)

Constructor of the PDP class.

Definition

PDP is defined as: $$ \hat{f}^{PDP}(x_s) = {1 \over N} \sum_{i=1}^N f(x_s, x_C^{(i)})b $$

The ICE plots are: $$ \hat{f}^{(i)}(x_s) = f(x_s, x_C^{(i)}), \quad i=1, \dots, N $$

The heterogeneity is: $$ \mathcal{H}^{PDP}(x_s) = \sqrt {{1 \over N} \sum_{i=1}^N ( \hat{f}^{(i)}(x_s) - \hat{f}^{PDP}(x_s) )^2} $$

Notes

The required parameters are data and model. The rest are optional.

Parameters:

Name Type Description Default
data np.ndarray

the design matrix

  • shape: (N,D)
required
model Callable

the black-box model. Must be a Callable with:

  • input: ndarray of shape (N, D)
  • output: ndarray of shape (N, )
required
axis_limits Optional[np.ndarray]

The limits of the feature effect plot along each axis

  • use a ndarray of shape (2, D), to specify them manually
  • use None, to be inferred from the data
None
nof_instances Union[int, str]

maximum number of instances to be used for PDP.

  • use "all", for using all instances.
  • use an int, for using nof_instances instances.
300
avg_output Optional[float]

The average output of the model.

  • use a float, to specify it manually
  • use None, to be inferred as np.mean(model(data))
None
feature_names Optional[List]

The names of the features

  • use a list of str, to specify the name manually. For example: ["age", "weight", ...]
  • use None, to keep the default names: ["x_0", "x_1", ...]
None
target_name Optional[str]

The name of the target variable

  • use a str, to specify it name manually. For example: "price"
  • use None, to keep the default name: "y"
None
Source code in /home/runner/work/effector/effector/effector/global_effect_pdp.py
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
def __init__(
    self,
    data: np.ndarray,
    model: Callable,
    axis_limits: Optional[np.ndarray] = None,
    nof_instances: Union[int, str] = 300,
    avg_output: Optional[float] = None,
    feature_names: Optional[List] = None,
    target_name: Optional[str] = None,
):
    """
    Constructor of the PDP class.

    Definition:
        PDP is defined as:
        $$
        \hat{f}^{PDP}(x_s) = {1 \over N} \sum_{i=1}^N f(x_s, x_C^{(i)})b
        $$

        The ICE plots are:
        $$
        \hat{f}^{(i)}(x_s) = f(x_s, x_C^{(i)}), \quad i=1, \dots, N
        $$

        The heterogeneity is:
        $$
        \mathcal{H}^{PDP}(x_s) = \sqrt {{1 \over N} \sum_{i=1}^N ( \hat{f}^{(i)}(x_s) - \hat{f}^{PDP}(x_s) )^2}
        $$

    Notes:
        The required parameters are `data` and `model`. The rest are optional.

    Args:
        data: the design matrix

            - shape: `(N,D)`
        model: the black-box model. Must be a `Callable` with:

            - input: `ndarray` of shape `(N, D)`
            - output: `ndarray` of shape `(N, )`

        axis_limits: The limits of the feature effect plot along each axis

            - use a `ndarray` of shape `(2, D)`, to specify them manually
            - use `None`, to be inferred from the data

        nof_instances: maximum number of instances to be used for PDP.

            - use "all", for using all instances.
            - use an `int`, for using `nof_instances` instances.

        avg_output: The average output of the model.

            - use a `float`, to specify it manually
            - use `None`, to be inferred as `np.mean(model(data))`

        feature_names: The names of the features

            - use a `list` of `str`, to specify the name manually. For example: `                  ["age", "weight", ...]`
            - use `None`, to keep the default names: `["x_0", "x_1", ...]`

        target_name: The name of the target variable

            - use a `str`, to specify it name manually. For example: `"price"`
            - use `None`, to keep the default name: `"y"`
    """

    super(PDP, self).__init__(
        data, model, None, axis_limits, avg_output, nof_instances, feature_names, target_name, method_name="PDP"
    )

effector.global_effect_pdp.DerPDP

Bases: PDPBase

Source code in /home/runner/work/effector/effector/effector/global_effect_pdp.py
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
class DerPDP(PDPBase):
    def __init__(
            self,
            data: np.ndarray,
            model: Callable,
            model_jac: Optional[Callable] = None,
            axis_limits: Optional[np.ndarray] = None,
            nof_instances: Union[int, str] = 300,
            avg_output: Optional[float] = None,
            feature_names: Optional[List] = None,
            target_name: Optional[str] = None,
    ):
        """
        Constructor of the DerivativePDP class.

        Definition:
            d-PDP is defined as:
            $$
            \hat{f}^{d-PDP}(x_s) = {1 \over N} \sum_{i=1}^N {df \over d x_s} (x_s, x_C^i)
            $$

            The d-ICE plots are:
            $$
            \hat{f}^i(x_s) = {df \over d x_s}(x_s, x_C^i), \quad i=1, \dots, N
            $$

            The heterogeneity is:
            $$
            \mathcal{H}^{d-PDP}(x_s) = \sqrt {{1 \over N} \sum_{i=1}^N ( \hat{f}^i(x_s) - \hat{f}^{d-PDP}(x_s) )^2}
            $$

        Notes:
            - The required parameters are `data` and `model`. The rest are optional.
            - The `model_jac` is the Jacobian of the model. If `None`, the Jacobian will be computed numerically.

        Args:
            data: the design matrix

                - shape: `(N,D)`
            model: the black-box model. Must be a `Callable` with:

                - input: `ndarray` of shape `(N, D)`
                - output: `ndarray` of shape `(N, )`

            model_jac: the black-box model Jacobian. Must be a `Callable` with:

                - input: `ndarray` of shape `(N, D)`
                - output: `ndarray` of shape `(N, D)`

            axis_limits: The limits of the feature effect plot along each axis

                - use a `ndarray` of shape `(2, D)`, to specify them manually
                - use `None`, to be inferred from the data

            nof_instances: maximum number of instances to be used for PDP.

                - use "all", for using all instances.
                - use an `int`, for using `nof_instances` instances.

            avg_output: The average output of the model.

                - use a `float`, to specify it manually
                - use `None`, to be inferred as `np.mean(model(data))`

            feature_names: The names of the features

                - use a `list` of `str`, to specify the name manually. For example: `["age", "weight", ...]`
                - use `None`, to keep the default names: `["x_0", "x_1", ...]`

            target_name: The name of the target variable

                - use a `str`, to specify it name manually. For example: `"price"`
                - use `None`, to keep the default name: `"y"`
        """

        super(DerPDP, self).__init__(
            data, model, model_jac, axis_limits, avg_output, nof_instances, feature_names, target_name, method_name="d-PDP"
        )

__init__(data, model, model_jac=None, axis_limits=None, nof_instances=300, avg_output=None, feature_names=None, target_name=None)

Constructor of the DerivativePDP class.

Definition

d-PDP is defined as: $$ \hat{f}^{d-PDP}(x_s) = {1 \over N} \sum_{i=1}^N {df \over d x_s} (x_s, x_C^i) $$

The d-ICE plots are: $$ \hat{f}^i(x_s) = {df \over d x_s}(x_s, x_C^i), \quad i=1, \dots, N $$

The heterogeneity is: $$ \mathcal{H}^{d-PDP}(x_s) = \sqrt {{1 \over N} \sum_{i=1}^N ( \hat{f}^i(x_s) - \hat{f}^{d-PDP}(x_s) )^2} $$

Notes
  • The required parameters are data and model. The rest are optional.
  • The model_jac is the Jacobian of the model. If None, the Jacobian will be computed numerically.

Parameters:

Name Type Description Default
data np.ndarray

the design matrix

  • shape: (N,D)
required
model Callable

the black-box model. Must be a Callable with:

  • input: ndarray of shape (N, D)
  • output: ndarray of shape (N, )
required
model_jac Optional[Callable]

the black-box model Jacobian. Must be a Callable with:

  • input: ndarray of shape (N, D)
  • output: ndarray of shape (N, D)
None
axis_limits Optional[np.ndarray]

The limits of the feature effect plot along each axis

  • use a ndarray of shape (2, D), to specify them manually
  • use None, to be inferred from the data
None
nof_instances Union[int, str]

maximum number of instances to be used for PDP.

  • use "all", for using all instances.
  • use an int, for using nof_instances instances.
300
avg_output Optional[float]

The average output of the model.

  • use a float, to specify it manually
  • use None, to be inferred as np.mean(model(data))
None
feature_names Optional[List]

The names of the features

  • use a list of str, to specify the name manually. For example: ["age", "weight", ...]
  • use None, to keep the default names: ["x_0", "x_1", ...]
None
target_name Optional[str]

The name of the target variable

  • use a str, to specify it name manually. For example: "price"
  • use None, to keep the default name: "y"
None
Source code in /home/runner/work/effector/effector/effector/global_effect_pdp.py
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
def __init__(
        self,
        data: np.ndarray,
        model: Callable,
        model_jac: Optional[Callable] = None,
        axis_limits: Optional[np.ndarray] = None,
        nof_instances: Union[int, str] = 300,
        avg_output: Optional[float] = None,
        feature_names: Optional[List] = None,
        target_name: Optional[str] = None,
):
    """
    Constructor of the DerivativePDP class.

    Definition:
        d-PDP is defined as:
        $$
        \hat{f}^{d-PDP}(x_s) = {1 \over N} \sum_{i=1}^N {df \over d x_s} (x_s, x_C^i)
        $$

        The d-ICE plots are:
        $$
        \hat{f}^i(x_s) = {df \over d x_s}(x_s, x_C^i), \quad i=1, \dots, N
        $$

        The heterogeneity is:
        $$
        \mathcal{H}^{d-PDP}(x_s) = \sqrt {{1 \over N} \sum_{i=1}^N ( \hat{f}^i(x_s) - \hat{f}^{d-PDP}(x_s) )^2}
        $$

    Notes:
        - The required parameters are `data` and `model`. The rest are optional.
        - The `model_jac` is the Jacobian of the model. If `None`, the Jacobian will be computed numerically.

    Args:
        data: the design matrix

            - shape: `(N,D)`
        model: the black-box model. Must be a `Callable` with:

            - input: `ndarray` of shape `(N, D)`
            - output: `ndarray` of shape `(N, )`

        model_jac: the black-box model Jacobian. Must be a `Callable` with:

            - input: `ndarray` of shape `(N, D)`
            - output: `ndarray` of shape `(N, D)`

        axis_limits: The limits of the feature effect plot along each axis

            - use a `ndarray` of shape `(2, D)`, to specify them manually
            - use `None`, to be inferred from the data

        nof_instances: maximum number of instances to be used for PDP.

            - use "all", for using all instances.
            - use an `int`, for using `nof_instances` instances.

        avg_output: The average output of the model.

            - use a `float`, to specify it manually
            - use `None`, to be inferred as `np.mean(model(data))`

        feature_names: The names of the features

            - use a `list` of `str`, to specify the name manually. For example: `["age", "weight", ...]`
            - use `None`, to keep the default names: `["x_0", "x_1", ...]`

        target_name: The name of the target variable

            - use a `str`, to specify it name manually. For example: `"price"`
            - use `None`, to keep the default name: `"y"`
    """

    super(DerPDP, self).__init__(
        data, model, model_jac, axis_limits, avg_output, nof_instances, feature_names, target_name, method_name="d-PDP"
    )

effector.global_effect_shap.ShapDP

Bases: GlobalEffectBase

Source code in /home/runner/work/effector/effector/effector/global_effect_shap.py
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
class ShapDP(GlobalEffectBase):
    def __init__(
            self,
            data: np.ndarray,
            model: Callable,
            axis_limits: Optional[np.ndarray] = None,
            nof_instances: Union[int, str] = 100,
            avg_output: Optional[float] = None,
            feature_names: Optional[List[str]] = None,
            target_name: Optional[str] = None,
    ):
        """
        Constructor of the SHAPDependence class.

        Definition:
            The value of a coalition of $S$ features is estimated as:
            $$
            \hat{v}(S) = {1 \over N} \sum_{i=1}^N  f(x_S \cup x_C^i) - f(x^i)
            $$
            The value of a coalition $S$ quantifies what the values $\mathbf{x}_S$ of the features in $S$ contribute to the output of the model. It
            is the average (over all instances) difference on the output between setting features in $S$ to be $x_S$, i.e., $\mathbf{x} = (\mathbf{x}_S, \mathbf{x}_C^i)$ and leaving the instance as it is, i.e., $\mathbf{x}^i = (\mathbf{x}_S^i, \mathbf{x}_C^i)$.

            The contribution of a feature $j$ added to a coalition $S$ is estimated as:
            $$
            \hat{\Delta}_{S, j} = \hat{v}(S \cup \{j\}) - \hat{v}(S)
            $$

            The SHAP value of a feature $j$ with value $x_j$ is the average contribution of feature $j$ across all possible coalitions with a weight $w_{S, j}$:

            $$
            \hat{\phi}_j(x_j) = {1 \over N} \sum_{S \subseteq \{1, \dots, D\} \setminus \{j\}} w_{S, j} \hat{\Delta}_{S, j}
            $$

            where $w_{S, j}$ assures that the contribution of feature $j$ is the same for all coalitions of the same size. For example, there are $D-1$ ways for $x_j$ to enter a coalition of $|S| = 1$ feature, so $w_{S, j} = {1 \over D (D-1)}$ for each of them. In contrast, there is only one way for $x_j$ to enter a coaltion of $|S|=0$ (to be the first specified feature), so $w_{S, j} = {1 \over D}$.

            The SHAP Dependence Plot (SHAP-DP) is a spline $\hat{f}^{SDP}_j(x_j)$ fit to the dataset $\{(x_j^i, \hat{\phi}_j(x_j^i))\}_{i=1}^N$ using the `UnivariateSpline` function from `scipy.interpolate`.

        Notes:
            * The required parameters are `data` and `model`. The rest are optional.
            * SHAP values are computed using the `shap` package, using the class `Explainer`.
            * SHAP values are centered by default, i.e., the average SHAP value is subtracted from the SHAP values.
            * More details on the SHAP values can be found in the [original paper](https://arxiv.org/abs/1705.07874) and in the book [Interpreting Machine Learning Models with SHAP](https://christophmolnar.com/books/shap/)

        Args:
            data: the design matrix

                - shape: `(N,D)`
            model: the black-box model. Must be a `Callable` with:

                - input: `ndarray` of shape `(N, D)`
                - output: `ndarray` of shape `(N,)`

            axis_limits: The limits of the feature effect plot along each axis

                - use a `ndarray` of shape `(2, D)`, to specify them manually
                - use `None`, to be inferred from the data

            nof_instances: maximum number of instances to be used for SHAP estimation.

                - use "all", for using all instances.
                - use an `int`, for using `nof_instances` instances.

            avg_output: The average output of the model.

                - use a `float`, to specify it manually
                - use `None`, to be inferred as `np.mean(model(data))`

            feature_names: The names of the features

                - use a `list` of `str`, to specify the name manually. For example: `                  ["age", "weight", ...]`
                - use `None`, to keep the default names: `["x_0", "x_1", ...]`

            target_name: The name of the target variable

                - use a `str`, to specify it name manually. For example: `"price"`
                - use `None`, to keep the default name: `"y"`
        """
        self.nof_instances, self.indices = helpers.prep_nof_instances(
            nof_instances, data.shape[0]
        )
        data = data[self.indices, :]

        super(ShapDP, self).__init__(
            "SHAP DP", data, model, nof_instances, axis_limits, avg_output, feature_names, target_name
        )

    def _fit_feature(
        self,
        feature: int,
        centering: typing.Union[bool, str] = False,
        points_for_centering: int = 100,
    ) -> typing.Dict:

        # drop points outside of limits
        self.data = self.data[self.data[:, feature] >= self.axis_limits[0, feature]]
        self.data = self.data[self.data[:, feature] <= self.axis_limits[1, feature]]

        # compute shap values
        data = self.data
        shap_explainer = shap.Explainer(self.model, data)
        explanation = shap_explainer(data)

        # extract x and y pais
        yy = explanation.values[:, feature]
        xx = data[:, feature]

        # make xx monotonic
        idx = np.argsort(xx)
        xx = xx[idx]
        yy = yy[idx]

        # fit spline_mean to xx, yy pairs
        spline_mean = UnivariateSpline(xx, yy)

        # fit spline_mean to the sqrt of the residuals
        yy_std = np.abs(yy - spline_mean(xx))
        spline_std = UnivariateSpline(xx, yy_std)

        # compute norm constant
        if centering == "zero_integral":
            x_norm = np.linspace(xx[0], xx[-1], points_for_centering)
            y_norm = spline_mean(x_norm)
            norm_const = np.trapz(y_norm, x_norm) / (xx[-1] - xx[0])
        elif centering == "zero_start":
            norm_const = spline_mean(xx[0])
        else:
            norm_const = helpers.EMPTY_SYMBOL

        ret_dict = {
            "spline_mean": spline_mean,
            "spline_std": spline_std,
            "xx": xx,
            "yy": yy,
            "norm_const": norm_const,
        }
        return ret_dict

    def fit(
            self,
            features: Union[int, str, List] = "all",
            centering: Union[bool, str] = False,
            points_for_centering: Union[int, str] = 100,
    ) -> None:
        """Fit the SHAP Dependence Plot to the data.

        Notes:
            The SHAP Dependence Plot (SDP) $\hat{f}^{SDP}_j(x_j)$ is a spline fit to
            the dataset $\{(x_j^i, \hat{\phi}_j(x_j^i))\}_{i=1}^N$
            using the `UnivariateSpline` function from `scipy.interpolate`.

            The SHAP standard deviation, $\hat{\sigma}^{SDP}_j(x_j)$, is a spline fit            to the absolute value of the residuals, i.e., to the dataset $\{(x_j^i, |\hat{\phi}_j(x_j^i) - \hat{f}^{SDP}_j(x_j^i)|)\}_{i=1}^N$, using the `UnivariateSpline` function from `scipy.interpolate`.

        Args:
            features: the features to fit.
                - If set to "all", all the features will be fitted.
            centering:
                - If set to False, no centering will be applied.
                - If set to "zero_integral" or True, the integral of the feature effect will be set to zero.
                - If set to "zero_mean", the mean of the feature effect will be set to zero.

            points_for_centering: number of linspaced points along the feature axis used for centering.

                - If set to `all`, all the dataset points will be used.

        Notes:
            SHAP values are by default centered, i.e., $\sum_{i=1}^N \hat{\phi}_j(x_j^i) = 0$. This does not mean that the SHAP _curve_ is centered around zero; this happens only if the $s$-th feature of the dataset instances, i.e., the set $\{x_s^i\}_{i=1}^N$ is uniformly distributed along the $s$-th axis. So, use:

            * `centering=False`, to leave the SHAP values as they are.
            * `centering=True` or `centering=zero_integral`, to center the SHAP curve around the `y` axis.
            * `centering=zero_start`, to start the SHAP curve from `y=0`.

            SHAP values are expensive to compute.
            To speed up the computation consider using a subset of the dataset
            points for computing the SHAP values and for centering the spline.
            The default values (`points_for_fitting_spline=100`
            and `points_for_centering=100`) are a moderate choice.
        """
        centering = helpers.prep_centering(centering)
        features = helpers.prep_features(features, self.dim)

        # new implementation
        for s in features:
            self.feature_effect["feature_" + str(s)] = self._fit_feature(
                s, centering, points_for_centering
            )
            self.is_fitted[s] = True
            self.method_args["feature_" + str(s)] = {
                "centering": centering,
                "points_for_centering": points_for_centering,
            }

    def eval(
        self,
        feature: int,
        xs: np.ndarray,
        heterogeneity: bool = False,
        centering: typing.Union[bool, str] = False,
    ) -> typing.Union[np.ndarray, typing.Tuple[np.ndarray, np.ndarray]]:
        """Evaluate the effect of the s-th feature at positions `xs`.

        Args:
            feature: index of feature of interest
            xs: the points along the s-th axis to evaluate the FE plot

              - `np.ndarray` of shape `(T,)`
            heterogeneity: whether to return the heterogeneity measures.

                  - if `heterogeneity=False`, the function returns the mean effect at the given `xs`
                  - If `heterogeneity=True`, the function returns `(y, std)` where `y` is the mean effect and `std` is the standard deviation of the mean effect

            centering: whether to center the plot

                - If `centering` is `False`, the SHAP curve is not centered
                - If `centering` is `True` or `zero_integral`, the SHAP curve is centered around the `y` axis.
                - If `centering` is `zero_start`, the SHAP curve starts from `y=0`.

        Returns:
            the mean effect `y`, if `heterogeneity=False` (default) or a tuple `(y, std, estimator_var)` otherwise
        """
        centering = helpers.prep_centering(centering)

        if self.refit(feature, centering):
            self.fit(features=feature, centering=centering)

        # Check if the lower bound is less than the upper bound
        assert self.axis_limits[0, feature] < self.axis_limits[1, feature]

        yy = self.feature_effect["feature_" + str(feature)]["spline_mean"](xs)

        if centering is not False:
            norm_const = self.feature_effect["feature_" + str(feature)]["norm_const"]
            yy = yy - norm_const

        if heterogeneity:
            yy_std = self.feature_effect["feature_" + str(feature)]["spline_std"](xs)
            return yy, yy_std
        else:
            return yy

    def plot(
        self,
        feature: int,
        heterogeneity: Union[bool, str] = False,
        centering: Union[bool, str] = False,
        nof_points: int = 30,
        scale_x: Optional[dict] = None,
        scale_y: Optional[dict] = None,
        nof_shap_values: Union[int, str] = "all",
        show_avg_output: bool = False,
        y_limits: Optional[List] = None,
    ) -> None:
        """
        Plot the SHAP Dependence Plot (SDP) of the s-th feature.

        Args:
            feature: index of the plotted feature
            heterogeneity: whether to output the heterogeneity of the SHAP values

                - If `heterogeneity` is `False`, no heterogeneity is plotted
                - If `heterogeneity` is `True` or `"std"`, the standard deviation of the shap values is plotted
                - If `heterogeneity` is `"shap_values"`, the shap values are scattered on top of the SHAP curve

            centering: whether to center the SDP

                - If `centering` is `False`, the SHAP curve is not centered
                - If `centering` is `True` or `zero_integral`, the SHAP curve is centered around the `y` axis.
                - If `centering` is `zero_start`, the SHAP curve starts from `y=0`.

            nof_points: number of points to evaluate the SDP plot
            scale_x: dictionary with keys "mean" and "std" for scaling the x-axis
            scale_y: dictionary with keys "mean" and "std" for scaling the y-axis
            nof_shap_values: number of shap values to show on top of the SHAP curve
            show_avg_output: whether to show the average output of the model
            y_limits: limits of the y-axis
        """
        heterogeneity = helpers.prep_confidence_interval(heterogeneity)

        x = np.linspace(
            self.axis_limits[0, feature], self.axis_limits[1, feature], nof_points
        )

        # get the SHAP curve
        y = self.eval(feature, x, heterogeneity=False, centering=centering)
        y_std = (
            self.feature_effect["feature_" + str(feature)]["spline_std"](x)
            if heterogeneity == "std" or True
            else None
        )

        # get some SHAP values
        _, ind = helpers.prep_nof_instances(nof_shap_values, self.data.shape[0])
        yy = (
            self.feature_effect["feature_" + str(feature)]["yy"][ind]
            if heterogeneity == "shap_values"
            else None
        )
        if yy is not None and centering is not False:
            yy = yy - self.feature_effect["feature_" + str(feature)]["norm_const"]
        xx = (
            self.feature_effect["feature_" + str(feature)]["xx"][ind]
            if heterogeneity == "shap_values"
            else None
        )

        if show_avg_output:
            avg_output = helpers.prep_avg_output(self.data, self.model, self.avg_output, scale_y)
        else:
            avg_output = None

        vis.plot_shap(
            x,
            y,
            xx,
            yy,
            y_std,
            feature,
            heterogeneity=heterogeneity,
            scale_x=scale_x,
            scale_y=scale_y,
            avg_output=avg_output,
            feature_names=self.feature_names,
            target_name=self.target_name,
            y_limits=y_limits
        )

__init__(data, model, axis_limits=None, nof_instances=100, avg_output=None, feature_names=None, target_name=None)

Constructor of the SHAPDependence class.

Definition

The value of a coalition of \(S\) features is estimated as: $$ \hat{v}(S) = {1 \over N} \sum_{i=1}^N f(x_S \cup x_C^i) - f(x^i) $$ The value of a coalition \(S\) quantifies what the values \(\mathbf{x}_S\) of the features in \(S\) contribute to the output of the model. It is the average (over all instances) difference on the output between setting features in \(S\) to be \(x_S\), i.e., \(\mathbf{x} = (\mathbf{x}_S, \mathbf{x}_C^i)\) and leaving the instance as it is, i.e., \(\mathbf{x}^i = (\mathbf{x}_S^i, \mathbf{x}_C^i)\).

The contribution of a feature \(j\) added to a coalition \(S\) is estimated as: $$ \hat{\Delta}_{S, j} = \hat{v}(S \cup {j}) - \hat{v}(S) $$

The SHAP value of a feature \(j\) with value \(x_j\) is the average contribution of feature \(j\) across all possible coalitions with a weight \(w_{S, j}\):

\[ \hat{\phi}_j(x_j) = {1 \over N} \sum_{S \subseteq \{1, \dots, D\} \setminus \{j\}} w_{S, j} \hat{\Delta}_{S, j} \]

where \(w_{S, j}\) assures that the contribution of feature \(j\) is the same for all coalitions of the same size. For example, there are \(D-1\) ways for \(x_j\) to enter a coalition of \(|S| = 1\) feature, so \(w_{S, j} = {1 \over D (D-1)}\) for each of them. In contrast, there is only one way for \(x_j\) to enter a coaltion of \(|S|=0\) (to be the first specified feature), so \(w_{S, j} = {1 \over D}\).

The SHAP Dependence Plot (SHAP-DP) is a spline \(\hat{f}^{SDP}_j(x_j)\) fit to the dataset \(\{(x_j^i, \hat{\phi}_j(x_j^i))\}_{i=1}^N\) using the UnivariateSpline function from scipy.interpolate.

Notes
  • The required parameters are data and model. The rest are optional.
  • SHAP values are computed using the shap package, using the class Explainer.
  • SHAP values are centered by default, i.e., the average SHAP value is subtracted from the SHAP values.
  • More details on the SHAP values can be found in the original paper and in the book Interpreting Machine Learning Models with SHAP

Parameters:

Name Type Description Default
data np.ndarray

the design matrix

  • shape: (N,D)
required
model Callable

the black-box model. Must be a Callable with:

  • input: ndarray of shape (N, D)
  • output: ndarray of shape (N,)
required
axis_limits Optional[np.ndarray]

The limits of the feature effect plot along each axis

  • use a ndarray of shape (2, D), to specify them manually
  • use None, to be inferred from the data
None
nof_instances Union[int, str]

maximum number of instances to be used for SHAP estimation.

  • use "all", for using all instances.
  • use an int, for using nof_instances instances.
100
avg_output Optional[float]

The average output of the model.

  • use a float, to specify it manually
  • use None, to be inferred as np.mean(model(data))
None
feature_names Optional[List[str]]

The names of the features

  • use a list of str, to specify the name manually. For example: ["age", "weight", ...]
  • use None, to keep the default names: ["x_0", "x_1", ...]
None
target_name Optional[str]

The name of the target variable

  • use a str, to specify it name manually. For example: "price"
  • use None, to keep the default name: "y"
None
Source code in /home/runner/work/effector/effector/effector/global_effect_shap.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
def __init__(
        self,
        data: np.ndarray,
        model: Callable,
        axis_limits: Optional[np.ndarray] = None,
        nof_instances: Union[int, str] = 100,
        avg_output: Optional[float] = None,
        feature_names: Optional[List[str]] = None,
        target_name: Optional[str] = None,
):
    """
    Constructor of the SHAPDependence class.

    Definition:
        The value of a coalition of $S$ features is estimated as:
        $$
        \hat{v}(S) = {1 \over N} \sum_{i=1}^N  f(x_S \cup x_C^i) - f(x^i)
        $$
        The value of a coalition $S$ quantifies what the values $\mathbf{x}_S$ of the features in $S$ contribute to the output of the model. It
        is the average (over all instances) difference on the output between setting features in $S$ to be $x_S$, i.e., $\mathbf{x} = (\mathbf{x}_S, \mathbf{x}_C^i)$ and leaving the instance as it is, i.e., $\mathbf{x}^i = (\mathbf{x}_S^i, \mathbf{x}_C^i)$.

        The contribution of a feature $j$ added to a coalition $S$ is estimated as:
        $$
        \hat{\Delta}_{S, j} = \hat{v}(S \cup \{j\}) - \hat{v}(S)
        $$

        The SHAP value of a feature $j$ with value $x_j$ is the average contribution of feature $j$ across all possible coalitions with a weight $w_{S, j}$:

        $$
        \hat{\phi}_j(x_j) = {1 \over N} \sum_{S \subseteq \{1, \dots, D\} \setminus \{j\}} w_{S, j} \hat{\Delta}_{S, j}
        $$

        where $w_{S, j}$ assures that the contribution of feature $j$ is the same for all coalitions of the same size. For example, there are $D-1$ ways for $x_j$ to enter a coalition of $|S| = 1$ feature, so $w_{S, j} = {1 \over D (D-1)}$ for each of them. In contrast, there is only one way for $x_j$ to enter a coaltion of $|S|=0$ (to be the first specified feature), so $w_{S, j} = {1 \over D}$.

        The SHAP Dependence Plot (SHAP-DP) is a spline $\hat{f}^{SDP}_j(x_j)$ fit to the dataset $\{(x_j^i, \hat{\phi}_j(x_j^i))\}_{i=1}^N$ using the `UnivariateSpline` function from `scipy.interpolate`.

    Notes:
        * The required parameters are `data` and `model`. The rest are optional.
        * SHAP values are computed using the `shap` package, using the class `Explainer`.
        * SHAP values are centered by default, i.e., the average SHAP value is subtracted from the SHAP values.
        * More details on the SHAP values can be found in the [original paper](https://arxiv.org/abs/1705.07874) and in the book [Interpreting Machine Learning Models with SHAP](https://christophmolnar.com/books/shap/)

    Args:
        data: the design matrix

            - shape: `(N,D)`
        model: the black-box model. Must be a `Callable` with:

            - input: `ndarray` of shape `(N, D)`
            - output: `ndarray` of shape `(N,)`

        axis_limits: The limits of the feature effect plot along each axis

            - use a `ndarray` of shape `(2, D)`, to specify them manually
            - use `None`, to be inferred from the data

        nof_instances: maximum number of instances to be used for SHAP estimation.

            - use "all", for using all instances.
            - use an `int`, for using `nof_instances` instances.

        avg_output: The average output of the model.

            - use a `float`, to specify it manually
            - use `None`, to be inferred as `np.mean(model(data))`

        feature_names: The names of the features

            - use a `list` of `str`, to specify the name manually. For example: `                  ["age", "weight", ...]`
            - use `None`, to keep the default names: `["x_0", "x_1", ...]`

        target_name: The name of the target variable

            - use a `str`, to specify it name manually. For example: `"price"`
            - use `None`, to keep the default name: `"y"`
    """
    self.nof_instances, self.indices = helpers.prep_nof_instances(
        nof_instances, data.shape[0]
    )
    data = data[self.indices, :]

    super(ShapDP, self).__init__(
        "SHAP DP", data, model, nof_instances, axis_limits, avg_output, feature_names, target_name
    )

fit(features='all', centering=False, points_for_centering=100)

Fit the SHAP Dependence Plot to the data.

Notes

The SHAP Dependence Plot (SDP) \(\hat{f}^{SDP}_j(x_j)\) is a spline fit to the dataset \(\{(x_j^i, \hat{\phi}_j(x_j^i))\}_{i=1}^N\) using the UnivariateSpline function from scipy.interpolate.

The SHAP standard deviation, \(\hat{\sigma}^{SDP}_j(x_j)\), is a spline fit to the absolute value of the residuals, i.e., to the dataset \(\{(x_j^i, |\hat{\phi}_j(x_j^i) - \hat{f}^{SDP}_j(x_j^i)|)\}_{i=1}^N\), using the UnivariateSpline function from scipy.interpolate.

Parameters:

Name Type Description Default
features Union[int, str, List]

the features to fit. - If set to "all", all the features will be fitted.

'all'
centering Union[bool, str]
  • If set to False, no centering will be applied.
  • If set to "zero_integral" or True, the integral of the feature effect will be set to zero.
  • If set to "zero_mean", the mean of the feature effect will be set to zero.
False
points_for_centering Union[int, str]

number of linspaced points along the feature axis used for centering.

  • If set to all, all the dataset points will be used.
100
Notes

SHAP values are by default centered, i.e., \(\sum_{i=1}^N \hat{\phi}_j(x_j^i) = 0\). This does not mean that the SHAP curve is centered around zero; this happens only if the \(s\)-th feature of the dataset instances, i.e., the set \(\{x_s^i\}_{i=1}^N\) is uniformly distributed along the \(s\)-th axis. So, use:

  • centering=False, to leave the SHAP values as they are.
  • centering=True or centering=zero_integral, to center the SHAP curve around the y axis.
  • centering=zero_start, to start the SHAP curve from y=0.

SHAP values are expensive to compute. To speed up the computation consider using a subset of the dataset points for computing the SHAP values and for centering the spline. The default values (points_for_fitting_spline=100 and points_for_centering=100) are a moderate choice.

Source code in /home/runner/work/effector/effector/effector/global_effect_shap.py
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
def fit(
        self,
        features: Union[int, str, List] = "all",
        centering: Union[bool, str] = False,
        points_for_centering: Union[int, str] = 100,
) -> None:
    """Fit the SHAP Dependence Plot to the data.

    Notes:
        The SHAP Dependence Plot (SDP) $\hat{f}^{SDP}_j(x_j)$ is a spline fit to
        the dataset $\{(x_j^i, \hat{\phi}_j(x_j^i))\}_{i=1}^N$
        using the `UnivariateSpline` function from `scipy.interpolate`.

        The SHAP standard deviation, $\hat{\sigma}^{SDP}_j(x_j)$, is a spline fit            to the absolute value of the residuals, i.e., to the dataset $\{(x_j^i, |\hat{\phi}_j(x_j^i) - \hat{f}^{SDP}_j(x_j^i)|)\}_{i=1}^N$, using the `UnivariateSpline` function from `scipy.interpolate`.

    Args:
        features: the features to fit.
            - If set to "all", all the features will be fitted.
        centering:
            - If set to False, no centering will be applied.
            - If set to "zero_integral" or True, the integral of the feature effect will be set to zero.
            - If set to "zero_mean", the mean of the feature effect will be set to zero.

        points_for_centering: number of linspaced points along the feature axis used for centering.

            - If set to `all`, all the dataset points will be used.

    Notes:
        SHAP values are by default centered, i.e., $\sum_{i=1}^N \hat{\phi}_j(x_j^i) = 0$. This does not mean that the SHAP _curve_ is centered around zero; this happens only if the $s$-th feature of the dataset instances, i.e., the set $\{x_s^i\}_{i=1}^N$ is uniformly distributed along the $s$-th axis. So, use:

        * `centering=False`, to leave the SHAP values as they are.
        * `centering=True` or `centering=zero_integral`, to center the SHAP curve around the `y` axis.
        * `centering=zero_start`, to start the SHAP curve from `y=0`.

        SHAP values are expensive to compute.
        To speed up the computation consider using a subset of the dataset
        points for computing the SHAP values and for centering the spline.
        The default values (`points_for_fitting_spline=100`
        and `points_for_centering=100`) are a moderate choice.
    """
    centering = helpers.prep_centering(centering)
    features = helpers.prep_features(features, self.dim)

    # new implementation
    for s in features:
        self.feature_effect["feature_" + str(s)] = self._fit_feature(
            s, centering, points_for_centering
        )
        self.is_fitted[s] = True
        self.method_args["feature_" + str(s)] = {
            "centering": centering,
            "points_for_centering": points_for_centering,
        }

eval(feature, xs, heterogeneity=False, centering=False)

Evaluate the effect of the s-th feature at positions xs.

Parameters:

Name Type Description Default
feature int

index of feature of interest

required
xs np.ndarray

the points along the s-th axis to evaluate the FE plot

  • np.ndarray of shape (T,)
required
heterogeneity bool

whether to return the heterogeneity measures.

  • if heterogeneity=False, the function returns the mean effect at the given xs
  • If heterogeneity=True, the function returns (y, std) where y is the mean effect and std is the standard deviation of the mean effect
False
centering typing.Union[bool, str]

whether to center the plot

  • If centering is False, the SHAP curve is not centered
  • If centering is True or zero_integral, the SHAP curve is centered around the y axis.
  • If centering is zero_start, the SHAP curve starts from y=0.
False

Returns:

Type Description
typing.Union[np.ndarray, typing.Tuple[np.ndarray, np.ndarray]]

the mean effect y, if heterogeneity=False (default) or a tuple (y, std, estimator_var) otherwise

Source code in /home/runner/work/effector/effector/effector/global_effect_shap.py
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
def eval(
    self,
    feature: int,
    xs: np.ndarray,
    heterogeneity: bool = False,
    centering: typing.Union[bool, str] = False,
) -> typing.Union[np.ndarray, typing.Tuple[np.ndarray, np.ndarray]]:
    """Evaluate the effect of the s-th feature at positions `xs`.

    Args:
        feature: index of feature of interest
        xs: the points along the s-th axis to evaluate the FE plot

          - `np.ndarray` of shape `(T,)`
        heterogeneity: whether to return the heterogeneity measures.

              - if `heterogeneity=False`, the function returns the mean effect at the given `xs`
              - If `heterogeneity=True`, the function returns `(y, std)` where `y` is the mean effect and `std` is the standard deviation of the mean effect

        centering: whether to center the plot

            - If `centering` is `False`, the SHAP curve is not centered
            - If `centering` is `True` or `zero_integral`, the SHAP curve is centered around the `y` axis.
            - If `centering` is `zero_start`, the SHAP curve starts from `y=0`.

    Returns:
        the mean effect `y`, if `heterogeneity=False` (default) or a tuple `(y, std, estimator_var)` otherwise
    """
    centering = helpers.prep_centering(centering)

    if self.refit(feature, centering):
        self.fit(features=feature, centering=centering)

    # Check if the lower bound is less than the upper bound
    assert self.axis_limits[0, feature] < self.axis_limits[1, feature]

    yy = self.feature_effect["feature_" + str(feature)]["spline_mean"](xs)

    if centering is not False:
        norm_const = self.feature_effect["feature_" + str(feature)]["norm_const"]
        yy = yy - norm_const

    if heterogeneity:
        yy_std = self.feature_effect["feature_" + str(feature)]["spline_std"](xs)
        return yy, yy_std
    else:
        return yy

plot(feature, heterogeneity=False, centering=False, nof_points=30, scale_x=None, scale_y=None, nof_shap_values='all', show_avg_output=False, y_limits=None)

Plot the SHAP Dependence Plot (SDP) of the s-th feature.

Parameters:

Name Type Description Default
feature int

index of the plotted feature

required
heterogeneity Union[bool, str]

whether to output the heterogeneity of the SHAP values

  • If heterogeneity is False, no heterogeneity is plotted
  • If heterogeneity is True or "std", the standard deviation of the shap values is plotted
  • If heterogeneity is "shap_values", the shap values are scattered on top of the SHAP curve
False
centering Union[bool, str]

whether to center the SDP

  • If centering is False, the SHAP curve is not centered
  • If centering is True or zero_integral, the SHAP curve is centered around the y axis.
  • If centering is zero_start, the SHAP curve starts from y=0.
False
nof_points int

number of points to evaluate the SDP plot

30
scale_x Optional[dict]

dictionary with keys "mean" and "std" for scaling the x-axis

None
scale_y Optional[dict]

dictionary with keys "mean" and "std" for scaling the y-axis

None
nof_shap_values Union[int, str]

number of shap values to show on top of the SHAP curve

'all'
show_avg_output bool

whether to show the average output of the model

False
y_limits Optional[List]

limits of the y-axis

None
Source code in /home/runner/work/effector/effector/effector/global_effect_shap.py
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
def plot(
    self,
    feature: int,
    heterogeneity: Union[bool, str] = False,
    centering: Union[bool, str] = False,
    nof_points: int = 30,
    scale_x: Optional[dict] = None,
    scale_y: Optional[dict] = None,
    nof_shap_values: Union[int, str] = "all",
    show_avg_output: bool = False,
    y_limits: Optional[List] = None,
) -> None:
    """
    Plot the SHAP Dependence Plot (SDP) of the s-th feature.

    Args:
        feature: index of the plotted feature
        heterogeneity: whether to output the heterogeneity of the SHAP values

            - If `heterogeneity` is `False`, no heterogeneity is plotted
            - If `heterogeneity` is `True` or `"std"`, the standard deviation of the shap values is plotted
            - If `heterogeneity` is `"shap_values"`, the shap values are scattered on top of the SHAP curve

        centering: whether to center the SDP

            - If `centering` is `False`, the SHAP curve is not centered
            - If `centering` is `True` or `zero_integral`, the SHAP curve is centered around the `y` axis.
            - If `centering` is `zero_start`, the SHAP curve starts from `y=0`.

        nof_points: number of points to evaluate the SDP plot
        scale_x: dictionary with keys "mean" and "std" for scaling the x-axis
        scale_y: dictionary with keys "mean" and "std" for scaling the y-axis
        nof_shap_values: number of shap values to show on top of the SHAP curve
        show_avg_output: whether to show the average output of the model
        y_limits: limits of the y-axis
    """
    heterogeneity = helpers.prep_confidence_interval(heterogeneity)

    x = np.linspace(
        self.axis_limits[0, feature], self.axis_limits[1, feature], nof_points
    )

    # get the SHAP curve
    y = self.eval(feature, x, heterogeneity=False, centering=centering)
    y_std = (
        self.feature_effect["feature_" + str(feature)]["spline_std"](x)
        if heterogeneity == "std" or True
        else None
    )

    # get some SHAP values
    _, ind = helpers.prep_nof_instances(nof_shap_values, self.data.shape[0])
    yy = (
        self.feature_effect["feature_" + str(feature)]["yy"][ind]
        if heterogeneity == "shap_values"
        else None
    )
    if yy is not None and centering is not False:
        yy = yy - self.feature_effect["feature_" + str(feature)]["norm_const"]
    xx = (
        self.feature_effect["feature_" + str(feature)]["xx"][ind]
        if heterogeneity == "shap_values"
        else None
    )

    if show_avg_output:
        avg_output = helpers.prep_avg_output(self.data, self.model, self.avg_output, scale_y)
    else:
        avg_output = None

    vis.plot_shap(
        x,
        y,
        xx,
        yy,
        y_std,
        feature,
        heterogeneity=heterogeneity,
        scale_x=scale_x,
        scale_y=scale_y,
        avg_output=avg_output,
        feature_names=self.feature_names,
        target_name=self.target_name,
        y_limits=y_limits
    )

Regional Effect Methods

effector.regional_effect.RegionalEffectBase

Source code in /home/runner/work/effector/effector/effector/regional_effect.py
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
class RegionalEffectBase:
    empty_symbol = helpers.EMPTY_SYMBOL

    def __init__(
        self,
        method_name: str,
        data: np.ndarray,
        model: Callable,
        model_jac: Optional[Callable] = None,
        data_effect: Optional[np.ndarray] = None,
        nof_instances: Union[int, str] = 100,
        axis_limits: Optional[np.ndarray] = None,
        feature_types: Optional[List] = None,
        cat_limit: Optional[int] = 10,
        feature_names: Optional[List] = None,
        target_name: Optional[str] = None,
    ) -> None:
        """
        Constructor for the RegionalEffect class.
        """
        self.method_name = method_name.lower()
        self.model = model
        self.model_jac = model_jac

        # select nof_instances from the data
        self.nof_instances, self.indices = helpers.prep_nof_instances(
            nof_instances, data.shape[0]
        )
        self.data = data[self.indices, :]
        self.instance_effects = data_effect[self.indices, :] if data_effect is not None else None
        self.dim = self.data.shape[1]

        # set axis_limits
        axis_limits = (
            helpers.axis_limits_from_data(data) if axis_limits is None else axis_limits
        )
        self.axis_limits: np.ndarray = axis_limits

        # set feature types
        self.cat_limit = cat_limit
        feature_types = (
            utils.get_feature_types(data, cat_limit)
            if feature_types is None
            else feature_types
        )
        self.feature_types: list = feature_types

        # set feature names
        feature_names: list[str] = (
            helpers.get_feature_names(axis_limits.shape[1])
            if feature_names is None
            else feature_names
        )
        self.feature_names: list = feature_names

        # set target name
        self.target_name = "y" if target_name is None else target_name

        # state variables
        self.is_fitted: np.ndarray = np.ones([self.dim]) < 0

        # parameters used when fitting the regional effect
        self.method_args: typing.Dict = {}

        # dictionary with all the information required for plotting or evaluating the regional effects
        self.partitioners: typing.Dict[str, Regions] = {}
        self.tree_full: typing.Dict[str, Tree] = {}
        self.tree_pruned: typing.Dict[str, Tree] = {}
        self.tree_full_scaled: typing.Dict[str, Tree] = {}
        self.tree_pruned_scaled: typing.Dict[str, Tree] = {}

    def _fit_feature(
        self,
        feature: int,
        heter_func: Callable,
        heter_pcg_drop_thres: float = 0.1,
        heter_small_enough: float = 0.1,
        max_split_levels: int = 2,
        candidate_positions_for_numerical: int = 20,
        min_points_per_subregion: int = 10,
        candidate_foc: Union[str, List] = "all",
        split_categorical_features: bool = False,
    ):
        """
        Find the subregions for a single feature.
        """
        # init Region Extractor
        regions = Regions(
            feature,
            heter_func,
            self.data,
            self.instance_effects,
            self.feature_types,
            self.feature_names,
            self.target_name,
            self.cat_limit,
            candidate_foc,
            min_points_per_subregion,
            candidate_positions_for_numerical,
            max_split_levels,
            heter_pcg_drop_thres,
            heter_small_enough,
            split_categorical_features,
        )

        # apply partitioning
        regions.search_all_splits()
        regions.choose_important_splits()
        self.tree_full["feature_{}".format(feature)] = regions.splits_to_tree()
        self.tree_pruned["feature_{}".format(feature)] = regions.splits_to_tree(True)

        # store the partitioning object
        self.partitioners["feature_{}".format(feature)] = regions

        # update state
        self.is_fitted[feature] = True

    def refit(self, feature):
        if not self.is_fitted[feature]:
            self.fit(feature)

    def get_node_info(self, feature, node_idx):
        assert self.is_fitted[feature], "Feature {} has not been fitted yet".format(feature)
        assert self.tree_pruned["feature_{}".format(feature)] is not None, "Feature {} has no splits".format(feature)

        if self.tree_pruned_scaled is not None and "feature_{}".format(feature) in self.tree_pruned_scaled.keys():
            tree = self.tree_pruned_scaled["feature_{}".format(feature)]
        else:
            tree = self.tree_pruned["feature_{}".format(feature)]

        # assert node id exists
        assert node_idx in [node.idx for node in tree.nodes], "Node {} does not exist".format(node_idx)

        # find the node
        node = [node for node in tree.nodes if node.idx == node_idx][0]

        # get data
        data = node.data["data"]
        data_effect = node.data["data_effect"]
        name = node.name
        return data, data_effect, name

    def _create_fe_object(self, data, data_effect, feature_names):
        if self.method_name == "rhale":
            return RHALE(data, self.model, self.model_jac, data_effect=data_effect, feature_names=feature_names, target_name=self.target_name)
        elif self.method_name == "ale":
            return ALE(data, self.model, feature_names=feature_names, target_name=self.target_name)
        elif self.method_name == "shap":
            return ShapDP(data, self.model, feature_names=feature_names, target_name=self.target_name)
        elif self.method_name == "pdp":
            return PDP(data, self.model, feature_names=feature_names, target_name=self.target_name)
        elif self.method_name == "d-pdp":
            return DerPDP(data, self.model, self.model_jac, feature_names=feature_names, target_name=self.target_name)
        else:
            raise NotImplementedError

    def eval(self, feature, node_idx, xs, heterogeneity=False, centering=False):
        """
        Evaluate the regional effect for a given feature and node.

        Args:
            feature: the feature to evaluate
            node_idx: the node corresponding to the subregion to evaluate
            xs: the points at which to evaluate the regional effect
            heterogeneity: whether to return the heterogeneity.

                  - if `heterogeneity=False`, the function returns the mean effect at the given `xs`
                  - If `heterogeneity=True`, the function returns `(y, std)` where `y` is the mean effect and `std` is the standard deviation of the mean effect

            centering: whether to center the regional effect. The following options are available:

                - If `centering` is `False`, the regional effect is not centered
                - If `centering` is `True` or `zero_integral`, the regional effect is centered around the `y` axis.
                - If `centering` is `zero_start`, the regional effect starts from `y=0`.

        Returns:
            the mean effect `y`, if `heterogeneity=False` (default) or a tuple `(y, std)` otherwise

        """
        self.refit(feature)
        centering = helpers.prep_centering(centering)
        data, data_effect, _ = self.get_node_info(feature, node_idx)
        fe_method = self._create_fe_object(data, data_effect, None)
        return fe_method.eval(feature, xs, heterogeneity, centering)

    def fit(self, *args, **kwargs):
        raise NotImplementedError

    def plot(self,
             feature,
             node_idx,
             heterogeneity=False,
             centering=False,
             scale_x_list=None,
             scale_y=None,
             y_limits=None):

        self.refit(feature)

        if scale_x_list is not None:
            self.tree_full_scaled["feature_{}".format(feature)] = self.partitioners["feature_{}".format(feature)].splits_to_tree(False, scale_x_list)
            self.tree_pruned_scaled["feature_{}".format(feature)] = self.partitioners["feature_{}".format(feature)].splits_to_tree(True, scale_x_list)

        data, data_effect, name = self.get_node_info(feature, node_idx)
        feature_names = copy.deepcopy(self.feature_names)
        feature_names[feature] = name
        fe_method = self._create_fe_object(data, data_effect, feature_names)

        return fe_method.plot(
            feature=feature,
            heterogeneity=heterogeneity,
            centering=centering,
            scale_x=scale_x_list[feature] if scale_x_list is not None else None,
            scale_y=scale_y,
            y_limits=y_limits
            )

    def show_partitioning(self, features, only_important=True, scale_x_list=None):
        features = helpers.prep_features(features, self.dim)

        for feat in features:
            self.refit(feat)

            if scale_x_list is not None:
                tree_full_scaled = self.partitioners["feature_{}".format(feat)].splits_to_tree(True, scale_x_list)
                tree_pruned_scaled = self.partitioners["feature_{}".format(feat)].splits_to_tree(False, scale_x_list)
                tree_dict = tree_full_scaled if only_important else tree_pruned_scaled
            else:
                tree_dict = self.tree_pruned["feature_{}".format(feat)] if only_important else self.tree_full["feature_{}".format(feat)]

            print("Feature {} - Full partition tree:".format(feat))

            if tree_dict is None:
                print("No splits found for feature {}".format(feat))
            else:
                tree_dict.show_full_tree()

            print("-" * 50)
            print("Feature {} - Statistics per tree level:".format(feat))

            if tree_dict is None:
                print("No splits found for feature {}".format(feat))
            else:
                tree_dict.show_level_stats()

    def describe_subregions(
        self,
        features,
        only_important=True,
        scale_x_list: typing.Union[None, typing.List[dict]] = None,
    ):
        features = helpers.prep_features(features, self.dim)
        for feature in features:
            self.refit(feature)

            # it means it a categorical feature
            if self.tree_full["feature_{}".format(feature)] is None:
                continue

            feature_name = self.feature_names[feature]
            if only_important:
                tree = self.tree_pruned["feature_{}".format(feature)]
                if len(tree.nodes) == 1:
                    print("No important splits found for feature {}".format(feature))
                    continue
                else:
                    print("Important splits for feature {}".format(feature_name))
            else:
                print("All splits for feature {}".format(feature_name))
                tree = self.tree_full["feature_{}".format(feature)]

            max_level = max([node.level for node in tree.nodes])
            for level in range(1, max_level+1):
                previous_level_nodes = tree.get_level_nodes(level-1)
                level_nodes = tree.get_level_nodes(level)
                type_of_split_feature = level_nodes[0].data["feature_type"]
                foc_name = self.feature_names[level_nodes[0].data["feature"]]
                print("- On feature {} ({})".format(foc_name, type_of_split_feature))

                position_split_formatted = (
                    "{:.2f}".format(level_nodes[0].data["position"])
                    if scale_x_list is None
                    else "{:.2f}".format(
                        level_nodes[0].data["position"] * scale_x_list[level_nodes[0].data["feature"]]["std"]
                        + scale_x_list[level_nodes[0].data["feature"]]["mean"]
                    )
                )
                print("  - Position of split: {}".format(position_split_formatted))

                weight_heter_before = np.sum([node.data["weight"] * node.data["heterogeneity"] for node in previous_level_nodes])
                print("  - Heterogeneity before split: {:.2f}".format(weight_heter_before))

                weight_heter = np.sum([node.data["weight"] * node.data["heterogeneity"] for node in level_nodes])
                print("  - Heterogeneity after split: {:.2f}".format(weight_heter))
                weight_heter_drop = weight_heter_before - weight_heter
                print("  - Heterogeneity drop: {:.2f} ({:.2f} %)".format(
                    weight_heter_drop, weight_heter_drop / weight_heter_before * 100)
                )

                nof_instances_before = [nod.data["nof_instances"] for nod in previous_level_nodes]
                print("  - Number of instances before split: {}".format(nof_instances_before))
                nof_instances = [nod.data["nof_instances"] for nod in level_nodes]
                print("  - Number of instances after split: {}".format(nof_instances))

eval(feature, node_idx, xs, heterogeneity=False, centering=False)

Evaluate the regional effect for a given feature and node.

Parameters:

Name Type Description Default
feature

the feature to evaluate

required
node_idx

the node corresponding to the subregion to evaluate

required
xs

the points at which to evaluate the regional effect

required
heterogeneity

whether to return the heterogeneity.

  • if heterogeneity=False, the function returns the mean effect at the given xs
  • If heterogeneity=True, the function returns (y, std) where y is the mean effect and std is the standard deviation of the mean effect
False
centering

whether to center the regional effect. The following options are available:

  • If centering is False, the regional effect is not centered
  • If centering is True or zero_integral, the regional effect is centered around the y axis.
  • If centering is zero_start, the regional effect starts from y=0.
False

Returns:

Type Description

the mean effect y, if heterogeneity=False (default) or a tuple (y, std) otherwise

Source code in /home/runner/work/effector/effector/effector/regional_effect.py
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
def eval(self, feature, node_idx, xs, heterogeneity=False, centering=False):
    """
    Evaluate the regional effect for a given feature and node.

    Args:
        feature: the feature to evaluate
        node_idx: the node corresponding to the subregion to evaluate
        xs: the points at which to evaluate the regional effect
        heterogeneity: whether to return the heterogeneity.

              - if `heterogeneity=False`, the function returns the mean effect at the given `xs`
              - If `heterogeneity=True`, the function returns `(y, std)` where `y` is the mean effect and `std` is the standard deviation of the mean effect

        centering: whether to center the regional effect. The following options are available:

            - If `centering` is `False`, the regional effect is not centered
            - If `centering` is `True` or `zero_integral`, the regional effect is centered around the `y` axis.
            - If `centering` is `zero_start`, the regional effect starts from `y=0`.

    Returns:
        the mean effect `y`, if `heterogeneity=False` (default) or a tuple `(y, std)` otherwise

    """
    self.refit(feature)
    centering = helpers.prep_centering(centering)
    data, data_effect, _ = self.get_node_info(feature, node_idx)
    fe_method = self._create_fe_object(data, data_effect, None)
    return fe_method.eval(feature, xs, heterogeneity, centering)

plot(feature, node_idx, heterogeneity=False, centering=False, scale_x_list=None, scale_y=None, y_limits=None)

Source code in /home/runner/work/effector/effector/effector/regional_effect.py
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
def plot(self,
         feature,
         node_idx,
         heterogeneity=False,
         centering=False,
         scale_x_list=None,
         scale_y=None,
         y_limits=None):

    self.refit(feature)

    if scale_x_list is not None:
        self.tree_full_scaled["feature_{}".format(feature)] = self.partitioners["feature_{}".format(feature)].splits_to_tree(False, scale_x_list)
        self.tree_pruned_scaled["feature_{}".format(feature)] = self.partitioners["feature_{}".format(feature)].splits_to_tree(True, scale_x_list)

    data, data_effect, name = self.get_node_info(feature, node_idx)
    feature_names = copy.deepcopy(self.feature_names)
    feature_names[feature] = name
    fe_method = self._create_fe_object(data, data_effect, feature_names)

    return fe_method.plot(
        feature=feature,
        heterogeneity=heterogeneity,
        centering=centering,
        scale_x=scale_x_list[feature] if scale_x_list is not None else None,
        scale_y=scale_y,
        y_limits=y_limits
        )

get_node_info(feature, node_idx)

Source code in /home/runner/work/effector/effector/effector/regional_effect.py
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
def get_node_info(self, feature, node_idx):
    assert self.is_fitted[feature], "Feature {} has not been fitted yet".format(feature)
    assert self.tree_pruned["feature_{}".format(feature)] is not None, "Feature {} has no splits".format(feature)

    if self.tree_pruned_scaled is not None and "feature_{}".format(feature) in self.tree_pruned_scaled.keys():
        tree = self.tree_pruned_scaled["feature_{}".format(feature)]
    else:
        tree = self.tree_pruned["feature_{}".format(feature)]

    # assert node id exists
    assert node_idx in [node.idx for node in tree.nodes], "Node {} does not exist".format(node_idx)

    # find the node
    node = [node for node in tree.nodes if node.idx == node_idx][0]

    # get data
    data = node.data["data"]
    data_effect = node.data["data_effect"]
    name = node.name
    return data, data_effect, name

effector.regional_effect_ale.RegionalALE

Bases: RegionalEffectBase

Source code in /home/runner/work/effector/effector/effector/regional_effect_ale.py
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
class RegionalALE(RegionalEffectBase):
    def __init__(
        self,
        data: np.ndarray,
        model: callable,
        nof_instances: typing.Union[int, str] = "all",
        axis_limits: typing.Union[None, np.ndarray] = None,
        feature_types: typing.Union[list, None] = None,
        cat_limit: typing.Union[int, None] = 10,
        feature_names: typing.Union[list, None] = None,
        target_name: typing.Union[str, None] = None,
    ):
        """
        Regional RHALE constructor.

        Args:
            data: X matrix (N,D).
            model: the black-box model (N,D) -> (N, )
            model_jac: the black-box model Jacobian (N,D) -> (N,D)
            axis_limits: axis limits for the FE plot [2, D] or None. If None, axis limits are computed from the data.
            feature_types: list of feature types (categorical or numerical)
            cat_limit: the minimum number of unique values for a feature to be considered categorical
            feature_names: list of feature names
        """
        super(RegionalALE, self).__init__(
            "ale",
            data,
            model,
            None,
            None,
            nof_instances,
            axis_limits,
            feature_types,
            cat_limit,
            feature_names,
            target_name
        )

    def _create_heterogeneity_function(self, foi, binning_method, min_points, centering):
        binning_method = prep_binning_method(binning_method)
        isinstance(binning_method, binning_methods.Fixed)

        def heter(data, instance_effects=None) -> float:
            if data.shape[0] < min_points:
                return BIG_M

            ale = ALE(data, self.model, "all", None, instance_effects)
            try:
                ale.fit(features=foi, binning_method=binning_method, centering=centering)
            except:
                return BIG_M

            # heterogeneity is the accumulated std at the end of the curve
            axis_limits = helpers.axis_limits_from_data(data)
            stop = np.array([axis_limits[:, foi][1]])
            _, z = ale.eval(feature=foi, xs=stop, heterogeneity=True)
            return z.item()

        return heter

    def fit(
        self,
        features: typing.Union[int, str, list],
        heter_pcg_drop_thres: float = 0.1,
        heter_small_enough: float = 0.1,
        max_depth: int = 1,
        nof_candidate_splits_for_numerical: int = 20,
        min_points_per_subregion: int = 10,
        candidate_conditioning_features: typing.Union["str", list] = "all",
        split_categorical_features: bool = False,
        binning_method: typing.Union[str, binning_methods.Fixed] = binning_methods.Fixed(nof_bins=20, min_points_per_bin=0),
        centering: typing.Union[bool, str] = False,
    ):
        """
        Find the Regional RHALE for a list of features.

        Args:
            features: list of features to fit
            heter_pcg_drop_thres: heterogeneity drop threshold for a split to be considered important
            heter_small_enough: heterogeneity threshold for a region to be considered homogeneous (splitting stops)
            binning_method: binning method to use
            max_depth: maximum number of splits to perform (depth of the tree)
            nof_candidate_splits_for_numerical: number of candidate splits to consider for numerical features
            min_points_per_subregion: minimum allowed number of points in a subregion (otherwise the split is not considered as valid)
            candidate_conditioning_features: list of features to consider as conditioning features for the candidate splits
        """

        assert min_points_per_subregion >= 2, "min_points_per_subregion must be >= 2"
        features = helpers.prep_features(features, self.dim)
        for feat in tqdm(features):
            heter = self._create_heterogeneity_function(
                feat, binning_method, min_points_per_subregion, centering
            )

            self._fit_feature(
                feat,
                heter,
                heter_pcg_drop_thres,
                heter_small_enough,
                max_depth,
                nof_candidate_splits_for_numerical,
                min_points_per_subregion,
                candidate_conditioning_features,
                split_categorical_features,
            )

            self.method_args["feature_" + str(feat)] = {
                "heter_pcg_drop_thres": heter_pcg_drop_thres,
                "heter_small_enough": heter_small_enough,
                "max_depth": max_depth,
                "nof_candidate_splits_for_numerical": nof_candidate_splits_for_numerical,
                "min_points_per_subregion": min_points_per_subregion,
                "candidate_conditioning_features": candidate_conditioning_features,
                "split_categorical_features": split_categorical_features,
                "binning_method": binning_method,
                "centering": centering,
            }

    def plot(self,
             feature,
             node_idx,
             heterogeneity=False,
             centering=False,
             scale_x_list=None,
             scale_y=None,
             y_limits=None,
             dy_limits=None):

        # get data from the node
        self.refit(feature)

        if scale_x_list is not None:
            self.tree_full_scaled["feature_{}".format(feature)] = self.partitioners["feature_{}".format(feature)].splits_to_tree(False, scale_x_list)
            self.tree_pruned_scaled["feature_{}".format(feature)] = self.partitioners["feature_{}".format(feature)].splits_to_tree(True, scale_x_list)

        data, data_effect, name = self.get_node_info(feature, node_idx)
        feature_names = copy.deepcopy(self.feature_names)
        feature_names[feature] = name

        # define the method and fit
        self.method_args["feature_" + str(feature)]["heterogeneity"] = heterogeneity
        rhale = RHALE(data, self.model, self.model_jac, self.nof_instances, None, data_effect, feature_names=feature_names)
        binning_method = prep_binning_method(self.method_args["feature_" + str(feature)]["binning_method"])
        rhale.fit(features=feature, binning_method=binning_method, centering=centering)
        scale_x = scale_x_list[feature] if scale_x_list is not None else None
        rhale.plot(feature=feature, heterogeneity=heterogeneity, centering=centering, scale_x=scale_x, scale_y=scale_y, y_limits=y_limits, dy_limits=dy_limits)

__init__(data, model, nof_instances='all', axis_limits=None, feature_types=None, cat_limit=10, feature_names=None, target_name=None)

Regional RHALE constructor.

Parameters:

Name Type Description Default
data np.ndarray

X matrix (N,D).

required
model callable

the black-box model (N,D) -> (N, )

required
model_jac

the black-box model Jacobian (N,D) -> (N,D)

required
axis_limits typing.Union[None, np.ndarray]

axis limits for the FE plot [2, D] or None. If None, axis limits are computed from the data.

None
feature_types typing.Union[list, None]

list of feature types (categorical or numerical)

None
cat_limit typing.Union[int, None]

the minimum number of unique values for a feature to be considered categorical

10
feature_names typing.Union[list, None]

list of feature names

None
Source code in /home/runner/work/effector/effector/effector/regional_effect_ale.py
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
def __init__(
    self,
    data: np.ndarray,
    model: callable,
    nof_instances: typing.Union[int, str] = "all",
    axis_limits: typing.Union[None, np.ndarray] = None,
    feature_types: typing.Union[list, None] = None,
    cat_limit: typing.Union[int, None] = 10,
    feature_names: typing.Union[list, None] = None,
    target_name: typing.Union[str, None] = None,
):
    """
    Regional RHALE constructor.

    Args:
        data: X matrix (N,D).
        model: the black-box model (N,D) -> (N, )
        model_jac: the black-box model Jacobian (N,D) -> (N,D)
        axis_limits: axis limits for the FE plot [2, D] or None. If None, axis limits are computed from the data.
        feature_types: list of feature types (categorical or numerical)
        cat_limit: the minimum number of unique values for a feature to be considered categorical
        feature_names: list of feature names
    """
    super(RegionalALE, self).__init__(
        "ale",
        data,
        model,
        None,
        None,
        nof_instances,
        axis_limits,
        feature_types,
        cat_limit,
        feature_names,
        target_name
    )

fit(features, heter_pcg_drop_thres=0.1, heter_small_enough=0.1, max_depth=1, nof_candidate_splits_for_numerical=20, min_points_per_subregion=10, candidate_conditioning_features='all', split_categorical_features=False, binning_method=binning_methods.Fixed(nof_bins=20, min_points_per_bin=0), centering=False)

Find the Regional RHALE for a list of features.

Parameters:

Name Type Description Default
features typing.Union[int, str, list]

list of features to fit

required
heter_pcg_drop_thres float

heterogeneity drop threshold for a split to be considered important

0.1
heter_small_enough float

heterogeneity threshold for a region to be considered homogeneous (splitting stops)

0.1
binning_method typing.Union[str, binning_methods.Fixed]

binning method to use

binning_methods.Fixed(nof_bins=20, min_points_per_bin=0)
max_depth int

maximum number of splits to perform (depth of the tree)

1
nof_candidate_splits_for_numerical int

number of candidate splits to consider for numerical features

20
min_points_per_subregion int

minimum allowed number of points in a subregion (otherwise the split is not considered as valid)

10
candidate_conditioning_features typing.Union[str, list]

list of features to consider as conditioning features for the candidate splits

'all'
Source code in /home/runner/work/effector/effector/effector/regional_effect_ale.py
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
def fit(
    self,
    features: typing.Union[int, str, list],
    heter_pcg_drop_thres: float = 0.1,
    heter_small_enough: float = 0.1,
    max_depth: int = 1,
    nof_candidate_splits_for_numerical: int = 20,
    min_points_per_subregion: int = 10,
    candidate_conditioning_features: typing.Union["str", list] = "all",
    split_categorical_features: bool = False,
    binning_method: typing.Union[str, binning_methods.Fixed] = binning_methods.Fixed(nof_bins=20, min_points_per_bin=0),
    centering: typing.Union[bool, str] = False,
):
    """
    Find the Regional RHALE for a list of features.

    Args:
        features: list of features to fit
        heter_pcg_drop_thres: heterogeneity drop threshold for a split to be considered important
        heter_small_enough: heterogeneity threshold for a region to be considered homogeneous (splitting stops)
        binning_method: binning method to use
        max_depth: maximum number of splits to perform (depth of the tree)
        nof_candidate_splits_for_numerical: number of candidate splits to consider for numerical features
        min_points_per_subregion: minimum allowed number of points in a subregion (otherwise the split is not considered as valid)
        candidate_conditioning_features: list of features to consider as conditioning features for the candidate splits
    """

    assert min_points_per_subregion >= 2, "min_points_per_subregion must be >= 2"
    features = helpers.prep_features(features, self.dim)
    for feat in tqdm(features):
        heter = self._create_heterogeneity_function(
            feat, binning_method, min_points_per_subregion, centering
        )

        self._fit_feature(
            feat,
            heter,
            heter_pcg_drop_thres,
            heter_small_enough,
            max_depth,
            nof_candidate_splits_for_numerical,
            min_points_per_subregion,
            candidate_conditioning_features,
            split_categorical_features,
        )

        self.method_args["feature_" + str(feat)] = {
            "heter_pcg_drop_thres": heter_pcg_drop_thres,
            "heter_small_enough": heter_small_enough,
            "max_depth": max_depth,
            "nof_candidate_splits_for_numerical": nof_candidate_splits_for_numerical,
            "min_points_per_subregion": min_points_per_subregion,
            "candidate_conditioning_features": candidate_conditioning_features,
            "split_categorical_features": split_categorical_features,
            "binning_method": binning_method,
            "centering": centering,
        }

effector.regional_effect_ale.RegionalRHALE

Bases: RegionalEffectBase

Source code in /home/runner/work/effector/effector/effector/regional_effect_ale.py
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
class RegionalRHALE(RegionalEffectBase):
    def __init__(
        self,
        data: np.ndarray,
        model: Callable,
        model_jac: Optional[Callable] = None,
        instance_effects: Optional[np.ndarray] = None,
        nof_instances: Union[int, str] = "all",
        axis_limits: Optional[np.ndarray] = None,
        feature_types: Optional[List] = None,
        cat_limit: Optional[int] = 10,
        feature_names: Optional[List] = None,
        target_name: Optional[str] = None,
    ):
        """
        Regional RHALE constructor.

        Args:
            data: X matrix (N,D).
            model: the black-box model (N,D) -> (N, )
            model_jac: the black-box model Jacobian (N,D) -> (N,D)
            axis_limits: axis limits for the FE plot [2, D] or None. If None, axis limits are computed from the data.
            feature_types: list of feature types (categorical or numerical)
            cat_limit: the minimum number of unique values for a feature to be considered categorical
            feature_names: list of feature names
        """

        if instance_effects is None:
            if model_jac is not None:
                instance_effects = model_jac(data)
            else:
                instance_effects = utils.compute_jacobian_numerically(model, data)


        super(RegionalRHALE, self).__init__(
            "rhale",
            data,
            model,
            model_jac,
            instance_effects,
            nof_instances,
            axis_limits,
            feature_types,
            cat_limit,
            feature_names,
            target_name
        )

    def _create_heterogeneity_function(self, foi, binning_method, min_points, centering):
        binning_method = prep_binning_method(binning_method)

        def heter(data, instance_effects=None) -> float:
            if data.shape[0] < min_points:
                return BIG_M

            rhale = RHALE(data, self.model, self.model_jac, "all", None, instance_effects)
            try:
                rhale.fit(features=foi, binning_method=binning_method, centering=centering)
            except:
                return BIG_M

            # heterogeneity is the accumulated std at the end of the curve
            axis_limits = helpers.axis_limits_from_data(data)
            stop = np.array([axis_limits[:, foi][1]])
            _, z = rhale.eval(feature=foi, xs=stop, heterogeneity=True)
            return z.item()

        return heter

    def fit(
        self,
        features: typing.Union[int, str, list] = "all",
        heter_pcg_drop_thres: float = 0.1,
        heter_small_enough: float = 0.1,
        max_depth: int = 1,
        nof_candidate_splits_for_numerical: int = 20,
        min_points_per_subregion: int = 10,
        candidate_conditioning_features: typing.Union["str", list] = "all",
        split_categorical_features: bool = False,
        binning_method: typing.Union[
                str,
                binning_methods.Fixed,
                binning_methods.DynamicProgramming,
                binning_methods.Greedy,
        ] = "greedy",
        centering: typing.Union[bool, str] = False,
    ):
        """
        Find the Regional RHALE for a list of features.

        Args:
            features: list of features to fit
            heter_pcg_drop_thres: heterogeneity drop threshold for a split to be considered important
            heter_small_enough: heterogeneity threshold for a region to be considered homogeneous (splitting stops)
            binning_method: binning method to use
            max_depth: maximum number of splits to perform (depth of the tree)
            nof_candidate_splits_for_numerical: number of candidate splits to consider for numerical features
            min_points_per_subregion: minimum allowed number of points in a subregion (otherwise the split is not considered as valid)
            candidate_conditioning_features: list of features to consider as conditioning features for the candidate splits
        """

        assert min_points_per_subregion >= 2, "min_points_per_subregion must be >= 2"
        features = helpers.prep_features(features, self.dim)
        for feat in tqdm(features):
            heter = self._create_heterogeneity_function(
                feat, binning_method, min_points_per_subregion, centering
            )

            self._fit_feature(
                feat,
                heter,
                heter_pcg_drop_thres,
                heter_small_enough,
                max_depth,
                nof_candidate_splits_for_numerical,
                min_points_per_subregion,
                candidate_conditioning_features,
                split_categorical_features,
            )

            self.method_args["feature_" + str(feat)] = {
                "heter_pcg_drop_thres": heter_pcg_drop_thres,
                "heter_small_enough": heter_small_enough,
                "max_depth": max_depth,
                "nof_candidate_splits_for_numerical": nof_candidate_splits_for_numerical,
                "min_points_per_subregion": min_points_per_subregion,
                "candidate_conditioning_features": candidate_conditioning_features,
                "split_categorical_features": split_categorical_features,
                "binning_method": binning_method,
            }

    def plot(self,
             feature,
             node_idx,
             heterogeneity=False,
             centering=False,
             scale_x_list=None,
             scale_y=None,
             y_limits=None,
             dy_limits=None):

        # get data from the node
        self.refit(feature)

        if scale_x_list is not None:
            self.tree_full_scaled["feature_{}".format(feature)] = self.partitioners["feature_{}".format(feature)].splits_to_tree(False, scale_x_list)
            self.tree_pruned_scaled["feature_{}".format(feature)] = self.partitioners["feature_{}".format(feature)].splits_to_tree(True, scale_x_list)

        data, data_effect, name = self.get_node_info(feature, node_idx)
        feature_names = copy.deepcopy(self.feature_names)
        feature_names[feature] = name

        # define the method and fit
        self.method_args["feature_" + str(feature)]["heterogeneity"] = heterogeneity
        rhale = RHALE(data, self.model, self.model_jac, self.nof_instances, None, data_effect, feature_names=feature_names)
        binning_method = prep_binning_method(self.method_args["feature_" + str(feature)]["binning_method"])
        rhale.fit(features=feature, binning_method=binning_method, centering=centering)
        scale_x = scale_x_list[feature] if scale_x_list is not None else None
        rhale.plot(feature=feature, heterogeneity=heterogeneity, centering=centering, scale_x=scale_x, scale_y=scale_y, y_limits=y_limits, dy_limits=dy_limits)

__init__(data, model, model_jac=None, instance_effects=None, nof_instances='all', axis_limits=None, feature_types=None, cat_limit=10, feature_names=None, target_name=None)

Regional RHALE constructor.

Parameters:

Name Type Description Default
data np.ndarray

X matrix (N,D).

required
model Callable

the black-box model (N,D) -> (N, )

required
model_jac Optional[Callable]

the black-box model Jacobian (N,D) -> (N,D)

None
axis_limits Optional[np.ndarray]

axis limits for the FE plot [2, D] or None. If None, axis limits are computed from the data.

None
feature_types Optional[List]

list of feature types (categorical or numerical)

None
cat_limit Optional[int]

the minimum number of unique values for a feature to be considered categorical

10
feature_names Optional[List]

list of feature names

None
Source code in /home/runner/work/effector/effector/effector/regional_effect_ale.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
def __init__(
    self,
    data: np.ndarray,
    model: Callable,
    model_jac: Optional[Callable] = None,
    instance_effects: Optional[np.ndarray] = None,
    nof_instances: Union[int, str] = "all",
    axis_limits: Optional[np.ndarray] = None,
    feature_types: Optional[List] = None,
    cat_limit: Optional[int] = 10,
    feature_names: Optional[List] = None,
    target_name: Optional[str] = None,
):
    """
    Regional RHALE constructor.

    Args:
        data: X matrix (N,D).
        model: the black-box model (N,D) -> (N, )
        model_jac: the black-box model Jacobian (N,D) -> (N,D)
        axis_limits: axis limits for the FE plot [2, D] or None. If None, axis limits are computed from the data.
        feature_types: list of feature types (categorical or numerical)
        cat_limit: the minimum number of unique values for a feature to be considered categorical
        feature_names: list of feature names
    """

    if instance_effects is None:
        if model_jac is not None:
            instance_effects = model_jac(data)
        else:
            instance_effects = utils.compute_jacobian_numerically(model, data)


    super(RegionalRHALE, self).__init__(
        "rhale",
        data,
        model,
        model_jac,
        instance_effects,
        nof_instances,
        axis_limits,
        feature_types,
        cat_limit,
        feature_names,
        target_name
    )

fit(features='all', heter_pcg_drop_thres=0.1, heter_small_enough=0.1, max_depth=1, nof_candidate_splits_for_numerical=20, min_points_per_subregion=10, candidate_conditioning_features='all', split_categorical_features=False, binning_method='greedy', centering=False)

Find the Regional RHALE for a list of features.

Parameters:

Name Type Description Default
features typing.Union[int, str, list]

list of features to fit

'all'
heter_pcg_drop_thres float

heterogeneity drop threshold for a split to be considered important

0.1
heter_small_enough float

heterogeneity threshold for a region to be considered homogeneous (splitting stops)

0.1
binning_method typing.Union[str, binning_methods.Fixed, binning_methods.DynamicProgramming, binning_methods.Greedy]

binning method to use

'greedy'
max_depth int

maximum number of splits to perform (depth of the tree)

1
nof_candidate_splits_for_numerical int

number of candidate splits to consider for numerical features

20
min_points_per_subregion int

minimum allowed number of points in a subregion (otherwise the split is not considered as valid)

10
candidate_conditioning_features typing.Union[str, list]

list of features to consider as conditioning features for the candidate splits

'all'
Source code in /home/runner/work/effector/effector/effector/regional_effect_ale.py
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
def fit(
    self,
    features: typing.Union[int, str, list] = "all",
    heter_pcg_drop_thres: float = 0.1,
    heter_small_enough: float = 0.1,
    max_depth: int = 1,
    nof_candidate_splits_for_numerical: int = 20,
    min_points_per_subregion: int = 10,
    candidate_conditioning_features: typing.Union["str", list] = "all",
    split_categorical_features: bool = False,
    binning_method: typing.Union[
            str,
            binning_methods.Fixed,
            binning_methods.DynamicProgramming,
            binning_methods.Greedy,
    ] = "greedy",
    centering: typing.Union[bool, str] = False,
):
    """
    Find the Regional RHALE for a list of features.

    Args:
        features: list of features to fit
        heter_pcg_drop_thres: heterogeneity drop threshold for a split to be considered important
        heter_small_enough: heterogeneity threshold for a region to be considered homogeneous (splitting stops)
        binning_method: binning method to use
        max_depth: maximum number of splits to perform (depth of the tree)
        nof_candidate_splits_for_numerical: number of candidate splits to consider for numerical features
        min_points_per_subregion: minimum allowed number of points in a subregion (otherwise the split is not considered as valid)
        candidate_conditioning_features: list of features to consider as conditioning features for the candidate splits
    """

    assert min_points_per_subregion >= 2, "min_points_per_subregion must be >= 2"
    features = helpers.prep_features(features, self.dim)
    for feat in tqdm(features):
        heter = self._create_heterogeneity_function(
            feat, binning_method, min_points_per_subregion, centering
        )

        self._fit_feature(
            feat,
            heter,
            heter_pcg_drop_thres,
            heter_small_enough,
            max_depth,
            nof_candidate_splits_for_numerical,
            min_points_per_subregion,
            candidate_conditioning_features,
            split_categorical_features,
        )

        self.method_args["feature_" + str(feat)] = {
            "heter_pcg_drop_thres": heter_pcg_drop_thres,
            "heter_small_enough": heter_small_enough,
            "max_depth": max_depth,
            "nof_candidate_splits_for_numerical": nof_candidate_splits_for_numerical,
            "min_points_per_subregion": min_points_per_subregion,
            "candidate_conditioning_features": candidate_conditioning_features,
            "split_categorical_features": split_categorical_features,
            "binning_method": binning_method,
        }

effector.regional_effect_pdp.RegionalPDPBase

Bases: RegionalEffectBase

Source code in /home/runner/work/effector/effector/effector/regional_effect_pdp.py
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
class RegionalPDPBase(RegionalEffectBase):
    def __init__(
        self,
        method_name: str,
        data: np.ndarray,
        model: callable,
        model_jac: typing.Union[None, callable] = None,
        nof_instances: typing.Union[int, str] = 100,
        axis_limits: typing.Union[None, np.ndarray] = None,
        feature_types: typing.Union[list, None] = None,
        cat_limit: typing.Union[int, None] = 10,
        feature_names: typing.Union[list, None] = None,
        target_name: typing.Union[str, None] = None,
    ):
        super(RegionalPDPBase, self).__init__(
            method_name,
            data,
            model,
            model_jac,
            None,
            nof_instances,
            axis_limits,
            feature_types,
            cat_limit,
            feature_names,
            target_name)

    def _create_heterogeneity_function(self, foi, min_points, centering, nof_instances, points_for_centering, use_vectorized=True):
        def heter(data) -> float:
            if data.shape[0] < min_points:
                return BIG_M

            if self.method_name == "pdp":
                pdp = PDP(data, self.model, self.axis_limits, nof_instances=nof_instances)
            else:
                pdp = DerPDP(data, self.model, self.model_jac, self.axis_limits, nof_instances=nof_instances)

            try:
                pdp.fit(features=foi, centering=centering, points_for_centering=points_for_centering, use_vectorized=use_vectorized)
            except:
                return BIG_M

            # heterogeneity is the mean heterogeneity over the curve
            axis_limits = helpers.axis_limits_from_data(data)

            xx = np.linspace(axis_limits[:, foi][0], axis_limits[:, foi][1], 10)
            try:
                _, z = pdp.eval(feature=foi, xs=xx, heterogeneity=True, use_vectorized=use_vectorized)
            except:
                return BIG_M
            return np.mean(z)

        return heter

    def fit(
        self,
        features: typing.Union[int, str, list] = "all",
        heter_pcg_drop_thres: float = 0.1,
        heter_small_enough: float = 0.1,
        max_depth: int = 1,
        nof_candidate_splits_for_numerical: int = 20,
        min_points_per_subregion: int = 10,
        candidate_conditioning_features: typing.Union["str", list] = "all",
        split_categorical_features: bool = False,
        centering: typing.Union[bool, str] = False,
        nof_instances: int = "all",
        points_for_centering: int = 100,
        use_vectorized: bool = True,
    ):
        """
        Find the Regional PDP for a list of features.

        Args:
            features: list of features to fit
            heter_pcg_drop_thres: heterogeneity drop threshold for a split to be considered important
            heter_small_enough: heterogeneity threshold for a region to be considered homogeneous (splitting stops)
            max_depth: maximum number of splits to perform (depth of the tree)
            nof_candidate_splits_for_numerical: number of candidate splits to consider for numerical features
            min_points_per_subregion: minimum allowed number of points in a subregion (otherwise the split is not considered as valid)
            candidate_conditioning_features: list of features to consider as conditioning features for the candidate splits
            split_categorical_features
        """

        assert min_points_per_subregion >= 2, "min_points_per_subregion must be >= 2"
        features = helpers.prep_features(features, self.dim)
        for feat in tqdm(features):
            heter = self._create_heterogeneity_function(feat, min_points_per_subregion, centering, nof_instances, points_for_centering, use_vectorized)

            self._fit_feature(
                feat,
                heter,
                heter_pcg_drop_thres,
                heter_small_enough,
                max_depth,
                nof_candidate_splits_for_numerical,
                min_points_per_subregion,
                candidate_conditioning_features,
                split_categorical_features,
            )

fit(features='all', heter_pcg_drop_thres=0.1, heter_small_enough=0.1, max_depth=1, nof_candidate_splits_for_numerical=20, min_points_per_subregion=10, candidate_conditioning_features='all', split_categorical_features=False, centering=False, nof_instances='all', points_for_centering=100, use_vectorized=True)

Find the Regional PDP for a list of features.

Parameters:

Name Type Description Default
features typing.Union[int, str, list]

list of features to fit

'all'
heter_pcg_drop_thres float

heterogeneity drop threshold for a split to be considered important

0.1
heter_small_enough float

heterogeneity threshold for a region to be considered homogeneous (splitting stops)

0.1
max_depth int

maximum number of splits to perform (depth of the tree)

1
nof_candidate_splits_for_numerical int

number of candidate splits to consider for numerical features

20
min_points_per_subregion int

minimum allowed number of points in a subregion (otherwise the split is not considered as valid)

10
candidate_conditioning_features typing.Union[str, list]

list of features to consider as conditioning features for the candidate splits

'all'
Source code in /home/runner/work/effector/effector/effector/regional_effect_pdp.py
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
def fit(
    self,
    features: typing.Union[int, str, list] = "all",
    heter_pcg_drop_thres: float = 0.1,
    heter_small_enough: float = 0.1,
    max_depth: int = 1,
    nof_candidate_splits_for_numerical: int = 20,
    min_points_per_subregion: int = 10,
    candidate_conditioning_features: typing.Union["str", list] = "all",
    split_categorical_features: bool = False,
    centering: typing.Union[bool, str] = False,
    nof_instances: int = "all",
    points_for_centering: int = 100,
    use_vectorized: bool = True,
):
    """
    Find the Regional PDP for a list of features.

    Args:
        features: list of features to fit
        heter_pcg_drop_thres: heterogeneity drop threshold for a split to be considered important
        heter_small_enough: heterogeneity threshold for a region to be considered homogeneous (splitting stops)
        max_depth: maximum number of splits to perform (depth of the tree)
        nof_candidate_splits_for_numerical: number of candidate splits to consider for numerical features
        min_points_per_subregion: minimum allowed number of points in a subregion (otherwise the split is not considered as valid)
        candidate_conditioning_features: list of features to consider as conditioning features for the candidate splits
        split_categorical_features
    """

    assert min_points_per_subregion >= 2, "min_points_per_subregion must be >= 2"
    features = helpers.prep_features(features, self.dim)
    for feat in tqdm(features):
        heter = self._create_heterogeneity_function(feat, min_points_per_subregion, centering, nof_instances, points_for_centering, use_vectorized)

        self._fit_feature(
            feat,
            heter,
            heter_pcg_drop_thres,
            heter_small_enough,
            max_depth,
            nof_candidate_splits_for_numerical,
            min_points_per_subregion,
            candidate_conditioning_features,
            split_categorical_features,
        )

effector.regional_effect_pdp.RegionalPDP

Bases: RegionalPDPBase

Source code in /home/runner/work/effector/effector/effector/regional_effect_pdp.py
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
class RegionalPDP(RegionalPDPBase):
    def __init__(
        self,
        data: np.ndarray,
        model: callable,
        nof_instances: typing.Union[int, str] = 1000,
        axis_limits: typing.Union[None, np.ndarray] = None,
        feature_types: typing.Union[list, None] = None,
        cat_limit: typing.Union[int, None] = 10,
        feature_names: typing.Union[list, None] = None,
        target_name: typing.Union[str, None] = None,
    ):
        super(RegionalPDP, self).__init__(
            "pdp",
            data,
            model,
            None,
            nof_instances,
            axis_limits,
            feature_types,
            cat_limit,
            feature_names,
            target_name)

__init__(data, model, nof_instances=1000, axis_limits=None, feature_types=None, cat_limit=10, feature_names=None, target_name=None)

Source code in /home/runner/work/effector/effector/effector/regional_effect_pdp.py
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
def __init__(
    self,
    data: np.ndarray,
    model: callable,
    nof_instances: typing.Union[int, str] = 1000,
    axis_limits: typing.Union[None, np.ndarray] = None,
    feature_types: typing.Union[list, None] = None,
    cat_limit: typing.Union[int, None] = 10,
    feature_names: typing.Union[list, None] = None,
    target_name: typing.Union[str, None] = None,
):
    super(RegionalPDP, self).__init__(
        "pdp",
        data,
        model,
        None,
        nof_instances,
        axis_limits,
        feature_types,
        cat_limit,
        feature_names,
        target_name)

effector.regional_effect_pdp.RegionalDerPDP

Bases: RegionalPDPBase

Source code in /home/runner/work/effector/effector/effector/regional_effect_pdp.py
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
class RegionalDerPDP(RegionalPDPBase):
    def __init__(
        self,
        data: np.ndarray,
        model: callable,
        model_jac: typing.Union[None, callable] = None,
        nof_instances: typing.Union[int, str] = 1000,
        axis_limits: typing.Union[None, np.ndarray] = None,
        feature_types: typing.Union[list, None] = None,
        cat_limit: typing.Union[int, None] = 10,
        feature_names: typing.Union[list, None] = None,
        target_name: typing.Union[str, None] = None,
    ):
        super(RegionalDerPDP, self).__init__(
            "d-pdp",
            data,
            model,
            model_jac,
            nof_instances,
            axis_limits,
            feature_types,
            cat_limit,
            feature_names,
            target_name)

__init__(data, model, model_jac=None, nof_instances=1000, axis_limits=None, feature_types=None, cat_limit=10, feature_names=None, target_name=None)

Source code in /home/runner/work/effector/effector/effector/regional_effect_pdp.py
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
def __init__(
    self,
    data: np.ndarray,
    model: callable,
    model_jac: typing.Union[None, callable] = None,
    nof_instances: typing.Union[int, str] = 1000,
    axis_limits: typing.Union[None, np.ndarray] = None,
    feature_types: typing.Union[list, None] = None,
    cat_limit: typing.Union[int, None] = 10,
    feature_names: typing.Union[list, None] = None,
    target_name: typing.Union[str, None] = None,
):
    super(RegionalDerPDP, self).__init__(
        "d-pdp",
        data,
        model,
        model_jac,
        nof_instances,
        axis_limits,
        feature_types,
        cat_limit,
        feature_names,
        target_name)

effector.regional_effect_shap.RegionalShapDP

Bases: RegionalEffectBase

Source code in /home/runner/work/effector/effector/effector/regional_effect_shap.py
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
class RegionalShapDP(RegionalEffectBase):
    big_m = helpers.BIG_M

    def __init__(
        self,
        data: np.ndarray,
        model: Callable,
        axis_limits: Optional[np.ndarray] = None,
        nof_instances: Union[int, str] = 100,
        feature_types: Optional[List[str]] = None,
        cat_limit: Optional[int] = 10,
        feature_names: Optional[List[str]] = None,
        target_name: Optional[str] = None,
    ):
        """
        Regional SHAP constructor.

        Args:
            data: the design matrix

                - shape: `(N,D)`
            model: the black-box model. Must be a `Callable` with:

                - input: `ndarray` of shape `(N, D)`
                - output: `ndarray` of shape `(N, )`

            axis_limits: The limits of the feature effect plot along each axis

                - use a `ndarray` of shape `(2, D)`, to specify them manually
                - use `None`, to be inferred from the data

            nof_instances: maximum number of instances to be used for PDP.

                - use "all", for using all instances.
                - use an `int`, for using `nof_instances` instances.

            feature_types: The feature types.

                - use `None`, to infer them from the data; whether a feature is categorical or numerical is inferred
                from whether it exceeds the `cat_limit` unique values.
                - use a list with elements `"cat"` or `"numerical"`, to specify them manually.

            cat_limit: the minimum number of unique values for a feature to be considered categorical

            feature_names: The names of the features

                - use a `list` of `str`, to specify the name manually. For example: `                  ["age", "weight", ...]`
                - use `None`, to keep the default names: `["x_0", "x_1", ...]`

            target_name: The name of the target variable

                - use a `str`, to specify it name manually. For example: `"price"`
                - use `None`, to keep the default name: `"y"`
        """
        super(RegionalShapDP, self).__init__(
            "shap",
            data,
            model,
            None,
            None,
            nof_instances,
            axis_limits,
            feature_types,
            cat_limit,
            feature_names,
            target_name
        )

    def _create_heterogeneity_function(self, foi, min_points, centering, points_for_centering):

        def heterogeneity_function(data) -> float:
            if data.shape[0] < min_points:
                return self.big_m

            axis_limits = helpers.axis_limits_from_data(data)
            xx = np.linspace(axis_limits[:, foi][0], axis_limits[:, foi][1], 10)

            shap = ShapDP(data, self.model, None, self.nof_instances)
            shap.fit(foi, centering, points_for_centering)
            _, z = shap.eval(foi, xx, heterogeneity=True)
            return np.mean(z)

        return heterogeneity_function

    def fit(
            self,
            features: typing.Union[int, str, list],
            heter_pcg_drop_thres: float = 0.1,
            heter_small_enough: float = 0.1,
            max_depth: int = 1,
            nof_candidate_splits_for_numerical: int = 20,
            min_points_per_subregion: int = 10,
            candidate_conditioning_features: typing.Union["str", list] = "all",
            split_categorical_features: bool = False,
            centering: typing.Union[bool, str] = False,
            points_for_centering: int = 100,
    ):
        """
        Fit the regional SHAP.

        Args:
            features: the features to fit.
                - If set to "all", all the features will be fitted.

            heter_pcg_drop_thres: threshold for the percentage drop in heterogeneity to consider a split valid
            heter_small_enough: heterogeneity threshold for a region to be considered homogeneous (splitting stops)
            max_depth: maximum number of splits to perform (depth of the tree)
            nof_candidate_splits_for_numerical: number of candidate splits to consider for numerical features
            min_points_per_subregion: minimum allowed number of points in a subregion (otherwise the split is not considered as valid)
            candidate_conditioning_features: list of features to consider as conditioning features for the candidate splits
            split_categorical_features: whether to search for subregions in categorical features
            centering: whether to center the SHAP dependence plots before estimating the heterogeneity
            points_for_centering: number of points to use for centering
        """
        assert min_points_per_subregion >= 2, "min_points_per_subregion must be >= 2"
        features = helpers.prep_features(features, self.dim)
        for feat in tqdm(features):
            heter = self._create_heterogeneity_function(
                feat, min_points_per_subregion, centering, points_for_centering
            )

            self._fit_feature(
                feat,
                heter,
                heter_pcg_drop_thres,
                heter_small_enough,
                max_depth,
                nof_candidate_splits_for_numerical,
                min_points_per_subregion,
                candidate_conditioning_features,
                split_categorical_features,
            )

            self.method_args["feature_" + str(feat)] = {
                "heter_pcg_drop_thres": heter_pcg_drop_thres,
                "heter_small_enough": heter_small_enough,
                "max_depth": max_depth,
                "nof_candidate_splits_for_numerical": nof_candidate_splits_for_numerical,
                "min_points_per_subregion": min_points_per_subregion,
                "candidate_conditioning_features": candidate_conditioning_features,
                "split_categorical_features": split_categorical_features,
                "centering": centering,
                "points_for_centering": points_for_centering,
            }

__init__(data, model, axis_limits=None, nof_instances=100, feature_types=None, cat_limit=10, feature_names=None, target_name=None)

Regional SHAP constructor.

Parameters:

Name Type Description Default
data np.ndarray

the design matrix

  • shape: (N,D)
required
model Callable

the black-box model. Must be a Callable with:

  • input: ndarray of shape (N, D)
  • output: ndarray of shape (N, )
required
axis_limits Optional[np.ndarray]

The limits of the feature effect plot along each axis

  • use a ndarray of shape (2, D), to specify them manually
  • use None, to be inferred from the data
None
nof_instances Union[int, str]

maximum number of instances to be used for PDP.

  • use "all", for using all instances.
  • use an int, for using nof_instances instances.
100
feature_types Optional[List[str]]

The feature types.

  • use None, to infer them from the data; whether a feature is categorical or numerical is inferred from whether it exceeds the cat_limit unique values.
  • use a list with elements "cat" or "numerical", to specify them manually.
None
cat_limit Optional[int]

the minimum number of unique values for a feature to be considered categorical

10
feature_names Optional[List[str]]

The names of the features

  • use a list of str, to specify the name manually. For example: ["age", "weight", ...]
  • use None, to keep the default names: ["x_0", "x_1", ...]
None
target_name Optional[str]

The name of the target variable

  • use a str, to specify it name manually. For example: "price"
  • use None, to keep the default name: "y"
None
Source code in /home/runner/work/effector/effector/effector/regional_effect_shap.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
def __init__(
    self,
    data: np.ndarray,
    model: Callable,
    axis_limits: Optional[np.ndarray] = None,
    nof_instances: Union[int, str] = 100,
    feature_types: Optional[List[str]] = None,
    cat_limit: Optional[int] = 10,
    feature_names: Optional[List[str]] = None,
    target_name: Optional[str] = None,
):
    """
    Regional SHAP constructor.

    Args:
        data: the design matrix

            - shape: `(N,D)`
        model: the black-box model. Must be a `Callable` with:

            - input: `ndarray` of shape `(N, D)`
            - output: `ndarray` of shape `(N, )`

        axis_limits: The limits of the feature effect plot along each axis

            - use a `ndarray` of shape `(2, D)`, to specify them manually
            - use `None`, to be inferred from the data

        nof_instances: maximum number of instances to be used for PDP.

            - use "all", for using all instances.
            - use an `int`, for using `nof_instances` instances.

        feature_types: The feature types.

            - use `None`, to infer them from the data; whether a feature is categorical or numerical is inferred
            from whether it exceeds the `cat_limit` unique values.
            - use a list with elements `"cat"` or `"numerical"`, to specify them manually.

        cat_limit: the minimum number of unique values for a feature to be considered categorical

        feature_names: The names of the features

            - use a `list` of `str`, to specify the name manually. For example: `                  ["age", "weight", ...]`
            - use `None`, to keep the default names: `["x_0", "x_1", ...]`

        target_name: The name of the target variable

            - use a `str`, to specify it name manually. For example: `"price"`
            - use `None`, to keep the default name: `"y"`
    """
    super(RegionalShapDP, self).__init__(
        "shap",
        data,
        model,
        None,
        None,
        nof_instances,
        axis_limits,
        feature_types,
        cat_limit,
        feature_names,
        target_name
    )

fit(features, heter_pcg_drop_thres=0.1, heter_small_enough=0.1, max_depth=1, nof_candidate_splits_for_numerical=20, min_points_per_subregion=10, candidate_conditioning_features='all', split_categorical_features=False, centering=False, points_for_centering=100)

Fit the regional SHAP.

Parameters:

Name Type Description Default
features typing.Union[int, str, list]

the features to fit. - If set to "all", all the features will be fitted.

required
heter_pcg_drop_thres float

threshold for the percentage drop in heterogeneity to consider a split valid

0.1
heter_small_enough float

heterogeneity threshold for a region to be considered homogeneous (splitting stops)

0.1
max_depth int

maximum number of splits to perform (depth of the tree)

1
nof_candidate_splits_for_numerical int

number of candidate splits to consider for numerical features

20
min_points_per_subregion int

minimum allowed number of points in a subregion (otherwise the split is not considered as valid)

10
candidate_conditioning_features typing.Union[str, list]

list of features to consider as conditioning features for the candidate splits

'all'
split_categorical_features bool

whether to search for subregions in categorical features

False
centering typing.Union[bool, str]

whether to center the SHAP dependence plots before estimating the heterogeneity

False
points_for_centering int

number of points to use for centering

100
Source code in /home/runner/work/effector/effector/effector/regional_effect_shap.py
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
def fit(
        self,
        features: typing.Union[int, str, list],
        heter_pcg_drop_thres: float = 0.1,
        heter_small_enough: float = 0.1,
        max_depth: int = 1,
        nof_candidate_splits_for_numerical: int = 20,
        min_points_per_subregion: int = 10,
        candidate_conditioning_features: typing.Union["str", list] = "all",
        split_categorical_features: bool = False,
        centering: typing.Union[bool, str] = False,
        points_for_centering: int = 100,
):
    """
    Fit the regional SHAP.

    Args:
        features: the features to fit.
            - If set to "all", all the features will be fitted.

        heter_pcg_drop_thres: threshold for the percentage drop in heterogeneity to consider a split valid
        heter_small_enough: heterogeneity threshold for a region to be considered homogeneous (splitting stops)
        max_depth: maximum number of splits to perform (depth of the tree)
        nof_candidate_splits_for_numerical: number of candidate splits to consider for numerical features
        min_points_per_subregion: minimum allowed number of points in a subregion (otherwise the split is not considered as valid)
        candidate_conditioning_features: list of features to consider as conditioning features for the candidate splits
        split_categorical_features: whether to search for subregions in categorical features
        centering: whether to center the SHAP dependence plots before estimating the heterogeneity
        points_for_centering: number of points to use for centering
    """
    assert min_points_per_subregion >= 2, "min_points_per_subregion must be >= 2"
    features = helpers.prep_features(features, self.dim)
    for feat in tqdm(features):
        heter = self._create_heterogeneity_function(
            feat, min_points_per_subregion, centering, points_for_centering
        )

        self._fit_feature(
            feat,
            heter,
            heter_pcg_drop_thres,
            heter_small_enough,
            max_depth,
            nof_candidate_splits_for_numerical,
            min_points_per_subregion,
            candidate_conditioning_features,
            split_categorical_features,
        )

        self.method_args["feature_" + str(feat)] = {
            "heter_pcg_drop_thres": heter_pcg_drop_thres,
            "heter_small_enough": heter_small_enough,
            "max_depth": max_depth,
            "nof_candidate_splits_for_numerical": nof_candidate_splits_for_numerical,
            "min_points_per_subregion": min_points_per_subregion,
            "candidate_conditioning_features": candidate_conditioning_features,
            "split_categorical_features": split_categorical_features,
            "centering": centering,
            "points_for_centering": points_for_centering,
        }

Binning Methods

effector.binning_methods.Fixed

Source code in /home/runner/work/effector/effector/effector/binning_methods.py
31
32
33
34
35
36
37
38
39
class Fixed:
    def __init__(self,
                 nof_bins: int = 100,
                 min_points_per_bin=10,
                 cat_limit: int = 15
                 ):
        self.nof_bins = nof_bins
        self.min_points_per_bin = min_points_per_bin
        self.cat_limit = cat_limit

__init__(nof_bins=100, min_points_per_bin=10, cat_limit=15)

Source code in /home/runner/work/effector/effector/effector/binning_methods.py
32
33
34
35
36
37
38
39
def __init__(self,
             nof_bins: int = 100,
             min_points_per_bin=10,
             cat_limit: int = 15
             ):
    self.nof_bins = nof_bins
    self.min_points_per_bin = min_points_per_bin
    self.cat_limit = cat_limit

effector.binning_methods.Greedy

Source code in /home/runner/work/effector/effector/effector/binning_methods.py
18
19
20
21
22
23
24
25
26
27
28
class Greedy:
    def __init__(self,
                 init_nof_bins: int = 100,
                 min_points_per_bin: int = 10,
                 discount: float = 0.3,
                 cat_limit: int = 15
                 ):
        self.max_nof_bins = init_nof_bins
        self.min_points_per_bin = min_points_per_bin
        self.discount = discount
        self.cat_limit = cat_limit

__init__(init_nof_bins=100, min_points_per_bin=10, discount=0.3, cat_limit=15)

Source code in /home/runner/work/effector/effector/effector/binning_methods.py
19
20
21
22
23
24
25
26
27
28
def __init__(self,
             init_nof_bins: int = 100,
             min_points_per_bin: int = 10,
             discount: float = 0.3,
             cat_limit: int = 15
             ):
    self.max_nof_bins = init_nof_bins
    self.min_points_per_bin = min_points_per_bin
    self.discount = discount
    self.cat_limit = cat_limit

effector.binning_methods.DynamicProgramming

Source code in /home/runner/work/effector/effector/effector/binning_methods.py
 6
 7
 8
 9
10
11
12
13
14
15
class DynamicProgramming:
    def __init__(self,
                 max_nof_bins: int = 20,
                 min_points_per_bin: int = 10,
                 discount: float = 0.3,
                 cat_limit: int = 15):
        self.max_nof_bins = max_nof_bins
        self.min_points_per_bin = min_points_per_bin
        self.discount = discount
        self.cat_limit = cat_limit

__init__(max_nof_bins=20, min_points_per_bin=10, discount=0.3, cat_limit=15)

Source code in /home/runner/work/effector/effector/effector/binning_methods.py
 7
 8
 9
10
11
12
13
14
15
def __init__(self,
             max_nof_bins: int = 20,
             min_points_per_bin: int = 10,
             discount: float = 0.3,
             cat_limit: int = 15):
    self.max_nof_bins = max_nof_bins
    self.min_points_per_bin = min_points_per_bin
    self.discount = discount
    self.cat_limit = cat_limit

Utils

compute_accumulated_effect(x, limits, bin_effect, dx, square=False)

Compute the accumulated effect at each point x.

Notes

The function implements the following formula:

\[ \mathtt{dx}[i] = \mathtt{limits}[i+1] - \mathtt{limits}[i] \]
\[ \mathtt{full\_bin\_acc} = \sum_{i=0}^{k_x - 1} \mathtt{dx}[i] * \mathtt{bin\_effect}[i] \]
\[ \mathtt{remainder} = (x - \mathtt{limits}[k_x-1])* \mathtt{bin\_effect}[k_x] \]
\[ f(x) = \mathtt{full\_bin\_acc} + \mathtt{remainder} \]
Notes

if square=True, then the formula is: $$ \mathtt{full_bin_acc} = \sum_{i=0}^{k_x - 1} \mathtt{dx}^2[i] * \mathtt{bin_effect}[i] $$

\[ \mathtt{remainder} = (x - \mathtt{limits}[k_x-1])^2* \mathtt{bin\_effect}[k_x] \]

Examples:

>>> x = np.array([-1.0, -0.5, 0.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0])
>>> limits = np.array([0, 1.5, 2.0])
>>> bin_effect = np.array([1.0, -1.0])
>>> dx = np.array([1.5, 0.5])
>>> compute_accumulated_effect(x, limits, bin_effect, dx)
array([0. , 0. , 0. , 0.5, 1. , 1.5, 1. , 1. , 1. ])
>>> x = np.array([-1.0, -0.5, 0.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0])
>>> limits = np.array([0, 1.5, 2.0])
>>> bin_effect = np.array([1.0, 1.0])
>>> dx = np.array([1.5, 0.5])
>>> compute_accumulated_effect(x, limits, bin_effect, dx)
array([0. , 0. , 0. , 0.5, 1. , 1.5, 2. , 2. , 2. ])

Parameters:

Name Type Description Default
x np.ndarray

The points we want to evaluate at, (T)

required
limits np.ndarray

The bin limits, (K+1)

required
bin_effect np.ndarray

The effect in each bin, (K)

required
dx np.ndarray

The bin-widths, (K)

required
square bool

Whether to square the width. If true, the effect is bin_effect * dx^2, otherwise bin_effect * dx

False

Returns:

Name Type Description
y np.ndarray

The accumulated effect at each point, (T)

Source code in /home/runner/work/effector/effector/effector/utils.py
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
def compute_accumulated_effect(
    x: np.ndarray,
    limits: np.ndarray,
    bin_effect: np.ndarray,
    dx: np.ndarray,
    square: bool = False,
) -> np.ndarray:
    """Compute the accumulated effect at each point `x`.

    Notes:
        The function implements the following formula:

        $$
        \mathtt{dx}[i] = \mathtt{limits}[i+1] - \mathtt{limits}[i]
        $$

        $$
        \mathtt{full\_bin\_acc} = \sum_{i=0}^{k_x - 1} \mathtt{dx}[i] * \mathtt{bin\_effect}[i]
        $$

        $$
        \mathtt{remainder} = (x - \mathtt{limits}[k_x-1])* \mathtt{bin\_effect}[k_x]
        $$

        $$
        f(x) =  \mathtt{full\_bin\_acc} + \mathtt{remainder}
        $$

    Notes:
        if `square=True`, then the formula is:
        $$
        \mathtt{full\_bin\_acc} = \sum_{i=0}^{k_x - 1} \mathtt{dx}^2[i] * \mathtt{bin\_effect}[i]
        $$

        $$
        \mathtt{remainder} = (x - \mathtt{limits}[k_x-1])^2* \mathtt{bin\_effect}[k_x]
        $$

    Examples:
        >>> x = np.array([-1.0, -0.5, 0.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0])
        >>> limits = np.array([0, 1.5, 2.0])
        >>> bin_effect = np.array([1.0, -1.0])
        >>> dx = np.array([1.5, 0.5])
        >>> compute_accumulated_effect(x, limits, bin_effect, dx)
        array([0. , 0. , 0. , 0.5, 1. , 1.5, 1. , 1. , 1. ])

        >>> x = np.array([-1.0, -0.5, 0.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0])
        >>> limits = np.array([0, 1.5, 2.0])
        >>> bin_effect = np.array([1.0, 1.0])
        >>> dx = np.array([1.5, 0.5])
        >>> compute_accumulated_effect(x, limits, bin_effect, dx)
        array([0. , 0. , 0. , 0.5, 1. , 1.5, 2. , 2. , 2. ])



    Parameters:
        x: The points we want to evaluate at, (T)
        limits: The bin limits, (K+1)
        bin_effect: The effect in each bin, (K)
        dx: The bin-widths, (K)
        square: Whether to square the width. If true, the effect is bin_effect * dx^2, otherwise bin_effect * dx

    Returns:
        y: The accumulated effect at each point, (T)


    """
    # find where each point belongs to
    ind = np.digitize(x, limits)

    # for each point, find the accumulated full-bin effect
    x_cumsum = (bin_effect * dx**2).cumsum() if square else (bin_effect * dx).cumsum()
    tmp = np.concatenate([[0, 0], x_cumsum])
    full_bin_effect = tmp[ind]

    # for each point, find the remaining effect
    tmp = np.concatenate([[limits[0]], limits[:-1], [BIG_M]])
    deltas = x - tmp[ind]
    deltas[deltas < 0] = 0  # if xs < left_limit => delta = 0
    deltas = deltas**2 if square else deltas
    tmp = np.concatenate([[0.0], bin_effect, [bin_effect[-1]]])
    remaining_effect = deltas * tmp[ind]

    # final effect
    y = full_bin_effect + remaining_effect
    return y

compute_ale_params(xs, df_dxs, limits)

Compute all important parameters for the ALE plot.

Examples:

>>> # Example without interpolation
>>> xs = np.array([0.5, 1.2, 2, 2.3])
>>> df_dxs = np.array([30, 34, 15, 17])
>>> limits = np.array([0, 1.5, 3.])
>>> compute_ale_params(xs, df_dxs, limits)
{'limits': array([0. , 1.5, 3. ]), 'dx': array([1.5, 1.5]), 'points_per_bin': array([2, 2]), 'bin_effect': array([32., 16.]), 'bin_variance': array([4., 1.]), 'bin_estimator_variance': array([2. , 0.5])}
>>> # Example with interpolation
>>> xs = np.array([1, 2, 2.8, 4])
>>> df_dxs = np.array([31, 34, 37, 40])
>>> limits = np.array([1, 3, 4])
>>> compute_ale_params(xs, df_dxs, limits)
{'limits': array([1, 3, 4]), 'dx': array([2, 1]), 'points_per_bin': array([3, 1]), 'bin_effect': array([34., 40.]), 'bin_variance': array([6., 6.]), 'bin_estimator_variance': array([2., 2.])}

Parameters:

Name Type Description Default
xs np.ndarray

The values of s-th feature, (N)

required
df_dxs np.ndarray

The effect wrt the s-th feature, (N)

required
limits np.ndarray

The bin limits, (K+1)

required

Returns:

Name Type Description
parameters dict

dict

Source code in /home/runner/work/effector/effector/effector/utils.py
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
def compute_ale_params(xs: np.ndarray, df_dxs: np.ndarray, limits: np.ndarray) -> dict:
    """
    Compute all important parameters for the ALE plot.

    Examples:
        >>> # Example without interpolation
        >>> xs = np.array([0.5, 1.2, 2, 2.3])
        >>> df_dxs = np.array([30, 34, 15, 17])
        >>> limits = np.array([0, 1.5, 3.])
        >>> compute_ale_params(xs, df_dxs, limits)
        {'limits': array([0. , 1.5, 3. ]), 'dx': array([1.5, 1.5]), 'points_per_bin': array([2, 2]), 'bin_effect': array([32., 16.]), 'bin_variance': array([4., 1.]), 'bin_estimator_variance': array([2. , 0.5])}

        >>> # Example with interpolation
        >>> xs = np.array([1, 2, 2.8, 4])
        >>> df_dxs = np.array([31, 34, 37, 40])
        >>> limits = np.array([1, 3, 4])
        >>> compute_ale_params(xs, df_dxs, limits)
        {'limits': array([1, 3, 4]), 'dx': array([2, 1]), 'points_per_bin': array([3, 1]), 'bin_effect': array([34., 40.]), 'bin_variance': array([6., 6.]), 'bin_estimator_variance': array([2., 2.])}

    Args:
        xs: The values of s-th feature, (N)
        df_dxs: The effect wrt the s-th feature, (N)
        limits: The bin limits, (K+1)

    Returns:
        parameters: dict

    """
    # compute bin-widths
    dx = np.array([limits[i + 1] - limits[i] for i in range(len(limits) - 1)])

    # compute mean effect on each bin
    bin_effect_nans, points_per_bin = compute_bin_effect(xs, df_dxs, limits)

    # compute effect variance in each bin
    bin_variance_nans, bin_estimator_variance_nans = compute_bin_variance(
        xs, df_dxs, limits, bin_effect_nans
    )

    # interpolate NaNs
    bin_effect = fill_nans(bin_effect_nans)
    bin_variance = fill_nans(bin_variance_nans)
    bin_estimator_variance = fill_nans(bin_estimator_variance_nans)

    parameters = {
        "limits": limits,
        "dx": dx,
        "points_per_bin": points_per_bin,
        "bin_effect": bin_effect,
        "bin_variance": bin_variance,
        "bin_estimator_variance": bin_estimator_variance,
    }
    return parameters

compute_bin_effect(xs, df_dxs, limits)

Compute the mean effect in each bin.

Notes

The function (a) allocates the instances in the bins and (b) aggregates the instance-level effects to compute the average bin-effect. If no instances lie in a bin, then the bin effect is NaN.

\[ \mathtt{bin\_effect}_k = {1 \over |i \in bin_k|} \sum_{i \in bin_k} \mathtt{effect}_i \]

Examples:

>>> n = 100
>>> xs = np.ones([n]) - 0.5
>>> df_dxs = np.ones_like(xs) * 10
>>> limits = np.array([0., 1., 2.0])
>>> bin_effects, ppb = compute_bin_effect(xs, df_dxs, limits)
>>> bin_effects
array([10., nan])
>>> ppb
array([100,   0])

Parameters:

Name Type Description Default
xs np.ndarray

The s-th feature of the instances, (N)

required
df_dxs np.ndarray

The effect wrt the s-th feature for each instance, (N)

required
limits np.ndarray

The bin limits, (K+1)

required

Returns:

Name Type Description
bin_effects np.ndarray

The average effect per bin, (K)

points_per_bin np.ndarray

The number of points per bin, (K)

Source code in /home/runner/work/effector/effector/effector/utils.py
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
def compute_bin_effect(
    xs: np.ndarray, df_dxs: np.ndarray, limits: np.ndarray
) -> typing.Tuple[np.ndarray, np.ndarray]:
    """Compute the mean effect in each bin.

    Notes:
        The function (a) allocates the instances in the bins and (b) aggregates the instance-level effects to compute
        the average bin-effect. If no instances lie in a bin, then the bin effect is NaN.

        $$
        \mathtt{bin\_effect}_k = {1 \over |i \in bin_k|} \sum_{i \in bin_k} \mathtt{effect}_i
        $$

    Examples:
        >>> n = 100
        >>> xs = np.ones([n]) - 0.5
        >>> df_dxs = np.ones_like(xs) * 10
        >>> limits = np.array([0., 1., 2.0])
        >>> bin_effects, ppb = compute_bin_effect(xs, df_dxs, limits)
        >>> bin_effects
        array([10., nan])
        >>> ppb
        array([100,   0])

    Parameters:
        xs: The s-th feature of the instances, (N)
        df_dxs: The effect wrt the s-th feature for each instance, (N)
        limits: The bin limits, (K+1)

    Returns:
        bin_effects: The average effect per bin, (K)
        points_per_bin: The number of points per bin, (K)
    """
    empty_symbol = np.NaN

    # find bin-index of points
    limits_enh = copy.deepcopy(limits).astype(float)
    limits_enh[-1] += EPS
    ind = np.digitize(xs, limits_enh)
    # assert np.alltrue(ind > 0)

    # bin effect is the mean of all points that lie in the bin
    nof_bins = limits.shape[0] - 1
    aggregated_effect = np.bincount(ind - 1, df_dxs, minlength=nof_bins)
    points_per_bin = np.bincount(ind - 1, minlength=nof_bins)

    # if no point lies in a bin, store Nan
    bin_effect_mean = np.divide(
        aggregated_effect,
        points_per_bin,
        out=np.ones(aggregated_effect.shape, dtype=float) * empty_symbol,
        where=points_per_bin != 0,
    )
    return bin_effect_mean, points_per_bin

compute_bin_variance(xs, df_dxs, limits, bin_effect_mean)

Compute the variance of the effect in each bin.

Notes

The function (a) allocates the points in the bins and (b) computes the variance and the variance/nof points. If less than two points in a bin, NaN is passed.

\[ \mathtt{bin\_variance}_k = {1 \over |i \in bin_k|} \sum_{i \in bin_k} (\mathtt{effect}_i - \mathtt{bin\_effect}_k)^2 \]
\[ \mathtt{bin\_estimator\_variance_k} = {\mathtt{bin\_variance}_k \over |i \in bin_k|} \]

Examples:

>>> n = 100
>>> xs = np.ones([n]) - 0.5
>>> df_dxs = np.ones_like(xs) * 10
>>> limits = np.array([0., 1., 2.0])
>>> bin_effect_mean, ppb = compute_bin_effect(xs, df_dxs, limits)
>>> bin_variance, bin_estimator_variance = compute_bin_variance(xs, df_dxs, limits, bin_effect_mean)
>>> bin_variance
array([ 0., nan])
>>> bin_estimator_variance
array([ 0., nan])
>>> xs = np.ones(4) * 0.5
>>> df_dxs = np.array([1.0, 3.0, 3.0, 5.0])
>>> limits = np.array([0, 1, 2.0])
>>> bin_effect_mean = np.array([np.mean(df_dxs), np.NaN])
>>> compute_bin_variance(xs, df_dxs, limits, bin_effect_mean)
(array([ 2., nan]), array([0.5, nan]))

Parameters:

Name Type Description Default
xs np.ndarray

The points we evaluate, (N)

required
df_dxs np.ndarray

The effect of each point, (N, )

required
limits np.ndarray

The bin limits (K+1)

required
bin_effect_mean np.ndarray

Mean effect in each bin, (K)

required

Returns:

Name Type Description
bin_variance np.ndarray

The variance in each bin, (K, )

bin_estimator_variance np.ndarray

The variance of the estimator in each bin, (K, )

Source code in /home/runner/work/effector/effector/effector/utils.py
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
def compute_bin_variance(
    xs: np.ndarray, df_dxs: np.ndarray, limits: np.ndarray, bin_effect_mean: np.ndarray
) -> typing.Tuple[np.ndarray, np.ndarray]:
    """
    Compute the variance of the effect in each bin.

    Notes:
        The function (a) allocates the points in the bins and (b) computes the variance and the variance/nof points.
        If less than two points in a bin, NaN is passed.

        $$
        \mathtt{bin\_variance}_k = {1 \over |i \in bin_k|} \sum_{i \in bin_k}
        (\mathtt{effect}_i - \mathtt{bin\_effect}_k)^2
        $$

        $$
        \mathtt{bin\_estimator\_variance_k} = {\mathtt{bin\_variance}_k \over |i \in bin_k|}
        $$

    Examples:
        >>> n = 100
        >>> xs = np.ones([n]) - 0.5
        >>> df_dxs = np.ones_like(xs) * 10
        >>> limits = np.array([0., 1., 2.0])
        >>> bin_effect_mean, ppb = compute_bin_effect(xs, df_dxs, limits)
        >>> bin_variance, bin_estimator_variance = compute_bin_variance(xs, df_dxs, limits, bin_effect_mean)
        >>> bin_variance
        array([ 0., nan])
        >>> bin_estimator_variance
        array([ 0., nan])

        >>> xs = np.ones(4) * 0.5
        >>> df_dxs = np.array([1.0, 3.0, 3.0, 5.0])
        >>> limits = np.array([0, 1, 2.0])
        >>> bin_effect_mean = np.array([np.mean(df_dxs), np.NaN])
        >>> compute_bin_variance(xs, df_dxs, limits, bin_effect_mean)
        (array([ 2., nan]), array([0.5, nan]))

    Parameters:
        xs: The points we evaluate, (N)
        df_dxs: The effect of each point, (N, )
        limits: The bin limits (K+1)
        bin_effect_mean: Mean effect in each bin, (K)

    Returns:
        bin_variance: The variance in each bin, (K, )
        bin_estimator_variance: The variance of the estimator in each bin, (K, )

    """
    empty_symbol = np.NaN

    # find bin-index of points
    eps = 1e-8
    limits_enh = copy.deepcopy(limits).astype(float)
    limits_enh[-1] += eps
    ind = np.digitize(xs, limits_enh)
    # assert np.alltrue(ind > 0)

    # variance of the effect in each bin
    variance_per_point = (df_dxs - bin_effect_mean[ind - 1]) ** 2
    nof_bins = limits.shape[0] - 1
    aggregated_variance_per_bin = np.bincount(
        ind - 1, variance_per_point, minlength=nof_bins
    )
    points_per_bin = np.bincount(ind - 1, minlength=nof_bins)

    # if less than two points in a bin, store Nan
    bin_variance = np.divide(
        aggregated_variance_per_bin,
        points_per_bin,
        out=np.ones(aggregated_variance_per_bin.shape, dtype=float) * empty_symbol,
        where=points_per_bin > 1,
    )

    # the variance of the estimator
    bin_estimator_variance = np.divide(
        bin_variance,
        points_per_bin,
        out=np.ones(aggregated_variance_per_bin.shape, dtype=float) * empty_symbol,
        where=points_per_bin > 1,
    )
    return bin_variance, bin_estimator_variance

compute_jacobian_numerically(model, data, eps=1e-08)

Compute the Jacobian of the model using finite differences.

Notes

The function computes the Jacobian of the model using finite differences. The formula is:

\[ \mathtt{J} = {\mathtt{model}(x + \mathtt{eps}) - \mathtt{model}(x) \over \mathtt{eps}} \]

Examples:

>>> data = np.array([[1, 2], [2, 3.0]])
>>> model = lambda x: np.sum(x, axis=1)
>>> compute_jacobian_numerically(model, data)
array([[1., 1.],
       [1., 1.]])

Parameters:

Name Type Description Default
data np.ndarray

The dataset, (N, D)

required
model typing.Callable

The black-box model ((N, D) -> (N))

required
eps float

The finite difference step

1e-08

Returns:

Name Type Description
jacobian np.ndarray

The Jacobian of the model, (N, D)

Source code in /home/runner/work/effector/effector/effector/utils.py
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
def compute_jacobian_numerically(
    model: typing.Callable, data: np.ndarray, eps: float = 1e-8
) -> np.ndarray:
    """Compute the Jacobian of the model using finite differences.

    Notes:
        The function computes the Jacobian of the model using finite differences. The formula is:

        $$
        \mathtt{J} = {\mathtt{model}(x + \mathtt{eps}) - \mathtt{model}(x) \over \mathtt{eps}}
        $$

    Examples:
        >>> data = np.array([[1, 2], [2, 3.0]])
        >>> model = lambda x: np.sum(x, axis=1)
        >>> compute_jacobian_numerically(model, data)
        array([[1., 1.],
               [1., 1.]])

    Args:
        data: The dataset, (N, D)
        model: The black-box model ((N, D) -> (N))
        eps: The finite difference step

    Returns:
        jacobian: The Jacobian of the model, (N, D)

    """
    assert data.ndim == 2
    jacobian = np.zeros_like(data)
    for f in range(data.shape[1]):
        data_plus = copy.deepcopy(data)
        data_plus[:, f] += eps
        jacobian[:, f] = (model(data_plus) - model(data)) / eps
    return jacobian

compute_local_effects(data, model, limits, feature)

Compute the local effects, permuting the feature of interest using the bin limits.

Notes

The function (a) allocates the points in the bins based on the feature of interest (foi) and (b) computes the effect as the difference when evaluating the output setting the foi at the right and the left limit of the bin.

Given that the bins are defined as a list [l_0, l_1, ..., l_k], and x_s of the i-th point belongs to the k-th bin:

\[ {df \over dx_s}(x^i) = {f(x_0^i, ... ,x_s=l_k, ..., x_D^i) - f(x_0^i, ... ,x_s=l_{k-1}, ..., x_D^i) \over l_k - l_{k-1}} \]

Examples:

>>> data = np.array([[1, 2], [2, 3.0]])
>>> model = lambda x: np.sum(x, axis=1)
>>> limits = np.array([1.0, 2.0])
>>> data_effect = compute_local_effects(data, model, limits, feature=0)
>>> data_effect
array([1., 1.])

Parameters:

Name Type Description Default
data np.ndarray

The training set, (N, D)

required
model typing.Callable

The black-box model ((N, D) -> (N))

required
limits np.ndarray

The bin limits, (K+1)

required
feature int

Index of the feature-of-interest

required

Returns:

Name Type Description
data_effect np.ndarray

The local effect of each data point, (N)

Source code in /home/runner/work/effector/effector/effector/utils.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
def compute_local_effects(
    data: np.ndarray, model: typing.Callable, limits: np.ndarray, feature: int
) -> np.ndarray:
    """Compute the local effects, permuting the feature of interest using the bin limits.

    Notes:
        The function (a) allocates the points in the bins based on the feature of interest (foi)
        and (b) computes the effect as the difference when evaluating the output setting the foi at the right and the
        left limit of the bin.

        Given that the bins are defined as a list [l_0, l_1, ..., l_k], and x_s of the i-th point belongs to the k-th bin:

        $$
        {df \over dx_s}(x^i) = {f(x_0^i, ... ,x_s=l_k, ..., x_D^i) - f(x_0^i, ... ,x_s=l_{k-1}, ..., x_D^i)
         \over l_k - l_{k-1}}
        $$


    Examples:
        >>> data = np.array([[1, 2], [2, 3.0]])
        >>> model = lambda x: np.sum(x, axis=1)
        >>> limits = np.array([1.0, 2.0])
        >>> data_effect = compute_local_effects(data, model, limits, feature=0)
        >>> data_effect
        array([1., 1.])

    Args:
        data: The training set, (N, D)
        model: The black-box model ((N, D) -> (N))
        limits: The bin limits, (K+1)
        feature: Index of the feature-of-interest

    Returns:
        data_effect: The local effect of each data point, (N)

    """
    assert data.ndim == 2

    # check that limits cover all data points
    assert limits[0] <= np.min(data[:, feature])
    assert limits[-1] >= np.max(data[:, feature])

    # for each point, find the bin-index it belongs to
    limits[-1] += EPS
    ind = np.digitize(data[:, feature], limits)
    assert np.alltrue(ind > 0)

    # compute effect
    right_lim = copy.deepcopy(data)
    left_lim = copy.deepcopy(data)
    right_lim[:, feature] = limits[ind]
    left_lim[:, feature] = limits[ind - 1]
    dx = limits[1] - limits[0]
    data_effect = model(right_lim) - model(left_lim)
    return np.squeeze(data_effect) / dx

fill_nans(x)

Replace NaNs with interpolated values.

Examples:

>>> x = np.array([1.0, np.NaN, 2.0])
>>> fill_nans(x)
array([1. , 1.5, 2. ])
>>> x = np.array([1.0, np.NaN, np.NaN, np.NaN, 2.0])
>>> fill_nans(x)
array([1.  , 1.25, 1.5 , 1.75, 2.  ])
>>> x = np.array([0.5, 1.0, np.NaN, np.NaN, np.NaN])
>>> fill_nans(x)
array([0.5, 1. , 1. , 1. , 1. ])

Parameters:

Name Type Description Default
x np.ndarray

Time-series with NaNs, (T)

required

Returns:

Name Type Description
x np.ndarray

Time-series values without NaNs, (T)

Source code in /home/runner/work/effector/effector/effector/utils.py
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
def fill_nans(x: np.ndarray) -> np.ndarray:
    """Replace NaNs with interpolated values.

    Examples:
        >>> x = np.array([1.0, np.NaN, 2.0])
        >>> fill_nans(x)
        array([1. , 1.5, 2. ])

        >>> x = np.array([1.0, np.NaN, np.NaN, np.NaN, 2.0])
        >>> fill_nans(x)
        array([1.  , 1.25, 1.5 , 1.75, 2.  ])

        >>> x = np.array([0.5, 1.0, np.NaN, np.NaN, np.NaN])
        >>> fill_nans(x)
        array([0.5, 1. , 1. , 1. , 1. ])

    Parameters:
        x: Time-series with NaNs, (T)

    Returns:
        x: Time-series values without NaNs, (T)
    """
    bin_effect_1 = copy.deepcopy(x)

    def nan_helper(y):
        return np.isnan(y), lambda z: z.nonzero()[0]

    nans, x = nan_helper(bin_effect_1)
    bin_effect_1[nans] = np.interp(x(nans), x(~nans), bin_effect_1[~nans])
    return bin_effect_1

filter_points_in_bin(xs, df_dxs, limits)

Filter the points inside the bin defined by the limits.

Notes

Filtering depends on whether xs lies in the interval [limits[0], limits[1]], not df_dxs.

Examples:

>>> xs = np.array([1, 2, 3])
>>> df_dxs = np.array([32, 34, 36])
>>> limits = np.array([1, 2])
>>> xs, df_dxs = filter_points_in_bin(xs, df_dxs, limits)
>>> xs
array([1, 2])
>>> df_dxs
array([32, 34])

Parameters:

Name Type Description Default
xs np.ndarray

The instances, (N)

required
df_dxs typing.Union[np.ndarray, None]

The instance-effects (N) or None

required
limits np.ndarray

[Start, Stop] of the bin

required

Returns:

Name Type Description
data np.ndarray

The instances in the bin, (nof_points_in_bin, D)

data_effect typing.Union[np.ndarray, None]

The instance-effects in the bin, (nof_points_in_bin, D) or None

Source code in /home/runner/work/effector/effector/effector/utils.py
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
def filter_points_in_bin(
    xs: np.ndarray, df_dxs: typing.Union[np.ndarray, None], limits: np.ndarray
) -> typing.Tuple[np.ndarray, typing.Union[np.ndarray, None]]:
    """
    Filter the points inside the bin defined by the `limits`.

    Notes:
        Filtering depends on whether `xs` lies in the interval [limits[0], limits[1]], not `df_dxs`.

    Examples:
        >>> xs = np.array([1, 2, 3])
        >>> df_dxs = np.array([32, 34, 36])
        >>> limits = np.array([1, 2])
        >>> xs, df_dxs = filter_points_in_bin(xs, df_dxs, limits)
        >>> xs
        array([1, 2])
        >>> df_dxs
        array([32, 34])

    Args:
        xs: The instances, (N)
        df_dxs: The instance-effects (N) or None
        limits: [Start, Stop] of the bin

    Returns:
        data: The instances in the bin, (nof_points_in_bin, D)
        data_effect: The instance-effects in the bin, (nof_points_in_bin, D) or None

    """
    filt = np.logical_and(limits[0] <= xs, xs <= limits[1])

    # return data
    xs = xs[filt]

    # return data effect if not None
    if df_dxs is not None:
        df_dxs = df_dxs[filt]
    return xs, df_dxs

get_feature_types(data, categorical_limit=10)

Determine the type of each feature.

Notes

A feature is considered as categorical if it has less than cat_limit unique values.

Parameters:

Name Type Description Default
data np.ndarray

The dataset, (N, D)

required
categorical_limit int

Maximum unique values for a feature to be considered as categorical

10

Returns:

Name Type Description
types typing.List[str]

A list of strings, where each string is either "cat" or "cont"

Source code in /home/runner/work/effector/effector/effector/utils.py
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
def get_feature_types(data: np.ndarray, categorical_limit: int = 10) -> typing.List[str]:
    """Determine the type of each feature.

    Notes:
        A feature is considered as categorical if it has less than `cat_limit` unique values.

    Args:
        data: The dataset, (N, D)
        categorical_limit: Maximum unique values for a feature to be considered as categorical


    Returns:
        types: A list of strings, where each string is either `"cat"` or `"cont"`

    """

    types = [
        "cat" if len(np.unique(data[:, f])) < categorical_limit else "cont"
        for f in range(data.shape[1])
    ]
    return types