Source code for frlearn.trees.data_descriptors

"""Decision tree data descriptors"""
from __future__ import annotations

from typing import Callable

from sklearn.ensemble import IsolationForest

from frlearn.base import DataDescriptor


class EIF(DataDescriptor):
    """
    Wrapper for the Extended Isolation Forest (IF) data descriptor [1]_.
    Requires the eif library, which is not automatically installed.
    Expresses the effort required to isolate a query instance from the target data
    by separating instances with random hyperplanes.

    Parameters
    ----------
    psi : int or (int -> int) = 256
        Sub-sampling size. Number of training instances to use for each random tree.
        Should be either a positive integer,
        or a function that takes the size of the target class and returns such an integer.
        If the size of the target class is a smaller number, that will be used instead.

    t : int = 100
        Number of random trees.

    random_state : int = 0
        Random state to use.

    eif_params
        additional keyword parameters will be passed on as-is to eif's iForest constructor.

    preprocessors : iterable = ()
        Preprocessors to apply.

    Notes
    -----
    Scores are the complement of the anomaly scores in [1]_.
    `psi` and `t` are two hyperparameters that can potentially be tuned,
    but the default values should be good enough [2]_.

    References
    ----------
    .. [1] `Hariri S, Carrasco Kind M, Brunner RJ (2021).
       Extended Isolation Forest.
       IEEE Transactions on Knowledge and Data Engineering, vol 33, no 4, pp 1479–1489.
       doi: 10.1109/TKDE.2019.2947676
       <https://ieeexplore.ieee.org/document/8888179>`_
    .. [2] `Liu FT, Ting KM, Zhou Z-H (2008).
       Isolation Forest.
       ICDM 2008: Proceedings of the Eighth IEEE International Conference on Data Mining, pp 413–422.
       IEEE.
       doi: 10.1109/ICDM.2008.17
       <https://ieeexplore.ieee.org/document/4781136>`_
    """

    def __init__(
            self,
            psi: int | Callable[[int], int] = 256,
            t: int = 100,
            random_state: int = 0,
            preprocessors=(),
            **eif_params
    ):
        super().__init__(preprocessors=preprocessors)
        try:
            import eif
        except ImportError:
            raise ImportError('EIF data descriptor requires the eif library.') from None
        self.psi = psi
        self.t = t
        self.random_state = random_state
        self.eif_params = eif_params

    def _construct(self, X):
        import eif
        model = super()._construct(X)
        model.psi = min(self.psi, X.shape[0])
        model.t = self.t
        model.random_state = self.random_state
        model.forest = eif.iForest(
            X,
            ntrees=model.t, sample_size=model.psi, seed=model.random_state,
            ExtensionLevel=X.shape[1] - 1, **self.eif_params
        )
        return model

    class Model(DataDescriptor.Model):

        psi: int
        t: int
        random_state: int
        forest: ...

        def _query(self, X):
            # convert anomaly scores to normality scores
            return 1 - self.forest.compute_paths(X_in=X)


[docs]class IF(DataDescriptor):
    """
    Wrapper for the Isolation Forest (IF) data descriptor [1]_ implemented in scikit-learn.
    Expresses the effort required to isolate a query instance from the target data
    by random splits on attribute values.

    Parameters
    ----------
    psi : int or (int -> int) = 256
        Sub-sampling size. Number of training instances to use for each random tree.
        Should be either a positive integer,
        or a function that takes the size of the target class and returns such an integer.
        If the size of the target class is a smaller number, that will be used instead.

    t : int = 100
        Number of random trees.

    random_state : int = 0
        Random state to use.

    preprocessors : iterable = ()
        Preprocessors to apply.

    sklearn_params
        Additional keyword parameters will be passed on as-is to scikit-learn's IsolationForest constructor.

    Notes
    -----
    Scores are the complement of the anomaly scores in [1]_.
    `psi` and `t` are two hyperparameters that can potentially be tuned,
    but the default values should be good enough [1]_.

    References
    ----------
    .. [1] `Liu FT, Ting KM, Zhou Z-H (2008).
       Isolation Forest.
       ICDM 2008: Proceedings of the Eighth IEEE International Conference on Data Mining, pp 413–422.
       IEEE.
       doi: 10.1109/ICDM.2008.17
       <https://ieeexplore.ieee.org/document/4781136>`_
    """

    def __init__(
            self,
            psi: int | Callable[[int], int] = 256,
            t: int = 100,
            random_state: int = 0,
            preprocessors=(),
            **sklearn_params,
    ):
        super().__init__(preprocessors=preprocessors)
        self.psi = psi
        self.t = t
        self.random_state = random_state
        self.sklearn_params = sklearn_params

    def _construct(self, X):
        model = super()._construct(X)
        model.psi = min(self.psi, X.shape[0])
        model.t = self.t
        model.random_state = self.random_state
        model.forest = IsolationForest(
            max_samples=model.psi, n_estimators=model.t, random_state=self.random_state,
            **self.sklearn_params
        ).fit(X)
        return model

[docs]    class Model(DataDescriptor.Model):

        psi: int
        t: int
        random_state: int
        forest: IsolationForest

        def _query(self, X):
            # map from [-1, 0] to [0, 1]
            return 1 + self.forest.score_samples(X)