"""Decision tree data descriptors"""
from __future__ import annotations
from typing import Callable
from sklearn.ensemble import IsolationForest
from frlearn.base import DataDescriptor
class EIF(DataDescriptor):
"""
Wrapper for the Extended Isolation Forest (IF) data descriptor [1]_.
Requires the eif library, which is not automatically installed.
Expresses the effort required to isolate a query instance from the target data
by separating instances with random hyperplanes.
Parameters
----------
psi : int or (int -> int) = 256
Sub-sampling size. Number of training instances to use for each random tree.
Should be either a positive integer,
or a function that takes the size of the target class and returns such an integer.
If the size of the target class is a smaller number, that will be used instead.
t : int = 100
Number of random trees.
random_state : int = 0
Random state to use.
eif_params
additional keyword parameters will be passed on as-is to eif's iForest constructor.
preprocessors : iterable = ()
Preprocessors to apply.
Notes
-----
Scores are the complement of the anomaly scores in [1]_.
`psi` and `t` are two hyperparameters that can potentially be tuned,
but the default values should be good enough [2]_.
References
----------
.. [1] `Hariri S, Carrasco Kind M, Brunner RJ (2021).
Extended Isolation Forest.
IEEE Transactions on Knowledge and Data Engineering, vol 33, no 4, pp 1479–1489.
doi: 10.1109/TKDE.2019.2947676
<https://ieeexplore.ieee.org/document/8888179>`_
.. [2] `Liu FT, Ting KM, Zhou Z-H (2008).
Isolation Forest.
ICDM 2008: Proceedings of the Eighth IEEE International Conference on Data Mining, pp 413–422.
IEEE.
doi: 10.1109/ICDM.2008.17
<https://ieeexplore.ieee.org/document/4781136>`_
"""
def __init__(
self,
psi: int | Callable[[int], int] = 256,
t: int = 100,
random_state: int = 0,
preprocessors=(),
**eif_params
):
super().__init__(preprocessors=preprocessors)
try:
import eif
except ImportError:
raise ImportError('EIF data descriptor requires the eif library.') from None
self.psi = psi
self.t = t
self.random_state = random_state
self.eif_params = eif_params
def _construct(self, X):
import eif
model = super()._construct(X)
model.psi = min(self.psi, X.shape[0])
model.t = self.t
model.random_state = self.random_state
model.forest = eif.iForest(
X,
ntrees=model.t, sample_size=model.psi, seed=model.random_state,
ExtensionLevel=X.shape[1] - 1, **self.eif_params
)
return model
class Model(DataDescriptor.Model):
psi: int
t: int
random_state: int
forest: ...
def _query(self, X):
# convert anomaly scores to normality scores
return 1 - self.forest.compute_paths(X_in=X)
[docs]class IF(DataDescriptor):
"""
Wrapper for the Isolation Forest (IF) data descriptor [1]_ implemented in scikit-learn.
Expresses the effort required to isolate a query instance from the target data
by random splits on attribute values.
Parameters
----------
psi : int or (int -> int) = 256
Sub-sampling size. Number of training instances to use for each random tree.
Should be either a positive integer,
or a function that takes the size of the target class and returns such an integer.
If the size of the target class is a smaller number, that will be used instead.
t : int = 100
Number of random trees.
random_state : int = 0
Random state to use.
preprocessors : iterable = ()
Preprocessors to apply.
sklearn_params
Additional keyword parameters will be passed on as-is to scikit-learn's IsolationForest constructor.
Notes
-----
Scores are the complement of the anomaly scores in [1]_.
`psi` and `t` are two hyperparameters that can potentially be tuned,
but the default values should be good enough [1]_.
References
----------
.. [1] `Liu FT, Ting KM, Zhou Z-H (2008).
Isolation Forest.
ICDM 2008: Proceedings of the Eighth IEEE International Conference on Data Mining, pp 413–422.
IEEE.
doi: 10.1109/ICDM.2008.17
<https://ieeexplore.ieee.org/document/4781136>`_
"""
def __init__(
self,
psi: int | Callable[[int], int] = 256,
t: int = 100,
random_state: int = 0,
preprocessors=(),
**sklearn_params,
):
super().__init__(preprocessors=preprocessors)
self.psi = psi
self.t = t
self.random_state = random_state
self.sklearn_params = sklearn_params
def _construct(self, X):
model = super()._construct(X)
model.psi = min(self.psi, X.shape[0])
model.t = self.t
model.random_state = self.random_state
model.forest = IsolationForest(
max_samples=model.psi, n_estimators=model.t, random_state=self.random_state,
**self.sklearn_params
).fit(X)
return model
[docs] class Model(DataDescriptor.Model):
psi: int
t: int
random_state: int
forest: IsolationForest
def _query(self, X):
# map from [-1, 0] to [0, 1]
return 1 + self.forest.score_samples(X)