Source code for frlearn.neighbours.data_descriptors

"""Nearest neighbour data descriptors"""
from __future__ import annotations

from abc import abstractmethod
from typing import Callable

import numpy as np

from frlearn.array_functions import div_or, soft_head, soft_max
from frlearn.base import DataDescriptor
from frlearn.feature_preprocessors import IQRNormaliser
from frlearn.neighbour_search_methods import NeighbourSearchMethod, KDTree
from frlearn.neighbours.utilities import resolve_k
from frlearn.parametrisations import log_multiple
from frlearn.transformations import shifted_reciprocal
from frlearn.weights import LinearWeights
from frlearn.uncategorised.utilities import resolve_dissimilarity

# TODO: consider implementing NNDescriptor as addition of NeighbourSearchMethod to preprocessors,
# but have to handle k somehow (especially for ALP which also has l)

class NNDataDescriptor(DataDescriptor):

    def __init__(
            dissimilarity: str,
            k: int or Callable[[int], int] or None,
            nn_search: NeighbourSearchMethod,
        self.dissimilarity = resolve_dissimilarity(dissimilarity)
        self.nn_search = nn_search
        self.k = k
        self.localised = localised

    def _construct(self, X) -> Model:
        model: NNDataDescriptor.Model = super()._construct(X)
        nn_model = self.nn_search(X, self.dissimilarity)
        model.nn_model = nn_model
        # TODO: is this the right way to resolve k?
        model.k = model._resolve_k(self.k, localised=self.localised)
        return model

    class Model(DataDescriptor.Model):

        nn_model: NeighbourSearchMethod.Model
        k: int

        def __call__(self, X):
            # TODO: inherit from super
            for preprocessing_model in self.preprocessing_models:
                X = preprocessing_model(X)
            q_neighbours, q_distances = self.nn_model(X, self.k)
            return self._query(q_neighbours, q_distances)

        def _query(self, q_neighbours, q_distances):

        def _resolve_k(self, k: float or Callable[[int], float] or None, localised: bool = False, ):
            Helper method to obtain a valid number of neighbours
            from a parameter `k` given `n` target records,
            where `k` may be defined in terms of `n`.
            The maximum number of neighbours `k_max` is `n`,
            unless `localised` is `True`, in which case it is `n - 1`.

            k: float or (int -> float) or None
                Parameter value to resolve. Can be a float,
                a callable that takes `n` and returns a float,
                or None.

            localised: bool = False
                Whether `k` also has to be valid for target records,
                while excluding these from being their own nearest neighbour.
                If so, then `k_max` is `n - 1`.

            k: int
               If `k` is a float in [1, k_max]: `k`;
               If `k` is None: `k_max`;
               If `k` is callable, the output of `k` applied to `n`,
               rounded to the nearest integer in `[1, k_max]`.

                If `k` is a float not in [1, k_max].

            n = len(self)
            k_max = n - 1 if localised else n
            return resolve_k(k, n, k_max)

[docs]class ALP(NNDataDescriptor): """ Implementation of the Average Localised Proximity (ALP) data descriptor [1]_. Expresses the proximity of a query instance to the target data, by localising its nearest neighbour distances against the local nearest neighbour distances in the target data. Parameters ---------- dissimilarity: str or float or (np.array -> float) or ((np.array, np.array) -> float) = 'boscovich' The dissimilarity measure to use. A vector size measure `np.array -> float` induces a dissimilarity measure through application to `y - x`. A float is interpreted as Minkowski size with the corresponding value for `p`. For convenience, a number of popular measures can be referred to by name. The default is the Boscovich norm (also known as cityblock, Manhattan or taxicab norm). k : int or (int -> float) or None = 5.5 * log n How many nearest neighbour distances / localised proximities to consider. Corresponds to the scale at which proximity is evaluated. Should be either a positive integer, or a function that takes the target class size `n` and returns a float, or None, which is resolved as `n`. All such values are rounded to the nearest integer in `[1, n]`. l : int or (int -> float) or None = 6 * log n How many nearest neighbours to use for determining the local ith nearest neighbour distance, for each `i <= k`. Lower values correspond to more localisation. Should be either a positive integer, or a function that takes the target class size `n` and returns a float, or None, which is resolved as `n`. All such values are rounded to the nearest integer in `[1, n]`. scale_weights : (int -> np.array) or None = LinearWeights() Weights to use for calculating the soft maximum of localised proximities. Determines to which extent scales with high localised proximity are emphasised. localisation_weights : (int -> np.array) or None = LinearWeights() Weights to use for calculating the local ith nearest neighbour distance, for each `i <= k`. Determines to which extent nearer neighbours dominate. max_array_size : int = 2**26 Maximum array size to use. For a query set of size `q`, calculating local distances requires an array of size `[q, l, k]`, which can be too large to fit in memory. If the size of this array is larger than `max_array_size`, a query set is batch-processed, which is slower. TODO: determine maximum array size dynamically, investigate lowering float precision preprocessors : iterable = (IQRNormaliser(), ) Preprocessors to apply. The default interquartile range normaliser rescales all features to ensure that they all have the same interquartile range. Notes ----- `k` and `l` are the two principal hyperparameters that can be tuned to increase performance. Its default values are based on the empirical evaluation in [1]_. References ---------- .. [1] `Lenz OU, Peralta D, Cornelis C (2021). Average Localised Proximity: A new data descriptor with good default one-class classification performance. Pattern Recognition, vol 118, no 107991. doi: 10.1016/j.patcog.2021.107991 <>`_ """ def __init__( self, dissimilarity: str or float or Callable[[np.array], float] or Callable[[np.array, np.array], float] = 'boscovich', k: int or Callable[[int], float] or None = log_multiple(5.5), l: int or Callable[[int], float] or None = log_multiple(6), scale_weights: Callable[[int], np.array] | None = LinearWeights(), localisation_weights: Callable[[int], np.array] | None = LinearWeights(), nn_search: NeighbourSearchMethod = KDTree(), max_array_size: int = 2**26, preprocessors=(IQRNormaliser(), ) ): super().__init__(dissimilarity=dissimilarity, k=k, nn_search=nn_search, localised=True, preprocessors=preprocessors) self.l = l self.scale_weights = scale_weights self.localisation_weights = localisation_weights self.max_array_size = max_array_size def _construct(self, X): model: ALP.Model = super()._construct(X) model.l = model._resolve_k(self.l, localised=False) model._kl = max(model.k, model.l) _, model.distances = model.nn_model.query_self(model.k) model.scale_weights = self.scale_weights model.localisation_weights = self.localisation_weights return model
[docs] class Model(NNDataDescriptor.Model): l: int _kl: int distances: np.ndarray scale_weights: Callable[[int], np.array] localisation_weights: Callable[[int], np.array] def __call__(self, X): # TODO: inherit from super for preprocessing_model in self.preprocessing_models: X = preprocessing_model(X) q_neighbours, q_distances = self.nn_model(X, self._kl) return self._query(q_neighbours[..., :self.l], q_distances[..., :self.k]) def _query(self, q_neighbours, q_distances): batch_size = 2**26 // (self.k * self.l) local_distances = [] for i in range(0, q_neighbours.shape[0], batch_size): local_distances.append(soft_head( self.distances[q_neighbours[i:i+batch_size]], self.localisation_weights, self.l, axis=-2 )) local_distances = np.concatenate(local_distances, axis=0) # if both distances are zero, default to 1 localised_distances = div_or(q_distances, local_distances, 1) localised_proximities = shifted_reciprocal(localised_distances) return soft_max(localised_proximities, self.scale_weights, self.k)
[docs]class LNND(NNDataDescriptor): """ Implementation of the Localised Nearest Neighbour Distance (LNND) data descriptor [1]_[2]_. Parameters ---------- dissimilarity: str or float or (np.array -> float) or ((np.array, np.array) -> float) = 'boscovich' The dissimilarity measure to use. A vector size measure `np.array -> float` induces a dissimilarity measure through application to `y - x`. A float is interpreted as Minkowski size with the corresponding value for `p`. For convenience, a number of popular measures can be referred to by name. The default is the Boscovich norm (also known as cityblock, Manhattan or taxicab norm). k : int or (int -> float) or None = 3.4 * log n Which nearest neighbour to consider. Should be either a positive integer, or a function that takes the target class size `n` and returns a float, or None, which is resolved as `n`. All such values are rounded to the nearest integer in `[1, n]`. preprocessors : iterable = (IQRNormaliser(), ) Preprocessors to apply. The default interquartile range normaliser rescales all features to ensure that they all have the same interquartile range. Notes ----- The scores are derived with 1/(1 + l_distances). `k` is the principal hyperparameter that can be tuned to increase performance. Its default value is based on the empirical evaluation in [3]_. References ---------- .. [1] `de Ridder D, Tax DMJ, Duin RPW (1998). An experimental comparison of one-class classification methods. ASCI`98: Proceedings of the Fourth Annual Conference of the Advanced School for Computing and Imaging, 213–218. ASCI. <>`_ .. [2] `Tax DMJ, Duin RPW (1998). Outlier detection using classifier instability. SSPR/SPR 1998: Joint IAPR International Workshops on Statistical Techniques in Pattern Recognition and Structural and Syntactic Pattern Recognition, 593--601. Springer. doi: 10.1007/BFb0033283 <>`_ .. [3] `Lenz OU, Peralta D, Cornelis C (2021). Average Localised Proximity: A new data descriptor with good default one-class classification performance. Pattern Recognition, vol 118, no 107991. doi: 10.1016/j.patcog.2021.107991 <>`_ """ def __init__( self, dissimilarity: str or float or Callable[[np.array], float] or Callable[[np.array, np.array], float] = 'boscovich', k: int or Callable[[int], float] or None = log_multiple(3.4), nn_search: NeighbourSearchMethod = KDTree(), preprocessors=(IQRNormaliser(), ) ): super().__init__(dissimilarity=dissimilarity, k=k, nn_search=nn_search, localised=True, preprocessors=preprocessors) def _construct(self, X) -> Model: model: LNND.Model = super()._construct(X) _, distances = model.nn_model.query_self(model.k) model.distances = distances[:, -1] return model
[docs] class Model(NNDataDescriptor.Model): distances: np.ndarray def _query(self, q_neighbours, q_distances): # if both distances are zero, default to 1 l_distances = div_or(q_distances[:, self.k-1], self.distances[q_neighbours[:, self.k-1]], 1) # replace infinites with very large numbers, but keep nans (which shouldn't be here) to flag problems l_distances = np.nan_to_num(l_distances, nan=np.nan) return shifted_reciprocal(l_distances)
[docs]class LOF(NNDataDescriptor): """ Implementation of the Local Outlier Factor (LOF) data descriptor [1]_. Parameters ---------- dissimilarity: str or float or (np.array -> float) or ((np.array, np.array) -> float) = 'boscovich' The dissimilarity measure to use. A vector size measure `np.array -> float` induces a dissimilarity measure through application to `y - x`. A float is interpreted as Minkowski size with the corresponding value for `p`. For convenience, a number of popular measures can be referred to by name. The default is the Boscovich norm (also known as cityblock, Manhattan or taxicab norm). k : int or (int -> float) or None = 2.5 * log n How many nearest neighbours to consider. Should be either a positive integer, or a function that takes the target class size `n` and returns a float, or None, which is resolved as `n`. All such values are rounded to the nearest integer in `[1, n]`. preprocessors : iterable = (IQRNormaliser(), ) Preprocessors to apply. The default interquartile range normaliser rescales all features to ensure that they all have the same interquartile range. Notes ----- The scores are derived with 1/(1 + lof). `k` is the principal hyperparameter that can be tuned to increase performance. Its default value is based on the empirical evaluation in [2]_. References ---------- .. [1] `Breunig MM, Kriegel H-P, Ng RT, Sander J (2000). LOF: identifying density-based local outliers. SIGMOD 2000: ACM international conference on Management of data, 93–104. ACM. doi: 10.1145/342009.335388 <>`_ .. [2] `Lenz OU, Peralta D, Cornelis C (2021). Average Localised Proximity: A new data descriptor with good default one-class classification performance. Pattern Recognition, vol 118, no 107991. doi: 10.1016/j.patcog.2021.107991 <>`_ """ def __init__( self, dissimilarity: str or float or Callable[[np.array], float] or Callable[[np.array, np.array], float] = 'boscovich', k: int or Callable[[int], float] or None = log_multiple(2.5), nn_search: NeighbourSearchMethod = KDTree(), preprocessors=(IQRNormaliser(), ) ): super().__init__(dissimilarity=dissimilarity, k=k, nn_search=nn_search, localised=True, preprocessors=preprocessors) def _construct(self, X) -> Model: model: LOF.Model = super()._construct(X) neighbours, distances = model.nn_model.query_self(model.k) model.distances = distances[:, -1] model.lrd = model._get_lrd(neighbours, distances) return model
[docs] class Model(NNDataDescriptor.Model): distances: np.ndarray lrd: np.ndarray def _get_lrd(self, q_neighbours, q_distances): r_distances = np.maximum(q_distances, self.distances[q_neighbours]) return 1/np.mean(r_distances, axis=-1) def _query(self, q_neighbours, q_distances): q_lrd = self._get_lrd(q_neighbours, q_distances) lof = np.mean(self.lrd[q_neighbours], axis=-1) / q_lrd # handle nan, which comes from inf/inf lof[np.isnan(lof)] = 1 return shifted_reciprocal(lof)
[docs]class NND(NNDataDescriptor): """ Implementation of the Nearest Neighbour Distance (NND) data descriptor, which goes back to at least [1]_. It has also been used to calculate upper and lower approximations of fuzzy rough sets, where the addition of aggregation with OWA operators is due to [2]_. Parameters ---------- dissimilarity: str or float or (np.array -> float) or ((np.array, np.array) -> float) = 'boscovich' The dissimilarity measure to use. A vector size measure `np.array -> float` induces a dissimilarity measure through application to `y - x`. A float is interpreted as Minkowski size with the corresponding value for `p`. For convenience, a number of popular measures can be referred to by name. The default is the Boscovich norm (also known as cityblock, Manhattan or taxicab norm). k : int or (int -> float) or None = 1 Which nearest neighbour(s) to consider. Should be either a positive integer, or a function that takes the target class size `n` and returns a float, or None, which is resolved as `n`. All such values are rounded to the nearest integer in `[1, n]`. If `weights = None`, only the kth neighbour is used, otherwise closer neighbours are also taken into account. proximity : float -> float = np_utils.shifted_reciprocal The function used to convert distance values to proximity values. It should be be an order-reversing map from `[0, ∞)` to `[0, 1]`. weights : (int -> np.array) or None = None How to aggregate the proximity values from the `k` nearest neighbours. The default is to only consider the kth nearest neighbour distance. preprocessors : iterable = (IQRNormaliser(), ) Preprocessors to apply. The default interquartile range normaliser rescales all features to ensure that they all have the same interquartile range. Notes ----- `k` is the principal hyperparameter that can be tuned to increase performance. Its default value is based on the empirical evaluation in [3]_. References ---------- .. [1] `Knorr EM, Ng RT (1997). A Unified Notion of Outliers: Properties and Computation. KDD-97: Proceedings of the Third International Conference on Knowledge Discovery and Data Mining, pp 219–222. AAAI. doi: 10.5555/3001392.3001438 <>`_ .. [2] `Cornelis C, Verbiest N, Jensen R (2010). Ordered weighted average based fuzzy rough sets. RSKT 2010: Proceedings of the 5th International Conference on Rough Set and Knowledge Technology, pp 78--85. Springer, Lecture Notes in Artificial Intelligence 6401. doi: 10.1007/978-3-642-16248-0_16 <>`_ .. [3] `Lenz OU, Peralta D, Cornelis C (2021). Average Localised Proximity: A new data descriptor with good default one-class classification performance. Pattern Recognition, vol 118, no 107991. doi: 10.1016/j.patcog.2021.107991 <>`_ """ def __init__( self, dissimilarity: str or float or Callable[[np.array], float] or Callable[[np.array, np.array], float] = 'boscovich', k: int or Callable[[int], float] or None = 1, weights: Callable[[int], np.array] | None = None, proximity: Callable[[float], float] = shifted_reciprocal, nn_search: NeighbourSearchMethod = KDTree(), preprocessors=(IQRNormaliser(), ) ): super().__init__(dissimilarity=dissimilarity, k=k, nn_search=nn_search, preprocessors=preprocessors) self.weights = weights self.proximity = proximity def _construct(self, X) -> Model: model: NND.Model = super()._construct(X) model.proximity = self.proximity model.weights = self.weights return model
[docs] class Model(NNDataDescriptor.Model): proximity: Callable[[float], float] weights: Callable[[int], np.array] | None def _query(self, q_neighbours, q_distances): proximities = self.proximity(q_distances) score = soft_max(proximities, self.weights, self.k) return score