Source code for frlearn.neighbours.regressors

from __future__ import annotations

from typing import Callable

import numpy as np

from frlearn.base import Regressor
from frlearn.feature_preprocessors import RangeNormaliser
from frlearn.neighbours.utilities import resolve_k
from frlearn.parametrisations import at_most
from frlearn.uncategorised.utilities import apply_dissimilarity, resolve_dissimilarity


[docs]class FRNN(Regressor):
    """
    Implementation of the Fuzzy Rough Nearest Neighbour (FRNN) regressor [1]_.

    Predicts an output value of a test instance `y` on the basis of the output values
    of the `k` nearest neighbours of `y`, similar to KNN regression.
    The difference is that the output value is calculated as a weighted mean,
    with the weights corresponding to membership degrees in the upper and lower approximations
    of the tolerance sets of the output values of the neighbours.

    Parameters
    ----------
    k: int or (int -> float) = at_most(10)
        Number of neighbours to consider.
        Should be either a positive integer,
        or a function that takes the training set size `n` and returns a float.
        All such values are rounded to the nearest integer in `[1, n]`.
        Due to the computational complexity of this algorithm,
        `k` should not be chosen too large.

    dissimilarity: str or float or (np.array -> float) or ((np.array, np.array) -> float) = 'chebyshev'
        The dissimilarity measure to use.
        The similarity between two instances is calculated as 1 minus their dissimilarity.

        A vector size measure `np.array -> float` induces a dissimilarity measure through application to `y - x`.
        A float is interpreted as Minkowski size with the corresponding value for `p`.
        For convenience, a number of popular measures can be referred to by name.

        When a float or string is passed, the corresponding dissimilarity measure is automatically scaled
        to ensure that the dissimilarity of `[1, 1, ..., 1]` with `[0, 0, ..., 0]` is 1.

        For the default Chebyshev norm, this is already the case,
        since it assigns the maximum of the per-attribute differences,
        but e.g. the Boscovich norm normally amounts to the sum of the per-attribute differences.
        In this case, the scaling step divides by the number of dimensions,
        and we obtain a dissimilarity that is the mean of the per-attribute differences.

        This can be prevented by explicitly passing a dissimilarity measure without scaling.

    preprocessors : iterable = (RangeNormaliser(), )
        Preprocessors to apply. The default range normaliser ensures that all features have range 1.

    Notes
    -----
    Although proposed in the same paper [1]_, FRNN regression and FRNN classification are different algorithms.

    [1]_ does not recommend any specific value for `k`, but seems to use `k = 10` for its experiments.

    References
    ----------
    .. [1] `Jensen R, Cornelis C (2011).
       Fuzzy-rough nearest neighbour classification and prediction.
       Theoretical Computer Science, vol 412, pp 5871–5884.
       doi: 10.1016/j.tcs.2011.05.040
       <https://www.sciencedirect.com/science/article/pii/S0304397511004580>`_
    """
    def __init__(
            self,
            k: int = at_most(10),
            dissimilarity: str or float or Callable[[np.array], float] or Callable[[np.array, np.array], float] = 'chebyshev',
            preprocessors=(RangeNormaliser(), )
    ):
        super().__init__(preprocessors=preprocessors)
        self.k = k
        self.dissimilarity = resolve_dissimilarity(dissimilarity, scale_by_dimensionality=True)

    def _construct(self, X, y) -> Model:
        model: FRNN.Model = super()._construct(X, y)
        model.k = resolve_k(self.k, model.n)
        model.dissimilarity = self.dissimilarity
        model.X = X
        model.y_range = np.max(y) - np.min(y)
        model.y = y
        return model

[docs]    class Model(Regressor.Model):

        k: int
        dissimilarity: Callable[[np.array], float] or Callable[[np.array, np.array], float]
        X: np.array
        y: np.array
        y_range: float

        def _query(self, X):
            #distances = cdist(X, self.X, self.dissimilarity)
            distances = apply_dissimilarity(X, self.X, self.dissimilarity)
            neighbour_indices = np.argpartition(distances, kth=self.k - 1, axis=-1)[:, :self.k]
            neighbour_vals = self.y[neighbour_indices]
            neighbour_vals_sims = 1 - np.abs((neighbour_vals/self.y_range)[..., None] - self.y / self.y_range)
            lower_approx_vals = np.min(np.maximum(neighbour_vals_sims, distances[:, None, :]), axis=-1)
            upper_approx_vals = np.max(np.minimum(neighbour_vals_sims, 1 - distances[:, None, :]), axis=-1)
            combined_vals = (lower_approx_vals + upper_approx_vals)/2
            return np.sum(combined_vals*neighbour_vals, axis=-1)/np.sum(combined_vals, axis=-1)