Source code for frlearn.neighbours.regressors

from __future__ import annotations

from typing import Callable

import numpy as np

from frlearn.base import Regressor
from frlearn.feature_preprocessors import RangeNormaliser
from frlearn.neighbours.utilities import resolve_k
from frlearn.parametrisations import at_most
from frlearn.uncategorised.utilities import apply_dissimilarity, resolve_dissimilarity


[docs]class FRNN(Regressor): """ Implementation of the Fuzzy Rough Nearest Neighbour (FRNN) regressor [1]_. Predicts an output value of a test instance `y` on the basis of the output values of the `k` nearest neighbours of `y`, similar to KNN regression. The difference is that the output value is calculated as a weighted mean, with the weights corresponding to membership degrees in the upper and lower approximations of the tolerance sets of the output values of the neighbours. Parameters ---------- k: int or (int -> float) = at_most(10) Number of neighbours to consider. Should be either a positive integer, or a function that takes the training set size `n` and returns a float. All such values are rounded to the nearest integer in `[1, n]`. Due to the computational complexity of this algorithm, `k` should not be chosen too large. dissimilarity: str or float or (np.array -> float) or ((np.array, np.array) -> float) = 'chebyshev' The dissimilarity measure to use. The similarity between two instances is calculated as 1 minus their dissimilarity. A vector size measure `np.array -> float` induces a dissimilarity measure through application to `y - x`. A float is interpreted as Minkowski size with the corresponding value for `p`. For convenience, a number of popular measures can be referred to by name. When a float or string is passed, the corresponding dissimilarity measure is automatically scaled to ensure that the dissimilarity of `[1, 1, ..., 1]` with `[0, 0, ..., 0]` is 1. For the default Chebyshev norm, this is already the case, since it assigns the maximum of the per-attribute differences, but e.g. the Boscovich norm normally amounts to the sum of the per-attribute differences. In this case, the scaling step divides by the number of dimensions, and we obtain a dissimilarity that is the mean of the per-attribute differences. This can be prevented by explicitly passing a dissimilarity measure without scaling. preprocessors : iterable = (RangeNormaliser(), ) Preprocessors to apply. The default range normaliser ensures that all features have range 1. Notes ----- Although proposed in the same paper [1]_, FRNN regression and FRNN classification are different algorithms. [1]_ does not recommend any specific value for `k`, but seems to use `k = 10` for its experiments. References ---------- .. [1] `Jensen R, Cornelis C (2011). Fuzzy-rough nearest neighbour classification and prediction. Theoretical Computer Science, vol 412, pp 5871–5884. doi: 10.1016/j.tcs.2011.05.040 <https://www.sciencedirect.com/science/article/pii/S0304397511004580>`_ """ def __init__( self, k: int = at_most(10), dissimilarity: str or float or Callable[[np.array], float] or Callable[[np.array, np.array], float] = 'chebyshev', preprocessors=(RangeNormaliser(), ) ): super().__init__(preprocessors=preprocessors) self.k = k self.dissimilarity = resolve_dissimilarity(dissimilarity, scale_by_dimensionality=True) def _construct(self, X, y) -> Model: model: FRNN.Model = super()._construct(X, y) model.k = resolve_k(self.k, model.n) model.dissimilarity = self.dissimilarity model.X = X model.y_range = np.max(y) - np.min(y) model.y = y return model
[docs] class Model(Regressor.Model): k: int dissimilarity: Callable[[np.array], float] or Callable[[np.array, np.array], float] X: np.array y: np.array y_range: float def _query(self, X): #distances = cdist(X, self.X, self.dissimilarity) distances = apply_dissimilarity(X, self.X, self.dissimilarity) neighbour_indices = np.argpartition(distances, kth=self.k - 1, axis=-1)[:, :self.k] neighbour_vals = self.y[neighbour_indices] neighbour_vals_sims = 1 - np.abs((neighbour_vals/self.y_range)[..., None] - self.y / self.y_range) lower_approx_vals = np.min(np.maximum(neighbour_vals_sims, distances[:, None, :]), axis=-1) upper_approx_vals = np.max(np.minimum(neighbour_vals_sims, 1 - distances[:, None, :]), axis=-1) combined_vals = (lower_approx_vals + upper_approx_vals)/2 return np.sum(combined_vals*neighbour_vals, axis=-1)/np.sum(combined_vals, axis=-1)