Source code for frlearn.uncategorised.feature_preprocessors

"""Uncategorised preprocessors"""
from __future__ import annotations

from typing import Callable

import numpy as np

from frlearn.array_functions import div_or
from frlearn.base import FeaturePreprocessor, Unsupervised
from frlearn.uncategorised.utilities import resolve_dissimilarity


[docs]class VectorSizeNormaliser(Unsupervised, FeaturePreprocessor): """ Rescales each instance (seen as a vector) to a fixed size. Typically used on datasets of frequency counts, when only the relative frequencies are considered important, e.g. token counts of texts in NLP. Parameters ---------- measure: str or float or (np.array -> float) = 'boscovich' The vector size measure to use. Must be positively homogeneous. A float is interpreted as Minkowski size with the corresponding value for `p`. For convenience, a number of popular measures can be referred to by name. target_size: float = 0.5 The size that all vectors will be rescaled to. The default, 0.5, ensures that for Minkowski sizes, the maximum distance in the resulting dataset is 1. A more typical choice is to set this value to 1, so that all instances end up on the unit hypersphere. Notes ----- If the size of an instance is 0, it will be left unscaled. If the size of an instance is ∞, it will be scaled to 0. """ # TODO: this doesn't need to be a ModelFactory def __init__( self, measure: str or float or Callable[[np.array], float] = 'boscovich', target_size: float = 0.5, ): super().__init__() # TODO: resolve vector size measures separately self.measure = resolve_dissimilarity(measure) self.target_size = target_size def _construct(self, X, ) -> Model: model = super()._construct(X) model.measure = self.measure model.target_size = self.target_size return model
[docs] class Model(Unsupervised.Model, FeaturePreprocessor.Model): measure: Callable[[np.array], float] target_size: float def _query(self, X): return self.target_size * div_or(X, self.measure(X)[:, None], X)