Source code for frlearn.uncategorised.feature_preprocessors

"""Uncategorised preprocessors"""
from __future__ import annotations

from typing import Callable

import numpy as np

from frlearn.array_functions import div_or
from frlearn.base import FeaturePreprocessor, Unsupervised
from frlearn.uncategorised.utilities import resolve_dissimilarity


[docs]class VectorSizeNormaliser(Unsupervised, FeaturePreprocessor):
    """
    Rescales each instance (seen as a vector) to a fixed size.
    Typically used on datasets of frequency counts,
    when only the relative frequencies are considered important,
    e.g. token counts of texts in NLP.

    Parameters
    ----------
    measure: str or float or (np.array -> float) = 'boscovich'
        The vector size measure to use. Must be positively homogeneous.
        A float is interpreted as Minkowski size with the corresponding value for `p`.
        For convenience, a number of popular measures can be referred to by name.

    target_size: float = 0.5
        The size that all vectors will be rescaled to.
        The default, 0.5, ensures that for Minkowski sizes,
        the maximum distance in the resulting dataset is 1.
        A more typical choice is to set this value to 1,
        so that all instances end up on the unit hypersphere.

    Notes
    -----
    If the size of an instance is 0, it will be left unscaled.
    If the size of an instance is ∞, it will be scaled to 0.

    """
    # TODO: this doesn't need to be a ModelFactory

    def __init__(
            self,
            measure: str or float or Callable[[np.array], float] = 'boscovich',
            target_size: float = 0.5,
    ):
        super().__init__()
        # TODO: resolve vector size measures separately
        self.measure = resolve_dissimilarity(measure)
        self.target_size = target_size

    def _construct(self, X, ) -> Model:
        model = super()._construct(X)
        model.measure = self.measure
        model.target_size = self.target_size
        return model

[docs]    class Model(Unsupervised.Model, FeaturePreprocessor.Model):

        measure: Callable[[np.array], float]
        target_size: float

        def _query(self, X):
            return self.target_size * div_or(X, self.measure(X)[:, None], X)