Source code for frlearn.uncategorised.vector_size_measures

from dataclasses import dataclass

import numpy as np


[docs]@dataclass(frozen=True)
class MinkowskiSize:
    """
    Family of vector size measures of the form
    `(x1**p + x2**p + ... + xm**p)**(1/p)` (if `unrooted = False`), or
    `(x1**p + x2**p + ... + xm**p)` (if `unrooted = True`),
    for `0 < p < ∞`, and their limits in 0 and ∞.

    For `p = 0`, the rooted variant evaluates to ∞ if there is more than one non-zero coefficient,
    to 0 if all coefficients are zero, and to the only non-zero coefficient otherwise.
    The unrooted variant is equal to the number of non-zero coefficients.

    For `p = ∞`, the rooted variant is the maximum of all coefficients.
    The unrooted variant evaluates to ∞ if there is at least one coefficient larger than 1,
    and to the number of coefficients equal to 1 otherwise.

    Parameters
    ----------
    p: float = 1
        Exponent to use. Must be in `[0, ∞]`.

    unrooted: bool = False
        Whether to omit the root `**(1/p)` from the formula.
        For `p = 0`, this gives Hamming size.
        For `p = 2`, this gives squared Euclidean size.

    scale_by_dimensionality: bool = False
        If `True`, values are scaled linearly such that the vector `[1, 1, ..., 1]` has size 1.
        This can be used to ensure that the range of dissimilarity values in the unit hypercube is `[0, 1]`,
        which can be useful when working with features scaled to `[0, 1]`.

    Notes
    -----
    The most used parameter combinations have their own name.

    * Hamming size is unrooted `p = 0`.
    * The Boscovich norm is `p = 1`. Also known as cityblock, Manhattan or Taxicab norm.
    * The Euclidean norm is rooted `p = 2`. Also known as Pythagorean norm.
    * Squared Euclidean size is unrooted `p = 2`.
    * The Chebishev norm is rooted `p = ∞`. Also known as chessboard or maximum norm.
    """

    p: float
    unrooted: bool = False
    scale_by_dimensionality: bool = False

    def __post_init__(self):
        if self.p < 0:
            raise ValueError('`p` must be in `[0, ∞]`')

    def __call__(self, u, axis=-1):
        if self.p == 0:
            if self.unrooted:
                result = np.count_nonzero(u, axis=axis)
            else:
                result = np.where(np.count_nonzero(u, axis=axis) <= 1, np.sum(np.abs(u), axis=axis), np.inf)
        elif self.p == 1:
            result = np.sum(np.abs(u), axis=axis)
        elif self.p == np.inf:
            if self.unrooted:
                result = np.sum(np.where(np.abs(u) < 1, 0, np.where(np.abs(u) > 1, np.inf, 1)), axis=axis)
            else:
                result = np.max(u, axis=axis)
        else:
            result = np.sum(np.abs(u) ** self.p, axis=axis)
            if not self.unrooted:
                result = result**(1/self.p)
        if self.scale_by_dimensionality and self.p < np.inf:
            if self.unrooted:
                result = result / u.shape[axis]
            else:
                result = result / (u.shape[axis]**(1/self.p))
        return result