Source code for kiez.hubness_reduction.dis_sim

# SPDX-License-Identifier: BSD-3-Clause
# adapted from skhubness: https://github.com/VarIr/scikit-hubness/

from typing import Tuple, TypeVar

import numpy as np
from sklearn.metrics import euclidean_distances
from sklearn.utils.extmath import row_norms
from sklearn.utils.validation import check_is_fitted

from .base import HubnessReduction

T = TypeVar("T")

_DESIRED_P_VALUE = 2
_MINIMUM_DIST = 0.0

try:
    import torch
except ImportError:
    torch = None


[docs]class DisSimLocal(HubnessReduction):
    """Hubness reduction with DisSimLocal.

    Uses the formula presented in [1]_.

    Parameters
    ----------
    squared: bool, default = True
        DisSimLocal operates on squared Euclidean distances.
        If True, return (quasi) squared Euclidean distances;
        if False, return (quasi) Eucldean distances.

    References
    ----------
    .. [1] Hara K, Suzuki I, Kobayashi K, Fukumizu K, Radovanović M (2016)
           Flattening the density gradient for eliminating spatial centrality to reduce hubness.
           In: Proceedings of the 30th AAAI conference on artificial intelligence, pp 1659-1665.
           https://www.aaai.org/ocs/index.php/AAAI/AAAI16/paper/viewPaper/12055
    """

[docs]    def __init__(self, squared: bool = True, **kwargs):
        super().__init__(**kwargs)
        self.squared = squared
        if self.nn_algo.metric in ["euclidean", "minkowski"]:
            self.squared = False
            if hasattr(self.nn_algo, "p") and self.nn_algo.p != _DESIRED_P_VALUE:
                raise ValueError(
                    "DisSimLocal only supports squared Euclidean distances. If"
                    " the provided NNAlgorithm has a `p` parameter it must be"
                    f" set to p=2. Now it is p={self.nn_algo.p}"
                )
        elif self.nn_algo.metric in ["sqeuclidean"]:
            self.squared = True
        else:
            raise ValueError(
                "DisSimLocal only supports squared Euclidean distances, not"
                f" metric={self.nn_algo.metric}."
            )

    def __repr__(self):
        return f"{self.__class__.__name__}(squared = {self.squared})"

    def _fit(
        self,
        neigh_dist,
        neigh_ind,
        source,
        target,
    ) -> "DisSimLocal":
        """Fit the model using target, neigh_dist, and neigh_ind as training data.

        Parameters
        ----------
        neigh_dist: shape (n_samples, n_neighbors)
            Distance matrix of training objects (rows) against their
            individual k nearest neighbors (colums).
        neigh_ind: shape (n_samples, n_neighbors)
            Neighbor indices corresponding to the values in neigh_dist.
        source: shape (n_samples, n_features)
            source embedding, where n_samples is the number of vectors,
            and n_features their dimensionality (number of features).
        target: shape (n_samples, n_features)
            Target embedding, where n_samples is the number of vectors,
            and n_features their dimensionality (number of features).

        Returns
        -------
        DisSimLocal
            Fitted DisSimLocal
        """
        # Calculate local neighborhood centroids among the training points
        knn = neigh_ind
        centroids = source[knn].mean(axis=1)
        if self._use_torch:
            # see https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/utils/extmath.py#L87C21-L87C48
            X = target - centroids
            dist_to_cent = torch.einsum("ij,ij->i", X, X)
        else:
            dist_to_cent = row_norms(target - centroids, squared=True)

        self.source_ = source
        self.target_ = target
        self.target_centroids_ = centroids
        self.target_dist_to_centroids_ = dist_to_cent
        return self

[docs]    def transform(
        self,
        neigh_dist,
        neigh_ind,
        query,
    ) -> Tuple[T, T]:
        """Transform distance between test and training data with DisSimLocal.

        Parameters
        ----------
        neigh_dist: shape (n_query, n_neighbors)
            Distance matrix of test objects (rows) against their individual
            k nearest neighbors among the training data (columns).
        neigh_ind: shape (n_query, n_neighbors)
            Neighbor indices corresponding to the values in neigh_dist
        query: shape (n_query, n_features)
            Query entities that were used to obtain neighbors
            If none is provided use source that was provided in fit step

        Returns
        -------
        hub_reduced_dist, neigh_ind
            DisSimLocal distances, and corresponding neighbor indices

        Notes
        -----
        The returned distances are NOT sorted! If you use this class directly,
        you will need to sort the returned matrices according to hub_reduced_dist.
        """
        check_is_fitted(
            self,
            ["target_", "target_centroids_", "target_dist_to_centroids_"],
        )
        # Calculate local neighborhood centroids for source objects among target objects
        if self._use_torch:
            # pairwise squared euclidean distance between each query vector and knn
            # unsqueeze to enable batching
            hub_reduced_dist = (
                torch.cdist(torch.unsqueeze(query, 1), self.target_[neigh_ind])
                .pow(2)
                .squeeze()
            )
        else:
            hub_reduced_dist = np.empty_like(neigh_dist)
            for i, ind in enumerate(neigh_ind):
                hub_reduced_dist[i, :] = euclidean_distances(
                    query[i].reshape(1, -1), self.target_[ind], squared=True
                )

        centroids = self.target_[neigh_ind].mean(axis=1)
        source_minus_centroids = query - centroids
        source_minus_centroids **= 2
        source_dist_to_centroids = source_minus_centroids.sum(axis=1)
        target_dist_to_centroids = self.target_dist_to_centroids_[neigh_ind]

        hub_reduced_dist -= source_dist_to_centroids.reshape(-1, 1)
        hub_reduced_dist -= target_dist_to_centroids

        # DisSimLocal can yield negative dissimilarities, which can cause problems with
        # certain scikit-learn routines (e.g. in metric='precomputed' usages).
        # We, therefore, shift dissimilarities to non-negative values, if necessary.
        min_dist = hub_reduced_dist.min()
        if min_dist < _MINIMUM_DIST:
            hub_reduced_dist += -min_dist

        # Return Euclidean or squared Euclidean distances?
        if not self.squared:
            hub_reduced_dist **= 1 / 2

        # Return the hubness reduced distances
        # These must be sorted downstream
        return hub_reduced_dist, neigh_ind