Source code for kiez.hubness_reduction.dis_sim

# SPDX-License-Identifier: BSD-3-Clause
# adapted from skhubness: https://github.com/VarIr/scikit-hubness/

from typing import Tuple, TypeVar

import numpy as np
from sklearn.metrics import euclidean_distances
from sklearn.utils.extmath import row_norms
from sklearn.utils.validation import check_is_fitted

from .base import HubnessReduction

T = TypeVar("T")

_DESIRED_P_VALUE = 2
_MINIMUM_DIST = 0.0

try:
    import torch
except ImportError:
    torch = None


[docs]class DisSimLocal(HubnessReduction): """Hubness reduction with DisSimLocal. Uses the formula presented in [1]_. Parameters ---------- squared: bool, default = True DisSimLocal operates on squared Euclidean distances. If True, return (quasi) squared Euclidean distances; if False, return (quasi) Eucldean distances. References ---------- .. [1] Hara K, Suzuki I, Kobayashi K, Fukumizu K, Radovanović M (2016) Flattening the density gradient for eliminating spatial centrality to reduce hubness. In: Proceedings of the 30th AAAI conference on artificial intelligence, pp 1659-1665. https://www.aaai.org/ocs/index.php/AAAI/AAAI16/paper/viewPaper/12055 """
[docs] def __init__(self, squared: bool = True, **kwargs): super().__init__(**kwargs) self.squared = squared if self.nn_algo.metric in ["euclidean", "minkowski"]: self.squared = False if hasattr(self.nn_algo, "p") and self.nn_algo.p != _DESIRED_P_VALUE: raise ValueError( "DisSimLocal only supports squared Euclidean distances. If" " the provided NNAlgorithm has a `p` parameter it must be" f" set to p=2. Now it is p={self.nn_algo.p}" ) elif self.nn_algo.metric in ["sqeuclidean"]: self.squared = True else: raise ValueError( "DisSimLocal only supports squared Euclidean distances, not" f" metric={self.nn_algo.metric}." )
def __repr__(self): return f"{self.__class__.__name__}(squared = {self.squared})" def _fit( self, neigh_dist, neigh_ind, source, target, ) -> "DisSimLocal": """Fit the model using target, neigh_dist, and neigh_ind as training data. Parameters ---------- neigh_dist: shape (n_samples, n_neighbors) Distance matrix of training objects (rows) against their individual k nearest neighbors (colums). neigh_ind: shape (n_samples, n_neighbors) Neighbor indices corresponding to the values in neigh_dist. source: shape (n_samples, n_features) source embedding, where n_samples is the number of vectors, and n_features their dimensionality (number of features). target: shape (n_samples, n_features) Target embedding, where n_samples is the number of vectors, and n_features their dimensionality (number of features). Returns ------- DisSimLocal Fitted DisSimLocal """ # Calculate local neighborhood centroids among the training points knn = neigh_ind centroids = source[knn].mean(axis=1) if self._use_torch: # see https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/utils/extmath.py#L87C21-L87C48 X = target - centroids dist_to_cent = torch.einsum("ij,ij->i", X, X) else: dist_to_cent = row_norms(target - centroids, squared=True) self.source_ = source self.target_ = target self.target_centroids_ = centroids self.target_dist_to_centroids_ = dist_to_cent return self
[docs] def transform( self, neigh_dist, neigh_ind, query, ) -> Tuple[T, T]: """Transform distance between test and training data with DisSimLocal. Parameters ---------- neigh_dist: shape (n_query, n_neighbors) Distance matrix of test objects (rows) against their individual k nearest neighbors among the training data (columns). neigh_ind: shape (n_query, n_neighbors) Neighbor indices corresponding to the values in neigh_dist query: shape (n_query, n_features) Query entities that were used to obtain neighbors If none is provided use source that was provided in fit step Returns ------- hub_reduced_dist, neigh_ind DisSimLocal distances, and corresponding neighbor indices Notes ----- The returned distances are NOT sorted! If you use this class directly, you will need to sort the returned matrices according to hub_reduced_dist. """ check_is_fitted( self, ["target_", "target_centroids_", "target_dist_to_centroids_"], ) # Calculate local neighborhood centroids for source objects among target objects if self._use_torch: # pairwise squared euclidean distance between each query vector and knn # unsqueeze to enable batching hub_reduced_dist = ( torch.cdist(torch.unsqueeze(query, 1), self.target_[neigh_ind]) .pow(2) .squeeze() ) else: hub_reduced_dist = np.empty_like(neigh_dist) for i, ind in enumerate(neigh_ind): hub_reduced_dist[i, :] = euclidean_distances( query[i].reshape(1, -1), self.target_[ind], squared=True ) centroids = self.target_[neigh_ind].mean(axis=1) source_minus_centroids = query - centroids source_minus_centroids **= 2 source_dist_to_centroids = source_minus_centroids.sum(axis=1) target_dist_to_centroids = self.target_dist_to_centroids_[neigh_ind] hub_reduced_dist -= source_dist_to_centroids.reshape(-1, 1) hub_reduced_dist -= target_dist_to_centroids # DisSimLocal can yield negative dissimilarities, which can cause problems with # certain scikit-learn routines (e.g. in metric='precomputed' usages). # We, therefore, shift dissimilarities to non-negative values, if necessary. min_dist = hub_reduced_dist.min() if min_dist < _MINIMUM_DIST: hub_reduced_dist += -min_dist # Return Euclidean or squared Euclidean distances? if not self.squared: hub_reduced_dist **= 1 / 2 # Return the hubness reduced distances # These must be sorted downstream return hub_reduced_dist, neigh_ind