Source code for kiez.hubness_reduction.local_scaling
# SPDX-License-Identifier: BSD-3-Clause
# adapted from skhubness: https://github.com/VarIr/scikit-hubness/
from typing import Tuple, TypeVar
import numpy as np
from sklearn.utils.validation import check_is_fitted
from .base import HubnessReduction
T = TypeVar("T")
try:
import torch
except ImportError:
torch = None
[docs]class LocalScaling(HubnessReduction):
"""Hubness reduction with Local Scaling.
Uses the formula presented in [1]_.
Parameters
----------
k: int, default = 5
Number of neighbors to consider for the rescaling
method: 'standard' or 'nicdm', default = 'standard'
Perform local scaling with the specified variant:
- 'standard' or 'ls' rescale distances using the distance to the k-th neighbor
- 'nicdm' rescales distances using a statistic over distances to k neighbors
verbose: int, default = 0
If verbose > 0, show progress bar.
References
----------
.. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012).
Local and global scaling reduce hubs in space. The Journal of Machine
Learning Research, 13(1), 2871-2902.
"""
[docs] def __init__(self, method: str = "standard", **kwargs):
super().__init__(**kwargs)
self.method = method.lower()
if self.method not in ["ls", "standard", "nicdm"]:
raise ValueError(
f"Internal: Invalid method {self.method}. Try 'ls' or 'nicdm'."
)
def __repr__(self):
return (
f"{self.__class__.__name__}(method = {self.method}, verbose ="
f" {self.verbose})"
)
def _fit(
self,
neigh_dist,
neigh_ind,
source,
target,
) -> "LocalScaling":
"""Fit the model using neigh_dist and neigh_ind as training data.
Parameters
----------
neigh_dist: np.ndarray, shape (n_samples, n_neighbors)
Distance matrix of training objects (rows) against their
individual k nearest neighbors (colums).
neigh_ind: np.ndarray, shape (n_samples, n_neighbors)
Neighbor indices corresponding to the values in neigh_dist.
source
Ignored
target
Ignored
Returns
-------
LocalScaling
Fitted LocalScaling
"""
self.r_dist_t_to_s_ = neigh_dist
self.r_ind_t_to_s_ = neigh_ind
return self
def _exp(self, inner_exp):
if self._use_torch:
return torch.exp(inner_exp)
return np.exp(inner_exp)
def _sqrt(self, value):
if self._use_torch:
return torch.sqrt(value)
return np.sqrt(value)
[docs] def transform(
self,
neigh_dist,
neigh_ind,
query=None,
) -> Tuple[T, T]:
"""Transform distance between test and training data with Mutual Proximity.
Parameters
----------
neigh_dist: np.ndarray, shape (n_query, n_neighbors)
Distance matrix of test objects (rows) against their individual
k nearest neighbors among the training data (columns).
neigh_ind: np.ndarray, shape (n_query, n_neighbors)
Neighbor indices corresponding to the values in neigh_dist
query
Ignored
Returns
-------
hub_reduced_dist, neigh_ind
Local scaling distances, and corresponding neighbor indices
Raises
------
ValueError
If wrong self.method was supplied
Notes
-----
The returned distances are NOT sorted! If you use this class directly,
you will need to sort the returned matrices according to hub_reduced_dist.
"""
check_is_fitted(self, "r_dist_t_to_s_")
# Find distances to the k-th neighbor (standard LS) or the k neighbors (NICDM)
r_dist_s_to_t = neigh_dist
# Perform standard local scaling...
if self.method in ["ls", "standard"]:
r_t_to_s = self.r_dist_t_to_s_[:, -1]
r_s_to_t = r_dist_s_to_t[:, -1].reshape(-1, 1)
inner_exp = -1 * neigh_dist**2 / (r_s_to_t * r_t_to_s[neigh_ind])
exp = self._exp(inner_exp)
hub_reduced_dist = 1.0 - exp
# ...or use non-iterative contextual dissimilarity measure
elif self.method == "nicdm":
r_t_to_s = self.r_dist_t_to_s_.mean(axis=1)
r_s_to_t = r_dist_s_to_t.mean(axis=1).reshape(-1, 1)
inner_sqrt = r_s_to_t * r_t_to_s[neigh_ind]
sqrt = self._sqrt(inner_sqrt)
hub_reduced_dist = neigh_dist / sqrt
# Return the hubness reduced distances
# These must be sorted downstream
return hub_reduced_dist, neigh_ind
