Source code for kiez.hubness_reduction.csls
from typing import Tuple, TypeVar
from sklearn.utils.validation import check_is_fitted
from .base import HubnessReduction
T = TypeVar("T")
[docs]class CSLS(HubnessReduction):
"""Hubness reduction with Cross-domain similarity local scaling.
Uses the formula presented in [1]_.
References
----------
.. [1] Lample, G., Conneau, A., Ranzato, M., Denoyer, L., & Jégou, H. (2018)
Word translation without parallel data
In: 6th International Conference on Learning Representations,
ICLR 2018 - Conference Track Proceedings.
https://openreview.net/forum?id=H196sainb
"""
def __repr__(self):
return f"{self.__class__.__name__}(verbose = {self.verbose})"
def _fit(
self,
neigh_dist,
neigh_ind,
source=None,
target=None,
) -> "CSLS":
"""Fit the model using target, neigh_dist, and neigh_ind as training data.
Parameters
----------
neigh_dist: np.ndarray, shape (n_samples, n_neighbors)
Distance matrix of training objects (rows) against their
individual k nearest neighbors (colums).
neigh_ind: np.ndarray, shape (n_samples, n_neighbors)
Neighbor indices corresponding to the values in neigh_dist.
source
ignored
target
ignored
Returns
-------
CSLS
Fitted CSLS
"""
self.r_dist_train_ = neigh_dist
self.r_ind_train_ = neigh_ind
return self
[docs] def transform(
self,
neigh_dist,
neigh_ind,
query,
) -> Tuple[T, T]:
"""Transform distance between test and training data with CSLS.
Parameters
----------
neigh_dist: np.ndarray, shape (n_query, n_neighbors)
Distance matrix of test objects (rows) against their individual
k nearest neighbors among the training data (columns).
neigh_ind: np.ndarray, shape (n_query, n_neighbors)
Neighbor indices corresponding to the values in neigh_dist
query
Ignored
Returns
-------
hub_reduced_dist, neigh_ind
CSLS distances, and corresponding neighbor indices
Notes
-----
The returned distances are NOT sorted! If you use this class directly,
you will need to sort the returned matrices according to hub_reduced_dist.
"""
check_is_fitted(self, "r_dist_train_")
# Find average distances to the k nearest neighbors
r_dist_test = neigh_dist
r_train = self.r_dist_train_.mean(axis=1)
r_test = r_dist_test.mean(axis=1).reshape(-1, 1)
hub_reduced_dist = 2 * neigh_dist - r_test - r_train[neigh_ind]
# Return the hubness reduced distances
# These must be sorted downstream
return hub_reduced_dist, neigh_ind
