import json
from pathlib import Path
from typing import Any, Dict, List, Literal, Optional, Tuple, TypeVar, Union, overload
import numpy as np
from class_resolver import HintOrType
from kiez.hubness_reduction import (
hubness_reduction_resolver,
)
from kiez.hubness_reduction.base import HubnessReduction
from kiez.neighbors import NNAlgorithm, nn_algorithm_resolver
from kiez.neighbors.util import available_nn_algorithms
T = TypeVar("T")
[docs]class Kiez:
"""Performs hubness reduced nearest neighbor search for entity alignment.
Use the given algorithm to :meth:`fit` the data and calculate the
:meth:`kneighbors`.
Parameters
----------
n_candidates : int, default=10
number of nearest neighbors used for candidate search
algorithm : :obj:`~kiez.neighbors.NNAlgorithm`, default = None
initialised `NNAlgorithm` object that will be used for neighbor search
If no algorithm is provided :obj:`~kiez.neighbors.Faiss` is used if available else
:obj:`~kiez.neighbors.SklearnNN` is used with default values
algorithm_kwargs :
A dictionary of keyword arguments to pass to the :obj:`~kiez.neighbors.NNAlgorithm`
if given as a class in the ``algorithm`` argument.
hubness :
Either an instance of a :obj:`~kiez.hubness_reduction.base.HubnessReduction`,
the class for a :obj:`~kiez.hubness_reduction.base.HubnessReduction` that should
be instantiated, the name of the hubness reduction method, or if None, defaults
to no hubness reduction.
hubness_kwargs :
A dictionary of keyword arguments to pass to the :obj:`~kiez.hubness_reduction.base.HubnessReduction`
if given as a class in the ``hubness`` argument.
Examples
--------
>>> from kiez import Kiez
>>> import numpy as np
>>> # create example data
>>> rng = np.random.RandomState(0)
>>> source = rng.rand(100,50)
>>> target = rng.rand(100,50)
>>> # fit and get neighbors
>>> k_inst = Kiez()
>>> k_inst.fit(source, target)
>>> nn_dist, nn_ind = k_inst.kneighbors(5)
Using a specific algorithm and hubness reduction
>>> from kiez import Kiez
>>> import numpy as np
>>> # create example data
>>> rng = np.random.RandomState(0)
>>> source = rng.rand(100,50)
>>> target = rng.rand(100,50)
>>> # prepare algorithm and hubness reduction
>>> k_inst = Kiez(n_candidates=10, algorithm="Faiss", hubness="CSLS")
>>> # fit and get neighbors
>>> k_inst.fit(source, target)
>>> nn_dist, nn_ind = k_inst.kneighbors(5)
You can investigate which NN algos are installed and which hubness methods are implemented with:
>>> Kiez.show_hubness_options()
>>> Kiez.show_algorithm_options()
Beginning with version 0.5.0 torch can be used, when using `Faiss` as NN algorithm:
>>> from kiez import Kiez
>>> import torch
>>> source = torch.randn((100,10))
>>> target = torch.randn((200,10))
>>> k_inst = Kiez(algorithm="Faiss", hubness="CSLS")
>>> k_inst.fit(source, target)
>>> nn_dist, nn_ind = k_inst.kneighbors()
You can also utilize tensors and NN calculations on the GPU:
>>> k_inst = Kiez(algorithm="Faiss", algorithm_kwargs={"use_gpu":True}, hubness="CSLS")
>>> k_inst.fit(source.cuda(), target.cuda())
>>> nn_dist, nn_ind = k_inst.kneighbors()
You can also initalize Kiez via a json file
>>> kiez = Kiez.from_path("tests/example_conf.json")
"""
[docs] def __init__(
self,
n_candidates: int = 10,
algorithm: HintOrType[NNAlgorithm] = None,
algorithm_kwargs: Optional[Dict[str, Any]] = None,
hubness: HintOrType[HubnessReduction] = None,
hubness_kwargs: Optional[Dict[str, Any]] = None,
):
if not np.issubdtype(type(n_candidates), np.integer):
raise TypeError(
f"n_neighbors does not take {type(n_candidates)} value, enter"
" integer value"
)
if n_candidates <= 0:
raise ValueError(f"Expected n_candidates > 0. Got {n_candidates}")
if algorithm_kwargs is None:
algorithm_kwargs = {"n_candidates": n_candidates}
elif "n_candidates" not in algorithm_kwargs:
algorithm_kwargs["n_candidates"] = n_candidates
if algorithm is None:
try:
algorithm = nn_algorithm_resolver.make("Faiss", algorithm_kwargs)
except ImportError:
algorithm = nn_algorithm_resolver.make("SklearnNN", algorithm_kwargs)
else:
algorithm = nn_algorithm_resolver.make(algorithm, algorithm_kwargs)
assert algorithm
if hubness_kwargs is None:
hubness_kwargs = {}
hubness_kwargs["nn_algo"] = algorithm
self.hubness = hubness_reduction_resolver.make(hubness, hubness_kwargs)
@staticmethod
def show_algorithm_options() -> List[str]:
return available_nn_algorithms(as_string=True)
@staticmethod
def show_hubness_options() -> List[str]:
return list(hubness_reduction_resolver.options)
@property
def algorithm(self):
return self.hubness.nn_algo
@algorithm.setter
def algorithm(self, value):
self.hubness.nn_algo = value
def __repr__(self):
return (
f"Kiez(algorithm: {self.algorithm},"
f" hubness: {self.hubness})"
f" {self.algorithm._describe_source_target_fitted()}"
)
[docs] @classmethod
def from_path(cls, path: Union[str, Path]) -> "Kiez":
"""Load a Kiez instance from configuration in a JSON file, based on its path."""
with open(path) as file:
return cls(**json.load(file))
[docs] def fit(self, source: T, target: Optional[T] = None) -> "Kiez":
"""Fits the algorithm and hubness reduction method.
Parameters
----------
source : matrix of shape (n_samples, n_features)
embeddings of source entities
target : matrix of shape (m_samples, n_features)
embeddings of target entities. If none given, uses the source.
Returns
-------
Kiez
Fitted kiez instance
"""
self.hubness.fit(source, target)
return self
@overload
def kneighbors(
self,
k: Optional[int] = None,
return_distance: Literal[True] = True,
) -> Tuple[T, T]:
...
@overload
def kneighbors(
self,
k: Optional[int] = None,
return_distance: Literal[False] = False,
) -> Any:
...
[docs] def kneighbors(
self,
k: Optional[int] = None,
return_distance: bool = True,
) -> Union[T, Tuple[T, T]]:
"""Retrieve the k-nearest neighbors using the supplied nearest neighbor algorithm and hubness reduction method.
Parameters
----------
k : Optional[int], default = None
k-nearest neighbors, if None is set to number of n_candidates
return_distance : bool, default = True
Whether to return distances
If `False` only indices are returned
Returns
-------
neigh_dist : ndarray of shape (n_queries, n_neighbors)
Array representing the distance between source and target entities
only present if return_distance=True.
neigh_ind : ndarray of shape (n_queries, n_neighbors)
Indices of the nearest points in the population matrix.
"""
hubness_reduced_query_dist, query_ind = self.hubness.kneighbors(k)
if return_distance:
result = hubness_reduced_query_dist, query_ind
else:
result = query_ind
return result