Source code for kiez.neighbors.approximate.random_projection_trees

# SPDX-License-Identifier: BSD-3-Clause
# Author: Tom Dupre la Tour (original work)
#         Roman Feldbauer (adaptions for scikit-hubness)
#         Daniel Obraczka (adaptions for kiez)
# PEP 563: Postponed Evaluation of Annotations

from __future__ import annotations

import logging
from typing import Tuple

import numpy as np
from tqdm.auto import tqdm

from kiez.io.temp_file_handling import create_tempfile_preferably_in_dir
from kiez.neighbors.neighbor_algorithm_base import NNAlgorithmWithJoblib

try:
    import annoy

except ImportError:  # pragma: no cover
    annoy = None


[docs]class Annoy(NNAlgorithmWithJoblib):
    """Wrapper for Spotify's approximate nearest neighbor library.

    Parameters
    ----------
    n_candidates: int
        number of nearest neighbors used in search
    metric: str, default = 'euclidean'
        distance measure used in search
        possible measures are found in :obj:`Annoy.valid_metrics`
    n_trees: int, default = 10
        Build a forest of n_trees trees. More trees gives higher precision when querying,
        but are more expensive in terms of build time and index size.
    search_k: int, default = -1
        Query will inspect search_k nodes. A larger value will give more accurate results,
        but will take longer time.
    mmap_dir: str, default = 'auto'
        Memory-map the index to the given directory.
        This is required to make the the class pickleable.
        If None, keep everything in main memory (NON pickleable index),
        if mmap_dir is a string, it is interpreted as a directory to store the index into,
        if 'auto', create a temp dir for the index, preferably in /dev/shm on Linux.
    n_jobs: int, default = 1
        Number of parallel jobs
    verbose: int, default = 0
        Verbosity level. If verbose > 0, show tqdm progress bar on indexing and querying.

    Notes
    -----
    See more details in the annoy documentation: https://github.com/spotify/annoy#full-python-api
    """

    valid_metrics = (
        "angular",
        "euclidean",
        "manhattan",
        "hamming",
        "dot",
        "minkowski",
    )

[docs]    def __init__(
        self,
        n_candidates: int = 5,
        metric: str = "euclidean",
        n_trees: int = 10,
        search_k: int = -1,
        mmap_dir: str = "auto",
        n_jobs: int = 1,
        verbose: int = 0,
    ):
        if annoy is None:  # pragma: no cover
            raise ImportError(
                "Please install the `annoy` package, before using this class.\n"
                "$ pip install annoy"
            ) from None

        if metric not in self.__class__.valid_metrics:
            raise ValueError(
                f"Unknown metric please try one of {self.__class__.valid_metrics}"
            )
        if metric == "minkowski":  # for compatibility
            metric = "euclidean"
        super().__init__(
            n_candidates=n_candidates,
            metric=metric,
            n_jobs=n_jobs,
        )
        self.effective_metric_ = metric
        self.verbose = verbose
        self.n_trees = n_trees
        self.search_k = search_k
        self.mmap_dir = mmap_dir
        self.index_path_source = None
        self.index_path_target = None

    def __repr__(self):
        ret_str = (
            f"{self.__class__.__name__}(n_candidates={self.n_candidates},"
            + f"effective_metric_ = {self.effective_metric_},"
            + f"verbose = {self.verbose},"
            + f"n_trees = {self.n_trees},"
            + f"search_k = {self.search_k},"
            + f"mmap_dir = {self.mmap_dir},"
            + f"n_jobs = {self.n_jobs}"
        )
        if self.index_path_source is not None:
            ret_str += (
                f" source index path={self.index_path_source} and target index"
                f" path={self.index_path_target}"
            )
        ret_str += ")"
        return ret_str

    def _fit(self, data, is_source: bool):
        if is_source:
            self._source = data
            prefix = "kiez_source"
        else:
            self._target = data
            prefix = "kiez_target"
        suffix = ".annoy"
        annoy_index = annoy.AnnoyIndex(data.shape[1], metric=self.effective_metric_)

        index_path = None
        if self.mmap_dir == "auto":
            index_path = create_tempfile_preferably_in_dir(
                prefix=prefix, suffix=suffix, directory="/dev/shm"
            )
            logging.warning(
                f"The index will be stored in {index_path}. It will NOT be deleted"
                " automatically, when this instance is destructed."
            )
        elif isinstance(self.mmap_dir, str):
            index_path = create_tempfile_preferably_in_dir(
                prefix=prefix, suffix=suffix, directory=self.mmap_dir
            )
        else:  # e.g. None
            self.mmap_dir = None

        for i, x in tqdm(
            enumerate(data),
            desc="Build Annoy",
            disable=not self.verbose,
        ):
            annoy_index.add_item(i, x.tolist())
        annoy_index.build(self.n_trees)

        if index_path is not None:
            if is_source:
                self.index_path_source = index_path
            else:
                self.index_path_target = index_path
            annoy_index.save(index_path)
            return index_path, data.shape[1]

        return annoy_index

    def _kneighbors_part(self, k, query, index, return_distance, is_self_querying):
        n_query = query.shape[0]
        query_dtype = query.dtype

        # For compatibility reasons, as each sample is considered as its own
        # neighbor, one extra neighbor will be computed.
        if is_self_querying:
            k = k + 1
            start = 1
        else:
            start = 0

        # If fewer candidates than required are found for a query,
        # we save index=-1 and distance=NaN
        neigh_ind = -np.ones((n_query, k), dtype=np.int32)
        neigh_dist = np.empty_like(neigh_ind, dtype=query_dtype) * np.nan

        annoy_index = None
        if isinstance(index, Tuple):
            err_msg = (
                "Internal error: annoy index must be either annoy.AnnoyIndex"
                + f"or Tuple[str,int], but found {index}"
            )
            try:
                index, n_features = index
                if not isinstance(index, str) or not isinstance(n_features, int):
                    raise ValueError(err_msg)
            except ValueError as exc:
                raise ValueError(err_msg) from exc
            # Load memory-mapped annoy.Index, unless it's already in main memory
            if isinstance(index, str):
                annoy_index = annoy.AnnoyIndex(
                    n_features, metric=self.effective_metric_
                )
                annoy_index.load(index)
        elif isinstance(index, annoy.AnnoyIndex):
            annoy_index = index
        assert isinstance(
            annoy_index, annoy.AnnoyIndex
        ), f"Internal error: unexpected type ({type(index)} for annoy index"

        disable_tqdm = not self.verbose
        if is_self_querying:
            n_items = annoy_index.get_n_items()

            for i in tqdm(
                range(n_items),
                desc="Query Annoy",
                disable=disable_tqdm,
            ):
                ind, dist = annoy_index.get_nns_by_item(
                    i,
                    k,
                    self.search_k,
                    include_distances=True,
                )
                ind = ind[start:]
                dist = dist[start:]
                neigh_ind[i, : len(ind)] = ind
                neigh_dist[i, : len(dist)] = dist
        else:  # if query was provided
            for i, x in tqdm(
                enumerate(query),
                desc="Query Annoy",
                disable=disable_tqdm,
            ):
                ind, dist = annoy_index.get_nns_by_vector(
                    x.tolist(),
                    k,
                    self.search_k,
                    include_distances=True,
                )
                ind = ind[start:]
                dist = dist[start:]
                neigh_ind[i, : len(ind)] = ind
                neigh_dist[i, : len(dist)] = dist

        if return_distance:
            return neigh_dist, neigh_ind
        return neigh_ind