Source code for kiez.io.data_loading

"""Convenience methods for loading entity embeddings from knowledge graph embeddings."""
import os
from typing import Dict, Tuple

import numpy as np


def _read_kg_ids(path) -> Dict[int, str]:
    with open(path) as in_file:
        return {
            int(line.strip().split("\t")[1]): line.strip().split("\t")[0]
            for line in in_file
        }


def _read_ent_links(path) -> Dict[str, str]:
    with open(path) as in_file:
        return {
            line.strip().split("\t")[0]: line.strip().split("\t")[1] for line in in_file
        }


def _split_emb(emb, kg_ids):
    new_emb = []
    new_ids = {}
    i = 0
    for idx, e in enumerate(emb):
        if idx in kg_ids:
            new_ids[kg_ids[idx]] = i
            new_emb.append(e)
            i += 1
    return np.array(new_emb), new_ids


def _read_openea_files(emb_dir_path, kg_path):
    emb = np.load(os.path.join(emb_dir_path, "ent_embeds.npy"))
    kg1_ids = _read_kg_ids(os.path.join(emb_dir_path, "kg1_ent_ids"))
    kg2_ids = _read_kg_ids(os.path.join(emb_dir_path, "kg2_ent_ids"))
    ent_links = _read_ent_links(os.path.join(kg_path, "ent_links"))
    return emb, kg1_ids, kg2_ids, ent_links


def _seperate_common_embedding(
    emb: np.ndarray,
    kg1_ids: Dict[int, str],
    kg2_ids: Dict[int, str],
    ent_links: Dict[str, str],
) -> Tuple[np.ndarray, np.ndarray, Dict[int, str], Dict[int, str], Dict[str, str]]:
    """Seperate single embedding array into two arrays split by knowledge graph.

    Parameters
    ----------
    emb: np.ndarray
        embedding array
    kg1_ids: Dict[int,str]
        row index and entity ids of knowledge graph 1
    kg2_ids: Dict[int,str]
        row index and entity ids of knowledge graph 2
    ent_links: Dict[str,str]
        linked entities

    Returns
    -------
    emb1, emb2, kg1_ids_new, kg2_ids_new, ent_links_new
        embeddings of both knowledge graphs
        entity ids per embedding row for both knowledge graphs
        entity links
    """
    emb1, kg1_ids_new = _split_emb(emb, kg1_ids)
    emb2, kg2_ids_new = _split_emb(emb, kg2_ids)
    ent_links_new = {kg1_ids_new[e1]: kg2_ids_new[e2] for e1, e2 in ent_links.items()}
    return emb1, emb2, kg1_ids_new, kg2_ids_new, ent_links_new


[docs]def from_openea(
    emb_dir_path: str, kg_path: str
) -> Tuple[np.ndarray, np.ndarray, Dict[int, str], Dict[int, str], Dict[str, str]]:
    """Load OpenEA-type data.

    Parameters
    ----------
    emb_dir_path: str
        Path to folder containing embedding
    kg_path: str
        Path to folder containing kg info

    Returns
    -------
    emb1, emb2, kg1_ids_new, kg2_ids_new, ent_links_new
        embeddings of both knowledge graphs
        entity ids per embedding row for both knowledge graphs
        entity links

    Notes
    -----
    See here for more information on the dataset structure
    https://github.com/nju-websoft/OpenEA#dataset-description
    """
    return _seperate_common_embedding(*_read_openea_files(emb_dir_path, kg_path))