Source code for oaklib.implementations.translator.translator_implementation

"""
Adapter for NCATS Biomedical Translator endpoints (experimental).

Provides wrappers for

- Node Normalization
- Name Resolution

Examples:

Name resolution:

    .. code-block:: bash

        runoak -i translator: info "citrate"
        CHEBI:16947 ! citrate(3-)
        CHEBI:31602 ! FENTANYL CITRATE
        CHEBI:30769 ! Citric acid
        CHEBI:64733 ! Potassium citrate
        CHEBI:131391 ! Magnesium citrate
        UNII:LXN6S3999X ! MAROPITANT CITRATE
        CHEBI:190513 ! Calcium citrate
        CHEBI:71197 ! TOFACITINIB CITRATE
        CHEBI:9139 ! Sildenafil
        CHEBI:3752 ! Clomifene


Aliases:

    .. code-block:: bash

        runoak -i translator: aliases "CHEBI:16947"
        curie	pred	alias
        CHEBI:16947	oio:hasRelatedSynonym	cit
        CHEBI:16947	oio:hasRelatedSynonym	Citrate
        CHEBI:16947	oio:hasRelatedSynonym	citrate
        CHEBI:16947	oio:hasRelatedSynonym	cit(3-)
        CHEBI:16947	oio:hasRelatedSynonym	citrate(3-)
        ...

Mappings:

    .. code-block:: bash

        runoak -i translator: mappings "CHEBI:16947" -O sssom
        # curie_map:
        #   CAS: http://w3id.org/sssom/unknown_prefix/cas/
        #   CHEBI: http://purl.obolibrary.org/obo/CHEBI_
        #   INCHIKEY: http://w3id.org/sssom/unknown_prefix/inchikey/
        #   PUBCHEM.COMPOUND: http://w3id.org/sssom/unknown_prefix/pubchem.compound/
        #   UNII: http://w3id.org/sssom/unknown_prefix/unii/
        #   owl: http://www.w3.org/2002/07/owl#
        #   rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns#
        #   rdfs: http://www.w3.org/2000/01/rdf-schema#
        #   semapv: https://w3id.org/semapv/vocab/
        #   skos: http://www.w3.org/2004/02/skos/core#
        #   sssom: https://w3id.org/sssom/
        # license: https://w3id.org/sssom/license/unspecified
        # mapping_set_id: https://w3id.org/sssom/mappings/6b8c0caf-98d7-4c08-b499-8922be3405db
        subject_id	subject_label	predicate_id	object_id
        CHEBI:16947	citrate(3-)	skos:exactMatch	CAS:126-44-3
        CHEBI:16947	citrate(3-)	skos:exactMatch	CHEBI:16947
        CHEBI:16947	citrate(3-)	skos:exactMatch	INCHIKEY:KRKNYBCHXYNGOX-UHFFFAOYSA-K
        CHEBI:16947	citrate(3-)	skos:exactMatch	PUBCHEM.COMPOUND:31348
        CHEBI:16947	citrate(3-)	skos:exactMatch	UNII:664CCH53PI

Term categories:

    .. code-block:: bash

        runoak -i translator: term-categories PUBCHEM.COMPOUND:31348
        curie	subset
        PUBCHEM.COMPOUND:31348	biolink:SmallMolecule
        PUBCHEM.COMPOUND:31348	biolink:MolecularEntity
        PUBCHEM.COMPOUND:31348	biolink:ChemicalEntity
        PUBCHEM.COMPOUND:31348	biolink:PhysicalEssence
        PUBCHEM.COMPOUND:31348	biolink:ChemicalOrDrugOrTreatment
        PUBCHEM.COMPOUND:31348	biolink:ChemicalEntityOrGeneOrGeneProduct
        PUBCHEM.COMPOUND:31348	biolink:ChemicalEntityOrProteinOrPolypeptide
        PUBCHEM.COMPOUND:31348	biolink:NamedThing
        PUBCHEM.COMPOUND:31348	biolink:PhysicalEssenceOrOccurrent


"""

import logging
from dataclasses import dataclass
from typing import Dict, Iterable, Iterator, List, Mapping, Optional, Tuple, Union

import requests
import sssom_schema.datamodel.sssom_schema as sssom

from oaklib.constants import TIMEOUT_SECONDS
from oaklib.datamodels.search import SearchConfiguration
from oaklib.datamodels.vocabulary import (
    HAS_RELATED_SYNONYM,
    RDFS_LABEL,
    SEMAPV,
    SKOS_CLOSE_MATCH,
    SKOS_EXACT_MATCH,
)
from oaklib.interfaces import SearchInterface
from oaklib.interfaces.basic_ontology_interface import ALIAS_MAP, LANGUAGE_TAG
from oaklib.interfaces.mapping_provider_interface import MappingProviderInterface
from oaklib.interfaces.semsim_interface import SemanticSimilarityInterface
from oaklib.types import CATEGORY_CURIE, CURIE, PRED_CURIE

__all__ = [
    "TranslatorImplementation",
]

from oaklib.utilities.mapping.sssom_utils import inject_mapping_sources

NODE_NORMALIZER_ENDPOINT = "https://nodenormalization-sri.renci.org/1.4/get_normalized_nodes"
NAME_RESOLUTION_ENDPOINT = "https://name-resolution-sri.renci.org"
ARS_SUBMIT_ENDPOINT = "https://ars-prod.transltr.io/ars/api/submit"


[docs] @dataclass class TranslatorImplementation( MappingProviderInterface, SearchInterface, SemanticSimilarityInterface, ): """ Wraps Translator SRI endpoints. """ def terms_categories(self, curies: Iterable[CURIE]) -> Iterable[Tuple[CURIE, CATEGORY_CURIE]]: if isinstance(curies, CURIE): curies = [curies] else: curies = list(curies) r = requests.get( NODE_NORMALIZER_ENDPOINT, params={"curie": curies, "conflate": "false"}, timeout=TIMEOUT_SECONDS, ) results = r.json() if "detail" in results: if results["detail"] == "Not found.": return for curie, data in results.items(): for t in data.get("type", []): yield curie, t def information_content_scores( self, curies: Optional[Iterable[CURIE]] = None, predicates: List[PRED_CURIE] = None, object_closure_predicates: List[PRED_CURIE] = None, use_associations: bool = None, term_to_entities_map: Dict[CURIE, List[CURIE]] = None, **kwargs, ) -> Iterator[Tuple[CURIE, float]]: if isinstance(curies, CURIE): curies = [curies] else: curies = list(curies) r = requests.get( NODE_NORMALIZER_ENDPOINT, params={"curie": curies, "conflate": "false"}, timeout=TIMEOUT_SECONDS, ) results = r.json() if "detail" in results: if results["detail"] == "Not found.": return for curie, data in results.items(): ic = data.get("information_content", None) if ic is not None: yield curie, ic def sssom_mappings( self, curies: Optional[Union[CURIE, Iterable[CURIE]]] = None, source: Optional[str] = None ) -> Iterable[Mapping]: if isinstance(curies, CURIE): curies = [curies] else: curies = list(curies) r = requests.get( NODE_NORMALIZER_ENDPOINT, params={"curie": curies, "conflate": "false"}, timeout=TIMEOUT_SECONDS, ) non_conflated_results = r.json() r = requests.get( NODE_NORMALIZER_ENDPOINT, params={"curie": curies, "conflate": "true"}, timeout=TIMEOUT_SECONDS, ) results = r.json() objects = set() subjects = set() if "detail" in results: if results["detail"] == "Not found.": return for curie, data in results.items(): if not data: logging.info(f"No results for {curie} in {curies}") continue nc_data = non_conflated_results.get(curie, {}) label = None equiv_identifiers = data.get("equivalent_identifiers", []) for x in equiv_identifiers: if x["identifier"] == curie: label = x.get("label", None) for x in equiv_identifiers: object_id = x["identifier"] pred = ( SKOS_EXACT_MATCH if any( x2["identifier"] == object_id for x2 in nc_data.get("equivalent_identifiers", []) ) else SKOS_CLOSE_MATCH ) m = sssom.Mapping( subject_id=curie, subject_label=label, predicate_id=pred, object_id=object_id, object_label=x.get("label", None), mapping_justification=str(SEMAPV.ManualMappingCuration.value), ) inject_mapping_sources(m) if source: if m.object_source != source: continue yield m objects.add(object_id) subjects.add(curie) for curie in curies: if curie not in subjects: logging.warning(f"Could not find any mappings for {curie}") def inject_mapping_labels(self, mappings: Iterable[Mapping]) -> None: return def basic_search( self, search_term: str, config: Optional[SearchConfiguration] = None ) -> Iterable[CURIE]: r = requests.get( f"{NAME_RESOLUTION_ENDPOINT}/lookup", params={"string": search_term, "autocomplete": "true"}, timeout=TIMEOUT_SECONDS, ) r.raise_for_status() results = r.json() for result in results: curie = result["curie"] self.property_cache.add(curie, RDFS_LABEL, result["label"]) yield curie def label(self, curie: CURIE, lang: Optional[LANGUAGE_TAG] = None) -> Optional[str]: if lang: raise NotImplementedError if self.property_cache.contains(curie, RDFS_LABEL): return self.property_cache.get(curie, RDFS_LABEL) r = requests.get( f"{NAME_RESOLUTION_ENDPOINT}/reverse_lookup", params={"curies": curie}, timeout=TIMEOUT_SECONDS, ) r.raise_for_status() results = r.json() if curie not in results: return None return results[curie]["preferred_name"] def entity_aliases(self, curie: CURIE) -> List[str]: r = requests.get( f"{NAME_RESOLUTION_ENDPOINT}/reverse_lookup", params={"curies": curie}, timeout=TIMEOUT_SECONDS, ) r.raise_for_status() results = r.json() if curie not in results: return [] return results[curie]["names"] def entity_alias_map(self, curie: CURIE) -> ALIAS_MAP: return {HAS_RELATED_SYNONYM: self.entity_aliases(curie)}
# def relationships( # self, # subjects: Iterable[CURIE] = None, # predicates: Iterable[PRED_CURIE] = None, # objects: Iterable[CURIE] = None, # include_tbox: bool = True, # include_abox: bool = True, # include_entailed: bool = False, # exclude_blank: bool = True, # ) -> Iterator[RELATIONSHIP]: # query = { # "message": { # "query_graph": { # "edges": { # "e00": { # "subject": "n00", # "object": "n01", # "predicates": ["biolink:entity_negatively_regulates_entity"] # }, # "e01": { # "subject": "n01", # "object": "n02", # "predicates": ["biolink:related_to"] # } # }, # "nodes": { # "n00": { # "ids": ["PUBCHEM.COMPOUND:644073"], # "categories": ["biolink:ChemicalEntity"] # }, # "n01": { # "categories": ["biolink:BiologicalProcessOrActivity", "biolink:Gene", "biolink:Pathway"] # }, # "n02": { # "ids": ["HP:0000217"], # "categories": ["biolink:DiseaseOrPhenotypicFeature"] # } # } # } # } # } # r = requests.post(ARS_SUBMIT_ENDPOINT, json=query, timeout=TIMEOUT_SECONDS) # pk = r.get('pk') # import yaml # print(yaml.dump(r.json()))