Source code for oaklib.implementations.gilda

"""A text annotator based on Gilda."""

import logging
from dataclasses import dataclass
from typing import TYPE_CHECKING, Iterator

from oaklib.datamodels.text_annotator import TextAnnotation, TextAnnotationConfiguration
from oaklib.interfaces import TextAnnotatorInterface
from oaklib.interfaces.text_annotator_interface import TEXT, nen_annotation

if TYPE_CHECKING:
    import gilda

__all__ = [
    "GildaImplementation",
]


[docs] @dataclass class GildaImplementation(TextAnnotatorInterface): """ Perform named entity normalization on text strings with Gilda [gyori2021]_. .. [gyori2021] Benjamin M Gyori, Charles Tapley Hoyt, Albert Steppi (2021) `Gilda: biomedical entity text normalization with machine-learned disambiguation as a service <https://doi.org/10.1093/bioadv/vbac034>`_, *Bioinformatics Advances*, Volume 2, Issue 1, 2022, vbac034, """ grounder: "gilda.Grounder" = None """A grounder used by Gilda. This is instantiated in one of the following ways: 1. It can be passed directly during instantiation of the :class:`GildaImplementation` class. 2. If not passed and this implementation's ``slug`` attribute is set to a path to a gzipped term TSV file, it gets instantiated with the custom index 3. Otherwise, it gets instantiated with the default Gilda term index """ def __post_init__(self): if self.grounder is None: from gilda.grounder import Grounder # The slug corresponds to the path to a gzipped terms TSV # when parsed from a descriptor like ``gilda:<path>` via # :func:`get_resource_from_shorthand`. If no <path> was # given, then this will default to the default Gilda index try: self.grounder = Grounder(terms=self.resource.slug) except AttributeError: # i.e., there's no slug logging.warning("Gilda grounder will use default term index.") self.grounder = Grounder() def annotate_text( self, text: TEXT, configuration: TextAnnotationConfiguration = None ) -> Iterator[TextAnnotation]: """ Implements annotate_text from text_annotator_interface by calling the `annotate` endpoint using gilda client. :param text: Text to be annotated. :param configuration: Text annotation configuration. :yield: A generator function that returns annotated results. """ if not configuration: raise NotImplementedError("Missing text annotation configuration") if configuration.matches_whole_text: yield from self._ground(text) else: yield from self._gilda_annotate(text) def _gilda_annotate(self, text: str) -> Iterator[TextAnnotation]: from gilda.ner import annotate for match_text, match, start, end in annotate(text, grounder=self.grounder): yield TextAnnotation( subject_start=start, subject_end=end, subject_label=match_text, object_id=match.term.get_curie(), object_label=match.term.entry_name, matches_whole_text=start == 0 and end == len(text), ) def _ground(self, text: str) -> Iterator[TextAnnotation]: for match in self.grounder.ground(text): yield nen_annotation( text=text, object_id=match.term.get_curie(), object_label=match.term.entry_name, )