Source code for oaklib.implementations.gilda

"""A text annotator based on Gilda."""

import logging
from dataclasses import dataclass
from typing import TYPE_CHECKING, Iterator

from oaklib.datamodels.text_annotator import TextAnnotation, TextAnnotationConfiguration
from oaklib.interfaces import TextAnnotatorInterface
from oaklib.interfaces.text_annotator_interface import TEXT, nen_annotation

if TYPE_CHECKING:
    import gilda

__all__ = [
    "GildaImplementation",
]



[docs]
@dataclass
class GildaImplementation(TextAnnotatorInterface):
    """
    Perform named entity normalization on text strings with Gilda [gyori2021]_.

    .. [gyori2021] Benjamin M Gyori, Charles Tapley Hoyt, Albert Steppi (2021)
        `Gilda: biomedical entity text normalization with machine-learned
        disambiguation as a service <https://doi.org/10.1093/bioadv/vbac034>`_,
        *Bioinformatics Advances*, Volume 2, Issue 1, 2022, vbac034,
    """

    grounder: "gilda.Grounder" = None
    """A grounder used by Gilda.

    This is instantiated in one of the following ways:

    1. It can be passed directly during instantiation of the
       :class:`GildaImplementation` class.
    2. If not passed and this implementation's ``slug`` attribute is set
       to a path to a gzipped term TSV file, it gets instantiated with the
       custom index
    3. Otherwise, it gets instantiated with the default Gilda term index
    """

    def __post_init__(self):
        if self.grounder is None:
            from gilda.grounder import Grounder

            # The slug corresponds to the path to a gzipped terms TSV
            # when parsed from a descriptor like ``gilda:<path>` via
            # :func:`get_resource_from_shorthand`. If no <path> was
            # given, then this will default to the default Gilda index
            try:
                self.grounder = Grounder(terms=self.resource.slug)
            except AttributeError:  # i.e., there's no slug
                logging.warning("Gilda grounder will use default term index.")
                self.grounder = Grounder()

    def annotate_text(
        self, text: TEXT, configuration: TextAnnotationConfiguration = None
    ) -> Iterator[TextAnnotation]:
        """
        Implements annotate_text from text_annotator_interface by calling the
        `annotate` endpoint using gilda client.

        :param text: Text to be annotated.
        :param configuration: Text annotation configuration.
        :yield: A generator function that returns annotated results.
        """
        if not configuration:
            raise NotImplementedError("Missing text annotation configuration")
        if configuration.matches_whole_text:
            yield from self._ground(text)
        else:
            yield from self._gilda_annotate(text)

    def _gilda_annotate(self, text: str) -> Iterator[TextAnnotation]:
        from gilda.ner import annotate

        for match_text, match, start, end in annotate(text, grounder=self.grounder):
            yield TextAnnotation(
                subject_start=start,
                subject_end=end,
                subject_label=match_text,
                object_id=match.term.get_curie(),
                object_label=match.term.entry_name,
                matches_whole_text=start == 0 and end == len(text),
            )

    def _ground(self, text: str) -> Iterator[TextAnnotation]:
        for match in self.grounder.ground(text):
            yield nen_annotation(
                text=text,
                object_id=match.term.get_curie(),
                object_label=match.term.entry_name,
            )