Source code for oaklib.interfaces.association_provider_interface

import logging
from abc import ABC
from collections import defaultdict
from dataclasses import dataclass, field
from typing import Any, Dict, Iterable, Iterator, List, Optional, Tuple

from oaklib.datamodels.association import Association, PairwiseCoAssociation
from oaklib.datamodels.similarity import TermSetPairwiseSimilarity
from oaklib.interfaces import MappingProviderInterface
from oaklib.interfaces.basic_ontology_interface import BasicOntologyInterface
from oaklib.interfaces.obograph_interface import OboGraphInterface
from oaklib.interfaces.semsim_interface import SemanticSimilarityInterface
from oaklib.types import CURIE, PRED_CURIE, SUBSET_CURIE
from oaklib.utilities.associations.association_index import AssociationIndex
from oaklib.utilities.iterator_utils import chunk

ASSOCIATION_CORRELATION = Tuple[
    CURIE, CURIE, int, Optional[List[CURIE]], Optional[List[Association]]
]


def associations_subjects(associations: Iterable[Association]) -> Iterator[CURIE]:
    """
    Yields distinct subjects for all associations.

    :param associations:
    :return:
    """
    seen = set()
    for a in associations:
        if a.subject in seen:
            continue
        yield a.subject
        seen.add(a.subject)


def associations_objects(associations: Iterable[Association]) -> Iterator[CURIE]:
    """
    Yields distinct subjects for all associations.

    :param associations:
    :return:
    """
    seen = set()
    for a in associations:
        if a.object in seen:
            continue
        yield a.object
        seen.add(a.object)


@dataclass
class EntityNormalizer:
    """
    Describes how identifier fields should be normalized
    """

    adapter: MappingProviderInterface

    query_time: bool = field(default=False)
    strict: bool = field(default=False)

    source_prefixes: List[str] = field(default_factory=list)
    target_prefixes: List[str] = field(default_factory=list)

    prefix_alias_map: Dict[str, str] = field(default_factory=dict)

    slots: List[str] = field(default_factory=list)


[docs] @dataclass class AssociationProviderInterface(BasicOntologyInterface, ABC): """ An ontology provider that provides associations. Associations (also known as annotations) connect data elements or entities to an ontology class. Examples of associations include: - Gene associations to terms in ontologies like GO, Mondo, Uberon, HPO, MPO, CL - Associations between spans of text and ontology entities Data models and file formats include: - The GO GAF and GPAD formats. - The HPOA association file format. - KGX (Knowledge Graph Exchange). - The W3 `Open Annotation <https://www.w3.org/TR/annotation-model/>`_ (OA) data model The OA datamodel considers an annotation to be between a *body* and a *target*: .. image:: https://www.w3.org/TR/annotation-vocab/images/examples/annotation.png .. warning:: the signature of some methods are subject to change while we decide on the best patterns to define here. Note that most ontology sources are not themselves providers of associations; there is no agreed upon way to represent associations in OWL, and associations are typically distributed separately from ontologies. See the section :ref:`associations` in the OAK guide for more details. OAK provides a number of ways to augment an ontology adapter with associations. The most expressive way is to provide an :ref:`InputSpecification`: >>> from oaklib import get_adapter >>> adapter = get_adapter("src/oaklib/conf/go-pombase-input-spec.yaml") This combines an ontology source with one or more association sources. Another approach is to use an adapter that directly supports associations. An example of this is the :ref:`amigo_implementation`: >>> from oaklib import get_adapter >>> amigo = get_adapter("amigo:NCBITaxon:10090") # mouse Command Line Use ---------------- .. code:: runoak -i foo.db associations UBERON:0002101 """ _association_index: AssociationIndex = None """In-memory index of associations""" normalizers: List[EntityNormalizer] = field(default_factory=list)
[docs] def associations( self, subjects: Iterable[CURIE] = None, predicates: Iterable[PRED_CURIE] = None, objects: Iterable[CURIE] = None, property_filter: Dict[PRED_CURIE, Any] = None, subject_closure_predicates: Optional[List[PRED_CURIE]] = None, predicate_closure_predicates: Optional[List[PRED_CURIE]] = None, object_closure_predicates: Optional[List[PRED_CURIE]] = None, include_modified: bool = False, add_closure_fields: bool = False, **kwargs, ) -> Iterator[Association]: """ Yield all matching associations. To query by subject (e.g. genes): >>> from oaklib import get_adapter >>> adapter = get_adapter("src/oaklib/conf/go-pombase-input-spec.yaml") >>> genes = ["PomBase:SPAC1142.02c", "PomBase:SPAC3H1.05", "PomBase:SPAC1142.06", "PomBase:SPAC4G8.02c"] >>> for assoc in adapter.associations(genes): ... print(f"{assoc.object} {adapter.label(assoc.object)}") <BLANKLINE> ... GO:0006620 post-translational protein targeting to endoplasmic reticulum membrane ... To query by object (e.g. descriptor terms): >>> from oaklib import get_adapter >>> from oaklib.datamodels.vocabulary import IS_A, PART_OF >>> adapter = get_adapter("src/oaklib/conf/go-pombase-input-spec.yaml") >>> for assoc in adapter.associations(objects=["GO:0006620"], object_closure_predicates=[IS_A, PART_OF]): ... print(f"{assoc.subject} {assoc.subject_label}") <BLANKLINE> ... PomBase:SPAC1142.02c sgt2 ... When determining a match on `objects`, the predicates in ``object_closure_predicates`` is used. We recommend you always explicitly provide this. A good choice is typically IS_A and PART_OF for ontologies like GO, Uberon, CL, ENVO. :param subjects: constrain to these subjects (e.g. genes in a gene association) :param predicates: constrain to these predicates (e.g. involved-in for a gene to pathway association) :param objects: constrain to these objects (e.g. terms) :param property_filter: generic query filter :param subject_closure_predicates: subjects is treated as descendant via these predicates :param predicate_closure_predicates: predicates is treated as descendant via these predicates :param object_closure_predicates: object is treated as descendant via these predicates :param add_closure_fields: add subject and object closure fields to the association :param include_modified: :return: """ # Default implementation: this may be overridden for efficiency if objects and object_closure_predicates: if isinstance(self, OboGraphInterface): # TODO: use more efficient procedure when object has many descendants objects = list( self.descendants(list(objects), predicates=object_closure_predicates) ) else: raise NotImplementedError ix = self._association_index if ix is None: logging.warning("No association index") return yield from ix.lookup(subjects, predicates, objects)
def _inject_subject_labels(self, association_iterator: Iterable[Association]): for assoc_it in chunk(association_iterator): associations = list(assoc_it) subjects = {a.subject for a in associations} label_map = {s: name for s, name in self.labels(subjects)} logging.info(f"LABEL MAP: {label_map} for {subjects}") for association in associations: if association.subject in label_map: association.subject_label = label_map[association.subject] yield from associations
[docs] def associations_subjects(self, *args, **kwargs) -> Iterator[CURIE]: """ Yields all distinct subjects. >>> from oaklib import get_adapter >>> from oaklib.datamodels.vocabulary import IS_A, PART_OF >>> adapter = get_adapter("src/oaklib/conf/go-pombase-input-spec.yaml") >>> preds = [IS_A, PART_OF] >>> for gene in adapter.associations_subjects(objects=["GO:0045047"], object_closure_predicates=preds): ... print(gene) <BLANKLINE> ... PomBase:SPBC1271.05c ... :param kwargs: same arguments as for :ref:`associations` :return: """ # individual implementations should override this to be more efficient yielded = set() for a in self.associations(*args, **kwargs): s = a.subject if s in yielded: continue yield s yielded.add(s)
[docs] def association_pairwise_coassociations( self, curies1: Iterable[CURIE], curies2: Iterable[CURIE], inputs_are_subjects=False, include_reciprocals=False, include_diagonal=True, include_entities=True, **kwargs, ) -> Iterator[PairwiseCoAssociation]: """ Find co-associations. >>> from oaklib import get_adapter >>> from oaklib.datamodels.vocabulary import IS_A, PART_OF >>> adapter = get_adapter("src/oaklib/conf/go-pombase-input-spec.yaml") >>> terms = ["GO:0000910", "GO:0006281", "GO:0006412"] >>> preds = [IS_A, PART_OF] >>> for coassoc in adapter.association_pairwise_coassociations(curies1=terms, ... curies2=terms, ... object_closure_predicates=preds): ... print(coassoc.object1, coassoc.object2, coassoc.number_subjects_in_common) <BLANKLINE> ... GO:0006281 GO:0000910 0 ... :param curies1: :param curies2: :param inputs_are_subjects: :param kwargs: :return: """ if inputs_are_subjects: raise NotImplementedError curies1 = list(curies1) curies2 = list(curies2) logging.info(f"Finding co-associations between {curies1} and {curies2}") assocmap = { c: list(self.associations(objects=[c], **kwargs)) for c in set(curies1 + curies2) } assocmap1 = {c: assocmap[c] for c in curies1} assocmap2 = {c: assocmap[c] for c in curies2} for c1 in curies1: for c2 in curies2: if c2 > c1 and not include_reciprocals: continue if c1 == c2 and not include_diagonal: continue assocs1 = assocmap1[c1] assocs2 = assocmap2[c2] elements1 = {a.subject for a in assocs1} elements2 = {a.subject for a in assocs2} common = elements1.intersection(elements2) assocs_to_common = [a for a in assocs1 + assocs2 if a.subject in common] coassoc = PairwiseCoAssociation( object1=c1, object2=c2, number_subjects_in_common=len(common), number_subjects_in_union=len(elements1.union(elements2)), number_subject_unique_to_entity1=len(elements1.difference(elements2)), number_subject_unique_to_entity2=len(elements2.difference(elements1)), ) if coassoc.number_subjects_in_union: coassoc.proportion_subjects_in_common = ( coassoc.number_subjects_in_common / coassoc.number_subjects_in_union ) if include_entities: coassoc.subjects_in_common = list(common) coassoc.associations_for_subjects_in_common = assocs_to_common yield coassoc
[docs] def add_associations( self, associations: Iterable[Association], normalizers: List[EntityNormalizer] = None, **kwargs, ) -> bool: """ Store a collection of associations for later retrievals. :param associations: :param normalizers: :return: """ if not normalizers and self.normalizers: normalizers = self.normalizers if normalizers: associations = list(associations) associations = self.normalize_associations(associations, normalizers) if self._association_index is None: logging.info("Creating association index") self._association_index = AssociationIndex() self._association_index.create() logging.info("Populating associations to index") self._association_index.populate(associations) return True
[docs] def association_counts( self, subjects: Iterable[CURIE] = None, predicates: Iterable[PRED_CURIE] = None, property_filter: Dict[PRED_CURIE, Any] = None, subject_closure_predicates: Optional[List[PRED_CURIE]] = None, predicate_closure_predicates: Optional[List[PRED_CURIE]] = None, object_closure_predicates: Optional[List[PRED_CURIE]] = None, include_modified: bool = False, group_by: Optional[str] = "object", limit: Optional[int] = None, **kwargs, ) -> Iterator[Tuple[CURIE, int]]: """ Yield objects together with the number of distinct associations. :param subjects: :param predicates: :param property_filter: :param subject_closure_predicates: :param predicate_closure_predicates: :param object_closure_predicates: :param include_modified: :param group_by: :param limit: :param kwargs: :return: """ association_it = self.associations( subjects=subjects, predicates=predicates, property_filter=property_filter, subject_closure_predicates=subject_closure_predicates, predicate_closure_predicates=predicate_closure_predicates, include_modified=include_modified, **kwargs, ) assoc_map = defaultdict(list) cached = {} if isinstance(self, OboGraphInterface): for association in association_it: if group_by == "object": grp = association.object if grp not in cached: grps = list(self.ancestors([grp], predicates=object_closure_predicates)) cached[grp] = grps else: grps = cached[grp] elif group_by == "subject": grps = [association.subject] else: raise ValueError(f"Unknown group_by: {group_by}") for grp in grps: assoc_map[grp].append(association) for k, v in assoc_map.items(): yield k, len(v)
[docs] def association_subject_counts( self, subjects: Iterable[CURIE] = None, predicates: Iterable[PRED_CURIE] = None, property_filter: Dict[PRED_CURIE, Any] = None, subject_closure_predicates: Optional[List[PRED_CURIE]] = None, predicate_closure_predicates: Optional[List[PRED_CURIE]] = None, object_closure_predicates: Optional[List[PRED_CURIE]] = None, include_modified: bool = False, **kwargs, ) -> Iterator[Tuple[CURIE, int]]: """ Yield objects together with the number of distinct associated subjects. Here objects are typically nodes from ontologies and subjects are annotated entities such as genes. >>> from oaklib import get_adapter >>> from oaklib.datamodels.vocabulary import IS_A, PART_OF >>> adapter = get_adapter("src/oaklib/conf/go-pombase-input-spec.yaml") >>> genes = ["PomBase:SPAC1142.02c", "PomBase:SPAC3H1.05", "PomBase:SPAC1142.06"] >>> preds = [IS_A, PART_OF] >>> for term, num in adapter.association_subject_counts(genes, object_closure_predicates=preds): ... print(term, num) <BLANKLINE> ... GO:0051668 3 ... This shows that GO:0051668 (localization within membrane) is used for all 3 input subjects. If subjects is empty, this is calculated for all subjects in the association set. :param subjects: constrain to these subjects (e.g. genes in a gene association) :param predicates: constrain to these predicates (e.g. involved-in for a gene to pathway association) :param property_filter: generic filter :param subject_closure_predicates: subjects is treated as descendant via these predicates :param predicate_closure_predicates: predicates is treated as descendant via these predicates :param object_closure_predicates: object is treated as descendant via these predicates :param include_modified: include modified associations :param kwargs: additional arguments :return: """ association_it = self.associations( subjects=subjects, predicates=predicates, property_filter=property_filter, subject_closure_predicates=subject_closure_predicates, predicate_closure_predicates=predicate_closure_predicates, include_modified=include_modified, **kwargs, ) object_to_subject_map = defaultdict(set) cached = {} if isinstance(self, OboGraphInterface): for association in association_it: subject = association.subject obj = association.object if obj not in cached: ancs = list(self.ancestors([obj], predicates=object_closure_predicates)) cached[obj] = ancs else: ancs = cached[obj] for anc in ancs: object_to_subject_map[anc].add(subject) for k, v in object_to_subject_map.items(): yield k, len(v)
[docs] def map_associations( self, subjects: Iterable[CURIE] = None, predicates: Iterable[PRED_CURIE] = None, objects: Iterable[CURIE] = None, subset: SUBSET_CURIE = None, subset_entities: Iterable[CURIE] = None, property_filter: Dict[PRED_CURIE, Any] = None, subject_closure_predicates: Optional[List[PRED_CURIE]] = None, predicate_closure_predicates: Optional[List[PRED_CURIE]] = None, object_closure_predicates: Optional[List[PRED_CURIE]] = None, include_modified: bool = False, ) -> Iterator[Association]: """ Maps matching associations to a subset (map2slim, rollup). :param subjects: constrain to these subjects :param predicates: constrain to these predicates (e.g. involved-in for a gene to pathway association) :param objects: constrain to these objects (e.g. terms) :param subset: subset to map to :param subset_entities: subset entities to map to :param property_filter: generic filter :param subject_closure_predicates: subjects is treated as descendant via these predicates :param predicate_closure_predicates: predicates is treated as descendant via these predicates :param object_closure_predicates: object is treated as descendant via these predicates :param include_modified: include modified associations :return: """ # Default implementation: this may be overridden for efficiency if subset is None and subset_entities is None: raise ValueError("Must pass ONE OF subset OR subset_entities") if subset is not None and subset_entities is not None: raise ValueError("Must pass ONE OF subset OR subset_entities, not both") if subset: subset_entities = list(self.subset_members(subset)) subset_entities = set(subset_entities) association_it = self.associations( subjects=subjects, predicates=predicates, property_filter=property_filter, subject_closure_predicates=subject_closure_predicates, predicate_closure_predicates=predicate_closure_predicates, include_modified=include_modified, ) if isinstance(self, OboGraphInterface): for association in association_it: obj = association.object ancs = list(self.ancestors([obj], predicates=object_closure_predicates)) for mapped_obj in subset_entities.intersection(ancs): association = Association( association.subject, association.predicate, mapped_obj, property_values=association.property_values, ) yield association
[docs] def normalize_associations( self, associations: Iterable[Association], normalizers: Optional[List[EntityNormalizer]] = None, ) -> Iterator[Association]: """ Normalize associations. :param associations: :param normalizers: :return: """ if not normalizers: return associations associations = list(associations) subject_ids = set([association.subject for association in associations]) object_ids = set([association.object for association in associations]) for normalizer in normalizers: if not normalizer.slots or "subject" in normalizer.slots: logging.info(f"Creating subject normalization map for {len(subject_ids)}") subject_nmap = normalizer.adapter.create_normalization_map( subject_ids, source_prefixes=normalizer.source_prefixes, target_prefixes=normalizer.target_prefixes, prefix_alias_map=normalizer.prefix_alias_map, ) logging.info(f"Created subject normalization => {len(subject_nmap)}") else: subject_nmap = {} if not normalizer.slots or "object" in normalizer.slots: logging.info(f"Creating object normalization map for {len(object_ids)}") object_nmap = normalizer.adapter.create_normalization_map( object_ids, source_prefixes=normalizer.source_prefixes, target_prefixes=normalizer.target_prefixes, ) else: object_nmap = {} if subject_nmap or object_nmap: logging.info( f"Normalizing associations using s: {len(subject_nmap)} o: {len(object_nmap)}" ) for association in associations: if association.subject in subject_nmap: association.original_subject = association.subject association.subject = subject_nmap[association.subject] if association.object in object_nmap: association.original_object = association.object association.object = object_nmap[association.object] logging.info(f"Yielding {len(associations)} associations") yield from associations
[docs] def normalize_association( self, association: Association, normalizers: Optional[List[EntityNormalizer]] = None, ) -> Association: """ Normalize identifiers in an association. :param association: :param normalizers: :return: """ if normalizers is None: return association for normalizer in normalizers: subject_prefix = association.subject.split(":")[0] object_prefix = association.object.split(":")[0] if subject_prefix in normalizer.source_prefixes: association.subject = normalizer.adapter.normalize( association.subject, target_prefixes=normalizer.target_prefixes, strict=normalizer.strict, ) if object_prefix in normalizer.source_prefixes: association.object = normalizer.adapter.normalize( association.object, target_prefixes=normalizer.target_prefixes, strict=normalizer.strict, ) return association