GO Annotation Analysis

Analyzes the associations between genes and GO terms in the Gene Ontology (GO) database.

In particular, categorizes associations based on:

  • whether they have been “retracted” (i.e., removed from the database)

  • whether they have been reviewed and accepted or rejected via IBA

[1]:
import pandas as pd

from tests.test_implementations.test_robot_template import adapter

# a recent release plus an older one that may have retracted entries

RELEASES = [
    "2024-11-03",
    "2024-06-10",
    "2020-01-01",
]

LATEST = RELEASES[0]
PREVIOUS = RELEASES[1:]
assert all(r < LATEST for r in PREVIOUS)
[2]:
NEW_CUTOFF = "2024-06-01"
[3]:
# taxa to analyze

TAXA = [
    ("human", "goa_human", 9606),
    ("Arabidopsis thaliana", "tair", 3702),
    ("yeast", "sgd", 559292),
]
[4]:
GAF_URL_TEMPLATE = "https://release.geneontology.org/{date}/annotations/{name}.gaf.gz"
[5]:
from oaklib.datamodels.vocabulary import IS_A, PART_OF

Create an OAK adapter for the GO ontology

[6]:
from oaklib import get_adapter

go = get_adapter("sqlite:obo:go")
[7]:
obsoletes = set(go.obsoletes())
[ ]:

[8]:
binding_terms = set(go.descendants("GO:0005488", predicates=[IS_A]))
[9]:
antislim_terms = set(go.subset_members("gocheck_do_not_annotate")).union(go.subset_members("gocheck_obsoletion_candidate"))
non_informative = binding_terms.union(antislim_terms)

[96]:
mf_terms = set(go.descendants("GO:0003674", predicates=[IS_A]))
bp_terms = set(go.descendants("GO:0008150", predicates=[IS_A]))
cc_terms = set(go.descendants("GO:0005575", predicates=[IS_A]))

Load annotations from the archive

[10]:
from oaklib.parsers import GafAssociationParser
gaf_parser = GafAssociationParser()
[11]:
import requests_cache

session = requests_cache.CachedSession(
    cache_name='gaf_cache',
    backend='sqlite',  # or 'memory' for in-memory cache
    expire_after=24*60*60,  # Cache expiration in seconds
    allowable_codes=[200],  # Only cache successful responses
)
[12]:
from oaklib.datamodels.association import ParserConfiguration, NegatedAssociation
import io
import gzip

def get_gaf(release, name):
    config = ParserConfiguration(preserve_negated_associations=True)
    url = GAF_URL_TEMPLATE.format(date=release, name=name)
    # open the URL as a file object using requests
    with session.get(url, stream=True) as response:
        # Decompress the gzipped content and create a text stream
        decompressed = gzip.decompress(response.content)
        text_stream = io.TextIOWrapper(io.BytesIO(decompressed))
        print(f"Reading {url} using {config}")
        return list(gaf_parser.parse(text_stream, configuration=config))
[ ]:

Load all annotations into a cache

[13]:
from collections import defaultdict

db = defaultdict(dict)
for r in RELEASES:
    for name, grp, tax_id in TAXA:
        print(f"Loading {r} {name}")
        assocs = get_gaf(r, grp)
        print(f"Loaded {len(assocs)} associations")
        neg_assocs = [x for x in assocs if x.negated]
        print(f"  {len(neg_assocs)} negated associations")
        db[grp][r] = assocs
Loading 2024-11-03 human
Reading https://release.geneontology.org/2024-11-03/annotations/goa_human.gaf.gz using ParserConfiguration(preserve_negated_associations=True, include_association_attributes=None, primary_knowledge_source=None, aggregator_knowledge_source=None)
Loaded 782823 associations
  1494 negated associations
Loading 2024-11-03 Arabidopsis thaliana
Reading https://release.geneontology.org/2024-11-03/annotations/tair.gaf.gz using ParserConfiguration(preserve_negated_associations=True, include_association_attributes=None, primary_knowledge_source=None, aggregator_knowledge_source=None)
Loaded 235371 associations
  1374 negated associations
Loading 2024-11-03 yeast
Reading https://release.geneontology.org/2024-11-03/annotations/sgd.gaf.gz using ParserConfiguration(preserve_negated_associations=True, include_association_attributes=None, primary_knowledge_source=None, aggregator_knowledge_source=None)
Loaded 120823 associations
  6 negated associations
Loading 2024-06-10 human
Reading https://release.geneontology.org/2024-06-10/annotations/goa_human.gaf.gz using ParserConfiguration(preserve_negated_associations=True, include_association_attributes=None, primary_knowledge_source=None, aggregator_knowledge_source=None)
Loaded 707168 associations
  1308 negated associations
Loading 2024-06-10 Arabidopsis thaliana
Reading https://release.geneontology.org/2024-06-10/annotations/tair.gaf.gz using ParserConfiguration(preserve_negated_associations=True, include_association_attributes=None, primary_knowledge_source=None, aggregator_knowledge_source=None)
Loaded 235504 associations
  1373 negated associations
Loading 2024-06-10 yeast
Reading https://release.geneontology.org/2024-06-10/annotations/sgd.gaf.gz using ParserConfiguration(preserve_negated_associations=True, include_association_attributes=None, primary_knowledge_source=None, aggregator_knowledge_source=None)
Loaded 117290 associations
  7 negated associations
Loading 2020-01-01 human
Reading https://release.geneontology.org/2020-01-01/annotations/goa_human.gaf.gz using ParserConfiguration(preserve_negated_associations=True, include_association_attributes=None, primary_knowledge_source=None, aggregator_knowledge_source=None)
Loaded 495361 associations
  1244 negated associations
Loading 2020-01-01 Arabidopsis thaliana
Reading https://release.geneontology.org/2020-01-01/annotations/tair.gaf.gz using ParserConfiguration(preserve_negated_associations=True, include_association_attributes=None, primary_knowledge_source=None, aggregator_knowledge_source=None)
Loaded 236821 associations
  1364 negated associations
Loading 2020-01-01 yeast
Reading https://release.geneontology.org/2020-01-01/annotations/sgd.gaf.gz using ParserConfiguration(preserve_negated_associations=True, include_association_attributes=None, primary_knowledge_source=None, aggregator_knowledge_source=None)
Loaded 120916 associations
  28 negated associations
[14]:
db["goa_human"][LATEST][0]
[14]:
Association(subject='UniProtKB:A0A024RBG1', predicate='enables', object='GO:0003723', property_values=[], subject_label='NUDT4B', predicate_label=None, object_label=None, negated=None, publications=['GO_REF:0000043'], evidence_type='IEA', supporting_objects=[], primary_knowledge_source='infores:UniProt', aggregator_knowledge_source=None, subject_closure=[], subject_closure_label=[], object_closure=[], object_closure_label=[], comments=[])
[15]:
len([x for x in db["goa_human"][LATEST] if x.negated])
[15]:
1494
[ ]:
# reload modules

[30]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
[53]:
%autoreload 0

Diffs by terms

[ ]:

[32]:
from oaklib.utilities.associations.association_differ import AssociationDiffer


differ = AssociationDiffer(adapter=go)
[33]:
cache = {}
[73]:
len(db["goa_human"].keys())
[73]:
3
[74]:
list(db["goa_human"].keys())
[74]:
['2024-11-03', '2024-06-10', '2020-01-01']
[89]:
#ix = differ.changes_by_terms(db["goa_human"][LATEST], db["goa_human"][PREVIOUS[0]], min_num_entities_changes=10, cache={})
grp = "sgd"
ix = differ.changes_by_terms(db[grp][PREVIOUS[0]], db[grp][LATEST], min_num_entities_changes=2, cache={})

[90]:
len(ix)
[90]:
847
[91]:
for k in list(ix.keys())[0:5]:
    print(k, go.label(k))
GO:1904688 regulation of cytoplasmic translational initiation
GO:0170039 proteinogenic amino acid metabolic process
GO:0005980 glycogen catabolic process
GO:0045937 positive regulation of phosphate metabolic process
GO:0042762 regulation of sulfur metabolic process
[92]:
pubmed_adapter = get_adapter("pubmed:")
WARNING:eutils._internal.queryservice:No NCBI API key provided; throttling to 3 requests/second; see https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/new-api-keys-for-the-e-utilities/
[93]:
from functools import lru_cache


@lru_cache
def pub_date(pmid):
    m = pubmed_adapter.entity_metadata_map(pmid)
    if m:
        return m.get("year")

[110]:
from typing import Optional
from oaklib.utilities.associations.association_differ import TermComparison


def score_term_comparison(term: str, ix: TermComparison, max_date_inclusive=None, max_genes=20) -> Optional[dict]:
    """
    We are interested in genes for which all evidence was from previously known.

    :param term:
    :param ix:
    :return:
    """
    #all_genes = set(ix.old_associations_by_entity.keys()).union(ix.new_associations_by_entity.keys())
    #gene_diff = all_genes - set(ix.old_associations_by_entity.keys()).intersection(ix.new_associations_by_entity.keys())
    if len(ix.new_associations_by_entity) > max_genes:
        return None
    new_genes = set(ix.new_associations_by_entity.keys()) - set(ix.old_associations_by_entity.keys())
    gene_id_to_label_map = {}
    for g, assocs in ix.new_associations_by_entity.items():
        gene_id_to_label_map[g] = assocs[0].subject_label
    for g, assocs in ix.old_associations_by_entity.items():
        gene_id_to_label_map[g] = assocs[0].subject_label
    filtered_new_genes = {}
    for gene in new_genes:
        all_before_cutoff = True
        for a in ix.new_associations_by_entity[gene]:
            pmids = [x for x in a.publications if x.startswith("PMID")]
            if max_date_inclusive is not None:
                # print(f"{term} {gene}, pmids={pmids}")
                if not pmids:
                    all_before_cutoff = False
                else:
                    pmid = pmids[0]
                    date = pub_date(pmid)
                    if date is None or date > max_date_inclusive:
                        all_before_cutoff = False
        if all_before_cutoff:
            filtered_new_genes[gene] = ix.new_associations_by_entity[gene]
    #gene_id_diff = len(gene_diff)
    term_lbl = go.label(term)
    if term in mf_terms:
        ann_pred = "that are capable of"
    elif term in bp_terms:
        ann_pred = "involved in"
    elif term in cc_terms:
        ann_pred = "localized to"
    else:
        return
    def as_genes_list(amap):
        return [gene_id_to_label_map[g] for g in amap]
    def as_str_list(amap):
        return [str(g) for g in amap]
    case = {
        "input": f"List all genes {ann_pred} {term_lbl}",
        "ideal": "; ".join(as_genes_list(ix.new_associations_by_entity)),
        "original_input": {
            "term": str(term),
            "genes_current": as_str_list(ix.new_associations_by_entity),
            "genes_previous": as_str_list(ix.old_associations_by_entity),
            "genes_added": as_genes_list(new_genes),
            "genes_added_prior_to_cutoff": as_genes_list(filtered_new_genes),
            "num_genes_added_prior_to_cutoff": len(filtered_new_genes),
            "date_cutoff": max_date_inclusive,
        }
    }
    return case
[111]:
import yaml

n = 0
for k in list(ix.keys()):
    lbl = go.label(k)
    if "regulation" in lbl:
        continue
    if "response to" in lbl:
        continue
    case = score_term_comparison(k, ix[k], max_date_inclusive="2022")
    if not case:
        continue
    if case["original_input"]["num_genes_added_prior_to_cutoff"] > 2:
        print(yaml.dump(case, sort_keys=False))
        n += 1
    if n > 40:
        break
input: List all genes localized to respiratory chain complex IV
ideal: COX6; COX12; COX5B; COX9; COX5A; COX4; COX8; MTC3; COX13; COX7; COX1; COX2;
  COX3; COX26; AI4; AI5_ALPHA; AI3
original_input:
  term: GO:0045277
  genes_current:
  - SGD:S000001093
  - SGD:S000004028
  - SGD:S000001373
  - SGD:S000002225
  - SGD:S000004997
  - SGD:S000003155
  - SGD:S000004387
  - SGD:S000003195
  - SGD:S000003159
  - SGD:S000004869
  - SGD:S000007260
  - SGD:S000007281
  - SGD:S000007283
  - SGD:S000113555
  - SGD:S000007264
  - SGD:S000007265
  - SGD:S000007263
  genes_previous:
  - SGD:S000004387
  - SGD:S000004028
  - SGD:S000004857
  - SGD:S000007260
  genes_added:
  - COX26
  - COX13
  - AI4
  - COX4
  - COX9
  - AI3
  - MTC3
  - COX6
  - COX3
  - AI5_ALPHA
  - COX5A
  - COX5B
  - COX7
  - COX2
  genes_added_prior_to_cutoff:
  - COX26
  - COX9
  - COX3
  - COX7
  - COX2
  num_genes_added_prior_to_cutoff: 5
  date_cutoff: '2022'

input: List all genes localized to TTT Hsp90 cochaperone complex
ideal: TTI2; RVB2; TRA1; TTI1; RVB1; ASA1; TEL2
original_input:
  term: GO:0110078
  genes_current:
  - SGD:S000003897
  - SGD:S000006156
  - SGD:S000001141
  - SGD:S000001516
  - SGD:S000002598
  - SGD:S000006289
  - SGD:S000003331
  genes_previous:
  - SGD:S000003897
  genes_added:
  - RVB1
  - TEL2
  - TTI1
  - ASA1
  - RVB2
  - TRA1
  genes_added_prior_to_cutoff:
  - RVB1
  - TEL2
  - TTI1
  - ASA1
  - RVB2
  - TRA1
  num_genes_added_prior_to_cutoff: 6
  date_cutoff: '2022'

input: List all genes localized to respiratory chain complex III
ideal: COR1; RIP1; QCR6; QCR8; QCR7; QCR9; QCR10; CYT1; QCR2; COB
original_input:
  term: GO:0045275
  genes_current:
  - SGD:S000000141
  - SGD:S000000750
  - SGD:S000001929
  - SGD:S000003702
  - SGD:S000002937
  - SGD:S000003415
  - SGD:S000003529
  - SGD:S000005591
  - SGD:S000006395
  - SGD:S000007270
  genes_previous:
  - SGD:S000007270
  genes_added:
  - QCR10
  - COR1
  - CYT1
  - QCR2
  - QCR9
  - QCR8
  - QCR6
  - QCR7
  - RIP1
  genes_added_prior_to_cutoff:
  - QCR10
  - COR1
  - QCR2
  num_genes_added_prior_to_cutoff: 3
  date_cutoff: '2022'

input: List all genes that are capable of alpha-1,4-glucosidase activity
ideal: MAL62; MAL42; MAL22; MAL32; GTB1; MAL12; IMA1; IMA2; IMA3; IMA4; IMA5
original_input:
  term: GO:0004558
  genes_current:
  - SGD:S000029690
  - SGD:S000029687
  - SGD:S000029682
  - SGD:S000000503
  - SGD:S000002629
  - SGD:S000003524
  - SGD:S000003519
  - SGD:S000005517
  - SGD:S000001434
  - SGD:S000003757
  - SGD:S000003752
  genes_previous:
  - SGD:S000002629
  genes_added:
  - MAL42
  - IMA1
  - IMA3
  - IMA4
  - IMA5
  - IMA2
  - MAL22
  - MAL32
  - MAL62
  - MAL12
  genes_added_prior_to_cutoff:
  - MAL42
  - MAL22
  - MAL62
  num_genes_added_prior_to_cutoff: 3
  date_cutoff: '2022'

input: List all genes that are capable of G-quadruplex DNA binding
ideal: RAP1; MGS1; SUB1; DNA2; NSR1; VID22; MSS116; XRS2; SLX9; PIF1; MRE11; DBP2;
  RAD50; DED1; DBP1; RRM3
original_input:
  term: GO:0051880
  genes_current:
  - SGD:S000005160
  - SGD:S000005162
  - SGD:S000004642
  - SGD:S000001207
  - SGD:S000003391
  - SGD:S000004365
  - SGD:S000002602
  - SGD:S000002777
  - SGD:S000003313
  - SGD:S000004526
  - SGD:S000004837
  - SGD:S000005056
  - SGD:S000005194
  - SGD:S000005730
  - SGD:S000006040
  - SGD:S000001073
  genes_previous:
  - SGD:S000004526
  - SGD:S000005160
  - SGD:S000005194
  - SGD:S000004837
  - SGD:S000002777
  - SGD:S000005162
  - SGD:S000004642
  - SGD:S000001207
  - SGD:S000003391
  - SGD:S000003313
  - SGD:S000001073
  - SGD:S000004365
  genes_added:
  - DBP2
  - MSS116
  - DED1
  - DBP1
  genes_added_prior_to_cutoff:
  - DBP2
  - MSS116
  - DED1
  - DBP1
  num_genes_added_prior_to_cutoff: 4
  date_cutoff: '2022'

input: List all genes that are capable of alpha-glucosidase activity
ideal: MAL62; MAL42; MAL22; ROT2; MAL32; SGA1; IMA4; IMA3; GTB1; CWH41; MAL12; IMA1;
  IMA5; GDB1; IMA2; YMR196W; STA1; CPX-417; SUC2
original_input:
  term: GO:0090599
  genes_current:
  - SGD:S000029690
  - SGD:S000029687
  - SGD:S000029682
  - SGD:S000000433
  - SGD:S000000503
  - SGD:S000001361
  - SGD:S000003757
  - SGD:S000001434
  - SGD:S000002629
  - SGD:S000002995
  - SGD:S000003524
  - SGD:S000003519
  - SGD:S000003752
  - SGD:S000006388
  - SGD:S000005517
  - SGD:S000004809
  - SGD:S000029522
  - SGD:S000217621
  - SGD:S000001424
  genes_previous:
  - SGD:S000001361
  - SGD:S000001434
  - SGD:S000003757
  - SGD:S000000433
  - SGD:S000000503
  - SGD:S000003752
  - SGD:S000002995
  - SGD:S000003519
  - SGD:S000003524
  - SGD:S000002629
  - SGD:S000006388
  - SGD:S000005517
  - SGD:S000004809
  - SGD:S000029522
  - SGD:S000217621
  - SGD:S000001424
  genes_added:
  - MAL42
  - MAL22
  - MAL62
  genes_added_prior_to_cutoff:
  - MAL42
  - MAL22
  - MAL62
  num_genes_added_prior_to_cutoff: 3
  date_cutoff: '2022'

[ ]:

[ ]:

[ ]:

[ ]:

OLD ANALYSIS BELOW

[ ]:

[289]:
from functools import lru_cache



@lru_cache
def lineage(t: str):
    up = set(go.ancestors(t, predicates=[IS_A, PART_OF], reflexive=True))
    dn = set(go.descendants(t, predicates=[IS_A, PART_OF]))
    return up.union(dn)

#len(lineage("GO:0005737"))

@lru_cache
def ancs(t: str):
    return set(go.ancestors(t, predicates=[IS_A, PART_OF], reflexive=True))
[290]:
import pandas as pd

def pmid(a):
    pubs = [p for p in a.publications if p.startswith("PMID")]
    if pubs:
        if len(pubs) > 1:
            raise ValueError(f"Multiple PMIDs: {pubs}")
        return pubs[0]
    return None

@lru_cache
def lbl(t: str):
    return go.label(t)

def assocs_to_df(assocs: list, release: str):

    df = pd.DataFrame([{
        "subject": a.subject,
        "subject_label": a.subject_label,
        "predicate": a.predicate,
        "object": a.object,
        "object_label": lbl(a.object),
        "object_obsoletes": a.object in obsoletes,
        "object_uninformative": a.object in non_informative,
        "object_closure": ancs(a.object),
        "object_closure_redundant": ancs(a.object) - {a.object},
        "evidence": a.evidence_type,
        "is_iba": a.evidence_type == "IBA",
        "negated": a.negated,
        "pmid": pmid(a),
        "pubs": a.publications,
        "release": release,
    } for a in assocs])
    return df
[291]:
human_df = assocs_to_df(db["goa_human"][LATEST], LATEST)
tair_df = assocs_to_df(db["tair"][LATEST], LATEST)
sgd_df = assocs_to_df(db["sgd"][LATEST], LATEST)
df = sgd_df
[292]:
prev_df = assocs_to_df(db["sgd"][PREVIOUS[-1]], PREVIOUS[-1])
[293]:
df
[293]:
subject subject_label predicate object object_label object_obsoletes object_uninformative object_closure object_closure_redundant evidence is_iba negated pmid pubs release
0 SGD:S000003381 GPC1 acts_upstream_of_or_within GO:0090640 phosphatidylcholine biosynthesis from sn-glyce... False False {GO:0019637, GO:0008152, BFO:0000015, GO:00066... {GO:0019637, GO:0008152, GO:0006796, BFO:00000... IGI False None PMID:30514764 [PMID:30514764] 2024-11-03
1 SGD:S000005701 ALE1 acts_upstream_of_or_within GO:0090640 phosphatidylcholine biosynthesis from sn-glyce... False False {GO:0019637, GO:0008152, BFO:0000015, GO:00066... {GO:0019637, GO:0008152, GO:0006796, BFO:00000... IGI False None PMID:30514764 [PMID:30514764] 2024-11-03
2 SGD:S000003381 GPC1 acts_upstream_of_or_within GO:0036151 phosphatidylcholine acyl-chain remodeling False False {GO:0019637, GO:0008152, GO:0006796, BFO:00000... {GO:0019637, GO:0008152, GO:0006796, BFO:00000... IMP False None PMID:30514764 [PMID:30514764] 2024-11-03
3 SGD:S000004492 RCF1 acts_upstream_of_or_within GO:0033617 mitochondrial cytochrome c oxidase assembly False False {GO:0043933, GO:0044085, GO:0065003, GO:001604... {GO:0043933, GO:0044085, GO:0022607, GO:007184... IMP False None PMID:29746825 [PMID:29746825] 2024-11-03
4 SGD:S000004977 SIW14 enables GO:0052845 inositol-5-diphosphate-1,2,3,4,6-pentakisphosp... False False {GO:0016817, GO:0016818, GO:0052842, GO:000382... {GO:0016817, GO:0016818, GO:0052842, GO:000382... IDA False None PMID:26828065 [PMID:26828065] 2024-11-03
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
120818 SGD:S000003241 SEC9 involved_in GO:0006906 vesicle fusion False False {GO:0048284, GO:0051234, GO:0090174, GO:001604... {GO:0061024, BFO:0000015, GO:0009987, GO:00160... IBA True None None [GO_REF:0000033] 2024-11-03
120819 SGD:S000004826 CEF1 part_of GO:0000974 Prp19 complex False False {GO:0000974, BFO:0000004, BFO:0000040, BFO:000... {BFO:0000004, BFO:0000040, BFO:0000002, GO:003... IBA True None None [GO_REF:0000033] 2024-11-03
120820 SGD:S000002551 MKC7 involved_in GO:0031505 fungal-type cell wall organization False False {GO:0071554, GO:0016043, GO:0045229, GO:003150... {GO:0071554, GO:0016043, GO:0045229, GO:000998... IBA True None None [GO_REF:0000033] 2024-11-03
120821 SGD:S000003008 HEM2 is_active_in GO:0005829 cytosol False False {CARO:0030000, UBERON:0000061, CARO:0000003, G... {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... IBA True None None [GO_REF:0000033] 2024-11-03
120822 SGD:S000001122 LAM4 involved_in GO:0032366 intracellular sterol transport False False {GO:0006869, GO:0015850, GO:0032365, GO:005123... {GO:0032365, GO:0015918, GO:0051649, GO:000998... IBA True None None [GO_REF:0000033] 2024-11-03

120823 rows × 15 columns

[294]:
from typing import Dict, List, Optional


def repair_assocs_df(assocs: pd.DataFrame):
    """
    Ensures that IDs are normalized.

    :param assocs:
    :return:
    """
    # may not be 1:1
    subject_label_to_ids: Dict[str, List[str]]
    subject_label_to_ids = assocs.groupby("subject_label")["subject"].aggregate(lambda x: list(set(x))).to_dict()
    labels_with_multiple_ids = {k: v for k, v in subject_label_to_ids.items() if len(v) > 1}
    if labels_with_multiple_ids:
        print(f"Multiple IDs for {len(labels_with_multiple_ids)} labels")
        print(list(labels_with_multiple_ids.items())[:5])
    labels_to_canonical = {k: sorted(list(v))[0] for k, v in subject_label_to_ids.items()}
    assocs['subject'] = assocs['subject_label'].map(labels_to_canonical)
[295]:
test_df = tair_df.copy()
repair_assocs_df(test_df)
test_df
Multiple IDs for 5258 labels
[('4CL1', ['TAIR:locus:2017602', 'AGI_LocusCode:AT1G51680']), ('4CL2', ['TAIR:locus:2094716', 'AGI_LocusCode:AT3G21240']), ('4CL3', ['TAIR:locus:2015003', 'AGI_LocusCode:AT1G65060']), ('AAC1', ['TAIR:locus:2077778', 'AGI_LocusCode:AT3G08580']), ('AAC2', ['AGI_LocusCode:AT5G13490', 'TAIR:locus:2185041'])]
[295]:
subject subject_label predicate object object_label object_obsoletes object_uninformative object_closure object_closure_redundant evidence is_iba negated pmid pubs release
0 AGI_LocusCode:AT1G11880 AT1G11880 enables GO:0000009 alpha-1,6-mannosyltransferase activity False False {GO:0000030, GO:0003824, GO:0016740, BFO:00000... {GO:0000030, GO:0003824, GO:0016740, BFO:00000... IEA False None None [TAIR:AnalysisReference:501756966] 2024-11-03
1 AGI_LocusCode:AT1G80420 ATXRCC1 involved_in GO:0000012 single strand break repair False False {GO:0043170, GO:0033554, GO:0008152, GO:000613... {GO:0043170, GO:0008152, BFO:0000015, GO:00099... IEA False None None [TAIR:AnalysisReference:501756966] 2024-11-03
2 AGI_LocusCode:AT1G74030 ENO1 part_of GO:0000015 phosphopyruvate hydratase complex False False {GO:0005829, GO:0110165, BFO:0000002, GO:00329... {CARO:0030000, GO:0005829, BFO:0000004, CARO:0... IEA False None None [TAIR:AnalysisReference:501756966] 2024-11-03
3 AGI_LocusCode:AT2G29560 ENOC part_of GO:0000015 phosphopyruvate hydratase complex False False {GO:0005829, GO:0110165, BFO:0000002, GO:00329... {CARO:0030000, GO:0005829, BFO:0000004, CARO:0... IEA False None None [TAIR:AnalysisReference:501756966] 2024-11-03
4 AGI_LocusCode:AT2G36530 LOS2 part_of GO:0000015 phosphopyruvate hydratase complex False False {GO:0005829, GO:0110165, BFO:0000002, GO:00329... {CARO:0030000, GO:0005829, BFO:0000004, CARO:0... IEA False None None [TAIR:AnalysisReference:501756966] 2024-11-03
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
235366 TAIR:locus:2058630 At2g23210 enables GO:0010294 abscisic acid glucosyltransferase activity False False {GO:0035251, GO:0003824, GO:0016740, BFO:00000... {GO:0035251, GO:0003824, GO:0016740, BFO:00000... IBA True None None [GO_REF:0000033] 2024-11-03
235367 AGI_LocusCode:AT2G15820 OTP51 involved_in GO:0045292 mRNA cis splicing, via spliceosome False False {GO:0006397, GO:0008152, GO:0000375, BFO:00000... {GO:0009059, GO:0043170, GO:0006397, GO:000815... IBA True None None [GO_REF:0000033] 2024-11-03
235368 TAIR:locus:2143196 At5g15750 involved_in GO:0042274 ribosomal small subunit biogenesis False False {GO:0042274, GO:0044085, GO:0009987, BFO:00000... {GO:0044085, GO:0009987, BFO:0000015, GO:00226... IBA True None None [GO_REF:0000033] 2024-11-03
235369 AGI_LocusCode:AT4G14730 LFG1 is_active_in GO:0016020 membrane False False {CARO:0030000, UBERON:0000061, CARO:0000003, G... {CARO:0030000, UBERON:0000061, CARO:0000003, G... IBA True None None [GO_REF:0000033] 2024-11-03
235370 TAIR:locus:2116525 SD25 enables GO:0004672 protein kinase activity False False {GO:0140096, GO:0003824, GO:0016740, BFO:00000... {GO:0140096, GO:0003824, GO:0016740, BFO:00000... IBA True None None [GO_REF:0000033] 2024-11-03

235371 rows × 15 columns

[296]:
test_df[test_df['subject_label'] == "GALT6"]
[296]:
subject subject_label predicate object object_label object_obsoletes object_uninformative object_closure object_closure_redundant evidence is_iba negated pmid pubs release
56871 AGI_LocusCode:AT5G62620 GALT6 located_in GO:0005794 Golgi apparatus False False {GO:0110165, GO:0043231, BFO:0000002, GO:00057... {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... ISM False None None [TAIR:AnalysisReference:501780126] 2024-11-03
56872 AGI_LocusCode:AT5G62620 GALT6 located_in GO:0005794 Golgi apparatus False False {GO:0110165, GO:0043231, BFO:0000002, GO:00057... {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... ISM False None None [TAIR:AnalysisReference:501780126] 2024-11-03
61144 AGI_LocusCode:AT5G62620 GALT6 involved_in GO:0006486 protein glycosylation False False {GO:0008152, BFO:0000015, GO:0009100, GO:00434... {GO:0009059, GO:0043170, GO:0070085, GO:000815... IEA False None None [TAIR:AnalysisReference:501757242] 2024-11-03
85487 AGI_LocusCode:AT5G62620 GALT6 enables GO:0030246 carbohydrate binding False True {BFO:0000015, GO:0003674, GO:0005488, BFO:0000... {GO:0003674, BFO:0000015, GO:0005488, BFO:0000... IEA False None None [TAIR:AnalysisReference:501756966] 2024-11-03
165846 AGI_LocusCode:AT5G62620 GALT6 located_in GO:0005794 Golgi apparatus False False {GO:0110165, GO:0043231, BFO:0000002, GO:00057... {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... IDA False None PMID:26690932 [TAIR:Publication:501767599, PMID:26690932] 2024-11-03
165851 AGI_LocusCode:AT5G62620 GALT6 involved_in GO:0010405 arabinogalactan protein metabolic process False False {GO:0043170, GO:0044036, GO:0010384, GO:007155... {GO:0043170, GO:0044036, GO:0071554, GO:000815... IMP False None PMID:26690932 [TAIR:Publication:501767599, PMID:26690932] 2024-11-03
165856 AGI_LocusCode:AT5G62620 GALT6 involved_in GO:0018258 protein O-linked glycosylation via hydroxyproline False False {GO:0006493, GO:0008152, BFO:0000015, GO:00091... {GO:0009059, GO:0006493, GO:0043170, GO:007008... IDA False None PMID:26690932 [TAIR:Publication:501767599, PMID:26690932] 2024-11-03
165858 AGI_LocusCode:AT5G62620 GALT6 acts_upstream_of_or_within GO:0048354 mucilage biosynthetic process involved in seed... False False {GO:0032501, GO:0008152, GO:0048359, BFO:00000... {GO:0032501, GO:0010192, GO:0008152, GO:004835... IMP False None PMID:26690932 [TAIR:Publication:501767599, PMID:26690932] 2024-11-03
165863 AGI_LocusCode:AT5G62620 GALT6 acts_upstream_of_or_within GO:1900056 negative regulation of leaf senescence False False {GO:0065007, BFO:0000015, GO:1900055, GO:00485... {GO:0065007, BFO:0000015, GO:1900055, GO:00485... IMP False None PMID:26690932 [TAIR:Publication:501767599, PMID:26690932] 2024-11-03
165869 AGI_LocusCode:AT5G62620 GALT6 enables GO:1990714 hydroxyproline O-galactosyltransferase activity False False {GO:0003824, GO:0016740, BFO:0000015, GO:00083... {GO:0003824, GO:0016740, BFO:0000015, GO:00083... IDA False None PMID:26690932 [TAIR:Publication:501767599, PMID:26690932] 2024-11-03
210076 AGI_LocusCode:AT5G62620 GALT6 enables GO:1990714 hydroxyproline O-galactosyltransferase activity False False {GO:0003824, GO:0016740, BFO:0000015, GO:00083... {GO:0003824, GO:0016740, BFO:0000015, GO:00083... IBA True None None [GO_REF:0000033] 2024-11-03
217996 AGI_LocusCode:AT5G62620 GALT6 is_active_in GO:0000139 Golgi membrane False False {GO:0110165, GO:0043231, BFO:0000002, GO:00057... {CARO:0030000, GO:0005794, CARO:0000000, BFO:0... IBA True None None [GO_REF:0000033] 2024-11-03
[297]:

def create_gene_df(df: pd.DataFrame): """ creates a new dataframe, grouped by gene (subject) :param df: :return: """ gene_df = df.groupby("subject").agg({ "object": "count", "is_iba": "sum", "negated": "sum", # for object, take the union of all distinct values "object": lambda x: set(x), # for object closure, take the union of all sets "object_closure": lambda x: set.union(*x), "object_closure_redundant": lambda x: set.union(*x), "pmid": lambda x: set(x), #"pubs": lambda x: set.union(set(x)), "release": "first", }).reset_index() # the redundant closure is the set difference of object_closure and object gene_df = gene_df.rename(columns={ "object": "terms", "is_iba": "n_iba", "negated": "n_negated", "object_closure": "closure", "pmid": "n_pmid", }) return gene_df
[298]:
gene_df = create_gene_df(df)
gene_df
[298]:
subject terms n_iba n_negated closure object_closure_redundant n_pmid release
0 SGD:S000000001 {GO:0071168, GO:0005739, GO:0008301, GO:000367... 3 0 {GO:0005739, GO:0000182, GO:0008152, BFO:00000... {GO:0000182, GO:0008152, BFO:0000015, GO:00010... {PMID:2404611, PMID:18708580, PMID:2649882, PM... 2024-11-03
1 SGD:S000000002 {GO:0015031, GO:0099023, GO:0005768, GO:000662... 4 0 {GO:0032509, GO:0071985, GO:0046872, BFO:00000... {GO:0032509, GO:0071985, GO:0046872, BFO:00000... {PMID:19828734, PMID:30358795, PMID:20173035, ... 2024-11-03
2 SGD:S000000003 {GO:0005085, GO:0005737, GO:0032232, GO:000582... 3 0 {GO:0005085, GO:0030234, GO:0008152, BFO:00000... {GO:0030234, GO:0008152, BFO:0000015, GO:00900... {PMID:19545407, PMID:10409717, PMID:17925388, ... 2024-11-03
3 SGD:S000000004 {GO:0005829, GO:0072671, GO:0005576, GO:003460... 9 0 {GO:1901363, GO:0003723, GO:0008152, BFO:00000... {GO:1901363, GO:0003723, GO:0008152, BFO:00000... {PMID:18706386, PMID:9789005, PMID:26928762, P... 2024-11-03
4 SGD:S000000005 {GO:0015031, GO:0006621, GO:0005789, GO:000688... 8 0 {BFO:0000015, GO:0030135, GO:0070972, GO:01101... {BFO:0000015, GO:0030135, GO:0070972, GO:01101... {PMID:26928762, PMID:10359606, None, PMID:1115... 2024-11-03
... ... ... ... ... ... ... ... ...
6906 SGD:S000350095 {GO:0008150, GO:0003674, GO:0005575} 0 0 {BFO:0000003, BFO:0000015, BFO:0000004, GO:000... {BFO:0000004, BFO:0000040, BFO:0000003, BFO:00... {None} 2024-11-03
6907 SGD:S000350096 {GO:0008150, GO:0003674, GO:0005575} 0 0 {BFO:0000003, BFO:0000015, BFO:0000004, GO:000... {BFO:0000004, BFO:0000040, BFO:0000003, BFO:00... {None} 2024-11-03
6908 SGD:S000350097 {GO:0005575, GO:0003674, GO:0008150} 0 0 {BFO:0000004, BFO:0000002, GO:0003674, GO:0008... {BFO:0000004, BFO:0000002, BFO:0000003, BFO:00... {None} 2024-11-03
6909 SGD:S000350098 {GO:0005575, GO:0003674, GO:0008150} 0 0 {BFO:0000004, BFO:0000002, GO:0003674, GO:0008... {BFO:0000004, BFO:0000002, BFO:0000003, BFO:00... {None} 2024-11-03
6910 SGD:S000350099 {GO:0003674, GO:0005575, GO:0008150} 0 0 {BFO:0000040, BFO:0000015, BFO:0000004, GO:000... {BFO:0000004, BFO:0000002, BFO:0000003, BFO:00... {None} 2024-11-03

6911 rows × 8 columns

[299]:
def set_redundant_flag(assocs: pd.DataFrame, gene_df: pd.DataFrame):
    """
    Sets the redundant flag for each association, if the object is in object_closure_redundant for that gene.

    :param assocs:
    :param gene_df:
    :return:
    """
    # Create a mapping of subject to object_closure_redundant
    redundant_map = gene_df.set_index('subject')['object_closure_redundant'].to_dict()

    # Vectorized check for each row
    def check_redundant(row):
        if row['is_iba']:
            return False
        closure_set = redundant_map.get(row['subject'], set())
        is_redundant = row['object'] in closure_set
        return is_redundant

    # Apply the check to all rows at once
    assocs['redundant'] = assocs.apply(check_redundant, axis=1)
[300]:
df["redundant"] = False
[301]:
set_redundant_flag(df, gene_df)
[302]:
df[df['redundant']]
[302]:
subject subject_label predicate object object_label object_obsoletes object_uninformative object_closure object_closure_redundant evidence is_iba negated pmid pubs release redundant
16 SGD:S000004539 FPR3 located_in GO:0005634 nucleus False False {GO:0110165, GO:0043231, BFO:0000002, GO:00432... {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... IDA False None PMID:26359986 [PMID:26359986] 2024-11-03 True
28 SGD:S000002699 HRQ1 enables GO:0043138 3'-5' DNA helicase activity False False {BFO:0000015, GO:0008150, GO:0016043, GO:00431... {GO:0140097, GO:0032508, GO:0003824, BFO:00000... IDA False None PMID:28385527 [PMID:28385527] 2024-11-03 True
33 SGD:S000002699 HRQ1 enables GO:0043138 3'-5' DNA helicase activity False False {BFO:0000015, GO:0008150, GO:0016043, GO:00431... {GO:0140097, GO:0032508, GO:0003824, BFO:00000... IDA False None PMID:24440721 [PMID:24440721] 2024-11-03 True
39 SGD:S000003245 SNU71 located_in GO:0005634 nucleus False False {GO:0110165, GO:0043231, BFO:0000002, GO:00432... {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... IEA False None None [GO_REF:0000043] 2024-11-03 True
42 SGD:S000001443 DJP1 acts_upstream_of_or_within GO:0006626 protein targeting to mitochondrion False False {GO:0070585, GO:0051234, GO:0006605, GO:007072... {GO:0070585, GO:0070727, GO:0033365, BFO:00000... IMP False None PMID:30213914 [PMID:30213914] 2024-11-03 True
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
105624 SGD:S000006483 RDN18-2 located_in GO:0005840 ribosome False False {GO:0110165, BFO:0000002, GO:0043229, CL:00000... {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... IEA False None PMID:30502926 [PMID:30502926] 2024-11-03 True
105625 SGD:S000006502 SNR42 located_in GO:0005730 nucleolus False False {GO:0043233, GO:0043231, GO:0110165, BFO:00000... {CARO:0030000, GO:0005634, CARO:0000000, BFO:0... IEA False None PMID:30502926 [PMID:30502926] 2024-11-03 True
105627 SGD:S000007300 SNR36 located_in GO:0005730 nucleolus False False {GO:0043233, GO:0043231, GO:0110165, BFO:00000... {CARO:0030000, GO:0005634, CARO:0000000, BFO:0... IEA False None PMID:30502926 [PMID:30502926] 2024-11-03 True
105628 SGD:S000006484 RDN25-1 located_in GO:0005840 ribosome False False {GO:0110165, BFO:0000002, GO:0043229, CL:00000... {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... IEA False None PMID:30502926 [PMID:30502926] 2024-11-03 True
105629 SGD:S000006485 RDN25-2 located_in GO:0005840 ribosome False False {GO:0110165, BFO:0000002, GO:0043229, CL:00000... {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... IEA False None PMID:30502926 [PMID:30502926] 2024-11-03 True

34441 rows × 16 columns

[303]:
def annotate_new_pubs(assocs: pd.DataFrame, prev_assocs: pd.DataFrame):
    """
    Annotates each association in assocs with fresh if its pmid is not in prev_assocs.

    :param assocs
    """
    prev_pmid_map = prev_assocs.set_index('pmid')['subject'].to_dict()

    # Vectorized check for each row
    def check_new(row):
        return row['pmid'] not in prev_pmid_map

    # Apply the check to all rows in prev_assocs
    assocs['pmid_new'] = assocs.apply(check_new, axis=1)

    # annotate tuple as unique if (subject, object) is not in prev
    prev_pair_assocs = set(zip(prev_assocs['subject'], prev_assocs['object']))
    assocs['is_new'] = ~assocs.apply(lambda x: (x['subject'], x['object']) in prev_pair_assocs, axis=1)

    assocs['fresh'] = assocs['pmid_new'] & assocs['is_new']
[304]:
def annotate_redacted_pubs(assocs: pd.DataFrame, prev_assocs: pd.DataFrame):
    """
    Annotates each association in prev_assocs with retracted if its pmid is not in assocs (latest).

    :param assocs
    """
    # Create a mapping of pmid to subject
    pmid_map = assocs.set_index('pmid')['subject'].to_dict()

    # Vectorized check for each row
    def check_removed(row):
        return row['pmid'] not in pmid_map

    # Apply the check to all rows in prev_assocs
    prev_assocs['pmid_removed'] = prev_assocs.apply(check_removed, axis=1)

    # annotate tuple as unique if (subject, object) is not in latest
    pair_assocs = set(zip(assocs['subject'], assocs['object']))
    prev_assocs['unique'] = ~prev_assocs.apply(lambda x: (x['subject'], x['object']) in pair_assocs, axis=1)

    prev_assocs['redacted'] = prev_assocs['pmid_removed'] & prev_assocs['unique']
[305]:
annotate_redacted_pubs(df, prev_df)

[306]:
#prev_df['redacted'] = prev_df['pmid_removed'] & prev_df['unique']
[ ]:

[307]:
#list(set(df['subject']))[:5]
[308]:
#iba_df = df[df['is_iba']]
#iba_subjects = set(iba_df['subject'])
#list(iba_subjects)[:5]
[309]:
def set_iba_status(assocs: pd.DataFrame):
    iba_df = assocs[assocs['is_iba']]
    iba_subjects = set(iba_df['subject'])
    # set the IBA rejected as associations for which
    # (a) the subject has an IBA association
    # (b) the object is not in the closure of the object in the IBA association
    # first we set the iba closure for each subject
    iba_closure = iba_df.groupby('subject')['object_closure'].aggregate(lambda x: set.union(*x)).to_dict()
    # now we set IBA rejected for each row
    def check_iba_rejected(row):
        if row['is_iba']:
            return False
        return row['subject'] in iba_subjects and row['object'] not in iba_closure.get(row['subject'], set())
        #if row['subject'].startswith("TAIR"):
        #    print(f"Checking {row['subject']} in {list(iba_subjects)[:5]}")
        #return row['subject'] in iba_subjects
        #return True

    assocs['iba_rejected'] = assocs.apply(check_iba_rejected, axis=1)

#set_iba_status(df)
[ ]:

[310]:
human_df
[310]:
subject subject_label predicate object object_label object_obsoletes object_uninformative object_closure object_closure_redundant evidence is_iba negated pmid pubs release
0 UniProtKB:A0A024RBG1 NUDT4B enables GO:0003723 RNA binding False True {GO:0097159, GO:0003723, BFO:0000015, GO:00036... {GO:0097159, BFO:0000015, GO:0003674, GO:00036... IEA False None None [GO_REF:0000043] 2024-11-03
1 UniProtKB:A0A024RBG1 NUDT4B enables GO:0005515 protein binding False True {BFO:0000015, GO:0003674, GO:0005488, BFO:0000... {GO:0003674, BFO:0000015, GO:0005488, BFO:0000... IPI False None PMID:33961781 [PMID:33961781] 2024-11-03
2 UniProtKB:A0A024RBG1 NUDT4B enables GO:0046872 metal ion binding False True {GO:0043169, GO:0046872, BFO:0000015, GO:00360... {GO:0043169, BFO:0000015, GO:0036094, GO:00431... IEA False None None [GO_REF:0000043] 2024-11-03
3 UniProtKB:A0A024RBG1 NUDT4B located_in GO:0005829 cytosol False False {CARO:0030000, UBERON:0000061, CARO:0000003, G... {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... IDA False None None [GO_REF:0000052] 2024-11-03
4 UniProtKB:A0A075B6H5 TRBV20OR9-2 involved_in GO:0002376 immune system process False False {GO:0008150, BFO:0000015, GO:0002376, BFO:0000... {BFO:0000015, GO:0008150, BFO:0000003} IEA False None None [GO_REF:0000043] 2024-11-03
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
782818 UniProtKB:Q9NZC2 TREM2 involved_in GO:0045088 regulation of innate immune response False False {GO:0065007, GO:0002682, GO:0050776, BFO:00000... {GO:0065007, GO:0002682, GO:0050776, BFO:00000... IBA True None None [GO_REF:0000033] 2024-11-03
782819 UniProtKB:Q9Y2K2 SIK3 enables GO:0050321 tau-protein kinase activity False False {GO:0140096, GO:0003824, GO:0004674, GO:001674... {GO:0140096, GO:0003824, GO:0004674, GO:001674... IBA True None None [GO_REF:0000033] 2024-11-03
782820 UniProtKB:P43235 CTSK involved_in GO:0051603 proteolysis involved in protein catabolic process False False {GO:0043170, GO:0006508, GO:0044238, GO:000905... {GO:0043170, GO:0006508, GO:0044238, GO:000905... IBA True None None [GO_REF:0000033] 2024-11-03
782821 UniProtKB:Q07343 PDE4B enables GO:0047555 3',5'-cyclic-GMP phosphodiesterase activity False False {GO:0047555, GO:0003824, GO:0008081, BFO:00000... {GO:0003824, GO:0008081, BFO:0000015, GO:00425... IBA True None None [GO_REF:0000033] 2024-11-03
782822 UniProtKB:A6NC42 DPPA5 involved_in GO:0010468 regulation of gene expression False False {GO:0065007, GO:0060255, GO:0009889, BFO:00000... {GO:0065007, GO:0060255, GO:0009889, BFO:00000... IBA True None None [GO_REF:0000033] 2024-11-03

782823 rows × 15 columns

[311]:
set_iba_status(test_df)
[312]:
test_df['iba_rejected'].unique()
[312]:
array([False,  True])
[313]:
#set_iba_status(df)
[ ]:

[314]:
def synthesize(grp: str) -> pd.DataFrame:
    """
    Synthesizes the above steps
    """
    assocs = assocs_to_df(db[grp][LATEST], LATEST)
    prev_df_sets = []
    for prev in PREVIOUS:
        this_prev_assocs = assocs_to_df(db[grp][prev], prev)
        prev_df_sets.append(this_prev_assocs)
    annotate_new_pubs(assocs, prev_df_sets[0])
    #prev_assocs = pd.concat(prev_df_sets).drop_duplicates()
    prev_assocs = pd.concat(prev_df_sets)
    # prev_assocs = assocs_to_df(db[grp][PREVIOUS[0]], PREVIOUS[0])
    repair_assocs_df(assocs)
    repair_assocs_df(prev_assocs)
    # Create a gene dataframe (for latest only)
    gene_df = create_gene_df(assocs)
    annotate_redacted_pubs(assocs, prev_assocs)
    new_assocs = pd.concat([assocs, prev_assocs[prev_assocs['redacted']]])
    set_redundant_flag(new_assocs, gene_df)
    set_iba_status(new_assocs)
    return new_assocs
[315]:
new_human = synthesize("goa_human")
Multiple IDs for 64 labels
[('AKAP7', ['UniProtKB:O43687', 'UniProtKB:Q9P0M2']), ('ARHGEF18', ['UniProtKB:A0A590UK10', 'UniProtKB:Q6ZSZ5']), ('BBC3', ['UniProtKB:Q9BXH1', 'UniProtKB:Q96PG8']), ('CALCA', ['UniProtKB:P06881', 'UniProtKB:P01258']), ('CDKN2A', ['UniProtKB:Q8N726', 'UniProtKB:P42771'])]
Multiple IDs for 100 labels
[('AKAP7', ['UniProtKB:O43687', 'UniProtKB:Q9P0M2']), ('AMY1A', ['UniProtKB:P04745', 'UniProtKB:P0DUB6']), ('ARHGEF18', ['UniProtKB:A0A590UK10', 'UniProtKB:Q6ZSZ5']), ('ASIC5', ['UniProtKB:A0A0G2JLG4', 'UniProtKB:Q9NY37']), ('ATP6AP2', ['UniProtKB:O75787', 'UniProtKB:A0A1C7CYW4'])]
[316]:
new_human
[316]:
subject subject_label predicate object object_label object_obsoletes object_uninformative object_closure object_closure_redundant evidence ... pubs release pmid_new is_new fresh pmid_removed unique redacted redundant iba_rejected
0 UniProtKB:A0A024RBG1 NUDT4B enables GO:0003723 RNA binding False True {GO:0097159, GO:0003723, BFO:0000015, GO:00036... {GO:0097159, BFO:0000015, GO:0003674, GO:00036... IEA ... [GO_REF:0000043] 2024-11-03 False False False NaN NaN NaN False True
1 UniProtKB:A0A024RBG1 NUDT4B enables GO:0005515 protein binding False True {BFO:0000015, GO:0003674, GO:0005488, BFO:0000... {GO:0003674, BFO:0000015, GO:0005488, BFO:0000... IPI ... [PMID:33961781] 2024-11-03 True True True NaN NaN NaN False True
2 UniProtKB:A0A024RBG1 NUDT4B enables GO:0046872 metal ion binding False True {GO:0043169, GO:0046872, BFO:0000015, GO:00360... {GO:0043169, BFO:0000015, GO:0036094, GO:00431... IEA ... [GO_REF:0000043] 2024-11-03 False False False NaN NaN NaN False True
3 UniProtKB:A0A024RBG1 NUDT4B located_in GO:0005829 cytosol False False {CARO:0030000, UBERON:0000061, CARO:0000003, G... {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... IDA ... [GO_REF:0000052] 2024-11-03 False False False NaN NaN NaN False True
4 UniProtKB:A0A075B6H5 TRBV20OR9-2 involved_in GO:0002376 immune system process False False {GO:0008150, BFO:0000015, GO:0002376, BFO:0000... {BFO:0000015, GO:0008150, BFO:0000003} IEA ... [GO_REF:0000043] 2024-11-03 False True False NaN NaN NaN False True
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
434570 UniProtKB:Q9Y6A4 CFAP20 None GO:0007275 multicellular organism development False True {GO:0032501, BFO:0000015, GO:0048856, GO:00325... {GO:0032501, BFO:0000015, GO:0048856, GO:00325... TAS ... [PMID:8688464] 2020-01-01 NaN NaN NaN True True True False True
435058 UniProtKB:Q9Y6F1 PARP3 None GO:0006281 DNA repair False False {GO:0043170, GO:0033554, GO:0008152, GO:000613... {GO:0043170, GO:0008152, BFO:0000015, GO:00099... TAS ... [PMID:7260241] 2020-01-01 NaN NaN NaN True True True True False
436209 UniProtKB:Q9Y6Q9 NCOA3 None GO:0000981 DNA-binding transcription factor activity, RNA... False False {BFO:0000015, GO:0006357, GO:0008150, GO:00192... {GO:0065007, GO:0060255, GO:0009889, BFO:00000... ISM ... [PMID:19274049] 2020-01-01 NaN NaN NaN True True True False True
436550 UniProtKB:Q9Y6X0 SETBP1 None GO:0000981 DNA-binding transcription factor activity, RNA... False False {BFO:0000015, GO:0006357, GO:0008150, GO:00192... {GO:0065007, GO:0060255, GO:0009889, BFO:00000... ISM ... [PMID:19274049] 2020-01-01 NaN NaN NaN True True True False True
436732 UniProtKB:Q9Y6Y1 CAMTA1 None GO:0000981 DNA-binding transcription factor activity, RNA... False False {BFO:0000015, GO:0006357, GO:0008150, GO:00192... {GO:0065007, GO:0060255, GO:0009889, BFO:00000... ISM ... [PMID:19274049] 2020-01-01 NaN NaN NaN True True True False True

783918 rows × 23 columns

[317]:
new_human[new_human['iba_rejected']]
[317]:
subject subject_label predicate object object_label object_obsoletes object_uninformative object_closure object_closure_redundant evidence ... pubs release pmid_new is_new fresh pmid_removed unique redacted redundant iba_rejected
0 UniProtKB:A0A024RBG1 NUDT4B enables GO:0003723 RNA binding False True {GO:0097159, GO:0003723, BFO:0000015, GO:00036... {GO:0097159, BFO:0000015, GO:0003674, GO:00036... IEA ... [GO_REF:0000043] 2024-11-03 False False False NaN NaN NaN False True
1 UniProtKB:A0A024RBG1 NUDT4B enables GO:0005515 protein binding False True {BFO:0000015, GO:0003674, GO:0005488, BFO:0000... {GO:0003674, BFO:0000015, GO:0005488, BFO:0000... IPI ... [PMID:33961781] 2024-11-03 True True True NaN NaN NaN False True
2 UniProtKB:A0A024RBG1 NUDT4B enables GO:0046872 metal ion binding False True {GO:0043169, GO:0046872, BFO:0000015, GO:00360... {GO:0043169, BFO:0000015, GO:0036094, GO:00431... IEA ... [GO_REF:0000043] 2024-11-03 False False False NaN NaN NaN False True
3 UniProtKB:A0A024RBG1 NUDT4B located_in GO:0005829 cytosol False False {CARO:0030000, UBERON:0000061, CARO:0000003, G... {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... IDA ... [GO_REF:0000052] 2024-11-03 False False False NaN NaN NaN False True
4 UniProtKB:A0A075B6H5 TRBV20OR9-2 involved_in GO:0002376 immune system process False False {GO:0008150, BFO:0000015, GO:0002376, BFO:0000... {BFO:0000015, GO:0008150, BFO:0000003} IEA ... [GO_REF:0000043] 2024-11-03 False True False NaN NaN NaN False True
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
433680 UniProtKB:Q9Y5Y6 ST14 None GO:0005887 None True False {GO:0005887} {} TAS ... [PMID:10831593] 2020-01-01 NaN NaN NaN True True True False True
434570 UniProtKB:Q9Y6A4 CFAP20 None GO:0007275 multicellular organism development False True {GO:0032501, BFO:0000015, GO:0048856, GO:00325... {GO:0032501, BFO:0000015, GO:0048856, GO:00325... TAS ... [PMID:8688464] 2020-01-01 NaN NaN NaN True True True False True
436209 UniProtKB:Q9Y6Q9 NCOA3 None GO:0000981 DNA-binding transcription factor activity, RNA... False False {BFO:0000015, GO:0006357, GO:0008150, GO:00192... {GO:0065007, GO:0060255, GO:0009889, BFO:00000... ISM ... [PMID:19274049] 2020-01-01 NaN NaN NaN True True True False True
436550 UniProtKB:Q9Y6X0 SETBP1 None GO:0000981 DNA-binding transcription factor activity, RNA... False False {BFO:0000015, GO:0006357, GO:0008150, GO:00192... {GO:0065007, GO:0060255, GO:0009889, BFO:00000... ISM ... [PMID:19274049] 2020-01-01 NaN NaN NaN True True True False True
436732 UniProtKB:Q9Y6Y1 CAMTA1 None GO:0000981 DNA-binding transcription factor activity, RNA... False False {BFO:0000015, GO:0006357, GO:0008150, GO:00192... {GO:0065007, GO:0060255, GO:0009889, BFO:00000... ISM ... [PMID:19274049] 2020-01-01 NaN NaN NaN True True True False True

450057 rows × 23 columns

[318]:
prev_human_df = assocs_to_df(db["goa_human"][PREVIOUS[0]], PREVIOUS[0])
#annotate_new_pubs(human_df, prev_human_df)
[319]:
annotate_new_pubs(human_df, prev_human_df)
[320]:
human_df[human_df['fresh']]
[320]:
subject subject_label predicate object object_label object_obsoletes object_uninformative object_closure object_closure_redundant evidence is_iba negated pmid pubs release pmid_new is_new fresh
1 UniProtKB:A0A024RBG1 NUDT4B enables GO:0005515 protein binding False True {BFO:0000015, GO:0003674, GO:0005488, BFO:0000... {GO:0003674, BFO:0000015, GO:0005488, BFO:0000... IPI False None PMID:33961781 [PMID:33961781] 2024-11-03 True True True
357 UniProtKB:A0A096LP55 UQCRHL located_in GO:0005739 mitochondrion False False {GO:0005739, GO:0110165, GO:0043231, BFO:00000... {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... HTP False None PMID:34800366 [PMID:34800366] 2024-11-03 True True True
569 UniProtKB:A0A0B4J2F0 PIGBOS1 located_in GO:0005739 mitochondrion False False {GO:0005739, GO:0110165, GO:0043231, BFO:00000... {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... HTP False None PMID:34800366 [PMID:34800366] 2024-11-03 True True True
1904 UniProtKB:A1A4Y4 IRGM enables GO:1901612 cardiolipin binding False True {GO:0008289, GO:0043168, GO:1901612, BFO:00000... {GO:0008289, GO:0043168, BFO:0000015, GO:00360... IDA False None PMID:21102437 [PMID:21102437] 2024-11-03 True True True
1945 UniProtKB:A1A4Y4 IRGM involved_in GO:0090141 positive regulation of mitochondrial fission False False {GO:0010821, GO:0051130, GO:0065007, GO:003304... {GO:0010821, GO:0051130, GO:0065007, GO:003304... IDA False None PMID:21102437 [PMID:21102437] 2024-11-03 True True True
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
718365 RNAcentral:URS00026A23F2_9606 URS00026A23F2_9606 enables GO:0141180 dsDNA-RNA triple helix-forming chromatin adapt... False True {GO:0030674, BFO:0000015, GO:0043565, GO:00036... {GO:0003690, GO:0030674, GO:0003677, GO:007184... IDA False None PMID:27634931 [PMID:27634931] 2024-11-03 True True True
718366 RNAcentral:URS00026A23F2_9606 URS00026A23F2_9606 involved_in GO:0000122 negative regulation of transcription by RNA po... False False {BFO:0000015, GO:0006357, GO:0008150, GO:00098... {GO:0045934, GO:0009892, GO:0065007, GO:000988... IMP False None PMID:27634931 [PMID:27634931] 2024-11-03 True True True
718367 RNAcentral:URS00026A23F2_9606 URS00026A23F2_9606 involved_in GO:0000512 lncRNA-mediated post-transcriptional gene sile... False False {GO:0000512, BFO:0000015, GO:0016441, GO:00081... {GO:0065007, GO:0009892, GO:0031047, GO:000988... IDA False None PMID:30720199 [PMID:30720199] 2024-11-03 True True True
718368 RNAcentral:URS00026A23F2_9606 URS00026A23F2_9606 involved_in GO:0000512 lncRNA-mediated post-transcriptional gene sile... False False {GO:0000512, BFO:0000015, GO:0016441, GO:00081... {GO:0065007, GO:0009892, GO:0031047, GO:000988... IMP False None PMID:33102210 [PMID:33102210] 2024-11-03 True True True
718370 RNAcentral:URS00026A23F2_9606 URS00026A23F2_9606 involved_in GO:0090399 replicative senescence False False {GO:0008152, BFO:0000015, GO:0090399, GO:00099... {GO:0008152, BFO:0000015, GO:0009987, GO:00081... IMP False None PMID:27634931 [PMID:27634931] 2024-11-03 True True True

3380 rows × 18 columns

[321]:
#pair_assocs, pair_anns = annotate_assocs("tair")
[322]:
new_human.to_csv("output/go-human-assocs-annotated.csv", index=False)
[332]:
test_cases_df = new_human[(new_human['fresh'] == True) & (new_human['object_uninformative'] == False) & (new_human['negated'] != True) & (new_human['redundant'] != True) & (new_human['iba_rejected'] != True)]
test_cases_df
[332]:
subject subject_label predicate object object_label object_obsoletes object_uninformative object_closure object_closure_redundant evidence ... pubs release pmid_new is_new fresh pmid_removed unique redacted redundant iba_rejected
4876 UniProtKB:A6NNL5 C15orf61 located_in GO:0005739 mitochondrion False False {GO:0005739, GO:0110165, GO:0043231, BFO:00000... {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... HTP ... [PMID:34800366] 2024-11-03 True True True NaN NaN NaN False False
5853 UniProtKB:A8MSI8 LYRM9 located_in GO:0005739 mitochondrion False False {GO:0005739, GO:0110165, GO:0043231, BFO:00000... {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... HTP ... [PMID:34800366] 2024-11-03 True True True NaN NaN NaN False False
6327 UniProtKB:A8MXV4 NUDT19 located_in GO:0005739 mitochondrion False False {GO:0005739, GO:0110165, GO:0043231, BFO:00000... {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... HTP ... [PMID:34800366] 2024-11-03 True True True NaN NaN NaN False False
15942 UniProtKB:O14521 SDHD part_of GO:0045273 respiratory chain complex II (succinate dehydr... False False {GO:0098796, GO:0110165, GO:0045273, BFO:00000... {CARO:0030000, GO:0098803, GO:0098796, CARO:00... IDA ... [PMID:37098072] 2024-11-03 True True True NaN NaN NaN False False
34482 UniProtKB:O43325 LYRM1 located_in GO:0005739 mitochondrion False False {GO:0005739, GO:0110165, GO:0043231, BFO:00000... {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... HTP ... [PMID:34800366] 2024-11-03 True True True NaN NaN NaN False False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
718362 RNAcentral:URS00026A23F2_9606 URS00026A23F2_9606 acts_upstream_of GO:0008284 positive regulation of cell population prolife... False False {GO:0065007, GO:0042127, GO:0048518, BFO:00000... {GO:0065007, GO:0042127, GO:0048518, BFO:00000... IMP ... [PMID:33102210] 2024-11-03 True True True NaN NaN NaN False False
718366 RNAcentral:URS00026A23F2_9606 URS00026A23F2_9606 involved_in GO:0000122 negative regulation of transcription by RNA po... False False {BFO:0000015, GO:0006357, GO:0008150, GO:00098... {GO:0045934, GO:0009892, GO:0065007, GO:000988... IMP ... [PMID:27634931] 2024-11-03 True True True NaN NaN NaN False False
718367 RNAcentral:URS00026A23F2_9606 URS00026A23F2_9606 involved_in GO:0000512 lncRNA-mediated post-transcriptional gene sile... False False {GO:0000512, BFO:0000015, GO:0016441, GO:00081... {GO:0065007, GO:0009892, GO:0031047, GO:000988... IDA ... [PMID:30720199] 2024-11-03 True True True NaN NaN NaN False False
718368 RNAcentral:URS00026A23F2_9606 URS00026A23F2_9606 involved_in GO:0000512 lncRNA-mediated post-transcriptional gene sile... False False {GO:0000512, BFO:0000015, GO:0016441, GO:00081... {GO:0065007, GO:0009892, GO:0031047, GO:000988... IMP ... [PMID:33102210] 2024-11-03 True True True NaN NaN NaN False False
718370 RNAcentral:URS00026A23F2_9606 URS00026A23F2_9606 involved_in GO:0090399 replicative senescence False False {GO:0008152, BFO:0000015, GO:0090399, GO:00099... {GO:0008152, BFO:0000015, GO:0009987, GO:00081... IMP ... [PMID:27634931] 2024-11-03 True True True NaN NaN NaN False False

524 rows × 23 columns

[350]:
def row_to_test_case(row, answer="YES"):
    gene = row['subject_label']
    term = row['object_label']
    predicate = row['predicate']
    if not gene or not term or not predicate:
        return

    return {
        "input": f"{gene} {predicate} {term}",
        "original_input": {
            "subject": str(row['subject']),
            "predicate": str(row['predicate']),
            "object": str(row['object']),
        },
        "ideal": answer,
    }


def df_to_test_cases(df: pd.DataFrame, limit=1000):
    cases = [row_to_test_case(row) for _, row in df.iterrows()]
    cases = [x for x in cases if x is not None]
    if limit:
        cases = cases[:limit]
    return cases

df_to_test_cases(test_cases_df, limit=5)
[350]:
[{'input': 'C15orf61 located_in mitochondrion',
  'original_input': {'subject': 'UniProtKB:A6NNL5',
   'predicate': 'located_in',
   'object': 'GO:0005739'},
  'ideal': 'YES'},
 {'input': 'LYRM9 located_in mitochondrion',
  'original_input': {'subject': 'UniProtKB:A8MSI8',
   'predicate': 'located_in',
   'object': 'GO:0005739'},
  'ideal': 'YES'},
 {'input': 'NUDT19 located_in mitochondrion',
  'original_input': {'subject': 'UniProtKB:A8MXV4',
   'predicate': 'located_in',
   'object': 'GO:0005739'},
  'ideal': 'YES'},
 {'input': 'SDHD part_of respiratory chain complex II (succinate dehydrogenase)',
  'original_input': {'subject': 'UniProtKB:O14521',
   'predicate': 'part_of',
   'object': 'GO:0045273'},
  'ideal': 'YES'},
 {'input': 'LYRM1 located_in mitochondrion',
  'original_input': {'subject': 'UniProtKB:O43325',
   'predicate': 'located_in',
   'object': 'GO:0005739'},
  'ideal': 'YES'}]
[341]:
#cases = [row_to_test_case(row) for _, row in test_cases_df.iterrows()]

[343]:
import yaml

with open("output/test-cases.yaml", "w") as f:
    yaml.dump({"cases": cases}, f, sort_keys=False)
[345]:
redacted_df = new_human[(new_human['redacted'] == True) & (new_human['object_uninformative'] == False) & (new_human['negated'] != True) & (new_human['redundant'] != True) & (new_human['iba_rejected'] != True)]
redacted_df
[345]:
subject subject_label predicate object object_label object_obsoletes object_uninformative object_closure object_closure_redundant evidence ... pubs release pmid_new is_new fresh pmid_removed unique redacted redundant iba_rejected
401144 UniProtKB:Q8N6R0 METTL13 involved_in GO:0000122 negative regulation of transcription by RNA po... False False {BFO:0000015, GO:0006357, GO:0008150, GO:00098... {GO:0045934, GO:0009892, GO:0065007, GO:000988... IMP ... [PMID:26763933] 2024-06-10 NaN NaN NaN True True True False False
401146 UniProtKB:Q8N6R0 METTL13 involved_in GO:1902807 negative regulation of cell cycle G1/S phase t... False False {GO:0065007, GO:1902806, GO:0010948, GO:190198... {GO:0065007, GO:0010948, GO:1901988, GO:005172... IMP ... [PMID:26763933] 2024-06-10 NaN NaN NaN True True True False False
453842 UniProtKB:Q96K19 RNF170 involved_in GO:0034140 negative regulation of toll-like receptor 3 si... False False {GO:0048585, GO:0062207, GO:1902532, GO:000996... {GO:0048585, GO:0065007, GO:0002682, GO:006220... IDA ... [PMID:31076723] 2024-06-10 NaN NaN NaN True True True False False
587451 RNAcentral:URS0000083D87_9606 URS0000083D87_9606 involved_in GO:0035195 miRNA-mediated post-transcriptional gene silen... False False {BFO:0000015, GO:0016441, GO:0008150, GO:00106... {GO:0065007, GO:0009892, GO:0031047, GO:000988... IDA ... [PMID:28640956] 2024-06-10 NaN NaN NaN True True True False False
587452 RNAcentral:URS0000083D87_9606 URS0000083D87_9606 involved_in GO:0090051 negative regulation of cell migration involved... False False {GO:0030336, BFO:0000015, GO:0008150, GO:00105... {GO:0065007, GO:0030336, BFO:0000015, GO:00485... IGI ... [PMID:28640956] 2024-06-10 NaN NaN NaN True True True False False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
424250 UniProtKB:Q9Y226 SLC22A13 None GO:0015695 organic cation transport False False {GO:0051234, GO:0006810, BFO:0000015, GO:00156... {GO:0051234, GO:0006810, BFO:0000015, GO:00511... NAS ... [PMID:10072596] 2020-01-01 NaN NaN NaN True True True False False
425053 UniProtKB:Q9Y267 SLC22A14 None GO:0005887 None True False {GO:0005887} {} NAS ... [PMID:10072596] 2020-01-01 NaN NaN NaN True True True False False
425054 UniProtKB:Q9Y267 SLC22A14 None GO:0015101 organic cation transmembrane transporter activity False False {GO:0051234, GO:0055085, GO:0006810, BFO:00000... {GO:0051234, GO:0055085, GO:0006810, BFO:00000... NAS ... [PMID:10072596] 2020-01-01 NaN NaN NaN True True True False False
425055 UniProtKB:Q9Y267 SLC22A14 None GO:0015695 organic cation transport False False {GO:0051234, GO:0006810, BFO:0000015, GO:00156... {GO:0051234, GO:0006810, BFO:0000015, GO:00511... NAS ... [PMID:10072596] 2020-01-01 NaN NaN NaN True True True False False
432681 UniProtKB:Q9Y5M6 OCLM None GO:0007601 visual perception False False {GO:0032501, GO:0050953, GO:0003008, BFO:00000... {GO:0032501, GO:0050953, GO:0003008, BFO:00000... TAS ... [PMID:10362512] 2020-01-01 NaN NaN NaN True True True False False

83 rows × 23 columns

[346]:
cases = [row_to_test_case(row) for _, row in redacted_df.iterrows()]

with open("output/test-cases-redacted.yaml", "w") as f:
    yaml.dump({"cases": cases}, f, sort_keys=False)
[348]:
iba_df = new_human[(new_human['is_iba'] == True) & (new_human['object_uninformative'] == False) & (new_human['negated'] != True) & (new_human['redundant'] != True)]
iba_df
[348]:
subject subject_label predicate object object_label object_obsoletes object_uninformative object_closure object_closure_redundant evidence ... pubs release pmid_new is_new fresh pmid_removed unique redacted redundant iba_rejected
718564 UniProtKB:Q06418 TYRO3 enables GO:0004714 transmembrane receptor protein tyrosine kinase... False False {GO:0019199, GO:0140096, GO:0003824, GO:001674... {GO:0019199, GO:0003824, BFO:0000015, GO:00167... IBA ... [GO_REF:0000033] 2024-11-03 False False False NaN NaN NaN False False
718565 UniProtKB:P78559 MAP1A is_active_in GO:0030425 dendrite False False {GO:0030425, GO:0120025, GO:0043005, GO:011016... {CARO:0030000, CL:0002319, CL:0000211, UBERON:... IBA ... [GO_REF:0000033] 2024-11-03 False False False NaN NaN NaN False False
718566 UniProtKB:Q7L1W4 LRRC8D is_active_in GO:0005737 cytoplasm False False {CARO:0030000, UBERON:0000061, CARO:0000003, G... {CARO:0030000, UBERON:0000061, CARO:0000003, C... IBA ... [GO_REF:0000033] 2024-11-03 False False False NaN NaN NaN False False
718567 UniProtKB:A3QJZ7 PRAMEF27 part_of GO:0031462 Cul2-RING ubiquitin ligase complex False False {GO:0031462, GO:0031461, GO:1990234, BFO:00000... {GO:0031461, GO:1990234, BFO:0000004, GO:01405... IBA ... [GO_REF:0000033] 2024-11-03 False True False NaN NaN NaN False False
718568 UniProtKB:Q70IA6 MOB2 is_active_in GO:0005634 nucleus False False {GO:0110165, GO:0043231, BFO:0000002, GO:00432... {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... IBA ... [GO_REF:0000033] 2024-11-03 False False False NaN NaN NaN False False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
782818 UniProtKB:Q9NZC2 TREM2 involved_in GO:0045088 regulation of innate immune response False False {GO:0065007, GO:0002682, GO:0050776, BFO:00000... {GO:0065007, GO:0002682, GO:0050776, BFO:00000... IBA ... [GO_REF:0000033] 2024-11-03 False False False NaN NaN NaN False False
782819 UniProtKB:Q9Y2K2 SIK3 enables GO:0050321 tau-protein kinase activity False False {GO:0140096, GO:0003824, GO:0004674, GO:001674... {GO:0140096, GO:0003824, GO:0004674, GO:001674... IBA ... [GO_REF:0000033] 2024-11-03 False False False NaN NaN NaN False False
782820 UniProtKB:P43235 CTSK involved_in GO:0051603 proteolysis involved in protein catabolic process False False {GO:0043170, GO:0006508, GO:0044238, GO:000905... {GO:0043170, GO:0006508, GO:0044238, GO:000905... IBA ... [GO_REF:0000033] 2024-11-03 False False False NaN NaN NaN False False
782821 UniProtKB:Q07343 PDE4B enables GO:0047555 3',5'-cyclic-GMP phosphodiesterase activity False False {GO:0047555, GO:0003824, GO:0008081, BFO:00000... {GO:0003824, GO:0008081, BFO:0000015, GO:00425... IBA ... [GO_REF:0000033] 2024-11-03 False False False NaN NaN NaN False False
782822 UniProtKB:A6NC42 DPPA5 involved_in GO:0010468 regulation of gene expression False False {GO:0065007, GO:0060255, GO:0009889, BFO:00000... {GO:0065007, GO:0060255, GO:0009889, BFO:00000... IBA ... [GO_REF:0000033] 2024-11-03 False False False NaN NaN NaN False False

56487 rows × 23 columns

[351]:
with open("output/test-cases-iba.yaml", "w") as f:
    yaml.dump({"cases": df_to_test_cases(iba_df, limit=1000)}, f, sort_keys=False)
[ ]: