GO Annotation Analysis
Analyzes the associations between genes and GO terms in the Gene Ontology (GO) database.
In particular, categorizes associations based on:
whether they have been “retracted” (i.e., removed from the database)
whether they have been reviewed and accepted or rejected via IBA
[1]:
import pandas as pd
from tests.test_implementations.test_robot_template import adapter
# a recent release plus an older one that may have retracted entries
RELEASES = [
"2024-11-03",
"2024-06-10",
"2020-01-01",
]
LATEST = RELEASES[0]
PREVIOUS = RELEASES[1:]
assert all(r < LATEST for r in PREVIOUS)
[2]:
NEW_CUTOFF = "2024-06-01"
[3]:
# taxa to analyze
TAXA = [
("human", "goa_human", 9606),
("Arabidopsis thaliana", "tair", 3702),
("yeast", "sgd", 559292),
]
[4]:
GAF_URL_TEMPLATE = "https://release.geneontology.org/{date}/annotations/{name}.gaf.gz"
[5]:
from oaklib.datamodels.vocabulary import IS_A, PART_OF
Create an OAK adapter for the GO ontology
[6]:
from oaklib import get_adapter
go = get_adapter("sqlite:obo:go")
[7]:
obsoletes = set(go.obsoletes())
[ ]:
[8]:
binding_terms = set(go.descendants("GO:0005488", predicates=[IS_A]))
[9]:
antislim_terms = set(go.subset_members("gocheck_do_not_annotate")).union(go.subset_members("gocheck_obsoletion_candidate"))
non_informative = binding_terms.union(antislim_terms)
[96]:
mf_terms = set(go.descendants("GO:0003674", predicates=[IS_A]))
bp_terms = set(go.descendants("GO:0008150", predicates=[IS_A]))
cc_terms = set(go.descendants("GO:0005575", predicates=[IS_A]))
Load annotations from the archive
[10]:
from oaklib.parsers import GafAssociationParser
gaf_parser = GafAssociationParser()
[11]:
import requests_cache
session = requests_cache.CachedSession(
cache_name='gaf_cache',
backend='sqlite', # or 'memory' for in-memory cache
expire_after=24*60*60, # Cache expiration in seconds
allowable_codes=[200], # Only cache successful responses
)
[12]:
from oaklib.datamodels.association import ParserConfiguration, NegatedAssociation
import io
import gzip
def get_gaf(release, name):
config = ParserConfiguration(preserve_negated_associations=True)
url = GAF_URL_TEMPLATE.format(date=release, name=name)
# open the URL as a file object using requests
with session.get(url, stream=True) as response:
# Decompress the gzipped content and create a text stream
decompressed = gzip.decompress(response.content)
text_stream = io.TextIOWrapper(io.BytesIO(decompressed))
print(f"Reading {url} using {config}")
return list(gaf_parser.parse(text_stream, configuration=config))
[ ]:
Load all annotations into a cache
[13]:
from collections import defaultdict
db = defaultdict(dict)
for r in RELEASES:
for name, grp, tax_id in TAXA:
print(f"Loading {r} {name}")
assocs = get_gaf(r, grp)
print(f"Loaded {len(assocs)} associations")
neg_assocs = [x for x in assocs if x.negated]
print(f" {len(neg_assocs)} negated associations")
db[grp][r] = assocs
Loading 2024-11-03 human
Reading https://release.geneontology.org/2024-11-03/annotations/goa_human.gaf.gz using ParserConfiguration(preserve_negated_associations=True, include_association_attributes=None, primary_knowledge_source=None, aggregator_knowledge_source=None)
Loaded 782823 associations
1494 negated associations
Loading 2024-11-03 Arabidopsis thaliana
Reading https://release.geneontology.org/2024-11-03/annotations/tair.gaf.gz using ParserConfiguration(preserve_negated_associations=True, include_association_attributes=None, primary_knowledge_source=None, aggregator_knowledge_source=None)
Loaded 235371 associations
1374 negated associations
Loading 2024-11-03 yeast
Reading https://release.geneontology.org/2024-11-03/annotations/sgd.gaf.gz using ParserConfiguration(preserve_negated_associations=True, include_association_attributes=None, primary_knowledge_source=None, aggregator_knowledge_source=None)
Loaded 120823 associations
6 negated associations
Loading 2024-06-10 human
Reading https://release.geneontology.org/2024-06-10/annotations/goa_human.gaf.gz using ParserConfiguration(preserve_negated_associations=True, include_association_attributes=None, primary_knowledge_source=None, aggregator_knowledge_source=None)
Loaded 707168 associations
1308 negated associations
Loading 2024-06-10 Arabidopsis thaliana
Reading https://release.geneontology.org/2024-06-10/annotations/tair.gaf.gz using ParserConfiguration(preserve_negated_associations=True, include_association_attributes=None, primary_knowledge_source=None, aggregator_knowledge_source=None)
Loaded 235504 associations
1373 negated associations
Loading 2024-06-10 yeast
Reading https://release.geneontology.org/2024-06-10/annotations/sgd.gaf.gz using ParserConfiguration(preserve_negated_associations=True, include_association_attributes=None, primary_knowledge_source=None, aggregator_knowledge_source=None)
Loaded 117290 associations
7 negated associations
Loading 2020-01-01 human
Reading https://release.geneontology.org/2020-01-01/annotations/goa_human.gaf.gz using ParserConfiguration(preserve_negated_associations=True, include_association_attributes=None, primary_knowledge_source=None, aggregator_knowledge_source=None)
Loaded 495361 associations
1244 negated associations
Loading 2020-01-01 Arabidopsis thaliana
Reading https://release.geneontology.org/2020-01-01/annotations/tair.gaf.gz using ParserConfiguration(preserve_negated_associations=True, include_association_attributes=None, primary_knowledge_source=None, aggregator_knowledge_source=None)
Loaded 236821 associations
1364 negated associations
Loading 2020-01-01 yeast
Reading https://release.geneontology.org/2020-01-01/annotations/sgd.gaf.gz using ParserConfiguration(preserve_negated_associations=True, include_association_attributes=None, primary_knowledge_source=None, aggregator_knowledge_source=None)
Loaded 120916 associations
28 negated associations
[14]:
db["goa_human"][LATEST][0]
[14]:
Association(subject='UniProtKB:A0A024RBG1', predicate='enables', object='GO:0003723', property_values=[], subject_label='NUDT4B', predicate_label=None, object_label=None, negated=None, publications=['GO_REF:0000043'], evidence_type='IEA', supporting_objects=[], primary_knowledge_source='infores:UniProt', aggregator_knowledge_source=None, subject_closure=[], subject_closure_label=[], object_closure=[], object_closure_label=[], comments=[])
[15]:
len([x for x in db["goa_human"][LATEST] if x.negated])
[15]:
1494
[ ]:
# reload modules
[30]:
%load_ext autoreload
The autoreload extension is already loaded. To reload it, use:
%reload_ext autoreload
[53]:
%autoreload 0
Diffs by terms
[ ]:
[32]:
from oaklib.utilities.associations.association_differ import AssociationDiffer
differ = AssociationDiffer(adapter=go)
[33]:
cache = {}
[73]:
len(db["goa_human"].keys())
[73]:
3
[74]:
list(db["goa_human"].keys())
[74]:
['2024-11-03', '2024-06-10', '2020-01-01']
[89]:
#ix = differ.changes_by_terms(db["goa_human"][LATEST], db["goa_human"][PREVIOUS[0]], min_num_entities_changes=10, cache={})
grp = "sgd"
ix = differ.changes_by_terms(db[grp][PREVIOUS[0]], db[grp][LATEST], min_num_entities_changes=2, cache={})
[90]:
len(ix)
[90]:
847
[91]:
for k in list(ix.keys())[0:5]:
print(k, go.label(k))
GO:1904688 regulation of cytoplasmic translational initiation
GO:0170039 proteinogenic amino acid metabolic process
GO:0005980 glycogen catabolic process
GO:0045937 positive regulation of phosphate metabolic process
GO:0042762 regulation of sulfur metabolic process
[92]:
pubmed_adapter = get_adapter("pubmed:")
WARNING:eutils._internal.queryservice:No NCBI API key provided; throttling to 3 requests/second; see https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/new-api-keys-for-the-e-utilities/
[93]:
from functools import lru_cache
@lru_cache
def pub_date(pmid):
m = pubmed_adapter.entity_metadata_map(pmid)
if m:
return m.get("year")
[110]:
from typing import Optional
from oaklib.utilities.associations.association_differ import TermComparison
def score_term_comparison(term: str, ix: TermComparison, max_date_inclusive=None, max_genes=20) -> Optional[dict]:
"""
We are interested in genes for which all evidence was from previously known.
:param term:
:param ix:
:return:
"""
#all_genes = set(ix.old_associations_by_entity.keys()).union(ix.new_associations_by_entity.keys())
#gene_diff = all_genes - set(ix.old_associations_by_entity.keys()).intersection(ix.new_associations_by_entity.keys())
if len(ix.new_associations_by_entity) > max_genes:
return None
new_genes = set(ix.new_associations_by_entity.keys()) - set(ix.old_associations_by_entity.keys())
gene_id_to_label_map = {}
for g, assocs in ix.new_associations_by_entity.items():
gene_id_to_label_map[g] = assocs[0].subject_label
for g, assocs in ix.old_associations_by_entity.items():
gene_id_to_label_map[g] = assocs[0].subject_label
filtered_new_genes = {}
for gene in new_genes:
all_before_cutoff = True
for a in ix.new_associations_by_entity[gene]:
pmids = [x for x in a.publications if x.startswith("PMID")]
if max_date_inclusive is not None:
# print(f"{term} {gene}, pmids={pmids}")
if not pmids:
all_before_cutoff = False
else:
pmid = pmids[0]
date = pub_date(pmid)
if date is None or date > max_date_inclusive:
all_before_cutoff = False
if all_before_cutoff:
filtered_new_genes[gene] = ix.new_associations_by_entity[gene]
#gene_id_diff = len(gene_diff)
term_lbl = go.label(term)
if term in mf_terms:
ann_pred = "that are capable of"
elif term in bp_terms:
ann_pred = "involved in"
elif term in cc_terms:
ann_pred = "localized to"
else:
return
def as_genes_list(amap):
return [gene_id_to_label_map[g] for g in amap]
def as_str_list(amap):
return [str(g) for g in amap]
case = {
"input": f"List all genes {ann_pred} {term_lbl}",
"ideal": "; ".join(as_genes_list(ix.new_associations_by_entity)),
"original_input": {
"term": str(term),
"genes_current": as_str_list(ix.new_associations_by_entity),
"genes_previous": as_str_list(ix.old_associations_by_entity),
"genes_added": as_genes_list(new_genes),
"genes_added_prior_to_cutoff": as_genes_list(filtered_new_genes),
"num_genes_added_prior_to_cutoff": len(filtered_new_genes),
"date_cutoff": max_date_inclusive,
}
}
return case
[111]:
import yaml
n = 0
for k in list(ix.keys()):
lbl = go.label(k)
if "regulation" in lbl:
continue
if "response to" in lbl:
continue
case = score_term_comparison(k, ix[k], max_date_inclusive="2022")
if not case:
continue
if case["original_input"]["num_genes_added_prior_to_cutoff"] > 2:
print(yaml.dump(case, sort_keys=False))
n += 1
if n > 40:
break
input: List all genes localized to respiratory chain complex IV
ideal: COX6; COX12; COX5B; COX9; COX5A; COX4; COX8; MTC3; COX13; COX7; COX1; COX2;
COX3; COX26; AI4; AI5_ALPHA; AI3
original_input:
term: GO:0045277
genes_current:
- SGD:S000001093
- SGD:S000004028
- SGD:S000001373
- SGD:S000002225
- SGD:S000004997
- SGD:S000003155
- SGD:S000004387
- SGD:S000003195
- SGD:S000003159
- SGD:S000004869
- SGD:S000007260
- SGD:S000007281
- SGD:S000007283
- SGD:S000113555
- SGD:S000007264
- SGD:S000007265
- SGD:S000007263
genes_previous:
- SGD:S000004387
- SGD:S000004028
- SGD:S000004857
- SGD:S000007260
genes_added:
- COX26
- COX13
- AI4
- COX4
- COX9
- AI3
- MTC3
- COX6
- COX3
- AI5_ALPHA
- COX5A
- COX5B
- COX7
- COX2
genes_added_prior_to_cutoff:
- COX26
- COX9
- COX3
- COX7
- COX2
num_genes_added_prior_to_cutoff: 5
date_cutoff: '2022'
input: List all genes localized to TTT Hsp90 cochaperone complex
ideal: TTI2; RVB2; TRA1; TTI1; RVB1; ASA1; TEL2
original_input:
term: GO:0110078
genes_current:
- SGD:S000003897
- SGD:S000006156
- SGD:S000001141
- SGD:S000001516
- SGD:S000002598
- SGD:S000006289
- SGD:S000003331
genes_previous:
- SGD:S000003897
genes_added:
- RVB1
- TEL2
- TTI1
- ASA1
- RVB2
- TRA1
genes_added_prior_to_cutoff:
- RVB1
- TEL2
- TTI1
- ASA1
- RVB2
- TRA1
num_genes_added_prior_to_cutoff: 6
date_cutoff: '2022'
input: List all genes localized to respiratory chain complex III
ideal: COR1; RIP1; QCR6; QCR8; QCR7; QCR9; QCR10; CYT1; QCR2; COB
original_input:
term: GO:0045275
genes_current:
- SGD:S000000141
- SGD:S000000750
- SGD:S000001929
- SGD:S000003702
- SGD:S000002937
- SGD:S000003415
- SGD:S000003529
- SGD:S000005591
- SGD:S000006395
- SGD:S000007270
genes_previous:
- SGD:S000007270
genes_added:
- QCR10
- COR1
- CYT1
- QCR2
- QCR9
- QCR8
- QCR6
- QCR7
- RIP1
genes_added_prior_to_cutoff:
- QCR10
- COR1
- QCR2
num_genes_added_prior_to_cutoff: 3
date_cutoff: '2022'
input: List all genes that are capable of alpha-1,4-glucosidase activity
ideal: MAL62; MAL42; MAL22; MAL32; GTB1; MAL12; IMA1; IMA2; IMA3; IMA4; IMA5
original_input:
term: GO:0004558
genes_current:
- SGD:S000029690
- SGD:S000029687
- SGD:S000029682
- SGD:S000000503
- SGD:S000002629
- SGD:S000003524
- SGD:S000003519
- SGD:S000005517
- SGD:S000001434
- SGD:S000003757
- SGD:S000003752
genes_previous:
- SGD:S000002629
genes_added:
- MAL42
- IMA1
- IMA3
- IMA4
- IMA5
- IMA2
- MAL22
- MAL32
- MAL62
- MAL12
genes_added_prior_to_cutoff:
- MAL42
- MAL22
- MAL62
num_genes_added_prior_to_cutoff: 3
date_cutoff: '2022'
input: List all genes that are capable of G-quadruplex DNA binding
ideal: RAP1; MGS1; SUB1; DNA2; NSR1; VID22; MSS116; XRS2; SLX9; PIF1; MRE11; DBP2;
RAD50; DED1; DBP1; RRM3
original_input:
term: GO:0051880
genes_current:
- SGD:S000005160
- SGD:S000005162
- SGD:S000004642
- SGD:S000001207
- SGD:S000003391
- SGD:S000004365
- SGD:S000002602
- SGD:S000002777
- SGD:S000003313
- SGD:S000004526
- SGD:S000004837
- SGD:S000005056
- SGD:S000005194
- SGD:S000005730
- SGD:S000006040
- SGD:S000001073
genes_previous:
- SGD:S000004526
- SGD:S000005160
- SGD:S000005194
- SGD:S000004837
- SGD:S000002777
- SGD:S000005162
- SGD:S000004642
- SGD:S000001207
- SGD:S000003391
- SGD:S000003313
- SGD:S000001073
- SGD:S000004365
genes_added:
- DBP2
- MSS116
- DED1
- DBP1
genes_added_prior_to_cutoff:
- DBP2
- MSS116
- DED1
- DBP1
num_genes_added_prior_to_cutoff: 4
date_cutoff: '2022'
input: List all genes that are capable of alpha-glucosidase activity
ideal: MAL62; MAL42; MAL22; ROT2; MAL32; SGA1; IMA4; IMA3; GTB1; CWH41; MAL12; IMA1;
IMA5; GDB1; IMA2; YMR196W; STA1; CPX-417; SUC2
original_input:
term: GO:0090599
genes_current:
- SGD:S000029690
- SGD:S000029687
- SGD:S000029682
- SGD:S000000433
- SGD:S000000503
- SGD:S000001361
- SGD:S000003757
- SGD:S000001434
- SGD:S000002629
- SGD:S000002995
- SGD:S000003524
- SGD:S000003519
- SGD:S000003752
- SGD:S000006388
- SGD:S000005517
- SGD:S000004809
- SGD:S000029522
- SGD:S000217621
- SGD:S000001424
genes_previous:
- SGD:S000001361
- SGD:S000001434
- SGD:S000003757
- SGD:S000000433
- SGD:S000000503
- SGD:S000003752
- SGD:S000002995
- SGD:S000003519
- SGD:S000003524
- SGD:S000002629
- SGD:S000006388
- SGD:S000005517
- SGD:S000004809
- SGD:S000029522
- SGD:S000217621
- SGD:S000001424
genes_added:
- MAL42
- MAL22
- MAL62
genes_added_prior_to_cutoff:
- MAL42
- MAL22
- MAL62
num_genes_added_prior_to_cutoff: 3
date_cutoff: '2022'
[ ]:
[ ]:
[ ]:
[ ]:
OLD ANALYSIS BELOW
[ ]:
[289]:
from functools import lru_cache
@lru_cache
def lineage(t: str):
up = set(go.ancestors(t, predicates=[IS_A, PART_OF], reflexive=True))
dn = set(go.descendants(t, predicates=[IS_A, PART_OF]))
return up.union(dn)
#len(lineage("GO:0005737"))
@lru_cache
def ancs(t: str):
return set(go.ancestors(t, predicates=[IS_A, PART_OF], reflexive=True))
[290]:
import pandas as pd
def pmid(a):
pubs = [p for p in a.publications if p.startswith("PMID")]
if pubs:
if len(pubs) > 1:
raise ValueError(f"Multiple PMIDs: {pubs}")
return pubs[0]
return None
@lru_cache
def lbl(t: str):
return go.label(t)
def assocs_to_df(assocs: list, release: str):
df = pd.DataFrame([{
"subject": a.subject,
"subject_label": a.subject_label,
"predicate": a.predicate,
"object": a.object,
"object_label": lbl(a.object),
"object_obsoletes": a.object in obsoletes,
"object_uninformative": a.object in non_informative,
"object_closure": ancs(a.object),
"object_closure_redundant": ancs(a.object) - {a.object},
"evidence": a.evidence_type,
"is_iba": a.evidence_type == "IBA",
"negated": a.negated,
"pmid": pmid(a),
"pubs": a.publications,
"release": release,
} for a in assocs])
return df
[291]:
human_df = assocs_to_df(db["goa_human"][LATEST], LATEST)
tair_df = assocs_to_df(db["tair"][LATEST], LATEST)
sgd_df = assocs_to_df(db["sgd"][LATEST], LATEST)
df = sgd_df
[292]:
prev_df = assocs_to_df(db["sgd"][PREVIOUS[-1]], PREVIOUS[-1])
[293]:
df
[293]:
subject | subject_label | predicate | object | object_label | object_obsoletes | object_uninformative | object_closure | object_closure_redundant | evidence | is_iba | negated | pmid | pubs | release | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | SGD:S000003381 | GPC1 | acts_upstream_of_or_within | GO:0090640 | phosphatidylcholine biosynthesis from sn-glyce... | False | False | {GO:0019637, GO:0008152, BFO:0000015, GO:00066... | {GO:0019637, GO:0008152, GO:0006796, BFO:00000... | IGI | False | None | PMID:30514764 | [PMID:30514764] | 2024-11-03 |
1 | SGD:S000005701 | ALE1 | acts_upstream_of_or_within | GO:0090640 | phosphatidylcholine biosynthesis from sn-glyce... | False | False | {GO:0019637, GO:0008152, BFO:0000015, GO:00066... | {GO:0019637, GO:0008152, GO:0006796, BFO:00000... | IGI | False | None | PMID:30514764 | [PMID:30514764] | 2024-11-03 |
2 | SGD:S000003381 | GPC1 | acts_upstream_of_or_within | GO:0036151 | phosphatidylcholine acyl-chain remodeling | False | False | {GO:0019637, GO:0008152, GO:0006796, BFO:00000... | {GO:0019637, GO:0008152, GO:0006796, BFO:00000... | IMP | False | None | PMID:30514764 | [PMID:30514764] | 2024-11-03 |
3 | SGD:S000004492 | RCF1 | acts_upstream_of_or_within | GO:0033617 | mitochondrial cytochrome c oxidase assembly | False | False | {GO:0043933, GO:0044085, GO:0065003, GO:001604... | {GO:0043933, GO:0044085, GO:0022607, GO:007184... | IMP | False | None | PMID:29746825 | [PMID:29746825] | 2024-11-03 |
4 | SGD:S000004977 | SIW14 | enables | GO:0052845 | inositol-5-diphosphate-1,2,3,4,6-pentakisphosp... | False | False | {GO:0016817, GO:0016818, GO:0052842, GO:000382... | {GO:0016817, GO:0016818, GO:0052842, GO:000382... | IDA | False | None | PMID:26828065 | [PMID:26828065] | 2024-11-03 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
120818 | SGD:S000003241 | SEC9 | involved_in | GO:0006906 | vesicle fusion | False | False | {GO:0048284, GO:0051234, GO:0090174, GO:001604... | {GO:0061024, BFO:0000015, GO:0009987, GO:00160... | IBA | True | None | None | [GO_REF:0000033] | 2024-11-03 |
120819 | SGD:S000004826 | CEF1 | part_of | GO:0000974 | Prp19 complex | False | False | {GO:0000974, BFO:0000004, BFO:0000040, BFO:000... | {BFO:0000004, BFO:0000040, BFO:0000002, GO:003... | IBA | True | None | None | [GO_REF:0000033] | 2024-11-03 |
120820 | SGD:S000002551 | MKC7 | involved_in | GO:0031505 | fungal-type cell wall organization | False | False | {GO:0071554, GO:0016043, GO:0045229, GO:003150... | {GO:0071554, GO:0016043, GO:0045229, GO:000998... | IBA | True | None | None | [GO_REF:0000033] | 2024-11-03 |
120821 | SGD:S000003008 | HEM2 | is_active_in | GO:0005829 | cytosol | False | False | {CARO:0030000, UBERON:0000061, CARO:0000003, G... | {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... | IBA | True | None | None | [GO_REF:0000033] | 2024-11-03 |
120822 | SGD:S000001122 | LAM4 | involved_in | GO:0032366 | intracellular sterol transport | False | False | {GO:0006869, GO:0015850, GO:0032365, GO:005123... | {GO:0032365, GO:0015918, GO:0051649, GO:000998... | IBA | True | None | None | [GO_REF:0000033] | 2024-11-03 |
120823 rows × 15 columns
[294]:
from typing import Dict, List, Optional
def repair_assocs_df(assocs: pd.DataFrame):
"""
Ensures that IDs are normalized.
:param assocs:
:return:
"""
# may not be 1:1
subject_label_to_ids: Dict[str, List[str]]
subject_label_to_ids = assocs.groupby("subject_label")["subject"].aggregate(lambda x: list(set(x))).to_dict()
labels_with_multiple_ids = {k: v for k, v in subject_label_to_ids.items() if len(v) > 1}
if labels_with_multiple_ids:
print(f"Multiple IDs for {len(labels_with_multiple_ids)} labels")
print(list(labels_with_multiple_ids.items())[:5])
labels_to_canonical = {k: sorted(list(v))[0] for k, v in subject_label_to_ids.items()}
assocs['subject'] = assocs['subject_label'].map(labels_to_canonical)
[295]:
test_df = tair_df.copy()
repair_assocs_df(test_df)
test_df
Multiple IDs for 5258 labels
[('4CL1', ['TAIR:locus:2017602', 'AGI_LocusCode:AT1G51680']), ('4CL2', ['TAIR:locus:2094716', 'AGI_LocusCode:AT3G21240']), ('4CL3', ['TAIR:locus:2015003', 'AGI_LocusCode:AT1G65060']), ('AAC1', ['TAIR:locus:2077778', 'AGI_LocusCode:AT3G08580']), ('AAC2', ['AGI_LocusCode:AT5G13490', 'TAIR:locus:2185041'])]
[295]:
subject | subject_label | predicate | object | object_label | object_obsoletes | object_uninformative | object_closure | object_closure_redundant | evidence | is_iba | negated | pmid | pubs | release | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | AGI_LocusCode:AT1G11880 | AT1G11880 | enables | GO:0000009 | alpha-1,6-mannosyltransferase activity | False | False | {GO:0000030, GO:0003824, GO:0016740, BFO:00000... | {GO:0000030, GO:0003824, GO:0016740, BFO:00000... | IEA | False | None | None | [TAIR:AnalysisReference:501756966] | 2024-11-03 |
1 | AGI_LocusCode:AT1G80420 | ATXRCC1 | involved_in | GO:0000012 | single strand break repair | False | False | {GO:0043170, GO:0033554, GO:0008152, GO:000613... | {GO:0043170, GO:0008152, BFO:0000015, GO:00099... | IEA | False | None | None | [TAIR:AnalysisReference:501756966] | 2024-11-03 |
2 | AGI_LocusCode:AT1G74030 | ENO1 | part_of | GO:0000015 | phosphopyruvate hydratase complex | False | False | {GO:0005829, GO:0110165, BFO:0000002, GO:00329... | {CARO:0030000, GO:0005829, BFO:0000004, CARO:0... | IEA | False | None | None | [TAIR:AnalysisReference:501756966] | 2024-11-03 |
3 | AGI_LocusCode:AT2G29560 | ENOC | part_of | GO:0000015 | phosphopyruvate hydratase complex | False | False | {GO:0005829, GO:0110165, BFO:0000002, GO:00329... | {CARO:0030000, GO:0005829, BFO:0000004, CARO:0... | IEA | False | None | None | [TAIR:AnalysisReference:501756966] | 2024-11-03 |
4 | AGI_LocusCode:AT2G36530 | LOS2 | part_of | GO:0000015 | phosphopyruvate hydratase complex | False | False | {GO:0005829, GO:0110165, BFO:0000002, GO:00329... | {CARO:0030000, GO:0005829, BFO:0000004, CARO:0... | IEA | False | None | None | [TAIR:AnalysisReference:501756966] | 2024-11-03 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
235366 | TAIR:locus:2058630 | At2g23210 | enables | GO:0010294 | abscisic acid glucosyltransferase activity | False | False | {GO:0035251, GO:0003824, GO:0016740, BFO:00000... | {GO:0035251, GO:0003824, GO:0016740, BFO:00000... | IBA | True | None | None | [GO_REF:0000033] | 2024-11-03 |
235367 | AGI_LocusCode:AT2G15820 | OTP51 | involved_in | GO:0045292 | mRNA cis splicing, via spliceosome | False | False | {GO:0006397, GO:0008152, GO:0000375, BFO:00000... | {GO:0009059, GO:0043170, GO:0006397, GO:000815... | IBA | True | None | None | [GO_REF:0000033] | 2024-11-03 |
235368 | TAIR:locus:2143196 | At5g15750 | involved_in | GO:0042274 | ribosomal small subunit biogenesis | False | False | {GO:0042274, GO:0044085, GO:0009987, BFO:00000... | {GO:0044085, GO:0009987, BFO:0000015, GO:00226... | IBA | True | None | None | [GO_REF:0000033] | 2024-11-03 |
235369 | AGI_LocusCode:AT4G14730 | LFG1 | is_active_in | GO:0016020 | membrane | False | False | {CARO:0030000, UBERON:0000061, CARO:0000003, G... | {CARO:0030000, UBERON:0000061, CARO:0000003, G... | IBA | True | None | None | [GO_REF:0000033] | 2024-11-03 |
235370 | TAIR:locus:2116525 | SD25 | enables | GO:0004672 | protein kinase activity | False | False | {GO:0140096, GO:0003824, GO:0016740, BFO:00000... | {GO:0140096, GO:0003824, GO:0016740, BFO:00000... | IBA | True | None | None | [GO_REF:0000033] | 2024-11-03 |
235371 rows × 15 columns
[296]:
test_df[test_df['subject_label'] == "GALT6"]
[296]:
subject | subject_label | predicate | object | object_label | object_obsoletes | object_uninformative | object_closure | object_closure_redundant | evidence | is_iba | negated | pmid | pubs | release | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
56871 | AGI_LocusCode:AT5G62620 | GALT6 | located_in | GO:0005794 | Golgi apparatus | False | False | {GO:0110165, GO:0043231, BFO:0000002, GO:00057... | {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... | ISM | False | None | None | [TAIR:AnalysisReference:501780126] | 2024-11-03 |
56872 | AGI_LocusCode:AT5G62620 | GALT6 | located_in | GO:0005794 | Golgi apparatus | False | False | {GO:0110165, GO:0043231, BFO:0000002, GO:00057... | {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... | ISM | False | None | None | [TAIR:AnalysisReference:501780126] | 2024-11-03 |
61144 | AGI_LocusCode:AT5G62620 | GALT6 | involved_in | GO:0006486 | protein glycosylation | False | False | {GO:0008152, BFO:0000015, GO:0009100, GO:00434... | {GO:0009059, GO:0043170, GO:0070085, GO:000815... | IEA | False | None | None | [TAIR:AnalysisReference:501757242] | 2024-11-03 |
85487 | AGI_LocusCode:AT5G62620 | GALT6 | enables | GO:0030246 | carbohydrate binding | False | True | {BFO:0000015, GO:0003674, GO:0005488, BFO:0000... | {GO:0003674, BFO:0000015, GO:0005488, BFO:0000... | IEA | False | None | None | [TAIR:AnalysisReference:501756966] | 2024-11-03 |
165846 | AGI_LocusCode:AT5G62620 | GALT6 | located_in | GO:0005794 | Golgi apparatus | False | False | {GO:0110165, GO:0043231, BFO:0000002, GO:00057... | {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... | IDA | False | None | PMID:26690932 | [TAIR:Publication:501767599, PMID:26690932] | 2024-11-03 |
165851 | AGI_LocusCode:AT5G62620 | GALT6 | involved_in | GO:0010405 | arabinogalactan protein metabolic process | False | False | {GO:0043170, GO:0044036, GO:0010384, GO:007155... | {GO:0043170, GO:0044036, GO:0071554, GO:000815... | IMP | False | None | PMID:26690932 | [TAIR:Publication:501767599, PMID:26690932] | 2024-11-03 |
165856 | AGI_LocusCode:AT5G62620 | GALT6 | involved_in | GO:0018258 | protein O-linked glycosylation via hydroxyproline | False | False | {GO:0006493, GO:0008152, BFO:0000015, GO:00091... | {GO:0009059, GO:0006493, GO:0043170, GO:007008... | IDA | False | None | PMID:26690932 | [TAIR:Publication:501767599, PMID:26690932] | 2024-11-03 |
165858 | AGI_LocusCode:AT5G62620 | GALT6 | acts_upstream_of_or_within | GO:0048354 | mucilage biosynthetic process involved in seed... | False | False | {GO:0032501, GO:0008152, GO:0048359, BFO:00000... | {GO:0032501, GO:0010192, GO:0008152, GO:004835... | IMP | False | None | PMID:26690932 | [TAIR:Publication:501767599, PMID:26690932] | 2024-11-03 |
165863 | AGI_LocusCode:AT5G62620 | GALT6 | acts_upstream_of_or_within | GO:1900056 | negative regulation of leaf senescence | False | False | {GO:0065007, BFO:0000015, GO:1900055, GO:00485... | {GO:0065007, BFO:0000015, GO:1900055, GO:00485... | IMP | False | None | PMID:26690932 | [TAIR:Publication:501767599, PMID:26690932] | 2024-11-03 |
165869 | AGI_LocusCode:AT5G62620 | GALT6 | enables | GO:1990714 | hydroxyproline O-galactosyltransferase activity | False | False | {GO:0003824, GO:0016740, BFO:0000015, GO:00083... | {GO:0003824, GO:0016740, BFO:0000015, GO:00083... | IDA | False | None | PMID:26690932 | [TAIR:Publication:501767599, PMID:26690932] | 2024-11-03 |
210076 | AGI_LocusCode:AT5G62620 | GALT6 | enables | GO:1990714 | hydroxyproline O-galactosyltransferase activity | False | False | {GO:0003824, GO:0016740, BFO:0000015, GO:00083... | {GO:0003824, GO:0016740, BFO:0000015, GO:00083... | IBA | True | None | None | [GO_REF:0000033] | 2024-11-03 |
217996 | AGI_LocusCode:AT5G62620 | GALT6 | is_active_in | GO:0000139 | Golgi membrane | False | False | {GO:0110165, GO:0043231, BFO:0000002, GO:00057... | {CARO:0030000, GO:0005794, CARO:0000000, BFO:0... | IBA | True | None | None | [GO_REF:0000033] | 2024-11-03 |
[297]:
def create_gene_df(df: pd.DataFrame):
"""
creates a new dataframe, grouped by gene (subject)
:param df:
:return:
"""
gene_df = df.groupby("subject").agg({
"object": "count",
"is_iba": "sum",
"negated": "sum",
# for object, take the union of all distinct values
"object": lambda x: set(x),
# for object closure, take the union of all sets
"object_closure": lambda x: set.union(*x),
"object_closure_redundant": lambda x: set.union(*x),
"pmid": lambda x: set(x),
#"pubs": lambda x: set.union(set(x)),
"release": "first",
}).reset_index()
# the redundant closure is the set difference of object_closure and object
gene_df = gene_df.rename(columns={
"object": "terms",
"is_iba": "n_iba",
"negated": "n_negated",
"object_closure": "closure",
"pmid": "n_pmid",
})
return gene_df
[298]:
gene_df = create_gene_df(df)
gene_df
[298]:
subject | terms | n_iba | n_negated | closure | object_closure_redundant | n_pmid | release | |
---|---|---|---|---|---|---|---|---|
0 | SGD:S000000001 | {GO:0071168, GO:0005739, GO:0008301, GO:000367... | 3 | 0 | {GO:0005739, GO:0000182, GO:0008152, BFO:00000... | {GO:0000182, GO:0008152, BFO:0000015, GO:00010... | {PMID:2404611, PMID:18708580, PMID:2649882, PM... | 2024-11-03 |
1 | SGD:S000000002 | {GO:0015031, GO:0099023, GO:0005768, GO:000662... | 4 | 0 | {GO:0032509, GO:0071985, GO:0046872, BFO:00000... | {GO:0032509, GO:0071985, GO:0046872, BFO:00000... | {PMID:19828734, PMID:30358795, PMID:20173035, ... | 2024-11-03 |
2 | SGD:S000000003 | {GO:0005085, GO:0005737, GO:0032232, GO:000582... | 3 | 0 | {GO:0005085, GO:0030234, GO:0008152, BFO:00000... | {GO:0030234, GO:0008152, BFO:0000015, GO:00900... | {PMID:19545407, PMID:10409717, PMID:17925388, ... | 2024-11-03 |
3 | SGD:S000000004 | {GO:0005829, GO:0072671, GO:0005576, GO:003460... | 9 | 0 | {GO:1901363, GO:0003723, GO:0008152, BFO:00000... | {GO:1901363, GO:0003723, GO:0008152, BFO:00000... | {PMID:18706386, PMID:9789005, PMID:26928762, P... | 2024-11-03 |
4 | SGD:S000000005 | {GO:0015031, GO:0006621, GO:0005789, GO:000688... | 8 | 0 | {BFO:0000015, GO:0030135, GO:0070972, GO:01101... | {BFO:0000015, GO:0030135, GO:0070972, GO:01101... | {PMID:26928762, PMID:10359606, None, PMID:1115... | 2024-11-03 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
6906 | SGD:S000350095 | {GO:0008150, GO:0003674, GO:0005575} | 0 | 0 | {BFO:0000003, BFO:0000015, BFO:0000004, GO:000... | {BFO:0000004, BFO:0000040, BFO:0000003, BFO:00... | {None} | 2024-11-03 |
6907 | SGD:S000350096 | {GO:0008150, GO:0003674, GO:0005575} | 0 | 0 | {BFO:0000003, BFO:0000015, BFO:0000004, GO:000... | {BFO:0000004, BFO:0000040, BFO:0000003, BFO:00... | {None} | 2024-11-03 |
6908 | SGD:S000350097 | {GO:0005575, GO:0003674, GO:0008150} | 0 | 0 | {BFO:0000004, BFO:0000002, GO:0003674, GO:0008... | {BFO:0000004, BFO:0000002, BFO:0000003, BFO:00... | {None} | 2024-11-03 |
6909 | SGD:S000350098 | {GO:0005575, GO:0003674, GO:0008150} | 0 | 0 | {BFO:0000004, BFO:0000002, GO:0003674, GO:0008... | {BFO:0000004, BFO:0000002, BFO:0000003, BFO:00... | {None} | 2024-11-03 |
6910 | SGD:S000350099 | {GO:0003674, GO:0005575, GO:0008150} | 0 | 0 | {BFO:0000040, BFO:0000015, BFO:0000004, GO:000... | {BFO:0000004, BFO:0000002, BFO:0000003, BFO:00... | {None} | 2024-11-03 |
6911 rows × 8 columns
[299]:
def set_redundant_flag(assocs: pd.DataFrame, gene_df: pd.DataFrame):
"""
Sets the redundant flag for each association, if the object is in object_closure_redundant for that gene.
:param assocs:
:param gene_df:
:return:
"""
# Create a mapping of subject to object_closure_redundant
redundant_map = gene_df.set_index('subject')['object_closure_redundant'].to_dict()
# Vectorized check for each row
def check_redundant(row):
if row['is_iba']:
return False
closure_set = redundant_map.get(row['subject'], set())
is_redundant = row['object'] in closure_set
return is_redundant
# Apply the check to all rows at once
assocs['redundant'] = assocs.apply(check_redundant, axis=1)
[300]:
df["redundant"] = False
[301]:
set_redundant_flag(df, gene_df)
[302]:
df[df['redundant']]
[302]:
subject | subject_label | predicate | object | object_label | object_obsoletes | object_uninformative | object_closure | object_closure_redundant | evidence | is_iba | negated | pmid | pubs | release | redundant | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
16 | SGD:S000004539 | FPR3 | located_in | GO:0005634 | nucleus | False | False | {GO:0110165, GO:0043231, BFO:0000002, GO:00432... | {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... | IDA | False | None | PMID:26359986 | [PMID:26359986] | 2024-11-03 | True |
28 | SGD:S000002699 | HRQ1 | enables | GO:0043138 | 3'-5' DNA helicase activity | False | False | {BFO:0000015, GO:0008150, GO:0016043, GO:00431... | {GO:0140097, GO:0032508, GO:0003824, BFO:00000... | IDA | False | None | PMID:28385527 | [PMID:28385527] | 2024-11-03 | True |
33 | SGD:S000002699 | HRQ1 | enables | GO:0043138 | 3'-5' DNA helicase activity | False | False | {BFO:0000015, GO:0008150, GO:0016043, GO:00431... | {GO:0140097, GO:0032508, GO:0003824, BFO:00000... | IDA | False | None | PMID:24440721 | [PMID:24440721] | 2024-11-03 | True |
39 | SGD:S000003245 | SNU71 | located_in | GO:0005634 | nucleus | False | False | {GO:0110165, GO:0043231, BFO:0000002, GO:00432... | {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... | IEA | False | None | None | [GO_REF:0000043] | 2024-11-03 | True |
42 | SGD:S000001443 | DJP1 | acts_upstream_of_or_within | GO:0006626 | protein targeting to mitochondrion | False | False | {GO:0070585, GO:0051234, GO:0006605, GO:007072... | {GO:0070585, GO:0070727, GO:0033365, BFO:00000... | IMP | False | None | PMID:30213914 | [PMID:30213914] | 2024-11-03 | True |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
105624 | SGD:S000006483 | RDN18-2 | located_in | GO:0005840 | ribosome | False | False | {GO:0110165, BFO:0000002, GO:0043229, CL:00000... | {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... | IEA | False | None | PMID:30502926 | [PMID:30502926] | 2024-11-03 | True |
105625 | SGD:S000006502 | SNR42 | located_in | GO:0005730 | nucleolus | False | False | {GO:0043233, GO:0043231, GO:0110165, BFO:00000... | {CARO:0030000, GO:0005634, CARO:0000000, BFO:0... | IEA | False | None | PMID:30502926 | [PMID:30502926] | 2024-11-03 | True |
105627 | SGD:S000007300 | SNR36 | located_in | GO:0005730 | nucleolus | False | False | {GO:0043233, GO:0043231, GO:0110165, BFO:00000... | {CARO:0030000, GO:0005634, CARO:0000000, BFO:0... | IEA | False | None | PMID:30502926 | [PMID:30502926] | 2024-11-03 | True |
105628 | SGD:S000006484 | RDN25-1 | located_in | GO:0005840 | ribosome | False | False | {GO:0110165, BFO:0000002, GO:0043229, CL:00000... | {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... | IEA | False | None | PMID:30502926 | [PMID:30502926] | 2024-11-03 | True |
105629 | SGD:S000006485 | RDN25-2 | located_in | GO:0005840 | ribosome | False | False | {GO:0110165, BFO:0000002, GO:0043229, CL:00000... | {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... | IEA | False | None | PMID:30502926 | [PMID:30502926] | 2024-11-03 | True |
34441 rows × 16 columns
[303]:
def annotate_new_pubs(assocs: pd.DataFrame, prev_assocs: pd.DataFrame):
"""
Annotates each association in assocs with fresh if its pmid is not in prev_assocs.
:param assocs
"""
prev_pmid_map = prev_assocs.set_index('pmid')['subject'].to_dict()
# Vectorized check for each row
def check_new(row):
return row['pmid'] not in prev_pmid_map
# Apply the check to all rows in prev_assocs
assocs['pmid_new'] = assocs.apply(check_new, axis=1)
# annotate tuple as unique if (subject, object) is not in prev
prev_pair_assocs = set(zip(prev_assocs['subject'], prev_assocs['object']))
assocs['is_new'] = ~assocs.apply(lambda x: (x['subject'], x['object']) in prev_pair_assocs, axis=1)
assocs['fresh'] = assocs['pmid_new'] & assocs['is_new']
[304]:
def annotate_redacted_pubs(assocs: pd.DataFrame, prev_assocs: pd.DataFrame):
"""
Annotates each association in prev_assocs with retracted if its pmid is not in assocs (latest).
:param assocs
"""
# Create a mapping of pmid to subject
pmid_map = assocs.set_index('pmid')['subject'].to_dict()
# Vectorized check for each row
def check_removed(row):
return row['pmid'] not in pmid_map
# Apply the check to all rows in prev_assocs
prev_assocs['pmid_removed'] = prev_assocs.apply(check_removed, axis=1)
# annotate tuple as unique if (subject, object) is not in latest
pair_assocs = set(zip(assocs['subject'], assocs['object']))
prev_assocs['unique'] = ~prev_assocs.apply(lambda x: (x['subject'], x['object']) in pair_assocs, axis=1)
prev_assocs['redacted'] = prev_assocs['pmid_removed'] & prev_assocs['unique']
[305]:
annotate_redacted_pubs(df, prev_df)
[306]:
#prev_df['redacted'] = prev_df['pmid_removed'] & prev_df['unique']
[ ]:
[307]:
#list(set(df['subject']))[:5]
[308]:
#iba_df = df[df['is_iba']]
#iba_subjects = set(iba_df['subject'])
#list(iba_subjects)[:5]
[309]:
def set_iba_status(assocs: pd.DataFrame):
iba_df = assocs[assocs['is_iba']]
iba_subjects = set(iba_df['subject'])
# set the IBA rejected as associations for which
# (a) the subject has an IBA association
# (b) the object is not in the closure of the object in the IBA association
# first we set the iba closure for each subject
iba_closure = iba_df.groupby('subject')['object_closure'].aggregate(lambda x: set.union(*x)).to_dict()
# now we set IBA rejected for each row
def check_iba_rejected(row):
if row['is_iba']:
return False
return row['subject'] in iba_subjects and row['object'] not in iba_closure.get(row['subject'], set())
#if row['subject'].startswith("TAIR"):
# print(f"Checking {row['subject']} in {list(iba_subjects)[:5]}")
#return row['subject'] in iba_subjects
#return True
assocs['iba_rejected'] = assocs.apply(check_iba_rejected, axis=1)
#set_iba_status(df)
[ ]:
[310]:
human_df
[310]:
subject | subject_label | predicate | object | object_label | object_obsoletes | object_uninformative | object_closure | object_closure_redundant | evidence | is_iba | negated | pmid | pubs | release | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | UniProtKB:A0A024RBG1 | NUDT4B | enables | GO:0003723 | RNA binding | False | True | {GO:0097159, GO:0003723, BFO:0000015, GO:00036... | {GO:0097159, BFO:0000015, GO:0003674, GO:00036... | IEA | False | None | None | [GO_REF:0000043] | 2024-11-03 |
1 | UniProtKB:A0A024RBG1 | NUDT4B | enables | GO:0005515 | protein binding | False | True | {BFO:0000015, GO:0003674, GO:0005488, BFO:0000... | {GO:0003674, BFO:0000015, GO:0005488, BFO:0000... | IPI | False | None | PMID:33961781 | [PMID:33961781] | 2024-11-03 |
2 | UniProtKB:A0A024RBG1 | NUDT4B | enables | GO:0046872 | metal ion binding | False | True | {GO:0043169, GO:0046872, BFO:0000015, GO:00360... | {GO:0043169, BFO:0000015, GO:0036094, GO:00431... | IEA | False | None | None | [GO_REF:0000043] | 2024-11-03 |
3 | UniProtKB:A0A024RBG1 | NUDT4B | located_in | GO:0005829 | cytosol | False | False | {CARO:0030000, UBERON:0000061, CARO:0000003, G... | {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... | IDA | False | None | None | [GO_REF:0000052] | 2024-11-03 |
4 | UniProtKB:A0A075B6H5 | TRBV20OR9-2 | involved_in | GO:0002376 | immune system process | False | False | {GO:0008150, BFO:0000015, GO:0002376, BFO:0000... | {BFO:0000015, GO:0008150, BFO:0000003} | IEA | False | None | None | [GO_REF:0000043] | 2024-11-03 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
782818 | UniProtKB:Q9NZC2 | TREM2 | involved_in | GO:0045088 | regulation of innate immune response | False | False | {GO:0065007, GO:0002682, GO:0050776, BFO:00000... | {GO:0065007, GO:0002682, GO:0050776, BFO:00000... | IBA | True | None | None | [GO_REF:0000033] | 2024-11-03 |
782819 | UniProtKB:Q9Y2K2 | SIK3 | enables | GO:0050321 | tau-protein kinase activity | False | False | {GO:0140096, GO:0003824, GO:0004674, GO:001674... | {GO:0140096, GO:0003824, GO:0004674, GO:001674... | IBA | True | None | None | [GO_REF:0000033] | 2024-11-03 |
782820 | UniProtKB:P43235 | CTSK | involved_in | GO:0051603 | proteolysis involved in protein catabolic process | False | False | {GO:0043170, GO:0006508, GO:0044238, GO:000905... | {GO:0043170, GO:0006508, GO:0044238, GO:000905... | IBA | True | None | None | [GO_REF:0000033] | 2024-11-03 |
782821 | UniProtKB:Q07343 | PDE4B | enables | GO:0047555 | 3',5'-cyclic-GMP phosphodiesterase activity | False | False | {GO:0047555, GO:0003824, GO:0008081, BFO:00000... | {GO:0003824, GO:0008081, BFO:0000015, GO:00425... | IBA | True | None | None | [GO_REF:0000033] | 2024-11-03 |
782822 | UniProtKB:A6NC42 | DPPA5 | involved_in | GO:0010468 | regulation of gene expression | False | False | {GO:0065007, GO:0060255, GO:0009889, BFO:00000... | {GO:0065007, GO:0060255, GO:0009889, BFO:00000... | IBA | True | None | None | [GO_REF:0000033] | 2024-11-03 |
782823 rows × 15 columns
[311]:
set_iba_status(test_df)
[312]:
test_df['iba_rejected'].unique()
[312]:
array([False, True])
[313]:
#set_iba_status(df)
[ ]:
[314]:
def synthesize(grp: str) -> pd.DataFrame:
"""
Synthesizes the above steps
"""
assocs = assocs_to_df(db[grp][LATEST], LATEST)
prev_df_sets = []
for prev in PREVIOUS:
this_prev_assocs = assocs_to_df(db[grp][prev], prev)
prev_df_sets.append(this_prev_assocs)
annotate_new_pubs(assocs, prev_df_sets[0])
#prev_assocs = pd.concat(prev_df_sets).drop_duplicates()
prev_assocs = pd.concat(prev_df_sets)
# prev_assocs = assocs_to_df(db[grp][PREVIOUS[0]], PREVIOUS[0])
repair_assocs_df(assocs)
repair_assocs_df(prev_assocs)
# Create a gene dataframe (for latest only)
gene_df = create_gene_df(assocs)
annotate_redacted_pubs(assocs, prev_assocs)
new_assocs = pd.concat([assocs, prev_assocs[prev_assocs['redacted']]])
set_redundant_flag(new_assocs, gene_df)
set_iba_status(new_assocs)
return new_assocs
[315]:
new_human = synthesize("goa_human")
Multiple IDs for 64 labels
[('AKAP7', ['UniProtKB:O43687', 'UniProtKB:Q9P0M2']), ('ARHGEF18', ['UniProtKB:A0A590UK10', 'UniProtKB:Q6ZSZ5']), ('BBC3', ['UniProtKB:Q9BXH1', 'UniProtKB:Q96PG8']), ('CALCA', ['UniProtKB:P06881', 'UniProtKB:P01258']), ('CDKN2A', ['UniProtKB:Q8N726', 'UniProtKB:P42771'])]
Multiple IDs for 100 labels
[('AKAP7', ['UniProtKB:O43687', 'UniProtKB:Q9P0M2']), ('AMY1A', ['UniProtKB:P04745', 'UniProtKB:P0DUB6']), ('ARHGEF18', ['UniProtKB:A0A590UK10', 'UniProtKB:Q6ZSZ5']), ('ASIC5', ['UniProtKB:A0A0G2JLG4', 'UniProtKB:Q9NY37']), ('ATP6AP2', ['UniProtKB:O75787', 'UniProtKB:A0A1C7CYW4'])]
[316]:
new_human
[316]:
subject | subject_label | predicate | object | object_label | object_obsoletes | object_uninformative | object_closure | object_closure_redundant | evidence | ... | pubs | release | pmid_new | is_new | fresh | pmid_removed | unique | redacted | redundant | iba_rejected | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | UniProtKB:A0A024RBG1 | NUDT4B | enables | GO:0003723 | RNA binding | False | True | {GO:0097159, GO:0003723, BFO:0000015, GO:00036... | {GO:0097159, BFO:0000015, GO:0003674, GO:00036... | IEA | ... | [GO_REF:0000043] | 2024-11-03 | False | False | False | NaN | NaN | NaN | False | True |
1 | UniProtKB:A0A024RBG1 | NUDT4B | enables | GO:0005515 | protein binding | False | True | {BFO:0000015, GO:0003674, GO:0005488, BFO:0000... | {GO:0003674, BFO:0000015, GO:0005488, BFO:0000... | IPI | ... | [PMID:33961781] | 2024-11-03 | True | True | True | NaN | NaN | NaN | False | True |
2 | UniProtKB:A0A024RBG1 | NUDT4B | enables | GO:0046872 | metal ion binding | False | True | {GO:0043169, GO:0046872, BFO:0000015, GO:00360... | {GO:0043169, BFO:0000015, GO:0036094, GO:00431... | IEA | ... | [GO_REF:0000043] | 2024-11-03 | False | False | False | NaN | NaN | NaN | False | True |
3 | UniProtKB:A0A024RBG1 | NUDT4B | located_in | GO:0005829 | cytosol | False | False | {CARO:0030000, UBERON:0000061, CARO:0000003, G... | {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... | IDA | ... | [GO_REF:0000052] | 2024-11-03 | False | False | False | NaN | NaN | NaN | False | True |
4 | UniProtKB:A0A075B6H5 | TRBV20OR9-2 | involved_in | GO:0002376 | immune system process | False | False | {GO:0008150, BFO:0000015, GO:0002376, BFO:0000... | {BFO:0000015, GO:0008150, BFO:0000003} | IEA | ... | [GO_REF:0000043] | 2024-11-03 | False | True | False | NaN | NaN | NaN | False | True |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
434570 | UniProtKB:Q9Y6A4 | CFAP20 | None | GO:0007275 | multicellular organism development | False | True | {GO:0032501, BFO:0000015, GO:0048856, GO:00325... | {GO:0032501, BFO:0000015, GO:0048856, GO:00325... | TAS | ... | [PMID:8688464] | 2020-01-01 | NaN | NaN | NaN | True | True | True | False | True |
435058 | UniProtKB:Q9Y6F1 | PARP3 | None | GO:0006281 | DNA repair | False | False | {GO:0043170, GO:0033554, GO:0008152, GO:000613... | {GO:0043170, GO:0008152, BFO:0000015, GO:00099... | TAS | ... | [PMID:7260241] | 2020-01-01 | NaN | NaN | NaN | True | True | True | True | False |
436209 | UniProtKB:Q9Y6Q9 | NCOA3 | None | GO:0000981 | DNA-binding transcription factor activity, RNA... | False | False | {BFO:0000015, GO:0006357, GO:0008150, GO:00192... | {GO:0065007, GO:0060255, GO:0009889, BFO:00000... | ISM | ... | [PMID:19274049] | 2020-01-01 | NaN | NaN | NaN | True | True | True | False | True |
436550 | UniProtKB:Q9Y6X0 | SETBP1 | None | GO:0000981 | DNA-binding transcription factor activity, RNA... | False | False | {BFO:0000015, GO:0006357, GO:0008150, GO:00192... | {GO:0065007, GO:0060255, GO:0009889, BFO:00000... | ISM | ... | [PMID:19274049] | 2020-01-01 | NaN | NaN | NaN | True | True | True | False | True |
436732 | UniProtKB:Q9Y6Y1 | CAMTA1 | None | GO:0000981 | DNA-binding transcription factor activity, RNA... | False | False | {BFO:0000015, GO:0006357, GO:0008150, GO:00192... | {GO:0065007, GO:0060255, GO:0009889, BFO:00000... | ISM | ... | [PMID:19274049] | 2020-01-01 | NaN | NaN | NaN | True | True | True | False | True |
783918 rows × 23 columns
[317]:
new_human[new_human['iba_rejected']]
[317]:
subject | subject_label | predicate | object | object_label | object_obsoletes | object_uninformative | object_closure | object_closure_redundant | evidence | ... | pubs | release | pmid_new | is_new | fresh | pmid_removed | unique | redacted | redundant | iba_rejected | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | UniProtKB:A0A024RBG1 | NUDT4B | enables | GO:0003723 | RNA binding | False | True | {GO:0097159, GO:0003723, BFO:0000015, GO:00036... | {GO:0097159, BFO:0000015, GO:0003674, GO:00036... | IEA | ... | [GO_REF:0000043] | 2024-11-03 | False | False | False | NaN | NaN | NaN | False | True |
1 | UniProtKB:A0A024RBG1 | NUDT4B | enables | GO:0005515 | protein binding | False | True | {BFO:0000015, GO:0003674, GO:0005488, BFO:0000... | {GO:0003674, BFO:0000015, GO:0005488, BFO:0000... | IPI | ... | [PMID:33961781] | 2024-11-03 | True | True | True | NaN | NaN | NaN | False | True |
2 | UniProtKB:A0A024RBG1 | NUDT4B | enables | GO:0046872 | metal ion binding | False | True | {GO:0043169, GO:0046872, BFO:0000015, GO:00360... | {GO:0043169, BFO:0000015, GO:0036094, GO:00431... | IEA | ... | [GO_REF:0000043] | 2024-11-03 | False | False | False | NaN | NaN | NaN | False | True |
3 | UniProtKB:A0A024RBG1 | NUDT4B | located_in | GO:0005829 | cytosol | False | False | {CARO:0030000, UBERON:0000061, CARO:0000003, G... | {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... | IDA | ... | [GO_REF:0000052] | 2024-11-03 | False | False | False | NaN | NaN | NaN | False | True |
4 | UniProtKB:A0A075B6H5 | TRBV20OR9-2 | involved_in | GO:0002376 | immune system process | False | False | {GO:0008150, BFO:0000015, GO:0002376, BFO:0000... | {BFO:0000015, GO:0008150, BFO:0000003} | IEA | ... | [GO_REF:0000043] | 2024-11-03 | False | True | False | NaN | NaN | NaN | False | True |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
433680 | UniProtKB:Q9Y5Y6 | ST14 | None | GO:0005887 | None | True | False | {GO:0005887} | {} | TAS | ... | [PMID:10831593] | 2020-01-01 | NaN | NaN | NaN | True | True | True | False | True |
434570 | UniProtKB:Q9Y6A4 | CFAP20 | None | GO:0007275 | multicellular organism development | False | True | {GO:0032501, BFO:0000015, GO:0048856, GO:00325... | {GO:0032501, BFO:0000015, GO:0048856, GO:00325... | TAS | ... | [PMID:8688464] | 2020-01-01 | NaN | NaN | NaN | True | True | True | False | True |
436209 | UniProtKB:Q9Y6Q9 | NCOA3 | None | GO:0000981 | DNA-binding transcription factor activity, RNA... | False | False | {BFO:0000015, GO:0006357, GO:0008150, GO:00192... | {GO:0065007, GO:0060255, GO:0009889, BFO:00000... | ISM | ... | [PMID:19274049] | 2020-01-01 | NaN | NaN | NaN | True | True | True | False | True |
436550 | UniProtKB:Q9Y6X0 | SETBP1 | None | GO:0000981 | DNA-binding transcription factor activity, RNA... | False | False | {BFO:0000015, GO:0006357, GO:0008150, GO:00192... | {GO:0065007, GO:0060255, GO:0009889, BFO:00000... | ISM | ... | [PMID:19274049] | 2020-01-01 | NaN | NaN | NaN | True | True | True | False | True |
436732 | UniProtKB:Q9Y6Y1 | CAMTA1 | None | GO:0000981 | DNA-binding transcription factor activity, RNA... | False | False | {BFO:0000015, GO:0006357, GO:0008150, GO:00192... | {GO:0065007, GO:0060255, GO:0009889, BFO:00000... | ISM | ... | [PMID:19274049] | 2020-01-01 | NaN | NaN | NaN | True | True | True | False | True |
450057 rows × 23 columns
[318]:
prev_human_df = assocs_to_df(db["goa_human"][PREVIOUS[0]], PREVIOUS[0])
#annotate_new_pubs(human_df, prev_human_df)
[319]:
annotate_new_pubs(human_df, prev_human_df)
[320]:
human_df[human_df['fresh']]
[320]:
subject | subject_label | predicate | object | object_label | object_obsoletes | object_uninformative | object_closure | object_closure_redundant | evidence | is_iba | negated | pmid | pubs | release | pmid_new | is_new | fresh | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | UniProtKB:A0A024RBG1 | NUDT4B | enables | GO:0005515 | protein binding | False | True | {BFO:0000015, GO:0003674, GO:0005488, BFO:0000... | {GO:0003674, BFO:0000015, GO:0005488, BFO:0000... | IPI | False | None | PMID:33961781 | [PMID:33961781] | 2024-11-03 | True | True | True |
357 | UniProtKB:A0A096LP55 | UQCRHL | located_in | GO:0005739 | mitochondrion | False | False | {GO:0005739, GO:0110165, GO:0043231, BFO:00000... | {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... | HTP | False | None | PMID:34800366 | [PMID:34800366] | 2024-11-03 | True | True | True |
569 | UniProtKB:A0A0B4J2F0 | PIGBOS1 | located_in | GO:0005739 | mitochondrion | False | False | {GO:0005739, GO:0110165, GO:0043231, BFO:00000... | {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... | HTP | False | None | PMID:34800366 | [PMID:34800366] | 2024-11-03 | True | True | True |
1904 | UniProtKB:A1A4Y4 | IRGM | enables | GO:1901612 | cardiolipin binding | False | True | {GO:0008289, GO:0043168, GO:1901612, BFO:00000... | {GO:0008289, GO:0043168, BFO:0000015, GO:00360... | IDA | False | None | PMID:21102437 | [PMID:21102437] | 2024-11-03 | True | True | True |
1945 | UniProtKB:A1A4Y4 | IRGM | involved_in | GO:0090141 | positive regulation of mitochondrial fission | False | False | {GO:0010821, GO:0051130, GO:0065007, GO:003304... | {GO:0010821, GO:0051130, GO:0065007, GO:003304... | IDA | False | None | PMID:21102437 | [PMID:21102437] | 2024-11-03 | True | True | True |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
718365 | RNAcentral:URS00026A23F2_9606 | URS00026A23F2_9606 | enables | GO:0141180 | dsDNA-RNA triple helix-forming chromatin adapt... | False | True | {GO:0030674, BFO:0000015, GO:0043565, GO:00036... | {GO:0003690, GO:0030674, GO:0003677, GO:007184... | IDA | False | None | PMID:27634931 | [PMID:27634931] | 2024-11-03 | True | True | True |
718366 | RNAcentral:URS00026A23F2_9606 | URS00026A23F2_9606 | involved_in | GO:0000122 | negative regulation of transcription by RNA po... | False | False | {BFO:0000015, GO:0006357, GO:0008150, GO:00098... | {GO:0045934, GO:0009892, GO:0065007, GO:000988... | IMP | False | None | PMID:27634931 | [PMID:27634931] | 2024-11-03 | True | True | True |
718367 | RNAcentral:URS00026A23F2_9606 | URS00026A23F2_9606 | involved_in | GO:0000512 | lncRNA-mediated post-transcriptional gene sile... | False | False | {GO:0000512, BFO:0000015, GO:0016441, GO:00081... | {GO:0065007, GO:0009892, GO:0031047, GO:000988... | IDA | False | None | PMID:30720199 | [PMID:30720199] | 2024-11-03 | True | True | True |
718368 | RNAcentral:URS00026A23F2_9606 | URS00026A23F2_9606 | involved_in | GO:0000512 | lncRNA-mediated post-transcriptional gene sile... | False | False | {GO:0000512, BFO:0000015, GO:0016441, GO:00081... | {GO:0065007, GO:0009892, GO:0031047, GO:000988... | IMP | False | None | PMID:33102210 | [PMID:33102210] | 2024-11-03 | True | True | True |
718370 | RNAcentral:URS00026A23F2_9606 | URS00026A23F2_9606 | involved_in | GO:0090399 | replicative senescence | False | False | {GO:0008152, BFO:0000015, GO:0090399, GO:00099... | {GO:0008152, BFO:0000015, GO:0009987, GO:00081... | IMP | False | None | PMID:27634931 | [PMID:27634931] | 2024-11-03 | True | True | True |
3380 rows × 18 columns
[321]:
#pair_assocs, pair_anns = annotate_assocs("tair")
[322]:
new_human.to_csv("output/go-human-assocs-annotated.csv", index=False)
[332]:
test_cases_df = new_human[(new_human['fresh'] == True) & (new_human['object_uninformative'] == False) & (new_human['negated'] != True) & (new_human['redundant'] != True) & (new_human['iba_rejected'] != True)]
test_cases_df
[332]:
subject | subject_label | predicate | object | object_label | object_obsoletes | object_uninformative | object_closure | object_closure_redundant | evidence | ... | pubs | release | pmid_new | is_new | fresh | pmid_removed | unique | redacted | redundant | iba_rejected | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
4876 | UniProtKB:A6NNL5 | C15orf61 | located_in | GO:0005739 | mitochondrion | False | False | {GO:0005739, GO:0110165, GO:0043231, BFO:00000... | {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... | HTP | ... | [PMID:34800366] | 2024-11-03 | True | True | True | NaN | NaN | NaN | False | False |
5853 | UniProtKB:A8MSI8 | LYRM9 | located_in | GO:0005739 | mitochondrion | False | False | {GO:0005739, GO:0110165, GO:0043231, BFO:00000... | {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... | HTP | ... | [PMID:34800366] | 2024-11-03 | True | True | True | NaN | NaN | NaN | False | False |
6327 | UniProtKB:A8MXV4 | NUDT19 | located_in | GO:0005739 | mitochondrion | False | False | {GO:0005739, GO:0110165, GO:0043231, BFO:00000... | {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... | HTP | ... | [PMID:34800366] | 2024-11-03 | True | True | True | NaN | NaN | NaN | False | False |
15942 | UniProtKB:O14521 | SDHD | part_of | GO:0045273 | respiratory chain complex II (succinate dehydr... | False | False | {GO:0098796, GO:0110165, GO:0045273, BFO:00000... | {CARO:0030000, GO:0098803, GO:0098796, CARO:00... | IDA | ... | [PMID:37098072] | 2024-11-03 | True | True | True | NaN | NaN | NaN | False | False |
34482 | UniProtKB:O43325 | LYRM1 | located_in | GO:0005739 | mitochondrion | False | False | {GO:0005739, GO:0110165, GO:0043231, BFO:00000... | {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... | HTP | ... | [PMID:34800366] | 2024-11-03 | True | True | True | NaN | NaN | NaN | False | False |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
718362 | RNAcentral:URS00026A23F2_9606 | URS00026A23F2_9606 | acts_upstream_of | GO:0008284 | positive regulation of cell population prolife... | False | False | {GO:0065007, GO:0042127, GO:0048518, BFO:00000... | {GO:0065007, GO:0042127, GO:0048518, BFO:00000... | IMP | ... | [PMID:33102210] | 2024-11-03 | True | True | True | NaN | NaN | NaN | False | False |
718366 | RNAcentral:URS00026A23F2_9606 | URS00026A23F2_9606 | involved_in | GO:0000122 | negative regulation of transcription by RNA po... | False | False | {BFO:0000015, GO:0006357, GO:0008150, GO:00098... | {GO:0045934, GO:0009892, GO:0065007, GO:000988... | IMP | ... | [PMID:27634931] | 2024-11-03 | True | True | True | NaN | NaN | NaN | False | False |
718367 | RNAcentral:URS00026A23F2_9606 | URS00026A23F2_9606 | involved_in | GO:0000512 | lncRNA-mediated post-transcriptional gene sile... | False | False | {GO:0000512, BFO:0000015, GO:0016441, GO:00081... | {GO:0065007, GO:0009892, GO:0031047, GO:000988... | IDA | ... | [PMID:30720199] | 2024-11-03 | True | True | True | NaN | NaN | NaN | False | False |
718368 | RNAcentral:URS00026A23F2_9606 | URS00026A23F2_9606 | involved_in | GO:0000512 | lncRNA-mediated post-transcriptional gene sile... | False | False | {GO:0000512, BFO:0000015, GO:0016441, GO:00081... | {GO:0065007, GO:0009892, GO:0031047, GO:000988... | IMP | ... | [PMID:33102210] | 2024-11-03 | True | True | True | NaN | NaN | NaN | False | False |
718370 | RNAcentral:URS00026A23F2_9606 | URS00026A23F2_9606 | involved_in | GO:0090399 | replicative senescence | False | False | {GO:0008152, BFO:0000015, GO:0090399, GO:00099... | {GO:0008152, BFO:0000015, GO:0009987, GO:00081... | IMP | ... | [PMID:27634931] | 2024-11-03 | True | True | True | NaN | NaN | NaN | False | False |
524 rows × 23 columns
[350]:
def row_to_test_case(row, answer="YES"):
gene = row['subject_label']
term = row['object_label']
predicate = row['predicate']
if not gene or not term or not predicate:
return
return {
"input": f"{gene} {predicate} {term}",
"original_input": {
"subject": str(row['subject']),
"predicate": str(row['predicate']),
"object": str(row['object']),
},
"ideal": answer,
}
def df_to_test_cases(df: pd.DataFrame, limit=1000):
cases = [row_to_test_case(row) for _, row in df.iterrows()]
cases = [x for x in cases if x is not None]
if limit:
cases = cases[:limit]
return cases
df_to_test_cases(test_cases_df, limit=5)
[350]:
[{'input': 'C15orf61 located_in mitochondrion',
'original_input': {'subject': 'UniProtKB:A6NNL5',
'predicate': 'located_in',
'object': 'GO:0005739'},
'ideal': 'YES'},
{'input': 'LYRM9 located_in mitochondrion',
'original_input': {'subject': 'UniProtKB:A8MSI8',
'predicate': 'located_in',
'object': 'GO:0005739'},
'ideal': 'YES'},
{'input': 'NUDT19 located_in mitochondrion',
'original_input': {'subject': 'UniProtKB:A8MXV4',
'predicate': 'located_in',
'object': 'GO:0005739'},
'ideal': 'YES'},
{'input': 'SDHD part_of respiratory chain complex II (succinate dehydrogenase)',
'original_input': {'subject': 'UniProtKB:O14521',
'predicate': 'part_of',
'object': 'GO:0045273'},
'ideal': 'YES'},
{'input': 'LYRM1 located_in mitochondrion',
'original_input': {'subject': 'UniProtKB:O43325',
'predicate': 'located_in',
'object': 'GO:0005739'},
'ideal': 'YES'}]
[341]:
#cases = [row_to_test_case(row) for _, row in test_cases_df.iterrows()]
[343]:
import yaml
with open("output/test-cases.yaml", "w") as f:
yaml.dump({"cases": cases}, f, sort_keys=False)
[345]:
redacted_df = new_human[(new_human['redacted'] == True) & (new_human['object_uninformative'] == False) & (new_human['negated'] != True) & (new_human['redundant'] != True) & (new_human['iba_rejected'] != True)]
redacted_df
[345]:
subject | subject_label | predicate | object | object_label | object_obsoletes | object_uninformative | object_closure | object_closure_redundant | evidence | ... | pubs | release | pmid_new | is_new | fresh | pmid_removed | unique | redacted | redundant | iba_rejected | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
401144 | UniProtKB:Q8N6R0 | METTL13 | involved_in | GO:0000122 | negative regulation of transcription by RNA po... | False | False | {BFO:0000015, GO:0006357, GO:0008150, GO:00098... | {GO:0045934, GO:0009892, GO:0065007, GO:000988... | IMP | ... | [PMID:26763933] | 2024-06-10 | NaN | NaN | NaN | True | True | True | False | False |
401146 | UniProtKB:Q8N6R0 | METTL13 | involved_in | GO:1902807 | negative regulation of cell cycle G1/S phase t... | False | False | {GO:0065007, GO:1902806, GO:0010948, GO:190198... | {GO:0065007, GO:0010948, GO:1901988, GO:005172... | IMP | ... | [PMID:26763933] | 2024-06-10 | NaN | NaN | NaN | True | True | True | False | False |
453842 | UniProtKB:Q96K19 | RNF170 | involved_in | GO:0034140 | negative regulation of toll-like receptor 3 si... | False | False | {GO:0048585, GO:0062207, GO:1902532, GO:000996... | {GO:0048585, GO:0065007, GO:0002682, GO:006220... | IDA | ... | [PMID:31076723] | 2024-06-10 | NaN | NaN | NaN | True | True | True | False | False |
587451 | RNAcentral:URS0000083D87_9606 | URS0000083D87_9606 | involved_in | GO:0035195 | miRNA-mediated post-transcriptional gene silen... | False | False | {BFO:0000015, GO:0016441, GO:0008150, GO:00106... | {GO:0065007, GO:0009892, GO:0031047, GO:000988... | IDA | ... | [PMID:28640956] | 2024-06-10 | NaN | NaN | NaN | True | True | True | False | False |
587452 | RNAcentral:URS0000083D87_9606 | URS0000083D87_9606 | involved_in | GO:0090051 | negative regulation of cell migration involved... | False | False | {GO:0030336, BFO:0000015, GO:0008150, GO:00105... | {GO:0065007, GO:0030336, BFO:0000015, GO:00485... | IGI | ... | [PMID:28640956] | 2024-06-10 | NaN | NaN | NaN | True | True | True | False | False |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
424250 | UniProtKB:Q9Y226 | SLC22A13 | None | GO:0015695 | organic cation transport | False | False | {GO:0051234, GO:0006810, BFO:0000015, GO:00156... | {GO:0051234, GO:0006810, BFO:0000015, GO:00511... | NAS | ... | [PMID:10072596] | 2020-01-01 | NaN | NaN | NaN | True | True | True | False | False |
425053 | UniProtKB:Q9Y267 | SLC22A14 | None | GO:0005887 | None | True | False | {GO:0005887} | {} | NAS | ... | [PMID:10072596] | 2020-01-01 | NaN | NaN | NaN | True | True | True | False | False |
425054 | UniProtKB:Q9Y267 | SLC22A14 | None | GO:0015101 | organic cation transmembrane transporter activity | False | False | {GO:0051234, GO:0055085, GO:0006810, BFO:00000... | {GO:0051234, GO:0055085, GO:0006810, BFO:00000... | NAS | ... | [PMID:10072596] | 2020-01-01 | NaN | NaN | NaN | True | True | True | False | False |
425055 | UniProtKB:Q9Y267 | SLC22A14 | None | GO:0015695 | organic cation transport | False | False | {GO:0051234, GO:0006810, BFO:0000015, GO:00156... | {GO:0051234, GO:0006810, BFO:0000015, GO:00511... | NAS | ... | [PMID:10072596] | 2020-01-01 | NaN | NaN | NaN | True | True | True | False | False |
432681 | UniProtKB:Q9Y5M6 | OCLM | None | GO:0007601 | visual perception | False | False | {GO:0032501, GO:0050953, GO:0003008, BFO:00000... | {GO:0032501, GO:0050953, GO:0003008, BFO:00000... | TAS | ... | [PMID:10362512] | 2020-01-01 | NaN | NaN | NaN | True | True | True | False | False |
83 rows × 23 columns
[346]:
cases = [row_to_test_case(row) for _, row in redacted_df.iterrows()]
with open("output/test-cases-redacted.yaml", "w") as f:
yaml.dump({"cases": cases}, f, sort_keys=False)
[348]:
iba_df = new_human[(new_human['is_iba'] == True) & (new_human['object_uninformative'] == False) & (new_human['negated'] != True) & (new_human['redundant'] != True)]
iba_df
[348]:
subject | subject_label | predicate | object | object_label | object_obsoletes | object_uninformative | object_closure | object_closure_redundant | evidence | ... | pubs | release | pmid_new | is_new | fresh | pmid_removed | unique | redacted | redundant | iba_rejected | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
718564 | UniProtKB:Q06418 | TYRO3 | enables | GO:0004714 | transmembrane receptor protein tyrosine kinase... | False | False | {GO:0019199, GO:0140096, GO:0003824, GO:001674... | {GO:0019199, GO:0003824, BFO:0000015, GO:00167... | IBA | ... | [GO_REF:0000033] | 2024-11-03 | False | False | False | NaN | NaN | NaN | False | False |
718565 | UniProtKB:P78559 | MAP1A | is_active_in | GO:0030425 | dendrite | False | False | {GO:0030425, GO:0120025, GO:0043005, GO:011016... | {CARO:0030000, CL:0002319, CL:0000211, UBERON:... | IBA | ... | [GO_REF:0000033] | 2024-11-03 | False | False | False | NaN | NaN | NaN | False | False |
718566 | UniProtKB:Q7L1W4 | LRRC8D | is_active_in | GO:0005737 | cytoplasm | False | False | {CARO:0030000, UBERON:0000061, CARO:0000003, G... | {CARO:0030000, UBERON:0000061, CARO:0000003, C... | IBA | ... | [GO_REF:0000033] | 2024-11-03 | False | False | False | NaN | NaN | NaN | False | False |
718567 | UniProtKB:A3QJZ7 | PRAMEF27 | part_of | GO:0031462 | Cul2-RING ubiquitin ligase complex | False | False | {GO:0031462, GO:0031461, GO:1990234, BFO:00000... | {GO:0031461, GO:1990234, BFO:0000004, GO:01405... | IBA | ... | [GO_REF:0000033] | 2024-11-03 | False | True | False | NaN | NaN | NaN | False | False |
718568 | UniProtKB:Q70IA6 | MOB2 | is_active_in | GO:0005634 | nucleus | False | False | {GO:0110165, GO:0043231, BFO:0000002, GO:00432... | {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... | IBA | ... | [GO_REF:0000033] | 2024-11-03 | False | False | False | NaN | NaN | NaN | False | False |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
782818 | UniProtKB:Q9NZC2 | TREM2 | involved_in | GO:0045088 | regulation of innate immune response | False | False | {GO:0065007, GO:0002682, GO:0050776, BFO:00000... | {GO:0065007, GO:0002682, GO:0050776, BFO:00000... | IBA | ... | [GO_REF:0000033] | 2024-11-03 | False | False | False | NaN | NaN | NaN | False | False |
782819 | UniProtKB:Q9Y2K2 | SIK3 | enables | GO:0050321 | tau-protein kinase activity | False | False | {GO:0140096, GO:0003824, GO:0004674, GO:001674... | {GO:0140096, GO:0003824, GO:0004674, GO:001674... | IBA | ... | [GO_REF:0000033] | 2024-11-03 | False | False | False | NaN | NaN | NaN | False | False |
782820 | UniProtKB:P43235 | CTSK | involved_in | GO:0051603 | proteolysis involved in protein catabolic process | False | False | {GO:0043170, GO:0006508, GO:0044238, GO:000905... | {GO:0043170, GO:0006508, GO:0044238, GO:000905... | IBA | ... | [GO_REF:0000033] | 2024-11-03 | False | False | False | NaN | NaN | NaN | False | False |
782821 | UniProtKB:Q07343 | PDE4B | enables | GO:0047555 | 3',5'-cyclic-GMP phosphodiesterase activity | False | False | {GO:0047555, GO:0003824, GO:0008081, BFO:00000... | {GO:0003824, GO:0008081, BFO:0000015, GO:00425... | IBA | ... | [GO_REF:0000033] | 2024-11-03 | False | False | False | NaN | NaN | NaN | False | False |
782822 | UniProtKB:A6NC42 | DPPA5 | involved_in | GO:0010468 | regulation of gene expression | False | False | {GO:0065007, GO:0060255, GO:0009889, BFO:00000... | {GO:0065007, GO:0060255, GO:0009889, BFO:00000... | IBA | ... | [GO_REF:0000033] | 2024-11-03 | False | False | False | NaN | NaN | NaN | False | False |
56487 rows × 23 columns
[351]:
with open("output/test-cases-iba.yaml", "w") as f:
yaml.dump({"cases": df_to_test_cases(iba_df, limit=1000)}, f, sort_keys=False)
[ ]: