{ "cells": [ { "metadata": {}, "cell_type": "markdown", "source": [ "# GO Annotation Analysis\n", "\n", "Analyzes the associations between genes and GO terms in the Gene Ontology (GO) database.\n", "\n", "In particular, categorizes associations based on:\n", "\n", "- whether they have been \"retracted\" (i.e., removed from the database)\n", "- whether they have been reviewed and accepted or rejected via IBA" ], "id": "e82528c7acc91898" }, { "metadata": { "collapsed": true, "ExecuteTime": { "end_time": "2025-01-16T01:01:52.125658Z", "start_time": "2025-01-16T01:01:52.121736Z" } }, "cell_type": "code", "source": [ "import pandas as pd\n", "\n", "from tests.test_implementations.test_robot_template import adapter\n", "\n", "# a recent release plus an older one that may have retracted entries\n", "\n", "RELEASES = [\n", " \"2024-11-03\",\n", " \"2024-06-10\",\n", " \"2020-01-01\",\n", "]\n", "\n", "LATEST = RELEASES[0]\n", "PREVIOUS = RELEASES[1:]\n", "assert all(r < LATEST for r in PREVIOUS)" ], "id": "initial_id", "outputs": [], "execution_count": 1 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-16T01:01:52.132069Z", "start_time": "2025-01-16T01:01:52.129698Z" } }, "cell_type": "code", "source": "NEW_CUTOFF = \"2024-06-01\"", "id": "49978ca9b485fbb2", "outputs": [], "execution_count": 2 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-16T01:01:52.281969Z", "start_time": "2025-01-16T01:01:52.279065Z" } }, "cell_type": "code", "source": [ "# taxa to analyze\n", "\n", "TAXA = [\n", " (\"human\", \"goa_human\", 9606),\n", " (\"Arabidopsis thaliana\", \"tair\", 3702),\n", " (\"yeast\", \"sgd\", 559292),\n", "]" ], "id": "6018e587719b9375", "outputs": [], "execution_count": 3 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-16T01:01:52.292074Z", "start_time": "2025-01-16T01:01:52.289520Z" } }, "cell_type": "code", "source": "GAF_URL_TEMPLATE = \"https://release.geneontology.org/{date}/annotations/{name}.gaf.gz\"", "id": "7308ce91009184f6", "outputs": [], "execution_count": 4 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-16T01:01:54.402787Z", "start_time": "2025-01-16T01:01:52.338558Z" } }, "cell_type": "code", "source": "from oaklib.datamodels.vocabulary import IS_A, PART_OF", "id": "e5d71d36e84b2d23", "outputs": [], "execution_count": 5 }, { "metadata": {}, "cell_type": "markdown", "source": "## Create an OAK adapter for the GO ontology", "id": "aa0baac1347bdbad" }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-16T01:02:10.701075Z", "start_time": "2025-01-16T01:01:54.406977Z" } }, "cell_type": "code", "source": [ "from oaklib import get_adapter\n", "\n", "go = get_adapter(\"sqlite:obo:go\")" ], "id": "5ec745bd2131923d", "outputs": [ { "data": { "text/plain": [ "Downloading go.db.gz: 0.00B [00:00, ?B/s]" ], "application/vnd.jupyter.widget-view+json": { "version_major": 2, "version_minor": 0, "model_id": "aaf16a6ab071468590f8ec5a8690270e" } }, "metadata": {}, "output_type": "display_data" } ], "execution_count": 6 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-16T01:02:11.205688Z", "start_time": "2025-01-16T01:02:10.709344Z" } }, "cell_type": "code", "source": "obsoletes = set(go.obsoletes())", "id": "2639655a1230be36", "outputs": [], "execution_count": 7 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-16T01:02:11.219520Z", "start_time": "2025-01-16T01:02:11.217763Z" } }, "cell_type": "code", "source": "", "id": "323074c9573b3799", "outputs": [], "execution_count": null }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-16T01:02:12.966565Z", "start_time": "2025-01-16T01:02:11.231132Z" } }, "cell_type": "code", "source": "binding_terms = set(go.descendants(\"GO:0005488\", predicates=[IS_A]))", "id": "a75d36046527380e", "outputs": [], "execution_count": 8 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-16T01:02:13.187408Z", "start_time": "2025-01-16T01:02:12.977295Z" } }, "cell_type": "code", "source": [ "antislim_terms = set(go.subset_members(\"gocheck_do_not_annotate\")).union(go.subset_members(\"gocheck_obsoletion_candidate\"))\n", "non_informative = binding_terms.union(antislim_terms)\n" ], "id": "7c29df1790c1eaa", "outputs": [], "execution_count": 9 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-16T07:06:14.986685Z", "start_time": "2025-01-16T07:06:09.128386Z" } }, "cell_type": "code", "source": [ "mf_terms = set(go.descendants(\"GO:0003674\", predicates=[IS_A]))\n", "bp_terms = set(go.descendants(\"GO:0008150\", predicates=[IS_A]))\n", "cc_terms = set(go.descendants(\"GO:0005575\", predicates=[IS_A]))" ], "id": "ea09b5a5d4b41a41", "outputs": [], "execution_count": 96 }, { "metadata": {}, "cell_type": "markdown", "source": "## Load annotations from the archive", "id": "90bdc6b371566c7d" }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-16T01:02:13.215121Z", "start_time": "2025-01-16T01:02:13.212662Z" } }, "cell_type": "code", "source": [ "from oaklib.parsers import GafAssociationParser\n", "gaf_parser = GafAssociationParser()" ], "id": "1ff5c9430db09c", "outputs": [], "execution_count": 10 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-16T01:02:13.232733Z", "start_time": "2025-01-16T01:02:13.228Z" } }, "cell_type": "code", "source": [ "import requests_cache\n", "\n", "session = requests_cache.CachedSession(\n", " cache_name='gaf_cache',\n", " backend='sqlite', # or 'memory' for in-memory cache\n", " expire_after=24*60*60, # Cache expiration in seconds\n", " allowable_codes=[200], # Only cache successful responses\n", ")" ], "id": "6a60e3aaac287013", "outputs": [], "execution_count": 11 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-16T01:02:19.564028Z", "start_time": "2025-01-16T01:02:19.559273Z" } }, "cell_type": "code", "source": [ "from oaklib.datamodels.association import ParserConfiguration, NegatedAssociation\n", "import io\n", "import gzip\n", "\n", "def get_gaf(release, name):\n", " config = ParserConfiguration(preserve_negated_associations=True)\n", " url = GAF_URL_TEMPLATE.format(date=release, name=name)\n", " # open the URL as a file object using requests\n", " with session.get(url, stream=True) as response:\n", " # Decompress the gzipped content and create a text stream\n", " decompressed = gzip.decompress(response.content)\n", " text_stream = io.TextIOWrapper(io.BytesIO(decompressed))\n", " print(f\"Reading {url} using {config}\")\n", " return list(gaf_parser.parse(text_stream, configuration=config))" ], "id": "e094a1d0a870069f", "outputs": [], "execution_count": 12 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-16T01:02:40.209582Z", "start_time": "2025-01-16T01:02:40.207433Z" } }, "cell_type": "code", "source": "", "id": "a17102634f83407", "outputs": [], "execution_count": null }, { "metadata": {}, "cell_type": "markdown", "source": "## Load all annotations into a cache", "id": "1eb89218612607dd" }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-16T01:05:30.983720Z", "start_time": "2025-01-16T01:02:42.258853Z" } }, "cell_type": "code", "source": [ "from collections import defaultdict\n", "\n", "db = defaultdict(dict)\n", "for r in RELEASES:\n", " for name, grp, tax_id in TAXA:\n", " print(f\"Loading {r} {name}\")\n", " assocs = get_gaf(r, grp)\n", " print(f\"Loaded {len(assocs)} associations\")\n", " neg_assocs = [x for x in assocs if x.negated]\n", " print(f\" {len(neg_assocs)} negated associations\")\n", " db[grp][r] = assocs" ], "id": "d81f4578bced71a7", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Loading 2024-11-03 human\n", "Reading https://release.geneontology.org/2024-11-03/annotations/goa_human.gaf.gz using ParserConfiguration(preserve_negated_associations=True, include_association_attributes=None, primary_knowledge_source=None, aggregator_knowledge_source=None)\n", "Loaded 782823 associations\n", " 1494 negated associations\n", "Loading 2024-11-03 Arabidopsis thaliana\n", "Reading https://release.geneontology.org/2024-11-03/annotations/tair.gaf.gz using ParserConfiguration(preserve_negated_associations=True, include_association_attributes=None, primary_knowledge_source=None, aggregator_knowledge_source=None)\n", "Loaded 235371 associations\n", " 1374 negated associations\n", "Loading 2024-11-03 yeast\n", "Reading https://release.geneontology.org/2024-11-03/annotations/sgd.gaf.gz using ParserConfiguration(preserve_negated_associations=True, include_association_attributes=None, primary_knowledge_source=None, aggregator_knowledge_source=None)\n", "Loaded 120823 associations\n", " 6 negated associations\n", "Loading 2024-06-10 human\n", "Reading https://release.geneontology.org/2024-06-10/annotations/goa_human.gaf.gz using ParserConfiguration(preserve_negated_associations=True, include_association_attributes=None, primary_knowledge_source=None, aggregator_knowledge_source=None)\n", "Loaded 707168 associations\n", " 1308 negated associations\n", "Loading 2024-06-10 Arabidopsis thaliana\n", "Reading https://release.geneontology.org/2024-06-10/annotations/tair.gaf.gz using ParserConfiguration(preserve_negated_associations=True, include_association_attributes=None, primary_knowledge_source=None, aggregator_knowledge_source=None)\n", "Loaded 235504 associations\n", " 1373 negated associations\n", "Loading 2024-06-10 yeast\n", "Reading https://release.geneontology.org/2024-06-10/annotations/sgd.gaf.gz using ParserConfiguration(preserve_negated_associations=True, include_association_attributes=None, primary_knowledge_source=None, aggregator_knowledge_source=None)\n", "Loaded 117290 associations\n", " 7 negated associations\n", "Loading 2020-01-01 human\n", "Reading https://release.geneontology.org/2020-01-01/annotations/goa_human.gaf.gz using ParserConfiguration(preserve_negated_associations=True, include_association_attributes=None, primary_knowledge_source=None, aggregator_knowledge_source=None)\n", "Loaded 495361 associations\n", " 1244 negated associations\n", "Loading 2020-01-01 Arabidopsis thaliana\n", "Reading https://release.geneontology.org/2020-01-01/annotations/tair.gaf.gz using ParserConfiguration(preserve_negated_associations=True, include_association_attributes=None, primary_knowledge_source=None, aggregator_knowledge_source=None)\n", "Loaded 236821 associations\n", " 1364 negated associations\n", "Loading 2020-01-01 yeast\n", "Reading https://release.geneontology.org/2020-01-01/annotations/sgd.gaf.gz using ParserConfiguration(preserve_negated_associations=True, include_association_attributes=None, primary_knowledge_source=None, aggregator_knowledge_source=None)\n", "Loaded 120916 associations\n", " 28 negated associations\n" ] } ], "execution_count": 13 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-16T01:05:31.064502Z", "start_time": "2025-01-16T01:05:31.055034Z" } }, "cell_type": "code", "source": "db[\"goa_human\"][LATEST][0]", "id": "69f2c5dd1f8ff101", "outputs": [ { "data": { "text/plain": [ "Association(subject='UniProtKB:A0A024RBG1', predicate='enables', object='GO:0003723', property_values=[], subject_label='NUDT4B', predicate_label=None, object_label=None, negated=None, publications=['GO_REF:0000043'], evidence_type='IEA', supporting_objects=[], primary_knowledge_source='infores:UniProt', aggregator_knowledge_source=None, subject_closure=[], subject_closure_label=[], object_closure=[], object_closure_label=[], comments=[])" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 14 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-16T01:05:47.777533Z", "start_time": "2025-01-16T01:05:47.546272Z" } }, "cell_type": "code", "source": "len([x for x in db[\"goa_human\"][LATEST] if x.negated])", "id": "846c6083be22e228", "outputs": [ { "data": { "text/plain": [ "1494" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 15 }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "# reload modules\n", "id": "f3648e6d2ebabe76" }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-16T02:04:51.451900Z", "start_time": "2025-01-16T02:04:42.035195Z" } }, "cell_type": "code", "source": "%load_ext autoreload\n", "id": "4e24da22a5fb7756", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The autoreload extension is already loaded. To reload it, use:\n", " %reload_ext autoreload\n" ] } ], "execution_count": 30 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-16T03:00:50.738855Z", "start_time": "2025-01-16T03:00:50.712857Z" } }, "cell_type": "code", "source": "%autoreload 0", "id": "6c73a0e816464015", "outputs": [], "execution_count": 53 }, { "metadata": {}, "cell_type": "markdown", "source": "## Diffs by terms", "id": "9bc1bc750a447e06" }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-16T02:05:08.920771Z", "start_time": "2025-01-16T02:05:08.917999Z" } }, "cell_type": "code", "source": "", "id": "ec7960ee3ab0d9da", "outputs": [], "execution_count": null }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-16T02:05:40.040840Z", "start_time": "2025-01-16T02:05:39.993321Z" } }, "cell_type": "code", "source": [ "from oaklib.utilities.associations.association_differ import AssociationDiffer\n", "\n", "\n", "differ = AssociationDiffer(adapter=go)" ], "id": "a349de5baadc2204", "outputs": [], "execution_count": 32 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-16T02:05:40.073956Z", "start_time": "2025-01-16T02:05:40.049886Z" } }, "cell_type": "code", "source": "cache = {}", "id": "f6e2c4e58aca3bde", "outputs": [], "execution_count": 33 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-16T03:37:40.874524Z", "start_time": "2025-01-16T03:37:40.870953Z" } }, "cell_type": "code", "source": "len(db[\"goa_human\"].keys())", "id": "b9100eeaecc2d4a2", "outputs": [ { "data": { "text/plain": [ "3" ] }, "execution_count": 73, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 73 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-16T03:38:07.798450Z", "start_time": "2025-01-16T03:38:07.794257Z" } }, "cell_type": "code", "source": "list(db[\"goa_human\"].keys())", "id": "1d15de57619b4e7b", "outputs": [ { "data": { "text/plain": [ "['2024-11-03', '2024-06-10', '2020-01-01']" ] }, "execution_count": 74, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 74 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-16T06:51:21.735705Z", "start_time": "2025-01-16T06:51:13.328317Z" } }, "cell_type": "code", "source": [ "#ix = differ.changes_by_terms(db[\"goa_human\"][LATEST], db[\"goa_human\"][PREVIOUS[0]], min_num_entities_changes=10, cache={})\n", "grp = \"sgd\"\n", "ix = differ.changes_by_terms(db[grp][PREVIOUS[0]], db[grp][LATEST], min_num_entities_changes=2, cache={})\n" ], "id": "dea16c8833b92aaf", "outputs": [], "execution_count": 89 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-16T06:51:21.756133Z", "start_time": "2025-01-16T06:51:21.751127Z" } }, "cell_type": "code", "source": "len(ix)", "id": "1295df44a5e49396", "outputs": [ { "data": { "text/plain": [ "847" ] }, "execution_count": 90, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 90 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-16T06:51:38.169273Z", "start_time": "2025-01-16T06:51:38.161271Z" } }, "cell_type": "code", "source": [ "for k in list(ix.keys())[0:5]:\n", " print(k, go.label(k))" ], "id": "c52a32e82439ffb2", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "GO:1904688 regulation of cytoplasmic translational initiation\n", "GO:0170039 proteinogenic amino acid metabolic process\n", "GO:0005980 glycogen catabolic process\n", "GO:0045937 positive regulation of phosphate metabolic process\n", "GO:0042762 regulation of sulfur metabolic process\n" ] } ], "execution_count": 91 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-16T06:51:54.175669Z", "start_time": "2025-01-16T06:51:54.171142Z" } }, "cell_type": "code", "source": "pubmed_adapter = get_adapter(\"pubmed:\")", "id": "acf63844088ab185", "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "WARNING:eutils._internal.queryservice:No NCBI API key provided; throttling to 3 requests/second; see https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/new-api-keys-for-the-e-utilities/\n" ] } ], "execution_count": 92 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-16T06:52:10.167483Z", "start_time": "2025-01-16T06:52:10.164456Z" } }, "cell_type": "code", "source": [ "from functools import lru_cache\n", "\n", "\n", "@lru_cache\n", "def pub_date(pmid):\n", " m = pubmed_adapter.entity_metadata_map(pmid)\n", " if m:\n", " return m.get(\"year\")\n" ], "id": "f6b5935b55ce15ca", "outputs": [], "execution_count": 93 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-16T08:38:03.124949Z", "start_time": "2025-01-16T08:38:03.114051Z" } }, "cell_type": "code", "source": [ "from typing import Optional\n", "from oaklib.utilities.associations.association_differ import TermComparison\n", "\n", "\n", "def score_term_comparison(term: str, ix: TermComparison, max_date_inclusive=None, max_genes=20) -> Optional[dict]:\n", " \"\"\"\n", " We are interested in genes for which all evidence was from previously known.\n", " \n", " :param term: \n", " :param ix: \n", " :return: \n", " \"\"\"\n", " #all_genes = set(ix.old_associations_by_entity.keys()).union(ix.new_associations_by_entity.keys())\n", " #gene_diff = all_genes - set(ix.old_associations_by_entity.keys()).intersection(ix.new_associations_by_entity.keys())\n", " if len(ix.new_associations_by_entity) > max_genes:\n", " return None\n", " new_genes = set(ix.new_associations_by_entity.keys()) - set(ix.old_associations_by_entity.keys())\n", " gene_id_to_label_map = {}\n", " for g, assocs in ix.new_associations_by_entity.items():\n", " gene_id_to_label_map[g] = assocs[0].subject_label\n", " for g, assocs in ix.old_associations_by_entity.items():\n", " gene_id_to_label_map[g] = assocs[0].subject_label\n", " filtered_new_genes = {}\n", " for gene in new_genes:\n", " all_before_cutoff = True\n", " for a in ix.new_associations_by_entity[gene]:\n", " pmids = [x for x in a.publications if x.startswith(\"PMID\")]\n", " if max_date_inclusive is not None:\n", " # print(f\"{term} {gene}, pmids={pmids}\")\n", " if not pmids:\n", " all_before_cutoff = False\n", " else:\n", " pmid = pmids[0]\n", " date = pub_date(pmid)\n", " if date is None or date > max_date_inclusive:\n", " all_before_cutoff = False\n", " if all_before_cutoff:\n", " filtered_new_genes[gene] = ix.new_associations_by_entity[gene]\n", " #gene_id_diff = len(gene_diff)\n", " term_lbl = go.label(term)\n", " if term in mf_terms:\n", " ann_pred = \"that are capable of\"\n", " elif term in bp_terms:\n", " ann_pred = \"involved in\"\n", " elif term in cc_terms:\n", " ann_pred = \"localized to\"\n", " else:\n", " return\n", " def as_genes_list(amap):\n", " return [gene_id_to_label_map[g] for g in amap]\n", " def as_str_list(amap):\n", " return [str(g) for g in amap]\n", " case = {\n", " \"input\": f\"List all genes {ann_pred} {term_lbl}\",\n", " \"ideal\": \"; \".join(as_genes_list(ix.new_associations_by_entity)),\n", " \"original_input\": {\n", " \"term\": str(term),\n", " \"genes_current\": as_str_list(ix.new_associations_by_entity),\n", " \"genes_previous\": as_str_list(ix.old_associations_by_entity),\n", " \"genes_added\": as_genes_list(new_genes),\n", " \"genes_added_prior_to_cutoff\": as_genes_list(filtered_new_genes),\n", " \"num_genes_added_prior_to_cutoff\": len(filtered_new_genes),\n", " \"date_cutoff\": max_date_inclusive,\n", " }\n", " }\n", " return case" ], "id": "8415d4ef59e1c228", "outputs": [], "execution_count": 110 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-16T08:38:20.387476Z", "start_time": "2025-01-16T08:38:20.116376Z" } }, "cell_type": "code", "source": [ "import yaml\n", "\n", "n = 0\n", "for k in list(ix.keys()):\n", " lbl = go.label(k)\n", " if \"regulation\" in lbl:\n", " continue\n", " if \"response to\" in lbl:\n", " continue\n", " case = score_term_comparison(k, ix[k], max_date_inclusive=\"2022\")\n", " if not case:\n", " continue\n", " if case[\"original_input\"][\"num_genes_added_prior_to_cutoff\"] > 2:\n", " print(yaml.dump(case, sort_keys=False))\n", " n += 1\n", " if n > 40:\n", " break" ], "id": "af913da8d3c2a149", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "input: List all genes localized to respiratory chain complex IV\n", "ideal: COX6; COX12; COX5B; COX9; COX5A; COX4; COX8; MTC3; COX13; COX7; COX1; COX2;\n", " COX3; COX26; AI4; AI5_ALPHA; AI3\n", "original_input:\n", " term: GO:0045277\n", " genes_current:\n", " - SGD:S000001093\n", " - SGD:S000004028\n", " - SGD:S000001373\n", " - SGD:S000002225\n", " - SGD:S000004997\n", " - SGD:S000003155\n", " - SGD:S000004387\n", " - SGD:S000003195\n", " - SGD:S000003159\n", " - SGD:S000004869\n", " - SGD:S000007260\n", " - SGD:S000007281\n", " - SGD:S000007283\n", " - SGD:S000113555\n", " - SGD:S000007264\n", " - SGD:S000007265\n", " - SGD:S000007263\n", " genes_previous:\n", " - SGD:S000004387\n", " - SGD:S000004028\n", " - SGD:S000004857\n", " - SGD:S000007260\n", " genes_added:\n", " - COX26\n", " - COX13\n", " - AI4\n", " - COX4\n", " - COX9\n", " - AI3\n", " - MTC3\n", " - COX6\n", " - COX3\n", " - AI5_ALPHA\n", " - COX5A\n", " - COX5B\n", " - COX7\n", " - COX2\n", " genes_added_prior_to_cutoff:\n", " - COX26\n", " - COX9\n", " - COX3\n", " - COX7\n", " - COX2\n", " num_genes_added_prior_to_cutoff: 5\n", " date_cutoff: '2022'\n", "\n", "input: List all genes localized to TTT Hsp90 cochaperone complex\n", "ideal: TTI2; RVB2; TRA1; TTI1; RVB1; ASA1; TEL2\n", "original_input:\n", " term: GO:0110078\n", " genes_current:\n", " - SGD:S000003897\n", " - SGD:S000006156\n", " - SGD:S000001141\n", " - SGD:S000001516\n", " - SGD:S000002598\n", " - SGD:S000006289\n", " - SGD:S000003331\n", " genes_previous:\n", " - SGD:S000003897\n", " genes_added:\n", " - RVB1\n", " - TEL2\n", " - TTI1\n", " - ASA1\n", " - RVB2\n", " - TRA1\n", " genes_added_prior_to_cutoff:\n", " - RVB1\n", " - TEL2\n", " - TTI1\n", " - ASA1\n", " - RVB2\n", " - TRA1\n", " num_genes_added_prior_to_cutoff: 6\n", " date_cutoff: '2022'\n", "\n", "input: List all genes localized to respiratory chain complex III\n", "ideal: COR1; RIP1; QCR6; QCR8; QCR7; QCR9; QCR10; CYT1; QCR2; COB\n", "original_input:\n", " term: GO:0045275\n", " genes_current:\n", " - SGD:S000000141\n", " - SGD:S000000750\n", " - SGD:S000001929\n", " - SGD:S000003702\n", " - SGD:S000002937\n", " - SGD:S000003415\n", " - SGD:S000003529\n", " - SGD:S000005591\n", " - SGD:S000006395\n", " - SGD:S000007270\n", " genes_previous:\n", " - SGD:S000007270\n", " genes_added:\n", " - QCR10\n", " - COR1\n", " - CYT1\n", " - QCR2\n", " - QCR9\n", " - QCR8\n", " - QCR6\n", " - QCR7\n", " - RIP1\n", " genes_added_prior_to_cutoff:\n", " - QCR10\n", " - COR1\n", " - QCR2\n", " num_genes_added_prior_to_cutoff: 3\n", " date_cutoff: '2022'\n", "\n", "input: List all genes that are capable of alpha-1,4-glucosidase activity\n", "ideal: MAL62; MAL42; MAL22; MAL32; GTB1; MAL12; IMA1; IMA2; IMA3; IMA4; IMA5\n", "original_input:\n", " term: GO:0004558\n", " genes_current:\n", " - SGD:S000029690\n", " - SGD:S000029687\n", " - SGD:S000029682\n", " - SGD:S000000503\n", " - SGD:S000002629\n", " - SGD:S000003524\n", " - SGD:S000003519\n", " - SGD:S000005517\n", " - SGD:S000001434\n", " - SGD:S000003757\n", " - SGD:S000003752\n", " genes_previous:\n", " - SGD:S000002629\n", " genes_added:\n", " - MAL42\n", " - IMA1\n", " - IMA3\n", " - IMA4\n", " - IMA5\n", " - IMA2\n", " - MAL22\n", " - MAL32\n", " - MAL62\n", " - MAL12\n", " genes_added_prior_to_cutoff:\n", " - MAL42\n", " - MAL22\n", " - MAL62\n", " num_genes_added_prior_to_cutoff: 3\n", " date_cutoff: '2022'\n", "\n", "input: List all genes that are capable of G-quadruplex DNA binding\n", "ideal: RAP1; MGS1; SUB1; DNA2; NSR1; VID22; MSS116; XRS2; SLX9; PIF1; MRE11; DBP2;\n", " RAD50; DED1; DBP1; RRM3\n", "original_input:\n", " term: GO:0051880\n", " genes_current:\n", " - SGD:S000005160\n", " - SGD:S000005162\n", " - SGD:S000004642\n", " - SGD:S000001207\n", " - SGD:S000003391\n", " - SGD:S000004365\n", " - SGD:S000002602\n", " - SGD:S000002777\n", " - SGD:S000003313\n", " - SGD:S000004526\n", " - SGD:S000004837\n", " - SGD:S000005056\n", " - SGD:S000005194\n", " - SGD:S000005730\n", " - SGD:S000006040\n", " - SGD:S000001073\n", " genes_previous:\n", " - SGD:S000004526\n", " - SGD:S000005160\n", " - SGD:S000005194\n", " - SGD:S000004837\n", " - SGD:S000002777\n", " - SGD:S000005162\n", " - SGD:S000004642\n", " - SGD:S000001207\n", " - SGD:S000003391\n", " - SGD:S000003313\n", " - SGD:S000001073\n", " - SGD:S000004365\n", " genes_added:\n", " - DBP2\n", " - MSS116\n", " - DED1\n", " - DBP1\n", " genes_added_prior_to_cutoff:\n", " - DBP2\n", " - MSS116\n", " - DED1\n", " - DBP1\n", " num_genes_added_prior_to_cutoff: 4\n", " date_cutoff: '2022'\n", "\n", "input: List all genes that are capable of alpha-glucosidase activity\n", "ideal: MAL62; MAL42; MAL22; ROT2; MAL32; SGA1; IMA4; IMA3; GTB1; CWH41; MAL12; IMA1;\n", " IMA5; GDB1; IMA2; YMR196W; STA1; CPX-417; SUC2\n", "original_input:\n", " term: GO:0090599\n", " genes_current:\n", " - SGD:S000029690\n", " - SGD:S000029687\n", " - SGD:S000029682\n", " - SGD:S000000433\n", " - SGD:S000000503\n", " - SGD:S000001361\n", " - SGD:S000003757\n", " - SGD:S000001434\n", " - SGD:S000002629\n", " - SGD:S000002995\n", " - SGD:S000003524\n", " - SGD:S000003519\n", " - SGD:S000003752\n", " - SGD:S000006388\n", " - SGD:S000005517\n", " - SGD:S000004809\n", " - SGD:S000029522\n", " - SGD:S000217621\n", " - SGD:S000001424\n", " genes_previous:\n", " - SGD:S000001361\n", " - SGD:S000001434\n", " - SGD:S000003757\n", " - SGD:S000000433\n", " - SGD:S000000503\n", " - SGD:S000003752\n", " - SGD:S000002995\n", " - SGD:S000003519\n", " - SGD:S000003524\n", " - SGD:S000002629\n", " - SGD:S000006388\n", " - SGD:S000005517\n", " - SGD:S000004809\n", " - SGD:S000029522\n", " - SGD:S000217621\n", " - SGD:S000001424\n", " genes_added:\n", " - MAL42\n", " - MAL22\n", " - MAL62\n", " genes_added_prior_to_cutoff:\n", " - MAL42\n", " - MAL22\n", " - MAL62\n", " num_genes_added_prior_to_cutoff: 3\n", " date_cutoff: '2022'\n", "\n" ] } ], "execution_count": 111 }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "", "id": "e56adcfa23306224" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "", "id": "2a6846e7f9900ffd" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "", "id": "bd2bfcfb2ee8ba69" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "", "id": "bb928d0ff338adff" }, { "metadata": {}, "cell_type": "markdown", "source": "## OLD ANALYSIS BELOW", "id": "8103fbec8c035bf5" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "", "id": "f06ea434d4133a25" }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-06T01:24:34.655429Z", "start_time": "2025-01-06T01:24:34.650696Z" } }, "cell_type": "code", "source": [ "from functools import lru_cache\n", "\n", "\n", "\n", "@lru_cache\n", "def lineage(t: str):\n", " up = set(go.ancestors(t, predicates=[IS_A, PART_OF], reflexive=True))\n", " dn = set(go.descendants(t, predicates=[IS_A, PART_OF]))\n", " return up.union(dn)\n", "\n", "#len(lineage(\"GO:0005737\"))\n", "\n", "@lru_cache\n", "def ancs(t: str):\n", " return set(go.ancestors(t, predicates=[IS_A, PART_OF], reflexive=True))\n", " " ], "id": "899161951deb3a20", "outputs": [], "execution_count": 289 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-06T01:24:48.377611Z", "start_time": "2025-01-06T01:24:48.370515Z" } }, "cell_type": "code", "source": [ "import pandas as pd\n", "\n", "def pmid(a):\n", " pubs = [p for p in a.publications if p.startswith(\"PMID\")]\n", " if pubs:\n", " if len(pubs) > 1:\n", " raise ValueError(f\"Multiple PMIDs: {pubs}\")\n", " return pubs[0]\n", " return None\n", "\n", "@lru_cache\n", "def lbl(t: str):\n", " return go.label(t)\n", "\n", "def assocs_to_df(assocs: list, release: str):\n", " \n", " df = pd.DataFrame([{\n", " \"subject\": a.subject,\n", " \"subject_label\": a.subject_label,\n", " \"predicate\": a.predicate,\n", " \"object\": a.object,\n", " \"object_label\": lbl(a.object),\n", " \"object_obsoletes\": a.object in obsoletes,\n", " \"object_uninformative\": a.object in non_informative,\n", " \"object_closure\": ancs(a.object),\n", " \"object_closure_redundant\": ancs(a.object) - {a.object},\n", " \"evidence\": a.evidence_type,\n", " \"is_iba\": a.evidence_type == \"IBA\",\n", " \"negated\": a.negated,\n", " \"pmid\": pmid(a),\n", " \"pubs\": a.publications,\n", " \"release\": release,\n", " } for a in assocs])\n", " return df" ], "id": "8aae919788e4f995", "outputs": [], "execution_count": 290 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-06T01:29:30.373325Z", "start_time": "2025-01-06T01:25:01.988051Z" } }, "cell_type": "code", "source": [ "human_df = assocs_to_df(db[\"goa_human\"][LATEST], LATEST)\n", "tair_df = assocs_to_df(db[\"tair\"][LATEST], LATEST)\n", "sgd_df = assocs_to_df(db[\"sgd\"][LATEST], LATEST)\n", "df = sgd_df" ], "id": "6a647db06b35a07", "outputs": [], "execution_count": 291 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-06T01:30:21.920341Z", "start_time": "2025-01-06T01:29:44.689297Z" } }, "cell_type": "code", "source": "prev_df = assocs_to_df(db[\"sgd\"][PREVIOUS[-1]], PREVIOUS[-1])", "id": "c14a72ef574e29ce", "outputs": [], "execution_count": 292 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-06T01:30:36.207477Z", "start_time": "2025-01-06T01:30:36.191544Z" } }, "cell_type": "code", "source": "df", "id": "2fbff3c4fea8f6b6", "outputs": [ { "data": { "text/plain": [ " subject subject_label predicate object \\\n", "0 SGD:S000003381 GPC1 acts_upstream_of_or_within GO:0090640 \n", "1 SGD:S000005701 ALE1 acts_upstream_of_or_within GO:0090640 \n", "2 SGD:S000003381 GPC1 acts_upstream_of_or_within GO:0036151 \n", "3 SGD:S000004492 RCF1 acts_upstream_of_or_within GO:0033617 \n", "4 SGD:S000004977 SIW14 enables GO:0052845 \n", "... ... ... ... ... \n", "120818 SGD:S000003241 SEC9 involved_in GO:0006906 \n", "120819 SGD:S000004826 CEF1 part_of GO:0000974 \n", "120820 SGD:S000002551 MKC7 involved_in GO:0031505 \n", "120821 SGD:S000003008 HEM2 is_active_in GO:0005829 \n", "120822 SGD:S000001122 LAM4 involved_in GO:0032366 \n", "\n", " object_label object_obsoletes \\\n", "0 phosphatidylcholine biosynthesis from sn-glyce... False \n", "1 phosphatidylcholine biosynthesis from sn-glyce... False \n", "2 phosphatidylcholine acyl-chain remodeling False \n", "3 mitochondrial cytochrome c oxidase assembly False \n", "4 inositol-5-diphosphate-1,2,3,4,6-pentakisphosp... False \n", "... ... ... \n", "120818 vesicle fusion False \n", "120819 Prp19 complex False \n", "120820 fungal-type cell wall organization False \n", "120821 cytosol False \n", "120822 intracellular sterol transport False \n", "\n", " object_uninformative \\\n", "0 False \n", "1 False \n", "2 False \n", "3 False \n", "4 False \n", "... ... \n", "120818 False \n", "120819 False \n", "120820 False \n", "120821 False \n", "120822 False \n", "\n", " object_closure \\\n", "0 {GO:0019637, GO:0008152, BFO:0000015, GO:00066... \n", "1 {GO:0019637, GO:0008152, BFO:0000015, GO:00066... \n", "2 {GO:0019637, GO:0008152, GO:0006796, BFO:00000... \n", "3 {GO:0043933, GO:0044085, GO:0065003, GO:001604... \n", "4 {GO:0016817, GO:0016818, GO:0052842, GO:000382... \n", "... ... \n", "120818 {GO:0048284, GO:0051234, GO:0090174, GO:001604... \n", "120819 {GO:0000974, BFO:0000004, BFO:0000040, BFO:000... \n", "120820 {GO:0071554, GO:0016043, GO:0045229, GO:003150... \n", "120821 {CARO:0030000, UBERON:0000061, CARO:0000003, G... \n", "120822 {GO:0006869, GO:0015850, GO:0032365, GO:005123... \n", "\n", " object_closure_redundant evidence is_iba \\\n", "0 {GO:0019637, GO:0008152, GO:0006796, BFO:00000... IGI False \n", "1 {GO:0019637, GO:0008152, GO:0006796, BFO:00000... IGI False \n", "2 {GO:0019637, GO:0008152, GO:0006796, BFO:00000... IMP False \n", "3 {GO:0043933, GO:0044085, GO:0022607, GO:007184... IMP False \n", "4 {GO:0016817, GO:0016818, GO:0052842, GO:000382... IDA False \n", "... ... ... ... \n", "120818 {GO:0061024, BFO:0000015, GO:0009987, GO:00160... IBA True \n", "120819 {BFO:0000004, BFO:0000040, BFO:0000002, GO:003... IBA True \n", "120820 {GO:0071554, GO:0016043, GO:0045229, GO:000998... IBA True \n", "120821 {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... IBA True \n", "120822 {GO:0032365, GO:0015918, GO:0051649, GO:000998... IBA True \n", "\n", " negated pmid pubs release \n", "0 None PMID:30514764 [PMID:30514764] 2024-11-03 \n", "1 None PMID:30514764 [PMID:30514764] 2024-11-03 \n", "2 None PMID:30514764 [PMID:30514764] 2024-11-03 \n", "3 None PMID:29746825 [PMID:29746825] 2024-11-03 \n", "4 None PMID:26828065 [PMID:26828065] 2024-11-03 \n", "... ... ... ... ... \n", "120818 None None [GO_REF:0000033] 2024-11-03 \n", "120819 None None [GO_REF:0000033] 2024-11-03 \n", "120820 None None [GO_REF:0000033] 2024-11-03 \n", "120821 None None [GO_REF:0000033] 2024-11-03 \n", "120822 None None [GO_REF:0000033] 2024-11-03 \n", "\n", "[120823 rows x 15 columns]" ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
subjectsubject_labelpredicateobjectobject_labelobject_obsoletesobject_uninformativeobject_closureobject_closure_redundantevidenceis_ibanegatedpmidpubsrelease
0SGD:S000003381GPC1acts_upstream_of_or_withinGO:0090640phosphatidylcholine biosynthesis from sn-glyce...FalseFalse{GO:0019637, GO:0008152, BFO:0000015, GO:00066...{GO:0019637, GO:0008152, GO:0006796, BFO:00000...IGIFalseNonePMID:30514764[PMID:30514764]2024-11-03
1SGD:S000005701ALE1acts_upstream_of_or_withinGO:0090640phosphatidylcholine biosynthesis from sn-glyce...FalseFalse{GO:0019637, GO:0008152, BFO:0000015, GO:00066...{GO:0019637, GO:0008152, GO:0006796, BFO:00000...IGIFalseNonePMID:30514764[PMID:30514764]2024-11-03
2SGD:S000003381GPC1acts_upstream_of_or_withinGO:0036151phosphatidylcholine acyl-chain remodelingFalseFalse{GO:0019637, GO:0008152, GO:0006796, BFO:00000...{GO:0019637, GO:0008152, GO:0006796, BFO:00000...IMPFalseNonePMID:30514764[PMID:30514764]2024-11-03
3SGD:S000004492RCF1acts_upstream_of_or_withinGO:0033617mitochondrial cytochrome c oxidase assemblyFalseFalse{GO:0043933, GO:0044085, GO:0065003, GO:001604...{GO:0043933, GO:0044085, GO:0022607, GO:007184...IMPFalseNonePMID:29746825[PMID:29746825]2024-11-03
4SGD:S000004977SIW14enablesGO:0052845inositol-5-diphosphate-1,2,3,4,6-pentakisphosp...FalseFalse{GO:0016817, GO:0016818, GO:0052842, GO:000382...{GO:0016817, GO:0016818, GO:0052842, GO:000382...IDAFalseNonePMID:26828065[PMID:26828065]2024-11-03
................................................
120818SGD:S000003241SEC9involved_inGO:0006906vesicle fusionFalseFalse{GO:0048284, GO:0051234, GO:0090174, GO:001604...{GO:0061024, BFO:0000015, GO:0009987, GO:00160...IBATrueNoneNone[GO_REF:0000033]2024-11-03
120819SGD:S000004826CEF1part_ofGO:0000974Prp19 complexFalseFalse{GO:0000974, BFO:0000004, BFO:0000040, BFO:000...{BFO:0000004, BFO:0000040, BFO:0000002, GO:003...IBATrueNoneNone[GO_REF:0000033]2024-11-03
120820SGD:S000002551MKC7involved_inGO:0031505fungal-type cell wall organizationFalseFalse{GO:0071554, GO:0016043, GO:0045229, GO:003150...{GO:0071554, GO:0016043, GO:0045229, GO:000998...IBATrueNoneNone[GO_REF:0000033]2024-11-03
120821SGD:S000003008HEM2is_active_inGO:0005829cytosolFalseFalse{CARO:0030000, UBERON:0000061, CARO:0000003, G...{CARO:0030000, CARO:0000000, BFO:0000004, GO:0...IBATrueNoneNone[GO_REF:0000033]2024-11-03
120822SGD:S000001122LAM4involved_inGO:0032366intracellular sterol transportFalseFalse{GO:0006869, GO:0015850, GO:0032365, GO:005123...{GO:0032365, GO:0015918, GO:0051649, GO:000998...IBATrueNoneNone[GO_REF:0000033]2024-11-03
\n", "

120823 rows × 15 columns

\n", "
" ] }, "execution_count": 293, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 293 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-06T01:30:50.059359Z", "start_time": "2025-01-06T01:30:50.054865Z" } }, "cell_type": "code", "source": [ "from typing import Dict, List, Optional\n", "\n", "\n", "def repair_assocs_df(assocs: pd.DataFrame):\n", " \"\"\"\n", " Ensures that IDs are normalized.\n", " \n", " :param assocs: \n", " :return: \n", " \"\"\"\n", " # may not be 1:1\n", " subject_label_to_ids: Dict[str, List[str]]\n", " subject_label_to_ids = assocs.groupby(\"subject_label\")[\"subject\"].aggregate(lambda x: list(set(x))).to_dict()\n", " labels_with_multiple_ids = {k: v for k, v in subject_label_to_ids.items() if len(v) > 1}\n", " if labels_with_multiple_ids:\n", " print(f\"Multiple IDs for {len(labels_with_multiple_ids)} labels\")\n", " print(list(labels_with_multiple_ids.items())[:5])\n", " labels_to_canonical = {k: sorted(list(v))[0] for k, v in subject_label_to_ids.items()}\n", " assocs['subject'] = assocs['subject_label'].map(labels_to_canonical)\n", " \n", " " ], "id": "562804d220d9bd23", "outputs": [], "execution_count": 294 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-06T01:30:50.956550Z", "start_time": "2025-01-06T01:30:50.106919Z" } }, "cell_type": "code", "source": [ "test_df = tair_df.copy()\n", "repair_assocs_df(test_df)\n", "test_df" ], "id": "5ce16104b3c731e6", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Multiple IDs for 5258 labels\n", "[('4CL1', ['TAIR:locus:2017602', 'AGI_LocusCode:AT1G51680']), ('4CL2', ['TAIR:locus:2094716', 'AGI_LocusCode:AT3G21240']), ('4CL3', ['TAIR:locus:2015003', 'AGI_LocusCode:AT1G65060']), ('AAC1', ['TAIR:locus:2077778', 'AGI_LocusCode:AT3G08580']), ('AAC2', ['AGI_LocusCode:AT5G13490', 'TAIR:locus:2185041'])]\n" ] }, { "data": { "text/plain": [ " subject subject_label predicate object \\\n", "0 AGI_LocusCode:AT1G11880 AT1G11880 enables GO:0000009 \n", "1 AGI_LocusCode:AT1G80420 ATXRCC1 involved_in GO:0000012 \n", "2 AGI_LocusCode:AT1G74030 ENO1 part_of GO:0000015 \n", "3 AGI_LocusCode:AT2G29560 ENOC part_of GO:0000015 \n", "4 AGI_LocusCode:AT2G36530 LOS2 part_of GO:0000015 \n", "... ... ... ... ... \n", "235366 TAIR:locus:2058630 At2g23210 enables GO:0010294 \n", "235367 AGI_LocusCode:AT2G15820 OTP51 involved_in GO:0045292 \n", "235368 TAIR:locus:2143196 At5g15750 involved_in GO:0042274 \n", "235369 AGI_LocusCode:AT4G14730 LFG1 is_active_in GO:0016020 \n", "235370 TAIR:locus:2116525 SD25 enables GO:0004672 \n", "\n", " object_label object_obsoletes \\\n", "0 alpha-1,6-mannosyltransferase activity False \n", "1 single strand break repair False \n", "2 phosphopyruvate hydratase complex False \n", "3 phosphopyruvate hydratase complex False \n", "4 phosphopyruvate hydratase complex False \n", "... ... ... \n", "235366 abscisic acid glucosyltransferase activity False \n", "235367 mRNA cis splicing, via spliceosome False \n", "235368 ribosomal small subunit biogenesis False \n", "235369 membrane False \n", "235370 protein kinase activity False \n", "\n", " object_uninformative \\\n", "0 False \n", "1 False \n", "2 False \n", "3 False \n", "4 False \n", "... ... \n", "235366 False \n", "235367 False \n", "235368 False \n", "235369 False \n", "235370 False \n", "\n", " object_closure \\\n", "0 {GO:0000030, GO:0003824, GO:0016740, BFO:00000... \n", "1 {GO:0043170, GO:0033554, GO:0008152, GO:000613... \n", "2 {GO:0005829, GO:0110165, BFO:0000002, GO:00329... \n", "3 {GO:0005829, GO:0110165, BFO:0000002, GO:00329... \n", "4 {GO:0005829, GO:0110165, BFO:0000002, GO:00329... \n", "... ... \n", "235366 {GO:0035251, GO:0003824, GO:0016740, BFO:00000... \n", "235367 {GO:0006397, GO:0008152, GO:0000375, BFO:00000... \n", "235368 {GO:0042274, GO:0044085, GO:0009987, BFO:00000... \n", "235369 {CARO:0030000, UBERON:0000061, CARO:0000003, G... \n", "235370 {GO:0140096, GO:0003824, GO:0016740, BFO:00000... \n", "\n", " object_closure_redundant evidence is_iba \\\n", "0 {GO:0000030, GO:0003824, GO:0016740, BFO:00000... IEA False \n", "1 {GO:0043170, GO:0008152, BFO:0000015, GO:00099... IEA False \n", "2 {CARO:0030000, GO:0005829, BFO:0000004, CARO:0... IEA False \n", "3 {CARO:0030000, GO:0005829, BFO:0000004, CARO:0... IEA False \n", "4 {CARO:0030000, GO:0005829, BFO:0000004, CARO:0... IEA False \n", "... ... ... ... \n", "235366 {GO:0035251, GO:0003824, GO:0016740, BFO:00000... IBA True \n", "235367 {GO:0009059, GO:0043170, GO:0006397, GO:000815... IBA True \n", "235368 {GO:0044085, GO:0009987, BFO:0000015, GO:00226... IBA True \n", "235369 {CARO:0030000, UBERON:0000061, CARO:0000003, G... IBA True \n", "235370 {GO:0140096, GO:0003824, GO:0016740, BFO:00000... IBA True \n", "\n", " negated pmid pubs release \n", "0 None None [TAIR:AnalysisReference:501756966] 2024-11-03 \n", "1 None None [TAIR:AnalysisReference:501756966] 2024-11-03 \n", "2 None None [TAIR:AnalysisReference:501756966] 2024-11-03 \n", "3 None None [TAIR:AnalysisReference:501756966] 2024-11-03 \n", "4 None None [TAIR:AnalysisReference:501756966] 2024-11-03 \n", "... ... ... ... ... \n", "235366 None None [GO_REF:0000033] 2024-11-03 \n", "235367 None None [GO_REF:0000033] 2024-11-03 \n", "235368 None None [GO_REF:0000033] 2024-11-03 \n", "235369 None None [GO_REF:0000033] 2024-11-03 \n", "235370 None None [GO_REF:0000033] 2024-11-03 \n", "\n", "[235371 rows x 15 columns]" ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
subjectsubject_labelpredicateobjectobject_labelobject_obsoletesobject_uninformativeobject_closureobject_closure_redundantevidenceis_ibanegatedpmidpubsrelease
0AGI_LocusCode:AT1G11880AT1G11880enablesGO:0000009alpha-1,6-mannosyltransferase activityFalseFalse{GO:0000030, GO:0003824, GO:0016740, BFO:00000...{GO:0000030, GO:0003824, GO:0016740, BFO:00000...IEAFalseNoneNone[TAIR:AnalysisReference:501756966]2024-11-03
1AGI_LocusCode:AT1G80420ATXRCC1involved_inGO:0000012single strand break repairFalseFalse{GO:0043170, GO:0033554, GO:0008152, GO:000613...{GO:0043170, GO:0008152, BFO:0000015, GO:00099...IEAFalseNoneNone[TAIR:AnalysisReference:501756966]2024-11-03
2AGI_LocusCode:AT1G74030ENO1part_ofGO:0000015phosphopyruvate hydratase complexFalseFalse{GO:0005829, GO:0110165, BFO:0000002, GO:00329...{CARO:0030000, GO:0005829, BFO:0000004, CARO:0...IEAFalseNoneNone[TAIR:AnalysisReference:501756966]2024-11-03
3AGI_LocusCode:AT2G29560ENOCpart_ofGO:0000015phosphopyruvate hydratase complexFalseFalse{GO:0005829, GO:0110165, BFO:0000002, GO:00329...{CARO:0030000, GO:0005829, BFO:0000004, CARO:0...IEAFalseNoneNone[TAIR:AnalysisReference:501756966]2024-11-03
4AGI_LocusCode:AT2G36530LOS2part_ofGO:0000015phosphopyruvate hydratase complexFalseFalse{GO:0005829, GO:0110165, BFO:0000002, GO:00329...{CARO:0030000, GO:0005829, BFO:0000004, CARO:0...IEAFalseNoneNone[TAIR:AnalysisReference:501756966]2024-11-03
................................................
235366TAIR:locus:2058630At2g23210enablesGO:0010294abscisic acid glucosyltransferase activityFalseFalse{GO:0035251, GO:0003824, GO:0016740, BFO:00000...{GO:0035251, GO:0003824, GO:0016740, BFO:00000...IBATrueNoneNone[GO_REF:0000033]2024-11-03
235367AGI_LocusCode:AT2G15820OTP51involved_inGO:0045292mRNA cis splicing, via spliceosomeFalseFalse{GO:0006397, GO:0008152, GO:0000375, BFO:00000...{GO:0009059, GO:0043170, GO:0006397, GO:000815...IBATrueNoneNone[GO_REF:0000033]2024-11-03
235368TAIR:locus:2143196At5g15750involved_inGO:0042274ribosomal small subunit biogenesisFalseFalse{GO:0042274, GO:0044085, GO:0009987, BFO:00000...{GO:0044085, GO:0009987, BFO:0000015, GO:00226...IBATrueNoneNone[GO_REF:0000033]2024-11-03
235369AGI_LocusCode:AT4G14730LFG1is_active_inGO:0016020membraneFalseFalse{CARO:0030000, UBERON:0000061, CARO:0000003, G...{CARO:0030000, UBERON:0000061, CARO:0000003, G...IBATrueNoneNone[GO_REF:0000033]2024-11-03
235370TAIR:locus:2116525SD25enablesGO:0004672protein kinase activityFalseFalse{GO:0140096, GO:0003824, GO:0016740, BFO:00000...{GO:0140096, GO:0003824, GO:0016740, BFO:00000...IBATrueNoneNone[GO_REF:0000033]2024-11-03
\n", "

235371 rows × 15 columns

\n", "
" ] }, "execution_count": 295, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 295 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-06T01:30:51.054962Z", "start_time": "2025-01-06T01:30:50.997657Z" } }, "cell_type": "code", "source": "test_df[test_df['subject_label'] == \"GALT6\"]", "id": "cc2f1ce6ee035902", "outputs": [ { "data": { "text/plain": [ " subject subject_label predicate \\\n", "56871 AGI_LocusCode:AT5G62620 GALT6 located_in \n", "56872 AGI_LocusCode:AT5G62620 GALT6 located_in \n", "61144 AGI_LocusCode:AT5G62620 GALT6 involved_in \n", "85487 AGI_LocusCode:AT5G62620 GALT6 enables \n", "165846 AGI_LocusCode:AT5G62620 GALT6 located_in \n", "165851 AGI_LocusCode:AT5G62620 GALT6 involved_in \n", "165856 AGI_LocusCode:AT5G62620 GALT6 involved_in \n", "165858 AGI_LocusCode:AT5G62620 GALT6 acts_upstream_of_or_within \n", "165863 AGI_LocusCode:AT5G62620 GALT6 acts_upstream_of_or_within \n", "165869 AGI_LocusCode:AT5G62620 GALT6 enables \n", "210076 AGI_LocusCode:AT5G62620 GALT6 enables \n", "217996 AGI_LocusCode:AT5G62620 GALT6 is_active_in \n", "\n", " object object_label \\\n", "56871 GO:0005794 Golgi apparatus \n", "56872 GO:0005794 Golgi apparatus \n", "61144 GO:0006486 protein glycosylation \n", "85487 GO:0030246 carbohydrate binding \n", "165846 GO:0005794 Golgi apparatus \n", "165851 GO:0010405 arabinogalactan protein metabolic process \n", "165856 GO:0018258 protein O-linked glycosylation via hydroxyproline \n", "165858 GO:0048354 mucilage biosynthetic process involved in seed... \n", "165863 GO:1900056 negative regulation of leaf senescence \n", "165869 GO:1990714 hydroxyproline O-galactosyltransferase activity \n", "210076 GO:1990714 hydroxyproline O-galactosyltransferase activity \n", "217996 GO:0000139 Golgi membrane \n", "\n", " object_obsoletes object_uninformative \\\n", "56871 False False \n", "56872 False False \n", "61144 False False \n", "85487 False True \n", "165846 False False \n", "165851 False False \n", "165856 False False \n", "165858 False False \n", "165863 False False \n", "165869 False False \n", "210076 False False \n", "217996 False False \n", "\n", " object_closure \\\n", "56871 {GO:0110165, GO:0043231, BFO:0000002, GO:00057... \n", "56872 {GO:0110165, GO:0043231, BFO:0000002, GO:00057... \n", "61144 {GO:0008152, BFO:0000015, GO:0009100, GO:00434... \n", "85487 {BFO:0000015, GO:0003674, GO:0005488, BFO:0000... \n", "165846 {GO:0110165, GO:0043231, BFO:0000002, GO:00057... \n", "165851 {GO:0043170, GO:0044036, GO:0010384, GO:007155... \n", "165856 {GO:0006493, GO:0008152, BFO:0000015, GO:00091... \n", "165858 {GO:0032501, GO:0008152, GO:0048359, BFO:00000... \n", "165863 {GO:0065007, BFO:0000015, GO:1900055, GO:00485... \n", "165869 {GO:0003824, GO:0016740, BFO:0000015, GO:00083... \n", "210076 {GO:0003824, GO:0016740, BFO:0000015, GO:00083... \n", "217996 {GO:0110165, GO:0043231, BFO:0000002, GO:00057... \n", "\n", " object_closure_redundant evidence is_iba \\\n", "56871 {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... ISM False \n", "56872 {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... ISM False \n", "61144 {GO:0009059, GO:0043170, GO:0070085, GO:000815... IEA False \n", "85487 {GO:0003674, BFO:0000015, GO:0005488, BFO:0000... IEA False \n", "165846 {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... IDA False \n", "165851 {GO:0043170, GO:0044036, GO:0071554, GO:000815... IMP False \n", "165856 {GO:0009059, GO:0006493, GO:0043170, GO:007008... IDA False \n", "165858 {GO:0032501, GO:0010192, GO:0008152, GO:004835... IMP False \n", "165863 {GO:0065007, BFO:0000015, GO:1900055, GO:00485... IMP False \n", "165869 {GO:0003824, GO:0016740, BFO:0000015, GO:00083... IDA False \n", "210076 {GO:0003824, GO:0016740, BFO:0000015, GO:00083... IBA True \n", "217996 {CARO:0030000, GO:0005794, CARO:0000000, BFO:0... IBA True \n", "\n", " negated pmid pubs \\\n", "56871 None None [TAIR:AnalysisReference:501780126] \n", "56872 None None [TAIR:AnalysisReference:501780126] \n", "61144 None None [TAIR:AnalysisReference:501757242] \n", "85487 None None [TAIR:AnalysisReference:501756966] \n", "165846 None PMID:26690932 [TAIR:Publication:501767599, PMID:26690932] \n", "165851 None PMID:26690932 [TAIR:Publication:501767599, PMID:26690932] \n", "165856 None PMID:26690932 [TAIR:Publication:501767599, PMID:26690932] \n", "165858 None PMID:26690932 [TAIR:Publication:501767599, PMID:26690932] \n", "165863 None PMID:26690932 [TAIR:Publication:501767599, PMID:26690932] \n", "165869 None PMID:26690932 [TAIR:Publication:501767599, PMID:26690932] \n", "210076 None None [GO_REF:0000033] \n", "217996 None None [GO_REF:0000033] \n", "\n", " release \n", "56871 2024-11-03 \n", "56872 2024-11-03 \n", "61144 2024-11-03 \n", "85487 2024-11-03 \n", "165846 2024-11-03 \n", "165851 2024-11-03 \n", "165856 2024-11-03 \n", "165858 2024-11-03 \n", "165863 2024-11-03 \n", "165869 2024-11-03 \n", "210076 2024-11-03 \n", "217996 2024-11-03 " ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
subjectsubject_labelpredicateobjectobject_labelobject_obsoletesobject_uninformativeobject_closureobject_closure_redundantevidenceis_ibanegatedpmidpubsrelease
56871AGI_LocusCode:AT5G62620GALT6located_inGO:0005794Golgi apparatusFalseFalse{GO:0110165, GO:0043231, BFO:0000002, GO:00057...{CARO:0030000, CARO:0000000, BFO:0000004, GO:0...ISMFalseNoneNone[TAIR:AnalysisReference:501780126]2024-11-03
56872AGI_LocusCode:AT5G62620GALT6located_inGO:0005794Golgi apparatusFalseFalse{GO:0110165, GO:0043231, BFO:0000002, GO:00057...{CARO:0030000, CARO:0000000, BFO:0000004, GO:0...ISMFalseNoneNone[TAIR:AnalysisReference:501780126]2024-11-03
61144AGI_LocusCode:AT5G62620GALT6involved_inGO:0006486protein glycosylationFalseFalse{GO:0008152, BFO:0000015, GO:0009100, GO:00434...{GO:0009059, GO:0043170, GO:0070085, GO:000815...IEAFalseNoneNone[TAIR:AnalysisReference:501757242]2024-11-03
85487AGI_LocusCode:AT5G62620GALT6enablesGO:0030246carbohydrate bindingFalseTrue{BFO:0000015, GO:0003674, GO:0005488, BFO:0000...{GO:0003674, BFO:0000015, GO:0005488, BFO:0000...IEAFalseNoneNone[TAIR:AnalysisReference:501756966]2024-11-03
165846AGI_LocusCode:AT5G62620GALT6located_inGO:0005794Golgi apparatusFalseFalse{GO:0110165, GO:0043231, BFO:0000002, GO:00057...{CARO:0030000, CARO:0000000, BFO:0000004, GO:0...IDAFalseNonePMID:26690932[TAIR:Publication:501767599, PMID:26690932]2024-11-03
165851AGI_LocusCode:AT5G62620GALT6involved_inGO:0010405arabinogalactan protein metabolic processFalseFalse{GO:0043170, GO:0044036, GO:0010384, GO:007155...{GO:0043170, GO:0044036, GO:0071554, GO:000815...IMPFalseNonePMID:26690932[TAIR:Publication:501767599, PMID:26690932]2024-11-03
165856AGI_LocusCode:AT5G62620GALT6involved_inGO:0018258protein O-linked glycosylation via hydroxyprolineFalseFalse{GO:0006493, GO:0008152, BFO:0000015, GO:00091...{GO:0009059, GO:0006493, GO:0043170, GO:007008...IDAFalseNonePMID:26690932[TAIR:Publication:501767599, PMID:26690932]2024-11-03
165858AGI_LocusCode:AT5G62620GALT6acts_upstream_of_or_withinGO:0048354mucilage biosynthetic process involved in seed...FalseFalse{GO:0032501, GO:0008152, GO:0048359, BFO:00000...{GO:0032501, GO:0010192, GO:0008152, GO:004835...IMPFalseNonePMID:26690932[TAIR:Publication:501767599, PMID:26690932]2024-11-03
165863AGI_LocusCode:AT5G62620GALT6acts_upstream_of_or_withinGO:1900056negative regulation of leaf senescenceFalseFalse{GO:0065007, BFO:0000015, GO:1900055, GO:00485...{GO:0065007, BFO:0000015, GO:1900055, GO:00485...IMPFalseNonePMID:26690932[TAIR:Publication:501767599, PMID:26690932]2024-11-03
165869AGI_LocusCode:AT5G62620GALT6enablesGO:1990714hydroxyproline O-galactosyltransferase activityFalseFalse{GO:0003824, GO:0016740, BFO:0000015, GO:00083...{GO:0003824, GO:0016740, BFO:0000015, GO:00083...IDAFalseNonePMID:26690932[TAIR:Publication:501767599, PMID:26690932]2024-11-03
210076AGI_LocusCode:AT5G62620GALT6enablesGO:1990714hydroxyproline O-galactosyltransferase activityFalseFalse{GO:0003824, GO:0016740, BFO:0000015, GO:00083...{GO:0003824, GO:0016740, BFO:0000015, GO:00083...IBATrueNoneNone[GO_REF:0000033]2024-11-03
217996AGI_LocusCode:AT5G62620GALT6is_active_inGO:0000139Golgi membraneFalseFalse{GO:0110165, GO:0043231, BFO:0000002, GO:00057...{CARO:0030000, GO:0005794, CARO:0000000, BFO:0...IBATrueNoneNone[GO_REF:0000033]2024-11-03
\n", "
" ] }, "execution_count": 296, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 296 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-06T01:30:51.132969Z", "start_time": "2025-01-06T01:30:51.128031Z" } }, "cell_type": "code", "source": [ "\n", "def create_gene_df(df: pd.DataFrame):\n", " \"\"\"\n", " creates a new dataframe, grouped by gene (subject)\n", " \n", " :param df: \n", " :return: \n", " \"\"\"\n", " gene_df = df.groupby(\"subject\").agg({\n", " \"object\": \"count\",\n", " \"is_iba\": \"sum\",\n", " \"negated\": \"sum\",\n", " # for object, take the union of all distinct values\n", " \"object\": lambda x: set(x),\n", " # for object closure, take the union of all sets\n", " \"object_closure\": lambda x: set.union(*x),\n", " \"object_closure_redundant\": lambda x: set.union(*x),\n", " \"pmid\": lambda x: set(x),\n", " #\"pubs\": lambda x: set.union(set(x)),\n", " \"release\": \"first\",\n", " }).reset_index()\n", " # the redundant closure is the set difference of object_closure and object\n", " gene_df = gene_df.rename(columns={\n", " \"object\": \"terms\",\n", " \"is_iba\": \"n_iba\",\n", " \"negated\": \"n_negated\",\n", " \"object_closure\": \"closure\",\n", " \"pmid\": \"n_pmid\",\n", " })\n", " return gene_df" ], "id": "41a8190a165c5998", "outputs": [], "execution_count": 297 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-06T01:30:51.940805Z", "start_time": "2025-01-06T01:30:51.183316Z" } }, "cell_type": "code", "source": [ "gene_df = create_gene_df(df)\n", "gene_df" ], "id": "d44b299cbd15b29e", "outputs": [ { "data": { "text/plain": [ " subject terms \\\n", "0 SGD:S000000001 {GO:0071168, GO:0005739, GO:0008301, GO:000367... \n", "1 SGD:S000000002 {GO:0015031, GO:0099023, GO:0005768, GO:000662... \n", "2 SGD:S000000003 {GO:0005085, GO:0005737, GO:0032232, GO:000582... \n", "3 SGD:S000000004 {GO:0005829, GO:0072671, GO:0005576, GO:003460... \n", "4 SGD:S000000005 {GO:0015031, GO:0006621, GO:0005789, GO:000688... \n", "... ... ... \n", "6906 SGD:S000350095 {GO:0008150, GO:0003674, GO:0005575} \n", "6907 SGD:S000350096 {GO:0008150, GO:0003674, GO:0005575} \n", "6908 SGD:S000350097 {GO:0005575, GO:0003674, GO:0008150} \n", "6909 SGD:S000350098 {GO:0005575, GO:0003674, GO:0008150} \n", "6910 SGD:S000350099 {GO:0003674, GO:0005575, GO:0008150} \n", "\n", " n_iba n_negated closure \\\n", "0 3 0 {GO:0005739, GO:0000182, GO:0008152, BFO:00000... \n", "1 4 0 {GO:0032509, GO:0071985, GO:0046872, BFO:00000... \n", "2 3 0 {GO:0005085, GO:0030234, GO:0008152, BFO:00000... \n", "3 9 0 {GO:1901363, GO:0003723, GO:0008152, BFO:00000... \n", "4 8 0 {BFO:0000015, GO:0030135, GO:0070972, GO:01101... \n", "... ... ... ... \n", "6906 0 0 {BFO:0000003, BFO:0000015, BFO:0000004, GO:000... \n", "6907 0 0 {BFO:0000003, BFO:0000015, BFO:0000004, GO:000... \n", "6908 0 0 {BFO:0000004, BFO:0000002, GO:0003674, GO:0008... \n", "6909 0 0 {BFO:0000004, BFO:0000002, GO:0003674, GO:0008... \n", "6910 0 0 {BFO:0000040, BFO:0000015, BFO:0000004, GO:000... \n", "\n", " object_closure_redundant \\\n", "0 {GO:0000182, GO:0008152, BFO:0000015, GO:00010... \n", "1 {GO:0032509, GO:0071985, GO:0046872, BFO:00000... \n", "2 {GO:0030234, GO:0008152, BFO:0000015, GO:00900... \n", "3 {GO:1901363, GO:0003723, GO:0008152, BFO:00000... \n", "4 {BFO:0000015, GO:0030135, GO:0070972, GO:01101... \n", "... ... \n", "6906 {BFO:0000004, BFO:0000040, BFO:0000003, BFO:00... \n", "6907 {BFO:0000004, BFO:0000040, BFO:0000003, BFO:00... \n", "6908 {BFO:0000004, BFO:0000002, BFO:0000003, BFO:00... \n", "6909 {BFO:0000004, BFO:0000002, BFO:0000003, BFO:00... \n", "6910 {BFO:0000004, BFO:0000002, BFO:0000003, BFO:00... \n", "\n", " n_pmid release \n", "0 {PMID:2404611, PMID:18708580, PMID:2649882, PM... 2024-11-03 \n", "1 {PMID:19828734, PMID:30358795, PMID:20173035, ... 2024-11-03 \n", "2 {PMID:19545407, PMID:10409717, PMID:17925388, ... 2024-11-03 \n", "3 {PMID:18706386, PMID:9789005, PMID:26928762, P... 2024-11-03 \n", "4 {PMID:26928762, PMID:10359606, None, PMID:1115... 2024-11-03 \n", "... ... ... \n", "6906 {None} 2024-11-03 \n", "6907 {None} 2024-11-03 \n", "6908 {None} 2024-11-03 \n", "6909 {None} 2024-11-03 \n", "6910 {None} 2024-11-03 \n", "\n", "[6911 rows x 8 columns]" ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
subjecttermsn_iban_negatedclosureobject_closure_redundantn_pmidrelease
0SGD:S000000001{GO:0071168, GO:0005739, GO:0008301, GO:000367...30{GO:0005739, GO:0000182, GO:0008152, BFO:00000...{GO:0000182, GO:0008152, BFO:0000015, GO:00010...{PMID:2404611, PMID:18708580, PMID:2649882, PM...2024-11-03
1SGD:S000000002{GO:0015031, GO:0099023, GO:0005768, GO:000662...40{GO:0032509, GO:0071985, GO:0046872, BFO:00000...{GO:0032509, GO:0071985, GO:0046872, BFO:00000...{PMID:19828734, PMID:30358795, PMID:20173035, ...2024-11-03
2SGD:S000000003{GO:0005085, GO:0005737, GO:0032232, GO:000582...30{GO:0005085, GO:0030234, GO:0008152, BFO:00000...{GO:0030234, GO:0008152, BFO:0000015, GO:00900...{PMID:19545407, PMID:10409717, PMID:17925388, ...2024-11-03
3SGD:S000000004{GO:0005829, GO:0072671, GO:0005576, GO:003460...90{GO:1901363, GO:0003723, GO:0008152, BFO:00000...{GO:1901363, GO:0003723, GO:0008152, BFO:00000...{PMID:18706386, PMID:9789005, PMID:26928762, P...2024-11-03
4SGD:S000000005{GO:0015031, GO:0006621, GO:0005789, GO:000688...80{BFO:0000015, GO:0030135, GO:0070972, GO:01101...{BFO:0000015, GO:0030135, GO:0070972, GO:01101...{PMID:26928762, PMID:10359606, None, PMID:1115...2024-11-03
...........................
6906SGD:S000350095{GO:0008150, GO:0003674, GO:0005575}00{BFO:0000003, BFO:0000015, BFO:0000004, GO:000...{BFO:0000004, BFO:0000040, BFO:0000003, BFO:00...{None}2024-11-03
6907SGD:S000350096{GO:0008150, GO:0003674, GO:0005575}00{BFO:0000003, BFO:0000015, BFO:0000004, GO:000...{BFO:0000004, BFO:0000040, BFO:0000003, BFO:00...{None}2024-11-03
6908SGD:S000350097{GO:0005575, GO:0003674, GO:0008150}00{BFO:0000004, BFO:0000002, GO:0003674, GO:0008...{BFO:0000004, BFO:0000002, BFO:0000003, BFO:00...{None}2024-11-03
6909SGD:S000350098{GO:0005575, GO:0003674, GO:0008150}00{BFO:0000004, BFO:0000002, GO:0003674, GO:0008...{BFO:0000004, BFO:0000002, BFO:0000003, BFO:00...{None}2024-11-03
6910SGD:S000350099{GO:0003674, GO:0005575, GO:0008150}00{BFO:0000040, BFO:0000015, BFO:0000004, GO:000...{BFO:0000004, BFO:0000002, BFO:0000003, BFO:00...{None}2024-11-03
\n", "

6911 rows × 8 columns

\n", "
" ] }, "execution_count": 298, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 298 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-06T01:30:51.980756Z", "start_time": "2025-01-06T01:30:51.976257Z" } }, "cell_type": "code", "source": [ "def set_redundant_flag(assocs: pd.DataFrame, gene_df: pd.DataFrame):\n", " \"\"\"\n", " Sets the redundant flag for each association, if the object is in object_closure_redundant for that gene.\n", " \n", " :param assocs: \n", " :param gene_df: \n", " :return: \n", " \"\"\"\n", " # Create a mapping of subject to object_closure_redundant\n", " redundant_map = gene_df.set_index('subject')['object_closure_redundant'].to_dict()\n", " \n", " # Vectorized check for each row\n", " def check_redundant(row):\n", " if row['is_iba']:\n", " return False\n", " closure_set = redundant_map.get(row['subject'], set())\n", " is_redundant = row['object'] in closure_set\n", " return is_redundant\n", " \n", " # Apply the check to all rows at once\n", " assocs['redundant'] = assocs.apply(check_redundant, axis=1)" ], "id": "14019377b3f59d2d", "outputs": [], "execution_count": 299 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-06T01:30:52.035318Z", "start_time": "2025-01-06T01:30:52.032340Z" } }, "cell_type": "code", "source": "df[\"redundant\"] = False", "id": "e288d1b3e5994388", "outputs": [], "execution_count": 300 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-06T01:30:53.104422Z", "start_time": "2025-01-06T01:30:52.043397Z" } }, "cell_type": "code", "source": "set_redundant_flag(df, gene_df)", "id": "3600b3b1a3c349c3", "outputs": [], "execution_count": 301 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-06T01:31:06.845590Z", "start_time": "2025-01-06T01:31:06.820270Z" } }, "cell_type": "code", "source": "df[df['redundant']]", "id": "aa7fd8e88c4dbf6", "outputs": [ { "data": { "text/plain": [ " subject subject_label predicate object \\\n", "16 SGD:S000004539 FPR3 located_in GO:0005634 \n", "28 SGD:S000002699 HRQ1 enables GO:0043138 \n", "33 SGD:S000002699 HRQ1 enables GO:0043138 \n", "39 SGD:S000003245 SNU71 located_in GO:0005634 \n", "42 SGD:S000001443 DJP1 acts_upstream_of_or_within GO:0006626 \n", "... ... ... ... ... \n", "105624 SGD:S000006483 RDN18-2 located_in GO:0005840 \n", "105625 SGD:S000006502 SNR42 located_in GO:0005730 \n", "105627 SGD:S000007300 SNR36 located_in GO:0005730 \n", "105628 SGD:S000006484 RDN25-1 located_in GO:0005840 \n", "105629 SGD:S000006485 RDN25-2 located_in GO:0005840 \n", "\n", " object_label object_obsoletes \\\n", "16 nucleus False \n", "28 3'-5' DNA helicase activity False \n", "33 3'-5' DNA helicase activity False \n", "39 nucleus False \n", "42 protein targeting to mitochondrion False \n", "... ... ... \n", "105624 ribosome False \n", "105625 nucleolus False \n", "105627 nucleolus False \n", "105628 ribosome False \n", "105629 ribosome False \n", "\n", " object_uninformative \\\n", "16 False \n", "28 False \n", "33 False \n", "39 False \n", "42 False \n", "... ... \n", "105624 False \n", "105625 False \n", "105627 False \n", "105628 False \n", "105629 False \n", "\n", " object_closure \\\n", "16 {GO:0110165, GO:0043231, BFO:0000002, GO:00432... \n", "28 {BFO:0000015, GO:0008150, GO:0016043, GO:00431... \n", "33 {BFO:0000015, GO:0008150, GO:0016043, GO:00431... \n", "39 {GO:0110165, GO:0043231, BFO:0000002, GO:00432... \n", "42 {GO:0070585, GO:0051234, GO:0006605, GO:007072... \n", "... ... \n", "105624 {GO:0110165, BFO:0000002, GO:0043229, CL:00000... \n", "105625 {GO:0043233, GO:0043231, GO:0110165, BFO:00000... \n", "105627 {GO:0043233, GO:0043231, GO:0110165, BFO:00000... \n", "105628 {GO:0110165, BFO:0000002, GO:0043229, CL:00000... \n", "105629 {GO:0110165, BFO:0000002, GO:0043229, CL:00000... \n", "\n", " object_closure_redundant evidence is_iba \\\n", "16 {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... IDA False \n", "28 {GO:0140097, GO:0032508, GO:0003824, BFO:00000... IDA False \n", "33 {GO:0140097, GO:0032508, GO:0003824, BFO:00000... IDA False \n", "39 {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... IEA False \n", "42 {GO:0070585, GO:0070727, GO:0033365, BFO:00000... IMP False \n", "... ... ... ... \n", "105624 {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... IEA False \n", "105625 {CARO:0030000, GO:0005634, CARO:0000000, BFO:0... IEA False \n", "105627 {CARO:0030000, GO:0005634, CARO:0000000, BFO:0... IEA False \n", "105628 {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... IEA False \n", "105629 {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... IEA False \n", "\n", " negated pmid pubs release redundant \n", "16 None PMID:26359986 [PMID:26359986] 2024-11-03 True \n", "28 None PMID:28385527 [PMID:28385527] 2024-11-03 True \n", "33 None PMID:24440721 [PMID:24440721] 2024-11-03 True \n", "39 None None [GO_REF:0000043] 2024-11-03 True \n", "42 None PMID:30213914 [PMID:30213914] 2024-11-03 True \n", "... ... ... ... ... ... \n", "105624 None PMID:30502926 [PMID:30502926] 2024-11-03 True \n", "105625 None PMID:30502926 [PMID:30502926] 2024-11-03 True \n", "105627 None PMID:30502926 [PMID:30502926] 2024-11-03 True \n", "105628 None PMID:30502926 [PMID:30502926] 2024-11-03 True \n", "105629 None PMID:30502926 [PMID:30502926] 2024-11-03 True \n", "\n", "[34441 rows x 16 columns]" ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
subjectsubject_labelpredicateobjectobject_labelobject_obsoletesobject_uninformativeobject_closureobject_closure_redundantevidenceis_ibanegatedpmidpubsreleaseredundant
16SGD:S000004539FPR3located_inGO:0005634nucleusFalseFalse{GO:0110165, GO:0043231, BFO:0000002, GO:00432...{CARO:0030000, CARO:0000000, BFO:0000004, GO:0...IDAFalseNonePMID:26359986[PMID:26359986]2024-11-03True
28SGD:S000002699HRQ1enablesGO:00431383'-5' DNA helicase activityFalseFalse{BFO:0000015, GO:0008150, GO:0016043, GO:00431...{GO:0140097, GO:0032508, GO:0003824, BFO:00000...IDAFalseNonePMID:28385527[PMID:28385527]2024-11-03True
33SGD:S000002699HRQ1enablesGO:00431383'-5' DNA helicase activityFalseFalse{BFO:0000015, GO:0008150, GO:0016043, GO:00431...{GO:0140097, GO:0032508, GO:0003824, BFO:00000...IDAFalseNonePMID:24440721[PMID:24440721]2024-11-03True
39SGD:S000003245SNU71located_inGO:0005634nucleusFalseFalse{GO:0110165, GO:0043231, BFO:0000002, GO:00432...{CARO:0030000, CARO:0000000, BFO:0000004, GO:0...IEAFalseNoneNone[GO_REF:0000043]2024-11-03True
42SGD:S000001443DJP1acts_upstream_of_or_withinGO:0006626protein targeting to mitochondrionFalseFalse{GO:0070585, GO:0051234, GO:0006605, GO:007072...{GO:0070585, GO:0070727, GO:0033365, BFO:00000...IMPFalseNonePMID:30213914[PMID:30213914]2024-11-03True
...................................................
105624SGD:S000006483RDN18-2located_inGO:0005840ribosomeFalseFalse{GO:0110165, BFO:0000002, GO:0043229, CL:00000...{CARO:0030000, CARO:0000000, BFO:0000004, GO:0...IEAFalseNonePMID:30502926[PMID:30502926]2024-11-03True
105625SGD:S000006502SNR42located_inGO:0005730nucleolusFalseFalse{GO:0043233, GO:0043231, GO:0110165, BFO:00000...{CARO:0030000, GO:0005634, CARO:0000000, BFO:0...IEAFalseNonePMID:30502926[PMID:30502926]2024-11-03True
105627SGD:S000007300SNR36located_inGO:0005730nucleolusFalseFalse{GO:0043233, GO:0043231, GO:0110165, BFO:00000...{CARO:0030000, GO:0005634, CARO:0000000, BFO:0...IEAFalseNonePMID:30502926[PMID:30502926]2024-11-03True
105628SGD:S000006484RDN25-1located_inGO:0005840ribosomeFalseFalse{GO:0110165, BFO:0000002, GO:0043229, CL:00000...{CARO:0030000, CARO:0000000, BFO:0000004, GO:0...IEAFalseNonePMID:30502926[PMID:30502926]2024-11-03True
105629SGD:S000006485RDN25-2located_inGO:0005840ribosomeFalseFalse{GO:0110165, BFO:0000002, GO:0043229, CL:00000...{CARO:0030000, CARO:0000000, BFO:0000004, GO:0...IEAFalseNonePMID:30502926[PMID:30502926]2024-11-03True
\n", "

34441 rows × 16 columns

\n", "
" ] }, "execution_count": 302, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 302 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-06T01:31:20.496519Z", "start_time": "2025-01-06T01:31:20.492087Z" } }, "cell_type": "code", "source": [ "def annotate_new_pubs(assocs: pd.DataFrame, prev_assocs: pd.DataFrame):\n", " \"\"\"\n", " Annotates each association in assocs with fresh if its pmid is not in prev_assocs.\n", " \n", " :param assocs\n", " \"\"\"\n", " prev_pmid_map = prev_assocs.set_index('pmid')['subject'].to_dict()\n", " \n", " # Vectorized check for each row\n", " def check_new(row):\n", " return row['pmid'] not in prev_pmid_map\n", " \n", " # Apply the check to all rows in prev_assocs\n", " assocs['pmid_new'] = assocs.apply(check_new, axis=1)\n", " \n", " # annotate tuple as unique if (subject, object) is not in prev\n", " prev_pair_assocs = set(zip(prev_assocs['subject'], prev_assocs['object']))\n", " assocs['is_new'] = ~assocs.apply(lambda x: (x['subject'], x['object']) in prev_pair_assocs, axis=1)\n", " \n", " assocs['fresh'] = assocs['pmid_new'] & assocs['is_new']\n", " \n", " " ], "id": "f2b050545c0fc3a2", "outputs": [], "execution_count": 303 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-06T01:31:20.550163Z", "start_time": "2025-01-06T01:31:20.545824Z" } }, "cell_type": "code", "source": [ "def annotate_redacted_pubs(assocs: pd.DataFrame, prev_assocs: pd.DataFrame):\n", " \"\"\"\n", " Annotates each association in prev_assocs with retracted if its pmid is not in assocs (latest).\n", " \n", " :param assocs\n", " \"\"\"\n", " # Create a mapping of pmid to subject\n", " pmid_map = assocs.set_index('pmid')['subject'].to_dict()\n", " \n", " # Vectorized check for each row\n", " def check_removed(row):\n", " return row['pmid'] not in pmid_map\n", " \n", " # Apply the check to all rows in prev_assocs\n", " prev_assocs['pmid_removed'] = prev_assocs.apply(check_removed, axis=1)\n", " \n", " # annotate tuple as unique if (subject, object) is not in latest\n", " pair_assocs = set(zip(assocs['subject'], assocs['object']))\n", " prev_assocs['unique'] = ~prev_assocs.apply(lambda x: (x['subject'], x['object']) in pair_assocs, axis=1)\n", " \n", " prev_assocs['redacted'] = prev_assocs['pmid_removed'] & prev_assocs['unique']\n", " \n", " " ], "id": "867b7a7698c36e29", "outputs": [], "execution_count": 304 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-06T01:31:22.218998Z", "start_time": "2025-01-06T01:31:20.571207Z" } }, "cell_type": "code", "source": "annotate_redacted_pubs(df, prev_df)\n", "id": "1b9be34d02fcc35a", "outputs": [], "execution_count": 305 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-06T01:31:35.780825Z", "start_time": "2025-01-06T01:31:35.778359Z" } }, "cell_type": "code", "source": "#prev_df['redacted'] = prev_df['pmid_removed'] & prev_df['unique']", "id": "aa4f921bd715ed2e", "outputs": [], "execution_count": 306 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-06T01:31:49.384082Z", "start_time": "2025-01-06T01:31:49.381860Z" } }, "cell_type": "code", "source": "", "id": "278a7973e3ec4b32", "outputs": [], "execution_count": null }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-06T01:32:03.236009Z", "start_time": "2025-01-06T01:32:03.233785Z" } }, "cell_type": "code", "source": "#list(set(df['subject']))[:5]", "id": "cf5c19ae2c553e2f", "outputs": [], "execution_count": 307 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-06T01:32:16.839927Z", "start_time": "2025-01-06T01:32:16.837676Z" } }, "cell_type": "code", "source": [ "#iba_df = df[df['is_iba']]\n", "#iba_subjects = set(iba_df['subject'])\n", "#list(iba_subjects)[:5]" ], "id": "62ece9e9f658c81c", "outputs": [], "execution_count": 308 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-06T01:32:30.496156Z", "start_time": "2025-01-06T01:32:30.491391Z" } }, "cell_type": "code", "source": [ "def set_iba_status(assocs: pd.DataFrame):\n", " iba_df = assocs[assocs['is_iba']]\n", " iba_subjects = set(iba_df['subject'])\n", " # set the IBA rejected as associations for which\n", " # (a) the subject has an IBA association\n", " # (b) the object is not in the closure of the object in the IBA association\n", " # first we set the iba closure for each subject\n", " iba_closure = iba_df.groupby('subject')['object_closure'].aggregate(lambda x: set.union(*x)).to_dict()\n", " # now we set IBA rejected for each row\n", " def check_iba_rejected(row):\n", " if row['is_iba']:\n", " return False\n", " return row['subject'] in iba_subjects and row['object'] not in iba_closure.get(row['subject'], set())\n", " #if row['subject'].startswith(\"TAIR\"):\n", " # print(f\"Checking {row['subject']} in {list(iba_subjects)[:5]}\")\n", " #return row['subject'] in iba_subjects\n", " #return True\n", " \n", " assocs['iba_rejected'] = assocs.apply(check_iba_rejected, axis=1)\n", "\n", "#set_iba_status(df)\n", " \n", " " ], "id": "8f206c911adcb70", "outputs": [], "execution_count": 309 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-06T01:32:44.180038Z", "start_time": "2025-01-06T01:32:44.177926Z" } }, "cell_type": "code", "source": "", "id": "a0467e309db76da1", "outputs": [], "execution_count": null }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-06T01:32:57.830022Z", "start_time": "2025-01-06T01:32:57.814205Z" } }, "cell_type": "code", "source": "human_df", "id": "1a73a4e32fb7090f", "outputs": [ { "data": { "text/plain": [ " subject subject_label predicate object \\\n", "0 UniProtKB:A0A024RBG1 NUDT4B enables GO:0003723 \n", "1 UniProtKB:A0A024RBG1 NUDT4B enables GO:0005515 \n", "2 UniProtKB:A0A024RBG1 NUDT4B enables GO:0046872 \n", "3 UniProtKB:A0A024RBG1 NUDT4B located_in GO:0005829 \n", "4 UniProtKB:A0A075B6H5 TRBV20OR9-2 involved_in GO:0002376 \n", "... ... ... ... ... \n", "782818 UniProtKB:Q9NZC2 TREM2 involved_in GO:0045088 \n", "782819 UniProtKB:Q9Y2K2 SIK3 enables GO:0050321 \n", "782820 UniProtKB:P43235 CTSK involved_in GO:0051603 \n", "782821 UniProtKB:Q07343 PDE4B enables GO:0047555 \n", "782822 UniProtKB:A6NC42 DPPA5 involved_in GO:0010468 \n", "\n", " object_label object_obsoletes \\\n", "0 RNA binding False \n", "1 protein binding False \n", "2 metal ion binding False \n", "3 cytosol False \n", "4 immune system process False \n", "... ... ... \n", "782818 regulation of innate immune response False \n", "782819 tau-protein kinase activity False \n", "782820 proteolysis involved in protein catabolic process False \n", "782821 3',5'-cyclic-GMP phosphodiesterase activity False \n", "782822 regulation of gene expression False \n", "\n", " object_uninformative \\\n", "0 True \n", "1 True \n", "2 True \n", "3 False \n", "4 False \n", "... ... \n", "782818 False \n", "782819 False \n", "782820 False \n", "782821 False \n", "782822 False \n", "\n", " object_closure \\\n", "0 {GO:0097159, GO:0003723, BFO:0000015, GO:00036... \n", "1 {BFO:0000015, GO:0003674, GO:0005488, BFO:0000... \n", "2 {GO:0043169, GO:0046872, BFO:0000015, GO:00360... \n", "3 {CARO:0030000, UBERON:0000061, CARO:0000003, G... \n", "4 {GO:0008150, BFO:0000015, GO:0002376, BFO:0000... \n", "... ... \n", "782818 {GO:0065007, GO:0002682, GO:0050776, BFO:00000... \n", "782819 {GO:0140096, GO:0003824, GO:0004674, GO:001674... \n", "782820 {GO:0043170, GO:0006508, GO:0044238, GO:000905... \n", "782821 {GO:0047555, GO:0003824, GO:0008081, BFO:00000... \n", "782822 {GO:0065007, GO:0060255, GO:0009889, BFO:00000... \n", "\n", " object_closure_redundant evidence is_iba \\\n", "0 {GO:0097159, BFO:0000015, GO:0003674, GO:00036... IEA False \n", "1 {GO:0003674, BFO:0000015, GO:0005488, BFO:0000... IPI False \n", "2 {GO:0043169, BFO:0000015, GO:0036094, GO:00431... IEA False \n", "3 {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... IDA False \n", "4 {BFO:0000015, GO:0008150, BFO:0000003} IEA False \n", "... ... ... ... \n", "782818 {GO:0065007, GO:0002682, GO:0050776, BFO:00000... IBA True \n", "782819 {GO:0140096, GO:0003824, GO:0004674, GO:001674... IBA True \n", "782820 {GO:0043170, GO:0006508, GO:0044238, GO:000905... IBA True \n", "782821 {GO:0003824, GO:0008081, BFO:0000015, GO:00425... IBA True \n", "782822 {GO:0065007, GO:0060255, GO:0009889, BFO:00000... IBA True \n", "\n", " negated pmid pubs release \n", "0 None None [GO_REF:0000043] 2024-11-03 \n", "1 None PMID:33961781 [PMID:33961781] 2024-11-03 \n", "2 None None [GO_REF:0000043] 2024-11-03 \n", "3 None None [GO_REF:0000052] 2024-11-03 \n", "4 None None [GO_REF:0000043] 2024-11-03 \n", "... ... ... ... ... \n", "782818 None None [GO_REF:0000033] 2024-11-03 \n", "782819 None None [GO_REF:0000033] 2024-11-03 \n", "782820 None None [GO_REF:0000033] 2024-11-03 \n", "782821 None None [GO_REF:0000033] 2024-11-03 \n", "782822 None None [GO_REF:0000033] 2024-11-03 \n", "\n", "[782823 rows x 15 columns]" ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
subjectsubject_labelpredicateobjectobject_labelobject_obsoletesobject_uninformativeobject_closureobject_closure_redundantevidenceis_ibanegatedpmidpubsrelease
0UniProtKB:A0A024RBG1NUDT4BenablesGO:0003723RNA bindingFalseTrue{GO:0097159, GO:0003723, BFO:0000015, GO:00036...{GO:0097159, BFO:0000015, GO:0003674, GO:00036...IEAFalseNoneNone[GO_REF:0000043]2024-11-03
1UniProtKB:A0A024RBG1NUDT4BenablesGO:0005515protein bindingFalseTrue{BFO:0000015, GO:0003674, GO:0005488, BFO:0000...{GO:0003674, BFO:0000015, GO:0005488, BFO:0000...IPIFalseNonePMID:33961781[PMID:33961781]2024-11-03
2UniProtKB:A0A024RBG1NUDT4BenablesGO:0046872metal ion bindingFalseTrue{GO:0043169, GO:0046872, BFO:0000015, GO:00360...{GO:0043169, BFO:0000015, GO:0036094, GO:00431...IEAFalseNoneNone[GO_REF:0000043]2024-11-03
3UniProtKB:A0A024RBG1NUDT4Blocated_inGO:0005829cytosolFalseFalse{CARO:0030000, UBERON:0000061, CARO:0000003, G...{CARO:0030000, CARO:0000000, BFO:0000004, GO:0...IDAFalseNoneNone[GO_REF:0000052]2024-11-03
4UniProtKB:A0A075B6H5TRBV20OR9-2involved_inGO:0002376immune system processFalseFalse{GO:0008150, BFO:0000015, GO:0002376, BFO:0000...{BFO:0000015, GO:0008150, BFO:0000003}IEAFalseNoneNone[GO_REF:0000043]2024-11-03
................................................
782818UniProtKB:Q9NZC2TREM2involved_inGO:0045088regulation of innate immune responseFalseFalse{GO:0065007, GO:0002682, GO:0050776, BFO:00000...{GO:0065007, GO:0002682, GO:0050776, BFO:00000...IBATrueNoneNone[GO_REF:0000033]2024-11-03
782819UniProtKB:Q9Y2K2SIK3enablesGO:0050321tau-protein kinase activityFalseFalse{GO:0140096, GO:0003824, GO:0004674, GO:001674...{GO:0140096, GO:0003824, GO:0004674, GO:001674...IBATrueNoneNone[GO_REF:0000033]2024-11-03
782820UniProtKB:P43235CTSKinvolved_inGO:0051603proteolysis involved in protein catabolic processFalseFalse{GO:0043170, GO:0006508, GO:0044238, GO:000905...{GO:0043170, GO:0006508, GO:0044238, GO:000905...IBATrueNoneNone[GO_REF:0000033]2024-11-03
782821UniProtKB:Q07343PDE4BenablesGO:00475553',5'-cyclic-GMP phosphodiesterase activityFalseFalse{GO:0047555, GO:0003824, GO:0008081, BFO:00000...{GO:0003824, GO:0008081, BFO:0000015, GO:00425...IBATrueNoneNone[GO_REF:0000033]2024-11-03
782822UniProtKB:A6NC42DPPA5involved_inGO:0010468regulation of gene expressionFalseFalse{GO:0065007, GO:0060255, GO:0009889, BFO:00000...{GO:0065007, GO:0060255, GO:0009889, BFO:00000...IBATrueNoneNone[GO_REF:0000033]2024-11-03
\n", "

782823 rows × 15 columns

\n", "
" ] }, "execution_count": 310, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 310 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-06T01:33:13.375537Z", "start_time": "2025-01-06T01:33:11.321465Z" } }, "cell_type": "code", "source": "set_iba_status(test_df)", "id": "18c85a1e496789a1", "outputs": [], "execution_count": 311 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-06T01:33:15.209395Z", "start_time": "2025-01-06T01:33:15.204477Z" } }, "cell_type": "code", "source": "test_df['iba_rejected'].unique()", "id": "f10034b185842dc7", "outputs": [ { "data": { "text/plain": [ "array([False, True])" ] }, "execution_count": 312, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 312 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-06T01:33:28.881918Z", "start_time": "2025-01-06T01:33:28.879910Z" } }, "cell_type": "code", "source": "#set_iba_status(df)", "id": "9af1de25e37ec772", "outputs": [], "execution_count": 313 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-06T01:33:42.336419Z", "start_time": "2025-01-06T01:33:42.334842Z" } }, "cell_type": "code", "source": "", "id": "906cee0981042c2", "outputs": [], "execution_count": null }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-06T01:33:56.131239Z", "start_time": "2025-01-06T01:33:56.126586Z" } }, "cell_type": "code", "source": [ "def synthesize(grp: str) -> pd.DataFrame:\n", " \"\"\"\n", " Synthesizes the above steps\n", " \"\"\"\n", " assocs = assocs_to_df(db[grp][LATEST], LATEST)\n", " prev_df_sets = []\n", " for prev in PREVIOUS:\n", " this_prev_assocs = assocs_to_df(db[grp][prev], prev)\n", " prev_df_sets.append(this_prev_assocs)\n", " annotate_new_pubs(assocs, prev_df_sets[0])\n", " #prev_assocs = pd.concat(prev_df_sets).drop_duplicates()\n", " prev_assocs = pd.concat(prev_df_sets)\n", " # prev_assocs = assocs_to_df(db[grp][PREVIOUS[0]], PREVIOUS[0])\n", " repair_assocs_df(assocs)\n", " repair_assocs_df(prev_assocs)\n", " # Create a gene dataframe (for latest only)\n", " gene_df = create_gene_df(assocs)\n", " annotate_redacted_pubs(assocs, prev_assocs)\n", " new_assocs = pd.concat([assocs, prev_assocs[prev_assocs['redacted']]])\n", " set_redundant_flag(new_assocs, gene_df)\n", " set_iba_status(new_assocs)\n", " return new_assocs\n", " \n", " " ], "id": "a4c3b5106e007c89", "outputs": [], "execution_count": 314 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-06T01:43:49.335421Z", "start_time": "2025-01-06T01:34:09.648173Z" } }, "cell_type": "code", "source": "new_human = synthesize(\"goa_human\")", "id": "91b0622de603be32", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Multiple IDs for 64 labels\n", "[('AKAP7', ['UniProtKB:O43687', 'UniProtKB:Q9P0M2']), ('ARHGEF18', ['UniProtKB:A0A590UK10', 'UniProtKB:Q6ZSZ5']), ('BBC3', ['UniProtKB:Q9BXH1', 'UniProtKB:Q96PG8']), ('CALCA', ['UniProtKB:P06881', 'UniProtKB:P01258']), ('CDKN2A', ['UniProtKB:Q8N726', 'UniProtKB:P42771'])]\n", "Multiple IDs for 100 labels\n", "[('AKAP7', ['UniProtKB:O43687', 'UniProtKB:Q9P0M2']), ('AMY1A', ['UniProtKB:P04745', 'UniProtKB:P0DUB6']), ('ARHGEF18', ['UniProtKB:A0A590UK10', 'UniProtKB:Q6ZSZ5']), ('ASIC5', ['UniProtKB:A0A0G2JLG4', 'UniProtKB:Q9NY37']), ('ATP6AP2', ['UniProtKB:O75787', 'UniProtKB:A0A1C7CYW4'])]\n" ] } ], "execution_count": 315 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-06T01:44:03.327073Z", "start_time": "2025-01-06T01:44:03.309006Z" } }, "cell_type": "code", "source": "new_human", "id": "8d9cf08927c02bae", "outputs": [ { "data": { "text/plain": [ " subject subject_label predicate object \\\n", "0 UniProtKB:A0A024RBG1 NUDT4B enables GO:0003723 \n", "1 UniProtKB:A0A024RBG1 NUDT4B enables GO:0005515 \n", "2 UniProtKB:A0A024RBG1 NUDT4B enables GO:0046872 \n", "3 UniProtKB:A0A024RBG1 NUDT4B located_in GO:0005829 \n", "4 UniProtKB:A0A075B6H5 TRBV20OR9-2 involved_in GO:0002376 \n", "... ... ... ... ... \n", "434570 UniProtKB:Q9Y6A4 CFAP20 None GO:0007275 \n", "435058 UniProtKB:Q9Y6F1 PARP3 None GO:0006281 \n", "436209 UniProtKB:Q9Y6Q9 NCOA3 None GO:0000981 \n", "436550 UniProtKB:Q9Y6X0 SETBP1 None GO:0000981 \n", "436732 UniProtKB:Q9Y6Y1 CAMTA1 None GO:0000981 \n", "\n", " object_label object_obsoletes \\\n", "0 RNA binding False \n", "1 protein binding False \n", "2 metal ion binding False \n", "3 cytosol False \n", "4 immune system process False \n", "... ... ... \n", "434570 multicellular organism development False \n", "435058 DNA repair False \n", "436209 DNA-binding transcription factor activity, RNA... False \n", "436550 DNA-binding transcription factor activity, RNA... False \n", "436732 DNA-binding transcription factor activity, RNA... False \n", "\n", " object_uninformative \\\n", "0 True \n", "1 True \n", "2 True \n", "3 False \n", "4 False \n", "... ... \n", "434570 True \n", "435058 False \n", "436209 False \n", "436550 False \n", "436732 False \n", "\n", " object_closure \\\n", "0 {GO:0097159, GO:0003723, BFO:0000015, GO:00036... \n", "1 {BFO:0000015, GO:0003674, GO:0005488, BFO:0000... \n", "2 {GO:0043169, GO:0046872, BFO:0000015, GO:00360... \n", "3 {CARO:0030000, UBERON:0000061, CARO:0000003, G... \n", "4 {GO:0008150, BFO:0000015, GO:0002376, BFO:0000... \n", "... ... \n", "434570 {GO:0032501, BFO:0000015, GO:0048856, GO:00325... \n", "435058 {GO:0043170, GO:0033554, GO:0008152, GO:000613... \n", "436209 {BFO:0000015, GO:0006357, GO:0008150, GO:00192... \n", "436550 {BFO:0000015, GO:0006357, GO:0008150, GO:00192... \n", "436732 {BFO:0000015, GO:0006357, GO:0008150, GO:00192... \n", "\n", " object_closure_redundant evidence ... \\\n", "0 {GO:0097159, BFO:0000015, GO:0003674, GO:00036... IEA ... \n", "1 {GO:0003674, BFO:0000015, GO:0005488, BFO:0000... IPI ... \n", "2 {GO:0043169, BFO:0000015, GO:0036094, GO:00431... IEA ... \n", "3 {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... IDA ... \n", "4 {BFO:0000015, GO:0008150, BFO:0000003} IEA ... \n", "... ... ... ... \n", "434570 {GO:0032501, BFO:0000015, GO:0048856, GO:00325... TAS ... \n", "435058 {GO:0043170, GO:0008152, BFO:0000015, GO:00099... TAS ... \n", "436209 {GO:0065007, GO:0060255, GO:0009889, BFO:00000... ISM ... \n", "436550 {GO:0065007, GO:0060255, GO:0009889, BFO:00000... ISM ... \n", "436732 {GO:0065007, GO:0060255, GO:0009889, BFO:00000... ISM ... \n", "\n", " pubs release pmid_new is_new fresh pmid_removed \\\n", "0 [GO_REF:0000043] 2024-11-03 False False False NaN \n", "1 [PMID:33961781] 2024-11-03 True True True NaN \n", "2 [GO_REF:0000043] 2024-11-03 False False False NaN \n", "3 [GO_REF:0000052] 2024-11-03 False False False NaN \n", "4 [GO_REF:0000043] 2024-11-03 False True False NaN \n", "... ... ... ... ... ... ... \n", "434570 [PMID:8688464] 2020-01-01 NaN NaN NaN True \n", "435058 [PMID:7260241] 2020-01-01 NaN NaN NaN True \n", "436209 [PMID:19274049] 2020-01-01 NaN NaN NaN True \n", "436550 [PMID:19274049] 2020-01-01 NaN NaN NaN True \n", "436732 [PMID:19274049] 2020-01-01 NaN NaN NaN True \n", "\n", " unique redacted redundant iba_rejected \n", "0 NaN NaN False True \n", "1 NaN NaN False True \n", "2 NaN NaN False True \n", "3 NaN NaN False True \n", "4 NaN NaN False True \n", "... ... ... ... ... \n", "434570 True True False True \n", "435058 True True True False \n", "436209 True True False True \n", "436550 True True False True \n", "436732 True True False True \n", "\n", "[783918 rows x 23 columns]" ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
subjectsubject_labelpredicateobjectobject_labelobject_obsoletesobject_uninformativeobject_closureobject_closure_redundantevidence...pubsreleasepmid_newis_newfreshpmid_removeduniqueredactedredundantiba_rejected
0UniProtKB:A0A024RBG1NUDT4BenablesGO:0003723RNA bindingFalseTrue{GO:0097159, GO:0003723, BFO:0000015, GO:00036...{GO:0097159, BFO:0000015, GO:0003674, GO:00036...IEA...[GO_REF:0000043]2024-11-03FalseFalseFalseNaNNaNNaNFalseTrue
1UniProtKB:A0A024RBG1NUDT4BenablesGO:0005515protein bindingFalseTrue{BFO:0000015, GO:0003674, GO:0005488, BFO:0000...{GO:0003674, BFO:0000015, GO:0005488, BFO:0000...IPI...[PMID:33961781]2024-11-03TrueTrueTrueNaNNaNNaNFalseTrue
2UniProtKB:A0A024RBG1NUDT4BenablesGO:0046872metal ion bindingFalseTrue{GO:0043169, GO:0046872, BFO:0000015, GO:00360...{GO:0043169, BFO:0000015, GO:0036094, GO:00431...IEA...[GO_REF:0000043]2024-11-03FalseFalseFalseNaNNaNNaNFalseTrue
3UniProtKB:A0A024RBG1NUDT4Blocated_inGO:0005829cytosolFalseFalse{CARO:0030000, UBERON:0000061, CARO:0000003, G...{CARO:0030000, CARO:0000000, BFO:0000004, GO:0...IDA...[GO_REF:0000052]2024-11-03FalseFalseFalseNaNNaNNaNFalseTrue
4UniProtKB:A0A075B6H5TRBV20OR9-2involved_inGO:0002376immune system processFalseFalse{GO:0008150, BFO:0000015, GO:0002376, BFO:0000...{BFO:0000015, GO:0008150, BFO:0000003}IEA...[GO_REF:0000043]2024-11-03FalseTrueFalseNaNNaNNaNFalseTrue
..................................................................
434570UniProtKB:Q9Y6A4CFAP20NoneGO:0007275multicellular organism developmentFalseTrue{GO:0032501, BFO:0000015, GO:0048856, GO:00325...{GO:0032501, BFO:0000015, GO:0048856, GO:00325...TAS...[PMID:8688464]2020-01-01NaNNaNNaNTrueTrueTrueFalseTrue
435058UniProtKB:Q9Y6F1PARP3NoneGO:0006281DNA repairFalseFalse{GO:0043170, GO:0033554, GO:0008152, GO:000613...{GO:0043170, GO:0008152, BFO:0000015, GO:00099...TAS...[PMID:7260241]2020-01-01NaNNaNNaNTrueTrueTrueTrueFalse
436209UniProtKB:Q9Y6Q9NCOA3NoneGO:0000981DNA-binding transcription factor activity, RNA...FalseFalse{BFO:0000015, GO:0006357, GO:0008150, GO:00192...{GO:0065007, GO:0060255, GO:0009889, BFO:00000...ISM...[PMID:19274049]2020-01-01NaNNaNNaNTrueTrueTrueFalseTrue
436550UniProtKB:Q9Y6X0SETBP1NoneGO:0000981DNA-binding transcription factor activity, RNA...FalseFalse{BFO:0000015, GO:0006357, GO:0008150, GO:00192...{GO:0065007, GO:0060255, GO:0009889, BFO:00000...ISM...[PMID:19274049]2020-01-01NaNNaNNaNTrueTrueTrueFalseTrue
436732UniProtKB:Q9Y6Y1CAMTA1NoneGO:0000981DNA-binding transcription factor activity, RNA...FalseFalse{BFO:0000015, GO:0006357, GO:0008150, GO:00192...{GO:0065007, GO:0060255, GO:0009889, BFO:00000...ISM...[PMID:19274049]2020-01-01NaNNaNNaNTrueTrueTrueFalseTrue
\n", "

783918 rows × 23 columns

\n", "
" ] }, "execution_count": 316, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 316 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-06T01:44:17.208851Z", "start_time": "2025-01-06T01:44:17.077948Z" } }, "cell_type": "code", "source": "new_human[new_human['iba_rejected']]", "id": "171c5257a470dbee", "outputs": [ { "data": { "text/plain": [ " subject subject_label predicate object \\\n", "0 UniProtKB:A0A024RBG1 NUDT4B enables GO:0003723 \n", "1 UniProtKB:A0A024RBG1 NUDT4B enables GO:0005515 \n", "2 UniProtKB:A0A024RBG1 NUDT4B enables GO:0046872 \n", "3 UniProtKB:A0A024RBG1 NUDT4B located_in GO:0005829 \n", "4 UniProtKB:A0A075B6H5 TRBV20OR9-2 involved_in GO:0002376 \n", "... ... ... ... ... \n", "433680 UniProtKB:Q9Y5Y6 ST14 None GO:0005887 \n", "434570 UniProtKB:Q9Y6A4 CFAP20 None GO:0007275 \n", "436209 UniProtKB:Q9Y6Q9 NCOA3 None GO:0000981 \n", "436550 UniProtKB:Q9Y6X0 SETBP1 None GO:0000981 \n", "436732 UniProtKB:Q9Y6Y1 CAMTA1 None GO:0000981 \n", "\n", " object_label object_obsoletes \\\n", "0 RNA binding False \n", "1 protein binding False \n", "2 metal ion binding False \n", "3 cytosol False \n", "4 immune system process False \n", "... ... ... \n", "433680 None True \n", "434570 multicellular organism development False \n", "436209 DNA-binding transcription factor activity, RNA... False \n", "436550 DNA-binding transcription factor activity, RNA... False \n", "436732 DNA-binding transcription factor activity, RNA... False \n", "\n", " object_uninformative \\\n", "0 True \n", "1 True \n", "2 True \n", "3 False \n", "4 False \n", "... ... \n", "433680 False \n", "434570 True \n", "436209 False \n", "436550 False \n", "436732 False \n", "\n", " object_closure \\\n", "0 {GO:0097159, GO:0003723, BFO:0000015, GO:00036... \n", "1 {BFO:0000015, GO:0003674, GO:0005488, BFO:0000... \n", "2 {GO:0043169, GO:0046872, BFO:0000015, GO:00360... \n", "3 {CARO:0030000, UBERON:0000061, CARO:0000003, G... \n", "4 {GO:0008150, BFO:0000015, GO:0002376, BFO:0000... \n", "... ... \n", "433680 {GO:0005887} \n", "434570 {GO:0032501, BFO:0000015, GO:0048856, GO:00325... \n", "436209 {BFO:0000015, GO:0006357, GO:0008150, GO:00192... \n", "436550 {BFO:0000015, GO:0006357, GO:0008150, GO:00192... \n", "436732 {BFO:0000015, GO:0006357, GO:0008150, GO:00192... \n", "\n", " object_closure_redundant evidence ... \\\n", "0 {GO:0097159, BFO:0000015, GO:0003674, GO:00036... IEA ... \n", "1 {GO:0003674, BFO:0000015, GO:0005488, BFO:0000... IPI ... \n", "2 {GO:0043169, BFO:0000015, GO:0036094, GO:00431... IEA ... \n", "3 {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... IDA ... \n", "4 {BFO:0000015, GO:0008150, BFO:0000003} IEA ... \n", "... ... ... ... \n", "433680 {} TAS ... \n", "434570 {GO:0032501, BFO:0000015, GO:0048856, GO:00325... TAS ... \n", "436209 {GO:0065007, GO:0060255, GO:0009889, BFO:00000... ISM ... \n", "436550 {GO:0065007, GO:0060255, GO:0009889, BFO:00000... ISM ... \n", "436732 {GO:0065007, GO:0060255, GO:0009889, BFO:00000... ISM ... \n", "\n", " pubs release pmid_new is_new fresh pmid_removed \\\n", "0 [GO_REF:0000043] 2024-11-03 False False False NaN \n", "1 [PMID:33961781] 2024-11-03 True True True NaN \n", "2 [GO_REF:0000043] 2024-11-03 False False False NaN \n", "3 [GO_REF:0000052] 2024-11-03 False False False NaN \n", "4 [GO_REF:0000043] 2024-11-03 False True False NaN \n", "... ... ... ... ... ... ... \n", "433680 [PMID:10831593] 2020-01-01 NaN NaN NaN True \n", "434570 [PMID:8688464] 2020-01-01 NaN NaN NaN True \n", "436209 [PMID:19274049] 2020-01-01 NaN NaN NaN True \n", "436550 [PMID:19274049] 2020-01-01 NaN NaN NaN True \n", "436732 [PMID:19274049] 2020-01-01 NaN NaN NaN True \n", "\n", " unique redacted redundant iba_rejected \n", "0 NaN NaN False True \n", "1 NaN NaN False True \n", "2 NaN NaN False True \n", "3 NaN NaN False True \n", "4 NaN NaN False True \n", "... ... ... ... ... \n", "433680 True True False True \n", "434570 True True False True \n", "436209 True True False True \n", "436550 True True False True \n", "436732 True True False True \n", "\n", "[450057 rows x 23 columns]" ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
subjectsubject_labelpredicateobjectobject_labelobject_obsoletesobject_uninformativeobject_closureobject_closure_redundantevidence...pubsreleasepmid_newis_newfreshpmid_removeduniqueredactedredundantiba_rejected
0UniProtKB:A0A024RBG1NUDT4BenablesGO:0003723RNA bindingFalseTrue{GO:0097159, GO:0003723, BFO:0000015, GO:00036...{GO:0097159, BFO:0000015, GO:0003674, GO:00036...IEA...[GO_REF:0000043]2024-11-03FalseFalseFalseNaNNaNNaNFalseTrue
1UniProtKB:A0A024RBG1NUDT4BenablesGO:0005515protein bindingFalseTrue{BFO:0000015, GO:0003674, GO:0005488, BFO:0000...{GO:0003674, BFO:0000015, GO:0005488, BFO:0000...IPI...[PMID:33961781]2024-11-03TrueTrueTrueNaNNaNNaNFalseTrue
2UniProtKB:A0A024RBG1NUDT4BenablesGO:0046872metal ion bindingFalseTrue{GO:0043169, GO:0046872, BFO:0000015, GO:00360...{GO:0043169, BFO:0000015, GO:0036094, GO:00431...IEA...[GO_REF:0000043]2024-11-03FalseFalseFalseNaNNaNNaNFalseTrue
3UniProtKB:A0A024RBG1NUDT4Blocated_inGO:0005829cytosolFalseFalse{CARO:0030000, UBERON:0000061, CARO:0000003, G...{CARO:0030000, CARO:0000000, BFO:0000004, GO:0...IDA...[GO_REF:0000052]2024-11-03FalseFalseFalseNaNNaNNaNFalseTrue
4UniProtKB:A0A075B6H5TRBV20OR9-2involved_inGO:0002376immune system processFalseFalse{GO:0008150, BFO:0000015, GO:0002376, BFO:0000...{BFO:0000015, GO:0008150, BFO:0000003}IEA...[GO_REF:0000043]2024-11-03FalseTrueFalseNaNNaNNaNFalseTrue
..................................................................
433680UniProtKB:Q9Y5Y6ST14NoneGO:0005887NoneTrueFalse{GO:0005887}{}TAS...[PMID:10831593]2020-01-01NaNNaNNaNTrueTrueTrueFalseTrue
434570UniProtKB:Q9Y6A4CFAP20NoneGO:0007275multicellular organism developmentFalseTrue{GO:0032501, BFO:0000015, GO:0048856, GO:00325...{GO:0032501, BFO:0000015, GO:0048856, GO:00325...TAS...[PMID:8688464]2020-01-01NaNNaNNaNTrueTrueTrueFalseTrue
436209UniProtKB:Q9Y6Q9NCOA3NoneGO:0000981DNA-binding transcription factor activity, RNA...FalseFalse{BFO:0000015, GO:0006357, GO:0008150, GO:00192...{GO:0065007, GO:0060255, GO:0009889, BFO:00000...ISM...[PMID:19274049]2020-01-01NaNNaNNaNTrueTrueTrueFalseTrue
436550UniProtKB:Q9Y6X0SETBP1NoneGO:0000981DNA-binding transcription factor activity, RNA...FalseFalse{BFO:0000015, GO:0006357, GO:0008150, GO:00192...{GO:0065007, GO:0060255, GO:0009889, BFO:00000...ISM...[PMID:19274049]2020-01-01NaNNaNNaNTrueTrueTrueFalseTrue
436732UniProtKB:Q9Y6Y1CAMTA1NoneGO:0000981DNA-binding transcription factor activity, RNA...FalseFalse{BFO:0000015, GO:0006357, GO:0008150, GO:00192...{GO:0065007, GO:0060255, GO:0009889, BFO:00000...ISM...[PMID:19274049]2020-01-01NaNNaNNaNTrueTrueTrueFalseTrue
\n", "

450057 rows × 23 columns

\n", "
" ] }, "execution_count": 317, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 317 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-06T01:47:20.371960Z", "start_time": "2025-01-06T01:44:26.950777Z" } }, "cell_type": "code", "source": [ "prev_human_df = assocs_to_df(db[\"goa_human\"][PREVIOUS[0]], PREVIOUS[0])\n", "#annotate_new_pubs(human_df, prev_human_df)" ], "id": "14737ac8286733a5", "outputs": [], "execution_count": 318 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-06T01:47:44.842883Z", "start_time": "2025-01-06T01:47:34.278374Z" } }, "cell_type": "code", "source": "annotate_new_pubs(human_df, prev_human_df)", "id": "3a4124cdc8a825f4", "outputs": [], "execution_count": 319 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-06T01:47:44.893482Z", "start_time": "2025-01-06T01:47:44.874646Z" } }, "cell_type": "code", "source": "human_df[human_df['fresh']]", "id": "4283533cfc17e67c", "outputs": [ { "data": { "text/plain": [ " subject subject_label predicate \\\n", "1 UniProtKB:A0A024RBG1 NUDT4B enables \n", "357 UniProtKB:A0A096LP55 UQCRHL located_in \n", "569 UniProtKB:A0A0B4J2F0 PIGBOS1 located_in \n", "1904 UniProtKB:A1A4Y4 IRGM enables \n", "1945 UniProtKB:A1A4Y4 IRGM involved_in \n", "... ... ... ... \n", "718365 RNAcentral:URS00026A23F2_9606 URS00026A23F2_9606 enables \n", "718366 RNAcentral:URS00026A23F2_9606 URS00026A23F2_9606 involved_in \n", "718367 RNAcentral:URS00026A23F2_9606 URS00026A23F2_9606 involved_in \n", "718368 RNAcentral:URS00026A23F2_9606 URS00026A23F2_9606 involved_in \n", "718370 RNAcentral:URS00026A23F2_9606 URS00026A23F2_9606 involved_in \n", "\n", " object object_label \\\n", "1 GO:0005515 protein binding \n", "357 GO:0005739 mitochondrion \n", "569 GO:0005739 mitochondrion \n", "1904 GO:1901612 cardiolipin binding \n", "1945 GO:0090141 positive regulation of mitochondrial fission \n", "... ... ... \n", "718365 GO:0141180 dsDNA-RNA triple helix-forming chromatin adapt... \n", "718366 GO:0000122 negative regulation of transcription by RNA po... \n", "718367 GO:0000512 lncRNA-mediated post-transcriptional gene sile... \n", "718368 GO:0000512 lncRNA-mediated post-transcriptional gene sile... \n", "718370 GO:0090399 replicative senescence \n", "\n", " object_obsoletes object_uninformative \\\n", "1 False True \n", "357 False False \n", "569 False False \n", "1904 False True \n", "1945 False False \n", "... ... ... \n", "718365 False True \n", "718366 False False \n", "718367 False False \n", "718368 False False \n", "718370 False False \n", "\n", " object_closure \\\n", "1 {BFO:0000015, GO:0003674, GO:0005488, BFO:0000... \n", "357 {GO:0005739, GO:0110165, GO:0043231, BFO:00000... \n", "569 {GO:0005739, GO:0110165, GO:0043231, BFO:00000... \n", "1904 {GO:0008289, GO:0043168, GO:1901612, BFO:00000... \n", "1945 {GO:0010821, GO:0051130, GO:0065007, GO:003304... \n", "... ... \n", "718365 {GO:0030674, BFO:0000015, GO:0043565, GO:00036... \n", "718366 {BFO:0000015, GO:0006357, GO:0008150, GO:00098... \n", "718367 {GO:0000512, BFO:0000015, GO:0016441, GO:00081... \n", "718368 {GO:0000512, BFO:0000015, GO:0016441, GO:00081... \n", "718370 {GO:0008152, BFO:0000015, GO:0090399, GO:00099... \n", "\n", " object_closure_redundant evidence is_iba \\\n", "1 {GO:0003674, BFO:0000015, GO:0005488, BFO:0000... IPI False \n", "357 {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... HTP False \n", "569 {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... HTP False \n", "1904 {GO:0008289, GO:0043168, BFO:0000015, GO:00360... IDA False \n", "1945 {GO:0010821, GO:0051130, GO:0065007, GO:003304... IDA False \n", "... ... ... ... \n", "718365 {GO:0003690, GO:0030674, GO:0003677, GO:007184... IDA False \n", "718366 {GO:0045934, GO:0009892, GO:0065007, GO:000988... IMP False \n", "718367 {GO:0065007, GO:0009892, GO:0031047, GO:000988... IDA False \n", "718368 {GO:0065007, GO:0009892, GO:0031047, GO:000988... IMP False \n", "718370 {GO:0008152, BFO:0000015, GO:0009987, GO:00081... IMP False \n", "\n", " negated pmid pubs release pmid_new is_new \\\n", "1 None PMID:33961781 [PMID:33961781] 2024-11-03 True True \n", "357 None PMID:34800366 [PMID:34800366] 2024-11-03 True True \n", "569 None PMID:34800366 [PMID:34800366] 2024-11-03 True True \n", "1904 None PMID:21102437 [PMID:21102437] 2024-11-03 True True \n", "1945 None PMID:21102437 [PMID:21102437] 2024-11-03 True True \n", "... ... ... ... ... ... ... \n", "718365 None PMID:27634931 [PMID:27634931] 2024-11-03 True True \n", "718366 None PMID:27634931 [PMID:27634931] 2024-11-03 True True \n", "718367 None PMID:30720199 [PMID:30720199] 2024-11-03 True True \n", "718368 None PMID:33102210 [PMID:33102210] 2024-11-03 True True \n", "718370 None PMID:27634931 [PMID:27634931] 2024-11-03 True True \n", "\n", " fresh \n", "1 True \n", "357 True \n", "569 True \n", "1904 True \n", "1945 True \n", "... ... \n", "718365 True \n", "718366 True \n", "718367 True \n", "718368 True \n", "718370 True \n", "\n", "[3380 rows x 18 columns]" ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
subjectsubject_labelpredicateobjectobject_labelobject_obsoletesobject_uninformativeobject_closureobject_closure_redundantevidenceis_ibanegatedpmidpubsreleasepmid_newis_newfresh
1UniProtKB:A0A024RBG1NUDT4BenablesGO:0005515protein bindingFalseTrue{BFO:0000015, GO:0003674, GO:0005488, BFO:0000...{GO:0003674, BFO:0000015, GO:0005488, BFO:0000...IPIFalseNonePMID:33961781[PMID:33961781]2024-11-03TrueTrueTrue
357UniProtKB:A0A096LP55UQCRHLlocated_inGO:0005739mitochondrionFalseFalse{GO:0005739, GO:0110165, GO:0043231, BFO:00000...{CARO:0030000, CARO:0000000, BFO:0000004, GO:0...HTPFalseNonePMID:34800366[PMID:34800366]2024-11-03TrueTrueTrue
569UniProtKB:A0A0B4J2F0PIGBOS1located_inGO:0005739mitochondrionFalseFalse{GO:0005739, GO:0110165, GO:0043231, BFO:00000...{CARO:0030000, CARO:0000000, BFO:0000004, GO:0...HTPFalseNonePMID:34800366[PMID:34800366]2024-11-03TrueTrueTrue
1904UniProtKB:A1A4Y4IRGMenablesGO:1901612cardiolipin bindingFalseTrue{GO:0008289, GO:0043168, GO:1901612, BFO:00000...{GO:0008289, GO:0043168, BFO:0000015, GO:00360...IDAFalseNonePMID:21102437[PMID:21102437]2024-11-03TrueTrueTrue
1945UniProtKB:A1A4Y4IRGMinvolved_inGO:0090141positive regulation of mitochondrial fissionFalseFalse{GO:0010821, GO:0051130, GO:0065007, GO:003304...{GO:0010821, GO:0051130, GO:0065007, GO:003304...IDAFalseNonePMID:21102437[PMID:21102437]2024-11-03TrueTrueTrue
.........................................................
718365RNAcentral:URS00026A23F2_9606URS00026A23F2_9606enablesGO:0141180dsDNA-RNA triple helix-forming chromatin adapt...FalseTrue{GO:0030674, BFO:0000015, GO:0043565, GO:00036...{GO:0003690, GO:0030674, GO:0003677, GO:007184...IDAFalseNonePMID:27634931[PMID:27634931]2024-11-03TrueTrueTrue
718366RNAcentral:URS00026A23F2_9606URS00026A23F2_9606involved_inGO:0000122negative regulation of transcription by RNA po...FalseFalse{BFO:0000015, GO:0006357, GO:0008150, GO:00098...{GO:0045934, GO:0009892, GO:0065007, GO:000988...IMPFalseNonePMID:27634931[PMID:27634931]2024-11-03TrueTrueTrue
718367RNAcentral:URS00026A23F2_9606URS00026A23F2_9606involved_inGO:0000512lncRNA-mediated post-transcriptional gene sile...FalseFalse{GO:0000512, BFO:0000015, GO:0016441, GO:00081...{GO:0065007, GO:0009892, GO:0031047, GO:000988...IDAFalseNonePMID:30720199[PMID:30720199]2024-11-03TrueTrueTrue
718368RNAcentral:URS00026A23F2_9606URS00026A23F2_9606involved_inGO:0000512lncRNA-mediated post-transcriptional gene sile...FalseFalse{GO:0000512, BFO:0000015, GO:0016441, GO:00081...{GO:0065007, GO:0009892, GO:0031047, GO:000988...IMPFalseNonePMID:33102210[PMID:33102210]2024-11-03TrueTrueTrue
718370RNAcentral:URS00026A23F2_9606URS00026A23F2_9606involved_inGO:0090399replicative senescenceFalseFalse{GO:0008152, BFO:0000015, GO:0090399, GO:00099...{GO:0008152, BFO:0000015, GO:0009987, GO:00081...IMPFalseNonePMID:27634931[PMID:27634931]2024-11-03TrueTrueTrue
\n", "

3380 rows × 18 columns

\n", "
" ] }, "execution_count": 320, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 320 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-06T01:47:58.760497Z", "start_time": "2025-01-06T01:47:58.758393Z" } }, "cell_type": "code", "source": "#pair_assocs, pair_anns = annotate_assocs(\"tair\")", "id": "35e6ea6ce6f45958", "outputs": [], "execution_count": 321 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-06T01:48:10.671121Z", "start_time": "2025-01-06T01:47:58.773655Z" } }, "cell_type": "code", "source": "new_human.to_csv(\"output/go-human-assocs-annotated.csv\", index=False)", "id": "98d63328995a6be1", "outputs": [], "execution_count": 322 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-06T02:28:43.837036Z", "start_time": "2025-01-06T02:28:43.768532Z" } }, "cell_type": "code", "source": [ "test_cases_df = new_human[(new_human['fresh'] == True) & (new_human['object_uninformative'] == False) & (new_human['negated'] != True) & (new_human['redundant'] != True) & (new_human['iba_rejected'] != True)]\n", "test_cases_df" ], "id": "292a3fc05eafeb5f", "outputs": [ { "data": { "text/plain": [ " subject subject_label predicate \\\n", "4876 UniProtKB:A6NNL5 C15orf61 located_in \n", "5853 UniProtKB:A8MSI8 LYRM9 located_in \n", "6327 UniProtKB:A8MXV4 NUDT19 located_in \n", "15942 UniProtKB:O14521 SDHD part_of \n", "34482 UniProtKB:O43325 LYRM1 located_in \n", "... ... ... ... \n", "718362 RNAcentral:URS00026A23F2_9606 URS00026A23F2_9606 acts_upstream_of \n", "718366 RNAcentral:URS00026A23F2_9606 URS00026A23F2_9606 involved_in \n", "718367 RNAcentral:URS00026A23F2_9606 URS00026A23F2_9606 involved_in \n", "718368 RNAcentral:URS00026A23F2_9606 URS00026A23F2_9606 involved_in \n", "718370 RNAcentral:URS00026A23F2_9606 URS00026A23F2_9606 involved_in \n", "\n", " object object_label \\\n", "4876 GO:0005739 mitochondrion \n", "5853 GO:0005739 mitochondrion \n", "6327 GO:0005739 mitochondrion \n", "15942 GO:0045273 respiratory chain complex II (succinate dehydr... \n", "34482 GO:0005739 mitochondrion \n", "... ... ... \n", "718362 GO:0008284 positive regulation of cell population prolife... \n", "718366 GO:0000122 negative regulation of transcription by RNA po... \n", "718367 GO:0000512 lncRNA-mediated post-transcriptional gene sile... \n", "718368 GO:0000512 lncRNA-mediated post-transcriptional gene sile... \n", "718370 GO:0090399 replicative senescence \n", "\n", " object_obsoletes object_uninformative \\\n", "4876 False False \n", "5853 False False \n", "6327 False False \n", "15942 False False \n", "34482 False False \n", "... ... ... \n", "718362 False False \n", "718366 False False \n", "718367 False False \n", "718368 False False \n", "718370 False False \n", "\n", " object_closure \\\n", "4876 {GO:0005739, GO:0110165, GO:0043231, BFO:00000... \n", "5853 {GO:0005739, GO:0110165, GO:0043231, BFO:00000... \n", "6327 {GO:0005739, GO:0110165, GO:0043231, BFO:00000... \n", "15942 {GO:0098796, GO:0110165, GO:0045273, BFO:00000... \n", "34482 {GO:0005739, GO:0110165, GO:0043231, BFO:00000... \n", "... ... \n", "718362 {GO:0065007, GO:0042127, GO:0048518, BFO:00000... \n", "718366 {BFO:0000015, GO:0006357, GO:0008150, GO:00098... \n", "718367 {GO:0000512, BFO:0000015, GO:0016441, GO:00081... \n", "718368 {GO:0000512, BFO:0000015, GO:0016441, GO:00081... \n", "718370 {GO:0008152, BFO:0000015, GO:0090399, GO:00099... \n", "\n", " object_closure_redundant evidence ... \\\n", "4876 {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... HTP ... \n", "5853 {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... HTP ... \n", "6327 {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... HTP ... \n", "15942 {CARO:0030000, GO:0098803, GO:0098796, CARO:00... IDA ... \n", "34482 {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... HTP ... \n", "... ... ... ... \n", "718362 {GO:0065007, GO:0042127, GO:0048518, BFO:00000... IMP ... \n", "718366 {GO:0045934, GO:0009892, GO:0065007, GO:000988... IMP ... \n", "718367 {GO:0065007, GO:0009892, GO:0031047, GO:000988... IDA ... \n", "718368 {GO:0065007, GO:0009892, GO:0031047, GO:000988... IMP ... \n", "718370 {GO:0008152, BFO:0000015, GO:0009987, GO:00081... IMP ... \n", "\n", " pubs release pmid_new is_new fresh pmid_removed unique \\\n", "4876 [PMID:34800366] 2024-11-03 True True True NaN NaN \n", "5853 [PMID:34800366] 2024-11-03 True True True NaN NaN \n", "6327 [PMID:34800366] 2024-11-03 True True True NaN NaN \n", "15942 [PMID:37098072] 2024-11-03 True True True NaN NaN \n", "34482 [PMID:34800366] 2024-11-03 True True True NaN NaN \n", "... ... ... ... ... ... ... ... \n", "718362 [PMID:33102210] 2024-11-03 True True True NaN NaN \n", "718366 [PMID:27634931] 2024-11-03 True True True NaN NaN \n", "718367 [PMID:30720199] 2024-11-03 True True True NaN NaN \n", "718368 [PMID:33102210] 2024-11-03 True True True NaN NaN \n", "718370 [PMID:27634931] 2024-11-03 True True True NaN NaN \n", "\n", " redacted redundant iba_rejected \n", "4876 NaN False False \n", "5853 NaN False False \n", "6327 NaN False False \n", "15942 NaN False False \n", "34482 NaN False False \n", "... ... ... ... \n", "718362 NaN False False \n", "718366 NaN False False \n", "718367 NaN False False \n", "718368 NaN False False \n", "718370 NaN False False \n", "\n", "[524 rows x 23 columns]" ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
subjectsubject_labelpredicateobjectobject_labelobject_obsoletesobject_uninformativeobject_closureobject_closure_redundantevidence...pubsreleasepmid_newis_newfreshpmid_removeduniqueredactedredundantiba_rejected
4876UniProtKB:A6NNL5C15orf61located_inGO:0005739mitochondrionFalseFalse{GO:0005739, GO:0110165, GO:0043231, BFO:00000...{CARO:0030000, CARO:0000000, BFO:0000004, GO:0...HTP...[PMID:34800366]2024-11-03TrueTrueTrueNaNNaNNaNFalseFalse
5853UniProtKB:A8MSI8LYRM9located_inGO:0005739mitochondrionFalseFalse{GO:0005739, GO:0110165, GO:0043231, BFO:00000...{CARO:0030000, CARO:0000000, BFO:0000004, GO:0...HTP...[PMID:34800366]2024-11-03TrueTrueTrueNaNNaNNaNFalseFalse
6327UniProtKB:A8MXV4NUDT19located_inGO:0005739mitochondrionFalseFalse{GO:0005739, GO:0110165, GO:0043231, BFO:00000...{CARO:0030000, CARO:0000000, BFO:0000004, GO:0...HTP...[PMID:34800366]2024-11-03TrueTrueTrueNaNNaNNaNFalseFalse
15942UniProtKB:O14521SDHDpart_ofGO:0045273respiratory chain complex II (succinate dehydr...FalseFalse{GO:0098796, GO:0110165, GO:0045273, BFO:00000...{CARO:0030000, GO:0098803, GO:0098796, CARO:00...IDA...[PMID:37098072]2024-11-03TrueTrueTrueNaNNaNNaNFalseFalse
34482UniProtKB:O43325LYRM1located_inGO:0005739mitochondrionFalseFalse{GO:0005739, GO:0110165, GO:0043231, BFO:00000...{CARO:0030000, CARO:0000000, BFO:0000004, GO:0...HTP...[PMID:34800366]2024-11-03TrueTrueTrueNaNNaNNaNFalseFalse
..................................................................
718362RNAcentral:URS00026A23F2_9606URS00026A23F2_9606acts_upstream_ofGO:0008284positive regulation of cell population prolife...FalseFalse{GO:0065007, GO:0042127, GO:0048518, BFO:00000...{GO:0065007, GO:0042127, GO:0048518, BFO:00000...IMP...[PMID:33102210]2024-11-03TrueTrueTrueNaNNaNNaNFalseFalse
718366RNAcentral:URS00026A23F2_9606URS00026A23F2_9606involved_inGO:0000122negative regulation of transcription by RNA po...FalseFalse{BFO:0000015, GO:0006357, GO:0008150, GO:00098...{GO:0045934, GO:0009892, GO:0065007, GO:000988...IMP...[PMID:27634931]2024-11-03TrueTrueTrueNaNNaNNaNFalseFalse
718367RNAcentral:URS00026A23F2_9606URS00026A23F2_9606involved_inGO:0000512lncRNA-mediated post-transcriptional gene sile...FalseFalse{GO:0000512, BFO:0000015, GO:0016441, GO:00081...{GO:0065007, GO:0009892, GO:0031047, GO:000988...IDA...[PMID:30720199]2024-11-03TrueTrueTrueNaNNaNNaNFalseFalse
718368RNAcentral:URS00026A23F2_9606URS00026A23F2_9606involved_inGO:0000512lncRNA-mediated post-transcriptional gene sile...FalseFalse{GO:0000512, BFO:0000015, GO:0016441, GO:00081...{GO:0065007, GO:0009892, GO:0031047, GO:000988...IMP...[PMID:33102210]2024-11-03TrueTrueTrueNaNNaNNaNFalseFalse
718370RNAcentral:URS00026A23F2_9606URS00026A23F2_9606involved_inGO:0090399replicative senescenceFalseFalse{GO:0008152, BFO:0000015, GO:0090399, GO:00099...{GO:0008152, BFO:0000015, GO:0009987, GO:00081...IMP...[PMID:27634931]2024-11-03TrueTrueTrueNaNNaNNaNFalseFalse
\n", "

524 rows × 23 columns

\n", "
" ] }, "execution_count": 332, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 332 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-06T16:39:16.785481Z", "start_time": "2025-01-06T16:39:16.748334Z" } }, "cell_type": "code", "source": [ "def row_to_test_case(row, answer=\"YES\"):\n", " gene = row['subject_label']\n", " term = row['object_label']\n", " predicate = row['predicate']\n", " if not gene or not term or not predicate:\n", " return\n", " \n", " return {\n", " \"input\": f\"{gene} {predicate} {term}\",\n", " \"original_input\": {\n", " \"subject\": str(row['subject']),\n", " \"predicate\": str(row['predicate']),\n", " \"object\": str(row['object']),\n", " },\n", " \"ideal\": answer,\n", " }\n", "\n", "\n", "def df_to_test_cases(df: pd.DataFrame, limit=1000):\n", " cases = [row_to_test_case(row) for _, row in df.iterrows()]\n", " cases = [x for x in cases if x is not None]\n", " if limit:\n", " cases = cases[:limit]\n", " return cases\n", "\n", "df_to_test_cases(test_cases_df, limit=5)" ], "id": "ef5c09fd349f7646", "outputs": [ { "data": { "text/plain": [ "[{'input': 'C15orf61 located_in mitochondrion',\n", " 'original_input': {'subject': 'UniProtKB:A6NNL5',\n", " 'predicate': 'located_in',\n", " 'object': 'GO:0005739'},\n", " 'ideal': 'YES'},\n", " {'input': 'LYRM9 located_in mitochondrion',\n", " 'original_input': {'subject': 'UniProtKB:A8MSI8',\n", " 'predicate': 'located_in',\n", " 'object': 'GO:0005739'},\n", " 'ideal': 'YES'},\n", " {'input': 'NUDT19 located_in mitochondrion',\n", " 'original_input': {'subject': 'UniProtKB:A8MXV4',\n", " 'predicate': 'located_in',\n", " 'object': 'GO:0005739'},\n", " 'ideal': 'YES'},\n", " {'input': 'SDHD part_of respiratory chain complex II (succinate dehydrogenase)',\n", " 'original_input': {'subject': 'UniProtKB:O14521',\n", " 'predicate': 'part_of',\n", " 'object': 'GO:0045273'},\n", " 'ideal': 'YES'},\n", " {'input': 'LYRM1 located_in mitochondrion',\n", " 'original_input': {'subject': 'UniProtKB:O43325',\n", " 'predicate': 'located_in',\n", " 'object': 'GO:0005739'},\n", " 'ideal': 'YES'}]" ] }, "execution_count": 350, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 350 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-06T02:34:40.733465Z", "start_time": "2025-01-06T02:34:40.706998Z" } }, "cell_type": "code", "source": "#cases = [row_to_test_case(row) for _, row in test_cases_df.iterrows()]\n", "id": "db505d632ca52fde", "outputs": [], "execution_count": 341 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-06T02:35:09.227224Z", "start_time": "2025-01-06T02:35:09.064306Z" } }, "cell_type": "code", "source": [ "import yaml\n", "\n", "with open(\"output/test-cases.yaml\", \"w\") as f:\n", " yaml.dump({\"cases\": cases}, f, sort_keys=False)" ], "id": "43a7fc62e139e2a8", "outputs": [], "execution_count": 343 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-06T04:59:05.656278Z", "start_time": "2025-01-06T04:59:05.599644Z" } }, "cell_type": "code", "source": [ "redacted_df = new_human[(new_human['redacted'] == True) & (new_human['object_uninformative'] == False) & (new_human['negated'] != True) & (new_human['redundant'] != True) & (new_human['iba_rejected'] != True)]\n", "redacted_df" ], "id": "592aef9b9d45259a", "outputs": [ { "data": { "text/plain": [ " subject subject_label predicate \\\n", "401144 UniProtKB:Q8N6R0 METTL13 involved_in \n", "401146 UniProtKB:Q8N6R0 METTL13 involved_in \n", "453842 UniProtKB:Q96K19 RNF170 involved_in \n", "587451 RNAcentral:URS0000083D87_9606 URS0000083D87_9606 involved_in \n", "587452 RNAcentral:URS0000083D87_9606 URS0000083D87_9606 involved_in \n", "... ... ... ... \n", "424250 UniProtKB:Q9Y226 SLC22A13 None \n", "425053 UniProtKB:Q9Y267 SLC22A14 None \n", "425054 UniProtKB:Q9Y267 SLC22A14 None \n", "425055 UniProtKB:Q9Y267 SLC22A14 None \n", "432681 UniProtKB:Q9Y5M6 OCLM None \n", "\n", " object object_label \\\n", "401144 GO:0000122 negative regulation of transcription by RNA po... \n", "401146 GO:1902807 negative regulation of cell cycle G1/S phase t... \n", "453842 GO:0034140 negative regulation of toll-like receptor 3 si... \n", "587451 GO:0035195 miRNA-mediated post-transcriptional gene silen... \n", "587452 GO:0090051 negative regulation of cell migration involved... \n", "... ... ... \n", "424250 GO:0015695 organic cation transport \n", "425053 GO:0005887 None \n", "425054 GO:0015101 organic cation transmembrane transporter activity \n", "425055 GO:0015695 organic cation transport \n", "432681 GO:0007601 visual perception \n", "\n", " object_obsoletes object_uninformative \\\n", "401144 False False \n", "401146 False False \n", "453842 False False \n", "587451 False False \n", "587452 False False \n", "... ... ... \n", "424250 False False \n", "425053 True False \n", "425054 False False \n", "425055 False False \n", "432681 False False \n", "\n", " object_closure \\\n", "401144 {BFO:0000015, GO:0006357, GO:0008150, GO:00098... \n", "401146 {GO:0065007, GO:1902806, GO:0010948, GO:190198... \n", "453842 {GO:0048585, GO:0062207, GO:1902532, GO:000996... \n", "587451 {BFO:0000015, GO:0016441, GO:0008150, GO:00106... \n", "587452 {GO:0030336, BFO:0000015, GO:0008150, GO:00105... \n", "... ... \n", "424250 {GO:0051234, GO:0006810, BFO:0000015, GO:00156... \n", "425053 {GO:0005887} \n", "425054 {GO:0051234, GO:0055085, GO:0006810, BFO:00000... \n", "425055 {GO:0051234, GO:0006810, BFO:0000015, GO:00156... \n", "432681 {GO:0032501, GO:0050953, GO:0003008, BFO:00000... \n", "\n", " object_closure_redundant evidence ... \\\n", "401144 {GO:0045934, GO:0009892, GO:0065007, GO:000988... IMP ... \n", "401146 {GO:0065007, GO:0010948, GO:1901988, GO:005172... IMP ... \n", "453842 {GO:0048585, GO:0065007, GO:0002682, GO:006220... IDA ... \n", "587451 {GO:0065007, GO:0009892, GO:0031047, GO:000988... IDA ... \n", "587452 {GO:0065007, GO:0030336, BFO:0000015, GO:00485... IGI ... \n", "... ... ... ... \n", "424250 {GO:0051234, GO:0006810, BFO:0000015, GO:00511... NAS ... \n", "425053 {} NAS ... \n", "425054 {GO:0051234, GO:0055085, GO:0006810, BFO:00000... NAS ... \n", "425055 {GO:0051234, GO:0006810, BFO:0000015, GO:00511... NAS ... \n", "432681 {GO:0032501, GO:0050953, GO:0003008, BFO:00000... TAS ... \n", "\n", " pubs release pmid_new is_new fresh pmid_removed unique \\\n", "401144 [PMID:26763933] 2024-06-10 NaN NaN NaN True True \n", "401146 [PMID:26763933] 2024-06-10 NaN NaN NaN True True \n", "453842 [PMID:31076723] 2024-06-10 NaN NaN NaN True True \n", "587451 [PMID:28640956] 2024-06-10 NaN NaN NaN True True \n", "587452 [PMID:28640956] 2024-06-10 NaN NaN NaN True True \n", "... ... ... ... ... ... ... ... \n", "424250 [PMID:10072596] 2020-01-01 NaN NaN NaN True True \n", "425053 [PMID:10072596] 2020-01-01 NaN NaN NaN True True \n", "425054 [PMID:10072596] 2020-01-01 NaN NaN NaN True True \n", "425055 [PMID:10072596] 2020-01-01 NaN NaN NaN True True \n", "432681 [PMID:10362512] 2020-01-01 NaN NaN NaN True True \n", "\n", " redacted redundant iba_rejected \n", "401144 True False False \n", "401146 True False False \n", "453842 True False False \n", "587451 True False False \n", "587452 True False False \n", "... ... ... ... \n", "424250 True False False \n", "425053 True False False \n", "425054 True False False \n", "425055 True False False \n", "432681 True False False \n", "\n", "[83 rows x 23 columns]" ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
subjectsubject_labelpredicateobjectobject_labelobject_obsoletesobject_uninformativeobject_closureobject_closure_redundantevidence...pubsreleasepmid_newis_newfreshpmid_removeduniqueredactedredundantiba_rejected
401144UniProtKB:Q8N6R0METTL13involved_inGO:0000122negative regulation of transcription by RNA po...FalseFalse{BFO:0000015, GO:0006357, GO:0008150, GO:00098...{GO:0045934, GO:0009892, GO:0065007, GO:000988...IMP...[PMID:26763933]2024-06-10NaNNaNNaNTrueTrueTrueFalseFalse
401146UniProtKB:Q8N6R0METTL13involved_inGO:1902807negative regulation of cell cycle G1/S phase t...FalseFalse{GO:0065007, GO:1902806, GO:0010948, GO:190198...{GO:0065007, GO:0010948, GO:1901988, GO:005172...IMP...[PMID:26763933]2024-06-10NaNNaNNaNTrueTrueTrueFalseFalse
453842UniProtKB:Q96K19RNF170involved_inGO:0034140negative regulation of toll-like receptor 3 si...FalseFalse{GO:0048585, GO:0062207, GO:1902532, GO:000996...{GO:0048585, GO:0065007, GO:0002682, GO:006220...IDA...[PMID:31076723]2024-06-10NaNNaNNaNTrueTrueTrueFalseFalse
587451RNAcentral:URS0000083D87_9606URS0000083D87_9606involved_inGO:0035195miRNA-mediated post-transcriptional gene silen...FalseFalse{BFO:0000015, GO:0016441, GO:0008150, GO:00106...{GO:0065007, GO:0009892, GO:0031047, GO:000988...IDA...[PMID:28640956]2024-06-10NaNNaNNaNTrueTrueTrueFalseFalse
587452RNAcentral:URS0000083D87_9606URS0000083D87_9606involved_inGO:0090051negative regulation of cell migration involved...FalseFalse{GO:0030336, BFO:0000015, GO:0008150, GO:00105...{GO:0065007, GO:0030336, BFO:0000015, GO:00485...IGI...[PMID:28640956]2024-06-10NaNNaNNaNTrueTrueTrueFalseFalse
..................................................................
424250UniProtKB:Q9Y226SLC22A13NoneGO:0015695organic cation transportFalseFalse{GO:0051234, GO:0006810, BFO:0000015, GO:00156...{GO:0051234, GO:0006810, BFO:0000015, GO:00511...NAS...[PMID:10072596]2020-01-01NaNNaNNaNTrueTrueTrueFalseFalse
425053UniProtKB:Q9Y267SLC22A14NoneGO:0005887NoneTrueFalse{GO:0005887}{}NAS...[PMID:10072596]2020-01-01NaNNaNNaNTrueTrueTrueFalseFalse
425054UniProtKB:Q9Y267SLC22A14NoneGO:0015101organic cation transmembrane transporter activityFalseFalse{GO:0051234, GO:0055085, GO:0006810, BFO:00000...{GO:0051234, GO:0055085, GO:0006810, BFO:00000...NAS...[PMID:10072596]2020-01-01NaNNaNNaNTrueTrueTrueFalseFalse
425055UniProtKB:Q9Y267SLC22A14NoneGO:0015695organic cation transportFalseFalse{GO:0051234, GO:0006810, BFO:0000015, GO:00156...{GO:0051234, GO:0006810, BFO:0000015, GO:00511...NAS...[PMID:10072596]2020-01-01NaNNaNNaNTrueTrueTrueFalseFalse
432681UniProtKB:Q9Y5M6OCLMNoneGO:0007601visual perceptionFalseFalse{GO:0032501, GO:0050953, GO:0003008, BFO:00000...{GO:0032501, GO:0050953, GO:0003008, BFO:00000...TAS...[PMID:10362512]2020-01-01NaNNaNNaNTrueTrueTrueFalseFalse
\n", "

83 rows × 23 columns

\n", "
" ] }, "execution_count": 345, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 345 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-06T04:59:37.662072Z", "start_time": "2025-01-06T04:59:37.623828Z" } }, "cell_type": "code", "source": [ "cases = [row_to_test_case(row) for _, row in redacted_df.iterrows()]\n", "\n", "with open(\"output/test-cases-redacted.yaml\", \"w\") as f:\n", " yaml.dump({\"cases\": cases}, f, sort_keys=False)" ], "id": "5a9d77bbe489741c", "outputs": [], "execution_count": 346 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-06T16:24:14.051640Z", "start_time": "2025-01-06T16:24:13.956972Z" } }, "cell_type": "code", "source": [ "iba_df = new_human[(new_human['is_iba'] == True) & (new_human['object_uninformative'] == False) & (new_human['negated'] != True) & (new_human['redundant'] != True)]\n", "iba_df" ], "id": "d1d74a1ac92d43c7", "outputs": [ { "data": { "text/plain": [ " subject subject_label predicate object \\\n", "718564 UniProtKB:Q06418 TYRO3 enables GO:0004714 \n", "718565 UniProtKB:P78559 MAP1A is_active_in GO:0030425 \n", "718566 UniProtKB:Q7L1W4 LRRC8D is_active_in GO:0005737 \n", "718567 UniProtKB:A3QJZ7 PRAMEF27 part_of GO:0031462 \n", "718568 UniProtKB:Q70IA6 MOB2 is_active_in GO:0005634 \n", "... ... ... ... ... \n", "782818 UniProtKB:Q9NZC2 TREM2 involved_in GO:0045088 \n", "782819 UniProtKB:Q9Y2K2 SIK3 enables GO:0050321 \n", "782820 UniProtKB:P43235 CTSK involved_in GO:0051603 \n", "782821 UniProtKB:Q07343 PDE4B enables GO:0047555 \n", "782822 UniProtKB:A6NC42 DPPA5 involved_in GO:0010468 \n", "\n", " object_label object_obsoletes \\\n", "718564 transmembrane receptor protein tyrosine kinase... False \n", "718565 dendrite False \n", "718566 cytoplasm False \n", "718567 Cul2-RING ubiquitin ligase complex False \n", "718568 nucleus False \n", "... ... ... \n", "782818 regulation of innate immune response False \n", "782819 tau-protein kinase activity False \n", "782820 proteolysis involved in protein catabolic process False \n", "782821 3',5'-cyclic-GMP phosphodiesterase activity False \n", "782822 regulation of gene expression False \n", "\n", " object_uninformative \\\n", "718564 False \n", "718565 False \n", "718566 False \n", "718567 False \n", "718568 False \n", "... ... \n", "782818 False \n", "782819 False \n", "782820 False \n", "782821 False \n", "782822 False \n", "\n", " object_closure \\\n", "718564 {GO:0019199, GO:0140096, GO:0003824, GO:001674... \n", "718565 {GO:0030425, GO:0120025, GO:0043005, GO:011016... \n", "718566 {CARO:0030000, UBERON:0000061, CARO:0000003, G... \n", "718567 {GO:0031462, GO:0031461, GO:1990234, BFO:00000... \n", "718568 {GO:0110165, GO:0043231, BFO:0000002, GO:00432... \n", "... ... \n", "782818 {GO:0065007, GO:0002682, GO:0050776, BFO:00000... \n", "782819 {GO:0140096, GO:0003824, GO:0004674, GO:001674... \n", "782820 {GO:0043170, GO:0006508, GO:0044238, GO:000905... \n", "782821 {GO:0047555, GO:0003824, GO:0008081, BFO:00000... \n", "782822 {GO:0065007, GO:0060255, GO:0009889, BFO:00000... \n", "\n", " object_closure_redundant evidence ... \\\n", "718564 {GO:0019199, GO:0003824, BFO:0000015, GO:00167... IBA ... \n", "718565 {CARO:0030000, CL:0002319, CL:0000211, UBERON:... IBA ... \n", "718566 {CARO:0030000, UBERON:0000061, CARO:0000003, C... IBA ... \n", "718567 {GO:0031461, GO:1990234, BFO:0000004, GO:01405... IBA ... \n", "718568 {CARO:0030000, CARO:0000000, BFO:0000004, GO:0... IBA ... \n", "... ... ... ... \n", "782818 {GO:0065007, GO:0002682, GO:0050776, BFO:00000... IBA ... \n", "782819 {GO:0140096, GO:0003824, GO:0004674, GO:001674... IBA ... \n", "782820 {GO:0043170, GO:0006508, GO:0044238, GO:000905... IBA ... \n", "782821 {GO:0003824, GO:0008081, BFO:0000015, GO:00425... IBA ... \n", "782822 {GO:0065007, GO:0060255, GO:0009889, BFO:00000... IBA ... \n", "\n", " pubs release pmid_new is_new fresh pmid_removed \\\n", "718564 [GO_REF:0000033] 2024-11-03 False False False NaN \n", "718565 [GO_REF:0000033] 2024-11-03 False False False NaN \n", "718566 [GO_REF:0000033] 2024-11-03 False False False NaN \n", "718567 [GO_REF:0000033] 2024-11-03 False True False NaN \n", "718568 [GO_REF:0000033] 2024-11-03 False False False NaN \n", "... ... ... ... ... ... ... \n", "782818 [GO_REF:0000033] 2024-11-03 False False False NaN \n", "782819 [GO_REF:0000033] 2024-11-03 False False False NaN \n", "782820 [GO_REF:0000033] 2024-11-03 False False False NaN \n", "782821 [GO_REF:0000033] 2024-11-03 False False False NaN \n", "782822 [GO_REF:0000033] 2024-11-03 False False False NaN \n", "\n", " unique redacted redundant iba_rejected \n", "718564 NaN NaN False False \n", "718565 NaN NaN False False \n", "718566 NaN NaN False False \n", "718567 NaN NaN False False \n", "718568 NaN NaN False False \n", "... ... ... ... ... \n", "782818 NaN NaN False False \n", "782819 NaN NaN False False \n", "782820 NaN NaN False False \n", "782821 NaN NaN False False \n", "782822 NaN NaN False False \n", "\n", "[56487 rows x 23 columns]" ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
subjectsubject_labelpredicateobjectobject_labelobject_obsoletesobject_uninformativeobject_closureobject_closure_redundantevidence...pubsreleasepmid_newis_newfreshpmid_removeduniqueredactedredundantiba_rejected
718564UniProtKB:Q06418TYRO3enablesGO:0004714transmembrane receptor protein tyrosine kinase...FalseFalse{GO:0019199, GO:0140096, GO:0003824, GO:001674...{GO:0019199, GO:0003824, BFO:0000015, GO:00167...IBA...[GO_REF:0000033]2024-11-03FalseFalseFalseNaNNaNNaNFalseFalse
718565UniProtKB:P78559MAP1Ais_active_inGO:0030425dendriteFalseFalse{GO:0030425, GO:0120025, GO:0043005, GO:011016...{CARO:0030000, CL:0002319, CL:0000211, UBERON:...IBA...[GO_REF:0000033]2024-11-03FalseFalseFalseNaNNaNNaNFalseFalse
718566UniProtKB:Q7L1W4LRRC8Dis_active_inGO:0005737cytoplasmFalseFalse{CARO:0030000, UBERON:0000061, CARO:0000003, G...{CARO:0030000, UBERON:0000061, CARO:0000003, C...IBA...[GO_REF:0000033]2024-11-03FalseFalseFalseNaNNaNNaNFalseFalse
718567UniProtKB:A3QJZ7PRAMEF27part_ofGO:0031462Cul2-RING ubiquitin ligase complexFalseFalse{GO:0031462, GO:0031461, GO:1990234, BFO:00000...{GO:0031461, GO:1990234, BFO:0000004, GO:01405...IBA...[GO_REF:0000033]2024-11-03FalseTrueFalseNaNNaNNaNFalseFalse
718568UniProtKB:Q70IA6MOB2is_active_inGO:0005634nucleusFalseFalse{GO:0110165, GO:0043231, BFO:0000002, GO:00432...{CARO:0030000, CARO:0000000, BFO:0000004, GO:0...IBA...[GO_REF:0000033]2024-11-03FalseFalseFalseNaNNaNNaNFalseFalse
..................................................................
782818UniProtKB:Q9NZC2TREM2involved_inGO:0045088regulation of innate immune responseFalseFalse{GO:0065007, GO:0002682, GO:0050776, BFO:00000...{GO:0065007, GO:0002682, GO:0050776, BFO:00000...IBA...[GO_REF:0000033]2024-11-03FalseFalseFalseNaNNaNNaNFalseFalse
782819UniProtKB:Q9Y2K2SIK3enablesGO:0050321tau-protein kinase activityFalseFalse{GO:0140096, GO:0003824, GO:0004674, GO:001674...{GO:0140096, GO:0003824, GO:0004674, GO:001674...IBA...[GO_REF:0000033]2024-11-03FalseFalseFalseNaNNaNNaNFalseFalse
782820UniProtKB:P43235CTSKinvolved_inGO:0051603proteolysis involved in protein catabolic processFalseFalse{GO:0043170, GO:0006508, GO:0044238, GO:000905...{GO:0043170, GO:0006508, GO:0044238, GO:000905...IBA...[GO_REF:0000033]2024-11-03FalseFalseFalseNaNNaNNaNFalseFalse
782821UniProtKB:Q07343PDE4BenablesGO:00475553',5'-cyclic-GMP phosphodiesterase activityFalseFalse{GO:0047555, GO:0003824, GO:0008081, BFO:00000...{GO:0003824, GO:0008081, BFO:0000015, GO:00425...IBA...[GO_REF:0000033]2024-11-03FalseFalseFalseNaNNaNNaNFalseFalse
782822UniProtKB:A6NC42DPPA5involved_inGO:0010468regulation of gene expressionFalseFalse{GO:0065007, GO:0060255, GO:0009889, BFO:00000...{GO:0065007, GO:0060255, GO:0009889, BFO:00000...IBA...[GO_REF:0000033]2024-11-03FalseFalseFalseNaNNaNNaNFalseFalse
\n", "

56487 rows × 23 columns

\n", "
" ] }, "execution_count": 348, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 348 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-06T16:39:47.734345Z", "start_time": "2025-01-06T16:39:44.937046Z" } }, "cell_type": "code", "source": [ "with open(\"output/test-cases-iba.yaml\", \"w\") as f:\n", " yaml.dump({\"cases\": df_to_test_cases(iba_df, limit=1000)}, f, sort_keys=False)" ], "id": "8cd8667bbc5568c0", "outputs": [], "execution_count": 351 }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "", "id": "47d31b2f387a2dc0" } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 5 }