{
"cells": [
{
"metadata": {},
"cell_type": "markdown",
"source": [
"# CHEBI Slimmer\n",
"\n",
"Creates a simplified version of CHEBI by conflating all members of a conjugate clique."
],
"id": "882b63db7629f552"
},
{
"metadata": {},
"cell_type": "markdown",
"source": [
"## Initial setup\n",
"\n",
"Imports and use OAK to get an adapter to CHEBI sqlite database."
],
"id": "74787c2d9005dc21"
},
{
"metadata": {
"collapsed": true,
"ExecuteTime": {
"end_time": "2024-11-16T01:30:31.668987Z",
"start_time": "2024-11-16T01:30:29.432380Z"
}
},
"cell_type": "code",
"source": [
"from typing import Optional, List\n",
"from collections import defaultdict\n",
"\n",
"import pandas as pd\n",
"\n",
"from oaklib import get_adapter\n",
"from oaklib.utilities.obograph_utils import reflexive\n",
"from tests.test_converters.test_obo_format import canonical_path\n",
"\n",
"chebi = get_adapter(\"sqlite:obo:chebi\")\n",
"# session = get_adapter(\"sqlite:obo:chebi\").session\n",
"session = chebi.session"
],
"id": "initial_id",
"outputs": [],
"execution_count": 1
},
{
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-11-16T01:30:31.718018Z",
"start_time": "2024-11-16T01:30:31.710177Z"
}
},
"cell_type": "code",
"source": [
"from oaklib.datamodels.vocabulary import IS_A, HAS_PART\n",
"from oaklib.interfaces import OboGraphInterface\n",
"\n",
"assert isinstance(chebi, OboGraphInterface)\n"
],
"id": "237c5d0906ac560c",
"outputs": [],
"execution_count": 2
},
{
"metadata": {},
"cell_type": "markdown",
"source": "## Set up vocabulary constants",
"id": "64e69d5dbfbdc79b"
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:30:31.795374Z",
"start_time": "2024-11-16T01:30:31.792642Z"
}
},
"cell_type": "code",
"source": [
"# Relations\n",
"CBO = \"obo:chebi#is_conjugate_base_of\"\n",
"CAO = \"obo:chebi#is_conjugate_acid_of\"\n",
"TAUTOMER_OF = \"obo:chebi#is_tautomer_of\"\n",
"ENANTIOMER_OF = \"obo:chebi#is_enantiomer_of\"\n",
"HAS_ROLE = \"RO:0000087\""
],
"id": "703ad334757e40df",
"outputs": [],
"execution_count": 3
},
{
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-11-16T01:30:31.831712Z",
"start_time": "2024-11-16T01:30:31.829565Z"
}
},
"cell_type": "code",
"source": [
"\n",
"CHEMICAL_ENTITY = \"CHEBI:24431\""
],
"id": "4538a10374439f3b",
"outputs": [],
"execution_count": 4
},
{
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-11-16T01:30:31.926589Z",
"start_time": "2024-11-16T01:30:31.885548Z"
}
},
"cell_type": "code",
"source": [
"# modify this for testing\n",
"# ROOT = AMINO_ACID\n",
"ROOT = CHEMICAL_ENTITY"
],
"id": "d9a11b7b7e44919a",
"outputs": [],
"execution_count": 5
},
{
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-11-16T01:30:32.257077Z",
"start_time": "2024-11-16T01:30:32.249103Z"
}
},
"cell_type": "code",
"source": [
"AMINO_ACID = \"CHEBI:33709\"\n",
"AMINO_ACID_ANION = \"CHEBI:37022\"\n",
"ION = \"CHEBI:24870\" \n",
"ALPHA_AMINO_ACID = \"CHEBI:33704\"\n",
"ALPHA_AMINO_ACID_ANION = \"CHEBI:33558\"\n",
"ALPHA_AMINO_ACID_ZWITTERION = \"CHEBI:78608\"\n",
"CYSTEINE_ZWITTERION = \"CHEBI:35237\"\n",
"L_CYSTEINE_ZWITTERION = \"CHEBI:35235\"\n",
"CYSTEINATE_1_MINUS = \"CHEBI:32456\"\n",
"CYSTEINIUM = \"CHEBI:32458\"\n",
"CORD_E = \"CHEBI:213754\"\n",
"AAAE = \"CHEBI:46874\"\n",
"CITRIC_ACID = \"CHEBI:30769\"\n",
"\n",
"AMMONIA=\"CHEBI:16134\"\n",
"AMMONIUM=\"CHEBI:28938\"\n",
"AZANIDE=\"CHEBI:29337\"\n",
"HYRDRIDONITRATE_2M = \"CHEBI:29340\"\n",
"PECTIN = \"CHEBI:17309\"\n",
"WATER = \"CHEBI:15377\"\n",
"\n",
"CONJ_EXCLUDES = {\n",
" AMMONIA, AMMONIUM, AZANIDE, HYRDRIDONITRATE_2M, PECTIN\n",
"}\n",
"\n"
],
"id": "6b59749828d83578",
"outputs": [],
"execution_count": 6
},
{
"metadata": {},
"cell_type": "markdown",
"source": "## All labels",
"id": "16ad1bd007f1dd6f"
},
{
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-11-16T01:30:51.852164Z",
"start_time": "2024-11-16T01:30:32.346045Z"
}
},
"cell_type": "code",
"source": [
"labels = {k: v for k, v in chebi.labels(chebi.entities())}\n",
"len(labels)"
],
"id": "bb9da82d69bf53ff",
"outputs": [
{
"data": {
"text/plain": [
"200959"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 7
},
{
"metadata": {},
"cell_type": "markdown",
"source": "## Mappings",
"id": "ab80728e19185a45"
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:01.875034Z",
"start_time": "2024-11-16T01:30:51.888994Z"
}
},
"cell_type": "code",
"source": [
"from semsql.sqla.semsql import Statements, HasDbxrefStatement\n",
"q = session.query(HasDbxrefStatement)\n",
"xrefs = defaultdict(list)\n",
"for row in q:\n",
" if row.subject.startswith(\"CHEBI:\"):\n",
" xrefs[row.subject].append(row.value)\n",
"len(xrefs)"
],
"id": "9f9bba0c78606429",
"outputs": [
{
"data": {
"text/plain": [
"161158"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 8
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:03.840582Z",
"start_time": "2024-11-16T01:31:01.983823Z"
}
},
"cell_type": "code",
"source": [
"q = session.query(Statements).filter(Statements.predicate == \"obo:chebi/inchi\")\n",
"inchis = {row.subject: row.value for row in q}\n",
"len(inchis)"
],
"id": "8bb68491d1485660",
"outputs": [
{
"data": {
"text/plain": [
"177528"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 9
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:03.966190Z",
"start_time": "2024-11-16T01:31:03.962683Z"
}
},
"cell_type": "code",
"source": [
"S3H = \"CHEBI:113373\"\n",
"inchis[S3H]"
],
"id": "12371488c8dad5e6",
"outputs": [
{
"data": {
"text/plain": [
"'InChI=1S/C4H8O3.Na/c1-3(5)2-4(6)7;/h3,5H,2H2,1H3,(H,6,7);/q;+1/p-1'"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 10
},
{
"metadata": {},
"cell_type": "markdown",
"source": "## Various Relationships\n",
"id": "a56da86f0437f78f"
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:07.972053Z",
"start_time": "2024-11-16T01:31:04.161443Z"
}
},
"cell_type": "code",
"source": [
"PMAP = {ENANTIOMER_OF: \"RO:0018039\"}\n",
"preserved_rels = list(chebi.relationships(predicates=[HAS_PART, HAS_ROLE, ENANTIOMER_OF]))\n",
"preserved_rels_by_subject = defaultdict(list)\n",
"for s, p, o in preserved_rels:\n",
" p_mapped = PMAP.get(p, p)\n",
" preserved_rels_by_subject[s].append((p_mapped, o))\n",
"assert len(preserved_rels_by_subject) > 1000"
],
"id": "1b818c23743b2dd8",
"outputs": [],
"execution_count": 11
},
{
"metadata": {},
"cell_type": "markdown",
"source": "",
"id": "79ff715a10f5ee8c"
},
{
"metadata": {},
"cell_type": "markdown",
"source": "## Retrieve all Charge States\n",
"id": "f7763be85ede7b0a"
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:08.113257Z",
"start_time": "2024-11-16T01:31:08.111324Z"
}
},
"cell_type": "code",
"source": "",
"id": "2c57db8dcdc42d30",
"outputs": [],
"execution_count": null
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:10.946952Z",
"start_time": "2024-11-16T01:31:08.254299Z"
}
},
"cell_type": "code",
"source": [
"from semsql.sqla.semsql import Statements, HasDbxrefStatement\n",
"\n",
"session = get_adapter(\"sqlite:obo:chebi\").session\n",
"q = session.query(Statements).filter(Statements.predicate == \"obo:chebi/charge\")\n",
"charges = {row.subject: int(row.value) for row in q if row.value is not None}"
],
"id": "481dce997828261a",
"outputs": [],
"execution_count": 12
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:11.099072Z",
"start_time": "2024-11-16T01:31:11.095765Z"
}
},
"cell_type": "code",
"source": "len(charges)",
"id": "5ee9f24a01966ac0",
"outputs": [
{
"data": {
"text/plain": [
"189127"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 13
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:11.252359Z",
"start_time": "2024-11-16T01:31:11.249651Z"
}
},
"cell_type": "code",
"source": "assert charges[L_CYSTEINE_ZWITTERION] == 0",
"id": "6c387597c60290a2",
"outputs": [],
"execution_count": 14
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:11.402898Z",
"start_time": "2024-11-16T01:31:11.400351Z"
}
},
"cell_type": "code",
"source": "assert charges[CYSTEINATE_1_MINUS] == -1",
"id": "58b92eac4c3aea65",
"outputs": [],
"execution_count": 15
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:11.553674Z",
"start_time": "2024-11-16T01:31:11.551180Z"
}
},
"cell_type": "code",
"source": "assert charges[CITRIC_ACID] == 0",
"id": "c55e6cb12a37ea60",
"outputs": [],
"execution_count": 16
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:11.704268Z",
"start_time": "2024-11-16T01:31:11.701723Z"
}
},
"cell_type": "code",
"source": "assert AMINO_ACID_ANION not in charges, \"X anion terms are agnostic to a SPECIFIC charge\"",
"id": "e7d7762146973a0e",
"outputs": [],
"execution_count": 17
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:12.218497Z",
"start_time": "2024-11-16T01:31:11.857704Z"
}
},
"cell_type": "code",
"source": [
"# check against inchis\n",
"inchi_skip = {}\n",
"charges_by_inchi = {}\n",
"for id, inchi in inchis.items():\n",
" toks = inchi.split(\"/\")\n",
" # q is charge\n",
" qtoks = [tok for tok in toks if tok.startswith(\"q\")]\n",
" if qtoks:\n",
" qtok = qtoks[0]\n",
" if \";\" in qtok:\n",
" qtok = qtok.replace(\";\", \"\")\n",
" inchi_skip[id] = qtok\n",
" continue\n",
" if \"*\" in qtok:\n",
" # print(qtok)\n",
" mparts= qtok[1:].split(\"*\")\n",
" charge = 1\n",
" try:\n",
" for mpart in mparts:\n",
" charge *= int(mpart)\n",
" except:\n",
" odd.append(id)\n",
" continue\n",
" else:\n",
" try:\n",
" charge = int(qtok[1:])\n",
" charges_by_inchi[id] = charge\n",
" except:\n",
" odd.append(id)\n",
" continue\n",
" # p is protonation\n",
" ptoks = [tok for tok in toks if tok.startswith(\"p\")]\n",
" if ptoks and True:\n",
" ptok = ptoks[0]\n",
" try:\n",
" charge = int(ptok[1:])\n",
" \n",
" if id in charges_by_inchi:\n",
" charges_by_inchi[id] = charge + charges_by_inchi[id]\n",
" else:\n",
" charges_by_inchi[id] = charge\n",
" except:\n",
" pass\n",
" \n",
"len(charges_by_inchi)"
],
"id": "da085426a91c079a",
"outputs": [
{
"data": {
"text/plain": [
"10701"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 18
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:12.390567Z",
"start_time": "2024-11-16T01:31:12.372527Z"
}
},
"cell_type": "code",
"source": [
"errs = []\n",
"for id, charge in charges_by_inchi.items():\n",
" if id not in charges:\n",
" errs.append({\"id\": id, \"inchi_charge\": charge, \"asserted_charge\": None, \"type\": \"MISSING\"})\n",
" elif charges[id] != charge:\n",
" errs.append({\"id\": id, \"inchi_charge\": charge, \"asserted_charge\": charges[id], \"type\": \"MISMATCH\"})\n",
"\n",
"cedf = pd.DataFrame(errs)\n",
"cedf"
],
"id": "b46c92961495ac6c",
"outputs": [
{
"data": {
"text/plain": [
"Empty DataFrame\n",
"Columns: []\n",
"Index: []"
],
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 19
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:12.576946Z",
"start_time": "2024-11-16T01:31:12.574952Z"
}
},
"cell_type": "code",
"source": "# charges_by_inchi[L_CYSTEINE_ZWITTERION]",
"id": "4b25096456c81bdf",
"outputs": [],
"execution_count": 20
},
{
"metadata": {},
"cell_type": "markdown",
"source": "## Find Conjugate Cliques",
"id": "9e690eb32a60624"
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:15.101877Z",
"start_time": "2024-11-16T01:31:12.580466Z"
}
},
"cell_type": "code",
"source": [
"conjrels = list(chebi.relationships(predicates=[CBO, CAO, TAUTOMER_OF]))\n",
"\n",
"conjrels = [(s, p, o) for s, p, o in conjrels if not s in CONJ_EXCLUDES and not o in CONJ_EXCLUDES]\n",
"\n",
"assert len(conjrels) > 15000\n",
"assert len([r for r in conjrels if r[1] == CBO]) > 8000\n",
"assert len([r for r in conjrels if r[1] == TAUTOMER_OF]) > 1500"
],
"id": "3cb9284c0f7ba8b1",
"outputs": [],
"execution_count": 21
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:15.116942Z",
"start_time": "2024-11-16T01:31:15.106922Z"
}
},
"cell_type": "code",
"source": [
"conjrels_by_subject = defaultdict(list)\n",
"for s, p, o in conjrels:\n",
" conjrels_by_subject[s].append((p, o))"
],
"id": "73446b9fa82786fd",
"outputs": [],
"execution_count": 22
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:15.280816Z",
"start_time": "2024-11-16T01:31:15.279140Z"
}
},
"cell_type": "code",
"source": "",
"id": "12ccc4842a4a2204",
"outputs": [],
"execution_count": null
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:15.446767Z",
"start_time": "2024-11-16T01:31:15.443711Z"
}
},
"cell_type": "code",
"source": "conjrels_by_subject[\"CHEBI:142854\"]",
"id": "efb8614663ba2054",
"outputs": [
{
"data": {
"text/plain": [
"[('obo:chebi#is_conjugate_acid_of', 'CHEBI:142858'),\n",
" ('obo:chebi#is_tautomer_of', 'CHEBI:142853')]"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 23
},
{
"metadata": {},
"cell_type": "markdown",
"source": "## Calculate conjugate graph and strongly connected components",
"id": "13e159ee4a9c8964"
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:15.833277Z",
"start_time": "2024-11-16T01:31:15.611565Z"
}
},
"cell_type": "code",
"source": [
"from typing import Tuple\n",
"import networkx as nx\n",
"# find strongly connected components using cbos\n",
"\n",
"def calculate_conj_graph(conjrels: List[Tuple[str, str, str]]) -> nx.DiGraph:\n",
" conj_graph = nx.DiGraph()\n",
" for s, _, o in conjrels:\n",
" conj_graph.add_edge(s, o)\n",
" conj_graph.add_edge(o, s)\n",
" return conj_graph\n",
"\n",
"conj_graph = calculate_conj_graph(conjrels)\n",
"sccs = list(nx.strongly_connected_components(conj_graph))\n",
"asserted_sccs = sccs\n",
"assert len(sccs) > 8000\n",
"\n"
],
"id": "5d1bc9a29166f020",
"outputs": [],
"execution_count": 24
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:16.005371Z",
"start_time": "2024-11-16T01:31:16.002227Z"
}
},
"cell_type": "code",
"source": "len(asserted_sccs)",
"id": "a0fb3be95eeb9eec",
"outputs": [
{
"data": {
"text/plain": [
"8555"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 25
},
{
"metadata": {},
"cell_type": "markdown",
"source": [
"## Lexical analysis\n",
"\n",
"The CHEBI conjugate relationships are incomplete - here we aim to complete them doing a lexical analysis of the labels.\n",
"\n",
"For example,\n",
"\n",
"- foo acid anion\n",
"- foo acid(1-)\n",
"- foo acid zwitterion\n",
"\n",
"should be in a clique"
],
"id": "c84b70bcca329c02"
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:16.179810Z",
"start_time": "2024-11-16T01:31:16.173920Z"
}
},
"cell_type": "code",
"source": [
"# ensure ordering is such that greedy matching works\n",
"suffixes = {\n",
" # odd edge cases - e.g. (2R)-glufosinate zwitterion(1-)\n",
" \"zwitterion(1-)\": -1,\n",
" \"zwitterion(2-)\": -2,\n",
" \"anion(1-)\": -1,\n",
" # standard\n",
" \"zwitterion\": None, \"anion\": (-99, -1), \"cation\": (1, 99), \"ion\": None, \n",
" \"ate\": None,\n",
" \"acid\": None,\n",
" \n",
"}\n",
"for i in range(1, 10):\n",
" for sign in [\"+\", \"-\"]:\n",
" suffixes[f\"({i}{sign})\"] = int(f\"{sign}{i}\")\n",
"suffixes"
],
"id": "5fd4bf68445f7b11",
"outputs": [
{
"data": {
"text/plain": [
"{'zwitterion(1-)': -1,\n",
" 'zwitterion(2-)': -2,\n",
" 'anion(1-)': -1,\n",
" 'zwitterion': None,\n",
" 'anion': (-99, -1),\n",
" 'cation': (1, 99),\n",
" 'ion': None,\n",
" 'ate': None,\n",
" 'acid': None,\n",
" '(1+)': 1,\n",
" '(1-)': -1,\n",
" '(2+)': 2,\n",
" '(2-)': -2,\n",
" '(3+)': 3,\n",
" '(3-)': -3,\n",
" '(4+)': 4,\n",
" '(4-)': -4,\n",
" '(5+)': 5,\n",
" '(5-)': -5,\n",
" '(6+)': 6,\n",
" '(6-)': -6,\n",
" '(7+)': 7,\n",
" '(7-)': -7,\n",
" '(8+)': 8,\n",
" '(8-)': -8,\n",
" '(9+)': 9,\n",
" '(9-)': -9}"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 26
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:19.602618Z",
"start_time": "2024-11-16T01:31:16.349402Z"
}
},
"cell_type": "code",
"source": "roles = list(chebi.descendants(\"CHEBI:50906\", [IS_A]))",
"id": "c10c64db4580a7ee",
"outputs": [],
"execution_count": 27
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:19.774294Z",
"start_time": "2024-11-16T01:31:19.772106Z"
}
},
"cell_type": "code",
"source": [
"# https://github.com/ebi-chebi/ChEBI/issues/4528\n",
"EXCLUDE_STEMS = [\"disulfide\", \"tartr\", \"tartar\", \"oxide\", \"oxo\"]"
],
"id": "8400b3213240d61b",
"outputs": [],
"execution_count": 28
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:25.527666Z",
"start_time": "2024-11-16T01:31:19.942542Z"
}
},
"cell_type": "code",
"source": [
"from typing import Dict\n",
"\n",
"# stem to chem is a mapping between a stem (e.g. \"L-lysine\")\n",
"# and a dictionary of suffixes to CHEBI IDs\n",
"stem_to_chem: Dict[str, Dict[str, str]] = {}\n",
"stem_to_chem = defaultdict(dict)\n",
"\n",
"def _norm(label: str) -> str:\n",
" # CHEBI is inconsistent, e.g. \"amino-acid\" vs \"amino acid\"\n",
" label = label.replace(\"-acid\", \" acid\")\n",
" # label = label.replace(\" acid\", \"\")\n",
" return label\n",
"\n",
"def _de_acid(label: str) -> str:\n",
" if label.endswith(\" acid\"):\n",
" label = label.replace(\" acid\", \"\")\n",
" if label.endswith(\"ic\"):\n",
" label = label.replace(\"ic\", \"\")\n",
" if label.endswith(\"ate\"):\n",
" label = label.replace(\"ate\", \"\")\n",
" return label\n",
"\n",
"for id, label in labels.items():\n",
" if not label:\n",
" # TODO: eliminate non-classes\n",
" continue\n",
" if id in roles:\n",
" continue\n",
" label = _norm(label)\n",
" for suffix in suffixes.keys():\n",
" if label.endswith(suffix):\n",
" stem = label.replace(suffix, \"\")\n",
" stem = stem.strip()\n",
" stem_to_chem[_de_acid(stem)][suffix] = id\n",
" break\n",
"for id, label in labels.items():\n",
" if not label:\n",
" continue\n",
" label = _norm(label)\n",
" if label in stem_to_chem:\n",
" stem_to_chem[_de_acid(label)][\"\"] = id\n",
" \n",
"for stem in EXCLUDE_STEMS:\n",
" if stem in stem_to_chem:\n",
" del stem_to_chem[stem]\n",
"\n",
"assert len(stem_to_chem) > 30000"
],
"id": "191d0cca5fcbbcaf",
"outputs": [],
"execution_count": 29
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:25.734334Z",
"start_time": "2024-11-16T01:31:25.731139Z"
}
},
"cell_type": "code",
"source": "stem_to_chem[\"amino\"]",
"id": "8655db99a49e3963",
"outputs": [
{
"data": {
"text/plain": [
"{'cation': 'CHEBI:33703',\n",
" 'acid': 'CHEBI:33709',\n",
" 'zwitterion': 'CHEBI:35238',\n",
" 'anion': 'CHEBI:37022'}"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 30
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:25.943943Z",
"start_time": "2024-11-16T01:31:25.939811Z"
}
},
"cell_type": "code",
"source": "stem_to_chem[\"oxo\"]",
"id": "18ad460573de1981",
"outputs": [
{
"data": {
"text/plain": [
"{}"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 31
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:26.174167Z",
"start_time": "2024-11-16T01:31:26.169880Z"
}
},
"cell_type": "code",
"source": "stem_to_chem[\"citr\"]",
"id": "e9866728e7812eb3",
"outputs": [
{
"data": {
"text/plain": [
"{'(4-)': 'CHEBI:132362',\n",
" 'anion': 'CHEBI:133748',\n",
" '(3-)': 'CHEBI:16947',\n",
" 'acid': 'CHEBI:30769',\n",
" '(1-)': 'CHEBI:35804',\n",
" '(2-)': 'CHEBI:35808'}"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 32
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:26.432704Z",
"start_time": "2024-11-16T01:31:26.429980Z"
}
},
"cell_type": "code",
"source": "assert not stem_to_chem[\"citrate\"]",
"id": "218adb893354594a",
"outputs": [],
"execution_count": 33
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:26.649678Z",
"start_time": "2024-11-16T01:31:26.646255Z"
}
},
"cell_type": "code",
"source": "stem_to_chem[\"(2R)-glufosin\"]",
"id": "7f2dc8de7ba44920",
"outputs": [
{
"data": {
"text/plain": [
"{'ate': 'CHEBI:142853',\n",
" 'zwitterion': 'CHEBI:142854',\n",
" 'zwitterion(1-)': 'CHEBI:142858'}"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 34
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:26.863908Z",
"start_time": "2024-11-16T01:31:26.861202Z"
}
},
"cell_type": "code",
"source": [
"# ensure edge case of (2R)-glufosinate zwitterion(1-) is taken care of\n",
"assert not stem_to_chem[\"(2R)-glufosinate zwitterion\"]"
],
"id": "4c64ccc7e2241c4c",
"outputs": [],
"execution_count": 35
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:27.071697Z",
"start_time": "2024-11-16T01:31:27.069657Z"
}
},
"cell_type": "code",
"source": "",
"id": "91405e702f57aac8",
"outputs": [],
"execution_count": null
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:27.282471Z",
"start_time": "2024-11-16T01:31:27.280694Z"
}
},
"cell_type": "code",
"source": "",
"id": "8c0413a76dcfd32b",
"outputs": [],
"execution_count": null
},
{
"metadata": {},
"cell_type": "markdown",
"source": [
"## Analysis: Consistency check between lexically inferred cliques and asserted relationships\n",
"\n",
"Some of this is reported here:\n",
"\n",
"- https://github.com/ebi-chebi/ChEBI/issues/4524\n"
],
"id": "d18ca068f686eb8e"
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:27.717425Z",
"start_time": "2024-11-16T01:31:27.698402Z"
}
},
"cell_type": "code",
"source": [
"import numpy as np\n",
"\n",
"NO_REL = \"NO_REL\"\n",
"INVERSES = {\n",
" CBO: CAO,\n",
" CAO: CBO,\n",
" TAUTOMER_OF: TAUTOMER_OF,\n",
" NO_REL: NO_REL\n",
"}\n",
"\n",
"charge_problems = []\n",
"def make_conjrefs(clique_suffix_dict: dict, stem=None) -> List:\n",
" results = []\n",
" for suffix1, chem1 in clique_suffix_dict.items():\n",
" ch1 = suffixes.get(suffix1, None)\n",
" actual_ch1 = charges.get(chem1, None)\n",
" chem1_label = labels.get(chem1, chem1)\n",
" if isinstance(ch1, int):\n",
" if actual_ch1 is None:\n",
" charge_problems.append({\"id\": chem1, \"label\": chem1_label, \"expected\": ch1, \"asserted\": None, \"problem\": \"MISSING_CHARGE\"})\n",
" # raise ValueError(f\"Missing charge for {chem1}\")\n",
" elif ch1 != actual_ch1:\n",
" charge_problems.append({\"id\": chem1, \"label\": chem1_label, \"expected\": ch1, \"asserted\": actual_ch1, \"problem\": \"CONFLICTING_CHARGE\"})\n",
" # raise ValueError(f\"Charge mismatch for {chem1}: {ch1} vs {actual_ch1}\")\n",
" elif isinstance(ch1, tuple):\n",
" if actual_ch1 is not None:\n",
" if actual_ch1 < ch1[0] or actual_ch1 > ch1[1]:\n",
" charge_problems.append({\"id\": chem1, \"label\": chem1_label, \"expected\": ch1, \"asserted\": actual_ch1, \"problem\": \"OUTSIDE_RANGE\"})\n",
" # raise ValueError(f\"Charge mismatch for {chem1}: {ch1} vs {actual_ch1}\")\n",
" rels = conjrels_by_subject.get(chem1, [])\n",
" for suffix2, chem2 in clique_suffix_dict.items():\n",
" if suffix1 == suffix2:\n",
" continue\n",
" messages = []\n",
" matched_preds = set()\n",
" actual_p = \"NO_REL\"\n",
" for p, o in rels:\n",
" if o == chem2:\n",
" actual_p = p\n",
" matched_preds.add(p)\n",
" if len(matched_preds) > 1:\n",
" messages.append(f\"Multiple matched preds: {matched_preds}\")\n",
" \n",
" rev_matched_preds = set()\n",
" rels2 = conjrels_by_subject.get(chem2, [])\n",
" for p, o in rels2:\n",
" if o == chem1:\n",
" inv_p = INVERSES[p]\n",
" rev_matched_preds.add(inv_p)\n",
" if actual_p:\n",
" if inv_p != actual_p:\n",
" messages.append(f\"Preds mismatch: {actual_p} vs {inv_p}\")\n",
" else:\n",
" actual_p = inv_p\n",
" if len(rev_matched_preds) > 1:\n",
" messages.append(f\"Multiple matched inv preds: {rev_matched_preds}\")\n",
" if matched_preds != rev_matched_preds:\n",
" messages.append(f\"Preds mismatch: {matched_preds} vs {rev_matched_preds}\")\n",
" if messages:\n",
" raise ValueError(f\"Error in clique {clique_suffix_dict}: {messages}\")\n",
" ch2 = suffixes.get(suffix2, None)\n",
" if ch1 is None or ch2 is None:\n",
" charge_diff = None\n",
" charge_diff_sign = None\n",
" else:\n",
" if isinstance(ch1, int) and isinstance(ch2, int):\n",
" charge_diff = ch1 - ch2\n",
" charge_diff_sign = np.sign(charge_diff)\n",
" elif isinstance(ch1, int) and isinstance(ch2, tuple):\n",
" charge_diff = None\n",
" if ch1 < ch2[0]:\n",
" charge_diff_sign = -1\n",
" elif ch1 > ch2[1]:\n",
" charge_diff_sign = 1\n",
" else:\n",
" charge_diff_sign = 0\n",
" elif isinstance(ch1, tuple) and isinstance(ch2, int):\n",
" charge_diff = None\n",
" if ch1[0] < ch2:\n",
" charge_diff_sign = -1\n",
" elif ch1[1] > ch2:\n",
" charge_diff_sign = 1\n",
" else:\n",
" charge_diff_sign = 0\n",
" else:\n",
" charge_diff = None\n",
" charge_diff_sign = None\n",
" results.append({\"suffix1\": suffix1 or \"NO_SUFFIX\",\n",
" \"suffix2\": suffix2 or \"NO_SUFFIX\",\n",
" \"predicate\": actual_p,\n",
" \"chem1\": chem1,\n",
" \"chem2\": chem2,\n",
" \"charge_diff\": charge_diff,\n",
" \"charge_diff_sign\": charge_diff_sign,\n",
" \"stem\": stem,\n",
" })\n",
" return results\n",
" \n",
"make_conjrefs(stem_to_chem[\"amino\"])"
],
"id": "f82ad334be85fe9f",
"outputs": [
{
"data": {
"text/plain": [
"[{'suffix1': 'cation',\n",
" 'suffix2': 'acid',\n",
" 'predicate': 'obo:chebi#is_conjugate_acid_of',\n",
" 'chem1': 'CHEBI:33703',\n",
" 'chem2': 'CHEBI:33709',\n",
" 'charge_diff': None,\n",
" 'charge_diff_sign': None,\n",
" 'stem': None},\n",
" {'suffix1': 'cation',\n",
" 'suffix2': 'zwitterion',\n",
" 'predicate': 'NO_REL',\n",
" 'chem1': 'CHEBI:33703',\n",
" 'chem2': 'CHEBI:35238',\n",
" 'charge_diff': None,\n",
" 'charge_diff_sign': None,\n",
" 'stem': None},\n",
" {'suffix1': 'cation',\n",
" 'suffix2': 'anion',\n",
" 'predicate': 'NO_REL',\n",
" 'chem1': 'CHEBI:33703',\n",
" 'chem2': 'CHEBI:37022',\n",
" 'charge_diff': None,\n",
" 'charge_diff_sign': None,\n",
" 'stem': None},\n",
" {'suffix1': 'acid',\n",
" 'suffix2': 'cation',\n",
" 'predicate': 'obo:chebi#is_conjugate_base_of',\n",
" 'chem1': 'CHEBI:33709',\n",
" 'chem2': 'CHEBI:33703',\n",
" 'charge_diff': None,\n",
" 'charge_diff_sign': None,\n",
" 'stem': None},\n",
" {'suffix1': 'acid',\n",
" 'suffix2': 'zwitterion',\n",
" 'predicate': 'NO_REL',\n",
" 'chem1': 'CHEBI:33709',\n",
" 'chem2': 'CHEBI:35238',\n",
" 'charge_diff': None,\n",
" 'charge_diff_sign': None,\n",
" 'stem': None},\n",
" {'suffix1': 'acid',\n",
" 'suffix2': 'anion',\n",
" 'predicate': 'obo:chebi#is_conjugate_acid_of',\n",
" 'chem1': 'CHEBI:33709',\n",
" 'chem2': 'CHEBI:37022',\n",
" 'charge_diff': None,\n",
" 'charge_diff_sign': None,\n",
" 'stem': None},\n",
" {'suffix1': 'zwitterion',\n",
" 'suffix2': 'cation',\n",
" 'predicate': 'NO_REL',\n",
" 'chem1': 'CHEBI:35238',\n",
" 'chem2': 'CHEBI:33703',\n",
" 'charge_diff': None,\n",
" 'charge_diff_sign': None,\n",
" 'stem': None},\n",
" {'suffix1': 'zwitterion',\n",
" 'suffix2': 'acid',\n",
" 'predicate': 'NO_REL',\n",
" 'chem1': 'CHEBI:35238',\n",
" 'chem2': 'CHEBI:33709',\n",
" 'charge_diff': None,\n",
" 'charge_diff_sign': None,\n",
" 'stem': None},\n",
" {'suffix1': 'zwitterion',\n",
" 'suffix2': 'anion',\n",
" 'predicate': 'NO_REL',\n",
" 'chem1': 'CHEBI:35238',\n",
" 'chem2': 'CHEBI:37022',\n",
" 'charge_diff': None,\n",
" 'charge_diff_sign': None,\n",
" 'stem': None},\n",
" {'suffix1': 'anion',\n",
" 'suffix2': 'cation',\n",
" 'predicate': 'NO_REL',\n",
" 'chem1': 'CHEBI:37022',\n",
" 'chem2': 'CHEBI:33703',\n",
" 'charge_diff': None,\n",
" 'charge_diff_sign': None,\n",
" 'stem': None},\n",
" {'suffix1': 'anion',\n",
" 'suffix2': 'acid',\n",
" 'predicate': 'obo:chebi#is_conjugate_base_of',\n",
" 'chem1': 'CHEBI:37022',\n",
" 'chem2': 'CHEBI:33709',\n",
" 'charge_diff': None,\n",
" 'charge_diff_sign': None,\n",
" 'stem': None},\n",
" {'suffix1': 'anion',\n",
" 'suffix2': 'zwitterion',\n",
" 'predicate': 'NO_REL',\n",
" 'chem1': 'CHEBI:37022',\n",
" 'chem2': 'CHEBI:35238',\n",
" 'charge_diff': None,\n",
" 'charge_diff_sign': None,\n",
" 'stem': None}]"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 36
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:27.757292Z",
"start_time": "2024-11-16T01:31:27.752910Z"
}
},
"cell_type": "code",
"source": "make_conjrefs(stem_to_chem[\"(2R)-glufosin\"])",
"id": "be092eb65d52d565",
"outputs": [
{
"data": {
"text/plain": [
"[{'suffix1': 'ate',\n",
" 'suffix2': 'zwitterion',\n",
" 'predicate': 'obo:chebi#is_tautomer_of',\n",
" 'chem1': 'CHEBI:142853',\n",
" 'chem2': 'CHEBI:142854',\n",
" 'charge_diff': None,\n",
" 'charge_diff_sign': None,\n",
" 'stem': None},\n",
" {'suffix1': 'ate',\n",
" 'suffix2': 'zwitterion(1-)',\n",
" 'predicate': 'NO_REL',\n",
" 'chem1': 'CHEBI:142853',\n",
" 'chem2': 'CHEBI:142858',\n",
" 'charge_diff': None,\n",
" 'charge_diff_sign': None,\n",
" 'stem': None},\n",
" {'suffix1': 'zwitterion',\n",
" 'suffix2': 'ate',\n",
" 'predicate': 'obo:chebi#is_tautomer_of',\n",
" 'chem1': 'CHEBI:142854',\n",
" 'chem2': 'CHEBI:142853',\n",
" 'charge_diff': None,\n",
" 'charge_diff_sign': None,\n",
" 'stem': None},\n",
" {'suffix1': 'zwitterion',\n",
" 'suffix2': 'zwitterion(1-)',\n",
" 'predicate': 'obo:chebi#is_conjugate_acid_of',\n",
" 'chem1': 'CHEBI:142854',\n",
" 'chem2': 'CHEBI:142858',\n",
" 'charge_diff': None,\n",
" 'charge_diff_sign': None,\n",
" 'stem': None},\n",
" {'suffix1': 'zwitterion(1-)',\n",
" 'suffix2': 'ate',\n",
" 'predicate': 'NO_REL',\n",
" 'chem1': 'CHEBI:142858',\n",
" 'chem2': 'CHEBI:142853',\n",
" 'charge_diff': None,\n",
" 'charge_diff_sign': None,\n",
" 'stem': None},\n",
" {'suffix1': 'zwitterion(1-)',\n",
" 'suffix2': 'zwitterion',\n",
" 'predicate': 'obo:chebi#is_conjugate_base_of',\n",
" 'chem1': 'CHEBI:142858',\n",
" 'chem2': 'CHEBI:142854',\n",
" 'charge_diff': None,\n",
" 'charge_diff_sign': None,\n",
" 'stem': None}]"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 37
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:57:26.798272Z",
"start_time": "2024-11-16T01:57:26.634185Z"
}
},
"cell_type": "code",
"source": "!mkdir -p tmp",
"id": "66516f31cdfd6ef5",
"outputs": [],
"execution_count": 39
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:57:27.594776Z",
"start_time": "2024-11-16T01:57:27.489243Z"
}
},
"cell_type": "code",
"source": [
"charge_problems = [] ## warning - global\n",
"lexical_conj_pairs = []\n",
"for stem, clique in stem_to_chem.items():\n",
" lexical_conj_pairs += make_conjrefs(clique, stem)\n",
"\n",
"# Reported here: https://github.com/ebi-chebi/ChEBI/issues/4525\n",
"pd.DataFrame(charge_problems).to_csv(\"tmp/charge_problems.csv\", index=False)\n",
"\n",
"\n",
"len(lexical_conj_pairs)"
],
"id": "c5ef02e4ed64a4d3",
"outputs": [
{
"data": {
"text/plain": [
"17170"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 40
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:57:28.245587Z",
"start_time": "2024-11-16T01:57:28.229896Z"
}
},
"cell_type": "code",
"source": [
"chem_to_stem: Dict[str, str] = {}\n",
"for row in lexical_conj_pairs:\n",
" def _assign(chem: str, stem: str):\n",
" if chem in chem_to_stem:\n",
" if chem_to_stem[chem] != stem:\n",
" raise ValueError(f\"Conflicting stems for {chem}: {chem_to_stem[chem]} vs {stem}\")\n",
" else:\n",
" chem_to_stem[chem] = stem\n",
" stem = row[\"stem\"]\n",
" _assign(row[\"chem1\"], stem)\n",
" _assign(row[\"chem2\"], stem)"
],
"id": "3088a79bf38863f5",
"outputs": [],
"execution_count": 41
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:57:28.772650Z",
"start_time": "2024-11-16T01:57:28.478678Z"
}
},
"cell_type": "code",
"source": [
"g = calculate_conj_graph([(row[\"chem1\"], \"?\", row[\"chem2\"]) for row in lexical_conj_pairs])\n",
"lexical_sccs = list(nx.strongly_connected_components(g))\n",
"len(lexical_sccs)"
],
"id": "b0612535c952ce23",
"outputs": [
{
"data": {
"text/plain": [
"7519"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 42
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:57:28.871885Z",
"start_time": "2024-11-16T01:57:28.869895Z"
}
},
"cell_type": "code",
"source": "",
"id": "fd3af467efa5e894",
"outputs": [],
"execution_count": null
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:57:29.331493Z",
"start_time": "2024-11-16T01:57:29.302776Z"
}
},
"cell_type": "code",
"source": [
"# venn diagram of overlaps between\n",
"# - lexical_sccs\n",
"# - asserted_sccs\n",
"# - rhea_sccs\n",
"# - full_sccs\n",
"\n",
"from matplotlib_venn import venn3\n",
"from matplotlib_venn import venn3_unweighted\n",
"\n",
"import matplotlib.pyplot as plt\n",
"\n",
"def hashable_scc(scc):\n",
" return set([tuple(sorted(list(x))) for x in scc])\n",
"\n",
"def my_venn3(sccs, *args, **kwargs):\n",
" scc_sets = [hashable_scc(scc) for scc in sccs]\n",
" venn3_unweighted(scc_sets, *args, **kwargs)\n",
"\n",
"#venn3([set(lexical_sccs), set(asserted_sccs), set(rhea_sccs)], (\"Lexical\", \"Asserted\", \"Rhea\"))\n",
"#venn3([{1}, {1,2}, {1,2,tuple(\"a\" \"b\")}])\n",
"my_venn3([lexical_sccs, asserted_sccs, rhea_sccs], (\"Lexical\", \"Asserted\", \"Rhea\"))\n",
"plt.show()"
],
"id": "2305e056b7c20879",
"outputs": [
{
"ename": "NameError",
"evalue": "name 'rhea_sccs' is not defined",
"output_type": "error",
"traceback": [
"\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
"\u001B[0;31mNameError\u001B[0m Traceback (most recent call last)",
"Cell \u001B[0;32mIn[43], line 21\u001B[0m\n\u001B[1;32m 17\u001B[0m venn3_unweighted(scc_sets, \u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs)\n\u001B[1;32m 19\u001B[0m \u001B[38;5;66;03m#venn3([set(lexical_sccs), set(asserted_sccs), set(rhea_sccs)], (\"Lexical\", \"Asserted\", \"Rhea\"))\u001B[39;00m\n\u001B[1;32m 20\u001B[0m \u001B[38;5;66;03m#venn3([{1}, {1,2}, {1,2,tuple(\"a\" \"b\")}])\u001B[39;00m\n\u001B[0;32m---> 21\u001B[0m my_venn3([lexical_sccs, asserted_sccs, \u001B[43mrhea_sccs\u001B[49m], (\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mLexical\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mAsserted\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mRhea\u001B[39m\u001B[38;5;124m\"\u001B[39m))\n\u001B[1;32m 22\u001B[0m plt\u001B[38;5;241m.\u001B[39mshow()\n",
"\u001B[0;31mNameError\u001B[0m: name 'rhea_sccs' is not defined"
]
}
],
"execution_count": 43
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": [
"# same as Euler diagram\n",
"from matplotlib_venn import venn3_unweighted\n"
],
"id": "96f038a31ac729b0"
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:57:29.414228Z",
"start_time": "2024-08-24T00:15:17.567225Z"
}
},
"cell_type": "code",
"source": [
"import pandas as pd\n",
"df = pd.DataFrame(lexical_conj_pairs)"
],
"id": "ce0321a0c59abbe8",
"outputs": [],
"execution_count": 42
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:57:29.653055Z",
"start_time": "2024-11-16T01:57:29.637167Z"
}
},
"cell_type": "code",
"source": "df",
"id": "b3d677f09cff4d7e",
"outputs": [
{
"ename": "NameError",
"evalue": "name 'df' is not defined",
"output_type": "error",
"traceback": [
"\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
"\u001B[0;31mNameError\u001B[0m Traceback (most recent call last)",
"Cell \u001B[0;32mIn[44], line 1\u001B[0m\n\u001B[0;32m----> 1\u001B[0m \u001B[43mdf\u001B[49m\n",
"\u001B[0;31mNameError\u001B[0m: name 'df' is not defined"
]
}
],
"execution_count": 44
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:57:29.856272Z",
"start_time": "2024-11-16T01:57:29.838003Z"
}
},
"cell_type": "code",
"source": "df[(df[\"charge_diff_sign\"] < 0) & (df[\"predicate\"] == CAO)]",
"id": "4f5986951d764fa1",
"outputs": [
{
"ename": "NameError",
"evalue": "name 'df' is not defined",
"output_type": "error",
"traceback": [
"\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
"\u001B[0;31mNameError\u001B[0m Traceback (most recent call last)",
"Cell \u001B[0;32mIn[45], line 1\u001B[0m\n\u001B[0;32m----> 1\u001B[0m \u001B[43mdf\u001B[49m[(df[\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mcharge_diff_sign\u001B[39m\u001B[38;5;124m\"\u001B[39m] \u001B[38;5;241m<\u001B[39m \u001B[38;5;241m0\u001B[39m) \u001B[38;5;241m&\u001B[39m (df[\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mpredicate\u001B[39m\u001B[38;5;124m\"\u001B[39m] \u001B[38;5;241m==\u001B[39m CAO)]\n",
"\u001B[0;31mNameError\u001B[0m: name 'df' is not defined"
]
}
],
"execution_count": 45
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:57:30.236461Z",
"start_time": "2024-11-16T01:57:30.218120Z"
}
},
"cell_type": "code",
"source": "df[(df[\"charge_diff_sign\"] > 0) & (df[\"predicate\"] == CBO)]",
"id": "577c5ec2b5fe7e60",
"outputs": [
{
"ename": "NameError",
"evalue": "name 'df' is not defined",
"output_type": "error",
"traceback": [
"\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
"\u001B[0;31mNameError\u001B[0m Traceback (most recent call last)",
"Cell \u001B[0;32mIn[46], line 1\u001B[0m\n\u001B[0;32m----> 1\u001B[0m \u001B[43mdf\u001B[49m[(df[\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mcharge_diff_sign\u001B[39m\u001B[38;5;124m\"\u001B[39m] \u001B[38;5;241m>\u001B[39m \u001B[38;5;241m0\u001B[39m) \u001B[38;5;241m&\u001B[39m (df[\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mpredicate\u001B[39m\u001B[38;5;124m\"\u001B[39m] \u001B[38;5;241m==\u001B[39m CBO)]\n",
"\u001B[0;31mNameError\u001B[0m: name 'df' is not defined"
]
}
],
"execution_count": 46
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:57:30.547390Z",
"start_time": "2024-11-16T01:57:30.531512Z"
}
},
"cell_type": "code",
"source": " df.groupby([\"predicate\"]).size()",
"id": "b381a47bce47eda3",
"outputs": [
{
"ename": "NameError",
"evalue": "name 'df' is not defined",
"output_type": "error",
"traceback": [
"\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
"\u001B[0;31mNameError\u001B[0m Traceback (most recent call last)",
"Cell \u001B[0;32mIn[47], line 1\u001B[0m\n\u001B[0;32m----> 1\u001B[0m \u001B[43mdf\u001B[49m\u001B[38;5;241m.\u001B[39mgroupby([\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mpredicate\u001B[39m\u001B[38;5;124m\"\u001B[39m])\u001B[38;5;241m.\u001B[39msize()\n",
"\u001B[0;31mNameError\u001B[0m: name 'df' is not defined"
]
}
],
"execution_count": 47
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:57:30.855558Z",
"start_time": "2024-11-16T01:57:30.839137Z"
}
},
"cell_type": "code",
"source": "df.to_csv(\"tmp/conjrels.csv\", index=False)\n",
"id": "5ee58eef65ddbc9d",
"outputs": [
{
"ename": "NameError",
"evalue": "name 'df' is not defined",
"output_type": "error",
"traceback": [
"\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
"\u001B[0;31mNameError\u001B[0m Traceback (most recent call last)",
"Cell \u001B[0;32mIn[48], line 1\u001B[0m\n\u001B[0;32m----> 1\u001B[0m \u001B[43mdf\u001B[49m\u001B[38;5;241m.\u001B[39mto_csv(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mtmp/conjrels.csv\u001B[39m\u001B[38;5;124m\"\u001B[39m, index\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mFalse\u001B[39;00m)\n",
"\u001B[0;31mNameError\u001B[0m: name 'df' is not defined"
]
}
],
"execution_count": 48
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.003412Z",
"start_time": "2024-08-24T00:15:18.496412Z"
}
},
"cell_type": "code",
"source": [
"# group by suffix1, suffix2, predicate, and count the number of rows\n",
"summary = df.groupby([\"suffix1\", \"suffix2\", \"predicate\"]).size()\n",
"summary.sort_values(ascending=False)"
],
"id": "f1be2d07a0d5ce95",
"outputs": [
{
"data": {
"text/plain": [
"suffix1 suffix2 predicate \n",
"acid ate obo:chebi#is_conjugate_acid_of 1522\n",
"ate acid obo:chebi#is_conjugate_base_of 1522\n",
"NO_SUFFIX (1-) obo:chebi#is_conjugate_acid_of 1332\n",
"(1-) NO_SUFFIX obo:chebi#is_conjugate_base_of 1332\n",
"NO_SUFFIX (4-) obo:chebi#is_conjugate_acid_of 673\n",
" ... \n",
"anion(1-) acid NO_REL 1\n",
"(6-) (2-) NO_REL 1\n",
" (3-) NO_REL 1\n",
" (4-) NO_REL 1\n",
"zwitterion(2-) zwitterion obo:chebi#is_conjugate_base_of 1\n",
"Length: 424, dtype: int64"
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 48
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.003568Z",
"start_time": "2024-08-24T00:15:18.593731Z"
}
},
"cell_type": "code",
"source": [
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from scipy.stats import entropy\n",
"\n",
"data = df\n",
"\n",
"# Display the first few rows of the data to understand its structure\n",
"data.head()\n",
"\n",
"# Create a function to calculate entropy for each group\n",
"def calculate_entropy(group):\n",
" counts = group.value_counts(normalize=True)\n",
" return entropy(counts)\n",
"\n",
"# Grouping the data by suffix1 and suffix2, then calculating entropy for the predicates\n",
"entropy_data = data.groupby(['suffix1', 'suffix2'])['predicate'].apply(calculate_entropy).unstack(fill_value=0)\n",
"\n",
"# Plotting the entropy heatmap with reversed x-axis\n",
"plt.figure(figsize=(10, 8))\n",
"sns.heatmap(entropy_data.loc[:, ::-1], annot=True, cmap=\"coolwarm\")\n",
"plt.title('Entropy of Predicate Distribution by Suffix1 and Suffix2 (Reversed X-Axis)')\n",
"plt.xlabel('Suffix2')\n",
"plt.ylabel('Suffix1')\n",
"plt.show()\n"
],
"id": "b15618246ffb5c58",
"outputs": [
{
"data": {
"text/plain": [
""
],
"image/png": ""
},
"metadata": {},
"output_type": "display_data"
}
],
"execution_count": 49
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.003682Z",
"start_time": "2024-08-24T00:15:19.254225Z"
}
},
"cell_type": "code",
"source": [
"pivot_table = df.pivot_table(index=['suffix1', 'suffix2'], columns='predicate', aggfunc='size', fill_value=0).reset_index()\n",
"pivot_table.to_csv(\"tmp/pivot_table.csv\", index=False)\n",
"pivot_table"
],
"id": "575eb6dbb6024008",
"outputs": [
{
"data": {
"text/plain": [
"predicate suffix1 suffix2 NO_REL obo:chebi#is_conjugate_acid_of \\\n",
"0 (1+) (1-) 2 2 \n",
"1 (1+) (2+) 19 0 \n",
"2 (1+) (2-) 3 2 \n",
"3 (1+) (3+) 11 0 \n",
"4 (1+) (3-) 2 0 \n",
".. ... ... ... ... \n",
"271 zwitterion(1-) ate 1 0 \n",
"272 zwitterion(1-) zwitterion 0 0 \n",
"273 zwitterion(2-) (3-) 0 1 \n",
"274 zwitterion(2-) acid 1 0 \n",
"275 zwitterion(2-) zwitterion 0 0 \n",
"\n",
"predicate obo:chebi#is_conjugate_base_of obo:chebi#is_tautomer_of \n",
"0 0 0 \n",
"1 19 0 \n",
"2 0 0 \n",
"3 0 0 \n",
"4 0 0 \n",
".. ... ... \n",
"271 0 0 \n",
"272 3 0 \n",
"273 0 0 \n",
"274 0 0 \n",
"275 1 0 \n",
"\n",
"[276 rows x 6 columns]"
],
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" predicate | \n",
" suffix1 | \n",
" suffix2 | \n",
" NO_REL | \n",
" obo:chebi#is_conjugate_acid_of | \n",
" obo:chebi#is_conjugate_base_of | \n",
" obo:chebi#is_tautomer_of | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" (1+) | \n",
" (1-) | \n",
" 2 | \n",
" 2 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" (1+) | \n",
" (2+) | \n",
" 19 | \n",
" 0 | \n",
" 19 | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" (1+) | \n",
" (2-) | \n",
" 3 | \n",
" 2 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" (1+) | \n",
" (3+) | \n",
" 11 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" (1+) | \n",
" (3-) | \n",
" 2 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 271 | \n",
" zwitterion(1-) | \n",
" ate | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 272 | \n",
" zwitterion(1-) | \n",
" zwitterion | \n",
" 0 | \n",
" 0 | \n",
" 3 | \n",
" 0 | \n",
"
\n",
" \n",
" 273 | \n",
" zwitterion(2-) | \n",
" (3-) | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 274 | \n",
" zwitterion(2-) | \n",
" acid | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 275 | \n",
" zwitterion(2-) | \n",
" zwitterion | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
276 rows × 6 columns
\n",
"
"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 50
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.005059Z",
"start_time": "2024-08-24T00:15:19.334380Z"
}
},
"cell_type": "code",
"source": "df[(df[\"suffix1\"] == \"(1+)\") & (df[\"suffix2\"] == \"NO_SUFFIX\") & (df[\"predicate\"] == CBO)]",
"id": "ecff73edc212adaa",
"outputs": [
{
"data": {
"text/plain": [
" suffix1 suffix2 predicate chem1 \\\n",
"3026 (1+) NO_SUFFIX obo:chebi#is_conjugate_base_of CHEBI:141055 \n",
"10936 (1+) NO_SUFFIX obo:chebi#is_conjugate_base_of CHEBI:58644 \n",
"12230 (1+) NO_SUFFIX obo:chebi#is_conjugate_base_of CHEBI:64003 \n",
"12330 (1+) NO_SUFFIX obo:chebi#is_conjugate_base_of CHEBI:64364 \n",
"13406 (1+) NO_SUFFIX obo:chebi#is_conjugate_base_of CHEBI:72567 \n",
"14208 (1+) NO_SUFFIX obo:chebi#is_conjugate_base_of CHEBI:75297 \n",
"14382 (1+) NO_SUFFIX obo:chebi#is_conjugate_base_of CHEBI:76278 \n",
"14672 (1+) NO_SUFFIX obo:chebi#is_conjugate_base_of CHEBI:76819 \n",
"14698 (1+) NO_SUFFIX obo:chebi#is_conjugate_base_of CHEBI:76922 \n",
"16426 (1+) NO_SUFFIX obo:chebi#is_conjugate_base_of CHEBI:86083 \n",
"16452 (1+) NO_SUFFIX obo:chebi#is_conjugate_base_of CHEBI:86380 \n",
"\n",
" chem2 charge_diff charge_diff_sign \\\n",
"3026 CHEBI:141057 NaN NaN \n",
"10936 CHEBI:32818 NaN NaN \n",
"12230 CHEBI:64004 NaN NaN \n",
"12330 CHEBI:10650 NaN NaN \n",
"13406 CHEBI:6438 NaN NaN \n",
"14208 CHEBI:31057 NaN NaN \n",
"14382 CHEBI:16299 NaN NaN \n",
"14672 CHEBI:15906 NaN NaN \n",
"14698 CHEBI:77055 NaN NaN \n",
"16426 CHEBI:86085 NaN NaN \n",
"16452 CHEBI:599440 NaN NaN \n",
"\n",
" stem \n",
"3026 validoxylamine B \n",
"10936 p-coumaroylagmatine \n",
"12230 N-allyl-6-chloro-1-(3-methylphenyl)-2,3,4,5-te... \n",
"12330 sumatriptan \n",
"13406 levobunolol \n",
"14208 13-deoxydaunorubicin \n",
"14382 dehydrocoformycin \n",
"14672 demethylmacrocin \n",
"14698 argemonine \n",
"16426 (Z)-p-coumaroylagmatine \n",
"16452 amorolfine "
],
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" suffix1 | \n",
" suffix2 | \n",
" predicate | \n",
" chem1 | \n",
" chem2 | \n",
" charge_diff | \n",
" charge_diff_sign | \n",
" stem | \n",
"
\n",
" \n",
" \n",
" \n",
" 3026 | \n",
" (1+) | \n",
" NO_SUFFIX | \n",
" obo:chebi#is_conjugate_base_of | \n",
" CHEBI:141055 | \n",
" CHEBI:141057 | \n",
" NaN | \n",
" NaN | \n",
" validoxylamine B | \n",
"
\n",
" \n",
" 10936 | \n",
" (1+) | \n",
" NO_SUFFIX | \n",
" obo:chebi#is_conjugate_base_of | \n",
" CHEBI:58644 | \n",
" CHEBI:32818 | \n",
" NaN | \n",
" NaN | \n",
" p-coumaroylagmatine | \n",
"
\n",
" \n",
" 12230 | \n",
" (1+) | \n",
" NO_SUFFIX | \n",
" obo:chebi#is_conjugate_base_of | \n",
" CHEBI:64003 | \n",
" CHEBI:64004 | \n",
" NaN | \n",
" NaN | \n",
" N-allyl-6-chloro-1-(3-methylphenyl)-2,3,4,5-te... | \n",
"
\n",
" \n",
" 12330 | \n",
" (1+) | \n",
" NO_SUFFIX | \n",
" obo:chebi#is_conjugate_base_of | \n",
" CHEBI:64364 | \n",
" CHEBI:10650 | \n",
" NaN | \n",
" NaN | \n",
" sumatriptan | \n",
"
\n",
" \n",
" 13406 | \n",
" (1+) | \n",
" NO_SUFFIX | \n",
" obo:chebi#is_conjugate_base_of | \n",
" CHEBI:72567 | \n",
" CHEBI:6438 | \n",
" NaN | \n",
" NaN | \n",
" levobunolol | \n",
"
\n",
" \n",
" 14208 | \n",
" (1+) | \n",
" NO_SUFFIX | \n",
" obo:chebi#is_conjugate_base_of | \n",
" CHEBI:75297 | \n",
" CHEBI:31057 | \n",
" NaN | \n",
" NaN | \n",
" 13-deoxydaunorubicin | \n",
"
\n",
" \n",
" 14382 | \n",
" (1+) | \n",
" NO_SUFFIX | \n",
" obo:chebi#is_conjugate_base_of | \n",
" CHEBI:76278 | \n",
" CHEBI:16299 | \n",
" NaN | \n",
" NaN | \n",
" dehydrocoformycin | \n",
"
\n",
" \n",
" 14672 | \n",
" (1+) | \n",
" NO_SUFFIX | \n",
" obo:chebi#is_conjugate_base_of | \n",
" CHEBI:76819 | \n",
" CHEBI:15906 | \n",
" NaN | \n",
" NaN | \n",
" demethylmacrocin | \n",
"
\n",
" \n",
" 14698 | \n",
" (1+) | \n",
" NO_SUFFIX | \n",
" obo:chebi#is_conjugate_base_of | \n",
" CHEBI:76922 | \n",
" CHEBI:77055 | \n",
" NaN | \n",
" NaN | \n",
" argemonine | \n",
"
\n",
" \n",
" 16426 | \n",
" (1+) | \n",
" NO_SUFFIX | \n",
" obo:chebi#is_conjugate_base_of | \n",
" CHEBI:86083 | \n",
" CHEBI:86085 | \n",
" NaN | \n",
" NaN | \n",
" (Z)-p-coumaroylagmatine | \n",
"
\n",
" \n",
" 16452 | \n",
" (1+) | \n",
" NO_SUFFIX | \n",
" obo:chebi#is_conjugate_base_of | \n",
" CHEBI:86380 | \n",
" CHEBI:599440 | \n",
" NaN | \n",
" NaN | \n",
" amorolfine | \n",
"
\n",
" \n",
"
\n",
"
"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 51
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.006179Z",
"start_time": "2024-08-24T00:15:19.586411Z"
}
},
"cell_type": "code",
"source": "df",
"id": "d20d7eb6fce43eb1",
"outputs": [
{
"data": {
"text/plain": [
" suffix1 suffix2 predicate chem1 \\\n",
"0 acid anion obo:chebi#is_conjugate_acid_of CHEBI:100147 \n",
"1 anion acid obo:chebi#is_conjugate_base_of CHEBI:62070 \n",
"2 acid NO_SUFFIX NO_REL CHEBI:10046 \n",
"3 NO_SUFFIX acid NO_REL CHEBI:10045 \n",
"4 acid ate obo:chebi#is_conjugate_acid_of CHEBI:10072 \n",
"... ... ... ... ... \n",
"17161 NO_SUFFIX (1-) obo:chebi#is_conjugate_acid_of CHEBI:130073 \n",
"17162 (1-) NO_SUFFIX obo:chebi#is_conjugate_base_of CHEBI:9162 \n",
"17163 NO_SUFFIX (1-) obo:chebi#is_conjugate_acid_of CHEBI:79317 \n",
"17164 ate acid obo:chebi#is_conjugate_base_of CHEBI:994 \n",
"17165 acid ate obo:chebi#is_conjugate_acid_of CHEBI:995 \n",
"\n",
" chem2 charge_diff charge_diff_sign \\\n",
"0 CHEBI:62070 NaN NaN \n",
"1 CHEBI:100147 NaN NaN \n",
"2 CHEBI:10045 NaN NaN \n",
"3 CHEBI:10046 NaN NaN \n",
"4 CHEBI:71201 NaN NaN \n",
"... ... ... ... \n",
"17161 CHEBI:91301 NaN NaN \n",
"17162 CHEBI:79317 NaN NaN \n",
"17163 CHEBI:9162 NaN NaN \n",
"17164 CHEBI:995 NaN NaN \n",
"17165 CHEBI:994 NaN NaN \n",
"\n",
" stem \n",
"0 nalidix \n",
"1 nalidix \n",
"2 Wyerone \n",
"3 Wyerone \n",
"4 xanthuren \n",
"... ... \n",
"17161 5,20-diHEPE \n",
"17162 sinigrin \n",
"17163 sinigrin \n",
"17164 cis,cis-2-amino-3-(3-oxoprop-1-enyl)but-2-enedio \n",
"17165 cis,cis-2-amino-3-(3-oxoprop-1-enyl)but-2-enedio \n",
"\n",
"[17166 rows x 8 columns]"
],
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" suffix1 | \n",
" suffix2 | \n",
" predicate | \n",
" chem1 | \n",
" chem2 | \n",
" charge_diff | \n",
" charge_diff_sign | \n",
" stem | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" acid | \n",
" anion | \n",
" obo:chebi#is_conjugate_acid_of | \n",
" CHEBI:100147 | \n",
" CHEBI:62070 | \n",
" NaN | \n",
" NaN | \n",
" nalidix | \n",
"
\n",
" \n",
" 1 | \n",
" anion | \n",
" acid | \n",
" obo:chebi#is_conjugate_base_of | \n",
" CHEBI:62070 | \n",
" CHEBI:100147 | \n",
" NaN | \n",
" NaN | \n",
" nalidix | \n",
"
\n",
" \n",
" 2 | \n",
" acid | \n",
" NO_SUFFIX | \n",
" NO_REL | \n",
" CHEBI:10046 | \n",
" CHEBI:10045 | \n",
" NaN | \n",
" NaN | \n",
" Wyerone | \n",
"
\n",
" \n",
" 3 | \n",
" NO_SUFFIX | \n",
" acid | \n",
" NO_REL | \n",
" CHEBI:10045 | \n",
" CHEBI:10046 | \n",
" NaN | \n",
" NaN | \n",
" Wyerone | \n",
"
\n",
" \n",
" 4 | \n",
" acid | \n",
" ate | \n",
" obo:chebi#is_conjugate_acid_of | \n",
" CHEBI:10072 | \n",
" CHEBI:71201 | \n",
" NaN | \n",
" NaN | \n",
" xanthuren | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 17161 | \n",
" NO_SUFFIX | \n",
" (1-) | \n",
" obo:chebi#is_conjugate_acid_of | \n",
" CHEBI:130073 | \n",
" CHEBI:91301 | \n",
" NaN | \n",
" NaN | \n",
" 5,20-diHEPE | \n",
"
\n",
" \n",
" 17162 | \n",
" (1-) | \n",
" NO_SUFFIX | \n",
" obo:chebi#is_conjugate_base_of | \n",
" CHEBI:9162 | \n",
" CHEBI:79317 | \n",
" NaN | \n",
" NaN | \n",
" sinigrin | \n",
"
\n",
" \n",
" 17163 | \n",
" NO_SUFFIX | \n",
" (1-) | \n",
" obo:chebi#is_conjugate_acid_of | \n",
" CHEBI:79317 | \n",
" CHEBI:9162 | \n",
" NaN | \n",
" NaN | \n",
" sinigrin | \n",
"
\n",
" \n",
" 17164 | \n",
" ate | \n",
" acid | \n",
" obo:chebi#is_conjugate_base_of | \n",
" CHEBI:994 | \n",
" CHEBI:995 | \n",
" NaN | \n",
" NaN | \n",
" cis,cis-2-amino-3-(3-oxoprop-1-enyl)but-2-enedio | \n",
"
\n",
" \n",
" 17165 | \n",
" acid | \n",
" ate | \n",
" obo:chebi#is_conjugate_acid_of | \n",
" CHEBI:995 | \n",
" CHEBI:994 | \n",
" NaN | \n",
" NaN | \n",
" cis,cis-2-amino-3-(3-oxoprop-1-enyl)but-2-enedio | \n",
"
\n",
" \n",
"
\n",
"
17166 rows × 8 columns
\n",
"
"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 52
},
{
"metadata": {},
"cell_type": "markdown",
"source": "## Create is-a map",
"id": "d17e17831c235596"
},
{
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.006730Z",
"start_time": "2024-08-24T00:15:19.687885Z"
}
},
"cell_type": "code",
"source": "is_as = list(chebi.relationships(predicates=[IS_A]))",
"id": "5a77f734fba5ae24",
"outputs": [],
"execution_count": 53
},
{
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.006862Z",
"start_time": "2024-08-24T00:15:25.255263Z"
}
},
"cell_type": "code",
"source": [
"is_a_map = defaultdict(list)\n",
"for s, _, o in is_as:\n",
" is_a_map[s].append(o)"
],
"id": "7a7a9eb7c9110c39",
"outputs": [],
"execution_count": 54
},
{
"metadata": {},
"cell_type": "markdown",
"source": [
"## Fetch Uniprot Synonyms\n",
"\n",
"These are bio-friendly synonyms."
],
"id": "2e35a6a093aae65c"
},
{
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.006961Z",
"start_time": "2024-08-24T00:15:25.397779Z"
}
},
"cell_type": "code",
"outputs": [],
"execution_count": 55,
"source": [
"from semsql.sqla.semsql import OwlAxiomAnnotation\n",
"\n",
"q = session.query(OwlAxiomAnnotation)\n",
"axiom_anns = list(q)"
],
"id": "6ac6247a4be899da"
},
{
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.014295Z",
"start_time": "2024-08-24T00:15:38.870996Z"
}
},
"cell_type": "code",
"source": "len(axiom_anns)",
"id": "c2c109cbfa2e46a5",
"outputs": [
{
"data": {
"text/plain": [
"716075"
]
},
"execution_count": 56,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 56
},
{
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.014707Z",
"start_time": "2024-08-24T00:15:39.176170Z"
}
},
"cell_type": "code",
"source": "up_axiom_anns = [row for row in axiom_anns if row.annotation_predicate == \"oio:hasDbXref\" and row.annotation_value == \"UniProt\"]",
"id": "6132886244ebdbe",
"outputs": [],
"execution_count": 57
},
{
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.014812Z",
"start_time": "2024-08-24T00:15:39.867494Z"
}
},
"cell_type": "code",
"source": [
"bio_syn_map = {row.subject: row.value for row in axiom_anns if row.annotation_predicate == \"oio:hasDbXref\" and row.annotation_value == \"UniProt\"}\n",
"len(bio_syn_map)"
],
"id": "a021b7d4e91c99e0",
"outputs": [
{
"data": {
"text/plain": [
"16393"
]
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 58
},
{
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.014924Z",
"start_time": "2024-08-24T00:15:40.580153Z"
}
},
"cell_type": "code",
"source": "len(bio_syn_map)",
"id": "8ad8345e0db49f7a",
"outputs": [
{
"data": {
"text/plain": [
"16393"
]
},
"execution_count": 59,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 59
},
{
"metadata": {},
"cell_type": "markdown",
"source": "",
"id": "fc455907eee67b8a"
},
{
"metadata": {},
"cell_type": "markdown",
"source": [
"## RHEA pH Mapping\n",
"\n",
"RHEA provides a table that maps CHEBI IDs to their pH 7.3 stable forms. The mapping may be reflexive\n",
"print((e.g.a stable form will map to itself))\n",
"\n",
"The mappings may not be complete - in particular only leaf nodes are mapped."
],
"id": "d67387abc3bd9dc7"
},
{
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.015055Z",
"start_time": "2024-08-24T00:15:40.903477Z"
}
},
"cell_type": "code",
"source": "\n",
"id": "9ab37d9db461f843",
"outputs": [],
"execution_count": 59
},
{
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.015145Z",
"start_time": "2024-08-24T00:15:41.217559Z"
}
},
"cell_type": "code",
"source": "",
"id": "6c6c70e637f11eb9",
"outputs": [],
"execution_count": null
},
{
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.015227Z",
"start_time": "2024-08-24T00:15:41.855434Z"
}
},
"cell_type": "code",
"source": [
"import pystow\n",
"ph_mapping_df = pystow.ensure_csv(\"rhea\", url=\"https://ftp.expasy.org/databases/rhea/tsv/chebi_pH7_3_mapping.tsv\")\n",
"for col in [\"CHEBI\", \"CHEBI_PH7_3\"]:\n",
" ph_mapping_df[col] = \"CHEBI:\" + ph_mapping_df[col].astype(str)\n",
"\n"
],
"id": "e5792b2c75c4a675",
"outputs": [],
"execution_count": 60
},
{
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.015311Z",
"start_time": "2024-08-24T00:15:41.963911Z"
}
},
"cell_type": "code",
"source": "ph_mapping_df",
"id": "7cc020b7576cc449",
"outputs": [
{
"data": {
"text/plain": [
" CHEBI CHEBI_PH7_3 ORIGIN\n",
"0 CHEBI:3 CHEBI:3 computation\n",
"1 CHEBI:7 CHEBI:7 computation\n",
"2 CHEBI:8 CHEBI:8 computation\n",
"3 CHEBI:19 CHEBI:19 computation\n",
"4 CHEBI:20 CHEBI:20 computation\n",
"... ... ... ...\n",
"119802 CHEBI:691037 CHEBI:691037 computation\n",
"119803 CHEBI:691622 CHEBI:691622 computation\n",
"119804 CHEBI:741548 CHEBI:132939 computation\n",
"119805 CHEBI:744019 CHEBI:744019 computation\n",
"119806 CHEBI:746859 CHEBI:746859 computation\n",
"\n",
"[119807 rows x 3 columns]"
],
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" CHEBI | \n",
" CHEBI_PH7_3 | \n",
" ORIGIN | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" CHEBI:3 | \n",
" CHEBI:3 | \n",
" computation | \n",
"
\n",
" \n",
" 1 | \n",
" CHEBI:7 | \n",
" CHEBI:7 | \n",
" computation | \n",
"
\n",
" \n",
" 2 | \n",
" CHEBI:8 | \n",
" CHEBI:8 | \n",
" computation | \n",
"
\n",
" \n",
" 3 | \n",
" CHEBI:19 | \n",
" CHEBI:19 | \n",
" computation | \n",
"
\n",
" \n",
" 4 | \n",
" CHEBI:20 | \n",
" CHEBI:20 | \n",
" computation | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 119802 | \n",
" CHEBI:691037 | \n",
" CHEBI:691037 | \n",
" computation | \n",
"
\n",
" \n",
" 119803 | \n",
" CHEBI:691622 | \n",
" CHEBI:691622 | \n",
" computation | \n",
"
\n",
" \n",
" 119804 | \n",
" CHEBI:741548 | \n",
" CHEBI:132939 | \n",
" computation | \n",
"
\n",
" \n",
" 119805 | \n",
" CHEBI:744019 | \n",
" CHEBI:744019 | \n",
" computation | \n",
"
\n",
" \n",
" 119806 | \n",
" CHEBI:746859 | \n",
" CHEBI:746859 | \n",
" computation | \n",
"
\n",
" \n",
"
\n",
"
119807 rows × 3 columns
\n",
"
"
]
},
"execution_count": 61,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 61
},
{
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.016299Z",
"start_time": "2024-08-24T00:15:42.311545Z"
}
},
"cell_type": "code",
"source": "ph_mapping = dict(zip(ph_mapping_df['CHEBI'], ph_mapping_df['CHEBI_PH7_3']))",
"id": "afd7468af0f9a848",
"outputs": [],
"execution_count": 62
},
{
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.016568Z",
"start_time": "2024-08-24T00:15:42.383741Z"
}
},
"cell_type": "code",
"source": "len(ph_mapping)",
"id": "feb64fe9e36bbdec",
"outputs": [
{
"data": {
"text/plain": [
"119807"
]
},
"execution_count": 63,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 63
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.016898Z",
"start_time": "2024-08-24T00:15:42.436633Z"
}
},
"cell_type": "code",
"source": [
"# we expected leaf and leaf-y nodes to be mapped\n",
"assert ph_mapping[CYSTEINATE_1_MINUS] == CYSTEINE_ZWITTERION"
],
"id": "f960fab3cbd8c968",
"outputs": [],
"execution_count": 64
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.017042Z",
"start_time": "2024-08-24T00:15:42.754150Z"
}
},
"cell_type": "code",
"source": "assert ph_mapping[CITRIC_ACID] != CITRIC_ACID",
"id": "5d984c637fc464a6",
"outputs": [],
"execution_count": 65
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.017145Z",
"start_time": "2024-08-24T00:15:43.078525Z"
}
},
"cell_type": "code",
"source": [
"# reflexivity\n",
"assert ph_mapping[L_CYSTEINE_ZWITTERION] == L_CYSTEINE_ZWITTERION\n",
"assert ph_mapping[CYSTEINE_ZWITTERION] == CYSTEINE_ZWITTERION"
],
"id": "6215936dbaf1e9f7",
"outputs": [],
"execution_count": 66
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.017350Z",
"start_time": "2024-08-24T00:15:43.399428Z"
}
},
"cell_type": "code",
"source": "assert ph_mapping[WATER] == WATER",
"id": "3f295b2fce03ea7f",
"outputs": [],
"execution_count": 67
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.017526Z",
"start_time": "2024-08-24T00:15:43.717362Z"
}
},
"cell_type": "code",
"source": [
"# groupings are not mapped\n",
"assert AMINO_ACID not in ph_mapping"
],
"id": "41959f4eb1f532e7",
"outputs": [],
"execution_count": 68
},
{
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.017637Z",
"start_time": "2024-08-24T00:15:44.072155Z"
}
},
"cell_type": "code",
"source": [
"# create a reverse mapping\n",
"\n",
"rev_ph_mapping = defaultdict(list)\n",
"for k, v in ph_mapping.items():\n",
" rev_ph_mapping[v].append(k)\n",
" \n",
"len(rev_ph_mapping)"
],
"id": "5a139dee44a44030",
"outputs": [
{
"data": {
"text/plain": [
"111077"
]
},
"execution_count": 69,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 69
},
{
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.017768Z",
"start_time": "2024-08-24T00:15:45.014725Z"
}
},
"cell_type": "code",
"source": [
"assert L_CYSTEINE_ZWITTERION in rev_ph_mapping[L_CYSTEINE_ZWITTERION]\n",
"assert CYSTEINATE_1_MINUS in rev_ph_mapping[CYSTEINE_ZWITTERION]\n",
"\n"
],
"id": "9120365808e9bc04",
"outputs": [],
"execution_count": 70
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.017934Z",
"start_time": "2024-08-24T00:15:45.024002Z"
}
},
"cell_type": "code",
"source": [
"rhea_sccs = []\n",
"rhea_singletons = []\n",
"for _, clique in rev_ph_mapping.items():\n",
" if len(clique) > 1:\n",
" rhea_sccs.append(set(clique))\n",
" else:\n",
" rhea_singletons.append(clique[0])\n",
"\n",
"len(rhea_sccs), len(rhea_singletons)"
],
"id": "7c88440d52718d0a",
"outputs": [
{
"data": {
"text/plain": [
"(8140, 102937)"
]
},
"execution_count": 71,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 71
},
{
"metadata": {},
"cell_type": "markdown",
"source": [
"## Pick Canonical from Conjugate Cliques\n",
"\n",
"For each clique, pick the canonical term."
],
"id": "9ab6512f389c6a93"
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.018170Z",
"start_time": "2024-08-24T00:15:45.411105Z"
}
},
"cell_type": "code",
"source": [
"\n",
"arbitrary_canonical_map = {}\n",
"def pick_canonical(ids: List[str]) -> str:\n",
" \"\"\"\n",
" Pick the canonical term from a list of ids.\n",
" \n",
" Priority order:\n",
" \n",
" 1. pH mapping\n",
" 2. Uniprot synonyms\n",
" 3. Charge 0\n",
" \n",
" :param ids: \n",
" :return: \n",
" \"\"\"\n",
" # ensure deterministic order\n",
" ids = sorted(ids)\n",
" for id in ids:\n",
" if id in ph_mapping:\n",
" return ph_mapping[id]\n",
" for id in ids:\n",
" if id in bio_syn_map:\n",
" return id\n",
" for id in ids:\n",
" if id in charges and charges[id] == 0:\n",
" return id\n",
" # prioritize shorter\n",
" ids = sorted(ids, key=lambda x: len(labels.get(x)))\n",
" for id in ids:\n",
" if id not in charges:\n",
" # last resort\n",
" return id\n",
" if len(ids) == 1:\n",
" return ids[0]\n",
" arbitrary_canonical_map[tuple(ids)] = ids[0]\n",
" return ids[0]\n",
" # raise ValueError(f\"Could not find canonical for {ids}\")\n",
"\n",
"assert pick_canonical([CITRIC_ACID]) == ph_mapping[CITRIC_ACID]\n",
"assert pick_canonical([L_CYSTEINE_ZWITTERION]) == L_CYSTEINE_ZWITTERION\n",
"assert pick_canonical([CYSTEINE_ZWITTERION, CYSTEINATE_1_MINUS, CYSTEINIUM]) == CYSTEINE_ZWITTERION\n",
"assert pick_canonical([AMINO_ACID, AMINO_ACID_ANION]) == AMINO_ACID\n",
"assert pick_canonical([ALPHA_AMINO_ACID, ALPHA_AMINO_ACID_ANION, ALPHA_AMINO_ACID_ZWITTERION]) == ALPHA_AMINO_ACID_ZWITTERION\n",
" "
],
"id": "e878bc530b346f26",
"outputs": [],
"execution_count": 72
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.018305Z",
"start_time": "2024-08-24T00:15:45.775801Z"
}
},
"cell_type": "code",
"source": "pick_canonical([ALPHA_AMINO_ACID, ALPHA_AMINO_ACID_ANION, ALPHA_AMINO_ACID_ZWITTERION])",
"id": "437cc21e6c84393c",
"outputs": [
{
"data": {
"text/plain": [
"'CHEBI:78608'"
]
},
"execution_count": 73,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 73
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.018621Z",
"start_time": "2024-08-24T00:15:46.520120Z"
}
},
"cell_type": "code",
"source": "",
"id": "25942fc6d94bd016",
"outputs": [],
"execution_count": null
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.018775Z",
"start_time": "2024-08-24T00:15:46.528398Z"
}
},
"cell_type": "code",
"source": [
"from typing import Set, Tuple\n",
"\n",
"\n",
"def create_canonical_map(scc_sets: List[Set[str]]) -> Tuple[Dict[str, Set[str]], Dict[str, str]]:\n",
" \"\"\"\n",
" Create a mapping between canonical and members of the strongly connected components\n",
" :param scc_sets: \n",
" :return: \n",
" \"\"\"\n",
" canonical_to_members = {}\n",
" for scc in scc_sets:\n",
" canonical = pick_canonical(scc)\n",
" canonical_to_members[canonical] = scc\n",
" members_to_canonical = {m: c for c, ms in canonical_to_members.items() for m in ms}\n",
" return (canonical_to_members, members_to_canonical)\n",
"\n",
"canonical_to_members, members_to_canonical = create_canonical_map(sccs)\n",
"assert len(canonical_to_members) > 8000\n",
"assert members_to_canonical[CYSTEINE_ZWITTERION] == CYSTEINE_ZWITTERION\n",
"assert members_to_canonical[CYSTEINATE_1_MINUS] == CYSTEINE_ZWITTERION"
],
"id": "5ac60d2c2ca3e36d",
"outputs": [],
"execution_count": 74
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.018887Z",
"start_time": "2024-08-24T00:15:46.951638Z"
}
},
"cell_type": "code",
"source": "",
"id": "820efde068eb28fa",
"outputs": [],
"execution_count": null
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.019140Z",
"start_time": "2024-08-24T00:15:47.312693Z"
}
},
"cell_type": "code",
"source": "# assert CITRIC_ACID in members_to_canonical\n",
"id": "b922566d4a6977f1",
"outputs": [],
"execution_count": 75
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.019229Z",
"start_time": "2024-08-24T00:15:47.673888Z"
}
},
"cell_type": "code",
"source": "##",
"id": "986db0b5676b022",
"outputs": [],
"execution_count": 76
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.019333Z",
"start_time": "2024-08-24T00:15:48.034616Z"
}
},
"cell_type": "code",
"source": "",
"id": "144827585302bdb6",
"outputs": [],
"execution_count": null
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.019416Z",
"start_time": "2024-08-24T00:15:48.390743Z"
}
},
"cell_type": "code",
"source": [
"# assess completeness of the sccs\n",
"missing_in_conjugate_sccs = []\n",
"for chem in stem_to_chem.keys():\n",
" if chem not in canonical_to_members:\n",
" missing_in_conjugate_sccs.append(chem)\n",
" \n",
"len(missing_in_conjugate_sccs)\n"
],
"id": "a021fe7fed571235",
"outputs": [
{
"data": {
"text/plain": [
"31415"
]
},
"execution_count": 77,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 77
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.023327Z",
"start_time": "2024-08-24T00:15:48.747771Z"
}
},
"cell_type": "code",
"source": [
"missing_in_lexical_analysis = []\n",
"for chem in canonical_to_members.keys():\n",
" if chem not in chem_to_stem:\n",
" missing_in_lexical_analysis.append(chem)\n",
" \n",
"len(missing_in_lexical_analysis)"
],
"id": "803cbc15883d0c19",
"outputs": [
{
"data": {
"text/plain": [
"1927"
]
},
"execution_count": 78,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 78
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.023768Z",
"start_time": "2024-08-24T00:15:49.110226Z"
}
},
"cell_type": "code",
"source": [
"\n",
"for _, vmap in stem_to_chem.items():\n",
" for v1 in vmap.values():\n",
" for v2 in vmap.values():\n",
" if v1 > v2:\n",
" rel = (v1, \"?\", v2)\n",
" if rel not in conjrels:\n",
" conjrels.append(rel)\n",
" # conj_graph.add_edge(v1, v2))\n",
"conj_graph = calculate_conj_graph(conjrels)\n",
"full_sccs = list(nx.strongly_connected_components(conj_graph))\n",
"len(full_sccs)"
],
"id": "c49abac57eb8498",
"outputs": [
{
"data": {
"text/plain": [
"9321"
]
},
"execution_count": 79,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 79
},
{
"metadata": {},
"cell_type": "markdown",
"source": "This number is the total number of cliques we will use",
"id": "ddcaccb58acaefc1"
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.023919Z",
"start_time": "2024-08-24T00:15:55.326335Z"
}
},
"cell_type": "code",
"source": "",
"id": "a426f8f57483a9e9",
"outputs": [],
"execution_count": null
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.024Z",
"start_time": "2024-08-24T00:15:55.677465Z"
}
},
"cell_type": "code",
"source": [
"canonical_to_members, members_to_canonical = create_canonical_map(full_sccs)\n",
"assert len(canonical_to_members) > 9000"
],
"id": "7e12327ea167d1f7",
"outputs": [],
"execution_count": 80
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.024089Z",
"start_time": "2024-08-24T00:15:56.034231Z"
}
},
"cell_type": "code",
"source": "labels[members_to_canonical[AMINO_ACID]]",
"id": "47d4065d938af84f",
"outputs": [
{
"data": {
"text/plain": [
"'amino acid'"
]
},
"execution_count": 81,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 81
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.024185Z",
"start_time": "2024-08-24T00:15:56.382058Z"
}
},
"cell_type": "code",
"source": "assert members_to_canonical[AMINO_ACID_ANION] == AMINO_ACID",
"id": "8ae582d277664a82",
"outputs": [],
"execution_count": 82
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.024259Z",
"start_time": "2024-08-24T00:15:56.732138Z"
}
},
"cell_type": "code",
"source": "len(members_to_canonical)",
"id": "69daa1cf3e2af9a4",
"outputs": [
{
"data": {
"text/plain": [
"19624"
]
},
"execution_count": 83,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 83
},
{
"metadata": {},
"cell_type": "markdown",
"source": "## Exclusion List",
"id": "ff7443169be8ebfe"
},
{
"cell_type": "code",
"source": [
"ions = list(chebi.descendants(ION, [IS_A]))\n",
"exclusion_list = [ion for ion in ions if ion not in canonical_to_members]"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.024351Z",
"start_time": "2024-08-24T00:15:57.086873Z"
}
},
"id": "6f9c14d520e38499",
"outputs": [],
"execution_count": 84
},
{
"cell_type": "code",
"source": [
"len(exclusion_list)"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.024432Z",
"start_time": "2024-08-24T00:16:00.492363Z"
}
},
"id": "707d750e6f27b0d",
"outputs": [
{
"data": {
"text/plain": [
"6762"
]
},
"execution_count": 85,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 85
},
{
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.024524Z",
"start_time": "2024-08-24T00:16:00.868151Z"
}
},
"cell_type": "code",
"source": [
"# assert AAAC in exclusion_list, f\"expected {AAAC} in exclusion_list\"\n",
"assert CITRIC_ACID not in exclusion_list\n",
"assert AMINO_ACID not in exclusion_list"
],
"id": "d4012fde318a42cb",
"outputs": [],
"execution_count": 86
},
{
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.024605Z",
"start_time": "2024-08-24T00:16:01.233771Z"
}
},
"cell_type": "code",
"source": [
"def rewire(id: str) -> Optional[str]:\n",
" \"\"\"\n",
" Rewire an ID to its canonical form, if it is not in the exclusion list\n",
" :param id: \n",
" :return: \n",
" \"\"\"\n",
" rewired = members_to_canonical.get(id, id)\n",
" if rewired in exclusion_list:\n",
" return None\n",
" return rewired"
],
"id": "eb60026fd7282828",
"outputs": [],
"execution_count": 87
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.024679Z",
"start_time": "2024-08-24T00:16:01.588085Z"
}
},
"cell_type": "code",
"source": "rewire(AAAE)",
"id": "d2f7eb4cab17323e",
"outputs": [
{
"data": {
"text/plain": [
"'CHEBI:83410'"
]
},
"execution_count": 88,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 88
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.024768Z",
"start_time": "2024-08-24T00:16:01.940983Z"
}
},
"cell_type": "code",
"source": [
"assert rewire(AAAE) != AAAE\n",
"assert rewire(AAAE) not in exclusion_list"
],
"id": "458724079e756895",
"outputs": [],
"execution_count": 89
},
{
"metadata": {},
"cell_type": "markdown",
"source": "## Generate Ontology",
"id": "3c586153bb91baeb"
},
{
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.024851Z",
"start_time": "2024-08-24T00:16:02.295171Z"
}
},
"cell_type": "code",
"source": "",
"id": "8ef381b8af27a6d7",
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"source": [
"from typing import Tuple\n",
"from pydantic import BaseModel\n",
"\n",
"\n",
"\n",
"class Term(BaseModel):\n",
" stanza_type: str = \"Term\"\n",
" id: str\n",
" label: str\n",
" synonyms: Optional[List[str]] = None\n",
" xrefs: Optional[List[str]] = []\n",
" alt_ids: Optional[List[str]] = None\n",
" parents: List[str] = []\n",
" relationships: List[Tuple[str, str]] = []\n",
" inchi: Optional[str] = None\n",
" physiologically_stable_form: Optional[str] = None\n",
" comments: List[str] = []\n",
" \n",
" def as_obo(self) -> str:\n",
" name = self.label.replace('{', r'\\{')\n",
" lines = [\n",
" f\"[{self.stanza_type}]\",\n",
" f\"id: {self.id}\",\n",
" f\"name: {name}\",\n",
" ]\n",
" lines += [f\"synonym: {s}\" for s in self.synonyms or []]\n",
" lines += [f\"alt_id: {alt_id}\" for alt_id in self.alt_ids or []]\n",
" lines += [f\"is_a: {is_a}\" for is_a in self.parents or []]\n",
" lines += [f\"xref: {xref}\" for xref in self.xrefs or []]\n",
" lines += [f\"relationship: {p} {v}\" for p, v in self.relationships or []]\n",
" lines += [f\"comment: {'; '.join(self.comments)}\"] if self.comments else []\n",
" lines += [f\"property_value: chemrof:inchi_string \\\"{self.inchi}\\\" xsd:string\"] if self.inchi else []\n",
" lines += [f\"property_value: chemrof:has_physiologically_stable_form {self.physiologically_stable_form}\"] if self.physiologically_stable_form else []\n",
" lines += [\"\"]\n",
" return \"\\n\".join(lines)\n",
" \n",
" \n",
"class Ontology(BaseModel):\n",
" terms: List[Term] = []\n",
" \n",
" def as_obo(self) -> str:\n",
" lines = [\n",
" f\"ontology: chebi-slim\",\n",
" \"idspace: chemrof https://w3id.org/chemrof/\",\n",
" \"\",\n",
" ]\n",
" return \"\\n\".join(lines + [t.as_obo() for t in self.terms])\n",
" \n",
" \n"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.024923Z",
"start_time": "2024-08-24T00:16:02.649433Z"
}
},
"id": "e1d93ec81dd98d00",
"outputs": [],
"execution_count": 90
},
{
"cell_type": "code",
"source": [
"BAD_SUFFIXES = [\"zwitterion\", \"ion\", \"(1+)\", \"(2+)\"]\n",
"\n",
"def make_term(id: str) -> Optional[Term]:\n",
" \"\"\"\n",
" Make a term from an ID\n",
" \n",
" :param id: \n",
" :return: \n",
" \"\"\"\n",
" if id in exclusion_list:\n",
" return None\n",
" if members_to_canonical.get(id, id) != id:\n",
" # non-canonical members are not included\n",
" return None\n",
" #if id not in initial_terms:\n",
" # # filter for testing\n",
" # return None\n",
" label = labels.get(id, id)\n",
" if id in bio_syn_map:\n",
" label = bio_syn_map[id]\n",
" else:\n",
" for suffix in BAD_SUFFIXES:\n",
" suffix = \" \" + suffix\n",
" if label.endswith(suffix):\n",
" label = label.replace(suffix, \"\")\n",
" term = Term(id=id, label=label)\n",
" alt_ids = [x for x in canonical_to_members.get(id, []) if x != id]\n",
" if id in ph_mapping:\n",
" term.physiologically_stable_form = ph_mapping[id]\n",
" if alt_ids:\n",
" term.alt_ids = alt_ids\n",
" else:\n",
" alt_ids = []\n",
" equiv_set = [id] + alt_ids\n",
" comments = []\n",
" for alt_id in equiv_set:\n",
" for parent in is_a_map.get(alt_id, []):\n",
" rewired_parent = rewire(parent)\n",
" if rewired_parent and rewired_parent not in term.parents:\n",
" term.parents.append(rewired_parent)\n",
" if rewired_parent != parent or alt_id != id:\n",
" comments.append(f\"Parent {rewired_parent} was rewired from {alt_id} to {parent}\")\n",
" for (p, o) in preserved_rels_by_subject.get(alt_id, []):\n",
" rewired_o = rewire(o)\n",
" if rewired_o and (p, rewired_o) not in term.relationships:\n",
" term.relationships.append((p, rewired_o))\n",
" # TODO: xrefs\n",
" for xref in xrefs.get(alt_id, []):\n",
" if xref.startswith(\"PMID:\"):\n",
" continue\n",
" term.xrefs.append(xref)\n",
" if alt_id in inchis:\n",
" if not term.inchi:\n",
" term.inchi = inchis[alt_id]\n",
" term.comments = comments\n",
" return term\n",
"\n",
"\n",
"#assert L_CYSTEINE_ZWITTERION in initial_terms\n",
"t = make_term(L_CYSTEINE_ZWITTERION)\n",
"assert t.label == \"L-cysteine\"\n",
"print(t.as_obo())"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.025001Z",
"start_time": "2024-08-24T00:16:03.023772Z"
}
},
"id": "b196288c3d4bc41c",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[Term]\n",
"id: CHEBI:35235\n",
"name: L-cysteine\n",
"alt_id: CHEBI:32442\n",
"alt_id: CHEBI:17561\n",
"alt_id: CHEBI:32445\n",
"alt_id: CHEBI:32443\n",
"is_a: CHEBI:35237\n",
"is_a: CHEBI:59869\n",
"is_a: CHEBI:26650\n",
"is_a: CHEBI:83813\n",
"xref: Gmelin:49993\n",
"xref: Reaxys:4128886\n",
"xref: Gmelin:325857\n",
"xref: Beilstein:4128886\n",
"xref: YMDB:YMDB00046\n",
"xref: Wikipedia:Cysteine\n",
"xref: Reaxys:1721408\n",
"xref: PDBeChem:CYS\n",
"xref: MetaCyc:CYS\n",
"xref: KNApSAcK:C00001351\n",
"xref: KEGG:D00026\n",
"xref: KEGG:C00097\n",
"xref: HMDB:HMDB0000574\n",
"xref: Gmelin:49991\n",
"xref: ECMDB:ECMDB00574\n",
"xref: Drug_Central:769\n",
"xref: DrugBank:DB00151\n",
"xref: CAS:52-90-4\n",
"xref: Beilstein:1721408\n",
"xref: Gmelin:325860\n",
"xref: Reaxys:5921923\n",
"xref: Gmelin:325856\n",
"xref: Beilstein:5921923\n",
"relationship: RO:0018039 CHEBI:35236\n",
"relationship: RO:0000087 CHEBI:78675\n",
"relationship: RO:0000087 CHEBI:64577\n",
"relationship: RO:0000087 CHEBI:77703\n",
"relationship: RO:0000087 CHEBI:77746\n",
"comment: Parent CHEBI:59869 was rewired from CHEBI:32442 to CHEBI:59814; Parent CHEBI:26650 was rewired from CHEBI:17561 to CHEBI:26650; Parent CHEBI:83813 was rewired from CHEBI:17561 to CHEBI:83813\n",
"property_value: chemrof:inchi_string \"InChI=1S/C3H7NO2S/c4-2(1-7)3(5)6/h2,7H,1,4H2,(H,5,6)/t2-/m0/s1\" xsd:string\n",
"property_value: chemrof:has_physiologically_stable_form CHEBI:35235\n",
"\n"
]
}
],
"execution_count": 91
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.025086Z",
"start_time": "2024-08-24T00:16:03.394309Z"
}
},
"cell_type": "code",
"source": "assert rewire(is_a_map[CORD_E][0]) not in exclusion_list",
"id": "2a3ed58115eabd33",
"outputs": [],
"execution_count": 92
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.025170Z",
"start_time": "2024-08-24T00:16:03.744976Z"
}
},
"cell_type": "code",
"source": [
"t = make_term(CORD_E)\n",
"print(t.as_obo())\n"
],
"id": "238e38614ea62be7",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[Term]\n",
"id: CHEBI:213754\n",
"name: Cordycepamide E\n",
"is_a: CHEBI:83410\n",
"comment: Parent CHEBI:83410 was rewired from CHEBI:213754 to CHEBI:46874\n",
"property_value: chemrof:inchi_string \"InChI=1S/C15H19NO4/c1-9(2)13-14(18)16(3)12(15(19)20-13)8-10-4-6-11(17)7-5-10/h4-7,9,12-13,17H,8H2,1-3H3/t12-,13+/m0/s1\" xsd:string\n",
"property_value: chemrof:has_physiologically_stable_form CHEBI:213754\n",
"\n"
]
}
],
"execution_count": 93
},
{
"cell_type": "code",
"source": [
"\n",
"# assert CYSTEINE_ZWITTERION in initial_terms\n",
"t = make_term(CYSTEINE_ZWITTERION)\n",
"assert t.label == \"cysteine\"\n",
"assert \"CHEBI:78608\" in t.parents\n",
"print(t.as_obo())"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.025253Z",
"start_time": "2024-08-24T00:16:04.096810Z"
}
},
"id": "bc403819eabb21f5",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[Term]\n",
"id: CHEBI:35237\n",
"name: cysteine\n",
"alt_id: CHEBI:32458\n",
"alt_id: CHEBI:32456\n",
"alt_id: CHEBI:32457\n",
"alt_id: CHEBI:15356\n",
"is_a: CHEBI:33709\n",
"is_a: CHEBI:78608\n",
"is_a: CHEBI:26834\n",
"is_a: CHEBI:62031\n",
"xref: Gmelin:49992\n",
"xref: Gmelin:325859\n",
"xref: Reaxys:4128885\n",
"xref: Gmelin:363235\n",
"xref: Beilstein:4128885\n",
"xref: Gmelin:49990\n",
"xref: Wikipedia:Cysteine\n",
"xref: Reaxys:1721406\n",
"xref: KNApSAcK:C00007323\n",
"xref: KNApSAcK:C00001351\n",
"xref: KEGG:C00736\n",
"xref: Gmelin:2933\n",
"xref: CAS:3374-22-9\n",
"xref: Beilstein:1721406\n",
"relationship: BFO:0000051 CHEBI:50326\n",
"relationship: RO:0000087 CHEBI:78675\n",
"comment: Parent CHEBI:33709 was rewired from CHEBI:35237 to CHEBI:35238; Parent CHEBI:78608 was rewired from CHEBI:32458 to CHEBI:33719; Parent CHEBI:26834 was rewired from CHEBI:32456 to CHEBI:63470; Parent CHEBI:62031 was rewired from CHEBI:15356 to CHEBI:26167\n",
"property_value: chemrof:inchi_string \"InChI=1S/C3H7NO2S/c4-2(1-7)3(5)6/h2,7H,1,4H2,(H,5,6)\" xsd:string\n",
"property_value: chemrof:has_physiologically_stable_form CHEBI:35237\n",
"\n"
]
}
],
"execution_count": 94
},
{
"cell_type": "code",
"source": [
"for is_a in t.parents:\n",
" print(is_a, labels[is_a])"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.025331Z",
"start_time": "2024-08-24T00:16:04.458964Z"
}
},
"id": "79d094c451d6a3d9",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CHEBI:33709 amino acid\n",
"CHEBI:78608 alpha-amino acid zwitterion\n",
"CHEBI:26834 sulfur-containing amino acid\n",
"CHEBI:62031 polar amino acid zwitterion\n"
]
}
],
"execution_count": 95
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.025411Z",
"start_time": "2024-08-24T00:16:04.815776Z"
}
},
"cell_type": "code",
"source": "print(make_term(ALPHA_AMINO_ACID_ZWITTERION).as_obo())",
"id": "2ae23324e2b21a47",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[Term]\n",
"id: CHEBI:78608\n",
"name: an alpha-amino acid\n",
"alt_id: CHEBI:33558\n",
"alt_id: CHEBI:33704\n",
"alt_id: CHEBI:33719\n",
"is_a: CHEBI:33709\n",
"xref: MetaCyc:Alpha-Amino-Acids\n",
"xref: KEGG:C05167\n",
"xref: KEGG:C00045\n",
"comment: Parent CHEBI:33709 was rewired from CHEBI:78608 to CHEBI:35238\n",
"property_value: chemrof:has_physiologically_stable_form CHEBI:78608\n",
"\n"
]
}
],
"execution_count": 96
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.025496Z",
"start_time": "2024-08-24T00:16:05.173476Z"
}
},
"cell_type": "code",
"source": [
"t = make_term(\"CHEBI:25944\")\n",
"t.comments\n",
"print(t.as_obo())\n"
],
"id": "7cf212252b470eac",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[Term]\n",
"id: CHEBI:25944\n",
"name: pesticide\n",
"is_a: CHEBI:33232\n",
"xref: Wikipedia:Pesticide\n",
"\n"
]
}
],
"execution_count": 97
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.025572Z",
"start_time": "2024-08-24T00:16:05.531782Z"
}
},
"cell_type": "code",
"source": [
"GLU_1M = \"CHEBI:14321\"\n",
"assert preserved_rels_by_subject[GLU_1M]\n",
"print(make_term(GLU_1M).as_obo())"
],
"id": "4fde5945f3dbd863",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[Term]\n",
"id: CHEBI:14321\n",
"name: glutamate\n",
"alt_id: CHEBI:18237\n",
"alt_id: CHEBI:29987\n",
"is_a: CHEBI:78608\n",
"is_a: CHEBI:62031\n",
"xref: Gmelin:327908\n",
"xref: Wikipedia:Glutamic_acid\n",
"xref: Reaxys:1723799\n",
"xref: KNApSAcK:C00019577\n",
"xref: KNApSAcK:C00001358\n",
"xref: KEGG:D04341\n",
"xref: KEGG:C00302\n",
"xref: Gmelin:101971\n",
"xref: CAS:617-65-2\n",
"xref: Beilstein:1723799\n",
"xref: Reaxys:4134100\n",
"xref: Gmelin:327903\n",
"xref: Beilstein:4134100\n",
"relationship: RO:0000087 CHEBI:78675\n",
"relationship: BFO:0000051 CHEBI:50329\n",
"comment: Parent CHEBI:78608 was rewired from CHEBI:14321 to CHEBI:33558; Parent CHEBI:62031 was rewired from CHEBI:18237 to CHEBI:26167\n",
"property_value: chemrof:inchi_string \"InChI=1S/C5H9NO4/c6-3(5(9)10)1-2-4(7)8/h3H,1-2,6H2,(H,7,8)(H,9,10)/p-1\" xsd:string\n",
"property_value: chemrof:has_physiologically_stable_form CHEBI:14321\n",
"\n"
]
}
],
"execution_count": 98
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.025649Z",
"start_time": "2024-08-24T00:16:05.892222Z"
}
},
"cell_type": "code",
"source": "bio_syn_map[ALPHA_AMINO_ACID_ZWITTERION]",
"id": "bad0eb941e50a582",
"outputs": [
{
"data": {
"text/plain": [
"'an alpha-amino acid'"
]
},
"execution_count": 99,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 99
},
{
"cell_type": "code",
"source": [
"ont = Ontology(terms=[t])\n",
"print(ont.as_obo())"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.025732Z",
"start_time": "2024-08-24T00:16:06.251105Z"
}
},
"id": "f8fda8f3ae80f063",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"ontology: chebi-slim\n",
"idspace: chemrof https://w3id.org/chemrof/\n",
"\n",
"[Term]\n",
"id: CHEBI:25944\n",
"name: pesticide\n",
"is_a: CHEBI:33232\n",
"xref: Wikipedia:Pesticide\n",
"\n"
]
}
],
"execution_count": 100
},
{
"cell_type": "code",
"source": [
"with open(\"tmp/t.obo\", \"w\") as file:\n",
" file.write(ont.as_obo())"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.025810Z",
"start_time": "2024-08-24T00:16:06.606612Z"
}
},
"id": "b0e8b052fec4776f",
"outputs": [],
"execution_count": 101
},
{
"cell_type": "code",
"source": [
"def roots(terms: List[Term]):\n",
" return [t.id for t in terms if not t.parents]"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.025886Z",
"start_time": "2024-08-24T00:16:06.963704Z"
}
},
"id": "2aecf0e555b32a2a",
"outputs": [],
"execution_count": 102
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.025956Z",
"start_time": "2024-08-24T00:16:07.313073Z"
}
},
"cell_type": "code",
"source": "",
"id": "61d45587ba2b1b57",
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"source": [
"\n",
"def make_terms_for_ids(ids: List[str]) -> List[Term]:\n",
" \"\"\"\n",
" Make terms for a list of IDs\n",
" \n",
" :param ids: \n",
" :return: \n",
" \"\"\"\n",
" terms = []\n",
" n = 0\n",
" for id in ids:\n",
" n += 1\n",
" t = make_term(id)\n",
" if t:\n",
" terms.append(t)\n",
" if n % 10000 == 0:\n",
" print(f\"Processed {n} IDs, made {len(terms)} terms\")\n",
" return terms\n",
" "
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.026032Z",
"start_time": "2024-08-24T00:16:07.668198Z"
}
},
"id": "6f93d120489cc7ac",
"outputs": [],
"execution_count": 103
},
{
"cell_type": "code",
"source": [],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.026104Z",
"start_time": "2024-08-24T00:16:08.024489Z"
}
},
"id": "fb1e972e983ab1ba",
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"source": [
"def write_terms(terms: List[Term], path: str):\n",
" \"\"\"\n",
" Write terms to a file\n",
" \n",
" :param terms: \n",
" :param path: \n",
" :return: \n",
" \"\"\"\n",
" ont = Ontology(terms=terms)\n",
" with open(path, \"w\") as file:\n",
" file.write(ont.as_obo())\n",
" \n",
" "
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.026170Z",
"start_time": "2024-08-24T00:16:08.381337Z"
}
},
"id": "373457deffad71d3",
"outputs": [],
"execution_count": 104
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.026240Z",
"start_time": "2024-08-24T00:16:08.735221Z"
}
},
"cell_type": "code",
"source": [
"def generate_write_all(ids: List[str], path: str) -> List[Term]:\n",
" \"\"\"\n",
" Run whole pipeline\n",
" \n",
" :param ids: \n",
" :param path: \n",
" :return: \n",
" \"\"\"\n",
" terms = make_terms_for_ids(ids)\n",
" write_terms(terms, path)\n",
" return terms"
],
"id": "4c511c4c5ff0977f",
"outputs": [],
"execution_count": 105
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.026311Z",
"start_time": "2024-08-24T00:16:09.088855Z"
}
},
"cell_type": "code",
"source": [
"amino_acid_ids = list(chebi.descendants(AMINO_ACID))\n",
"assert L_CYSTEINE_ZWITTERION in amino_acid_ids\n",
"assert len(amino_acid_ids) > 100"
],
"id": "9a96b67935019540",
"outputs": [],
"execution_count": 106
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.026381Z",
"start_time": "2024-08-24T00:16:12.508101Z"
}
},
"cell_type": "code",
"source": "terms = generate_write_all(amino_acid_ids, \"tmp/amino_acids.obo\")",
"id": "816113d92ec207d1",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Processed 10000 IDs, made 7879 terms\n"
]
}
],
"execution_count": 107
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.026452Z",
"start_time": "2024-08-24T00:16:17.924748Z"
}
},
"cell_type": "code",
"source": [
"[t] = [t for t in terms if t.id == L_CYSTEINE_ZWITTERION]\n",
"print(t.as_obo())"
],
"id": "c0b4338f1146a984",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[Term]\n",
"id: CHEBI:35235\n",
"name: L-cysteine\n",
"alt_id: CHEBI:32442\n",
"alt_id: CHEBI:17561\n",
"alt_id: CHEBI:32445\n",
"alt_id: CHEBI:32443\n",
"is_a: CHEBI:35237\n",
"is_a: CHEBI:59869\n",
"is_a: CHEBI:26650\n",
"is_a: CHEBI:83813\n",
"xref: Gmelin:49993\n",
"xref: Reaxys:4128886\n",
"xref: Gmelin:325857\n",
"xref: Beilstein:4128886\n",
"xref: YMDB:YMDB00046\n",
"xref: Wikipedia:Cysteine\n",
"xref: Reaxys:1721408\n",
"xref: PDBeChem:CYS\n",
"xref: MetaCyc:CYS\n",
"xref: KNApSAcK:C00001351\n",
"xref: KEGG:D00026\n",
"xref: KEGG:C00097\n",
"xref: HMDB:HMDB0000574\n",
"xref: Gmelin:49991\n",
"xref: ECMDB:ECMDB00574\n",
"xref: Drug_Central:769\n",
"xref: DrugBank:DB00151\n",
"xref: CAS:52-90-4\n",
"xref: Beilstein:1721408\n",
"xref: Gmelin:325860\n",
"xref: Reaxys:5921923\n",
"xref: Gmelin:325856\n",
"xref: Beilstein:5921923\n",
"relationship: RO:0018039 CHEBI:35236\n",
"relationship: RO:0000087 CHEBI:78675\n",
"relationship: RO:0000087 CHEBI:64577\n",
"relationship: RO:0000087 CHEBI:77703\n",
"relationship: RO:0000087 CHEBI:77746\n",
"comment: Parent CHEBI:59869 was rewired from CHEBI:32442 to CHEBI:59814; Parent CHEBI:26650 was rewired from CHEBI:17561 to CHEBI:26650; Parent CHEBI:83813 was rewired from CHEBI:17561 to CHEBI:83813\n",
"property_value: chemrof:inchi_string \"InChI=1S/C3H7NO2S/c4-2(1-7)3(5)6/h2,7H,1,4H2,(H,5,6)/t2-/m0/s1\" xsd:string\n",
"property_value: chemrof:has_physiologically_stable_form CHEBI:35235\n",
"\n"
]
}
],
"execution_count": 108
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.026529Z",
"start_time": "2024-08-24T00:16:18.302490Z"
}
},
"cell_type": "code",
"source": [
"from oaklib.datamodels.vocabulary import OWL_CLASS\n",
"\n",
"# all_ids = list(chebi.descendants(ROOT))\n",
"all_ids = list(chebi.entities(filter_obsoletes=True, owl_type=OWL_CLASS))\n",
"terms = generate_write_all(all_ids, \"tmp/all.obo\")\n"
],
"id": "7fec1427e96baea2",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Processed 10000 IDs, made 9955 terms\n",
"Processed 20000 IDs, made 19917 terms\n",
"Processed 30000 IDs, made 29878 terms\n",
"Processed 40000 IDs, made 38207 terms\n",
"Processed 50000 IDs, made 47129 terms\n",
"Processed 60000 IDs, made 56582 terms\n",
"Processed 70000 IDs, made 65929 terms\n",
"Processed 80000 IDs, made 75147 terms\n",
"Processed 90000 IDs, made 84597 terms\n",
"Processed 100000 IDs, made 93930 terms\n",
"Processed 110000 IDs, made 103890 terms\n",
"Processed 120000 IDs, made 113825 terms\n",
"Processed 130000 IDs, made 123784 terms\n",
"Processed 140000 IDs, made 132470 terms\n",
"Processed 150000 IDs, made 141078 terms\n",
"Processed 160000 IDs, made 149833 terms\n",
"Processed 170000 IDs, made 158566 terms\n",
"Processed 180000 IDs, made 166455 terms\n",
"Processed 190000 IDs, made 174804 terms\n",
"Processed 200000 IDs, made 184378 terms\n"
]
}
],
"execution_count": 109
},
{
"cell_type": "code",
"source": [
"len(terms)"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.026608Z",
"start_time": "2024-08-24T00:17:38.385274Z"
}
},
"id": "153be3a8b30c2713",
"outputs": [
{
"data": {
"text/plain": [
"185206"
]
},
"execution_count": 110,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 110
},
{
"cell_type": "code",
"source": [
"# many roots expected when we make a subset\n",
"len(roots(terms))"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.026690Z",
"start_time": "2024-08-24T00:17:38.762055Z"
}
},
"id": "ac31bc98e2ef7ef7",
"outputs": [
{
"data": {
"text/plain": [
"16"
]
},
"execution_count": 111,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 111
},
{
"cell_type": "code",
"source": "#write_terms(terms, f\"tmp/{ROOT.replace(':', '_')}.obo\")",
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.026776Z",
"start_time": "2024-08-24T00:17:39.162207Z"
}
},
"id": "858eda4cbfd8fb5d",
"outputs": [],
"execution_count": 112
},
{
"cell_type": "code",
"source": [
"fertirelin = \"CHEBI:177856\""
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.026847Z",
"start_time": "2024-08-24T00:17:39.529104Z"
}
},
"id": "dbc4c586e92c3328",
"outputs": [],
"execution_count": 113
},
{
"cell_type": "code",
"source": [
"t = make_term(fertirelin)\n",
"print(t.as_obo())"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.026935Z",
"start_time": "2024-08-24T00:17:39.893737Z"
}
},
"id": "90336d5cc96554c7",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[Term]\n",
"id: CHEBI:177856\n",
"name: fertirelin\n",
"is_a: CHEBI:25676\n",
"xref: KEGG:D07957\n",
"xref: Chemspider:163670\n",
"xref: CAS:38234-21-8\n",
"property_value: chemrof:inchi_string \"InChI=1S/C55H76N16O12/c1-4-59-53(82)44-12-8-20-71(44)54(83)38(11-7-19-60-55(56)57)66-49(78)39(21-30(2)3)65-46(75)27-62-47(76)40(22-31-13-15-34(73)16-14-31)67-52(81)43(28-72)70-50(79)41(23-32-25-61-36-10-6-5-9-35(32)36)68-51(80)42(24-33-26-58-29-63-33)69-48(77)37-17-18-45(74)64-37/h5-6,9-10,13-16,25-26,29-30,37-44,61,72-73H,4,7-8,11-12,17-24,27-28H2,1-3H3,(H,58,63)(H,59,82)(H,62,76)(H,64,74)(H,65,75)(H,66,78)(H,67,81)(H,68,80)(H,69,77)(H,70,79)(H4,56,57,60)/t37-,38-,39-,40-,41-,42-,43-,44-/m0/s1\" xsd:string\n",
"\n"
]
}
],
"execution_count": 114
},
{
"cell_type": "code",
"source": [
"chebi.label(is_a_map[fertirelin][0])"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.027020Z",
"start_time": "2024-08-24T00:17:40.255847Z"
}
},
"id": "e1185be596f3b559",
"outputs": [
{
"data": {
"text/plain": [
"'oligopeptide'"
]
},
"execution_count": 115,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 115
},
{
"cell_type": "code",
"source": [],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-11-16T01:31:29.027117Z",
"start_time": "2024-08-24T00:17:40.640098Z"
}
},
"id": "3e17588e0084f22d",
"outputs": [],
"execution_count": null
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}