CHEBI Slimmer
Creates a simplified version of CHEBI by conflating all members of a conjugate clique.
Initial setup
Imports and use OAK to get an adapter to CHEBI sqlite database.
[1]:
from typing import Optional, List
from collections import defaultdict
import pandas as pd
from oaklib import get_adapter
from oaklib.utilities.obograph_utils import reflexive
from tests.test_converters.test_obo_format import canonical_path
chebi = get_adapter("sqlite:obo:chebi")
# session = get_adapter("sqlite:obo:chebi").session
session = chebi.session
[2]:
from oaklib.datamodels.vocabulary import IS_A, HAS_PART
from oaklib.interfaces import OboGraphInterface
assert isinstance(chebi, OboGraphInterface)
Set up vocabulary constants
[3]:
# Relations
CBO = "obo:chebi#is_conjugate_base_of"
CAO = "obo:chebi#is_conjugate_acid_of"
TAUTOMER_OF = "obo:chebi#is_tautomer_of"
ENANTIOMER_OF = "obo:chebi#is_enantiomer_of"
HAS_ROLE = "RO:0000087"
[4]:
CHEMICAL_ENTITY = "CHEBI:24431"
[5]:
# modify this for testing
# ROOT = AMINO_ACID
ROOT = CHEMICAL_ENTITY
[6]:
AMINO_ACID = "CHEBI:33709"
AMINO_ACID_ANION = "CHEBI:37022"
ION = "CHEBI:24870"
ALPHA_AMINO_ACID = "CHEBI:33704"
ALPHA_AMINO_ACID_ANION = "CHEBI:33558"
ALPHA_AMINO_ACID_ZWITTERION = "CHEBI:78608"
CYSTEINE_ZWITTERION = "CHEBI:35237"
L_CYSTEINE_ZWITTERION = "CHEBI:35235"
CYSTEINATE_1_MINUS = "CHEBI:32456"
CYSTEINIUM = "CHEBI:32458"
CORD_E = "CHEBI:213754"
AAAE = "CHEBI:46874"
CITRIC_ACID = "CHEBI:30769"
AMMONIA="CHEBI:16134"
AMMONIUM="CHEBI:28938"
AZANIDE="CHEBI:29337"
HYRDRIDONITRATE_2M = "CHEBI:29340"
PECTIN = "CHEBI:17309"
WATER = "CHEBI:15377"
CONJ_EXCLUDES = {
AMMONIA, AMMONIUM, AZANIDE, HYRDRIDONITRATE_2M, PECTIN
}
All labels
[7]:
labels = {k: v for k, v in chebi.labels(chebi.entities())}
len(labels)
[7]:
200879
Mappings
[8]:
from semsql.sqla.semsql import Statements, HasDbxrefStatement
q = session.query(HasDbxrefStatement)
xrefs = defaultdict(list)
for row in q:
if row.subject.startswith("CHEBI:"):
xrefs[row.subject].append(row.value)
len(xrefs)
[8]:
161106
[9]:
q = session.query(Statements).filter(Statements.predicate == "obo:chebi/inchi")
inchis = {row.subject: row.value for row in q}
len(inchis)
[9]:
177462
[10]:
S3H = "CHEBI:113373"
inchis[S3H]
[10]:
'InChI=1S/C4H8O3.Na/c1-3(5)2-4(6)7;/h3,5H,2H2,1H3,(H,6,7);/q;+1/p-1'
Various Relationships
[11]:
PMAP = {ENANTIOMER_OF: "RO:0018039"}
preserved_rels = list(chebi.relationships(predicates=[HAS_PART, HAS_ROLE, ENANTIOMER_OF]))
preserved_rels_by_subject = defaultdict(list)
for s, p, o in preserved_rels:
p_mapped = PMAP.get(p, p)
preserved_rels_by_subject[s].append((p_mapped, o))
assert len(preserved_rels_by_subject) > 1000
Retrieve all Charge States
[ ]:
[12]:
from semsql.sqla.semsql import Statements, HasDbxrefStatement
session = get_adapter("sqlite:obo:chebi").session
q = session.query(Statements).filter(Statements.predicate == "obo:chebi/charge")
charges = {row.subject: int(row.value) for row in q if row.value is not None}
[13]:
len(charges)
[13]:
189057
[14]:
assert charges[L_CYSTEINE_ZWITTERION] == 0
[15]:
assert charges[CYSTEINATE_1_MINUS] == -1
[16]:
assert charges[CITRIC_ACID] == 0
[17]:
assert AMINO_ACID_ANION not in charges, "X anion terms are agnostic to a SPECIFIC charge"
[18]:
# check against inchis
inchi_skip = {}
charges_by_inchi = {}
for id, inchi in inchis.items():
toks = inchi.split("/")
# q is charge
qtoks = [tok for tok in toks if tok.startswith("q")]
if qtoks:
qtok = qtoks[0]
if ";" in qtok:
qtok = qtok.replace(";", "")
inchi_skip[id] = qtok
continue
if "*" in qtok:
# print(qtok)
mparts= qtok[1:].split("*")
charge = 1
try:
for mpart in mparts:
charge *= int(mpart)
except:
odd.append(id)
continue
else:
try:
charge = int(qtok[1:])
charges_by_inchi[id] = charge
except:
odd.append(id)
continue
# p is protonation
ptoks = [tok for tok in toks if tok.startswith("p")]
if ptoks and True:
ptok = ptoks[0]
try:
charge = int(ptok[1:])
if id in charges_by_inchi:
charges_by_inchi[id] = charge + charges_by_inchi[id]
else:
charges_by_inchi[id] = charge
except:
pass
len(charges_by_inchi)
[18]:
10687
[19]:
errs = []
for id, charge in charges_by_inchi.items():
if id not in charges:
errs.append({"id": id, "inchi_charge": charge, "asserted_charge": None, "type": "MISSING"})
elif charges[id] != charge:
errs.append({"id": id, "inchi_charge": charge, "asserted_charge": charges[id], "type": "MISMATCH"})
cedf = pd.DataFrame(errs)
cedf
[19]:
[20]:
# charges_by_inchi[L_CYSTEINE_ZWITTERION]
Find Conjugate Cliques
[21]:
conjrels = list(chebi.relationships(predicates=[CBO, CAO, TAUTOMER_OF]))
conjrels = [(s, p, o) for s, p, o in conjrels if not s in CONJ_EXCLUDES and not o in CONJ_EXCLUDES]
assert len(conjrels) > 15000
assert len([r for r in conjrels if r[1] == CBO]) > 8000
assert len([r for r in conjrels if r[1] == TAUTOMER_OF]) > 1500
[22]:
conjrels_by_subject = defaultdict(list)
for s, p, o in conjrels:
conjrels_by_subject[s].append((p, o))
[ ]:
[23]:
conjrels_by_subject["CHEBI:142854"]
[23]:
[('obo:chebi#is_conjugate_acid_of', 'CHEBI:142858'),
('obo:chebi#is_tautomer_of', 'CHEBI:142853')]
Calculate conjugate graph and strongly connected components
[24]:
from typing import Tuple
import networkx as nx
# find strongly connected components using cbos
def calculate_conj_graph(conjrels: List[Tuple[str, str, str]]) -> nx.DiGraph:
conj_graph = nx.DiGraph()
for s, _, o in conjrels:
conj_graph.add_edge(s, o)
conj_graph.add_edge(o, s)
return conj_graph
conj_graph = calculate_conj_graph(conjrels)
sccs = list(nx.strongly_connected_components(conj_graph))
asserted_sccs = sccs
assert len(sccs) > 8000
[25]:
len(asserted_sccs)
[25]:
8553
Lexical analysis
The CHEBI conjugate relationships are incomplete - here we aim to complete them doing a lexical analysis of the labels.
For example,
foo acid anion
foo acid(1-)
foo acid zwitterion
should be in a clique
[26]:
# ensure ordering is such that greedy matching works
suffixes = {
# odd edge cases - e.g. (2R)-glufosinate zwitterion(1-)
"zwitterion(1-)": -1,
"zwitterion(2-)": -2,
"anion(1-)": -1,
# standard
"zwitterion": None, "anion": (-99, -1), "cation": (1, 99), "ion": None,
"ate": None,
"acid": None,
}
for i in range(1, 10):
for sign in ["+", "-"]:
suffixes[f"({i}{sign})"] = int(f"{sign}{i}")
suffixes
[26]:
{'zwitterion(1-)': -1,
'zwitterion(2-)': -2,
'anion(1-)': -1,
'zwitterion': None,
'anion': (-99, -1),
'cation': (1, 99),
'ion': None,
'ate': None,
'acid': None,
'(1+)': 1,
'(1-)': -1,
'(2+)': 2,
'(2-)': -2,
'(3+)': 3,
'(3-)': -3,
'(4+)': 4,
'(4-)': -4,
'(5+)': 5,
'(5-)': -5,
'(6+)': 6,
'(6-)': -6,
'(7+)': 7,
'(7-)': -7,
'(8+)': 8,
'(8-)': -8,
'(9+)': 9,
'(9-)': -9}
[27]:
roles = list(chebi.descendants("CHEBI:50906", [IS_A]))
[28]:
# https://github.com/ebi-chebi/ChEBI/issues/4528
EXCLUDE_STEMS = ["disulfide", "tartr", "tartar", "oxide", "oxo"]
[29]:
from typing import Dict
# stem to chem is a mapping between a stem (e.g. "L-lysine")
# and a dictionary of suffixes to CHEBI IDs
stem_to_chem: Dict[str, Dict[str, str]] = {}
stem_to_chem = defaultdict(dict)
def _norm(label: str) -> str:
# CHEBI is inconsistent, e.g. "amino-acid" vs "amino acid"
label = label.replace("-acid", " acid")
# label = label.replace(" acid", "")
return label
def _de_acid(label: str) -> str:
if label.endswith(" acid"):
label = label.replace(" acid", "")
if label.endswith("ic"):
label = label.replace("ic", "")
if label.endswith("ate"):
label = label.replace("ate", "")
return label
for id, label in labels.items():
if not label:
# TODO: eliminate non-classes
continue
if id in roles:
continue
label = _norm(label)
for suffix in suffixes.keys():
if label.endswith(suffix):
stem = label.replace(suffix, "")
stem = stem.strip()
stem_to_chem[_de_acid(stem)][suffix] = id
break
for id, label in labels.items():
if not label:
continue
label = _norm(label)
if label in stem_to_chem:
stem_to_chem[_de_acid(label)][""] = id
for stem in EXCLUDE_STEMS:
if stem in stem_to_chem:
del stem_to_chem[stem]
assert len(stem_to_chem) > 30000
[30]:
stem_to_chem["amino"]
[30]:
{'cation': 'CHEBI:33703',
'acid': 'CHEBI:33709',
'zwitterion': 'CHEBI:35238',
'anion': 'CHEBI:37022'}
[31]:
stem_to_chem["oxo"]
[31]:
{}
[32]:
stem_to_chem["citr"]
[32]:
{'(4-)': 'CHEBI:132362',
'anion': 'CHEBI:133748',
'(3-)': 'CHEBI:16947',
'acid': 'CHEBI:30769',
'(1-)': 'CHEBI:35804',
'(2-)': 'CHEBI:35808'}
[33]:
assert not stem_to_chem["citrate"]
[34]:
stem_to_chem["(2R)-glufosin"]
[34]:
{'ate': 'CHEBI:142853',
'zwitterion': 'CHEBI:142854',
'zwitterion(1-)': 'CHEBI:142858'}
[35]:
# ensure edge case of (2R)-glufosinate zwitterion(1-) is taken care of
assert not stem_to_chem["(2R)-glufosinate zwitterion"]
[36]:
[37]:
[37]:
31415
Analysis: Consistency check between lexically inferred cliques and asserted relationships
Some of this is reported here:
[38]:
import numpy as np
NO_REL = "NO_REL"
INVERSES = {
CBO: CAO,
CAO: CBO,
TAUTOMER_OF: TAUTOMER_OF,
NO_REL: NO_REL
}
charge_problems = []
def make_conjrefs(clique_suffix_dict: dict, stem=None) -> List:
results = []
for suffix1, chem1 in clique_suffix_dict.items():
ch1 = suffixes.get(suffix1, None)
actual_ch1 = charges.get(chem1, None)
chem1_label = labels.get(chem1, chem1)
if isinstance(ch1, int):
if actual_ch1 is None:
charge_problems.append({"id": chem1, "label": chem1_label, "expected": ch1, "asserted": None, "problem": "MISSING_CHARGE"})
# raise ValueError(f"Missing charge for {chem1}")
elif ch1 != actual_ch1:
charge_problems.append({"id": chem1, "label": chem1_label, "expected": ch1, "asserted": actual_ch1, "problem": "CONFLICTING_CHARGE"})
# raise ValueError(f"Charge mismatch for {chem1}: {ch1} vs {actual_ch1}")
elif isinstance(ch1, tuple):
if actual_ch1 is not None:
if actual_ch1 < ch1[0] or actual_ch1 > ch1[1]:
charge_problems.append({"id": chem1, "label": chem1_label, "expected": ch1, "asserted": actual_ch1, "problem": "OUTSIDE_RANGE"})
# raise ValueError(f"Charge mismatch for {chem1}: {ch1} vs {actual_ch1}")
rels = conjrels_by_subject.get(chem1, [])
for suffix2, chem2 in clique_suffix_dict.items():
if suffix1 == suffix2:
continue
messages = []
matched_preds = set()
actual_p = "NO_REL"
for p, o in rels:
if o == chem2:
actual_p = p
matched_preds.add(p)
if len(matched_preds) > 1:
messages.append(f"Multiple matched preds: {matched_preds}")
rev_matched_preds = set()
rels2 = conjrels_by_subject.get(chem2, [])
for p, o in rels2:
if o == chem1:
inv_p = INVERSES[p]
rev_matched_preds.add(inv_p)
if actual_p:
if inv_p != actual_p:
messages.append(f"Preds mismatch: {actual_p} vs {inv_p}")
else:
actual_p = inv_p
if len(rev_matched_preds) > 1:
messages.append(f"Multiple matched inv preds: {rev_matched_preds}")
if matched_preds != rev_matched_preds:
messages.append(f"Preds mismatch: {matched_preds} vs {rev_matched_preds}")
if messages:
raise ValueError(f"Error in clique {clique_suffix_dict}: {messages}")
ch2 = suffixes.get(suffix2, None)
if ch1 is None or ch2 is None:
charge_diff = None
charge_diff_sign = None
else:
if isinstance(ch1, int) and isinstance(ch2, int):
charge_diff = ch1 - ch2
charge_diff_sign = np.sign(charge_diff)
elif isinstance(ch1, int) and isinstance(ch2, tuple):
charge_diff = None
if ch1 < ch2[0]:
charge_diff_sign = -1
elif ch1 > ch2[1]:
charge_diff_sign = 1
else:
charge_diff_sign = 0
elif isinstance(ch1, tuple) and isinstance(ch2, int):
charge_diff = None
if ch1[0] < ch2:
charge_diff_sign = -1
elif ch1[1] > ch2:
charge_diff_sign = 1
else:
charge_diff_sign = 0
else:
charge_diff = None
charge_diff_sign = None
results.append({"suffix1": suffix1 or "NO_SUFFIX",
"suffix2": suffix2 or "NO_SUFFIX",
"predicate": actual_p,
"chem1": chem1,
"chem2": chem2,
"charge_diff": charge_diff,
"charge_diff_sign": charge_diff_sign,
"stem": stem,
})
return results
make_conjrefs(stem_to_chem["amino"])
[38]:
[{'suffix1': 'cation',
'suffix2': 'acid',
'predicate': 'obo:chebi#is_conjugate_acid_of',
'chem1': 'CHEBI:33703',
'chem2': 'CHEBI:33709',
'charge_diff': None,
'charge_diff_sign': None,
'stem': None},
{'suffix1': 'cation',
'suffix2': 'zwitterion',
'predicate': 'NO_REL',
'chem1': 'CHEBI:33703',
'chem2': 'CHEBI:35238',
'charge_diff': None,
'charge_diff_sign': None,
'stem': None},
{'suffix1': 'cation',
'suffix2': 'anion',
'predicate': 'NO_REL',
'chem1': 'CHEBI:33703',
'chem2': 'CHEBI:37022',
'charge_diff': None,
'charge_diff_sign': None,
'stem': None},
{'suffix1': 'acid',
'suffix2': 'cation',
'predicate': 'obo:chebi#is_conjugate_base_of',
'chem1': 'CHEBI:33709',
'chem2': 'CHEBI:33703',
'charge_diff': None,
'charge_diff_sign': None,
'stem': None},
{'suffix1': 'acid',
'suffix2': 'zwitterion',
'predicate': 'NO_REL',
'chem1': 'CHEBI:33709',
'chem2': 'CHEBI:35238',
'charge_diff': None,
'charge_diff_sign': None,
'stem': None},
{'suffix1': 'acid',
'suffix2': 'anion',
'predicate': 'obo:chebi#is_conjugate_acid_of',
'chem1': 'CHEBI:33709',
'chem2': 'CHEBI:37022',
'charge_diff': None,
'charge_diff_sign': None,
'stem': None},
{'suffix1': 'zwitterion',
'suffix2': 'cation',
'predicate': 'NO_REL',
'chem1': 'CHEBI:35238',
'chem2': 'CHEBI:33703',
'charge_diff': None,
'charge_diff_sign': None,
'stem': None},
{'suffix1': 'zwitterion',
'suffix2': 'acid',
'predicate': 'NO_REL',
'chem1': 'CHEBI:35238',
'chem2': 'CHEBI:33709',
'charge_diff': None,
'charge_diff_sign': None,
'stem': None},
{'suffix1': 'zwitterion',
'suffix2': 'anion',
'predicate': 'NO_REL',
'chem1': 'CHEBI:35238',
'chem2': 'CHEBI:37022',
'charge_diff': None,
'charge_diff_sign': None,
'stem': None},
{'suffix1': 'anion',
'suffix2': 'cation',
'predicate': 'NO_REL',
'chem1': 'CHEBI:37022',
'chem2': 'CHEBI:33703',
'charge_diff': None,
'charge_diff_sign': None,
'stem': None},
{'suffix1': 'anion',
'suffix2': 'acid',
'predicate': 'obo:chebi#is_conjugate_base_of',
'chem1': 'CHEBI:37022',
'chem2': 'CHEBI:33709',
'charge_diff': None,
'charge_diff_sign': None,
'stem': None},
{'suffix1': 'anion',
'suffix2': 'zwitterion',
'predicate': 'NO_REL',
'chem1': 'CHEBI:37022',
'chem2': 'CHEBI:35238',
'charge_diff': None,
'charge_diff_sign': None,
'stem': None}]
[39]:
make_conjrefs(stem_to_chem["(2R)-glufosin"])
[39]:
[{'suffix1': 'ate',
'suffix2': 'zwitterion',
'predicate': 'obo:chebi#is_tautomer_of',
'chem1': 'CHEBI:142853',
'chem2': 'CHEBI:142854',
'charge_diff': None,
'charge_diff_sign': None,
'stem': None},
{'suffix1': 'ate',
'suffix2': 'zwitterion(1-)',
'predicate': 'NO_REL',
'chem1': 'CHEBI:142853',
'chem2': 'CHEBI:142858',
'charge_diff': None,
'charge_diff_sign': None,
'stem': None},
{'suffix1': 'zwitterion',
'suffix2': 'ate',
'predicate': 'obo:chebi#is_tautomer_of',
'chem1': 'CHEBI:142854',
'chem2': 'CHEBI:142853',
'charge_diff': None,
'charge_diff_sign': None,
'stem': None},
{'suffix1': 'zwitterion',
'suffix2': 'zwitterion(1-)',
'predicate': 'obo:chebi#is_conjugate_acid_of',
'chem1': 'CHEBI:142854',
'chem2': 'CHEBI:142858',
'charge_diff': None,
'charge_diff_sign': None,
'stem': None},
{'suffix1': 'zwitterion(1-)',
'suffix2': 'ate',
'predicate': 'NO_REL',
'chem1': 'CHEBI:142858',
'chem2': 'CHEBI:142853',
'charge_diff': None,
'charge_diff_sign': None,
'stem': None},
{'suffix1': 'zwitterion(1-)',
'suffix2': 'zwitterion',
'predicate': 'obo:chebi#is_conjugate_base_of',
'chem1': 'CHEBI:142858',
'chem2': 'CHEBI:142854',
'charge_diff': None,
'charge_diff_sign': None,
'stem': None}]
[40]:
charge_problems = [] ## warning - global
lexical_conj_pairs = []
for stem, clique in stem_to_chem.items():
lexical_conj_pairs += make_conjrefs(clique, stem)
# Reported here: https://github.com/ebi-chebi/ChEBI/issues/4525
pd.DataFrame(charge_problems).to_csv("tmp/charge_problems.csv", index=False)
len(lexical_conj_pairs)
[40]:
17166
[41]:
chem_to_stem: Dict[str, str] = {}
for row in lexical_conj_pairs:
def _assign(chem: str, stem: str):
if chem in chem_to_stem:
if chem_to_stem[chem] != stem:
raise ValueError(f"Conflicting stems for {chem}: {chem_to_stem[chem]} vs {stem}")
else:
chem_to_stem[chem] = stem
stem = row["stem"]
_assign(row["chem1"], stem)
_assign(row["chem2"], stem)
[117]:
g = calculate_conj_graph([(row["chem1"], "?", row["chem2"]) for row in lexical_conj_pairs])
lexical_sccs = list(nx.strongly_connected_components(g))
len(lexical_sccs)
[117]:
7517
[ ]:
[123]:
# venn diagram of overlaps between
# - lexical_sccs
# - asserted_sccs
# - rhea_sccs
# - full_sccs
from matplotlib_venn import venn3
from matplotlib_venn import venn3_unweighted
import matplotlib.pyplot as plt
def hashable_scc(scc):
return set([tuple(sorted(list(x))) for x in scc])
def my_venn3(sccs, *args, **kwargs):
scc_sets = [hashable_scc(scc) for scc in sccs]
venn3_unweighted(scc_sets, *args, **kwargs)
#venn3([set(lexical_sccs), set(asserted_sccs), set(rhea_sccs)], ("Lexical", "Asserted", "Rhea"))
#venn3([{1}, {1,2}, {1,2,tuple("a" "b")}])
my_venn3([lexical_sccs, asserted_sccs, rhea_sccs], ("Lexical", "Asserted", "Rhea"))
plt.show()
[ ]:
# same as Euler diagram
from matplotlib_venn import venn3_unweighted
[42]:
import pandas as pd
df = pd.DataFrame(lexical_conj_pairs)
[43]:
df
[43]:
suffix1 | suffix2 | predicate | chem1 | chem2 | charge_diff | charge_diff_sign | stem | |
---|---|---|---|---|---|---|---|---|
0 | acid | anion | obo:chebi#is_conjugate_acid_of | CHEBI:100147 | CHEBI:62070 | NaN | NaN | nalidix |
1 | anion | acid | obo:chebi#is_conjugate_base_of | CHEBI:62070 | CHEBI:100147 | NaN | NaN | nalidix |
2 | acid | NO_SUFFIX | NO_REL | CHEBI:10046 | CHEBI:10045 | NaN | NaN | Wyerone |
3 | NO_SUFFIX | acid | NO_REL | CHEBI:10045 | CHEBI:10046 | NaN | NaN | Wyerone |
4 | acid | ate | obo:chebi#is_conjugate_acid_of | CHEBI:10072 | CHEBI:71201 | NaN | NaN | xanthuren |
... | ... | ... | ... | ... | ... | ... | ... | ... |
17161 | NO_SUFFIX | (1-) | obo:chebi#is_conjugate_acid_of | CHEBI:130073 | CHEBI:91301 | NaN | NaN | 5,20-diHEPE |
17162 | (1-) | NO_SUFFIX | obo:chebi#is_conjugate_base_of | CHEBI:9162 | CHEBI:79317 | NaN | NaN | sinigrin |
17163 | NO_SUFFIX | (1-) | obo:chebi#is_conjugate_acid_of | CHEBI:79317 | CHEBI:9162 | NaN | NaN | sinigrin |
17164 | ate | acid | obo:chebi#is_conjugate_base_of | CHEBI:994 | CHEBI:995 | NaN | NaN | cis,cis-2-amino-3-(3-oxoprop-1-enyl)but-2-enedio |
17165 | acid | ate | obo:chebi#is_conjugate_acid_of | CHEBI:995 | CHEBI:994 | NaN | NaN | cis,cis-2-amino-3-(3-oxoprop-1-enyl)but-2-enedio |
17166 rows × 8 columns
[44]:
df[(df["charge_diff_sign"] < 0) & (df["predicate"] == CAO)]
[44]:
suffix1 | suffix2 | predicate | chem1 | chem2 | charge_diff | charge_diff_sign | stem | |
---|---|---|---|---|---|---|---|---|
15585 | (1+) | (5+) | obo:chebi#is_conjugate_acid_of | CHEBI:83553 | CHEBI:82771 | -4.0 | -1.0 | N(4)-bis(aminopropyl)spermidine |
[45]:
df[(df["charge_diff_sign"] > 0) & (df["predicate"] == CBO)]
[45]:
suffix1 | suffix2 | predicate | chem1 | chem2 | charge_diff | charge_diff_sign | stem | |
---|---|---|---|---|---|---|---|---|
15584 | (5+) | (1+) | obo:chebi#is_conjugate_base_of | CHEBI:82771 | CHEBI:83553 | 4.0 | 1.0 | N(4)-bis(aminopropyl)spermidine |
[46]:
df.groupby(["predicate"]).size()
[46]:
predicate
NO_REL 3256
obo:chebi#is_conjugate_acid_of 6378
obo:chebi#is_conjugate_base_of 6378
obo:chebi#is_tautomer_of 1154
dtype: int64
[47]:
df.to_csv("tmp/conjrels.csv", index=False)
[48]:
# group by suffix1, suffix2, predicate, and count the number of rows
summary = df.groupby(["suffix1", "suffix2", "predicate"]).size()
summary.sort_values(ascending=False)
[48]:
suffix1 suffix2 predicate
acid ate obo:chebi#is_conjugate_acid_of 1522
ate acid obo:chebi#is_conjugate_base_of 1522
NO_SUFFIX (1-) obo:chebi#is_conjugate_acid_of 1332
(1-) NO_SUFFIX obo:chebi#is_conjugate_base_of 1332
NO_SUFFIX (4-) obo:chebi#is_conjugate_acid_of 673
...
anion(1-) acid NO_REL 1
(6-) (2-) NO_REL 1
(3-) NO_REL 1
(4-) NO_REL 1
zwitterion(2-) zwitterion obo:chebi#is_conjugate_base_of 1
Length: 424, dtype: int64
[49]:
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import entropy
data = df
# Display the first few rows of the data to understand its structure
data.head()
# Create a function to calculate entropy for each group
def calculate_entropy(group):
counts = group.value_counts(normalize=True)
return entropy(counts)
# Grouping the data by suffix1 and suffix2, then calculating entropy for the predicates
entropy_data = data.groupby(['suffix1', 'suffix2'])['predicate'].apply(calculate_entropy).unstack(fill_value=0)
# Plotting the entropy heatmap with reversed x-axis
plt.figure(figsize=(10, 8))
sns.heatmap(entropy_data.loc[:, ::-1], annot=True, cmap="coolwarm")
plt.title('Entropy of Predicate Distribution by Suffix1 and Suffix2 (Reversed X-Axis)')
plt.xlabel('Suffix2')
plt.ylabel('Suffix1')
plt.show()
[50]:
pivot_table = df.pivot_table(index=['suffix1', 'suffix2'], columns='predicate', aggfunc='size', fill_value=0).reset_index()
pivot_table.to_csv("tmp/pivot_table.csv", index=False)
pivot_table
[50]:
predicate | suffix1 | suffix2 | NO_REL | obo:chebi#is_conjugate_acid_of | obo:chebi#is_conjugate_base_of | obo:chebi#is_tautomer_of |
---|---|---|---|---|---|---|
0 | (1+) | (1-) | 2 | 2 | 0 | 0 |
1 | (1+) | (2+) | 19 | 0 | 19 | 0 |
2 | (1+) | (2-) | 3 | 2 | 0 | 0 |
3 | (1+) | (3+) | 11 | 0 | 0 | 0 |
4 | (1+) | (3-) | 2 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... |
271 | zwitterion(1-) | ate | 1 | 0 | 0 | 0 |
272 | zwitterion(1-) | zwitterion | 0 | 0 | 3 | 0 |
273 | zwitterion(2-) | (3-) | 0 | 1 | 0 | 0 |
274 | zwitterion(2-) | acid | 1 | 0 | 0 | 0 |
275 | zwitterion(2-) | zwitterion | 0 | 0 | 1 | 0 |
276 rows × 6 columns
[51]:
df[(df["suffix1"] == "(1+)") & (df["suffix2"] == "NO_SUFFIX") & (df["predicate"] == CBO)]
[51]:
suffix1 | suffix2 | predicate | chem1 | chem2 | charge_diff | charge_diff_sign | stem | |
---|---|---|---|---|---|---|---|---|
3026 | (1+) | NO_SUFFIX | obo:chebi#is_conjugate_base_of | CHEBI:141055 | CHEBI:141057 | NaN | NaN | validoxylamine B |
10936 | (1+) | NO_SUFFIX | obo:chebi#is_conjugate_base_of | CHEBI:58644 | CHEBI:32818 | NaN | NaN | p-coumaroylagmatine |
12230 | (1+) | NO_SUFFIX | obo:chebi#is_conjugate_base_of | CHEBI:64003 | CHEBI:64004 | NaN | NaN | N-allyl-6-chloro-1-(3-methylphenyl)-2,3,4,5-te... |
12330 | (1+) | NO_SUFFIX | obo:chebi#is_conjugate_base_of | CHEBI:64364 | CHEBI:10650 | NaN | NaN | sumatriptan |
13406 | (1+) | NO_SUFFIX | obo:chebi#is_conjugate_base_of | CHEBI:72567 | CHEBI:6438 | NaN | NaN | levobunolol |
14208 | (1+) | NO_SUFFIX | obo:chebi#is_conjugate_base_of | CHEBI:75297 | CHEBI:31057 | NaN | NaN | 13-deoxydaunorubicin |
14382 | (1+) | NO_SUFFIX | obo:chebi#is_conjugate_base_of | CHEBI:76278 | CHEBI:16299 | NaN | NaN | dehydrocoformycin |
14672 | (1+) | NO_SUFFIX | obo:chebi#is_conjugate_base_of | CHEBI:76819 | CHEBI:15906 | NaN | NaN | demethylmacrocin |
14698 | (1+) | NO_SUFFIX | obo:chebi#is_conjugate_base_of | CHEBI:76922 | CHEBI:77055 | NaN | NaN | argemonine |
16426 | (1+) | NO_SUFFIX | obo:chebi#is_conjugate_base_of | CHEBI:86083 | CHEBI:86085 | NaN | NaN | (Z)-p-coumaroylagmatine |
16452 | (1+) | NO_SUFFIX | obo:chebi#is_conjugate_base_of | CHEBI:86380 | CHEBI:599440 | NaN | NaN | amorolfine |
[52]:
df
[52]:
suffix1 | suffix2 | predicate | chem1 | chem2 | charge_diff | charge_diff_sign | stem | |
---|---|---|---|---|---|---|---|---|
0 | acid | anion | obo:chebi#is_conjugate_acid_of | CHEBI:100147 | CHEBI:62070 | NaN | NaN | nalidix |
1 | anion | acid | obo:chebi#is_conjugate_base_of | CHEBI:62070 | CHEBI:100147 | NaN | NaN | nalidix |
2 | acid | NO_SUFFIX | NO_REL | CHEBI:10046 | CHEBI:10045 | NaN | NaN | Wyerone |
3 | NO_SUFFIX | acid | NO_REL | CHEBI:10045 | CHEBI:10046 | NaN | NaN | Wyerone |
4 | acid | ate | obo:chebi#is_conjugate_acid_of | CHEBI:10072 | CHEBI:71201 | NaN | NaN | xanthuren |
... | ... | ... | ... | ... | ... | ... | ... | ... |
17161 | NO_SUFFIX | (1-) | obo:chebi#is_conjugate_acid_of | CHEBI:130073 | CHEBI:91301 | NaN | NaN | 5,20-diHEPE |
17162 | (1-) | NO_SUFFIX | obo:chebi#is_conjugate_base_of | CHEBI:9162 | CHEBI:79317 | NaN | NaN | sinigrin |
17163 | NO_SUFFIX | (1-) | obo:chebi#is_conjugate_acid_of | CHEBI:79317 | CHEBI:9162 | NaN | NaN | sinigrin |
17164 | ate | acid | obo:chebi#is_conjugate_base_of | CHEBI:994 | CHEBI:995 | NaN | NaN | cis,cis-2-amino-3-(3-oxoprop-1-enyl)but-2-enedio |
17165 | acid | ate | obo:chebi#is_conjugate_acid_of | CHEBI:995 | CHEBI:994 | NaN | NaN | cis,cis-2-amino-3-(3-oxoprop-1-enyl)but-2-enedio |
17166 rows × 8 columns
Create is-a map
[53]:
is_as = list(chebi.relationships(predicates=[IS_A]))
[54]:
is_a_map = defaultdict(list)
for s, _, o in is_as:
is_a_map[s].append(o)
Fetch Uniprot Synonyms
These are bio-friendly synonyms.
[55]:
from semsql.sqla.semsql import OwlAxiomAnnotation
q = session.query(OwlAxiomAnnotation)
axiom_anns = list(q)
[56]:
len(axiom_anns)
[56]:
716075
[57]:
up_axiom_anns = [row for row in axiom_anns if row.annotation_predicate == "oio:hasDbXref" and row.annotation_value == "UniProt"]
[58]:
bio_syn_map = {row.subject: row.value for row in axiom_anns if row.annotation_predicate == "oio:hasDbXref" and row.annotation_value == "UniProt"}
len(bio_syn_map)
[58]:
16393
[59]:
len(bio_syn_map)
[59]:
16393
RHEA pH Mapping
RHEA provides a table that maps CHEBI IDs to their pH 7.3 stable forms. The mapping may be reflexive print((e.g.a stable form will map to itself))
The mappings may not be complete - in particular only leaf nodes are mapped.
[59]:
[ ]:
[60]:
import pystow
ph_mapping_df = pystow.ensure_csv("rhea", url="https://ftp.expasy.org/databases/rhea/tsv/chebi_pH7_3_mapping.tsv")
for col in ["CHEBI", "CHEBI_PH7_3"]:
ph_mapping_df[col] = "CHEBI:" + ph_mapping_df[col].astype(str)
[61]:
ph_mapping_df
[61]:
CHEBI | CHEBI_PH7_3 | ORIGIN | |
---|---|---|---|
0 | CHEBI:3 | CHEBI:3 | computation |
1 | CHEBI:7 | CHEBI:7 | computation |
2 | CHEBI:8 | CHEBI:8 | computation |
3 | CHEBI:19 | CHEBI:19 | computation |
4 | CHEBI:20 | CHEBI:20 | computation |
... | ... | ... | ... |
119802 | CHEBI:691037 | CHEBI:691037 | computation |
119803 | CHEBI:691622 | CHEBI:691622 | computation |
119804 | CHEBI:741548 | CHEBI:132939 | computation |
119805 | CHEBI:744019 | CHEBI:744019 | computation |
119806 | CHEBI:746859 | CHEBI:746859 | computation |
119807 rows × 3 columns
[62]:
ph_mapping = dict(zip(ph_mapping_df['CHEBI'], ph_mapping_df['CHEBI_PH7_3']))
[63]:
len(ph_mapping)
[63]:
119807
[64]:
# we expected leaf and leaf-y nodes to be mapped
assert ph_mapping[CYSTEINATE_1_MINUS] == CYSTEINE_ZWITTERION
[65]:
assert ph_mapping[CITRIC_ACID] != CITRIC_ACID
[66]:
# reflexivity
assert ph_mapping[L_CYSTEINE_ZWITTERION] == L_CYSTEINE_ZWITTERION
assert ph_mapping[CYSTEINE_ZWITTERION] == CYSTEINE_ZWITTERION
[67]:
assert ph_mapping[WATER] == WATER
[68]:
# groupings are not mapped
assert AMINO_ACID not in ph_mapping
[69]:
# create a reverse mapping
rev_ph_mapping = defaultdict(list)
for k, v in ph_mapping.items():
rev_ph_mapping[v].append(k)
len(rev_ph_mapping)
[69]:
111077
[70]:
assert L_CYSTEINE_ZWITTERION in rev_ph_mapping[L_CYSTEINE_ZWITTERION]
assert CYSTEINATE_1_MINUS in rev_ph_mapping[CYSTEINE_ZWITTERION]
[71]:
rhea_sccs = []
rhea_singletons = []
for _, clique in rev_ph_mapping.items():
if len(clique) > 1:
rhea_sccs.append(set(clique))
else:
rhea_singletons.append(clique[0])
len(rhea_sccs), len(rhea_singletons)
[71]:
(8140, 102937)
Pick Canonical from Conjugate Cliques
For each clique, pick the canonical term.
[72]:
arbitrary_canonical_map = {}
def pick_canonical(ids: List[str]) -> str:
"""
Pick the canonical term from a list of ids.
Priority order:
1. pH mapping
2. Uniprot synonyms
3. Charge 0
:param ids:
:return:
"""
# ensure deterministic order
ids = sorted(ids)
for id in ids:
if id in ph_mapping:
return ph_mapping[id]
for id in ids:
if id in bio_syn_map:
return id
for id in ids:
if id in charges and charges[id] == 0:
return id
# prioritize shorter
ids = sorted(ids, key=lambda x: len(labels.get(x)))
for id in ids:
if id not in charges:
# last resort
return id
if len(ids) == 1:
return ids[0]
arbitrary_canonical_map[tuple(ids)] = ids[0]
return ids[0]
# raise ValueError(f"Could not find canonical for {ids}")
assert pick_canonical([CITRIC_ACID]) == ph_mapping[CITRIC_ACID]
assert pick_canonical([L_CYSTEINE_ZWITTERION]) == L_CYSTEINE_ZWITTERION
assert pick_canonical([CYSTEINE_ZWITTERION, CYSTEINATE_1_MINUS, CYSTEINIUM]) == CYSTEINE_ZWITTERION
assert pick_canonical([AMINO_ACID, AMINO_ACID_ANION]) == AMINO_ACID
assert pick_canonical([ALPHA_AMINO_ACID, ALPHA_AMINO_ACID_ANION, ALPHA_AMINO_ACID_ZWITTERION]) == ALPHA_AMINO_ACID_ZWITTERION
[73]:
pick_canonical([ALPHA_AMINO_ACID, ALPHA_AMINO_ACID_ANION, ALPHA_AMINO_ACID_ZWITTERION])
[73]:
'CHEBI:78608'
[ ]:
[74]:
from typing import Set, Tuple
def create_canonical_map(scc_sets: List[Set[str]]) -> Tuple[Dict[str, Set[str]], Dict[str, str]]:
"""
Create a mapping between canonical and members of the strongly connected components
:param scc_sets:
:return:
"""
canonical_to_members = {}
for scc in scc_sets:
canonical = pick_canonical(scc)
canonical_to_members[canonical] = scc
members_to_canonical = {m: c for c, ms in canonical_to_members.items() for m in ms}
return (canonical_to_members, members_to_canonical)
canonical_to_members, members_to_canonical = create_canonical_map(sccs)
assert len(canonical_to_members) > 8000
assert members_to_canonical[CYSTEINE_ZWITTERION] == CYSTEINE_ZWITTERION
assert members_to_canonical[CYSTEINATE_1_MINUS] == CYSTEINE_ZWITTERION
[ ]:
[75]:
# assert CITRIC_ACID in members_to_canonical
[76]:
##
[ ]:
[77]:
# assess completeness of the sccs
missing_in_conjugate_sccs = []
for chem in stem_to_chem.keys():
if chem not in canonical_to_members:
missing_in_conjugate_sccs.append(chem)
len(missing_in_conjugate_sccs)
[77]:
31415
[78]:
missing_in_lexical_analysis = []
for chem in canonical_to_members.keys():
if chem not in chem_to_stem:
missing_in_lexical_analysis.append(chem)
len(missing_in_lexical_analysis)
[78]:
1927
[79]:
for _, vmap in stem_to_chem.items():
for v1 in vmap.values():
for v2 in vmap.values():
if v1 > v2:
rel = (v1, "?", v2)
if rel not in conjrels:
conjrels.append(rel)
# conj_graph.add_edge(v1, v2))
conj_graph = calculate_conj_graph(conjrels)
full_sccs = list(nx.strongly_connected_components(conj_graph))
len(full_sccs)
[79]:
9321
This number is the total number of cliques we will use
[ ]:
[80]:
canonical_to_members, members_to_canonical = create_canonical_map(full_sccs)
assert len(canonical_to_members) > 9000
[81]:
labels[members_to_canonical[AMINO_ACID]]
[81]:
'amino acid'
[82]:
assert members_to_canonical[AMINO_ACID_ANION] == AMINO_ACID
[83]:
len(members_to_canonical)
[83]:
19624
Exclusion List
[84]:
ions = list(chebi.descendants(ION, [IS_A]))
exclusion_list = [ion for ion in ions if ion not in canonical_to_members]
[85]:
len(exclusion_list)
[85]:
6762
[86]:
# assert AAAC in exclusion_list, f"expected {AAAC} in exclusion_list"
assert CITRIC_ACID not in exclusion_list
assert AMINO_ACID not in exclusion_list
[87]:
def rewire(id: str) -> Optional[str]:
"""
Rewire an ID to its canonical form, if it is not in the exclusion list
:param id:
:return:
"""
rewired = members_to_canonical.get(id, id)
if rewired in exclusion_list:
return None
return rewired
[88]:
rewire(AAAE)
[88]:
'CHEBI:83410'
[89]:
assert rewire(AAAE) != AAAE
assert rewire(AAAE) not in exclusion_list
Generate Ontology
[ ]:
[90]:
from typing import Tuple
from pydantic import BaseModel
class Term(BaseModel):
stanza_type: str = "Term"
id: str
label: str
synonyms: Optional[List[str]] = None
xrefs: Optional[List[str]] = []
alt_ids: Optional[List[str]] = None
parents: List[str] = []
relationships: List[Tuple[str, str]] = []
inchi: Optional[str] = None
physiologically_stable_form: Optional[str] = None
comments: List[str] = []
def as_obo(self) -> str:
name = self.label.replace('{', r'\{')
lines = [
f"[{self.stanza_type}]",
f"id: {self.id}",
f"name: {name}",
]
lines += [f"synonym: {s}" for s in self.synonyms or []]
lines += [f"alt_id: {alt_id}" for alt_id in self.alt_ids or []]
lines += [f"is_a: {is_a}" for is_a in self.parents or []]
lines += [f"xref: {xref}" for xref in self.xrefs or []]
lines += [f"relationship: {p} {v}" for p, v in self.relationships or []]
lines += [f"comment: {'; '.join(self.comments)}"] if self.comments else []
lines += [f"property_value: chemrof:inchi_string \"{self.inchi}\" xsd:string"] if self.inchi else []
lines += [f"property_value: chemrof:has_physiologically_stable_form {self.physiologically_stable_form}"] if self.physiologically_stable_form else []
lines += [""]
return "\n".join(lines)
class Ontology(BaseModel):
terms: List[Term] = []
def as_obo(self) -> str:
lines = [
f"ontology: chebi-slim",
"idspace: chemrof https://w3id.org/chemrof/",
"",
]
return "\n".join(lines + [t.as_obo() for t in self.terms])
[91]:
BAD_SUFFIXES = ["zwitterion", "ion", "(1+)", "(2+)"]
def make_term(id: str) -> Optional[Term]:
"""
Make a term from an ID
:param id:
:return:
"""
if id in exclusion_list:
return None
if members_to_canonical.get(id, id) != id:
# non-canonical members are not included
return None
#if id not in initial_terms:
# # filter for testing
# return None
label = labels.get(id, id)
if id in bio_syn_map:
label = bio_syn_map[id]
else:
for suffix in BAD_SUFFIXES:
suffix = " " + suffix
if label.endswith(suffix):
label = label.replace(suffix, "")
term = Term(id=id, label=label)
alt_ids = [x for x in canonical_to_members.get(id, []) if x != id]
if id in ph_mapping:
term.physiologically_stable_form = ph_mapping[id]
if alt_ids:
term.alt_ids = alt_ids
else:
alt_ids = []
equiv_set = [id] + alt_ids
comments = []
for alt_id in equiv_set:
for parent in is_a_map.get(alt_id, []):
rewired_parent = rewire(parent)
if rewired_parent and rewired_parent not in term.parents:
term.parents.append(rewired_parent)
if rewired_parent != parent or alt_id != id:
comments.append(f"Parent {rewired_parent} was rewired from {alt_id} to {parent}")
for (p, o) in preserved_rels_by_subject.get(alt_id, []):
rewired_o = rewire(o)
if rewired_o and (p, rewired_o) not in term.relationships:
term.relationships.append((p, rewired_o))
# TODO: xrefs
for xref in xrefs.get(alt_id, []):
if xref.startswith("PMID:"):
continue
term.xrefs.append(xref)
if alt_id in inchis:
if not term.inchi:
term.inchi = inchis[alt_id]
term.comments = comments
return term
#assert L_CYSTEINE_ZWITTERION in initial_terms
t = make_term(L_CYSTEINE_ZWITTERION)
assert t.label == "L-cysteine"
print(t.as_obo())
[Term]
id: CHEBI:35235
name: L-cysteine
alt_id: CHEBI:32442
alt_id: CHEBI:17561
alt_id: CHEBI:32445
alt_id: CHEBI:32443
is_a: CHEBI:35237
is_a: CHEBI:59869
is_a: CHEBI:26650
is_a: CHEBI:83813
xref: Gmelin:49993
xref: Reaxys:4128886
xref: Gmelin:325857
xref: Beilstein:4128886
xref: YMDB:YMDB00046
xref: Wikipedia:Cysteine
xref: Reaxys:1721408
xref: PDBeChem:CYS
xref: MetaCyc:CYS
xref: KNApSAcK:C00001351
xref: KEGG:D00026
xref: KEGG:C00097
xref: HMDB:HMDB0000574
xref: Gmelin:49991
xref: ECMDB:ECMDB00574
xref: Drug_Central:769
xref: DrugBank:DB00151
xref: CAS:52-90-4
xref: Beilstein:1721408
xref: Gmelin:325860
xref: Reaxys:5921923
xref: Gmelin:325856
xref: Beilstein:5921923
relationship: RO:0018039 CHEBI:35236
relationship: RO:0000087 CHEBI:78675
relationship: RO:0000087 CHEBI:64577
relationship: RO:0000087 CHEBI:77703
relationship: RO:0000087 CHEBI:77746
comment: Parent CHEBI:59869 was rewired from CHEBI:32442 to CHEBI:59814; Parent CHEBI:26650 was rewired from CHEBI:17561 to CHEBI:26650; Parent CHEBI:83813 was rewired from CHEBI:17561 to CHEBI:83813
property_value: chemrof:inchi_string "InChI=1S/C3H7NO2S/c4-2(1-7)3(5)6/h2,7H,1,4H2,(H,5,6)/t2-/m0/s1" xsd:string
property_value: chemrof:has_physiologically_stable_form CHEBI:35235
[92]:
assert rewire(is_a_map[CORD_E][0]) not in exclusion_list
[93]:
t = make_term(CORD_E)
print(t.as_obo())
[Term]
id: CHEBI:213754
name: Cordycepamide E
is_a: CHEBI:83410
comment: Parent CHEBI:83410 was rewired from CHEBI:213754 to CHEBI:46874
property_value: chemrof:inchi_string "InChI=1S/C15H19NO4/c1-9(2)13-14(18)16(3)12(15(19)20-13)8-10-4-6-11(17)7-5-10/h4-7,9,12-13,17H,8H2,1-3H3/t12-,13+/m0/s1" xsd:string
property_value: chemrof:has_physiologically_stable_form CHEBI:213754
[94]:
# assert CYSTEINE_ZWITTERION in initial_terms
t = make_term(CYSTEINE_ZWITTERION)
assert t.label == "cysteine"
assert "CHEBI:78608" in t.parents
print(t.as_obo())
[Term]
id: CHEBI:35237
name: cysteine
alt_id: CHEBI:32458
alt_id: CHEBI:32456
alt_id: CHEBI:32457
alt_id: CHEBI:15356
is_a: CHEBI:33709
is_a: CHEBI:78608
is_a: CHEBI:26834
is_a: CHEBI:62031
xref: Gmelin:49992
xref: Gmelin:325859
xref: Reaxys:4128885
xref: Gmelin:363235
xref: Beilstein:4128885
xref: Gmelin:49990
xref: Wikipedia:Cysteine
xref: Reaxys:1721406
xref: KNApSAcK:C00007323
xref: KNApSAcK:C00001351
xref: KEGG:C00736
xref: Gmelin:2933
xref: CAS:3374-22-9
xref: Beilstein:1721406
relationship: BFO:0000051 CHEBI:50326
relationship: RO:0000087 CHEBI:78675
comment: Parent CHEBI:33709 was rewired from CHEBI:35237 to CHEBI:35238; Parent CHEBI:78608 was rewired from CHEBI:32458 to CHEBI:33719; Parent CHEBI:26834 was rewired from CHEBI:32456 to CHEBI:63470; Parent CHEBI:62031 was rewired from CHEBI:15356 to CHEBI:26167
property_value: chemrof:inchi_string "InChI=1S/C3H7NO2S/c4-2(1-7)3(5)6/h2,7H,1,4H2,(H,5,6)" xsd:string
property_value: chemrof:has_physiologically_stable_form CHEBI:35237
[95]:
for is_a in t.parents:
print(is_a, labels[is_a])
CHEBI:33709 amino acid
CHEBI:78608 alpha-amino acid zwitterion
CHEBI:26834 sulfur-containing amino acid
CHEBI:62031 polar amino acid zwitterion
[96]:
print(make_term(ALPHA_AMINO_ACID_ZWITTERION).as_obo())
[Term]
id: CHEBI:78608
name: an alpha-amino acid
alt_id: CHEBI:33558
alt_id: CHEBI:33704
alt_id: CHEBI:33719
is_a: CHEBI:33709
xref: MetaCyc:Alpha-Amino-Acids
xref: KEGG:C05167
xref: KEGG:C00045
comment: Parent CHEBI:33709 was rewired from CHEBI:78608 to CHEBI:35238
property_value: chemrof:has_physiologically_stable_form CHEBI:78608
[97]:
t = make_term("CHEBI:25944")
t.comments
print(t.as_obo())
[Term]
id: CHEBI:25944
name: pesticide
is_a: CHEBI:33232
xref: Wikipedia:Pesticide
[98]:
GLU_1M = "CHEBI:14321"
assert preserved_rels_by_subject[GLU_1M]
print(make_term(GLU_1M).as_obo())
[Term]
id: CHEBI:14321
name: glutamate
alt_id: CHEBI:18237
alt_id: CHEBI:29987
is_a: CHEBI:78608
is_a: CHEBI:62031
xref: Gmelin:327908
xref: Wikipedia:Glutamic_acid
xref: Reaxys:1723799
xref: KNApSAcK:C00019577
xref: KNApSAcK:C00001358
xref: KEGG:D04341
xref: KEGG:C00302
xref: Gmelin:101971
xref: CAS:617-65-2
xref: Beilstein:1723799
xref: Reaxys:4134100
xref: Gmelin:327903
xref: Beilstein:4134100
relationship: RO:0000087 CHEBI:78675
relationship: BFO:0000051 CHEBI:50329
comment: Parent CHEBI:78608 was rewired from CHEBI:14321 to CHEBI:33558; Parent CHEBI:62031 was rewired from CHEBI:18237 to CHEBI:26167
property_value: chemrof:inchi_string "InChI=1S/C5H9NO4/c6-3(5(9)10)1-2-4(7)8/h3H,1-2,6H2,(H,7,8)(H,9,10)/p-1" xsd:string
property_value: chemrof:has_physiologically_stable_form CHEBI:14321
[99]:
bio_syn_map[ALPHA_AMINO_ACID_ZWITTERION]
[99]:
'an alpha-amino acid'
[100]:
ont = Ontology(terms=[t])
print(ont.as_obo())
ontology: chebi-slim
idspace: chemrof https://w3id.org/chemrof/
[Term]
id: CHEBI:25944
name: pesticide
is_a: CHEBI:33232
xref: Wikipedia:Pesticide
[101]:
with open("tmp/t.obo", "w") as file:
file.write(ont.as_obo())
[102]:
def roots(terms: List[Term]):
return [t.id for t in terms if not t.parents]
[ ]:
[103]:
def make_terms_for_ids(ids: List[str]) -> List[Term]:
"""
Make terms for a list of IDs
:param ids:
:return:
"""
terms = []
n = 0
for id in ids:
n += 1
t = make_term(id)
if t:
terms.append(t)
if n % 10000 == 0:
print(f"Processed {n} IDs, made {len(terms)} terms")
return terms
[ ]:
[104]:
def write_terms(terms: List[Term], path: str):
"""
Write terms to a file
:param terms:
:param path:
:return:
"""
ont = Ontology(terms=terms)
with open(path, "w") as file:
file.write(ont.as_obo())
[105]:
def generate_write_all(ids: List[str], path: str) -> List[Term]:
"""
Run whole pipeline
:param ids:
:param path:
:return:
"""
terms = make_terms_for_ids(ids)
write_terms(terms, path)
return terms
[106]:
amino_acid_ids = list(chebi.descendants(AMINO_ACID))
assert L_CYSTEINE_ZWITTERION in amino_acid_ids
assert len(amino_acid_ids) > 100
[107]:
terms = generate_write_all(amino_acid_ids, "tmp/amino_acids.obo")
Processed 10000 IDs, made 7879 terms
[108]:
[t] = [t for t in terms if t.id == L_CYSTEINE_ZWITTERION]
print(t.as_obo())
[Term]
id: CHEBI:35235
name: L-cysteine
alt_id: CHEBI:32442
alt_id: CHEBI:17561
alt_id: CHEBI:32445
alt_id: CHEBI:32443
is_a: CHEBI:35237
is_a: CHEBI:59869
is_a: CHEBI:26650
is_a: CHEBI:83813
xref: Gmelin:49993
xref: Reaxys:4128886
xref: Gmelin:325857
xref: Beilstein:4128886
xref: YMDB:YMDB00046
xref: Wikipedia:Cysteine
xref: Reaxys:1721408
xref: PDBeChem:CYS
xref: MetaCyc:CYS
xref: KNApSAcK:C00001351
xref: KEGG:D00026
xref: KEGG:C00097
xref: HMDB:HMDB0000574
xref: Gmelin:49991
xref: ECMDB:ECMDB00574
xref: Drug_Central:769
xref: DrugBank:DB00151
xref: CAS:52-90-4
xref: Beilstein:1721408
xref: Gmelin:325860
xref: Reaxys:5921923
xref: Gmelin:325856
xref: Beilstein:5921923
relationship: RO:0018039 CHEBI:35236
relationship: RO:0000087 CHEBI:78675
relationship: RO:0000087 CHEBI:64577
relationship: RO:0000087 CHEBI:77703
relationship: RO:0000087 CHEBI:77746
comment: Parent CHEBI:59869 was rewired from CHEBI:32442 to CHEBI:59814; Parent CHEBI:26650 was rewired from CHEBI:17561 to CHEBI:26650; Parent CHEBI:83813 was rewired from CHEBI:17561 to CHEBI:83813
property_value: chemrof:inchi_string "InChI=1S/C3H7NO2S/c4-2(1-7)3(5)6/h2,7H,1,4H2,(H,5,6)/t2-/m0/s1" xsd:string
property_value: chemrof:has_physiologically_stable_form CHEBI:35235
[109]:
from oaklib.datamodels.vocabulary import OWL_CLASS
# all_ids = list(chebi.descendants(ROOT))
all_ids = list(chebi.entities(filter_obsoletes=True, owl_type=OWL_CLASS))
terms = generate_write_all(all_ids, "tmp/all.obo")
Processed 10000 IDs, made 9955 terms
Processed 20000 IDs, made 19917 terms
Processed 30000 IDs, made 29878 terms
Processed 40000 IDs, made 38207 terms
Processed 50000 IDs, made 47129 terms
Processed 60000 IDs, made 56582 terms
Processed 70000 IDs, made 65929 terms
Processed 80000 IDs, made 75147 terms
Processed 90000 IDs, made 84597 terms
Processed 100000 IDs, made 93930 terms
Processed 110000 IDs, made 103890 terms
Processed 120000 IDs, made 113825 terms
Processed 130000 IDs, made 123784 terms
Processed 140000 IDs, made 132470 terms
Processed 150000 IDs, made 141078 terms
Processed 160000 IDs, made 149833 terms
Processed 170000 IDs, made 158566 terms
Processed 180000 IDs, made 166455 terms
Processed 190000 IDs, made 174804 terms
Processed 200000 IDs, made 184378 terms
[110]:
len(terms)
[110]:
185206
[111]:
# many roots expected when we make a subset
len(roots(terms))
[111]:
16
[112]:
#write_terms(terms, f"tmp/{ROOT.replace(':', '_')}.obo")
[113]:
fertirelin = "CHEBI:177856"
[114]:
t = make_term(fertirelin)
print(t.as_obo())
[Term]
id: CHEBI:177856
name: fertirelin
is_a: CHEBI:25676
xref: KEGG:D07957
xref: Chemspider:163670
xref: CAS:38234-21-8
property_value: chemrof:inchi_string "InChI=1S/C55H76N16O12/c1-4-59-53(82)44-12-8-20-71(44)54(83)38(11-7-19-60-55(56)57)66-49(78)39(21-30(2)3)65-46(75)27-62-47(76)40(22-31-13-15-34(73)16-14-31)67-52(81)43(28-72)70-50(79)41(23-32-25-61-36-10-6-5-9-35(32)36)68-51(80)42(24-33-26-58-29-63-33)69-48(77)37-17-18-45(74)64-37/h5-6,9-10,13-16,25-26,29-30,37-44,61,72-73H,4,7-8,11-12,17-24,27-28H2,1-3H3,(H,58,63)(H,59,82)(H,62,76)(H,64,74)(H,65,75)(H,66,78)(H,67,81)(H,68,80)(H,69,77)(H,70,79)(H4,56,57,60)/t37-,38-,39-,40-,41-,42-,43-,44-/m0/s1" xsd:string
[115]:
chebi.label(is_a_map[fertirelin][0])
[115]:
'oligopeptide'
[ ]: