Graph Info

The GripQL API allows a user to download the schema of a graph. This outlines the different types of nodes, the edges the connect them and the structure of the documents stored in graph elements. A graph document has a graph field that has the name, a vertices field and an edges field.

{"graph': "bmeg_rc1_2",
 "vertices': [{"gid": "Compound",
   "label": "Compound",
   "data": {"name": "STRING", "term": "STRING", "term_id": "STRING"}},
  ...],
  "edges": [{"gid": "(Project)--InProgram->(Program)",
   "label": "InProgram",
   "from": "Project",
   "to": "Program",
   "data": {}},

Connect to BMEG server

import networkx as nx
import matplotlib.pyplot as plt
import gripql
from networkx.drawing.nx_agraph import graphviz_layout
conn = gripql.Connection("https://bmeg.io/api", credential_file="/tmp/bmeg_credentials.json")

Print avalible graphs

print(conn.listGraphs())
['bmeg_rc1_2', 'bmeg_rc1_2__schema__']
O = conn.graph("bmeg_rc1_2")

Get the schema graph

schema = conn.getSchema("bmeg_rc1_2")

Start build graph using NetworkX

g = nx.MultiDiGraph()
for v in schema['vertices']:
    g.add_node(v['gid'])
for e in schema['edges']:
    g.add_edge(e['from'], e['to'])

Draw Schema Graph

pos = graphviz_layout(g, prog='twopi', args='')
fig, ax = plt.subplots(1, 1, figsize=(8, 6));
nx.draw(g, pos, ax=ax, with_labels=True)

png


Allele Lookup

import hashlib
import gripql
conn = gripql.Connection("https://bmeg.io/api", credential_file="/tmp/bmeg_credentials.json")
O = conn.graph("bmeg_rc1_2")

BMEG stores alleles using a hashed version of the alteration description

genome:chromosome:start:end:reference_bases:alternate_bases

So an example allele would be:

GRCh37:1:27100988:27100988:C:T

Which is then run though a sha1 hash to get the string

0b0a7a23d57414e768677a6cbd764563922209df
def allele_gid(genome, chromosome, start, end, reference_bases,
                 alternate_bases):
        vid = "%s:%s:%d:%d:%s:%s" % (genome, chromosome,
                                     start, end, reference_bases,
                                     alternate_bases)
        vid = vid.encode('utf-8')
        vidhash = hashlib.sha1()
        vidhash.update(vid)
        vidhash = vidhash.hexdigest()
        return "Allele:%s" % (vidhash)
chrom = 1
loc = 27100988
ids = []
for r in ['A', 'C', 'G', 'T']:
    for a in ['A', 'C', 'G', 'T']:
        ids.append( allele_gid("GRCh37", chrom, loc, loc, r, a) )
for row in O.query().V(ids):
    print( row )
[INFO]  2019-03-11 15:46:39,647 1 results received in 0 seconds


<AttrDict({'gid': 'Allele:0b0a7a23d57414e768677a6cbd764563922209df', 'label': 'Allele', 'data': {'alternate_bases': 'T', 'chromosome': '1', 'effect': 'Nonsense_Mutation', 'end': 27100988, 'ensembl_transcript': 'ENST00000324856', 'genome': 'GRCh37', 'hugo_symbol': 'ARID1A', 'reference_bases': 'C', 'start': 27100988, 'strand': '+', 'type': 'SNP'}})>

Gene Info

import gripql
conn = gripql.Connection("https://bmeg.io/api", credential_file="/tmp/bmeg_credentials.json")
O = conn.graph("bmeg_rc1_2")

Look up a gene by its hugo symbol

gids = list(O.query().V().hasLabel("Gene").has(gripql.eq("$.symbol", "BRCA1")).render("_gid"))
[INFO]  2019-03-11 15:49:09,362 1 results received in 0 seconds

Find some of the Gene Ontology terms the gene is linked to

for ent in O.query().V(gids).in_("GeneOntologyAnnotation").limit(10):
    print(ent.gid, ent.data)
[INFO]  2019-03-11 16:32:55,385 10 results received in 0 seconds


GO:0005515 <AttrDict({'definition': 'Interacting selectively and non-covalently with any protein or protein complex (a complex of two or more proteins that may include other nonprotein molecules).', 'go_id': 'GO:0005515', 'name': 'protein binding', 'namespace': 'molecular_function', 'synonym': ['glycoprotein binding', 'protein amino acid binding'], 'xref': ['reactome:R-HSA-170835', 'reactome:R-HSA-170846', 'reactome:R-HSA-3645786', 'reactome:R-HSA-3656484', 'reactome:R-HSA-3702153', 'reactome:R-HSA-3713560']})>
GO:0005654 <AttrDict({'definition': 'That part of the nuclear content other than the chromosomes or the nucleolus.', 'go_id': 'GO:0005654', 'name': 'nucleoplasm', 'namespace': 'cellular_component', 'synonym': [], 'xref': ['NIF_Subcellular:sao661522542', 'Wikipedia:Nucleoplasm']})>
GO:0005737 <AttrDict({'definition': 'All of the contents of a cell excluding the plasma membrane and nucleus, but including other subcellular structures.', 'go_id': 'GO:0005737', 'name': 'cytoplasm', 'namespace': 'cellular_component', 'synonym': [], 'xref': ['Wikipedia:Cytoplasm']})>
GO:0006359 <AttrDict({'definition': 'Any process that modulates the frequency, rate or extent of transcription mediated by RNA ploymerase III.', 'go_id': 'GO:0006359', 'name': 'regulation of transcription by RNA polymerase III', 'namespace': 'biological_process', 'synonym': ['regulation of transcription from Pol III promoter', 'regulation of transcription from RNA polymerase III promoter'], 'xref': []})>
GO:0008630 <AttrDict({'definition': 'A series of molecular signals in which an intracellular signal is conveyed to trigger the apoptotic death of a cell. The pathway is induced by the detection of DNA damage, and ends when the execution phase of apoptosis is triggered.', 'go_id': 'GO:0008630', 'name': 'intrinsic apoptotic signaling pathway in response to DNA damage', 'namespace': 'biological_process', 'synonym': ['DNA damage response, signal transduction resulting in induction of apoptosis'], 'xref': []})>
GO:0031436 <AttrDict({'definition': 'A heterodimeric complex comprising BRCA1 and BARD1, which possesses ubiquitin ligase activity and is involved in genome maintenance, possibly by functioning in surveillance for DNA damage.', 'go_id': 'GO:0031436', 'name': 'BRCA1-BARD1 complex', 'namespace': 'cellular_component', 'synonym': [], 'xref': []})>
GO:0019899 <AttrDict({'definition': 'Interacting selectively and non-covalently with any enzyme.', 'go_id': 'GO:0019899', 'name': 'enzyme binding', 'namespace': 'molecular_function', 'synonym': [], 'xref': []})>
GO:0044212 <AttrDict({'definition': 'Interacting selectively and non-covalently with a DNA region that regulates the transcription of a region of DNA, which may be a gene, cistron, or operon. Binding may occur as a sequence specific interaction or as an interaction observed only once a factor has been recruited to the DNA by other factors.', 'go_id': 'GO:0044212', 'name': 'transcription regulatory region DNA binding', 'namespace': 'molecular_function', 'synonym': ['regulatory region DNA binding'], 'xref': []})>
GO:0050681 <AttrDict({'definition': 'Interacting selectively and non-covalently with an androgen receptor.', 'go_id': 'GO:0050681', 'name': 'androgen receptor binding', 'namespace': 'molecular_function', 'synonym': ['AR binding'], 'xref': []})>
GO:0051573 <AttrDict({'definition': 'Any process that stops, prevents, or reduces the frequency, rate or extent of the covalent addition of a methyl group to the lysine at position 9 of histone H3.', 'go_id': 'GO:0051573', 'name': 'negative regulation of histone H3-K9 methylation', 'namespace': 'biological_process', 'synonym': ['down regulation of histone H3-K9 methylation', 'down-regulation of histone H3-K9 methylation', 'downregulation of histone H3-K9 methylation', 'inhibition of histone H3-K9 methylation'], 'xref': []})>

Sample Counts

import gripql
conn = gripql.Connection("https://bmeg.io/api", credential_file="/tmp/bmeg_credentials.json")
O = conn.graph("bmeg_rc1_2")

Count number of Projects per Program

q = O.query().V().hasLabel("Program").as_("p").in_("InProgram").select("p")
q = q.aggregate(gripql.term("project_count", "$._gid"))
for row in q.execute()[0]["project_count"]["buckets"]:
    print("%s\t%s" % (row["key"], row["value"]))
[INFO]  2019-03-11 16:13:02,635 1 results received in 0 seconds


Program:DepMap  38
Program:TCGA    33
Program:GTEx    31
Program:CTRP    30
Program:GDSC    27
Program:CCLE    26
Program:TARGET  6
Program:VAREPOP 1
Program:NCICCR  1
Program:FM  1
Program:CTSP    1

Count number of Samples per Program

q = O.query().V().hasLabel("Program").as_("p").in_("InProgram").in_("InProject").select("p")
q = q.aggregate(gripql.term("sample_count", "$._gid"))
for row in q.execute()[0]["sample_count"]["buckets"]:
    print("%s\t%s" % (row["key"], row["value"]))
[INFO]  2019-03-11 16:13:09,049 1 results received in 3 seconds


Program:FM  18004
Program:TCGA    11315
Program:GTEx    8859
Program:TARGET  3236
Program:DepMap  1700
Program:CTRP    841
Program:GDSC    746
Program:CCLE    504
Program:NCICCR  489
Program:CTSP    45
Program:VAREPOP 7

CNA Histogram

import matplotlib.pyplot as plt
import numpy as np
import gripql
conn = gripql.Connection("https://bmeg.io/api", credential_file="/tmp/bmeg_credentials.json")
O = conn.graph("bmeg_rc1_2")

Get Ensembl Gene ids for genes of interest

GENES = ["PTEN", "TP53", "RB1"]
gene_ids = {}
for g in GENES:
    for i in O.query().V().hasLabel("Gene").has(gripql.eq("symbol", g)):
        gene_ids[g] = i.gid
[INFO]  2019-03-11 15:50:32,116 1 results received in 0 seconds
[INFO]  2019-03-11 15:50:32,355 1 results received in 0 seconds
[INFO]  2019-03-11 15:50:32,599 1 results received in 0 seconds
gene_ids
{'PTEN': 'ENSG00000171862',
 'TP53': 'ENSG00000141510',
 'RB1': 'ENSG00000139687'}

For each gene of interest, obtain the copy number alteration values and aggregate them by gene.

q = O.query().V("Project:TCGA-PRAD").in_("InProject").in_("SampleFor").in_("AliquotFor")
q = q.has(gripql.eq("$.gdc_attributes.sample_type", 'Primary Tumor')).in_("CopyNumberAlterationOf")
q = q.aggregate(
    list( gripql.term( g, "values.%s" % (g), 5) for g in gene_ids.values() )
)

res = list(q)
for r in res[0]:
    for b in res[0][r]['buckets']:
        print("%s\t%s:%s" % (r, b['key'], b['value']))
[INFO]  2019-03-11 16:22:24,556 1 results received in 5 seconds


ENSG00000139687 0:269
ENSG00000139687 -1:139
ENSG00000139687 -2:81
ENSG00000139687 1:3
ENSG00000141510 0:329
ENSG00000141510 -1:126
ENSG00000141510 -2:37
ENSG00000171862 0:327
ENSG00000171862 -2:95
ENSG00000171862 -1:64
ENSG00000171862 1:5
ENSG00000171862 2:1

Create a barchart showing the counts of copy number altered samples in the cohort.

val = []
count = []
for i in sorted(res[0]['ENSG00000139687']['buckets'], key=lambda x:int(x["key"])):
    val.append(int(i["key"]))
    count.append(i["value"])
plt.bar(val, count, width=0.35)
<BarContainer object of 4 artists>

png


Cohort Mutation Counts

import pandas
import gripql
conn = gripql.Connection("https://bmeg.io/api", credential_file="/tmp/bmeg_credentials.json")
O = conn.graph("bmeg_rc1_2")

Select all the tumor samples in the TCGA KIRC cohort, and aggregate across the ensembl_gene field.

q = O.query().V("Project:TCGA-KIRC")
q = q.in_("InProject").in_("SampleFor").has(gripql.eq("gdc_attributes.sample_type", "Primary Tumor"))
q = q.in_("AliquotFor").in_("CallsetFor").outE("AlleleCall")
q = q.has(gripql.contains("methods", "MUTECT"))
q = q.aggregate(gripql.term("geneCount", "ensembl_gene"))

res = list(q)
counts = {}
for i in res[0]['geneCount']['buckets']:
    counts[i['key']] = i['value']

[INFO]  2019-03-11 16:14:14,703 1 results received in 2 seconds

Create a Pandas.Series with the output and find all the genes with 20 or more mutations

countDF = pandas.Series(counts)
goi = list(countDF.index[countDF >= 20])
for e,g in O.query().V(goi).render(["$._gid" ,"$.symbol"]):
    print(e,g)
[INFO]  2019-03-11 16:18:28,689 10 results received in 0 seconds


ENSG00000007174 DNAH9
ENSG00000081479 LRP2
ENSG00000134086 VHL
ENSG00000151914 DST
ENSG00000155657 TTN
ENSG00000163930 BAP1
ENSG00000163939 PBRM1
ENSG00000181143 MUC16
ENSG00000181555 SETD2
ENSG00000198793 MTOR

Drug Response

import pandas
import gripql
import itertools
import scipy.stats as stats

conn = gripql.Connection("https://bmeg.io/api", credential_file="/tmp/bmeg_credentials.json")
O = conn.graph("bmeg_rc1_2")

Find all of the aliquots in the CTRP experiment

q = O.query().V("Program:CTRP").in_("InProgram").in_("InProject").in_("SampleFor").in_("AliquotFor").distinct("_gid")
all_aliquots = []
for row in q:
    all_aliquots.append(row.gid)
[INFO]  2019-03-11 16:27:26,999 841 results received in 0 seconds

For the genes of interest, get Ensembl gene ids, from the HUGO symbols

GENES = ["CDKN2A", "PTEN", "TP53", "SMAD4"]
gene_ids = {}
for g in GENES:
    for i in O.query().V().hasLabel("Gene").has(gripql.eq("symbol", g)):
        gene_ids[g] = i.gid
[INFO]  2019-03-11 16:27:27,225 1 results received in 0 seconds
[INFO]  2019-03-11 16:27:27,399 1 results received in 0 seconds
[INFO]  2019-03-11 16:27:27,576 1 results received in 0 seconds
[INFO]  2019-03-11 16:27:27,752 1 results received in 0 seconds
gene_ids
{'CDKN2A': 'ENSG00000147889',
 'PTEN': 'ENSG00000171862',
 'TP53': 'ENSG00000141510',
 'SMAD4': 'ENSG00000141646'}

For each of the genes, find the set of samples that have a mutation in that gene

mut_samples = {}
norm_samples = {}

q = O.query().V(all_aliquots).as_("sample").in_("CallsetFor").outE("AlleleCall")
q = q.has(gripql.within("ensembl_gene", list(gene_ids.values()))).as_("variant")
q = q.render({"sample" : "$sample._gid", "gene" : "$variant._data.ensembl_gene"})

for res in q:
    mut_samples[res.gene] = mut_samples.get(res.gene, set()) | set([res.sample])

#get CCLE samples without mutation    
for i in gene_ids.values():
    norm_samples[i] = list(set(all_aliquots).difference(mut_samples[i]))

    print( "%s Positive Set: %d" % (i, len(mut_samples[i])) )
    print( "%s Negative Set: %d" % (i, len(norm_samples[i])) )

[INFO]  2019-03-11 16:27:48,969 1,198 results received in 21 seconds


ENSG00000147889 Positive Set: 99
ENSG00000147889 Negative Set: 742
ENSG00000171862 Positive Set: 120
ENSG00000171862 Negative Set: 721
ENSG00000141510 Positive Set: 597
ENSG00000141510 Negative Set: 244
ENSG00000141646 Positive Set: 69
ENSG00000141646 Negative Set: 772
pos_response = {}
for g in gene_ids.values():
    pos_response[g] = {}
    q = O.query().V(list(mut_samples[g])).in_("ResponseIn").has(gripql.eq("source", "CTRP")).as_("a").out("ResponseTo").as_("b").select(["a", "b"])
    for row in q:
        v = row['a']['data']['act_area']
        compound = row['b']['gid']
        if compound not in pos_response[g]:
            pos_response[g][compound] = [ v ]
        else:
            pos_response[g][compound].append(v)
   
[INFO]  2019-03-11 16:28:07,266 45,065 results received in 18 seconds
[INFO]  2019-03-11 16:28:28,027 52,559 results received in 20 seconds
[INFO]  2019-03-11 16:30:08,716 261,093 results received in 100 seconds
[INFO]  2019-03-11 16:30:19,911 30,334 results received in 11 seconds
neg_response = {}
for g in gene_ids.values():
    neg_response[g] = {}
    q = O.query().V(list(norm_samples[g])).in_("ResponseIn").has(gripql.eq("source", "CTRP")).as_("a").out("ResponseTo").as_("b").select(["a", "b"])
    for row in q:
        v = row['a']['data']['act_area']
        compound = row['b']['gid']
        if compound not in neg_response[g]:
            neg_response[g][compound] = [ v ]
        else:
            neg_response[g][compound].append(v)
   
[INFO]  2019-03-11 16:32:24,361 321,190 results received in 124 seconds
[INFO]  2019-03-11 16:34:41,644 313,696 results received in 137 seconds
[INFO]  2019-03-11 16:35:22,911 105,162 results received in 41 seconds
[INFO]  2019-03-11 16:37:35,496 335,921 results received in 132 seconds
drugs = set(itertools.chain.from_iterable( i.keys() for i in pos_response.values() ))
out = []
for drug in drugs:
    for g in gene_ids.values():
        if drug in pos_response[g] and drug in neg_response[g]:
            row = {"drug" : drug, "mutation" : g}
            mut_values = pos_response[g][drug]
            norm_values = neg_response[g][drug]
            if len(mut_values) > 5 and len(norm_values) > 5:
                s = stats.ttest_ind(mut_values, norm_values, equal_var=False)
                row["t-statistic"] = s.statistic
                row["t-pvalue"] = s.pvalue
                s = stats.f_oneway(mut_values, norm_values)
                row["a-statistic"] = s.statistic
                row["a-pvalue"] = s.pvalue
                out.append(row)
pandas.DataFrame(out, columns=["drug", "mutation", "t-statistic", "t-pvalue", "a-statistic", "a-pvalue"]).sort_values("a-pvalue").head(30)
drug mutation t-statistic t-pvalue a-statistic a-pvalue
1038 Compound:CID11433190 ENSG00000141510 13.047139 5.317522e-31 259.778790 1.184247e-50
1845 Compound:CID10127622 ENSG00000171862 14.762042 7.981893e-47 179.304787 1.641445e-40
1846 Compound:CID10127622 ENSG00000141510 10.688355 2.465168e-26 127.405960 2.358116e-29
1338 Compound:CID24978538 ENSG00000141510 7.740424 1.240948e-14 63.907146 1.483366e-15
1510 Compound:CID11609586 ENSG00000141510 6.241976 8.088568e-10 52.314159 7.472634e-13
693 Compound:CHEMBL401930 ENSG00000171862 8.302645 2.440095e-14 49.757986 3.862614e-12
1336 Compound:CID24978538 ENSG00000147889 6.527219 9.533273e-11 40.010125 2.659870e-10
558 Compound:CID31703 ENSG00000141510 5.785938 1.071401e-08 39.482605 4.256624e-10
1339 Compound:CID24978538 ENSG00000141646 6.429776 2.146093e-10 34.934173 3.547833e-09
549 Compound:CID9825149 ENSG00000171862 5.932992 1.955910e-08 33.090768 1.266485e-08
1901 Compound:CID12003241 ENSG00000171862 6.056617 9.415681e-09 31.770410 2.396710e-08
917 Compound:CID16038120 ENSG00000171862 6.327688 2.345613e-09 28.930932 9.928930e-08
709 Compound:CHEMBL1091644 ENSG00000171862 7.158885 1.212246e-11 27.669131 1.856315e-07
1450 Compound:CID11626560 ENSG00000141510 4.969099 8.227653e-07 25.591613 4.718561e-07
494 Compound:CID44462760 ENSG00000141510 3.845680 1.844618e-04 25.876791 5.647977e-07
374 Compound:CID11717001 ENSG00000141510 4.286482 2.411596e-05 24.598161 8.679683e-07
1656 Compound:CID16722836 ENSG00000147889 4.586444 1.142000e-05 24.162761 1.078975e-06
1101 Compound:CID10231331 ENSG00000171862 5.831482 2.502220e-08 23.939656 1.200220e-06
1073 Compound:CID24785538 ENSG00000171862 6.561520 3.916417e-10 23.661479 1.385508e-06
293 Compound:CID5328940 ENSG00000171862 4.952539 1.930155e-06 23.299296 1.668798e-06
1965 Compound:CID11707110 ENSG00000171862 6.957522 1.812686e-10 23.331620 1.950337e-06
1473 Compound:CID176870 ENSG00000171862 5.251573 2.702363e-07 19.892882 8.791931e-06
1781 Compound:CID10184653 ENSG00000171862 4.889542 2.639552e-06 18.957445 1.534211e-05
949 Compound:CID9915743 ENSG00000171862 4.919408 1.938827e-06 17.633489 2.977588e-05
1244 Compound:CID451668 ENSG00000147889 5.080006 5.546808e-07 16.400735 5.290745e-05
1844 Compound:CID10127622 ENSG00000147889 -4.226107 2.518708e-05 16.273715 5.526157e-05
1134 Compound:CID6253 ENSG00000141510 3.863943 1.304145e-04 16.169516 6.347135e-05
1657 Compound:CID16722836 ENSG00000171862 4.043392 8.329122e-05 16.168543 6.356201e-05
1350 Compound:CID24180719 ENSG00000141510 3.209481 1.473440e-03 14.775437 1.308989e-04
1062 Compound:CID6505803 ENSG00000141510 3.573446 3.677986e-04 14.626433 1.345047e-04

Expression Data

import seaborn as sns
import pandas
import gripql
conn = gripql.Connection("https://bmeg.io/api", credential_file="/tmp/bmeg_credentials.json")
O = conn.graph("bmeg_rc1_2")

Download gene expression values from TCGA-READ cohort and build matrix with submitter id as label

c = O.query().V("Project:TCGA-READ").in_("InProject").in_("SampleFor").as_("sample")
c = c.in_("AliquotFor").in_("GeneExpressionOf").as_("exp")
c = c.render( ["$sample._data.gdc_attributes.submitter_id", "$exp._data.values"])
data = {}
for row in c.execute(stream=True):
    data[row[0]] = row[1]
samples = pandas.DataFrame(data).transpose().fillna(0.0)
[INFO]  2019-03-11 16:34:51,885 177 results received in 79 seconds
samples["ENSG00000000003"].sort_values(ascending=False).head()
TCGA-DC-5869-01A    273.06691
TCGA-AF-3913-01A    247.02666
TCGA-EF-5831-01A    245.57436
TCGA-DC-6683-01A    243.55564
TCGA-DC-5337-01A    240.26590
Name: ENSG00000000003, dtype: float64
sns.kdeplot(samples['ENSG00000000003'], color="g")
<matplotlib.axes._subplots.AxesSubplot at 0x134257e48>

png


Gene Mutation Hotstops

import matplotlib.pyplot as plt
import pandas
import gripql
conn = gripql.Connection("https://bmeg.io/api", credential_file="/tmp/bmeg_credentials.json")
O = conn.graph("bmeg_rc1_2")

Get BRCA1 start and stop locations

loc = list( O.query().V().hasLabel("Gene").has(gripql.eq("symbol", "BRCA1")).render(["$.start", "$.end"]) )[0]
[INFO]  2019-03-11 15:57:14,808 1 results received in 1 seconds
counts = [0] * (loc[1]-loc[0])
q = O.query().V().hasLabel("Gene").has(gripql.eq("symbol", "BRCA1")).in_("AlleleIn").has(gripql.eq("type", "SNP"))
q = q.aggregate(gripql.term("brac1_pos", "start"))
res = list(q)[0]
for v in res.brac1_pos.buckets:
    counts[ v['key'] - loc[0] ] = v['value']
[INFO]  2019-03-11 15:57:16,372 1 results received in 1 seconds
s = pandas.DataFrame(counts)
s.rolling(500).sum().plot()
<matplotlib.axes._subplots.AxesSubplot at 0x115d41390>

png