comparison impc_tool.py @ 0:0a9cf7f52b9c draft default tip

planemo upload commit 213f6eeb03f96bb13d0ace6e0c87e2562d37f728-dirty
author infr
date Wed, 22 Jun 2022 13:36:44 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:0a9cf7f52b9c
1 import sys
2 import requests
3 import pandas as pd
4 import urllib.request as url
5
6 impc_api_url = "https://www.gentar.org/impc-dev-api/"
7 impc_api_search_url = f"{impc_api_url}/genes"
8 impc_api_gene_bundle_url = f"{impc_api_url}/geneBundles"
9
10
11 def stop_err(msg):
12 sys.exit(msg)
13
14
15 def main():
16 inp = str(sys.argv[1])
17 query = str(sys.argv[3])
18
19 try:
20 if query == '7':
21 full_gene_table()
22 sys.exit(0)
23
24 if str(sys.argv[5])=="txt":
25 s = str(sys.argv[6])
26 if s == "t":
27 sep = "\t"
28 elif s == "s":
29 sep = " "
30 elif s in ",;.":
31 sep = s
32 else:
33 sys.exit("Separator not valid, please change it.")
34 inp = pd.read_csv(inp, header=None, delimiter=sep)
35 if len(inp.columns)==1:
36 inp = str(inp[0].values[0]).replace("'","")
37 else:
38 inp = inp.to_string(header=False, index=False).replace(" ",",")
39
40 if query == '8':
41 genes_in_pipeline(inp)
42 sys.exit(0)
43 elif query == '10': # it's here but not totally implemented
44 par_pip_ma(inp)
45 sys.exit(0)
46 elif query == '11': # it's here but not totally implemented
47 par_gen(inp)
48 sys.exit(0)
49 elif query == '2' or query == "4":
50 final_list=pheno_mapping(inp)
51 else:
52 final_list=gene_mapping(inp)
53 inp= ",".join(final_list)
54
55
56 if query == '1':
57 get_pheno(inp)
58 sys.exit(0)
59 elif query == '2':
60 get_genes(inp)
61 sys.exit(0)
62 elif query == '3':
63 gene_set(inp)
64 sys.exit(0)
65 elif query == '4':
66 extr_img(inp)
67 sys.exit(0)
68 elif query == '5':
69 parameters(inp)
70 sys.exit(0)
71 elif query == '6':
72 sign_par(inp)
73 sys.exit(0)
74 elif query == '9':
75 sign_mp(inp)
76 sys.exit(0)
77 else:
78 stop_err("Error, non-implemented query selected: " + query)
79 except Exception as ex:
80 stop_err('Error running impc_tool.py:\n' + str(ex))
81
82
83 # 1-Given a gene id, retrieve all the phenotypes related to it (id and name)
84 def get_pheno(inp):
85 head = sys.argv[4]
86 mgi_accession_id = inp
87
88 gene_url = f"{impc_api_search_url}/{mgi_accession_id}"
89 gene_data = requests.get(gene_url).json()
90
91 p_list = []
92 id_list = []
93
94 if gene_data['significantMpTerms'] == None:
95 stop_err("No significant MP terms found for this gene")
96 else:
97 for x in gene_data['significantMpTerms']:
98 p_list.append(x['mpTermId'])
99 id_list.append(x['mpTermName'])
100
101 df = pd.DataFrame()
102 df['MP term name'] = p_list
103 df['MP term id'] = id_list
104
105 if head == 'True':
106 df.to_csv(sys.argv[2], header=True, index=False, sep="\t", index_label=False)
107 else:
108 df.to_csv(sys.argv[2], header=False, index=False, sep="\t", index_label=False)
109
110
111 # 3-Extract all genes having a particular phenotype or a set of phenotypes (e.g. relevant to a disease)
112 def get_genes(inp):
113 head = sys.argv[4]
114 target_mp_terms = inp
115
116 ## All the data is paginated using the page and size parameters, by default the endpoint returns the first 20 hits
117 gene_by_phenotypes_query = f"{impc_api_search_url}/search/findAllBySignificantMpTermIdsContains?mpTermIds={target_mp_terms}&page=0&size=20"
118 genes_with_clinical_chemistry_phenotypes = requests.get(gene_by_phenotypes_query).json()
119 print(f"Genes with {target_mp_terms}: {genes_with_clinical_chemistry_phenotypes['page']['totalElements']}")
120 list_of_genes = pd.DataFrame(columns=['Gene accession id', 'Gene name', 'Gene bundle url'])
121 acc = []
122 name = []
123 url = []
124
125 for gene in genes_with_clinical_chemistry_phenotypes['_embedded']['genes']:
126 acc.append(gene['mgiAccessionId'])
127 name.append(gene['markerName'])
128 url.append(gene['_links']['geneBundle']['href'])
129
130 list_of_genes['Gene accession id'] = acc
131 list_of_genes['Gene name'] = name
132 list_of_genes['Gene bundle url'] = url
133
134 if head == 'True':
135 list_of_genes.to_csv(sys.argv[2], header=True, index=False, sep="\t", index_label=False)
136 else:
137 list_of_genes.to_csv(sys.argv[2], header=False, index=False, sep="\t", index_label=False)
138
139 # 4. Extract all phenotypes which are present in a particular gene set (e.g. genes together in a pathway)
140
141 def gene_set(inp):
142 head = sys.argv[4]
143 target_genes = inp
144
145 genes_in_gene_list_query = f"{impc_api_search_url}/search/findAllByMgiAccessionIdIn?mgiAccessionIds={target_genes}"
146
147 genes_in_gene_list = requests.get(genes_in_gene_list_query).json()
148 list_of_mp_terms_vs_gene_index = {}
149
150 for gene in genes_in_gene_list['_embedded']['genes']:
151 mp_terms = gene['significantMpTerms']
152 gene_acc_id = gene["mgiAccessionId"]
153 if mp_terms is None:
154 continue
155 for mp_term_name in mp_terms:
156 if mp_term_name['mpTermId'] not in list_of_mp_terms_vs_gene_index:
157 list_of_mp_terms_vs_gene_index[mp_term_name['mpTermId']] = {"mp_term": mp_term_name['mpTermId'], "mp_name": mp_term_name['mpTermName'], "genes": []}
158 list_of_mp_terms_vs_gene_index[mp_term_name['mpTermId']]["genes"].append(gene_acc_id)
159 genes_by_mp_term = list(list_of_mp_terms_vs_gene_index.values())
160
161 df = pd.DataFrame()
162 terms = []
163 names = []
164 genes = []
165 for i in genes_by_mp_term:
166 terms.append(i['mp_term'])
167 names.append(i['mp_name'])
168 genes.append(",".join(i['genes']))
169
170 df['mp_term'] = terms
171 df['mp_name'] = names
172 df['genes'] = genes
173
174 if head == 'True':
175 df.to_csv(sys.argv[2], header=True, index=False, sep="\t", index_label=False)
176 else:
177 df.to_csv(sys.argv[2], header=False, index=False, sep="\t", index_label=False)
178
179 # 7. Extract images with a particular phenotype or a set of phenotypes
180
181
182 def extr_img(inp):
183 head = sys.argv[4]
184 target_mp_terms = inp # ['MP:0002110', 'MP:0000559']
185
186 ## All the data is paginated using the page and size parameters, by default the endpoint returns the first 20 hits
187 gene_by_phenotypes_query = f"{impc_api_search_url}/search/findAllBySignificantMpTermIdsContains?mpTermIds={target_mp_terms}&page=0&size=20"
188 genes_with_morphology_mps = requests.get(gene_by_phenotypes_query).json()
189 list_of_gene_bundle_urls = [gene["_links"]["geneBundle"]['href'] for gene in
190 genes_with_morphology_mps['_embedded']['genes']]
191
192 gene_bundles = []
193 for gene_bundle_url in list_of_gene_bundle_urls:
194 gene_bundle = requests.get(gene_bundle_url).json()
195 gene_bundles.append(gene_bundle)
196
197 images_with_morphology_mps = []
198
199 ## Doing just the first 20 and filtering out fields on the images
200 display_fields = ['geneSymbol', 'parameterName', 'biologicalSampleGroup', 'colonyId', 'zygosity', 'sex',
201 'downloadUrl', 'externalSampleId', 'thumbnailUrl']
202
203
204 for gene_bundle in gene_bundles[:20]:
205 if len(gene_bundle) == 4:
206 continue
207 if gene_bundle["geneImages"] is not None:
208 images = gene_bundle["geneImages"]
209 for image in images:
210 display_image = {k: v for k, v in image.items() if k in display_fields}
211 images_with_morphology_mps.append(display_image)
212
213 images_table = []
214 print(f"Images related to phenotype {target_mp_terms}: {len(images_with_morphology_mps)}")
215 ## Displaying just the first 20 images
216 for i in images_with_morphology_mps[:20]:
217 row = [f"<img src='{i['thumbnailUrl']}' />"] + list(i.values())
218 images_table.append(row)
219
220 df = pd.DataFrame()
221 externalSampleId = []
222 geneSymbol = []
223 biologicalSampleGroup = []
224 sex = []
225 colonyId = []
226 zygosity = []
227 parameterName = []
228 downloadUrl = []
229 thumbnailUrl = []
230
231 for i in images_table:
232 externalSampleId.append(i[1])
233 geneSymbol.append(i[2])
234 biologicalSampleGroup.append(i[3])
235 sex.append(i[4])
236 colonyId.append(i[5])
237 zygosity.append(i[6])
238 parameterName.append(i[7])
239 downloadUrl.append(i[8])
240 thumbnailUrl.append(i[9])
241
242 df['externalSampleId'] = externalSampleId
243 df['geneSymbol'] = geneSymbol
244 df['biologicalSampleGroup'] = biologicalSampleGroup
245 df['sex'] = sex
246 df['colonyId'] = colonyId
247 df['zygosity'] = zygosity
248 df['parameterName'] = parameterName
249 df['downloadUrl'] = downloadUrl
250 df['thumbnailUrl'] = thumbnailUrl
251
252 if head == 'True':
253 df.to_csv(sys.argv[2], header=True, index=False, sep="\t", index_label=False)
254 else:
255 df.to_csv(sys.argv[2], header=False, index=False, sep="\t", index_label=False)
256
257 # 11- Which parameters have been measured for a particular knockout EASY
258
259
260 def parameters(inp):
261 head = sys.argv[4]
262 knockout = inp # "MGI:104636"
263 gene_info = requests.get(impc_api_search_url + "/" + knockout).json()
264
265 if gene_info['phenotypingDataAvailable']:
266 geneBundle = requests.get(gene_info['_links']['geneBundle']['href']).json()
267 gen_imgs = geneBundle['geneImages']
268 par_list = []
269 l = {}
270 for i in gen_imgs:
271 l = {"Parameter Name": i['parameterName']}
272 if l not in par_list:
273 par_list.append(l)
274 df = pd.DataFrame()
275 l = []
276
277 for i in par_list:
278 l.append(i['Parameter Name'])
279
280 df['Parameter'] = l
281 if head == 'True':
282 df.to_csv(sys.argv[2], header=True, index=False, sep="\t", index_label=False)
283 else:
284 df.to_csv(sys.argv[2], header=False, index=False, sep="\t", index_label=False)
285
286 else:
287 stop_err("No parameters available for this knockout gene")
288
289
290 # 12- Which parameters identified a significant finding for a particular knockout line (colony) EASY
291 def sign_par(inp):
292 head = sys.argv[4]
293 knockout = inp # "MGI:104636"
294
295 gene_info = requests.get(f"{impc_api_url}statisticalResults/search/findAllByMarkerAccessionIdIsAndSignificantTrue?mgiAccessionId=" + knockout).json()
296 gene_stats = gene_info['_embedded']['statisticalResults']
297
298 if len(gene_stats) == 0:
299 stop_err("No statistically relevant parameters found for this knockout gene")
300 else:
301 df = pd.DataFrame()
302 n = []
303 p = []
304 for g in gene_stats:
305 n.append(g['parameterName'])
306 p.append(g['pvalue'])
307
308 df['Parameter name'] = n
309 df['p-value'] = p
310 if head == 'True':
311 df.to_csv(sys.argv[2], header=True, index=False, sep="\t", index_label=False)
312 else:
313 df.to_csv(sys.argv[2], header=False, index=False, sep="\t", index_label=False)
314
315
316 # 13- List of genes names and ID measured in a pipeline
317 def genes_in_pipeline(inp):
318 head = sys.argv[4]
319 pip = inp
320
321 g_in_p_query = f"{impc_api_search_url}/search/findAllByTestedPipelineId?pipelineId={pip}&page=0&size=1000"
322 genes_in_pip = requests.get(g_in_p_query).json()
323 pages = genes_in_pip['page']['totalPages']
324 max_elem = genes_in_pip['page']['totalElements']
325
326 print(f"Genes with {pip}: {genes_in_pip['page']['totalElements']}")
327 d ={ }
328 list_d = []
329 list_of_genes = pd.DataFrame(columns=['Gene accession id', 'Gene name'])
330 acc = []
331 name = []
332
333 if max_elem > 1000:
334 g_in_p_query = genes_in_pip['_embedded']['genes']
335 for i in range(1,pages):
336 gl = requests.get(f'{impc_api_search_url}/search/findAllByTestedPipelineId?pipelineId={pip}&page={i}&size=1000').json()['_embedded']['genes']
337 g_in_p_query += gl
338 else:
339 g_in_p_query = genes_in_pip['_embedded']['genes']
340
341 for g in g_in_p_query:
342 d = {"Gene Accession ID": g['mgiAccessionId'], "Gene Name": g['markerName']}
343 list_d.append(d)
344
345 for i in list_d:
346 acc.append(i['Gene Accession ID'])
347 name.append(i['Gene Name'])
348
349 list_of_genes['Gene accession id'] = acc
350 list_of_genes['Gene name'] = name
351
352 if head == 'True':
353 list_of_genes.to_csv(sys.argv[2], header=True, index=False, sep="\t", index_label=False)
354 else:
355 list_of_genes.to_csv(sys.argv[2], header=False, index=False, sep="\t", index_label=False)
356
357
358 # 14- Extract all genes and corresponding phenotypes related to a particular organ system(eg: significatMPTerm)
359 def sign_mp(inp):
360 head = sys.argv[4]
361 mp_term = inp # ['MP:0005391']
362
363 gene_by_mpterm_query = f"{impc_api_search_url}/search/findAllBySignificantMpTermIdsContains?mpTermIds={mp_term}&size=1000"
364 genes_with_mpterm = requests.get(gene_by_mpterm_query).json()
365
366 pages = genes_with_mpterm['page']['totalPages']
367 genes_info = genes_with_mpterm['_embedded']['genes']
368
369 for pn in range(1,pages):
370 pq = f"{impc_api_search_url}/search/findAllBySignificantMpTermIdsContains?mpTermIds={mp_term}&page={pn}&size=1000"
371 g = requests.get(pq).json()['_embedded']['genes']
372 genes_info += g
373
374 list_d=[]
375 d={}
376 for g in genes_info:
377 names=[]
378 ids=[]
379 for s in g['significantMpTerms']:
380 names.append(s['mpTermName'])
381 ids.append(s['mpTermId'])
382 d={'Gene':g['mgiAccessionId'], 'mpTermId': ids, 'mpTermName':names}
383 list_d.append(d)
384
385
386 g = []
387 ids = []
388 names = []
389 for i in list_d:
390 g.append(i['Gene'])
391 ids.append(i['mpTermId'])
392 names.append(i['mpTermName'])
393
394 df = pd.DataFrame()
395 df['Gene Id']=g
396 df['Significant MP terms Ids']=ids
397 df['Significant MP terms Names']=names
398
399 if head == 'True':
400 df.to_csv(sys.argv[2], header=True, index=False, sep="\t", index_label=False)
401 else:
402 df.to_csv(sys.argv[2], header=False, index=False, sep="\t", index_label=False)
403
404
405 # 16- Full table of genes and all identified phenotypes
406
407 def full_gene_table():
408 head = sys.argv[4]
409 gene_list = requests.get(impc_api_search_url + '?page=0&size=1000').json()
410 pages = gene_list['page']['totalPages']
411 genes_info = gene_list['_embedded']['genes']
412
413 for pn in range(1,pages):
414 gp = requests.get(impc_api_search_url + f'?page={pn}&size=1000').json()['_embedded']['genes']
415 genes_info += gp
416
417 d = {}
418 list_d=[]
419
420 for i in genes_info:
421 l = []
422 if i['significantMpTerms'] is None:
423 d={"Gene": i['mgiAccessionId'], "Identified phenotypes": "None"}
424 else:
425 d = {"Gene": i['mgiAccessionId'], "Identified phenotypes": [sub['mpTermId'] for sub in i['significantMpTerms']]}
426 list_d.append(d)
427
428 df = pd.DataFrame()
429 g = []
430 p = []
431 for i in list_d:
432 g.append(i['Gene'])
433 p.append(i['Identified phenotypes'])
434
435 df['MGI id'] = g
436 df['MP term list'] = p
437
438 for i in range(0, len(df)):
439 if df['MP term list'][i] != "None":
440 df['MP term list'][i] = str(df['MP term list'][i])[1:-1].replace("'", "")
441
442 if str(sys.argv[1]) == 'True':
443 if head == 'True':
444 df.to_csv(sys.argv[2], header=True, index=False, sep="\t", index_label=False)
445 else:
446 df.to_csv(sys.argv[2], header=False, index=False, sep="\t", index_label=False)
447 else:
448 df = df[df['MP term list'] != "None"]
449 df.reset_index(drop=True, inplace=True)
450 if head == 'True':
451 df.to_csv(sys.argv[2], header=True, index=False, sep="\t", index_label=False)
452 else:
453 df.to_csv(sys.argv[2], header=False, index=False, sep="\t", index_label=False)
454
455 # Old method, chech which is faster
456 # max_elem = gene_list['page']['totalElements']
457 # d = {}
458 # list_d = []
459 # for i in range(0, pages):
460 # gl = requests.get(impc_api_search_url + '?page=' + str(i) + '&size=' + str(max_elem)).json()
461 # for g in gl['_embedded']['genes']:
462 # if g['significantMpTerms'] is None:
463 # d = {"Gene": g['mgiAccessionId'], "Identified phenotypes": "None"}
464 # else:
465 # d = {"Gene": g['mgiAccessionId'], "Identified phenotypes": [ sub['mpTermId'] for sub in g['significantMpTerms'] ]}
466 # list_d.append(d)
467
468
469
470
471 # 18- Extract measurements and analysis for a parameter or pipeline
472
473 def par_pip_ma(inp):
474 head = sys.argv[4]
475 id = inp
476
477 if id[0:4] == "IMPC":
478 par = True
479 ma_query = f"{impc_api_search_url}/search/findAllByTestedParameterId?parameterId={id}&page=0&size=1000"
480 else:
481 ma_query = f"{impc_api_search_url}/search/findAllByTestedPipelineId?pipelineId={id}&page=0&size=1000"
482 par = False
483
484 ma_in_pip = requests.get(ma_query).json()
485 pages = ma_in_pip['page']['totalPages']
486 max_elem = ma_in_pip['page']['totalElements']
487
488 print(f"Genes with {id}: {ma_in_pip['page']['totalElements']}")
489 d = {}
490 list_d = []
491 list_of_genes = pd.DataFrame(columns=['Measurements', 'Analysis'])
492 mes = []
493 an = []
494
495 if max_elem > 1000:
496
497 ma_in_pip = ma_in_pip['_embedded']['genes']
498 for pn in range(1, pages):
499 if par:
500 pip = requests.get(f"{impc_api_search_url}/search/findAllByTestedParameterId?parameterId={id}&page={pn}&size=1000").json()['_embedded']['genes']
501 else:
502 pip = requests.get(f"{impc_api_search_url}/search/findAllByTestedPipelineId?pipelineId={id}&page={pn}&size=1000").json()['_embedded']['genes']
503 ma_in_pip += pip
504
505 else:
506 ma_in_pip = ma_in_pip['_embedded']['genes']
507
508 for g in ma_in_pip:
509 d = {"Measurements": g[''], "Analysis": g['']}
510 list_d.append(d)
511
512 for i in list_d:
513 mes.append(i[''])
514 an.append(i[''])
515
516 list_of_genes['Analysis'] = an
517 list_of_genes['Measurements'] = mes
518
519 if head == 'True':
520 list_of_genes.to_csv(sys.argv[2], header=True, index=False, sep="\t", index_label=False)
521 else:
522 list_of_genes.to_csv(sys.argv[2], header=False, index=False, sep="\t", index_label=False)
523
524
525 # 19- Get all genes and measured values for a particular parameter
526 def par_gen(inp):
527 head = sys.argv[4]
528 id = inp
529
530 pa_query = f"{impc_api_search_url}/search/findAllByTestedParameterId?parameterId={id}&page=0&size=1000"
531
532 gm_par = requests.get(pa_query).json()
533 pages = gm_par['page']['totalPages']
534 max_elem = gm_par['page']['totalElements']
535
536 print(f"Genes with {id}: {gm_par['page']['totalElements']}")
537 d = {}
538 list_d = []
539 list_of_genes = pd.DataFrame(columns=['Genes', 'Measured Values'])
540 gen = []
541 mes = []
542
543 if max_elem > 1000:
544
545 gm_par = gm_par['_embedded']['genes']
546
547 for pn in range(1, pages):
548 pip = requests.get(f"{impc_api_search_url}/search/findAllByTestedParameterId?parameterId={id}&page={pn}&size=1000").json()['_embedded']['genes']
549 gm_par += pip
550
551 else:
552 gm_par = gm_par['_embedded']['genes']
553
554
555 for g in gm_par:
556 d = {"Genes": g['mgiAccessionId'], "Measured Values": g['']}
557 list_d.append(d)
558
559 for i in list_d:
560 gen.append(i['Genes'])
561 mes.append(i['Measured Values'])
562
563 list_of_genes['Genes'] = gen
564 list_of_genes['Measured Values'] = mes
565
566 if head == 'True':
567 list_of_genes.to_csv(sys.argv[2], header=True, index=False, sep="\t", index_label=False)
568 else:
569 list_of_genes.to_csv(sys.argv[2], header=False, index=False, sep="\t", index_label=False)
570
571
572 def gene_mapping(inp):
573 tmp = inp.split(",")
574 final_list = []
575 sym_list = []
576 for i in tmp:
577 if 'MGI:' in i:
578 final_list.append(i)
579 else:
580 sym_list.append(i)
581 del (i)
582 if len(sym_list) != 0:
583 sym_list = ",".join(sym_list)
584 biodbnet = f'https://biodbnet.abcc.ncifcrf.gov/webServices/rest.php/biodbnetRestApi.xml?method=db2db&format=row&input=genesymbol&inputValues={sym_list}&outputs=mgiid&taxonId=10090'
585 u = url.urlopen(biodbnet)
586 db = pd.read_xml(u, elems_only=True)
587 empty = True
588 discarded = []
589 for i in db.index:
590 if db['MGIID'][i] != '-':
591 empty = False
592 final_list.append(db['MGIID'][i][4:])
593 break
594 else:
595 discarded.append(db['MGIID'][i][4:])
596
597 if (len(db) == 0 and len(final_list) == 0) or (empty and len(final_list) == 0):
598 stop_err("Error: it was not possible to map the input.")
599 elif empty:
600 print("Warning: it was not possible to map any of the gene symbols entry. Only MGI entries will be used.")
601 elif len(discarded) != 0:
602 print("Warning: it was not possible to map these elements: " + ",".join(discarded) + "\n")
603 return(final_list)
604
605 def pheno_mapping(inp):
606 tmp = inp.split(",")
607 final_list = []
608 sym_list = []
609 for i in tmp:
610 if 'MP:' in i:
611 final_list.append(i)
612 else:
613 sym_list.append(i)
614 del (i)
615 if len(sym_list) != 0:
616 url="https://raw.githubusercontent.com/AndreaFurlani/hp_mp_mapping_test/main/hp_mp_mapping.csv"
617 mapper = pd.read_csv(url,header=0,index_col=2)
618 empty = True
619 discarded = []
620 for i in sym_list:
621 try:
622 final_list.append(mapper.loc[i]['mpId'])
623 empty=False
624 except KeyError:
625 discarded.append(i)
626 continue
627 if empty and len(final_list)==0:
628 stop_err("Error: it was not possible to map the input.")
629 elif empty:
630 print("Warning: it was not possible to map any of the HP term entries. Only MP entries will be used.")
631 elif len(discarded) != 0:
632 print("Warning: it was not possible to map these elements: " + ",".join(discarded) + "\n")
633 return (final_list)
634
635 if __name__ == "__main__":
636 main()