comparison sirius_csifingerid.py @ 3:4cbfd3d0a4c4 draft

"planemo upload for repository https://github.com/computational-metabolomics/sirius_csifingerid_galaxy commit e4bc02f97a21da7556d1b76e5338ede3a9031fac"
author computational-metabolomics
date Wed, 02 Feb 2022 17:29:46 +0000
parents 856b3761277d
children 8fb51147d15e
comparison
equal deleted inserted replaced
2:856b3761277d 3:4cbfd3d0a4c4
1 from __future__ import absolute_import, print_function
2
3 import argparse 1 import argparse
4 import csv 2 import csv
5 import glob 3 import glob
6 import multiprocessing 4 import multiprocessing
7 import os 5 import os
9 import sys 7 import sys
10 import tempfile 8 import tempfile
11 import uuid 9 import uuid
12 from collections import defaultdict 10 from collections import defaultdict
13 11
14 import six
15 12
16 parser = argparse.ArgumentParser() 13 parser = argparse.ArgumentParser()
17 parser.add_argument('--input_pth') 14 parser.add_argument('--input_pth')
18 parser.add_argument('--result_pth') 15 parser.add_argument('--canopus_result_pth')
16 parser.add_argument('--annotations_result_pth')
19 parser.add_argument('--database') 17 parser.add_argument('--database')
20 parser.add_argument('--profile') 18 parser.add_argument('--profile')
21 parser.add_argument('--candidates') 19 parser.add_argument('--candidates')
22 parser.add_argument('--ppm_max') 20 parser.add_argument('--ppm_max')
23 parser.add_argument('--polarity') 21 parser.add_argument('--polarity')
26 parser.add_argument('--tool_directory') 24 parser.add_argument('--tool_directory')
27 parser.add_argument('--temp_dir') 25 parser.add_argument('--temp_dir')
28 parser.add_argument('--meta_select_col', default='all') 26 parser.add_argument('--meta_select_col', default='all')
29 parser.add_argument('--cores_top_level', default=1) 27 parser.add_argument('--cores_top_level', default=1)
30 parser.add_argument('--chunks', default=1) 28 parser.add_argument('--chunks', default=1)
31 parser.add_argument('--minMSMSpeaks', default=1) 29 parser.add_argument('--min_MSMS_peaks', default=1)
32 parser.add_argument('--rank_filter', default=0) 30 parser.add_argument('--rank_filter', default=0)
31 parser.add_argument('--confidence_filter', default=0)
32 parser.add_argument('--backwards_compatible',
33 default=False, action='store_true')
33 parser.add_argument('--schema', default='msp') 34 parser.add_argument('--schema', default='msp')
34 parser.add_argument('-a', '--adducts', action='append', nargs=1, 35 parser.add_argument('-a', '--adducts', action='append', nargs=1,
35 required=False, default=[], help='Adducts used') 36 required=False, default=[], help='Adducts used')
36 37
37 args = parser.parse_args() 38 args = parser.parse_args()
122 123
123 # function to extract the meta data using the regular expressions 124 # function to extract the meta data using the regular expressions
124 def parse_meta(meta_regex, meta_info=None): 125 def parse_meta(meta_regex, meta_info=None):
125 if meta_info is None: 126 if meta_info is None:
126 meta_info = {} 127 meta_info = {}
127 for k, regexes in six.iteritems(meta_regex): 128 for k, regexes in meta_regex.items():
128 for reg in regexes: 129 for reg in regexes:
129 m = re.search(reg, line, re.IGNORECASE) 130 m = re.search(reg, line, re.IGNORECASE)
130 if m: 131 if m:
131 meta_info[k] = '-'.join(m.groups()).strip() 132 meta_info[k] = '-'.join(m.groups()).strip()
132 return meta_info 133 return meta_info
201 if 'precursor_type' in meta_info and meta_info['precursor_type']: 202 if 'precursor_type' in meta_info and meta_info['precursor_type']:
202 paramd["cli"]["--ion"] = meta_info['precursor_type'] 203 paramd["cli"]["--ion"] = meta_info['precursor_type']
203 adduct = meta_info['precursor_type'] 204 adduct = meta_info['precursor_type']
204 else: 205 else:
205 if paramd["default_ion"]: 206 if paramd["default_ion"]:
206 paramd["cli"]["--ion"] = paramd["default_ion"] 207 paramd["cli"]["--adduct"] = paramd["default_ion"]
207 adduct = paramd["default_ion"] 208 adduct = paramd["default_ion"]
208 else: 209 else:
209 paramd["cli"]["--auto-charge"] = '' 210 paramd["cli"]["--auto-charge"] = ''
210 211
211 if 'precursor_mz' in meta_info and meta_info['precursor_mz']: 212 if 'precursor_mz' in meta_info and meta_info['precursor_mz']:
215 in paramd['additional_details']): 216 in paramd['additional_details']):
216 # If possible always good to have the adduct in output as a column 217 # If possible always good to have the adduct in output as a column
217 paramd['additional_details']['adduct'] = adduct 218 paramd['additional_details']['adduct'] = adduct
218 219
219 # ============== Create CLI cmd for metfrag =============================== 220 # ============== Create CLI cmd for metfrag ===============================
220 cmd = "sirius --fingerid" 221 cmd = "sirius --no-citations --ms2 {} --adduct {} --precursor {} -o {} " \
221 for k, v in six.iteritems(paramd["cli"]): 222 "formula -c {} --ppm-max {} --profile {} " \
222 cmd += " {} {}".format(str(k), str(v)) 223 "structure --database {} canopus".format(
224 paramd["cli"]["--ms2"],
225 adduct,
226 paramd["cli"]["--precursor"],
227 paramd["cli"]["--output"],
228 paramd["cli"]["--candidates"],
229 paramd["cli"]["--ppm-max"],
230 paramd["cli"]["--profile"],
231 paramd["cli"]["--database"]
232 )
233 print(cmd)
223 paramds[paramd["SampleName"]] = paramd 234 paramds[paramd["SampleName"]] = paramd
224 235
225 # =============== Run srius ============================================== 236 # =============== Run srius ==============================================
226 # Filter before process with a minimum number of MS/MS peaks 237 # Filter before process with a minimum number of MS/MS peaks
227 if plinesread >= float(args.minMSMSpeaks): 238 if plinesread >= float(args.min_MSMS_peaks):
228 239
229 if int(args.cores_top_level) == 1: 240 if int(args.cores_top_level) == 1:
230 os.system(cmd) 241 os.system(cmd)
231 242
232 return paramd, cmd 243 return paramd, cmd
329 pool = multiprocessing.Pool(processes=int(args.cores_top_level)) 340 pool = multiprocessing.Pool(processes=int(args.cores_top_level))
330 pool.map(work, cmds_chunks) 341 pool.map(work, cmds_chunks)
331 pool.close() 342 pool.close()
332 pool.join() 343 pool.join()
333 344
345
334 ###################################################################### 346 ######################################################################
335 # Concatenate and filter the output 347 # Concatenate and filter the output
336 ###################################################################### 348 ######################################################################
337 # outputs might have different headers. Need to get a list of all the headers 349 # outputs might have different headers. Need to get a list of all the headers
338 # before we start merging the files outfiles = [os.path.join(wd, f) for f in 350 # before we start merging the files outfiles = [os.path.join(wd, f) for f in
339 # glob.glob(os.path.join(wd, "*_metfrag_result.csv"))] 351 # glob.glob(os.path.join(wd, "*_metfrag_result.csv"))]
340 outfiles = glob.glob(os.path.join(wd, '*', '*', 'summary_csi_fingerid.csv')) 352 def concat_output(filename, result_pth,
341 353 rank_filter, confidence_filter, backwards_compatible):
342 # sort files nicely 354 outfiles = glob.glob(os.path.join(wd, '*', '*{}'.format(filename)))
343 outfiles.sort(key=lambda s: int(re.match(r'^.*/(' 355
344 r'\d+).*/.*/summary_csi_fingerid.csv', 356 # sort files nicely
345 s).group(1))) 357 outfiles.sort(key=lambda s: int(re.match(r'^.*/('
346 print(outfiles) 358 r'\d+).*{}'.format(filename),
347 359 s).group(1)))
348 if len(outfiles) == 0: 360 print(outfiles)
349 print('No results') 361
350 sys.exit() 362 if len(outfiles) == 0:
351 363 print('No results')
352 headers = [] 364 sys.exit()
353 c = 0 365
354 for fn in outfiles: 366 headers = []
355 with open(fn, 'r') as infile: 367
356 reader = csv.reader(infile, delimiter='\t') 368 for fn in outfiles:
357 if sys.version_info >= (3, 0): 369 with open(fn, 'r') as infile:
358 headers.extend(next(reader)) 370 reader = csv.reader(infile, delimiter='\t')
359 else: 371 if sys.version_info >= (3, 0):
360 headers.extend(reader.next()) 372 headers.extend(next(reader))
361 break 373 else:
362 374 headers.extend(reader.next())
363 headers = list(paramd['additional_details'].keys()) + headers 375 break
364 376
365 with open(args.result_pth, 'a') as merged_outfile: 377 headers = list(paramd['additional_details'].keys()) + headers
366 dwriter = csv.DictWriter(merged_outfile, 378
367 fieldnames=headers, delimiter='\t') 379 with open(result_pth, 'a') as merged_outfile:
368 dwriter.writeheader() 380 dwriter = csv.DictWriter(merged_outfile,
369 381 fieldnames=headers, delimiter='\t')
370 for fn in sorted(outfiles): 382 dwriter.writeheader()
371 print(fn) 383
372 384 for fn in sorted(outfiles):
373 with open(fn) as infile: 385 print(fn)
374 reader = csv.DictReader(infile, delimiter='\t') 386
375 387 with open(fn) as infile:
376 ad = paramds[fn.split(os.sep)[-3]]['additional_details'] 388 reader = csv.DictReader(infile, delimiter='\t')
377 389
378 for line in reader: 390 ad = paramds[fn.split(os.sep)[-2]]['additional_details']
379 if 0 < int(args.rank_filter) < int(line['rank']): 391
380 # filter out those annotations greater than rank filter 392 for line in reader:
381 # If rank_filter is zero then skip 393 if 'rank' in line and \
382 continue 394 0 < int(rank_filter) < int(line['rank']):
383 line.update(ad) 395 # filter out those annotations greater than rank filter
384 # round score to 5 d.p. 396 # If rank_filter is zero then skip
385 line['score'] = round(float(line['score']), 5) 397 continue
386 398
387 dwriter.writerow(line) 399 if 'ConfidenceScore' in line \
400 and 0 < int(confidence_filter) < int(line['rank']):
401 # filter out those annotations greater than rank filter
402 # If rank_filter is zero then skip
403 continue
404 line.update(ad)
405
406 dwriter.writerow(line)
407
408 if backwards_compatible:
409 # Headers required in this format for tools that used
410 # v4.9.3 of SIRIUS-CSI:FingerID
411 s1 = "sed 's/InChIkey2D/inchikey2d/g' {r} > {r}".format(r=result_pth)
412 os.system(s1)
413 s2 = "sed 's/CSI:FingerIDScore/Score/' {r} > {r}".format(r=result_pth)
414 os.system(s2)
415
416
417 concat_output('canopus_summary.tsv',
418 args.canopus_result_pth,
419 args.rank_filter,
420 args.confidence_filter,
421 args.backwards_compatible)
422 concat_output('compound_identifications.tsv',
423 args.annotations_result_pth,
424 0,
425 0,
426 False)