Mercurial > repos > computational-metabolomics > sirius_csifingerid
comparison sirius_csifingerid.py @ 3:4cbfd3d0a4c4 draft
"planemo upload for repository https://github.com/computational-metabolomics/sirius_csifingerid_galaxy commit e4bc02f97a21da7556d1b76e5338ede3a9031fac"
| author | computational-metabolomics |
|---|---|
| date | Wed, 02 Feb 2022 17:29:46 +0000 |
| parents | 856b3761277d |
| children | 8fb51147d15e |
comparison
equal
deleted
inserted
replaced
| 2:856b3761277d | 3:4cbfd3d0a4c4 |
|---|---|
| 1 from __future__ import absolute_import, print_function | |
| 2 | |
| 3 import argparse | 1 import argparse |
| 4 import csv | 2 import csv |
| 5 import glob | 3 import glob |
| 6 import multiprocessing | 4 import multiprocessing |
| 7 import os | 5 import os |
| 9 import sys | 7 import sys |
| 10 import tempfile | 8 import tempfile |
| 11 import uuid | 9 import uuid |
| 12 from collections import defaultdict | 10 from collections import defaultdict |
| 13 | 11 |
| 14 import six | |
| 15 | 12 |
| 16 parser = argparse.ArgumentParser() | 13 parser = argparse.ArgumentParser() |
| 17 parser.add_argument('--input_pth') | 14 parser.add_argument('--input_pth') |
| 18 parser.add_argument('--result_pth') | 15 parser.add_argument('--canopus_result_pth') |
| 16 parser.add_argument('--annotations_result_pth') | |
| 19 parser.add_argument('--database') | 17 parser.add_argument('--database') |
| 20 parser.add_argument('--profile') | 18 parser.add_argument('--profile') |
| 21 parser.add_argument('--candidates') | 19 parser.add_argument('--candidates') |
| 22 parser.add_argument('--ppm_max') | 20 parser.add_argument('--ppm_max') |
| 23 parser.add_argument('--polarity') | 21 parser.add_argument('--polarity') |
| 26 parser.add_argument('--tool_directory') | 24 parser.add_argument('--tool_directory') |
| 27 parser.add_argument('--temp_dir') | 25 parser.add_argument('--temp_dir') |
| 28 parser.add_argument('--meta_select_col', default='all') | 26 parser.add_argument('--meta_select_col', default='all') |
| 29 parser.add_argument('--cores_top_level', default=1) | 27 parser.add_argument('--cores_top_level', default=1) |
| 30 parser.add_argument('--chunks', default=1) | 28 parser.add_argument('--chunks', default=1) |
| 31 parser.add_argument('--minMSMSpeaks', default=1) | 29 parser.add_argument('--min_MSMS_peaks', default=1) |
| 32 parser.add_argument('--rank_filter', default=0) | 30 parser.add_argument('--rank_filter', default=0) |
| 31 parser.add_argument('--confidence_filter', default=0) | |
| 32 parser.add_argument('--backwards_compatible', | |
| 33 default=False, action='store_true') | |
| 33 parser.add_argument('--schema', default='msp') | 34 parser.add_argument('--schema', default='msp') |
| 34 parser.add_argument('-a', '--adducts', action='append', nargs=1, | 35 parser.add_argument('-a', '--adducts', action='append', nargs=1, |
| 35 required=False, default=[], help='Adducts used') | 36 required=False, default=[], help='Adducts used') |
| 36 | 37 |
| 37 args = parser.parse_args() | 38 args = parser.parse_args() |
| 122 | 123 |
| 123 # function to extract the meta data using the regular expressions | 124 # function to extract the meta data using the regular expressions |
| 124 def parse_meta(meta_regex, meta_info=None): | 125 def parse_meta(meta_regex, meta_info=None): |
| 125 if meta_info is None: | 126 if meta_info is None: |
| 126 meta_info = {} | 127 meta_info = {} |
| 127 for k, regexes in six.iteritems(meta_regex): | 128 for k, regexes in meta_regex.items(): |
| 128 for reg in regexes: | 129 for reg in regexes: |
| 129 m = re.search(reg, line, re.IGNORECASE) | 130 m = re.search(reg, line, re.IGNORECASE) |
| 130 if m: | 131 if m: |
| 131 meta_info[k] = '-'.join(m.groups()).strip() | 132 meta_info[k] = '-'.join(m.groups()).strip() |
| 132 return meta_info | 133 return meta_info |
| 201 if 'precursor_type' in meta_info and meta_info['precursor_type']: | 202 if 'precursor_type' in meta_info and meta_info['precursor_type']: |
| 202 paramd["cli"]["--ion"] = meta_info['precursor_type'] | 203 paramd["cli"]["--ion"] = meta_info['precursor_type'] |
| 203 adduct = meta_info['precursor_type'] | 204 adduct = meta_info['precursor_type'] |
| 204 else: | 205 else: |
| 205 if paramd["default_ion"]: | 206 if paramd["default_ion"]: |
| 206 paramd["cli"]["--ion"] = paramd["default_ion"] | 207 paramd["cli"]["--adduct"] = paramd["default_ion"] |
| 207 adduct = paramd["default_ion"] | 208 adduct = paramd["default_ion"] |
| 208 else: | 209 else: |
| 209 paramd["cli"]["--auto-charge"] = '' | 210 paramd["cli"]["--auto-charge"] = '' |
| 210 | 211 |
| 211 if 'precursor_mz' in meta_info and meta_info['precursor_mz']: | 212 if 'precursor_mz' in meta_info and meta_info['precursor_mz']: |
| 215 in paramd['additional_details']): | 216 in paramd['additional_details']): |
| 216 # If possible always good to have the adduct in output as a column | 217 # If possible always good to have the adduct in output as a column |
| 217 paramd['additional_details']['adduct'] = adduct | 218 paramd['additional_details']['adduct'] = adduct |
| 218 | 219 |
| 219 # ============== Create CLI cmd for metfrag =============================== | 220 # ============== Create CLI cmd for metfrag =============================== |
| 220 cmd = "sirius --fingerid" | 221 cmd = "sirius --no-citations --ms2 {} --adduct {} --precursor {} -o {} " \ |
| 221 for k, v in six.iteritems(paramd["cli"]): | 222 "formula -c {} --ppm-max {} --profile {} " \ |
| 222 cmd += " {} {}".format(str(k), str(v)) | 223 "structure --database {} canopus".format( |
| 224 paramd["cli"]["--ms2"], | |
| 225 adduct, | |
| 226 paramd["cli"]["--precursor"], | |
| 227 paramd["cli"]["--output"], | |
| 228 paramd["cli"]["--candidates"], | |
| 229 paramd["cli"]["--ppm-max"], | |
| 230 paramd["cli"]["--profile"], | |
| 231 paramd["cli"]["--database"] | |
| 232 ) | |
| 233 print(cmd) | |
| 223 paramds[paramd["SampleName"]] = paramd | 234 paramds[paramd["SampleName"]] = paramd |
| 224 | 235 |
| 225 # =============== Run srius ============================================== | 236 # =============== Run srius ============================================== |
| 226 # Filter before process with a minimum number of MS/MS peaks | 237 # Filter before process with a minimum number of MS/MS peaks |
| 227 if plinesread >= float(args.minMSMSpeaks): | 238 if plinesread >= float(args.min_MSMS_peaks): |
| 228 | 239 |
| 229 if int(args.cores_top_level) == 1: | 240 if int(args.cores_top_level) == 1: |
| 230 os.system(cmd) | 241 os.system(cmd) |
| 231 | 242 |
| 232 return paramd, cmd | 243 return paramd, cmd |
| 329 pool = multiprocessing.Pool(processes=int(args.cores_top_level)) | 340 pool = multiprocessing.Pool(processes=int(args.cores_top_level)) |
| 330 pool.map(work, cmds_chunks) | 341 pool.map(work, cmds_chunks) |
| 331 pool.close() | 342 pool.close() |
| 332 pool.join() | 343 pool.join() |
| 333 | 344 |
| 345 | |
| 334 ###################################################################### | 346 ###################################################################### |
| 335 # Concatenate and filter the output | 347 # Concatenate and filter the output |
| 336 ###################################################################### | 348 ###################################################################### |
| 337 # outputs might have different headers. Need to get a list of all the headers | 349 # outputs might have different headers. Need to get a list of all the headers |
| 338 # before we start merging the files outfiles = [os.path.join(wd, f) for f in | 350 # before we start merging the files outfiles = [os.path.join(wd, f) for f in |
| 339 # glob.glob(os.path.join(wd, "*_metfrag_result.csv"))] | 351 # glob.glob(os.path.join(wd, "*_metfrag_result.csv"))] |
| 340 outfiles = glob.glob(os.path.join(wd, '*', '*', 'summary_csi_fingerid.csv')) | 352 def concat_output(filename, result_pth, |
| 341 | 353 rank_filter, confidence_filter, backwards_compatible): |
| 342 # sort files nicely | 354 outfiles = glob.glob(os.path.join(wd, '*', '*{}'.format(filename))) |
| 343 outfiles.sort(key=lambda s: int(re.match(r'^.*/(' | 355 |
| 344 r'\d+).*/.*/summary_csi_fingerid.csv', | 356 # sort files nicely |
| 345 s).group(1))) | 357 outfiles.sort(key=lambda s: int(re.match(r'^.*/(' |
| 346 print(outfiles) | 358 r'\d+).*{}'.format(filename), |
| 347 | 359 s).group(1))) |
| 348 if len(outfiles) == 0: | 360 print(outfiles) |
| 349 print('No results') | 361 |
| 350 sys.exit() | 362 if len(outfiles) == 0: |
| 351 | 363 print('No results') |
| 352 headers = [] | 364 sys.exit() |
| 353 c = 0 | 365 |
| 354 for fn in outfiles: | 366 headers = [] |
| 355 with open(fn, 'r') as infile: | 367 |
| 356 reader = csv.reader(infile, delimiter='\t') | 368 for fn in outfiles: |
| 357 if sys.version_info >= (3, 0): | 369 with open(fn, 'r') as infile: |
| 358 headers.extend(next(reader)) | 370 reader = csv.reader(infile, delimiter='\t') |
| 359 else: | 371 if sys.version_info >= (3, 0): |
| 360 headers.extend(reader.next()) | 372 headers.extend(next(reader)) |
| 361 break | 373 else: |
| 362 | 374 headers.extend(reader.next()) |
| 363 headers = list(paramd['additional_details'].keys()) + headers | 375 break |
| 364 | 376 |
| 365 with open(args.result_pth, 'a') as merged_outfile: | 377 headers = list(paramd['additional_details'].keys()) + headers |
| 366 dwriter = csv.DictWriter(merged_outfile, | 378 |
| 367 fieldnames=headers, delimiter='\t') | 379 with open(result_pth, 'a') as merged_outfile: |
| 368 dwriter.writeheader() | 380 dwriter = csv.DictWriter(merged_outfile, |
| 369 | 381 fieldnames=headers, delimiter='\t') |
| 370 for fn in sorted(outfiles): | 382 dwriter.writeheader() |
| 371 print(fn) | 383 |
| 372 | 384 for fn in sorted(outfiles): |
| 373 with open(fn) as infile: | 385 print(fn) |
| 374 reader = csv.DictReader(infile, delimiter='\t') | 386 |
| 375 | 387 with open(fn) as infile: |
| 376 ad = paramds[fn.split(os.sep)[-3]]['additional_details'] | 388 reader = csv.DictReader(infile, delimiter='\t') |
| 377 | 389 |
| 378 for line in reader: | 390 ad = paramds[fn.split(os.sep)[-2]]['additional_details'] |
| 379 if 0 < int(args.rank_filter) < int(line['rank']): | 391 |
| 380 # filter out those annotations greater than rank filter | 392 for line in reader: |
| 381 # If rank_filter is zero then skip | 393 if 'rank' in line and \ |
| 382 continue | 394 0 < int(rank_filter) < int(line['rank']): |
| 383 line.update(ad) | 395 # filter out those annotations greater than rank filter |
| 384 # round score to 5 d.p. | 396 # If rank_filter is zero then skip |
| 385 line['score'] = round(float(line['score']), 5) | 397 continue |
| 386 | 398 |
| 387 dwriter.writerow(line) | 399 if 'ConfidenceScore' in line \ |
| 400 and 0 < int(confidence_filter) < int(line['rank']): | |
| 401 # filter out those annotations greater than rank filter | |
| 402 # If rank_filter is zero then skip | |
| 403 continue | |
| 404 line.update(ad) | |
| 405 | |
| 406 dwriter.writerow(line) | |
| 407 | |
| 408 if backwards_compatible: | |
| 409 # Headers required in this format for tools that used | |
| 410 # v4.9.3 of SIRIUS-CSI:FingerID | |
| 411 s1 = "sed 's/InChIkey2D/inchikey2d/g' {r} > {r}".format(r=result_pth) | |
| 412 os.system(s1) | |
| 413 s2 = "sed 's/CSI:FingerIDScore/Score/' {r} > {r}".format(r=result_pth) | |
| 414 os.system(s2) | |
| 415 | |
| 416 | |
| 417 concat_output('canopus_summary.tsv', | |
| 418 args.canopus_result_pth, | |
| 419 args.rank_filter, | |
| 420 args.confidence_filter, | |
| 421 args.backwards_compatible) | |
| 422 concat_output('compound_identifications.tsv', | |
| 423 args.annotations_result_pth, | |
| 424 0, | |
| 425 0, | |
| 426 False) |
