Mercurial > repos > computational-metabolomics > metfrag
comparison metfrag.py @ 0:fd5c0b39569a draft
"planemo upload for repository https://github.com/computational-metabolomics/metfrag-galaxy commit e20ce56f23d9fe30df64542ece2295d654ca142d"
author | computational-metabolomics |
---|---|
date | Wed, 05 Feb 2020 12:30:06 -0500 |
parents | |
children | 9ee2e2ceb2c9 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:fd5c0b39569a |
---|---|
1 from __future__ import absolute_import, print_function | |
2 | |
3 import ConfigParser | |
4 import argparse | |
5 import csv | |
6 import glob | |
7 import multiprocessing | |
8 import os | |
9 import re | |
10 import shutil | |
11 import sys | |
12 import tempfile | |
13 from collections import defaultdict | |
14 | |
15 import six | |
16 | |
17 print(sys.version) | |
18 | |
19 parser = argparse.ArgumentParser() | |
20 parser.add_argument('--input_pth') | |
21 parser.add_argument('--result_pth', default='metfrag_result.csv') | |
22 | |
23 parser.add_argument('--temp_dir') | |
24 parser.add_argument('--polarity', default='pos') | |
25 parser.add_argument('--minMSMSpeaks', default=1) | |
26 | |
27 parser.add_argument('--MetFragDatabaseType', default='PubChem') | |
28 parser.add_argument('--LocalDatabasePath', default='') | |
29 parser.add_argument('--LocalMetChemDatabaseServerIp', default='') | |
30 | |
31 parser.add_argument('--DatabaseSearchRelativeMassDeviation', default=5) | |
32 parser.add_argument('--FragmentPeakMatchRelativeMassDeviation', default=10) | |
33 parser.add_argument('--FragmentPeakMatchAbsoluteMassDeviation', default=0.001) | |
34 parser.add_argument('--NumberThreads', default=1) | |
35 parser.add_argument('--UnconnectedCompoundFilter', action='store_true') | |
36 parser.add_argument('--IsotopeFilter', action='store_true') | |
37 | |
38 parser.add_argument('--FilterMinimumElements', default='') | |
39 parser.add_argument('--FilterMaximumElements', default='') | |
40 parser.add_argument('--FilterSmartsInclusionList', default='') | |
41 parser.add_argument('--FilterSmartsExclusionList', default='') | |
42 parser.add_argument('--FilterIncludedElements', default='') | |
43 parser.add_argument('--FilterExcludedElements', default='') | |
44 parser.add_argument('--FilterIncludedExclusiveElements', default='') | |
45 | |
46 parser.add_argument('--score_thrshld', default=0) | |
47 parser.add_argument('--pctexplpeak_thrshld', default=0) | |
48 parser.add_argument('--schema') | |
49 parser.add_argument('--cores_top_level', default=1) | |
50 parser.add_argument('--chunks', default=1) | |
51 parser.add_argument('--meta_select_col', default='name') | |
52 parser.add_argument('--skip_invalid_adducts', action='store_true') | |
53 | |
54 parser.add_argument('--ScoreSuspectLists', default='') | |
55 parser.add_argument('--MetFragScoreTypes', | |
56 default="FragmenterScore,OfflineMetFusionScore") | |
57 parser.add_argument('--MetFragScoreWeights', default="1.0,1.0") | |
58 | |
59 args = parser.parse_args() | |
60 print(args) | |
61 | |
62 config = ConfigParser.ConfigParser() | |
63 config.read( | |
64 os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config.ini')) | |
65 | |
66 if os.stat(args.input_pth).st_size == 0: | |
67 print('Input file empty') | |
68 exit() | |
69 | |
70 # Create temporary working directory | |
71 if args.temp_dir: | |
72 wd = args.temp_dir | |
73 else: | |
74 wd = tempfile.mkdtemp() | |
75 | |
76 if os.path.exists(wd): | |
77 shutil.rmtree(wd) | |
78 os.makedirs(wd) | |
79 else: | |
80 os.makedirs(wd) | |
81 | |
82 ###################################################################### | |
83 # Setup regular expressions for MSP parsing dictionary | |
84 ###################################################################### | |
85 regex_msp = {} | |
86 regex_msp['name'] = [r'^Name(?:=|:)(.*)$'] | |
87 regex_msp['polarity'] = [r'^ion.*mode(?:=|:)(.*)$', | |
88 r'^ionization.*mode(?:=|:)(.*)$', | |
89 r'^polarity(?:=|:)(.*)$'] | |
90 regex_msp['precursor_mz'] = [r'^precursor.*m/z(?:=|:)\s*(\d*[.,]?\d*)$', | |
91 r'^precursor.*mz(?:=|:)\s*(\d*[.,]?\d*)$'] | |
92 regex_msp['precursor_type'] = [r'^precursor.*type(?:=|:)(.*)$', | |
93 r'^adduct(?:=|:)(.*)$', | |
94 r'^ADDUCTIONNAME(?:=|:)(.*)$'] | |
95 regex_msp['num_peaks'] = [r'^Num.*Peaks(?:=|:)\s*(\d*)$'] | |
96 regex_msp['msp'] = [r'^Name(?:=|:)(.*)$'] # Flag for standard MSP format | |
97 | |
98 regex_massbank = {} | |
99 regex_massbank['name'] = [r'^RECORD_TITLE:(.*)$'] | |
100 regex_massbank['polarity'] = [r'^AC\$MASS_SPECTROMETRY:\s+ION_MODE\s+(.*)$'] | |
101 regex_massbank['precursor_mz'] = [ | |
102 r'^MS\$FOCUSED_ION:\s+PRECURSOR_M/Z\s+(\d*[.,]?\d*)$'] | |
103 regex_massbank['precursor_type'] = [ | |
104 r'^MS\$FOCUSED_ION:\s+PRECURSOR_TYPE\s+(.*)$'] | |
105 regex_massbank['num_peaks'] = [r'^PK\$NUM_PEAK:\s+(\d*)'] | |
106 regex_massbank['cols'] = [r'^PK\$PEAK:\s+(.*)'] | |
107 regex_massbank['massbank'] = [ | |
108 r'^RECORD_TITLE:(.*)$'] # Flag for massbank format | |
109 | |
110 if args.schema == 'msp': | |
111 meta_regex = regex_msp | |
112 elif args.schema == 'massbank': | |
113 meta_regex = regex_massbank | |
114 elif args.schema == 'auto': | |
115 # If auto we just check for all the available paramter names and then | |
116 # determine if Massbank or MSP based on the name parameter | |
117 meta_regex = {} | |
118 meta_regex.update(regex_massbank) | |
119 meta_regex['name'].extend(regex_msp['name']) | |
120 meta_regex['polarity'].extend(regex_msp['polarity']) | |
121 meta_regex['precursor_mz'].extend(regex_msp['precursor_mz']) | |
122 meta_regex['precursor_type'].extend(regex_msp['precursor_type']) | |
123 meta_regex['num_peaks'].extend(regex_msp['num_peaks']) | |
124 meta_regex['msp'] = regex_msp['msp'] | |
125 else: | |
126 sys.exit("No schema selected") | |
127 | |
128 adduct_types = { | |
129 '[M+H]+': 1.007276, | |
130 '[M+NH4]+': 18.034374, | |
131 '[M+Na]+': 22.989218, | |
132 '[M+K]+': 38.963158, | |
133 '[M+CH3OH+H]+': 33.033489, | |
134 '[M+ACN+H]+': 42.033823, | |
135 '[M+ACN+Na]+': 64.015765, | |
136 '[M+2ACN+H]+': 83.06037, | |
137 '[M-H]-': -1.007276, | |
138 '[M+Cl]-': 34.969402, | |
139 '[M+HCOO]-': 44.99819, | |
140 '[M-H+HCOOH]-': 44.99819, | |
141 # same as above but different style of writing adduct | |
142 '[M+CH3COO]-': 59.01385, | |
143 '[M-H+CH3COOH]-': 59.01385 | |
144 # same as above but different style of writing adduct | |
145 } | |
146 inv_adduct_types = {int(round(v, 0)): k for k, v in adduct_types.iteritems()} | |
147 | |
148 | |
149 # function to extract the meta data using the regular expressions | |
150 def parse_meta(meta_regex, meta_info=None): | |
151 if meta_info is None: | |
152 meta_info = {} | |
153 for k, regexes in six.iteritems(meta_regex): | |
154 for reg in regexes: | |
155 m = re.search(reg, line, re.IGNORECASE) | |
156 if m: | |
157 meta_info[k] = '-'.join(m.groups()).strip() | |
158 return meta_info | |
159 | |
160 | |
161 ###################################################################### | |
162 # Setup parameter dictionary | |
163 ###################################################################### | |
164 def init_paramd(args): | |
165 paramd = defaultdict() | |
166 | |
167 paramd["MetFragDatabaseType"] = args.MetFragDatabaseType | |
168 | |
169 if args.MetFragDatabaseType == "LocalCSV": | |
170 paramd["LocalDatabasePath"] = args.LocalDatabasePath | |
171 elif args.MetFragDatabaseType == "MetChem": | |
172 paramd["LocalMetChemDatabase"] = \ | |
173 config.get('MetChem', 'LocalMetChemDatabase') | |
174 paramd["LocalMetChemDatabasePortNumber"] = \ | |
175 config.get('MetChem', 'LocalMetChemDatabasePortNumber') | |
176 paramd["LocalMetChemDatabaseServerIp"] = \ | |
177 args.LocalMetChemDatabaseServerIp | |
178 paramd["LocalMetChemDatabaseUser"] = \ | |
179 config.get('MetChem', 'LocalMetChemDatabaseUser') | |
180 paramd["LocalMetChemDatabasePassword"] = \ | |
181 config.get('MetChem', 'LocalMetChemDatabasePassword') | |
182 | |
183 paramd["FragmentPeakMatchAbsoluteMassDeviation"] = \ | |
184 args.FragmentPeakMatchAbsoluteMassDeviation | |
185 paramd["FragmentPeakMatchRelativeMassDeviation"] = \ | |
186 args.FragmentPeakMatchRelativeMassDeviation | |
187 paramd["DatabaseSearchRelativeMassDeviation"] = \ | |
188 args.DatabaseSearchRelativeMassDeviation | |
189 paramd["SampleName"] = '' | |
190 paramd["ResultsPath"] = os.path.join(wd) | |
191 | |
192 if args.polarity == "pos": | |
193 paramd["IsPositiveIonMode"] = True | |
194 paramd["PrecursorIonModeDefault"] = "1" | |
195 paramd["PrecursorIonMode"] = "1" | |
196 paramd["nm_mass_diff_default"] = 1.007276 | |
197 else: | |
198 paramd["IsPositiveIonMode"] = False | |
199 paramd["PrecursorIonModeDefault"] = "-1" | |
200 paramd["PrecursorIonMode"] = "-1" | |
201 paramd["nm_mass_diff_default"] = -1.007276 | |
202 | |
203 paramd["MetFragCandidateWriter"] = "CSV" | |
204 paramd["NumberThreads"] = args.NumberThreads | |
205 | |
206 if args.ScoreSuspectLists: | |
207 paramd["ScoreSuspectLists"] = args.ScoreSuspectLists | |
208 | |
209 paramd["MetFragScoreTypes"] = args.MetFragScoreTypes | |
210 paramd["MetFragScoreWeights"] = args.MetFragScoreWeights | |
211 | |
212 dct_filter = defaultdict() | |
213 filterh = [] | |
214 | |
215 if args.UnconnectedCompoundFilter: | |
216 filterh.append('UnconnectedCompoundFilter') | |
217 | |
218 if args.IsotopeFilter: | |
219 filterh.append('IsotopeFilter') | |
220 | |
221 if args.FilterMinimumElements: | |
222 filterh.append('MinimumElementsFilter') | |
223 dct_filter['FilterMinimumElements'] = args.FilterMinimumElements | |
224 | |
225 if args.FilterMaximumElements: | |
226 filterh.append('MaximumElementsFilter') | |
227 dct_filter['FilterMaximumElements'] = args.FilterMaximumElements | |
228 | |
229 if args.FilterSmartsInclusionList: | |
230 filterh.append('SmartsSubstructureInclusionFilter') | |
231 dct_filter[ | |
232 'FilterSmartsInclusionList'] = args.FilterSmartsInclusionList | |
233 | |
234 if args.FilterSmartsExclusionList: | |
235 filterh.append('SmartsSubstructureExclusionFilter') | |
236 dct_filter[ | |
237 'FilterSmartsExclusionList'] = args.FilterSmartsExclusionList | |
238 | |
239 # My understanding is that both 'ElementInclusionExclusiveFilter' | |
240 # and 'ElementExclusionFilter' use 'FilterIncludedElements' | |
241 if args.FilterIncludedExclusiveElements: | |
242 filterh.append('ElementInclusionExclusiveFilter') | |
243 dct_filter[ | |
244 'FilterIncludedElements'] = args.FilterIncludedExclusiveElements | |
245 | |
246 if args.FilterIncludedElements: | |
247 filterh.append('ElementInclusionFilter') | |
248 dct_filter['FilterIncludedElements'] = args.FilterIncludedElements | |
249 | |
250 if args.FilterExcludedElements: | |
251 filterh.append('ElementExclusionFilter') | |
252 dct_filter['FilterExcludedElements'] = args.FilterExcludedElements | |
253 | |
254 if filterh: | |
255 fcmds = ','.join(filterh) + ' ' | |
256 for k, v in six.iteritems(dct_filter): | |
257 fcmds += "{0}={1} ".format(str(k), str(v)) | |
258 | |
259 paramd["MetFragPreProcessingCandidateFilter"] = fcmds | |
260 | |
261 return paramd | |
262 | |
263 | |
264 ###################################################################### | |
265 # Function to run metfrag when all metainfo and peaks have been parsed | |
266 ###################################################################### | |
267 def run_metfrag(meta_info, peaklist, args, wd, spectrac, adduct_types): | |
268 # Get sample details (if possible to extract) e.g. if created as part of | |
269 # the msPurity pipeline) choose between getting additional details to add | |
270 # as columns as either all meta data from msp, just details from the | |
271 # record name (i.e. when using msPurity and we have the columns coded into | |
272 # the name) or just the spectra index (spectrac)]. | |
273 # Returns the parameters used and the command line call | |
274 | |
275 paramd = init_paramd(args) | |
276 if args.meta_select_col == 'name': | |
277 # have additional column of just the name | |
278 paramd['additional_details'] = {'name': meta_info['name']} | |
279 elif args.meta_select_col == 'name_split': | |
280 # have additional columns split by "|" and | |
281 # then on ":" e.g. MZ:100.2 | RT:20 | xcms_grp_id:1 | |
282 paramd['additional_details'] = { | |
283 sm.split(":")[0].strip(): sm.split(":")[1].strip() for sm in | |
284 meta_info['name'].split("|")} | |
285 elif args.meta_select_col == 'all': | |
286 # have additional columns based on all the meta information | |
287 # extracted from the MSP | |
288 paramd['additional_details'] = meta_info | |
289 else: | |
290 # Just have and index of the spectra in the MSP file | |
291 paramd['additional_details'] = {'spectra_idx': spectrac} | |
292 | |
293 paramd["SampleName"] = "{}_metfrag_result".format(spectrac) | |
294 | |
295 # =============== Output peaks to txt file ============================== | |
296 paramd["PeakListPath"] = os.path.join(wd, | |
297 "{}_tmpspec.txt".format(spectrac)) | |
298 | |
299 # write spec file | |
300 with open(paramd["PeakListPath"], 'w') as outfile: | |
301 for p in peaklist: | |
302 outfile.write(p[0] + "\t" + p[1] + "\n") | |
303 | |
304 # =============== Update param based on MSP metadata ====================== | |
305 # Replace param details with details from MSP if required | |
306 if 'precursor_type' in meta_info and \ | |
307 meta_info['precursor_type'] in adduct_types: | |
308 adduct = meta_info['precursor_type'] | |
309 nm = float(meta_info['precursor_mz']) - adduct_types[ | |
310 meta_info['precursor_type']] | |
311 paramd["PrecursorIonMode"] = \ | |
312 int(round(adduct_types[meta_info['precursor_type']], 0)) | |
313 elif not args.skip_invalid_adducts: | |
314 adduct = inv_adduct_types[int(paramd['PrecursorIonModeDefault'])] | |
315 paramd["PrecursorIonMode"] = paramd['PrecursorIonModeDefault'] | |
316 nm = float(meta_info['precursor_mz']) - paramd['nm_mass_diff_default'] | |
317 else: | |
318 print('Skipping {}'.format(paramd["SampleName"])) | |
319 return '', '' | |
320 | |
321 paramd['additional_details']['adduct'] = adduct | |
322 paramd["NeutralPrecursorMass"] = nm | |
323 | |
324 # ============== Create CLI cmd for metfrag =============================== | |
325 cmd = "metfrag" | |
326 for k, v in six.iteritems(paramd): | |
327 if k not in ['PrecursorIonModeDefault', 'nm_mass_diff_default', | |
328 'additional_details']: | |
329 cmd += " {}={}".format(str(k), str(v)) | |
330 | |
331 # ============== Run metfrag ============================================== | |
332 # print(cmd) | |
333 # Filter before process with a minimum number of MS/MS peaks | |
334 if plinesread >= float(args.minMSMSpeaks): | |
335 | |
336 if int(args.cores_top_level) == 1: | |
337 os.system(cmd) | |
338 | |
339 return paramd, cmd | |
340 | |
341 | |
342 def work(cmds): | |
343 return [os.system(cmd) for cmd in cmds] | |
344 | |
345 | |
346 ###################################################################### | |
347 # Parse MSP file and run metfrag CLI | |
348 ###################################################################### | |
349 # keep list of commands if performing in CLI in parallel | |
350 cmds = [] | |
351 # keep a dictionary of all params | |
352 paramds = {} | |
353 # keep count of spectra (for uid) | |
354 spectrac = 0 | |
355 # this dictionary will store the meta data results form the MSp file | |
356 meta_info = {} | |
357 | |
358 with open(args.input_pth, "r") as infile: | |
359 # number of lines for the peaks | |
360 pnumlines = 0 | |
361 # number of lines read for the peaks | |
362 plinesread = 0 | |
363 for line in infile: | |
364 line = line.strip() | |
365 | |
366 if pnumlines == 0: | |
367 # ============== Extract metadata from MSP ======================== | |
368 meta_info = parse_meta(meta_regex, meta_info) | |
369 | |
370 if ('massbank' in meta_info and 'cols' in meta_info) or ( | |
371 'msp' in meta_info and 'num_peaks' in meta_info): | |
372 pnumlines = int(meta_info['num_peaks']) | |
373 plinesread = 0 | |
374 peaklist = [] | |
375 | |
376 elif plinesread < pnumlines: | |
377 # ============== Extract peaks from MSP ========================== | |
378 # .split() will split on any empty space (i.e. tab and space) | |
379 line = tuple(line.split()) | |
380 # Keep only m/z and intensity, not relative intensity | |
381 save_line = tuple(line[0].split() + line[1].split()) | |
382 plinesread += 1 | |
383 peaklist.append(save_line) | |
384 | |
385 elif plinesread and plinesread == pnumlines: | |
386 # ======= Get sample name and additional details for output ======= | |
387 spectrac += 1 | |
388 paramd, cmd = run_metfrag(meta_info, peaklist, args, wd, spectrac, | |
389 adduct_types) | |
390 | |
391 if paramd: | |
392 paramds[paramd["SampleName"]] = paramd | |
393 cmds.append(cmd) | |
394 | |
395 meta_info = {} | |
396 pnumlines = 0 | |
397 plinesread = 0 | |
398 | |
399 # end of file. Check if there is a MSP spectra to run metfrag on still | |
400 if plinesread and plinesread == pnumlines: | |
401 | |
402 paramd, cmd = run_metfrag(meta_info, peaklist, args, wd, spectrac + 1, | |
403 adduct_types) | |
404 | |
405 if paramd: | |
406 paramds[paramd["SampleName"]] = paramd | |
407 cmds.append(cmd) | |
408 | |
409 # Perform multiprocessing on command line call level | |
410 if int(args.cores_top_level) > 1: | |
411 cmds_chunks = [cmds[x:x + int(args.chunks)] for x in | |
412 list(range(0, len(cmds), int(args.chunks)))] | |
413 pool = multiprocessing.Pool(processes=int(args.cores_top_level)) | |
414 pool.map(work, cmds_chunks) | |
415 pool.close() | |
416 pool.join() | |
417 | |
418 ###################################################################### | |
419 # Concatenate and filter the output | |
420 ###################################################################### | |
421 # outputs might have different headers. Need to get a list of all the | |
422 # headers before we start merging the files | |
423 # outfiles = [os.path.join(wd, f) for f in glob.glob(os.path.join(wd, | |
424 # "*_metfrag_result.csv"))] | |
425 outfiles = glob.glob(os.path.join(wd, "*_metfrag_result.csv")) | |
426 | |
427 if len(outfiles) == 0: | |
428 print('No results') | |
429 sys.exit() | |
430 | |
431 headers = [] | |
432 c = 0 | |
433 for fn in outfiles: | |
434 with open(fn, 'r') as infile: | |
435 reader = csv.reader(infile) | |
436 if sys.version_info >= (3, 0): | |
437 headers.extend(next(reader)) | |
438 else: | |
439 headers.extend(reader.next()) | |
440 # check if file has any data rows | |
441 for i, row in enumerate(reader): | |
442 c += 1 | |
443 if i == 1: | |
444 break | |
445 | |
446 # if no data rows (e.g. matches) then do not save an | |
447 # output and leave the program | |
448 if c == 0: | |
449 print('No results') | |
450 sys.exit() | |
451 | |
452 additional_detail_headers = ['sample_name'] | |
453 for k, paramd in six.iteritems(paramds): | |
454 additional_detail_headers = list(set( | |
455 additional_detail_headers + list(paramd['additional_details'].keys()))) | |
456 | |
457 # add inchikey if not already present (missing in metchem output) | |
458 if 'InChIKey' not in headers: | |
459 headers.append('InChIKey') | |
460 | |
461 headers = additional_detail_headers + sorted(list(set(headers))) | |
462 | |
463 # Sort files nicely | |
464 outfiles.sort( | |
465 key=lambda s: int(re.match(r'^.*/(\d+)_metfrag_result.csv', s).group(1))) | |
466 | |
467 print(outfiles) | |
468 | |
469 # merge outputs | |
470 with open(args.result_pth, 'a') as merged_outfile: | |
471 dwriter = csv.DictWriter(merged_outfile, fieldnames=headers, | |
472 delimiter='\t', quotechar='"') | |
473 dwriter.writeheader() | |
474 | |
475 for fn in outfiles: | |
476 | |
477 with open(fn) as infile: | |
478 reader = csv.DictReader(infile, delimiter=',', quotechar='"') | |
479 for line in reader: | |
480 bewrite = True | |
481 for key, value in line.items(): | |
482 # Filter when no MS/MS peak matched | |
483 if key == "ExplPeaks": | |
484 if float(args.pctexplpeak_thrshld) > 0 and \ | |
485 "NA" in value: | |
486 bewrite = False | |
487 # Filter with a score threshold | |
488 elif key == "Score": | |
489 if float(value) <= float(args.score_thrshld): | |
490 bewrite = False | |
491 elif key == "NoExplPeaks": | |
492 nbfindpeak = float(value) | |
493 elif key == "NumberPeaksUsed": | |
494 totpeaks = float(value) | |
495 # Filter with a relative number of peak matched | |
496 try: | |
497 pctexplpeak = nbfindpeak / totpeaks * 100 | |
498 except ZeroDivisionError: | |
499 bewrite = False | |
500 else: | |
501 if pctexplpeak < float(args.pctexplpeak_thrshld): | |
502 bewrite = False | |
503 | |
504 # Write the line if it pass all filters | |
505 if bewrite: | |
506 bfn = os.path.basename(fn) | |
507 bfn = bfn.replace(".csv", "") | |
508 line['sample_name'] = paramds[bfn]['SampleName'] | |
509 ad = paramds[bfn]['additional_details'] | |
510 | |
511 if args.MetFragDatabaseType == "MetChem": | |
512 # for some reason the metchem database option does | |
513 # not report the full inchikey (at least in the Bham | |
514 # setup. This ensures we always get the fully inchikey | |
515 line['InChIKey'] = '{}-{}-{}'.format(line['InChIKey1'], | |
516 line['InChIKey2'], | |
517 line['InChIKey3']) | |
518 | |
519 line.update(ad) | |
520 dwriter.writerow(line) |