Mercurial > repos > computational-metabolomics > metfrag
comparison metfrag.py @ 2:f151ee133612 draft default tip
"planemo upload for repository https://github.com/computational-metabolomics/metfrag-galaxy commit b337c6296968848e3214f4b51df3d86776f84b6a"
author | computational-metabolomics |
---|---|
date | Tue, 14 Jul 2020 07:41:53 -0400 |
parents | 9ee2e2ceb2c9 |
children |
comparison
equal
deleted
inserted
replaced
1:9ee2e2ceb2c9 | 2:f151ee133612 |
---|---|
1 from __future__ import absolute_import, print_function | 1 from __future__ import absolute_import, print_function |
2 | 2 |
3 import ConfigParser | 3 try: |
4 from configparser import ConfigParser | |
5 except ImportError as e: | |
6 print(e) | |
7 from ConfigParser import ConfigParser | |
8 | |
4 import argparse | 9 import argparse |
5 import csv | 10 import csv |
6 import glob | 11 import glob |
7 import multiprocessing | 12 import multiprocessing |
8 import os | 13 import os |
12 import tempfile | 17 import tempfile |
13 from collections import defaultdict | 18 from collections import defaultdict |
14 | 19 |
15 import six | 20 import six |
16 | 21 |
17 print(sys.version) | |
18 | |
19 parser = argparse.ArgumentParser() | |
20 parser.add_argument('--input_pth') | |
21 parser.add_argument('--result_pth', default='metfrag_result.csv') | |
22 | |
23 parser.add_argument('--temp_dir') | |
24 parser.add_argument('--polarity', default='pos') | |
25 parser.add_argument('--minMSMSpeaks', default=1) | |
26 | |
27 parser.add_argument('--MetFragDatabaseType', default='PubChem') | |
28 parser.add_argument('--LocalDatabasePath', default='') | |
29 parser.add_argument('--LocalMetChemDatabaseServerIp', default='') | |
30 | |
31 parser.add_argument('--DatabaseSearchRelativeMassDeviation', default=5) | |
32 parser.add_argument('--FragmentPeakMatchRelativeMassDeviation', default=10) | |
33 parser.add_argument('--FragmentPeakMatchAbsoluteMassDeviation', default=0.001) | |
34 parser.add_argument('--NumberThreads', default=1) | |
35 parser.add_argument('--UnconnectedCompoundFilter', action='store_true') | |
36 parser.add_argument('--IsotopeFilter', action='store_true') | |
37 | |
38 parser.add_argument('--FilterMinimumElements', default='') | |
39 parser.add_argument('--FilterMaximumElements', default='') | |
40 parser.add_argument('--FilterSmartsInclusionList', default='') | |
41 parser.add_argument('--FilterSmartsExclusionList', default='') | |
42 parser.add_argument('--FilterIncludedElements', default='') | |
43 parser.add_argument('--FilterExcludedElements', default='') | |
44 parser.add_argument('--FilterIncludedExclusiveElements', default='') | |
45 | |
46 parser.add_argument('--score_thrshld', default=0) | |
47 parser.add_argument('--pctexplpeak_thrshld', default=0) | |
48 parser.add_argument('--schema') | |
49 parser.add_argument('--cores_top_level', default=1) | |
50 parser.add_argument('--chunks', default=1) | |
51 parser.add_argument('--meta_select_col', default='name') | |
52 parser.add_argument('--skip_invalid_adducts', action='store_true') | |
53 | |
54 parser.add_argument('--ScoreSuspectLists', default='') | |
55 parser.add_argument('--MetFragScoreTypes', | |
56 default="FragmenterScore,OfflineMetFusionScore") | |
57 parser.add_argument('--MetFragScoreWeights', default="1.0,1.0") | |
58 | |
59 args = parser.parse_args() | |
60 print(args) | |
61 | |
62 config = ConfigParser.ConfigParser() | |
63 config.read( | |
64 os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config.ini')) | |
65 | |
66 if os.stat(args.input_pth).st_size == 0: | |
67 print('Input file empty') | |
68 exit() | |
69 | |
70 # Create temporary working directory | |
71 if args.temp_dir: | |
72 wd = args.temp_dir | |
73 else: | |
74 wd = tempfile.mkdtemp() | |
75 | |
76 if os.path.exists(wd): | |
77 shutil.rmtree(wd) | |
78 os.makedirs(wd) | |
79 else: | |
80 os.makedirs(wd) | |
81 | |
82 ###################################################################### | |
83 # Setup regular expressions for MSP parsing dictionary | |
84 ###################################################################### | |
85 regex_msp = {} | |
86 regex_msp['name'] = [r'^Name(?:=|:)(.*)$'] | |
87 regex_msp['polarity'] = [r'^ion.*mode(?:=|:)(.*)$', | |
88 r'^ionization.*mode(?:=|:)(.*)$', | |
89 r'^polarity(?:=|:)(.*)$'] | |
90 regex_msp['precursor_mz'] = [r'^precursor.*m/z(?:=|:)\s*(\d*[.,]?\d*)$', | |
91 r'^precursor.*mz(?:=|:)\s*(\d*[.,]?\d*)$'] | |
92 regex_msp['precursor_type'] = [r'^precursor.*type(?:=|:)(.*)$', | |
93 r'^adduct(?:=|:)(.*)$', | |
94 r'^ADDUCTIONNAME(?:=|:)(.*)$'] | |
95 regex_msp['num_peaks'] = [r'^Num.*Peaks(?:=|:)\s*(\d*)$'] | |
96 regex_msp['msp'] = [r'^Name(?:=|:)(.*)$'] # Flag for standard MSP format | |
97 | |
98 regex_massbank = {} | |
99 regex_massbank['name'] = [r'^RECORD_TITLE:(.*)$'] | |
100 regex_massbank['polarity'] = [r'^AC\$MASS_SPECTROMETRY:\s+ION_MODE\s+(.*)$'] | |
101 regex_massbank['precursor_mz'] = [ | |
102 r'^MS\$FOCUSED_ION:\s+PRECURSOR_M/Z\s+(\d*[.,]?\d*)$'] | |
103 regex_massbank['precursor_type'] = [ | |
104 r'^MS\$FOCUSED_ION:\s+PRECURSOR_TYPE\s+(.*)$'] | |
105 regex_massbank['num_peaks'] = [r'^PK\$NUM_PEAK:\s+(\d*)'] | |
106 regex_massbank['cols'] = [r'^PK\$PEAK:\s+(.*)'] | |
107 regex_massbank['massbank'] = [ | |
108 r'^RECORD_TITLE:(.*)$'] # Flag for massbank format | |
109 | |
110 if args.schema == 'msp': | |
111 meta_regex = regex_msp | |
112 elif args.schema == 'massbank': | |
113 meta_regex = regex_massbank | |
114 elif args.schema == 'auto': | |
115 # If auto we just check for all the available paramter names and then | |
116 # determine if Massbank or MSP based on the name parameter | |
117 meta_regex = {} | |
118 meta_regex.update(regex_massbank) | |
119 meta_regex['name'].extend(regex_msp['name']) | |
120 meta_regex['polarity'].extend(regex_msp['polarity']) | |
121 meta_regex['precursor_mz'].extend(regex_msp['precursor_mz']) | |
122 meta_regex['precursor_type'].extend(regex_msp['precursor_type']) | |
123 meta_regex['num_peaks'].extend(regex_msp['num_peaks']) | |
124 meta_regex['msp'] = regex_msp['msp'] | |
125 else: | |
126 sys.exit("No schema selected") | |
127 | |
128 adduct_types = { | |
129 '[M+H]+': 1.007276, | |
130 '[M+NH4]+': 18.034374, | |
131 '[M+Na]+': 22.989218, | |
132 '[M+K]+': 38.963158, | |
133 '[M+CH3OH+H]+': 33.033489, | |
134 '[M+ACN+H]+': 42.033823, | |
135 '[M+ACN+Na]+': 64.015765, | |
136 '[M+2ACN+H]+': 83.06037, | |
137 '[M-H]-': -1.007276, | |
138 '[M+Cl]-': 34.969402, | |
139 '[M+HCOO]-': 44.99819, | |
140 '[M-H+HCOOH]-': 44.99819, | |
141 # same as above but different style of writing adduct | |
142 '[M+CH3COO]-': 59.01385, | |
143 '[M-H+CH3COOH]-': 59.01385 | |
144 # same as above but different style of writing adduct | |
145 } | |
146 inv_adduct_types = {int(round(v, 0)): k for k, v in adduct_types.iteritems()} | |
147 | |
148 | 22 |
149 # function to extract the meta data using the regular expressions | 23 # function to extract the meta data using the regular expressions |
150 def parse_meta(meta_regex, meta_info=None): | 24 def parse_meta(meta_regex, meta_info=None): |
151 if meta_info is None: | 25 if meta_info is None: |
152 meta_info = {} | 26 meta_info = {} |
154 for reg in regexes: | 28 for reg in regexes: |
155 m = re.search(reg, line, re.IGNORECASE) | 29 m = re.search(reg, line, re.IGNORECASE) |
156 if m: | 30 if m: |
157 meta_info[k] = '-'.join(m.groups()).strip() | 31 meta_info[k] = '-'.join(m.groups()).strip() |
158 return meta_info | 32 return meta_info |
33 | |
34 | |
35 def get_meta_regex(schema): | |
36 ###################################################################### | |
37 # Setup regular expressions for MSP parsing dictionary | |
38 ###################################################################### | |
39 regex_msp = {} | |
40 regex_msp['name'] = [r'^Name(?:=|:)(.*)$'] | |
41 regex_msp['polarity'] = [r'^ion.*mode(?:=|:)(.*)$', | |
42 r'^ionization.*mode(?:=|:)(.*)$', | |
43 r'^polarity(?:=|:)(.*)$'] | |
44 regex_msp['precursor_mz'] = [r'^precursor.*m/z(?:=|:)\s*(\d*[.,]?\d*)$', | |
45 r'^precursor.*mz(?:=|:)\s*(\d*[.,]?\d*)$'] | |
46 regex_msp['precursor_type'] = [r'^precursor.*type(?:=|:)(.*)$', | |
47 r'^adduct(?:=|:)(.*)$', | |
48 r'^ADDUCTIONNAME(?:=|:)(.*)$'] | |
49 | |
50 regex_msp['retention_time'] = [r'^RETENTION.*TIME(?:=|:)\s*(.*)$', | |
51 r'^rt(?:=|:)\s*(.*)$', | |
52 r'^time(?:=|:)\s*(.*)$'] | |
53 # From example winter_pos.mspy from kristian | |
54 regex_msp['AlignmentID'] = [r'^AlignmentID(?:=|:)\s*(.*)$'] | |
55 | |
56 regex_msp['num_peaks'] = [r'^Num.*Peaks(?:=|:)\s*(\d*)$'] | |
57 regex_msp['msp'] = [r'^Name(?:=|:)(.*)$'] # Flag for standard MSP format | |
58 | |
59 regex_massbank = {} | |
60 regex_massbank['name'] = [r'^RECORD_TITLE:(.*)$'] | |
61 regex_massbank['polarity'] = [ | |
62 r'^AC\$MASS_SPECTROMETRY:\s+ION_MODE\s+(.*)$'] | |
63 regex_massbank['precursor_mz'] = [ | |
64 r'^MS\$FOCUSED_ION:\s+PRECURSOR_M/Z\s+(\d*[.,]?\d*)$'] | |
65 regex_massbank['precursor_type'] = [ | |
66 r'^MS\$FOCUSED_ION:\s+PRECURSOR_TYPE\s+(.*)$'] | |
67 regex_massbank['retention_time'] = [ | |
68 r'^AC\$CHROMATOGRAPHY:\s+RETENTION_TIME\s*(\d*\.?\d+).*'] | |
69 | |
70 regex_massbank['num_peaks'] = [r'^PK\$NUM_PEAK:\s+(\d*)'] | |
71 regex_massbank['cols'] = [r'^PK\$PEAK:\s+(.*)'] | |
72 regex_massbank['massbank'] = [ | |
73 r'^RECORD_TITLE:(.*)$'] # Flag for massbank format | |
74 | |
75 if schema == 'msp': | |
76 meta_regex = regex_msp | |
77 elif schema == 'massbank': | |
78 meta_regex = regex_massbank | |
79 elif schema == 'auto': | |
80 # If auto we just check for all the available paramter names and then | |
81 # determine if Massbank or MSP based on the name parameter | |
82 meta_regex = {} | |
83 meta_regex.update(regex_massbank) | |
84 meta_regex['name'].extend(regex_msp['name']) | |
85 meta_regex['polarity'].extend(regex_msp['polarity']) | |
86 meta_regex['precursor_mz'].extend(regex_msp['precursor_mz']) | |
87 meta_regex['precursor_type'].extend(regex_msp['precursor_type']) | |
88 meta_regex['num_peaks'].extend(regex_msp['num_peaks']) | |
89 meta_regex['retention_time'].extend(regex_msp['retention_time']) | |
90 meta_regex['AlignmentID'] = regex_msp['AlignmentID'] | |
91 meta_regex['msp'] = regex_msp['msp'] | |
92 | |
93 else: | |
94 sys.exit("No schema selected") | |
95 | |
96 return meta_regex | |
159 | 97 |
160 | 98 |
161 ###################################################################### | 99 ###################################################################### |
162 # Setup parameter dictionary | 100 # Setup parameter dictionary |
163 ###################################################################### | 101 ###################################################################### |
269 # the msPurity pipeline) choose between getting additional details to add | 207 # the msPurity pipeline) choose between getting additional details to add |
270 # as columns as either all meta data from msp, just details from the | 208 # as columns as either all meta data from msp, just details from the |
271 # record name (i.e. when using msPurity and we have the columns coded into | 209 # record name (i.e. when using msPurity and we have the columns coded into |
272 # the name) or just the spectra index (spectrac)]. | 210 # the name) or just the spectra index (spectrac)]. |
273 # Returns the parameters used and the command line call | 211 # Returns the parameters used and the command line call |
212 meta_info = {k: v for k, v in meta_info.items() if k | |
213 not in ['msp', 'massbank', 'cols']} | |
274 | 214 |
275 paramd = init_paramd(args) | 215 paramd = init_paramd(args) |
276 if args.meta_select_col == 'name': | 216 if args.meta_select_col == 'name': |
277 # have additional column of just the name | 217 # have additional column of just the name |
278 paramd['additional_details'] = {'name': meta_info['name']} | 218 paramd['additional_details'] = {'name': meta_info['name']} |
285 elif args.meta_select_col == 'all': | 225 elif args.meta_select_col == 'all': |
286 # have additional columns based on all the meta information | 226 # have additional columns based on all the meta information |
287 # extracted from the MSP | 227 # extracted from the MSP |
288 paramd['additional_details'] = meta_info | 228 paramd['additional_details'] = meta_info |
289 else: | 229 else: |
290 # Just have and index of the spectra in the MSP file | 230 # Just have an index of the spectra in the MSP file |
291 paramd['additional_details'] = {'spectra_idx': spectrac} | 231 paramd['additional_details'] = {'spectra_idx': spectrac} |
292 | 232 |
293 paramd["SampleName"] = "{}_metfrag_result".format(spectrac) | 233 paramd["SampleName"] = "{}_metfrag_result".format(spectrac) |
294 | 234 |
295 # =============== Output peaks to txt file ============================== | 235 # =============== Output peaks to txt file ============================== |
296 paramd["PeakListPath"] = os.path.join(wd, | 236 paramd["PeakListPath"] = os.path.join(wd, |
297 "{}_tmpspec.txt".format(spectrac)) | 237 "{}_tmpspec.txt".format(spectrac)) |
298 | 238 |
299 # write spec file | 239 # write spec file |
240 | |
300 with open(paramd["PeakListPath"], 'w') as outfile: | 241 with open(paramd["PeakListPath"], 'w') as outfile: |
242 pls = '' | |
301 for p in peaklist: | 243 for p in peaklist: |
302 outfile.write(p[0] + "\t" + p[1] + "\n") | 244 outfile.write(p[0] + "\t" + p[1] + "\n") |
245 pls = pls + '{}_{};'.format(p[0], p[1]) | |
246 | |
247 if args.output_cl: | |
248 peaklist_str = pls[:-1] | |
303 | 249 |
304 # =============== Update param based on MSP metadata ====================== | 250 # =============== Update param based on MSP metadata ====================== |
305 # Replace param details with details from MSP if required | 251 # Replace param details with details from MSP if required |
306 if 'precursor_type' in meta_info and \ | 252 if 'precursor_type' in meta_info and \ |
307 meta_info['precursor_type'] in adduct_types: | 253 meta_info['precursor_type'] in adduct_types: |
309 nm = float(meta_info['precursor_mz']) - adduct_types[ | 255 nm = float(meta_info['precursor_mz']) - adduct_types[ |
310 meta_info['precursor_type']] | 256 meta_info['precursor_type']] |
311 paramd["PrecursorIonMode"] = \ | 257 paramd["PrecursorIonMode"] = \ |
312 int(round(adduct_types[meta_info['precursor_type']], 0)) | 258 int(round(adduct_types[meta_info['precursor_type']], 0)) |
313 elif not args.skip_invalid_adducts: | 259 elif not args.skip_invalid_adducts: |
260 inv_adduct_types = {int(round(v, 0)): k for k, v in | |
261 six.iteritems(adduct_types)} | |
314 adduct = inv_adduct_types[int(paramd['PrecursorIonModeDefault'])] | 262 adduct = inv_adduct_types[int(paramd['PrecursorIonModeDefault'])] |
315 paramd["PrecursorIonMode"] = paramd['PrecursorIonModeDefault'] | 263 paramd["PrecursorIonMode"] = paramd['PrecursorIonModeDefault'] |
316 nm = float(meta_info['precursor_mz']) - paramd['nm_mass_diff_default'] | 264 nm = float(meta_info['precursor_mz']) - paramd['nm_mass_diff_default'] |
317 else: | 265 else: |
318 print('Skipping {}'.format(paramd["SampleName"])) | 266 print('Skipping {}'.format(paramd["SampleName"])) |
319 return '', '' | 267 return '', '' |
320 | 268 |
321 paramd['additional_details']['adduct'] = adduct | 269 if not ('precursor_type' in paramd['additional_details'] or 'adduct' |
270 in paramd['additional_details']): | |
271 paramd['additional_details']['adduct'] = adduct | |
272 | |
322 paramd["NeutralPrecursorMass"] = nm | 273 paramd["NeutralPrecursorMass"] = nm |
323 | 274 |
324 # ============== Create CLI cmd for metfrag =============================== | 275 # ============== Create CLI cmd for metfrag =============================== |
325 cmd = "metfrag" | 276 cmd = "metfrag" |
326 for k, v in six.iteritems(paramd): | 277 for k, v in six.iteritems(paramd): |
327 if k not in ['PrecursorIonModeDefault', 'nm_mass_diff_default', | 278 if k not in ['PrecursorIonModeDefault', 'nm_mass_diff_default', |
328 'additional_details']: | 279 'additional_details']: |
329 cmd += " {}={}".format(str(k), str(v)) | 280 cmd += " {}={}".format(str(k), str(v)) |
330 | 281 |
282 if args.output_cl: | |
283 cli_str = '{} PeakListString={}'.format(cmd, peaklist_str) | |
284 paramd['additional_details']['MetFragCLIString'] = cli_str | |
285 | |
331 # ============== Run metfrag ============================================== | 286 # ============== Run metfrag ============================================== |
332 # print(cmd) | 287 # print(cmd) |
333 # Filter before process with a minimum number of MS/MS peaks | 288 # Filter before process with a minimum number of MS/MS peaks |
334 if plinesread >= float(args.minMSMSpeaks): | 289 if plinesread >= float(args.minMSMSpeaks): |
335 | 290 |
341 | 296 |
342 def work(cmds): | 297 def work(cmds): |
343 return [os.system(cmd) for cmd in cmds] | 298 return [os.system(cmd) for cmd in cmds] |
344 | 299 |
345 | 300 |
346 ###################################################################### | 301 if __name__ == "__main__": |
347 # Parse MSP file and run metfrag CLI | 302 print(sys.version) |
348 ###################################################################### | 303 |
349 # keep list of commands if performing in CLI in parallel | 304 parser = argparse.ArgumentParser() |
350 cmds = [] | 305 parser.add_argument('--input_pth') |
351 # keep a dictionary of all params | 306 parser.add_argument('--result_pth', default='metfrag_result.csv') |
352 paramds = {} | 307 |
353 # keep count of spectra (for uid) | 308 parser.add_argument('--temp_dir') |
354 spectrac = 0 | 309 parser.add_argument('--polarity', default='pos') |
355 # this dictionary will store the meta data results form the MSp file | 310 parser.add_argument('--minMSMSpeaks', default=1) |
356 meta_info = {} | 311 |
357 | 312 parser.add_argument('--MetFragDatabaseType', default='PubChem') |
358 with open(args.input_pth, "r") as infile: | 313 parser.add_argument('--LocalDatabasePath', default='') |
359 # number of lines for the peaks | 314 parser.add_argument('--LocalMetChemDatabaseServerIp', default='') |
360 pnumlines = 0 | 315 |
361 # number of lines read for the peaks | 316 parser.add_argument('--DatabaseSearchRelativeMassDeviation', default=5) |
362 plinesread = 0 | 317 parser.add_argument('--FragmentPeakMatchRelativeMassDeviation', default=10) |
363 for line in infile: | 318 parser.add_argument('--FragmentPeakMatchAbsoluteMassDeviation', |
364 line = line.strip() | 319 default=0.001) |
365 | 320 parser.add_argument('--NumberThreads', default=1) |
366 if pnumlines == 0: | 321 parser.add_argument('--UnconnectedCompoundFilter', action='store_true') |
367 # ============== Extract metadata from MSP ======================== | 322 parser.add_argument('--IsotopeFilter', action='store_true') |
368 meta_info = parse_meta(meta_regex, meta_info) | 323 |
369 | 324 parser.add_argument('--FilterMinimumElements', default='') |
370 if ('massbank' in meta_info and 'cols' in meta_info) or ( | 325 parser.add_argument('--FilterMaximumElements', default='') |
371 'msp' in meta_info and 'num_peaks' in meta_info): | 326 parser.add_argument('--FilterSmartsInclusionList', default='') |
372 pnumlines = int(meta_info['num_peaks']) | 327 parser.add_argument('--FilterSmartsExclusionList', default='') |
328 parser.add_argument('--FilterIncludedElements', default='') | |
329 parser.add_argument('--FilterExcludedElements', default='') | |
330 parser.add_argument('--FilterIncludedExclusiveElements', default='') | |
331 | |
332 parser.add_argument('--score_thrshld', default=0) | |
333 parser.add_argument('--pctexplpeak_thrshld', default=0) | |
334 parser.add_argument('--schema') | |
335 parser.add_argument('--cores_top_level', default=1) | |
336 parser.add_argument('--chunks', default=1) | |
337 parser.add_argument('--meta_select_col', default='name') | |
338 parser.add_argument('--skip_invalid_adducts', action='store_true') | |
339 parser.add_argument('--output_cl', action='store_true') | |
340 | |
341 parser.add_argument('--ScoreSuspectLists', default='') | |
342 parser.add_argument('--MetFragScoreTypes', | |
343 default="FragmenterScore,OfflineMetFusionScore") | |
344 parser.add_argument('--MetFragScoreWeights', default="1.0,1.0") | |
345 parser.add_argument('-a', '--adducts', action='append', nargs=1, | |
346 required=False, default=[], help='Adducts used') | |
347 | |
348 args = parser.parse_args() | |
349 print(args) | |
350 config = ConfigParser() | |
351 config.read( | |
352 os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config.ini')) | |
353 | |
354 if os.stat(args.input_pth).st_size == 0: | |
355 print('Input file empty') | |
356 exit() | |
357 | |
358 # Create temporary working directory | |
359 if args.temp_dir: | |
360 wd = args.temp_dir | |
361 else: | |
362 wd = tempfile.mkdtemp() | |
363 | |
364 if os.path.exists(wd): | |
365 shutil.rmtree(wd) | |
366 os.makedirs(wd) | |
367 else: | |
368 os.makedirs(wd) | |
369 | |
370 meta_regex = get_meta_regex(args.schema) | |
371 | |
372 adduct_types = { | |
373 '[M+H]+': 1.007276, | |
374 '[M+NH4]+': 18.034374, | |
375 '[M+Na]+': 22.989218, | |
376 '[M+K]+': 38.963158, | |
377 '[M+CH3OH+H]+': 33.033489, | |
378 '[M+ACN+H]+': 42.033823, | |
379 '[M+ACN+Na]+': 64.015765, | |
380 '[M+2ACN+H]+': 83.06037, | |
381 '[M-H]-': -1.007276, | |
382 '[M+Cl]-': 34.969402, | |
383 '[M+HCOO]-': 44.99819, | |
384 '[M-H+HCOOH]-': 44.99819, | |
385 # same as above but different style of writing adduct | |
386 '[M+CH3COO]-': 59.01385, | |
387 '[M-H+CH3COOH]-': 59.01385 | |
388 # same as above but different style of writing adduct | |
389 } | |
390 | |
391 ###################################################################### | |
392 # Parse MSP file and run metfrag CLI | |
393 ###################################################################### | |
394 # keep list of commands if performing in CLI in parallel | |
395 cmds = [] | |
396 # keep a dictionary of all params | |
397 paramds = {} | |
398 # keep count of spectra (for uid) | |
399 spectrac = 0 | |
400 # this dictionary will store the meta data results form the MSp file | |
401 meta_info = {} | |
402 | |
403 if args.adducts: | |
404 adducts_from_cli = [ | |
405 a[0].replace('__ob__', '[').replace('__cb__', ']') for a in | |
406 args.adducts | |
407 ] | |
408 else: | |
409 adducts_from_cli = [] | |
410 | |
411 with open(args.input_pth, "r") as infile: | |
412 # number of lines for the peaks | |
413 pnumlines = 0 | |
414 # number of lines read for the peaks | |
415 plinesread = 0 | |
416 for line in infile: | |
417 line = line.strip() | |
418 | |
419 if pnumlines == 0: | |
420 # ============== Extract metadata from MSP ==================== | |
421 meta_info = parse_meta(meta_regex, meta_info) | |
422 | |
423 if ('massbank' in meta_info and 'cols' in meta_info) or ( | |
424 'msp' in meta_info and 'num_peaks' in meta_info): | |
425 pnumlines = int(meta_info['num_peaks']) | |
426 plinesread = 0 | |
427 peaklist = [] | |
428 | |
429 elif plinesread < pnumlines: | |
430 # ============== Extract peaks from MSP ======================= | |
431 # .split() will split on any empty space (i.e. tab and space) | |
432 line = tuple(line.split()) | |
433 # Keep only m/z and intensity, not relative intensity | |
434 save_line = tuple(line[0].split() + line[1].split()) | |
435 plinesread += 1 | |
436 peaklist.append(save_line) | |
437 | |
438 elif plinesread and plinesread == pnumlines: | |
439 # =Get sample name and additional details for output and RUN == | |
440 if adducts_from_cli: | |
441 for adduct in adducts_from_cli: | |
442 spectrac += 1 | |
443 meta_info['precursor_type'] = adduct | |
444 paramd, cmd = run_metfrag(meta_info, peaklist, args, | |
445 wd, spectrac, adduct_types) | |
446 if paramd: | |
447 paramds[paramd["SampleName"]] = paramd | |
448 cmds.append(cmd) | |
449 else: | |
450 spectrac += 1 | |
451 paramd, cmd = run_metfrag(meta_info, peaklist, args, | |
452 wd, spectrac, adduct_types) | |
453 if paramd: | |
454 paramds[paramd["SampleName"]] = paramd | |
455 cmds.append(cmd) | |
456 | |
457 meta_info = {} | |
458 pnumlines = 0 | |
373 plinesread = 0 | 459 plinesread = 0 |
374 peaklist = [] | 460 |
375 | 461 # end of file. Check if there is a MSP spectra to run metfrag on |
376 elif plinesread < pnumlines: | 462 if plinesread and plinesread == pnumlines: |
377 # ============== Extract peaks from MSP ========================== | 463 |
378 # .split() will split on any empty space (i.e. tab and space) | 464 if adducts_from_cli: |
379 line = tuple(line.split()) | 465 for adduct in adducts_from_cli: |
380 # Keep only m/z and intensity, not relative intensity | 466 spectrac += 1 |
381 save_line = tuple(line[0].split() + line[1].split()) | 467 meta_info['precursor_type'] = adduct |
382 plinesread += 1 | 468 paramd, cmd = run_metfrag(meta_info, peaklist, args, |
383 peaklist.append(save_line) | 469 wd, spectrac, adduct_types) |
384 | 470 if paramd: |
385 elif plinesread and plinesread == pnumlines: | 471 paramds[paramd["SampleName"]] = paramd |
386 # ======= Get sample name and additional details for output ======= | 472 cmds.append(cmd) |
387 spectrac += 1 | 473 else: |
388 paramd, cmd = run_metfrag(meta_info, peaklist, args, wd, spectrac, | 474 spectrac += 1 |
389 adduct_types) | 475 paramd, cmd = run_metfrag(meta_info, peaklist, args, |
390 | 476 wd, spectrac, adduct_types) |
391 if paramd: | 477 if paramd: |
392 paramds[paramd["SampleName"]] = paramd | 478 paramds[paramd["SampleName"]] = paramd |
393 cmds.append(cmd) | 479 cmds.append(cmd) |
394 | 480 |
395 meta_info = {} | 481 # Perform multiprocessing on command line call level |
396 pnumlines = 0 | 482 if int(args.cores_top_level) > 1: |
397 plinesread = 0 | 483 cmds_chunks = [cmds[x:x + int(args.chunks)] for x in |
398 | 484 list(range(0, len(cmds), int(args.chunks)))] |
399 # end of file. Check if there is a MSP spectra to run metfrag on still | 485 pool = multiprocessing.Pool(processes=int(args.cores_top_level)) |
400 if plinesread and plinesread == pnumlines: | 486 pool.map(work, cmds_chunks) |
401 | 487 pool.close() |
402 paramd, cmd = run_metfrag(meta_info, peaklist, args, wd, spectrac + 1, | 488 pool.join() |
403 adduct_types) | 489 |
404 | 490 ###################################################################### |
405 if paramd: | 491 # Concatenate and filter the output |
406 paramds[paramd["SampleName"]] = paramd | 492 ###################################################################### |
407 cmds.append(cmd) | 493 # outputs might have different headers. Need to get a list of all the |
408 | 494 # headers before we start merging the files |
409 # Perform multiprocessing on command line call level | 495 # outfiles = [os.path.join(wd, f) for f in glob.glob(os.path.join(wd, |
410 if int(args.cores_top_level) > 1: | 496 # "*_metfrag_result.csv"))] |
411 cmds_chunks = [cmds[x:x + int(args.chunks)] for x in | 497 outfiles = glob.glob(os.path.join(wd, "*_metfrag_result.csv")) |
412 list(range(0, len(cmds), int(args.chunks)))] | 498 |
413 pool = multiprocessing.Pool(processes=int(args.cores_top_level)) | 499 if len(outfiles) == 0: |
414 pool.map(work, cmds_chunks) | 500 print('No results') |
415 pool.close() | 501 sys.exit() |
416 pool.join() | 502 |
417 | 503 headers = [] |
418 ###################################################################### | 504 c = 0 |
419 # Concatenate and filter the output | |
420 ###################################################################### | |
421 # outputs might have different headers. Need to get a list of all the | |
422 # headers before we start merging the files | |
423 # outfiles = [os.path.join(wd, f) for f in glob.glob(os.path.join(wd, | |
424 # "*_metfrag_result.csv"))] | |
425 outfiles = glob.glob(os.path.join(wd, "*_metfrag_result.csv")) | |
426 | |
427 if len(outfiles) == 0: | |
428 print('No results') | |
429 sys.exit() | |
430 | |
431 headers = [] | |
432 c = 0 | |
433 for fn in outfiles: | |
434 with open(fn, 'r') as infile: | |
435 reader = csv.reader(infile) | |
436 if sys.version_info >= (3, 0): | |
437 headers.extend(next(reader)) | |
438 else: | |
439 headers.extend(reader.next()) | |
440 # check if file has any data rows | |
441 for i, row in enumerate(reader): | |
442 c += 1 | |
443 if i == 1: | |
444 break | |
445 | |
446 # if no data rows (e.g. matches) then do not save an | |
447 # output and leave the program | |
448 if c == 0: | |
449 print('No results') | |
450 sys.exit() | |
451 | |
452 additional_detail_headers = ['sample_name'] | |
453 for k, paramd in six.iteritems(paramds): | |
454 additional_detail_headers = list(set( | |
455 additional_detail_headers + list(paramd['additional_details'].keys()))) | |
456 | |
457 # add inchikey if not already present (missing in metchem output) | |
458 if 'InChIKey' not in headers: | |
459 headers.append('InChIKey') | |
460 | |
461 headers = additional_detail_headers + sorted(list(set(headers))) | |
462 | |
463 # Sort files nicely | |
464 outfiles.sort( | |
465 key=lambda s: int(re.match(r'^.*/(\d+)_metfrag_result.csv', s).group(1))) | |
466 | |
467 print(outfiles) | |
468 | |
469 # merge outputs | |
470 with open(args.result_pth, 'a') as merged_outfile: | |
471 dwriter = csv.DictWriter(merged_outfile, fieldnames=headers, | |
472 delimiter='\t', quotechar='"') | |
473 dwriter.writeheader() | |
474 | |
475 for fn in outfiles: | 505 for fn in outfiles: |
476 | 506 with open(fn, 'r') as infile: |
477 with open(fn) as infile: | 507 reader = csv.reader(infile) |
478 reader = csv.DictReader(infile, delimiter=',', quotechar='"') | 508 if sys.version_info >= (3, 0): |
479 for line in reader: | 509 headers.extend(next(reader)) |
480 bewrite = True | 510 else: |
481 for key, value in line.items(): | 511 headers.extend(reader.next()) |
482 # Filter when no MS/MS peak matched | 512 # check if file has any data rows |
483 if key == "ExplPeaks": | 513 for i, row in enumerate(reader): |
484 if float(args.pctexplpeak_thrshld) > 0 \ | 514 c += 1 |
485 and value and "NA" in value: | 515 if i == 1: |
516 break | |
517 | |
518 # if no data rows (e.g. matches) then do not save an | |
519 # output and leave the program | |
520 if c == 0: | |
521 print('No results') | |
522 sys.exit() | |
523 | |
524 additional_detail_headers = ['sample_name'] | |
525 for k, paramd in six.iteritems(paramds): | |
526 additional_detail_headers = list(set( | |
527 additional_detail_headers + list( | |
528 paramd['additional_details'].keys()))) | |
529 | |
530 # add inchikey if not already present (missing in metchem output) | |
531 if 'InChIKey' not in headers: | |
532 headers.append('InChIKey') | |
533 | |
534 additional_detail_headers = sorted(additional_detail_headers) | |
535 headers = additional_detail_headers + sorted(list(set(headers))) | |
536 | |
537 # Sort files nicely | |
538 outfiles.sort( | |
539 key=lambda s: int( | |
540 re.match(r'^.*/(\d+)_metfrag_result.csv', s).group(1))) | |
541 | |
542 print(outfiles) | |
543 | |
544 # merge outputs | |
545 with open(args.result_pth, 'a') as merged_outfile: | |
546 dwriter = csv.DictWriter(merged_outfile, fieldnames=headers, | |
547 delimiter='\t', quotechar='"') | |
548 dwriter.writeheader() | |
549 | |
550 for fn in outfiles: | |
551 with open(fn) as infile: | |
552 reader = csv.DictReader(infile, delimiter=',', quotechar='"') | |
553 for line in reader: | |
554 bewrite = True | |
555 for key, value in line.items(): | |
556 # Filter when no MS/MS peak matched | |
557 if key == "ExplPeaks": | |
558 if float(args.pctexplpeak_thrshld) > 0 \ | |
559 and value and "NA" in value: | |
560 bewrite = False | |
561 # Filter with a score threshold | |
562 elif key == "Score": | |
563 if value and float(value) <= float( | |
564 args.score_thrshld): | |
565 bewrite = False | |
566 elif key == "NoExplPeaks": | |
567 nbfindpeak = float(value) | |
568 elif key == "NumberPeaksUsed": | |
569 totpeaks = float(value) | |
570 # Filter with a relative number of peak matched | |
571 try: | |
572 pctexplpeak = nbfindpeak / totpeaks * 100 | |
573 except ZeroDivisionError: | |
574 bewrite = False | |
575 else: | |
576 if pctexplpeak < float(args.pctexplpeak_thrshld): | |
486 bewrite = False | 577 bewrite = False |
487 # Filter with a score threshold | 578 |
488 elif key == "Score": | 579 # Write the line if it pass all filters |
489 if value and float(value) <= float(args.score_thrshld): | 580 if bewrite: |
490 bewrite = False | 581 bfn = os.path.basename(fn) |
491 elif key == "NoExplPeaks": | 582 bfn = bfn.replace(".csv", "") |
492 nbfindpeak = float(value) | 583 line['sample_name'] = paramds[bfn]['SampleName'] |
493 elif key == "NumberPeaksUsed": | 584 ad = paramds[bfn]['additional_details'] |
494 totpeaks = float(value) | 585 |
495 # Filter with a relative number of peak matched | 586 if args.MetFragDatabaseType == "MetChem": |
496 try: | 587 # for some reason the metchem database option does |
497 pctexplpeak = nbfindpeak / totpeaks * 100 | 588 # not report the full inchikey (at least in the |
498 except ZeroDivisionError: | 589 # Bham setup. This ensures we always get |
499 bewrite = False | 590 # the fully inchikey |
500 else: | 591 line['InChIKey'] = '{}-{}-{}'.format( |
501 if pctexplpeak < float(args.pctexplpeak_thrshld): | 592 line['InChIKey1'], |
502 bewrite = False | 593 line['InChIKey2'], |
503 | 594 line['InChIKey3']) |
504 # Write the line if it pass all filters | 595 |
505 if bewrite: | 596 line.update(ad) |
506 bfn = os.path.basename(fn) | 597 dwriter.writerow(line) |
507 bfn = bfn.replace(".csv", "") | |
508 line['sample_name'] = paramds[bfn]['SampleName'] | |
509 ad = paramds[bfn]['additional_details'] | |
510 | |
511 if args.MetFragDatabaseType == "MetChem": | |
512 # for some reason the metchem database option does | |
513 # not report the full inchikey (at least in the Bham | |
514 # setup. This ensures we always get the fully inchikey | |
515 line['InChIKey'] = '{}-{}-{}'.format(line['InChIKey1'], | |
516 line['InChIKey2'], | |
517 line['InChIKey3']) | |
518 | |
519 line.update(ad) | |
520 dwriter.writerow(line) |