comparison metfrag.py @ 2:f151ee133612 draft default tip

"planemo upload for repository https://github.com/computational-metabolomics/metfrag-galaxy commit b337c6296968848e3214f4b51df3d86776f84b6a"
author computational-metabolomics
date Tue, 14 Jul 2020 07:41:53 -0400
parents 9ee2e2ceb2c9
children
comparison
equal deleted inserted replaced
1:9ee2e2ceb2c9 2:f151ee133612
1 from __future__ import absolute_import, print_function 1 from __future__ import absolute_import, print_function
2 2
3 import ConfigParser 3 try:
4 from configparser import ConfigParser
5 except ImportError as e:
6 print(e)
7 from ConfigParser import ConfigParser
8
4 import argparse 9 import argparse
5 import csv 10 import csv
6 import glob 11 import glob
7 import multiprocessing 12 import multiprocessing
8 import os 13 import os
12 import tempfile 17 import tempfile
13 from collections import defaultdict 18 from collections import defaultdict
14 19
15 import six 20 import six
16 21
17 print(sys.version)
18
19 parser = argparse.ArgumentParser()
20 parser.add_argument('--input_pth')
21 parser.add_argument('--result_pth', default='metfrag_result.csv')
22
23 parser.add_argument('--temp_dir')
24 parser.add_argument('--polarity', default='pos')
25 parser.add_argument('--minMSMSpeaks', default=1)
26
27 parser.add_argument('--MetFragDatabaseType', default='PubChem')
28 parser.add_argument('--LocalDatabasePath', default='')
29 parser.add_argument('--LocalMetChemDatabaseServerIp', default='')
30
31 parser.add_argument('--DatabaseSearchRelativeMassDeviation', default=5)
32 parser.add_argument('--FragmentPeakMatchRelativeMassDeviation', default=10)
33 parser.add_argument('--FragmentPeakMatchAbsoluteMassDeviation', default=0.001)
34 parser.add_argument('--NumberThreads', default=1)
35 parser.add_argument('--UnconnectedCompoundFilter', action='store_true')
36 parser.add_argument('--IsotopeFilter', action='store_true')
37
38 parser.add_argument('--FilterMinimumElements', default='')
39 parser.add_argument('--FilterMaximumElements', default='')
40 parser.add_argument('--FilterSmartsInclusionList', default='')
41 parser.add_argument('--FilterSmartsExclusionList', default='')
42 parser.add_argument('--FilterIncludedElements', default='')
43 parser.add_argument('--FilterExcludedElements', default='')
44 parser.add_argument('--FilterIncludedExclusiveElements', default='')
45
46 parser.add_argument('--score_thrshld', default=0)
47 parser.add_argument('--pctexplpeak_thrshld', default=0)
48 parser.add_argument('--schema')
49 parser.add_argument('--cores_top_level', default=1)
50 parser.add_argument('--chunks', default=1)
51 parser.add_argument('--meta_select_col', default='name')
52 parser.add_argument('--skip_invalid_adducts', action='store_true')
53
54 parser.add_argument('--ScoreSuspectLists', default='')
55 parser.add_argument('--MetFragScoreTypes',
56 default="FragmenterScore,OfflineMetFusionScore")
57 parser.add_argument('--MetFragScoreWeights', default="1.0,1.0")
58
59 args = parser.parse_args()
60 print(args)
61
62 config = ConfigParser.ConfigParser()
63 config.read(
64 os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config.ini'))
65
66 if os.stat(args.input_pth).st_size == 0:
67 print('Input file empty')
68 exit()
69
70 # Create temporary working directory
71 if args.temp_dir:
72 wd = args.temp_dir
73 else:
74 wd = tempfile.mkdtemp()
75
76 if os.path.exists(wd):
77 shutil.rmtree(wd)
78 os.makedirs(wd)
79 else:
80 os.makedirs(wd)
81
82 ######################################################################
83 # Setup regular expressions for MSP parsing dictionary
84 ######################################################################
85 regex_msp = {}
86 regex_msp['name'] = [r'^Name(?:=|:)(.*)$']
87 regex_msp['polarity'] = [r'^ion.*mode(?:=|:)(.*)$',
88 r'^ionization.*mode(?:=|:)(.*)$',
89 r'^polarity(?:=|:)(.*)$']
90 regex_msp['precursor_mz'] = [r'^precursor.*m/z(?:=|:)\s*(\d*[.,]?\d*)$',
91 r'^precursor.*mz(?:=|:)\s*(\d*[.,]?\d*)$']
92 regex_msp['precursor_type'] = [r'^precursor.*type(?:=|:)(.*)$',
93 r'^adduct(?:=|:)(.*)$',
94 r'^ADDUCTIONNAME(?:=|:)(.*)$']
95 regex_msp['num_peaks'] = [r'^Num.*Peaks(?:=|:)\s*(\d*)$']
96 regex_msp['msp'] = [r'^Name(?:=|:)(.*)$'] # Flag for standard MSP format
97
98 regex_massbank = {}
99 regex_massbank['name'] = [r'^RECORD_TITLE:(.*)$']
100 regex_massbank['polarity'] = [r'^AC\$MASS_SPECTROMETRY:\s+ION_MODE\s+(.*)$']
101 regex_massbank['precursor_mz'] = [
102 r'^MS\$FOCUSED_ION:\s+PRECURSOR_M/Z\s+(\d*[.,]?\d*)$']
103 regex_massbank['precursor_type'] = [
104 r'^MS\$FOCUSED_ION:\s+PRECURSOR_TYPE\s+(.*)$']
105 regex_massbank['num_peaks'] = [r'^PK\$NUM_PEAK:\s+(\d*)']
106 regex_massbank['cols'] = [r'^PK\$PEAK:\s+(.*)']
107 regex_massbank['massbank'] = [
108 r'^RECORD_TITLE:(.*)$'] # Flag for massbank format
109
110 if args.schema == 'msp':
111 meta_regex = regex_msp
112 elif args.schema == 'massbank':
113 meta_regex = regex_massbank
114 elif args.schema == 'auto':
115 # If auto we just check for all the available paramter names and then
116 # determine if Massbank or MSP based on the name parameter
117 meta_regex = {}
118 meta_regex.update(regex_massbank)
119 meta_regex['name'].extend(regex_msp['name'])
120 meta_regex['polarity'].extend(regex_msp['polarity'])
121 meta_regex['precursor_mz'].extend(regex_msp['precursor_mz'])
122 meta_regex['precursor_type'].extend(regex_msp['precursor_type'])
123 meta_regex['num_peaks'].extend(regex_msp['num_peaks'])
124 meta_regex['msp'] = regex_msp['msp']
125 else:
126 sys.exit("No schema selected")
127
128 adduct_types = {
129 '[M+H]+': 1.007276,
130 '[M+NH4]+': 18.034374,
131 '[M+Na]+': 22.989218,
132 '[M+K]+': 38.963158,
133 '[M+CH3OH+H]+': 33.033489,
134 '[M+ACN+H]+': 42.033823,
135 '[M+ACN+Na]+': 64.015765,
136 '[M+2ACN+H]+': 83.06037,
137 '[M-H]-': -1.007276,
138 '[M+Cl]-': 34.969402,
139 '[M+HCOO]-': 44.99819,
140 '[M-H+HCOOH]-': 44.99819,
141 # same as above but different style of writing adduct
142 '[M+CH3COO]-': 59.01385,
143 '[M-H+CH3COOH]-': 59.01385
144 # same as above but different style of writing adduct
145 }
146 inv_adduct_types = {int(round(v, 0)): k for k, v in adduct_types.iteritems()}
147
148 22
149 # function to extract the meta data using the regular expressions 23 # function to extract the meta data using the regular expressions
150 def parse_meta(meta_regex, meta_info=None): 24 def parse_meta(meta_regex, meta_info=None):
151 if meta_info is None: 25 if meta_info is None:
152 meta_info = {} 26 meta_info = {}
154 for reg in regexes: 28 for reg in regexes:
155 m = re.search(reg, line, re.IGNORECASE) 29 m = re.search(reg, line, re.IGNORECASE)
156 if m: 30 if m:
157 meta_info[k] = '-'.join(m.groups()).strip() 31 meta_info[k] = '-'.join(m.groups()).strip()
158 return meta_info 32 return meta_info
33
34
35 def get_meta_regex(schema):
36 ######################################################################
37 # Setup regular expressions for MSP parsing dictionary
38 ######################################################################
39 regex_msp = {}
40 regex_msp['name'] = [r'^Name(?:=|:)(.*)$']
41 regex_msp['polarity'] = [r'^ion.*mode(?:=|:)(.*)$',
42 r'^ionization.*mode(?:=|:)(.*)$',
43 r'^polarity(?:=|:)(.*)$']
44 regex_msp['precursor_mz'] = [r'^precursor.*m/z(?:=|:)\s*(\d*[.,]?\d*)$',
45 r'^precursor.*mz(?:=|:)\s*(\d*[.,]?\d*)$']
46 regex_msp['precursor_type'] = [r'^precursor.*type(?:=|:)(.*)$',
47 r'^adduct(?:=|:)(.*)$',
48 r'^ADDUCTIONNAME(?:=|:)(.*)$']
49
50 regex_msp['retention_time'] = [r'^RETENTION.*TIME(?:=|:)\s*(.*)$',
51 r'^rt(?:=|:)\s*(.*)$',
52 r'^time(?:=|:)\s*(.*)$']
53 # From example winter_pos.mspy from kristian
54 regex_msp['AlignmentID'] = [r'^AlignmentID(?:=|:)\s*(.*)$']
55
56 regex_msp['num_peaks'] = [r'^Num.*Peaks(?:=|:)\s*(\d*)$']
57 regex_msp['msp'] = [r'^Name(?:=|:)(.*)$'] # Flag for standard MSP format
58
59 regex_massbank = {}
60 regex_massbank['name'] = [r'^RECORD_TITLE:(.*)$']
61 regex_massbank['polarity'] = [
62 r'^AC\$MASS_SPECTROMETRY:\s+ION_MODE\s+(.*)$']
63 regex_massbank['precursor_mz'] = [
64 r'^MS\$FOCUSED_ION:\s+PRECURSOR_M/Z\s+(\d*[.,]?\d*)$']
65 regex_massbank['precursor_type'] = [
66 r'^MS\$FOCUSED_ION:\s+PRECURSOR_TYPE\s+(.*)$']
67 regex_massbank['retention_time'] = [
68 r'^AC\$CHROMATOGRAPHY:\s+RETENTION_TIME\s*(\d*\.?\d+).*']
69
70 regex_massbank['num_peaks'] = [r'^PK\$NUM_PEAK:\s+(\d*)']
71 regex_massbank['cols'] = [r'^PK\$PEAK:\s+(.*)']
72 regex_massbank['massbank'] = [
73 r'^RECORD_TITLE:(.*)$'] # Flag for massbank format
74
75 if schema == 'msp':
76 meta_regex = regex_msp
77 elif schema == 'massbank':
78 meta_regex = regex_massbank
79 elif schema == 'auto':
80 # If auto we just check for all the available paramter names and then
81 # determine if Massbank or MSP based on the name parameter
82 meta_regex = {}
83 meta_regex.update(regex_massbank)
84 meta_regex['name'].extend(regex_msp['name'])
85 meta_regex['polarity'].extend(regex_msp['polarity'])
86 meta_regex['precursor_mz'].extend(regex_msp['precursor_mz'])
87 meta_regex['precursor_type'].extend(regex_msp['precursor_type'])
88 meta_regex['num_peaks'].extend(regex_msp['num_peaks'])
89 meta_regex['retention_time'].extend(regex_msp['retention_time'])
90 meta_regex['AlignmentID'] = regex_msp['AlignmentID']
91 meta_regex['msp'] = regex_msp['msp']
92
93 else:
94 sys.exit("No schema selected")
95
96 return meta_regex
159 97
160 98
161 ###################################################################### 99 ######################################################################
162 # Setup parameter dictionary 100 # Setup parameter dictionary
163 ###################################################################### 101 ######################################################################
269 # the msPurity pipeline) choose between getting additional details to add 207 # the msPurity pipeline) choose between getting additional details to add
270 # as columns as either all meta data from msp, just details from the 208 # as columns as either all meta data from msp, just details from the
271 # record name (i.e. when using msPurity and we have the columns coded into 209 # record name (i.e. when using msPurity and we have the columns coded into
272 # the name) or just the spectra index (spectrac)]. 210 # the name) or just the spectra index (spectrac)].
273 # Returns the parameters used and the command line call 211 # Returns the parameters used and the command line call
212 meta_info = {k: v for k, v in meta_info.items() if k
213 not in ['msp', 'massbank', 'cols']}
274 214
275 paramd = init_paramd(args) 215 paramd = init_paramd(args)
276 if args.meta_select_col == 'name': 216 if args.meta_select_col == 'name':
277 # have additional column of just the name 217 # have additional column of just the name
278 paramd['additional_details'] = {'name': meta_info['name']} 218 paramd['additional_details'] = {'name': meta_info['name']}
285 elif args.meta_select_col == 'all': 225 elif args.meta_select_col == 'all':
286 # have additional columns based on all the meta information 226 # have additional columns based on all the meta information
287 # extracted from the MSP 227 # extracted from the MSP
288 paramd['additional_details'] = meta_info 228 paramd['additional_details'] = meta_info
289 else: 229 else:
290 # Just have and index of the spectra in the MSP file 230 # Just have an index of the spectra in the MSP file
291 paramd['additional_details'] = {'spectra_idx': spectrac} 231 paramd['additional_details'] = {'spectra_idx': spectrac}
292 232
293 paramd["SampleName"] = "{}_metfrag_result".format(spectrac) 233 paramd["SampleName"] = "{}_metfrag_result".format(spectrac)
294 234
295 # =============== Output peaks to txt file ============================== 235 # =============== Output peaks to txt file ==============================
296 paramd["PeakListPath"] = os.path.join(wd, 236 paramd["PeakListPath"] = os.path.join(wd,
297 "{}_tmpspec.txt".format(spectrac)) 237 "{}_tmpspec.txt".format(spectrac))
298 238
299 # write spec file 239 # write spec file
240
300 with open(paramd["PeakListPath"], 'w') as outfile: 241 with open(paramd["PeakListPath"], 'w') as outfile:
242 pls = ''
301 for p in peaklist: 243 for p in peaklist:
302 outfile.write(p[0] + "\t" + p[1] + "\n") 244 outfile.write(p[0] + "\t" + p[1] + "\n")
245 pls = pls + '{}_{};'.format(p[0], p[1])
246
247 if args.output_cl:
248 peaklist_str = pls[:-1]
303 249
304 # =============== Update param based on MSP metadata ====================== 250 # =============== Update param based on MSP metadata ======================
305 # Replace param details with details from MSP if required 251 # Replace param details with details from MSP if required
306 if 'precursor_type' in meta_info and \ 252 if 'precursor_type' in meta_info and \
307 meta_info['precursor_type'] in adduct_types: 253 meta_info['precursor_type'] in adduct_types:
309 nm = float(meta_info['precursor_mz']) - adduct_types[ 255 nm = float(meta_info['precursor_mz']) - adduct_types[
310 meta_info['precursor_type']] 256 meta_info['precursor_type']]
311 paramd["PrecursorIonMode"] = \ 257 paramd["PrecursorIonMode"] = \
312 int(round(adduct_types[meta_info['precursor_type']], 0)) 258 int(round(adduct_types[meta_info['precursor_type']], 0))
313 elif not args.skip_invalid_adducts: 259 elif not args.skip_invalid_adducts:
260 inv_adduct_types = {int(round(v, 0)): k for k, v in
261 six.iteritems(adduct_types)}
314 adduct = inv_adduct_types[int(paramd['PrecursorIonModeDefault'])] 262 adduct = inv_adduct_types[int(paramd['PrecursorIonModeDefault'])]
315 paramd["PrecursorIonMode"] = paramd['PrecursorIonModeDefault'] 263 paramd["PrecursorIonMode"] = paramd['PrecursorIonModeDefault']
316 nm = float(meta_info['precursor_mz']) - paramd['nm_mass_diff_default'] 264 nm = float(meta_info['precursor_mz']) - paramd['nm_mass_diff_default']
317 else: 265 else:
318 print('Skipping {}'.format(paramd["SampleName"])) 266 print('Skipping {}'.format(paramd["SampleName"]))
319 return '', '' 267 return '', ''
320 268
321 paramd['additional_details']['adduct'] = adduct 269 if not ('precursor_type' in paramd['additional_details'] or 'adduct'
270 in paramd['additional_details']):
271 paramd['additional_details']['adduct'] = adduct
272
322 paramd["NeutralPrecursorMass"] = nm 273 paramd["NeutralPrecursorMass"] = nm
323 274
324 # ============== Create CLI cmd for metfrag =============================== 275 # ============== Create CLI cmd for metfrag ===============================
325 cmd = "metfrag" 276 cmd = "metfrag"
326 for k, v in six.iteritems(paramd): 277 for k, v in six.iteritems(paramd):
327 if k not in ['PrecursorIonModeDefault', 'nm_mass_diff_default', 278 if k not in ['PrecursorIonModeDefault', 'nm_mass_diff_default',
328 'additional_details']: 279 'additional_details']:
329 cmd += " {}={}".format(str(k), str(v)) 280 cmd += " {}={}".format(str(k), str(v))
330 281
282 if args.output_cl:
283 cli_str = '{} PeakListString={}'.format(cmd, peaklist_str)
284 paramd['additional_details']['MetFragCLIString'] = cli_str
285
331 # ============== Run metfrag ============================================== 286 # ============== Run metfrag ==============================================
332 # print(cmd) 287 # print(cmd)
333 # Filter before process with a minimum number of MS/MS peaks 288 # Filter before process with a minimum number of MS/MS peaks
334 if plinesread >= float(args.minMSMSpeaks): 289 if plinesread >= float(args.minMSMSpeaks):
335 290
341 296
342 def work(cmds): 297 def work(cmds):
343 return [os.system(cmd) for cmd in cmds] 298 return [os.system(cmd) for cmd in cmds]
344 299
345 300
346 ###################################################################### 301 if __name__ == "__main__":
347 # Parse MSP file and run metfrag CLI 302 print(sys.version)
348 ###################################################################### 303
349 # keep list of commands if performing in CLI in parallel 304 parser = argparse.ArgumentParser()
350 cmds = [] 305 parser.add_argument('--input_pth')
351 # keep a dictionary of all params 306 parser.add_argument('--result_pth', default='metfrag_result.csv')
352 paramds = {} 307
353 # keep count of spectra (for uid) 308 parser.add_argument('--temp_dir')
354 spectrac = 0 309 parser.add_argument('--polarity', default='pos')
355 # this dictionary will store the meta data results form the MSp file 310 parser.add_argument('--minMSMSpeaks', default=1)
356 meta_info = {} 311
357 312 parser.add_argument('--MetFragDatabaseType', default='PubChem')
358 with open(args.input_pth, "r") as infile: 313 parser.add_argument('--LocalDatabasePath', default='')
359 # number of lines for the peaks 314 parser.add_argument('--LocalMetChemDatabaseServerIp', default='')
360 pnumlines = 0 315
361 # number of lines read for the peaks 316 parser.add_argument('--DatabaseSearchRelativeMassDeviation', default=5)
362 plinesread = 0 317 parser.add_argument('--FragmentPeakMatchRelativeMassDeviation', default=10)
363 for line in infile: 318 parser.add_argument('--FragmentPeakMatchAbsoluteMassDeviation',
364 line = line.strip() 319 default=0.001)
365 320 parser.add_argument('--NumberThreads', default=1)
366 if pnumlines == 0: 321 parser.add_argument('--UnconnectedCompoundFilter', action='store_true')
367 # ============== Extract metadata from MSP ======================== 322 parser.add_argument('--IsotopeFilter', action='store_true')
368 meta_info = parse_meta(meta_regex, meta_info) 323
369 324 parser.add_argument('--FilterMinimumElements', default='')
370 if ('massbank' in meta_info and 'cols' in meta_info) or ( 325 parser.add_argument('--FilterMaximumElements', default='')
371 'msp' in meta_info and 'num_peaks' in meta_info): 326 parser.add_argument('--FilterSmartsInclusionList', default='')
372 pnumlines = int(meta_info['num_peaks']) 327 parser.add_argument('--FilterSmartsExclusionList', default='')
328 parser.add_argument('--FilterIncludedElements', default='')
329 parser.add_argument('--FilterExcludedElements', default='')
330 parser.add_argument('--FilterIncludedExclusiveElements', default='')
331
332 parser.add_argument('--score_thrshld', default=0)
333 parser.add_argument('--pctexplpeak_thrshld', default=0)
334 parser.add_argument('--schema')
335 parser.add_argument('--cores_top_level', default=1)
336 parser.add_argument('--chunks', default=1)
337 parser.add_argument('--meta_select_col', default='name')
338 parser.add_argument('--skip_invalid_adducts', action='store_true')
339 parser.add_argument('--output_cl', action='store_true')
340
341 parser.add_argument('--ScoreSuspectLists', default='')
342 parser.add_argument('--MetFragScoreTypes',
343 default="FragmenterScore,OfflineMetFusionScore")
344 parser.add_argument('--MetFragScoreWeights', default="1.0,1.0")
345 parser.add_argument('-a', '--adducts', action='append', nargs=1,
346 required=False, default=[], help='Adducts used')
347
348 args = parser.parse_args()
349 print(args)
350 config = ConfigParser()
351 config.read(
352 os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config.ini'))
353
354 if os.stat(args.input_pth).st_size == 0:
355 print('Input file empty')
356 exit()
357
358 # Create temporary working directory
359 if args.temp_dir:
360 wd = args.temp_dir
361 else:
362 wd = tempfile.mkdtemp()
363
364 if os.path.exists(wd):
365 shutil.rmtree(wd)
366 os.makedirs(wd)
367 else:
368 os.makedirs(wd)
369
370 meta_regex = get_meta_regex(args.schema)
371
372 adduct_types = {
373 '[M+H]+': 1.007276,
374 '[M+NH4]+': 18.034374,
375 '[M+Na]+': 22.989218,
376 '[M+K]+': 38.963158,
377 '[M+CH3OH+H]+': 33.033489,
378 '[M+ACN+H]+': 42.033823,
379 '[M+ACN+Na]+': 64.015765,
380 '[M+2ACN+H]+': 83.06037,
381 '[M-H]-': -1.007276,
382 '[M+Cl]-': 34.969402,
383 '[M+HCOO]-': 44.99819,
384 '[M-H+HCOOH]-': 44.99819,
385 # same as above but different style of writing adduct
386 '[M+CH3COO]-': 59.01385,
387 '[M-H+CH3COOH]-': 59.01385
388 # same as above but different style of writing adduct
389 }
390
391 ######################################################################
392 # Parse MSP file and run metfrag CLI
393 ######################################################################
394 # keep list of commands if performing in CLI in parallel
395 cmds = []
396 # keep a dictionary of all params
397 paramds = {}
398 # keep count of spectra (for uid)
399 spectrac = 0
400 # this dictionary will store the meta data results form the MSp file
401 meta_info = {}
402
403 if args.adducts:
404 adducts_from_cli = [
405 a[0].replace('__ob__', '[').replace('__cb__', ']') for a in
406 args.adducts
407 ]
408 else:
409 adducts_from_cli = []
410
411 with open(args.input_pth, "r") as infile:
412 # number of lines for the peaks
413 pnumlines = 0
414 # number of lines read for the peaks
415 plinesread = 0
416 for line in infile:
417 line = line.strip()
418
419 if pnumlines == 0:
420 # ============== Extract metadata from MSP ====================
421 meta_info = parse_meta(meta_regex, meta_info)
422
423 if ('massbank' in meta_info and 'cols' in meta_info) or (
424 'msp' in meta_info and 'num_peaks' in meta_info):
425 pnumlines = int(meta_info['num_peaks'])
426 plinesread = 0
427 peaklist = []
428
429 elif plinesread < pnumlines:
430 # ============== Extract peaks from MSP =======================
431 # .split() will split on any empty space (i.e. tab and space)
432 line = tuple(line.split())
433 # Keep only m/z and intensity, not relative intensity
434 save_line = tuple(line[0].split() + line[1].split())
435 plinesread += 1
436 peaklist.append(save_line)
437
438 elif plinesread and plinesread == pnumlines:
439 # =Get sample name and additional details for output and RUN ==
440 if adducts_from_cli:
441 for adduct in adducts_from_cli:
442 spectrac += 1
443 meta_info['precursor_type'] = adduct
444 paramd, cmd = run_metfrag(meta_info, peaklist, args,
445 wd, spectrac, adduct_types)
446 if paramd:
447 paramds[paramd["SampleName"]] = paramd
448 cmds.append(cmd)
449 else:
450 spectrac += 1
451 paramd, cmd = run_metfrag(meta_info, peaklist, args,
452 wd, spectrac, adduct_types)
453 if paramd:
454 paramds[paramd["SampleName"]] = paramd
455 cmds.append(cmd)
456
457 meta_info = {}
458 pnumlines = 0
373 plinesread = 0 459 plinesread = 0
374 peaklist = [] 460
375 461 # end of file. Check if there is a MSP spectra to run metfrag on
376 elif plinesread < pnumlines: 462 if plinesread and plinesread == pnumlines:
377 # ============== Extract peaks from MSP ========================== 463
378 # .split() will split on any empty space (i.e. tab and space) 464 if adducts_from_cli:
379 line = tuple(line.split()) 465 for adduct in adducts_from_cli:
380 # Keep only m/z and intensity, not relative intensity 466 spectrac += 1
381 save_line = tuple(line[0].split() + line[1].split()) 467 meta_info['precursor_type'] = adduct
382 plinesread += 1 468 paramd, cmd = run_metfrag(meta_info, peaklist, args,
383 peaklist.append(save_line) 469 wd, spectrac, adduct_types)
384 470 if paramd:
385 elif plinesread and plinesread == pnumlines: 471 paramds[paramd["SampleName"]] = paramd
386 # ======= Get sample name and additional details for output ======= 472 cmds.append(cmd)
387 spectrac += 1 473 else:
388 paramd, cmd = run_metfrag(meta_info, peaklist, args, wd, spectrac, 474 spectrac += 1
389 adduct_types) 475 paramd, cmd = run_metfrag(meta_info, peaklist, args,
390 476 wd, spectrac, adduct_types)
391 if paramd: 477 if paramd:
392 paramds[paramd["SampleName"]] = paramd 478 paramds[paramd["SampleName"]] = paramd
393 cmds.append(cmd) 479 cmds.append(cmd)
394 480
395 meta_info = {} 481 # Perform multiprocessing on command line call level
396 pnumlines = 0 482 if int(args.cores_top_level) > 1:
397 plinesread = 0 483 cmds_chunks = [cmds[x:x + int(args.chunks)] for x in
398 484 list(range(0, len(cmds), int(args.chunks)))]
399 # end of file. Check if there is a MSP spectra to run metfrag on still 485 pool = multiprocessing.Pool(processes=int(args.cores_top_level))
400 if plinesread and plinesread == pnumlines: 486 pool.map(work, cmds_chunks)
401 487 pool.close()
402 paramd, cmd = run_metfrag(meta_info, peaklist, args, wd, spectrac + 1, 488 pool.join()
403 adduct_types) 489
404 490 ######################################################################
405 if paramd: 491 # Concatenate and filter the output
406 paramds[paramd["SampleName"]] = paramd 492 ######################################################################
407 cmds.append(cmd) 493 # outputs might have different headers. Need to get a list of all the
408 494 # headers before we start merging the files
409 # Perform multiprocessing on command line call level 495 # outfiles = [os.path.join(wd, f) for f in glob.glob(os.path.join(wd,
410 if int(args.cores_top_level) > 1: 496 # "*_metfrag_result.csv"))]
411 cmds_chunks = [cmds[x:x + int(args.chunks)] for x in 497 outfiles = glob.glob(os.path.join(wd, "*_metfrag_result.csv"))
412 list(range(0, len(cmds), int(args.chunks)))] 498
413 pool = multiprocessing.Pool(processes=int(args.cores_top_level)) 499 if len(outfiles) == 0:
414 pool.map(work, cmds_chunks) 500 print('No results')
415 pool.close() 501 sys.exit()
416 pool.join() 502
417 503 headers = []
418 ###################################################################### 504 c = 0
419 # Concatenate and filter the output
420 ######################################################################
421 # outputs might have different headers. Need to get a list of all the
422 # headers before we start merging the files
423 # outfiles = [os.path.join(wd, f) for f in glob.glob(os.path.join(wd,
424 # "*_metfrag_result.csv"))]
425 outfiles = glob.glob(os.path.join(wd, "*_metfrag_result.csv"))
426
427 if len(outfiles) == 0:
428 print('No results')
429 sys.exit()
430
431 headers = []
432 c = 0
433 for fn in outfiles:
434 with open(fn, 'r') as infile:
435 reader = csv.reader(infile)
436 if sys.version_info >= (3, 0):
437 headers.extend(next(reader))
438 else:
439 headers.extend(reader.next())
440 # check if file has any data rows
441 for i, row in enumerate(reader):
442 c += 1
443 if i == 1:
444 break
445
446 # if no data rows (e.g. matches) then do not save an
447 # output and leave the program
448 if c == 0:
449 print('No results')
450 sys.exit()
451
452 additional_detail_headers = ['sample_name']
453 for k, paramd in six.iteritems(paramds):
454 additional_detail_headers = list(set(
455 additional_detail_headers + list(paramd['additional_details'].keys())))
456
457 # add inchikey if not already present (missing in metchem output)
458 if 'InChIKey' not in headers:
459 headers.append('InChIKey')
460
461 headers = additional_detail_headers + sorted(list(set(headers)))
462
463 # Sort files nicely
464 outfiles.sort(
465 key=lambda s: int(re.match(r'^.*/(\d+)_metfrag_result.csv', s).group(1)))
466
467 print(outfiles)
468
469 # merge outputs
470 with open(args.result_pth, 'a') as merged_outfile:
471 dwriter = csv.DictWriter(merged_outfile, fieldnames=headers,
472 delimiter='\t', quotechar='"')
473 dwriter.writeheader()
474
475 for fn in outfiles: 505 for fn in outfiles:
476 506 with open(fn, 'r') as infile:
477 with open(fn) as infile: 507 reader = csv.reader(infile)
478 reader = csv.DictReader(infile, delimiter=',', quotechar='"') 508 if sys.version_info >= (3, 0):
479 for line in reader: 509 headers.extend(next(reader))
480 bewrite = True 510 else:
481 for key, value in line.items(): 511 headers.extend(reader.next())
482 # Filter when no MS/MS peak matched 512 # check if file has any data rows
483 if key == "ExplPeaks": 513 for i, row in enumerate(reader):
484 if float(args.pctexplpeak_thrshld) > 0 \ 514 c += 1
485 and value and "NA" in value: 515 if i == 1:
516 break
517
518 # if no data rows (e.g. matches) then do not save an
519 # output and leave the program
520 if c == 0:
521 print('No results')
522 sys.exit()
523
524 additional_detail_headers = ['sample_name']
525 for k, paramd in six.iteritems(paramds):
526 additional_detail_headers = list(set(
527 additional_detail_headers + list(
528 paramd['additional_details'].keys())))
529
530 # add inchikey if not already present (missing in metchem output)
531 if 'InChIKey' not in headers:
532 headers.append('InChIKey')
533
534 additional_detail_headers = sorted(additional_detail_headers)
535 headers = additional_detail_headers + sorted(list(set(headers)))
536
537 # Sort files nicely
538 outfiles.sort(
539 key=lambda s: int(
540 re.match(r'^.*/(\d+)_metfrag_result.csv', s).group(1)))
541
542 print(outfiles)
543
544 # merge outputs
545 with open(args.result_pth, 'a') as merged_outfile:
546 dwriter = csv.DictWriter(merged_outfile, fieldnames=headers,
547 delimiter='\t', quotechar='"')
548 dwriter.writeheader()
549
550 for fn in outfiles:
551 with open(fn) as infile:
552 reader = csv.DictReader(infile, delimiter=',', quotechar='"')
553 for line in reader:
554 bewrite = True
555 for key, value in line.items():
556 # Filter when no MS/MS peak matched
557 if key == "ExplPeaks":
558 if float(args.pctexplpeak_thrshld) > 0 \
559 and value and "NA" in value:
560 bewrite = False
561 # Filter with a score threshold
562 elif key == "Score":
563 if value and float(value) <= float(
564 args.score_thrshld):
565 bewrite = False
566 elif key == "NoExplPeaks":
567 nbfindpeak = float(value)
568 elif key == "NumberPeaksUsed":
569 totpeaks = float(value)
570 # Filter with a relative number of peak matched
571 try:
572 pctexplpeak = nbfindpeak / totpeaks * 100
573 except ZeroDivisionError:
574 bewrite = False
575 else:
576 if pctexplpeak < float(args.pctexplpeak_thrshld):
486 bewrite = False 577 bewrite = False
487 # Filter with a score threshold 578
488 elif key == "Score": 579 # Write the line if it pass all filters
489 if value and float(value) <= float(args.score_thrshld): 580 if bewrite:
490 bewrite = False 581 bfn = os.path.basename(fn)
491 elif key == "NoExplPeaks": 582 bfn = bfn.replace(".csv", "")
492 nbfindpeak = float(value) 583 line['sample_name'] = paramds[bfn]['SampleName']
493 elif key == "NumberPeaksUsed": 584 ad = paramds[bfn]['additional_details']
494 totpeaks = float(value) 585
495 # Filter with a relative number of peak matched 586 if args.MetFragDatabaseType == "MetChem":
496 try: 587 # for some reason the metchem database option does
497 pctexplpeak = nbfindpeak / totpeaks * 100 588 # not report the full inchikey (at least in the
498 except ZeroDivisionError: 589 # Bham setup. This ensures we always get
499 bewrite = False 590 # the fully inchikey
500 else: 591 line['InChIKey'] = '{}-{}-{}'.format(
501 if pctexplpeak < float(args.pctexplpeak_thrshld): 592 line['InChIKey1'],
502 bewrite = False 593 line['InChIKey2'],
503 594 line['InChIKey3'])
504 # Write the line if it pass all filters 595
505 if bewrite: 596 line.update(ad)
506 bfn = os.path.basename(fn) 597 dwriter.writerow(line)
507 bfn = bfn.replace(".csv", "")
508 line['sample_name'] = paramds[bfn]['SampleName']
509 ad = paramds[bfn]['additional_details']
510
511 if args.MetFragDatabaseType == "MetChem":
512 # for some reason the metchem database option does
513 # not report the full inchikey (at least in the Bham
514 # setup. This ensures we always get the fully inchikey
515 line['InChIKey'] = '{}-{}-{}'.format(line['InChIKey1'],
516 line['InChIKey2'],
517 line['InChIKey3'])
518
519 line.update(ad)
520 dwriter.writerow(line)