Mercurial > repos > computational-metabolomics > metfrag
comparison metfrag.py @ 2:f151ee133612 draft default tip
"planemo upload for repository https://github.com/computational-metabolomics/metfrag-galaxy commit b337c6296968848e3214f4b51df3d86776f84b6a"
| author | computational-metabolomics |
|---|---|
| date | Tue, 14 Jul 2020 07:41:53 -0400 |
| parents | 9ee2e2ceb2c9 |
| children |
comparison
equal
deleted
inserted
replaced
| 1:9ee2e2ceb2c9 | 2:f151ee133612 |
|---|---|
| 1 from __future__ import absolute_import, print_function | 1 from __future__ import absolute_import, print_function |
| 2 | 2 |
| 3 import ConfigParser | 3 try: |
| 4 from configparser import ConfigParser | |
| 5 except ImportError as e: | |
| 6 print(e) | |
| 7 from ConfigParser import ConfigParser | |
| 8 | |
| 4 import argparse | 9 import argparse |
| 5 import csv | 10 import csv |
| 6 import glob | 11 import glob |
| 7 import multiprocessing | 12 import multiprocessing |
| 8 import os | 13 import os |
| 12 import tempfile | 17 import tempfile |
| 13 from collections import defaultdict | 18 from collections import defaultdict |
| 14 | 19 |
| 15 import six | 20 import six |
| 16 | 21 |
| 17 print(sys.version) | |
| 18 | |
| 19 parser = argparse.ArgumentParser() | |
| 20 parser.add_argument('--input_pth') | |
| 21 parser.add_argument('--result_pth', default='metfrag_result.csv') | |
| 22 | |
| 23 parser.add_argument('--temp_dir') | |
| 24 parser.add_argument('--polarity', default='pos') | |
| 25 parser.add_argument('--minMSMSpeaks', default=1) | |
| 26 | |
| 27 parser.add_argument('--MetFragDatabaseType', default='PubChem') | |
| 28 parser.add_argument('--LocalDatabasePath', default='') | |
| 29 parser.add_argument('--LocalMetChemDatabaseServerIp', default='') | |
| 30 | |
| 31 parser.add_argument('--DatabaseSearchRelativeMassDeviation', default=5) | |
| 32 parser.add_argument('--FragmentPeakMatchRelativeMassDeviation', default=10) | |
| 33 parser.add_argument('--FragmentPeakMatchAbsoluteMassDeviation', default=0.001) | |
| 34 parser.add_argument('--NumberThreads', default=1) | |
| 35 parser.add_argument('--UnconnectedCompoundFilter', action='store_true') | |
| 36 parser.add_argument('--IsotopeFilter', action='store_true') | |
| 37 | |
| 38 parser.add_argument('--FilterMinimumElements', default='') | |
| 39 parser.add_argument('--FilterMaximumElements', default='') | |
| 40 parser.add_argument('--FilterSmartsInclusionList', default='') | |
| 41 parser.add_argument('--FilterSmartsExclusionList', default='') | |
| 42 parser.add_argument('--FilterIncludedElements', default='') | |
| 43 parser.add_argument('--FilterExcludedElements', default='') | |
| 44 parser.add_argument('--FilterIncludedExclusiveElements', default='') | |
| 45 | |
| 46 parser.add_argument('--score_thrshld', default=0) | |
| 47 parser.add_argument('--pctexplpeak_thrshld', default=0) | |
| 48 parser.add_argument('--schema') | |
| 49 parser.add_argument('--cores_top_level', default=1) | |
| 50 parser.add_argument('--chunks', default=1) | |
| 51 parser.add_argument('--meta_select_col', default='name') | |
| 52 parser.add_argument('--skip_invalid_adducts', action='store_true') | |
| 53 | |
| 54 parser.add_argument('--ScoreSuspectLists', default='') | |
| 55 parser.add_argument('--MetFragScoreTypes', | |
| 56 default="FragmenterScore,OfflineMetFusionScore") | |
| 57 parser.add_argument('--MetFragScoreWeights', default="1.0,1.0") | |
| 58 | |
| 59 args = parser.parse_args() | |
| 60 print(args) | |
| 61 | |
| 62 config = ConfigParser.ConfigParser() | |
| 63 config.read( | |
| 64 os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config.ini')) | |
| 65 | |
| 66 if os.stat(args.input_pth).st_size == 0: | |
| 67 print('Input file empty') | |
| 68 exit() | |
| 69 | |
| 70 # Create temporary working directory | |
| 71 if args.temp_dir: | |
| 72 wd = args.temp_dir | |
| 73 else: | |
| 74 wd = tempfile.mkdtemp() | |
| 75 | |
| 76 if os.path.exists(wd): | |
| 77 shutil.rmtree(wd) | |
| 78 os.makedirs(wd) | |
| 79 else: | |
| 80 os.makedirs(wd) | |
| 81 | |
| 82 ###################################################################### | |
| 83 # Setup regular expressions for MSP parsing dictionary | |
| 84 ###################################################################### | |
| 85 regex_msp = {} | |
| 86 regex_msp['name'] = [r'^Name(?:=|:)(.*)$'] | |
| 87 regex_msp['polarity'] = [r'^ion.*mode(?:=|:)(.*)$', | |
| 88 r'^ionization.*mode(?:=|:)(.*)$', | |
| 89 r'^polarity(?:=|:)(.*)$'] | |
| 90 regex_msp['precursor_mz'] = [r'^precursor.*m/z(?:=|:)\s*(\d*[.,]?\d*)$', | |
| 91 r'^precursor.*mz(?:=|:)\s*(\d*[.,]?\d*)$'] | |
| 92 regex_msp['precursor_type'] = [r'^precursor.*type(?:=|:)(.*)$', | |
| 93 r'^adduct(?:=|:)(.*)$', | |
| 94 r'^ADDUCTIONNAME(?:=|:)(.*)$'] | |
| 95 regex_msp['num_peaks'] = [r'^Num.*Peaks(?:=|:)\s*(\d*)$'] | |
| 96 regex_msp['msp'] = [r'^Name(?:=|:)(.*)$'] # Flag for standard MSP format | |
| 97 | |
| 98 regex_massbank = {} | |
| 99 regex_massbank['name'] = [r'^RECORD_TITLE:(.*)$'] | |
| 100 regex_massbank['polarity'] = [r'^AC\$MASS_SPECTROMETRY:\s+ION_MODE\s+(.*)$'] | |
| 101 regex_massbank['precursor_mz'] = [ | |
| 102 r'^MS\$FOCUSED_ION:\s+PRECURSOR_M/Z\s+(\d*[.,]?\d*)$'] | |
| 103 regex_massbank['precursor_type'] = [ | |
| 104 r'^MS\$FOCUSED_ION:\s+PRECURSOR_TYPE\s+(.*)$'] | |
| 105 regex_massbank['num_peaks'] = [r'^PK\$NUM_PEAK:\s+(\d*)'] | |
| 106 regex_massbank['cols'] = [r'^PK\$PEAK:\s+(.*)'] | |
| 107 regex_massbank['massbank'] = [ | |
| 108 r'^RECORD_TITLE:(.*)$'] # Flag for massbank format | |
| 109 | |
| 110 if args.schema == 'msp': | |
| 111 meta_regex = regex_msp | |
| 112 elif args.schema == 'massbank': | |
| 113 meta_regex = regex_massbank | |
| 114 elif args.schema == 'auto': | |
| 115 # If auto we just check for all the available paramter names and then | |
| 116 # determine if Massbank or MSP based on the name parameter | |
| 117 meta_regex = {} | |
| 118 meta_regex.update(regex_massbank) | |
| 119 meta_regex['name'].extend(regex_msp['name']) | |
| 120 meta_regex['polarity'].extend(regex_msp['polarity']) | |
| 121 meta_regex['precursor_mz'].extend(regex_msp['precursor_mz']) | |
| 122 meta_regex['precursor_type'].extend(regex_msp['precursor_type']) | |
| 123 meta_regex['num_peaks'].extend(regex_msp['num_peaks']) | |
| 124 meta_regex['msp'] = regex_msp['msp'] | |
| 125 else: | |
| 126 sys.exit("No schema selected") | |
| 127 | |
| 128 adduct_types = { | |
| 129 '[M+H]+': 1.007276, | |
| 130 '[M+NH4]+': 18.034374, | |
| 131 '[M+Na]+': 22.989218, | |
| 132 '[M+K]+': 38.963158, | |
| 133 '[M+CH3OH+H]+': 33.033489, | |
| 134 '[M+ACN+H]+': 42.033823, | |
| 135 '[M+ACN+Na]+': 64.015765, | |
| 136 '[M+2ACN+H]+': 83.06037, | |
| 137 '[M-H]-': -1.007276, | |
| 138 '[M+Cl]-': 34.969402, | |
| 139 '[M+HCOO]-': 44.99819, | |
| 140 '[M-H+HCOOH]-': 44.99819, | |
| 141 # same as above but different style of writing adduct | |
| 142 '[M+CH3COO]-': 59.01385, | |
| 143 '[M-H+CH3COOH]-': 59.01385 | |
| 144 # same as above but different style of writing adduct | |
| 145 } | |
| 146 inv_adduct_types = {int(round(v, 0)): k for k, v in adduct_types.iteritems()} | |
| 147 | |
| 148 | 22 |
| 149 # function to extract the meta data using the regular expressions | 23 # function to extract the meta data using the regular expressions |
| 150 def parse_meta(meta_regex, meta_info=None): | 24 def parse_meta(meta_regex, meta_info=None): |
| 151 if meta_info is None: | 25 if meta_info is None: |
| 152 meta_info = {} | 26 meta_info = {} |
| 154 for reg in regexes: | 28 for reg in regexes: |
| 155 m = re.search(reg, line, re.IGNORECASE) | 29 m = re.search(reg, line, re.IGNORECASE) |
| 156 if m: | 30 if m: |
| 157 meta_info[k] = '-'.join(m.groups()).strip() | 31 meta_info[k] = '-'.join(m.groups()).strip() |
| 158 return meta_info | 32 return meta_info |
| 33 | |
| 34 | |
| 35 def get_meta_regex(schema): | |
| 36 ###################################################################### | |
| 37 # Setup regular expressions for MSP parsing dictionary | |
| 38 ###################################################################### | |
| 39 regex_msp = {} | |
| 40 regex_msp['name'] = [r'^Name(?:=|:)(.*)$'] | |
| 41 regex_msp['polarity'] = [r'^ion.*mode(?:=|:)(.*)$', | |
| 42 r'^ionization.*mode(?:=|:)(.*)$', | |
| 43 r'^polarity(?:=|:)(.*)$'] | |
| 44 regex_msp['precursor_mz'] = [r'^precursor.*m/z(?:=|:)\s*(\d*[.,]?\d*)$', | |
| 45 r'^precursor.*mz(?:=|:)\s*(\d*[.,]?\d*)$'] | |
| 46 regex_msp['precursor_type'] = [r'^precursor.*type(?:=|:)(.*)$', | |
| 47 r'^adduct(?:=|:)(.*)$', | |
| 48 r'^ADDUCTIONNAME(?:=|:)(.*)$'] | |
| 49 | |
| 50 regex_msp['retention_time'] = [r'^RETENTION.*TIME(?:=|:)\s*(.*)$', | |
| 51 r'^rt(?:=|:)\s*(.*)$', | |
| 52 r'^time(?:=|:)\s*(.*)$'] | |
| 53 # From example winter_pos.mspy from kristian | |
| 54 regex_msp['AlignmentID'] = [r'^AlignmentID(?:=|:)\s*(.*)$'] | |
| 55 | |
| 56 regex_msp['num_peaks'] = [r'^Num.*Peaks(?:=|:)\s*(\d*)$'] | |
| 57 regex_msp['msp'] = [r'^Name(?:=|:)(.*)$'] # Flag for standard MSP format | |
| 58 | |
| 59 regex_massbank = {} | |
| 60 regex_massbank['name'] = [r'^RECORD_TITLE:(.*)$'] | |
| 61 regex_massbank['polarity'] = [ | |
| 62 r'^AC\$MASS_SPECTROMETRY:\s+ION_MODE\s+(.*)$'] | |
| 63 regex_massbank['precursor_mz'] = [ | |
| 64 r'^MS\$FOCUSED_ION:\s+PRECURSOR_M/Z\s+(\d*[.,]?\d*)$'] | |
| 65 regex_massbank['precursor_type'] = [ | |
| 66 r'^MS\$FOCUSED_ION:\s+PRECURSOR_TYPE\s+(.*)$'] | |
| 67 regex_massbank['retention_time'] = [ | |
| 68 r'^AC\$CHROMATOGRAPHY:\s+RETENTION_TIME\s*(\d*\.?\d+).*'] | |
| 69 | |
| 70 regex_massbank['num_peaks'] = [r'^PK\$NUM_PEAK:\s+(\d*)'] | |
| 71 regex_massbank['cols'] = [r'^PK\$PEAK:\s+(.*)'] | |
| 72 regex_massbank['massbank'] = [ | |
| 73 r'^RECORD_TITLE:(.*)$'] # Flag for massbank format | |
| 74 | |
| 75 if schema == 'msp': | |
| 76 meta_regex = regex_msp | |
| 77 elif schema == 'massbank': | |
| 78 meta_regex = regex_massbank | |
| 79 elif schema == 'auto': | |
| 80 # If auto we just check for all the available paramter names and then | |
| 81 # determine if Massbank or MSP based on the name parameter | |
| 82 meta_regex = {} | |
| 83 meta_regex.update(regex_massbank) | |
| 84 meta_regex['name'].extend(regex_msp['name']) | |
| 85 meta_regex['polarity'].extend(regex_msp['polarity']) | |
| 86 meta_regex['precursor_mz'].extend(regex_msp['precursor_mz']) | |
| 87 meta_regex['precursor_type'].extend(regex_msp['precursor_type']) | |
| 88 meta_regex['num_peaks'].extend(regex_msp['num_peaks']) | |
| 89 meta_regex['retention_time'].extend(regex_msp['retention_time']) | |
| 90 meta_regex['AlignmentID'] = regex_msp['AlignmentID'] | |
| 91 meta_regex['msp'] = regex_msp['msp'] | |
| 92 | |
| 93 else: | |
| 94 sys.exit("No schema selected") | |
| 95 | |
| 96 return meta_regex | |
| 159 | 97 |
| 160 | 98 |
| 161 ###################################################################### | 99 ###################################################################### |
| 162 # Setup parameter dictionary | 100 # Setup parameter dictionary |
| 163 ###################################################################### | 101 ###################################################################### |
| 269 # the msPurity pipeline) choose between getting additional details to add | 207 # the msPurity pipeline) choose between getting additional details to add |
| 270 # as columns as either all meta data from msp, just details from the | 208 # as columns as either all meta data from msp, just details from the |
| 271 # record name (i.e. when using msPurity and we have the columns coded into | 209 # record name (i.e. when using msPurity and we have the columns coded into |
| 272 # the name) or just the spectra index (spectrac)]. | 210 # the name) or just the spectra index (spectrac)]. |
| 273 # Returns the parameters used and the command line call | 211 # Returns the parameters used and the command line call |
| 212 meta_info = {k: v for k, v in meta_info.items() if k | |
| 213 not in ['msp', 'massbank', 'cols']} | |
| 274 | 214 |
| 275 paramd = init_paramd(args) | 215 paramd = init_paramd(args) |
| 276 if args.meta_select_col == 'name': | 216 if args.meta_select_col == 'name': |
| 277 # have additional column of just the name | 217 # have additional column of just the name |
| 278 paramd['additional_details'] = {'name': meta_info['name']} | 218 paramd['additional_details'] = {'name': meta_info['name']} |
| 285 elif args.meta_select_col == 'all': | 225 elif args.meta_select_col == 'all': |
| 286 # have additional columns based on all the meta information | 226 # have additional columns based on all the meta information |
| 287 # extracted from the MSP | 227 # extracted from the MSP |
| 288 paramd['additional_details'] = meta_info | 228 paramd['additional_details'] = meta_info |
| 289 else: | 229 else: |
| 290 # Just have and index of the spectra in the MSP file | 230 # Just have an index of the spectra in the MSP file |
| 291 paramd['additional_details'] = {'spectra_idx': spectrac} | 231 paramd['additional_details'] = {'spectra_idx': spectrac} |
| 292 | 232 |
| 293 paramd["SampleName"] = "{}_metfrag_result".format(spectrac) | 233 paramd["SampleName"] = "{}_metfrag_result".format(spectrac) |
| 294 | 234 |
| 295 # =============== Output peaks to txt file ============================== | 235 # =============== Output peaks to txt file ============================== |
| 296 paramd["PeakListPath"] = os.path.join(wd, | 236 paramd["PeakListPath"] = os.path.join(wd, |
| 297 "{}_tmpspec.txt".format(spectrac)) | 237 "{}_tmpspec.txt".format(spectrac)) |
| 298 | 238 |
| 299 # write spec file | 239 # write spec file |
| 240 | |
| 300 with open(paramd["PeakListPath"], 'w') as outfile: | 241 with open(paramd["PeakListPath"], 'w') as outfile: |
| 242 pls = '' | |
| 301 for p in peaklist: | 243 for p in peaklist: |
| 302 outfile.write(p[0] + "\t" + p[1] + "\n") | 244 outfile.write(p[0] + "\t" + p[1] + "\n") |
| 245 pls = pls + '{}_{};'.format(p[0], p[1]) | |
| 246 | |
| 247 if args.output_cl: | |
| 248 peaklist_str = pls[:-1] | |
| 303 | 249 |
| 304 # =============== Update param based on MSP metadata ====================== | 250 # =============== Update param based on MSP metadata ====================== |
| 305 # Replace param details with details from MSP if required | 251 # Replace param details with details from MSP if required |
| 306 if 'precursor_type' in meta_info and \ | 252 if 'precursor_type' in meta_info and \ |
| 307 meta_info['precursor_type'] in adduct_types: | 253 meta_info['precursor_type'] in adduct_types: |
| 309 nm = float(meta_info['precursor_mz']) - adduct_types[ | 255 nm = float(meta_info['precursor_mz']) - adduct_types[ |
| 310 meta_info['precursor_type']] | 256 meta_info['precursor_type']] |
| 311 paramd["PrecursorIonMode"] = \ | 257 paramd["PrecursorIonMode"] = \ |
| 312 int(round(adduct_types[meta_info['precursor_type']], 0)) | 258 int(round(adduct_types[meta_info['precursor_type']], 0)) |
| 313 elif not args.skip_invalid_adducts: | 259 elif not args.skip_invalid_adducts: |
| 260 inv_adduct_types = {int(round(v, 0)): k for k, v in | |
| 261 six.iteritems(adduct_types)} | |
| 314 adduct = inv_adduct_types[int(paramd['PrecursorIonModeDefault'])] | 262 adduct = inv_adduct_types[int(paramd['PrecursorIonModeDefault'])] |
| 315 paramd["PrecursorIonMode"] = paramd['PrecursorIonModeDefault'] | 263 paramd["PrecursorIonMode"] = paramd['PrecursorIonModeDefault'] |
| 316 nm = float(meta_info['precursor_mz']) - paramd['nm_mass_diff_default'] | 264 nm = float(meta_info['precursor_mz']) - paramd['nm_mass_diff_default'] |
| 317 else: | 265 else: |
| 318 print('Skipping {}'.format(paramd["SampleName"])) | 266 print('Skipping {}'.format(paramd["SampleName"])) |
| 319 return '', '' | 267 return '', '' |
| 320 | 268 |
| 321 paramd['additional_details']['adduct'] = adduct | 269 if not ('precursor_type' in paramd['additional_details'] or 'adduct' |
| 270 in paramd['additional_details']): | |
| 271 paramd['additional_details']['adduct'] = adduct | |
| 272 | |
| 322 paramd["NeutralPrecursorMass"] = nm | 273 paramd["NeutralPrecursorMass"] = nm |
| 323 | 274 |
| 324 # ============== Create CLI cmd for metfrag =============================== | 275 # ============== Create CLI cmd for metfrag =============================== |
| 325 cmd = "metfrag" | 276 cmd = "metfrag" |
| 326 for k, v in six.iteritems(paramd): | 277 for k, v in six.iteritems(paramd): |
| 327 if k not in ['PrecursorIonModeDefault', 'nm_mass_diff_default', | 278 if k not in ['PrecursorIonModeDefault', 'nm_mass_diff_default', |
| 328 'additional_details']: | 279 'additional_details']: |
| 329 cmd += " {}={}".format(str(k), str(v)) | 280 cmd += " {}={}".format(str(k), str(v)) |
| 330 | 281 |
| 282 if args.output_cl: | |
| 283 cli_str = '{} PeakListString={}'.format(cmd, peaklist_str) | |
| 284 paramd['additional_details']['MetFragCLIString'] = cli_str | |
| 285 | |
| 331 # ============== Run metfrag ============================================== | 286 # ============== Run metfrag ============================================== |
| 332 # print(cmd) | 287 # print(cmd) |
| 333 # Filter before process with a minimum number of MS/MS peaks | 288 # Filter before process with a minimum number of MS/MS peaks |
| 334 if plinesread >= float(args.minMSMSpeaks): | 289 if plinesread >= float(args.minMSMSpeaks): |
| 335 | 290 |
| 341 | 296 |
| 342 def work(cmds): | 297 def work(cmds): |
| 343 return [os.system(cmd) for cmd in cmds] | 298 return [os.system(cmd) for cmd in cmds] |
| 344 | 299 |
| 345 | 300 |
| 346 ###################################################################### | 301 if __name__ == "__main__": |
| 347 # Parse MSP file and run metfrag CLI | 302 print(sys.version) |
| 348 ###################################################################### | 303 |
| 349 # keep list of commands if performing in CLI in parallel | 304 parser = argparse.ArgumentParser() |
| 350 cmds = [] | 305 parser.add_argument('--input_pth') |
| 351 # keep a dictionary of all params | 306 parser.add_argument('--result_pth', default='metfrag_result.csv') |
| 352 paramds = {} | 307 |
| 353 # keep count of spectra (for uid) | 308 parser.add_argument('--temp_dir') |
| 354 spectrac = 0 | 309 parser.add_argument('--polarity', default='pos') |
| 355 # this dictionary will store the meta data results form the MSp file | 310 parser.add_argument('--minMSMSpeaks', default=1) |
| 356 meta_info = {} | 311 |
| 357 | 312 parser.add_argument('--MetFragDatabaseType', default='PubChem') |
| 358 with open(args.input_pth, "r") as infile: | 313 parser.add_argument('--LocalDatabasePath', default='') |
| 359 # number of lines for the peaks | 314 parser.add_argument('--LocalMetChemDatabaseServerIp', default='') |
| 360 pnumlines = 0 | 315 |
| 361 # number of lines read for the peaks | 316 parser.add_argument('--DatabaseSearchRelativeMassDeviation', default=5) |
| 362 plinesread = 0 | 317 parser.add_argument('--FragmentPeakMatchRelativeMassDeviation', default=10) |
| 363 for line in infile: | 318 parser.add_argument('--FragmentPeakMatchAbsoluteMassDeviation', |
| 364 line = line.strip() | 319 default=0.001) |
| 365 | 320 parser.add_argument('--NumberThreads', default=1) |
| 366 if pnumlines == 0: | 321 parser.add_argument('--UnconnectedCompoundFilter', action='store_true') |
| 367 # ============== Extract metadata from MSP ======================== | 322 parser.add_argument('--IsotopeFilter', action='store_true') |
| 368 meta_info = parse_meta(meta_regex, meta_info) | 323 |
| 369 | 324 parser.add_argument('--FilterMinimumElements', default='') |
| 370 if ('massbank' in meta_info and 'cols' in meta_info) or ( | 325 parser.add_argument('--FilterMaximumElements', default='') |
| 371 'msp' in meta_info and 'num_peaks' in meta_info): | 326 parser.add_argument('--FilterSmartsInclusionList', default='') |
| 372 pnumlines = int(meta_info['num_peaks']) | 327 parser.add_argument('--FilterSmartsExclusionList', default='') |
| 328 parser.add_argument('--FilterIncludedElements', default='') | |
| 329 parser.add_argument('--FilterExcludedElements', default='') | |
| 330 parser.add_argument('--FilterIncludedExclusiveElements', default='') | |
| 331 | |
| 332 parser.add_argument('--score_thrshld', default=0) | |
| 333 parser.add_argument('--pctexplpeak_thrshld', default=0) | |
| 334 parser.add_argument('--schema') | |
| 335 parser.add_argument('--cores_top_level', default=1) | |
| 336 parser.add_argument('--chunks', default=1) | |
| 337 parser.add_argument('--meta_select_col', default='name') | |
| 338 parser.add_argument('--skip_invalid_adducts', action='store_true') | |
| 339 parser.add_argument('--output_cl', action='store_true') | |
| 340 | |
| 341 parser.add_argument('--ScoreSuspectLists', default='') | |
| 342 parser.add_argument('--MetFragScoreTypes', | |
| 343 default="FragmenterScore,OfflineMetFusionScore") | |
| 344 parser.add_argument('--MetFragScoreWeights', default="1.0,1.0") | |
| 345 parser.add_argument('-a', '--adducts', action='append', nargs=1, | |
| 346 required=False, default=[], help='Adducts used') | |
| 347 | |
| 348 args = parser.parse_args() | |
| 349 print(args) | |
| 350 config = ConfigParser() | |
| 351 config.read( | |
| 352 os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config.ini')) | |
| 353 | |
| 354 if os.stat(args.input_pth).st_size == 0: | |
| 355 print('Input file empty') | |
| 356 exit() | |
| 357 | |
| 358 # Create temporary working directory | |
| 359 if args.temp_dir: | |
| 360 wd = args.temp_dir | |
| 361 else: | |
| 362 wd = tempfile.mkdtemp() | |
| 363 | |
| 364 if os.path.exists(wd): | |
| 365 shutil.rmtree(wd) | |
| 366 os.makedirs(wd) | |
| 367 else: | |
| 368 os.makedirs(wd) | |
| 369 | |
| 370 meta_regex = get_meta_regex(args.schema) | |
| 371 | |
| 372 adduct_types = { | |
| 373 '[M+H]+': 1.007276, | |
| 374 '[M+NH4]+': 18.034374, | |
| 375 '[M+Na]+': 22.989218, | |
| 376 '[M+K]+': 38.963158, | |
| 377 '[M+CH3OH+H]+': 33.033489, | |
| 378 '[M+ACN+H]+': 42.033823, | |
| 379 '[M+ACN+Na]+': 64.015765, | |
| 380 '[M+2ACN+H]+': 83.06037, | |
| 381 '[M-H]-': -1.007276, | |
| 382 '[M+Cl]-': 34.969402, | |
| 383 '[M+HCOO]-': 44.99819, | |
| 384 '[M-H+HCOOH]-': 44.99819, | |
| 385 # same as above but different style of writing adduct | |
| 386 '[M+CH3COO]-': 59.01385, | |
| 387 '[M-H+CH3COOH]-': 59.01385 | |
| 388 # same as above but different style of writing adduct | |
| 389 } | |
| 390 | |
| 391 ###################################################################### | |
| 392 # Parse MSP file and run metfrag CLI | |
| 393 ###################################################################### | |
| 394 # keep list of commands if performing in CLI in parallel | |
| 395 cmds = [] | |
| 396 # keep a dictionary of all params | |
| 397 paramds = {} | |
| 398 # keep count of spectra (for uid) | |
| 399 spectrac = 0 | |
| 400 # this dictionary will store the meta data results form the MSp file | |
| 401 meta_info = {} | |
| 402 | |
| 403 if args.adducts: | |
| 404 adducts_from_cli = [ | |
| 405 a[0].replace('__ob__', '[').replace('__cb__', ']') for a in | |
| 406 args.adducts | |
| 407 ] | |
| 408 else: | |
| 409 adducts_from_cli = [] | |
| 410 | |
| 411 with open(args.input_pth, "r") as infile: | |
| 412 # number of lines for the peaks | |
| 413 pnumlines = 0 | |
| 414 # number of lines read for the peaks | |
| 415 plinesread = 0 | |
| 416 for line in infile: | |
| 417 line = line.strip() | |
| 418 | |
| 419 if pnumlines == 0: | |
| 420 # ============== Extract metadata from MSP ==================== | |
| 421 meta_info = parse_meta(meta_regex, meta_info) | |
| 422 | |
| 423 if ('massbank' in meta_info and 'cols' in meta_info) or ( | |
| 424 'msp' in meta_info and 'num_peaks' in meta_info): | |
| 425 pnumlines = int(meta_info['num_peaks']) | |
| 426 plinesread = 0 | |
| 427 peaklist = [] | |
| 428 | |
| 429 elif plinesread < pnumlines: | |
| 430 # ============== Extract peaks from MSP ======================= | |
| 431 # .split() will split on any empty space (i.e. tab and space) | |
| 432 line = tuple(line.split()) | |
| 433 # Keep only m/z and intensity, not relative intensity | |
| 434 save_line = tuple(line[0].split() + line[1].split()) | |
| 435 plinesread += 1 | |
| 436 peaklist.append(save_line) | |
| 437 | |
| 438 elif plinesread and plinesread == pnumlines: | |
| 439 # =Get sample name and additional details for output and RUN == | |
| 440 if adducts_from_cli: | |
| 441 for adduct in adducts_from_cli: | |
| 442 spectrac += 1 | |
| 443 meta_info['precursor_type'] = adduct | |
| 444 paramd, cmd = run_metfrag(meta_info, peaklist, args, | |
| 445 wd, spectrac, adduct_types) | |
| 446 if paramd: | |
| 447 paramds[paramd["SampleName"]] = paramd | |
| 448 cmds.append(cmd) | |
| 449 else: | |
| 450 spectrac += 1 | |
| 451 paramd, cmd = run_metfrag(meta_info, peaklist, args, | |
| 452 wd, spectrac, adduct_types) | |
| 453 if paramd: | |
| 454 paramds[paramd["SampleName"]] = paramd | |
| 455 cmds.append(cmd) | |
| 456 | |
| 457 meta_info = {} | |
| 458 pnumlines = 0 | |
| 373 plinesread = 0 | 459 plinesread = 0 |
| 374 peaklist = [] | 460 |
| 375 | 461 # end of file. Check if there is a MSP spectra to run metfrag on |
| 376 elif plinesread < pnumlines: | 462 if plinesread and plinesread == pnumlines: |
| 377 # ============== Extract peaks from MSP ========================== | 463 |
| 378 # .split() will split on any empty space (i.e. tab and space) | 464 if adducts_from_cli: |
| 379 line = tuple(line.split()) | 465 for adduct in adducts_from_cli: |
| 380 # Keep only m/z and intensity, not relative intensity | 466 spectrac += 1 |
| 381 save_line = tuple(line[0].split() + line[1].split()) | 467 meta_info['precursor_type'] = adduct |
| 382 plinesread += 1 | 468 paramd, cmd = run_metfrag(meta_info, peaklist, args, |
| 383 peaklist.append(save_line) | 469 wd, spectrac, adduct_types) |
| 384 | 470 if paramd: |
| 385 elif plinesread and plinesread == pnumlines: | 471 paramds[paramd["SampleName"]] = paramd |
| 386 # ======= Get sample name and additional details for output ======= | 472 cmds.append(cmd) |
| 387 spectrac += 1 | 473 else: |
| 388 paramd, cmd = run_metfrag(meta_info, peaklist, args, wd, spectrac, | 474 spectrac += 1 |
| 389 adduct_types) | 475 paramd, cmd = run_metfrag(meta_info, peaklist, args, |
| 390 | 476 wd, spectrac, adduct_types) |
| 391 if paramd: | 477 if paramd: |
| 392 paramds[paramd["SampleName"]] = paramd | 478 paramds[paramd["SampleName"]] = paramd |
| 393 cmds.append(cmd) | 479 cmds.append(cmd) |
| 394 | 480 |
| 395 meta_info = {} | 481 # Perform multiprocessing on command line call level |
| 396 pnumlines = 0 | 482 if int(args.cores_top_level) > 1: |
| 397 plinesread = 0 | 483 cmds_chunks = [cmds[x:x + int(args.chunks)] for x in |
| 398 | 484 list(range(0, len(cmds), int(args.chunks)))] |
| 399 # end of file. Check if there is a MSP spectra to run metfrag on still | 485 pool = multiprocessing.Pool(processes=int(args.cores_top_level)) |
| 400 if plinesread and plinesread == pnumlines: | 486 pool.map(work, cmds_chunks) |
| 401 | 487 pool.close() |
| 402 paramd, cmd = run_metfrag(meta_info, peaklist, args, wd, spectrac + 1, | 488 pool.join() |
| 403 adduct_types) | 489 |
| 404 | 490 ###################################################################### |
| 405 if paramd: | 491 # Concatenate and filter the output |
| 406 paramds[paramd["SampleName"]] = paramd | 492 ###################################################################### |
| 407 cmds.append(cmd) | 493 # outputs might have different headers. Need to get a list of all the |
| 408 | 494 # headers before we start merging the files |
| 409 # Perform multiprocessing on command line call level | 495 # outfiles = [os.path.join(wd, f) for f in glob.glob(os.path.join(wd, |
| 410 if int(args.cores_top_level) > 1: | 496 # "*_metfrag_result.csv"))] |
| 411 cmds_chunks = [cmds[x:x + int(args.chunks)] for x in | 497 outfiles = glob.glob(os.path.join(wd, "*_metfrag_result.csv")) |
| 412 list(range(0, len(cmds), int(args.chunks)))] | 498 |
| 413 pool = multiprocessing.Pool(processes=int(args.cores_top_level)) | 499 if len(outfiles) == 0: |
| 414 pool.map(work, cmds_chunks) | 500 print('No results') |
| 415 pool.close() | 501 sys.exit() |
| 416 pool.join() | 502 |
| 417 | 503 headers = [] |
| 418 ###################################################################### | 504 c = 0 |
| 419 # Concatenate and filter the output | |
| 420 ###################################################################### | |
| 421 # outputs might have different headers. Need to get a list of all the | |
| 422 # headers before we start merging the files | |
| 423 # outfiles = [os.path.join(wd, f) for f in glob.glob(os.path.join(wd, | |
| 424 # "*_metfrag_result.csv"))] | |
| 425 outfiles = glob.glob(os.path.join(wd, "*_metfrag_result.csv")) | |
| 426 | |
| 427 if len(outfiles) == 0: | |
| 428 print('No results') | |
| 429 sys.exit() | |
| 430 | |
| 431 headers = [] | |
| 432 c = 0 | |
| 433 for fn in outfiles: | |
| 434 with open(fn, 'r') as infile: | |
| 435 reader = csv.reader(infile) | |
| 436 if sys.version_info >= (3, 0): | |
| 437 headers.extend(next(reader)) | |
| 438 else: | |
| 439 headers.extend(reader.next()) | |
| 440 # check if file has any data rows | |
| 441 for i, row in enumerate(reader): | |
| 442 c += 1 | |
| 443 if i == 1: | |
| 444 break | |
| 445 | |
| 446 # if no data rows (e.g. matches) then do not save an | |
| 447 # output and leave the program | |
| 448 if c == 0: | |
| 449 print('No results') | |
| 450 sys.exit() | |
| 451 | |
| 452 additional_detail_headers = ['sample_name'] | |
| 453 for k, paramd in six.iteritems(paramds): | |
| 454 additional_detail_headers = list(set( | |
| 455 additional_detail_headers + list(paramd['additional_details'].keys()))) | |
| 456 | |
| 457 # add inchikey if not already present (missing in metchem output) | |
| 458 if 'InChIKey' not in headers: | |
| 459 headers.append('InChIKey') | |
| 460 | |
| 461 headers = additional_detail_headers + sorted(list(set(headers))) | |
| 462 | |
| 463 # Sort files nicely | |
| 464 outfiles.sort( | |
| 465 key=lambda s: int(re.match(r'^.*/(\d+)_metfrag_result.csv', s).group(1))) | |
| 466 | |
| 467 print(outfiles) | |
| 468 | |
| 469 # merge outputs | |
| 470 with open(args.result_pth, 'a') as merged_outfile: | |
| 471 dwriter = csv.DictWriter(merged_outfile, fieldnames=headers, | |
| 472 delimiter='\t', quotechar='"') | |
| 473 dwriter.writeheader() | |
| 474 | |
| 475 for fn in outfiles: | 505 for fn in outfiles: |
| 476 | 506 with open(fn, 'r') as infile: |
| 477 with open(fn) as infile: | 507 reader = csv.reader(infile) |
| 478 reader = csv.DictReader(infile, delimiter=',', quotechar='"') | 508 if sys.version_info >= (3, 0): |
| 479 for line in reader: | 509 headers.extend(next(reader)) |
| 480 bewrite = True | 510 else: |
| 481 for key, value in line.items(): | 511 headers.extend(reader.next()) |
| 482 # Filter when no MS/MS peak matched | 512 # check if file has any data rows |
| 483 if key == "ExplPeaks": | 513 for i, row in enumerate(reader): |
| 484 if float(args.pctexplpeak_thrshld) > 0 \ | 514 c += 1 |
| 485 and value and "NA" in value: | 515 if i == 1: |
| 516 break | |
| 517 | |
| 518 # if no data rows (e.g. matches) then do not save an | |
| 519 # output and leave the program | |
| 520 if c == 0: | |
| 521 print('No results') | |
| 522 sys.exit() | |
| 523 | |
| 524 additional_detail_headers = ['sample_name'] | |
| 525 for k, paramd in six.iteritems(paramds): | |
| 526 additional_detail_headers = list(set( | |
| 527 additional_detail_headers + list( | |
| 528 paramd['additional_details'].keys()))) | |
| 529 | |
| 530 # add inchikey if not already present (missing in metchem output) | |
| 531 if 'InChIKey' not in headers: | |
| 532 headers.append('InChIKey') | |
| 533 | |
| 534 additional_detail_headers = sorted(additional_detail_headers) | |
| 535 headers = additional_detail_headers + sorted(list(set(headers))) | |
| 536 | |
| 537 # Sort files nicely | |
| 538 outfiles.sort( | |
| 539 key=lambda s: int( | |
| 540 re.match(r'^.*/(\d+)_metfrag_result.csv', s).group(1))) | |
| 541 | |
| 542 print(outfiles) | |
| 543 | |
| 544 # merge outputs | |
| 545 with open(args.result_pth, 'a') as merged_outfile: | |
| 546 dwriter = csv.DictWriter(merged_outfile, fieldnames=headers, | |
| 547 delimiter='\t', quotechar='"') | |
| 548 dwriter.writeheader() | |
| 549 | |
| 550 for fn in outfiles: | |
| 551 with open(fn) as infile: | |
| 552 reader = csv.DictReader(infile, delimiter=',', quotechar='"') | |
| 553 for line in reader: | |
| 554 bewrite = True | |
| 555 for key, value in line.items(): | |
| 556 # Filter when no MS/MS peak matched | |
| 557 if key == "ExplPeaks": | |
| 558 if float(args.pctexplpeak_thrshld) > 0 \ | |
| 559 and value and "NA" in value: | |
| 560 bewrite = False | |
| 561 # Filter with a score threshold | |
| 562 elif key == "Score": | |
| 563 if value and float(value) <= float( | |
| 564 args.score_thrshld): | |
| 565 bewrite = False | |
| 566 elif key == "NoExplPeaks": | |
| 567 nbfindpeak = float(value) | |
| 568 elif key == "NumberPeaksUsed": | |
| 569 totpeaks = float(value) | |
| 570 # Filter with a relative number of peak matched | |
| 571 try: | |
| 572 pctexplpeak = nbfindpeak / totpeaks * 100 | |
| 573 except ZeroDivisionError: | |
| 574 bewrite = False | |
| 575 else: | |
| 576 if pctexplpeak < float(args.pctexplpeak_thrshld): | |
| 486 bewrite = False | 577 bewrite = False |
| 487 # Filter with a score threshold | 578 |
| 488 elif key == "Score": | 579 # Write the line if it pass all filters |
| 489 if value and float(value) <= float(args.score_thrshld): | 580 if bewrite: |
| 490 bewrite = False | 581 bfn = os.path.basename(fn) |
| 491 elif key == "NoExplPeaks": | 582 bfn = bfn.replace(".csv", "") |
| 492 nbfindpeak = float(value) | 583 line['sample_name'] = paramds[bfn]['SampleName'] |
| 493 elif key == "NumberPeaksUsed": | 584 ad = paramds[bfn]['additional_details'] |
| 494 totpeaks = float(value) | 585 |
| 495 # Filter with a relative number of peak matched | 586 if args.MetFragDatabaseType == "MetChem": |
| 496 try: | 587 # for some reason the metchem database option does |
| 497 pctexplpeak = nbfindpeak / totpeaks * 100 | 588 # not report the full inchikey (at least in the |
| 498 except ZeroDivisionError: | 589 # Bham setup. This ensures we always get |
| 499 bewrite = False | 590 # the fully inchikey |
| 500 else: | 591 line['InChIKey'] = '{}-{}-{}'.format( |
| 501 if pctexplpeak < float(args.pctexplpeak_thrshld): | 592 line['InChIKey1'], |
| 502 bewrite = False | 593 line['InChIKey2'], |
| 503 | 594 line['InChIKey3']) |
| 504 # Write the line if it pass all filters | 595 |
| 505 if bewrite: | 596 line.update(ad) |
| 506 bfn = os.path.basename(fn) | 597 dwriter.writerow(line) |
| 507 bfn = bfn.replace(".csv", "") | |
| 508 line['sample_name'] = paramds[bfn]['SampleName'] | |
| 509 ad = paramds[bfn]['additional_details'] | |
| 510 | |
| 511 if args.MetFragDatabaseType == "MetChem": | |
| 512 # for some reason the metchem database option does | |
| 513 # not report the full inchikey (at least in the Bham | |
| 514 # setup. This ensures we always get the fully inchikey | |
| 515 line['InChIKey'] = '{}-{}-{}'.format(line['InChIKey1'], | |
| 516 line['InChIKey2'], | |
| 517 line['InChIKey3']) | |
| 518 | |
| 519 line.update(ad) | |
| 520 dwriter.writerow(line) |
