metfrag: metfrag.py comparison

comparison metfrag.py @ 2:f151ee133612 draft default tip

"planemo upload for repository https://github.com/computational-metabolomics/metfrag-galaxy commit b337c6296968848e3214f4b51df3d86776f84b6a"

author	computational-metabolomics
date	Tue, 14 Jul 2020 07:41:53 -0400
parents	9ee2e2ceb2c9
children

comparison

equal deleted inserted replaced

-:9ee2e2ceb2c9
+:f151ee133612
 from __future__ import absolute_import, print_function
-import ConfigParser
+try:
+from configparser import ConfigParser
+except ImportError as e:
+print(e)
+from ConfigParser import ConfigParser
 import argparse
 import csv
 import glob
 import multiprocessing
 import os
 import tempfile
 from collections import defaultdict
 import six
-print(sys.version)
-parser = argparse.ArgumentParser()
-parser.add_argument('--input_pth')
-parser.add_argument('--result_pth', default='metfrag_result.csv')
-parser.add_argument('--temp_dir')
-parser.add_argument('--polarity', default='pos')
-parser.add_argument('--minMSMSpeaks', default=1)
-parser.add_argument('--MetFragDatabaseType', default='PubChem')
-parser.add_argument('--LocalDatabasePath', default='')
-parser.add_argument('--LocalMetChemDatabaseServerIp', default='')
-parser.add_argument('--DatabaseSearchRelativeMassDeviation', default=5)
-parser.add_argument('--FragmentPeakMatchRelativeMassDeviation', default=10)
-parser.add_argument('--FragmentPeakMatchAbsoluteMassDeviation', default=0.001)
-parser.add_argument('--NumberThreads', default=1)
-parser.add_argument('--UnconnectedCompoundFilter', action='store_true')
-parser.add_argument('--IsotopeFilter', action='store_true')
-parser.add_argument('--FilterMinimumElements', default='')
-parser.add_argument('--FilterMaximumElements', default='')
-parser.add_argument('--FilterSmartsInclusionList', default='')
-parser.add_argument('--FilterSmartsExclusionList', default='')
-parser.add_argument('--FilterIncludedElements', default='')
-parser.add_argument('--FilterExcludedElements', default='')
-parser.add_argument('--FilterIncludedExclusiveElements', default='')
-parser.add_argument('--score_thrshld', default=0)
-parser.add_argument('--pctexplpeak_thrshld', default=0)
-parser.add_argument('--schema')
-parser.add_argument('--cores_top_level', default=1)
-parser.add_argument('--chunks', default=1)
-parser.add_argument('--meta_select_col', default='name')
-parser.add_argument('--skip_invalid_adducts', action='store_true')
-parser.add_argument('--ScoreSuspectLists', default='')
-parser.add_argument('--MetFragScoreTypes',
-default="FragmenterScore,OfflineMetFusionScore")
-parser.add_argument('--MetFragScoreWeights', default="1.0,1.0")
-args = parser.parse_args()
-print(args)
-config = ConfigParser.ConfigParser()
-config.read(
-os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config.ini'))
-if os.stat(args.input_pth).st_size == 0:
-print('Input file empty')
-exit()
-# Create temporary working directory
-if args.temp_dir:
-wd = args.temp_dir
-else:
-wd = tempfile.mkdtemp()
-if os.path.exists(wd):
-shutil.rmtree(wd)
-os.makedirs(wd)
-else:
-os.makedirs(wd)
-######################################################################
-# Setup regular expressions for MSP parsing dictionary
-######################################################################
-regex_msp = {}
-regex_msp['name'] = [r'^Name(?:=|:)(.*)$']
-regex_msp['polarity'] = [r'^ion.*mode(?:=|:)(.*)$',
-r'^ionization.*mode(?:=|:)(.*)$',
-r'^polarity(?:=|:)(.*)$']
-regex_msp['precursor_mz'] = [r'^precursor.*m/z(?:=|:)\s*(\d*[.,]?\d*)$',
-r'^precursor.*mz(?:=|:)\s*(\d*[.,]?\d*)$']
-regex_msp['precursor_type'] = [r'^precursor.*type(?:=|:)(.*)$',
-r'^adduct(?:=|:)(.*)$',
-r'^ADDUCTIONNAME(?:=|:)(.*)$']
-regex_msp['num_peaks'] = [r'^Num.*Peaks(?:=|:)\s*(\d*)$']
-regex_msp['msp'] = [r'^Name(?:=|:)(.*)$']  # Flag for standard MSP format
-regex_massbank = {}
-regex_massbank['name'] = [r'^RECORD_TITLE:(.*)$']
-regex_massbank['polarity'] = [r'^AC\$MASS_SPECTROMETRY:\s+ION_MODE\s+(.*)$']
-regex_massbank['precursor_mz'] = [
-r'^MS\$FOCUSED_ION:\s+PRECURSOR_M/Z\s+(\d*[.,]?\d*)$']
-regex_massbank['precursor_type'] = [
-r'^MS\$FOCUSED_ION:\s+PRECURSOR_TYPE\s+(.*)$']
-regex_massbank['num_peaks'] = [r'^PK\$NUM_PEAK:\s+(\d*)']
-regex_massbank['cols'] = [r'^PK\$PEAK:\s+(.*)']
-regex_massbank['massbank'] = [
-r'^RECORD_TITLE:(.*)$']  # Flag for massbank format
-if args.schema == 'msp':
-meta_regex = regex_msp
-elif args.schema == 'massbank':
-meta_regex = regex_massbank
-elif args.schema == 'auto':
-# If auto we just check for all the available paramter names and then
-# determine if Massbank or MSP based on the name parameter
-meta_regex = {}
-meta_regex.update(regex_massbank)
-meta_regex['name'].extend(regex_msp['name'])
-meta_regex['polarity'].extend(regex_msp['polarity'])
-meta_regex['precursor_mz'].extend(regex_msp['precursor_mz'])
-meta_regex['precursor_type'].extend(regex_msp['precursor_type'])
-meta_regex['num_peaks'].extend(regex_msp['num_peaks'])
-meta_regex['msp'] = regex_msp['msp']
-else:
-sys.exit("No schema selected")
-adduct_types = {
-'[M+H]+': 1.007276,
-'[M+NH4]+': 18.034374,
-'[M+Na]+': 22.989218,
-'[M+K]+': 38.963158,
-'[M+CH3OH+H]+': 33.033489,
-'[M+ACN+H]+': 42.033823,
-'[M+ACN+Na]+': 64.015765,
-'[M+2ACN+H]+': 83.06037,
-'[M-H]-': -1.007276,
-'[M+Cl]-': 34.969402,
-'[M+HCOO]-': 44.99819,
-'[M-H+HCOOH]-': 44.99819,
-# same as above but different style of writing adduct
-'[M+CH3COO]-': 59.01385,
-'[M-H+CH3COOH]-': 59.01385
-# same as above but different style of writing adduct
-}
-inv_adduct_types = {int(round(v, 0)): k for k, v in adduct_types.iteritems()}
 # function to extract the meta data using the regular expressions
 def parse_meta(meta_regex, meta_info=None):
 if meta_info is None:
 meta_info = {}
 for reg in regexes:
 m = re.search(reg, line, re.IGNORECASE)
 if m:
 meta_info[k] = '-'.join(m.groups()).strip()
 return meta_info
+def get_meta_regex(schema):
+######################################################################
+# Setup regular expressions for MSP parsing dictionary
+######################################################################
+regex_msp = {}
+regex_msp['name'] = [r'^Name(?:=|:)(.*)$']
+regex_msp['polarity'] = [r'^ion.*mode(?:=|:)(.*)$',
+r'^ionization.*mode(?:=|:)(.*)$',
+r'^polarity(?:=|:)(.*)$']
+regex_msp['precursor_mz'] = [r'^precursor.*m/z(?:=|:)\s*(\d*[.,]?\d*)$',
+r'^precursor.*mz(?:=|:)\s*(\d*[.,]?\d*)$']
+regex_msp['precursor_type'] = [r'^precursor.*type(?:=|:)(.*)$',
+r'^adduct(?:=|:)(.*)$',
+r'^ADDUCTIONNAME(?:=|:)(.*)$']
+regex_msp['retention_time'] = [r'^RETENTION.*TIME(?:=|:)\s*(.*)$',
+r'^rt(?:=|:)\s*(.*)$',
+r'^time(?:=|:)\s*(.*)$']
+# From example winter_pos.mspy from kristian
+regex_msp['AlignmentID'] = [r'^AlignmentID(?:=|:)\s*(.*)$']
+regex_msp['num_peaks'] = [r'^Num.*Peaks(?:=|:)\s*(\d*)$']
+regex_msp['msp'] = [r'^Name(?:=|:)(.*)$']  # Flag for standard MSP format
+regex_massbank = {}
+regex_massbank['name'] = [r'^RECORD_TITLE:(.*)$']
+regex_massbank['polarity'] = [
+r'^AC\$MASS_SPECTROMETRY:\s+ION_MODE\s+(.*)$']
+regex_massbank['precursor_mz'] = [
+r'^MS\$FOCUSED_ION:\s+PRECURSOR_M/Z\s+(\d*[.,]?\d*)$']
+regex_massbank['precursor_type'] = [
+r'^MS\$FOCUSED_ION:\s+PRECURSOR_TYPE\s+(.*)$']
+regex_massbank['retention_time'] = [
+r'^AC\$CHROMATOGRAPHY:\s+RETENTION_TIME\s*(\d*\.?\d+).*']
+regex_massbank['num_peaks'] = [r'^PK\$NUM_PEAK:\s+(\d*)']
+regex_massbank['cols'] = [r'^PK\$PEAK:\s+(.*)']
+regex_massbank['massbank'] = [
+r'^RECORD_TITLE:(.*)$']  # Flag for massbank format
+if schema == 'msp':
+meta_regex = regex_msp
+elif schema == 'massbank':
+meta_regex = regex_massbank
+elif schema == 'auto':
+# If auto we just check for all the available paramter names and then
+# determine if Massbank or MSP based on the name parameter
+meta_regex = {}
+meta_regex.update(regex_massbank)
+meta_regex['name'].extend(regex_msp['name'])
+meta_regex['polarity'].extend(regex_msp['polarity'])
+meta_regex['precursor_mz'].extend(regex_msp['precursor_mz'])
+meta_regex['precursor_type'].extend(regex_msp['precursor_type'])
+meta_regex['num_peaks'].extend(regex_msp['num_peaks'])
+meta_regex['retention_time'].extend(regex_msp['retention_time'])
+meta_regex['AlignmentID'] = regex_msp['AlignmentID']
+meta_regex['msp'] = regex_msp['msp']
+else:
+sys.exit("No schema selected")
+return meta_regex
 ######################################################################
 # Setup parameter dictionary
 ######################################################################
 # the msPurity pipeline) choose between getting additional details to add
 # as columns as either all meta data from msp, just details from the
 # record name (i.e. when using msPurity and we have the columns coded into
 # the name) or just the spectra index (spectrac)].
 # Returns the parameters used and the command line call
+meta_info = {k: v for k, v in meta_info.items() if k
+not in ['msp', 'massbank', 'cols']}
 paramd = init_paramd(args)
 if args.meta_select_col == 'name':
 # have additional column of just the name
 paramd['additional_details'] = {'name': meta_info['name']}
 elif args.meta_select_col == 'all':
 # have additional columns based on all the meta information
 # extracted from the MSP
 paramd['additional_details'] = meta_info
 else:
-# Just have and index of the spectra in the MSP file
+# Just have an index of the spectra in the MSP file
 paramd['additional_details'] = {'spectra_idx': spectrac}
 paramd["SampleName"] = "{}_metfrag_result".format(spectrac)
 # =============== Output peaks to txt file  ==============================
 paramd["PeakListPath"] = os.path.join(wd,
 "{}_tmpspec.txt".format(spectrac))
 # write spec file
 with open(paramd["PeakListPath"], 'w') as outfile:
+pls = ''
 for p in peaklist:
 outfile.write(p[0] + "\t" + p[1] + "\n")
+pls = pls + '{}_{};'.format(p[0], p[1])
+if args.output_cl:
+peaklist_str = pls[:-1]
 # =============== Update param based on MSP metadata ======================
 # Replace param details with details from MSP if required
 if 'precursor_type' in meta_info and \
 meta_info['precursor_type'] in adduct_types:
 nm = float(meta_info['precursor_mz']) - adduct_types[
 meta_info['precursor_type']]
 paramd["PrecursorIonMode"] = \
 int(round(adduct_types[meta_info['precursor_type']], 0))
 elif not args.skip_invalid_adducts:
+inv_adduct_types = {int(round(v, 0)): k for k, v in
+six.iteritems(adduct_types)}
 adduct = inv_adduct_types[int(paramd['PrecursorIonModeDefault'])]
 paramd["PrecursorIonMode"] = paramd['PrecursorIonModeDefault']
 nm = float(meta_info['precursor_mz']) - paramd['nm_mass_diff_default']
 else:
 print('Skipping {}'.format(paramd["SampleName"]))
 return '', ''
-paramd['additional_details']['adduct'] = adduct
+if not ('precursor_type' in paramd['additional_details'] or 'adduct'
+in paramd['additional_details']):
+paramd['additional_details']['adduct'] = adduct
 paramd["NeutralPrecursorMass"] = nm
 # ============== Create CLI cmd for metfrag ===============================
 cmd = "metfrag"
 for k, v in six.iteritems(paramd):
 if k not in ['PrecursorIonModeDefault', 'nm_mass_diff_default',
 'additional_details']:
 cmd += " {}={}".format(str(k), str(v))
+if args.output_cl:
+cli_str = '{} PeakListString={}'.format(cmd, peaklist_str)
+paramd['additional_details']['MetFragCLIString'] = cli_str
 # ============== Run metfrag ==============================================
 # print(cmd)
 # Filter before process with a minimum number of MS/MS peaks
 if plinesread >= float(args.minMSMSpeaks):
 def work(cmds):
 return [os.system(cmd) for cmd in cmds]
-######################################################################
+if __name__ == "__main__":
-# Parse MSP file and run metfrag CLI
+print(sys.version)
-######################################################################
-# keep list of commands if performing in CLI in parallel
+parser = argparse.ArgumentParser()
-cmds = []
+parser.add_argument('--input_pth')
-# keep a dictionary of all params
+parser.add_argument('--result_pth', default='metfrag_result.csv')
-paramds = {}
-# keep count of spectra (for uid)
+parser.add_argument('--temp_dir')
-spectrac = 0
+parser.add_argument('--polarity', default='pos')
-# this dictionary will store the meta data results form the MSp file
+parser.add_argument('--minMSMSpeaks', default=1)
-meta_info = {}
+parser.add_argument('--MetFragDatabaseType', default='PubChem')
-with open(args.input_pth, "r") as infile:
+parser.add_argument('--LocalDatabasePath', default='')
-# number of lines for the peaks
+parser.add_argument('--LocalMetChemDatabaseServerIp', default='')
-pnumlines = 0
-# number of lines read for the peaks
+parser.add_argument('--DatabaseSearchRelativeMassDeviation', default=5)
-plinesread = 0
+parser.add_argument('--FragmentPeakMatchRelativeMassDeviation', default=10)
-for line in infile:
+parser.add_argument('--FragmentPeakMatchAbsoluteMassDeviation',
-line = line.strip()
+default=0.001)
+parser.add_argument('--NumberThreads', default=1)
-if pnumlines == 0:
+parser.add_argument('--UnconnectedCompoundFilter', action='store_true')
-# ============== Extract metadata from MSP ========================
+parser.add_argument('--IsotopeFilter', action='store_true')
-meta_info = parse_meta(meta_regex, meta_info)
+parser.add_argument('--FilterMinimumElements', default='')
-if ('massbank' in meta_info and 'cols' in meta_info) or (
+parser.add_argument('--FilterMaximumElements', default='')
-'msp' in meta_info and 'num_peaks' in meta_info):
+parser.add_argument('--FilterSmartsInclusionList', default='')
-pnumlines = int(meta_info['num_peaks'])
+parser.add_argument('--FilterSmartsExclusionList', default='')
+parser.add_argument('--FilterIncludedElements', default='')
+parser.add_argument('--FilterExcludedElements', default='')
+parser.add_argument('--FilterIncludedExclusiveElements', default='')
+parser.add_argument('--score_thrshld', default=0)
+parser.add_argument('--pctexplpeak_thrshld', default=0)
+parser.add_argument('--schema')
+parser.add_argument('--cores_top_level', default=1)
+parser.add_argument('--chunks', default=1)
+parser.add_argument('--meta_select_col', default='name')
+parser.add_argument('--skip_invalid_adducts', action='store_true')
+parser.add_argument('--output_cl', action='store_true')
+parser.add_argument('--ScoreSuspectLists', default='')
+parser.add_argument('--MetFragScoreTypes',
+default="FragmenterScore,OfflineMetFusionScore")
+parser.add_argument('--MetFragScoreWeights', default="1.0,1.0")
+parser.add_argument('-a', '--adducts', action='append', nargs=1,
+required=False, default=[], help='Adducts used')
+args = parser.parse_args()
+print(args)
+config = ConfigParser()
+config.read(
+os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config.ini'))
+if os.stat(args.input_pth).st_size == 0:
+print('Input file empty')
+exit()
+# Create temporary working directory
+if args.temp_dir:
+wd = args.temp_dir
+else:
+wd = tempfile.mkdtemp()
+if os.path.exists(wd):
+shutil.rmtree(wd)
+os.makedirs(wd)
+else:
+os.makedirs(wd)
+meta_regex = get_meta_regex(args.schema)
+adduct_types = {
+'[M+H]+': 1.007276,
+'[M+NH4]+': 18.034374,
+'[M+Na]+': 22.989218,
+'[M+K]+': 38.963158,
+'[M+CH3OH+H]+': 33.033489,
+'[M+ACN+H]+': 42.033823,
+'[M+ACN+Na]+': 64.015765,
+'[M+2ACN+H]+': 83.06037,
+'[M-H]-': -1.007276,
+'[M+Cl]-': 34.969402,
+'[M+HCOO]-': 44.99819,
+'[M-H+HCOOH]-': 44.99819,
+# same as above but different style of writing adduct
+'[M+CH3COO]-': 59.01385,
+'[M-H+CH3COOH]-': 59.01385
+# same as above but different style of writing adduct
+}
+######################################################################
+# Parse MSP file and run metfrag CLI
+######################################################################
+# keep list of commands if performing in CLI in parallel
+cmds = []
+# keep a dictionary of all params
+paramds = {}
+# keep count of spectra (for uid)
+spectrac = 0
+# this dictionary will store the meta data results form the MSp file
+meta_info = {}
+if args.adducts:
+adducts_from_cli = [
+a[0].replace('__ob__', '[').replace('__cb__', ']') for a in
+args.adducts
+]
+else:
+adducts_from_cli = []
+with open(args.input_pth, "r") as infile:
+# number of lines for the peaks
+pnumlines = 0
+# number of lines read for the peaks
+plinesread = 0
+for line in infile:
+line = line.strip()
+if pnumlines == 0:
+# ============== Extract metadata from MSP ====================
+meta_info = parse_meta(meta_regex, meta_info)
+if ('massbank' in meta_info and 'cols' in meta_info) or (
+'msp' in meta_info and 'num_peaks' in meta_info):
+pnumlines = int(meta_info['num_peaks'])
+plinesread = 0
+peaklist = []
+elif plinesread < pnumlines:
+# ============== Extract peaks from MSP =======================
+# .split() will split on any empty space (i.e. tab and space)
+line = tuple(line.split())
+# Keep only m/z and intensity, not relative intensity
+save_line = tuple(line[0].split() + line[1].split())
+plinesread += 1
+peaklist.append(save_line)
+elif plinesread and plinesread == pnumlines:
+# =Get sample name and additional details for output and RUN ==
+if adducts_from_cli:
+for adduct in adducts_from_cli:
+spectrac += 1
+meta_info['precursor_type'] = adduct
+paramd, cmd = run_metfrag(meta_info, peaklist, args,
+wd, spectrac, adduct_types)
+if paramd:
+paramds[paramd["SampleName"]] = paramd
+cmds.append(cmd)
+else:
+spectrac += 1
+paramd, cmd = run_metfrag(meta_info, peaklist, args,
+wd, spectrac, adduct_types)
+if paramd:
+paramds[paramd["SampleName"]] = paramd
+cmds.append(cmd)
+meta_info = {}
+pnumlines = 0
 plinesread = 0
-peaklist = []
+# end of file. Check if there is a MSP spectra to run metfrag on
-elif plinesread < pnumlines:
+if plinesread and plinesread == pnumlines:
-# ============== Extract peaks from MSP ==========================
-# .split() will split on any empty space (i.e. tab and space)
+if adducts_from_cli:
-line = tuple(line.split())
+for adduct in adducts_from_cli:
-# Keep only m/z and intensity, not relative intensity
+spectrac += 1
-save_line = tuple(line[0].split() + line[1].split())
+meta_info['precursor_type'] = adduct
-plinesread += 1
+paramd, cmd = run_metfrag(meta_info, peaklist, args,
-peaklist.append(save_line)
+wd, spectrac, adduct_types)
+if paramd:
-elif plinesread and plinesread == pnumlines:
+paramds[paramd["SampleName"]] = paramd
-# ======= Get sample name and additional details for output =======
+cmds.append(cmd)
-spectrac += 1
+else:
-paramd, cmd = run_metfrag(meta_info, peaklist, args, wd, spectrac,
+spectrac += 1
-adduct_types)
+paramd, cmd = run_metfrag(meta_info, peaklist, args,
+wd, spectrac, adduct_types)
 if paramd:
 paramds[paramd["SampleName"]] = paramd
 cmds.append(cmd)
-meta_info = {}
+# Perform multiprocessing on command line call level
-pnumlines = 0
+if int(args.cores_top_level) > 1:
-plinesread = 0
+cmds_chunks = [cmds[x:x + int(args.chunks)] for x in
+list(range(0, len(cmds), int(args.chunks)))]
-# end of file. Check if there is a MSP spectra to run metfrag on still
+pool = multiprocessing.Pool(processes=int(args.cores_top_level))
-if plinesread and plinesread == pnumlines:
+pool.map(work, cmds_chunks)
+pool.close()
-paramd, cmd = run_metfrag(meta_info, peaklist, args, wd, spectrac + 1,
+pool.join()
-adduct_types)
+######################################################################
-if paramd:
+# Concatenate and filter the output
-paramds[paramd["SampleName"]] = paramd
+######################################################################
-cmds.append(cmd)
+# outputs might have different headers. Need to get a list of all the
+# headers before we start merging the files
-# Perform multiprocessing on command line call level
+# outfiles = [os.path.join(wd, f) for f in glob.glob(os.path.join(wd,
-if int(args.cores_top_level) > 1:
+# "*_metfrag_result.csv"))]
-cmds_chunks = [cmds[x:x + int(args.chunks)] for x in
+outfiles = glob.glob(os.path.join(wd, "*_metfrag_result.csv"))
-list(range(0, len(cmds), int(args.chunks)))]
-pool = multiprocessing.Pool(processes=int(args.cores_top_level))
+if len(outfiles) == 0:
-pool.map(work, cmds_chunks)
+print('No results')
-pool.close()
+sys.exit()
-pool.join()
+headers = []
-######################################################################
+c = 0
-# Concatenate and filter the output
-######################################################################
-# outputs might have different headers. Need to get a list of all the
-# headers before we start merging the files
-# outfiles = [os.path.join(wd, f) for f in glob.glob(os.path.join(wd,
-# "*_metfrag_result.csv"))]
-outfiles = glob.glob(os.path.join(wd, "*_metfrag_result.csv"))
-if len(outfiles) == 0:
-print('No results')
-sys.exit()
-headers = []
-c = 0
-for fn in outfiles:
-with open(fn, 'r') as infile:
-reader = csv.reader(infile)
-if sys.version_info >= (3, 0):
-headers.extend(next(reader))
-else:
-headers.extend(reader.next())
-# check if file has any data rows
-for i, row in enumerate(reader):
-c += 1
-if i == 1:
-break
-# if no data rows (e.g. matches) then do not save an
-# output and leave the program
-if c == 0:
-print('No results')
-sys.exit()
-additional_detail_headers = ['sample_name']
-for k, paramd in six.iteritems(paramds):
-additional_detail_headers = list(set(
-additional_detail_headers + list(paramd['additional_details'].keys())))
-# add inchikey if not already present (missing in metchem output)
-if 'InChIKey' not in headers:
-headers.append('InChIKey')
-headers = additional_detail_headers + sorted(list(set(headers)))
-# Sort files nicely
-outfiles.sort(
-key=lambda s: int(re.match(r'^.*/(\d+)_metfrag_result.csv', s).group(1)))
-print(outfiles)
-# merge outputs
-with open(args.result_pth, 'a') as merged_outfile:
-dwriter = csv.DictWriter(merged_outfile, fieldnames=headers,
-delimiter='\t', quotechar='"')
-dwriter.writeheader()
 for fn in outfiles:
+with open(fn, 'r') as infile:
-with open(fn) as infile:
+reader = csv.reader(infile)
-reader = csv.DictReader(infile, delimiter=',', quotechar='"')
+if sys.version_info >= (3, 0):
-for line in reader:
+headers.extend(next(reader))
-bewrite = True
+else:
-for key, value in line.items():
+headers.extend(reader.next())
-# Filter when no MS/MS peak matched
+# check if file has any data rows
-if key == "ExplPeaks":
+for i, row in enumerate(reader):
-if float(args.pctexplpeak_thrshld) > 0 \
+c += 1
-and value and "NA" in value:
+if i == 1:
+break
+# if no data rows (e.g. matches) then do not save an
+# output and leave the program
+if c == 0:
+print('No results')
+sys.exit()
+additional_detail_headers = ['sample_name']
+for k, paramd in six.iteritems(paramds):
+additional_detail_headers = list(set(
+additional_detail_headers + list(
+paramd['additional_details'].keys())))
+# add inchikey if not already present (missing in metchem output)
+if 'InChIKey' not in headers:
+headers.append('InChIKey')
+additional_detail_headers = sorted(additional_detail_headers)
+headers = additional_detail_headers + sorted(list(set(headers)))
+# Sort files nicely
+outfiles.sort(
+key=lambda s: int(
+re.match(r'^.*/(\d+)_metfrag_result.csv', s).group(1)))
+print(outfiles)
+# merge outputs
+with open(args.result_pth, 'a') as merged_outfile:
+dwriter = csv.DictWriter(merged_outfile, fieldnames=headers,
+delimiter='\t', quotechar='"')
+dwriter.writeheader()
+for fn in outfiles:
+with open(fn) as infile:
+reader = csv.DictReader(infile, delimiter=',', quotechar='"')
+for line in reader:
+bewrite = True
+for key, value in line.items():
+# Filter when no MS/MS peak matched
+if key == "ExplPeaks":
+if float(args.pctexplpeak_thrshld) > 0 \
+and value and "NA" in value:
+bewrite = False
+# Filter with a score threshold
+elif key == "Score":
+if value and float(value) <= float(
+args.score_thrshld):
+bewrite = False
+elif key == "NoExplPeaks":
+nbfindpeak = float(value)
+elif key == "NumberPeaksUsed":
+totpeaks = float(value)
+# Filter with a relative number of peak matched
+try:
+pctexplpeak = nbfindpeak / totpeaks * 100
+except ZeroDivisionError:
+bewrite = False
+else:
+if pctexplpeak < float(args.pctexplpeak_thrshld):
 bewrite = False
-# Filter with a score threshold
-elif key == "Score":
+# Write the line if it pass all filters
-if value and float(value) <= float(args.score_thrshld):
+if bewrite:
-bewrite = False
+bfn = os.path.basename(fn)
-elif key == "NoExplPeaks":
+bfn = bfn.replace(".csv", "")
-nbfindpeak = float(value)
+line['sample_name'] = paramds[bfn]['SampleName']
-elif key == "NumberPeaksUsed":
+ad = paramds[bfn]['additional_details']
-totpeaks = float(value)
-# Filter with a relative number of peak matched
+if args.MetFragDatabaseType == "MetChem":
-try:
+# for some reason the metchem database option does
-pctexplpeak = nbfindpeak / totpeaks * 100
+# not report the full inchikey (at least in the
-except ZeroDivisionError:
+# Bham setup. This ensures we always get
-bewrite = False
+# the fully inchikey
-else:
+line['InChIKey'] = '{}-{}-{}'.format(
-if pctexplpeak < float(args.pctexplpeak_thrshld):
+line['InChIKey1'],
-bewrite = False
+line['InChIKey2'],
+line['InChIKey3'])
-# Write the line if it pass all filters
-if bewrite:
+line.update(ad)
-bfn = os.path.basename(fn)
+dwriter.writerow(line)
-bfn = bfn.replace(".csv", "")
-line['sample_name'] = paramds[bfn]['SampleName']
-ad = paramds[bfn]['additional_details']
-if args.MetFragDatabaseType == "MetChem":
-# for some reason the metchem database option does
-# not report the full inchikey (at least in the Bham
-# setup. This ensures we always get the fully inchikey
-line['InChIKey'] = '{}-{}-{}'.format(line['InChIKey1'],
-line['InChIKey2'],
-line['InChIKey3'])
-line.update(ad)
-dwriter.writerow(line)

Mercurial > repos > computational-metabolomics > metfrag

comparison metfrag.py @ 2:f151ee133612 draft default tip