prims_metabolomics2: GCMS/combine

comparison GCMS/combine_output.py @ 6:4393f982d18f

reorganized sources

author	pieter.lukasse@wur.nl
date	Thu, 19 Mar 2015 12:22:23 +0100
parents
children	346ff9ad8c7a

comparison

equal deleted inserted replaced

-:41f122255d14
+:4393f982d18f
+#!/usr/bin/env python
+# encoding: utf-8
+'''
+Module to combine output from two GCMS Galaxy tools (RankFilter and CasLookup)
+'''
+import csv
+import re
+import sys
+import math
+import pprint
+__author__ = "Marcel Kempenaar"
+__contact__ = "brs@nbic.nl"
+__copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre"
+__license__ = "MIT"
+def _process_data(in_csv):
+'''
+Generic method to parse a tab-separated file returning a dictionary with named columns
+@param in_csv: input filename to be parsed
+'''
+data = list(csv.reader(open(in_csv, 'rU'), delimiter='\t'))
+header = data.pop(0)
+# Create dictionary with column name as key
+output = {}
+for index in xrange(len(header)):
+output[header[index]] = [row[index] for row in data]
+return output
+def _merge_data(rankfilter, caslookup):
+'''
+Merges data from both input dictionaries based on the Centrotype field. This method will
+build up a new list containing the merged hits as the items.
+@param rankfilter: dictionary holding RankFilter output in the form of N lists (one list per attribute name)
+@param caslookup: dictionary holding CasLookup output in the form of N lists (one list per attribute name)
+'''
+# TODO: test for correct input files -> rankfilter and caslookup internal lists should have the same lenghts:
+if (len(rankfilter['ID']) != len(caslookup['Centrotype'])):
+raise Exception('rankfilter and caslookup files should have the same nr of rows/records ')
+merged = []
+processed = {}
+for compound_id_idx in xrange(len(rankfilter['ID'])):
+compound_id = rankfilter['ID'][compound_id_idx]
+if not compound_id in processed :
+# keep track of processed items to not repeat them
+processed[compound_id] = compound_id
+# get centrotype nr
+centrotype = compound_id.split('-')[0]
+# Get the indices for current compound ID in both data-structures for proper matching
+rindex = [index for index, value in enumerate(rankfilter['ID']) if value == compound_id]
+cindex = [index for index, value in enumerate(caslookup['Centrotype']) if value == centrotype]
+merged_hits = []
+# Combine hits
+for hit in xrange(len(rindex)):
+# Create records of hits to be merged ("keys" are the attribute names, so what the lines below do
+# is create a new "dict" item with same "keys"/attributes, with each attribute filled with its
+# corresponding value in the rankfilter or caslookup tables; i.e.
+# rankfilter[key] => returns the list/array with size = nrrows, with the values for the attribute
+#                    represented by "key". rindex[hit] => points to the row nr=hit (hit is a rownr/index)
+rf_record = dict(zip(rankfilter.keys(), [rankfilter[key][rindex[hit]] for key in rankfilter.keys()]))
+cl_record = dict(zip(caslookup.keys(), [caslookup[key][cindex[hit]] for key in caslookup.keys()]))
+merged_hit = _add_hit(rf_record, cl_record)
+merged_hits.append(merged_hit)
+merged.append(merged_hits)
+return merged, len(rindex)
+def _add_hit(rankfilter, caslookup):
+'''
+Combines single records from both the RankFilter- and CasLookup-tools
+@param rankfilter: record (dictionary) of one compound in the RankFilter output
+@param caslookup: matching record (dictionary) of one compound in the CasLookup output
+'''
+# The ID in the RankFilter output contains the following 5 fields:
+rf_id = rankfilter['ID'].split('-')
+try:
+name, formula = _remove_formula(rankfilter['Name'])
+hit = [rf_id[0], # Centrotype
+rf_id[1], # cent.Factor
+rf_id[2], # scan nr
+rf_id[3], # R.T. (umin)
+rf_id[4], # nr. Peaks
+# Appending other fields
+rankfilter['R.T.'],
+name,
+caslookup['FORMULA'] if not formula else formula,
+rankfilter['Library'].strip(),
+rankfilter['CAS'].strip(),
+rankfilter['Forward'],
+rankfilter['Reverse'],
+((float(rankfilter['Forward']) + float(rankfilter['Reverse'])) / 2),
+rankfilter['RIexp'],
+caslookup['RI'],
+rankfilter['RIsvr'],
+# Calculate absolute differences
+math.fabs(float(rankfilter['RIexp']) - float(rankfilter['RIsvr'])),
+math.fabs(float(caslookup['RI']) - float(rankfilter['RIexp'])),
+caslookup['Regression.Column.Name'],
+caslookup['min'],
+caslookup['max'],
+caslookup['nr.duplicates'],
+caslookup['Column.phase.type'],
+caslookup['Column.name'],
+rankfilter['Rank'],
+rankfilter['%rel.err'],
+rankfilter['Synonyms']]
+except KeyError as error:
+print "Problem reading in data from input file(s):\n",
+print "Respective CasLookup entry: \n", pprint.pprint(caslookup), "\n"
+print "Respective RankFilter entry: \n", pprint.pprint(rankfilter), "\n"
+raise error
+return hit
+def _remove_formula(name):
+'''
+The RankFilter Name field often contains the Formula as well, this function removes it from the Name
+@param name: complete name of the compound from the RankFilter output
+'''
+name = name.split()
+poss_formula = name[-1]
+match = re.match("^(([A-Z][a-z]{0,2})(\d*))+$", poss_formula)
+if match:
+return ' '.join(name[:-1]), poss_formula
+else:
+return ' '.join(name), False
+def _get_default_caslookup():
+'''
+The Cas Lookup tool might not have found all compounds in the library searched,
+this default dict will be used to combine with the Rank Filter output
+'''
+return {'FORMULA': 'N/A',
+'RI': '0.0',
+'Regression.Column.Name': 'None',
+'min': '0.0',
+'max': '0.0',
+'nr.duplicates': '0',
+'Column.phase.type': 'N/A',
+'Column.name': 'N/A'}
+def _save_data(data, nhits, out_csv_single, out_csv_multi):
+'''
+Writes tab-separated data to file
+@param data: dictionary containing merged dataset
+@param out_csv: output csv file
+'''
+# Columns we don't repeat:
+header_part1 = ['Centrotype',
+'cent.Factor',
+'scan nr.',
+'R.T. (umin)',
+'nr. Peaks',
+'R.T.']
+# These are the headers/columns we repeat in case of
+# combining hits in one line (see alternative_headers method below):
+header_part2 = [
+'Name',
+'FORMULA',
+'Library',
+'CAS',
+'Forward',
+'Reverse',
+'Avg. (Forward, Reverse)',
+'RIexp',
+'RI',
+'RIsvr',
+'RIexp - RIsvr',
+'RI - RIexp',
+'Regression.Column.Name',
+'min',
+'max',
+'nr.duplicates',
+'Column.phase.type',
+'Column.name',
+'Rank',
+'%rel.err',
+'Synonyms']
+# Open output file for writing
+outfile_single_handle = open(out_csv_single, 'wb')
+outfile_multi_handle = open(out_csv_multi, 'wb')
+output_single_handle = csv.writer(outfile_single_handle, delimiter="\t")
+output_multi_handle = csv.writer(outfile_multi_handle, delimiter="\t")
+# Write headers
+output_single_handle.writerow(header_part1 + header_part2)
+output_multi_handle.writerow(header_part1 + header_part2 + alternative_headers(header_part2, nhits-1))
+# Combine all hits for each centrotype into one line
+line = []
+for centrotype_idx in xrange(len(data)):
+i = 0
+for hit in data[centrotype_idx]:
+if i==0:
+line.extend(hit)
+else:
+line.extend(hit[6:])
+i = i+1
+# small validation (if error, it is a programming error):
+if i > nhits:
+raise Exception('Error: more hits that expected for  centrotype_idx ' + centrotype_idx)
+output_multi_handle.writerow(line)
+line = []
+# Write one line for each centrotype
+for centrotype_idx in xrange(len(data)):
+for hit in data[centrotype_idx]:
+output_single_handle.writerow(hit)
+def alternative_headers(header_part2, nr_alternative_hits):
+'''
+This method will iterate over the header names and add the string 'ALT#_' before each,
+where # is the number of the alternative, according to number of alternative hits we want to add
+to final csv/tsv
+'''
+result = []
+for i in xrange(nr_alternative_hits):
+for header_name in header_part2:
+result.append("ALT" + str(i+1) + "_" + header_name)
+return result
+def main():
+'''
+Combine Output main function
+It will merge the result files from "RankFilter"  and "Lookup RI for CAS numbers"
+NB: the caslookup_result_file will typically have fewer lines than
+rankfilter_result_file, so the merge has to consider this as well. The final file
+should have the same nr of lines as rankfilter_result_file.
+'''
+rankfilter_result_file = sys.argv[1]
+caslookup_result_file = sys.argv[2]
+output_single_csv = sys.argv[3]
+output_multi_csv = sys.argv[4]
+# Read RankFilter and CasLookup output files
+rankfilter = _process_data(rankfilter_result_file)
+caslookup = _process_data(caslookup_result_file)
+merged, nhits = _merge_data(rankfilter, caslookup)
+_save_data(merged, nhits, output_single_csv, output_multi_csv)
+if __name__ == '__main__':
+main()

Mercurial > repos > pieterlukasse > prims_metabolomics2

comparison GCMS/combine_output.py @ 6:4393f982d18f