Mercurial > repos > pieterlukasse > prims_metabolomics2
diff GCMS/combine_output.py @ 6:4393f982d18f
reorganized sources
author | pieter.lukasse@wur.nl |
---|---|
date | Thu, 19 Mar 2015 12:22:23 +0100 |
parents | |
children | 346ff9ad8c7a |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/GCMS/combine_output.py Thu Mar 19 12:22:23 2015 +0100 @@ -0,0 +1,253 @@ +#!/usr/bin/env python +# encoding: utf-8 +''' +Module to combine output from two GCMS Galaxy tools (RankFilter and CasLookup) +''' + +import csv +import re +import sys +import math +import pprint + +__author__ = "Marcel Kempenaar" +__contact__ = "brs@nbic.nl" +__copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre" +__license__ = "MIT" + +def _process_data(in_csv): + ''' + Generic method to parse a tab-separated file returning a dictionary with named columns + @param in_csv: input filename to be parsed + ''' + data = list(csv.reader(open(in_csv, 'rU'), delimiter='\t')) + header = data.pop(0) + # Create dictionary with column name as key + output = {} + for index in xrange(len(header)): + output[header[index]] = [row[index] for row in data] + return output + + +def _merge_data(rankfilter, caslookup): + ''' + Merges data from both input dictionaries based on the Centrotype field. This method will + build up a new list containing the merged hits as the items. + @param rankfilter: dictionary holding RankFilter output in the form of N lists (one list per attribute name) + @param caslookup: dictionary holding CasLookup output in the form of N lists (one list per attribute name) + ''' + # TODO: test for correct input files -> rankfilter and caslookup internal lists should have the same lenghts: + if (len(rankfilter['ID']) != len(caslookup['Centrotype'])): + raise Exception('rankfilter and caslookup files should have the same nr of rows/records ') + + merged = [] + processed = {} + for compound_id_idx in xrange(len(rankfilter['ID'])): + compound_id = rankfilter['ID'][compound_id_idx] + if not compound_id in processed : + # keep track of processed items to not repeat them + processed[compound_id] = compound_id + # get centrotype nr + centrotype = compound_id.split('-')[0] + # Get the indices for current compound ID in both data-structures for proper matching + rindex = [index for index, value in enumerate(rankfilter['ID']) if value == compound_id] + cindex = [index for index, value in enumerate(caslookup['Centrotype']) if value == centrotype] + + merged_hits = [] + # Combine hits + for hit in xrange(len(rindex)): + # Create records of hits to be merged ("keys" are the attribute names, so what the lines below do + # is create a new "dict" item with same "keys"/attributes, with each attribute filled with its + # corresponding value in the rankfilter or caslookup tables; i.e. + # rankfilter[key] => returns the list/array with size = nrrows, with the values for the attribute + # represented by "key". rindex[hit] => points to the row nr=hit (hit is a rownr/index) + rf_record = dict(zip(rankfilter.keys(), [rankfilter[key][rindex[hit]] for key in rankfilter.keys()])) + cl_record = dict(zip(caslookup.keys(), [caslookup[key][cindex[hit]] for key in caslookup.keys()])) + + merged_hit = _add_hit(rf_record, cl_record) + merged_hits.append(merged_hit) + + merged.append(merged_hits) + + return merged, len(rindex) + + +def _add_hit(rankfilter, caslookup): + ''' + Combines single records from both the RankFilter- and CasLookup-tools + @param rankfilter: record (dictionary) of one compound in the RankFilter output + @param caslookup: matching record (dictionary) of one compound in the CasLookup output + ''' + # The ID in the RankFilter output contains the following 5 fields: + rf_id = rankfilter['ID'].split('-') + try: + name, formula = _remove_formula(rankfilter['Name']) + hit = [rf_id[0], # Centrotype + rf_id[1], # cent.Factor + rf_id[2], # scan nr + rf_id[3], # R.T. (umin) + rf_id[4], # nr. Peaks + # Appending other fields + rankfilter['R.T.'], + name, + caslookup['FORMULA'] if not formula else formula, + rankfilter['Library'].strip(), + rankfilter['CAS'].strip(), + rankfilter['Forward'], + rankfilter['Reverse'], + ((float(rankfilter['Forward']) + float(rankfilter['Reverse'])) / 2), + rankfilter['RIexp'], + caslookup['RI'], + rankfilter['RIsvr'], + # Calculate absolute differences + math.fabs(float(rankfilter['RIexp']) - float(rankfilter['RIsvr'])), + math.fabs(float(caslookup['RI']) - float(rankfilter['RIexp'])), + caslookup['Regression.Column.Name'], + caslookup['min'], + caslookup['max'], + caslookup['nr.duplicates'], + caslookup['Column.phase.type'], + caslookup['Column.name'], + rankfilter['Rank'], + rankfilter['%rel.err'], + rankfilter['Synonyms']] + except KeyError as error: + print "Problem reading in data from input file(s):\n", + print "Respective CasLookup entry: \n", pprint.pprint(caslookup), "\n" + print "Respective RankFilter entry: \n", pprint.pprint(rankfilter), "\n" + raise error + + return hit + + +def _remove_formula(name): + ''' + The RankFilter Name field often contains the Formula as well, this function removes it from the Name + @param name: complete name of the compound from the RankFilter output + ''' + name = name.split() + poss_formula = name[-1] + match = re.match("^(([A-Z][a-z]{0,2})(\d*))+$", poss_formula) + if match: + return ' '.join(name[:-1]), poss_formula + else: + return ' '.join(name), False + + +def _get_default_caslookup(): + ''' + The Cas Lookup tool might not have found all compounds in the library searched, + this default dict will be used to combine with the Rank Filter output + ''' + return {'FORMULA': 'N/A', + 'RI': '0.0', + 'Regression.Column.Name': 'None', + 'min': '0.0', + 'max': '0.0', + 'nr.duplicates': '0', + 'Column.phase.type': 'N/A', + 'Column.name': 'N/A'} + + +def _save_data(data, nhits, out_csv_single, out_csv_multi): + ''' + Writes tab-separated data to file + @param data: dictionary containing merged dataset + @param out_csv: output csv file + ''' + # Columns we don't repeat: + header_part1 = ['Centrotype', + 'cent.Factor', + 'scan nr.', + 'R.T. (umin)', + 'nr. Peaks', + 'R.T.'] + # These are the headers/columns we repeat in case of + # combining hits in one line (see alternative_headers method below): + header_part2 = [ + 'Name', + 'FORMULA', + 'Library', + 'CAS', + 'Forward', + 'Reverse', + 'Avg. (Forward, Reverse)', + 'RIexp', + 'RI', + 'RIsvr', + 'RIexp - RIsvr', + 'RI - RIexp', + 'Regression.Column.Name', + 'min', + 'max', + 'nr.duplicates', + 'Column.phase.type', + 'Column.name', + 'Rank', + '%rel.err', + 'Synonyms'] + + # Open output file for writing + outfile_single_handle = open(out_csv_single, 'wb') + outfile_multi_handle = open(out_csv_multi, 'wb') + output_single_handle = csv.writer(outfile_single_handle, delimiter="\t") + output_multi_handle = csv.writer(outfile_multi_handle, delimiter="\t") + + # Write headers + output_single_handle.writerow(header_part1 + header_part2) + output_multi_handle.writerow(header_part1 + header_part2 + alternative_headers(header_part2, nhits-1)) + # Combine all hits for each centrotype into one line + line = [] + for centrotype_idx in xrange(len(data)): + i = 0 + for hit in data[centrotype_idx]: + if i==0: + line.extend(hit) + else: + line.extend(hit[6:]) + i = i+1 + # small validation (if error, it is a programming error): + if i > nhits: + raise Exception('Error: more hits that expected for centrotype_idx ' + centrotype_idx) + output_multi_handle.writerow(line) + line = [] + + # Write one line for each centrotype + for centrotype_idx in xrange(len(data)): + for hit in data[centrotype_idx]: + output_single_handle.writerow(hit) + +def alternative_headers(header_part2, nr_alternative_hits): + ''' + This method will iterate over the header names and add the string 'ALT#_' before each, + where # is the number of the alternative, according to number of alternative hits we want to add + to final csv/tsv + ''' + result = [] + for i in xrange(nr_alternative_hits): + for header_name in header_part2: + result.append("ALT" + str(i+1) + "_" + header_name) + return result + +def main(): + ''' + Combine Output main function + It will merge the result files from "RankFilter" and "Lookup RI for CAS numbers" + NB: the caslookup_result_file will typically have fewer lines than + rankfilter_result_file, so the merge has to consider this as well. The final file + should have the same nr of lines as rankfilter_result_file. + ''' + rankfilter_result_file = sys.argv[1] + caslookup_result_file = sys.argv[2] + output_single_csv = sys.argv[3] + output_multi_csv = sys.argv[4] + + # Read RankFilter and CasLookup output files + rankfilter = _process_data(rankfilter_result_file) + caslookup = _process_data(caslookup_result_file) + merged, nhits = _merge_data(rankfilter, caslookup) + _save_data(merged, nhits, output_single_csv, output_multi_csv) + + +if __name__ == '__main__': + main()