Mercurial > repos > pieterlukasse > prims_metabolomics
view combine_output.py @ 43:eb0e25d06060
updated dependency
author | pieter.lukasse@wur.nl |
---|---|
date | Fri, 07 Nov 2014 15:55:06 +0100 |
parents | 19d8fd10248e |
children |
line wrap: on
line source
#!/usr/bin/env python # encoding: utf-8 ''' Module to combine output from two GCMS Galaxy tools (RankFilter and CasLookup) ''' import csv import re import sys import math import pprint __author__ = "Marcel Kempenaar" __contact__ = "brs@nbic.nl" __copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre" __license__ = "MIT" def _process_data(in_csv): ''' Generic method to parse a tab-separated file returning a dictionary with named columns @param in_csv: input filename to be parsed ''' data = list(csv.reader(open(in_csv, 'rU'), delimiter='\t')) header = data.pop(0) # Create dictionary with column name as key output = {} for index in xrange(len(header)): output[header[index]] = [row[index] for row in data] return output def _merge_data(rankfilter, caslookup): ''' Merges data from both input dictionaries based on the Centrotype field. This method will build up a new list containing the merged hits as the items. @param rankfilter: dictionary holding RankFilter output in the form of N lists (one list per attribute name) @param caslookup: dictionary holding CasLookup output in the form of N lists (one list per attribute name) ''' # TODO: test for correct input files -> rankfilter and caslookup internal lists should have the same lenghts: if (len(rankfilter['ID']) != len(caslookup['Centrotype'])): raise Exception('rankfilter and caslookup files should have the same nr of rows/records ') merged = [] processed = {} for compound_id_idx in xrange(len(rankfilter['ID'])): compound_id = rankfilter['ID'][compound_id_idx] if not compound_id in processed : # keep track of processed items to not repeat them processed[compound_id] = compound_id # get centrotype nr centrotype = compound_id.split('-')[0] # Get the indices for current compound ID in both data-structures for proper matching rindex = [index for index, value in enumerate(rankfilter['ID']) if value == compound_id] cindex = [index for index, value in enumerate(caslookup['Centrotype']) if value == centrotype] merged_hits = [] # Combine hits for hit in xrange(len(rindex)): # Create records of hits to be merged ("keys" are the attribute names, so what the lines below do # is create a new "dict" item with same "keys"/attributes, with each attribute filled with its # corresponding value in the rankfilter or caslookup tables; i.e. # rankfilter[key] => returns the list/array with size = nrrows, with the values for the attribute # represented by "key". rindex[hit] => points to the row nr=hit (hit is a rownr/index) rf_record = dict(zip(rankfilter.keys(), [rankfilter[key][rindex[hit]] for key in rankfilter.keys()])) cl_record = dict(zip(caslookup.keys(), [caslookup[key][cindex[hit]] for key in caslookup.keys()])) merged_hit = _add_hit(rf_record, cl_record) merged_hits.append(merged_hit) merged.append(merged_hits) return merged, len(rindex) def _add_hit(rankfilter, caslookup): ''' Combines single records from both the RankFilter- and CasLookup-tools @param rankfilter: record (dictionary) of one compound in the RankFilter output @param caslookup: matching record (dictionary) of one compound in the CasLookup output ''' # The ID in the RankFilter output contains the following 5 fields: rf_id = rankfilter['ID'].split('-') try: name, formula = _remove_formula(rankfilter['Name']) hit = [rf_id[0], # Centrotype rf_id[1], # cent.Factor rf_id[2], # scan nr rf_id[3], # R.T. (umin) rf_id[4], # nr. Peaks # Appending other fields rankfilter['R.T.'], name, caslookup['FORMULA'] if not formula else formula, rankfilter['Library'].strip(), rankfilter['CAS'].strip(), rankfilter['Forward'], rankfilter['Reverse'], ((float(rankfilter['Forward']) + float(rankfilter['Reverse'])) / 2), rankfilter['RIexp'], caslookup['RI'], rankfilter['RIsvr'], # Calculate absolute differences math.fabs(float(rankfilter['RIexp']) - float(rankfilter['RIsvr'])), math.fabs(float(caslookup['RI']) - float(rankfilter['RIexp'])), caslookup['Regression.Column.Name'], caslookup['min'], caslookup['max'], caslookup['nr.duplicates'], caslookup['Column.phase.type'], caslookup['Column.name'], rankfilter['Rank'], rankfilter['%rel.err'], rankfilter['Synonyms']] except KeyError as error: print "Problem reading in data from input file(s):\n", print "Respective CasLookup entry: \n", pprint.pprint(caslookup), "\n" print "Respective RankFilter entry: \n", pprint.pprint(rankfilter), "\n" raise error return hit def _remove_formula(name): ''' The RankFilter Name field often contains the Formula as well, this function removes it from the Name @param name: complete name of the compound from the RankFilter output ''' name = name.split() poss_formula = name[-1] match = re.match("^(([A-Z][a-z]{0,2})(\d*))+$", poss_formula) if match: return ' '.join(name[:-1]), poss_formula else: return ' '.join(name), False def _get_default_caslookup(): ''' The Cas Lookup tool might not have found all compounds in the library searched, this default dict will be used to combine with the Rank Filter output ''' return {'FORMULA': 'N/A', 'RI': '0.0', 'Regression.Column.Name': 'None', 'min': '0.0', 'max': '0.0', 'nr.duplicates': '0', 'Column.phase.type': 'N/A', 'Column.name': 'N/A'} def _save_data(data, nhits, out_csv_single, out_csv_multi): ''' Writes tab-separated data to file @param data: dictionary containing merged dataset @param out_csv: output csv file ''' # Columns we don't repeat: header_part1 = ['Centrotype', 'cent.Factor', 'scan nr.', 'R.T. (umin)', 'nr. Peaks', 'R.T.'] # These are the headers/columns we repeat in case of # combining hits in one line (see alternative_headers method below): header_part2 = [ 'Name', 'FORMULA', 'Library', 'CAS', 'Forward', 'Reverse', 'Avg. (Forward, Reverse)', 'RIexp', 'RI', 'RIsvr', 'RIexp - RIsvr', 'RI - RIexp', 'Regression.Column.Name', 'min', 'max', 'nr.duplicates', 'Column.phase.type', 'Column.name', 'Rank', '%rel.err', 'Synonyms'] # Open output file for writing outfile_single_handle = open(out_csv_single, 'wb') outfile_multi_handle = open(out_csv_multi, 'wb') output_single_handle = csv.writer(outfile_single_handle, delimiter="\t") output_multi_handle = csv.writer(outfile_multi_handle, delimiter="\t") # Write headers output_single_handle.writerow(header_part1 + header_part2) output_multi_handle.writerow(header_part1 + header_part2 + alternative_headers(header_part2, nhits-1)) # Combine all hits for each centrotype into one line line = [] for centrotype_idx in xrange(len(data)): i = 0 for hit in data[centrotype_idx]: if i==0: line.extend(hit) else: line.extend(hit[6:]) i = i+1 # small validation (if error, it is a programming error): if i > nhits: raise Exception('Error: more hits that expected for centrotype_idx ' + centrotype_idx) output_multi_handle.writerow(line) line = [] # Write one line for each centrotype for centrotype_idx in xrange(len(data)): for hit in data[centrotype_idx]: output_single_handle.writerow(hit) def alternative_headers(header_part2, nr_alternative_hits): ''' This method will iterate over the header names and add the string 'ALT#_' before each, where # is the number of the alternative, according to number of alternative hits we want to add to final csv/tsv ''' result = [] for i in xrange(nr_alternative_hits): for header_name in header_part2: result.append("ALT" + str(i+1) + "_" + header_name) return result def main(): ''' Combine Output main function It will merge the result files from "RankFilter" and "Lookup RI for CAS numbers" NB: the caslookup_result_file will typically have fewer lines than rankfilter_result_file, so the merge has to consider this as well. The final file should have the same nr of lines as rankfilter_result_file. ''' rankfilter_result_file = sys.argv[1] caslookup_result_file = sys.argv[2] output_single_csv = sys.argv[3] output_multi_csv = sys.argv[4] # Read RankFilter and CasLookup output files rankfilter = _process_data(rankfilter_result_file) caslookup = _process_data(caslookup_result_file) merged, nhits = _merge_data(rankfilter, caslookup) _save_data(merged, nhits, output_single_csv, output_multi_csv) if __name__ == '__main__': main()