Mercurial > repos > pieterlukasse > prims_metabolomics
view export_to_metexp_tabular.py @ 19:c068ed713eb9
fixes on <when> tags
author | pieter.lukasse@wur.nl |
---|---|
date | Fri, 31 Jan 2014 12:21:51 +0100 |
parents | 9d5f4f5f764b |
children | 19d8fd10248e |
line wrap: on
line source
#!/usr/bin/env python # encoding: utf-8 ''' Module to combine output from the GCMS Galaxy tools RankFilter, CasLookup and MsClust into a tabular file that can be uploaded to the MetExp database. RankFilter, CasLookup are already combined by combine_output.py so here we will use this result. Furthermore here the MsClust spectra file (.MSP) and one of the MsClust quantification files are to be combined with combine_output.py result as well. Extra calculations performed: - The column MW is also added here and is derived from the column FORMULA found in combine_output.py result. So in total here we merge 3 files and calculate one new column. ''' import csv import sys from collections import OrderedDict __author__ = "Pieter Lukasse" __contact__ = "pieter.lukasse@wur.nl" __copyright__ = "Copyright, 2013, Plant Research International, WUR" __license__ = "Apache v2" def _process_data(in_csv, delim='\t'): ''' Generic method to parse a tab-separated file returning a dictionary with named columns @param in_csv: input filename to be parsed ''' data = list(csv.reader(open(in_csv, 'rU'), delimiter=delim)) header = data.pop(0) # Create dictionary with column name as key output = OrderedDict() for index in xrange(len(header)): output[header[index]] = [row[index] for row in data] return output ONE_TO_ONE = 'one_to_one' N_TO_ONE = 'n_to_one' def _merge_data(set1, link_field_set1, set2, link_field_set2, compare_function, merge_function, relation_type=ONE_TO_ONE): ''' Merges data from both input dictionaries based on the link fields. This method will build up a new list containing the merged hits as the items. @param set1: dictionary holding set1 in the form of N lists (one list per attribute name) @param set2: dictionary holding set2 in the form of N lists (one list per attribute name) ''' # TODO test for correct input files -> same link_field values should be there (test at least number of unique link_field values): # # if (len(set1[link_field_set1]) != len(set2[link_field_set2])): # raise Exception('input files should have the same nr of key values ') merged = [] processed = {} for link_field_set1_idx in xrange(len(set1[link_field_set1])): link_field_set1_value = set1[link_field_set1][link_field_set1_idx] if not link_field_set1_value in processed : # keep track of processed items to not repeat them processed[link_field_set1_value] = link_field_set1_value # Get the indices for current link_field_set1_value in both data-structures for proper matching set1index = [index for index, value in enumerate(set1[link_field_set1]) if value == link_field_set1_value] set2index = [index for index, value in enumerate(set2[link_field_set2]) if compare_function(value, link_field_set1_value)==True ] merged_hits = [] # Combine hits for hit in xrange(len(set1index)): # Create records of hits to be merged ("keys" are the attribute names, so what the lines below do # is create a new "dict" item with same "keys"/attributes, with each attribute filled with its # corresponding value in the rankfilter or caslookup tables; i.e. # rankfilter[key] => returns the list/array with size = nrrows, with the values for the attribute # represented by "key". rindex[hit] => points to the row nr=hit (hit is a rownr/index) # It just ensures the entry is made available as a plain named array for easy access. rf_record = OrderedDict(zip(set1.keys(), [set1[key][set1index[hit]] for key in set1.keys()])) if relation_type == ONE_TO_ONE : cl_record = OrderedDict(zip(set2.keys(), [set2[key][set2index[hit]] for key in set2.keys()])) else: # is N to 1: cl_record = OrderedDict(zip(set2.keys(), [set2[key][set2index[0]] for key in set2.keys()])) merged_hit = merge_function(rf_record, cl_record) merged_hits.append(merged_hit) merged.append(merged_hits) return merged, len(set1index) def _compare_records(key1, key2): ''' in this case the compare method is really simple as both keys are expected to contain same value when records are the same ''' if key1 == key2: return True else: return False def _merge_records(rank_caslookup_combi, msclust_quant_record): ''' Combines single records from both the RankFilter+CasLookup combi file and from MsClust file @param rank_caslookup_combi: rankfilter and caslookup combined record (see combine_output.py) @param msclust_quant_record: msclust quantification + spectrum record ''' i = 0 record = [] for column in rank_caslookup_combi: record.append(rank_caslookup_combi[column]) i += 1 for column in msclust_quant_record: record.append(msclust_quant_record[column]) i += 1 return record def _save_data(data, headers, nhits, out_csv): ''' Writes tab-separated data to file @param data: dictionary containing merged dataset @param out_csv: output csv file ''' # Open output file for writing outfile_single_handle = open(out_csv, 'wb') output_single_handle = csv.writer(outfile_single_handle, delimiter="\t") # Write headers output_single_handle.writerow(headers) # Write one line for each centrotype for centrotype_idx in xrange(len(data)): for hit in data[centrotype_idx]: output_single_handle.writerow(hit) def main(): ''' Combine Output main function RankFilter, CasLookup are already combined by combine_output.py so here we will use this result. Furthermore here the MsClust spectra file (.MSP) and one of the MsClust quantification files are to be combined with combine_output.py result as well. ''' rankfilter_and_caslookup_combined_file = sys.argv[1] msclust_quantification_and_spectra_file = sys.argv[2] output_csv = sys.argv[3] # Read RankFilter and CasLookup output files rankfilter_and_caslookup_combined = _process_data(rankfilter_and_caslookup_combined_file) msclust_quantification_and_spectra = _process_data(msclust_quantification_and_spectra_file, ',') merged, nhits = _merge_data(rankfilter_and_caslookup_combined, 'Centrotype', msclust_quantification_and_spectra, 'centrotype', _compare_records, _merge_records, N_TO_ONE) headers = rankfilter_and_caslookup_combined.keys() + msclust_quantification_and_spectra.keys() _save_data(merged, headers, nhits, output_csv) if __name__ == '__main__': main()