Mercurial > repos > pieterlukasse > prims_metabolomics2
diff export_to_metexp_tabular.py @ 0:dffc38727496
initial commit
author | pieter.lukasse@wur.nl |
---|---|
date | Sat, 07 Feb 2015 22:02:00 +0100 |
parents | |
children | 0d1557b3d540 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/export_to_metexp_tabular.py Sat Feb 07 22:02:00 2015 +0100 @@ -0,0 +1,247 @@ +#!/usr/bin/env python +# encoding: utf-8 +''' +Module to combine output from the GCMS Galaxy tools RankFilter, CasLookup and MsClust +into a tabular file that can be uploaded to the MetExp database. + +RankFilter, CasLookup are already combined by combine_output.py so here we will use +this result. Furthermore here one of the MsClust +quantification files containing the respective spectra details are to be combined as well. + +Extra calculations performed: +- The column MW is also added here and is derived from the column FORMULA found + in RankFilter, CasLookup combined result. + +So in total here we merge 2 files and calculate one new column. +''' +from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611 +import csv +import re +import sys +from collections import OrderedDict + +__author__ = "Pieter Lukasse" +__contact__ = "pieter.lukasse@wur.nl" +__copyright__ = "Copyright, 2013, Plant Research International, WUR" +__license__ = "Apache v2" + +def _process_data(in_csv, delim='\t'): + ''' + Generic method to parse a tab-separated file returning a dictionary with named columns + @param in_csv: input filename to be parsed + ''' + data = list(csv.reader(open(in_csv, 'rU'), delimiter=delim)) + header = data.pop(0) + # Create dictionary with column name as key + output = OrderedDict() + for index in xrange(len(header)): + output[header[index]] = [row[index] for row in data] + return output + +ONE_TO_ONE = 'one_to_one' +N_TO_ONE = 'n_to_one' + +def _merge_data(set1, link_field_set1, set2, link_field_set2, compare_function, merge_function, metadata, relation_type=ONE_TO_ONE): + ''' + Merges data from both input dictionaries based on the link fields. This method will + build up a new list containing the merged hits as the items. + @param set1: dictionary holding set1 in the form of N lists (one list per attribute name) + @param set2: dictionary holding set2 in the form of N lists (one list per attribute name) + ''' + # TODO test for correct input files -> same link_field values should be there + # (test at least number of unique link_field values): + # + # if (len(set1[link_field_set1]) != len(set2[link_field_set2])): + # raise Exception('input files should have the same nr of key values ') + + + merged = [] + processed = {} + for link_field_set1_idx in xrange(len(set1[link_field_set1])): + link_field_set1_value = set1[link_field_set1][link_field_set1_idx] + if not link_field_set1_value in processed : + # keep track of processed items to not repeat them + processed[link_field_set1_value] = link_field_set1_value + + # Get the indices for current link_field_set1_value in both data-structures for proper matching + set1index = [index for index, value in enumerate(set1[link_field_set1]) if value == link_field_set1_value] + set2index = [index for index, value in enumerate(set2[link_field_set2]) if compare_function(value, link_field_set1_value)==True ] + # Validation : + if len(set2index) == 0: + # means that corresponding data could not be found in set2, then throw error + raise Exception("Datasets not compatible, merge not possible. " + link_field_set1 + "=" + + link_field_set1_value + " only found in first dataset. ") + + merged_hits = [] + # Combine hits + for hit in xrange(len(set1index)): + # Create records of hits to be merged ("keys" are the attribute names, so what the lines below do + # is create a new "dict" item with same "keys"/attributes, with each attribute filled with its + # corresponding value in the sets; i.e. + # set1[key] => returns the list/array with size = nrrows, with the values for the attribute + # represented by "key". + # set1index[hit] => points to the row nr=hit (hit is a rownr/index) + # So set1[x][set1index[n]] = set1.attributeX.instanceN + # + # It just ensures the entry is made available as a plain named array for easy access. + rf_record = OrderedDict(zip(set1.keys(), [set1[key][set1index[hit]] for key in set1.keys()])) + if relation_type == ONE_TO_ONE : + cl_record = OrderedDict(zip(set2.keys(), [set2[key][set2index[hit]] for key in set2.keys()])) + else: + # is N to 1: + cl_record = OrderedDict(zip(set2.keys(), [set2[key][set2index[0]] for key in set2.keys()])) + + merged_hit = merge_function(rf_record, cl_record, metadata) + merged_hits.append(merged_hit) + + merged.append(merged_hits) + + return merged, len(set1index) + + +def _compare_records(key1, key2): + ''' + in this case the compare method is really simple as both keys are expected to contain + same value when records are the same + ''' + if key1 == key2: + return True + else: + return False + + + +def _merge_records(rank_caslookup_combi, msclust_quant_record, metadata): + ''' + Combines single records from both the RankFilter+CasLookup combi file and from MsClust file + + @param rank_caslookup_combi: rankfilter and caslookup combined record (see combine_output.py) + @param msclust_quant_record: msclust quantification + spectrum record + ''' + record = [] + for column in rank_caslookup_combi: + record.append(rank_caslookup_combi[column]) + + for column in msclust_quant_record: + record.append(msclust_quant_record[column]) + + for column in metadata: + record.append(metadata[column]) + + # add MOLECULAR MASS (MM) + molecular_mass = get_molecular_mass(rank_caslookup_combi['FORMULA']) + # limit to two decimals: + record.append("{0:.2f}".format(molecular_mass)) + + # add MOLECULAR WEIGHT (MW) - TODO - calculate this + record.append('0.0') + + # level of identification and Location of reference standard + record.append('0') + record.append('') + + return record + + +def get_molecular_mass(formula): + ''' + Calculates the molecular mass (MM). + E.g. MM of H2O = (relative)atomic mass of H x2 + (relative)atomic mass of O + ''' + + # Each element is represented by a capital letter, followed optionally by + # lower case, with one or more digits as for how many elements: + element_pattern = re.compile("([A-Z][a-z]?)(\d*)") + + total_mass = 0 + for (element_name, count) in element_pattern.findall(formula): + if count == "": + count = 1 + else: + count = int(count) + element_mass = float(elements_and_masses_map[element_name]) # "found: Python's built-in float type has double precision " (? check if really correct ?) + total_mass += element_mass * count + + return total_mass + + + +def _save_data(data, headers, out_csv): + ''' + Writes tab-separated data to file + @param data: dictionary containing merged dataset + @param out_csv: output csv file + ''' + + # Open output file for writing + outfile_single_handle = open(out_csv, 'wb') + output_single_handle = csv.writer(outfile_single_handle, delimiter="\t") + + # Write headers + output_single_handle.writerow(headers) + + # Write + for item_idx in xrange(len(data)): + for hit in data[item_idx]: + output_single_handle.writerow(hit) + + +def _get_map_for_elements_and_masses(elements_and_masses): + ''' + This method will read out the column 'Chemical symbol' and make a map + of this, storing the column 'Relative atomic mass' as its value + ''' + resultMap = {} + index = 0 + for entry in elements_and_masses['Chemical symbol']: + resultMap[entry] = elements_and_masses['Relative atomic mass'][index] + index += 1 + + return resultMap + + +def init_elements_and_masses_map(): + ''' + Initializes the lookup map containing the elements and their respective masses + ''' + elements_and_masses = _process_data(resource_filename(__name__, "static_resources/elements_and_masses.tab")) + global elements_and_masses_map + elements_and_masses_map = _get_map_for_elements_and_masses(elements_and_masses) + + +def main(): + ''' + Combine Output main function + + RankFilter, CasLookup are already combined by combine_output.py so here we will use + this result. Furthermore here the MsClust spectra file (.MSP) and one of the MsClust + quantification files are to be combined with combine_output.py result as well. + ''' + rankfilter_and_caslookup_combined_file = sys.argv[1] + msclust_quantification_and_spectra_file = sys.argv[2] + output_csv = sys.argv[3] + # metadata + metadata = OrderedDict() + metadata['organism'] = sys.argv[4] + metadata['tissue'] = sys.argv[5] + metadata['experiment_name'] = sys.argv[6] + metadata['user_name'] = sys.argv[7] + metadata['column_type'] = sys.argv[8] + + # Read RankFilter and CasLookup output files + rankfilter_and_caslookup_combined = _process_data(rankfilter_and_caslookup_combined_file) + msclust_quantification_and_spectra = _process_data(msclust_quantification_and_spectra_file, ',') + + # Read elements and masses to use for the MW/MM calculation : + init_elements_and_masses_map() + + merged, nhits = _merge_data(rankfilter_and_caslookup_combined, 'Centrotype', + msclust_quantification_and_spectra, 'centrotype', + _compare_records, _merge_records, metadata, + N_TO_ONE) + headers = rankfilter_and_caslookup_combined.keys() + msclust_quantification_and_spectra.keys() + metadata.keys() + ['MM','MW', 'Level of identification', 'Location of reference standard'] + _save_data(merged, headers, output_csv) + + +if __name__ == '__main__': + main()