view export_to_metexp_tabular.py @ 0:dffc38727496

initial commit
author pieter.lukasse@wur.nl
date Sat, 07 Feb 2015 22:02:00 +0100
parents
children 0d1557b3d540
line wrap: on
line source

#!/usr/bin/env python
# encoding: utf-8
'''
Module to combine output from the GCMS Galaxy tools RankFilter, CasLookup and MsClust
into a tabular file that can be uploaded to the MetExp database.

RankFilter, CasLookup are already combined by combine_output.py so here we will use
this result. Furthermore here one of the MsClust
quantification files containing the respective spectra details are to be combined as well. 

Extra calculations performed:
- The column MW is also added here and is derived from the column FORMULA found 
  in RankFilter, CasLookup combined result. 
  
So in total here we merge 2 files and calculate one new column. 
'''
from pkg_resources import resource_filename  # @UnresolvedImport # pylint: disable=E0611
import csv
import re
import sys
from collections import OrderedDict

__author__ = "Pieter Lukasse"
__contact__ = "pieter.lukasse@wur.nl"
__copyright__ = "Copyright, 2013, Plant Research International, WUR"
__license__ = "Apache v2"

def _process_data(in_csv, delim='\t'):
    '''
    Generic method to parse a tab-separated file returning a dictionary with named columns
    @param in_csv: input filename to be parsed
    '''
    data = list(csv.reader(open(in_csv, 'rU'), delimiter=delim))
    header = data.pop(0)
    # Create dictionary with column name as key
    output = OrderedDict()
    for index in xrange(len(header)):
        output[header[index]] = [row[index] for row in data]
    return output

ONE_TO_ONE = 'one_to_one'
N_TO_ONE = 'n_to_one'

def _merge_data(set1, link_field_set1, set2, link_field_set2, compare_function, merge_function, metadata, relation_type=ONE_TO_ONE):
    '''
    Merges data from both input dictionaries based on the link fields. This method will
    build up a new list containing the merged hits as the items. 
    @param set1: dictionary holding set1 in the form of N lists (one list per attribute name)
    @param set2: dictionary holding set2 in the form of N lists (one list per attribute name)
    '''
    # TODO test for correct input files -> same link_field values should be there 
    # (test at least number of unique link_field values):
    #
    # if (len(set1[link_field_set1]) != len(set2[link_field_set2])):
    #    raise Exception('input files should have the same nr of key values  ')
    
    
    merged = []
    processed = {}
    for link_field_set1_idx in xrange(len(set1[link_field_set1])):
        link_field_set1_value = set1[link_field_set1][link_field_set1_idx]
        if not link_field_set1_value in processed :
            # keep track of processed items to not repeat them
            processed[link_field_set1_value] = link_field_set1_value

            # Get the indices for current link_field_set1_value in both data-structures for proper matching
            set1index = [index for index, value in enumerate(set1[link_field_set1]) if value == link_field_set1_value]
            set2index = [index for index, value in enumerate(set2[link_field_set2]) if compare_function(value, link_field_set1_value)==True ]
            # Validation :
            if len(set2index) == 0:
                # means that corresponding data could not be found in set2, then throw error
                raise Exception("Datasets not compatible, merge not possible. " + link_field_set1 + "=" + 
                                link_field_set1_value + " only found in first dataset. ")
            
            merged_hits = []
            # Combine hits
            for hit in xrange(len(set1index)):
                # Create records of hits to be merged ("keys" are the attribute names, so what the lines below do 
                # is create a new "dict" item with same "keys"/attributes, with each attribute filled with its
                # corresponding value in the sets; i.e. 
                # set1[key] => returns the list/array with size = nrrows, with the values for the attribute
                #                    represented by "key". 
                # set1index[hit] => points to the row nr=hit (hit is a rownr/index)
                # So set1[x][set1index[n]] = set1.attributeX.instanceN
                #
                # It just ensures the entry is made available as a plain named array for easy access.
                rf_record = OrderedDict(zip(set1.keys(), [set1[key][set1index[hit]] for key in set1.keys()]))
                if relation_type == ONE_TO_ONE :
                    cl_record = OrderedDict(zip(set2.keys(), [set2[key][set2index[hit]] for key in set2.keys()]))
                else:
                    # is N to 1:
                    cl_record = OrderedDict(zip(set2.keys(), [set2[key][set2index[0]] for key in set2.keys()]))
                
                merged_hit = merge_function(rf_record, cl_record, metadata)
                merged_hits.append(merged_hit)
                
            merged.append(merged_hits)

    return merged, len(set1index)


def _compare_records(key1, key2):
    '''
    in this case the compare method is really simple as both keys are expected to contain 
    same value when records are the same
    '''
    if key1 == key2:
        return True
    else:
        return False
    
    
    
def _merge_records(rank_caslookup_combi, msclust_quant_record, metadata):
    '''
    Combines single records from both the RankFilter+CasLookup combi file and from MsClust file
    
    @param rank_caslookup_combi: rankfilter and caslookup combined record (see combine_output.py)
    @param msclust_quant_record: msclust quantification + spectrum record
    '''
    record = []
    for column in rank_caslookup_combi:
        record.append(rank_caslookup_combi[column])
    
    for column in msclust_quant_record:
        record.append(msclust_quant_record[column])
        
    for column in metadata:
        record.append(metadata[column])
        
    # add MOLECULAR MASS (MM) 
    molecular_mass = get_molecular_mass(rank_caslookup_combi['FORMULA'])
    # limit to two decimals:    
    record.append("{0:.2f}".format(molecular_mass))    
        
    # add MOLECULAR WEIGHT (MW) - TODO - calculate this
    record.append('0.0')    
    
    # level of identification and Location of reference standard
    record.append('0')
    record.append('')    
        
    return record


def get_molecular_mass(formula):
    '''
    Calculates the molecular mass (MM). 
    E.g. MM of H2O = (relative)atomic mass of H x2 + (relative)atomic mass of O
    '''
    
    # Each element is represented by a capital letter, followed optionally by 
    # lower case, with one or more digits as for how many elements:
    element_pattern = re.compile("([A-Z][a-z]?)(\d*)")

    total_mass = 0
    for (element_name, count) in element_pattern.findall(formula):
        if count == "":
            count = 1
        else:
            count = int(count)
        element_mass = float(elements_and_masses_map[element_name])  # "found: Python's built-in float type has double precision " (? check if really correct ?)
        total_mass += element_mass * count
        
    return total_mass
    
    

def _save_data(data, headers, out_csv):
    '''
    Writes tab-separated data to file
    @param data: dictionary containing merged dataset
    @param out_csv: output csv file
    '''

    # Open output file for writing
    outfile_single_handle = open(out_csv, 'wb')
    output_single_handle = csv.writer(outfile_single_handle, delimiter="\t")

    # Write headers
    output_single_handle.writerow(headers)

    # Write 
    for item_idx in xrange(len(data)):
        for hit in data[item_idx]:
            output_single_handle.writerow(hit)


def _get_map_for_elements_and_masses(elements_and_masses):
    '''
    This method will read out the column 'Chemical symbol' and make a map 
    of this, storing the column 'Relative atomic mass' as its value
    '''
    resultMap = {}
    index = 0
    for entry in elements_and_masses['Chemical symbol']:
        resultMap[entry] = elements_and_masses['Relative atomic mass'][index]
        index += 1
        
    return resultMap


def init_elements_and_masses_map():
    '''
    Initializes the lookup map containing the elements and their respective masses
    '''
    elements_and_masses = _process_data(resource_filename(__name__, "static_resources/elements_and_masses.tab"))
    global elements_and_masses_map
    elements_and_masses_map = _get_map_for_elements_and_masses(elements_and_masses)
    

def main():
    '''
    Combine Output main function
    
    RankFilter, CasLookup are already combined by combine_output.py so here we will use
    this result. Furthermore here the MsClust spectra file (.MSP) and one of the MsClust
    quantification files are to be combined with combine_output.py result as well. 
    '''
    rankfilter_and_caslookup_combined_file = sys.argv[1]
    msclust_quantification_and_spectra_file = sys.argv[2]
    output_csv = sys.argv[3]
    # metadata
    metadata = OrderedDict()
    metadata['organism'] = sys.argv[4]
    metadata['tissue'] = sys.argv[5]
    metadata['experiment_name'] = sys.argv[6]
    metadata['user_name'] = sys.argv[7]
    metadata['column_type'] = sys.argv[8]

    # Read RankFilter and CasLookup output files
    rankfilter_and_caslookup_combined = _process_data(rankfilter_and_caslookup_combined_file)
    msclust_quantification_and_spectra = _process_data(msclust_quantification_and_spectra_file, ',')
    
    # Read elements and masses to use for the MW/MM calculation :
    init_elements_and_masses_map()
    
    merged, nhits = _merge_data(rankfilter_and_caslookup_combined, 'Centrotype', 
                                msclust_quantification_and_spectra, 'centrotype', 
                                _compare_records, _merge_records, metadata,
                                N_TO_ONE)
    headers = rankfilter_and_caslookup_combined.keys() + msclust_quantification_and_spectra.keys() + metadata.keys() + ['MM','MW', 'Level of identification', 'Location of reference standard']
    _save_data(merged, headers, output_csv)


if __name__ == '__main__':
    main()