view combine_output.py @ 59:458e05d1d172

fix for xcms support in msclust
author pieter.lukasse@wur.nl
date Fri, 12 Dec 2014 12:28:08 +0100
parents 19d8fd10248e
children
line wrap: on
line source

#!/usr/bin/env python
# encoding: utf-8
'''
Module to combine output from two GCMS Galaxy tools (RankFilter and CasLookup)
'''

import csv
import re
import sys
import math
import pprint

__author__ = "Marcel Kempenaar"
__contact__ = "brs@nbic.nl"
__copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre"
__license__ = "MIT"

def _process_data(in_csv):
    '''
    Generic method to parse a tab-separated file returning a dictionary with named columns
    @param in_csv: input filename to be parsed
    '''
    data = list(csv.reader(open(in_csv, 'rU'), delimiter='\t'))
    header = data.pop(0)
    # Create dictionary with column name as key
    output = {}
    for index in xrange(len(header)):
        output[header[index]] = [row[index] for row in data]
    return output


def _merge_data(rankfilter, caslookup):
    '''
    Merges data from both input dictionaries based on the Centrotype field. This method will
    build up a new list containing the merged hits as the items. 
    @param rankfilter: dictionary holding RankFilter output in the form of N lists (one list per attribute name)
    @param caslookup: dictionary holding CasLookup output in the form of N lists (one list per attribute name)
    '''
    # TODO: test for correct input files -> rankfilter and caslookup internal lists should have the same lenghts:
    if (len(rankfilter['ID']) != len(caslookup['Centrotype'])):
        raise Exception('rankfilter and caslookup files should have the same nr of rows/records ')
    
    merged = []
    processed = {}
    for compound_id_idx in xrange(len(rankfilter['ID'])):
        compound_id = rankfilter['ID'][compound_id_idx]
        if not compound_id in processed :
            # keep track of processed items to not repeat them
            processed[compound_id] = compound_id
            # get centrotype nr
            centrotype = compound_id.split('-')[0]
            # Get the indices for current compound ID in both data-structures for proper matching
            rindex = [index for index, value in enumerate(rankfilter['ID']) if value == compound_id]
            cindex = [index for index, value in enumerate(caslookup['Centrotype']) if value == centrotype]
            
            merged_hits = []
            # Combine hits
            for hit in xrange(len(rindex)):
                # Create records of hits to be merged ("keys" are the attribute names, so what the lines below do 
                # is create a new "dict" item with same "keys"/attributes, with each attribute filled with its
                # corresponding value in the rankfilter or caslookup tables; i.e. 
                # rankfilter[key] => returns the list/array with size = nrrows, with the values for the attribute
                #                    represented by "key". rindex[hit] => points to the row nr=hit (hit is a rownr/index)
                rf_record = dict(zip(rankfilter.keys(), [rankfilter[key][rindex[hit]] for key in rankfilter.keys()]))
                cl_record = dict(zip(caslookup.keys(), [caslookup[key][cindex[hit]] for key in caslookup.keys()]))
                
                merged_hit = _add_hit(rf_record, cl_record)
                merged_hits.append(merged_hit)
                
            merged.append(merged_hits)

    return merged, len(rindex)


def _add_hit(rankfilter, caslookup):
    '''
    Combines single records from both the RankFilter- and CasLookup-tools
    @param rankfilter: record (dictionary) of one compound in the RankFilter output
    @param caslookup: matching record (dictionary) of one compound in the CasLookup output
    '''
    # The ID in the RankFilter output contains the following 5 fields:
    rf_id = rankfilter['ID'].split('-')
    try:
        name, formula = _remove_formula(rankfilter['Name'])
        hit = [rf_id[0], # Centrotype
               rf_id[1], # cent.Factor
               rf_id[2], # scan nr
               rf_id[3], # R.T. (umin)
               rf_id[4], # nr. Peaks
               # Appending other fields
               rankfilter['R.T.'],
               name,
               caslookup['FORMULA'] if not formula else formula,
               rankfilter['Library'].strip(),
               rankfilter['CAS'].strip(),
               rankfilter['Forward'],
               rankfilter['Reverse'],
               ((float(rankfilter['Forward']) + float(rankfilter['Reverse'])) / 2),
               rankfilter['RIexp'],
               caslookup['RI'],
               rankfilter['RIsvr'],
               # Calculate absolute differences
               math.fabs(float(rankfilter['RIexp']) - float(rankfilter['RIsvr'])),
               math.fabs(float(caslookup['RI']) - float(rankfilter['RIexp'])),
               caslookup['Regression.Column.Name'],
               caslookup['min'],
               caslookup['max'],
               caslookup['nr.duplicates'],
               caslookup['Column.phase.type'],
               caslookup['Column.name'],
               rankfilter['Rank'],
               rankfilter['%rel.err'],
               rankfilter['Synonyms']]
    except KeyError as error:
        print "Problem reading in data from input file(s):\n",
        print "Respective CasLookup entry: \n", pprint.pprint(caslookup), "\n"
        print "Respective RankFilter entry: \n", pprint.pprint(rankfilter), "\n"
        raise error

    return hit


def _remove_formula(name):
    '''
    The RankFilter Name field often contains the Formula as well, this function removes it from the Name
    @param name: complete name of the compound from the RankFilter output
    '''
    name = name.split()
    poss_formula = name[-1]
    match = re.match("^(([A-Z][a-z]{0,2})(\d*))+$", poss_formula)
    if match:
        return ' '.join(name[:-1]), poss_formula
    else:
        return ' '.join(name), False


def _get_default_caslookup():
    '''
    The Cas Lookup tool might not have found all compounds in the library searched,
    this default dict will be used to combine with the Rank Filter output
    '''
    return {'FORMULA': 'N/A',
            'RI': '0.0',
            'Regression.Column.Name': 'None',
            'min': '0.0',
            'max': '0.0',
            'nr.duplicates': '0',
            'Column.phase.type': 'N/A',
            'Column.name': 'N/A'}


def _save_data(data, nhits, out_csv_single, out_csv_multi):
    '''
    Writes tab-separated data to file
    @param data: dictionary containing merged dataset
    @param out_csv: output csv file
    '''
    # Columns we don't repeat:
    header_part1 = ['Centrotype',
              'cent.Factor',
              'scan nr.',
              'R.T. (umin)',
              'nr. Peaks',
              'R.T.']
    # These are the headers/columns we repeat in case of 
    # combining hits in one line (see alternative_headers method below):
    header_part2 = [
              'Name',
              'FORMULA',
              'Library',
              'CAS',
              'Forward',
              'Reverse',
              'Avg. (Forward, Reverse)',
              'RIexp',
              'RI',
              'RIsvr',
              'RIexp - RIsvr',
              'RI - RIexp',
              'Regression.Column.Name',
              'min',
              'max',
              'nr.duplicates',
              'Column.phase.type',
              'Column.name',
              'Rank',
              '%rel.err',
              'Synonyms']

    # Open output file for writing
    outfile_single_handle = open(out_csv_single, 'wb')
    outfile_multi_handle = open(out_csv_multi, 'wb')
    output_single_handle = csv.writer(outfile_single_handle, delimiter="\t")
    output_multi_handle = csv.writer(outfile_multi_handle, delimiter="\t")

    # Write headers
    output_single_handle.writerow(header_part1 + header_part2)
    output_multi_handle.writerow(header_part1 + header_part2 + alternative_headers(header_part2, nhits-1))
    # Combine all hits for each centrotype into one line
    line = []
    for centrotype_idx in xrange(len(data)):
        i = 0
        for hit in data[centrotype_idx]:
            if i==0:
                line.extend(hit)
            else:
                line.extend(hit[6:])
            i = i+1
        # small validation (if error, it is a programming error):
        if i > nhits:
            raise Exception('Error: more hits that expected for  centrotype_idx ' + centrotype_idx)
        output_multi_handle.writerow(line)
        line = []

    # Write one line for each centrotype
    for centrotype_idx in xrange(len(data)):
        for hit in data[centrotype_idx]:
            output_single_handle.writerow(hit)

def alternative_headers(header_part2, nr_alternative_hits):
    ''' 
    This method will iterate over the header names and add the string 'ALT#_' before each, 
    where # is the number of the alternative, according to number of alternative hits we want to add
    to final csv/tsv
    '''
    result = []
    for i in xrange(nr_alternative_hits): 
        for header_name in header_part2:
            result.append("ALT" + str(i+1) + "_" + header_name) 
    return result

def main():
    '''
    Combine Output main function
    It will merge the result files from "RankFilter"  and "Lookup RI for CAS numbers" 
    NB: the caslookup_result_file will typically have fewer lines than
    rankfilter_result_file, so the merge has to consider this as well. The final file
    should have the same nr of lines as rankfilter_result_file.
    '''
    rankfilter_result_file = sys.argv[1]
    caslookup_result_file = sys.argv[2]
    output_single_csv = sys.argv[3]
    output_multi_csv = sys.argv[4]

    # Read RankFilter and CasLookup output files
    rankfilter = _process_data(rankfilter_result_file)
    caslookup = _process_data(caslookup_result_file)
    merged, nhits = _merge_data(rankfilter, caslookup)
    _save_data(merged, nhits, output_single_csv, output_multi_csv)


if __name__ == '__main__':
    main()