| 
0
 | 
     1 #!/usr/bin/env python
 | 
| 
 | 
     2 # encoding: utf-8
 | 
| 
 | 
     3 '''
 | 
| 
 | 
     4 Module to combine output from two GCMS Galaxy tools (RankFilter and CasLookup)
 | 
| 
 | 
     5 '''
 | 
| 
 | 
     6 
 | 
| 
 | 
     7 import csv
 | 
| 
 | 
     8 import re
 | 
| 
 | 
     9 import sys
 | 
| 
 | 
    10 import math
 | 
| 
 | 
    11 import pprint
 | 
| 
 | 
    12 
 | 
| 
 | 
    13 __author__ = "Marcel Kempenaar"
 | 
| 
 | 
    14 __contact__ = "brs@nbic.nl"
 | 
| 
 | 
    15 __copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre"
 | 
| 
 | 
    16 __license__ = "MIT"
 | 
| 
 | 
    17 
 | 
| 
 | 
    18 def _process_data(in_csv):
 | 
| 
 | 
    19     '''
 | 
| 
 | 
    20     Generic method to parse a tab-separated file returning a dictionary with named columns
 | 
| 
 | 
    21     @param in_csv: input filename to be parsed
 | 
| 
 | 
    22     '''
 | 
| 
 | 
    23     data = list(csv.reader(open(in_csv, 'rU'), delimiter='\t'))
 | 
| 
 | 
    24     header = data.pop(0)
 | 
| 
 | 
    25     # Create dictionary with column name as key
 | 
| 
 | 
    26     output = {}
 | 
| 
 | 
    27     for index in xrange(len(header)):
 | 
| 
 | 
    28         output[header[index]] = [row[index] for row in data]
 | 
| 
 | 
    29     return output
 | 
| 
 | 
    30 
 | 
| 
 | 
    31 
 | 
| 
 | 
    32 def _merge_data(rankfilter, caslookup):
 | 
| 
 | 
    33     '''
 | 
| 
 | 
    34     Merges data from both input dictionaries based on the Centrotype field. This method will
 | 
| 
 | 
    35     build up a new list containing the merged hits as the items. 
 | 
| 
 | 
    36     @param rankfilter: dictionary holding RankFilter output in the form of N lists (one list per attribute name)
 | 
| 
 | 
    37     @param caslookup: dictionary holding CasLookup output in the form of N lists (one list per attribute name)
 | 
| 
 | 
    38     '''
 | 
| 
 | 
    39     # TODO: test for correct input files -> rankfilter and caslookup internal lists should have the same lenghts:
 | 
| 
 | 
    40     if (len(rankfilter['ID']) != len(caslookup['Centrotype'])):
 | 
| 
 | 
    41         raise Exception('rankfilter and caslookup files should have the same nr of rows/records ')
 | 
| 
 | 
    42     
 | 
| 
 | 
    43     merged = []
 | 
| 
 | 
    44     processed = {}
 | 
| 
 | 
    45     for compound_id_idx in xrange(len(rankfilter['ID'])):
 | 
| 
 | 
    46         compound_id = rankfilter['ID'][compound_id_idx]
 | 
| 
 | 
    47         if not compound_id in processed :
 | 
| 
 | 
    48             # keep track of processed items to not repeat them
 | 
| 
 | 
    49             processed[compound_id] = compound_id
 | 
| 
 | 
    50             # get centrotype nr
 | 
| 
 | 
    51             centrotype = compound_id.split('-')[0]
 | 
| 
 | 
    52             # Get the indices for current compound ID in both data-structures for proper matching
 | 
| 
 | 
    53             rindex = [index for index, value in enumerate(rankfilter['ID']) if value == compound_id]
 | 
| 
 | 
    54             cindex = [index for index, value in enumerate(caslookup['Centrotype']) if value == centrotype]
 | 
| 
 | 
    55             
 | 
| 
 | 
    56             merged_hits = []
 | 
| 
 | 
    57             # Combine hits
 | 
| 
 | 
    58             for hit in xrange(len(rindex)):
 | 
| 
 | 
    59                 # Create records of hits to be merged ("keys" are the attribute names, so what the lines below do 
 | 
| 
 | 
    60                 # is create a new "dict" item with same "keys"/attributes, with each attribute filled with its
 | 
| 
 | 
    61                 # corresponding value in the rankfilter or caslookup tables; i.e. 
 | 
| 
 | 
    62                 # rankfilter[key] => returns the list/array with size = nrrows, with the values for the attribute
 | 
| 
 | 
    63                 #                    represented by "key". rindex[hit] => points to the row nr=hit (hit is a rownr/index)
 | 
| 
 | 
    64                 rf_record = dict(zip(rankfilter.keys(), [rankfilter[key][rindex[hit]] for key in rankfilter.keys()]))
 | 
| 
 | 
    65                 cl_record = dict(zip(caslookup.keys(), [caslookup[key][cindex[hit]] for key in caslookup.keys()]))
 | 
| 
 | 
    66                 
 | 
| 
 | 
    67                 merged_hit = _add_hit(rf_record, cl_record)
 | 
| 
 | 
    68                 merged_hits.append(merged_hit)
 | 
| 
 | 
    69                 
 | 
| 
 | 
    70             merged.append(merged_hits)
 | 
| 
 | 
    71 
 | 
| 
 | 
    72     return merged, len(rindex)
 | 
| 
 | 
    73 
 | 
| 
 | 
    74 
 | 
| 
 | 
    75 def _add_hit(rankfilter, caslookup):
 | 
| 
 | 
    76     '''
 | 
| 
 | 
    77     Combines single records from both the RankFilter- and CasLookup-tools
 | 
| 
 | 
    78     @param rankfilter: record (dictionary) of one compound in the RankFilter output
 | 
| 
 | 
    79     @param caslookup: matching record (dictionary) of one compound in the CasLookup output
 | 
| 
 | 
    80     '''
 | 
| 
 | 
    81     # The ID in the RankFilter output contains the following 5 fields:
 | 
| 
 | 
    82     rf_id = rankfilter['ID'].split('-')
 | 
| 
 | 
    83     try:
 | 
| 
 | 
    84         name, formula = _remove_formula(rankfilter['Name'])
 | 
| 
 | 
    85         hit = [rf_id[0], # Centrotype
 | 
| 
 | 
    86                rf_id[1], # cent.Factor
 | 
| 
 | 
    87                rf_id[2], # scan nr
 | 
| 
 | 
    88                rf_id[3], # R.T. (umin)
 | 
| 
 | 
    89                rf_id[4], # nr. Peaks
 | 
| 
 | 
    90                # Appending other fields
 | 
| 
 | 
    91                rankfilter['R.T.'],
 | 
| 
 | 
    92                name,
 | 
| 
 | 
    93                caslookup['FORMULA'] if not formula else formula,
 | 
| 
 | 
    94                rankfilter['Library'].strip(),
 | 
| 
 | 
    95                rankfilter['CAS'].strip(),
 | 
| 
 | 
    96                rankfilter['Forward'],
 | 
| 
 | 
    97                rankfilter['Reverse'],
 | 
| 
 | 
    98                ((float(rankfilter['Forward']) + float(rankfilter['Reverse'])) / 2),
 | 
| 
 | 
    99                rankfilter['RIexp'],
 | 
| 
 | 
   100                caslookup['RI'],
 | 
| 
 | 
   101                rankfilter['RIsvr'],
 | 
| 
 | 
   102                # Calculate absolute differences
 | 
| 
 | 
   103                math.fabs(float(rankfilter['RIexp']) - float(rankfilter['RIsvr'])),
 | 
| 
 | 
   104                math.fabs(float(caslookup['RI']) - float(rankfilter['RIexp'])),
 | 
| 
 | 
   105                caslookup['Regression.Column.Name'],
 | 
| 
 | 
   106                caslookup['min'],
 | 
| 
 | 
   107                caslookup['max'],
 | 
| 
 | 
   108                caslookup['nr.duplicates'],
 | 
| 
 | 
   109                caslookup['Column.phase.type'],
 | 
| 
 | 
   110                caslookup['Column.name'],
 | 
| 
 | 
   111                rankfilter['Rank'],
 | 
| 
 | 
   112                rankfilter['%rel.err'],
 | 
| 
 | 
   113                rankfilter['Synonyms']]
 | 
| 
 | 
   114     except KeyError as error:
 | 
| 
 | 
   115         print "Problem reading in data from input file(s):\n",
 | 
| 
 | 
   116         print "Respective CasLookup entry: \n", pprint.pprint(caslookup), "\n"
 | 
| 
 | 
   117         print "Respective RankFilter entry: \n", pprint.pprint(rankfilter), "\n"
 | 
| 
 | 
   118         raise error
 | 
| 
 | 
   119 
 | 
| 
 | 
   120     return hit
 | 
| 
 | 
   121 
 | 
| 
 | 
   122 
 | 
| 
 | 
   123 def _remove_formula(name):
 | 
| 
 | 
   124     '''
 | 
| 
 | 
   125     The RankFilter Name field often contains the Formula as well, this function removes it from the Name
 | 
| 
 | 
   126     @param name: complete name of the compound from the RankFilter output
 | 
| 
 | 
   127     '''
 | 
| 
 | 
   128     name = name.split()
 | 
| 
 | 
   129     poss_formula = name[-1]
 | 
| 
 | 
   130     match = re.match("^(([A-Z][a-z]{0,2})(\d*))+$", poss_formula)
 | 
| 
 | 
   131     if match:
 | 
| 
 | 
   132         return ' '.join(name[:-1]), poss_formula
 | 
| 
 | 
   133     else:
 | 
| 
 | 
   134         return ' '.join(name), False
 | 
| 
 | 
   135 
 | 
| 
 | 
   136 
 | 
| 
 | 
   137 def _get_default_caslookup():
 | 
| 
 | 
   138     '''
 | 
| 
 | 
   139     The Cas Lookup tool might not have found all compounds in the library searched,
 | 
| 
 | 
   140     this default dict will be used to combine with the Rank Filter output
 | 
| 
 | 
   141     '''
 | 
| 
 | 
   142     return {'FORMULA': 'N/A',
 | 
| 
 | 
   143             'RI': '0.0',
 | 
| 
 | 
   144             'Regression.Column.Name': 'None',
 | 
| 
 | 
   145             'min': '0.0',
 | 
| 
 | 
   146             'max': '0.0',
 | 
| 
 | 
   147             'nr.duplicates': '0',
 | 
| 
 | 
   148             'Column.phase.type': 'N/A',
 | 
| 
 | 
   149             'Column.name': 'N/A'}
 | 
| 
 | 
   150 
 | 
| 
 | 
   151 
 | 
| 
 | 
   152 def _save_data(data, nhits, out_csv_single, out_csv_multi):
 | 
| 
 | 
   153     '''
 | 
| 
 | 
   154     Writes tab-separated data to file
 | 
| 
 | 
   155     @param data: dictionary containing merged dataset
 | 
| 
 | 
   156     @param out_csv: output csv file
 | 
| 
 | 
   157     '''
 | 
| 
 | 
   158     header = ['Centrotype',
 | 
| 
 | 
   159               'cent.Factor',
 | 
| 
 | 
   160               'scan nr.',
 | 
| 
 | 
   161               'R.T. (umin)',
 | 
| 
 | 
   162               'nr. Peaks',
 | 
| 
 | 
   163               'R.T.',
 | 
| 
 | 
   164               'Name',
 | 
| 
 | 
   165               'FORMULA',
 | 
| 
 | 
   166               'Library',
 | 
| 
 | 
   167               'CAS',
 | 
| 
 | 
   168               'Forward',
 | 
| 
 | 
   169               'Reverse',
 | 
| 
 | 
   170               'Avg. (Forward, Reverse)',
 | 
| 
 | 
   171               'RIexp',
 | 
| 
 | 
   172               'RI',
 | 
| 
 | 
   173               'RIsvr',
 | 
| 
 | 
   174               'RIexp - RIsvr',
 | 
| 
 | 
   175               'RI - RIexp',
 | 
| 
 | 
   176               'Regression.Column.Name',
 | 
| 
 | 
   177               'min',
 | 
| 
 | 
   178               'max',
 | 
| 
 | 
   179               'nr.duplicates',
 | 
| 
 | 
   180               'Column.phase.type',
 | 
| 
 | 
   181               'Column.name',
 | 
| 
 | 
   182               'Rank',
 | 
| 
 | 
   183               '%rel.err',
 | 
| 
 | 
   184               'Synonyms']
 | 
| 
 | 
   185 
 | 
| 
 | 
   186     # Open output file for writing
 | 
| 
 | 
   187     outfile_single_handle = open(out_csv_single, 'wb')
 | 
| 
 | 
   188     outfile_multi_handle = open(out_csv_multi, 'wb')
 | 
| 
 | 
   189     output_single_handle = csv.writer(outfile_single_handle, delimiter="\t")
 | 
| 
 | 
   190     output_multi_handle = csv.writer(outfile_multi_handle, delimiter="\t")
 | 
| 
 | 
   191 
 | 
| 
 | 
   192     # Write headers
 | 
| 
 | 
   193     output_single_handle.writerow(header)
 | 
| 
 | 
   194     output_multi_handle.writerow(header * nhits)
 | 
| 
 | 
   195     # Combine all hits for each centrotype into one line
 | 
| 
 | 
   196     line = []
 | 
| 
 | 
   197     for centrotype_idx in xrange(len(data)):
 | 
| 
 | 
   198         for hit in data[centrotype_idx]:
 | 
| 
 | 
   199             line.extend(hit)
 | 
| 
 | 
   200         output_multi_handle.writerow(line)
 | 
| 
 | 
   201         line = []
 | 
| 
 | 
   202 
 | 
| 
 | 
   203     # Write one line for each centrotype
 | 
| 
 | 
   204     for centrotype_idx in xrange(len(data)):
 | 
| 
 | 
   205         for hit in data[centrotype_idx]:
 | 
| 
 | 
   206             output_single_handle.writerow(hit)
 | 
| 
 | 
   207 
 | 
| 
 | 
   208 
 | 
| 
 | 
   209 def main():
 | 
| 
 | 
   210     '''
 | 
| 
 | 
   211     Combine Output main function
 | 
| 
 | 
   212     It will merge the result files from "RankFilter"  and "Lookup RI for CAS numbers" 
 | 
| 
 | 
   213     NB: the caslookup_result_file will typically have fewer lines than
 | 
| 
 | 
   214     rankfilter_result_file, so the merge has to consider this as well. The final file
 | 
| 
 | 
   215     should have the same nr of lines as rankfilter_result_file.
 | 
| 
 | 
   216     '''
 | 
| 
 | 
   217     rankfilter_result_file = sys.argv[1]
 | 
| 
 | 
   218     caslookup_result_file = sys.argv[2]
 | 
| 
 | 
   219     output_single_csv = sys.argv[3]
 | 
| 
 | 
   220     output_multi_csv = sys.argv[4]
 | 
| 
 | 
   221 
 | 
| 
 | 
   222     # Read RankFilter and CasLookup output files
 | 
| 
 | 
   223     rankfilter = _process_data(rankfilter_result_file)
 | 
| 
 | 
   224     caslookup = _process_data(caslookup_result_file)
 | 
| 
 | 
   225     merged, nhits = _merge_data(rankfilter, caslookup)
 | 
| 
 | 
   226     _save_data(merged, nhits, output_single_csv, output_multi_csv)
 | 
| 
 | 
   227 
 | 
| 
 | 
   228 
 | 
| 
 | 
   229 if __name__ == '__main__':
 | 
| 
 | 
   230     main()
 |