Mercurial > repos > pieterlukasse > prims_metabolomics2
annotate GCMS/combine_output.py @ 21:43902da5d00e
changed match_library location again
| author | linda.bakker@wur.nl <linda.bakker@wur.nl> | 
|---|---|
| date | Wed, 06 May 2015 08:06:53 +0200 | 
| parents | fe4682eb938c | 
| children | 
| rev | line source | 
|---|---|
| 6 | 1 #!/usr/bin/env python | 
| 2 # encoding: utf-8 | |
| 3 ''' | |
| 4 Module to combine output from two GCMS Galaxy tools (RankFilter and CasLookup) | |
| 5 ''' | |
| 6 | |
| 7 import csv | |
| 8 import sys | |
| 9 import math | |
| 10 import pprint | |
| 11 | |
| 12 __author__ = "Marcel Kempenaar" | |
| 13 __contact__ = "brs@nbic.nl" | |
| 14 __copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre" | |
| 15 __license__ = "MIT" | |
| 16 | |
| 17 def _process_data(in_csv): | |
| 18 ''' | |
| 19 Generic method to parse a tab-separated file returning a dictionary with named columns | |
| 20 @param in_csv: input filename to be parsed | |
| 21 ''' | |
| 22 data = list(csv.reader(open(in_csv, 'rU'), delimiter='\t')) | |
| 23 header = data.pop(0) | |
| 24 # Create dictionary with column name as key | |
| 25 output = {} | |
| 26 for index in xrange(len(header)): | |
| 27 output[header[index]] = [row[index] for row in data] | |
| 28 return output | |
| 29 | |
| 30 | |
| 31 def _merge_data(rankfilter, caslookup): | |
| 32 ''' | |
| 33 Merges data from both input dictionaries based on the Centrotype field. This method will | |
| 34 build up a new list containing the merged hits as the items. | |
| 35 @param rankfilter: dictionary holding RankFilter output in the form of N lists (one list per attribute name) | |
| 36 @param caslookup: dictionary holding CasLookup output in the form of N lists (one list per attribute name) | |
| 37 ''' | |
| 38 # TODO: test for correct input files -> rankfilter and caslookup internal lists should have the same lenghts: | |
| 39 if (len(rankfilter['ID']) != len(caslookup['Centrotype'])): | |
| 40 raise Exception('rankfilter and caslookup files should have the same nr of rows/records ') | |
| 41 | |
| 42 merged = [] | |
| 43 processed = {} | |
| 44 for compound_id_idx in xrange(len(rankfilter['ID'])): | |
| 45 compound_id = rankfilter['ID'][compound_id_idx] | |
| 46 if not compound_id in processed : | |
| 47 # keep track of processed items to not repeat them | |
| 48 processed[compound_id] = compound_id | |
| 49 # get centrotype nr | |
| 50 centrotype = compound_id.split('-')[0] | |
| 51 # Get the indices for current compound ID in both data-structures for proper matching | |
| 52 rindex = [index for index, value in enumerate(rankfilter['ID']) if value == compound_id] | |
| 53 cindex = [index for index, value in enumerate(caslookup['Centrotype']) if value == centrotype] | |
| 54 | |
| 55 merged_hits = [] | |
| 56 # Combine hits | |
| 57 for hit in xrange(len(rindex)): | |
| 58 # Create records of hits to be merged ("keys" are the attribute names, so what the lines below do | |
| 59 # is create a new "dict" item with same "keys"/attributes, with each attribute filled with its | |
| 60 # corresponding value in the rankfilter or caslookup tables; i.e. | |
| 61 # rankfilter[key] => returns the list/array with size = nrrows, with the values for the attribute | |
| 62 # represented by "key". rindex[hit] => points to the row nr=hit (hit is a rownr/index) | |
| 63 rf_record = dict(zip(rankfilter.keys(), [rankfilter[key][rindex[hit]] for key in rankfilter.keys()])) | |
| 64 cl_record = dict(zip(caslookup.keys(), [caslookup[key][cindex[hit]] for key in caslookup.keys()])) | |
| 65 | |
| 66 merged_hit = _add_hit(rf_record, cl_record) | |
| 67 merged_hits.append(merged_hit) | |
| 68 | |
| 69 merged.append(merged_hits) | |
| 70 | |
| 71 return merged, len(rindex) | |
| 72 | |
| 73 | |
| 74 def _add_hit(rankfilter, caslookup): | |
| 75 ''' | |
| 76 Combines single records from both the RankFilter- and CasLookup-tools | |
| 77 @param rankfilter: record (dictionary) of one compound in the RankFilter output | |
| 78 @param caslookup: matching record (dictionary) of one compound in the CasLookup output | |
| 79 ''' | |
| 80 # The ID in the RankFilter output contains the following 5 fields: | |
| 81 rf_id = rankfilter['ID'].split('-') | |
| 82 try: | |
| 16 | 83 if 'Formula' not in rankfilter: | 
| 84 raise Exception("Error: old Rankfilter format detected (the selected Rankfilter data does not contain the column 'Formula'). Solution: rerun Rankfilter again.") | |
| 6 | 85 hit = [rf_id[0], # Centrotype | 
| 86 rf_id[1], # cent.Factor | |
| 87 rf_id[2], # scan nr | |
| 88 rf_id[3], # R.T. (umin) | |
| 89 rf_id[4], # nr. Peaks | |
| 16 | 90 rankfilter['R.T.'], | 
| 6 | 91 # Appending other fields | 
| 14 
346ff9ad8c7a
fix for rankfilter, removed pfd read functional
 linda.bakker@wur.nl <linda.bakker@wur.nl> parents: 
6diff
changeset | 92 rankfilter['Name'], | 
| 
346ff9ad8c7a
fix for rankfilter, removed pfd read functional
 linda.bakker@wur.nl <linda.bakker@wur.nl> parents: 
6diff
changeset | 93 rankfilter['Formula'], | 
| 6 | 94 rankfilter['Library'].strip(), | 
| 95 rankfilter['CAS'].strip(), | |
| 96 rankfilter['Forward'], | |
| 97 rankfilter['Reverse'], | |
| 98 ((float(rankfilter['Forward']) + float(rankfilter['Reverse'])) / 2), | |
| 99 rankfilter['RIexp'], | |
| 100 caslookup['RI'], | |
| 101 rankfilter['RIsvr'], | |
| 102 # Calculate absolute differences | |
| 103 math.fabs(float(rankfilter['RIexp']) - float(rankfilter['RIsvr'])), | |
| 104 math.fabs(float(caslookup['RI']) - float(rankfilter['RIexp'])), | |
| 105 caslookup['Regression.Column.Name'], | |
| 106 caslookup['min'], | |
| 107 caslookup['max'], | |
| 108 caslookup['nr.duplicates'], | |
| 109 caslookup['Column.phase.type'], | |
| 110 caslookup['Column.name'], | |
| 111 rankfilter['Rank'], | |
| 112 rankfilter['%rel.err'], | |
| 113 rankfilter['Synonyms']] | |
| 114 except KeyError as error: | |
| 115 print "Problem reading in data from input file(s):\n", | |
| 116 print "Respective CasLookup entry: \n", pprint.pprint(caslookup), "\n" | |
| 117 print "Respective RankFilter entry: \n", pprint.pprint(rankfilter), "\n" | |
| 118 raise error | |
| 119 | |
| 120 return hit | |
| 121 | |
| 122 | |
| 14 
346ff9ad8c7a
fix for rankfilter, removed pfd read functional
 linda.bakker@wur.nl <linda.bakker@wur.nl> parents: 
6diff
changeset | 123 | 
| 6 | 124 | 
| 125 | |
| 126 def _get_default_caslookup(): | |
| 127 ''' | |
| 128 The Cas Lookup tool might not have found all compounds in the library searched, | |
| 129 this default dict will be used to combine with the Rank Filter output | |
| 130 ''' | |
| 131 return {'FORMULA': 'N/A', | |
| 132 'RI': '0.0', | |
| 133 'Regression.Column.Name': 'None', | |
| 134 'min': '0.0', | |
| 135 'max': '0.0', | |
| 136 'nr.duplicates': '0', | |
| 137 'Column.phase.type': 'N/A', | |
| 138 'Column.name': 'N/A'} | |
| 139 | |
| 140 | |
| 141 def _save_data(data, nhits, out_csv_single, out_csv_multi): | |
| 142 ''' | |
| 143 Writes tab-separated data to file | |
| 144 @param data: dictionary containing merged dataset | |
| 145 @param out_csv: output csv file | |
| 146 ''' | |
| 147 # Columns we don't repeat: | |
| 148 header_part1 = ['Centrotype', | |
| 149 'cent.Factor', | |
| 150 'scan nr.', | |
| 151 'R.T. (umin)', | |
| 152 'nr. Peaks', | |
| 153 'R.T.'] | |
| 154 # These are the headers/columns we repeat in case of | |
| 155 # combining hits in one line (see alternative_headers method below): | |
| 156 header_part2 = [ | |
| 157 'Name', | |
| 158 'FORMULA', | |
| 159 'Library', | |
| 160 'CAS', | |
| 161 'Forward', | |
| 162 'Reverse', | |
| 163 'Avg. (Forward, Reverse)', | |
| 164 'RIexp', | |
| 165 'RI', | |
| 166 'RIsvr', | |
| 167 'RIexp - RIsvr', | |
| 168 'RI - RIexp', | |
| 169 'Regression.Column.Name', | |
| 170 'min', | |
| 171 'max', | |
| 172 'nr.duplicates', | |
| 173 'Column.phase.type', | |
| 174 'Column.name', | |
| 175 'Rank', | |
| 176 '%rel.err', | |
| 177 'Synonyms'] | |
| 178 | |
| 179 # Open output file for writing | |
| 180 outfile_single_handle = open(out_csv_single, 'wb') | |
| 181 outfile_multi_handle = open(out_csv_multi, 'wb') | |
| 182 output_single_handle = csv.writer(outfile_single_handle, delimiter="\t") | |
| 183 output_multi_handle = csv.writer(outfile_multi_handle, delimiter="\t") | |
| 184 | |
| 185 # Write headers | |
| 186 output_single_handle.writerow(header_part1 + header_part2) | |
| 187 output_multi_handle.writerow(header_part1 + header_part2 + alternative_headers(header_part2, nhits-1)) | |
| 188 # Combine all hits for each centrotype into one line | |
| 189 line = [] | |
| 190 for centrotype_idx in xrange(len(data)): | |
| 191 i = 0 | |
| 192 for hit in data[centrotype_idx]: | |
| 193 if i==0: | |
| 194 line.extend(hit) | |
| 195 else: | |
| 196 line.extend(hit[6:]) | |
| 197 i = i+1 | |
| 198 # small validation (if error, it is a programming error): | |
| 199 if i > nhits: | |
| 200 raise Exception('Error: more hits that expected for centrotype_idx ' + centrotype_idx) | |
| 201 output_multi_handle.writerow(line) | |
| 202 line = [] | |
| 203 | |
| 204 # Write one line for each centrotype | |
| 205 for centrotype_idx in xrange(len(data)): | |
| 206 for hit in data[centrotype_idx]: | |
| 207 output_single_handle.writerow(hit) | |
| 208 | |
| 209 def alternative_headers(header_part2, nr_alternative_hits): | |
| 210 ''' | |
| 211 This method will iterate over the header names and add the string 'ALT#_' before each, | |
| 212 where # is the number of the alternative, according to number of alternative hits we want to add | |
| 213 to final csv/tsv | |
| 214 ''' | |
| 215 result = [] | |
| 216 for i in xrange(nr_alternative_hits): | |
| 217 for header_name in header_part2: | |
| 218 result.append("ALT" + str(i+1) + "_" + header_name) | |
| 219 return result | |
| 220 | |
| 221 def main(): | |
| 222 ''' | |
| 223 Combine Output main function | |
| 224 It will merge the result files from "RankFilter" and "Lookup RI for CAS numbers" | |
| 225 NB: the caslookup_result_file will typically have fewer lines than | |
| 226 rankfilter_result_file, so the merge has to consider this as well. The final file | |
| 227 should have the same nr of lines as rankfilter_result_file. | |
| 228 ''' | |
| 229 rankfilter_result_file = sys.argv[1] | |
| 230 caslookup_result_file = sys.argv[2] | |
| 231 output_single_csv = sys.argv[3] | |
| 232 output_multi_csv = sys.argv[4] | |
| 233 | |
| 234 # Read RankFilter and CasLookup output files | |
| 235 rankfilter = _process_data(rankfilter_result_file) | |
| 236 caslookup = _process_data(caslookup_result_file) | |
| 237 merged, nhits = _merge_data(rankfilter, caslookup) | |
| 238 _save_data(merged, nhits, output_single_csv, output_multi_csv) | |
| 239 | |
| 240 | |
| 241 if __name__ == '__main__': | |
| 242 main() | 
