comparison combine_output.py @ 0:dffc38727496

initial commit
author pieter.lukasse@wur.nl
date Sat, 07 Feb 2015 22:02:00 +0100
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:dffc38727496
1 #!/usr/bin/env python
2 # encoding: utf-8
3 '''
4 Module to combine output from two GCMS Galaxy tools (RankFilter and CasLookup)
5 '''
6
7 import csv
8 import re
9 import sys
10 import math
11 import pprint
12
13 __author__ = "Marcel Kempenaar"
14 __contact__ = "brs@nbic.nl"
15 __copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre"
16 __license__ = "MIT"
17
18 def _process_data(in_csv):
19 '''
20 Generic method to parse a tab-separated file returning a dictionary with named columns
21 @param in_csv: input filename to be parsed
22 '''
23 data = list(csv.reader(open(in_csv, 'rU'), delimiter='\t'))
24 header = data.pop(0)
25 # Create dictionary with column name as key
26 output = {}
27 for index in xrange(len(header)):
28 output[header[index]] = [row[index] for row in data]
29 return output
30
31
32 def _merge_data(rankfilter, caslookup):
33 '''
34 Merges data from both input dictionaries based on the Centrotype field. This method will
35 build up a new list containing the merged hits as the items.
36 @param rankfilter: dictionary holding RankFilter output in the form of N lists (one list per attribute name)
37 @param caslookup: dictionary holding CasLookup output in the form of N lists (one list per attribute name)
38 '''
39 # TODO: test for correct input files -> rankfilter and caslookup internal lists should have the same lenghts:
40 if (len(rankfilter['ID']) != len(caslookup['Centrotype'])):
41 raise Exception('rankfilter and caslookup files should have the same nr of rows/records ')
42
43 merged = []
44 processed = {}
45 for compound_id_idx in xrange(len(rankfilter['ID'])):
46 compound_id = rankfilter['ID'][compound_id_idx]
47 if not compound_id in processed :
48 # keep track of processed items to not repeat them
49 processed[compound_id] = compound_id
50 # get centrotype nr
51 centrotype = compound_id.split('-')[0]
52 # Get the indices for current compound ID in both data-structures for proper matching
53 rindex = [index for index, value in enumerate(rankfilter['ID']) if value == compound_id]
54 cindex = [index for index, value in enumerate(caslookup['Centrotype']) if value == centrotype]
55
56 merged_hits = []
57 # Combine hits
58 for hit in xrange(len(rindex)):
59 # Create records of hits to be merged ("keys" are the attribute names, so what the lines below do
60 # is create a new "dict" item with same "keys"/attributes, with each attribute filled with its
61 # corresponding value in the rankfilter or caslookup tables; i.e.
62 # rankfilter[key] => returns the list/array with size = nrrows, with the values for the attribute
63 # represented by "key". rindex[hit] => points to the row nr=hit (hit is a rownr/index)
64 rf_record = dict(zip(rankfilter.keys(), [rankfilter[key][rindex[hit]] for key in rankfilter.keys()]))
65 cl_record = dict(zip(caslookup.keys(), [caslookup[key][cindex[hit]] for key in caslookup.keys()]))
66
67 merged_hit = _add_hit(rf_record, cl_record)
68 merged_hits.append(merged_hit)
69
70 merged.append(merged_hits)
71
72 return merged, len(rindex)
73
74
75 def _add_hit(rankfilter, caslookup):
76 '''
77 Combines single records from both the RankFilter- and CasLookup-tools
78 @param rankfilter: record (dictionary) of one compound in the RankFilter output
79 @param caslookup: matching record (dictionary) of one compound in the CasLookup output
80 '''
81 # The ID in the RankFilter output contains the following 5 fields:
82 rf_id = rankfilter['ID'].split('-')
83 try:
84 name, formula = _remove_formula(rankfilter['Name'])
85 hit = [rf_id[0], # Centrotype
86 rf_id[1], # cent.Factor
87 rf_id[2], # scan nr
88 rf_id[3], # R.T. (umin)
89 rf_id[4], # nr. Peaks
90 # Appending other fields
91 rankfilter['R.T.'],
92 name,
93 caslookup['FORMULA'] if not formula else formula,
94 rankfilter['Library'].strip(),
95 rankfilter['CAS'].strip(),
96 rankfilter['Forward'],
97 rankfilter['Reverse'],
98 ((float(rankfilter['Forward']) + float(rankfilter['Reverse'])) / 2),
99 rankfilter['RIexp'],
100 caslookup['RI'],
101 rankfilter['RIsvr'],
102 # Calculate absolute differences
103 math.fabs(float(rankfilter['RIexp']) - float(rankfilter['RIsvr'])),
104 math.fabs(float(caslookup['RI']) - float(rankfilter['RIexp'])),
105 caslookup['Regression.Column.Name'],
106 caslookup['min'],
107 caslookup['max'],
108 caslookup['nr.duplicates'],
109 caslookup['Column.phase.type'],
110 caslookup['Column.name'],
111 rankfilter['Rank'],
112 rankfilter['%rel.err'],
113 rankfilter['Synonyms']]
114 except KeyError as error:
115 print "Problem reading in data from input file(s):\n",
116 print "Respective CasLookup entry: \n", pprint.pprint(caslookup), "\n"
117 print "Respective RankFilter entry: \n", pprint.pprint(rankfilter), "\n"
118 raise error
119
120 return hit
121
122
123 def _remove_formula(name):
124 '''
125 The RankFilter Name field often contains the Formula as well, this function removes it from the Name
126 @param name: complete name of the compound from the RankFilter output
127 '''
128 name = name.split()
129 poss_formula = name[-1]
130 match = re.match("^(([A-Z][a-z]{0,2})(\d*))+$", poss_formula)
131 if match:
132 return ' '.join(name[:-1]), poss_formula
133 else:
134 return ' '.join(name), False
135
136
137 def _get_default_caslookup():
138 '''
139 The Cas Lookup tool might not have found all compounds in the library searched,
140 this default dict will be used to combine with the Rank Filter output
141 '''
142 return {'FORMULA': 'N/A',
143 'RI': '0.0',
144 'Regression.Column.Name': 'None',
145 'min': '0.0',
146 'max': '0.0',
147 'nr.duplicates': '0',
148 'Column.phase.type': 'N/A',
149 'Column.name': 'N/A'}
150
151
152 def _save_data(data, nhits, out_csv_single, out_csv_multi):
153 '''
154 Writes tab-separated data to file
155 @param data: dictionary containing merged dataset
156 @param out_csv: output csv file
157 '''
158 # Columns we don't repeat:
159 header_part1 = ['Centrotype',
160 'cent.Factor',
161 'scan nr.',
162 'R.T. (umin)',
163 'nr. Peaks',
164 'R.T.']
165 # These are the headers/columns we repeat in case of
166 # combining hits in one line (see alternative_headers method below):
167 header_part2 = [
168 'Name',
169 'FORMULA',
170 'Library',
171 'CAS',
172 'Forward',
173 'Reverse',
174 'Avg. (Forward, Reverse)',
175 'RIexp',
176 'RI',
177 'RIsvr',
178 'RIexp - RIsvr',
179 'RI - RIexp',
180 'Regression.Column.Name',
181 'min',
182 'max',
183 'nr.duplicates',
184 'Column.phase.type',
185 'Column.name',
186 'Rank',
187 '%rel.err',
188 'Synonyms']
189
190 # Open output file for writing
191 outfile_single_handle = open(out_csv_single, 'wb')
192 outfile_multi_handle = open(out_csv_multi, 'wb')
193 output_single_handle = csv.writer(outfile_single_handle, delimiter="\t")
194 output_multi_handle = csv.writer(outfile_multi_handle, delimiter="\t")
195
196 # Write headers
197 output_single_handle.writerow(header_part1 + header_part2)
198 output_multi_handle.writerow(header_part1 + header_part2 + alternative_headers(header_part2, nhits-1))
199 # Combine all hits for each centrotype into one line
200 line = []
201 for centrotype_idx in xrange(len(data)):
202 i = 0
203 for hit in data[centrotype_idx]:
204 if i==0:
205 line.extend(hit)
206 else:
207 line.extend(hit[6:])
208 i = i+1
209 # small validation (if error, it is a programming error):
210 if i > nhits:
211 raise Exception('Error: more hits that expected for centrotype_idx ' + centrotype_idx)
212 output_multi_handle.writerow(line)
213 line = []
214
215 # Write one line for each centrotype
216 for centrotype_idx in xrange(len(data)):
217 for hit in data[centrotype_idx]:
218 output_single_handle.writerow(hit)
219
220 def alternative_headers(header_part2, nr_alternative_hits):
221 '''
222 This method will iterate over the header names and add the string 'ALT#_' before each,
223 where # is the number of the alternative, according to number of alternative hits we want to add
224 to final csv/tsv
225 '''
226 result = []
227 for i in xrange(nr_alternative_hits):
228 for header_name in header_part2:
229 result.append("ALT" + str(i+1) + "_" + header_name)
230 return result
231
232 def main():
233 '''
234 Combine Output main function
235 It will merge the result files from "RankFilter" and "Lookup RI for CAS numbers"
236 NB: the caslookup_result_file will typically have fewer lines than
237 rankfilter_result_file, so the merge has to consider this as well. The final file
238 should have the same nr of lines as rankfilter_result_file.
239 '''
240 rankfilter_result_file = sys.argv[1]
241 caslookup_result_file = sys.argv[2]
242 output_single_csv = sys.argv[3]
243 output_multi_csv = sys.argv[4]
244
245 # Read RankFilter and CasLookup output files
246 rankfilter = _process_data(rankfilter_result_file)
247 caslookup = _process_data(caslookup_result_file)
248 merged, nhits = _merge_data(rankfilter, caslookup)
249 _save_data(merged, nhits, output_single_csv, output_multi_csv)
250
251
252 if __name__ == '__main__':
253 main()