0
|
1 #!/usr/bin/env python
|
|
2 # encoding: utf-8
|
|
3 '''
|
|
4 Module to combine output from two GCMS Galaxy tools (RankFilter and CasLookup)
|
|
5 '''
|
|
6
|
|
7 import csv
|
|
8 import re
|
|
9 import sys
|
|
10 import math
|
|
11 import pprint
|
|
12
|
|
13 __author__ = "Marcel Kempenaar"
|
|
14 __contact__ = "brs@nbic.nl"
|
|
15 __copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre"
|
|
16 __license__ = "MIT"
|
|
17
|
|
18 def _process_data(in_csv):
|
|
19 '''
|
|
20 Generic method to parse a tab-separated file returning a dictionary with named columns
|
|
21 @param in_csv: input filename to be parsed
|
|
22 '''
|
|
23 data = list(csv.reader(open(in_csv, 'rU'), delimiter='\t'))
|
|
24 header = data.pop(0)
|
|
25 # Create dictionary with column name as key
|
|
26 output = {}
|
|
27 for index in xrange(len(header)):
|
|
28 output[header[index]] = [row[index] for row in data]
|
|
29 return output
|
|
30
|
|
31
|
|
32 def _merge_data(rankfilter, caslookup):
|
|
33 '''
|
|
34 Merges data from both input dictionaries based on the Centrotype field. This method will
|
|
35 build up a new list containing the merged hits as the items.
|
|
36 @param rankfilter: dictionary holding RankFilter output in the form of N lists (one list per attribute name)
|
|
37 @param caslookup: dictionary holding CasLookup output in the form of N lists (one list per attribute name)
|
|
38 '''
|
|
39 # TODO: test for correct input files -> rankfilter and caslookup internal lists should have the same lenghts:
|
|
40 if (len(rankfilter['ID']) != len(caslookup['Centrotype'])):
|
|
41 raise Exception('rankfilter and caslookup files should have the same nr of rows/records ')
|
|
42
|
|
43 merged = []
|
|
44 processed = {}
|
|
45 for compound_id_idx in xrange(len(rankfilter['ID'])):
|
|
46 compound_id = rankfilter['ID'][compound_id_idx]
|
|
47 if not compound_id in processed :
|
|
48 # keep track of processed items to not repeat them
|
|
49 processed[compound_id] = compound_id
|
|
50 # get centrotype nr
|
|
51 centrotype = compound_id.split('-')[0]
|
|
52 # Get the indices for current compound ID in both data-structures for proper matching
|
|
53 rindex = [index for index, value in enumerate(rankfilter['ID']) if value == compound_id]
|
|
54 cindex = [index for index, value in enumerate(caslookup['Centrotype']) if value == centrotype]
|
|
55
|
|
56 merged_hits = []
|
|
57 # Combine hits
|
|
58 for hit in xrange(len(rindex)):
|
|
59 # Create records of hits to be merged ("keys" are the attribute names, so what the lines below do
|
|
60 # is create a new "dict" item with same "keys"/attributes, with each attribute filled with its
|
|
61 # corresponding value in the rankfilter or caslookup tables; i.e.
|
|
62 # rankfilter[key] => returns the list/array with size = nrrows, with the values for the attribute
|
|
63 # represented by "key". rindex[hit] => points to the row nr=hit (hit is a rownr/index)
|
|
64 rf_record = dict(zip(rankfilter.keys(), [rankfilter[key][rindex[hit]] for key in rankfilter.keys()]))
|
|
65 cl_record = dict(zip(caslookup.keys(), [caslookup[key][cindex[hit]] for key in caslookup.keys()]))
|
|
66
|
|
67 merged_hit = _add_hit(rf_record, cl_record)
|
|
68 merged_hits.append(merged_hit)
|
|
69
|
|
70 merged.append(merged_hits)
|
|
71
|
|
72 return merged, len(rindex)
|
|
73
|
|
74
|
|
75 def _add_hit(rankfilter, caslookup):
|
|
76 '''
|
|
77 Combines single records from both the RankFilter- and CasLookup-tools
|
|
78 @param rankfilter: record (dictionary) of one compound in the RankFilter output
|
|
79 @param caslookup: matching record (dictionary) of one compound in the CasLookup output
|
|
80 '''
|
|
81 # The ID in the RankFilter output contains the following 5 fields:
|
|
82 rf_id = rankfilter['ID'].split('-')
|
|
83 try:
|
|
84 name, formula = _remove_formula(rankfilter['Name'])
|
|
85 hit = [rf_id[0], # Centrotype
|
|
86 rf_id[1], # cent.Factor
|
|
87 rf_id[2], # scan nr
|
|
88 rf_id[3], # R.T. (umin)
|
|
89 rf_id[4], # nr. Peaks
|
|
90 # Appending other fields
|
|
91 rankfilter['R.T.'],
|
|
92 name,
|
|
93 caslookup['FORMULA'] if not formula else formula,
|
|
94 rankfilter['Library'].strip(),
|
|
95 rankfilter['CAS'].strip(),
|
|
96 rankfilter['Forward'],
|
|
97 rankfilter['Reverse'],
|
|
98 ((float(rankfilter['Forward']) + float(rankfilter['Reverse'])) / 2),
|
|
99 rankfilter['RIexp'],
|
|
100 caslookup['RI'],
|
|
101 rankfilter['RIsvr'],
|
|
102 # Calculate absolute differences
|
|
103 math.fabs(float(rankfilter['RIexp']) - float(rankfilter['RIsvr'])),
|
|
104 math.fabs(float(caslookup['RI']) - float(rankfilter['RIexp'])),
|
|
105 caslookup['Regression.Column.Name'],
|
|
106 caslookup['min'],
|
|
107 caslookup['max'],
|
|
108 caslookup['nr.duplicates'],
|
|
109 caslookup['Column.phase.type'],
|
|
110 caslookup['Column.name'],
|
|
111 rankfilter['Rank'],
|
|
112 rankfilter['%rel.err'],
|
|
113 rankfilter['Synonyms']]
|
|
114 except KeyError as error:
|
|
115 print "Problem reading in data from input file(s):\n",
|
|
116 print "Respective CasLookup entry: \n", pprint.pprint(caslookup), "\n"
|
|
117 print "Respective RankFilter entry: \n", pprint.pprint(rankfilter), "\n"
|
|
118 raise error
|
|
119
|
|
120 return hit
|
|
121
|
|
122
|
|
123 def _remove_formula(name):
|
|
124 '''
|
|
125 The RankFilter Name field often contains the Formula as well, this function removes it from the Name
|
|
126 @param name: complete name of the compound from the RankFilter output
|
|
127 '''
|
|
128 name = name.split()
|
|
129 poss_formula = name[-1]
|
|
130 match = re.match("^(([A-Z][a-z]{0,2})(\d*))+$", poss_formula)
|
|
131 if match:
|
|
132 return ' '.join(name[:-1]), poss_formula
|
|
133 else:
|
|
134 return ' '.join(name), False
|
|
135
|
|
136
|
|
137 def _get_default_caslookup():
|
|
138 '''
|
|
139 The Cas Lookup tool might not have found all compounds in the library searched,
|
|
140 this default dict will be used to combine with the Rank Filter output
|
|
141 '''
|
|
142 return {'FORMULA': 'N/A',
|
|
143 'RI': '0.0',
|
|
144 'Regression.Column.Name': 'None',
|
|
145 'min': '0.0',
|
|
146 'max': '0.0',
|
|
147 'nr.duplicates': '0',
|
|
148 'Column.phase.type': 'N/A',
|
|
149 'Column.name': 'N/A'}
|
|
150
|
|
151
|
|
152 def _save_data(data, nhits, out_csv_single, out_csv_multi):
|
|
153 '''
|
|
154 Writes tab-separated data to file
|
|
155 @param data: dictionary containing merged dataset
|
|
156 @param out_csv: output csv file
|
|
157 '''
|
|
158 # Columns we don't repeat:
|
|
159 header_part1 = ['Centrotype',
|
|
160 'cent.Factor',
|
|
161 'scan nr.',
|
|
162 'R.T. (umin)',
|
|
163 'nr. Peaks',
|
|
164 'R.T.']
|
|
165 # These are the headers/columns we repeat in case of
|
|
166 # combining hits in one line (see alternative_headers method below):
|
|
167 header_part2 = [
|
|
168 'Name',
|
|
169 'FORMULA',
|
|
170 'Library',
|
|
171 'CAS',
|
|
172 'Forward',
|
|
173 'Reverse',
|
|
174 'Avg. (Forward, Reverse)',
|
|
175 'RIexp',
|
|
176 'RI',
|
|
177 'RIsvr',
|
|
178 'RIexp - RIsvr',
|
|
179 'RI - RIexp',
|
|
180 'Regression.Column.Name',
|
|
181 'min',
|
|
182 'max',
|
|
183 'nr.duplicates',
|
|
184 'Column.phase.type',
|
|
185 'Column.name',
|
|
186 'Rank',
|
|
187 '%rel.err',
|
|
188 'Synonyms']
|
|
189
|
|
190 # Open output file for writing
|
|
191 outfile_single_handle = open(out_csv_single, 'wb')
|
|
192 outfile_multi_handle = open(out_csv_multi, 'wb')
|
|
193 output_single_handle = csv.writer(outfile_single_handle, delimiter="\t")
|
|
194 output_multi_handle = csv.writer(outfile_multi_handle, delimiter="\t")
|
|
195
|
|
196 # Write headers
|
|
197 output_single_handle.writerow(header_part1 + header_part2)
|
|
198 output_multi_handle.writerow(header_part1 + header_part2 + alternative_headers(header_part2, nhits-1))
|
|
199 # Combine all hits for each centrotype into one line
|
|
200 line = []
|
|
201 for centrotype_idx in xrange(len(data)):
|
|
202 i = 0
|
|
203 for hit in data[centrotype_idx]:
|
|
204 if i==0:
|
|
205 line.extend(hit)
|
|
206 else:
|
|
207 line.extend(hit[6:])
|
|
208 i = i+1
|
|
209 # small validation (if error, it is a programming error):
|
|
210 if i > nhits:
|
|
211 raise Exception('Error: more hits that expected for centrotype_idx ' + centrotype_idx)
|
|
212 output_multi_handle.writerow(line)
|
|
213 line = []
|
|
214
|
|
215 # Write one line for each centrotype
|
|
216 for centrotype_idx in xrange(len(data)):
|
|
217 for hit in data[centrotype_idx]:
|
|
218 output_single_handle.writerow(hit)
|
|
219
|
|
220 def alternative_headers(header_part2, nr_alternative_hits):
|
|
221 '''
|
|
222 This method will iterate over the header names and add the string 'ALT#_' before each,
|
|
223 where # is the number of the alternative, according to number of alternative hits we want to add
|
|
224 to final csv/tsv
|
|
225 '''
|
|
226 result = []
|
|
227 for i in xrange(nr_alternative_hits):
|
|
228 for header_name in header_part2:
|
|
229 result.append("ALT" + str(i+1) + "_" + header_name)
|
|
230 return result
|
|
231
|
|
232 def main():
|
|
233 '''
|
|
234 Combine Output main function
|
|
235 It will merge the result files from "RankFilter" and "Lookup RI for CAS numbers"
|
|
236 NB: the caslookup_result_file will typically have fewer lines than
|
|
237 rankfilter_result_file, so the merge has to consider this as well. The final file
|
|
238 should have the same nr of lines as rankfilter_result_file.
|
|
239 '''
|
|
240 rankfilter_result_file = sys.argv[1]
|
|
241 caslookup_result_file = sys.argv[2]
|
|
242 output_single_csv = sys.argv[3]
|
|
243 output_multi_csv = sys.argv[4]
|
|
244
|
|
245 # Read RankFilter and CasLookup output files
|
|
246 rankfilter = _process_data(rankfilter_result_file)
|
|
247 caslookup = _process_data(caslookup_result_file)
|
|
248 merged, nhits = _merge_data(rankfilter, caslookup)
|
|
249 _save_data(merged, nhits, output_single_csv, output_multi_csv)
|
|
250
|
|
251
|
|
252 if __name__ == '__main__':
|
|
253 main()
|