Mercurial > repos > pieterlukasse > prims_metabolomics
annotate combine_output.py @ 34:e79a7c32b011
version using less memory, but more CPU
author | pieter.lukasse@wur.nl |
---|---|
date | Wed, 17 Sep 2014 11:31:34 +0200 |
parents | 19d8fd10248e |
children |
rev | line source |
---|---|
0 | 1 #!/usr/bin/env python |
2 # encoding: utf-8 | |
3 ''' | |
4 Module to combine output from two GCMS Galaxy tools (RankFilter and CasLookup) | |
5 ''' | |
6 | |
7 import csv | |
8 import re | |
9 import sys | |
10 import math | |
11 import pprint | |
12 | |
13 __author__ = "Marcel Kempenaar" | |
14 __contact__ = "brs@nbic.nl" | |
15 __copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre" | |
16 __license__ = "MIT" | |
17 | |
18 def _process_data(in_csv): | |
19 ''' | |
20 Generic method to parse a tab-separated file returning a dictionary with named columns | |
21 @param in_csv: input filename to be parsed | |
22 ''' | |
23 data = list(csv.reader(open(in_csv, 'rU'), delimiter='\t')) | |
24 header = data.pop(0) | |
25 # Create dictionary with column name as key | |
26 output = {} | |
27 for index in xrange(len(header)): | |
28 output[header[index]] = [row[index] for row in data] | |
29 return output | |
30 | |
31 | |
32 def _merge_data(rankfilter, caslookup): | |
33 ''' | |
34 Merges data from both input dictionaries based on the Centrotype field. This method will | |
35 build up a new list containing the merged hits as the items. | |
36 @param rankfilter: dictionary holding RankFilter output in the form of N lists (one list per attribute name) | |
37 @param caslookup: dictionary holding CasLookup output in the form of N lists (one list per attribute name) | |
38 ''' | |
39 # TODO: test for correct input files -> rankfilter and caslookup internal lists should have the same lenghts: | |
40 if (len(rankfilter['ID']) != len(caslookup['Centrotype'])): | |
41 raise Exception('rankfilter and caslookup files should have the same nr of rows/records ') | |
42 | |
43 merged = [] | |
44 processed = {} | |
45 for compound_id_idx in xrange(len(rankfilter['ID'])): | |
46 compound_id = rankfilter['ID'][compound_id_idx] | |
47 if not compound_id in processed : | |
48 # keep track of processed items to not repeat them | |
49 processed[compound_id] = compound_id | |
50 # get centrotype nr | |
51 centrotype = compound_id.split('-')[0] | |
52 # Get the indices for current compound ID in both data-structures for proper matching | |
53 rindex = [index for index, value in enumerate(rankfilter['ID']) if value == compound_id] | |
54 cindex = [index for index, value in enumerate(caslookup['Centrotype']) if value == centrotype] | |
55 | |
56 merged_hits = [] | |
57 # Combine hits | |
58 for hit in xrange(len(rindex)): | |
59 # Create records of hits to be merged ("keys" are the attribute names, so what the lines below do | |
60 # is create a new "dict" item with same "keys"/attributes, with each attribute filled with its | |
61 # corresponding value in the rankfilter or caslookup tables; i.e. | |
62 # rankfilter[key] => returns the list/array with size = nrrows, with the values for the attribute | |
63 # represented by "key". rindex[hit] => points to the row nr=hit (hit is a rownr/index) | |
64 rf_record = dict(zip(rankfilter.keys(), [rankfilter[key][rindex[hit]] for key in rankfilter.keys()])) | |
65 cl_record = dict(zip(caslookup.keys(), [caslookup[key][cindex[hit]] for key in caslookup.keys()])) | |
66 | |
67 merged_hit = _add_hit(rf_record, cl_record) | |
68 merged_hits.append(merged_hit) | |
69 | |
70 merged.append(merged_hits) | |
71 | |
72 return merged, len(rindex) | |
73 | |
74 | |
75 def _add_hit(rankfilter, caslookup): | |
76 ''' | |
77 Combines single records from both the RankFilter- and CasLookup-tools | |
78 @param rankfilter: record (dictionary) of one compound in the RankFilter output | |
79 @param caslookup: matching record (dictionary) of one compound in the CasLookup output | |
80 ''' | |
81 # The ID in the RankFilter output contains the following 5 fields: | |
82 rf_id = rankfilter['ID'].split('-') | |
83 try: | |
84 name, formula = _remove_formula(rankfilter['Name']) | |
85 hit = [rf_id[0], # Centrotype | |
86 rf_id[1], # cent.Factor | |
87 rf_id[2], # scan nr | |
88 rf_id[3], # R.T. (umin) | |
89 rf_id[4], # nr. Peaks | |
90 # Appending other fields | |
91 rankfilter['R.T.'], | |
92 name, | |
93 caslookup['FORMULA'] if not formula else formula, | |
94 rankfilter['Library'].strip(), | |
95 rankfilter['CAS'].strip(), | |
96 rankfilter['Forward'], | |
97 rankfilter['Reverse'], | |
98 ((float(rankfilter['Forward']) + float(rankfilter['Reverse'])) / 2), | |
99 rankfilter['RIexp'], | |
100 caslookup['RI'], | |
101 rankfilter['RIsvr'], | |
102 # Calculate absolute differences | |
103 math.fabs(float(rankfilter['RIexp']) - float(rankfilter['RIsvr'])), | |
104 math.fabs(float(caslookup['RI']) - float(rankfilter['RIexp'])), | |
105 caslookup['Regression.Column.Name'], | |
106 caslookup['min'], | |
107 caslookup['max'], | |
108 caslookup['nr.duplicates'], | |
109 caslookup['Column.phase.type'], | |
110 caslookup['Column.name'], | |
111 rankfilter['Rank'], | |
112 rankfilter['%rel.err'], | |
113 rankfilter['Synonyms']] | |
114 except KeyError as error: | |
115 print "Problem reading in data from input file(s):\n", | |
116 print "Respective CasLookup entry: \n", pprint.pprint(caslookup), "\n" | |
117 print "Respective RankFilter entry: \n", pprint.pprint(rankfilter), "\n" | |
118 raise error | |
119 | |
120 return hit | |
121 | |
122 | |
123 def _remove_formula(name): | |
124 ''' | |
125 The RankFilter Name field often contains the Formula as well, this function removes it from the Name | |
126 @param name: complete name of the compound from the RankFilter output | |
127 ''' | |
128 name = name.split() | |
129 poss_formula = name[-1] | |
130 match = re.match("^(([A-Z][a-z]{0,2})(\d*))+$", poss_formula) | |
131 if match: | |
132 return ' '.join(name[:-1]), poss_formula | |
133 else: | |
134 return ' '.join(name), False | |
135 | |
136 | |
137 def _get_default_caslookup(): | |
138 ''' | |
139 The Cas Lookup tool might not have found all compounds in the library searched, | |
140 this default dict will be used to combine with the Rank Filter output | |
141 ''' | |
142 return {'FORMULA': 'N/A', | |
143 'RI': '0.0', | |
144 'Regression.Column.Name': 'None', | |
145 'min': '0.0', | |
146 'max': '0.0', | |
147 'nr.duplicates': '0', | |
148 'Column.phase.type': 'N/A', | |
149 'Column.name': 'N/A'} | |
150 | |
151 | |
152 def _save_data(data, nhits, out_csv_single, out_csv_multi): | |
153 ''' | |
154 Writes tab-separated data to file | |
155 @param data: dictionary containing merged dataset | |
156 @param out_csv: output csv file | |
157 ''' | |
21
19d8fd10248e
* Added interface to METEXP data store, including tool to fire queries in batch mode
pieter.lukasse@wur.nl
parents:
0
diff
changeset
|
158 # Columns we don't repeat: |
19d8fd10248e
* Added interface to METEXP data store, including tool to fire queries in batch mode
pieter.lukasse@wur.nl
parents:
0
diff
changeset
|
159 header_part1 = ['Centrotype', |
0 | 160 'cent.Factor', |
161 'scan nr.', | |
162 'R.T. (umin)', | |
163 'nr. Peaks', | |
21
19d8fd10248e
* Added interface to METEXP data store, including tool to fire queries in batch mode
pieter.lukasse@wur.nl
parents:
0
diff
changeset
|
164 'R.T.'] |
19d8fd10248e
* Added interface to METEXP data store, including tool to fire queries in batch mode
pieter.lukasse@wur.nl
parents:
0
diff
changeset
|
165 # These are the headers/columns we repeat in case of |
19d8fd10248e
* Added interface to METEXP data store, including tool to fire queries in batch mode
pieter.lukasse@wur.nl
parents:
0
diff
changeset
|
166 # combining hits in one line (see alternative_headers method below): |
19d8fd10248e
* Added interface to METEXP data store, including tool to fire queries in batch mode
pieter.lukasse@wur.nl
parents:
0
diff
changeset
|
167 header_part2 = [ |
0 | 168 'Name', |
169 'FORMULA', | |
170 'Library', | |
171 'CAS', | |
172 'Forward', | |
173 'Reverse', | |
174 'Avg. (Forward, Reverse)', | |
175 'RIexp', | |
176 'RI', | |
177 'RIsvr', | |
178 'RIexp - RIsvr', | |
179 'RI - RIexp', | |
180 'Regression.Column.Name', | |
181 'min', | |
182 'max', | |
183 'nr.duplicates', | |
184 'Column.phase.type', | |
185 'Column.name', | |
186 'Rank', | |
187 '%rel.err', | |
188 'Synonyms'] | |
189 | |
190 # Open output file for writing | |
191 outfile_single_handle = open(out_csv_single, 'wb') | |
192 outfile_multi_handle = open(out_csv_multi, 'wb') | |
193 output_single_handle = csv.writer(outfile_single_handle, delimiter="\t") | |
194 output_multi_handle = csv.writer(outfile_multi_handle, delimiter="\t") | |
195 | |
196 # Write headers | |
21
19d8fd10248e
* Added interface to METEXP data store, including tool to fire queries in batch mode
pieter.lukasse@wur.nl
parents:
0
diff
changeset
|
197 output_single_handle.writerow(header_part1 + header_part2) |
19d8fd10248e
* Added interface to METEXP data store, including tool to fire queries in batch mode
pieter.lukasse@wur.nl
parents:
0
diff
changeset
|
198 output_multi_handle.writerow(header_part1 + header_part2 + alternative_headers(header_part2, nhits-1)) |
0 | 199 # Combine all hits for each centrotype into one line |
200 line = [] | |
201 for centrotype_idx in xrange(len(data)): | |
21
19d8fd10248e
* Added interface to METEXP data store, including tool to fire queries in batch mode
pieter.lukasse@wur.nl
parents:
0
diff
changeset
|
202 i = 0 |
0 | 203 for hit in data[centrotype_idx]: |
21
19d8fd10248e
* Added interface to METEXP data store, including tool to fire queries in batch mode
pieter.lukasse@wur.nl
parents:
0
diff
changeset
|
204 if i==0: |
19d8fd10248e
* Added interface to METEXP data store, including tool to fire queries in batch mode
pieter.lukasse@wur.nl
parents:
0
diff
changeset
|
205 line.extend(hit) |
19d8fd10248e
* Added interface to METEXP data store, including tool to fire queries in batch mode
pieter.lukasse@wur.nl
parents:
0
diff
changeset
|
206 else: |
19d8fd10248e
* Added interface to METEXP data store, including tool to fire queries in batch mode
pieter.lukasse@wur.nl
parents:
0
diff
changeset
|
207 line.extend(hit[6:]) |
19d8fd10248e
* Added interface to METEXP data store, including tool to fire queries in batch mode
pieter.lukasse@wur.nl
parents:
0
diff
changeset
|
208 i = i+1 |
19d8fd10248e
* Added interface to METEXP data store, including tool to fire queries in batch mode
pieter.lukasse@wur.nl
parents:
0
diff
changeset
|
209 # small validation (if error, it is a programming error): |
19d8fd10248e
* Added interface to METEXP data store, including tool to fire queries in batch mode
pieter.lukasse@wur.nl
parents:
0
diff
changeset
|
210 if i > nhits: |
19d8fd10248e
* Added interface to METEXP data store, including tool to fire queries in batch mode
pieter.lukasse@wur.nl
parents:
0
diff
changeset
|
211 raise Exception('Error: more hits that expected for centrotype_idx ' + centrotype_idx) |
0 | 212 output_multi_handle.writerow(line) |
213 line = [] | |
214 | |
215 # Write one line for each centrotype | |
216 for centrotype_idx in xrange(len(data)): | |
217 for hit in data[centrotype_idx]: | |
218 output_single_handle.writerow(hit) | |
219 | |
21
19d8fd10248e
* Added interface to METEXP data store, including tool to fire queries in batch mode
pieter.lukasse@wur.nl
parents:
0
diff
changeset
|
220 def alternative_headers(header_part2, nr_alternative_hits): |
19d8fd10248e
* Added interface to METEXP data store, including tool to fire queries in batch mode
pieter.lukasse@wur.nl
parents:
0
diff
changeset
|
221 ''' |
19d8fd10248e
* Added interface to METEXP data store, including tool to fire queries in batch mode
pieter.lukasse@wur.nl
parents:
0
diff
changeset
|
222 This method will iterate over the header names and add the string 'ALT#_' before each, |
19d8fd10248e
* Added interface to METEXP data store, including tool to fire queries in batch mode
pieter.lukasse@wur.nl
parents:
0
diff
changeset
|
223 where # is the number of the alternative, according to number of alternative hits we want to add |
19d8fd10248e
* Added interface to METEXP data store, including tool to fire queries in batch mode
pieter.lukasse@wur.nl
parents:
0
diff
changeset
|
224 to final csv/tsv |
19d8fd10248e
* Added interface to METEXP data store, including tool to fire queries in batch mode
pieter.lukasse@wur.nl
parents:
0
diff
changeset
|
225 ''' |
19d8fd10248e
* Added interface to METEXP data store, including tool to fire queries in batch mode
pieter.lukasse@wur.nl
parents:
0
diff
changeset
|
226 result = [] |
19d8fd10248e
* Added interface to METEXP data store, including tool to fire queries in batch mode
pieter.lukasse@wur.nl
parents:
0
diff
changeset
|
227 for i in xrange(nr_alternative_hits): |
19d8fd10248e
* Added interface to METEXP data store, including tool to fire queries in batch mode
pieter.lukasse@wur.nl
parents:
0
diff
changeset
|
228 for header_name in header_part2: |
19d8fd10248e
* Added interface to METEXP data store, including tool to fire queries in batch mode
pieter.lukasse@wur.nl
parents:
0
diff
changeset
|
229 result.append("ALT" + str(i+1) + "_" + header_name) |
19d8fd10248e
* Added interface to METEXP data store, including tool to fire queries in batch mode
pieter.lukasse@wur.nl
parents:
0
diff
changeset
|
230 return result |
0 | 231 |
232 def main(): | |
233 ''' | |
234 Combine Output main function | |
235 It will merge the result files from "RankFilter" and "Lookup RI for CAS numbers" | |
236 NB: the caslookup_result_file will typically have fewer lines than | |
237 rankfilter_result_file, so the merge has to consider this as well. The final file | |
238 should have the same nr of lines as rankfilter_result_file. | |
239 ''' | |
240 rankfilter_result_file = sys.argv[1] | |
241 caslookup_result_file = sys.argv[2] | |
242 output_single_csv = sys.argv[3] | |
243 output_multi_csv = sys.argv[4] | |
244 | |
245 # Read RankFilter and CasLookup output files | |
246 rankfilter = _process_data(rankfilter_result_file) | |
247 caslookup = _process_data(caslookup_result_file) | |
248 merged, nhits = _merge_data(rankfilter, caslookup) | |
249 _save_data(merged, nhits, output_single_csv, output_multi_csv) | |
250 | |
251 | |
252 if __name__ == '__main__': | |
253 main() |