comparison export_to_metexp_tabular.py @ 0:dffc38727496

initial commit
author pieter.lukasse@wur.nl
date Sat, 07 Feb 2015 22:02:00 +0100
parents
children 0d1557b3d540
comparison
equal deleted inserted replaced
-1:000000000000 0:dffc38727496
1 #!/usr/bin/env python
2 # encoding: utf-8
3 '''
4 Module to combine output from the GCMS Galaxy tools RankFilter, CasLookup and MsClust
5 into a tabular file that can be uploaded to the MetExp database.
6
7 RankFilter, CasLookup are already combined by combine_output.py so here we will use
8 this result. Furthermore here one of the MsClust
9 quantification files containing the respective spectra details are to be combined as well.
10
11 Extra calculations performed:
12 - The column MW is also added here and is derived from the column FORMULA found
13 in RankFilter, CasLookup combined result.
14
15 So in total here we merge 2 files and calculate one new column.
16 '''
17 from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611
18 import csv
19 import re
20 import sys
21 from collections import OrderedDict
22
23 __author__ = "Pieter Lukasse"
24 __contact__ = "pieter.lukasse@wur.nl"
25 __copyright__ = "Copyright, 2013, Plant Research International, WUR"
26 __license__ = "Apache v2"
27
28 def _process_data(in_csv, delim='\t'):
29 '''
30 Generic method to parse a tab-separated file returning a dictionary with named columns
31 @param in_csv: input filename to be parsed
32 '''
33 data = list(csv.reader(open(in_csv, 'rU'), delimiter=delim))
34 header = data.pop(0)
35 # Create dictionary with column name as key
36 output = OrderedDict()
37 for index in xrange(len(header)):
38 output[header[index]] = [row[index] for row in data]
39 return output
40
41 ONE_TO_ONE = 'one_to_one'
42 N_TO_ONE = 'n_to_one'
43
44 def _merge_data(set1, link_field_set1, set2, link_field_set2, compare_function, merge_function, metadata, relation_type=ONE_TO_ONE):
45 '''
46 Merges data from both input dictionaries based on the link fields. This method will
47 build up a new list containing the merged hits as the items.
48 @param set1: dictionary holding set1 in the form of N lists (one list per attribute name)
49 @param set2: dictionary holding set2 in the form of N lists (one list per attribute name)
50 '''
51 # TODO test for correct input files -> same link_field values should be there
52 # (test at least number of unique link_field values):
53 #
54 # if (len(set1[link_field_set1]) != len(set2[link_field_set2])):
55 # raise Exception('input files should have the same nr of key values ')
56
57
58 merged = []
59 processed = {}
60 for link_field_set1_idx in xrange(len(set1[link_field_set1])):
61 link_field_set1_value = set1[link_field_set1][link_field_set1_idx]
62 if not link_field_set1_value in processed :
63 # keep track of processed items to not repeat them
64 processed[link_field_set1_value] = link_field_set1_value
65
66 # Get the indices for current link_field_set1_value in both data-structures for proper matching
67 set1index = [index for index, value in enumerate(set1[link_field_set1]) if value == link_field_set1_value]
68 set2index = [index for index, value in enumerate(set2[link_field_set2]) if compare_function(value, link_field_set1_value)==True ]
69 # Validation :
70 if len(set2index) == 0:
71 # means that corresponding data could not be found in set2, then throw error
72 raise Exception("Datasets not compatible, merge not possible. " + link_field_set1 + "=" +
73 link_field_set1_value + " only found in first dataset. ")
74
75 merged_hits = []
76 # Combine hits
77 for hit in xrange(len(set1index)):
78 # Create records of hits to be merged ("keys" are the attribute names, so what the lines below do
79 # is create a new "dict" item with same "keys"/attributes, with each attribute filled with its
80 # corresponding value in the sets; i.e.
81 # set1[key] => returns the list/array with size = nrrows, with the values for the attribute
82 # represented by "key".
83 # set1index[hit] => points to the row nr=hit (hit is a rownr/index)
84 # So set1[x][set1index[n]] = set1.attributeX.instanceN
85 #
86 # It just ensures the entry is made available as a plain named array for easy access.
87 rf_record = OrderedDict(zip(set1.keys(), [set1[key][set1index[hit]] for key in set1.keys()]))
88 if relation_type == ONE_TO_ONE :
89 cl_record = OrderedDict(zip(set2.keys(), [set2[key][set2index[hit]] for key in set2.keys()]))
90 else:
91 # is N to 1:
92 cl_record = OrderedDict(zip(set2.keys(), [set2[key][set2index[0]] for key in set2.keys()]))
93
94 merged_hit = merge_function(rf_record, cl_record, metadata)
95 merged_hits.append(merged_hit)
96
97 merged.append(merged_hits)
98
99 return merged, len(set1index)
100
101
102 def _compare_records(key1, key2):
103 '''
104 in this case the compare method is really simple as both keys are expected to contain
105 same value when records are the same
106 '''
107 if key1 == key2:
108 return True
109 else:
110 return False
111
112
113
114 def _merge_records(rank_caslookup_combi, msclust_quant_record, metadata):
115 '''
116 Combines single records from both the RankFilter+CasLookup combi file and from MsClust file
117
118 @param rank_caslookup_combi: rankfilter and caslookup combined record (see combine_output.py)
119 @param msclust_quant_record: msclust quantification + spectrum record
120 '''
121 record = []
122 for column in rank_caslookup_combi:
123 record.append(rank_caslookup_combi[column])
124
125 for column in msclust_quant_record:
126 record.append(msclust_quant_record[column])
127
128 for column in metadata:
129 record.append(metadata[column])
130
131 # add MOLECULAR MASS (MM)
132 molecular_mass = get_molecular_mass(rank_caslookup_combi['FORMULA'])
133 # limit to two decimals:
134 record.append("{0:.2f}".format(molecular_mass))
135
136 # add MOLECULAR WEIGHT (MW) - TODO - calculate this
137 record.append('0.0')
138
139 # level of identification and Location of reference standard
140 record.append('0')
141 record.append('')
142
143 return record
144
145
146 def get_molecular_mass(formula):
147 '''
148 Calculates the molecular mass (MM).
149 E.g. MM of H2O = (relative)atomic mass of H x2 + (relative)atomic mass of O
150 '''
151
152 # Each element is represented by a capital letter, followed optionally by
153 # lower case, with one or more digits as for how many elements:
154 element_pattern = re.compile("([A-Z][a-z]?)(\d*)")
155
156 total_mass = 0
157 for (element_name, count) in element_pattern.findall(formula):
158 if count == "":
159 count = 1
160 else:
161 count = int(count)
162 element_mass = float(elements_and_masses_map[element_name]) # "found: Python's built-in float type has double precision " (? check if really correct ?)
163 total_mass += element_mass * count
164
165 return total_mass
166
167
168
169 def _save_data(data, headers, out_csv):
170 '''
171 Writes tab-separated data to file
172 @param data: dictionary containing merged dataset
173 @param out_csv: output csv file
174 '''
175
176 # Open output file for writing
177 outfile_single_handle = open(out_csv, 'wb')
178 output_single_handle = csv.writer(outfile_single_handle, delimiter="\t")
179
180 # Write headers
181 output_single_handle.writerow(headers)
182
183 # Write
184 for item_idx in xrange(len(data)):
185 for hit in data[item_idx]:
186 output_single_handle.writerow(hit)
187
188
189 def _get_map_for_elements_and_masses(elements_and_masses):
190 '''
191 This method will read out the column 'Chemical symbol' and make a map
192 of this, storing the column 'Relative atomic mass' as its value
193 '''
194 resultMap = {}
195 index = 0
196 for entry in elements_and_masses['Chemical symbol']:
197 resultMap[entry] = elements_and_masses['Relative atomic mass'][index]
198 index += 1
199
200 return resultMap
201
202
203 def init_elements_and_masses_map():
204 '''
205 Initializes the lookup map containing the elements and their respective masses
206 '''
207 elements_and_masses = _process_data(resource_filename(__name__, "static_resources/elements_and_masses.tab"))
208 global elements_and_masses_map
209 elements_and_masses_map = _get_map_for_elements_and_masses(elements_and_masses)
210
211
212 def main():
213 '''
214 Combine Output main function
215
216 RankFilter, CasLookup are already combined by combine_output.py so here we will use
217 this result. Furthermore here the MsClust spectra file (.MSP) and one of the MsClust
218 quantification files are to be combined with combine_output.py result as well.
219 '''
220 rankfilter_and_caslookup_combined_file = sys.argv[1]
221 msclust_quantification_and_spectra_file = sys.argv[2]
222 output_csv = sys.argv[3]
223 # metadata
224 metadata = OrderedDict()
225 metadata['organism'] = sys.argv[4]
226 metadata['tissue'] = sys.argv[5]
227 metadata['experiment_name'] = sys.argv[6]
228 metadata['user_name'] = sys.argv[7]
229 metadata['column_type'] = sys.argv[8]
230
231 # Read RankFilter and CasLookup output files
232 rankfilter_and_caslookup_combined = _process_data(rankfilter_and_caslookup_combined_file)
233 msclust_quantification_and_spectra = _process_data(msclust_quantification_and_spectra_file, ',')
234
235 # Read elements and masses to use for the MW/MM calculation :
236 init_elements_and_masses_map()
237
238 merged, nhits = _merge_data(rankfilter_and_caslookup_combined, 'Centrotype',
239 msclust_quantification_and_spectra, 'centrotype',
240 _compare_records, _merge_records, metadata,
241 N_TO_ONE)
242 headers = rankfilter_and_caslookup_combined.keys() + msclust_quantification_and_spectra.keys() + metadata.keys() + ['MM','MW', 'Level of identification', 'Location of reference standard']
243 _save_data(merged, headers, output_csv)
244
245
246 if __name__ == '__main__':
247 main()