Mercurial > repos > galaxyp > maxquant_mqpar
comparison mqparam.py @ 0:256cc0e17454 draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/maxquant commit ab4e4f1817080cbe8a031a82cb180610ff140847
author | galaxyp |
---|---|
date | Sat, 20 Jul 2019 04:53:23 -0400 |
parents | |
children | 2d67fb758956 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:256cc0e17454 |
---|---|
1 """ | |
2 Create a project-specific MaxQuant parameter file. | |
3 | |
4 TODO: check validity of parsed experimental design template | |
5 add support for parameter groups | |
6 add reporter ion MS2 | |
7 | |
8 Author: Damian Glaetzer <d.glaetzer@mailbox.org> | |
9 """ | |
10 | |
11 import ntpath | |
12 import os | |
13 import re | |
14 import xml.etree.ElementTree as ET | |
15 from itertools import zip_longest | |
16 from xml.dom import minidom | |
17 | |
18 | |
19 class MQParam: | |
20 """Represents a mqpar.xml and provides methods to modify | |
21 some of its parameters. | |
22 """ | |
23 | |
24 fasta_template = """<FastaFileInfo> | |
25 <fastaFilePath></fastaFilePath> | |
26 <identifierParseRule></identifierParseRule> | |
27 <descriptionParseRule></descriptionParseRule> | |
28 <taxonomyParseRule></taxonomyParseRule> | |
29 <variationParseRule></variationParseRule> | |
30 <modificationParseRule></modificationParseRule> | |
31 <taxonomyId></taxonomyId> | |
32 </FastaFileInfo>""" | |
33 | |
34 def __init__(self, mqpar_out, mqpar_in, exp_design, | |
35 substitution_rx=r'[^\s\S]'): # no sub by default | |
36 """Initialize MQParam class. mqpar_in can either be a template | |
37 or a already suitable mqpar file. | |
38 >>> t = MQParam("test", './test-data/template.xml', None) | |
39 >>> t.root.tag | |
40 'MaxQuantParams' | |
41 >>> (t.root.find('maxQuantVersion')).text | |
42 '1.6.3.4' | |
43 """ | |
44 | |
45 self.orig_mqpar = mqpar_in | |
46 self.exp_design = exp_design | |
47 self.mqpar_out = mqpar_out | |
48 self.root = ET.parse(mqpar_in).getroot() | |
49 self.version = self.root.find('maxQuantVersion').text | |
50 # regex for substitution of certain file name characters | |
51 self.substitution_rx = substitution_rx | |
52 | |
53 @staticmethod | |
54 def _add_child(el, name, text, attrib=None): | |
55 """Add a child element to an element. | |
56 | |
57 >>> t = MQParam("test", './test-data/template.xml', None) | |
58 >>> MQParam._add_child(t.root, "test", "test") | |
59 >>> t.root.find('test').text == "test" | |
60 True | |
61 """ | |
62 | |
63 child = ET.SubElement(el, name, attrib=attrib if attrib else {}) | |
64 child.text = str(text) | |
65 | |
66 def _make_exp_design(self, infiles): | |
67 """Create a dict representing an experimental design from | |
68 an experimental design template and a list of input files. | |
69 If the experimental design template is None, create a default | |
70 design with one experiment for each input file, no fractions and | |
71 parameter group 0 for all files. | |
72 >>> t2 = MQParam("test", './test-data/template.xml', \ | |
73 './test-data/two/exp_design_template.txt') | |
74 >>> design = t2._make_exp_design(['./test-data/BSA_min_21.mzXML', \ | |
75 './test-data/BSA_min_22.mzXML']) | |
76 >>> design['Name'] | |
77 ['./test-data/BSA_min_21.mzXML', './test-data/BSA_min_22.mzXML'] | |
78 >>> design['Fraction'] | |
79 ['1', '2'] | |
80 """ | |
81 design = {s: [] for s in ("Name", "PTM", "Fraction", "Experiment")} | |
82 if not self.exp_design: | |
83 design["Name"] = infiles | |
84 design["Fraction"] = ('32767',) * len(infiles) | |
85 design["Experiment"] = [os.path.split(f)[1] for f in infiles] | |
86 design["PTM"] = ('False',) * len(infiles) | |
87 else: | |
88 with open(self.exp_design) as design_file: | |
89 index_line = design_file.readline().strip() | |
90 index = [] | |
91 for i in index_line.split('\t'): | |
92 if i in design: | |
93 index.append(i) | |
94 else: | |
95 raise Exception("Invalid comlumn index in experimental" | |
96 + " design template: {}".format(i)) | |
97 for line in design_file: | |
98 row = line.strip().split('\t') | |
99 for e, i in zip_longest(row, index): | |
100 design[i].append(e) | |
101 | |
102 # map infiles to names in exp. design template | |
103 names = [] | |
104 names_to_paths = {} | |
105 # strip path and extension | |
106 for f in infiles: | |
107 b = os.path.basename(f) | |
108 basename = b[:-6] if b.endswith('.mzXML') else b[:-11] | |
109 names_to_paths[basename] = f | |
110 for name in design['Name']: | |
111 # same substitution as in maxquant.xml, | |
112 # when passing the element identifiers | |
113 fname = re.sub(self.substitution_rx, '_', name) | |
114 names.append(names_to_paths[fname] if fname in names_to_paths | |
115 else None) | |
116 # replace orig. file names with matching links to galaxy datasets | |
117 design['Name'] = names | |
118 | |
119 return design | |
120 | |
121 def add_infiles(self, infiles, interactive): | |
122 """Add a list of raw/mzxml files to the mqpar.xml. | |
123 If experimental design template was specified, | |
124 modify other parameters accordingly. | |
125 The files must be specified as absolute paths | |
126 for maxquant to find them. | |
127 >>> t1 = MQParam("test", './test-data/template.xml', None) | |
128 >>> t1.add_infiles(('test1', ), True) | |
129 >>> t1.root.find("filePaths")[0].text | |
130 'test1' | |
131 >>> t1.root.find("fractions")[0].text | |
132 '32767' | |
133 >>> len(t1.root.find("fractions")) | |
134 1 | |
135 >>> t2 = MQParam("test", './test-data/template.xml', \ | |
136 './test-data/exp_design_test.txt') | |
137 >>> t2.add_infiles(('test-data/QEplus021874.thermo.raw', \ | |
138 'test-data/QEplus021876.thermo.raw'), True) | |
139 >>> len(t2.root.find("filePaths")) | |
140 2 | |
141 >>> t2.root.find("filePaths")[1].text | |
142 'test-data/QEplus021876.thermo.raw' | |
143 >>> t2.root.find("experiments")[1].text | |
144 '2' | |
145 >>> t2.root.find("fractions")[0].text | |
146 '3' | |
147 """ | |
148 | |
149 # Create experimental design for interactive mode. | |
150 # In non-interactive mode only filepaths are modified, but | |
151 # their order from the original mqpar must be kept. | |
152 if interactive: | |
153 index = range(len(infiles)) | |
154 nodenames = ('filePaths', 'experiments', 'fractions', | |
155 'ptms', 'paramGroupIndices', 'referenceChannel') | |
156 design = self._make_exp_design(infiles) | |
157 else: | |
158 index = [-1] * len(infiles) | |
159 # kind of a BUG: fails if filename starts with '.' | |
160 infilenames = [os.path.basename(f).split('.')[0] for f in infiles] | |
161 i = 0 | |
162 for child in self.root.find('filePaths'): | |
163 # either windows or posix path | |
164 win = ntpath.basename(child.text) | |
165 posix = os.path.basename(child.text) | |
166 basename = win if len(win) < len(posix) else posix | |
167 basename_with_sub = re.sub(self.substitution_rx, '_', | |
168 basename.split('.')[0]) | |
169 # match infiles to their names in mqpar.xml, | |
170 # ignore files missing in mqpar.xml | |
171 if basename_with_sub in infilenames: | |
172 index[i] = infilenames.index(basename_with_sub) | |
173 i += 1 | |
174 else: | |
175 raise ValueError("no matching infile found for " | |
176 + child.text) | |
177 | |
178 nodenames = ('filePaths', ) | |
179 design = {'Name': infiles} | |
180 | |
181 # Get parent nodes from document | |
182 nodes = dict() | |
183 for nodename in nodenames: | |
184 node = self.root.find(nodename) | |
185 if node is None: | |
186 raise ValueError('Element {} not found in parameter file' | |
187 .format(nodename)) | |
188 nodes[nodename] = node | |
189 node.clear() | |
190 node.tag = nodename | |
191 | |
192 # Append sub-elements to nodes (one per file) | |
193 for i in index: | |
194 if i > -1 and design['Name'][i]: | |
195 MQParam._add_child(nodes['filePaths'], 'string', | |
196 design['Name'][i]) | |
197 if interactive: | |
198 MQParam._add_child(nodes['experiments'], 'string', | |
199 design['Experiment'][i]) | |
200 MQParam._add_child(nodes['fractions'], 'short', | |
201 design['Fraction'][i]) | |
202 MQParam._add_child(nodes['ptms'], 'boolean', | |
203 design['PTM'][i]) | |
204 MQParam._add_child(nodes['paramGroupIndices'], 'int', 0) | |
205 MQParam._add_child(nodes['referenceChannel'], 'string', '') | |
206 | |
207 def add_fasta_files(self, files, | |
208 identifier=r'>([^\s]*)', | |
209 description=r'>(.*)'): | |
210 """Add fasta file groups. | |
211 >>> t = MQParam('test', './test-data/template.xml', None) | |
212 >>> t.add_fasta_files(('test1', 'test2')) | |
213 >>> len(t.root.find('fastaFiles')) | |
214 2 | |
215 >>> t.root.find('fastaFiles')[0].find("fastaFilePath").text | |
216 'test1' | |
217 """ | |
218 fasta_node = self.root.find("fastaFiles") | |
219 fasta_node.clear() | |
220 fasta_node.tag = "fastaFiles" | |
221 | |
222 for index in range(len(files)): | |
223 filepath = '<fastaFilePath>' + files[index] | |
224 fasta = self.fasta_template.replace('<fastaFilePath>', filepath) | |
225 fasta = fasta.replace('<identifierParseRule>', | |
226 '<identifierParseRule>' + identifier) | |
227 fasta = fasta.replace('<descriptionParseRule>', | |
228 '<descriptionParseRule>' + description) | |
229 ff_node = self.root.find('.fastaFiles') | |
230 fastaentry = ET.fromstring(fasta) | |
231 ff_node.append(fastaentry) | |
232 | |
233 def set_simple_param(self, key, value): | |
234 """Set a simple parameter. | |
235 >>> t = MQParam(None, './test-data/template.xml', None) | |
236 >>> t.set_simple_param('min_unique_pep', 4) | |
237 >>> t.root.find('.minUniquePeptides').text | |
238 '4' | |
239 """ | |
240 # map simple params to their node in the xml tree | |
241 simple_params = {'missed_cleavages': | |
242 '.parameterGroups/parameterGroup/maxMissedCleavages', | |
243 'min_unique_pep': '.minUniquePeptides', | |
244 'num_threads': 'numThreads', | |
245 'calc_peak_properties': '.calcPeakProperties', | |
246 'write_mztab': 'writeMzTab', | |
247 'min_peptide_len': 'minPepLen', | |
248 'max_peptide_mass': 'maxPeptideMass', | |
249 'ibaq': 'ibaq', # lfq global options | |
250 'ibaq_log_fit': 'ibaqLogFit', | |
251 'separate_lfq': 'separateLfq', | |
252 'lfq_stabilize_large_ratios': | |
253 'lfqStabilizeLargeRatios', | |
254 'lfq_require_msms': 'lfqRequireMsms', | |
255 'advanced_site_intensities': | |
256 'advancedSiteIntensities', | |
257 'lfq_mode': # lfq param group options | |
258 '.parameterGroups/parameterGroup/lfqMode', | |
259 'lfq_skip_norm': | |
260 '.parameterGroups/parameterGroup/lfqSkipNorm', | |
261 'lfq_min_edges_per_node': | |
262 '.parameterGroups/parameterGroup/lfqMinEdgesPerNode', | |
263 'lfq_avg_edges_per_node': | |
264 '.parameterGroups/parameterGroup/lfqAvEdgesPerNode', | |
265 'lfq_min_ratio_count': | |
266 '.parameterGroups/parameterGroup/lfqMinRatioCount'} | |
267 | |
268 if key in simple_params: | |
269 node = self.root.find(simple_params[key]) | |
270 if node is None: | |
271 raise ValueError('Element {} not found in parameter file' | |
272 .format(simple_params[key])) | |
273 node.text = str(value) | |
274 else: | |
275 raise ValueError("Parameter not found.") | |
276 | |
277 def set_silac(self, light_mods, medium_mods, heavy_mods): | |
278 """Set label modifications. | |
279 >>> t1 = MQParam('test', './test-data/template.xml', None) | |
280 >>> t1.set_silac(None, ('test1', 'test2'), None) | |
281 >>> t1.root.find('.parameterGroups/parameterGroup/maxLabeledAa').text | |
282 '2' | |
283 >>> t1.root.find('.parameterGroups/parameterGroup/multiplicity').text | |
284 '3' | |
285 >>> t1.root.find('.parameterGroups/parameterGroup/labelMods')[1].text | |
286 'test1;test2' | |
287 >>> t1.root.find('.parameterGroups/parameterGroup/labelMods')[2].text | |
288 '' | |
289 """ | |
290 multiplicity = 3 if medium_mods else 2 if heavy_mods else 1 | |
291 max_label = str(max(len(light_mods) if light_mods else 0, | |
292 len(medium_mods) if medium_mods else 0, | |
293 len(heavy_mods) if heavy_mods else 0)) | |
294 multiplicity_node = self.root.find('.parameterGroups/parameterGroup/' | |
295 + 'multiplicity') | |
296 multiplicity_node.text = str(multiplicity) | |
297 max_label_node = self.root.find('.parameterGroups/parameterGroup/' | |
298 + 'maxLabeledAa') | |
299 max_label_node.text = max_label | |
300 | |
301 node = self.root.find('.parameterGroups/parameterGroup/labelMods') | |
302 node[0].text = ';'.join(light_mods) if light_mods else '' | |
303 if multiplicity == 3: | |
304 MQParam._add_child(node, name='string', text=';'.join(medium_mods)) | |
305 if multiplicity > 1: | |
306 MQParam._add_child(node, name='string', | |
307 text=';'.join(heavy_mods) if heavy_mods else '') | |
308 | |
309 def set_list_params(self, key, vals): | |
310 """Set a list parameter. | |
311 >>> t = MQParam(None, './test-data/template.xml', None) | |
312 >>> t.set_list_params('proteases', ('test 1', 'test 2')) | |
313 >>> len(t.root.find('.parameterGroups/parameterGroup/enzymes')) | |
314 2 | |
315 >>> t.set_list_params('var_mods', ('Oxidation (M)', )) | |
316 >>> var_mods = '.parameterGroups/parameterGroup/variableModifications' | |
317 >>> t.root.find(var_mods)[0].text | |
318 'Oxidation (M)' | |
319 """ | |
320 | |
321 params = {'var_mods': | |
322 '.parameterGroups/parameterGroup/variableModifications', | |
323 'fixed_mods': | |
324 '.parameterGroups/parameterGroup/fixedModifications', | |
325 'proteases': | |
326 '.parameterGroups/parameterGroup/enzymes'} | |
327 | |
328 if key in params: | |
329 node = self.root.find(params[key]) | |
330 if node is None: | |
331 raise ValueError('Element {} not found in parameter file' | |
332 .format(params[key])) | |
333 node.clear() | |
334 node.tag = params[key].split('/')[-1] | |
335 for e in vals: | |
336 MQParam._add_child(node, name='string', text=e) | |
337 else: | |
338 raise ValueError("Parameter {} not found.".format(key)) | |
339 | |
340 def write(self): | |
341 rough_string = ET.tostring(self.root, 'utf-8', short_empty_elements=False) | |
342 reparsed = minidom.parseString(rough_string) | |
343 pretty = reparsed.toprettyxml(indent="\t") | |
344 even_prettier = re.sub(r"\n\s+\n", r"\n", pretty) | |
345 with open(self.mqpar_out, 'w') as f: | |
346 print(even_prettier, file=f) |