Mercurial > repos > galaxyp > iedb_netmhcpan
comparison nextgen_iedb_api.py @ 0:88e44dab2988 draft default tip
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/iedb_netmhcpan commit 0ac7534c8d9f5bfea21b998286f822784e62da08
author | galaxyp |
---|---|
date | Wed, 09 Jul 2025 12:56:30 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:88e44dab2988 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 """ | |
4 This file was adapted from the iedb_apy.py file of the iedb_api tool. | |
5 It uses the newer "Next-Generation" IEDB API, and is constrained to | |
6 the mhci and mhcii tool groups. | |
7 """ | |
8 | |
9 import argparse | |
10 import json | |
11 import os.path | |
12 import re | |
13 import sys | |
14 import time | |
15 import urllib.request | |
16 from urllib.error import HTTPError | |
17 | |
18 # IEDB tool groups and predictor methods | |
19 mhci_methods = ['netmhcpan_el', 'netmhcpan_ba'] | |
20 mhcii_methods = ['netmhciipan_el', 'netmhciipan_ba'] | |
21 tool_group_methods = {'mhci': mhci_methods, | |
22 'mhcii': mhcii_methods} | |
23 all_methods = set(mhci_methods + mhcii_methods) | |
24 | |
25 # Values for polling backoff | |
26 max_backoff_count = 25 | |
27 init_poll_sleep = 10 | |
28 poll_retries = 50 | |
29 requests_before_backoff = 5 | |
30 | |
31 | |
32 def parse_alleles(allelefile): | |
33 """Returns a dictionary with alleles from input file.""" | |
34 alleles = [] | |
35 with open(allelefile, 'r') as fh: | |
36 for line in fh: | |
37 allele = line.strip() | |
38 alleles.append(allele) | |
39 return alleles | |
40 | |
41 | |
42 def parse_sequence_column(sequence_file_lines, col): | |
43 """Sequences may come from a specific column in a TSV file. | |
44 | |
45 Parse these sequences out while checking each against a regex for validity. | |
46 """ | |
47 aapat = '^[ABCDEFGHIKLMNPQRSTVWY]+$' | |
48 sequences = [] | |
49 for i, line in enumerate(sequence_file_lines): | |
50 fields = line.split('\t') | |
51 if len(fields) > col: | |
52 seq = re.sub('[_*]', '', fields[col].strip()) | |
53 if not re.match(aapat, seq): | |
54 warn_err(f'Line {i}, Not a peptide: {seq}') | |
55 else: | |
56 sequences.append(seq) | |
57 else: | |
58 warn_err('Invalid value for -c/--column') | |
59 break | |
60 return sequences | |
61 | |
62 | |
63 def iedb_request(req, timeout, retries, error_retry_sleep, response_fn=None, req_data=None): | |
64 """Handles HTTP request and exceptions. Allows for a callback to parse IEDB response.""" | |
65 for retry in range(1, retries + 1): | |
66 response = None | |
67 try: | |
68 response = urllib.request.urlopen(req, req_data, timeout=timeout) | |
69 except HTTPError as e: | |
70 warn_err(f'{retry} of {retries} Error connecting to IEDB server. \ | |
71 HTTP status code: {e.code}') | |
72 time.sleep(error_retry_sleep) | |
73 except Exception as e: | |
74 warn_err(f'Error getting results from IEDB server: {e}') | |
75 return None | |
76 | |
77 if response and response.getcode() == 200: | |
78 # If no callback, return results | |
79 if not response_fn: | |
80 response_string = response.read().decode('utf-8') | |
81 response_json = json.loads(response_string) | |
82 return response_json | |
83 | |
84 # Retry if response_fn callback deems necessary, i.e. results from job are not ready. | |
85 response_json = response_fn(response, retry) | |
86 if response_json: | |
87 return response_json | |
88 else: | |
89 code = response.getcode() if response else 1 | |
90 warn_err(f'Error connecting to IEDB server. HTTP status code: {code}') | |
91 | |
92 warn_err(f'No successful response from IEDB in {retries} retries') | |
93 return None | |
94 | |
95 | |
96 def pipeline_request(url, tool_group, sequence_text, alleles, length_range, | |
97 methods, peptide_shift, timeout=300, retries=3, error_retry_sleep=300): | |
98 """Submits job to IEDB pipeline and polls API until results are ready. | |
99 | |
100 Returns response JSON from IEDB. | |
101 """ | |
102 | |
103 # Set up input parameters for IEDB NetMHCPan or NetMHCIIPan job | |
104 input_parameters = { | |
105 'alleles': alleles, | |
106 'peptide_length_range': length_range, | |
107 'predictors': [{'type': 'binding', 'method': m} for m in methods], | |
108 'peptide_shift': peptide_shift | |
109 } | |
110 | |
111 if peptide_shift: | |
112 input_parameters['peptide_shift'] = peptide_shift | |
113 | |
114 stage = { | |
115 'stage_number': 1, | |
116 'tool_group': tool_group, | |
117 'input_sequence_text': sequence_text, | |
118 'input_parameters': input_parameters | |
119 } | |
120 | |
121 params = { | |
122 'pipeline_id': "", | |
123 'run_stage_range': [1, 1], | |
124 'stages': [stage] | |
125 } | |
126 | |
127 req = urllib.request.Request(url, method='POST') | |
128 req_data = json.dumps(params).encode('utf-8') | |
129 req.add_header('Content-Type', 'application/json; charset=utf-8') | |
130 req.add_header('Content-Length', len(req_data)) | |
131 | |
132 # Make an initial request to submit job | |
133 response_json = iedb_request(req, timeout, retries, error_retry_sleep, req_data=req_data) | |
134 if not response_json: | |
135 warn_err('Initial request failed.') | |
136 return None | |
137 | |
138 # Check response from job submission | |
139 warnings = response_json.get('warnings') | |
140 if warnings and len(warnings) > 0: | |
141 invalid_alleles = False | |
142 for warning in warnings: | |
143 if 'cannot predict binding for allele' in warning: | |
144 warn_err(f"Error: Bad allelle input. {warning}") | |
145 invalid_alleles = True | |
146 if invalid_alleles: | |
147 return None | |
148 | |
149 warn_err(f'Warnings from IEDB: {warnings}') | |
150 | |
151 errors = response_json.get('errors') | |
152 if errors: | |
153 warn_err(f'Errors from IEDB: {errors}') | |
154 return None | |
155 | |
156 results_uri = response_json.get('results_uri') | |
157 if not results_uri: | |
158 warn_err('No results URI provided from IEDB.') | |
159 return None | |
160 | |
161 # Callback function to rate-limit poll requests | |
162 def poll_response_fn(response, retry): | |
163 response_string = response.read().decode('utf-8') | |
164 response_json = json.loads(response_string) | |
165 if response_json['status'] != 'done': | |
166 if retry == poll_retries: | |
167 warn_err('Job not finished in maximum allowed time.') | |
168 backoff_count = min(retry, max_backoff_count) | |
169 | |
170 # Double sleep every requests_before_backoff requests | |
171 sleep_duration = init_poll_sleep * 2 ** (backoff_count // requests_before_backoff) | |
172 time.sleep(sleep_duration) | |
173 return None | |
174 return response_json | |
175 | |
176 # Submit polling for results | |
177 response_json = iedb_request(results_uri, timeout, poll_retries, | |
178 error_retry_sleep, response_fn=poll_response_fn) | |
179 if not response_json: | |
180 warn_err('Retrieving results failed.') | |
181 return response_json | |
182 | |
183 | |
184 def warn_err(msg, exit_code=None): | |
185 sys.stderr.write(f"{msg}\n") | |
186 sys.stderr.flush() | |
187 if exit_code: | |
188 sys.exit(exit_code) | |
189 | |
190 | |
191 def add_reversed_sequences(file_lines, file_format): | |
192 """Adds a reversed sequence after each input sequence. | |
193 | |
194 Takes a plain list of sequences, or FASTA file. Each reversed FASTA sequence has | |
195 the same header prefixed with 'reversed_'. | |
196 """ | |
197 sequences_with_reversed = [] | |
198 if file_format == 'fasta': | |
199 i = 0 | |
200 while i < len(file_lines): | |
201 | |
202 # Validate header from next sequence | |
203 seq_header = file_lines[i] | |
204 if seq_header[0] != '>': | |
205 print('Invalid FASTA. Exiting.', file=sys.stderr) | |
206 sys.exit(1) | |
207 | |
208 # Aggregate sequence into a single line | |
209 j = i + 1 | |
210 seq = '' | |
211 while j < len(file_lines): | |
212 next_line = file_lines[j] | |
213 if next_line[0] == '>': | |
214 break | |
215 seq = seq + file_lines[j] | |
216 j += 1 | |
217 | |
218 # Add non-reversed sequence | |
219 sequences_with_reversed.append(seq_header) | |
220 sequences_with_reversed.append(seq) | |
221 | |
222 # Add reversed header and sequence | |
223 rev_header = seq_header.replace('>', '>reversed_') | |
224 sequences_with_reversed.append(rev_header) | |
225 | |
226 rev_seq = seq[::-1] | |
227 sequences_with_reversed.append(rev_seq) | |
228 | |
229 # Advance index to what should be the next sequence header | |
230 i = j | |
231 | |
232 # If not FASTA, should be a simple list of peptides to reverse sequentially | |
233 else: | |
234 for seq in file_lines: | |
235 | |
236 # Reverse seq | |
237 rev_seq = f'{seq[::-1]}' | |
238 | |
239 # Add original and reversed sequences | |
240 sequences_with_reversed.append(seq) | |
241 sequences_with_reversed.append(rev_seq) | |
242 | |
243 return sequences_with_reversed | |
244 | |
245 | |
246 def __main__(): | |
247 # Parse Command Line | |
248 parser = argparse.ArgumentParser() | |
249 parser.add_argument('-T', '--tool-group', | |
250 dest='tool_group', | |
251 default='mhci', | |
252 choices=tool_group_methods.keys(), | |
253 help='IEDB API Tool Group') | |
254 parser.add_argument('-m', '--method', | |
255 action="append", | |
256 required=True, | |
257 choices=all_methods, | |
258 help='prediction method') | |
259 parser.add_argument('-A', '--allelefile', | |
260 required=True, | |
261 help='File of HLA alleles') | |
262 parser.add_argument('-l', '--lengthrange', | |
263 help='length range for which to make predictions for alleles') | |
264 parser.add_argument('-P', '--peptide_shift', | |
265 type=int, | |
266 default=None, | |
267 help='Peptide Shift') | |
268 parser.add_argument('-i', '--input', | |
269 required=True, | |
270 help='Input file for peptide sequences ' | |
271 + '(fasta or tabular)') | |
272 parser.add_argument('-c', '--column', | |
273 default=None, | |
274 help='Zero-indexed peptide column in a tabular input file') | |
275 parser.add_argument('-o', '--output', | |
276 required=True, | |
277 help='Output file for query results') | |
278 parser.add_argument('-t', '--timeout', | |
279 type=int, | |
280 default=600, | |
281 help='Seconds to wait for server response') | |
282 parser.add_argument('-r', '--retries', | |
283 type=int, | |
284 default=5, | |
285 help='Number of times to retry failed server query') | |
286 parser.add_argument('-S', '--sleep', | |
287 type=int, | |
288 default=300, | |
289 help='Seconds to wait between failed server query retries') | |
290 parser.add_argument('-R', '--add-reversed', | |
291 dest='add_reversed', | |
292 action='store_true', | |
293 help='Input has every other sequence reversed. Identify in output.') | |
294 args = parser.parse_args() | |
295 | |
296 allele_string = ','.join(parse_alleles(args.allelefile)) | |
297 | |
298 length_range = [int(i) for i in args.lengthrange.split(',')] | |
299 | |
300 pipeline_url = 'https://api-nextgen-tools.iedb.org/api/v1/pipeline' | |
301 | |
302 # If sequences submitted as a file, parse out sequences. | |
303 try: | |
304 with open(args.input) as inf: | |
305 sequence_file_contents = inf.read() | |
306 except Exception as e: | |
307 warn_err(f'Unable to open input file: {e}', exit_code=1) | |
308 | |
309 sequence_file_lines = sequence_file_contents.splitlines() | |
310 | |
311 # Pick out sequences if input file has multiple columns, | |
312 # otherwise submit list of sequences as-is. | |
313 if not args.column: | |
314 # IEDB may take FASTA files directly, so input contents as-is | |
315 if args.add_reversed: | |
316 sequence_text = '\n'.join(add_reversed_sequences(sequence_file_lines, 'fasta')) | |
317 else: | |
318 sequence_text = sequence_file_contents | |
319 else: | |
320 sequences = parse_sequence_column(sequence_file_lines, int(args.column)) | |
321 if args.add_reversed: | |
322 sequence_text = '\n'.join(add_reversed_sequences(sequences, 'tsv')) | |
323 else: | |
324 sequence_text = '\n'.join(sequences) | |
325 | |
326 if len(sequence_text) == 0: | |
327 warn_err('Error parsing sequences', exit_code=1) | |
328 | |
329 # Submit job and return results | |
330 results = pipeline_request(pipeline_url, args.tool_group, sequence_text, | |
331 allele_string, length_range, args.method, | |
332 peptide_shift=args.peptide_shift, timeout=args.timeout, | |
333 retries=args.retries, error_retry_sleep=args.sleep) | |
334 if not results: | |
335 warn_err('Job failed. Exiting.', exit_code=1) | |
336 | |
337 try: | |
338 peptide_table = [t for t in results['data']['results'] if t['type'] == 'peptide_table'][0] | |
339 peptide_table_data = peptide_table['table_data'] | |
340 peptide_table_columns = peptide_table['table_columns'] | |
341 | |
342 # If we reversed peptides prior to IEDB input, | |
343 # find column index of sequence number so we can identify which come from reversed input. | |
344 if args.add_reversed: | |
345 for i, column in enumerate(peptide_table_columns): | |
346 if column['display_name'] == 'seq #': | |
347 seq_num_index = i | |
348 break | |
349 except (KeyError, IndexError) as e: | |
350 warn_err(f'Error parsing IEDB results: {e}', exit_code=1) | |
351 | |
352 output_path = os.path.abspath(args.output) | |
353 with open(output_path, 'w') as output_file: | |
354 # Write column names | |
355 display_names = '\t'.join([c['display_name'] for c in peptide_table_columns] + ['reversed']) | |
356 print(display_names, file=output_file) | |
357 | |
358 # Write data | |
359 for values in peptide_table_data: | |
360 if args.add_reversed: | |
361 seq_number = values[seq_num_index] | |
362 # Every original input sequence is followed by its reversed sequence, | |
363 # so we know even sequence numbers are reversed. | |
364 reversed_val = str(seq_number % 2 == 0).lower() | |
365 else: | |
366 reversed_val = 'false' | |
367 values = '\t'.join([str(v) for v in values] + [reversed_val]) | |
368 print(values, file=output_file) | |
369 | |
370 | |
371 if __name__ == "__main__": | |
372 __main__() |