comparison nextgen_iedb_api.py @ 0:88e44dab2988 draft default tip

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/iedb_netmhcpan commit 0ac7534c8d9f5bfea21b998286f822784e62da08
author galaxyp
date Wed, 09 Jul 2025 12:56:30 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:88e44dab2988
1 #!/usr/bin/env python
2
3 """
4 This file was adapted from the iedb_apy.py file of the iedb_api tool.
5 It uses the newer "Next-Generation" IEDB API, and is constrained to
6 the mhci and mhcii tool groups.
7 """
8
9 import argparse
10 import json
11 import os.path
12 import re
13 import sys
14 import time
15 import urllib.request
16 from urllib.error import HTTPError
17
18 # IEDB tool groups and predictor methods
19 mhci_methods = ['netmhcpan_el', 'netmhcpan_ba']
20 mhcii_methods = ['netmhciipan_el', 'netmhciipan_ba']
21 tool_group_methods = {'mhci': mhci_methods,
22 'mhcii': mhcii_methods}
23 all_methods = set(mhci_methods + mhcii_methods)
24
25 # Values for polling backoff
26 max_backoff_count = 25
27 init_poll_sleep = 10
28 poll_retries = 50
29 requests_before_backoff = 5
30
31
32 def parse_alleles(allelefile):
33 """Returns a dictionary with alleles from input file."""
34 alleles = []
35 with open(allelefile, 'r') as fh:
36 for line in fh:
37 allele = line.strip()
38 alleles.append(allele)
39 return alleles
40
41
42 def parse_sequence_column(sequence_file_lines, col):
43 """Sequences may come from a specific column in a TSV file.
44
45 Parse these sequences out while checking each against a regex for validity.
46 """
47 aapat = '^[ABCDEFGHIKLMNPQRSTVWY]+$'
48 sequences = []
49 for i, line in enumerate(sequence_file_lines):
50 fields = line.split('\t')
51 if len(fields) > col:
52 seq = re.sub('[_*]', '', fields[col].strip())
53 if not re.match(aapat, seq):
54 warn_err(f'Line {i}, Not a peptide: {seq}')
55 else:
56 sequences.append(seq)
57 else:
58 warn_err('Invalid value for -c/--column')
59 break
60 return sequences
61
62
63 def iedb_request(req, timeout, retries, error_retry_sleep, response_fn=None, req_data=None):
64 """Handles HTTP request and exceptions. Allows for a callback to parse IEDB response."""
65 for retry in range(1, retries + 1):
66 response = None
67 try:
68 response = urllib.request.urlopen(req, req_data, timeout=timeout)
69 except HTTPError as e:
70 warn_err(f'{retry} of {retries} Error connecting to IEDB server. \
71 HTTP status code: {e.code}')
72 time.sleep(error_retry_sleep)
73 except Exception as e:
74 warn_err(f'Error getting results from IEDB server: {e}')
75 return None
76
77 if response and response.getcode() == 200:
78 # If no callback, return results
79 if not response_fn:
80 response_string = response.read().decode('utf-8')
81 response_json = json.loads(response_string)
82 return response_json
83
84 # Retry if response_fn callback deems necessary, i.e. results from job are not ready.
85 response_json = response_fn(response, retry)
86 if response_json:
87 return response_json
88 else:
89 code = response.getcode() if response else 1
90 warn_err(f'Error connecting to IEDB server. HTTP status code: {code}')
91
92 warn_err(f'No successful response from IEDB in {retries} retries')
93 return None
94
95
96 def pipeline_request(url, tool_group, sequence_text, alleles, length_range,
97 methods, peptide_shift, timeout=300, retries=3, error_retry_sleep=300):
98 """Submits job to IEDB pipeline and polls API until results are ready.
99
100 Returns response JSON from IEDB.
101 """
102
103 # Set up input parameters for IEDB NetMHCPan or NetMHCIIPan job
104 input_parameters = {
105 'alleles': alleles,
106 'peptide_length_range': length_range,
107 'predictors': [{'type': 'binding', 'method': m} for m in methods],
108 'peptide_shift': peptide_shift
109 }
110
111 if peptide_shift:
112 input_parameters['peptide_shift'] = peptide_shift
113
114 stage = {
115 'stage_number': 1,
116 'tool_group': tool_group,
117 'input_sequence_text': sequence_text,
118 'input_parameters': input_parameters
119 }
120
121 params = {
122 'pipeline_id': "",
123 'run_stage_range': [1, 1],
124 'stages': [stage]
125 }
126
127 req = urllib.request.Request(url, method='POST')
128 req_data = json.dumps(params).encode('utf-8')
129 req.add_header('Content-Type', 'application/json; charset=utf-8')
130 req.add_header('Content-Length', len(req_data))
131
132 # Make an initial request to submit job
133 response_json = iedb_request(req, timeout, retries, error_retry_sleep, req_data=req_data)
134 if not response_json:
135 warn_err('Initial request failed.')
136 return None
137
138 # Check response from job submission
139 warnings = response_json.get('warnings')
140 if warnings and len(warnings) > 0:
141 invalid_alleles = False
142 for warning in warnings:
143 if 'cannot predict binding for allele' in warning:
144 warn_err(f"Error: Bad allelle input. {warning}")
145 invalid_alleles = True
146 if invalid_alleles:
147 return None
148
149 warn_err(f'Warnings from IEDB: {warnings}')
150
151 errors = response_json.get('errors')
152 if errors:
153 warn_err(f'Errors from IEDB: {errors}')
154 return None
155
156 results_uri = response_json.get('results_uri')
157 if not results_uri:
158 warn_err('No results URI provided from IEDB.')
159 return None
160
161 # Callback function to rate-limit poll requests
162 def poll_response_fn(response, retry):
163 response_string = response.read().decode('utf-8')
164 response_json = json.loads(response_string)
165 if response_json['status'] != 'done':
166 if retry == poll_retries:
167 warn_err('Job not finished in maximum allowed time.')
168 backoff_count = min(retry, max_backoff_count)
169
170 # Double sleep every requests_before_backoff requests
171 sleep_duration = init_poll_sleep * 2 ** (backoff_count // requests_before_backoff)
172 time.sleep(sleep_duration)
173 return None
174 return response_json
175
176 # Submit polling for results
177 response_json = iedb_request(results_uri, timeout, poll_retries,
178 error_retry_sleep, response_fn=poll_response_fn)
179 if not response_json:
180 warn_err('Retrieving results failed.')
181 return response_json
182
183
184 def warn_err(msg, exit_code=None):
185 sys.stderr.write(f"{msg}\n")
186 sys.stderr.flush()
187 if exit_code:
188 sys.exit(exit_code)
189
190
191 def add_reversed_sequences(file_lines, file_format):
192 """Adds a reversed sequence after each input sequence.
193
194 Takes a plain list of sequences, or FASTA file. Each reversed FASTA sequence has
195 the same header prefixed with 'reversed_'.
196 """
197 sequences_with_reversed = []
198 if file_format == 'fasta':
199 i = 0
200 while i < len(file_lines):
201
202 # Validate header from next sequence
203 seq_header = file_lines[i]
204 if seq_header[0] != '>':
205 print('Invalid FASTA. Exiting.', file=sys.stderr)
206 sys.exit(1)
207
208 # Aggregate sequence into a single line
209 j = i + 1
210 seq = ''
211 while j < len(file_lines):
212 next_line = file_lines[j]
213 if next_line[0] == '>':
214 break
215 seq = seq + file_lines[j]
216 j += 1
217
218 # Add non-reversed sequence
219 sequences_with_reversed.append(seq_header)
220 sequences_with_reversed.append(seq)
221
222 # Add reversed header and sequence
223 rev_header = seq_header.replace('>', '>reversed_')
224 sequences_with_reversed.append(rev_header)
225
226 rev_seq = seq[::-1]
227 sequences_with_reversed.append(rev_seq)
228
229 # Advance index to what should be the next sequence header
230 i = j
231
232 # If not FASTA, should be a simple list of peptides to reverse sequentially
233 else:
234 for seq in file_lines:
235
236 # Reverse seq
237 rev_seq = f'{seq[::-1]}'
238
239 # Add original and reversed sequences
240 sequences_with_reversed.append(seq)
241 sequences_with_reversed.append(rev_seq)
242
243 return sequences_with_reversed
244
245
246 def __main__():
247 # Parse Command Line
248 parser = argparse.ArgumentParser()
249 parser.add_argument('-T', '--tool-group',
250 dest='tool_group',
251 default='mhci',
252 choices=tool_group_methods.keys(),
253 help='IEDB API Tool Group')
254 parser.add_argument('-m', '--method',
255 action="append",
256 required=True,
257 choices=all_methods,
258 help='prediction method')
259 parser.add_argument('-A', '--allelefile',
260 required=True,
261 help='File of HLA alleles')
262 parser.add_argument('-l', '--lengthrange',
263 help='length range for which to make predictions for alleles')
264 parser.add_argument('-P', '--peptide_shift',
265 type=int,
266 default=None,
267 help='Peptide Shift')
268 parser.add_argument('-i', '--input',
269 required=True,
270 help='Input file for peptide sequences '
271 + '(fasta or tabular)')
272 parser.add_argument('-c', '--column',
273 default=None,
274 help='Zero-indexed peptide column in a tabular input file')
275 parser.add_argument('-o', '--output',
276 required=True,
277 help='Output file for query results')
278 parser.add_argument('-t', '--timeout',
279 type=int,
280 default=600,
281 help='Seconds to wait for server response')
282 parser.add_argument('-r', '--retries',
283 type=int,
284 default=5,
285 help='Number of times to retry failed server query')
286 parser.add_argument('-S', '--sleep',
287 type=int,
288 default=300,
289 help='Seconds to wait between failed server query retries')
290 parser.add_argument('-R', '--add-reversed',
291 dest='add_reversed',
292 action='store_true',
293 help='Input has every other sequence reversed. Identify in output.')
294 args = parser.parse_args()
295
296 allele_string = ','.join(parse_alleles(args.allelefile))
297
298 length_range = [int(i) for i in args.lengthrange.split(',')]
299
300 pipeline_url = 'https://api-nextgen-tools.iedb.org/api/v1/pipeline'
301
302 # If sequences submitted as a file, parse out sequences.
303 try:
304 with open(args.input) as inf:
305 sequence_file_contents = inf.read()
306 except Exception as e:
307 warn_err(f'Unable to open input file: {e}', exit_code=1)
308
309 sequence_file_lines = sequence_file_contents.splitlines()
310
311 # Pick out sequences if input file has multiple columns,
312 # otherwise submit list of sequences as-is.
313 if not args.column:
314 # IEDB may take FASTA files directly, so input contents as-is
315 if args.add_reversed:
316 sequence_text = '\n'.join(add_reversed_sequences(sequence_file_lines, 'fasta'))
317 else:
318 sequence_text = sequence_file_contents
319 else:
320 sequences = parse_sequence_column(sequence_file_lines, int(args.column))
321 if args.add_reversed:
322 sequence_text = '\n'.join(add_reversed_sequences(sequences, 'tsv'))
323 else:
324 sequence_text = '\n'.join(sequences)
325
326 if len(sequence_text) == 0:
327 warn_err('Error parsing sequences', exit_code=1)
328
329 # Submit job and return results
330 results = pipeline_request(pipeline_url, args.tool_group, sequence_text,
331 allele_string, length_range, args.method,
332 peptide_shift=args.peptide_shift, timeout=args.timeout,
333 retries=args.retries, error_retry_sleep=args.sleep)
334 if not results:
335 warn_err('Job failed. Exiting.', exit_code=1)
336
337 try:
338 peptide_table = [t for t in results['data']['results'] if t['type'] == 'peptide_table'][0]
339 peptide_table_data = peptide_table['table_data']
340 peptide_table_columns = peptide_table['table_columns']
341
342 # If we reversed peptides prior to IEDB input,
343 # find column index of sequence number so we can identify which come from reversed input.
344 if args.add_reversed:
345 for i, column in enumerate(peptide_table_columns):
346 if column['display_name'] == 'seq #':
347 seq_num_index = i
348 break
349 except (KeyError, IndexError) as e:
350 warn_err(f'Error parsing IEDB results: {e}', exit_code=1)
351
352 output_path = os.path.abspath(args.output)
353 with open(output_path, 'w') as output_file:
354 # Write column names
355 display_names = '\t'.join([c['display_name'] for c in peptide_table_columns] + ['reversed'])
356 print(display_names, file=output_file)
357
358 # Write data
359 for values in peptide_table_data:
360 if args.add_reversed:
361 seq_number = values[seq_num_index]
362 # Every original input sequence is followed by its reversed sequence,
363 # so we know even sequence numbers are reversed.
364 reversed_val = str(seq_number % 2 == 0).lower()
365 else:
366 reversed_val = 'false'
367 values = '\t'.join([str(v) for v in values] + [reversed_val])
368 print(values, file=output_file)
369
370
371 if __name__ == "__main__":
372 __main__()