# HG changeset patch # User jjohnson # Date 1582670254 18000 # Node ID 4a89ba6cfc63e0ad454ecf8094a51a0f7e768457 # Parent 9914246054924dd8d8550c2a95eaf2351a91814f "planemo upload for repository https://github.com/jj-umn/galaxytools/tree/master/iedb_api commit 18698e056ccc2d6d37836bd22728e2d8765e92ec" diff -r 991424605492 -r 4a89ba6cfc63 iedb_api.py --- a/iedb_api.py Mon Feb 17 16:04:07 2020 -0500 +++ b/iedb_api.py Tue Feb 25 17:37:34 2020 -0500 @@ -1,142 +1,281 @@ #!/usr/bin/env python -""" -""" -import sys + +import argparse import os.path import re -import optparse -import urllib -import urllib2 -from optparse import OptionParser +import sys +import time + +from urllib.error import HTTPError +from urllib.parse import urlencode, unquote +from urllib.request import urlopen -mhci_methods = ['recommended','consensus','netmhcpan','ann','smmpmbec','smm','comblib_sidney2008','netmhccons','pickpocket'] -mhcii_methods = ['recommended','consensus3','NetMHCIIpan','nn_align','smm_align','comblib','tepitope'] -processing_methods = ['recommended','consensus','netmhcpan','ann','smmpmbec','smm','comblib_sidney2008'] -mhcnp_methods = ['mhcnp'] -bcell_methods = ['Bepipred','Chou-FasmanEmini','Karplus-Schulz','Kolaskar-Tongaonkar','Parker'] -prediction_methods = {'mhci':mhci_methods,'mhcii':mhcii_methods,'processing':processing_methods,'mhcnp':mhcnp_methods,'bcell':bcell_methods} +mhci_methods = ['recommended', 'consensus', + 'netmhcpan_ba', 'netmhcpan_el', + 'ann', 'smmpmbec', 'smm', + 'comblib_sidney2008', 'netmhccons', + 'pickpocket', 'netmhcstabpan'] +mhcii_methods = ['recommended', 'consensus', 'NetMHCIIpan', + 'nn_align', 'smm_align', 'comblib', 'tepitope'] +processing_methods = ['recommended', 'netmhcpan', 'ann', + 'smmpmbec', 'smm', 'comblib_sidney2008', + 'netmhccons', 'pickpocket'] +mhcnp_methods = ['mhcnp', 'netmhcpan'] +bcell_methods = ['Bepipred', 'Chou-Fasman', 'Emini', 'Karplus-Schulz', + 'Kolaskar-Tongaonkar', 'Parker', 'Bepipred-2.0'] +prediction_methods = {'mhci': mhci_methods, + 'mhcii': mhcii_methods, + 'processing': processing_methods, + 'mhcnp': mhcnp_methods, + 'bcell': bcell_methods} +all_methods = set(mhci_methods + mhcii_methods + + mhcnp_methods + bcell_methods) +prediction_lengths = {'mhci': range(8, 16), + 'mhcii': range(11, 31), + 'processing': range(8, 15), + 'mhcnp': range(8, 12), + 'bcell': range(8, 16)} -def warn_err(msg,exit_code=1): - sys.stderr.write(msg) - if exit_code: - sys.exit(exit_code) + +def warn_err(msg, exit_code=1): + sys.stderr.write(msg) + if exit_code: + sys.exit(exit_code) def __main__(): - #Parse Command Line - parser = optparse.OptionParser() - parser.add_option( '-p', '--prediction', dest='prediction', default='mhci', choices=['mhci','mhcii','processing','mhcnp','bcell'], help='IEDB API prediction service' ) - parser.add_option( '-s', '--sequence', dest='sequence', action="append", default=None, help='Peptide Sequence' ) - parser.add_option( '-m', '--method', dest='method', default='recommended', choices=['recommended','consensus','netmhcpan','ann','smmpmbec','smm','comblib_sidney2008','netmhccons','pickpocket' ], help='prediction method' ) - parser.add_option( '-a', '--allele', dest='allele', action="append", default=[], help='Alleles for which to make predictions' ) - parser.add_option( '-l', '--length', dest='length', action="append", default=[], choices=['8', '9', '10', '11', '12', '13', '14', '15'], help='lengths for which to make predictions, 1 per allele' ) - parser.add_option( '-i', '--input', dest='input', default=None, help='Input file for peptide sequences (fasta or tabular)' ) - parser.add_option( '-c', '--column', dest='column', default=None, help='Peptide Column in a tabular input file' ) - parser.add_option( '-C', '--id_column', dest='id_column', default=None, help='ID Column in a tabular input file' ) - parser.add_option( '-o', '--output', dest='output', default=None, help='Output file for query results' ) - parser.add_option( '-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stderr' ) - (options, args) = parser.parse_args() - - aapat = '^[ABCDEFGHIKLMNPQRSTVWY]+$' - - if not options.allele and options.prediction != 'bcell': - warn_err('-a allele required\n', exit_code=1) - - if not (options.sequence or options.input): - warn_err('NO Sequences given: either -s sequence or -i input_file is required\n', exit_code=1) + # Parse Command Line + parser = argparse.ArgumentParser(description='', epilog='') + parser.add_argument('-p', '--prediction', + default='mhci', + choices=prediction_methods.keys(), + help='IEDB API prediction service') + parser.add_argument('-s', '--sequence', + action="append", + default=None, + help='Peptide Sequence') + parser.add_argument('-m', '--method', + default='recommended', + choices=all_methods, + help='prediction method') + parser.add_argument('-P', '--proteasome', + default=None, + choices=['immuno', 'constitutive'], + help='IEDB processing proteasome type') + parser.add_argument('-a', '--allele', + action="append", + default=[], + help='Alleles for which to make predictions') + parser.add_argument('-l', '--length', + action="append", + default=[], + help='lengths for which to make predictions, ' + + '1 per allele') + parser.add_argument('-w', '--window_size', + type=int, + default=None, + help='window_size for bcell prediction') + parser.add_argument('-i', '--input', + default=None, + help='Input file for peptide sequences ' + + '(fasta or tabular)') + parser.add_argument('-c', '--column', + default=None, + help='Peptide Column in a tabular input file') + parser.add_argument('-C', '--id_column', + default=None, + help='ID Column in a tabular input file') + parser.add_argument('-o', '--output', + default=None, + help='Output file for query results') + parser.add_argument('-O', '--output2', + default='iedb_results2', + help='Output file for secondary query results') + parser.add_argument('-t', '--timeout', + type=int, + default=600, + help='Seconds to wait for server response') + parser.add_argument('-r', '--retries', + type=int, + default=5, + help='Number of times to retry server query') + parser.add_argument('-S', '--sleep', + type=int, + default=300, + help='Seconds to wait between retries') + parser.add_argument('-d', '--debug', + action='store_true', + default=False, + help='Turn on wrapper debugging to stderr') + args = parser.parse_args() - if options.output != None: - try: - outputPath = os.path.abspath(options.output) - outputFile = open(outputPath, 'w') - except Exception, e: - warn_err("Unable to open output file: %s\n" % e, exit_code=1) - else: - outputFile = sys.stdout + aapat = '^[ABCDEFGHIKLMNPQRSTVWY]+$' + + if not args.allele and args.prediction != 'bcell': + warn_err('-a allele required\n', exit_code=1) - url = 'http://tools-api.iedb.org/tools_api/%s/' % options.prediction + if not (args.sequence or args.input): + warn_err('NO Sequences given: ' + + 'either -s sequence or -i input_file is required\n', + exit_code=1) - #TODO parse alleles from the options.alleles file - alleles = ','.join(options.allele) - lengths = ','.join(options.length) - method = options.method + if args.output is not None: + try: + outputPath = os.path.abspath(args.output) + outputFile = open(outputPath, 'w') + except Exception as e: + warn_err("Unable to open output file: %s\n" % e, exit_code=1) + else: + outputFile = sys.stdout - results = [] - global header - header = None + url = 'http://tools-cluster-interface.iedb.org/tools_api/%s/' %\ + args.prediction + len_param = 'length' if args.prediction != 'bcell' else 'window_size' - sequence_text = [] - def add_seq(seqid,seq): - sequence_text.append(">%s\n%s" % (seqid if seqid else "peptide%d" % len(sequence_text),seq)) - - def query(url,seq,allele,length,seqid=None,method='recommended'): + # TODO parse alleles from the args.alleles file + alleles = ','.join(args.allele) if args.prediction != 'bcell' else None + lengths = ','.join(args.length) + if args.prediction == 'bcell': + lengths = args.window_size + method = args.method + proteasome = args.proteasome if args.prediction == 'processcing' else None global header - params = dict() - if method: - params['method'] = method - params['sequence_text'] = seq - params['allele'] = allele - params['length'] = length - data = urllib.urlencode(params) - request = urllib2.Request(url, data) - if options.debug: - print >> sys.stderr, "url %s %s %s" % (request.get_full_url(), seqid if seqid else "None", seq) - response = None - response = urllib2.urlopen(request) - if response and response.getcode() == 200: - resp_data = response.readlines() - for line in resp_data: - if line.find('eptide') > 0: - header = "#%s%s" % ("ID\t" if seqid else "", line) - continue - if seqid: - results.append("%s\t%s" % (seqid,line)) - else: - results.append(line) - elif not response: - warn_err("NO response from IEDB server\n", exit_code=3) - else: - warn_err("Error connecting to IEDB server\n", exit_code=response.getcode()) + header = None + results = [] + global header2 + header2 = None + results2 = [] + + sequence_text = [] + + def add_seq(seqid, seq): + sid = seqid if seqid else "peptide%d" % len(sequence_text) + sequence_text.append(">%s\n%s" % (sid, seq)) - if options.sequence: - for i,seq in enumerate(options.sequence): - query(url,seq,alleles,lengths,seqid=None,method=method) - if options.input: - try: - fh = open(options.input,'r') - if options.column: ## tabular - col = int(options.column) - idcol = int(options.id_column) if options.id_column else None - for i,line in enumerate(fh): - fields = line.split('\t') - if len(fields) > col: - seq = re.sub('[_*]','',fields[col]) - if re.match(aapat,seq): - seqid = fields[idcol] if idcol != None and idcol < len(fields) else None - query(url,seq,alleles,lengths,seqid=seqid,method=method) - else: - warn_err('Line %d, Not a peptide: %s\n' % (i,seq),exit_code=None) - else: ## fasta - seqid = None - seq = '' - for i,line in enumerate(fh): - if line.startswith('>'): - if seqid and len(seq) > 0: - query(url,seq,alleles,lengths,seqid=seqid,method=method) - seqid = line[1:].strip() - seq = '' - else: - seq += line.strip() - if seqid and len(seq) > 0: - query(url,seq,alleles,lengths,seqid=seqid,method=method) - fh.close() - except Exception, e: - warn_err("Unable to open input file: %s\n" % e, exit_code=1) + def query(url, seq, allele, length, seqid=None, method='recommended'): + global header + global header2 + params = dict() + if method: + params['method'] = method.encode() + if proteasome: + params['proteasome'] = proteasome.encode() + params['sequence_text'] = seq.encode() + if allele is not None: + params['allele'] = allele.encode() + if length is not None: + params[len_param] = str(length).encode() + req_data = urlencode(params) + if args.debug: + print('url %s %s' % (url, unquote(req_data)), file=sys.stderr) + retries = max(0, args.retries) + 1 + for retry in range(1, retries): + response = None + try: + response = urlopen(url, data=req_data.encode('utf-8'), + timeout=args.timeout) + if response and response.getcode() == 200: + data = [line.decode() for line in response.readlines()] + if args.debug: + print(data, file=sys.stderr) + rslts = results + for ln, line in enumerate(data): + if line.lower().find('invalid') >= 0: + msg = '%s %s\n%s' % (url, unquote(req_data), + ''.join(data)) + warn_err(msg, exit_code=1) + if line.find('eptide') > 0: + header = "#%s%s" %\ + ("ID\t" if seqid else "", line) + if args.debug: + print(header, file=sys.stderr) + continue + elif method == 'Bepipred' and line.find('Residue') > 0: + header2 = "#%s%s" %\ + ("ID\t" if seqid else "", line) + if args.debug: + print(header2, file=sys.stderr) + rslts = results2 + continue + if seqid: + rslts.extend("%s\t%s" % (seqid, line)) + else: + rslts.extend(line) + break + else: + code = response.getcode() if response else 1 + warn_err("Error connecting to IEDB server\n", + exit_code=code) + except HTTPError as e: + code = None if retry < args.retries else e.code + warn_err("%d of %d Error connecting to IEDB server %s\n" % + (retry, retries, e), + exit_code=code) + time.sleep(args.sleep) + except Exception as e: + warn_err("Error connecting to IEDB server %s\n" % e, + exit_code=3) - if header: - outputFile.write(header) - for line in results: - outputFile.write(line) + if args.sequence: + for i, seq in enumerate(args.sequence): + query(url, seq, alleles, lengths, seqid=None, method=method) + if args.input: + try: + fh = open(args.input, 'r') + if args.column: # tabular + col = int(args.column) + idcol = int(args.id_column) if args.id_column else None + for i, line in enumerate(fh): + fields = line.split('\t') + if len(fields) > col: + seq = re.sub('[_*]', '', fields[col]) + if re.match(aapat, seq): + if idcol is not None and idcol < len(fields): + seqid = fields[idcol] + else: + seqid = None + query(url, seq, alleles, lengths, + seqid=seqid, method=method) + else: + warn_err('Line %d, Not a peptide: %s\n' % (i, seq), + exit_code=None) + else: # fasta + seqid = None + seq = '' + for i, line in enumerate(fh): + if line.startswith('>'): + if seqid and len(seq) > 0: + query(url, seq, alleles, lengths, + seqid=seqid, method=method) + seqid = line[1:].strip() + seq = '' + else: + seq += line.strip() + if seqid and len(seq) > 0: + query(url, seq, alleles, lengths, + seqid=seqid, method=method) + fh.close() + except Exception as e: + warn_err("Unable to open input file: %s\n" % e, exit_code=1) -if __name__ == "__main__": __main__() + if header: + outputFile.write(header) + for line in results: + outputFile.write(line) + if results2: + if args.output2: + try: + outPath = os.path.abspath(args.output2) + outFile = open(outPath, 'w') + except Exception as e: + warn_err("Unable to open output file: %s\n" % e, exit_code=1) + else: + outFile = sys.stdout + if header2: + outFile.write(header2) + for line in results2: + outFile.write(line) + +if __name__ == "__main__": + __main__() diff -r 991424605492 -r 4a89ba6cfc63 iedb_api.xml --- a/iedb_api.xml Mon Feb 17 16:04:07 2020 -0500 +++ b/iedb_api.xml Tue Feb 25 17:37:34 2020 -0500 @@ -1,17 +1,81 @@ - + MHC Binding prediction + + + + + + + + + + The dataset should have on allele per line. The allele may be followed by an optional comma-separated list of peptide lengths, e.g.: @HLALEN_EXAMPLES@ + + + + + Enter alleles separated by white space: @HLA_EXAMPLES@ (The peptide lengths may follow each allele: @HLALEN_EXAMPLES@) + ^@HLA_REGEX@(\s+@HLA_REGEX@)*$ + + + + + + python - - - - 0: + #if len($fields) > 1: + #for $alen in $fields[1:]: + -a '$allele' -l $alen + #end for + #else: + #for $alen in str($prediction.lengths).split(','): + -a '$allele' -l $alen + #end for + #end if + #end if + #end for + #else: + #for $word in str($prediction.alleles.allele_text).strip().split(): + #set $fields = $word.strip().split(',') + #set $allele = $fields[0].strip() + #if len($allele) > 0: + #if len($fields) > 1: + #for $alen in $fields[1:]: + -a '$allele' -l $alen + #end for + #else: + #for $alen in str($prediction.lengths).split(','): + -a '$allele' -l $alen + #end for + #end if + #end if + #end for + #end if + #end if + #if $sequence.seqsrc == 'fasta': - -i $sequence.seq_fasta + -i '$sequence.seq_fasta' #else if $sequence.seqsrc == 'tabular': - -i $sequence.seq_tsv + -i '$sequence.seq_tsv' -c #echo int(str($sequence.pep_col)) - 1 #if $sequence.id_col: -C #echo int(str($sequence.id_col)) - 1 @@ -21,42 +85,131 @@ -s $seq.strip() #end for #end if - #if $alleles.allelesrc == 'history': - #for $line in open(str($alleles.allele_file)): - #set $fields = $line.strip().split(',') - #set $allele = $fields[0].strip() - #if len($allele) > 0: - #if len($fields) > 1: - #for $alen in $fields[1:]: - -a $allele -l $alen - #end for - #else: - #for $alen in str($lengths).split(','): - -a $allele -l $alen - #end for - #end if - #end if - #end for - #else: - #for $word in str($alleles.allele_text).strip().split(): - #set $fields = $word.strip().split(',') - #set $allele = $fields[0].strip() - #if len($allele) > 0: - #if len($fields) > 1: - #for $alen in $fields[1:]: - -a $allele -l $alen - #end for - #else: - #for $alen in str($lengths).split(','): - -a $allele -l $alen - #end for - #end if - #end if - #end for - #end if -o $output ]]> + + + + + + + + + + + + + + + + + + + + + + + Used for any alleles which don't include specified lengths + + + + + + + + + + + + + + + + + + + + + + + + Used for any alleles which don't include specified lengths + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Used for any alleles which don't include specified lengths + + + + + + + + + + + + + + + + + Used for any alleles which don't include specified lengths + + + + + + + + + + + + + + + + + + @@ -75,139 +228,157 @@ - - - - - - - - The dataset should have on allele per line. The allele may be followed by an optional comma-separated list of pepttide lengths, e.g.: HLA-A*02:01,8,9 - - - - - Enter alleles separated by white space: HLA-A*03:01 HLA-B*07:02 (The peptide lengths may follow each allele: HLA-A*03:01,8,9,10 HLA-B*07:02,9 - ^(HLA-([A-C]|D[PQR][AB]1)\*[0-9][[0-9]:[0-9][0-9](,(8|9|10|11|12|13|14|15))*)(\s+HLA-([A-C]|D[PQR][AB]1)\*[0-9][[0-9]:[0-9][0-9](,(8|9|10|11|12|13|14|15))*)*$ - - - - - Used for any alleles which don't include specified lengths - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + prediction['method'].startswith('Bepipred') + - - - - - - - + + + + + + + + + + + + + + + - - - - - - + + + + + + + + + + + + - + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -215,14 +386,15 @@ pep1 +VLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDRFKHLKTE +>pep2 +AGHAHKVPRRLLKAAR +>pep3 +ALKAADASADADGSGSGSGSGAGHAHKVPRRLLKAAR diff -r 991424605492 -r 4a89ba6cfc63 test-data/seqs.fa --- a/test-data/seqs.fa Mon Feb 17 16:04:07 2020 -0500 +++ b/test-data/seqs.fa Tue Feb 25 17:37:34 2020 -0500 @@ -1,4 +1,4 @@ >peptide1 -AGHAHKVPRRLLKAAR +GHAHKVPRRLLKAAR >peptide2 -ALKAADASADADGSGSGSGSG +LKAADASADADGSGSGSGSG diff -r 991424605492 -r 4a89ba6cfc63 test-data/seqs.tsv --- a/test-data/seqs.tsv Mon Feb 17 16:04:07 2020 -0500 +++ b/test-data/seqs.tsv Tue Feb 25 17:37:34 2020 -0500 @@ -1,2 +1,2 @@ -peptide1 16 AGHAHKVPRRLLKAAR -peptide2 21 ALKAADASADADGSGSGSGSG +peptide1 16 GHAHKVPRRLLKAAR +peptide2 21 LKAADASADADGSGSGSGSG