0
|
1 #!/usr/bin/env python3
|
|
2 """
|
|
3 Create tab-delimited database file to store sequence alignment information
|
|
4 """
|
|
5 # Info
|
|
6 __author__ = 'Namita Gupta, Jason Anthony Vander Heiden'
|
|
7 from changeo import __version__, __date__
|
|
8
|
|
9 # Imports
|
|
10 import os
|
|
11 import sys
|
|
12 from argparse import ArgumentParser
|
|
13 from collections import OrderedDict
|
|
14 from textwrap import dedent
|
|
15 from time import time
|
|
16 from Bio import SeqIO
|
|
17
|
|
18 # Presto and changeo imports
|
|
19 from presto.Defaults import default_out_args
|
|
20 from presto.Annotation import parseAnnotation
|
|
21 from presto.IO import countSeqFile, printLog, printMessage, printProgress, readSeqFile
|
|
22 from changeo.Commandline import CommonHelpFormatter, checkArgs, getCommonArgParser, parseCommonArgs
|
|
23 from changeo.IO import countDbFile, extractIMGT, getDbWriter, readRepo
|
|
24 from changeo.Parsers import IgBLASTReader, IMGTReader, IHMMuneReader, getIDforIMGT
|
|
25
|
|
26
|
|
27 def getFilePrefix(aligner_output, out_args):
|
|
28 """
|
|
29 Get file name prefix and create output directory
|
|
30
|
|
31 Arguments:
|
|
32 aligner_output : aligner output file or directory.
|
|
33 out_args : dictionary of output arguments.
|
|
34
|
|
35 Returns:
|
|
36 str : file name prefix.
|
|
37 """
|
|
38 # Determine output directory
|
|
39 if not out_args['out_dir']:
|
|
40 out_dir = os.path.dirname(os.path.abspath(aligner_output))
|
|
41 else:
|
|
42 out_dir = os.path.abspath(out_args['out_dir'])
|
|
43 if not os.path.exists(out_dir):
|
|
44 os.mkdir(out_dir)
|
|
45
|
|
46 # Determine file prefix
|
|
47 if out_args['out_name']:
|
|
48 file_prefix = out_args['out_name']
|
|
49 else:
|
|
50 file_prefix = os.path.splitext(os.path.split(os.path.abspath(aligner_output))[1])[0]
|
|
51
|
|
52 return os.path.join(out_dir, file_prefix)
|
|
53
|
|
54
|
|
55 def getSeqDict(seq_file):
|
|
56 """
|
|
57 Create a dictionary from a sequence file.
|
|
58
|
|
59 Arguments:
|
|
60 seq_file : sequence file.
|
|
61
|
|
62 Returns:
|
|
63 dict : sequence description as keys with Bio.SeqRecords as values.
|
|
64 """
|
|
65 seq_dict = SeqIO.to_dict(readSeqFile(seq_file),
|
|
66 key_function=lambda x: x.description)
|
|
67
|
|
68 return seq_dict
|
|
69
|
|
70
|
|
71 def writeDb(db, fields, file_prefix, total_count, id_dict=None, no_parse=True, partial=False,
|
|
72 out_args=default_out_args):
|
|
73 """
|
|
74 Writes tab-delimited database file in output directory.
|
|
75
|
|
76 Arguments:
|
|
77 db : a iterator of IgRecord objects containing alignment data.
|
|
78 fields : a list of ordered field names to write.
|
|
79 file_prefix : directory and prefix for CLIP tab-delim file.
|
|
80 total_count : number of records (for progress bar).
|
|
81 id_dict : a dictionary of the truncated sequence ID mapped to the full sequence ID.
|
|
82 no_parse : if ID is to be parsed for pRESTO output with default delimiters.
|
|
83 partial : if True put incomplete alignments in the pass file.
|
|
84 out_args : common output argument dictionary from parseCommonArgs.
|
|
85
|
|
86 Returns:
|
|
87 None
|
|
88 """
|
|
89 # Function to check for valid records strictly
|
|
90 def _pass_strict(rec):
|
|
91 valid = [rec.v_call and rec.v_call != 'None',
|
|
92 rec.j_call and rec.j_call != 'None',
|
|
93 rec.functional is not None,
|
|
94 rec.seq_vdj,
|
|
95 rec.junction]
|
|
96 return all(valid)
|
|
97
|
|
98 # Function to check for valid records loosely
|
|
99 def _pass_gentle(rec):
|
|
100 valid = [rec.v_call and rec.v_call != 'None',
|
|
101 rec.d_call and rec.d_call != 'None',
|
|
102 rec.j_call and rec.j_call != 'None']
|
|
103 return any(valid)
|
|
104
|
|
105 # Set pass criteria
|
|
106 _pass = _pass_gentle if partial else _pass_strict
|
|
107
|
|
108 # Define output file names
|
|
109 pass_file = '%s_db-pass.tab' % file_prefix
|
|
110 fail_file = '%s_db-fail.tab' % file_prefix
|
|
111
|
|
112 # Initiate handles, writers and counters
|
|
113 pass_handle = None
|
|
114 fail_handle = None
|
|
115 pass_writer = None
|
|
116 fail_writer = None
|
|
117 start_time = time()
|
|
118 rec_count = pass_count = fail_count = 0
|
|
119
|
|
120 # Validate and write output
|
|
121 printProgress(0, total_count, 0.05, start_time)
|
|
122 for i, record in enumerate(db, start=1):
|
|
123
|
|
124 # Replace sequence description with full string, if required
|
|
125 if id_dict is not None and record.id in id_dict:
|
|
126 record.id = id_dict[record.id]
|
|
127
|
|
128 # Parse sequence description into new columns
|
|
129 if not no_parse:
|
|
130 try:
|
|
131 record.annotations = parseAnnotation(record.id, delimiter=out_args['delimiter'])
|
|
132 record.id = record.annotations['ID']
|
|
133 del record.annotations['ID']
|
|
134
|
|
135 # TODO: This is not the best approach. should pass in output fields.
|
|
136 # If first record, use parsed description to define extra columns
|
|
137 if i == 1: fields.extend(list(record.annotations.keys()))
|
|
138 except IndexError:
|
|
139 # Could not parse pRESTO-style annotations so fall back to no parse
|
|
140 no_parse = True
|
|
141 sys.stderr.write('\nWARNING: Sequence annotation format not recognized. Sequence headers will not be parsed.\n')
|
|
142
|
|
143 # Count pass or fail and write to appropriate file
|
|
144 if _pass(record):
|
|
145 # Open pass file
|
|
146 if pass_writer is None:
|
|
147 pass_handle = open(pass_file, 'wt')
|
|
148 pass_writer = getDbWriter(pass_handle, add_fields=fields)
|
|
149
|
|
150 # Write row to pass file
|
|
151 pass_count += 1
|
|
152 pass_writer.writerow(record.toDict())
|
|
153 else:
|
|
154 # Open failed file
|
|
155 if out_args['failed'] and fail_writer is None:
|
|
156 fail_handle = open(fail_file, 'wt')
|
|
157 fail_writer = getDbWriter(fail_handle, add_fields=fields)
|
|
158
|
|
159 # Write row to fail file if specified
|
|
160 fail_count += 1
|
|
161 if fail_writer is not None:
|
|
162 fail_writer.writerow(record.toDict())
|
|
163
|
|
164 # Print progress
|
|
165 printProgress(i, total_count, 0.05, start_time)
|
|
166
|
|
167 # Print consol log
|
|
168 log = OrderedDict()
|
|
169 log['OUTPUT'] = pass_file
|
|
170 log['PASS'] = pass_count
|
|
171 log['FAIL'] = fail_count
|
|
172 log['END'] = 'MakeDb'
|
|
173 printLog(log)
|
|
174
|
|
175 if pass_handle is not None: pass_handle.close()
|
|
176 if fail_handle is not None: fail_handle.close()
|
|
177
|
|
178
|
|
179 # TODO: may be able to merge with other mains
|
|
180 def parseIMGT(aligner_output, seq_file=None, no_parse=True, partial=False,
|
|
181 parse_scores=False, parse_regions=False, parse_junction=False,
|
|
182 out_args=default_out_args):
|
|
183 """
|
|
184 Main for IMGT aligned sample sequences.
|
|
185
|
|
186 Arguments:
|
|
187 aligner_output : zipped file or unzipped folder output by IMGT.
|
|
188 seq_file : FASTA file input to IMGT (from which to get seqID).
|
|
189 no_parse : if ID is to be parsed for pRESTO output with default delimiters.
|
|
190 partial : If True put incomplete alignments in the pass file.
|
|
191 parse_scores : if True add alignment score fields to output file.
|
|
192 parse_regions : if True add FWR and CDR region fields to output file.
|
|
193 out_args : common output argument dictionary from parseCommonArgs.
|
|
194
|
|
195 Returns:
|
|
196 None
|
|
197 """
|
|
198 # Print parameter info
|
|
199 log = OrderedDict()
|
|
200 log['START'] = 'MakeDb'
|
|
201 log['ALIGNER'] = 'IMGT'
|
|
202 log['ALIGNER_OUTPUT'] = aligner_output
|
|
203 log['SEQ_FILE'] = os.path.basename(seq_file) if seq_file else ''
|
|
204 log['NO_PARSE'] = no_parse
|
|
205 log['PARTIAL'] = partial
|
|
206 log['SCORES'] = parse_scores
|
|
207 log['REGIONS'] = parse_regions
|
|
208 log['JUNCTION'] = parse_junction
|
|
209 printLog(log)
|
|
210
|
|
211 start_time = time()
|
|
212 printMessage('Loading sequence files', start_time=start_time, width=25)
|
|
213 # Extract IMGT files
|
|
214 temp_dir, imgt_files = extractIMGT(aligner_output)
|
|
215 # Count records in IMGT files
|
|
216 total_count = countDbFile(imgt_files['summary'])
|
|
217 # Get (parsed) IDs from fasta file submitted to IMGT
|
|
218 id_dict = getIDforIMGT(seq_file) if seq_file else {}
|
|
219 printMessage('Done', start_time=start_time, end=True, width=25)
|
|
220
|
|
221 # Parse IMGT output and write db
|
|
222 with open(imgt_files['summary'], 'r') as summary_handle, \
|
|
223 open(imgt_files['gapped'], 'r') as gapped_handle, \
|
|
224 open(imgt_files['ntseq'], 'r') as ntseq_handle, \
|
|
225 open(imgt_files['junction'], 'r') as junction_handle:
|
|
226 parse_iter = IMGTReader(summary_handle, gapped_handle, ntseq_handle, junction_handle,
|
|
227 parse_scores=parse_scores, parse_regions=parse_regions,
|
|
228 parse_junction=parse_junction)
|
|
229 file_prefix = getFilePrefix(aligner_output, out_args)
|
|
230 writeDb(parse_iter, parse_iter.fields, file_prefix, total_count, id_dict=id_dict,
|
|
231 no_parse=no_parse, partial=partial, out_args=out_args)
|
|
232
|
|
233 # Cleanup temp directory
|
|
234 temp_dir.cleanup()
|
|
235
|
|
236 return None
|
|
237
|
|
238
|
|
239 # TODO: may be able to merge with other mains
|
|
240 def parseIgBLAST(aligner_output, seq_file, repo, no_parse=True, partial=False,
|
|
241 parse_regions=False, parse_scores=False, parse_igblast_cdr3=False,
|
|
242 out_args=default_out_args):
|
|
243 """
|
|
244 Main for IgBLAST aligned sample sequences.
|
|
245
|
|
246 Arguments:
|
|
247 aligner_output : IgBLAST output file to process.
|
|
248 seq_file : fasta file input to IgBlast (from which to get sequence).
|
|
249 repo : folder with germline repertoire files.
|
|
250 no_parse : if ID is to be parsed for pRESTO output with default delimiters.
|
|
251 partial : If True put incomplete alignments in the pass file.
|
|
252 parse_regions : if True add FWR and CDR fields to output file.
|
|
253 parse_scores : if True add alignment score fields to output file.
|
|
254 parse_igblast_cdr3 : if True parse CDR3 sequences generated by IgBLAST
|
|
255 out_args : common output argument dictionary from parseCommonArgs.
|
|
256
|
|
257 Returns:
|
|
258 None
|
|
259 """
|
|
260 # Print parameter info
|
|
261 log = OrderedDict()
|
|
262 log['START'] = 'MakeDB'
|
|
263 log['ALIGNER'] = 'IgBlast'
|
|
264 log['ALIGNER_OUTPUT'] = os.path.basename(aligner_output)
|
|
265 log['SEQ_FILE'] = os.path.basename(seq_file)
|
|
266 log['NO_PARSE'] = no_parse
|
|
267 log['PARTIAL'] = partial
|
|
268 log['SCORES'] = parse_scores
|
|
269 log['REGIONS'] = parse_regions
|
|
270 printLog(log)
|
|
271
|
|
272 start_time = time()
|
|
273 printMessage('Loading sequence files', start_time=start_time, width=25)
|
|
274 # Count records in sequence file
|
|
275 total_count = countSeqFile(seq_file)
|
|
276 # Get input sequence dictionary
|
|
277 seq_dict = getSeqDict(seq_file)
|
|
278 # Create germline repo dictionary
|
|
279 repo_dict = readRepo(repo)
|
|
280 printMessage('Done', start_time=start_time, end=True, width=25)
|
|
281
|
|
282 # Parse and write output
|
|
283 with open(aligner_output, 'r') as f:
|
|
284 parse_iter = IgBLASTReader(f, seq_dict, repo_dict,
|
|
285 parse_scores=parse_scores, parse_regions=parse_regions,
|
|
286 parse_igblast_cdr3=parse_igblast_cdr3)
|
|
287 file_prefix = getFilePrefix(aligner_output, out_args)
|
|
288 writeDb(parse_iter, parse_iter.fields, file_prefix, total_count,
|
|
289 no_parse=no_parse, partial=partial, out_args=out_args)
|
|
290
|
|
291 return None
|
|
292
|
|
293
|
|
294 # TODO: may be able to merge with other mains
|
|
295 def parseIHMM(aligner_output, seq_file, repo, no_parse=True, partial=False,
|
|
296 parse_scores=False, parse_regions=False, out_args=default_out_args):
|
|
297 """
|
|
298 Main for iHMMuneAlign aligned sample sequences.
|
|
299
|
|
300 Arguments:
|
|
301 aligner_output : iHMMune-Align output file to process.
|
|
302 seq_file : fasta file input to iHMMuneAlign (from which to get sequence).
|
|
303 repo : folder with germline repertoire files.
|
|
304 no_parse : if ID is to be parsed for pRESTO output with default delimiters.
|
|
305 partial : If True put incomplete alignments in the pass file.
|
|
306 parse_scores : if True parse alignment scores.
|
|
307 parse_regions : if True add FWR and CDR region fields.
|
|
308 out_args : common output argument dictionary from parseCommonArgs.
|
|
309
|
|
310 Returns:
|
|
311 None
|
|
312 """
|
|
313 # Print parameter info
|
|
314 log = OrderedDict()
|
|
315 log['START'] = 'MakeDB'
|
|
316 log['ALIGNER'] = 'iHMMune-Align'
|
|
317 log['ALIGNER_OUTPUT'] = os.path.basename(aligner_output)
|
|
318 log['SEQ_FILE'] = os.path.basename(seq_file)
|
|
319 log['NO_PARSE'] = no_parse
|
|
320 log['PARTIAL'] = partial
|
|
321 log['SCORES'] = parse_scores
|
|
322 log['REGIONS'] = parse_regions
|
|
323 printLog(log)
|
|
324
|
|
325 start_time = time()
|
|
326 printMessage('Loading sequence files', start_time=start_time, width=25)
|
|
327 # Count records in sequence file
|
|
328 total_count = countSeqFile(seq_file)
|
|
329 # Get input sequence dictionary
|
|
330 seq_dict = getSeqDict(seq_file)
|
|
331 # Create germline repo dictionary
|
|
332 repo_dict = readRepo(repo)
|
|
333 printMessage('Done', start_time=start_time, end=True, width=25)
|
|
334
|
|
335 # Parse and write output
|
|
336 with open(aligner_output, 'r') as f:
|
|
337 parse_iter = IHMMuneReader(f, seq_dict, repo_dict,
|
|
338 parse_scores=parse_scores, parse_regions=parse_regions)
|
|
339 file_prefix = getFilePrefix(aligner_output, out_args)
|
|
340 writeDb(parse_iter, parse_iter.fields, file_prefix, total_count,
|
|
341 no_parse=no_parse, partial=partial, out_args=out_args)
|
|
342
|
|
343 return None
|
|
344
|
|
345
|
|
346 def getArgParser():
|
|
347 """
|
|
348 Defines the ArgumentParser.
|
|
349
|
|
350 Returns:
|
|
351 argparse.ArgumentParser
|
|
352 """
|
|
353 fields = dedent(
|
|
354 '''
|
|
355 output files:
|
|
356 db-pass
|
|
357 database of alignment records with functionality information,
|
|
358 V and J calls, and a junction region.
|
|
359 db-fail
|
|
360 database with records that fail due to no functionality information
|
|
361 (did not pass IMGT), no V call, no J call, or no junction region.
|
|
362
|
|
363 universal output fields:
|
|
364 SEQUENCE_ID, SEQUENCE_INPUT, SEQUENCE_VDJ, SEQUENCE_IMGT,
|
|
365 FUNCTIONAL, IN_FRAME, STOP, MUTATED_INVARIANT, INDELS,
|
|
366 V_CALL, D_CALL, J_CALL,
|
|
367 V_SEQ_START, V_SEQ_LENGTH,
|
|
368 D_SEQ_START, D_SEQ_LENGTH, D_GERM_START, D_GERM_LENGTH,
|
|
369 J_SEQ_START, J_SEQ_LENGTH, J_GERM_START, J_GERM_LENGTH,
|
|
370 JUNCTION_LENGTH, JUNCTION, NP1_LENGTH, NP2_LENGTH,
|
|
371 FWR1_IMGT, FWR2_IMGT, FWR3_IMGT, FWR4_IMGT,
|
|
372 CDR1_IMGT, CDR2_IMGT, CDR3_IMGT
|
|
373
|
|
374 imgt specific output fields:
|
|
375 V_GERM_START_IMGT, V_GERM_LENGTH_IMGT,
|
|
376 N1_LENGTH, N2_LENGTH, P3V_LENGTH, P5D_LENGTH, P3D_LENGTH, P5J_LENGTH,
|
|
377 D_FRAME, V_SCORE, V_IDENTITY, J_SCORE, J_IDENTITY,
|
|
378
|
|
379 igblast specific output fields:
|
|
380 V_GERM_START_VDJ, V_GERM_LENGTH_VDJ,
|
|
381 V_EVALUE, V_SCORE, V_IDENTITY, V_BTOP,
|
|
382 J_EVALUE, J_SCORE, J_IDENTITY, J_BTOP.
|
|
383 CDR3_IGBLAST_NT, CDR3_IGBLAST_AA
|
|
384
|
|
385 ihmm specific output fields:
|
|
386 V_GERM_START_VDJ, V_GERM_LENGTH_VDJ,
|
|
387 HMM_SCORE
|
|
388 ''')
|
|
389
|
|
390 # Define ArgumentParser
|
|
391 parser = ArgumentParser(description=__doc__, epilog=fields,
|
|
392 formatter_class=CommonHelpFormatter)
|
|
393 parser.add_argument('--version', action='version',
|
|
394 version='%(prog)s:' + ' %s-%s' %(__version__, __date__))
|
|
395 subparsers = parser.add_subparsers(title='subcommands', dest='command',
|
|
396 help='Aligner used', metavar='')
|
|
397 # TODO: This is a temporary fix for Python issue 9253
|
|
398 subparsers.required = True
|
|
399
|
|
400 # Parent parser
|
|
401 parser_parent = getCommonArgParser(seq_in=False, seq_out=False, log=False)
|
|
402
|
|
403 # IgBlast Aligner
|
|
404 parser_igblast = subparsers.add_parser('igblast', parents=[parser_parent],
|
|
405 formatter_class=CommonHelpFormatter,
|
|
406 help='Process IgBLAST output.',
|
|
407 description='Process IgBLAST output.')
|
|
408 parser_igblast.add_argument('-i', nargs='+', action='store', dest='aligner_outputs',
|
|
409 required=True,
|
|
410 help='''IgBLAST output files in format 7 with query sequence
|
|
411 (IgBLAST argument \'-outfmt "7 std qseq sseq btop"\').''')
|
|
412 parser_igblast.add_argument('-r', nargs='+', action='store', dest='repo', required=True,
|
|
413 help='''List of folders and/or fasta files containing
|
|
414 IMGT-gapped germline sequences corresponding to the
|
|
415 set of germlines used in the IgBLAST alignment.''')
|
|
416 parser_igblast.add_argument('-s', action='store', nargs='+', dest='seq_files',
|
|
417 required=True,
|
|
418 help='''List of input FASTA files (with .fasta, .fna or .fa
|
|
419 extension), containing sequences.''')
|
|
420 parser_igblast.add_argument('--noparse', action='store_true', dest='no_parse',
|
|
421 help='''Specify to prevent input sequence headers from being parsed
|
|
422 to add new columns to database. Parsing of sequence headers requires
|
|
423 headers to be in the pRESTO annotation format, so this should be specified
|
|
424 when sequence headers are incompatible with the pRESTO annotation scheme.
|
|
425 Note, unrecognized header formats will default to this behavior.''')
|
|
426 parser_igblast.add_argument('--partial', action='store_true', dest='partial',
|
|
427 help='''If specified, include incomplete V(D)J alignments in
|
|
428 the pass file instead of the fail file.''')
|
|
429 parser_igblast.add_argument('--scores', action='store_true', dest='parse_scores',
|
|
430 help='''Specify if alignment score metrics should be
|
|
431 included in the output. Adds the V_SCORE, V_IDENTITY,
|
|
432 V_EVALUE, V_BTOP, J_SCORE, J_IDENTITY,
|
|
433 J_BTOP, and J_EVALUE columns.''')
|
|
434 parser_igblast.add_argument('--regions', action='store_true', dest='parse_regions',
|
|
435 help='''Specify if IMGT FWR and CDRs should be
|
|
436 included in the output. Adds the FWR1_IMGT, FWR2_IMGT,
|
|
437 FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and
|
|
438 CDR3_IMGT columns.''')
|
|
439 parser_igblast.add_argument('--cdr3', action='store_true',
|
|
440 dest='parse_igblast_cdr3',
|
|
441 help='''Specify if the CDR3 sequences generated by IgBLAST
|
|
442 should be included in the output. Adds the columns
|
|
443 CDR3_IGBLAST_NT and CDR3_IGBLAST_AA. Requires IgBLAST
|
|
444 version 1.5 or greater.''')
|
|
445 parser_igblast.set_defaults(func=parseIgBLAST)
|
|
446
|
|
447 # IMGT aligner
|
|
448 parser_imgt = subparsers.add_parser('imgt', parents=[parser_parent],
|
|
449 formatter_class=CommonHelpFormatter,
|
|
450 help='''Process IMGT/HighV-Quest output
|
|
451 (does not work with V-QUEST).''',
|
|
452 description='''Process IMGT/HighV-Quest output
|
|
453 (does not work with V-QUEST).''')
|
|
454 parser_imgt.add_argument('-i', nargs='+', action='store', dest='aligner_outputs',
|
|
455 help='''Either zipped IMGT output files (.zip or .txz) or a
|
|
456 folder containing unzipped IMGT output files (which must
|
|
457 include 1_Summary, 2_IMGT-gapped, 3_Nt-sequences,
|
|
458 and 6_Junction).''')
|
|
459 parser_imgt.add_argument('-s', nargs='*', action='store', dest='seq_files',
|
|
460 required=False,
|
|
461 help='''List of input FASTA files (with .fasta, .fna or .fa
|
|
462 extension) containing sequences.''')
|
|
463 parser_imgt.add_argument('--noparse', action='store_true', dest='no_parse',
|
|
464 help='''Specify to prevent input sequence headers from being parsed
|
|
465 to add new columns to database. Parsing of sequence headers requires
|
|
466 headers to be in the pRESTO annotation format, so this should be specified
|
|
467 when sequence headers are incompatible with the pRESTO annotation scheme.
|
|
468 Note, unrecognized header formats will default to this behavior.''')
|
|
469 parser_imgt.add_argument('--partial', action='store_true', dest='partial',
|
|
470 help='''If specified, include incomplete V(D)J alignments in
|
|
471 the pass file instead of the fail file.''')
|
|
472 parser_imgt.add_argument('--scores', action='store_true', dest='parse_scores',
|
|
473 help='''Specify if alignment score metrics should be
|
|
474 included in the output. Adds the V_SCORE, V_IDENTITY,
|
|
475 J_SCORE and J_IDENTITY.''')
|
|
476 parser_imgt.add_argument('--regions', action='store_true', dest='parse_regions',
|
|
477 help='''Specify if IMGT FWRs and CDRs should be
|
|
478 included in the output. Adds the FWR1_IMGT, FWR2_IMGT,
|
|
479 FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and
|
|
480 CDR3_IMGT columns.''')
|
|
481 parser_imgt.add_argument('--junction', action='store_true', dest='parse_junction',
|
|
482 help='''Specify if detailed junction fields should be
|
|
483 included in the output. Adds the columns
|
|
484 N1_LENGTH, N2_LENGTH, P3V_LENGTH, P5D_LENGTH, P3D_LENGTH,
|
|
485 P5J_LENGTH, D_FRAME.''')
|
|
486 parser_imgt.set_defaults(func=parseIMGT)
|
|
487
|
|
488 # iHMMuneAlign Aligner
|
|
489 parser_ihmm = subparsers.add_parser('ihmm', parents=[parser_parent],
|
|
490 formatter_class=CommonHelpFormatter,
|
|
491 help='Process iHMMune-Align output.',
|
|
492 description='Process iHMMune-Align output.')
|
|
493 parser_ihmm.add_argument('-i', nargs='+', action='store', dest='aligner_outputs',
|
|
494 required=True,
|
|
495 help='''iHMMune-Align output file.''')
|
|
496 parser_ihmm.add_argument('-r', nargs='+', action='store', dest='repo', required=True,
|
|
497 help='''List of folders and/or FASTA files containing
|
|
498 IMGT-gapped germline sequences corresponding to the
|
|
499 set of germlines used in the IgBLAST alignment.''')
|
|
500 parser_ihmm.add_argument('-s', action='store', nargs='+', dest='seq_files',
|
|
501 required=True,
|
|
502 help='''List of input FASTA files (with .fasta, .fna or .fa
|
|
503 extension) containing sequences.''')
|
|
504 parser_ihmm.add_argument('--noparse', action='store_true', dest='no_parse',
|
|
505 help='''Specify to prevent input sequence headers from being parsed
|
|
506 to add new columns to database. Parsing of sequence headers requires
|
|
507 headers to be in the pRESTO annotation format, so this should be specified
|
|
508 when sequence headers are incompatible with the pRESTO annotation scheme.
|
|
509 Note, unrecognized header formats will default to this behavior.''')
|
|
510 parser_ihmm.add_argument('--partial', action='store_true', dest='partial',
|
|
511 help='''If specified, include incomplete V(D)J alignments in
|
|
512 the pass file instead of the fail file.''')
|
|
513 parser_ihmm.add_argument('--scores', action='store_true', dest='parse_scores',
|
|
514 help='''Specify if alignment score metrics should be
|
|
515 included in the output. Adds the path score of the
|
|
516 iHMMune-Align hidden Markov model to HMM_SCORE.''')
|
|
517 parser_ihmm.add_argument('--regions', action='store_true', dest='parse_regions',
|
|
518 help='''Specify if IMGT FWRs and CDRs should be
|
|
519 included in the output. Adds the FWR1_IMGT, FWR2_IMGT,
|
|
520 FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and
|
|
521 CDR3_IMGT columns.''')
|
|
522 parser_ihmm.set_defaults(func=parseIHMM)
|
|
523
|
|
524 return parser
|
|
525
|
|
526
|
|
527 if __name__ == "__main__":
|
|
528 """
|
|
529 Parses command line arguments and calls main
|
|
530 """
|
|
531 parser = getArgParser()
|
|
532 checkArgs(parser)
|
|
533 args = parser.parse_args()
|
|
534 args_dict = parseCommonArgs(args, in_arg='aligner_outputs')
|
|
535
|
|
536 # Set no ID parsing if sequence files are not provided
|
|
537 if 'seq_files' in args_dict and not args_dict['seq_files']:
|
|
538 args_dict['no_parse'] = True
|
|
539
|
|
540 # Delete
|
|
541 if 'seq_files' in args_dict: del args_dict['seq_files']
|
|
542 if 'aligner_outputs' in args_dict: del args_dict['aligner_outputs']
|
|
543 if 'command' in args_dict: del args_dict['command']
|
|
544 if 'func' in args_dict: del args_dict['func']
|
|
545
|
|
546 if args.command == 'imgt':
|
|
547 for i in range(len(args.__dict__['aligner_outputs'])):
|
|
548 args_dict['aligner_output'] = args.__dict__['aligner_outputs'][i]
|
|
549 args_dict['seq_file'] = args.__dict__['seq_files'][i] \
|
|
550 if args.__dict__['seq_files'] else None
|
|
551 args.func(**args_dict)
|
|
552 elif args.command == 'igblast' or args.command == 'ihmm':
|
|
553 for i in range(len(args.__dict__['aligner_outputs'])):
|
|
554 args_dict['aligner_output'] = args.__dict__['aligner_outputs'][i]
|
|
555 args_dict['seq_file'] = args.__dict__['seq_files'][i]
|
|
556 args.func(**args_dict)
|