Mercurial > repos > davidvanzessen > change_o
comparison MakeDb.py @ 0:183edf446dcf draft default tip
Uploaded
author | davidvanzessen |
---|---|
date | Mon, 17 Jul 2017 07:44:27 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:183edf446dcf |
---|---|
1 #!/usr/bin/env python3 | |
2 """ | |
3 Create tab-delimited database file to store sequence alignment information | |
4 """ | |
5 # Info | |
6 __author__ = 'Namita Gupta, Jason Anthony Vander Heiden' | |
7 from changeo import __version__, __date__ | |
8 | |
9 # Imports | |
10 import os | |
11 import sys | |
12 from argparse import ArgumentParser | |
13 from collections import OrderedDict | |
14 from textwrap import dedent | |
15 from time import time | |
16 from Bio import SeqIO | |
17 | |
18 # Presto and changeo imports | |
19 from presto.Defaults import default_out_args | |
20 from presto.Annotation import parseAnnotation | |
21 from presto.IO import countSeqFile, printLog, printMessage, printProgress, readSeqFile | |
22 from changeo.Commandline import CommonHelpFormatter, checkArgs, getCommonArgParser, parseCommonArgs | |
23 from changeo.IO import countDbFile, extractIMGT, getDbWriter, readRepo | |
24 from changeo.Parsers import IgBLASTReader, IMGTReader, IHMMuneReader, getIDforIMGT | |
25 | |
26 | |
27 def getFilePrefix(aligner_output, out_args): | |
28 """ | |
29 Get file name prefix and create output directory | |
30 | |
31 Arguments: | |
32 aligner_output : aligner output file or directory. | |
33 out_args : dictionary of output arguments. | |
34 | |
35 Returns: | |
36 str : file name prefix. | |
37 """ | |
38 # Determine output directory | |
39 if not out_args['out_dir']: | |
40 out_dir = os.path.dirname(os.path.abspath(aligner_output)) | |
41 else: | |
42 out_dir = os.path.abspath(out_args['out_dir']) | |
43 if not os.path.exists(out_dir): | |
44 os.mkdir(out_dir) | |
45 | |
46 # Determine file prefix | |
47 if out_args['out_name']: | |
48 file_prefix = out_args['out_name'] | |
49 else: | |
50 file_prefix = os.path.splitext(os.path.split(os.path.abspath(aligner_output))[1])[0] | |
51 | |
52 return os.path.join(out_dir, file_prefix) | |
53 | |
54 | |
55 def getSeqDict(seq_file): | |
56 """ | |
57 Create a dictionary from a sequence file. | |
58 | |
59 Arguments: | |
60 seq_file : sequence file. | |
61 | |
62 Returns: | |
63 dict : sequence description as keys with Bio.SeqRecords as values. | |
64 """ | |
65 seq_dict = SeqIO.to_dict(readSeqFile(seq_file), | |
66 key_function=lambda x: x.description) | |
67 | |
68 return seq_dict | |
69 | |
70 | |
71 def writeDb(db, fields, file_prefix, total_count, id_dict=None, no_parse=True, partial=False, | |
72 out_args=default_out_args): | |
73 """ | |
74 Writes tab-delimited database file in output directory. | |
75 | |
76 Arguments: | |
77 db : a iterator of IgRecord objects containing alignment data. | |
78 fields : a list of ordered field names to write. | |
79 file_prefix : directory and prefix for CLIP tab-delim file. | |
80 total_count : number of records (for progress bar). | |
81 id_dict : a dictionary of the truncated sequence ID mapped to the full sequence ID. | |
82 no_parse : if ID is to be parsed for pRESTO output with default delimiters. | |
83 partial : if True put incomplete alignments in the pass file. | |
84 out_args : common output argument dictionary from parseCommonArgs. | |
85 | |
86 Returns: | |
87 None | |
88 """ | |
89 # Function to check for valid records strictly | |
90 def _pass_strict(rec): | |
91 valid = [rec.v_call and rec.v_call != 'None', | |
92 rec.j_call and rec.j_call != 'None', | |
93 rec.functional is not None, | |
94 rec.seq_vdj, | |
95 rec.junction] | |
96 return all(valid) | |
97 | |
98 # Function to check for valid records loosely | |
99 def _pass_gentle(rec): | |
100 valid = [rec.v_call and rec.v_call != 'None', | |
101 rec.d_call and rec.d_call != 'None', | |
102 rec.j_call and rec.j_call != 'None'] | |
103 return any(valid) | |
104 | |
105 # Set pass criteria | |
106 _pass = _pass_gentle if partial else _pass_strict | |
107 | |
108 # Define output file names | |
109 pass_file = '%s_db-pass.tab' % file_prefix | |
110 fail_file = '%s_db-fail.tab' % file_prefix | |
111 | |
112 # Initiate handles, writers and counters | |
113 pass_handle = None | |
114 fail_handle = None | |
115 pass_writer = None | |
116 fail_writer = None | |
117 start_time = time() | |
118 rec_count = pass_count = fail_count = 0 | |
119 | |
120 # Validate and write output | |
121 printProgress(0, total_count, 0.05, start_time) | |
122 for i, record in enumerate(db, start=1): | |
123 | |
124 # Replace sequence description with full string, if required | |
125 if id_dict is not None and record.id in id_dict: | |
126 record.id = id_dict[record.id] | |
127 | |
128 # Parse sequence description into new columns | |
129 if not no_parse: | |
130 try: | |
131 record.annotations = parseAnnotation(record.id, delimiter=out_args['delimiter']) | |
132 record.id = record.annotations['ID'] | |
133 del record.annotations['ID'] | |
134 | |
135 # TODO: This is not the best approach. should pass in output fields. | |
136 # If first record, use parsed description to define extra columns | |
137 if i == 1: fields.extend(list(record.annotations.keys())) | |
138 except IndexError: | |
139 # Could not parse pRESTO-style annotations so fall back to no parse | |
140 no_parse = True | |
141 sys.stderr.write('\nWARNING: Sequence annotation format not recognized. Sequence headers will not be parsed.\n') | |
142 | |
143 # Count pass or fail and write to appropriate file | |
144 if _pass(record): | |
145 # Open pass file | |
146 if pass_writer is None: | |
147 pass_handle = open(pass_file, 'wt') | |
148 pass_writer = getDbWriter(pass_handle, add_fields=fields) | |
149 | |
150 # Write row to pass file | |
151 pass_count += 1 | |
152 pass_writer.writerow(record.toDict()) | |
153 else: | |
154 # Open failed file | |
155 if out_args['failed'] and fail_writer is None: | |
156 fail_handle = open(fail_file, 'wt') | |
157 fail_writer = getDbWriter(fail_handle, add_fields=fields) | |
158 | |
159 # Write row to fail file if specified | |
160 fail_count += 1 | |
161 if fail_writer is not None: | |
162 fail_writer.writerow(record.toDict()) | |
163 | |
164 # Print progress | |
165 printProgress(i, total_count, 0.05, start_time) | |
166 | |
167 # Print consol log | |
168 log = OrderedDict() | |
169 log['OUTPUT'] = pass_file | |
170 log['PASS'] = pass_count | |
171 log['FAIL'] = fail_count | |
172 log['END'] = 'MakeDb' | |
173 printLog(log) | |
174 | |
175 if pass_handle is not None: pass_handle.close() | |
176 if fail_handle is not None: fail_handle.close() | |
177 | |
178 | |
179 # TODO: may be able to merge with other mains | |
180 def parseIMGT(aligner_output, seq_file=None, no_parse=True, partial=False, | |
181 parse_scores=False, parse_regions=False, parse_junction=False, | |
182 out_args=default_out_args): | |
183 """ | |
184 Main for IMGT aligned sample sequences. | |
185 | |
186 Arguments: | |
187 aligner_output : zipped file or unzipped folder output by IMGT. | |
188 seq_file : FASTA file input to IMGT (from which to get seqID). | |
189 no_parse : if ID is to be parsed for pRESTO output with default delimiters. | |
190 partial : If True put incomplete alignments in the pass file. | |
191 parse_scores : if True add alignment score fields to output file. | |
192 parse_regions : if True add FWR and CDR region fields to output file. | |
193 out_args : common output argument dictionary from parseCommonArgs. | |
194 | |
195 Returns: | |
196 None | |
197 """ | |
198 # Print parameter info | |
199 log = OrderedDict() | |
200 log['START'] = 'MakeDb' | |
201 log['ALIGNER'] = 'IMGT' | |
202 log['ALIGNER_OUTPUT'] = aligner_output | |
203 log['SEQ_FILE'] = os.path.basename(seq_file) if seq_file else '' | |
204 log['NO_PARSE'] = no_parse | |
205 log['PARTIAL'] = partial | |
206 log['SCORES'] = parse_scores | |
207 log['REGIONS'] = parse_regions | |
208 log['JUNCTION'] = parse_junction | |
209 printLog(log) | |
210 | |
211 start_time = time() | |
212 printMessage('Loading sequence files', start_time=start_time, width=25) | |
213 # Extract IMGT files | |
214 temp_dir, imgt_files = extractIMGT(aligner_output) | |
215 # Count records in IMGT files | |
216 total_count = countDbFile(imgt_files['summary']) | |
217 # Get (parsed) IDs from fasta file submitted to IMGT | |
218 id_dict = getIDforIMGT(seq_file) if seq_file else {} | |
219 printMessage('Done', start_time=start_time, end=True, width=25) | |
220 | |
221 # Parse IMGT output and write db | |
222 with open(imgt_files['summary'], 'r') as summary_handle, \ | |
223 open(imgt_files['gapped'], 'r') as gapped_handle, \ | |
224 open(imgt_files['ntseq'], 'r') as ntseq_handle, \ | |
225 open(imgt_files['junction'], 'r') as junction_handle: | |
226 parse_iter = IMGTReader(summary_handle, gapped_handle, ntseq_handle, junction_handle, | |
227 parse_scores=parse_scores, parse_regions=parse_regions, | |
228 parse_junction=parse_junction) | |
229 file_prefix = getFilePrefix(aligner_output, out_args) | |
230 writeDb(parse_iter, parse_iter.fields, file_prefix, total_count, id_dict=id_dict, | |
231 no_parse=no_parse, partial=partial, out_args=out_args) | |
232 | |
233 # Cleanup temp directory | |
234 temp_dir.cleanup() | |
235 | |
236 return None | |
237 | |
238 | |
239 # TODO: may be able to merge with other mains | |
240 def parseIgBLAST(aligner_output, seq_file, repo, no_parse=True, partial=False, | |
241 parse_regions=False, parse_scores=False, parse_igblast_cdr3=False, | |
242 out_args=default_out_args): | |
243 """ | |
244 Main for IgBLAST aligned sample sequences. | |
245 | |
246 Arguments: | |
247 aligner_output : IgBLAST output file to process. | |
248 seq_file : fasta file input to IgBlast (from which to get sequence). | |
249 repo : folder with germline repertoire files. | |
250 no_parse : if ID is to be parsed for pRESTO output with default delimiters. | |
251 partial : If True put incomplete alignments in the pass file. | |
252 parse_regions : if True add FWR and CDR fields to output file. | |
253 parse_scores : if True add alignment score fields to output file. | |
254 parse_igblast_cdr3 : if True parse CDR3 sequences generated by IgBLAST | |
255 out_args : common output argument dictionary from parseCommonArgs. | |
256 | |
257 Returns: | |
258 None | |
259 """ | |
260 # Print parameter info | |
261 log = OrderedDict() | |
262 log['START'] = 'MakeDB' | |
263 log['ALIGNER'] = 'IgBlast' | |
264 log['ALIGNER_OUTPUT'] = os.path.basename(aligner_output) | |
265 log['SEQ_FILE'] = os.path.basename(seq_file) | |
266 log['NO_PARSE'] = no_parse | |
267 log['PARTIAL'] = partial | |
268 log['SCORES'] = parse_scores | |
269 log['REGIONS'] = parse_regions | |
270 printLog(log) | |
271 | |
272 start_time = time() | |
273 printMessage('Loading sequence files', start_time=start_time, width=25) | |
274 # Count records in sequence file | |
275 total_count = countSeqFile(seq_file) | |
276 # Get input sequence dictionary | |
277 seq_dict = getSeqDict(seq_file) | |
278 # Create germline repo dictionary | |
279 repo_dict = readRepo(repo) | |
280 printMessage('Done', start_time=start_time, end=True, width=25) | |
281 | |
282 # Parse and write output | |
283 with open(aligner_output, 'r') as f: | |
284 parse_iter = IgBLASTReader(f, seq_dict, repo_dict, | |
285 parse_scores=parse_scores, parse_regions=parse_regions, | |
286 parse_igblast_cdr3=parse_igblast_cdr3) | |
287 file_prefix = getFilePrefix(aligner_output, out_args) | |
288 writeDb(parse_iter, parse_iter.fields, file_prefix, total_count, | |
289 no_parse=no_parse, partial=partial, out_args=out_args) | |
290 | |
291 return None | |
292 | |
293 | |
294 # TODO: may be able to merge with other mains | |
295 def parseIHMM(aligner_output, seq_file, repo, no_parse=True, partial=False, | |
296 parse_scores=False, parse_regions=False, out_args=default_out_args): | |
297 """ | |
298 Main for iHMMuneAlign aligned sample sequences. | |
299 | |
300 Arguments: | |
301 aligner_output : iHMMune-Align output file to process. | |
302 seq_file : fasta file input to iHMMuneAlign (from which to get sequence). | |
303 repo : folder with germline repertoire files. | |
304 no_parse : if ID is to be parsed for pRESTO output with default delimiters. | |
305 partial : If True put incomplete alignments in the pass file. | |
306 parse_scores : if True parse alignment scores. | |
307 parse_regions : if True add FWR and CDR region fields. | |
308 out_args : common output argument dictionary from parseCommonArgs. | |
309 | |
310 Returns: | |
311 None | |
312 """ | |
313 # Print parameter info | |
314 log = OrderedDict() | |
315 log['START'] = 'MakeDB' | |
316 log['ALIGNER'] = 'iHMMune-Align' | |
317 log['ALIGNER_OUTPUT'] = os.path.basename(aligner_output) | |
318 log['SEQ_FILE'] = os.path.basename(seq_file) | |
319 log['NO_PARSE'] = no_parse | |
320 log['PARTIAL'] = partial | |
321 log['SCORES'] = parse_scores | |
322 log['REGIONS'] = parse_regions | |
323 printLog(log) | |
324 | |
325 start_time = time() | |
326 printMessage('Loading sequence files', start_time=start_time, width=25) | |
327 # Count records in sequence file | |
328 total_count = countSeqFile(seq_file) | |
329 # Get input sequence dictionary | |
330 seq_dict = getSeqDict(seq_file) | |
331 # Create germline repo dictionary | |
332 repo_dict = readRepo(repo) | |
333 printMessage('Done', start_time=start_time, end=True, width=25) | |
334 | |
335 # Parse and write output | |
336 with open(aligner_output, 'r') as f: | |
337 parse_iter = IHMMuneReader(f, seq_dict, repo_dict, | |
338 parse_scores=parse_scores, parse_regions=parse_regions) | |
339 file_prefix = getFilePrefix(aligner_output, out_args) | |
340 writeDb(parse_iter, parse_iter.fields, file_prefix, total_count, | |
341 no_parse=no_parse, partial=partial, out_args=out_args) | |
342 | |
343 return None | |
344 | |
345 | |
346 def getArgParser(): | |
347 """ | |
348 Defines the ArgumentParser. | |
349 | |
350 Returns: | |
351 argparse.ArgumentParser | |
352 """ | |
353 fields = dedent( | |
354 ''' | |
355 output files: | |
356 db-pass | |
357 database of alignment records with functionality information, | |
358 V and J calls, and a junction region. | |
359 db-fail | |
360 database with records that fail due to no functionality information | |
361 (did not pass IMGT), no V call, no J call, or no junction region. | |
362 | |
363 universal output fields: | |
364 SEQUENCE_ID, SEQUENCE_INPUT, SEQUENCE_VDJ, SEQUENCE_IMGT, | |
365 FUNCTIONAL, IN_FRAME, STOP, MUTATED_INVARIANT, INDELS, | |
366 V_CALL, D_CALL, J_CALL, | |
367 V_SEQ_START, V_SEQ_LENGTH, | |
368 D_SEQ_START, D_SEQ_LENGTH, D_GERM_START, D_GERM_LENGTH, | |
369 J_SEQ_START, J_SEQ_LENGTH, J_GERM_START, J_GERM_LENGTH, | |
370 JUNCTION_LENGTH, JUNCTION, NP1_LENGTH, NP2_LENGTH, | |
371 FWR1_IMGT, FWR2_IMGT, FWR3_IMGT, FWR4_IMGT, | |
372 CDR1_IMGT, CDR2_IMGT, CDR3_IMGT | |
373 | |
374 imgt specific output fields: | |
375 V_GERM_START_IMGT, V_GERM_LENGTH_IMGT, | |
376 N1_LENGTH, N2_LENGTH, P3V_LENGTH, P5D_LENGTH, P3D_LENGTH, P5J_LENGTH, | |
377 D_FRAME, V_SCORE, V_IDENTITY, J_SCORE, J_IDENTITY, | |
378 | |
379 igblast specific output fields: | |
380 V_GERM_START_VDJ, V_GERM_LENGTH_VDJ, | |
381 V_EVALUE, V_SCORE, V_IDENTITY, V_BTOP, | |
382 J_EVALUE, J_SCORE, J_IDENTITY, J_BTOP. | |
383 CDR3_IGBLAST_NT, CDR3_IGBLAST_AA | |
384 | |
385 ihmm specific output fields: | |
386 V_GERM_START_VDJ, V_GERM_LENGTH_VDJ, | |
387 HMM_SCORE | |
388 ''') | |
389 | |
390 # Define ArgumentParser | |
391 parser = ArgumentParser(description=__doc__, epilog=fields, | |
392 formatter_class=CommonHelpFormatter) | |
393 parser.add_argument('--version', action='version', | |
394 version='%(prog)s:' + ' %s-%s' %(__version__, __date__)) | |
395 subparsers = parser.add_subparsers(title='subcommands', dest='command', | |
396 help='Aligner used', metavar='') | |
397 # TODO: This is a temporary fix for Python issue 9253 | |
398 subparsers.required = True | |
399 | |
400 # Parent parser | |
401 parser_parent = getCommonArgParser(seq_in=False, seq_out=False, log=False) | |
402 | |
403 # IgBlast Aligner | |
404 parser_igblast = subparsers.add_parser('igblast', parents=[parser_parent], | |
405 formatter_class=CommonHelpFormatter, | |
406 help='Process IgBLAST output.', | |
407 description='Process IgBLAST output.') | |
408 parser_igblast.add_argument('-i', nargs='+', action='store', dest='aligner_outputs', | |
409 required=True, | |
410 help='''IgBLAST output files in format 7 with query sequence | |
411 (IgBLAST argument \'-outfmt "7 std qseq sseq btop"\').''') | |
412 parser_igblast.add_argument('-r', nargs='+', action='store', dest='repo', required=True, | |
413 help='''List of folders and/or fasta files containing | |
414 IMGT-gapped germline sequences corresponding to the | |
415 set of germlines used in the IgBLAST alignment.''') | |
416 parser_igblast.add_argument('-s', action='store', nargs='+', dest='seq_files', | |
417 required=True, | |
418 help='''List of input FASTA files (with .fasta, .fna or .fa | |
419 extension), containing sequences.''') | |
420 parser_igblast.add_argument('--noparse', action='store_true', dest='no_parse', | |
421 help='''Specify to prevent input sequence headers from being parsed | |
422 to add new columns to database. Parsing of sequence headers requires | |
423 headers to be in the pRESTO annotation format, so this should be specified | |
424 when sequence headers are incompatible with the pRESTO annotation scheme. | |
425 Note, unrecognized header formats will default to this behavior.''') | |
426 parser_igblast.add_argument('--partial', action='store_true', dest='partial', | |
427 help='''If specified, include incomplete V(D)J alignments in | |
428 the pass file instead of the fail file.''') | |
429 parser_igblast.add_argument('--scores', action='store_true', dest='parse_scores', | |
430 help='''Specify if alignment score metrics should be | |
431 included in the output. Adds the V_SCORE, V_IDENTITY, | |
432 V_EVALUE, V_BTOP, J_SCORE, J_IDENTITY, | |
433 J_BTOP, and J_EVALUE columns.''') | |
434 parser_igblast.add_argument('--regions', action='store_true', dest='parse_regions', | |
435 help='''Specify if IMGT FWR and CDRs should be | |
436 included in the output. Adds the FWR1_IMGT, FWR2_IMGT, | |
437 FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and | |
438 CDR3_IMGT columns.''') | |
439 parser_igblast.add_argument('--cdr3', action='store_true', | |
440 dest='parse_igblast_cdr3', | |
441 help='''Specify if the CDR3 sequences generated by IgBLAST | |
442 should be included in the output. Adds the columns | |
443 CDR3_IGBLAST_NT and CDR3_IGBLAST_AA. Requires IgBLAST | |
444 version 1.5 or greater.''') | |
445 parser_igblast.set_defaults(func=parseIgBLAST) | |
446 | |
447 # IMGT aligner | |
448 parser_imgt = subparsers.add_parser('imgt', parents=[parser_parent], | |
449 formatter_class=CommonHelpFormatter, | |
450 help='''Process IMGT/HighV-Quest output | |
451 (does not work with V-QUEST).''', | |
452 description='''Process IMGT/HighV-Quest output | |
453 (does not work with V-QUEST).''') | |
454 parser_imgt.add_argument('-i', nargs='+', action='store', dest='aligner_outputs', | |
455 help='''Either zipped IMGT output files (.zip or .txz) or a | |
456 folder containing unzipped IMGT output files (which must | |
457 include 1_Summary, 2_IMGT-gapped, 3_Nt-sequences, | |
458 and 6_Junction).''') | |
459 parser_imgt.add_argument('-s', nargs='*', action='store', dest='seq_files', | |
460 required=False, | |
461 help='''List of input FASTA files (with .fasta, .fna or .fa | |
462 extension) containing sequences.''') | |
463 parser_imgt.add_argument('--noparse', action='store_true', dest='no_parse', | |
464 help='''Specify to prevent input sequence headers from being parsed | |
465 to add new columns to database. Parsing of sequence headers requires | |
466 headers to be in the pRESTO annotation format, so this should be specified | |
467 when sequence headers are incompatible with the pRESTO annotation scheme. | |
468 Note, unrecognized header formats will default to this behavior.''') | |
469 parser_imgt.add_argument('--partial', action='store_true', dest='partial', | |
470 help='''If specified, include incomplete V(D)J alignments in | |
471 the pass file instead of the fail file.''') | |
472 parser_imgt.add_argument('--scores', action='store_true', dest='parse_scores', | |
473 help='''Specify if alignment score metrics should be | |
474 included in the output. Adds the V_SCORE, V_IDENTITY, | |
475 J_SCORE and J_IDENTITY.''') | |
476 parser_imgt.add_argument('--regions', action='store_true', dest='parse_regions', | |
477 help='''Specify if IMGT FWRs and CDRs should be | |
478 included in the output. Adds the FWR1_IMGT, FWR2_IMGT, | |
479 FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and | |
480 CDR3_IMGT columns.''') | |
481 parser_imgt.add_argument('--junction', action='store_true', dest='parse_junction', | |
482 help='''Specify if detailed junction fields should be | |
483 included in the output. Adds the columns | |
484 N1_LENGTH, N2_LENGTH, P3V_LENGTH, P5D_LENGTH, P3D_LENGTH, | |
485 P5J_LENGTH, D_FRAME.''') | |
486 parser_imgt.set_defaults(func=parseIMGT) | |
487 | |
488 # iHMMuneAlign Aligner | |
489 parser_ihmm = subparsers.add_parser('ihmm', parents=[parser_parent], | |
490 formatter_class=CommonHelpFormatter, | |
491 help='Process iHMMune-Align output.', | |
492 description='Process iHMMune-Align output.') | |
493 parser_ihmm.add_argument('-i', nargs='+', action='store', dest='aligner_outputs', | |
494 required=True, | |
495 help='''iHMMune-Align output file.''') | |
496 parser_ihmm.add_argument('-r', nargs='+', action='store', dest='repo', required=True, | |
497 help='''List of folders and/or FASTA files containing | |
498 IMGT-gapped germline sequences corresponding to the | |
499 set of germlines used in the IgBLAST alignment.''') | |
500 parser_ihmm.add_argument('-s', action='store', nargs='+', dest='seq_files', | |
501 required=True, | |
502 help='''List of input FASTA files (with .fasta, .fna or .fa | |
503 extension) containing sequences.''') | |
504 parser_ihmm.add_argument('--noparse', action='store_true', dest='no_parse', | |
505 help='''Specify to prevent input sequence headers from being parsed | |
506 to add new columns to database. Parsing of sequence headers requires | |
507 headers to be in the pRESTO annotation format, so this should be specified | |
508 when sequence headers are incompatible with the pRESTO annotation scheme. | |
509 Note, unrecognized header formats will default to this behavior.''') | |
510 parser_ihmm.add_argument('--partial', action='store_true', dest='partial', | |
511 help='''If specified, include incomplete V(D)J alignments in | |
512 the pass file instead of the fail file.''') | |
513 parser_ihmm.add_argument('--scores', action='store_true', dest='parse_scores', | |
514 help='''Specify if alignment score metrics should be | |
515 included in the output. Adds the path score of the | |
516 iHMMune-Align hidden Markov model to HMM_SCORE.''') | |
517 parser_ihmm.add_argument('--regions', action='store_true', dest='parse_regions', | |
518 help='''Specify if IMGT FWRs and CDRs should be | |
519 included in the output. Adds the FWR1_IMGT, FWR2_IMGT, | |
520 FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and | |
521 CDR3_IMGT columns.''') | |
522 parser_ihmm.set_defaults(func=parseIHMM) | |
523 | |
524 return parser | |
525 | |
526 | |
527 if __name__ == "__main__": | |
528 """ | |
529 Parses command line arguments and calls main | |
530 """ | |
531 parser = getArgParser() | |
532 checkArgs(parser) | |
533 args = parser.parse_args() | |
534 args_dict = parseCommonArgs(args, in_arg='aligner_outputs') | |
535 | |
536 # Set no ID parsing if sequence files are not provided | |
537 if 'seq_files' in args_dict and not args_dict['seq_files']: | |
538 args_dict['no_parse'] = True | |
539 | |
540 # Delete | |
541 if 'seq_files' in args_dict: del args_dict['seq_files'] | |
542 if 'aligner_outputs' in args_dict: del args_dict['aligner_outputs'] | |
543 if 'command' in args_dict: del args_dict['command'] | |
544 if 'func' in args_dict: del args_dict['func'] | |
545 | |
546 if args.command == 'imgt': | |
547 for i in range(len(args.__dict__['aligner_outputs'])): | |
548 args_dict['aligner_output'] = args.__dict__['aligner_outputs'][i] | |
549 args_dict['seq_file'] = args.__dict__['seq_files'][i] \ | |
550 if args.__dict__['seq_files'] else None | |
551 args.func(**args_dict) | |
552 elif args.command == 'igblast' or args.command == 'ihmm': | |
553 for i in range(len(args.__dict__['aligner_outputs'])): | |
554 args_dict['aligner_output'] = args.__dict__['aligner_outputs'][i] | |
555 args_dict['seq_file'] = args.__dict__['seq_files'][i] | |
556 args.func(**args_dict) |