comparison MakeDb.py @ 0:183edf446dcf draft default tip

Uploaded
author davidvanzessen
date Mon, 17 Jul 2017 07:44:27 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:183edf446dcf
1 #!/usr/bin/env python3
2 """
3 Create tab-delimited database file to store sequence alignment information
4 """
5 # Info
6 __author__ = 'Namita Gupta, Jason Anthony Vander Heiden'
7 from changeo import __version__, __date__
8
9 # Imports
10 import os
11 import sys
12 from argparse import ArgumentParser
13 from collections import OrderedDict
14 from textwrap import dedent
15 from time import time
16 from Bio import SeqIO
17
18 # Presto and changeo imports
19 from presto.Defaults import default_out_args
20 from presto.Annotation import parseAnnotation
21 from presto.IO import countSeqFile, printLog, printMessage, printProgress, readSeqFile
22 from changeo.Commandline import CommonHelpFormatter, checkArgs, getCommonArgParser, parseCommonArgs
23 from changeo.IO import countDbFile, extractIMGT, getDbWriter, readRepo
24 from changeo.Parsers import IgBLASTReader, IMGTReader, IHMMuneReader, getIDforIMGT
25
26
27 def getFilePrefix(aligner_output, out_args):
28 """
29 Get file name prefix and create output directory
30
31 Arguments:
32 aligner_output : aligner output file or directory.
33 out_args : dictionary of output arguments.
34
35 Returns:
36 str : file name prefix.
37 """
38 # Determine output directory
39 if not out_args['out_dir']:
40 out_dir = os.path.dirname(os.path.abspath(aligner_output))
41 else:
42 out_dir = os.path.abspath(out_args['out_dir'])
43 if not os.path.exists(out_dir):
44 os.mkdir(out_dir)
45
46 # Determine file prefix
47 if out_args['out_name']:
48 file_prefix = out_args['out_name']
49 else:
50 file_prefix = os.path.splitext(os.path.split(os.path.abspath(aligner_output))[1])[0]
51
52 return os.path.join(out_dir, file_prefix)
53
54
55 def getSeqDict(seq_file):
56 """
57 Create a dictionary from a sequence file.
58
59 Arguments:
60 seq_file : sequence file.
61
62 Returns:
63 dict : sequence description as keys with Bio.SeqRecords as values.
64 """
65 seq_dict = SeqIO.to_dict(readSeqFile(seq_file),
66 key_function=lambda x: x.description)
67
68 return seq_dict
69
70
71 def writeDb(db, fields, file_prefix, total_count, id_dict=None, no_parse=True, partial=False,
72 out_args=default_out_args):
73 """
74 Writes tab-delimited database file in output directory.
75
76 Arguments:
77 db : a iterator of IgRecord objects containing alignment data.
78 fields : a list of ordered field names to write.
79 file_prefix : directory and prefix for CLIP tab-delim file.
80 total_count : number of records (for progress bar).
81 id_dict : a dictionary of the truncated sequence ID mapped to the full sequence ID.
82 no_parse : if ID is to be parsed for pRESTO output with default delimiters.
83 partial : if True put incomplete alignments in the pass file.
84 out_args : common output argument dictionary from parseCommonArgs.
85
86 Returns:
87 None
88 """
89 # Function to check for valid records strictly
90 def _pass_strict(rec):
91 valid = [rec.v_call and rec.v_call != 'None',
92 rec.j_call and rec.j_call != 'None',
93 rec.functional is not None,
94 rec.seq_vdj,
95 rec.junction]
96 return all(valid)
97
98 # Function to check for valid records loosely
99 def _pass_gentle(rec):
100 valid = [rec.v_call and rec.v_call != 'None',
101 rec.d_call and rec.d_call != 'None',
102 rec.j_call and rec.j_call != 'None']
103 return any(valid)
104
105 # Set pass criteria
106 _pass = _pass_gentle if partial else _pass_strict
107
108 # Define output file names
109 pass_file = '%s_db-pass.tab' % file_prefix
110 fail_file = '%s_db-fail.tab' % file_prefix
111
112 # Initiate handles, writers and counters
113 pass_handle = None
114 fail_handle = None
115 pass_writer = None
116 fail_writer = None
117 start_time = time()
118 rec_count = pass_count = fail_count = 0
119
120 # Validate and write output
121 printProgress(0, total_count, 0.05, start_time)
122 for i, record in enumerate(db, start=1):
123
124 # Replace sequence description with full string, if required
125 if id_dict is not None and record.id in id_dict:
126 record.id = id_dict[record.id]
127
128 # Parse sequence description into new columns
129 if not no_parse:
130 try:
131 record.annotations = parseAnnotation(record.id, delimiter=out_args['delimiter'])
132 record.id = record.annotations['ID']
133 del record.annotations['ID']
134
135 # TODO: This is not the best approach. should pass in output fields.
136 # If first record, use parsed description to define extra columns
137 if i == 1: fields.extend(list(record.annotations.keys()))
138 except IndexError:
139 # Could not parse pRESTO-style annotations so fall back to no parse
140 no_parse = True
141 sys.stderr.write('\nWARNING: Sequence annotation format not recognized. Sequence headers will not be parsed.\n')
142
143 # Count pass or fail and write to appropriate file
144 if _pass(record):
145 # Open pass file
146 if pass_writer is None:
147 pass_handle = open(pass_file, 'wt')
148 pass_writer = getDbWriter(pass_handle, add_fields=fields)
149
150 # Write row to pass file
151 pass_count += 1
152 pass_writer.writerow(record.toDict())
153 else:
154 # Open failed file
155 if out_args['failed'] and fail_writer is None:
156 fail_handle = open(fail_file, 'wt')
157 fail_writer = getDbWriter(fail_handle, add_fields=fields)
158
159 # Write row to fail file if specified
160 fail_count += 1
161 if fail_writer is not None:
162 fail_writer.writerow(record.toDict())
163
164 # Print progress
165 printProgress(i, total_count, 0.05, start_time)
166
167 # Print consol log
168 log = OrderedDict()
169 log['OUTPUT'] = pass_file
170 log['PASS'] = pass_count
171 log['FAIL'] = fail_count
172 log['END'] = 'MakeDb'
173 printLog(log)
174
175 if pass_handle is not None: pass_handle.close()
176 if fail_handle is not None: fail_handle.close()
177
178
179 # TODO: may be able to merge with other mains
180 def parseIMGT(aligner_output, seq_file=None, no_parse=True, partial=False,
181 parse_scores=False, parse_regions=False, parse_junction=False,
182 out_args=default_out_args):
183 """
184 Main for IMGT aligned sample sequences.
185
186 Arguments:
187 aligner_output : zipped file or unzipped folder output by IMGT.
188 seq_file : FASTA file input to IMGT (from which to get seqID).
189 no_parse : if ID is to be parsed for pRESTO output with default delimiters.
190 partial : If True put incomplete alignments in the pass file.
191 parse_scores : if True add alignment score fields to output file.
192 parse_regions : if True add FWR and CDR region fields to output file.
193 out_args : common output argument dictionary from parseCommonArgs.
194
195 Returns:
196 None
197 """
198 # Print parameter info
199 log = OrderedDict()
200 log['START'] = 'MakeDb'
201 log['ALIGNER'] = 'IMGT'
202 log['ALIGNER_OUTPUT'] = aligner_output
203 log['SEQ_FILE'] = os.path.basename(seq_file) if seq_file else ''
204 log['NO_PARSE'] = no_parse
205 log['PARTIAL'] = partial
206 log['SCORES'] = parse_scores
207 log['REGIONS'] = parse_regions
208 log['JUNCTION'] = parse_junction
209 printLog(log)
210
211 start_time = time()
212 printMessage('Loading sequence files', start_time=start_time, width=25)
213 # Extract IMGT files
214 temp_dir, imgt_files = extractIMGT(aligner_output)
215 # Count records in IMGT files
216 total_count = countDbFile(imgt_files['summary'])
217 # Get (parsed) IDs from fasta file submitted to IMGT
218 id_dict = getIDforIMGT(seq_file) if seq_file else {}
219 printMessage('Done', start_time=start_time, end=True, width=25)
220
221 # Parse IMGT output and write db
222 with open(imgt_files['summary'], 'r') as summary_handle, \
223 open(imgt_files['gapped'], 'r') as gapped_handle, \
224 open(imgt_files['ntseq'], 'r') as ntseq_handle, \
225 open(imgt_files['junction'], 'r') as junction_handle:
226 parse_iter = IMGTReader(summary_handle, gapped_handle, ntseq_handle, junction_handle,
227 parse_scores=parse_scores, parse_regions=parse_regions,
228 parse_junction=parse_junction)
229 file_prefix = getFilePrefix(aligner_output, out_args)
230 writeDb(parse_iter, parse_iter.fields, file_prefix, total_count, id_dict=id_dict,
231 no_parse=no_parse, partial=partial, out_args=out_args)
232
233 # Cleanup temp directory
234 temp_dir.cleanup()
235
236 return None
237
238
239 # TODO: may be able to merge with other mains
240 def parseIgBLAST(aligner_output, seq_file, repo, no_parse=True, partial=False,
241 parse_regions=False, parse_scores=False, parse_igblast_cdr3=False,
242 out_args=default_out_args):
243 """
244 Main for IgBLAST aligned sample sequences.
245
246 Arguments:
247 aligner_output : IgBLAST output file to process.
248 seq_file : fasta file input to IgBlast (from which to get sequence).
249 repo : folder with germline repertoire files.
250 no_parse : if ID is to be parsed for pRESTO output with default delimiters.
251 partial : If True put incomplete alignments in the pass file.
252 parse_regions : if True add FWR and CDR fields to output file.
253 parse_scores : if True add alignment score fields to output file.
254 parse_igblast_cdr3 : if True parse CDR3 sequences generated by IgBLAST
255 out_args : common output argument dictionary from parseCommonArgs.
256
257 Returns:
258 None
259 """
260 # Print parameter info
261 log = OrderedDict()
262 log['START'] = 'MakeDB'
263 log['ALIGNER'] = 'IgBlast'
264 log['ALIGNER_OUTPUT'] = os.path.basename(aligner_output)
265 log['SEQ_FILE'] = os.path.basename(seq_file)
266 log['NO_PARSE'] = no_parse
267 log['PARTIAL'] = partial
268 log['SCORES'] = parse_scores
269 log['REGIONS'] = parse_regions
270 printLog(log)
271
272 start_time = time()
273 printMessage('Loading sequence files', start_time=start_time, width=25)
274 # Count records in sequence file
275 total_count = countSeqFile(seq_file)
276 # Get input sequence dictionary
277 seq_dict = getSeqDict(seq_file)
278 # Create germline repo dictionary
279 repo_dict = readRepo(repo)
280 printMessage('Done', start_time=start_time, end=True, width=25)
281
282 # Parse and write output
283 with open(aligner_output, 'r') as f:
284 parse_iter = IgBLASTReader(f, seq_dict, repo_dict,
285 parse_scores=parse_scores, parse_regions=parse_regions,
286 parse_igblast_cdr3=parse_igblast_cdr3)
287 file_prefix = getFilePrefix(aligner_output, out_args)
288 writeDb(parse_iter, parse_iter.fields, file_prefix, total_count,
289 no_parse=no_parse, partial=partial, out_args=out_args)
290
291 return None
292
293
294 # TODO: may be able to merge with other mains
295 def parseIHMM(aligner_output, seq_file, repo, no_parse=True, partial=False,
296 parse_scores=False, parse_regions=False, out_args=default_out_args):
297 """
298 Main for iHMMuneAlign aligned sample sequences.
299
300 Arguments:
301 aligner_output : iHMMune-Align output file to process.
302 seq_file : fasta file input to iHMMuneAlign (from which to get sequence).
303 repo : folder with germline repertoire files.
304 no_parse : if ID is to be parsed for pRESTO output with default delimiters.
305 partial : If True put incomplete alignments in the pass file.
306 parse_scores : if True parse alignment scores.
307 parse_regions : if True add FWR and CDR region fields.
308 out_args : common output argument dictionary from parseCommonArgs.
309
310 Returns:
311 None
312 """
313 # Print parameter info
314 log = OrderedDict()
315 log['START'] = 'MakeDB'
316 log['ALIGNER'] = 'iHMMune-Align'
317 log['ALIGNER_OUTPUT'] = os.path.basename(aligner_output)
318 log['SEQ_FILE'] = os.path.basename(seq_file)
319 log['NO_PARSE'] = no_parse
320 log['PARTIAL'] = partial
321 log['SCORES'] = parse_scores
322 log['REGIONS'] = parse_regions
323 printLog(log)
324
325 start_time = time()
326 printMessage('Loading sequence files', start_time=start_time, width=25)
327 # Count records in sequence file
328 total_count = countSeqFile(seq_file)
329 # Get input sequence dictionary
330 seq_dict = getSeqDict(seq_file)
331 # Create germline repo dictionary
332 repo_dict = readRepo(repo)
333 printMessage('Done', start_time=start_time, end=True, width=25)
334
335 # Parse and write output
336 with open(aligner_output, 'r') as f:
337 parse_iter = IHMMuneReader(f, seq_dict, repo_dict,
338 parse_scores=parse_scores, parse_regions=parse_regions)
339 file_prefix = getFilePrefix(aligner_output, out_args)
340 writeDb(parse_iter, parse_iter.fields, file_prefix, total_count,
341 no_parse=no_parse, partial=partial, out_args=out_args)
342
343 return None
344
345
346 def getArgParser():
347 """
348 Defines the ArgumentParser.
349
350 Returns:
351 argparse.ArgumentParser
352 """
353 fields = dedent(
354 '''
355 output files:
356 db-pass
357 database of alignment records with functionality information,
358 V and J calls, and a junction region.
359 db-fail
360 database with records that fail due to no functionality information
361 (did not pass IMGT), no V call, no J call, or no junction region.
362
363 universal output fields:
364 SEQUENCE_ID, SEQUENCE_INPUT, SEQUENCE_VDJ, SEQUENCE_IMGT,
365 FUNCTIONAL, IN_FRAME, STOP, MUTATED_INVARIANT, INDELS,
366 V_CALL, D_CALL, J_CALL,
367 V_SEQ_START, V_SEQ_LENGTH,
368 D_SEQ_START, D_SEQ_LENGTH, D_GERM_START, D_GERM_LENGTH,
369 J_SEQ_START, J_SEQ_LENGTH, J_GERM_START, J_GERM_LENGTH,
370 JUNCTION_LENGTH, JUNCTION, NP1_LENGTH, NP2_LENGTH,
371 FWR1_IMGT, FWR2_IMGT, FWR3_IMGT, FWR4_IMGT,
372 CDR1_IMGT, CDR2_IMGT, CDR3_IMGT
373
374 imgt specific output fields:
375 V_GERM_START_IMGT, V_GERM_LENGTH_IMGT,
376 N1_LENGTH, N2_LENGTH, P3V_LENGTH, P5D_LENGTH, P3D_LENGTH, P5J_LENGTH,
377 D_FRAME, V_SCORE, V_IDENTITY, J_SCORE, J_IDENTITY,
378
379 igblast specific output fields:
380 V_GERM_START_VDJ, V_GERM_LENGTH_VDJ,
381 V_EVALUE, V_SCORE, V_IDENTITY, V_BTOP,
382 J_EVALUE, J_SCORE, J_IDENTITY, J_BTOP.
383 CDR3_IGBLAST_NT, CDR3_IGBLAST_AA
384
385 ihmm specific output fields:
386 V_GERM_START_VDJ, V_GERM_LENGTH_VDJ,
387 HMM_SCORE
388 ''')
389
390 # Define ArgumentParser
391 parser = ArgumentParser(description=__doc__, epilog=fields,
392 formatter_class=CommonHelpFormatter)
393 parser.add_argument('--version', action='version',
394 version='%(prog)s:' + ' %s-%s' %(__version__, __date__))
395 subparsers = parser.add_subparsers(title='subcommands', dest='command',
396 help='Aligner used', metavar='')
397 # TODO: This is a temporary fix for Python issue 9253
398 subparsers.required = True
399
400 # Parent parser
401 parser_parent = getCommonArgParser(seq_in=False, seq_out=False, log=False)
402
403 # IgBlast Aligner
404 parser_igblast = subparsers.add_parser('igblast', parents=[parser_parent],
405 formatter_class=CommonHelpFormatter,
406 help='Process IgBLAST output.',
407 description='Process IgBLAST output.')
408 parser_igblast.add_argument('-i', nargs='+', action='store', dest='aligner_outputs',
409 required=True,
410 help='''IgBLAST output files in format 7 with query sequence
411 (IgBLAST argument \'-outfmt "7 std qseq sseq btop"\').''')
412 parser_igblast.add_argument('-r', nargs='+', action='store', dest='repo', required=True,
413 help='''List of folders and/or fasta files containing
414 IMGT-gapped germline sequences corresponding to the
415 set of germlines used in the IgBLAST alignment.''')
416 parser_igblast.add_argument('-s', action='store', nargs='+', dest='seq_files',
417 required=True,
418 help='''List of input FASTA files (with .fasta, .fna or .fa
419 extension), containing sequences.''')
420 parser_igblast.add_argument('--noparse', action='store_true', dest='no_parse',
421 help='''Specify to prevent input sequence headers from being parsed
422 to add new columns to database. Parsing of sequence headers requires
423 headers to be in the pRESTO annotation format, so this should be specified
424 when sequence headers are incompatible with the pRESTO annotation scheme.
425 Note, unrecognized header formats will default to this behavior.''')
426 parser_igblast.add_argument('--partial', action='store_true', dest='partial',
427 help='''If specified, include incomplete V(D)J alignments in
428 the pass file instead of the fail file.''')
429 parser_igblast.add_argument('--scores', action='store_true', dest='parse_scores',
430 help='''Specify if alignment score metrics should be
431 included in the output. Adds the V_SCORE, V_IDENTITY,
432 V_EVALUE, V_BTOP, J_SCORE, J_IDENTITY,
433 J_BTOP, and J_EVALUE columns.''')
434 parser_igblast.add_argument('--regions', action='store_true', dest='parse_regions',
435 help='''Specify if IMGT FWR and CDRs should be
436 included in the output. Adds the FWR1_IMGT, FWR2_IMGT,
437 FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and
438 CDR3_IMGT columns.''')
439 parser_igblast.add_argument('--cdr3', action='store_true',
440 dest='parse_igblast_cdr3',
441 help='''Specify if the CDR3 sequences generated by IgBLAST
442 should be included in the output. Adds the columns
443 CDR3_IGBLAST_NT and CDR3_IGBLAST_AA. Requires IgBLAST
444 version 1.5 or greater.''')
445 parser_igblast.set_defaults(func=parseIgBLAST)
446
447 # IMGT aligner
448 parser_imgt = subparsers.add_parser('imgt', parents=[parser_parent],
449 formatter_class=CommonHelpFormatter,
450 help='''Process IMGT/HighV-Quest output
451 (does not work with V-QUEST).''',
452 description='''Process IMGT/HighV-Quest output
453 (does not work with V-QUEST).''')
454 parser_imgt.add_argument('-i', nargs='+', action='store', dest='aligner_outputs',
455 help='''Either zipped IMGT output files (.zip or .txz) or a
456 folder containing unzipped IMGT output files (which must
457 include 1_Summary, 2_IMGT-gapped, 3_Nt-sequences,
458 and 6_Junction).''')
459 parser_imgt.add_argument('-s', nargs='*', action='store', dest='seq_files',
460 required=False,
461 help='''List of input FASTA files (with .fasta, .fna or .fa
462 extension) containing sequences.''')
463 parser_imgt.add_argument('--noparse', action='store_true', dest='no_parse',
464 help='''Specify to prevent input sequence headers from being parsed
465 to add new columns to database. Parsing of sequence headers requires
466 headers to be in the pRESTO annotation format, so this should be specified
467 when sequence headers are incompatible with the pRESTO annotation scheme.
468 Note, unrecognized header formats will default to this behavior.''')
469 parser_imgt.add_argument('--partial', action='store_true', dest='partial',
470 help='''If specified, include incomplete V(D)J alignments in
471 the pass file instead of the fail file.''')
472 parser_imgt.add_argument('--scores', action='store_true', dest='parse_scores',
473 help='''Specify if alignment score metrics should be
474 included in the output. Adds the V_SCORE, V_IDENTITY,
475 J_SCORE and J_IDENTITY.''')
476 parser_imgt.add_argument('--regions', action='store_true', dest='parse_regions',
477 help='''Specify if IMGT FWRs and CDRs should be
478 included in the output. Adds the FWR1_IMGT, FWR2_IMGT,
479 FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and
480 CDR3_IMGT columns.''')
481 parser_imgt.add_argument('--junction', action='store_true', dest='parse_junction',
482 help='''Specify if detailed junction fields should be
483 included in the output. Adds the columns
484 N1_LENGTH, N2_LENGTH, P3V_LENGTH, P5D_LENGTH, P3D_LENGTH,
485 P5J_LENGTH, D_FRAME.''')
486 parser_imgt.set_defaults(func=parseIMGT)
487
488 # iHMMuneAlign Aligner
489 parser_ihmm = subparsers.add_parser('ihmm', parents=[parser_parent],
490 formatter_class=CommonHelpFormatter,
491 help='Process iHMMune-Align output.',
492 description='Process iHMMune-Align output.')
493 parser_ihmm.add_argument('-i', nargs='+', action='store', dest='aligner_outputs',
494 required=True,
495 help='''iHMMune-Align output file.''')
496 parser_ihmm.add_argument('-r', nargs='+', action='store', dest='repo', required=True,
497 help='''List of folders and/or FASTA files containing
498 IMGT-gapped germline sequences corresponding to the
499 set of germlines used in the IgBLAST alignment.''')
500 parser_ihmm.add_argument('-s', action='store', nargs='+', dest='seq_files',
501 required=True,
502 help='''List of input FASTA files (with .fasta, .fna or .fa
503 extension) containing sequences.''')
504 parser_ihmm.add_argument('--noparse', action='store_true', dest='no_parse',
505 help='''Specify to prevent input sequence headers from being parsed
506 to add new columns to database. Parsing of sequence headers requires
507 headers to be in the pRESTO annotation format, so this should be specified
508 when sequence headers are incompatible with the pRESTO annotation scheme.
509 Note, unrecognized header formats will default to this behavior.''')
510 parser_ihmm.add_argument('--partial', action='store_true', dest='partial',
511 help='''If specified, include incomplete V(D)J alignments in
512 the pass file instead of the fail file.''')
513 parser_ihmm.add_argument('--scores', action='store_true', dest='parse_scores',
514 help='''Specify if alignment score metrics should be
515 included in the output. Adds the path score of the
516 iHMMune-Align hidden Markov model to HMM_SCORE.''')
517 parser_ihmm.add_argument('--regions', action='store_true', dest='parse_regions',
518 help='''Specify if IMGT FWRs and CDRs should be
519 included in the output. Adds the FWR1_IMGT, FWR2_IMGT,
520 FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and
521 CDR3_IMGT columns.''')
522 parser_ihmm.set_defaults(func=parseIHMM)
523
524 return parser
525
526
527 if __name__ == "__main__":
528 """
529 Parses command line arguments and calls main
530 """
531 parser = getArgParser()
532 checkArgs(parser)
533 args = parser.parse_args()
534 args_dict = parseCommonArgs(args, in_arg='aligner_outputs')
535
536 # Set no ID parsing if sequence files are not provided
537 if 'seq_files' in args_dict and not args_dict['seq_files']:
538 args_dict['no_parse'] = True
539
540 # Delete
541 if 'seq_files' in args_dict: del args_dict['seq_files']
542 if 'aligner_outputs' in args_dict: del args_dict['aligner_outputs']
543 if 'command' in args_dict: del args_dict['command']
544 if 'func' in args_dict: del args_dict['func']
545
546 if args.command == 'imgt':
547 for i in range(len(args.__dict__['aligner_outputs'])):
548 args_dict['aligner_output'] = args.__dict__['aligner_outputs'][i]
549 args_dict['seq_file'] = args.__dict__['seq_files'][i] \
550 if args.__dict__['seq_files'] else None
551 args.func(**args_dict)
552 elif args.command == 'igblast' or args.command == 'ihmm':
553 for i in range(len(args.__dict__['aligner_outputs'])):
554 args_dict['aligner_output'] = args.__dict__['aligner_outputs'][i]
555 args_dict['seq_file'] = args.__dict__['seq_files'][i]
556 args.func(**args_dict)