comparison change_o/MakeDb.py @ 78:aff3ba86ef7a draft

Uploaded
author davidvanzessen
date Mon, 31 Aug 2020 11:20:08 -0400
parents
children
comparison
equal deleted inserted replaced
77:58d2377b507d 78:aff3ba86ef7a
1 #!/usr/bin/env python3
2 """
3 Create tab-delimited database file to store sequence alignment information
4 """
5
6 # Info
7 __author__ = 'Namita Gupta, Jason Anthony Vander Heiden'
8 from changeo import __version__, __date__
9
10 # Imports
11 import os
12 import re
13 import csv
14 from argparse import ArgumentParser
15 from collections import OrderedDict
16 from textwrap import dedent
17 from time import time
18 from Bio import SeqIO
19
20 # Presto and changeo imports
21 from presto.Annotation import parseAnnotation
22 from presto.IO import countSeqFile, printLog, printMessage, printProgress, printError, printWarning, readSeqFile
23 from changeo.Defaults import default_format, default_out_args
24 from changeo.Commandline import CommonHelpFormatter, checkArgs, getCommonArgParser, parseCommonArgs
25 from changeo.Alignment import RegionDefinition
26 from changeo.Gene import buildGermline
27 from changeo.IO import countDbFile, extractIMGT, readGermlines, getFormatOperators, getOutputHandle, \
28 AIRRWriter, ChangeoWriter, IgBLASTReader, IgBLASTReaderAA, IMGTReader, IHMMuneReader
29 from changeo.Receptor import ChangeoSchema, AIRRSchema
30
31 # 10X Receptor attributes
32 cellranger_base = ['cell', 'c_call', 'conscount', 'umicount']
33 cellranger_extended = ['cell', 'c_call', 'conscount', 'umicount',
34 'v_call_10x', 'd_call_10x', 'j_call_10x',
35 'junction_10x', 'junction_10x_aa']
36
37 def readCellRanger(cellranger_file, fields=cellranger_base):
38 """
39 Load a Cell Ranger annotation table
40
41 Arguments:
42 cellranger_file (str): path to the annotation file.
43 fields (list): list of fields to keep.
44
45 Returns:
46 dict: dict of dicts with contig_id as the primary key.
47 """
48 # Mapping of 10X annotations to Receptor attributes
49 cellranger_map = {'cell': 'barcode',
50 'c_call': 'c_gene',
51 'locus': 'chain',
52 'conscount': 'reads',
53 'umicount': 'umis',
54 'v_call_10x': 'v_gene',
55 'd_call_10x': 'd_gene',
56 'j_call_10x': 'j_gene',
57 'junction_10x': 'cdr3_nt',
58 'junction_10x_aa': 'cdr3'}
59
60 # Function to parse individual fields
61 def _parse(x):
62 return '' if x == 'None' else x
63
64 # Generate annotation dictionary
65 ann_dict = {}
66 with open(cellranger_file) as csv_file:
67 # Detect delimiters
68 dialect = csv.Sniffer().sniff(csv_file.readline())
69 csv_file.seek(0)
70 # Read in annotation file
71 csv_reader = csv.DictReader(csv_file, dialect=dialect)
72
73 # Generate annotation dictionary
74 for row in csv_reader:
75 ann_dict[row['contig_id']] = {f: _parse(row[cellranger_map[f]]) for f in fields}
76
77 return ann_dict
78
79
80 def addGermline(receptor, references, amino_acid=False):
81 """
82 Add full length germline to Receptor object
83
84 Arguments:
85 receptor (changeo.Receptor.Receptor): Receptor object to modify.
86 references (dict): dictionary of IMGT-gapped references sequences.
87 amino_acid (bool): if True build amino acid germline, otherwise build nucleotide germline
88
89 Returns:
90 changeo.Receptor.Receptor: modified Receptor with the germline sequence added.
91 """
92 if amino_acid:
93 __, germlines, __ = buildGermline(receptor, references, seq_field='sequence_aa_imgt',
94 amino_acid=True)
95 germline_seq = None if germlines is None else germlines['full']
96 receptor.setField('germline_aa_imgt', germline_seq)
97 else:
98 __, germlines, __ = buildGermline(receptor, references, amino_acid=False)
99 germline_seq = None if germlines is None else germlines['full']
100 receptor.setField('germline_imgt', germline_seq)
101
102 return receptor
103
104
105 def getIDforIMGT(seq_file):
106 """
107 Create a sequence ID translation using IMGT truncation.
108
109 Arguments:
110 seq_file : a fasta file of sequences input to IMGT.
111
112 Returns:
113 dict : a dictionary of with the IMGT truncated ID as the key and the full sequence description as the value.
114 """
115
116 # Create a sequence ID translation using IDs truncate up to space or 50 chars
117 ids = {}
118 for rec in readSeqFile(seq_file):
119 if len(rec.description) <= 50:
120 id_key = rec.description
121 else:
122 id_key = re.sub('\||\s|!|&|\*|<|>|\?', '_', rec.description[:50])
123 ids.update({id_key: rec.description})
124
125 return ids
126
127
128 def getSeqDict(seq_file):
129 """
130 Create a dictionary from a sequence file.
131
132 Arguments:
133 seq_file : sequence file.
134
135 Returns:
136 dict : sequence description as keys with Bio.SeqRecords as values.
137 """
138 seq_dict = SeqIO.to_dict(readSeqFile(seq_file), key_function=lambda x: x.description)
139
140 return seq_dict
141
142
143 def writeDb(records, fields, aligner_file, total_count, id_dict=None, annotations=None,
144 amino_acid=False, partial=False, asis_id=True, regions='default',
145 writer=AIRRWriter, out_file=None, out_args=default_out_args):
146 """
147 Writes parsed records to an output file
148
149 Arguments:
150 records : a iterator of Receptor objects containing alignment data.
151 fields : a list of ordered field names to write.
152 aligner_file : input file name.
153 total_count : number of records (for progress bar).
154 id_dict : a dictionary of the truncated sequence ID mapped to the full sequence ID.
155 annotations : additional annotation dictionary.
156 amino_acid : if True do verification on amino acid fields.
157 partial : if True put incomplete alignments in the pass file.
158 asis_id : if ID is to be parsed for pRESTO output with default delimiters.
159 regions (str): name of the IMGT FWR/CDR region definitions to use.
160 writer : writer class.
161 out_file : output file name. Automatically generated from the input file if None.
162 out_args : common output argument dictionary from parseCommonArgs.
163
164 Returns:
165 None
166 """
167 # Wrapper for opening handles and writers
168 def _open(x, f, writer=writer, out_file=out_file):
169 if out_file is not None and x == 'pass':
170 handle = open(out_file, 'w')
171 else:
172 handle = getOutputHandle(aligner_file,
173 out_label='db-%s' % x,
174 out_dir=out_args['out_dir'],
175 out_name=out_args['out_name'],
176 out_type=out_args['out_type'])
177 return handle, writer(handle, fields=f)
178
179 # Function to convert fasta header annotations to changeo columns
180 def _changeo(f, header):
181 h = [ChangeoSchema.fromReceptor(x) for x in header if x.upper() not in f]
182 f.extend(h)
183 return f
184
185 def _airr(f, header):
186 h = [AIRRSchema.fromReceptor(x) for x in header if x.lower() not in f]
187 f.extend(h)
188 return f
189
190 # Function to verify IMGT-gapped sequence and junction concur
191 def _imgt_check(rec):
192 try:
193 if amino_acid:
194 rd = RegionDefinition(rec.junction_aa_length, amino_acid=amino_acid, definition=regions)
195 x, y = rd.positions['junction']
196 check = (rec.junction_aa == rec.sequence_aa_imgt[x:y])
197 else:
198 rd = RegionDefinition(rec.junction_length, amino_acid=amino_acid, definition=regions)
199 x, y = rd.positions['junction']
200 check = (rec.junction == rec.sequence_imgt[x:y])
201 except (TypeError, AttributeError):
202 check = False
203 return check
204
205 # Function to check for valid records strictly
206 def _strict(rec):
207 if amino_acid:
208 valid = [rec.v_call and rec.v_call != 'None',
209 rec.j_call and rec.j_call != 'None',
210 rec.functional is not None,
211 rec.sequence_aa_imgt,
212 rec.junction_aa,
213 _imgt_check(rec)]
214 else:
215 valid = [rec.v_call and rec.v_call != 'None',
216 rec.j_call and rec.j_call != 'None',
217 rec.functional is not None,
218 rec.sequence_imgt,
219 rec.junction,
220 _imgt_check(rec)]
221 return all(valid)
222
223 # Function to check for valid records loosely
224 def _gentle(rec):
225 valid = [rec.v_call and rec.v_call != 'None',
226 rec.d_call and rec.d_call != 'None',
227 rec.j_call and rec.j_call != 'None']
228 return any(valid)
229
230 # Set writer class and annotation conversion function
231 if writer == ChangeoWriter:
232 _annotate = _changeo
233 elif writer == AIRRWriter:
234 _annotate = _airr
235 else:
236 printError('Invalid output writer.')
237
238 # Additional annotation (e.g. 10X cell calls)
239 # _append_table = None
240 # if cellranger_file is not None:
241 # with open(cellranger_file) as csv_file:
242 # # Read in annotation file (use Sniffer to discover file delimiters)
243 # dialect = csv.Sniffer().sniff(csv_file.readline())
244 # csv_file.seek(0)
245 # csv_reader = csv.DictReader(csv_file, dialect = dialect)
246 #
247 # # Generate annotation dictionary
248 # anntab_dict = {entry['contig_id']: {cellranger_map[field]: entry[field] \
249 # for field in cellranger_map.keys()} for entry in csv_reader}
250 #
251 # fields = _annotate(fields, cellranger_map.values())
252 # _append_table = lambda sequence_id: anntab_dict[sequence_id]
253
254 # Set pass criteria
255 _pass = _gentle if partial else _strict
256
257 # Define log handle
258 if out_args['log_file'] is None:
259 log_handle = None
260 else:
261 log_handle = open(out_args['log_file'], 'w')
262
263 # Initialize handles, writers and counters
264 pass_handle, pass_writer = None, None
265 fail_handle, fail_writer = None, None
266 pass_count, fail_count = 0, 0
267 start_time = time()
268
269 # Validate and write output
270 printProgress(0, total_count, 0.05, start_time=start_time)
271 for i, record in enumerate(records, start=1):
272 # Replace sequence description with full string, if required
273 if id_dict is not None and record.sequence_id in id_dict:
274 record.sequence_id = id_dict[record.sequence_id]
275
276 # Parse sequence description into new columns
277 if not asis_id:
278 try:
279 ann_raw = parseAnnotation(record.sequence_id)
280 record.sequence_id = ann_raw.pop('ID')
281
282 # Convert to Receptor fields
283 ann_parsed = OrderedDict()
284 for k, v in ann_raw.items():
285 ann_parsed[ChangeoSchema.toReceptor(k)] = v
286
287 # Add annotations to Receptor and update field list
288 record.setDict(ann_parsed, parse=True)
289 if i == 1: fields = _annotate(fields, ann_parsed.keys())
290 except IndexError:
291 # Could not parse pRESTO-style annotations so fall back to no parse
292 asis_id = True
293 printWarning('Sequence annotation format not recognized. Sequence headers will not be parsed.')
294
295 # Add supplemental annotation fields
296 # if _append_table is not None:
297 # record.setDict(_append_table(record.sequence_id), parse=True)
298 if annotations is not None:
299 record.setDict(annotations[record.sequence_id], parse=True)
300 if i == 1: fields = _annotate(fields, annotations[record.sequence_id].keys())
301
302 # Count pass or fail and write to appropriate file
303 if _pass(record):
304 pass_count += 1
305 # Write row to pass file
306 try:
307 pass_writer.writeReceptor(record)
308 except AttributeError:
309 # Open pass file and writer
310 pass_handle, pass_writer = _open('pass', fields)
311 pass_writer.writeReceptor(record)
312 else:
313 fail_count += 1
314 # Write row to fail file if specified
315 if out_args['failed']:
316 try:
317 fail_writer.writeReceptor(record)
318 except AttributeError:
319 # Open fail file and writer
320 fail_handle, fail_writer = _open('fail', fields)
321 fail_writer.writeReceptor(record)
322
323 # Write log
324 if log_handle is not None:
325 log = OrderedDict([('ID', record.sequence_id),
326 ('V_CALL', record.v_call),
327 ('D_CALL', record.d_call),
328 ('J_CALL', record.j_call),
329 ('PRODUCTIVE', record.functional)])
330 if not _imgt_check(record) and not amino_acid:
331 log['ERROR'] = 'Junction does not match the sequence starting at position 310 in the IMGT numbered V(D)J sequence.'
332 printLog(log, log_handle)
333
334 # Print progress
335 printProgress(i, total_count, 0.05, start_time=start_time)
336
337 # Print console log
338 log = OrderedDict()
339 log['OUTPUT'] = os.path.basename(pass_handle.name) if pass_handle is not None else None
340 log['PASS'] = pass_count
341 log['FAIL'] = fail_count
342 log['END'] = 'MakeDb'
343 printLog(log)
344
345 # Close file handles
346 output = {'pass': None, 'fail': None}
347 if pass_handle is not None:
348 output['pass'] = pass_handle.name
349 pass_handle.close()
350 if fail_handle is not None:
351 output['fail'] = fail_handle.name
352 fail_handle.close()
353
354 return output
355
356
357 def parseIMGT(aligner_file, seq_file=None, repo=None, cellranger_file=None, partial=False, asis_id=True,
358 extended=False, format=default_format, out_file=None, out_args=default_out_args):
359 """
360 Main for IMGT aligned sample sequences.
361
362 Arguments:
363 aligner_file : zipped file or unzipped folder output by IMGT.
364 seq_file : FASTA file input to IMGT (from which to get seqID).
365 repo : folder with germline repertoire files.
366 partial : If True put incomplete alignments in the pass file.
367 asis_id : if ID is to be parsed for pRESTO output with default delimiters.
368 extended : if True add alignment score, FWR, CDR and junction fields to output file.
369 format : output format. one of 'changeo' or 'airr'.
370 out_file : output file name. Automatically generated from the input file if None.
371 out_args : common output argument dictionary from parseCommonArgs.
372
373 Returns:
374 dict : names of the 'pass' and 'fail' output files.
375 """
376 # Print parameter info
377 log = OrderedDict()
378 log['START'] = 'MakeDb'
379 log['COMMAND'] = 'imgt'
380 log['ALIGNER_FILE'] = aligner_file
381 log['SEQ_FILE'] = os.path.basename(seq_file) if seq_file else ''
382 log['ASIS_ID'] = asis_id
383 log['PARTIAL'] = partial
384 log['EXTENDED'] = extended
385 printLog(log)
386
387 start_time = time()
388 printMessage('Loading files', start_time=start_time, width=20)
389
390 # Extract IMGT files
391 temp_dir, imgt_files = extractIMGT(aligner_file)
392
393 # Count records in IMGT files
394 total_count = countDbFile(imgt_files['summary'])
395
396 # Get (parsed) IDs from fasta file submitted to IMGT
397 id_dict = getIDforIMGT(seq_file) if seq_file else {}
398
399 # Load supplementary annotation table
400 if cellranger_file is not None:
401 f = cellranger_extended if extended else cellranger_base
402 annotations = readCellRanger(cellranger_file, fields=f)
403 else:
404 annotations = None
405
406 printMessage('Done', start_time=start_time, end=True, width=20)
407
408 # Define format operators
409 try:
410 __, writer, schema = getFormatOperators(format)
411 except ValueError:
412 printError('Invalid format %s.' % format)
413 out_args['out_type'] = schema.out_type
414
415 # Define output fields
416 fields = list(schema.required)
417 if extended:
418 custom = IMGTReader.customFields(scores=True, regions=True, junction=True, schema=schema)
419 fields.extend(custom)
420
421 # Parse IMGT output and write db
422 with open(imgt_files['summary'], 'r') as summary_handle, \
423 open(imgt_files['gapped'], 'r') as gapped_handle, \
424 open(imgt_files['ntseq'], 'r') as ntseq_handle, \
425 open(imgt_files['junction'], 'r') as junction_handle:
426
427 # Open parser
428 parse_iter = IMGTReader(summary_handle, gapped_handle, ntseq_handle, junction_handle)
429
430 # Add germline sequence
431 if repo is None:
432 germ_iter = parse_iter
433 else:
434 references = readGermlines(repo)
435 # Check for IMGT-gaps in germlines
436 if all('...' not in x for x in references.values()):
437 printWarning('Germline reference sequences do not appear to contain IMGT-numbering spacers. Results may be incorrect.')
438 germ_iter = (addGermline(x, references) for x in parse_iter)
439
440 # Write db
441 output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count,
442 annotations=annotations, id_dict=id_dict, asis_id=asis_id, partial=partial,
443 writer=writer, out_file=out_file, out_args=out_args)
444
445 # Cleanup temp directory
446 temp_dir.cleanup()
447
448 return output
449
450
451 def parseIgBLAST(aligner_file, seq_file, repo, amino_acid=False, cellranger_file=None, partial=False,
452 asis_id=True, asis_calls=False, extended=False, regions='default',
453 format='changeo', out_file=None, out_args=default_out_args):
454 """
455 Main for IgBLAST aligned sample sequences.
456
457 Arguments:
458 aligner_file (str): IgBLAST output file to process.
459 seq_file (str): fasta file input to IgBlast (from which to get sequence).
460 repo (str): folder with germline repertoire files.
461 amino_acid (bool): if True then the IgBLAST output files are results from igblastp. igblastn is assumed if False.
462 partial : If True put incomplete alignments in the pass file.
463 asis_id (bool): if ID is to be parsed for pRESTO output with default delimiters.
464 asis_calls (bool): if True do not parse gene calls for allele names.
465 extended (bool): if True add alignment scores, FWR regions, and CDR regions to the output.
466 regions (str): name of the IMGT FWR/CDR region definitions to use.
467 format (str): output format. one of 'changeo' or 'airr'.
468 out_file (str): output file name. Automatically generated from the input file if None.
469 out_args (dict): common output argument dictionary from parseCommonArgs.
470
471 Returns:
472 dict : names of the 'pass' and 'fail' output files.
473 """
474 # Print parameter info
475 log = OrderedDict()
476 log['START'] = 'MakeDB'
477 log['COMMAND'] = 'igblast-aa' if amino_acid else 'igblast'
478 log['ALIGNER_FILE'] = os.path.basename(aligner_file)
479 log['SEQ_FILE'] = os.path.basename(seq_file)
480 log['ASIS_ID'] = asis_id
481 log['ASIS_CALLS'] = asis_calls
482 log['PARTIAL'] = partial
483 log['EXTENDED'] = extended
484 printLog(log)
485
486 # Set amino acid conditions
487 if amino_acid:
488 format = '%s-aa' % format
489 parser = IgBLASTReaderAA
490 else:
491 parser = IgBLASTReader
492
493 # Start
494 start_time = time()
495 printMessage('Loading files', start_time=start_time, width=20)
496
497 # Count records in sequence file
498 total_count = countSeqFile(seq_file)
499
500 # Get input sequence dictionary
501 seq_dict = getSeqDict(seq_file)
502
503 # Create germline repo dictionary
504 references = readGermlines(repo, asis=asis_calls)
505
506 # Load supplementary annotation table
507 if cellranger_file is not None:
508 f = cellranger_extended if extended else cellranger_base
509 annotations = readCellRanger(cellranger_file, fields=f)
510 else:
511 annotations = None
512
513 printMessage('Done', start_time=start_time, end=True, width=20)
514
515 # Check for IMGT-gaps in germlines
516 if all('...' not in x for x in references.values()):
517 printWarning('Germline reference sequences do not appear to contain IMGT-numbering spacers. Results may be incorrect.')
518
519 # Define format operators
520 try:
521 __, writer, schema = getFormatOperators(format)
522 except ValueError:
523 printError('Invalid format %s.' % format)
524 out_args['out_type'] = schema.out_type
525
526 # Define output fields
527 fields = list(schema.required)
528 if extended:
529 custom = parser.customFields(schema=schema)
530 fields.extend(custom)
531
532 # Parse and write output
533 with open(aligner_file, 'r') as f:
534 parse_iter = parser(f, seq_dict, references, regions=regions, asis_calls=asis_calls)
535 germ_iter = (addGermline(x, references, amino_acid=amino_acid) for x in parse_iter)
536 output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count,
537 annotations=annotations, amino_acid=amino_acid, partial=partial, asis_id=asis_id,
538 regions=regions, writer=writer, out_file=out_file, out_args=out_args)
539
540 return output
541
542
543 def parseIHMM(aligner_file, seq_file, repo, cellranger_file=None, partial=False, asis_id=True,
544 extended=False, format=default_format, out_file=None, out_args=default_out_args):
545 """
546 Main for iHMMuneAlign aligned sample sequences.
547
548 Arguments:
549 aligner_file : iHMMune-Align output file to process.
550 seq_file : fasta file input to iHMMuneAlign (from which to get sequence).
551 repo : folder with germline repertoire files.
552 partial : If True put incomplete alignments in the pass file.
553 asis_id : if ID is to be parsed for pRESTO output with default delimiters.
554 extended : if True parse alignment scores, FWR and CDR region fields.
555 format : output format. One of 'changeo' or 'airr'.
556 out_file : output file name. Automatically generated from the input file if None.
557 out_args : common output argument dictionary from parseCommonArgs.
558
559 Returns:
560 dict : names of the 'pass' and 'fail' output files.
561 """
562 # Print parameter info
563 log = OrderedDict()
564 log['START'] = 'MakeDB'
565 log['COMMAND'] = 'ihmm'
566 log['ALIGNER_FILE'] = os.path.basename(aligner_file)
567 log['SEQ_FILE'] = os.path.basename(seq_file)
568 log['ASIS_ID'] = asis_id
569 log['PARTIAL'] = partial
570 log['EXTENDED'] = extended
571 printLog(log)
572
573 start_time = time()
574 printMessage('Loading files', start_time=start_time, width=20)
575
576 # Count records in sequence file
577 total_count = countSeqFile(seq_file)
578
579 # Get input sequence dictionary
580 seq_dict = getSeqDict(seq_file)
581
582 # Create germline repo dictionary
583 references = readGermlines(repo)
584
585 # Load supplementary annotation table
586 if cellranger_file is not None:
587 f = cellranger_extended if extended else cellranger_base
588 annotations = readCellRanger(cellranger_file, fields=f)
589 else:
590 annotations = None
591
592 printMessage('Done', start_time=start_time, end=True, width=20)
593
594 # Check for IMGT-gaps in germlines
595 if all('...' not in x for x in references.values()):
596 printWarning('Germline reference sequences do not appear to contain IMGT-numbering spacers. Results may be incorrect.')
597
598 # Define format operators
599 try:
600 __, writer, schema = getFormatOperators(format)
601 except ValueError:
602 printError('Invalid format %s.' % format)
603 out_args['out_type'] = schema.out_type
604
605 # Define output fields
606 fields = list(schema.required)
607 if extended:
608 custom = IHMMuneReader.customFields(scores=True, regions=True, schema=schema)
609 fields.extend(custom)
610
611 # Parse and write output
612 with open(aligner_file, 'r') as f:
613 parse_iter = IHMMuneReader(f, seq_dict, references)
614 germ_iter = (addGermline(x, references) for x in parse_iter)
615 output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count,
616 annotations=annotations, asis_id=asis_id, partial=partial,
617 writer=writer, out_file=out_file, out_args=out_args)
618
619 return output
620
621
622 def getArgParser():
623 """
624 Defines the ArgumentParser.
625
626 Returns:
627 argparse.ArgumentParser
628 """
629 fields = dedent(
630 '''
631 output files:
632 db-pass
633 database of alignment records with functionality information,
634 V and J calls, and a junction region.
635 db-fail
636 database with records that fail due to no productivity information,
637 no gene V assignment, no J assignment, or no junction region.
638
639 universal output fields:
640 sequence_id, sequence, sequence_alignment, germline_alignment,
641 rev_comp, productive, stop_codon, vj_in_frame, locus,
642 v_call, d_call, j_call, junction, junction_length, junction_aa,
643 v_sequence_start, v_sequence_end, v_germline_start, v_germline_end,
644 d_sequence_start, d_sequence_end, d_germline_start, d_germline_end,
645 j_sequence_start, j_sequence_end, j_germline_start, j_germline_end,
646 np1_length, np2_length, fwr1, fwr2, fwr3, fwr4, cdr1, cdr2, cdr3
647
648 imgt specific output fields:
649 n1_length, n2_length, p3v_length, p5d_length, p3d_length, p5j_length,
650 d_frame, v_score, v_identity, d_score, d_identity, j_score, j_identity
651
652 igblast specific output fields:
653 v_score, v_identity, v_support, v_cigar,
654 d_score, d_identity, d_support, d_cigar,
655 j_score, j_identity, j_support, j_cigar
656
657 ihmm specific output fields:
658 vdj_score
659
660 10X specific output fields:
661 cell_id, c_call, consensus_count, umi_count,
662 v_call_10x, d_call_10x, j_call_10x,
663 junction_10x, junction_10x_aa
664 ''')
665
666 # Define ArgumentParser
667 parser = ArgumentParser(description=__doc__, epilog=fields,
668 formatter_class=CommonHelpFormatter, add_help=False)
669 group_help = parser.add_argument_group('help')
670 group_help.add_argument('--version', action='version',
671 version='%(prog)s:' + ' %s %s' %(__version__, __date__))
672 group_help.add_argument('-h', '--help', action='help', help='show this help message and exit')
673 subparsers = parser.add_subparsers(title='subcommands', dest='command',
674 help='Aligner used', metavar='')
675 # TODO: This is a temporary fix for Python issue 9253
676 subparsers.required = True
677
678 # Parent parser
679 parser_parent = getCommonArgParser(db_in=False)
680
681 # igblastn output parser
682 parser_igblast = subparsers.add_parser('igblast', parents=[parser_parent],
683 formatter_class=CommonHelpFormatter, add_help=False,
684 help='Process igblastn output.',
685 description='Process igblastn output.')
686 group_igblast = parser_igblast.add_argument_group('aligner parsing arguments')
687 group_igblast.add_argument('-i', nargs='+', action='store', dest='aligner_files',
688 required=True,
689 help='''IgBLAST output files in format 7 with query sequence
690 (igblastn argument \'-outfmt "7 std qseq sseq btop"\').''')
691 group_igblast.add_argument('-r', nargs='+', action='store', dest='repo', required=True,
692 help='''List of folders and/or fasta files containing
693 the same germline set used in the IgBLAST alignment. These
694 reference sequences must contain IMGT-numbering spacers (gaps)
695 in the V segment.''')
696 group_igblast.add_argument('-s', action='store', nargs='+', dest='seq_files',
697 required=True,
698 help='''List of input FASTA files (with .fasta, .fna or .fa
699 extension), containing sequences.''')
700 group_igblast.add_argument('--10x', action='store', nargs='+', dest='cellranger_file',
701 help='''Table file containing 10X annotations (with .csv or .tsv
702 extension).''')
703 group_igblast.add_argument('--asis-id', action='store_true', dest='asis_id',
704 help='''Specify to prevent input sequence headers from being parsed
705 to add new columns to database. Parsing of sequence headers requires
706 headers to be in the pRESTO annotation format, so this should be specified
707 when sequence headers are incompatible with the pRESTO annotation scheme.
708 Note, unrecognized header formats will default to this behavior.''')
709 group_igblast.add_argument('--asis-calls', action='store_true', dest='asis_calls',
710 help='''Specify to prevent gene calls from being parsed into standard allele names
711 in both the IgBLAST output and reference database. Note, this requires
712 the sequence identifiers in the reference sequence set and the IgBLAST
713 database to be exact string matches.''')
714 group_igblast.add_argument('--partial', action='store_true', dest='partial',
715 help='''If specified, include incomplete V(D)J alignments in
716 the pass file instead of the fail file. An incomplete alignment
717 is defined as a record for which a valid IMGT-gapped sequence
718 cannot be built or that is missing a V gene assignment,
719 J gene assignment, junction region, or productivity call.''')
720 group_igblast.add_argument('--extended', action='store_true', dest='extended',
721 help='''Specify to include additional aligner specific fields in the output.
722 Adds <vdj>_score, <vdj>_identity, <vdj>_support, <vdj>_cigar,
723 fwr1, fwr2, fwr3, fwr4, cdr1, cdr2 and cdr3.''')
724 group_igblast.add_argument('--regions', action='store', dest='regions',
725 choices=('default', 'rhesus-igl'), default='default',
726 help='''IMGT CDR and FWR boundary definition to use.''')
727 parser_igblast.set_defaults(func=parseIgBLAST, amino_acid=False)
728
729 # igblastp output parser
730 parser_igblast_aa = subparsers.add_parser('igblast-aa', parents=[parser_parent],
731 formatter_class=CommonHelpFormatter, add_help=False,
732 help='Process igblastp output.',
733 description='Process igblastp output.')
734 group_igblast_aa = parser_igblast_aa.add_argument_group('aligner parsing arguments')
735 group_igblast_aa.add_argument('-i', nargs='+', action='store', dest='aligner_files',
736 required=True,
737 help='''IgBLAST output files in format 7 with query sequence
738 (igblastp argument \'-outfmt "7 std qseq sseq btop"\').''')
739 group_igblast_aa.add_argument('-r', nargs='+', action='store', dest='repo', required=True,
740 help='''List of folders and/or fasta files containing
741 the same germline set used in the IgBLAST alignment. These
742 reference sequences must contain IMGT-numbering spacers (gaps)
743 in the V segment.''')
744 group_igblast_aa.add_argument('-s', action='store', nargs='+', dest='seq_files', required=True,
745 help='''List of input FASTA files (with .fasta, .fna or .fa
746 extension), containing sequences.''')
747 group_igblast_aa.add_argument('--10x', action='store', nargs='+', dest='cellranger_file',
748 help='''Table file containing 10X annotations (with .csv or .tsv extension).''')
749 group_igblast_aa.add_argument('--asis-id', action='store_true', dest='asis_id',
750 help='''Specify to prevent input sequence headers from being parsed
751 to add new columns to database. Parsing of sequence headers requires
752 headers to be in the pRESTO annotation format, so this should be specified
753 when sequence headers are incompatible with the pRESTO annotation scheme.
754 Note, unrecognized header formats will default to this behavior.''')
755 group_igblast_aa.add_argument('--asis-calls', action='store_true', dest='asis_calls',
756 help='''Specify to prevent gene calls from being parsed into standard allele names
757 in both the IgBLAST output and reference database. Note, this requires
758 the sequence identifiers in the reference sequence set and the IgBLAST
759 database to be exact string matches.''')
760 group_igblast_aa.add_argument('--extended', action='store_true', dest='extended',
761 help='''Specify to include additional aligner specific fields in the output.
762 Adds v_score, v_identity, v_support, v_cigar, fwr1, fwr2, fwr3, cdr1 and cdr2.''')
763 group_igblast_aa.add_argument('--regions', action='store', dest='regions',
764 choices=('default', 'rhesus-igl'), default='default',
765 help='''IMGT CDR and FWR boundary definition to use.''')
766 parser_igblast_aa.set_defaults(func=parseIgBLAST, partial=True, amino_acid=True)
767
768
769 # IMGT aligner
770 parser_imgt = subparsers.add_parser('imgt', parents=[parser_parent],
771 formatter_class=CommonHelpFormatter, add_help=False,
772 help='''Process IMGT/HighV-Quest output
773 (does not work with V-QUEST).''',
774 description='''Process IMGT/HighV-Quest output
775 (does not work with V-QUEST).''')
776 group_imgt = parser_imgt.add_argument_group('aligner parsing arguments')
777 group_imgt.add_argument('-i', nargs='+', action='store', dest='aligner_files',
778 help='''Either zipped IMGT output files (.zip or .txz) or a
779 folder containing unzipped IMGT output files (which must
780 include 1_Summary, 2_IMGT-gapped, 3_Nt-sequences,
781 and 6_Junction).''')
782 group_imgt.add_argument('-s', nargs='*', action='store', dest='seq_files', required=False,
783 help='''List of FASTA files (with .fasta, .fna or .fa
784 extension) that were submitted to IMGT/HighV-QUEST.
785 If unspecified, sequence identifiers truncated by IMGT/HighV-QUEST
786 will not be corrected.''')
787 group_imgt.add_argument('-r', nargs='+', action='store', dest='repo', required=False,
788 help='''List of folders and/or fasta files containing
789 the germline sequence set used by IMGT/HighV-QUEST.
790 These reference sequences must contain IMGT-numbering spacers (gaps)
791 in the V segment. If unspecified, the germline sequence reconstruction
792 will not be included in the output.''')
793 group_imgt.add_argument('--10x', action='store', nargs='+', dest='cellranger_file',
794 help='''Table file containing 10X annotations (with .csv or .tsv
795 extension).''')
796 group_imgt.add_argument('--asis-id', action='store_true', dest='asis_id',
797 help='''Specify to prevent input sequence headers from being parsed
798 to add new columns to database. Parsing of sequence headers requires
799 headers to be in the pRESTO annotation format, so this should be specified
800 when sequence headers are incompatible with the pRESTO annotation scheme.
801 Note, unrecognized header formats will default to this behavior.''')
802 group_imgt.add_argument('--partial', action='store_true', dest='partial',
803 help='''If specified, include incomplete V(D)J alignments in
804 the pass file instead of the fail file. An incomplete alignment
805 is defined as a record that is missing a V gene assignment,
806 J gene assignment, junction region, or productivity call.''')
807 group_imgt.add_argument('--extended', action='store_true', dest='extended',
808 help='''Specify to include additional aligner specific fields in the output.
809 Adds <vdj>_score, <vdj>_identity>, fwr1, fwr2, fwr3, fwr4,
810 cdr1, cdr2, cdr3, n1_length, n2_length, p3v_length, p5d_length,
811 p3d_length, p5j_length and d_frame.''')
812 parser_imgt.set_defaults(func=parseIMGT)
813
814 # iHMMuneAlign Aligner
815 parser_ihmm = subparsers.add_parser('ihmm', parents=[parser_parent],
816 formatter_class=CommonHelpFormatter, add_help=False,
817 help='Process iHMMune-Align output.',
818 description='Process iHMMune-Align output.')
819 group_ihmm = parser_ihmm.add_argument_group('aligner parsing arguments')
820 group_ihmm.add_argument('-i', nargs='+', action='store', dest='aligner_files',
821 required=True,
822 help='''iHMMune-Align output file.''')
823 group_ihmm.add_argument('-r', nargs='+', action='store', dest='repo', required=True,
824 help='''List of folders and/or FASTA files containing
825 the set of germline sequences used by iHMMune-Align. These
826 reference sequences must contain IMGT-numbering spacers (gaps)
827 in the V segment.''')
828 group_ihmm.add_argument('-s', action='store', nargs='+', dest='seq_files',
829 required=True,
830 help='''List of input FASTA files (with .fasta, .fna or .fa
831 extension) containing sequences.''')
832 group_ihmm.add_argument('--10x', action='store', nargs='+', dest='cellranger_file',
833 help='''Table file containing 10X annotations (with .csv or .tsv
834 extension).''')
835 group_ihmm.add_argument('--asis-id', action='store_true', dest='asis_id',
836 help='''Specify to prevent input sequence headers from being parsed
837 to add new columns to database. Parsing of sequence headers requires
838 headers to be in the pRESTO annotation format, so this should be specified
839 when sequence headers are incompatible with the pRESTO annotation scheme.
840 Note, unrecognized header formats will default to this behavior.''')
841 group_ihmm.add_argument('--partial', action='store_true', dest='partial',
842 help='''If specified, include incomplete V(D)J alignments in
843 the pass file instead of the fail file. An incomplete alignment
844 is defined as a record for which a valid IMGT-gapped sequence
845 cannot be built or that is missing a V gene assignment,
846 J gene assignment, junction region, or productivity call.''')
847 group_ihmm.add_argument('--extended', action='store_true', dest='extended',
848 help='''Specify to include additional aligner specific fields in the output.
849 Adds the path score of the iHMMune-Align hidden Markov model as vdj_score;
850 adds fwr1, fwr2, fwr3, fwr4, cdr1, cdr2 and cdr3.''')
851 parser_ihmm.set_defaults(func=parseIHMM)
852
853 return parser
854
855
856 if __name__ == "__main__":
857 """
858 Parses command line arguments and calls main
859 """
860 parser = getArgParser()
861 checkArgs(parser)
862 args = parser.parse_args()
863 args_dict = parseCommonArgs(args, in_arg='aligner_files')
864
865 # Set no ID parsing if sequence files are not provided
866 if 'seq_files' in args_dict and not args_dict['seq_files']:
867 args_dict['asis_id'] = True
868
869 # Delete
870 if 'aligner_files' in args_dict: del args_dict['aligner_files']
871 if 'seq_files' in args_dict: del args_dict['seq_files']
872 if 'out_files' in args_dict: del args_dict['out_files']
873 if 'command' in args_dict: del args_dict['command']
874 if 'func' in args_dict: del args_dict['func']
875
876 # Call main
877 for i, f in enumerate(args.__dict__['aligner_files']):
878 args_dict['aligner_file'] = f
879 args_dict['seq_file'] = args.__dict__['seq_files'][i] \
880 if args.__dict__['seq_files'] else None
881 args_dict['out_file'] = args.__dict__['out_files'][i] \
882 if args.__dict__['out_files'] else None
883 args_dict['cellranger_file'] = args.__dict__['cellranger_file'][i] \
884 if args.__dict__['cellranger_file'] else None
885 args.func(**args_dict)