78
|
1 #!/usr/bin/env python3
|
|
2 """
|
|
3 Create tab-delimited database file to store sequence alignment information
|
|
4 """
|
|
5
|
|
6 # Info
|
|
7 __author__ = 'Namita Gupta, Jason Anthony Vander Heiden'
|
|
8 from changeo import __version__, __date__
|
|
9
|
|
10 # Imports
|
|
11 import os
|
|
12 import re
|
|
13 import csv
|
|
14 from argparse import ArgumentParser
|
|
15 from collections import OrderedDict
|
|
16 from textwrap import dedent
|
|
17 from time import time
|
|
18 from Bio import SeqIO
|
|
19
|
|
20 # Presto and changeo imports
|
|
21 from presto.Annotation import parseAnnotation
|
|
22 from presto.IO import countSeqFile, printLog, printMessage, printProgress, printError, printWarning, readSeqFile
|
|
23 from changeo.Defaults import default_format, default_out_args
|
|
24 from changeo.Commandline import CommonHelpFormatter, checkArgs, getCommonArgParser, parseCommonArgs
|
|
25 from changeo.Alignment import RegionDefinition
|
|
26 from changeo.Gene import buildGermline
|
|
27 from changeo.IO import countDbFile, extractIMGT, readGermlines, getFormatOperators, getOutputHandle, \
|
|
28 AIRRWriter, ChangeoWriter, IgBLASTReader, IgBLASTReaderAA, IMGTReader, IHMMuneReader
|
|
29 from changeo.Receptor import ChangeoSchema, AIRRSchema
|
|
30
|
|
31 # 10X Receptor attributes
|
|
32 cellranger_base = ['cell', 'c_call', 'conscount', 'umicount']
|
|
33 cellranger_extended = ['cell', 'c_call', 'conscount', 'umicount',
|
|
34 'v_call_10x', 'd_call_10x', 'j_call_10x',
|
|
35 'junction_10x', 'junction_10x_aa']
|
|
36
|
|
37 def readCellRanger(cellranger_file, fields=cellranger_base):
|
|
38 """
|
|
39 Load a Cell Ranger annotation table
|
|
40
|
|
41 Arguments:
|
|
42 cellranger_file (str): path to the annotation file.
|
|
43 fields (list): list of fields to keep.
|
|
44
|
|
45 Returns:
|
|
46 dict: dict of dicts with contig_id as the primary key.
|
|
47 """
|
|
48 # Mapping of 10X annotations to Receptor attributes
|
|
49 cellranger_map = {'cell': 'barcode',
|
|
50 'c_call': 'c_gene',
|
|
51 'locus': 'chain',
|
|
52 'conscount': 'reads',
|
|
53 'umicount': 'umis',
|
|
54 'v_call_10x': 'v_gene',
|
|
55 'd_call_10x': 'd_gene',
|
|
56 'j_call_10x': 'j_gene',
|
|
57 'junction_10x': 'cdr3_nt',
|
|
58 'junction_10x_aa': 'cdr3'}
|
|
59
|
|
60 # Function to parse individual fields
|
|
61 def _parse(x):
|
|
62 return '' if x == 'None' else x
|
|
63
|
|
64 # Generate annotation dictionary
|
|
65 ann_dict = {}
|
|
66 with open(cellranger_file) as csv_file:
|
|
67 # Detect delimiters
|
|
68 dialect = csv.Sniffer().sniff(csv_file.readline())
|
|
69 csv_file.seek(0)
|
|
70 # Read in annotation file
|
|
71 csv_reader = csv.DictReader(csv_file, dialect=dialect)
|
|
72
|
|
73 # Generate annotation dictionary
|
|
74 for row in csv_reader:
|
|
75 ann_dict[row['contig_id']] = {f: _parse(row[cellranger_map[f]]) for f in fields}
|
|
76
|
|
77 return ann_dict
|
|
78
|
|
79
|
|
80 def addGermline(receptor, references, amino_acid=False):
|
|
81 """
|
|
82 Add full length germline to Receptor object
|
|
83
|
|
84 Arguments:
|
|
85 receptor (changeo.Receptor.Receptor): Receptor object to modify.
|
|
86 references (dict): dictionary of IMGT-gapped references sequences.
|
|
87 amino_acid (bool): if True build amino acid germline, otherwise build nucleotide germline
|
|
88
|
|
89 Returns:
|
|
90 changeo.Receptor.Receptor: modified Receptor with the germline sequence added.
|
|
91 """
|
|
92 if amino_acid:
|
|
93 __, germlines, __ = buildGermline(receptor, references, seq_field='sequence_aa_imgt',
|
|
94 amino_acid=True)
|
|
95 germline_seq = None if germlines is None else germlines['full']
|
|
96 receptor.setField('germline_aa_imgt', germline_seq)
|
|
97 else:
|
|
98 __, germlines, __ = buildGermline(receptor, references, amino_acid=False)
|
|
99 germline_seq = None if germlines is None else germlines['full']
|
|
100 receptor.setField('germline_imgt', germline_seq)
|
|
101
|
|
102 return receptor
|
|
103
|
|
104
|
|
105 def getIDforIMGT(seq_file):
|
|
106 """
|
|
107 Create a sequence ID translation using IMGT truncation.
|
|
108
|
|
109 Arguments:
|
|
110 seq_file : a fasta file of sequences input to IMGT.
|
|
111
|
|
112 Returns:
|
|
113 dict : a dictionary of with the IMGT truncated ID as the key and the full sequence description as the value.
|
|
114 """
|
|
115
|
|
116 # Create a sequence ID translation using IDs truncate up to space or 50 chars
|
|
117 ids = {}
|
|
118 for rec in readSeqFile(seq_file):
|
|
119 if len(rec.description) <= 50:
|
|
120 id_key = rec.description
|
|
121 else:
|
|
122 id_key = re.sub('\||\s|!|&|\*|<|>|\?', '_', rec.description[:50])
|
|
123 ids.update({id_key: rec.description})
|
|
124
|
|
125 return ids
|
|
126
|
|
127
|
|
128 def getSeqDict(seq_file):
|
|
129 """
|
|
130 Create a dictionary from a sequence file.
|
|
131
|
|
132 Arguments:
|
|
133 seq_file : sequence file.
|
|
134
|
|
135 Returns:
|
|
136 dict : sequence description as keys with Bio.SeqRecords as values.
|
|
137 """
|
|
138 seq_dict = SeqIO.to_dict(readSeqFile(seq_file), key_function=lambda x: x.description)
|
|
139
|
|
140 return seq_dict
|
|
141
|
|
142
|
|
143 def writeDb(records, fields, aligner_file, total_count, id_dict=None, annotations=None,
|
|
144 amino_acid=False, partial=False, asis_id=True, regions='default',
|
|
145 writer=AIRRWriter, out_file=None, out_args=default_out_args):
|
|
146 """
|
|
147 Writes parsed records to an output file
|
|
148
|
|
149 Arguments:
|
|
150 records : a iterator of Receptor objects containing alignment data.
|
|
151 fields : a list of ordered field names to write.
|
|
152 aligner_file : input file name.
|
|
153 total_count : number of records (for progress bar).
|
|
154 id_dict : a dictionary of the truncated sequence ID mapped to the full sequence ID.
|
|
155 annotations : additional annotation dictionary.
|
|
156 amino_acid : if True do verification on amino acid fields.
|
|
157 partial : if True put incomplete alignments in the pass file.
|
|
158 asis_id : if ID is to be parsed for pRESTO output with default delimiters.
|
|
159 regions (str): name of the IMGT FWR/CDR region definitions to use.
|
|
160 writer : writer class.
|
|
161 out_file : output file name. Automatically generated from the input file if None.
|
|
162 out_args : common output argument dictionary from parseCommonArgs.
|
|
163
|
|
164 Returns:
|
|
165 None
|
|
166 """
|
|
167 # Wrapper for opening handles and writers
|
|
168 def _open(x, f, writer=writer, out_file=out_file):
|
|
169 if out_file is not None and x == 'pass':
|
|
170 handle = open(out_file, 'w')
|
|
171 else:
|
|
172 handle = getOutputHandle(aligner_file,
|
|
173 out_label='db-%s' % x,
|
|
174 out_dir=out_args['out_dir'],
|
|
175 out_name=out_args['out_name'],
|
|
176 out_type=out_args['out_type'])
|
|
177 return handle, writer(handle, fields=f)
|
|
178
|
|
179 # Function to convert fasta header annotations to changeo columns
|
|
180 def _changeo(f, header):
|
|
181 h = [ChangeoSchema.fromReceptor(x) for x in header if x.upper() not in f]
|
|
182 f.extend(h)
|
|
183 return f
|
|
184
|
|
185 def _airr(f, header):
|
|
186 h = [AIRRSchema.fromReceptor(x) for x in header if x.lower() not in f]
|
|
187 f.extend(h)
|
|
188 return f
|
|
189
|
|
190 # Function to verify IMGT-gapped sequence and junction concur
|
|
191 def _imgt_check(rec):
|
|
192 try:
|
|
193 if amino_acid:
|
|
194 rd = RegionDefinition(rec.junction_aa_length, amino_acid=amino_acid, definition=regions)
|
|
195 x, y = rd.positions['junction']
|
|
196 check = (rec.junction_aa == rec.sequence_aa_imgt[x:y])
|
|
197 else:
|
|
198 rd = RegionDefinition(rec.junction_length, amino_acid=amino_acid, definition=regions)
|
|
199 x, y = rd.positions['junction']
|
|
200 check = (rec.junction == rec.sequence_imgt[x:y])
|
|
201 except (TypeError, AttributeError):
|
|
202 check = False
|
|
203 return check
|
|
204
|
|
205 # Function to check for valid records strictly
|
|
206 def _strict(rec):
|
|
207 if amino_acid:
|
|
208 valid = [rec.v_call and rec.v_call != 'None',
|
|
209 rec.j_call and rec.j_call != 'None',
|
|
210 rec.functional is not None,
|
|
211 rec.sequence_aa_imgt,
|
|
212 rec.junction_aa,
|
|
213 _imgt_check(rec)]
|
|
214 else:
|
|
215 valid = [rec.v_call and rec.v_call != 'None',
|
|
216 rec.j_call and rec.j_call != 'None',
|
|
217 rec.functional is not None,
|
|
218 rec.sequence_imgt,
|
|
219 rec.junction,
|
|
220 _imgt_check(rec)]
|
|
221 return all(valid)
|
|
222
|
|
223 # Function to check for valid records loosely
|
|
224 def _gentle(rec):
|
|
225 valid = [rec.v_call and rec.v_call != 'None',
|
|
226 rec.d_call and rec.d_call != 'None',
|
|
227 rec.j_call and rec.j_call != 'None']
|
|
228 return any(valid)
|
|
229
|
|
230 # Set writer class and annotation conversion function
|
|
231 if writer == ChangeoWriter:
|
|
232 _annotate = _changeo
|
|
233 elif writer == AIRRWriter:
|
|
234 _annotate = _airr
|
|
235 else:
|
|
236 printError('Invalid output writer.')
|
|
237
|
|
238 # Additional annotation (e.g. 10X cell calls)
|
|
239 # _append_table = None
|
|
240 # if cellranger_file is not None:
|
|
241 # with open(cellranger_file) as csv_file:
|
|
242 # # Read in annotation file (use Sniffer to discover file delimiters)
|
|
243 # dialect = csv.Sniffer().sniff(csv_file.readline())
|
|
244 # csv_file.seek(0)
|
|
245 # csv_reader = csv.DictReader(csv_file, dialect = dialect)
|
|
246 #
|
|
247 # # Generate annotation dictionary
|
|
248 # anntab_dict = {entry['contig_id']: {cellranger_map[field]: entry[field] \
|
|
249 # for field in cellranger_map.keys()} for entry in csv_reader}
|
|
250 #
|
|
251 # fields = _annotate(fields, cellranger_map.values())
|
|
252 # _append_table = lambda sequence_id: anntab_dict[sequence_id]
|
|
253
|
|
254 # Set pass criteria
|
|
255 _pass = _gentle if partial else _strict
|
|
256
|
|
257 # Define log handle
|
|
258 if out_args['log_file'] is None:
|
|
259 log_handle = None
|
|
260 else:
|
|
261 log_handle = open(out_args['log_file'], 'w')
|
|
262
|
|
263 # Initialize handles, writers and counters
|
|
264 pass_handle, pass_writer = None, None
|
|
265 fail_handle, fail_writer = None, None
|
|
266 pass_count, fail_count = 0, 0
|
|
267 start_time = time()
|
|
268
|
|
269 # Validate and write output
|
|
270 printProgress(0, total_count, 0.05, start_time=start_time)
|
|
271 for i, record in enumerate(records, start=1):
|
|
272 # Replace sequence description with full string, if required
|
|
273 if id_dict is not None and record.sequence_id in id_dict:
|
|
274 record.sequence_id = id_dict[record.sequence_id]
|
|
275
|
|
276 # Parse sequence description into new columns
|
|
277 if not asis_id:
|
|
278 try:
|
|
279 ann_raw = parseAnnotation(record.sequence_id)
|
|
280 record.sequence_id = ann_raw.pop('ID')
|
|
281
|
|
282 # Convert to Receptor fields
|
|
283 ann_parsed = OrderedDict()
|
|
284 for k, v in ann_raw.items():
|
|
285 ann_parsed[ChangeoSchema.toReceptor(k)] = v
|
|
286
|
|
287 # Add annotations to Receptor and update field list
|
|
288 record.setDict(ann_parsed, parse=True)
|
|
289 if i == 1: fields = _annotate(fields, ann_parsed.keys())
|
|
290 except IndexError:
|
|
291 # Could not parse pRESTO-style annotations so fall back to no parse
|
|
292 asis_id = True
|
|
293 printWarning('Sequence annotation format not recognized. Sequence headers will not be parsed.')
|
|
294
|
|
295 # Add supplemental annotation fields
|
|
296 # if _append_table is not None:
|
|
297 # record.setDict(_append_table(record.sequence_id), parse=True)
|
|
298 if annotations is not None:
|
|
299 record.setDict(annotations[record.sequence_id], parse=True)
|
|
300 if i == 1: fields = _annotate(fields, annotations[record.sequence_id].keys())
|
|
301
|
|
302 # Count pass or fail and write to appropriate file
|
|
303 if _pass(record):
|
|
304 pass_count += 1
|
|
305 # Write row to pass file
|
|
306 try:
|
|
307 pass_writer.writeReceptor(record)
|
|
308 except AttributeError:
|
|
309 # Open pass file and writer
|
|
310 pass_handle, pass_writer = _open('pass', fields)
|
|
311 pass_writer.writeReceptor(record)
|
|
312 else:
|
|
313 fail_count += 1
|
|
314 # Write row to fail file if specified
|
|
315 if out_args['failed']:
|
|
316 try:
|
|
317 fail_writer.writeReceptor(record)
|
|
318 except AttributeError:
|
|
319 # Open fail file and writer
|
|
320 fail_handle, fail_writer = _open('fail', fields)
|
|
321 fail_writer.writeReceptor(record)
|
|
322
|
|
323 # Write log
|
|
324 if log_handle is not None:
|
|
325 log = OrderedDict([('ID', record.sequence_id),
|
|
326 ('V_CALL', record.v_call),
|
|
327 ('D_CALL', record.d_call),
|
|
328 ('J_CALL', record.j_call),
|
|
329 ('PRODUCTIVE', record.functional)])
|
|
330 if not _imgt_check(record) and not amino_acid:
|
|
331 log['ERROR'] = 'Junction does not match the sequence starting at position 310 in the IMGT numbered V(D)J sequence.'
|
|
332 printLog(log, log_handle)
|
|
333
|
|
334 # Print progress
|
|
335 printProgress(i, total_count, 0.05, start_time=start_time)
|
|
336
|
|
337 # Print console log
|
|
338 log = OrderedDict()
|
|
339 log['OUTPUT'] = os.path.basename(pass_handle.name) if pass_handle is not None else None
|
|
340 log['PASS'] = pass_count
|
|
341 log['FAIL'] = fail_count
|
|
342 log['END'] = 'MakeDb'
|
|
343 printLog(log)
|
|
344
|
|
345 # Close file handles
|
|
346 output = {'pass': None, 'fail': None}
|
|
347 if pass_handle is not None:
|
|
348 output['pass'] = pass_handle.name
|
|
349 pass_handle.close()
|
|
350 if fail_handle is not None:
|
|
351 output['fail'] = fail_handle.name
|
|
352 fail_handle.close()
|
|
353
|
|
354 return output
|
|
355
|
|
356
|
|
357 def parseIMGT(aligner_file, seq_file=None, repo=None, cellranger_file=None, partial=False, asis_id=True,
|
|
358 extended=False, format=default_format, out_file=None, out_args=default_out_args):
|
|
359 """
|
|
360 Main for IMGT aligned sample sequences.
|
|
361
|
|
362 Arguments:
|
|
363 aligner_file : zipped file or unzipped folder output by IMGT.
|
|
364 seq_file : FASTA file input to IMGT (from which to get seqID).
|
|
365 repo : folder with germline repertoire files.
|
|
366 partial : If True put incomplete alignments in the pass file.
|
|
367 asis_id : if ID is to be parsed for pRESTO output with default delimiters.
|
|
368 extended : if True add alignment score, FWR, CDR and junction fields to output file.
|
|
369 format : output format. one of 'changeo' or 'airr'.
|
|
370 out_file : output file name. Automatically generated from the input file if None.
|
|
371 out_args : common output argument dictionary from parseCommonArgs.
|
|
372
|
|
373 Returns:
|
|
374 dict : names of the 'pass' and 'fail' output files.
|
|
375 """
|
|
376 # Print parameter info
|
|
377 log = OrderedDict()
|
|
378 log['START'] = 'MakeDb'
|
|
379 log['COMMAND'] = 'imgt'
|
|
380 log['ALIGNER_FILE'] = aligner_file
|
|
381 log['SEQ_FILE'] = os.path.basename(seq_file) if seq_file else ''
|
|
382 log['ASIS_ID'] = asis_id
|
|
383 log['PARTIAL'] = partial
|
|
384 log['EXTENDED'] = extended
|
|
385 printLog(log)
|
|
386
|
|
387 start_time = time()
|
|
388 printMessage('Loading files', start_time=start_time, width=20)
|
|
389
|
|
390 # Extract IMGT files
|
|
391 temp_dir, imgt_files = extractIMGT(aligner_file)
|
|
392
|
|
393 # Count records in IMGT files
|
|
394 total_count = countDbFile(imgt_files['summary'])
|
|
395
|
|
396 # Get (parsed) IDs from fasta file submitted to IMGT
|
|
397 id_dict = getIDforIMGT(seq_file) if seq_file else {}
|
|
398
|
|
399 # Load supplementary annotation table
|
|
400 if cellranger_file is not None:
|
|
401 f = cellranger_extended if extended else cellranger_base
|
|
402 annotations = readCellRanger(cellranger_file, fields=f)
|
|
403 else:
|
|
404 annotations = None
|
|
405
|
|
406 printMessage('Done', start_time=start_time, end=True, width=20)
|
|
407
|
|
408 # Define format operators
|
|
409 try:
|
|
410 __, writer, schema = getFormatOperators(format)
|
|
411 except ValueError:
|
|
412 printError('Invalid format %s.' % format)
|
|
413 out_args['out_type'] = schema.out_type
|
|
414
|
|
415 # Define output fields
|
|
416 fields = list(schema.required)
|
|
417 if extended:
|
|
418 custom = IMGTReader.customFields(scores=True, regions=True, junction=True, schema=schema)
|
|
419 fields.extend(custom)
|
|
420
|
|
421 # Parse IMGT output and write db
|
|
422 with open(imgt_files['summary'], 'r') as summary_handle, \
|
|
423 open(imgt_files['gapped'], 'r') as gapped_handle, \
|
|
424 open(imgt_files['ntseq'], 'r') as ntseq_handle, \
|
|
425 open(imgt_files['junction'], 'r') as junction_handle:
|
|
426
|
|
427 # Open parser
|
|
428 parse_iter = IMGTReader(summary_handle, gapped_handle, ntseq_handle, junction_handle)
|
|
429
|
|
430 # Add germline sequence
|
|
431 if repo is None:
|
|
432 germ_iter = parse_iter
|
|
433 else:
|
|
434 references = readGermlines(repo)
|
|
435 # Check for IMGT-gaps in germlines
|
|
436 if all('...' not in x for x in references.values()):
|
|
437 printWarning('Germline reference sequences do not appear to contain IMGT-numbering spacers. Results may be incorrect.')
|
|
438 germ_iter = (addGermline(x, references) for x in parse_iter)
|
|
439
|
|
440 # Write db
|
|
441 output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count,
|
|
442 annotations=annotations, id_dict=id_dict, asis_id=asis_id, partial=partial,
|
|
443 writer=writer, out_file=out_file, out_args=out_args)
|
|
444
|
|
445 # Cleanup temp directory
|
|
446 temp_dir.cleanup()
|
|
447
|
|
448 return output
|
|
449
|
|
450
|
|
451 def parseIgBLAST(aligner_file, seq_file, repo, amino_acid=False, cellranger_file=None, partial=False,
|
|
452 asis_id=True, asis_calls=False, extended=False, regions='default',
|
|
453 format='changeo', out_file=None, out_args=default_out_args):
|
|
454 """
|
|
455 Main for IgBLAST aligned sample sequences.
|
|
456
|
|
457 Arguments:
|
|
458 aligner_file (str): IgBLAST output file to process.
|
|
459 seq_file (str): fasta file input to IgBlast (from which to get sequence).
|
|
460 repo (str): folder with germline repertoire files.
|
|
461 amino_acid (bool): if True then the IgBLAST output files are results from igblastp. igblastn is assumed if False.
|
|
462 partial : If True put incomplete alignments in the pass file.
|
|
463 asis_id (bool): if ID is to be parsed for pRESTO output with default delimiters.
|
|
464 asis_calls (bool): if True do not parse gene calls for allele names.
|
|
465 extended (bool): if True add alignment scores, FWR regions, and CDR regions to the output.
|
|
466 regions (str): name of the IMGT FWR/CDR region definitions to use.
|
|
467 format (str): output format. one of 'changeo' or 'airr'.
|
|
468 out_file (str): output file name. Automatically generated from the input file if None.
|
|
469 out_args (dict): common output argument dictionary from parseCommonArgs.
|
|
470
|
|
471 Returns:
|
|
472 dict : names of the 'pass' and 'fail' output files.
|
|
473 """
|
|
474 # Print parameter info
|
|
475 log = OrderedDict()
|
|
476 log['START'] = 'MakeDB'
|
|
477 log['COMMAND'] = 'igblast-aa' if amino_acid else 'igblast'
|
|
478 log['ALIGNER_FILE'] = os.path.basename(aligner_file)
|
|
479 log['SEQ_FILE'] = os.path.basename(seq_file)
|
|
480 log['ASIS_ID'] = asis_id
|
|
481 log['ASIS_CALLS'] = asis_calls
|
|
482 log['PARTIAL'] = partial
|
|
483 log['EXTENDED'] = extended
|
|
484 printLog(log)
|
|
485
|
|
486 # Set amino acid conditions
|
|
487 if amino_acid:
|
|
488 format = '%s-aa' % format
|
|
489 parser = IgBLASTReaderAA
|
|
490 else:
|
|
491 parser = IgBLASTReader
|
|
492
|
|
493 # Start
|
|
494 start_time = time()
|
|
495 printMessage('Loading files', start_time=start_time, width=20)
|
|
496
|
|
497 # Count records in sequence file
|
|
498 total_count = countSeqFile(seq_file)
|
|
499
|
|
500 # Get input sequence dictionary
|
|
501 seq_dict = getSeqDict(seq_file)
|
|
502
|
|
503 # Create germline repo dictionary
|
|
504 references = readGermlines(repo, asis=asis_calls)
|
|
505
|
|
506 # Load supplementary annotation table
|
|
507 if cellranger_file is not None:
|
|
508 f = cellranger_extended if extended else cellranger_base
|
|
509 annotations = readCellRanger(cellranger_file, fields=f)
|
|
510 else:
|
|
511 annotations = None
|
|
512
|
|
513 printMessage('Done', start_time=start_time, end=True, width=20)
|
|
514
|
|
515 # Check for IMGT-gaps in germlines
|
|
516 if all('...' not in x for x in references.values()):
|
|
517 printWarning('Germline reference sequences do not appear to contain IMGT-numbering spacers. Results may be incorrect.')
|
|
518
|
|
519 # Define format operators
|
|
520 try:
|
|
521 __, writer, schema = getFormatOperators(format)
|
|
522 except ValueError:
|
|
523 printError('Invalid format %s.' % format)
|
|
524 out_args['out_type'] = schema.out_type
|
|
525
|
|
526 # Define output fields
|
|
527 fields = list(schema.required)
|
|
528 if extended:
|
|
529 custom = parser.customFields(schema=schema)
|
|
530 fields.extend(custom)
|
|
531
|
|
532 # Parse and write output
|
|
533 with open(aligner_file, 'r') as f:
|
|
534 parse_iter = parser(f, seq_dict, references, regions=regions, asis_calls=asis_calls)
|
|
535 germ_iter = (addGermline(x, references, amino_acid=amino_acid) for x in parse_iter)
|
|
536 output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count,
|
|
537 annotations=annotations, amino_acid=amino_acid, partial=partial, asis_id=asis_id,
|
|
538 regions=regions, writer=writer, out_file=out_file, out_args=out_args)
|
|
539
|
|
540 return output
|
|
541
|
|
542
|
|
543 def parseIHMM(aligner_file, seq_file, repo, cellranger_file=None, partial=False, asis_id=True,
|
|
544 extended=False, format=default_format, out_file=None, out_args=default_out_args):
|
|
545 """
|
|
546 Main for iHMMuneAlign aligned sample sequences.
|
|
547
|
|
548 Arguments:
|
|
549 aligner_file : iHMMune-Align output file to process.
|
|
550 seq_file : fasta file input to iHMMuneAlign (from which to get sequence).
|
|
551 repo : folder with germline repertoire files.
|
|
552 partial : If True put incomplete alignments in the pass file.
|
|
553 asis_id : if ID is to be parsed for pRESTO output with default delimiters.
|
|
554 extended : if True parse alignment scores, FWR and CDR region fields.
|
|
555 format : output format. One of 'changeo' or 'airr'.
|
|
556 out_file : output file name. Automatically generated from the input file if None.
|
|
557 out_args : common output argument dictionary from parseCommonArgs.
|
|
558
|
|
559 Returns:
|
|
560 dict : names of the 'pass' and 'fail' output files.
|
|
561 """
|
|
562 # Print parameter info
|
|
563 log = OrderedDict()
|
|
564 log['START'] = 'MakeDB'
|
|
565 log['COMMAND'] = 'ihmm'
|
|
566 log['ALIGNER_FILE'] = os.path.basename(aligner_file)
|
|
567 log['SEQ_FILE'] = os.path.basename(seq_file)
|
|
568 log['ASIS_ID'] = asis_id
|
|
569 log['PARTIAL'] = partial
|
|
570 log['EXTENDED'] = extended
|
|
571 printLog(log)
|
|
572
|
|
573 start_time = time()
|
|
574 printMessage('Loading files', start_time=start_time, width=20)
|
|
575
|
|
576 # Count records in sequence file
|
|
577 total_count = countSeqFile(seq_file)
|
|
578
|
|
579 # Get input sequence dictionary
|
|
580 seq_dict = getSeqDict(seq_file)
|
|
581
|
|
582 # Create germline repo dictionary
|
|
583 references = readGermlines(repo)
|
|
584
|
|
585 # Load supplementary annotation table
|
|
586 if cellranger_file is not None:
|
|
587 f = cellranger_extended if extended else cellranger_base
|
|
588 annotations = readCellRanger(cellranger_file, fields=f)
|
|
589 else:
|
|
590 annotations = None
|
|
591
|
|
592 printMessage('Done', start_time=start_time, end=True, width=20)
|
|
593
|
|
594 # Check for IMGT-gaps in germlines
|
|
595 if all('...' not in x for x in references.values()):
|
|
596 printWarning('Germline reference sequences do not appear to contain IMGT-numbering spacers. Results may be incorrect.')
|
|
597
|
|
598 # Define format operators
|
|
599 try:
|
|
600 __, writer, schema = getFormatOperators(format)
|
|
601 except ValueError:
|
|
602 printError('Invalid format %s.' % format)
|
|
603 out_args['out_type'] = schema.out_type
|
|
604
|
|
605 # Define output fields
|
|
606 fields = list(schema.required)
|
|
607 if extended:
|
|
608 custom = IHMMuneReader.customFields(scores=True, regions=True, schema=schema)
|
|
609 fields.extend(custom)
|
|
610
|
|
611 # Parse and write output
|
|
612 with open(aligner_file, 'r') as f:
|
|
613 parse_iter = IHMMuneReader(f, seq_dict, references)
|
|
614 germ_iter = (addGermline(x, references) for x in parse_iter)
|
|
615 output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count,
|
|
616 annotations=annotations, asis_id=asis_id, partial=partial,
|
|
617 writer=writer, out_file=out_file, out_args=out_args)
|
|
618
|
|
619 return output
|
|
620
|
|
621
|
|
622 def getArgParser():
|
|
623 """
|
|
624 Defines the ArgumentParser.
|
|
625
|
|
626 Returns:
|
|
627 argparse.ArgumentParser
|
|
628 """
|
|
629 fields = dedent(
|
|
630 '''
|
|
631 output files:
|
|
632 db-pass
|
|
633 database of alignment records with functionality information,
|
|
634 V and J calls, and a junction region.
|
|
635 db-fail
|
|
636 database with records that fail due to no productivity information,
|
|
637 no gene V assignment, no J assignment, or no junction region.
|
|
638
|
|
639 universal output fields:
|
|
640 sequence_id, sequence, sequence_alignment, germline_alignment,
|
|
641 rev_comp, productive, stop_codon, vj_in_frame, locus,
|
|
642 v_call, d_call, j_call, junction, junction_length, junction_aa,
|
|
643 v_sequence_start, v_sequence_end, v_germline_start, v_germline_end,
|
|
644 d_sequence_start, d_sequence_end, d_germline_start, d_germline_end,
|
|
645 j_sequence_start, j_sequence_end, j_germline_start, j_germline_end,
|
|
646 np1_length, np2_length, fwr1, fwr2, fwr3, fwr4, cdr1, cdr2, cdr3
|
|
647
|
|
648 imgt specific output fields:
|
|
649 n1_length, n2_length, p3v_length, p5d_length, p3d_length, p5j_length,
|
|
650 d_frame, v_score, v_identity, d_score, d_identity, j_score, j_identity
|
|
651
|
|
652 igblast specific output fields:
|
|
653 v_score, v_identity, v_support, v_cigar,
|
|
654 d_score, d_identity, d_support, d_cigar,
|
|
655 j_score, j_identity, j_support, j_cigar
|
|
656
|
|
657 ihmm specific output fields:
|
|
658 vdj_score
|
|
659
|
|
660 10X specific output fields:
|
|
661 cell_id, c_call, consensus_count, umi_count,
|
|
662 v_call_10x, d_call_10x, j_call_10x,
|
|
663 junction_10x, junction_10x_aa
|
|
664 ''')
|
|
665
|
|
666 # Define ArgumentParser
|
|
667 parser = ArgumentParser(description=__doc__, epilog=fields,
|
|
668 formatter_class=CommonHelpFormatter, add_help=False)
|
|
669 group_help = parser.add_argument_group('help')
|
|
670 group_help.add_argument('--version', action='version',
|
|
671 version='%(prog)s:' + ' %s %s' %(__version__, __date__))
|
|
672 group_help.add_argument('-h', '--help', action='help', help='show this help message and exit')
|
|
673 subparsers = parser.add_subparsers(title='subcommands', dest='command',
|
|
674 help='Aligner used', metavar='')
|
|
675 # TODO: This is a temporary fix for Python issue 9253
|
|
676 subparsers.required = True
|
|
677
|
|
678 # Parent parser
|
|
679 parser_parent = getCommonArgParser(db_in=False)
|
|
680
|
|
681 # igblastn output parser
|
|
682 parser_igblast = subparsers.add_parser('igblast', parents=[parser_parent],
|
|
683 formatter_class=CommonHelpFormatter, add_help=False,
|
|
684 help='Process igblastn output.',
|
|
685 description='Process igblastn output.')
|
|
686 group_igblast = parser_igblast.add_argument_group('aligner parsing arguments')
|
|
687 group_igblast.add_argument('-i', nargs='+', action='store', dest='aligner_files',
|
|
688 required=True,
|
|
689 help='''IgBLAST output files in format 7 with query sequence
|
|
690 (igblastn argument \'-outfmt "7 std qseq sseq btop"\').''')
|
|
691 group_igblast.add_argument('-r', nargs='+', action='store', dest='repo', required=True,
|
|
692 help='''List of folders and/or fasta files containing
|
|
693 the same germline set used in the IgBLAST alignment. These
|
|
694 reference sequences must contain IMGT-numbering spacers (gaps)
|
|
695 in the V segment.''')
|
|
696 group_igblast.add_argument('-s', action='store', nargs='+', dest='seq_files',
|
|
697 required=True,
|
|
698 help='''List of input FASTA files (with .fasta, .fna or .fa
|
|
699 extension), containing sequences.''')
|
|
700 group_igblast.add_argument('--10x', action='store', nargs='+', dest='cellranger_file',
|
|
701 help='''Table file containing 10X annotations (with .csv or .tsv
|
|
702 extension).''')
|
|
703 group_igblast.add_argument('--asis-id', action='store_true', dest='asis_id',
|
|
704 help='''Specify to prevent input sequence headers from being parsed
|
|
705 to add new columns to database. Parsing of sequence headers requires
|
|
706 headers to be in the pRESTO annotation format, so this should be specified
|
|
707 when sequence headers are incompatible with the pRESTO annotation scheme.
|
|
708 Note, unrecognized header formats will default to this behavior.''')
|
|
709 group_igblast.add_argument('--asis-calls', action='store_true', dest='asis_calls',
|
|
710 help='''Specify to prevent gene calls from being parsed into standard allele names
|
|
711 in both the IgBLAST output and reference database. Note, this requires
|
|
712 the sequence identifiers in the reference sequence set and the IgBLAST
|
|
713 database to be exact string matches.''')
|
|
714 group_igblast.add_argument('--partial', action='store_true', dest='partial',
|
|
715 help='''If specified, include incomplete V(D)J alignments in
|
|
716 the pass file instead of the fail file. An incomplete alignment
|
|
717 is defined as a record for which a valid IMGT-gapped sequence
|
|
718 cannot be built or that is missing a V gene assignment,
|
|
719 J gene assignment, junction region, or productivity call.''')
|
|
720 group_igblast.add_argument('--extended', action='store_true', dest='extended',
|
|
721 help='''Specify to include additional aligner specific fields in the output.
|
|
722 Adds <vdj>_score, <vdj>_identity, <vdj>_support, <vdj>_cigar,
|
|
723 fwr1, fwr2, fwr3, fwr4, cdr1, cdr2 and cdr3.''')
|
|
724 group_igblast.add_argument('--regions', action='store', dest='regions',
|
|
725 choices=('default', 'rhesus-igl'), default='default',
|
|
726 help='''IMGT CDR and FWR boundary definition to use.''')
|
|
727 parser_igblast.set_defaults(func=parseIgBLAST, amino_acid=False)
|
|
728
|
|
729 # igblastp output parser
|
|
730 parser_igblast_aa = subparsers.add_parser('igblast-aa', parents=[parser_parent],
|
|
731 formatter_class=CommonHelpFormatter, add_help=False,
|
|
732 help='Process igblastp output.',
|
|
733 description='Process igblastp output.')
|
|
734 group_igblast_aa = parser_igblast_aa.add_argument_group('aligner parsing arguments')
|
|
735 group_igblast_aa.add_argument('-i', nargs='+', action='store', dest='aligner_files',
|
|
736 required=True,
|
|
737 help='''IgBLAST output files in format 7 with query sequence
|
|
738 (igblastp argument \'-outfmt "7 std qseq sseq btop"\').''')
|
|
739 group_igblast_aa.add_argument('-r', nargs='+', action='store', dest='repo', required=True,
|
|
740 help='''List of folders and/or fasta files containing
|
|
741 the same germline set used in the IgBLAST alignment. These
|
|
742 reference sequences must contain IMGT-numbering spacers (gaps)
|
|
743 in the V segment.''')
|
|
744 group_igblast_aa.add_argument('-s', action='store', nargs='+', dest='seq_files', required=True,
|
|
745 help='''List of input FASTA files (with .fasta, .fna or .fa
|
|
746 extension), containing sequences.''')
|
|
747 group_igblast_aa.add_argument('--10x', action='store', nargs='+', dest='cellranger_file',
|
|
748 help='''Table file containing 10X annotations (with .csv or .tsv extension).''')
|
|
749 group_igblast_aa.add_argument('--asis-id', action='store_true', dest='asis_id',
|
|
750 help='''Specify to prevent input sequence headers from being parsed
|
|
751 to add new columns to database. Parsing of sequence headers requires
|
|
752 headers to be in the pRESTO annotation format, so this should be specified
|
|
753 when sequence headers are incompatible with the pRESTO annotation scheme.
|
|
754 Note, unrecognized header formats will default to this behavior.''')
|
|
755 group_igblast_aa.add_argument('--asis-calls', action='store_true', dest='asis_calls',
|
|
756 help='''Specify to prevent gene calls from being parsed into standard allele names
|
|
757 in both the IgBLAST output and reference database. Note, this requires
|
|
758 the sequence identifiers in the reference sequence set and the IgBLAST
|
|
759 database to be exact string matches.''')
|
|
760 group_igblast_aa.add_argument('--extended', action='store_true', dest='extended',
|
|
761 help='''Specify to include additional aligner specific fields in the output.
|
|
762 Adds v_score, v_identity, v_support, v_cigar, fwr1, fwr2, fwr3, cdr1 and cdr2.''')
|
|
763 group_igblast_aa.add_argument('--regions', action='store', dest='regions',
|
|
764 choices=('default', 'rhesus-igl'), default='default',
|
|
765 help='''IMGT CDR and FWR boundary definition to use.''')
|
|
766 parser_igblast_aa.set_defaults(func=parseIgBLAST, partial=True, amino_acid=True)
|
|
767
|
|
768
|
|
769 # IMGT aligner
|
|
770 parser_imgt = subparsers.add_parser('imgt', parents=[parser_parent],
|
|
771 formatter_class=CommonHelpFormatter, add_help=False,
|
|
772 help='''Process IMGT/HighV-Quest output
|
|
773 (does not work with V-QUEST).''',
|
|
774 description='''Process IMGT/HighV-Quest output
|
|
775 (does not work with V-QUEST).''')
|
|
776 group_imgt = parser_imgt.add_argument_group('aligner parsing arguments')
|
|
777 group_imgt.add_argument('-i', nargs='+', action='store', dest='aligner_files',
|
|
778 help='''Either zipped IMGT output files (.zip or .txz) or a
|
|
779 folder containing unzipped IMGT output files (which must
|
|
780 include 1_Summary, 2_IMGT-gapped, 3_Nt-sequences,
|
|
781 and 6_Junction).''')
|
|
782 group_imgt.add_argument('-s', nargs='*', action='store', dest='seq_files', required=False,
|
|
783 help='''List of FASTA files (with .fasta, .fna or .fa
|
|
784 extension) that were submitted to IMGT/HighV-QUEST.
|
|
785 If unspecified, sequence identifiers truncated by IMGT/HighV-QUEST
|
|
786 will not be corrected.''')
|
|
787 group_imgt.add_argument('-r', nargs='+', action='store', dest='repo', required=False,
|
|
788 help='''List of folders and/or fasta files containing
|
|
789 the germline sequence set used by IMGT/HighV-QUEST.
|
|
790 These reference sequences must contain IMGT-numbering spacers (gaps)
|
|
791 in the V segment. If unspecified, the germline sequence reconstruction
|
|
792 will not be included in the output.''')
|
|
793 group_imgt.add_argument('--10x', action='store', nargs='+', dest='cellranger_file',
|
|
794 help='''Table file containing 10X annotations (with .csv or .tsv
|
|
795 extension).''')
|
|
796 group_imgt.add_argument('--asis-id', action='store_true', dest='asis_id',
|
|
797 help='''Specify to prevent input sequence headers from being parsed
|
|
798 to add new columns to database. Parsing of sequence headers requires
|
|
799 headers to be in the pRESTO annotation format, so this should be specified
|
|
800 when sequence headers are incompatible with the pRESTO annotation scheme.
|
|
801 Note, unrecognized header formats will default to this behavior.''')
|
|
802 group_imgt.add_argument('--partial', action='store_true', dest='partial',
|
|
803 help='''If specified, include incomplete V(D)J alignments in
|
|
804 the pass file instead of the fail file. An incomplete alignment
|
|
805 is defined as a record that is missing a V gene assignment,
|
|
806 J gene assignment, junction region, or productivity call.''')
|
|
807 group_imgt.add_argument('--extended', action='store_true', dest='extended',
|
|
808 help='''Specify to include additional aligner specific fields in the output.
|
|
809 Adds <vdj>_score, <vdj>_identity>, fwr1, fwr2, fwr3, fwr4,
|
|
810 cdr1, cdr2, cdr3, n1_length, n2_length, p3v_length, p5d_length,
|
|
811 p3d_length, p5j_length and d_frame.''')
|
|
812 parser_imgt.set_defaults(func=parseIMGT)
|
|
813
|
|
814 # iHMMuneAlign Aligner
|
|
815 parser_ihmm = subparsers.add_parser('ihmm', parents=[parser_parent],
|
|
816 formatter_class=CommonHelpFormatter, add_help=False,
|
|
817 help='Process iHMMune-Align output.',
|
|
818 description='Process iHMMune-Align output.')
|
|
819 group_ihmm = parser_ihmm.add_argument_group('aligner parsing arguments')
|
|
820 group_ihmm.add_argument('-i', nargs='+', action='store', dest='aligner_files',
|
|
821 required=True,
|
|
822 help='''iHMMune-Align output file.''')
|
|
823 group_ihmm.add_argument('-r', nargs='+', action='store', dest='repo', required=True,
|
|
824 help='''List of folders and/or FASTA files containing
|
|
825 the set of germline sequences used by iHMMune-Align. These
|
|
826 reference sequences must contain IMGT-numbering spacers (gaps)
|
|
827 in the V segment.''')
|
|
828 group_ihmm.add_argument('-s', action='store', nargs='+', dest='seq_files',
|
|
829 required=True,
|
|
830 help='''List of input FASTA files (with .fasta, .fna or .fa
|
|
831 extension) containing sequences.''')
|
|
832 group_ihmm.add_argument('--10x', action='store', nargs='+', dest='cellranger_file',
|
|
833 help='''Table file containing 10X annotations (with .csv or .tsv
|
|
834 extension).''')
|
|
835 group_ihmm.add_argument('--asis-id', action='store_true', dest='asis_id',
|
|
836 help='''Specify to prevent input sequence headers from being parsed
|
|
837 to add new columns to database. Parsing of sequence headers requires
|
|
838 headers to be in the pRESTO annotation format, so this should be specified
|
|
839 when sequence headers are incompatible with the pRESTO annotation scheme.
|
|
840 Note, unrecognized header formats will default to this behavior.''')
|
|
841 group_ihmm.add_argument('--partial', action='store_true', dest='partial',
|
|
842 help='''If specified, include incomplete V(D)J alignments in
|
|
843 the pass file instead of the fail file. An incomplete alignment
|
|
844 is defined as a record for which a valid IMGT-gapped sequence
|
|
845 cannot be built or that is missing a V gene assignment,
|
|
846 J gene assignment, junction region, or productivity call.''')
|
|
847 group_ihmm.add_argument('--extended', action='store_true', dest='extended',
|
|
848 help='''Specify to include additional aligner specific fields in the output.
|
|
849 Adds the path score of the iHMMune-Align hidden Markov model as vdj_score;
|
|
850 adds fwr1, fwr2, fwr3, fwr4, cdr1, cdr2 and cdr3.''')
|
|
851 parser_ihmm.set_defaults(func=parseIHMM)
|
|
852
|
|
853 return parser
|
|
854
|
|
855
|
|
856 if __name__ == "__main__":
|
|
857 """
|
|
858 Parses command line arguments and calls main
|
|
859 """
|
|
860 parser = getArgParser()
|
|
861 checkArgs(parser)
|
|
862 args = parser.parse_args()
|
|
863 args_dict = parseCommonArgs(args, in_arg='aligner_files')
|
|
864
|
|
865 # Set no ID parsing if sequence files are not provided
|
|
866 if 'seq_files' in args_dict and not args_dict['seq_files']:
|
|
867 args_dict['asis_id'] = True
|
|
868
|
|
869 # Delete
|
|
870 if 'aligner_files' in args_dict: del args_dict['aligner_files']
|
|
871 if 'seq_files' in args_dict: del args_dict['seq_files']
|
|
872 if 'out_files' in args_dict: del args_dict['out_files']
|
|
873 if 'command' in args_dict: del args_dict['command']
|
|
874 if 'func' in args_dict: del args_dict['func']
|
|
875
|
|
876 # Call main
|
|
877 for i, f in enumerate(args.__dict__['aligner_files']):
|
|
878 args_dict['aligner_file'] = f
|
|
879 args_dict['seq_file'] = args.__dict__['seq_files'][i] \
|
|
880 if args.__dict__['seq_files'] else None
|
|
881 args_dict['out_file'] = args.__dict__['out_files'][i] \
|
|
882 if args.__dict__['out_files'] else None
|
|
883 args_dict['cellranger_file'] = args.__dict__['cellranger_file'][i] \
|
|
884 if args.__dict__['cellranger_file'] else None
|
|
885 args.func(**args_dict)
|