Mercurial > repos > davidvanzessen > shm_csr
comparison change_o/MakeDb.py @ 78:aff3ba86ef7a draft
Uploaded
author | davidvanzessen |
---|---|
date | Mon, 31 Aug 2020 11:20:08 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
77:58d2377b507d | 78:aff3ba86ef7a |
---|---|
1 #!/usr/bin/env python3 | |
2 """ | |
3 Create tab-delimited database file to store sequence alignment information | |
4 """ | |
5 | |
6 # Info | |
7 __author__ = 'Namita Gupta, Jason Anthony Vander Heiden' | |
8 from changeo import __version__, __date__ | |
9 | |
10 # Imports | |
11 import os | |
12 import re | |
13 import csv | |
14 from argparse import ArgumentParser | |
15 from collections import OrderedDict | |
16 from textwrap import dedent | |
17 from time import time | |
18 from Bio import SeqIO | |
19 | |
20 # Presto and changeo imports | |
21 from presto.Annotation import parseAnnotation | |
22 from presto.IO import countSeqFile, printLog, printMessage, printProgress, printError, printWarning, readSeqFile | |
23 from changeo.Defaults import default_format, default_out_args | |
24 from changeo.Commandline import CommonHelpFormatter, checkArgs, getCommonArgParser, parseCommonArgs | |
25 from changeo.Alignment import RegionDefinition | |
26 from changeo.Gene import buildGermline | |
27 from changeo.IO import countDbFile, extractIMGT, readGermlines, getFormatOperators, getOutputHandle, \ | |
28 AIRRWriter, ChangeoWriter, IgBLASTReader, IgBLASTReaderAA, IMGTReader, IHMMuneReader | |
29 from changeo.Receptor import ChangeoSchema, AIRRSchema | |
30 | |
31 # 10X Receptor attributes | |
32 cellranger_base = ['cell', 'c_call', 'conscount', 'umicount'] | |
33 cellranger_extended = ['cell', 'c_call', 'conscount', 'umicount', | |
34 'v_call_10x', 'd_call_10x', 'j_call_10x', | |
35 'junction_10x', 'junction_10x_aa'] | |
36 | |
37 def readCellRanger(cellranger_file, fields=cellranger_base): | |
38 """ | |
39 Load a Cell Ranger annotation table | |
40 | |
41 Arguments: | |
42 cellranger_file (str): path to the annotation file. | |
43 fields (list): list of fields to keep. | |
44 | |
45 Returns: | |
46 dict: dict of dicts with contig_id as the primary key. | |
47 """ | |
48 # Mapping of 10X annotations to Receptor attributes | |
49 cellranger_map = {'cell': 'barcode', | |
50 'c_call': 'c_gene', | |
51 'locus': 'chain', | |
52 'conscount': 'reads', | |
53 'umicount': 'umis', | |
54 'v_call_10x': 'v_gene', | |
55 'd_call_10x': 'd_gene', | |
56 'j_call_10x': 'j_gene', | |
57 'junction_10x': 'cdr3_nt', | |
58 'junction_10x_aa': 'cdr3'} | |
59 | |
60 # Function to parse individual fields | |
61 def _parse(x): | |
62 return '' if x == 'None' else x | |
63 | |
64 # Generate annotation dictionary | |
65 ann_dict = {} | |
66 with open(cellranger_file) as csv_file: | |
67 # Detect delimiters | |
68 dialect = csv.Sniffer().sniff(csv_file.readline()) | |
69 csv_file.seek(0) | |
70 # Read in annotation file | |
71 csv_reader = csv.DictReader(csv_file, dialect=dialect) | |
72 | |
73 # Generate annotation dictionary | |
74 for row in csv_reader: | |
75 ann_dict[row['contig_id']] = {f: _parse(row[cellranger_map[f]]) for f in fields} | |
76 | |
77 return ann_dict | |
78 | |
79 | |
80 def addGermline(receptor, references, amino_acid=False): | |
81 """ | |
82 Add full length germline to Receptor object | |
83 | |
84 Arguments: | |
85 receptor (changeo.Receptor.Receptor): Receptor object to modify. | |
86 references (dict): dictionary of IMGT-gapped references sequences. | |
87 amino_acid (bool): if True build amino acid germline, otherwise build nucleotide germline | |
88 | |
89 Returns: | |
90 changeo.Receptor.Receptor: modified Receptor with the germline sequence added. | |
91 """ | |
92 if amino_acid: | |
93 __, germlines, __ = buildGermline(receptor, references, seq_field='sequence_aa_imgt', | |
94 amino_acid=True) | |
95 germline_seq = None if germlines is None else germlines['full'] | |
96 receptor.setField('germline_aa_imgt', germline_seq) | |
97 else: | |
98 __, germlines, __ = buildGermline(receptor, references, amino_acid=False) | |
99 germline_seq = None if germlines is None else germlines['full'] | |
100 receptor.setField('germline_imgt', germline_seq) | |
101 | |
102 return receptor | |
103 | |
104 | |
105 def getIDforIMGT(seq_file): | |
106 """ | |
107 Create a sequence ID translation using IMGT truncation. | |
108 | |
109 Arguments: | |
110 seq_file : a fasta file of sequences input to IMGT. | |
111 | |
112 Returns: | |
113 dict : a dictionary of with the IMGT truncated ID as the key and the full sequence description as the value. | |
114 """ | |
115 | |
116 # Create a sequence ID translation using IDs truncate up to space or 50 chars | |
117 ids = {} | |
118 for rec in readSeqFile(seq_file): | |
119 if len(rec.description) <= 50: | |
120 id_key = rec.description | |
121 else: | |
122 id_key = re.sub('\||\s|!|&|\*|<|>|\?', '_', rec.description[:50]) | |
123 ids.update({id_key: rec.description}) | |
124 | |
125 return ids | |
126 | |
127 | |
128 def getSeqDict(seq_file): | |
129 """ | |
130 Create a dictionary from a sequence file. | |
131 | |
132 Arguments: | |
133 seq_file : sequence file. | |
134 | |
135 Returns: | |
136 dict : sequence description as keys with Bio.SeqRecords as values. | |
137 """ | |
138 seq_dict = SeqIO.to_dict(readSeqFile(seq_file), key_function=lambda x: x.description) | |
139 | |
140 return seq_dict | |
141 | |
142 | |
143 def writeDb(records, fields, aligner_file, total_count, id_dict=None, annotations=None, | |
144 amino_acid=False, partial=False, asis_id=True, regions='default', | |
145 writer=AIRRWriter, out_file=None, out_args=default_out_args): | |
146 """ | |
147 Writes parsed records to an output file | |
148 | |
149 Arguments: | |
150 records : a iterator of Receptor objects containing alignment data. | |
151 fields : a list of ordered field names to write. | |
152 aligner_file : input file name. | |
153 total_count : number of records (for progress bar). | |
154 id_dict : a dictionary of the truncated sequence ID mapped to the full sequence ID. | |
155 annotations : additional annotation dictionary. | |
156 amino_acid : if True do verification on amino acid fields. | |
157 partial : if True put incomplete alignments in the pass file. | |
158 asis_id : if ID is to be parsed for pRESTO output with default delimiters. | |
159 regions (str): name of the IMGT FWR/CDR region definitions to use. | |
160 writer : writer class. | |
161 out_file : output file name. Automatically generated from the input file if None. | |
162 out_args : common output argument dictionary from parseCommonArgs. | |
163 | |
164 Returns: | |
165 None | |
166 """ | |
167 # Wrapper for opening handles and writers | |
168 def _open(x, f, writer=writer, out_file=out_file): | |
169 if out_file is not None and x == 'pass': | |
170 handle = open(out_file, 'w') | |
171 else: | |
172 handle = getOutputHandle(aligner_file, | |
173 out_label='db-%s' % x, | |
174 out_dir=out_args['out_dir'], | |
175 out_name=out_args['out_name'], | |
176 out_type=out_args['out_type']) | |
177 return handle, writer(handle, fields=f) | |
178 | |
179 # Function to convert fasta header annotations to changeo columns | |
180 def _changeo(f, header): | |
181 h = [ChangeoSchema.fromReceptor(x) for x in header if x.upper() not in f] | |
182 f.extend(h) | |
183 return f | |
184 | |
185 def _airr(f, header): | |
186 h = [AIRRSchema.fromReceptor(x) for x in header if x.lower() not in f] | |
187 f.extend(h) | |
188 return f | |
189 | |
190 # Function to verify IMGT-gapped sequence and junction concur | |
191 def _imgt_check(rec): | |
192 try: | |
193 if amino_acid: | |
194 rd = RegionDefinition(rec.junction_aa_length, amino_acid=amino_acid, definition=regions) | |
195 x, y = rd.positions['junction'] | |
196 check = (rec.junction_aa == rec.sequence_aa_imgt[x:y]) | |
197 else: | |
198 rd = RegionDefinition(rec.junction_length, amino_acid=amino_acid, definition=regions) | |
199 x, y = rd.positions['junction'] | |
200 check = (rec.junction == rec.sequence_imgt[x:y]) | |
201 except (TypeError, AttributeError): | |
202 check = False | |
203 return check | |
204 | |
205 # Function to check for valid records strictly | |
206 def _strict(rec): | |
207 if amino_acid: | |
208 valid = [rec.v_call and rec.v_call != 'None', | |
209 rec.j_call and rec.j_call != 'None', | |
210 rec.functional is not None, | |
211 rec.sequence_aa_imgt, | |
212 rec.junction_aa, | |
213 _imgt_check(rec)] | |
214 else: | |
215 valid = [rec.v_call and rec.v_call != 'None', | |
216 rec.j_call and rec.j_call != 'None', | |
217 rec.functional is not None, | |
218 rec.sequence_imgt, | |
219 rec.junction, | |
220 _imgt_check(rec)] | |
221 return all(valid) | |
222 | |
223 # Function to check for valid records loosely | |
224 def _gentle(rec): | |
225 valid = [rec.v_call and rec.v_call != 'None', | |
226 rec.d_call and rec.d_call != 'None', | |
227 rec.j_call and rec.j_call != 'None'] | |
228 return any(valid) | |
229 | |
230 # Set writer class and annotation conversion function | |
231 if writer == ChangeoWriter: | |
232 _annotate = _changeo | |
233 elif writer == AIRRWriter: | |
234 _annotate = _airr | |
235 else: | |
236 printError('Invalid output writer.') | |
237 | |
238 # Additional annotation (e.g. 10X cell calls) | |
239 # _append_table = None | |
240 # if cellranger_file is not None: | |
241 # with open(cellranger_file) as csv_file: | |
242 # # Read in annotation file (use Sniffer to discover file delimiters) | |
243 # dialect = csv.Sniffer().sniff(csv_file.readline()) | |
244 # csv_file.seek(0) | |
245 # csv_reader = csv.DictReader(csv_file, dialect = dialect) | |
246 # | |
247 # # Generate annotation dictionary | |
248 # anntab_dict = {entry['contig_id']: {cellranger_map[field]: entry[field] \ | |
249 # for field in cellranger_map.keys()} for entry in csv_reader} | |
250 # | |
251 # fields = _annotate(fields, cellranger_map.values()) | |
252 # _append_table = lambda sequence_id: anntab_dict[sequence_id] | |
253 | |
254 # Set pass criteria | |
255 _pass = _gentle if partial else _strict | |
256 | |
257 # Define log handle | |
258 if out_args['log_file'] is None: | |
259 log_handle = None | |
260 else: | |
261 log_handle = open(out_args['log_file'], 'w') | |
262 | |
263 # Initialize handles, writers and counters | |
264 pass_handle, pass_writer = None, None | |
265 fail_handle, fail_writer = None, None | |
266 pass_count, fail_count = 0, 0 | |
267 start_time = time() | |
268 | |
269 # Validate and write output | |
270 printProgress(0, total_count, 0.05, start_time=start_time) | |
271 for i, record in enumerate(records, start=1): | |
272 # Replace sequence description with full string, if required | |
273 if id_dict is not None and record.sequence_id in id_dict: | |
274 record.sequence_id = id_dict[record.sequence_id] | |
275 | |
276 # Parse sequence description into new columns | |
277 if not asis_id: | |
278 try: | |
279 ann_raw = parseAnnotation(record.sequence_id) | |
280 record.sequence_id = ann_raw.pop('ID') | |
281 | |
282 # Convert to Receptor fields | |
283 ann_parsed = OrderedDict() | |
284 for k, v in ann_raw.items(): | |
285 ann_parsed[ChangeoSchema.toReceptor(k)] = v | |
286 | |
287 # Add annotations to Receptor and update field list | |
288 record.setDict(ann_parsed, parse=True) | |
289 if i == 1: fields = _annotate(fields, ann_parsed.keys()) | |
290 except IndexError: | |
291 # Could not parse pRESTO-style annotations so fall back to no parse | |
292 asis_id = True | |
293 printWarning('Sequence annotation format not recognized. Sequence headers will not be parsed.') | |
294 | |
295 # Add supplemental annotation fields | |
296 # if _append_table is not None: | |
297 # record.setDict(_append_table(record.sequence_id), parse=True) | |
298 if annotations is not None: | |
299 record.setDict(annotations[record.sequence_id], parse=True) | |
300 if i == 1: fields = _annotate(fields, annotations[record.sequence_id].keys()) | |
301 | |
302 # Count pass or fail and write to appropriate file | |
303 if _pass(record): | |
304 pass_count += 1 | |
305 # Write row to pass file | |
306 try: | |
307 pass_writer.writeReceptor(record) | |
308 except AttributeError: | |
309 # Open pass file and writer | |
310 pass_handle, pass_writer = _open('pass', fields) | |
311 pass_writer.writeReceptor(record) | |
312 else: | |
313 fail_count += 1 | |
314 # Write row to fail file if specified | |
315 if out_args['failed']: | |
316 try: | |
317 fail_writer.writeReceptor(record) | |
318 except AttributeError: | |
319 # Open fail file and writer | |
320 fail_handle, fail_writer = _open('fail', fields) | |
321 fail_writer.writeReceptor(record) | |
322 | |
323 # Write log | |
324 if log_handle is not None: | |
325 log = OrderedDict([('ID', record.sequence_id), | |
326 ('V_CALL', record.v_call), | |
327 ('D_CALL', record.d_call), | |
328 ('J_CALL', record.j_call), | |
329 ('PRODUCTIVE', record.functional)]) | |
330 if not _imgt_check(record) and not amino_acid: | |
331 log['ERROR'] = 'Junction does not match the sequence starting at position 310 in the IMGT numbered V(D)J sequence.' | |
332 printLog(log, log_handle) | |
333 | |
334 # Print progress | |
335 printProgress(i, total_count, 0.05, start_time=start_time) | |
336 | |
337 # Print console log | |
338 log = OrderedDict() | |
339 log['OUTPUT'] = os.path.basename(pass_handle.name) if pass_handle is not None else None | |
340 log['PASS'] = pass_count | |
341 log['FAIL'] = fail_count | |
342 log['END'] = 'MakeDb' | |
343 printLog(log) | |
344 | |
345 # Close file handles | |
346 output = {'pass': None, 'fail': None} | |
347 if pass_handle is not None: | |
348 output['pass'] = pass_handle.name | |
349 pass_handle.close() | |
350 if fail_handle is not None: | |
351 output['fail'] = fail_handle.name | |
352 fail_handle.close() | |
353 | |
354 return output | |
355 | |
356 | |
357 def parseIMGT(aligner_file, seq_file=None, repo=None, cellranger_file=None, partial=False, asis_id=True, | |
358 extended=False, format=default_format, out_file=None, out_args=default_out_args): | |
359 """ | |
360 Main for IMGT aligned sample sequences. | |
361 | |
362 Arguments: | |
363 aligner_file : zipped file or unzipped folder output by IMGT. | |
364 seq_file : FASTA file input to IMGT (from which to get seqID). | |
365 repo : folder with germline repertoire files. | |
366 partial : If True put incomplete alignments in the pass file. | |
367 asis_id : if ID is to be parsed for pRESTO output with default delimiters. | |
368 extended : if True add alignment score, FWR, CDR and junction fields to output file. | |
369 format : output format. one of 'changeo' or 'airr'. | |
370 out_file : output file name. Automatically generated from the input file if None. | |
371 out_args : common output argument dictionary from parseCommonArgs. | |
372 | |
373 Returns: | |
374 dict : names of the 'pass' and 'fail' output files. | |
375 """ | |
376 # Print parameter info | |
377 log = OrderedDict() | |
378 log['START'] = 'MakeDb' | |
379 log['COMMAND'] = 'imgt' | |
380 log['ALIGNER_FILE'] = aligner_file | |
381 log['SEQ_FILE'] = os.path.basename(seq_file) if seq_file else '' | |
382 log['ASIS_ID'] = asis_id | |
383 log['PARTIAL'] = partial | |
384 log['EXTENDED'] = extended | |
385 printLog(log) | |
386 | |
387 start_time = time() | |
388 printMessage('Loading files', start_time=start_time, width=20) | |
389 | |
390 # Extract IMGT files | |
391 temp_dir, imgt_files = extractIMGT(aligner_file) | |
392 | |
393 # Count records in IMGT files | |
394 total_count = countDbFile(imgt_files['summary']) | |
395 | |
396 # Get (parsed) IDs from fasta file submitted to IMGT | |
397 id_dict = getIDforIMGT(seq_file) if seq_file else {} | |
398 | |
399 # Load supplementary annotation table | |
400 if cellranger_file is not None: | |
401 f = cellranger_extended if extended else cellranger_base | |
402 annotations = readCellRanger(cellranger_file, fields=f) | |
403 else: | |
404 annotations = None | |
405 | |
406 printMessage('Done', start_time=start_time, end=True, width=20) | |
407 | |
408 # Define format operators | |
409 try: | |
410 __, writer, schema = getFormatOperators(format) | |
411 except ValueError: | |
412 printError('Invalid format %s.' % format) | |
413 out_args['out_type'] = schema.out_type | |
414 | |
415 # Define output fields | |
416 fields = list(schema.required) | |
417 if extended: | |
418 custom = IMGTReader.customFields(scores=True, regions=True, junction=True, schema=schema) | |
419 fields.extend(custom) | |
420 | |
421 # Parse IMGT output and write db | |
422 with open(imgt_files['summary'], 'r') as summary_handle, \ | |
423 open(imgt_files['gapped'], 'r') as gapped_handle, \ | |
424 open(imgt_files['ntseq'], 'r') as ntseq_handle, \ | |
425 open(imgt_files['junction'], 'r') as junction_handle: | |
426 | |
427 # Open parser | |
428 parse_iter = IMGTReader(summary_handle, gapped_handle, ntseq_handle, junction_handle) | |
429 | |
430 # Add germline sequence | |
431 if repo is None: | |
432 germ_iter = parse_iter | |
433 else: | |
434 references = readGermlines(repo) | |
435 # Check for IMGT-gaps in germlines | |
436 if all('...' not in x for x in references.values()): | |
437 printWarning('Germline reference sequences do not appear to contain IMGT-numbering spacers. Results may be incorrect.') | |
438 germ_iter = (addGermline(x, references) for x in parse_iter) | |
439 | |
440 # Write db | |
441 output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count, | |
442 annotations=annotations, id_dict=id_dict, asis_id=asis_id, partial=partial, | |
443 writer=writer, out_file=out_file, out_args=out_args) | |
444 | |
445 # Cleanup temp directory | |
446 temp_dir.cleanup() | |
447 | |
448 return output | |
449 | |
450 | |
451 def parseIgBLAST(aligner_file, seq_file, repo, amino_acid=False, cellranger_file=None, partial=False, | |
452 asis_id=True, asis_calls=False, extended=False, regions='default', | |
453 format='changeo', out_file=None, out_args=default_out_args): | |
454 """ | |
455 Main for IgBLAST aligned sample sequences. | |
456 | |
457 Arguments: | |
458 aligner_file (str): IgBLAST output file to process. | |
459 seq_file (str): fasta file input to IgBlast (from which to get sequence). | |
460 repo (str): folder with germline repertoire files. | |
461 amino_acid (bool): if True then the IgBLAST output files are results from igblastp. igblastn is assumed if False. | |
462 partial : If True put incomplete alignments in the pass file. | |
463 asis_id (bool): if ID is to be parsed for pRESTO output with default delimiters. | |
464 asis_calls (bool): if True do not parse gene calls for allele names. | |
465 extended (bool): if True add alignment scores, FWR regions, and CDR regions to the output. | |
466 regions (str): name of the IMGT FWR/CDR region definitions to use. | |
467 format (str): output format. one of 'changeo' or 'airr'. | |
468 out_file (str): output file name. Automatically generated from the input file if None. | |
469 out_args (dict): common output argument dictionary from parseCommonArgs. | |
470 | |
471 Returns: | |
472 dict : names of the 'pass' and 'fail' output files. | |
473 """ | |
474 # Print parameter info | |
475 log = OrderedDict() | |
476 log['START'] = 'MakeDB' | |
477 log['COMMAND'] = 'igblast-aa' if amino_acid else 'igblast' | |
478 log['ALIGNER_FILE'] = os.path.basename(aligner_file) | |
479 log['SEQ_FILE'] = os.path.basename(seq_file) | |
480 log['ASIS_ID'] = asis_id | |
481 log['ASIS_CALLS'] = asis_calls | |
482 log['PARTIAL'] = partial | |
483 log['EXTENDED'] = extended | |
484 printLog(log) | |
485 | |
486 # Set amino acid conditions | |
487 if amino_acid: | |
488 format = '%s-aa' % format | |
489 parser = IgBLASTReaderAA | |
490 else: | |
491 parser = IgBLASTReader | |
492 | |
493 # Start | |
494 start_time = time() | |
495 printMessage('Loading files', start_time=start_time, width=20) | |
496 | |
497 # Count records in sequence file | |
498 total_count = countSeqFile(seq_file) | |
499 | |
500 # Get input sequence dictionary | |
501 seq_dict = getSeqDict(seq_file) | |
502 | |
503 # Create germline repo dictionary | |
504 references = readGermlines(repo, asis=asis_calls) | |
505 | |
506 # Load supplementary annotation table | |
507 if cellranger_file is not None: | |
508 f = cellranger_extended if extended else cellranger_base | |
509 annotations = readCellRanger(cellranger_file, fields=f) | |
510 else: | |
511 annotations = None | |
512 | |
513 printMessage('Done', start_time=start_time, end=True, width=20) | |
514 | |
515 # Check for IMGT-gaps in germlines | |
516 if all('...' not in x for x in references.values()): | |
517 printWarning('Germline reference sequences do not appear to contain IMGT-numbering spacers. Results may be incorrect.') | |
518 | |
519 # Define format operators | |
520 try: | |
521 __, writer, schema = getFormatOperators(format) | |
522 except ValueError: | |
523 printError('Invalid format %s.' % format) | |
524 out_args['out_type'] = schema.out_type | |
525 | |
526 # Define output fields | |
527 fields = list(schema.required) | |
528 if extended: | |
529 custom = parser.customFields(schema=schema) | |
530 fields.extend(custom) | |
531 | |
532 # Parse and write output | |
533 with open(aligner_file, 'r') as f: | |
534 parse_iter = parser(f, seq_dict, references, regions=regions, asis_calls=asis_calls) | |
535 germ_iter = (addGermline(x, references, amino_acid=amino_acid) for x in parse_iter) | |
536 output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count, | |
537 annotations=annotations, amino_acid=amino_acid, partial=partial, asis_id=asis_id, | |
538 regions=regions, writer=writer, out_file=out_file, out_args=out_args) | |
539 | |
540 return output | |
541 | |
542 | |
543 def parseIHMM(aligner_file, seq_file, repo, cellranger_file=None, partial=False, asis_id=True, | |
544 extended=False, format=default_format, out_file=None, out_args=default_out_args): | |
545 """ | |
546 Main for iHMMuneAlign aligned sample sequences. | |
547 | |
548 Arguments: | |
549 aligner_file : iHMMune-Align output file to process. | |
550 seq_file : fasta file input to iHMMuneAlign (from which to get sequence). | |
551 repo : folder with germline repertoire files. | |
552 partial : If True put incomplete alignments in the pass file. | |
553 asis_id : if ID is to be parsed for pRESTO output with default delimiters. | |
554 extended : if True parse alignment scores, FWR and CDR region fields. | |
555 format : output format. One of 'changeo' or 'airr'. | |
556 out_file : output file name. Automatically generated from the input file if None. | |
557 out_args : common output argument dictionary from parseCommonArgs. | |
558 | |
559 Returns: | |
560 dict : names of the 'pass' and 'fail' output files. | |
561 """ | |
562 # Print parameter info | |
563 log = OrderedDict() | |
564 log['START'] = 'MakeDB' | |
565 log['COMMAND'] = 'ihmm' | |
566 log['ALIGNER_FILE'] = os.path.basename(aligner_file) | |
567 log['SEQ_FILE'] = os.path.basename(seq_file) | |
568 log['ASIS_ID'] = asis_id | |
569 log['PARTIAL'] = partial | |
570 log['EXTENDED'] = extended | |
571 printLog(log) | |
572 | |
573 start_time = time() | |
574 printMessage('Loading files', start_time=start_time, width=20) | |
575 | |
576 # Count records in sequence file | |
577 total_count = countSeqFile(seq_file) | |
578 | |
579 # Get input sequence dictionary | |
580 seq_dict = getSeqDict(seq_file) | |
581 | |
582 # Create germline repo dictionary | |
583 references = readGermlines(repo) | |
584 | |
585 # Load supplementary annotation table | |
586 if cellranger_file is not None: | |
587 f = cellranger_extended if extended else cellranger_base | |
588 annotations = readCellRanger(cellranger_file, fields=f) | |
589 else: | |
590 annotations = None | |
591 | |
592 printMessage('Done', start_time=start_time, end=True, width=20) | |
593 | |
594 # Check for IMGT-gaps in germlines | |
595 if all('...' not in x for x in references.values()): | |
596 printWarning('Germline reference sequences do not appear to contain IMGT-numbering spacers. Results may be incorrect.') | |
597 | |
598 # Define format operators | |
599 try: | |
600 __, writer, schema = getFormatOperators(format) | |
601 except ValueError: | |
602 printError('Invalid format %s.' % format) | |
603 out_args['out_type'] = schema.out_type | |
604 | |
605 # Define output fields | |
606 fields = list(schema.required) | |
607 if extended: | |
608 custom = IHMMuneReader.customFields(scores=True, regions=True, schema=schema) | |
609 fields.extend(custom) | |
610 | |
611 # Parse and write output | |
612 with open(aligner_file, 'r') as f: | |
613 parse_iter = IHMMuneReader(f, seq_dict, references) | |
614 germ_iter = (addGermline(x, references) for x in parse_iter) | |
615 output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count, | |
616 annotations=annotations, asis_id=asis_id, partial=partial, | |
617 writer=writer, out_file=out_file, out_args=out_args) | |
618 | |
619 return output | |
620 | |
621 | |
622 def getArgParser(): | |
623 """ | |
624 Defines the ArgumentParser. | |
625 | |
626 Returns: | |
627 argparse.ArgumentParser | |
628 """ | |
629 fields = dedent( | |
630 ''' | |
631 output files: | |
632 db-pass | |
633 database of alignment records with functionality information, | |
634 V and J calls, and a junction region. | |
635 db-fail | |
636 database with records that fail due to no productivity information, | |
637 no gene V assignment, no J assignment, or no junction region. | |
638 | |
639 universal output fields: | |
640 sequence_id, sequence, sequence_alignment, germline_alignment, | |
641 rev_comp, productive, stop_codon, vj_in_frame, locus, | |
642 v_call, d_call, j_call, junction, junction_length, junction_aa, | |
643 v_sequence_start, v_sequence_end, v_germline_start, v_germline_end, | |
644 d_sequence_start, d_sequence_end, d_germline_start, d_germline_end, | |
645 j_sequence_start, j_sequence_end, j_germline_start, j_germline_end, | |
646 np1_length, np2_length, fwr1, fwr2, fwr3, fwr4, cdr1, cdr2, cdr3 | |
647 | |
648 imgt specific output fields: | |
649 n1_length, n2_length, p3v_length, p5d_length, p3d_length, p5j_length, | |
650 d_frame, v_score, v_identity, d_score, d_identity, j_score, j_identity | |
651 | |
652 igblast specific output fields: | |
653 v_score, v_identity, v_support, v_cigar, | |
654 d_score, d_identity, d_support, d_cigar, | |
655 j_score, j_identity, j_support, j_cigar | |
656 | |
657 ihmm specific output fields: | |
658 vdj_score | |
659 | |
660 10X specific output fields: | |
661 cell_id, c_call, consensus_count, umi_count, | |
662 v_call_10x, d_call_10x, j_call_10x, | |
663 junction_10x, junction_10x_aa | |
664 ''') | |
665 | |
666 # Define ArgumentParser | |
667 parser = ArgumentParser(description=__doc__, epilog=fields, | |
668 formatter_class=CommonHelpFormatter, add_help=False) | |
669 group_help = parser.add_argument_group('help') | |
670 group_help.add_argument('--version', action='version', | |
671 version='%(prog)s:' + ' %s %s' %(__version__, __date__)) | |
672 group_help.add_argument('-h', '--help', action='help', help='show this help message and exit') | |
673 subparsers = parser.add_subparsers(title='subcommands', dest='command', | |
674 help='Aligner used', metavar='') | |
675 # TODO: This is a temporary fix for Python issue 9253 | |
676 subparsers.required = True | |
677 | |
678 # Parent parser | |
679 parser_parent = getCommonArgParser(db_in=False) | |
680 | |
681 # igblastn output parser | |
682 parser_igblast = subparsers.add_parser('igblast', parents=[parser_parent], | |
683 formatter_class=CommonHelpFormatter, add_help=False, | |
684 help='Process igblastn output.', | |
685 description='Process igblastn output.') | |
686 group_igblast = parser_igblast.add_argument_group('aligner parsing arguments') | |
687 group_igblast.add_argument('-i', nargs='+', action='store', dest='aligner_files', | |
688 required=True, | |
689 help='''IgBLAST output files in format 7 with query sequence | |
690 (igblastn argument \'-outfmt "7 std qseq sseq btop"\').''') | |
691 group_igblast.add_argument('-r', nargs='+', action='store', dest='repo', required=True, | |
692 help='''List of folders and/or fasta files containing | |
693 the same germline set used in the IgBLAST alignment. These | |
694 reference sequences must contain IMGT-numbering spacers (gaps) | |
695 in the V segment.''') | |
696 group_igblast.add_argument('-s', action='store', nargs='+', dest='seq_files', | |
697 required=True, | |
698 help='''List of input FASTA files (with .fasta, .fna or .fa | |
699 extension), containing sequences.''') | |
700 group_igblast.add_argument('--10x', action='store', nargs='+', dest='cellranger_file', | |
701 help='''Table file containing 10X annotations (with .csv or .tsv | |
702 extension).''') | |
703 group_igblast.add_argument('--asis-id', action='store_true', dest='asis_id', | |
704 help='''Specify to prevent input sequence headers from being parsed | |
705 to add new columns to database. Parsing of sequence headers requires | |
706 headers to be in the pRESTO annotation format, so this should be specified | |
707 when sequence headers are incompatible with the pRESTO annotation scheme. | |
708 Note, unrecognized header formats will default to this behavior.''') | |
709 group_igblast.add_argument('--asis-calls', action='store_true', dest='asis_calls', | |
710 help='''Specify to prevent gene calls from being parsed into standard allele names | |
711 in both the IgBLAST output and reference database. Note, this requires | |
712 the sequence identifiers in the reference sequence set and the IgBLAST | |
713 database to be exact string matches.''') | |
714 group_igblast.add_argument('--partial', action='store_true', dest='partial', | |
715 help='''If specified, include incomplete V(D)J alignments in | |
716 the pass file instead of the fail file. An incomplete alignment | |
717 is defined as a record for which a valid IMGT-gapped sequence | |
718 cannot be built or that is missing a V gene assignment, | |
719 J gene assignment, junction region, or productivity call.''') | |
720 group_igblast.add_argument('--extended', action='store_true', dest='extended', | |
721 help='''Specify to include additional aligner specific fields in the output. | |
722 Adds <vdj>_score, <vdj>_identity, <vdj>_support, <vdj>_cigar, | |
723 fwr1, fwr2, fwr3, fwr4, cdr1, cdr2 and cdr3.''') | |
724 group_igblast.add_argument('--regions', action='store', dest='regions', | |
725 choices=('default', 'rhesus-igl'), default='default', | |
726 help='''IMGT CDR and FWR boundary definition to use.''') | |
727 parser_igblast.set_defaults(func=parseIgBLAST, amino_acid=False) | |
728 | |
729 # igblastp output parser | |
730 parser_igblast_aa = subparsers.add_parser('igblast-aa', parents=[parser_parent], | |
731 formatter_class=CommonHelpFormatter, add_help=False, | |
732 help='Process igblastp output.', | |
733 description='Process igblastp output.') | |
734 group_igblast_aa = parser_igblast_aa.add_argument_group('aligner parsing arguments') | |
735 group_igblast_aa.add_argument('-i', nargs='+', action='store', dest='aligner_files', | |
736 required=True, | |
737 help='''IgBLAST output files in format 7 with query sequence | |
738 (igblastp argument \'-outfmt "7 std qseq sseq btop"\').''') | |
739 group_igblast_aa.add_argument('-r', nargs='+', action='store', dest='repo', required=True, | |
740 help='''List of folders and/or fasta files containing | |
741 the same germline set used in the IgBLAST alignment. These | |
742 reference sequences must contain IMGT-numbering spacers (gaps) | |
743 in the V segment.''') | |
744 group_igblast_aa.add_argument('-s', action='store', nargs='+', dest='seq_files', required=True, | |
745 help='''List of input FASTA files (with .fasta, .fna or .fa | |
746 extension), containing sequences.''') | |
747 group_igblast_aa.add_argument('--10x', action='store', nargs='+', dest='cellranger_file', | |
748 help='''Table file containing 10X annotations (with .csv or .tsv extension).''') | |
749 group_igblast_aa.add_argument('--asis-id', action='store_true', dest='asis_id', | |
750 help='''Specify to prevent input sequence headers from being parsed | |
751 to add new columns to database. Parsing of sequence headers requires | |
752 headers to be in the pRESTO annotation format, so this should be specified | |
753 when sequence headers are incompatible with the pRESTO annotation scheme. | |
754 Note, unrecognized header formats will default to this behavior.''') | |
755 group_igblast_aa.add_argument('--asis-calls', action='store_true', dest='asis_calls', | |
756 help='''Specify to prevent gene calls from being parsed into standard allele names | |
757 in both the IgBLAST output and reference database. Note, this requires | |
758 the sequence identifiers in the reference sequence set and the IgBLAST | |
759 database to be exact string matches.''') | |
760 group_igblast_aa.add_argument('--extended', action='store_true', dest='extended', | |
761 help='''Specify to include additional aligner specific fields in the output. | |
762 Adds v_score, v_identity, v_support, v_cigar, fwr1, fwr2, fwr3, cdr1 and cdr2.''') | |
763 group_igblast_aa.add_argument('--regions', action='store', dest='regions', | |
764 choices=('default', 'rhesus-igl'), default='default', | |
765 help='''IMGT CDR and FWR boundary definition to use.''') | |
766 parser_igblast_aa.set_defaults(func=parseIgBLAST, partial=True, amino_acid=True) | |
767 | |
768 | |
769 # IMGT aligner | |
770 parser_imgt = subparsers.add_parser('imgt', parents=[parser_parent], | |
771 formatter_class=CommonHelpFormatter, add_help=False, | |
772 help='''Process IMGT/HighV-Quest output | |
773 (does not work with V-QUEST).''', | |
774 description='''Process IMGT/HighV-Quest output | |
775 (does not work with V-QUEST).''') | |
776 group_imgt = parser_imgt.add_argument_group('aligner parsing arguments') | |
777 group_imgt.add_argument('-i', nargs='+', action='store', dest='aligner_files', | |
778 help='''Either zipped IMGT output files (.zip or .txz) or a | |
779 folder containing unzipped IMGT output files (which must | |
780 include 1_Summary, 2_IMGT-gapped, 3_Nt-sequences, | |
781 and 6_Junction).''') | |
782 group_imgt.add_argument('-s', nargs='*', action='store', dest='seq_files', required=False, | |
783 help='''List of FASTA files (with .fasta, .fna or .fa | |
784 extension) that were submitted to IMGT/HighV-QUEST. | |
785 If unspecified, sequence identifiers truncated by IMGT/HighV-QUEST | |
786 will not be corrected.''') | |
787 group_imgt.add_argument('-r', nargs='+', action='store', dest='repo', required=False, | |
788 help='''List of folders and/or fasta files containing | |
789 the germline sequence set used by IMGT/HighV-QUEST. | |
790 These reference sequences must contain IMGT-numbering spacers (gaps) | |
791 in the V segment. If unspecified, the germline sequence reconstruction | |
792 will not be included in the output.''') | |
793 group_imgt.add_argument('--10x', action='store', nargs='+', dest='cellranger_file', | |
794 help='''Table file containing 10X annotations (with .csv or .tsv | |
795 extension).''') | |
796 group_imgt.add_argument('--asis-id', action='store_true', dest='asis_id', | |
797 help='''Specify to prevent input sequence headers from being parsed | |
798 to add new columns to database. Parsing of sequence headers requires | |
799 headers to be in the pRESTO annotation format, so this should be specified | |
800 when sequence headers are incompatible with the pRESTO annotation scheme. | |
801 Note, unrecognized header formats will default to this behavior.''') | |
802 group_imgt.add_argument('--partial', action='store_true', dest='partial', | |
803 help='''If specified, include incomplete V(D)J alignments in | |
804 the pass file instead of the fail file. An incomplete alignment | |
805 is defined as a record that is missing a V gene assignment, | |
806 J gene assignment, junction region, or productivity call.''') | |
807 group_imgt.add_argument('--extended', action='store_true', dest='extended', | |
808 help='''Specify to include additional aligner specific fields in the output. | |
809 Adds <vdj>_score, <vdj>_identity>, fwr1, fwr2, fwr3, fwr4, | |
810 cdr1, cdr2, cdr3, n1_length, n2_length, p3v_length, p5d_length, | |
811 p3d_length, p5j_length and d_frame.''') | |
812 parser_imgt.set_defaults(func=parseIMGT) | |
813 | |
814 # iHMMuneAlign Aligner | |
815 parser_ihmm = subparsers.add_parser('ihmm', parents=[parser_parent], | |
816 formatter_class=CommonHelpFormatter, add_help=False, | |
817 help='Process iHMMune-Align output.', | |
818 description='Process iHMMune-Align output.') | |
819 group_ihmm = parser_ihmm.add_argument_group('aligner parsing arguments') | |
820 group_ihmm.add_argument('-i', nargs='+', action='store', dest='aligner_files', | |
821 required=True, | |
822 help='''iHMMune-Align output file.''') | |
823 group_ihmm.add_argument('-r', nargs='+', action='store', dest='repo', required=True, | |
824 help='''List of folders and/or FASTA files containing | |
825 the set of germline sequences used by iHMMune-Align. These | |
826 reference sequences must contain IMGT-numbering spacers (gaps) | |
827 in the V segment.''') | |
828 group_ihmm.add_argument('-s', action='store', nargs='+', dest='seq_files', | |
829 required=True, | |
830 help='''List of input FASTA files (with .fasta, .fna or .fa | |
831 extension) containing sequences.''') | |
832 group_ihmm.add_argument('--10x', action='store', nargs='+', dest='cellranger_file', | |
833 help='''Table file containing 10X annotations (with .csv or .tsv | |
834 extension).''') | |
835 group_ihmm.add_argument('--asis-id', action='store_true', dest='asis_id', | |
836 help='''Specify to prevent input sequence headers from being parsed | |
837 to add new columns to database. Parsing of sequence headers requires | |
838 headers to be in the pRESTO annotation format, so this should be specified | |
839 when sequence headers are incompatible with the pRESTO annotation scheme. | |
840 Note, unrecognized header formats will default to this behavior.''') | |
841 group_ihmm.add_argument('--partial', action='store_true', dest='partial', | |
842 help='''If specified, include incomplete V(D)J alignments in | |
843 the pass file instead of the fail file. An incomplete alignment | |
844 is defined as a record for which a valid IMGT-gapped sequence | |
845 cannot be built or that is missing a V gene assignment, | |
846 J gene assignment, junction region, or productivity call.''') | |
847 group_ihmm.add_argument('--extended', action='store_true', dest='extended', | |
848 help='''Specify to include additional aligner specific fields in the output. | |
849 Adds the path score of the iHMMune-Align hidden Markov model as vdj_score; | |
850 adds fwr1, fwr2, fwr3, fwr4, cdr1, cdr2 and cdr3.''') | |
851 parser_ihmm.set_defaults(func=parseIHMM) | |
852 | |
853 return parser | |
854 | |
855 | |
856 if __name__ == "__main__": | |
857 """ | |
858 Parses command line arguments and calls main | |
859 """ | |
860 parser = getArgParser() | |
861 checkArgs(parser) | |
862 args = parser.parse_args() | |
863 args_dict = parseCommonArgs(args, in_arg='aligner_files') | |
864 | |
865 # Set no ID parsing if sequence files are not provided | |
866 if 'seq_files' in args_dict and not args_dict['seq_files']: | |
867 args_dict['asis_id'] = True | |
868 | |
869 # Delete | |
870 if 'aligner_files' in args_dict: del args_dict['aligner_files'] | |
871 if 'seq_files' in args_dict: del args_dict['seq_files'] | |
872 if 'out_files' in args_dict: del args_dict['out_files'] | |
873 if 'command' in args_dict: del args_dict['command'] | |
874 if 'func' in args_dict: del args_dict['func'] | |
875 | |
876 # Call main | |
877 for i, f in enumerate(args.__dict__['aligner_files']): | |
878 args_dict['aligner_file'] = f | |
879 args_dict['seq_file'] = args.__dict__['seq_files'][i] \ | |
880 if args.__dict__['seq_files'] else None | |
881 args_dict['out_file'] = args.__dict__['out_files'][i] \ | |
882 if args.__dict__['out_files'] else None | |
883 args_dict['cellranger_file'] = args.__dict__['cellranger_file'][i] \ | |
884 if args.__dict__['cellranger_file'] else None | |
885 args.func(**args_dict) |