Mercurial > repos > davidvanzessen > change_o
comparison MakeDb.py @ 0:183edf446dcf draft default tip
Uploaded
| author | davidvanzessen |
|---|---|
| date | Mon, 17 Jul 2017 07:44:27 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:183edf446dcf |
|---|---|
| 1 #!/usr/bin/env python3 | |
| 2 """ | |
| 3 Create tab-delimited database file to store sequence alignment information | |
| 4 """ | |
| 5 # Info | |
| 6 __author__ = 'Namita Gupta, Jason Anthony Vander Heiden' | |
| 7 from changeo import __version__, __date__ | |
| 8 | |
| 9 # Imports | |
| 10 import os | |
| 11 import sys | |
| 12 from argparse import ArgumentParser | |
| 13 from collections import OrderedDict | |
| 14 from textwrap import dedent | |
| 15 from time import time | |
| 16 from Bio import SeqIO | |
| 17 | |
| 18 # Presto and changeo imports | |
| 19 from presto.Defaults import default_out_args | |
| 20 from presto.Annotation import parseAnnotation | |
| 21 from presto.IO import countSeqFile, printLog, printMessage, printProgress, readSeqFile | |
| 22 from changeo.Commandline import CommonHelpFormatter, checkArgs, getCommonArgParser, parseCommonArgs | |
| 23 from changeo.IO import countDbFile, extractIMGT, getDbWriter, readRepo | |
| 24 from changeo.Parsers import IgBLASTReader, IMGTReader, IHMMuneReader, getIDforIMGT | |
| 25 | |
| 26 | |
| 27 def getFilePrefix(aligner_output, out_args): | |
| 28 """ | |
| 29 Get file name prefix and create output directory | |
| 30 | |
| 31 Arguments: | |
| 32 aligner_output : aligner output file or directory. | |
| 33 out_args : dictionary of output arguments. | |
| 34 | |
| 35 Returns: | |
| 36 str : file name prefix. | |
| 37 """ | |
| 38 # Determine output directory | |
| 39 if not out_args['out_dir']: | |
| 40 out_dir = os.path.dirname(os.path.abspath(aligner_output)) | |
| 41 else: | |
| 42 out_dir = os.path.abspath(out_args['out_dir']) | |
| 43 if not os.path.exists(out_dir): | |
| 44 os.mkdir(out_dir) | |
| 45 | |
| 46 # Determine file prefix | |
| 47 if out_args['out_name']: | |
| 48 file_prefix = out_args['out_name'] | |
| 49 else: | |
| 50 file_prefix = os.path.splitext(os.path.split(os.path.abspath(aligner_output))[1])[0] | |
| 51 | |
| 52 return os.path.join(out_dir, file_prefix) | |
| 53 | |
| 54 | |
| 55 def getSeqDict(seq_file): | |
| 56 """ | |
| 57 Create a dictionary from a sequence file. | |
| 58 | |
| 59 Arguments: | |
| 60 seq_file : sequence file. | |
| 61 | |
| 62 Returns: | |
| 63 dict : sequence description as keys with Bio.SeqRecords as values. | |
| 64 """ | |
| 65 seq_dict = SeqIO.to_dict(readSeqFile(seq_file), | |
| 66 key_function=lambda x: x.description) | |
| 67 | |
| 68 return seq_dict | |
| 69 | |
| 70 | |
| 71 def writeDb(db, fields, file_prefix, total_count, id_dict=None, no_parse=True, partial=False, | |
| 72 out_args=default_out_args): | |
| 73 """ | |
| 74 Writes tab-delimited database file in output directory. | |
| 75 | |
| 76 Arguments: | |
| 77 db : a iterator of IgRecord objects containing alignment data. | |
| 78 fields : a list of ordered field names to write. | |
| 79 file_prefix : directory and prefix for CLIP tab-delim file. | |
| 80 total_count : number of records (for progress bar). | |
| 81 id_dict : a dictionary of the truncated sequence ID mapped to the full sequence ID. | |
| 82 no_parse : if ID is to be parsed for pRESTO output with default delimiters. | |
| 83 partial : if True put incomplete alignments in the pass file. | |
| 84 out_args : common output argument dictionary from parseCommonArgs. | |
| 85 | |
| 86 Returns: | |
| 87 None | |
| 88 """ | |
| 89 # Function to check for valid records strictly | |
| 90 def _pass_strict(rec): | |
| 91 valid = [rec.v_call and rec.v_call != 'None', | |
| 92 rec.j_call and rec.j_call != 'None', | |
| 93 rec.functional is not None, | |
| 94 rec.seq_vdj, | |
| 95 rec.junction] | |
| 96 return all(valid) | |
| 97 | |
| 98 # Function to check for valid records loosely | |
| 99 def _pass_gentle(rec): | |
| 100 valid = [rec.v_call and rec.v_call != 'None', | |
| 101 rec.d_call and rec.d_call != 'None', | |
| 102 rec.j_call and rec.j_call != 'None'] | |
| 103 return any(valid) | |
| 104 | |
| 105 # Set pass criteria | |
| 106 _pass = _pass_gentle if partial else _pass_strict | |
| 107 | |
| 108 # Define output file names | |
| 109 pass_file = '%s_db-pass.tab' % file_prefix | |
| 110 fail_file = '%s_db-fail.tab' % file_prefix | |
| 111 | |
| 112 # Initiate handles, writers and counters | |
| 113 pass_handle = None | |
| 114 fail_handle = None | |
| 115 pass_writer = None | |
| 116 fail_writer = None | |
| 117 start_time = time() | |
| 118 rec_count = pass_count = fail_count = 0 | |
| 119 | |
| 120 # Validate and write output | |
| 121 printProgress(0, total_count, 0.05, start_time) | |
| 122 for i, record in enumerate(db, start=1): | |
| 123 | |
| 124 # Replace sequence description with full string, if required | |
| 125 if id_dict is not None and record.id in id_dict: | |
| 126 record.id = id_dict[record.id] | |
| 127 | |
| 128 # Parse sequence description into new columns | |
| 129 if not no_parse: | |
| 130 try: | |
| 131 record.annotations = parseAnnotation(record.id, delimiter=out_args['delimiter']) | |
| 132 record.id = record.annotations['ID'] | |
| 133 del record.annotations['ID'] | |
| 134 | |
| 135 # TODO: This is not the best approach. should pass in output fields. | |
| 136 # If first record, use parsed description to define extra columns | |
| 137 if i == 1: fields.extend(list(record.annotations.keys())) | |
| 138 except IndexError: | |
| 139 # Could not parse pRESTO-style annotations so fall back to no parse | |
| 140 no_parse = True | |
| 141 sys.stderr.write('\nWARNING: Sequence annotation format not recognized. Sequence headers will not be parsed.\n') | |
| 142 | |
| 143 # Count pass or fail and write to appropriate file | |
| 144 if _pass(record): | |
| 145 # Open pass file | |
| 146 if pass_writer is None: | |
| 147 pass_handle = open(pass_file, 'wt') | |
| 148 pass_writer = getDbWriter(pass_handle, add_fields=fields) | |
| 149 | |
| 150 # Write row to pass file | |
| 151 pass_count += 1 | |
| 152 pass_writer.writerow(record.toDict()) | |
| 153 else: | |
| 154 # Open failed file | |
| 155 if out_args['failed'] and fail_writer is None: | |
| 156 fail_handle = open(fail_file, 'wt') | |
| 157 fail_writer = getDbWriter(fail_handle, add_fields=fields) | |
| 158 | |
| 159 # Write row to fail file if specified | |
| 160 fail_count += 1 | |
| 161 if fail_writer is not None: | |
| 162 fail_writer.writerow(record.toDict()) | |
| 163 | |
| 164 # Print progress | |
| 165 printProgress(i, total_count, 0.05, start_time) | |
| 166 | |
| 167 # Print consol log | |
| 168 log = OrderedDict() | |
| 169 log['OUTPUT'] = pass_file | |
| 170 log['PASS'] = pass_count | |
| 171 log['FAIL'] = fail_count | |
| 172 log['END'] = 'MakeDb' | |
| 173 printLog(log) | |
| 174 | |
| 175 if pass_handle is not None: pass_handle.close() | |
| 176 if fail_handle is not None: fail_handle.close() | |
| 177 | |
| 178 | |
| 179 # TODO: may be able to merge with other mains | |
| 180 def parseIMGT(aligner_output, seq_file=None, no_parse=True, partial=False, | |
| 181 parse_scores=False, parse_regions=False, parse_junction=False, | |
| 182 out_args=default_out_args): | |
| 183 """ | |
| 184 Main for IMGT aligned sample sequences. | |
| 185 | |
| 186 Arguments: | |
| 187 aligner_output : zipped file or unzipped folder output by IMGT. | |
| 188 seq_file : FASTA file input to IMGT (from which to get seqID). | |
| 189 no_parse : if ID is to be parsed for pRESTO output with default delimiters. | |
| 190 partial : If True put incomplete alignments in the pass file. | |
| 191 parse_scores : if True add alignment score fields to output file. | |
| 192 parse_regions : if True add FWR and CDR region fields to output file. | |
| 193 out_args : common output argument dictionary from parseCommonArgs. | |
| 194 | |
| 195 Returns: | |
| 196 None | |
| 197 """ | |
| 198 # Print parameter info | |
| 199 log = OrderedDict() | |
| 200 log['START'] = 'MakeDb' | |
| 201 log['ALIGNER'] = 'IMGT' | |
| 202 log['ALIGNER_OUTPUT'] = aligner_output | |
| 203 log['SEQ_FILE'] = os.path.basename(seq_file) if seq_file else '' | |
| 204 log['NO_PARSE'] = no_parse | |
| 205 log['PARTIAL'] = partial | |
| 206 log['SCORES'] = parse_scores | |
| 207 log['REGIONS'] = parse_regions | |
| 208 log['JUNCTION'] = parse_junction | |
| 209 printLog(log) | |
| 210 | |
| 211 start_time = time() | |
| 212 printMessage('Loading sequence files', start_time=start_time, width=25) | |
| 213 # Extract IMGT files | |
| 214 temp_dir, imgt_files = extractIMGT(aligner_output) | |
| 215 # Count records in IMGT files | |
| 216 total_count = countDbFile(imgt_files['summary']) | |
| 217 # Get (parsed) IDs from fasta file submitted to IMGT | |
| 218 id_dict = getIDforIMGT(seq_file) if seq_file else {} | |
| 219 printMessage('Done', start_time=start_time, end=True, width=25) | |
| 220 | |
| 221 # Parse IMGT output and write db | |
| 222 with open(imgt_files['summary'], 'r') as summary_handle, \ | |
| 223 open(imgt_files['gapped'], 'r') as gapped_handle, \ | |
| 224 open(imgt_files['ntseq'], 'r') as ntseq_handle, \ | |
| 225 open(imgt_files['junction'], 'r') as junction_handle: | |
| 226 parse_iter = IMGTReader(summary_handle, gapped_handle, ntseq_handle, junction_handle, | |
| 227 parse_scores=parse_scores, parse_regions=parse_regions, | |
| 228 parse_junction=parse_junction) | |
| 229 file_prefix = getFilePrefix(aligner_output, out_args) | |
| 230 writeDb(parse_iter, parse_iter.fields, file_prefix, total_count, id_dict=id_dict, | |
| 231 no_parse=no_parse, partial=partial, out_args=out_args) | |
| 232 | |
| 233 # Cleanup temp directory | |
| 234 temp_dir.cleanup() | |
| 235 | |
| 236 return None | |
| 237 | |
| 238 | |
| 239 # TODO: may be able to merge with other mains | |
| 240 def parseIgBLAST(aligner_output, seq_file, repo, no_parse=True, partial=False, | |
| 241 parse_regions=False, parse_scores=False, parse_igblast_cdr3=False, | |
| 242 out_args=default_out_args): | |
| 243 """ | |
| 244 Main for IgBLAST aligned sample sequences. | |
| 245 | |
| 246 Arguments: | |
| 247 aligner_output : IgBLAST output file to process. | |
| 248 seq_file : fasta file input to IgBlast (from which to get sequence). | |
| 249 repo : folder with germline repertoire files. | |
| 250 no_parse : if ID is to be parsed for pRESTO output with default delimiters. | |
| 251 partial : If True put incomplete alignments in the pass file. | |
| 252 parse_regions : if True add FWR and CDR fields to output file. | |
| 253 parse_scores : if True add alignment score fields to output file. | |
| 254 parse_igblast_cdr3 : if True parse CDR3 sequences generated by IgBLAST | |
| 255 out_args : common output argument dictionary from parseCommonArgs. | |
| 256 | |
| 257 Returns: | |
| 258 None | |
| 259 """ | |
| 260 # Print parameter info | |
| 261 log = OrderedDict() | |
| 262 log['START'] = 'MakeDB' | |
| 263 log['ALIGNER'] = 'IgBlast' | |
| 264 log['ALIGNER_OUTPUT'] = os.path.basename(aligner_output) | |
| 265 log['SEQ_FILE'] = os.path.basename(seq_file) | |
| 266 log['NO_PARSE'] = no_parse | |
| 267 log['PARTIAL'] = partial | |
| 268 log['SCORES'] = parse_scores | |
| 269 log['REGIONS'] = parse_regions | |
| 270 printLog(log) | |
| 271 | |
| 272 start_time = time() | |
| 273 printMessage('Loading sequence files', start_time=start_time, width=25) | |
| 274 # Count records in sequence file | |
| 275 total_count = countSeqFile(seq_file) | |
| 276 # Get input sequence dictionary | |
| 277 seq_dict = getSeqDict(seq_file) | |
| 278 # Create germline repo dictionary | |
| 279 repo_dict = readRepo(repo) | |
| 280 printMessage('Done', start_time=start_time, end=True, width=25) | |
| 281 | |
| 282 # Parse and write output | |
| 283 with open(aligner_output, 'r') as f: | |
| 284 parse_iter = IgBLASTReader(f, seq_dict, repo_dict, | |
| 285 parse_scores=parse_scores, parse_regions=parse_regions, | |
| 286 parse_igblast_cdr3=parse_igblast_cdr3) | |
| 287 file_prefix = getFilePrefix(aligner_output, out_args) | |
| 288 writeDb(parse_iter, parse_iter.fields, file_prefix, total_count, | |
| 289 no_parse=no_parse, partial=partial, out_args=out_args) | |
| 290 | |
| 291 return None | |
| 292 | |
| 293 | |
| 294 # TODO: may be able to merge with other mains | |
| 295 def parseIHMM(aligner_output, seq_file, repo, no_parse=True, partial=False, | |
| 296 parse_scores=False, parse_regions=False, out_args=default_out_args): | |
| 297 """ | |
| 298 Main for iHMMuneAlign aligned sample sequences. | |
| 299 | |
| 300 Arguments: | |
| 301 aligner_output : iHMMune-Align output file to process. | |
| 302 seq_file : fasta file input to iHMMuneAlign (from which to get sequence). | |
| 303 repo : folder with germline repertoire files. | |
| 304 no_parse : if ID is to be parsed for pRESTO output with default delimiters. | |
| 305 partial : If True put incomplete alignments in the pass file. | |
| 306 parse_scores : if True parse alignment scores. | |
| 307 parse_regions : if True add FWR and CDR region fields. | |
| 308 out_args : common output argument dictionary from parseCommonArgs. | |
| 309 | |
| 310 Returns: | |
| 311 None | |
| 312 """ | |
| 313 # Print parameter info | |
| 314 log = OrderedDict() | |
| 315 log['START'] = 'MakeDB' | |
| 316 log['ALIGNER'] = 'iHMMune-Align' | |
| 317 log['ALIGNER_OUTPUT'] = os.path.basename(aligner_output) | |
| 318 log['SEQ_FILE'] = os.path.basename(seq_file) | |
| 319 log['NO_PARSE'] = no_parse | |
| 320 log['PARTIAL'] = partial | |
| 321 log['SCORES'] = parse_scores | |
| 322 log['REGIONS'] = parse_regions | |
| 323 printLog(log) | |
| 324 | |
| 325 start_time = time() | |
| 326 printMessage('Loading sequence files', start_time=start_time, width=25) | |
| 327 # Count records in sequence file | |
| 328 total_count = countSeqFile(seq_file) | |
| 329 # Get input sequence dictionary | |
| 330 seq_dict = getSeqDict(seq_file) | |
| 331 # Create germline repo dictionary | |
| 332 repo_dict = readRepo(repo) | |
| 333 printMessage('Done', start_time=start_time, end=True, width=25) | |
| 334 | |
| 335 # Parse and write output | |
| 336 with open(aligner_output, 'r') as f: | |
| 337 parse_iter = IHMMuneReader(f, seq_dict, repo_dict, | |
| 338 parse_scores=parse_scores, parse_regions=parse_regions) | |
| 339 file_prefix = getFilePrefix(aligner_output, out_args) | |
| 340 writeDb(parse_iter, parse_iter.fields, file_prefix, total_count, | |
| 341 no_parse=no_parse, partial=partial, out_args=out_args) | |
| 342 | |
| 343 return None | |
| 344 | |
| 345 | |
| 346 def getArgParser(): | |
| 347 """ | |
| 348 Defines the ArgumentParser. | |
| 349 | |
| 350 Returns: | |
| 351 argparse.ArgumentParser | |
| 352 """ | |
| 353 fields = dedent( | |
| 354 ''' | |
| 355 output files: | |
| 356 db-pass | |
| 357 database of alignment records with functionality information, | |
| 358 V and J calls, and a junction region. | |
| 359 db-fail | |
| 360 database with records that fail due to no functionality information | |
| 361 (did not pass IMGT), no V call, no J call, or no junction region. | |
| 362 | |
| 363 universal output fields: | |
| 364 SEQUENCE_ID, SEQUENCE_INPUT, SEQUENCE_VDJ, SEQUENCE_IMGT, | |
| 365 FUNCTIONAL, IN_FRAME, STOP, MUTATED_INVARIANT, INDELS, | |
| 366 V_CALL, D_CALL, J_CALL, | |
| 367 V_SEQ_START, V_SEQ_LENGTH, | |
| 368 D_SEQ_START, D_SEQ_LENGTH, D_GERM_START, D_GERM_LENGTH, | |
| 369 J_SEQ_START, J_SEQ_LENGTH, J_GERM_START, J_GERM_LENGTH, | |
| 370 JUNCTION_LENGTH, JUNCTION, NP1_LENGTH, NP2_LENGTH, | |
| 371 FWR1_IMGT, FWR2_IMGT, FWR3_IMGT, FWR4_IMGT, | |
| 372 CDR1_IMGT, CDR2_IMGT, CDR3_IMGT | |
| 373 | |
| 374 imgt specific output fields: | |
| 375 V_GERM_START_IMGT, V_GERM_LENGTH_IMGT, | |
| 376 N1_LENGTH, N2_LENGTH, P3V_LENGTH, P5D_LENGTH, P3D_LENGTH, P5J_LENGTH, | |
| 377 D_FRAME, V_SCORE, V_IDENTITY, J_SCORE, J_IDENTITY, | |
| 378 | |
| 379 igblast specific output fields: | |
| 380 V_GERM_START_VDJ, V_GERM_LENGTH_VDJ, | |
| 381 V_EVALUE, V_SCORE, V_IDENTITY, V_BTOP, | |
| 382 J_EVALUE, J_SCORE, J_IDENTITY, J_BTOP. | |
| 383 CDR3_IGBLAST_NT, CDR3_IGBLAST_AA | |
| 384 | |
| 385 ihmm specific output fields: | |
| 386 V_GERM_START_VDJ, V_GERM_LENGTH_VDJ, | |
| 387 HMM_SCORE | |
| 388 ''') | |
| 389 | |
| 390 # Define ArgumentParser | |
| 391 parser = ArgumentParser(description=__doc__, epilog=fields, | |
| 392 formatter_class=CommonHelpFormatter) | |
| 393 parser.add_argument('--version', action='version', | |
| 394 version='%(prog)s:' + ' %s-%s' %(__version__, __date__)) | |
| 395 subparsers = parser.add_subparsers(title='subcommands', dest='command', | |
| 396 help='Aligner used', metavar='') | |
| 397 # TODO: This is a temporary fix for Python issue 9253 | |
| 398 subparsers.required = True | |
| 399 | |
| 400 # Parent parser | |
| 401 parser_parent = getCommonArgParser(seq_in=False, seq_out=False, log=False) | |
| 402 | |
| 403 # IgBlast Aligner | |
| 404 parser_igblast = subparsers.add_parser('igblast', parents=[parser_parent], | |
| 405 formatter_class=CommonHelpFormatter, | |
| 406 help='Process IgBLAST output.', | |
| 407 description='Process IgBLAST output.') | |
| 408 parser_igblast.add_argument('-i', nargs='+', action='store', dest='aligner_outputs', | |
| 409 required=True, | |
| 410 help='''IgBLAST output files in format 7 with query sequence | |
| 411 (IgBLAST argument \'-outfmt "7 std qseq sseq btop"\').''') | |
| 412 parser_igblast.add_argument('-r', nargs='+', action='store', dest='repo', required=True, | |
| 413 help='''List of folders and/or fasta files containing | |
| 414 IMGT-gapped germline sequences corresponding to the | |
| 415 set of germlines used in the IgBLAST alignment.''') | |
| 416 parser_igblast.add_argument('-s', action='store', nargs='+', dest='seq_files', | |
| 417 required=True, | |
| 418 help='''List of input FASTA files (with .fasta, .fna or .fa | |
| 419 extension), containing sequences.''') | |
| 420 parser_igblast.add_argument('--noparse', action='store_true', dest='no_parse', | |
| 421 help='''Specify to prevent input sequence headers from being parsed | |
| 422 to add new columns to database. Parsing of sequence headers requires | |
| 423 headers to be in the pRESTO annotation format, so this should be specified | |
| 424 when sequence headers are incompatible with the pRESTO annotation scheme. | |
| 425 Note, unrecognized header formats will default to this behavior.''') | |
| 426 parser_igblast.add_argument('--partial', action='store_true', dest='partial', | |
| 427 help='''If specified, include incomplete V(D)J alignments in | |
| 428 the pass file instead of the fail file.''') | |
| 429 parser_igblast.add_argument('--scores', action='store_true', dest='parse_scores', | |
| 430 help='''Specify if alignment score metrics should be | |
| 431 included in the output. Adds the V_SCORE, V_IDENTITY, | |
| 432 V_EVALUE, V_BTOP, J_SCORE, J_IDENTITY, | |
| 433 J_BTOP, and J_EVALUE columns.''') | |
| 434 parser_igblast.add_argument('--regions', action='store_true', dest='parse_regions', | |
| 435 help='''Specify if IMGT FWR and CDRs should be | |
| 436 included in the output. Adds the FWR1_IMGT, FWR2_IMGT, | |
| 437 FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and | |
| 438 CDR3_IMGT columns.''') | |
| 439 parser_igblast.add_argument('--cdr3', action='store_true', | |
| 440 dest='parse_igblast_cdr3', | |
| 441 help='''Specify if the CDR3 sequences generated by IgBLAST | |
| 442 should be included in the output. Adds the columns | |
| 443 CDR3_IGBLAST_NT and CDR3_IGBLAST_AA. Requires IgBLAST | |
| 444 version 1.5 or greater.''') | |
| 445 parser_igblast.set_defaults(func=parseIgBLAST) | |
| 446 | |
| 447 # IMGT aligner | |
| 448 parser_imgt = subparsers.add_parser('imgt', parents=[parser_parent], | |
| 449 formatter_class=CommonHelpFormatter, | |
| 450 help='''Process IMGT/HighV-Quest output | |
| 451 (does not work with V-QUEST).''', | |
| 452 description='''Process IMGT/HighV-Quest output | |
| 453 (does not work with V-QUEST).''') | |
| 454 parser_imgt.add_argument('-i', nargs='+', action='store', dest='aligner_outputs', | |
| 455 help='''Either zipped IMGT output files (.zip or .txz) or a | |
| 456 folder containing unzipped IMGT output files (which must | |
| 457 include 1_Summary, 2_IMGT-gapped, 3_Nt-sequences, | |
| 458 and 6_Junction).''') | |
| 459 parser_imgt.add_argument('-s', nargs='*', action='store', dest='seq_files', | |
| 460 required=False, | |
| 461 help='''List of input FASTA files (with .fasta, .fna or .fa | |
| 462 extension) containing sequences.''') | |
| 463 parser_imgt.add_argument('--noparse', action='store_true', dest='no_parse', | |
| 464 help='''Specify to prevent input sequence headers from being parsed | |
| 465 to add new columns to database. Parsing of sequence headers requires | |
| 466 headers to be in the pRESTO annotation format, so this should be specified | |
| 467 when sequence headers are incompatible with the pRESTO annotation scheme. | |
| 468 Note, unrecognized header formats will default to this behavior.''') | |
| 469 parser_imgt.add_argument('--partial', action='store_true', dest='partial', | |
| 470 help='''If specified, include incomplete V(D)J alignments in | |
| 471 the pass file instead of the fail file.''') | |
| 472 parser_imgt.add_argument('--scores', action='store_true', dest='parse_scores', | |
| 473 help='''Specify if alignment score metrics should be | |
| 474 included in the output. Adds the V_SCORE, V_IDENTITY, | |
| 475 J_SCORE and J_IDENTITY.''') | |
| 476 parser_imgt.add_argument('--regions', action='store_true', dest='parse_regions', | |
| 477 help='''Specify if IMGT FWRs and CDRs should be | |
| 478 included in the output. Adds the FWR1_IMGT, FWR2_IMGT, | |
| 479 FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and | |
| 480 CDR3_IMGT columns.''') | |
| 481 parser_imgt.add_argument('--junction', action='store_true', dest='parse_junction', | |
| 482 help='''Specify if detailed junction fields should be | |
| 483 included in the output. Adds the columns | |
| 484 N1_LENGTH, N2_LENGTH, P3V_LENGTH, P5D_LENGTH, P3D_LENGTH, | |
| 485 P5J_LENGTH, D_FRAME.''') | |
| 486 parser_imgt.set_defaults(func=parseIMGT) | |
| 487 | |
| 488 # iHMMuneAlign Aligner | |
| 489 parser_ihmm = subparsers.add_parser('ihmm', parents=[parser_parent], | |
| 490 formatter_class=CommonHelpFormatter, | |
| 491 help='Process iHMMune-Align output.', | |
| 492 description='Process iHMMune-Align output.') | |
| 493 parser_ihmm.add_argument('-i', nargs='+', action='store', dest='aligner_outputs', | |
| 494 required=True, | |
| 495 help='''iHMMune-Align output file.''') | |
| 496 parser_ihmm.add_argument('-r', nargs='+', action='store', dest='repo', required=True, | |
| 497 help='''List of folders and/or FASTA files containing | |
| 498 IMGT-gapped germline sequences corresponding to the | |
| 499 set of germlines used in the IgBLAST alignment.''') | |
| 500 parser_ihmm.add_argument('-s', action='store', nargs='+', dest='seq_files', | |
| 501 required=True, | |
| 502 help='''List of input FASTA files (with .fasta, .fna or .fa | |
| 503 extension) containing sequences.''') | |
| 504 parser_ihmm.add_argument('--noparse', action='store_true', dest='no_parse', | |
| 505 help='''Specify to prevent input sequence headers from being parsed | |
| 506 to add new columns to database. Parsing of sequence headers requires | |
| 507 headers to be in the pRESTO annotation format, so this should be specified | |
| 508 when sequence headers are incompatible with the pRESTO annotation scheme. | |
| 509 Note, unrecognized header formats will default to this behavior.''') | |
| 510 parser_ihmm.add_argument('--partial', action='store_true', dest='partial', | |
| 511 help='''If specified, include incomplete V(D)J alignments in | |
| 512 the pass file instead of the fail file.''') | |
| 513 parser_ihmm.add_argument('--scores', action='store_true', dest='parse_scores', | |
| 514 help='''Specify if alignment score metrics should be | |
| 515 included in the output. Adds the path score of the | |
| 516 iHMMune-Align hidden Markov model to HMM_SCORE.''') | |
| 517 parser_ihmm.add_argument('--regions', action='store_true', dest='parse_regions', | |
| 518 help='''Specify if IMGT FWRs and CDRs should be | |
| 519 included in the output. Adds the FWR1_IMGT, FWR2_IMGT, | |
| 520 FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and | |
| 521 CDR3_IMGT columns.''') | |
| 522 parser_ihmm.set_defaults(func=parseIHMM) | |
| 523 | |
| 524 return parser | |
| 525 | |
| 526 | |
| 527 if __name__ == "__main__": | |
| 528 """ | |
| 529 Parses command line arguments and calls main | |
| 530 """ | |
| 531 parser = getArgParser() | |
| 532 checkArgs(parser) | |
| 533 args = parser.parse_args() | |
| 534 args_dict = parseCommonArgs(args, in_arg='aligner_outputs') | |
| 535 | |
| 536 # Set no ID parsing if sequence files are not provided | |
| 537 if 'seq_files' in args_dict and not args_dict['seq_files']: | |
| 538 args_dict['no_parse'] = True | |
| 539 | |
| 540 # Delete | |
| 541 if 'seq_files' in args_dict: del args_dict['seq_files'] | |
| 542 if 'aligner_outputs' in args_dict: del args_dict['aligner_outputs'] | |
| 543 if 'command' in args_dict: del args_dict['command'] | |
| 544 if 'func' in args_dict: del args_dict['func'] | |
| 545 | |
| 546 if args.command == 'imgt': | |
| 547 for i in range(len(args.__dict__['aligner_outputs'])): | |
| 548 args_dict['aligner_output'] = args.__dict__['aligner_outputs'][i] | |
| 549 args_dict['seq_file'] = args.__dict__['seq_files'][i] \ | |
| 550 if args.__dict__['seq_files'] else None | |
| 551 args.func(**args_dict) | |
| 552 elif args.command == 'igblast' or args.command == 'ihmm': | |
| 553 for i in range(len(args.__dict__['aligner_outputs'])): | |
| 554 args_dict['aligner_output'] = args.__dict__['aligner_outputs'][i] | |
| 555 args_dict['seq_file'] = args.__dict__['seq_files'][i] | |
| 556 args.func(**args_dict) |
