| 0 | 1 #!/usr/bin/env python3 | 
|  | 2 """ | 
|  | 3 Create tab-delimited database file to store sequence alignment information | 
|  | 4 """ | 
|  | 5 # Info | 
|  | 6 __author__ = 'Namita Gupta, Jason Anthony Vander Heiden' | 
|  | 7 from changeo import __version__, __date__ | 
|  | 8 | 
|  | 9 # Imports | 
|  | 10 import os | 
|  | 11 import sys | 
|  | 12 from argparse import ArgumentParser | 
|  | 13 from collections import OrderedDict | 
|  | 14 from textwrap import dedent | 
|  | 15 from time import time | 
|  | 16 from Bio import SeqIO | 
|  | 17 | 
|  | 18 # Presto and changeo imports | 
|  | 19 from presto.Defaults import default_out_args | 
|  | 20 from presto.Annotation import parseAnnotation | 
| 52 | 21 from presto.IO import countSeqFile, printLog, printMessage, printProgress, readSeqFile | 
| 0 | 22 from changeo.Commandline import CommonHelpFormatter, getCommonArgParser, parseCommonArgs | 
| 52 | 23 from changeo.IO import countDbFile, extractIMGT, getDbWriter, readRepo | 
|  | 24 from changeo.Parsers import IgBLASTReader, IMGTReader, IHMMuneReader, getIDforIMGT | 
| 0 | 25 | 
|  | 26 | 
| 52 | 27 def getFilePrefix(aligner_output, out_args): | 
| 0 | 28     """ | 
| 52 | 29     Get file name prefix and create output directory | 
| 0 | 30 | 
|  | 31     Arguments: | 
| 52 | 32       aligner_output : aligner output file or directory. | 
|  | 33       out_args : dictionary of output arguments. | 
| 0 | 34 | 
|  | 35     Returns: | 
| 52 | 36         str : file name prefix. | 
| 0 | 37     """ | 
| 52 | 38     # Determine output directory | 
|  | 39     if not out_args['out_dir']: | 
|  | 40         out_dir = os.path.dirname(os.path.abspath(aligner_output)) | 
|  | 41     else: | 
|  | 42         out_dir = os.path.abspath(out_args['out_dir']) | 
|  | 43         if not os.path.exists(out_dir): | 
|  | 44             os.mkdir(out_dir) | 
| 0 | 45 | 
| 52 | 46     # Determine file prefix | 
|  | 47     if out_args['out_name']: | 
|  | 48         file_prefix = out_args['out_name'] | 
|  | 49     else: | 
|  | 50         file_prefix = os.path.splitext(os.path.split(os.path.abspath(aligner_output))[1])[0] | 
| 0 | 51 | 
| 52 | 52     return os.path.join(out_dir, file_prefix) | 
| 0 | 53 | 
|  | 54 | 
| 52 | 55 def getSeqDict(seq_file): | 
| 0 | 56     """ | 
| 52 | 57     Create a dictionary from a sequence file. | 
| 0 | 58 | 
|  | 59     Arguments: | 
| 52 | 60       seq_file : sequence file. | 
| 0 | 61 | 
|  | 62     Returns: | 
| 52 | 63         dict : sequence description as keys with Bio.SeqRecords as values. | 
| 0 | 64     """ | 
| 52 | 65     seq_dict = SeqIO.to_dict(readSeqFile(seq_file), | 
|  | 66                              key_function=lambda x: x.description) | 
| 0 | 67 | 
| 52 | 68     return seq_dict | 
| 0 | 69 | 
|  | 70 | 
| 52 | 71 def writeDb(db, fields, file_prefix, total_count, id_dict=None, no_parse=True, partial=False, | 
|  | 72             out_args=default_out_args): | 
| 0 | 73     """ | 
| 52 | 74     Writes tab-delimited database file in output directory. | 
| 0 | 75 | 
|  | 76     Arguments: | 
| 52 | 77       db : a iterator of IgRecord objects containing alignment data. | 
|  | 78       fields : a list of ordered field names to write. | 
|  | 79       file_prefix : directory and prefix for CLIP tab-delim file. | 
|  | 80       total_count : number of records (for progress bar). | 
|  | 81       id_dict : a dictionary of the truncated sequence ID mapped to the full sequence ID. | 
|  | 82       no_parse : if ID is to be parsed for pRESTO output with default delimiters. | 
|  | 83       partial : if True put incomplete alignments in the pass file. | 
|  | 84       out_args : common output argument dictionary from parseCommonArgs. | 
| 0 | 85 | 
|  | 86     Returns: | 
| 52 | 87       None | 
| 0 | 88     """ | 
| 52 | 89     # Function to check for valid records strictly | 
|  | 90     def _pass_strict(rec): | 
|  | 91         valid = [rec.v_call and rec.v_call != 'None', | 
|  | 92                  rec.j_call and rec.j_call != 'None', | 
|  | 93                  rec.functional is not None, | 
|  | 94                  rec.seq_vdj, | 
|  | 95                  rec.junction] | 
|  | 96         return all(valid) | 
| 0 | 97 | 
| 52 | 98     # Function to check for valid records loosely | 
|  | 99     def _pass_gentle(rec): | 
|  | 100         valid = [rec.v_call and rec.v_call != 'None', | 
|  | 101                  rec.d_call and rec.d_call != 'None', | 
|  | 102                  rec.j_call and rec.j_call != 'None'] | 
|  | 103         return any(valid) | 
| 0 | 104 | 
| 52 | 105     # Set pass criteria | 
|  | 106     _pass = _pass_gentle if partial else _pass_strict | 
| 0 | 107 | 
| 52 | 108     # Define output file names | 
|  | 109     pass_file = '%s_db-pass.tab' % file_prefix | 
|  | 110     fail_file = '%s_db-fail.tab' % file_prefix | 
| 0 | 111 | 
| 52 | 112     # Initiate handles, writers and counters | 
|  | 113     pass_handle = None | 
|  | 114     fail_handle = None | 
| 0 | 115     pass_writer = None | 
| 52 | 116     fail_writer = None | 
| 0 | 117     start_time = time() | 
|  | 118     rec_count = pass_count = fail_count = 0 | 
|  | 119 | 
| 52 | 120     # Validate and write output | 
|  | 121     printProgress(0, total_count, 0.05, start_time) | 
|  | 122     for i, record in enumerate(db, start=1): | 
|  | 123 | 
|  | 124         # Replace sequence description with full string, if required | 
|  | 125         if id_dict is not None and record.id in id_dict: | 
| 0 | 126             record.id = id_dict[record.id] | 
|  | 127 | 
|  | 128         # Parse sequence description into new columns | 
|  | 129         if not no_parse: | 
| 52 | 130             try: | 
|  | 131                 record.annotations = parseAnnotation(record.id, delimiter=out_args['delimiter']) | 
|  | 132                 record.id = record.annotations['ID'] | 
|  | 133                 del record.annotations['ID'] | 
|  | 134 | 
|  | 135                 # TODO:  This is not the best approach. should pass in output fields. | 
|  | 136                 # If first record, use parsed description to define extra columns | 
|  | 137                 if i == 1:  fields.extend(list(record.annotations.keys())) | 
|  | 138             except IndexError: | 
|  | 139                 # Could not parse pRESTO-style annotations so fall back to no parse | 
|  | 140                 no_parse = True | 
|  | 141                 sys.stderr.write('\nWARNING: Sequence annotation format not recognized. Sequence headers will not be parsed.\n') | 
| 0 | 142 | 
| 52 | 143         # Count pass or fail and write to appropriate file | 
|  | 144         if _pass(record): | 
|  | 145             # Open pass file | 
|  | 146             if pass_writer is None: | 
|  | 147                 pass_handle = open(pass_file, 'wt') | 
|  | 148                 pass_writer = getDbWriter(pass_handle, add_fields=fields) | 
| 0 | 149 | 
| 52 | 150             # Write row to pass file | 
|  | 151             pass_count += 1 | 
|  | 152             pass_writer.writerow(record.toDict()) | 
|  | 153         else: | 
|  | 154             # Open failed file | 
|  | 155             if out_args['failed'] and fail_writer is None: | 
|  | 156                 fail_handle = open(fail_file, 'wt') | 
|  | 157                 fail_writer = getDbWriter(fail_handle, add_fields=fields) | 
| 0 | 158 | 
| 52 | 159             # Write row to fail file if specified | 
|  | 160             fail_count += 1 | 
|  | 161             if fail_writer is not None: | 
|  | 162                 fail_writer.writerow(record.toDict()) | 
|  | 163 | 
|  | 164         # Print progress | 
|  | 165         printProgress(i, total_count, 0.05, start_time) | 
|  | 166 | 
|  | 167     # Print consol log | 
| 0 | 168     log = OrderedDict() | 
|  | 169     log['OUTPUT'] = pass_file | 
|  | 170     log['PASS'] = pass_count | 
|  | 171     log['FAIL'] = fail_count | 
|  | 172     log['END'] = 'MakeDb' | 
|  | 173     printLog(log) | 
|  | 174 | 
|  | 175     if pass_handle is not None: pass_handle.close() | 
|  | 176     if fail_handle is not None: fail_handle.close() | 
|  | 177 | 
|  | 178 | 
| 52 | 179 # TODO:  may be able to merge with other mains | 
|  | 180 def parseIMGT(aligner_output, seq_file=None, no_parse=True, partial=False, | 
|  | 181               parse_scores=False, parse_regions=False, parse_junction=False, | 
|  | 182               out_args=default_out_args): | 
| 0 | 183     """ | 
| 52 | 184     Main for IMGT aligned sample sequences. | 
| 0 | 185 | 
|  | 186     Arguments: | 
| 52 | 187       aligner_output : zipped file or unzipped folder output by IMGT. | 
|  | 188       seq_file : FASTA file input to IMGT (from which to get seqID). | 
|  | 189       no_parse : if ID is to be parsed for pRESTO output with default delimiters. | 
|  | 190       partial : If True put incomplete alignments in the pass file. | 
|  | 191       parse_scores : if True add alignment score fields to output file. | 
|  | 192       parse_regions : if True add FWR and CDR region fields to output file. | 
|  | 193       out_args : common output argument dictionary from parseCommonArgs. | 
| 0 | 194 | 
|  | 195     Returns: | 
| 52 | 196       None | 
|  | 197     """ | 
|  | 198     # Print parameter info | 
|  | 199     log = OrderedDict() | 
|  | 200     log['START'] = 'MakeDb' | 
|  | 201     log['ALIGNER'] = 'IMGT' | 
|  | 202     log['ALIGNER_OUTPUT'] = aligner_output | 
|  | 203     log['SEQ_FILE'] = os.path.basename(seq_file) if seq_file else '' | 
|  | 204     log['NO_PARSE'] = no_parse | 
|  | 205     log['PARTIAL'] = partial | 
|  | 206     log['SCORES'] = parse_scores | 
|  | 207     log['REGIONS'] = parse_regions | 
|  | 208     log['JUNCTION'] = parse_junction | 
|  | 209     printLog(log) | 
|  | 210 | 
|  | 211     start_time = time() | 
|  | 212     printMessage('Loading sequence files', start_time=start_time, width=25) | 
|  | 213     # Extract IMGT files | 
|  | 214     temp_dir, imgt_files = extractIMGT(aligner_output) | 
|  | 215     # Count records in IMGT files | 
|  | 216     total_count = countDbFile(imgt_files['summary']) | 
|  | 217     # Get (parsed) IDs from fasta file submitted to IMGT | 
|  | 218     id_dict = getIDforIMGT(seq_file) if seq_file else {} | 
|  | 219     printMessage('Done', start_time=start_time, end=True, width=25) | 
|  | 220 | 
|  | 221     # Parse IMGT output and write db | 
|  | 222     with open(imgt_files['summary'], 'r') as summary_handle, \ | 
|  | 223             open(imgt_files['gapped'], 'r') as gapped_handle, \ | 
|  | 224             open(imgt_files['ntseq'], 'r') as ntseq_handle, \ | 
|  | 225             open(imgt_files['junction'], 'r') as junction_handle: | 
|  | 226         parse_iter = IMGTReader(summary_handle, gapped_handle, ntseq_handle, junction_handle, | 
|  | 227                                 parse_scores=parse_scores, parse_regions=parse_regions, | 
|  | 228                                 parse_junction=parse_junction) | 
|  | 229         file_prefix = getFilePrefix(aligner_output, out_args) | 
|  | 230         writeDb(parse_iter, parse_iter.fields, file_prefix, total_count, id_dict=id_dict, | 
|  | 231                 no_parse=no_parse, partial=partial, out_args=out_args) | 
|  | 232 | 
|  | 233     # Cleanup temp directory | 
|  | 234     temp_dir.cleanup() | 
|  | 235 | 
|  | 236     return None | 
|  | 237 | 
|  | 238 | 
|  | 239 # TODO:  may be able to merge with other mains | 
|  | 240 def parseIgBLAST(aligner_output, seq_file, repo, no_parse=True, partial=False, | 
|  | 241                  parse_regions=False, parse_scores=False, parse_igblast_cdr3=False, | 
|  | 242                  out_args=default_out_args): | 
|  | 243     """ | 
|  | 244     Main for IgBLAST aligned sample sequences. | 
|  | 245 | 
|  | 246     Arguments: | 
|  | 247       aligner_output : IgBLAST output file to process. | 
|  | 248       seq_file : fasta file input to IgBlast (from which to get sequence). | 
|  | 249       repo : folder with germline repertoire files. | 
|  | 250       no_parse : if ID is to be parsed for pRESTO output with default delimiters. | 
|  | 251       partial : If True put incomplete alignments in the pass file. | 
|  | 252       parse_regions : if True add FWR and CDR fields to output file. | 
|  | 253       parse_scores : if True add alignment score fields to output file. | 
|  | 254       parse_igblast_cdr3 : if True parse CDR3 sequences generated by IgBLAST | 
|  | 255       out_args : common output argument dictionary from parseCommonArgs. | 
|  | 256 | 
|  | 257     Returns: | 
|  | 258       None | 
| 0 | 259     """ | 
|  | 260     # Print parameter info | 
|  | 261     log = OrderedDict() | 
|  | 262     log['START'] = 'MakeDB' | 
|  | 263     log['ALIGNER'] = 'IgBlast' | 
| 52 | 264     log['ALIGNER_OUTPUT'] = os.path.basename(aligner_output) | 
| 0 | 265     log['SEQ_FILE'] = os.path.basename(seq_file) | 
|  | 266     log['NO_PARSE'] = no_parse | 
| 52 | 267     log['PARTIAL'] = partial | 
|  | 268     log['SCORES'] = parse_scores | 
|  | 269     log['REGIONS'] = parse_regions | 
| 0 | 270     printLog(log) | 
|  | 271 | 
| 52 | 272     start_time = time() | 
|  | 273     printMessage('Loading sequence files', start_time=start_time, width=25) | 
|  | 274     # Count records in sequence file | 
|  | 275     total_count = countSeqFile(seq_file) | 
| 0 | 276     # Get input sequence dictionary | 
| 52 | 277     seq_dict = getSeqDict(seq_file) | 
|  | 278     # Create germline repo dictionary | 
|  | 279     repo_dict = readRepo(repo) | 
|  | 280     printMessage('Done', start_time=start_time, end=True, width=25) | 
| 0 | 281 | 
| 52 | 282     # Parse and write output | 
|  | 283     with open(aligner_output, 'r') as f: | 
|  | 284         parse_iter = IgBLASTReader(f, seq_dict, repo_dict, | 
|  | 285                                    parse_scores=parse_scores, parse_regions=parse_regions, | 
|  | 286                                    parse_igblast_cdr3=parse_igblast_cdr3) | 
|  | 287         file_prefix = getFilePrefix(aligner_output, out_args) | 
|  | 288         writeDb(parse_iter, parse_iter.fields, file_prefix, total_count, | 
|  | 289                 no_parse=no_parse, partial=partial, out_args=out_args) | 
| 0 | 290 | 
| 52 | 291     return None | 
| 0 | 292 | 
|  | 293 | 
| 52 | 294 # TODO:  may be able to merge with other mains | 
|  | 295 def parseIHMM(aligner_output, seq_file, repo, no_parse=True, partial=False, | 
|  | 296               parse_scores=False, parse_regions=False, out_args=default_out_args): | 
| 0 | 297     """ | 
| 52 | 298     Main for iHMMuneAlign aligned sample sequences. | 
| 0 | 299 | 
|  | 300     Arguments: | 
| 52 | 301       aligner_output : iHMMune-Align output file to process. | 
|  | 302       seq_file : fasta file input to iHMMuneAlign (from which to get sequence). | 
|  | 303       repo : folder with germline repertoire files. | 
|  | 304       no_parse : if ID is to be parsed for pRESTO output with default delimiters. | 
|  | 305       partial : If True put incomplete alignments in the pass file. | 
|  | 306       parse_scores : if True parse alignment scores. | 
|  | 307       parse_regions : if True add FWR and CDR region fields. | 
|  | 308       out_args : common output argument dictionary from parseCommonArgs. | 
|  | 309 | 
|  | 310     Returns: | 
|  | 311       None | 
| 0 | 312     """ | 
|  | 313     # Print parameter info | 
|  | 314     log = OrderedDict() | 
| 52 | 315     log['START'] = 'MakeDB' | 
|  | 316     log['ALIGNER'] = 'iHMMune-Align' | 
|  | 317     log['ALIGNER_OUTPUT'] = os.path.basename(aligner_output) | 
|  | 318     log['SEQ_FILE'] = os.path.basename(seq_file) | 
| 0 | 319     log['NO_PARSE'] = no_parse | 
| 52 | 320     log['PARTIAL'] = partial | 
|  | 321     log['SCORES'] = parse_scores | 
|  | 322     log['REGIONS'] = parse_regions | 
| 0 | 323     printLog(log) | 
|  | 324 | 
| 52 | 325     start_time = time() | 
|  | 326     printMessage('Loading sequence files', start_time=start_time, width=25) | 
|  | 327     # Count records in sequence file | 
|  | 328     total_count = countSeqFile(seq_file) | 
|  | 329     # Get input sequence dictionary | 
|  | 330     seq_dict = getSeqDict(seq_file) | 
|  | 331     # Create germline repo dictionary | 
|  | 332     repo_dict = readRepo(repo) | 
|  | 333     printMessage('Done', start_time=start_time, end=True, width=25) | 
| 0 | 334 | 
| 52 | 335     # Parse and write output | 
|  | 336     with open(aligner_output, 'r') as f: | 
|  | 337         parse_iter = IHMMuneReader(f, seq_dict, repo_dict, | 
|  | 338                                    parse_scores=parse_scores, parse_regions=parse_regions) | 
|  | 339         file_prefix = getFilePrefix(aligner_output, out_args) | 
|  | 340         writeDb(parse_iter, parse_iter.fields, file_prefix, total_count, | 
|  | 341                 no_parse=no_parse, partial=partial, out_args=out_args) | 
|  | 342 | 
|  | 343     return None | 
| 0 | 344 | 
|  | 345 | 
|  | 346 def getArgParser(): | 
|  | 347     """ | 
| 52 | 348     Defines the ArgumentParser. | 
| 0 | 349 | 
|  | 350     Returns: | 
| 52 | 351       argparse.ArgumentParser | 
| 0 | 352     """ | 
|  | 353     fields = dedent( | 
|  | 354              ''' | 
|  | 355               output files: | 
|  | 356                   db-pass | 
| 52 | 357                       database of alignment records with functionality information, | 
|  | 358                       V and J calls, and a junction region. | 
| 0 | 359                   db-fail | 
| 52 | 360                       database with records that fail due to no functionality information | 
|  | 361                       (did not pass IMGT), no V call, no J call, or no junction region. | 
| 0 | 362 | 
| 52 | 363               universal output fields: | 
|  | 364                   SEQUENCE_ID, SEQUENCE_INPUT, SEQUENCE_VDJ, SEQUENCE_IMGT, | 
|  | 365                   FUNCTIONAL, IN_FRAME, STOP, MUTATED_INVARIANT, INDELS, | 
|  | 366                   V_CALL, D_CALL, J_CALL, | 
|  | 367                   V_SEQ_START, V_SEQ_LENGTH, | 
|  | 368                   D_SEQ_START, D_SEQ_LENGTH, D_GERM_START, D_GERM_LENGTH, | 
| 0 | 369                   J_SEQ_START, J_SEQ_LENGTH, J_GERM_START, J_GERM_LENGTH, | 
| 52 | 370                   JUNCTION_LENGTH, JUNCTION, NP1_LENGTH, NP2_LENGTH, | 
|  | 371                   FWR1_IMGT, FWR2_IMGT, FWR3_IMGT, FWR4_IMGT, | 
|  | 372                   CDR1_IMGT, CDR2_IMGT, CDR3_IMGT | 
|  | 373 | 
|  | 374               imgt specific output fields: | 
|  | 375                   V_GERM_START_IMGT, V_GERM_LENGTH_IMGT, | 
|  | 376                   N1_LENGTH, N2_LENGTH, P3V_LENGTH, P5D_LENGTH, P3D_LENGTH, P5J_LENGTH, | 
|  | 377                   D_FRAME, V_SCORE, V_IDENTITY, J_SCORE, J_IDENTITY, | 
|  | 378 | 
|  | 379               igblast specific output fields: | 
|  | 380                   V_GERM_START_VDJ, V_GERM_LENGTH_VDJ, | 
|  | 381                   V_EVALUE, V_SCORE, V_IDENTITY, V_BTOP, | 
|  | 382                   J_EVALUE, J_SCORE, J_IDENTITY, J_BTOP. | 
|  | 383                   CDR3_IGBLAST_NT, CDR3_IGBLAST_AA | 
|  | 384 | 
|  | 385               ihmm specific output fields: | 
|  | 386                   V_GERM_START_VDJ, V_GERM_LENGTH_VDJ, | 
|  | 387                   HMM_SCORE | 
| 0 | 388               ''') | 
|  | 389 | 
|  | 390     # Define ArgumentParser | 
|  | 391     parser = ArgumentParser(description=__doc__, epilog=fields, | 
|  | 392                             formatter_class=CommonHelpFormatter) | 
|  | 393     parser.add_argument('--version', action='version', | 
|  | 394                         version='%(prog)s:' + ' %s-%s' %(__version__, __date__)) | 
|  | 395     subparsers = parser.add_subparsers(title='subcommands', dest='command', | 
|  | 396                                        help='Aligner used', metavar='') | 
|  | 397     # TODO:  This is a temporary fix for Python issue 9253 | 
|  | 398     subparsers.required = True | 
|  | 399 | 
|  | 400     # Parent parser | 
|  | 401     parser_parent = getCommonArgParser(seq_in=False, seq_out=False, log=False) | 
|  | 402 | 
|  | 403     # IgBlast Aligner | 
| 52 | 404     parser_igblast = subparsers.add_parser('igblast', parents=[parser_parent], | 
|  | 405                                            formatter_class=CommonHelpFormatter, | 
|  | 406                                            help='Process IgBLAST output.', | 
|  | 407                                            description='Process IgBLAST output.') | 
|  | 408     parser_igblast.add_argument('-i', nargs='+', action='store', dest='aligner_outputs', | 
| 0 | 409                                 required=True, | 
|  | 410                                 help='''IgBLAST output files in format 7 with query sequence | 
|  | 411                                      (IgBLAST argument \'-outfmt "7 std qseq sseq btop"\').''') | 
|  | 412     parser_igblast.add_argument('-r', nargs='+', action='store', dest='repo', required=True, | 
|  | 413                                 help='''List of folders and/or fasta files containing | 
|  | 414                                      IMGT-gapped germline sequences corresponding to the | 
|  | 415                                      set of germlines used in the IgBLAST alignment.''') | 
|  | 416     parser_igblast.add_argument('-s', action='store', nargs='+', dest='seq_files', | 
|  | 417                                 required=True, | 
| 52 | 418                                 help='''List of input FASTA files (with .fasta, .fna or .fa | 
|  | 419                                      extension), containing sequences.''') | 
| 0 | 420     parser_igblast.add_argument('--noparse', action='store_true', dest='no_parse', | 
| 52 | 421                                 help='''Specify to prevent input sequence headers from being parsed | 
|  | 422                                     to add new columns to database. Parsing of sequence headers requires | 
|  | 423                                     headers to be in the pRESTO annotation format, so this should be specified | 
|  | 424                                     when sequence headers are incompatible with the pRESTO annotation scheme. | 
|  | 425                                     Note, unrecognized header formats will default to this behavior.''') | 
|  | 426     parser_igblast.add_argument('--partial', action='store_true', dest='partial', | 
|  | 427                                 help='''If specified, include incomplete V(D)J alignments in | 
|  | 428                                      the pass file instead of the fail file.''') | 
|  | 429     parser_igblast.add_argument('--scores', action='store_true', dest='parse_scores', | 
| 0 | 430                                 help='''Specify if alignment score metrics should be | 
|  | 431                                      included in the output. Adds the V_SCORE, V_IDENTITY, | 
|  | 432                                      V_EVALUE, V_BTOP, J_SCORE, J_IDENTITY, | 
|  | 433                                      J_BTOP, and J_EVALUE columns.''') | 
| 52 | 434     parser_igblast.add_argument('--regions', action='store_true', dest='parse_regions', | 
|  | 435                                 help='''Specify if IMGT FWR and CDRs should be | 
| 0 | 436                                      included in the output. Adds the FWR1_IMGT, FWR2_IMGT, | 
|  | 437                                      FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and | 
|  | 438                                      CDR3_IMGT columns.''') | 
| 52 | 439     parser_igblast.add_argument('--cdr3', action='store_true', | 
|  | 440                                 dest='parse_igblast_cdr3', | 
|  | 441                                 help='''Specify if the CDR3 sequences generated by IgBLAST | 
|  | 442                                      should be included in the output. Adds the columns | 
|  | 443                                      CDR3_IGBLAST_NT and CDR3_IGBLAST_AA. Requires IgBLAST | 
|  | 444                                      version 1.5 or greater.''') | 
|  | 445     parser_igblast.set_defaults(func=parseIgBLAST) | 
|  | 446 | 
| 0 | 447     # IMGT aligner | 
| 52 | 448     parser_imgt = subparsers.add_parser('imgt', parents=[parser_parent], | 
|  | 449                                         formatter_class=CommonHelpFormatter, | 
|  | 450                                         help='''Process IMGT/HighV-Quest output | 
|  | 451                                              (does not work with V-QUEST).''', | 
|  | 452                                         description='''Process IMGT/HighV-Quest output | 
|  | 453                                              (does not work with V-QUEST).''') | 
|  | 454     parser_imgt.add_argument('-i', nargs='+', action='store', dest='aligner_outputs', | 
|  | 455                              help='''Either zipped IMGT output files (.zip or .txz) or a | 
|  | 456                                   folder containing unzipped IMGT output files (which must | 
|  | 457                                   include 1_Summary, 2_IMGT-gapped, 3_Nt-sequences, | 
|  | 458                                   and 6_Junction).''') | 
| 0 | 459     parser_imgt.add_argument('-s', nargs='*', action='store', dest='seq_files', | 
|  | 460                              required=False, | 
| 52 | 461                              help='''List of input FASTA files (with .fasta, .fna or .fa | 
|  | 462                                   extension) containing sequences.''') | 
| 0 | 463     parser_imgt.add_argument('--noparse', action='store_true', dest='no_parse', | 
| 52 | 464                              help='''Specify to prevent input sequence headers from being parsed | 
|  | 465                                   to add new columns to database. Parsing of sequence headers requires | 
|  | 466                                   headers to be in the pRESTO annotation format, so this should be specified | 
|  | 467                                   when sequence headers are incompatible with the pRESTO annotation scheme. | 
|  | 468                                   Note, unrecognized header formats will default to this behavior.''') | 
|  | 469     parser_imgt.add_argument('--partial', action='store_true', dest='partial', | 
|  | 470                              help='''If specified, include incomplete V(D)J alignments in | 
|  | 471                                   the pass file instead of the fail file.''') | 
|  | 472     parser_imgt.add_argument('--scores', action='store_true', dest='parse_scores', | 
| 0 | 473                              help='''Specify if alignment score metrics should be | 
|  | 474                                   included in the output. Adds the V_SCORE, V_IDENTITY, | 
| 52 | 475                                   J_SCORE and J_IDENTITY.''') | 
|  | 476     parser_imgt.add_argument('--regions', action='store_true', dest='parse_regions', | 
|  | 477                              help='''Specify if IMGT FWRs and CDRs should be | 
| 0 | 478                                   included in the output. Adds the FWR1_IMGT, FWR2_IMGT, | 
|  | 479                                   FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and | 
|  | 480                                   CDR3_IMGT columns.''') | 
| 52 | 481     parser_imgt.add_argument('--junction', action='store_true', dest='parse_junction', | 
|  | 482                              help='''Specify if detailed junction fields should be | 
|  | 483                                   included in the output. Adds the columns | 
|  | 484                                   N1_LENGTH, N2_LENGTH, P3V_LENGTH, P5D_LENGTH, P3D_LENGTH, | 
|  | 485                                   P5J_LENGTH, D_FRAME.''') | 
| 0 | 486     parser_imgt.set_defaults(func=parseIMGT) | 
|  | 487 | 
| 52 | 488     # iHMMuneAlign Aligner | 
|  | 489     parser_ihmm = subparsers.add_parser('ihmm', parents=[parser_parent], | 
|  | 490                                         formatter_class=CommonHelpFormatter, | 
|  | 491                                         help='Process iHMMune-Align output.', | 
|  | 492                                         description='Process iHMMune-Align output.') | 
|  | 493     parser_ihmm.add_argument('-i', nargs='+', action='store', dest='aligner_outputs', | 
|  | 494                              required=True, | 
|  | 495                              help='''iHMMune-Align output file.''') | 
|  | 496     parser_ihmm.add_argument('-r', nargs='+', action='store', dest='repo', required=True, | 
|  | 497                              help='''List of folders and/or FASTA files containing | 
|  | 498                                   IMGT-gapped germline sequences corresponding to the | 
|  | 499                                   set of germlines used in the IgBLAST alignment.''') | 
|  | 500     parser_ihmm.add_argument('-s', action='store', nargs='+', dest='seq_files', | 
|  | 501                              required=True, | 
|  | 502                              help='''List of input FASTA files (with .fasta, .fna or .fa | 
|  | 503                                   extension) containing sequences.''') | 
|  | 504     parser_ihmm.add_argument('--noparse', action='store_true', dest='no_parse', | 
|  | 505                              help='''Specify to prevent input sequence headers from being parsed | 
|  | 506                                   to add new columns to database. Parsing of sequence headers requires | 
|  | 507                                   headers to be in the pRESTO annotation format, so this should be specified | 
|  | 508                                   when sequence headers are incompatible with the pRESTO annotation scheme. | 
|  | 509                                   Note, unrecognized header formats will default to this behavior.''') | 
|  | 510     parser_ihmm.add_argument('--partial', action='store_true', dest='partial', | 
|  | 511                              help='''If specified, include incomplete V(D)J alignments in | 
|  | 512                                   the pass file instead of the fail file.''') | 
|  | 513     parser_ihmm.add_argument('--scores', action='store_true', dest='parse_scores', | 
|  | 514                              help='''Specify if alignment score metrics should be | 
|  | 515                                   included in the output. Adds the path score of the | 
|  | 516                                   iHMMune-Align hidden Markov model to HMM_SCORE.''') | 
|  | 517     parser_ihmm.add_argument('--regions', action='store_true', dest='parse_regions', | 
|  | 518                              help='''Specify if IMGT FWRs and CDRs should be | 
|  | 519                                   included in the output. Adds the FWR1_IMGT, FWR2_IMGT, | 
|  | 520                                   FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and | 
|  | 521                                   CDR3_IMGT columns.''') | 
|  | 522     parser_ihmm.set_defaults(func=parseIHMM) | 
|  | 523 | 
| 0 | 524     return parser | 
|  | 525 | 
|  | 526 | 
|  | 527 if __name__ == "__main__": | 
|  | 528     """ | 
|  | 529     Parses command line arguments and calls main | 
|  | 530     """ | 
|  | 531     parser = getArgParser() | 
|  | 532     args = parser.parse_args() | 
| 52 | 533     args_dict = parseCommonArgs(args, in_arg='aligner_outputs') | 
| 0 | 534 | 
|  | 535     # Set no ID parsing if sequence files are not provided | 
|  | 536     if 'seq_files' in args_dict and not args_dict['seq_files']: | 
|  | 537         args_dict['no_parse'] = True | 
|  | 538 | 
|  | 539     # Delete | 
|  | 540     if 'seq_files' in args_dict: del args_dict['seq_files'] | 
| 52 | 541     if 'aligner_outputs' in args_dict: del args_dict['aligner_outputs'] | 
| 0 | 542     if 'command' in args_dict: del args_dict['command'] | 
|  | 543     if 'func' in args_dict: del args_dict['func'] | 
|  | 544 | 
|  | 545     if args.command == 'imgt': | 
| 52 | 546         for i in range(len(args.__dict__['aligner_outputs'])): | 
|  | 547             args_dict['aligner_output'] = args.__dict__['aligner_outputs'][i] | 
| 0 | 548             args_dict['seq_file'] = args.__dict__['seq_files'][i] \ | 
|  | 549                                     if args.__dict__['seq_files'] else None | 
|  | 550             args.func(**args_dict) | 
| 52 | 551     elif args.command == 'igblast' or args.command == 'ihmm': | 
|  | 552         for i in range(len(args.__dict__['aligner_outputs'])): | 
|  | 553             args_dict['aligner_output'] =  args.__dict__['aligner_outputs'][i] | 
| 0 | 554             args_dict['seq_file'] = args.__dict__['seq_files'][i] | 
|  | 555             args.func(**args_dict) |