Mercurial > repos > davidvanzessen > shm_csr

diff change_o/MakeDb.py @ 52:22dddabe3637 draft
Uploaded
author: davidvanzessen
date: Tue, 23 May 2017 08:32:58 -0400
parents: c33d93683a09
--- a/change_o/MakeDb.py	Tue May 16 08:52:48 2017 -0400
+++ b/change_o/MakeDb.py	Tue May 23 08:32:58 2017 -0400
@@ -7,765 +7,164 @@
 from changeo import __version__, __date__
 
 # Imports
-import csv
 import os
-import re
 import sys
-import pandas as pd
-import tarfile
-import zipfile
 from argparse import ArgumentParser
 from collections import OrderedDict
-from itertools import groupby
-from shutil import rmtree
-from tempfile import mkdtemp
 from textwrap import dedent
 from time import time
 from Bio import SeqIO
-from Bio.Seq import Seq
-from Bio.Alphabet import IUPAC
 
 # Presto and changeo imports
 from presto.Defaults import default_out_args
 from presto.Annotation import parseAnnotation
-from presto.IO import countSeqFile, printLog, printProgress
+from presto.IO import countSeqFile, printLog, printMessage, printProgress, readSeqFile
 from changeo.Commandline import CommonHelpFormatter, getCommonArgParser, parseCommonArgs
-from changeo.IO import getDbWriter, countDbFile, getRepo
-from changeo.Receptor import IgRecord, parseAllele, v_allele_regex, d_allele_regex, \
-                             j_allele_regex
-
-# Default parameters
-default_delimiter = ('\t', ',', '-')
-
-
-def gapV(ig_dict, repo_dict):
-    """
-    Insert gaps into V region and update alignment information
-
-    Arguments:
-      ig_dict : Dictionary of parsed IgBlast output
-      repo_dict : Dictionary of IMGT gapped germline sequences
-
-    Returns:
-      dict : Updated with SEQUENCE_IMGT, V_GERM_START_IMGT, and V_GERM_LENGTH_IMGT fields
-    """
-
-    seq_imgt = '.' * (int(ig_dict['V_GERM_START_VDJ'])-1) + ig_dict['SEQUENCE_VDJ']
-
-    # Find gapped germline V segment
-    vgene = parseAllele(ig_dict['V_CALL'], v_allele_regex, 'first')
-    vkey = (vgene, )
-    #TODO: Figure out else case
-    if vkey in repo_dict:
-        vgap = repo_dict[vkey]
-        # Iterate over gaps in the germline segment
-        gaps = re.finditer(r'\.', vgap)
-        gapcount = int(ig_dict['V_GERM_START_VDJ'])-1
-        for gap in gaps:
-            i = gap.start()
-            # Break if gap begins after V region
-            if i >= ig_dict['V_GERM_LENGTH_VDJ'] + gapcount:
-                break
-            # Insert gap into IMGT sequence
-            seq_imgt = seq_imgt[:i] + '.' + seq_imgt[i:]
-            # Update gap counter
-            gapcount += 1
-        ig_dict['SEQUENCE_IMGT'] = seq_imgt
-        # Update IMGT positioning information for V
-        ig_dict['V_GERM_START_IMGT'] = 1
-        ig_dict['V_GERM_LENGTH_IMGT'] = ig_dict['V_GERM_LENGTH_VDJ'] + gapcount
-
-    return ig_dict
+from changeo.IO import countDbFile, extractIMGT, getDbWriter, readRepo
+from changeo.Parsers import IgBLASTReader, IMGTReader, IHMMuneReader, getIDforIMGT
 
 
-def getIMGTJunc(ig_dict, repo_dict):
-    """
-    Identify junction region by IMGT definition
-
-    Arguments:
-      ig_dict : Dictionary of parsed IgBlast output
-      repo_dict : Dictionary of IMGT gapped germline sequences
-
-    Returns:
-      dict : Updated with JUNCTION_LENGTH_IMGT and JUNCTION_IMGT fields
-    """
-    # Find germline J segment
-    jgene = parseAllele(ig_dict['J_CALL'], j_allele_regex, 'first')
-    jkey = (jgene, )
-    #TODO: Figure out else case
-    if jkey in repo_dict:
-        # Get germline J sequence
-        jgerm = repo_dict[jkey]
-        jgerm = jgerm[:ig_dict['J_GERM_START']+ig_dict['J_GERM_LENGTH']-1]
-        # Look for (F|W)GXG aa motif in nt sequence
-        motif = re.search(r'T(TT|TC|GG)GG[ACGT]{4}GG[AGCT]', jgerm)
-        aa_end = len(ig_dict['SEQUENCE_IMGT'])
-        #TODO: Figure out else case
-        if motif:
-            # print('\n', motif.group())
-            aa_end = motif.start() - len(jgerm) + 3
-        # Add fields to dict
-        ig_dict['JUNCTION'] = ig_dict['SEQUENCE_IMGT'][309:aa_end]
-        ig_dict['JUNCTION_LENGTH'] = len(ig_dict['JUNCTION'])
-
-    return ig_dict
-
-
-def getRegions(ig_dict):
+def getFilePrefix(aligner_output, out_args):
     """
-    Identify FWR and CDR regions by IMGT definition
-
-    Arguments:
-      ig_dict : Dictionary of parsed alignment output
-
-    Returns:
-      dict : Updated with FWR1_IMGT, FWR2_IMGT, FWR3_IMGT, FWR4_IMGT,
-             CDR1_IMGT, CDR2_IMGT, and CDR3_IMGT fields
-    """
-    try:
-        seq_len = len(ig_dict['SEQUENCE_IMGT'])
-        ig_dict['FWR1_IMGT'] = ig_dict['SEQUENCE_IMGT'][0:min(78,seq_len)]
-    except (KeyError, IndexError):
-        return ig_dict
-
-    try: ig_dict['CDR1_IMGT'] = ig_dict['SEQUENCE_IMGT'][78:min(114, seq_len)]
-    except (IndexError): return ig_dict
-
-    try: ig_dict['FWR2_IMGT'] = ig_dict['SEQUENCE_IMGT'][114:min(165, seq_len)]
-    except (IndexError): return ig_dict
-
-    try: ig_dict['CDR2_IMGT'] = ig_dict['SEQUENCE_IMGT'][165:min(195, seq_len)]
-    except (IndexError): return ig_dict
-
-    try: ig_dict['FWR3_IMGT'] = ig_dict['SEQUENCE_IMGT'][195:min(312, seq_len)]
-    except (IndexError): return ig_dict
-
-    try:
-        cdr3_end = 306 + ig_dict['JUNCTION_LENGTH']
-        ig_dict['CDR3_IMGT'] = ig_dict['SEQUENCE_IMGT'][312:cdr3_end]
-        ig_dict['FWR4_IMGT'] = ig_dict['SEQUENCE_IMGT'][cdr3_end:]
-    except (KeyError, IndexError):
-        return ig_dict
-
-    return ig_dict
-
-
-def getSeqforIgBlast(seq_file):
-    """
-    Fetch input sequences for IgBlast queries
+    Get file name prefix and create output directory
 
     Arguments:
-    seq_file = a fasta file of sequences input to IgBlast
+      aligner_output : aligner output file or directory.
+      out_args : dictionary of output arguments.
 
     Returns:
-    a dictionary of {ID:Seq}
+        str : file name prefix.
     """
-
-    seq_dict = SeqIO.index(seq_file, "fasta", IUPAC.ambiguous_dna)
-
-    # Create a seq_dict ID translation using IDs truncate up to space or 50 chars
-    seqs = {}
-    for seq in seq_dict.values():
-        seqs.update({seq.description:str(seq.seq)})
-
-    return seqs
-
+    # Determine output directory
+    if not out_args['out_dir']:
+        out_dir = os.path.dirname(os.path.abspath(aligner_output))
+    else:
+        out_dir = os.path.abspath(out_args['out_dir'])
+        if not os.path.exists(out_dir):
+            os.mkdir(out_dir)
 
-def findLine(handle, query):
-    """
-    Finds line with query string in file
+    # Determine file prefix
+    if out_args['out_name']:
+        file_prefix = out_args['out_name']
+    else:
+        file_prefix = os.path.splitext(os.path.split(os.path.abspath(aligner_output))[1])[0]
 
-    Arguments:
-    handle = file handle in which to search for line
-    query = query string for which to search in file
-
-    Returns:
-    line from handle in which query string was found
-    """
-    for line in handle:
-        if(re.match(query, line)):
-            return line
+    return os.path.join(out_dir, file_prefix)
 
 
-def extractIMGT(imgt_output):
-    """
-    Extract necessary files from IMGT results, zipped or unzipped
-    
-    Arguments:
-    imgt_output = zipped file or unzipped folder output by IMGT
-    
-    Returns:
-    sorted list of filenames from which information will be read
+def getSeqDict(seq_file):
     """
-    #file_ext = os.path.splitext(imgt_output)[1].lower()
-    imgt_flags = ('1_Summary', '2_IMGT-gapped', '3_Nt-sequences', '6_Junction')
-    temp_dir = mkdtemp()
-    if zipfile.is_zipfile(imgt_output):
-        # Open zip file
-        imgt_zip = zipfile.ZipFile(imgt_output, 'r')
-        # Extract required files
-        imgt_files = sorted([n for n in imgt_zip.namelist() \
-                             if os.path.basename(n).startswith(imgt_flags)])
-        imgt_zip.extractall(temp_dir, imgt_files)
-        # Define file list
-        imgt_files = [os.path.join(temp_dir, f) for f in imgt_files]
-    elif os.path.isdir(imgt_output):
-        # Find required files in folder
-        folder_files = []
-        for root, dirs, files in os.walk(imgt_output):
-            folder_files.extend([os.path.join(os.path.abspath(root), f) for f in files])
-        # Define file list
-        imgt_files = sorted([n for n in folder_files \
-                             if os.path.basename(n).startswith(imgt_flags)])
-    elif tarfile.is_tarfile(imgt_output):
-        # Open zip file
-        imgt_tar = tarfile.open(imgt_output, 'r')
-        # Extract required files
-        imgt_files = sorted([n for n in imgt_tar.getnames() \
-                             if os.path.basename(n).startswith(imgt_flags)])
-        imgt_tar.extractall(temp_dir, [imgt_tar.getmember(n) for n in imgt_files])
-        # Define file list
-        imgt_files = [os.path.join(temp_dir, f) for f in imgt_files]
-    else:
-        sys.exit('ERROR: Unsupported IGMT output file. Must be either a zipped file (.zip), LZMA compressed tarfile (.txz) or a folder.')
-    
-    if len(imgt_files) > len(imgt_flags): # e.g. multiple 1_Summary files
-        sys.exit('ERROR: Wrong files in IMGT output %s.' % imgt_output)
-    elif len(imgt_files) < len(imgt_flags):
-        sys.exit('ERROR: Missing necessary file IMGT output %s.' % imgt_output)
-        
-    return temp_dir, imgt_files
-
-
-# TODO: return a dictionary with keys determined by the comment strings in the blocks, thus avoiding problems with missing blocks
-def readOneIgBlastResult(block):
-    """
-    Parse a single IgBLAST query result
+    Create a dictionary from a sequence file.
 
     Arguments:
-    block =  itertools groupby object of single result
-
-    Returns:
-    None if no results, otherwise list of DataFrames for each result block
-    """
-    results = list()
-    i = 0
-    for match, subblock in groupby(block, lambda l: l=='\n'):
-        if not match:
-            # Strip whitespace and comments
-            sub = [s.strip() for s in subblock if not s.startswith('#')]
-
-            # Continue on empty block
-            if not sub:  continue
-            else:  i += 1
-
-            # Split by tabs
-            sub = [s.split('\t') for s in sub]
-
-            # Append list for "V-(D)-J rearrangement summary" (i == 1)
-            # And "V-(D)-J junction details" (i == 2)
-            # Otherwise append DataFrame of subblock
-            if i == 1 or i == 2:
-                results.append(sub[0])
-            else:
-                df = pd.DataFrame(sub)
-                if not df.empty: results.append(df)
-
-    return results if results else None
-
-
-# TODO:  needs more speeds. pandas is probably to blame.
-def readIgBlast(igblast_output, seq_dict, repo_dict,
-                score_fields=False, region_fields=False):
-    """
-    Reads IgBlast output
-
-    Arguments:
-    igblast_output = IgBlast output file (format 7)
-    seq_dict = a dictionary of {ID:Seq} from input fasta file
-    repo_dict = dictionary of IMGT gapped germline sequences
-    score_fields = if True parse alignment scores
-    region_fields = if True add FWR and CDR region fields
+      seq_file : sequence file.
 
     Returns:
-    a generator of dictionaries containing alignment data
+        dict : sequence description as keys with Bio.SeqRecords as values.
     """
-
-    # Open IgBlast output file
-    with open(igblast_output) as f:
-        # Iterate over individual results (separated by # IGBLASTN)
-        for k1, block in groupby(f, lambda x: re.match('# IGBLASTN', x)):
-            block = list(block)
-            if not k1:
-                # TODO: move query name extraction into block parser readOneIgBlastResult().
-                # Extract sequence ID
-                query_name = ' '.join(block[0].strip().split(' ')[2:])
-                # Initialize db_gen to have ID and input sequence
-                db_gen = {'SEQUENCE_ID':     query_name,
-                          'SEQUENCE_INPUT':  seq_dict[query_name]}
-
-                # Parse further sub-blocks
-                block_list = readOneIgBlastResult(block)
-
-                # TODO: this is indented pretty far.  should be a separate function. or several functions.
-                # If results exist, parse further to obtain full db_gen
-                if block_list is not None:
-                    # Parse quality information
-                    db_gen['STOP'] = 'T' if block_list[0][-4] == 'Yes' else 'F'
-                    db_gen['IN_FRAME'] = 'T' if block_list[0][-3] == 'In-frame' else 'F'
-                    db_gen['FUNCTIONAL'] = 'T' if block_list[0][-2] == 'Yes' else 'F'
-                    if block_list[0][-1] == '-':
-                        db_gen['SEQUENCE_INPUT'] = str(Seq(db_gen['SEQUENCE_INPUT'],
-                                                           IUPAC.ambiguous_dna).reverse_complement())
-
-                    # Parse V, D, and J calls
-                    call_str = ' '.join(block_list[0])
-                    v_call = parseAllele(call_str, v_allele_regex, action='list')
-                    d_call = parseAllele(call_str, d_allele_regex, action='list')
-                    j_call = parseAllele(call_str, j_allele_regex, action='list')
-                    db_gen['V_CALL'] = ','.join(v_call) if v_call is not None else 'None'
-                    db_gen['D_CALL'] = ','.join(d_call) if d_call is not None else 'None'
-                    db_gen['J_CALL'] = ','.join(j_call) if j_call is not None else 'None'
-
-                    # Parse junction sequence
-                    # db_gen['JUNCTION_VDJ'] = re.sub('(N/A)|\[|\(|\)|\]', '', ''.join(block_list[1]))
-                    # db_gen['JUNCTION_LENGTH_VDJ'] = len(db_gen['JUNCTION_VDJ'])
-
-                    # TODO:  IgBLAST does a stupid and doesn't output block #3 sometimes. why?
-                    # TODO:  maybe we should fail these. they look craptastic.
-                    #pd.set_option('display.width', 500)
-                    #print query_name, len(block_list), hit_idx
-                    #for i, x in enumerate(block_list):
-                    #    print '[%i]' % i
-                    #    print x
-
-                    # Parse segment start and stop positions
-                    hit_df = block_list[-1]
-
-                    # Alignment info block
-                    #  0:  segment
-                    #  1:  query id
-                    #  2:  subject id
-                    #  3:  % identity
-                    #  4:  alignment length
-                    #  5:  mismatches
-                    #  6:  gap opens
-                    #  7:  gaps
-                    #  8:  q. start
-                    #  9:  q. end
-                    # 10:  s. start
-                    # 11:  s. end
-                    # 12:  evalue
-                    # 13:  bit score
-                    # 14:  query seq
-                    # 15:  subject seq
-                    # 16:  btop
-
-                    # If V call exists, parse V alignment information
-                    seq_vdj = ''
-                    if v_call is not None:
-                        v_align = hit_df[hit_df[0] == 'V'].iloc[0]
-                        # Germline positions
-                        db_gen['V_GERM_START_VDJ'] = int(v_align[10])
-                        db_gen['V_GERM_LENGTH_VDJ'] = int(v_align[11]) - db_gen['V_GERM_START_VDJ'] + 1
-                        # Query sequence positions
-                        db_gen['V_SEQ_START'] = int(v_align[8])
-                        db_gen['V_SEQ_LENGTH'] = int(v_align[9]) - db_gen['V_SEQ_START'] + 1
-
-                        if int(v_align[6]) == 0:
-                            db_gen['INDELS'] = 'F'
-                        else:
-                            db_gen['INDELS'] = 'T'
-                            # Set functional to none so record gets tossed (junction will be wrong)
-                            # db_gen['FUNCTIONAL'] = None
-
-                        # V alignment scores
-                        if score_fields:
-                            try: db_gen['V_SCORE'] = float(v_align[13])
-                            except (TypeError, ValueError): db_gen['V_SCORE'] = 'None'
-
-                            try: db_gen['V_IDENTITY'] = float(v_align[3]) / 100.0
-                            except (TypeError, ValueError): db_gen['V_IDENTITY'] = 'None'
-
-                            try: db_gen['V_EVALUE'] = float(v_align[12])
-                            except (TypeError, ValueError): db_gen['V_EVALUE'] = 'None'
-
-                            try: db_gen['V_BTOP'] = v_align[16]
-                            except (TypeError, ValueError): db_gen['V_BTOP'] = 'None'
+    seq_dict = SeqIO.to_dict(readSeqFile(seq_file),
+                             key_function=lambda x: x.description)
 
-                        # Update VDJ sequence, removing insertions
-                        start = 0
-                        for m in re.finditer(r'-', v_align[15]):
-                            ins = m.start()
-                            seq_vdj += v_align[14][start:ins]
-                            start = ins + 1
-                        seq_vdj += v_align[14][start:]
-
-                    # TODO:  needs to check that the V results are present before trying to determine N1_LENGTH from them.
-                    # If D call exists, parse D alignment information
-                    if d_call is not None:
-                        d_align = hit_df[hit_df[0] == 'D'].iloc[0]
-
-                        # TODO:  this is kinda gross.  not sure how else to fix the alignment overlap problem though.
-                        # Determine N-region length and amount of J overlap with V or D alignment
-                        overlap = 0
-                        if v_call is not None:
-                            n1_len = int(d_align[8]) - (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH'])
-                            if n1_len < 0:
-                                db_gen['N1_LENGTH'] = 0
-                                overlap = abs(n1_len)
-                            else:
-                                db_gen['N1_LENGTH'] = n1_len
-                                n1_start = (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH']-1)
-                                n1_end = int(d_align[8])-1
-                                seq_vdj += db_gen['SEQUENCE_INPUT'][n1_start:n1_end]
-
-                        # Query sequence positions
-                        db_gen['D_SEQ_START'] = int(d_align[8]) + overlap
-                        db_gen['D_SEQ_LENGTH'] = max(int(d_align[9]) - db_gen['D_SEQ_START'] + 1, 0)
-
-                        # Germline positions
-                        db_gen['D_GERM_START'] = int(d_align[10]) + overlap
-                        db_gen['D_GERM_LENGTH'] = max(int(d_align[11]) - db_gen['D_GERM_START'] + 1, 0)
-
-                        # Update VDJ sequence, removing insertions
-                        start = overlap
-                        for m in re.finditer(r'-', d_align[15]):
-                            ins = m.start()
-                            seq_vdj += d_align[14][start:ins]
-                            start = ins + 1
-                        seq_vdj += d_align[14][start:]
-
-                    # TODO:  needs to check that the V results are present before trying to determine N1_LENGTH from them.
-                    # If J call exists, parse J alignment information
-                    if j_call is not None:
-                        j_align = hit_df[hit_df[0] == 'J'].iloc[0]
-
-                        # TODO:  this is kinda gross.  not sure how else to fix the alignment overlap problem though.
-                        # Determine N-region length and amount of J overlap with V or D alignment
-                        overlap = 0
-                        if d_call is not None:
-                            n2_len = int(j_align[8]) - (db_gen['D_SEQ_START'] + db_gen['D_SEQ_LENGTH'])
-                            if n2_len < 0:
-                                db_gen['N2_LENGTH'] = 0
-                                overlap = abs(n2_len)
-                            else:
-                                db_gen['N2_LENGTH'] = n2_len
-                                n2_start = (db_gen['D_SEQ_START']+db_gen['D_SEQ_LENGTH']-1)
-                                n2_end = int(j_align[8])-1
-                                seq_vdj += db_gen['SEQUENCE_INPUT'][n2_start:n2_end]
-                        elif v_call is not None:
-                            n1_len = int(j_align[8]) - (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH'])
-                            if n1_len < 0:
-                                db_gen['N1_LENGTH'] = 0
-                                overlap = abs(n1_len)
-                            else:
-                                db_gen['N1_LENGTH'] = n1_len
-                                n1_start = (db_gen['V_SEQ_START']+db_gen['V_SEQ_LENGTH']-1)
-                                n1_end = int(j_align[8])-1
-                                seq_vdj += db_gen['SEQUENCE_INPUT'][n1_start:n1_end]
-                        else:
-                            db_gen['N1_LENGTH'] = 0
-
-                        # Query positions
-                        db_gen['J_SEQ_START'] = int(j_align[8]) + overlap
-                        db_gen['J_SEQ_LENGTH'] = max(int(j_align[9]) - db_gen['J_SEQ_START'] + 1, 0)
-
-                        # Germline positions
-                        db_gen['J_GERM_START'] = int(j_align[10]) + overlap
-                        db_gen['J_GERM_LENGTH'] = max(int(j_align[11]) - db_gen['J_GERM_START'] + 1, 0)
-
-                        # J alignment scores
-                        if score_fields:
-                            try: db_gen['J_SCORE'] = float(j_align[13])
-                            except (TypeError, ValueError): db_gen['J_SCORE'] = 'None'
-
-                            try: db_gen['J_IDENTITY'] = float(j_align[3]) / 100.0
-                            except (TypeError, ValueError): db_gen['J_IDENTITY'] = 'None'
-
-                            try: db_gen['J_EVALUE'] = float(j_align[12])
-                            except (TypeError, ValueError): db_gen['J_EVALUE'] = 'None'
-
-                            try: db_gen['J_BTOP'] = j_align[16]
-                            except (TypeError, ValueError): db_gen['J_BTOP'] = 'None'
-
-                        # Update VDJ sequence, removing insertions
-                        start = overlap
-                        for m in re.finditer(r'-', j_align[15]):
-                            ins = m.start()
-                            seq_vdj += j_align[14][start:ins]
-                            start = ins + 1
-                        seq_vdj += j_align[14][start:]
-
-                    db_gen['SEQUENCE_VDJ'] = seq_vdj
-
-                    # Create IMGT-gapped sequence and infer IMGT junction
-                    if v_call is not None:
-                        db_gen = gapV(db_gen, repo_dict)
-                        if j_call is not None:
-                            db_gen = getIMGTJunc(db_gen, repo_dict)
-
-                    # FWR and CDR regions
-                    if region_fields: getRegions(db_gen)
-
-                yield IgRecord(db_gen)
+    return seq_dict
 
 
-# TODO:  should be more readable
-def readIMGT(imgt_files, score_fields=False, region_fields=False):
-    """
-    Reads IMGT/HighV-Quest output
-
-    Arguments: 
-    imgt_files = IMGT/HighV-Quest output files 1, 2, 3, and 6
-    score_fields = if True parse alignment scores
-    region_fields = if True add FWR and CDR region fields
-    
-    Returns: 
-    a generator of dictionaries containing alignment data
+def writeDb(db, fields, file_prefix, total_count, id_dict=None, no_parse=True, partial=False,
+            out_args=default_out_args):
     """
-    imgt_iters = [csv.DictReader(open(f, 'rU'), delimiter='\t') for f in imgt_files]
-    # Create a dictionary for each sequence alignment and yield its generator
-    for sm, gp, nt, jn in zip(*imgt_iters):
-        if len(set([sm['Sequence ID'],
-                    gp['Sequence ID'],
-                    nt['Sequence ID'],
-                    jn['Sequence ID']])) != 1:
-            sys.exit('Error: IMGT files are corrupt starting with Summary file record %s' \
-                     % sm['Sequence ID'])
-
-        db_gen = {'SEQUENCE_ID': sm['Sequence ID'],
-                  'SEQUENCE_INPUT': sm['Sequence']}
-
-        if 'No results' not in sm['Functionality']:
-            db_gen['FUNCTIONAL'] = ['?','T','F'][('productive' in sm['Functionality']) +
-                                                 ('unprod' in sm['Functionality'])]
-            db_gen['IN_FRAME'] = ['?','T','F'][('in-frame' in sm['JUNCTION frame']) +
-                                               ('out-of-frame' in sm['JUNCTION frame'])],
-            db_gen['STOP'] = ['F','?','T'][('stop codon' in sm['Functionality comment']) +
-                                           ('unprod' in sm['Functionality'])]
-            db_gen['MUTATED_INVARIANT'] = ['F','?','T'][(any(('missing' in sm['Functionality comment'],
-                                                         'missing' in sm['V-REGION potential ins/del']))) +
-                                                         ('unprod' in sm['Functionality'])]
-            db_gen['INDELS'] = ['F','T'][any((sm['V-REGION potential ins/del'],
-                                              sm['V-REGION insertions'],
-                                              sm['V-REGION deletions']))]
-
-            db_gen['SEQUENCE_VDJ'] = nt['V-D-J-REGION'] if nt['V-D-J-REGION'] else nt['V-J-REGION']
-            db_gen['SEQUENCE_IMGT'] = gp['V-D-J-REGION'] if gp['V-D-J-REGION'] else gp['V-J-REGION']
-
-            db_gen['V_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['V-GENE and allele']))
-            db_gen['D_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['D-GENE and allele']))
-            db_gen['J_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['J-GENE and allele']))
-
-            v_seq_length = len(nt['V-REGION']) if nt['V-REGION'] else 0
-            db_gen['V_SEQ_START'] = nt['V-REGION start']
-            db_gen['V_SEQ_LENGTH'] = v_seq_length
-            db_gen['V_GERM_START_IMGT'] = 1
-            db_gen['V_GERM_LENGTH_IMGT'] = len(gp['V-REGION']) if gp['V-REGION'] else 0
-
-            db_gen['N1_LENGTH'] = sum(int(i) for i in [jn["P3'V-nt nb"],
-                                                       jn['N-REGION-nt nb'],
-                                                       jn['N1-REGION-nt nb'],
-                                                       jn["P5'D-nt nb"]] if i)
-            db_gen['D_SEQ_START'] = sum(int(i) for i in [1, v_seq_length,
-                                                         jn["P3'V-nt nb"],
-                                                         jn['N-REGION-nt nb'],
-                                                         jn['N1-REGION-nt nb'],
-                                                         jn["P5'D-nt nb"]] if i)
-            db_gen['D_SEQ_LENGTH'] = int(jn["D-REGION-nt nb"] or 0)
-            db_gen['D_GERM_START'] = int(jn["5'D-REGION trimmed-nt nb"] or 0) + 1
-            db_gen['D_GERM_LENGTH'] = int(jn["D-REGION-nt nb"] or 0)
-            db_gen['N2_LENGTH'] = sum(int(i) for i in [jn["P3'D-nt nb"],
-                                                       jn['N2-REGION-nt nb'],
-                                                       jn["P5'J-nt nb"]] if i)
-
-            db_gen['J_SEQ_START_IMGT'] = sum(int(i) for i in [1, v_seq_length,
-                                                         jn["P3'V-nt nb"],
-                                                         jn['N-REGION-nt nb'],
-                                                         jn['N1-REGION-nt nb'],
-                                                         jn["P5'D-nt nb"],
-                                                         jn["D-REGION-nt nb"],
-                                                         jn["P3'D-nt nb"],
-                                                         jn['N2-REGION-nt nb'],
-                                                         jn["P5'J-nt nb"]] if i)
-            db_gen['J_SEQ_LENGTH'] = len(nt['J-REGION']) if nt['J-REGION'] else 0
-            db_gen['J_GERM_START'] = int(jn["5'J-REGION trimmed-nt nb"] or 0) + 1
-            db_gen['J_GERM_LENGTH'] = len(gp['J-REGION']) if gp['J-REGION'] else 0
-
-            db_gen['JUNCTION_LENGTH'] = len(jn['JUNCTION']) if jn['JUNCTION'] else 0
-            db_gen['JUNCTION'] = jn['JUNCTION']
-
-            # Alignment scores
-            if score_fields:
-                try:  db_gen['V_SCORE'] = float(sm['V-REGION score'])
-                except (TypeError, ValueError):  db_gen['V_SCORE'] = 'None'
-
-                try:  db_gen['V_IDENTITY'] = float(sm['V-REGION identity %']) / 100.0
-                except (TypeError, ValueError):  db_gen['V_IDENTITY'] = 'None'
-
-                try:  db_gen['J_SCORE'] = float(sm['J-REGION score'])
-                except (TypeError, ValueError):  db_gen['J_SCORE'] = 'None'
-
-                try:  db_gen['J_IDENTITY'] = float(sm['J-REGION identity %']) / 100.0
-                except (TypeError, ValueError):  db_gen['J_IDENTITY'] = 'None'
-
-            # FWR and CDR regions
-            if region_fields: getRegions(db_gen)
-        else:
-            db_gen['V_CALL'] = 'None'
-            db_gen['D_CALL'] = 'None'
-            db_gen['J_CALL'] = 'None'
-
-        yield IgRecord(db_gen)
-
-    
-def getIDforIMGT(seq_file):
-    """
-    Create a sequence ID translation using IMGT truncation
-    
-    Arguments: 
-    seq_file = a fasta file of sequences input to IMGT
-                    
-    Returns: 
-    a dictionary of {truncated ID: full seq description} 
-    """
-    
-    # Create a seq_dict ID translation using IDs truncate up to space or 50 chars
-    ids = {}
-    for i, rec in enumerate(SeqIO.parse(seq_file, 'fasta', IUPAC.ambiguous_dna)):
-        if len(rec.description) <= 50:
-            id_key = rec.description
-        else:
-            id_key = re.sub('\||\s|!|&|\*|<|>|\?','_',rec.description[:50])
-        ids.update({id_key:rec.description})
-
-    return ids
-
-
-def writeDb(db_gen, file_prefix, total_count, id_dict={}, no_parse=True,
-            score_fields=False, region_fields=False, out_args=default_out_args):
-    """
-    Writes tab-delimited database file in output directory
+    Writes tab-delimited database file in output directory.
     
     Arguments:
-    db_gen = a generator of IgRecord objects containing alignment data
-    file_prefix = directory and prefix for CLIP tab-delim file
-    total_count = number of records (for progress bar)
-    id_dict = a dictionary of {IMGT ID: full seq description}
-    no_parse = if ID is to be parsed for pRESTO output with default delimiters
-    score_fields = if True add alignment score fields to output file
-    region_fields = if True add FWR and CDR region fields to output file
-    out_args = common output argument dictionary from parseCommonArgs
+      db : a iterator of IgRecord objects containing alignment data.
+      fields : a list of ordered field names to write.
+      file_prefix : directory and prefix for CLIP tab-delim file.
+      total_count : number of records (for progress bar).
+      id_dict : a dictionary of the truncated sequence ID mapped to the full sequence ID.
+      no_parse : if ID is to be parsed for pRESTO output with default delimiters.
+      partial : if True put incomplete alignments in the pass file.
+      out_args : common output argument dictionary from parseCommonArgs.
 
     Returns:
-    None
+      None
     """
-    pass_file = "%s_db-pass.tab" % file_prefix
-    fail_file = "%s_db-fail.tab" % file_prefix
-    ordered_fields = ['SEQUENCE_ID',
-                      'SEQUENCE_INPUT',
-                      'FUNCTIONAL',
-                      'IN_FRAME',
-                      'STOP',
-                      'MUTATED_INVARIANT',
-                      'INDELS',
-                      'V_CALL',
-                      'D_CALL',
-                      'J_CALL',
-                      'SEQUENCE_VDJ',
-                      'SEQUENCE_IMGT',
-                      'V_SEQ_START',
-                      'V_SEQ_LENGTH',
-                      'V_GERM_START_VDJ',
-                      'V_GERM_LENGTH_VDJ',
-                      'V_GERM_START_IMGT',
-                      'V_GERM_LENGTH_IMGT',
-                      'N1_LENGTH',
-                      'D_SEQ_START',
-                      'D_SEQ_LENGTH',
-                      'D_GERM_START',
-                      'D_GERM_LENGTH',
-                      'N2_LENGTH',
-                      'J_SEQ_START',
-                      'J_SEQ_LENGTH',
-                      'J_GERM_START',
-                      'J_GERM_LENGTH',
-                      'JUNCTION_LENGTH',
-                      'JUNCTION']
+    # Function to check for valid records strictly
+    def _pass_strict(rec):
+        valid = [rec.v_call and rec.v_call != 'None',
+                 rec.j_call and rec.j_call != 'None',
+                 rec.functional is not None,
+                 rec.seq_vdj,
+                 rec.junction]
+        return all(valid)
 
-    if score_fields:
-        ordered_fields.extend(['V_SCORE',
-                               'V_IDENTITY',
-                               'V_EVALUE',
-                               'V_BTOP',
-                               'J_SCORE',
-                               'J_IDENTITY',
-                               'J_EVALUE',
-                               'J_BTOP'])
+    # Function to check for valid records loosely
+    def _pass_gentle(rec):
+        valid = [rec.v_call and rec.v_call != 'None',
+                 rec.d_call and rec.d_call != 'None',
+                 rec.j_call and rec.j_call != 'None']
+        return any(valid)
 
-    if region_fields:
-        ordered_fields.extend(['FWR1_IMGT', 'FWR2_IMGT', 'FWR3_IMGT', 'FWR4_IMGT',
-                               'CDR1_IMGT', 'CDR2_IMGT', 'CDR3_IMGT'])
-
+    # Set pass criteria
+    _pass = _pass_gentle if partial else _pass_strict
 
-    # TODO:  This is not the best approach. should pass in output fields.
-    # Initiate passed handle
-    pass_handle = None
+    # Define output file names
+    pass_file = '%s_db-pass.tab' % file_prefix
+    fail_file = '%s_db-fail.tab' % file_prefix
 
-    # Open failed file
-    if out_args['failed']:
-        fail_handle = open(fail_file, 'wt')
-        fail_writer = getDbWriter(fail_handle, add_fields=['SEQUENCE_ID', 'SEQUENCE_INPUT'])
-    else:
-        fail_handle = None
-        fail_writer = None
-
-    # Initialize counters and file
+    # Initiate handles, writers and counters
+    pass_handle = None
+    fail_handle = None
     pass_writer = None
+    fail_writer = None
     start_time = time()
     rec_count = pass_count = fail_count = 0
-    for record in db_gen:
-        #printProgress(i + (total_count/2 if id_dict else 0), total_count, 0.05, start_time)
-        printProgress(rec_count, total_count, 0.05, start_time)
-        rec_count += 1
 
-        # Count pass or fail
-        if (record.v_call == 'None' and record.j_call == 'None') or \
-                record.functional is None or \
-                not record.seq_vdj or \
-                not record.junction:
-            # print(record.v_call, record.j_call, record.functional, record.junction)
-            fail_count += 1
-            if fail_writer is not None: fail_writer.writerow(record.toDict())
-            continue
-        else: 
-            pass_count += 1
-            
-        # Build sample sequence description
-        if record.id in id_dict:
+    # Validate and write output
+    printProgress(0, total_count, 0.05, start_time)
+    for i, record in enumerate(db, start=1):
+
+        # Replace sequence description with full string, if required
+        if id_dict is not None and record.id in id_dict:
             record.id = id_dict[record.id]
 
         # Parse sequence description into new columns
         if not no_parse:
-            record.annotations = parseAnnotation(record.id, delimiter=out_args['delimiter'])
-            record.id = record.annotations['ID']
-            del record.annotations['ID']
+            try:
+                record.annotations = parseAnnotation(record.id, delimiter=out_args['delimiter'])
+                record.id = record.annotations['ID']
+                del record.annotations['ID']
+
+                # TODO:  This is not the best approach. should pass in output fields.
+                # If first record, use parsed description to define extra columns
+                if i == 1:  fields.extend(list(record.annotations.keys()))
+            except IndexError:
+                # Could not parse pRESTO-style annotations so fall back to no parse
+                no_parse = True
+                sys.stderr.write('\nWARNING: Sequence annotation format not recognized. Sequence headers will not be parsed.\n')
 
-        # TODO:  This is not the best approach. should pass in output fields.
-        # If first sequence, use parsed description to create new columns and initialize writer
-        if pass_writer is None:
-            if not no_parse:  ordered_fields.extend(list(record.annotations.keys()))
-            pass_handle = open(pass_file, 'wt')
-            pass_writer = getDbWriter(pass_handle, add_fields=ordered_fields)
+        # Count pass or fail and write to appropriate file
+        if _pass(record):
+            # Open pass file
+            if pass_writer is None:
+                pass_handle = open(pass_file, 'wt')
+                pass_writer = getDbWriter(pass_handle, add_fields=fields)
 
-        # Write row to tab-delim CLIP file
-        pass_writer.writerow(record.toDict())
-    
-    # Print log
-    #printProgress(i+1 + (total_count/2 if id_dict else 0), total_count, 0.05, start_time)
-    printProgress(rec_count, total_count, 0.05, start_time)
+            # Write row to pass file
+            pass_count += 1
+            pass_writer.writerow(record.toDict())
+        else:
+            # Open failed file
+            if out_args['failed'] and fail_writer is None:
+                fail_handle = open(fail_file, 'wt')
+                fail_writer = getDbWriter(fail_handle, add_fields=fields)
 
+            # Write row to fail file if specified
+            fail_count += 1
+            if fail_writer is not None:
+                fail_writer.writerow(record.toDict())
+
+        # Print progress
+        printProgress(i, total_count, 0.05, start_time)
+
+    # Print consol log
     log = OrderedDict()
     log['OUTPUT'] = pass_file
     log['PASS'] = pass_count
@@ -777,146 +176,215 @@
     if fail_handle is not None: fail_handle.close()
 
 
-# TODO:  may be able to merge with parseIMGT
-def parseIgBlast(igblast_output, seq_file, repo, no_parse=True, score_fields=False,
-                 region_fields=False, out_args=default_out_args):
+# TODO:  may be able to merge with other mains
+def parseIMGT(aligner_output, seq_file=None, no_parse=True, partial=False,
+              parse_scores=False, parse_regions=False, parse_junction=False,
+              out_args=default_out_args):
     """
-    Main for IgBlast aligned sample sequences
+    Main for IMGT aligned sample sequences.
 
     Arguments:
-    igblast_output = IgBlast output file to process
-    seq_file = fasta file input to IgBlast (from which to get sequence)
-    repo = folder with germline repertoire files
-    no_parse = if ID is to be parsed for pRESTO output with default delimiters
-    score_fields = if True add alignment score fields to output file
-    region_fields = if True add FWR and CDR region fields to output file
-    out_args = common output argument dictionary from parseCommonArgs
+      aligner_output : zipped file or unzipped folder output by IMGT.
+      seq_file : FASTA file input to IMGT (from which to get seqID).
+      no_parse : if ID is to be parsed for pRESTO output with default delimiters.
+      partial : If True put incomplete alignments in the pass file.
+      parse_scores : if True add alignment score fields to output file.
+      parse_regions : if True add FWR and CDR region fields to output file.
+      out_args : common output argument dictionary from parseCommonArgs.
 
     Returns:
-    None
+      None
+    """
+    # Print parameter info
+    log = OrderedDict()
+    log['START'] = 'MakeDb'
+    log['ALIGNER'] = 'IMGT'
+    log['ALIGNER_OUTPUT'] = aligner_output
+    log['SEQ_FILE'] = os.path.basename(seq_file) if seq_file else ''
+    log['NO_PARSE'] = no_parse
+    log['PARTIAL'] = partial
+    log['SCORES'] = parse_scores
+    log['REGIONS'] = parse_regions
+    log['JUNCTION'] = parse_junction
+    printLog(log)
+
+    start_time = time()
+    printMessage('Loading sequence files', start_time=start_time, width=25)
+    # Extract IMGT files
+    temp_dir, imgt_files = extractIMGT(aligner_output)
+    # Count records in IMGT files
+    total_count = countDbFile(imgt_files['summary'])
+    # Get (parsed) IDs from fasta file submitted to IMGT
+    id_dict = getIDforIMGT(seq_file) if seq_file else {}
+    printMessage('Done', start_time=start_time, end=True, width=25)
+
+    # Parse IMGT output and write db
+    with open(imgt_files['summary'], 'r') as summary_handle, \
+            open(imgt_files['gapped'], 'r') as gapped_handle, \
+            open(imgt_files['ntseq'], 'r') as ntseq_handle, \
+            open(imgt_files['junction'], 'r') as junction_handle:
+        parse_iter = IMGTReader(summary_handle, gapped_handle, ntseq_handle, junction_handle,
+                                parse_scores=parse_scores, parse_regions=parse_regions,
+                                parse_junction=parse_junction)
+        file_prefix = getFilePrefix(aligner_output, out_args)
+        writeDb(parse_iter, parse_iter.fields, file_prefix, total_count, id_dict=id_dict,
+                no_parse=no_parse, partial=partial, out_args=out_args)
+
+    # Cleanup temp directory
+    temp_dir.cleanup()
+
+    return None
+
+
+# TODO:  may be able to merge with other mains
+def parseIgBLAST(aligner_output, seq_file, repo, no_parse=True, partial=False,
+                 parse_regions=False, parse_scores=False, parse_igblast_cdr3=False,
+                 out_args=default_out_args):
+    """
+    Main for IgBLAST aligned sample sequences.
+
+    Arguments:
+      aligner_output : IgBLAST output file to process.
+      seq_file : fasta file input to IgBlast (from which to get sequence).
+      repo : folder with germline repertoire files.
+      no_parse : if ID is to be parsed for pRESTO output with default delimiters.
+      partial : If True put incomplete alignments in the pass file.
+      parse_regions : if True add FWR and CDR fields to output file.
+      parse_scores : if True add alignment score fields to output file.
+      parse_igblast_cdr3 : if True parse CDR3 sequences generated by IgBLAST
+      out_args : common output argument dictionary from parseCommonArgs.
+
+    Returns:
+      None
     """
     # Print parameter info
     log = OrderedDict()
     log['START'] = 'MakeDB'
     log['ALIGNER'] = 'IgBlast'
-    log['ALIGN_RESULTS'] = os.path.basename(igblast_output)
+    log['ALIGNER_OUTPUT'] = os.path.basename(aligner_output)
     log['SEQ_FILE'] = os.path.basename(seq_file)
     log['NO_PARSE'] = no_parse
-    log['SCORE_FIELDS'] = score_fields
-    log['REGION_FIELDS'] = region_fields
+    log['PARTIAL'] = partial
+    log['SCORES'] = parse_scores
+    log['REGIONS'] = parse_regions
     printLog(log)
 
+    start_time = time()
+    printMessage('Loading sequence files', start_time=start_time, width=25)
+    # Count records in sequence file
+    total_count = countSeqFile(seq_file)
     # Get input sequence dictionary
-    seq_dict = getSeqforIgBlast(seq_file)
+    seq_dict = getSeqDict(seq_file)
+    # Create germline repo dictionary
+    repo_dict = readRepo(repo)
+    printMessage('Done', start_time=start_time, end=True, width=25)
 
-    # Formalize out_dir and file-prefix
-    if not out_args['out_dir']:
-        out_dir = os.path.split(igblast_output)[0]
-    else:
-        out_dir = os.path.abspath(out_args['out_dir'])
-        if not os.path.exists(out_dir):  os.mkdir(out_dir)
-    if out_args['out_name']:
-        file_prefix = out_args['out_name']
-    else:
-        file_prefix = os.path.basename(os.path.splitext(igblast_output)[0])
-    file_prefix = os.path.join(out_dir, file_prefix)
+    # Parse and write output
+    with open(aligner_output, 'r') as f:
+        parse_iter = IgBLASTReader(f, seq_dict, repo_dict,
+                                   parse_scores=parse_scores, parse_regions=parse_regions,
+                                   parse_igblast_cdr3=parse_igblast_cdr3)
+        file_prefix = getFilePrefix(aligner_output, out_args)
+        writeDb(parse_iter, parse_iter.fields, file_prefix, total_count,
+                no_parse=no_parse, partial=partial, out_args=out_args)
 
-    total_count = countSeqFile(seq_file)
-
-    # Create
-    repo_dict = getRepo(repo)
-    igblast_dict = readIgBlast(igblast_output, seq_dict, repo_dict,
-                               score_fields=score_fields, region_fields=region_fields)
-    writeDb(igblast_dict, file_prefix, total_count, no_parse=no_parse,
-            score_fields=score_fields, region_fields=region_fields, out_args=out_args)
+    return None
 
 
-# TODO:  may be able to merge with parseIgBlast
-def parseIMGT(imgt_output, seq_file=None, no_parse=True, score_fields=False,
-              region_fields=False, out_args=default_out_args):
+# TODO:  may be able to merge with other mains
+def parseIHMM(aligner_output, seq_file, repo, no_parse=True, partial=False,
+              parse_scores=False, parse_regions=False, out_args=default_out_args):
     """
-    Main for IMGT aligned sample sequences
+    Main for iHMMuneAlign aligned sample sequences.
 
     Arguments:
-    imgt_output = zipped file or unzipped folder output by IMGT
-    seq_file = FASTA file input to IMGT (from which to get seqID)
-    no_parse = if ID is to be parsed for pRESTO output with default delimiters
-    score_fields = if True add alignment score fields to output file
-    region_fields = if True add FWR and CDR region fields to output file
-    out_args = common output argument dictionary from parseCommonArgs
-        
-    Returns: 
-    None
+      aligner_output : iHMMune-Align output file to process.
+      seq_file : fasta file input to iHMMuneAlign (from which to get sequence).
+      repo : folder with germline repertoire files.
+      no_parse : if ID is to be parsed for pRESTO output with default delimiters.
+      partial : If True put incomplete alignments in the pass file.
+      parse_scores : if True parse alignment scores.
+      parse_regions : if True add FWR and CDR region fields.
+      out_args : common output argument dictionary from parseCommonArgs.
+
+    Returns:
+      None
     """
     # Print parameter info
     log = OrderedDict()
-    log['START'] = 'MakeDb'
-    log['ALIGNER'] = 'IMGT'
-    log['ALIGN_RESULTS'] = imgt_output
-    log['SEQ_FILE'] = os.path.basename(seq_file) if seq_file else ''
+    log['START'] = 'MakeDB'
+    log['ALIGNER'] = 'iHMMune-Align'
+    log['ALIGNER_OUTPUT'] = os.path.basename(aligner_output)
+    log['SEQ_FILE'] = os.path.basename(seq_file)
     log['NO_PARSE'] = no_parse
-    log['SCORE_FIELDS'] = score_fields
-    log['REGION_FIELDS'] = region_fields
+    log['PARTIAL'] = partial
+    log['SCORES'] = parse_scores
+    log['REGIONS'] = parse_regions
     printLog(log)
-    
-    # Get individual IMGT result files
-    temp_dir, imgt_files = extractIMGT(imgt_output)
-        
-    # Formalize out_dir and file-prefix
-    if not out_args['out_dir']:
-        out_dir = os.path.dirname(os.path.abspath(imgt_output))
-    else:
-        out_dir = os.path.abspath(out_args['out_dir'])
-        if not os.path.exists(out_dir):  os.mkdir(out_dir)
-    if out_args['out_name']:
-        file_prefix = out_args['out_name']
-    else:
-        file_prefix = os.path.splitext(os.path.split(os.path.abspath(imgt_output))[1])[0]
-    file_prefix = os.path.join(out_dir, file_prefix)
 
-    total_count = countDbFile(imgt_files[0])
-    
-    # Get (parsed) IDs from fasta file submitted to IMGT
-    id_dict = getIDforIMGT(seq_file) if seq_file else {}
-    
-    # Create
-    imgt_dict = readIMGT(imgt_files, score_fields=score_fields,
-                         region_fields=region_fields)
-    writeDb(imgt_dict, file_prefix, total_count, id_dict=id_dict, no_parse=no_parse,
-            score_fields=score_fields, region_fields=region_fields, out_args=out_args)
+    start_time = time()
+    printMessage('Loading sequence files', start_time=start_time, width=25)
+    # Count records in sequence file
+    total_count = countSeqFile(seq_file)
+    # Get input sequence dictionary
+    seq_dict = getSeqDict(seq_file)
+    # Create germline repo dictionary
+    repo_dict = readRepo(repo)
+    printMessage('Done', start_time=start_time, end=True, width=25)
 
-    # Delete temp directory
-    rmtree(temp_dir)
+    # Parse and write output
+    with open(aligner_output, 'r') as f:
+        parse_iter = IHMMuneReader(f, seq_dict, repo_dict,
+                                   parse_scores=parse_scores, parse_regions=parse_regions)
+        file_prefix = getFilePrefix(aligner_output, out_args)
+        writeDb(parse_iter, parse_iter.fields, file_prefix, total_count,
+                no_parse=no_parse, partial=partial, out_args=out_args)
+
+    return None
 
 
 def getArgParser():
     """
-    Defines the ArgumentParser
+    Defines the ArgumentParser.
 
-    Arguments: 
-    None
-                      
     Returns: 
-    an ArgumentParser object
+      argparse.ArgumentParser
     """
     fields = dedent(
              '''
               output files:
                   db-pass
-                      database of parsed alignment records.
+                      database of alignment records with functionality information,
+                      V and J calls, and a junction region.
                   db-fail
-                      database with records failing alignment.
+                      database with records that fail due to no functionality information
+                      (did not pass IMGT), no V call, no J call, or no junction region.
 
-              output fields:
-                  SEQUENCE_ID, SEQUENCE_INPUT, FUNCTIONAL, IN_FRAME, STOP, MUTATED_INVARIANT,
-                  INDELS, V_CALL, D_CALL, J_CALL, SEQUENCE_VDJ and/or SEQUENCE_IMGT,
-                  V_SEQ_START, V_SEQ_LENGTH, V_GERM_START_VDJ and/or V_GERM_START_IMGT,
-                  V_GERM_LENGTH_VDJ and/or V_GERM_LENGTH_IMGT, N1_LENGTH,
-                  D_SEQ_START, D_SEQ_LENGTH, D_GERM_START, D_GERM_LENGTH, N2_LENGTH,
+              universal output fields:
+                  SEQUENCE_ID, SEQUENCE_INPUT, SEQUENCE_VDJ, SEQUENCE_IMGT,
+                  FUNCTIONAL, IN_FRAME, STOP, MUTATED_INVARIANT, INDELS,
+                  V_CALL, D_CALL, J_CALL,
+                  V_SEQ_START, V_SEQ_LENGTH,
+                  D_SEQ_START, D_SEQ_LENGTH, D_GERM_START, D_GERM_LENGTH,
                   J_SEQ_START, J_SEQ_LENGTH, J_GERM_START, J_GERM_LENGTH,
-                  JUNCTION_LENGTH, JUNCTION, V_SCORE, V_IDENTITY, V_EVALUE, V_BTOP,
-                  J_SCORE, J_IDENTITY, J_EVALUE, J_BTOP, FWR1_IMGT, FWR2_IMGT, FWR3_IMGT,
-                  FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, CDR3_IMGT
+                  JUNCTION_LENGTH, JUNCTION, NP1_LENGTH, NP2_LENGTH,
+                  FWR1_IMGT, FWR2_IMGT, FWR3_IMGT, FWR4_IMGT,
+                  CDR1_IMGT, CDR2_IMGT, CDR3_IMGT
+
+              imgt specific output fields:
+                  V_GERM_START_IMGT, V_GERM_LENGTH_IMGT,
+                  N1_LENGTH, N2_LENGTH, P3V_LENGTH, P5D_LENGTH, P3D_LENGTH, P5J_LENGTH,
+                  D_FRAME, V_SCORE, V_IDENTITY, J_SCORE, J_IDENTITY,
+
+              igblast specific output fields:
+                  V_GERM_START_VDJ, V_GERM_LENGTH_VDJ,
+                  V_EVALUE, V_SCORE, V_IDENTITY, V_BTOP,
+                  J_EVALUE, J_SCORE, J_IDENTITY, J_BTOP.
+                  CDR3_IGBLAST_NT, CDR3_IGBLAST_AA
+
+              ihmm specific output fields:
+                  V_GERM_START_VDJ, V_GERM_LENGTH_VDJ,
+                  HMM_SCORE
               ''')
                 
     # Define ArgumentParser
@@ -933,11 +401,11 @@
     parser_parent = getCommonArgParser(seq_in=False, seq_out=False, log=False)
 
     # IgBlast Aligner
-    parser_igblast = subparsers.add_parser('igblast', help='Process IgBlast output',
-                                           parents=[parser_parent],
-                                           formatter_class=CommonHelpFormatter)
-    parser_igblast.set_defaults(func=parseIgBlast)
-    parser_igblast.add_argument('-i', nargs='+', action='store', dest='aligner_files',
+    parser_igblast = subparsers.add_parser('igblast', parents=[parser_parent],
+                                           formatter_class=CommonHelpFormatter,
+                                           help='Process IgBLAST output.',
+                                           description='Process IgBLAST output.')
+    parser_igblast.add_argument('-i', nargs='+', action='store', dest='aligner_outputs',
                                 required=True,
                                 help='''IgBLAST output files in format 7 with query sequence
                                      (IgBLAST argument \'-outfmt "7 std qseq sseq btop"\').''')
@@ -947,50 +415,112 @@
                                      set of germlines used in the IgBLAST alignment.''')
     parser_igblast.add_argument('-s', action='store', nargs='+', dest='seq_files',
                                 required=True,
-                                help='List of input FASTA files containing sequences')
+                                help='''List of input FASTA files (with .fasta, .fna or .fa
+                                     extension), containing sequences.''')
     parser_igblast.add_argument('--noparse', action='store_true', dest='no_parse',
-                                help='''Specify if input IDs should not be parsed to add
-                                     new columns to database.''')
-    parser_igblast.add_argument('--scores', action='store_true', dest='score_fields',
+                                help='''Specify to prevent input sequence headers from being parsed
+                                    to add new columns to database. Parsing of sequence headers requires
+                                    headers to be in the pRESTO annotation format, so this should be specified
+                                    when sequence headers are incompatible with the pRESTO annotation scheme.
+                                    Note, unrecognized header formats will default to this behavior.''')
+    parser_igblast.add_argument('--partial', action='store_true', dest='partial',
+                                help='''If specified, include incomplete V(D)J alignments in
+                                     the pass file instead of the fail file.''')
+    parser_igblast.add_argument('--scores', action='store_true', dest='parse_scores',
                                 help='''Specify if alignment score metrics should be
                                      included in the output. Adds the V_SCORE, V_IDENTITY,
                                      V_EVALUE, V_BTOP, J_SCORE, J_IDENTITY,
                                      J_BTOP, and J_EVALUE columns.''')
-    parser_igblast.add_argument('--regions', action='store_true', dest='region_fields',
-                                help='''Specify if IMGT framework and CDR regions should be
+    parser_igblast.add_argument('--regions', action='store_true', dest='parse_regions',
+                                help='''Specify if IMGT FWR and CDRs should be
                                      included in the output. Adds the FWR1_IMGT, FWR2_IMGT,
                                      FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and
                                      CDR3_IMGT columns.''')
-    
+    parser_igblast.add_argument('--cdr3', action='store_true',
+                                dest='parse_igblast_cdr3', 
+                                help='''Specify if the CDR3 sequences generated by IgBLAST 
+                                     should be included in the output. Adds the columns
+                                     CDR3_IGBLAST_NT and CDR3_IGBLAST_AA. Requires IgBLAST
+                                     version 1.5 or greater.''')
+    parser_igblast.set_defaults(func=parseIgBLAST)
+
     # IMGT aligner
-    parser_imgt = subparsers.add_parser('imgt', help='Process IMGT/HighV-Quest output', 
-                                        parents=[parser_parent], 
-                                        formatter_class=CommonHelpFormatter)
-    imgt_arg_group =  parser_imgt.add_mutually_exclusive_group(required=True)
-    imgt_arg_group.add_argument('-i', nargs='+', action='store', dest='aligner_files',
-                                help='''Either zipped IMGT output files (.zip) or a folder
-                                     containing unzipped IMGT output files (which must
-                                     include 1_Summary, 2_IMGT-gapped, 3_Nt-sequences,
-                                     and 6_Junction).''')
+    parser_imgt = subparsers.add_parser('imgt', parents=[parser_parent],
+                                        formatter_class=CommonHelpFormatter,
+                                        help='''Process IMGT/HighV-Quest output
+                                             (does not work with V-QUEST).''',
+                                        description='''Process IMGT/HighV-Quest output
+                                             (does not work with V-QUEST).''')
+    parser_imgt.add_argument('-i', nargs='+', action='store', dest='aligner_outputs',
+                             help='''Either zipped IMGT output files (.zip or .txz) or a
+                                  folder containing unzipped IMGT output files (which must
+                                  include 1_Summary, 2_IMGT-gapped, 3_Nt-sequences,
+                                  and 6_Junction).''')
     parser_imgt.add_argument('-s', nargs='*', action='store', dest='seq_files',
                              required=False,
-                             help='List of input FASTA files containing sequences')
+                             help='''List of input FASTA files (with .fasta, .fna or .fa
+                                  extension) containing sequences.''')
     parser_imgt.add_argument('--noparse', action='store_true', dest='no_parse', 
-                             help='''Specify if input IDs should not be parsed to add new
-                                  columns to database.''')
-    parser_imgt.add_argument('--scores', action='store_true', dest='score_fields',
+                             help='''Specify to prevent input sequence headers from being parsed
+                                  to add new columns to database. Parsing of sequence headers requires
+                                  headers to be in the pRESTO annotation format, so this should be specified
+                                  when sequence headers are incompatible with the pRESTO annotation scheme.
+                                  Note, unrecognized header formats will default to this behavior.''')
+    parser_imgt.add_argument('--partial', action='store_true', dest='partial',
+                             help='''If specified, include incomplete V(D)J alignments in
+                                  the pass file instead of the fail file.''')
+    parser_imgt.add_argument('--scores', action='store_true', dest='parse_scores',
                              help='''Specify if alignment score metrics should be
                                   included in the output. Adds the V_SCORE, V_IDENTITY,
-                                  J_SCORE and J_IDENTITY. Note, this will also add
-                                  the columns V_EVALUE, V_BTOP, J_EVALUE and J_BTOP,
-                                  but they will be empty for IMGT output.''')
-    parser_imgt.add_argument('--regions', action='store_true', dest='region_fields',
-                             help='''Specify if IMGT framework and CDR regions should be
+                                  J_SCORE and J_IDENTITY.''')
+    parser_imgt.add_argument('--regions', action='store_true', dest='parse_regions',
+                             help='''Specify if IMGT FWRs and CDRs should be
                                   included in the output. Adds the FWR1_IMGT, FWR2_IMGT,
                                   FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and
                                   CDR3_IMGT columns.''')
+    parser_imgt.add_argument('--junction', action='store_true', dest='parse_junction',
+                             help='''Specify if detailed junction fields should be
+                                  included in the output. Adds the columns 
+                                  N1_LENGTH, N2_LENGTH, P3V_LENGTH, P5D_LENGTH, P3D_LENGTH,
+                                  P5J_LENGTH, D_FRAME.''')
     parser_imgt.set_defaults(func=parseIMGT)
 
+    # iHMMuneAlign Aligner
+    parser_ihmm = subparsers.add_parser('ihmm', parents=[parser_parent],
+                                        formatter_class=CommonHelpFormatter,
+                                        help='Process iHMMune-Align output.',
+                                        description='Process iHMMune-Align output.')
+    parser_ihmm.add_argument('-i', nargs='+', action='store', dest='aligner_outputs',
+                             required=True,
+                             help='''iHMMune-Align output file.''')
+    parser_ihmm.add_argument('-r', nargs='+', action='store', dest='repo', required=True,
+                             help='''List of folders and/or FASTA files containing
+                                  IMGT-gapped germline sequences corresponding to the
+                                  set of germlines used in the IgBLAST alignment.''')
+    parser_ihmm.add_argument('-s', action='store', nargs='+', dest='seq_files',
+                             required=True,
+                             help='''List of input FASTA files (with .fasta, .fna or .fa
+                                  extension) containing sequences.''')
+    parser_ihmm.add_argument('--noparse', action='store_true', dest='no_parse',
+                             help='''Specify to prevent input sequence headers from being parsed
+                                  to add new columns to database. Parsing of sequence headers requires
+                                  headers to be in the pRESTO annotation format, so this should be specified
+                                  when sequence headers are incompatible with the pRESTO annotation scheme.
+                                  Note, unrecognized header formats will default to this behavior.''')
+    parser_ihmm.add_argument('--partial', action='store_true', dest='partial',
+                             help='''If specified, include incomplete V(D)J alignments in
+                                  the pass file instead of the fail file.''')
+    parser_ihmm.add_argument('--scores', action='store_true', dest='parse_scores',
+                             help='''Specify if alignment score metrics should be
+                                  included in the output. Adds the path score of the
+                                  iHMMune-Align hidden Markov model to HMM_SCORE.''')
+    parser_ihmm.add_argument('--regions', action='store_true', dest='parse_regions',
+                             help='''Specify if IMGT FWRs and CDRs should be
+                                  included in the output. Adds the FWR1_IMGT, FWR2_IMGT,
+                                  FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and
+                                  CDR3_IMGT columns.''')
+    parser_ihmm.set_defaults(func=parseIHMM)
+
     return parser
     
     
@@ -1000,7 +530,7 @@
     """
     parser = getArgParser()    
     args = parser.parse_args()
-    args_dict = parseCommonArgs(args, in_arg='aligner_files')
+    args_dict = parseCommonArgs(args, in_arg='aligner_outputs')
 
     # Set no ID parsing if sequence files are not provided
     if 'seq_files' in args_dict and not args_dict['seq_files']:
@@ -1008,18 +538,18 @@
 
     # Delete
     if 'seq_files' in args_dict: del args_dict['seq_files']
-    if 'aligner_files' in args_dict: del args_dict['aligner_files']
+    if 'aligner_outputs' in args_dict: del args_dict['aligner_outputs']
     if 'command' in args_dict: del args_dict['command']
     if 'func' in args_dict: del args_dict['func']           
     
     if args.command == 'imgt':
-        for i in range(len(args.__dict__['aligner_files'])):
-            args_dict['imgt_output'] = args.__dict__['aligner_files'][i]
+        for i in range(len(args.__dict__['aligner_outputs'])):
+            args_dict['aligner_output'] = args.__dict__['aligner_outputs'][i]
             args_dict['seq_file'] = args.__dict__['seq_files'][i] \
                                     if args.__dict__['seq_files'] else None
             args.func(**args_dict)
-    elif args.command == 'igblast':
-        for i in range(len(args.__dict__['aligner_files'])):
-            args_dict['igblast_output'] =  args.__dict__['aligner_files'][i]
+    elif args.command == 'igblast' or args.command == 'ihmm':
+        for i in range(len(args.__dict__['aligner_outputs'])):
+            args_dict['aligner_output'] =  args.__dict__['aligner_outputs'][i]
             args_dict['seq_file'] = args.__dict__['seq_files'][i]
             args.func(**args_dict)
author	davidvanzessen
date	Tue, 23 May 2017 08:32:58 -0400
parents	c33d93683a09
children