view CreateGermlines.py @ 0:183edf446dcf draft default tip

Uploaded
author davidvanzessen
date Mon, 17 Jul 2017 07:44:27 -0400
parents
children
line wrap: on
line source

#!/usr/bin/env python3
"""
Reconstructs germline sequences from alignment data
"""
# Info
__author__ = 'Namita Gupta, Jason Anthony Vander Heiden'
from changeo import __version__, __date__

# Imports
import os
import sys
from argparse import ArgumentParser
from collections import OrderedDict
from textwrap import dedent
from time import time

# Presto and change imports
from presto.Defaults import default_out_args
from presto.IO import getOutputHandle, printLog, printProgress
from changeo.Commandline import CommonHelpFormatter, checkArgs, getCommonArgParser, parseCommonArgs
from changeo.IO import getDbWriter, readDbFile, countDbFile, readRepo
from changeo.Receptor import allele_regex, parseAllele

# Defaults
default_germ_types = 'dmask'
default_v_field = 'V_CALL'
default_seq_field = 'SEQUENCE_IMGT'

    
def joinGermline(align, repo_dict, germ_types, v_field, seq_field):
    """
    Join gapped germline sequences aligned with sample sequences
    
    Arguments:
    align = iterable yielding dictionaries of sample sequence data
    repo_dict = dictionary of IMGT gapped germline sequences
    germ_types = types of germline sequences to be output
                     (full germline, D-region masked, only V-region germline)
    v_field = field in which to look for V call
    seq_field = field in which to look for sequence
    
    Returns:
    dictionary of germline_type: germline_sequence
    """
    j_field = 'J_CALL'
    germlines = {'full': '', 'dmask': '', 'vonly': '', 'regions': ''}
    result_log = OrderedDict()
    result_log['ID'] = align['SEQUENCE_ID']

    # Find germline V-region gene
    if v_field == 'V_CALL_GENOTYPED':
        vgene = parseAllele(align[v_field], allele_regex, 'list')
        vkey = vgene
    else:
        vgene = parseAllele(align[v_field], allele_regex, 'first')
        vkey = (vgene, )

    try:
        int(align['P3V_LENGTH'])
        int(align['N1_LENGTH'])
        int(align['P5D_LENGTH'])
        int(align['P3D_LENGTH'])
        int(align['N2_LENGTH'])
        int(align['P5J_LENGTH'])
    except:
        regions_style = 'IgBLAST'
    else:
        regions_style = 'IMGT'

    # Build V-region germline
    if vgene is not None:
        result_log['V_CALL'] = ','.join(vkey)
        if vkey in repo_dict:
            vseq = repo_dict[vkey]
            # Germline start
            try: vstart = int(align['V_GERM_START_IMGT']) - 1
            except (TypeError, ValueError): vstart = 0
            # Germline length
            try: vlen = int(align['V_GERM_LENGTH_IMGT'])
            except (TypeError, ValueError): vlen = 0
            # TODO:  not sure what this line is doing here. it no make no sense.
            vpad = vlen - len(vseq[vstart:])
            if vpad < 0: vpad = 0
            germ_vseq = vseq[vstart:(vstart + vlen)] + ('N' * vpad)
        else:
            result_log['ERROR'] = 'Germline %s not in repertoire' % ','.join(vkey)
            return result_log, germlines
    else:
        result_log['V_CALL'] = None
        try: vlen = int(align['V_GERM_LENGTH_IMGT'])
        except (TypeError, ValueError): vlen = 0
        germ_vseq = 'N' * vlen

    # Find germline D-region gene
    dgene = parseAllele(align['D_CALL'], allele_regex, 'first')

    # Build D-region germline
    if dgene is not None:
        result_log['D_CALL'] = dgene
        dkey = (dgene, )
        if dkey in repo_dict:
            dseq = repo_dict[dkey]
            # Germline start
            try: dstart = int(align['D_GERM_START']) - 1
            except (TypeError, ValueError): dstart = 0
            # Germline length
            try: dlen = int(align['D_GERM_LENGTH'])
            except (TypeError, ValueError): dlen = 0
            germ_dseq = repo_dict[dkey][dstart:(dstart + dlen)]
        else:
            result_log['ERROR'] = 'Germline %s not in repertoire' % dgene
            return result_log, germlines
    else:
        result_log['D_CALL'] = None
        germ_dseq = ''

    # Find germline J-region gene
    jgene = parseAllele(align[j_field], allele_regex, 'first')

    # Build D-region germline
    if jgene is not None:
        result_log['J_CALL'] = jgene
        jkey = (jgene, )
        if jkey in repo_dict:
            jseq = repo_dict[jkey]
            # Germline start
            try: jstart = int(align['J_GERM_START']) - 1
            except (TypeError, ValueError): jstart = 0
            # Germline length
            try: jlen = int(align['J_GERM_LENGTH'])
            except (TypeError, ValueError): jlen = 0
            # TODO:  not sure what this line is doing either
            jpad = jlen - len(jseq[jstart:])
            if jpad < 0: jpad = 0
            germ_jseq = jseq[jstart:(jstart + jlen)] + ('N' * jpad)
        else:
            result_log['ERROR'] = 'Germline %s not in repertoire' % jgene
            return result_log, germlines
    else:
        result_log['J_CALL'] = None
        try: jlen = int(align['J_GERM_LENGTH'])
        except (TypeError, ValueError): jlen = 0
        germ_jseq = 'N' * jlen

    # Assemble pieces starting with V-region
    germ_seq = germ_vseq
    regions = 'V' * len(germ_vseq)

    try:
        np1_len = int(align['NP1_LENGTH'])
    except (TypeError, ValueError):
        np1_len = 0

    # NP nucleotide additions after V
    if regions_style == 'IMGT':
        # P nucleotide additions
        try:
            p3v_len = int(align['P3V_LENGTH'])
        except (TypeError, ValueError):
            p3v_len = 0
        if p3v_len < 0:
            result_log['ERROR'] = 'P3V_LENGTH is negative'
            return result_log, germlines

        regions += 'P' * p3v_len

        # N1 nucleotide additions
        try:
            n1_len = int(align['N1_LENGTH'])
        except (TypeError, ValueError):
            n1_len = 0
        if n1_len < 0:
            result_log['ERROR'] = 'N1_LENGTH is negative'
            return result_log, germlines

        regions += 'N' * n1_len

        # P nucleotide additions before D
        try: p5d_len = int(align['P5D_LENGTH'])
        except (TypeError, ValueError): p5d_len = 0
        if p5d_len < 0:
            result_log['ERROR'] = 'P5D_LENGTH is negative'
            return result_log, germlines

        regions += 'P' * p5d_len
    else:
        # IgBLAST style
        # PNP nucleotide additions after V
        if np1_len < 0:
            result_log['ERROR'] = 'NP1_LENGTH is negative'
            return result_log, germlines

        regions += 'N' * np1_len

    germ_seq += 'N' * np1_len

    # Add D-region
    germ_seq += germ_dseq
    regions += 'D' * len(germ_dseq)

    #print 'VD>', germ_seq, '\nVD>', regions

    try:
        np2_len = int(align['NP2_LENGTH'])
    except (TypeError, ValueError):
        np2_len = 0

    # NP nucleotide additions before J
    if regions_style == 'IMGT':
        # P nucleotide additions
        try:
            p3d_len = int(align['P3D_LENGTH'])
        except (TypeError, ValueError):
            p3d_len = 0
        if p3d_len < 0:
            result_log['ERROR'] = 'P3D_LENGTH is negative'
            return result_log, germlines

        regions += 'P' * p3d_len

        # N2 nucleotide additions
        try:
            n2_len = int(align['N2_LENGTH'])
        except (TypeError, ValueError):
            n2_len = 0
        if n2_len < 0:
            result_log['ERROR'] = 'N2_LENGTH is negative'
            return result_log, germlines

        regions += 'N' * n2_len

        # P nucleotide additions
        try:
            p5j_len = int(align['P5J_LENGTH'])
        except (TypeError, ValueError):
            p5j_len = 0
        if p5j_len < 0:
            result_log['ERROR'] = 'P5J_LENGTH is negative'
            return result_log, germlines

        regions += 'P' * p5j_len
    else:
        # IgBLAST style
        # NP nucleotide additions
        if np2_len < 0:
            result_log['ERROR'] = 'NP2_LENGTH is negative'
            return result_log, germlines

        regions += 'N' * np2_len

    germ_seq += 'N' * np2_len

    # Add J-region
    germ_seq += germ_jseq
    regions += 'J' * len(germ_jseq)

    #print('\nREGIONS>',regions,'\n')

    # Define return germlines
    germlines['full'] = germ_seq
    germlines['regions'] = regions

    if 'dmask' in germ_types:
        germlines['dmask'] = germ_seq[:len(germ_vseq)] + \
                             'N' * (len(germ_seq) - len(germ_vseq) - len(germ_jseq)) + \
                             germ_seq[-len(germ_jseq):]
    if 'vonly' in germ_types:
        germlines['vonly'] = germ_vseq

    # Check that input and germline sequence match
    if len(align[seq_field]) == 0:
        result_log['ERROR'] = 'Sequence is missing from %s column' % seq_field
    elif len(germlines['full']) != len(align[seq_field]):
        result_log['ERROR'] = 'Germline sequence is %d nucleotides longer than input sequence' % \
                              (len(germlines['full']) - len(align[seq_field]))

    # Convert to uppercase
    for k, v in germlines.items():  germlines[k] = v.upper()

    return result_log, germlines


def assembleEachGermline(db_file, repo, germ_types, v_field, seq_field, out_args=default_out_args):
    """
    Write germline sequences to tab-delimited database file

    Arguments:
    db_file = input tab-delimited database file
    repo = folder with germline repertoire files
    germ_types = types of germline sequences to be output
                     (full germline, D-region masked, only V-region germline)
    v_field = field in which to look for V call
    seq_field = field in which to look for sequence
    out_args = arguments for output preferences

    Returns:
    None
    """
    # Print parameter info
    log = OrderedDict()
    log['START'] = 'CreateGermlines'
    log['DB_FILE'] = os.path.basename(db_file)
    log['GERM_TYPES'] = germ_types if isinstance(germ_types, str) else ','.join(germ_types)
    log['CLONED'] = 'False'
    log['V_FIELD'] = v_field
    log['SEQ_FIELD'] = seq_field
    printLog(log)

    # Get repertoire and open Db reader
    repo_dict = readRepo(repo)
    reader = readDbFile(db_file, ig=False)

    # Exit if V call field does not exist in reader
    if v_field not in reader.fieldnames:
        sys.exit('Error: V field does not exist in input database file.')

    # Define log handle
    if out_args['log_file'] is None:
        log_handle = None
    else:
        log_handle = open(out_args['log_file'], 'w')

    add_fields = []
    seq_type = seq_field.split('_')[-1]
    if 'full' in germ_types: add_fields +=  ['GERMLINE_' + seq_type]
    if 'dmask' in germ_types: add_fields += ['GERMLINE_' + seq_type + '_D_MASK']
    if 'vonly' in germ_types: add_fields += ['GERMLINE_' + seq_type + '_V_REGION']
    if 'regions' in germ_types: add_fields += ['GERMLINE_REGIONS']

    # Create output file handle and Db writer
    pass_handle = getOutputHandle(db_file, 'germ-pass',
                                  out_dir=out_args['out_dir'],
                                  out_name=out_args['out_name'],
                                  out_type=out_args['out_type'])
    pass_writer = getDbWriter(pass_handle, db_file, add_fields=add_fields)

    if out_args['failed']:
        fail_handle = getOutputHandle(db_file, 'germ-fail',
                                      out_dir=out_args['out_dir'],
                                      out_name=out_args['out_name'],
                                      out_type=out_args['out_type'])
        fail_writer = getDbWriter(fail_handle, db_file, add_fields=add_fields)
    else:
        fail_handle = None
        fail_writer = None

    # Initialize time and total count for progress bar
    start_time = time()
    rec_count = countDbFile(db_file)
    pass_count = fail_count = 0
    # Iterate over rows
    for i, row in enumerate(reader):
        # Print progress
        printProgress(i, rec_count, 0.05, start_time)

        result_log, germlines = joinGermline(row, repo_dict, germ_types, v_field, seq_field)

        # Add germline field(s) to dictionary
        if 'full' in germ_types: row['GERMLINE_' + seq_type] = germlines['full']
        if 'dmask' in germ_types: row['GERMLINE_' + seq_type + '_D_MASK'] = germlines['dmask']
        if 'vonly' in germ_types: row['GERMLINE_' + seq_type + '_V_REGION'] = germlines['vonly']
        if 'regions' in germ_types: row['GERMLINE_REGIONS'] = germlines['regions']

        # Write row to pass or fail file
        if 'ERROR' in result_log:
            fail_count += 1
            if fail_writer is not None: fail_writer.writerow(row)
        else:
            result_log['SEQUENCE'] = row[seq_field]
            result_log['GERMLINE'] = germlines['full']
            result_log['REGIONS'] = germlines['regions']

            pass_count += 1
            pass_writer.writerow(row)
        printLog(result_log, handle=log_handle)

    # Print log
    printProgress(i + 1, rec_count, 0.05, start_time)
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(pass_handle.name)
    log['RECORDS'] = rec_count
    log['PASS'] = pass_count
    log['FAIL'] = fail_count
    log['END'] = 'CreateGermlines'
    printLog(log)

    # Close file handles
    pass_handle.close()
    if fail_handle is not None: fail_handle.close()
    if log_handle is not None:  log_handle.close()


def makeCloneGermline(clone, clone_dict, repo_dict, germ_types, v_field,
                      seq_field, counts, writers, out_args):
    """
    Determine consensus clone sequence and create germline for clone

    Arguments:
    clone = clone ID
    clone_dict = iterable yielding dictionaries of sequence data from clone
    repo_dict = dictionary of IMGT gapped germline sequences
    germ_types = types of germline sequences to be output
                     (full germline, D-region masked, only V-region germline)
    v_field = field in which to look for V call
    seq_field = field in which to look for sequence
    counts = dictionary of pass counter and fail counter
    writers = dictionary with pass and fail DB writers
    out_args = arguments for output preferences

    Returns:
    None
    """
    seq_type = seq_field.split('_')[-1]
    j_field = 'J_CALL'

    # Create dictionaries to count observed V/J calls
    v_dict = OrderedDict()
    j_dict = OrderedDict()

    # Find longest sequence in clone
    max_length = 0
    for val in clone_dict.values():
        v = val[v_field]
        v_dict[v] = v_dict.get(v, 0) + 1
        j = val[j_field]
        j_dict[j] = j_dict.get(j, 0) + 1
        if len(val[seq_field]) > max_length:
            max_length = len(val[seq_field])

    # Consensus V and J having most observations
    v_cons = [k for k in list(v_dict.keys()) if v_dict[k] == max(v_dict.values())]
    j_cons = [k for k in list(j_dict.keys()) if j_dict[k] == max(j_dict.values())]

    # Consensus sequence(s) with consensus V/J calls and longest sequence
    cons = [val for val in list(clone_dict.values()) \
            if val.get(v_field, '') in v_cons and \
            val.get(j_field, '') in j_cons and \
            len(val[seq_field]) == max_length]

    # Sequence(s) with consensus V/J are not longest
    if not cons:
        # Sequence(s) with consensus V/J (not longest)
        cons = [val for val in list(clone_dict.values()) \
                if val.get(v_field, '') in v_cons and val.get(j_field, '') in j_cons]

        # No sequence has both consensus V and J call
        if not cons:
            result_log = OrderedDict()
            result_log['ID'] = clone
            result_log['V_CALL'] = ','.join(v_cons)
            result_log['J_CALL'] = ','.join(j_cons)
            result_log['ERROR'] = 'No consensus sequence for clone found'
        else:
            # Pad end of consensus sequence with gaps to make it the max length
            cons = cons[0]
            cons['J_GERM_LENGTH'] = str(int(cons['J_GERM_LENGTH'] or 0) + max_length - len(cons[seq_field]))
            cons[seq_field] += '.' * (max_length - len(cons[seq_field]))
            result_log, germlines = joinGermline(cons, repo_dict, germ_types, v_field, seq_field)
            result_log['ID'] = clone
            result_log['CONSENSUS'] = cons['SEQUENCE_ID']
    else:
        cons = cons[0]
        result_log, germlines = joinGermline(cons, repo_dict, germ_types, v_field, seq_field)
        result_log['ID'] = clone
        result_log['CONSENSUS'] = cons['SEQUENCE_ID']

    # Write sequences of clone
    for val in clone_dict.values():
        if 'ERROR' not in result_log:
            # Update lengths padded to longest sequence in clone
            val['J_GERM_LENGTH'] = str(int(val['J_GERM_LENGTH'] or 0) + max_length - len(val[seq_field]))
            val[seq_field] += '.' * (max_length - len(val[seq_field]))

            # Add column(s) to tab-delimited database file
            if 'full' in germ_types: val['GERMLINE_' + seq_type] = germlines['full']
            if 'dmask' in germ_types: val['GERMLINE_' + seq_type + '_D_MASK'] = germlines['dmask']
            if 'vonly' in germ_types: val['GERMLINE_' + seq_type + '_V_REGION'] = germlines['vonly']
            if 'regions' in germ_types: val['GERMLINE_REGIONS'] = germlines['regions']

            # Add field
            val['GERMLINE_V_CALL'] = result_log['V_CALL']
            val['GERMLINE_D_CALL'] = result_log['D_CALL']
            val['GERMLINE_J_CALL'] = result_log['J_CALL']
            
            result_log['SEQUENCE'] = cons[seq_field]
            result_log['GERMLINE'] = germlines['full']
            result_log['REGIONS'] = germlines['regions']

            # Write to pass file
            counts['pass'] += 1
            writers['pass'].writerow(val)
        else:
            # Write to fail file
            counts['fail'] += 1
            if writers['fail'] is not None:
                writers['fail'].writerow(val)
    # Return log
    return result_log


def assembleCloneGermline(db_file, repo, germ_types, v_field, seq_field, out_args=default_out_args):
    """
    Assemble one germline sequence for each clone in a tab-delimited database file

    Arguments:
    db_file = input tab-delimited database file
    repo = folder with germline repertoire files
    germ_types = types of germline sequences to be output
                 (full germline, D-region masked, only V-region germline)
    v_field = field in which to look for V call
    seq_field = field in which to look for sequence
    out_args = arguments for output preferences

    Returns:
    None
    """
    # Print parameter info
    log = OrderedDict()
    log['START'] = 'CreateGermlines'
    log['DB_FILE'] = os.path.basename(db_file)
    log['GERM_TYPES'] = germ_types if isinstance(germ_types, str) else ','.join(germ_types)
    log['CLONED'] = 'True'
    log['V_FIELD'] = v_field
    log['SEQ_FIELD'] = seq_field
    printLog(log)

    # Get repertoire and open Db reader
    repo_dict = readRepo(repo)
    reader = readDbFile(db_file, ig=False)

    # Exit if V call field does not exist in reader
    if v_field not in reader.fieldnames:
        sys.exit('Error: V field does not exist in input database file.')

    # Define log handle
    if out_args['log_file'] is None:
        log_handle = None
    else:
        log_handle = open(out_args['log_file'], 'w')

    add_fields = []
    seq_type = seq_field.split('_')[-1]
    if 'full' in germ_types: add_fields +=  ['GERMLINE_' + seq_type]
    if 'dmask' in germ_types: add_fields += ['GERMLINE_' + seq_type + '_D_MASK']
    if 'vonly' in germ_types: add_fields += ['GERMLINE_' + seq_type + '_V_REGION']
    if 'regions' in germ_types: add_fields += ['GERMLINE_REGIONS']

    add_fields += ['GERMLINE_V_CALL']
    add_fields += ['GERMLINE_D_CALL']
    add_fields += ['GERMLINE_J_CALL']
    
    # Create output file handle and Db writer
    writers = {}
    pass_handle = getOutputHandle(db_file, 'germ-pass', out_dir=out_args['out_dir'],
                                  out_name=out_args['out_name'], out_type=out_args['out_type'])
    writers['pass'] = getDbWriter(pass_handle, db_file, add_fields=add_fields)

    if out_args['failed']:
        fail_handle = getOutputHandle(db_file, 'germ-fail', out_dir=out_args['out_dir'],
                                     out_name=out_args['out_name'], out_type=out_args['out_type'])
        writers['fail'] = getDbWriter(fail_handle, db_file, add_fields=add_fields)
    else:
        fail_handle = None
        writers['fail'] = None

    # Initialize time and total count for progress bar
    start_time = time()
    rec_count = countDbFile(db_file)
    counts = {}
    clone_count = counts['pass'] = counts['fail'] = 0
    # Iterate over rows
    clone = 'initial'
    clone_dict = OrderedDict()
    for i, row in enumerate(reader):
        # Print progress
        printProgress(i, rec_count, 0.05, start_time)

        # Clone isn't over yet
        if row.get('CLONE', '') == clone:
            clone_dict[i] = row
        # Clone just finished
        elif clone_dict:
            clone_count += 1
            result_log = makeCloneGermline(clone, clone_dict, repo_dict, germ_types,
                                           v_field, seq_field, counts, writers, out_args)
            printLog(result_log, handle=log_handle)
            # Now deal with current row (first of next clone)
            clone = row['CLONE']
            clone_dict = OrderedDict([(i, row)])
        # Last case is only for first row of file
        else:
            clone = row['CLONE']
            clone_dict = OrderedDict([(i, row)])

    clone_count += 1
    result_log = makeCloneGermline(clone, clone_dict, repo_dict, germ_types, v_field,
                                   seq_field, counts, writers, out_args)
    printLog(result_log, handle=log_handle)

    # Print log
    printProgress(i + 1, rec_count, 0.05, start_time)
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(pass_handle.name)
    log['CLONES'] = clone_count
    log['RECORDS'] = rec_count
    log['PASS'] = counts['pass']
    log['FAIL'] = counts['fail']
    log['END'] = 'CreateGermlines'
    printLog(log)

    # Close file handles
    pass_handle.close()
    if fail_handle is not None: fail_handle.close()
    if log_handle is not None:  log_handle.close()


def getArgParser():
    """
    Defines the ArgumentParser

    Arguments:
    None

    Returns:
    an ArgumentParser object
    """
    # Define input and output field help message
    fields = dedent(
             '''
             output files:
                 germ-pass
                    database with assigned germline sequences.
                 germ-fail
                    database with records failing germline assignment.

             required fields:
                 SEQUENCE_ID, SEQUENCE_VDJ or SEQUENCE_IMGT,
                 V_CALL or V_CALL_GENOTYPED, D_CALL, J_CALL,
                 V_SEQ_START, V_SEQ_LENGTH, V_GERM_START_IMGT, V_GERM_LENGTH_IMGT,
                 D_SEQ_START, D_SEQ_LENGTH, D_GERM_START, D_GERM_LENGTH,
                 J_SEQ_START, J_SEQ_LENGTH, J_GERM_START, J_GERM_LENGTH,
                 NP1_LENGTH, NP2_LENGTH

             optional fields:
                 N1_LENGTH, N2_LENGTH, P3V_LENGTH, P5D_LENGTH, P3D_LENGTH, P5J_LENGTH,
                 CLONE


             output fields:
                 GERMLINE_VDJ, GERMLINE_VDJ_D_MASK, GERMLINE_VDJ_V_REGION,
                 GERMLINE_IMGT, GERMLINE_IMGT_D_MASK, GERMLINE_IMGT_V_REGION,
                 GERMLINE_V_CALL, GERMLINE_D_CALL, GERMLINE_J_CALL,
                 GERMLINE_REGIONS
              ''')

    # Parent parser
    parser_parent = getCommonArgParser(seq_in=False, seq_out=False, db_in=True,
                                       annotation=False)
    # Define argument parser
    parser = ArgumentParser(description=__doc__, epilog=fields,
                            parents=[parser_parent],
                            formatter_class=CommonHelpFormatter)
    parser.add_argument('--version', action='version',
                        version='%(prog)s:' + ' %s-%s' %(__version__, __date__))

    parser.add_argument('-r', nargs='+', action='store', dest='repo', required=True,
                        help='''List of folders and/or fasta files (with .fasta, .fna or .fa
                         extension) with germline sequences.''')
    parser.add_argument('-g', action='store', dest='germ_types', default=default_germ_types,
                        nargs='+', choices=('full', 'dmask', 'vonly', 'regions'),
                        help='''Specify type(s) of germlines to include full germline,
                             germline with D-region masked, or germline for V region only.''')
    parser.add_argument('--cloned', action='store_true', dest='cloned',
                        help='''Specify to create only one germline per clone. Assumes input file is
                             sorted by clone column, and will not yield correct results if the data
                             is unsorted. Note, if allele calls are ambiguous within a clonal group,
                             this will place the germline call used for the entire clone within the
                             GERMLINE_V_CALL, GERMLINE_D_CALL and GERMLINE_J_CALL fields.''')
    parser.add_argument('--vf', action='store', dest='v_field', default=default_v_field,
                        help='Specify field to use for germline V call')
    parser.add_argument('--sf', action='store', dest='seq_field', default=default_seq_field,
                        help='Specify field to use for sequence')

    return parser


if __name__ == "__main__":
    """
    Parses command line arguments and calls main
    """

    # Parse command line arguments
    parser = getArgParser()
    checkArgs(parser)
    args = parser.parse_args()
    args_dict = parseCommonArgs(args)
    del args_dict['db_files']
    del args_dict['cloned']
    args_dict['v_field'] = args_dict['v_field'].upper()
    args_dict['seq_field'] = args_dict['seq_field'].upper()
    
    for f in args.__dict__['db_files']:
        args_dict['db_file'] = f
        if args.__dict__['cloned']:
            assembleCloneGermline(**args_dict)
        else:
            assembleEachGermline(**args_dict)