Mercurial > repos > davidvanzessen > shm_csr

diff change_o/MakeDb.py @ 0:c33d93683a09 draft
Uploaded
author: davidvanzessen
date: Thu, 13 Oct 2016 10:52:24 -0400
children: 22dddabe3637
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/change_o/MakeDb.py	Thu Oct 13 10:52:24 2016 -0400
@@ -0,0 +1,1025 @@
+#!/usr/bin/env python3
+"""
+Create tab-delimited database file to store sequence alignment information
+"""
+# Info
+__author__ = 'Namita Gupta, Jason Anthony Vander Heiden'
+from changeo import __version__, __date__
+
+# Imports
+import csv
+import os
+import re
+import sys
+import pandas as pd
+import tarfile
+import zipfile
+from argparse import ArgumentParser
+from collections import OrderedDict
+from itertools import groupby
+from shutil import rmtree
+from tempfile import mkdtemp
+from textwrap import dedent
+from time import time
+from Bio import SeqIO
+from Bio.Seq import Seq
+from Bio.Alphabet import IUPAC
+
+# Presto and changeo imports
+from presto.Defaults import default_out_args
+from presto.Annotation import parseAnnotation
+from presto.IO import countSeqFile, printLog, printProgress
+from changeo.Commandline import CommonHelpFormatter, getCommonArgParser, parseCommonArgs
+from changeo.IO import getDbWriter, countDbFile, getRepo
+from changeo.Receptor import IgRecord, parseAllele, v_allele_regex, d_allele_regex, \
+                             j_allele_regex
+
+# Default parameters
+default_delimiter = ('\t', ',', '-')
+
+
+def gapV(ig_dict, repo_dict):
+    """
+    Insert gaps into V region and update alignment information
+
+    Arguments:
+      ig_dict : Dictionary of parsed IgBlast output
+      repo_dict : Dictionary of IMGT gapped germline sequences
+
+    Returns:
+      dict : Updated with SEQUENCE_IMGT, V_GERM_START_IMGT, and V_GERM_LENGTH_IMGT fields
+    """
+
+    seq_imgt = '.' * (int(ig_dict['V_GERM_START_VDJ'])-1) + ig_dict['SEQUENCE_VDJ']
+
+    # Find gapped germline V segment
+    vgene = parseAllele(ig_dict['V_CALL'], v_allele_regex, 'first')
+    vkey = (vgene, )
+    #TODO: Figure out else case
+    if vkey in repo_dict:
+        vgap = repo_dict[vkey]
+        # Iterate over gaps in the germline segment
+        gaps = re.finditer(r'\.', vgap)
+        gapcount = int(ig_dict['V_GERM_START_VDJ'])-1
+        for gap in gaps:
+            i = gap.start()
+            # Break if gap begins after V region
+            if i >= ig_dict['V_GERM_LENGTH_VDJ'] + gapcount:
+                break
+            # Insert gap into IMGT sequence
+            seq_imgt = seq_imgt[:i] + '.' + seq_imgt[i:]
+            # Update gap counter
+            gapcount += 1
+        ig_dict['SEQUENCE_IMGT'] = seq_imgt
+        # Update IMGT positioning information for V
+        ig_dict['V_GERM_START_IMGT'] = 1
+        ig_dict['V_GERM_LENGTH_IMGT'] = ig_dict['V_GERM_LENGTH_VDJ'] + gapcount
+
+    return ig_dict
+
+
+def getIMGTJunc(ig_dict, repo_dict):
+    """
+    Identify junction region by IMGT definition
+
+    Arguments:
+      ig_dict : Dictionary of parsed IgBlast output
+      repo_dict : Dictionary of IMGT gapped germline sequences
+
+    Returns:
+      dict : Updated with JUNCTION_LENGTH_IMGT and JUNCTION_IMGT fields
+    """
+    # Find germline J segment
+    jgene = parseAllele(ig_dict['J_CALL'], j_allele_regex, 'first')
+    jkey = (jgene, )
+    #TODO: Figure out else case
+    if jkey in repo_dict:
+        # Get germline J sequence
+        jgerm = repo_dict[jkey]
+        jgerm = jgerm[:ig_dict['J_GERM_START']+ig_dict['J_GERM_LENGTH']-1]
+        # Look for (F|W)GXG aa motif in nt sequence
+        motif = re.search(r'T(TT|TC|GG)GG[ACGT]{4}GG[AGCT]', jgerm)
+        aa_end = len(ig_dict['SEQUENCE_IMGT'])
+        #TODO: Figure out else case
+        if motif:
+            # print('\n', motif.group())
+            aa_end = motif.start() - len(jgerm) + 3
+        # Add fields to dict
+        ig_dict['JUNCTION'] = ig_dict['SEQUENCE_IMGT'][309:aa_end]
+        ig_dict['JUNCTION_LENGTH'] = len(ig_dict['JUNCTION'])
+
+    return ig_dict
+
+
+def getRegions(ig_dict):
+    """
+    Identify FWR and CDR regions by IMGT definition
+
+    Arguments:
+      ig_dict : Dictionary of parsed alignment output
+
+    Returns:
+      dict : Updated with FWR1_IMGT, FWR2_IMGT, FWR3_IMGT, FWR4_IMGT,
+             CDR1_IMGT, CDR2_IMGT, and CDR3_IMGT fields
+    """
+    try:
+        seq_len = len(ig_dict['SEQUENCE_IMGT'])
+        ig_dict['FWR1_IMGT'] = ig_dict['SEQUENCE_IMGT'][0:min(78,seq_len)]
+    except (KeyError, IndexError):
+        return ig_dict
+
+    try: ig_dict['CDR1_IMGT'] = ig_dict['SEQUENCE_IMGT'][78:min(114, seq_len)]
+    except (IndexError): return ig_dict
+
+    try: ig_dict['FWR2_IMGT'] = ig_dict['SEQUENCE_IMGT'][114:min(165, seq_len)]
+    except (IndexError): return ig_dict
+
+    try: ig_dict['CDR2_IMGT'] = ig_dict['SEQUENCE_IMGT'][165:min(195, seq_len)]
+    except (IndexError): return ig_dict
+
+    try: ig_dict['FWR3_IMGT'] = ig_dict['SEQUENCE_IMGT'][195:min(312, seq_len)]
+    except (IndexError): return ig_dict
+
+    try:
+        cdr3_end = 306 + ig_dict['JUNCTION_LENGTH']
+        ig_dict['CDR3_IMGT'] = ig_dict['SEQUENCE_IMGT'][312:cdr3_end]
+        ig_dict['FWR4_IMGT'] = ig_dict['SEQUENCE_IMGT'][cdr3_end:]
+    except (KeyError, IndexError):
+        return ig_dict
+
+    return ig_dict
+
+
+def getSeqforIgBlast(seq_file):
+    """
+    Fetch input sequences for IgBlast queries
+
+    Arguments:
+    seq_file = a fasta file of sequences input to IgBlast
+
+    Returns:
+    a dictionary of {ID:Seq}
+    """
+
+    seq_dict = SeqIO.index(seq_file, "fasta", IUPAC.ambiguous_dna)
+
+    # Create a seq_dict ID translation using IDs truncate up to space or 50 chars
+    seqs = {}
+    for seq in seq_dict.values():
+        seqs.update({seq.description:str(seq.seq)})
+
+    return seqs
+
+
+def findLine(handle, query):
+    """
+    Finds line with query string in file
+
+    Arguments:
+    handle = file handle in which to search for line
+    query = query string for which to search in file
+
+    Returns:
+    line from handle in which query string was found
+    """
+    for line in handle:
+        if(re.match(query, line)):
+            return line
+
+
+def extractIMGT(imgt_output):
+    """
+    Extract necessary files from IMGT results, zipped or unzipped
+    
+    Arguments:
+    imgt_output = zipped file or unzipped folder output by IMGT
+    
+    Returns:
+    sorted list of filenames from which information will be read
+    """
+    #file_ext = os.path.splitext(imgt_output)[1].lower()
+    imgt_flags = ('1_Summary', '2_IMGT-gapped', '3_Nt-sequences', '6_Junction')
+    temp_dir = mkdtemp()
+    if zipfile.is_zipfile(imgt_output):
+        # Open zip file
+        imgt_zip = zipfile.ZipFile(imgt_output, 'r')
+        # Extract required files
+        imgt_files = sorted([n for n in imgt_zip.namelist() \
+                             if os.path.basename(n).startswith(imgt_flags)])
+        imgt_zip.extractall(temp_dir, imgt_files)
+        # Define file list
+        imgt_files = [os.path.join(temp_dir, f) for f in imgt_files]
+    elif os.path.isdir(imgt_output):
+        # Find required files in folder
+        folder_files = []
+        for root, dirs, files in os.walk(imgt_output):
+            folder_files.extend([os.path.join(os.path.abspath(root), f) for f in files])
+        # Define file list
+        imgt_files = sorted([n for n in folder_files \
+                             if os.path.basename(n).startswith(imgt_flags)])
+    elif tarfile.is_tarfile(imgt_output):
+        # Open zip file
+        imgt_tar = tarfile.open(imgt_output, 'r')
+        # Extract required files
+        imgt_files = sorted([n for n in imgt_tar.getnames() \
+                             if os.path.basename(n).startswith(imgt_flags)])
+        imgt_tar.extractall(temp_dir, [imgt_tar.getmember(n) for n in imgt_files])
+        # Define file list
+        imgt_files = [os.path.join(temp_dir, f) for f in imgt_files]
+    else:
+        sys.exit('ERROR: Unsupported IGMT output file. Must be either a zipped file (.zip), LZMA compressed tarfile (.txz) or a folder.')
+    
+    if len(imgt_files) > len(imgt_flags): # e.g. multiple 1_Summary files
+        sys.exit('ERROR: Wrong files in IMGT output %s.' % imgt_output)
+    elif len(imgt_files) < len(imgt_flags):
+        sys.exit('ERROR: Missing necessary file IMGT output %s.' % imgt_output)
+        
+    return temp_dir, imgt_files
+
+
+# TODO: return a dictionary with keys determined by the comment strings in the blocks, thus avoiding problems with missing blocks
+def readOneIgBlastResult(block):
+    """
+    Parse a single IgBLAST query result
+
+    Arguments:
+    block =  itertools groupby object of single result
+
+    Returns:
+    None if no results, otherwise list of DataFrames for each result block
+    """
+    results = list()
+    i = 0
+    for match, subblock in groupby(block, lambda l: l=='\n'):
+        if not match:
+            # Strip whitespace and comments
+            sub = [s.strip() for s in subblock if not s.startswith('#')]
+
+            # Continue on empty block
+            if not sub:  continue
+            else:  i += 1
+
+            # Split by tabs
+            sub = [s.split('\t') for s in sub]
+
+            # Append list for "V-(D)-J rearrangement summary" (i == 1)
+            # And "V-(D)-J junction details" (i == 2)
+            # Otherwise append DataFrame of subblock
+            if i == 1 or i == 2:
+                results.append(sub[0])
+            else:
+                df = pd.DataFrame(sub)
+                if not df.empty: results.append(df)
+
+    return results if results else None
+
+
+# TODO:  needs more speeds. pandas is probably to blame.
+def readIgBlast(igblast_output, seq_dict, repo_dict,
+                score_fields=False, region_fields=False):
+    """
+    Reads IgBlast output
+
+    Arguments:
+    igblast_output = IgBlast output file (format 7)
+    seq_dict = a dictionary of {ID:Seq} from input fasta file
+    repo_dict = dictionary of IMGT gapped germline sequences
+    score_fields = if True parse alignment scores
+    region_fields = if True add FWR and CDR region fields
+
+    Returns:
+    a generator of dictionaries containing alignment data
+    """
+
+    # Open IgBlast output file
+    with open(igblast_output) as f:
+        # Iterate over individual results (separated by # IGBLASTN)
+        for k1, block in groupby(f, lambda x: re.match('# IGBLASTN', x)):
+            block = list(block)
+            if not k1:
+                # TODO: move query name extraction into block parser readOneIgBlastResult().
+                # Extract sequence ID
+                query_name = ' '.join(block[0].strip().split(' ')[2:])
+                # Initialize db_gen to have ID and input sequence
+                db_gen = {'SEQUENCE_ID':     query_name,
+                          'SEQUENCE_INPUT':  seq_dict[query_name]}
+
+                # Parse further sub-blocks
+                block_list = readOneIgBlastResult(block)
+
+                # TODO: this is indented pretty far.  should be a separate function. or several functions.
+                # If results exist, parse further to obtain full db_gen
+                if block_list is not None:
+                    # Parse quality information
+                    db_gen['STOP'] = 'T' if block_list[0][-4] == 'Yes' else 'F'
+                    db_gen['IN_FRAME'] = 'T' if block_list[0][-3] == 'In-frame' else 'F'
+                    db_gen['FUNCTIONAL'] = 'T' if block_list[0][-2] == 'Yes' else 'F'
+                    if block_list[0][-1] == '-':
+                        db_gen['SEQUENCE_INPUT'] = str(Seq(db_gen['SEQUENCE_INPUT'],
+                                                           IUPAC.ambiguous_dna).reverse_complement())
+
+                    # Parse V, D, and J calls
+                    call_str = ' '.join(block_list[0])
+                    v_call = parseAllele(call_str, v_allele_regex, action='list')
+                    d_call = parseAllele(call_str, d_allele_regex, action='list')
+                    j_call = parseAllele(call_str, j_allele_regex, action='list')
+                    db_gen['V_CALL'] = ','.join(v_call) if v_call is not None else 'None'
+                    db_gen['D_CALL'] = ','.join(d_call) if d_call is not None else 'None'
+                    db_gen['J_CALL'] = ','.join(j_call) if j_call is not None else 'None'
+
+                    # Parse junction sequence
+                    # db_gen['JUNCTION_VDJ'] = re.sub('(N/A)|\[|\(|\)|\]', '', ''.join(block_list[1]))
+                    # db_gen['JUNCTION_LENGTH_VDJ'] = len(db_gen['JUNCTION_VDJ'])
+
+                    # TODO:  IgBLAST does a stupid and doesn't output block #3 sometimes. why?
+                    # TODO:  maybe we should fail these. they look craptastic.
+                    #pd.set_option('display.width', 500)
+                    #print query_name, len(block_list), hit_idx
+                    #for i, x in enumerate(block_list):
+                    #    print '[%i]' % i
+                    #    print x
+
+                    # Parse segment start and stop positions
+                    hit_df = block_list[-1]
+
+                    # Alignment info block
+                    #  0:  segment
+                    #  1:  query id
+                    #  2:  subject id
+                    #  3:  % identity
+                    #  4:  alignment length
+                    #  5:  mismatches
+                    #  6:  gap opens
+                    #  7:  gaps
+                    #  8:  q. start
+                    #  9:  q. end
+                    # 10:  s. start
+                    # 11:  s. end
+                    # 12:  evalue
+                    # 13:  bit score
+                    # 14:  query seq
+                    # 15:  subject seq
+                    # 16:  btop
+
+                    # If V call exists, parse V alignment information
+                    seq_vdj = ''
+                    if v_call is not None:
+                        v_align = hit_df[hit_df[0] == 'V'].iloc[0]
+                        # Germline positions
+                        db_gen['V_GERM_START_VDJ'] = int(v_align[10])
+                        db_gen['V_GERM_LENGTH_VDJ'] = int(v_align[11]) - db_gen['V_GERM_START_VDJ'] + 1
+                        # Query sequence positions
+                        db_gen['V_SEQ_START'] = int(v_align[8])
+                        db_gen['V_SEQ_LENGTH'] = int(v_align[9]) - db_gen['V_SEQ_START'] + 1
+
+                        if int(v_align[6]) == 0:
+                            db_gen['INDELS'] = 'F'
+                        else:
+                            db_gen['INDELS'] = 'T'
+                            # Set functional to none so record gets tossed (junction will be wrong)
+                            # db_gen['FUNCTIONAL'] = None
+
+                        # V alignment scores
+                        if score_fields:
+                            try: db_gen['V_SCORE'] = float(v_align[13])
+                            except (TypeError, ValueError): db_gen['V_SCORE'] = 'None'
+
+                            try: db_gen['V_IDENTITY'] = float(v_align[3]) / 100.0
+                            except (TypeError, ValueError): db_gen['V_IDENTITY'] = 'None'
+
+                            try: db_gen['V_EVALUE'] = float(v_align[12])
+                            except (TypeError, ValueError): db_gen['V_EVALUE'] = 'None'
+
+                            try: db_gen['V_BTOP'] = v_align[16]
+                            except (TypeError, ValueError): db_gen['V_BTOP'] = 'None'
+
+                        # Update VDJ sequence, removing insertions
+                        start = 0
+                        for m in re.finditer(r'-', v_align[15]):
+                            ins = m.start()
+                            seq_vdj += v_align[14][start:ins]
+                            start = ins + 1
+                        seq_vdj += v_align[14][start:]
+
+                    # TODO:  needs to check that the V results are present before trying to determine N1_LENGTH from them.
+                    # If D call exists, parse D alignment information
+                    if d_call is not None:
+                        d_align = hit_df[hit_df[0] == 'D'].iloc[0]
+
+                        # TODO:  this is kinda gross.  not sure how else to fix the alignment overlap problem though.
+                        # Determine N-region length and amount of J overlap with V or D alignment
+                        overlap = 0
+                        if v_call is not None:
+                            n1_len = int(d_align[8]) - (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH'])
+                            if n1_len < 0:
+                                db_gen['N1_LENGTH'] = 0
+                                overlap = abs(n1_len)
+                            else:
+                                db_gen['N1_LENGTH'] = n1_len
+                                n1_start = (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH']-1)
+                                n1_end = int(d_align[8])-1
+                                seq_vdj += db_gen['SEQUENCE_INPUT'][n1_start:n1_end]
+
+                        # Query sequence positions
+                        db_gen['D_SEQ_START'] = int(d_align[8]) + overlap
+                        db_gen['D_SEQ_LENGTH'] = max(int(d_align[9]) - db_gen['D_SEQ_START'] + 1, 0)
+
+                        # Germline positions
+                        db_gen['D_GERM_START'] = int(d_align[10]) + overlap
+                        db_gen['D_GERM_LENGTH'] = max(int(d_align[11]) - db_gen['D_GERM_START'] + 1, 0)
+
+                        # Update VDJ sequence, removing insertions
+                        start = overlap
+                        for m in re.finditer(r'-', d_align[15]):
+                            ins = m.start()
+                            seq_vdj += d_align[14][start:ins]
+                            start = ins + 1
+                        seq_vdj += d_align[14][start:]
+
+                    # TODO:  needs to check that the V results are present before trying to determine N1_LENGTH from them.
+                    # If J call exists, parse J alignment information
+                    if j_call is not None:
+                        j_align = hit_df[hit_df[0] == 'J'].iloc[0]
+
+                        # TODO:  this is kinda gross.  not sure how else to fix the alignment overlap problem though.
+                        # Determine N-region length and amount of J overlap with V or D alignment
+                        overlap = 0
+                        if d_call is not None:
+                            n2_len = int(j_align[8]) - (db_gen['D_SEQ_START'] + db_gen['D_SEQ_LENGTH'])
+                            if n2_len < 0:
+                                db_gen['N2_LENGTH'] = 0
+                                overlap = abs(n2_len)
+                            else:
+                                db_gen['N2_LENGTH'] = n2_len
+                                n2_start = (db_gen['D_SEQ_START']+db_gen['D_SEQ_LENGTH']-1)
+                                n2_end = int(j_align[8])-1
+                                seq_vdj += db_gen['SEQUENCE_INPUT'][n2_start:n2_end]
+                        elif v_call is not None:
+                            n1_len = int(j_align[8]) - (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH'])
+                            if n1_len < 0:
+                                db_gen['N1_LENGTH'] = 0
+                                overlap = abs(n1_len)
+                            else:
+                                db_gen['N1_LENGTH'] = n1_len
+                                n1_start = (db_gen['V_SEQ_START']+db_gen['V_SEQ_LENGTH']-1)
+                                n1_end = int(j_align[8])-1
+                                seq_vdj += db_gen['SEQUENCE_INPUT'][n1_start:n1_end]
+                        else:
+                            db_gen['N1_LENGTH'] = 0
+
+                        # Query positions
+                        db_gen['J_SEQ_START'] = int(j_align[8]) + overlap
+                        db_gen['J_SEQ_LENGTH'] = max(int(j_align[9]) - db_gen['J_SEQ_START'] + 1, 0)
+
+                        # Germline positions
+                        db_gen['J_GERM_START'] = int(j_align[10]) + overlap
+                        db_gen['J_GERM_LENGTH'] = max(int(j_align[11]) - db_gen['J_GERM_START'] + 1, 0)
+
+                        # J alignment scores
+                        if score_fields:
+                            try: db_gen['J_SCORE'] = float(j_align[13])
+                            except (TypeError, ValueError): db_gen['J_SCORE'] = 'None'
+
+                            try: db_gen['J_IDENTITY'] = float(j_align[3]) / 100.0
+                            except (TypeError, ValueError): db_gen['J_IDENTITY'] = 'None'
+
+                            try: db_gen['J_EVALUE'] = float(j_align[12])
+                            except (TypeError, ValueError): db_gen['J_EVALUE'] = 'None'
+
+                            try: db_gen['J_BTOP'] = j_align[16]
+                            except (TypeError, ValueError): db_gen['J_BTOP'] = 'None'
+
+                        # Update VDJ sequence, removing insertions
+                        start = overlap
+                        for m in re.finditer(r'-', j_align[15]):
+                            ins = m.start()
+                            seq_vdj += j_align[14][start:ins]
+                            start = ins + 1
+                        seq_vdj += j_align[14][start:]
+
+                    db_gen['SEQUENCE_VDJ'] = seq_vdj
+
+                    # Create IMGT-gapped sequence and infer IMGT junction
+                    if v_call is not None:
+                        db_gen = gapV(db_gen, repo_dict)
+                        if j_call is not None:
+                            db_gen = getIMGTJunc(db_gen, repo_dict)
+
+                    # FWR and CDR regions
+                    if region_fields: getRegions(db_gen)
+
+                yield IgRecord(db_gen)
+
+
+# TODO:  should be more readable
+def readIMGT(imgt_files, score_fields=False, region_fields=False):
+    """
+    Reads IMGT/HighV-Quest output
+
+    Arguments: 
+    imgt_files = IMGT/HighV-Quest output files 1, 2, 3, and 6
+    score_fields = if True parse alignment scores
+    region_fields = if True add FWR and CDR region fields
+    
+    Returns: 
+    a generator of dictionaries containing alignment data
+    """
+    imgt_iters = [csv.DictReader(open(f, 'rU'), delimiter='\t') for f in imgt_files]
+    # Create a dictionary for each sequence alignment and yield its generator
+    for sm, gp, nt, jn in zip(*imgt_iters):
+        if len(set([sm['Sequence ID'],
+                    gp['Sequence ID'],
+                    nt['Sequence ID'],
+                    jn['Sequence ID']])) != 1:
+            sys.exit('Error: IMGT files are corrupt starting with Summary file record %s' \
+                     % sm['Sequence ID'])
+
+        db_gen = {'SEQUENCE_ID': sm['Sequence ID'],
+                  'SEQUENCE_INPUT': sm['Sequence']}
+
+        if 'No results' not in sm['Functionality']:
+            db_gen['FUNCTIONAL'] = ['?','T','F'][('productive' in sm['Functionality']) +
+                                                 ('unprod' in sm['Functionality'])]
+            db_gen['IN_FRAME'] = ['?','T','F'][('in-frame' in sm['JUNCTION frame']) +
+                                               ('out-of-frame' in sm['JUNCTION frame'])],
+            db_gen['STOP'] = ['F','?','T'][('stop codon' in sm['Functionality comment']) +
+                                           ('unprod' in sm['Functionality'])]
+            db_gen['MUTATED_INVARIANT'] = ['F','?','T'][(any(('missing' in sm['Functionality comment'],
+                                                         'missing' in sm['V-REGION potential ins/del']))) +
+                                                         ('unprod' in sm['Functionality'])]
+            db_gen['INDELS'] = ['F','T'][any((sm['V-REGION potential ins/del'],
+                                              sm['V-REGION insertions'],
+                                              sm['V-REGION deletions']))]
+
+            db_gen['SEQUENCE_VDJ'] = nt['V-D-J-REGION'] if nt['V-D-J-REGION'] else nt['V-J-REGION']
+            db_gen['SEQUENCE_IMGT'] = gp['V-D-J-REGION'] if gp['V-D-J-REGION'] else gp['V-J-REGION']
+
+            db_gen['V_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['V-GENE and allele']))
+            db_gen['D_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['D-GENE and allele']))
+            db_gen['J_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['J-GENE and allele']))
+
+            v_seq_length = len(nt['V-REGION']) if nt['V-REGION'] else 0
+            db_gen['V_SEQ_START'] = nt['V-REGION start']
+            db_gen['V_SEQ_LENGTH'] = v_seq_length
+            db_gen['V_GERM_START_IMGT'] = 1
+            db_gen['V_GERM_LENGTH_IMGT'] = len(gp['V-REGION']) if gp['V-REGION'] else 0
+
+            db_gen['N1_LENGTH'] = sum(int(i) for i in [jn["P3'V-nt nb"],
+                                                       jn['N-REGION-nt nb'],
+                                                       jn['N1-REGION-nt nb'],
+                                                       jn["P5'D-nt nb"]] if i)
+            db_gen['D_SEQ_START'] = sum(int(i) for i in [1, v_seq_length,
+                                                         jn["P3'V-nt nb"],
+                                                         jn['N-REGION-nt nb'],
+                                                         jn['N1-REGION-nt nb'],
+                                                         jn["P5'D-nt nb"]] if i)
+            db_gen['D_SEQ_LENGTH'] = int(jn["D-REGION-nt nb"] or 0)
+            db_gen['D_GERM_START'] = int(jn["5'D-REGION trimmed-nt nb"] or 0) + 1
+            db_gen['D_GERM_LENGTH'] = int(jn["D-REGION-nt nb"] or 0)
+            db_gen['N2_LENGTH'] = sum(int(i) for i in [jn["P3'D-nt nb"],
+                                                       jn['N2-REGION-nt nb'],
+                                                       jn["P5'J-nt nb"]] if i)
+
+            db_gen['J_SEQ_START_IMGT'] = sum(int(i) for i in [1, v_seq_length,
+                                                         jn["P3'V-nt nb"],
+                                                         jn['N-REGION-nt nb'],
+                                                         jn['N1-REGION-nt nb'],
+                                                         jn["P5'D-nt nb"],
+                                                         jn["D-REGION-nt nb"],
+                                                         jn["P3'D-nt nb"],
+                                                         jn['N2-REGION-nt nb'],
+                                                         jn["P5'J-nt nb"]] if i)
+            db_gen['J_SEQ_LENGTH'] = len(nt['J-REGION']) if nt['J-REGION'] else 0
+            db_gen['J_GERM_START'] = int(jn["5'J-REGION trimmed-nt nb"] or 0) + 1
+            db_gen['J_GERM_LENGTH'] = len(gp['J-REGION']) if gp['J-REGION'] else 0
+
+            db_gen['JUNCTION_LENGTH'] = len(jn['JUNCTION']) if jn['JUNCTION'] else 0
+            db_gen['JUNCTION'] = jn['JUNCTION']
+
+            # Alignment scores
+            if score_fields:
+                try:  db_gen['V_SCORE'] = float(sm['V-REGION score'])
+                except (TypeError, ValueError):  db_gen['V_SCORE'] = 'None'
+
+                try:  db_gen['V_IDENTITY'] = float(sm['V-REGION identity %']) / 100.0
+                except (TypeError, ValueError):  db_gen['V_IDENTITY'] = 'None'
+
+                try:  db_gen['J_SCORE'] = float(sm['J-REGION score'])
+                except (TypeError, ValueError):  db_gen['J_SCORE'] = 'None'
+
+                try:  db_gen['J_IDENTITY'] = float(sm['J-REGION identity %']) / 100.0
+                except (TypeError, ValueError):  db_gen['J_IDENTITY'] = 'None'
+
+            # FWR and CDR regions
+            if region_fields: getRegions(db_gen)
+        else:
+            db_gen['V_CALL'] = 'None'
+            db_gen['D_CALL'] = 'None'
+            db_gen['J_CALL'] = 'None'
+
+        yield IgRecord(db_gen)
+
+    
+def getIDforIMGT(seq_file):
+    """
+    Create a sequence ID translation using IMGT truncation
+    
+    Arguments: 
+    seq_file = a fasta file of sequences input to IMGT
+                    
+    Returns: 
+    a dictionary of {truncated ID: full seq description} 
+    """
+    
+    # Create a seq_dict ID translation using IDs truncate up to space or 50 chars
+    ids = {}
+    for i, rec in enumerate(SeqIO.parse(seq_file, 'fasta', IUPAC.ambiguous_dna)):
+        if len(rec.description) <= 50:
+            id_key = rec.description
+        else:
+            id_key = re.sub('\||\s|!|&|\*|<|>|\?','_',rec.description[:50])
+        ids.update({id_key:rec.description})
+
+    return ids
+
+
+def writeDb(db_gen, file_prefix, total_count, id_dict={}, no_parse=True,
+            score_fields=False, region_fields=False, out_args=default_out_args):
+    """
+    Writes tab-delimited database file in output directory
+    
+    Arguments:
+    db_gen = a generator of IgRecord objects containing alignment data
+    file_prefix = directory and prefix for CLIP tab-delim file
+    total_count = number of records (for progress bar)
+    id_dict = a dictionary of {IMGT ID: full seq description}
+    no_parse = if ID is to be parsed for pRESTO output with default delimiters
+    score_fields = if True add alignment score fields to output file
+    region_fields = if True add FWR and CDR region fields to output file
+    out_args = common output argument dictionary from parseCommonArgs
+
+    Returns:
+    None
+    """
+    pass_file = "%s_db-pass.tab" % file_prefix
+    fail_file = "%s_db-fail.tab" % file_prefix
+    ordered_fields = ['SEQUENCE_ID',
+                      'SEQUENCE_INPUT',
+                      'FUNCTIONAL',
+                      'IN_FRAME',
+                      'STOP',
+                      'MUTATED_INVARIANT',
+                      'INDELS',
+                      'V_CALL',
+                      'D_CALL',
+                      'J_CALL',
+                      'SEQUENCE_VDJ',
+                      'SEQUENCE_IMGT',
+                      'V_SEQ_START',
+                      'V_SEQ_LENGTH',
+                      'V_GERM_START_VDJ',
+                      'V_GERM_LENGTH_VDJ',
+                      'V_GERM_START_IMGT',
+                      'V_GERM_LENGTH_IMGT',
+                      'N1_LENGTH',
+                      'D_SEQ_START',
+                      'D_SEQ_LENGTH',
+                      'D_GERM_START',
+                      'D_GERM_LENGTH',
+                      'N2_LENGTH',
+                      'J_SEQ_START',
+                      'J_SEQ_LENGTH',
+                      'J_GERM_START',
+                      'J_GERM_LENGTH',
+                      'JUNCTION_LENGTH',
+                      'JUNCTION']
+
+    if score_fields:
+        ordered_fields.extend(['V_SCORE',
+                               'V_IDENTITY',
+                               'V_EVALUE',
+                               'V_BTOP',
+                               'J_SCORE',
+                               'J_IDENTITY',
+                               'J_EVALUE',
+                               'J_BTOP'])
+
+    if region_fields:
+        ordered_fields.extend(['FWR1_IMGT', 'FWR2_IMGT', 'FWR3_IMGT', 'FWR4_IMGT',
+                               'CDR1_IMGT', 'CDR2_IMGT', 'CDR3_IMGT'])
+
+
+    # TODO:  This is not the best approach. should pass in output fields.
+    # Initiate passed handle
+    pass_handle = None
+
+    # Open failed file
+    if out_args['failed']:
+        fail_handle = open(fail_file, 'wt')
+        fail_writer = getDbWriter(fail_handle, add_fields=['SEQUENCE_ID', 'SEQUENCE_INPUT'])
+    else:
+        fail_handle = None
+        fail_writer = None
+
+    # Initialize counters and file
+    pass_writer = None
+    start_time = time()
+    rec_count = pass_count = fail_count = 0
+    for record in db_gen:
+        #printProgress(i + (total_count/2 if id_dict else 0), total_count, 0.05, start_time)
+        printProgress(rec_count, total_count, 0.05, start_time)
+        rec_count += 1
+
+        # Count pass or fail
+        if (record.v_call == 'None' and record.j_call == 'None') or \
+                record.functional is None or \
+                not record.seq_vdj or \
+                not record.junction:
+            # print(record.v_call, record.j_call, record.functional, record.junction)
+            fail_count += 1
+            if fail_writer is not None: fail_writer.writerow(record.toDict())
+            continue
+        else: 
+            pass_count += 1
+            
+        # Build sample sequence description
+        if record.id in id_dict:
+            record.id = id_dict[record.id]
+
+        # Parse sequence description into new columns
+        if not no_parse:
+            record.annotations = parseAnnotation(record.id, delimiter=out_args['delimiter'])
+            record.id = record.annotations['ID']
+            del record.annotations['ID']
+
+        # TODO:  This is not the best approach. should pass in output fields.
+        # If first sequence, use parsed description to create new columns and initialize writer
+        if pass_writer is None:
+            if not no_parse:  ordered_fields.extend(list(record.annotations.keys()))
+            pass_handle = open(pass_file, 'wt')
+            pass_writer = getDbWriter(pass_handle, add_fields=ordered_fields)
+
+        # Write row to tab-delim CLIP file
+        pass_writer.writerow(record.toDict())
+    
+    # Print log
+    #printProgress(i+1 + (total_count/2 if id_dict else 0), total_count, 0.05, start_time)
+    printProgress(rec_count, total_count, 0.05, start_time)
+
+    log = OrderedDict()
+    log['OUTPUT'] = pass_file
+    log['PASS'] = pass_count
+    log['FAIL'] = fail_count
+    log['END'] = 'MakeDb'
+    printLog(log)
+    
+    if pass_handle is not None: pass_handle.close()
+    if fail_handle is not None: fail_handle.close()
+
+
+# TODO:  may be able to merge with parseIMGT
+def parseIgBlast(igblast_output, seq_file, repo, no_parse=True, score_fields=False,
+                 region_fields=False, out_args=default_out_args):
+    """
+    Main for IgBlast aligned sample sequences
+
+    Arguments:
+    igblast_output = IgBlast output file to process
+    seq_file = fasta file input to IgBlast (from which to get sequence)
+    repo = folder with germline repertoire files
+    no_parse = if ID is to be parsed for pRESTO output with default delimiters
+    score_fields = if True add alignment score fields to output file
+    region_fields = if True add FWR and CDR region fields to output file
+    out_args = common output argument dictionary from parseCommonArgs
+
+    Returns:
+    None
+    """
+    # Print parameter info
+    log = OrderedDict()
+    log['START'] = 'MakeDB'
+    log['ALIGNER'] = 'IgBlast'
+    log['ALIGN_RESULTS'] = os.path.basename(igblast_output)
+    log['SEQ_FILE'] = os.path.basename(seq_file)
+    log['NO_PARSE'] = no_parse
+    log['SCORE_FIELDS'] = score_fields
+    log['REGION_FIELDS'] = region_fields
+    printLog(log)
+
+    # Get input sequence dictionary
+    seq_dict = getSeqforIgBlast(seq_file)
+
+    # Formalize out_dir and file-prefix
+    if not out_args['out_dir']:
+        out_dir = os.path.split(igblast_output)[0]
+    else:
+        out_dir = os.path.abspath(out_args['out_dir'])
+        if not os.path.exists(out_dir):  os.mkdir(out_dir)
+    if out_args['out_name']:
+        file_prefix = out_args['out_name']
+    else:
+        file_prefix = os.path.basename(os.path.splitext(igblast_output)[0])
+    file_prefix = os.path.join(out_dir, file_prefix)
+
+    total_count = countSeqFile(seq_file)
+
+    # Create
+    repo_dict = getRepo(repo)
+    igblast_dict = readIgBlast(igblast_output, seq_dict, repo_dict,
+                               score_fields=score_fields, region_fields=region_fields)
+    writeDb(igblast_dict, file_prefix, total_count, no_parse=no_parse,
+            score_fields=score_fields, region_fields=region_fields, out_args=out_args)
+
+
+# TODO:  may be able to merge with parseIgBlast
+def parseIMGT(imgt_output, seq_file=None, no_parse=True, score_fields=False,
+              region_fields=False, out_args=default_out_args):
+    """
+    Main for IMGT aligned sample sequences
+
+    Arguments:
+    imgt_output = zipped file or unzipped folder output by IMGT
+    seq_file = FASTA file input to IMGT (from which to get seqID)
+    no_parse = if ID is to be parsed for pRESTO output with default delimiters
+    score_fields = if True add alignment score fields to output file
+    region_fields = if True add FWR and CDR region fields to output file
+    out_args = common output argument dictionary from parseCommonArgs
+        
+    Returns: 
+    None
+    """
+    # Print parameter info
+    log = OrderedDict()
+    log['START'] = 'MakeDb'
+    log['ALIGNER'] = 'IMGT'
+    log['ALIGN_RESULTS'] = imgt_output
+    log['SEQ_FILE'] = os.path.basename(seq_file) if seq_file else ''
+    log['NO_PARSE'] = no_parse
+    log['SCORE_FIELDS'] = score_fields
+    log['REGION_FIELDS'] = region_fields
+    printLog(log)
+    
+    # Get individual IMGT result files
+    temp_dir, imgt_files = extractIMGT(imgt_output)
+        
+    # Formalize out_dir and file-prefix
+    if not out_args['out_dir']:
+        out_dir = os.path.dirname(os.path.abspath(imgt_output))
+    else:
+        out_dir = os.path.abspath(out_args['out_dir'])
+        if not os.path.exists(out_dir):  os.mkdir(out_dir)
+    if out_args['out_name']:
+        file_prefix = out_args['out_name']
+    else:
+        file_prefix = os.path.splitext(os.path.split(os.path.abspath(imgt_output))[1])[0]
+    file_prefix = os.path.join(out_dir, file_prefix)
+
+    total_count = countDbFile(imgt_files[0])
+    
+    # Get (parsed) IDs from fasta file submitted to IMGT
+    id_dict = getIDforIMGT(seq_file) if seq_file else {}
+    
+    # Create
+    imgt_dict = readIMGT(imgt_files, score_fields=score_fields,
+                         region_fields=region_fields)
+    writeDb(imgt_dict, file_prefix, total_count, id_dict=id_dict, no_parse=no_parse,
+            score_fields=score_fields, region_fields=region_fields, out_args=out_args)
+
+    # Delete temp directory
+    rmtree(temp_dir)
+
+
+def getArgParser():
+    """
+    Defines the ArgumentParser
+
+    Arguments: 
+    None
+                      
+    Returns: 
+    an ArgumentParser object
+    """
+    fields = dedent(
+             '''
+              output files:
+                  db-pass
+                      database of parsed alignment records.
+                  db-fail
+                      database with records failing alignment.
+
+              output fields:
+                  SEQUENCE_ID, SEQUENCE_INPUT, FUNCTIONAL, IN_FRAME, STOP, MUTATED_INVARIANT,
+                  INDELS, V_CALL, D_CALL, J_CALL, SEQUENCE_VDJ and/or SEQUENCE_IMGT,
+                  V_SEQ_START, V_SEQ_LENGTH, V_GERM_START_VDJ and/or V_GERM_START_IMGT,
+                  V_GERM_LENGTH_VDJ and/or V_GERM_LENGTH_IMGT, N1_LENGTH,
+                  D_SEQ_START, D_SEQ_LENGTH, D_GERM_START, D_GERM_LENGTH, N2_LENGTH,
+                  J_SEQ_START, J_SEQ_LENGTH, J_GERM_START, J_GERM_LENGTH,
+                  JUNCTION_LENGTH, JUNCTION, V_SCORE, V_IDENTITY, V_EVALUE, V_BTOP,
+                  J_SCORE, J_IDENTITY, J_EVALUE, J_BTOP, FWR1_IMGT, FWR2_IMGT, FWR3_IMGT,
+                  FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, CDR3_IMGT
+              ''')
+                
+    # Define ArgumentParser
+    parser = ArgumentParser(description=__doc__, epilog=fields,
+                            formatter_class=CommonHelpFormatter)
+    parser.add_argument('--version', action='version',
+                        version='%(prog)s:' + ' %s-%s' %(__version__, __date__))
+    subparsers = parser.add_subparsers(title='subcommands', dest='command',
+                                       help='Aligner used', metavar='')
+    # TODO:  This is a temporary fix for Python issue 9253
+    subparsers.required = True
+
+    # Parent parser    
+    parser_parent = getCommonArgParser(seq_in=False, seq_out=False, log=False)
+
+    # IgBlast Aligner
+    parser_igblast = subparsers.add_parser('igblast', help='Process IgBlast output',
+                                           parents=[parser_parent],
+                                           formatter_class=CommonHelpFormatter)
+    parser_igblast.set_defaults(func=parseIgBlast)
+    parser_igblast.add_argument('-i', nargs='+', action='store', dest='aligner_files',
+                                required=True,
+                                help='''IgBLAST output files in format 7 with query sequence
+                                     (IgBLAST argument \'-outfmt "7 std qseq sseq btop"\').''')
+    parser_igblast.add_argument('-r', nargs='+', action='store', dest='repo', required=True,
+                                help='''List of folders and/or fasta files containing
+                                     IMGT-gapped germline sequences corresponding to the
+                                     set of germlines used in the IgBLAST alignment.''')
+    parser_igblast.add_argument('-s', action='store', nargs='+', dest='seq_files',
+                                required=True,
+                                help='List of input FASTA files containing sequences')
+    parser_igblast.add_argument('--noparse', action='store_true', dest='no_parse',
+                                help='''Specify if input IDs should not be parsed to add
+                                     new columns to database.''')
+    parser_igblast.add_argument('--scores', action='store_true', dest='score_fields',
+                                help='''Specify if alignment score metrics should be
+                                     included in the output. Adds the V_SCORE, V_IDENTITY,
+                                     V_EVALUE, V_BTOP, J_SCORE, J_IDENTITY,
+                                     J_BTOP, and J_EVALUE columns.''')
+    parser_igblast.add_argument('--regions', action='store_true', dest='region_fields',
+                                help='''Specify if IMGT framework and CDR regions should be
+                                     included in the output. Adds the FWR1_IMGT, FWR2_IMGT,
+                                     FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and
+                                     CDR3_IMGT columns.''')
+    
+    # IMGT aligner
+    parser_imgt = subparsers.add_parser('imgt', help='Process IMGT/HighV-Quest output', 
+                                        parents=[parser_parent], 
+                                        formatter_class=CommonHelpFormatter)
+    imgt_arg_group =  parser_imgt.add_mutually_exclusive_group(required=True)
+    imgt_arg_group.add_argument('-i', nargs='+', action='store', dest='aligner_files',
+                                help='''Either zipped IMGT output files (.zip) or a folder
+                                     containing unzipped IMGT output files (which must
+                                     include 1_Summary, 2_IMGT-gapped, 3_Nt-sequences,
+                                     and 6_Junction).''')
+    parser_imgt.add_argument('-s', nargs='*', action='store', dest='seq_files',
+                             required=False,
+                             help='List of input FASTA files containing sequences')
+    parser_imgt.add_argument('--noparse', action='store_true', dest='no_parse', 
+                             help='''Specify if input IDs should not be parsed to add new
+                                  columns to database.''')
+    parser_imgt.add_argument('--scores', action='store_true', dest='score_fields',
+                             help='''Specify if alignment score metrics should be
+                                  included in the output. Adds the V_SCORE, V_IDENTITY,
+                                  J_SCORE and J_IDENTITY. Note, this will also add
+                                  the columns V_EVALUE, V_BTOP, J_EVALUE and J_BTOP,
+                                  but they will be empty for IMGT output.''')
+    parser_imgt.add_argument('--regions', action='store_true', dest='region_fields',
+                             help='''Specify if IMGT framework and CDR regions should be
+                                  included in the output. Adds the FWR1_IMGT, FWR2_IMGT,
+                                  FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and
+                                  CDR3_IMGT columns.''')
+    parser_imgt.set_defaults(func=parseIMGT)
+
+    return parser
+    
+    
+if __name__ == "__main__":
+    """
+    Parses command line arguments and calls main
+    """
+    parser = getArgParser()    
+    args = parser.parse_args()
+    args_dict = parseCommonArgs(args, in_arg='aligner_files')
+
+    # Set no ID parsing if sequence files are not provided
+    if 'seq_files' in args_dict and not args_dict['seq_files']:
+        args_dict['no_parse'] = True
+
+    # Delete
+    if 'seq_files' in args_dict: del args_dict['seq_files']
+    if 'aligner_files' in args_dict: del args_dict['aligner_files']
+    if 'command' in args_dict: del args_dict['command']
+    if 'func' in args_dict: del args_dict['func']           
+    
+    if args.command == 'imgt':
+        for i in range(len(args.__dict__['aligner_files'])):
+            args_dict['imgt_output'] = args.__dict__['aligner_files'][i]
+            args_dict['seq_file'] = args.__dict__['seq_files'][i] \
+                                    if args.__dict__['seq_files'] else None
+            args.func(**args_dict)
+    elif args.command == 'igblast':
+        for i in range(len(args.__dict__['aligner_files'])):
+            args_dict['igblast_output'] =  args.__dict__['aligner_files'][i]
+            args_dict['seq_file'] = args.__dict__['seq_files'][i]
+            args.func(**args_dict)
author	davidvanzessen
date	Thu, 13 Oct 2016 10:52:24 -0400
parents
children	22dddabe3637