shm_csr: change_o/MakeDb.py comparison

comparison change_o/MakeDb.py @ 52:22dddabe3637 draft

Uploaded

author	davidvanzessen
date	Tue, 23 May 2017 08:32:58 -0400
parents	c33d93683a09
children

comparison

equal deleted inserted replaced

-:8fa8836bd605
+:22dddabe3637
 # Info
 __author__ = 'Namita Gupta, Jason Anthony Vander Heiden'
 from changeo import __version__, __date__
 # Imports
-import csv
 import os
-import re
 import sys
-import pandas as pd
-import tarfile
-import zipfile
 from argparse import ArgumentParser
 from collections import OrderedDict
-from itertools import groupby
-from shutil import rmtree
-from tempfile import mkdtemp
 from textwrap import dedent
 from time import time
 from Bio import SeqIO
-from Bio.Seq import Seq
-from Bio.Alphabet import IUPAC
 # Presto and changeo imports
 from presto.Defaults import default_out_args
 from presto.Annotation import parseAnnotation
-from presto.IO import countSeqFile, printLog, printProgress
+from presto.IO import countSeqFile, printLog, printMessage, printProgress, readSeqFile
 from changeo.Commandline import CommonHelpFormatter, getCommonArgParser, parseCommonArgs
-from changeo.IO import getDbWriter, countDbFile, getRepo
+from changeo.IO import countDbFile, extractIMGT, getDbWriter, readRepo
-from changeo.Receptor import IgRecord, parseAllele, v_allele_regex, d_allele_regex, \
+from changeo.Parsers import IgBLASTReader, IMGTReader, IHMMuneReader, getIDforIMGT
-j_allele_regex
-# Default parameters
+def getFilePrefix(aligner_output, out_args):
-default_delimiter = ('\t', ',', '-')
+"""
+Get file name prefix and create output directory
-def gapV(ig_dict, repo_dict):
-"""
-Insert gaps into V region and update alignment information
 Arguments:
-ig_dict : Dictionary of parsed IgBlast output
+aligner_output : aligner output file or directory.
-repo_dict : Dictionary of IMGT gapped germline sequences
+out_args : dictionary of output arguments.
 Returns:
-dict : Updated with SEQUENCE_IMGT, V_GERM_START_IMGT, and V_GERM_LENGTH_IMGT fields
+str : file name prefix.
 """
+# Determine output directory
-seq_imgt = '.' * (int(ig_dict['V_GERM_START_VDJ'])-1) + ig_dict['SEQUENCE_VDJ']
+if not out_args['out_dir']:
+out_dir = os.path.dirname(os.path.abspath(aligner_output))
-# Find gapped germline V segment
+else:
-vgene = parseAllele(ig_dict['V_CALL'], v_allele_regex, 'first')
+out_dir = os.path.abspath(out_args['out_dir'])
-vkey = (vgene, )
+if not os.path.exists(out_dir):
-#TODO: Figure out else case
+os.mkdir(out_dir)
-if vkey in repo_dict:
-vgap = repo_dict[vkey]
+# Determine file prefix
-# Iterate over gaps in the germline segment
+if out_args['out_name']:
-gaps = re.finditer(r'\.', vgap)
+file_prefix = out_args['out_name']
-gapcount = int(ig_dict['V_GERM_START_VDJ'])-1
+else:
-for gap in gaps:
+file_prefix = os.path.splitext(os.path.split(os.path.abspath(aligner_output))[1])[0]
-i = gap.start()
-# Break if gap begins after V region
+return os.path.join(out_dir, file_prefix)
-if i >= ig_dict['V_GERM_LENGTH_VDJ'] + gapcount:
-break
-# Insert gap into IMGT sequence
+def getSeqDict(seq_file):
-seq_imgt = seq_imgt[:i] + '.' + seq_imgt[i:]
+"""
-# Update gap counter
+Create a dictionary from a sequence file.
-gapcount += 1
-ig_dict['SEQUENCE_IMGT'] = seq_imgt
-# Update IMGT positioning information for V
-ig_dict['V_GERM_START_IMGT'] = 1
-ig_dict['V_GERM_LENGTH_IMGT'] = ig_dict['V_GERM_LENGTH_VDJ'] + gapcount
-return ig_dict
-def getIMGTJunc(ig_dict, repo_dict):
-"""
-Identify junction region by IMGT definition
 Arguments:
-ig_dict : Dictionary of parsed IgBlast output
+seq_file : sequence file.
-repo_dict : Dictionary of IMGT gapped germline sequences
 Returns:
-dict : Updated with JUNCTION_LENGTH_IMGT and JUNCTION_IMGT fields
+dict : sequence description as keys with Bio.SeqRecords as values.
 """
-# Find germline J segment
+seq_dict = SeqIO.to_dict(readSeqFile(seq_file),
-jgene = parseAllele(ig_dict['J_CALL'], j_allele_regex, 'first')
+key_function=lambda x: x.description)
-jkey = (jgene, )
-#TODO: Figure out else case
+return seq_dict
-if jkey in repo_dict:
-# Get germline J sequence
-jgerm = repo_dict[jkey]
+def writeDb(db, fields, file_prefix, total_count, id_dict=None, no_parse=True, partial=False,
-jgerm = jgerm[:ig_dict['J_GERM_START']+ig_dict['J_GERM_LENGTH']-1]
+out_args=default_out_args):
-# Look for (F|W)GXG aa motif in nt sequence
+"""
-motif = re.search(r'T(TT|TC|GG)GG[ACGT]{4}GG[AGCT]', jgerm)
+Writes tab-delimited database file in output directory.
-aa_end = len(ig_dict['SEQUENCE_IMGT'])
-#TODO: Figure out else case
-if motif:
-# print('\n', motif.group())
-aa_end = motif.start() - len(jgerm) + 3
-# Add fields to dict
-ig_dict['JUNCTION'] = ig_dict['SEQUENCE_IMGT'][309:aa_end]
-ig_dict['JUNCTION_LENGTH'] = len(ig_dict['JUNCTION'])
-return ig_dict
-def getRegions(ig_dict):
-"""
-Identify FWR and CDR regions by IMGT definition
-Arguments:
-ig_dict : Dictionary of parsed alignment output
-Returns:
-dict : Updated with FWR1_IMGT, FWR2_IMGT, FWR3_IMGT, FWR4_IMGT,
-CDR1_IMGT, CDR2_IMGT, and CDR3_IMGT fields
-"""
-try:
-seq_len = len(ig_dict['SEQUENCE_IMGT'])
-ig_dict['FWR1_IMGT'] = ig_dict['SEQUENCE_IMGT'][0:min(78,seq_len)]
-except (KeyError, IndexError):
-return ig_dict
-try: ig_dict['CDR1_IMGT'] = ig_dict['SEQUENCE_IMGT'][78:min(114, seq_len)]
-except (IndexError): return ig_dict
-try: ig_dict['FWR2_IMGT'] = ig_dict['SEQUENCE_IMGT'][114:min(165, seq_len)]
-except (IndexError): return ig_dict
-try: ig_dict['CDR2_IMGT'] = ig_dict['SEQUENCE_IMGT'][165:min(195, seq_len)]
-except (IndexError): return ig_dict
-try: ig_dict['FWR3_IMGT'] = ig_dict['SEQUENCE_IMGT'][195:min(312, seq_len)]
-except (IndexError): return ig_dict
-try:
-cdr3_end = 306 + ig_dict['JUNCTION_LENGTH']
-ig_dict['CDR3_IMGT'] = ig_dict['SEQUENCE_IMGT'][312:cdr3_end]
-ig_dict['FWR4_IMGT'] = ig_dict['SEQUENCE_IMGT'][cdr3_end:]
-except (KeyError, IndexError):
-return ig_dict
-return ig_dict
-def getSeqforIgBlast(seq_file):
-"""
-Fetch input sequences for IgBlast queries
-Arguments:
-seq_file = a fasta file of sequences input to IgBlast
-Returns:
-a dictionary of {ID:Seq}
-"""
-seq_dict = SeqIO.index(seq_file, "fasta", IUPAC.ambiguous_dna)
-# Create a seq_dict ID translation using IDs truncate up to space or 50 chars
-seqs = {}
-for seq in seq_dict.values():
-seqs.update({seq.description:str(seq.seq)})
-return seqs
-def findLine(handle, query):
-"""
-Finds line with query string in file
-Arguments:
-handle = file handle in which to search for line
-query = query string for which to search in file
-Returns:
-line from handle in which query string was found
-"""
-for line in handle:
-if(re.match(query, line)):
-return line
-def extractIMGT(imgt_output):
-"""
-Extract necessary files from IMGT results, zipped or unzipped
 Arguments:
-imgt_output = zipped file or unzipped folder output by IMGT
+db : a iterator of IgRecord objects containing alignment data.
+fields : a list of ordered field names to write.
+file_prefix : directory and prefix for CLIP tab-delim file.
+total_count : number of records (for progress bar).
+id_dict : a dictionary of the truncated sequence ID mapped to the full sequence ID.
+no_parse : if ID is to be parsed for pRESTO output with default delimiters.
+partial : if True put incomplete alignments in the pass file.
+out_args : common output argument dictionary from parseCommonArgs.
 Returns:
-sorted list of filenames from which information will be read
+None
 """
-#file_ext = os.path.splitext(imgt_output)[1].lower()
+# Function to check for valid records strictly
-imgt_flags = ('1_Summary', '2_IMGT-gapped', '3_Nt-sequences', '6_Junction')
+def _pass_strict(rec):
-temp_dir = mkdtemp()
+valid = [rec.v_call and rec.v_call != 'None',
-if zipfile.is_zipfile(imgt_output):
+rec.j_call and rec.j_call != 'None',
-# Open zip file
+rec.functional is not None,
-imgt_zip = zipfile.ZipFile(imgt_output, 'r')
+rec.seq_vdj,
-# Extract required files
+rec.junction]
-imgt_files = sorted([n for n in imgt_zip.namelist() \
+return all(valid)
-if os.path.basename(n).startswith(imgt_flags)])
-imgt_zip.extractall(temp_dir, imgt_files)
+# Function to check for valid records loosely
-# Define file list
+def _pass_gentle(rec):
-imgt_files = [os.path.join(temp_dir, f) for f in imgt_files]
+valid = [rec.v_call and rec.v_call != 'None',
-elif os.path.isdir(imgt_output):
+rec.d_call and rec.d_call != 'None',
-# Find required files in folder
+rec.j_call and rec.j_call != 'None']
-folder_files = []
+return any(valid)
-for root, dirs, files in os.walk(imgt_output):
-folder_files.extend([os.path.join(os.path.abspath(root), f) for f in files])
+# Set pass criteria
-# Define file list
+_pass = _pass_gentle if partial else _pass_strict
-imgt_files = sorted([n for n in folder_files \
-if os.path.basename(n).startswith(imgt_flags)])
+# Define output file names
-elif tarfile.is_tarfile(imgt_output):
+pass_file = '%s_db-pass.tab' % file_prefix
-# Open zip file
+fail_file = '%s_db-fail.tab' % file_prefix
-imgt_tar = tarfile.open(imgt_output, 'r')
-# Extract required files
+# Initiate handles, writers and counters
-imgt_files = sorted([n for n in imgt_tar.getnames() \
-if os.path.basename(n).startswith(imgt_flags)])
-imgt_tar.extractall(temp_dir, [imgt_tar.getmember(n) for n in imgt_files])
-# Define file list
-imgt_files = [os.path.join(temp_dir, f) for f in imgt_files]
-else:
-sys.exit('ERROR: Unsupported IGMT output file. Must be either a zipped file (.zip), LZMA compressed tarfile (.txz) or a folder.')
-if len(imgt_files) > len(imgt_flags): # e.g. multiple 1_Summary files
-sys.exit('ERROR: Wrong files in IMGT output %s.' % imgt_output)
-elif len(imgt_files) < len(imgt_flags):
-sys.exit('ERROR: Missing necessary file IMGT output %s.' % imgt_output)
-return temp_dir, imgt_files
-# TODO: return a dictionary with keys determined by the comment strings in the blocks, thus avoiding problems with missing blocks
-def readOneIgBlastResult(block):
-"""
-Parse a single IgBLAST query result
-Arguments:
-block =  itertools groupby object of single result
-Returns:
-None if no results, otherwise list of DataFrames for each result block
-"""
-results = list()
-i = 0
-for match, subblock in groupby(block, lambda l: l=='\n'):
-if not match:
-# Strip whitespace and comments
-sub = [s.strip() for s in subblock if not s.startswith('#')]
-# Continue on empty block
-if not sub:  continue
-else:  i += 1
-# Split by tabs
-sub = [s.split('\t') for s in sub]
-# Append list for "V-(D)-J rearrangement summary" (i == 1)
-# And "V-(D)-J junction details" (i == 2)
-# Otherwise append DataFrame of subblock
-if i == 1 or i == 2:
-results.append(sub[0])
-else:
-df = pd.DataFrame(sub)
-if not df.empty: results.append(df)
-return results if results else None
-# TODO:  needs more speeds. pandas is probably to blame.
-def readIgBlast(igblast_output, seq_dict, repo_dict,
-score_fields=False, region_fields=False):
-"""
-Reads IgBlast output
-Arguments:
-igblast_output = IgBlast output file (format 7)
-seq_dict = a dictionary of {ID:Seq} from input fasta file
-repo_dict = dictionary of IMGT gapped germline sequences
-score_fields = if True parse alignment scores
-region_fields = if True add FWR and CDR region fields
-Returns:
-a generator of dictionaries containing alignment data
-"""
-# Open IgBlast output file
-with open(igblast_output) as f:
-# Iterate over individual results (separated by # IGBLASTN)
-for k1, block in groupby(f, lambda x: re.match('# IGBLASTN', x)):
-block = list(block)
-if not k1:
-# TODO: move query name extraction into block parser readOneIgBlastResult().
-# Extract sequence ID
-query_name = ' '.join(block[0].strip().split(' ')[2:])
-# Initialize db_gen to have ID and input sequence
-db_gen = {'SEQUENCE_ID':     query_name,
-'SEQUENCE_INPUT':  seq_dict[query_name]}
-# Parse further sub-blocks
-block_list = readOneIgBlastResult(block)
-# TODO: this is indented pretty far.  should be a separate function. or several functions.
-# If results exist, parse further to obtain full db_gen
-if block_list is not None:
-# Parse quality information
-db_gen['STOP'] = 'T' if block_list[0][-4] == 'Yes' else 'F'
-db_gen['IN_FRAME'] = 'T' if block_list[0][-3] == 'In-frame' else 'F'
-db_gen['FUNCTIONAL'] = 'T' if block_list[0][-2] == 'Yes' else 'F'
-if block_list[0][-1] == '-':
-db_gen['SEQUENCE_INPUT'] = str(Seq(db_gen['SEQUENCE_INPUT'],
-IUPAC.ambiguous_dna).reverse_complement())
-# Parse V, D, and J calls
-call_str = ' '.join(block_list[0])
-v_call = parseAllele(call_str, v_allele_regex, action='list')
-d_call = parseAllele(call_str, d_allele_regex, action='list')
-j_call = parseAllele(call_str, j_allele_regex, action='list')
-db_gen['V_CALL'] = ','.join(v_call) if v_call is not None else 'None'
-db_gen['D_CALL'] = ','.join(d_call) if d_call is not None else 'None'
-db_gen['J_CALL'] = ','.join(j_call) if j_call is not None else 'None'
-# Parse junction sequence
-# db_gen['JUNCTION_VDJ'] = re.sub('(N/A)|\[|\(|\)|\]', '', ''.join(block_list[1]))
-# db_gen['JUNCTION_LENGTH_VDJ'] = len(db_gen['JUNCTION_VDJ'])
-# TODO:  IgBLAST does a stupid and doesn't output block #3 sometimes. why?
-# TODO:  maybe we should fail these. they look craptastic.
-#pd.set_option('display.width', 500)
-#print query_name, len(block_list), hit_idx
-#for i, x in enumerate(block_list):
-#    print '[%i]' % i
-#    print x
-# Parse segment start and stop positions
-hit_df = block_list[-1]
-# Alignment info block
-#  0:  segment
-#  1:  query id
-#  2:  subject id
-#  3:  % identity
-#  4:  alignment length
-#  5:  mismatches
-#  6:  gap opens
-#  7:  gaps
-#  8:  q. start
-#  9:  q. end
-# 10:  s. start
-# 11:  s. end
-# 12:  evalue
-# 13:  bit score
-# 14:  query seq
-# 15:  subject seq
-# 16:  btop
-# If V call exists, parse V alignment information
-seq_vdj = ''
-if v_call is not None:
-v_align = hit_df[hit_df[0] == 'V'].iloc[0]
-# Germline positions
-db_gen['V_GERM_START_VDJ'] = int(v_align[10])
-db_gen['V_GERM_LENGTH_VDJ'] = int(v_align[11]) - db_gen['V_GERM_START_VDJ'] + 1
-# Query sequence positions
-db_gen['V_SEQ_START'] = int(v_align[8])
-db_gen['V_SEQ_LENGTH'] = int(v_align[9]) - db_gen['V_SEQ_START'] + 1
-if int(v_align[6]) == 0:
-db_gen['INDELS'] = 'F'
-else:
-db_gen['INDELS'] = 'T'
-# Set functional to none so record gets tossed (junction will be wrong)
-# db_gen['FUNCTIONAL'] = None
-# V alignment scores
-if score_fields:
-try: db_gen['V_SCORE'] = float(v_align[13])
-except (TypeError, ValueError): db_gen['V_SCORE'] = 'None'
-try: db_gen['V_IDENTITY'] = float(v_align[3]) / 100.0
-except (TypeError, ValueError): db_gen['V_IDENTITY'] = 'None'
-try: db_gen['V_EVALUE'] = float(v_align[12])
-except (TypeError, ValueError): db_gen['V_EVALUE'] = 'None'
-try: db_gen['V_BTOP'] = v_align[16]
-except (TypeError, ValueError): db_gen['V_BTOP'] = 'None'
-# Update VDJ sequence, removing insertions
-start = 0
-for m in re.finditer(r'-', v_align[15]):
-ins = m.start()
-seq_vdj += v_align[14][start:ins]
-start = ins + 1
-seq_vdj += v_align[14][start:]
-# TODO:  needs to check that the V results are present before trying to determine N1_LENGTH from them.
-# If D call exists, parse D alignment information
-if d_call is not None:
-d_align = hit_df[hit_df[0] == 'D'].iloc[0]
-# TODO:  this is kinda gross.  not sure how else to fix the alignment overlap problem though.
-# Determine N-region length and amount of J overlap with V or D alignment
-overlap = 0
-if v_call is not None:
-n1_len = int(d_align[8]) - (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH'])
-if n1_len < 0:
-db_gen['N1_LENGTH'] = 0
-overlap = abs(n1_len)
-else:
-db_gen['N1_LENGTH'] = n1_len
-n1_start = (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH']-1)
-n1_end = int(d_align[8])-1
-seq_vdj += db_gen['SEQUENCE_INPUT'][n1_start:n1_end]
-# Query sequence positions
-db_gen['D_SEQ_START'] = int(d_align[8]) + overlap
-db_gen['D_SEQ_LENGTH'] = max(int(d_align[9]) - db_gen['D_SEQ_START'] + 1, 0)
-# Germline positions
-db_gen['D_GERM_START'] = int(d_align[10]) + overlap
-db_gen['D_GERM_LENGTH'] = max(int(d_align[11]) - db_gen['D_GERM_START'] + 1, 0)
-# Update VDJ sequence, removing insertions
-start = overlap
-for m in re.finditer(r'-', d_align[15]):
-ins = m.start()
-seq_vdj += d_align[14][start:ins]
-start = ins + 1
-seq_vdj += d_align[14][start:]
-# TODO:  needs to check that the V results are present before trying to determine N1_LENGTH from them.
-# If J call exists, parse J alignment information
-if j_call is not None:
-j_align = hit_df[hit_df[0] == 'J'].iloc[0]
-# TODO:  this is kinda gross.  not sure how else to fix the alignment overlap problem though.
-# Determine N-region length and amount of J overlap with V or D alignment
-overlap = 0
-if d_call is not None:
-n2_len = int(j_align[8]) - (db_gen['D_SEQ_START'] + db_gen['D_SEQ_LENGTH'])
-if n2_len < 0:
-db_gen['N2_LENGTH'] = 0
-overlap = abs(n2_len)
-else:
-db_gen['N2_LENGTH'] = n2_len
-n2_start = (db_gen['D_SEQ_START']+db_gen['D_SEQ_LENGTH']-1)
-n2_end = int(j_align[8])-1
-seq_vdj += db_gen['SEQUENCE_INPUT'][n2_start:n2_end]
-elif v_call is not None:
-n1_len = int(j_align[8]) - (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH'])
-if n1_len < 0:
-db_gen['N1_LENGTH'] = 0
-overlap = abs(n1_len)
-else:
-db_gen['N1_LENGTH'] = n1_len
-n1_start = (db_gen['V_SEQ_START']+db_gen['V_SEQ_LENGTH']-1)
-n1_end = int(j_align[8])-1
-seq_vdj += db_gen['SEQUENCE_INPUT'][n1_start:n1_end]
-else:
-db_gen['N1_LENGTH'] = 0
-# Query positions
-db_gen['J_SEQ_START'] = int(j_align[8]) + overlap
-db_gen['J_SEQ_LENGTH'] = max(int(j_align[9]) - db_gen['J_SEQ_START'] + 1, 0)
-# Germline positions
-db_gen['J_GERM_START'] = int(j_align[10]) + overlap
-db_gen['J_GERM_LENGTH'] = max(int(j_align[11]) - db_gen['J_GERM_START'] + 1, 0)
-# J alignment scores
-if score_fields:
-try: db_gen['J_SCORE'] = float(j_align[13])
-except (TypeError, ValueError): db_gen['J_SCORE'] = 'None'
-try: db_gen['J_IDENTITY'] = float(j_align[3]) / 100.0
-except (TypeError, ValueError): db_gen['J_IDENTITY'] = 'None'
-try: db_gen['J_EVALUE'] = float(j_align[12])
-except (TypeError, ValueError): db_gen['J_EVALUE'] = 'None'
-try: db_gen['J_BTOP'] = j_align[16]
-except (TypeError, ValueError): db_gen['J_BTOP'] = 'None'
-# Update VDJ sequence, removing insertions
-start = overlap
-for m in re.finditer(r'-', j_align[15]):
-ins = m.start()
-seq_vdj += j_align[14][start:ins]
-start = ins + 1
-seq_vdj += j_align[14][start:]
-db_gen['SEQUENCE_VDJ'] = seq_vdj
-# Create IMGT-gapped sequence and infer IMGT junction
-if v_call is not None:
-db_gen = gapV(db_gen, repo_dict)
-if j_call is not None:
-db_gen = getIMGTJunc(db_gen, repo_dict)
-# FWR and CDR regions
-if region_fields: getRegions(db_gen)
-yield IgRecord(db_gen)
-# TODO:  should be more readable
-def readIMGT(imgt_files, score_fields=False, region_fields=False):
-"""
-Reads IMGT/HighV-Quest output
-Arguments:
-imgt_files = IMGT/HighV-Quest output files 1, 2, 3, and 6
-score_fields = if True parse alignment scores
-region_fields = if True add FWR and CDR region fields
-Returns:
-a generator of dictionaries containing alignment data
-"""
-imgt_iters = [csv.DictReader(open(f, 'rU'), delimiter='\t') for f in imgt_files]
-# Create a dictionary for each sequence alignment and yield its generator
-for sm, gp, nt, jn in zip(*imgt_iters):
-if len(set([sm['Sequence ID'],
-gp['Sequence ID'],
-nt['Sequence ID'],
-jn['Sequence ID']])) != 1:
-sys.exit('Error: IMGT files are corrupt starting with Summary file record %s' \
-% sm['Sequence ID'])
-db_gen = {'SEQUENCE_ID': sm['Sequence ID'],
-'SEQUENCE_INPUT': sm['Sequence']}
-if 'No results' not in sm['Functionality']:
-db_gen['FUNCTIONAL'] = ['?','T','F'][('productive' in sm['Functionality']) +
-('unprod' in sm['Functionality'])]
-db_gen['IN_FRAME'] = ['?','T','F'][('in-frame' in sm['JUNCTION frame']) +
-('out-of-frame' in sm['JUNCTION frame'])],
-db_gen['STOP'] = ['F','?','T'][('stop codon' in sm['Functionality comment']) +
-('unprod' in sm['Functionality'])]
-db_gen['MUTATED_INVARIANT'] = ['F','?','T'][(any(('missing' in sm['Functionality comment'],
-'missing' in sm['V-REGION potential ins/del']))) +
-('unprod' in sm['Functionality'])]
-db_gen['INDELS'] = ['F','T'][any((sm['V-REGION potential ins/del'],
-sm['V-REGION insertions'],
-sm['V-REGION deletions']))]
-db_gen['SEQUENCE_VDJ'] = nt['V-D-J-REGION'] if nt['V-D-J-REGION'] else nt['V-J-REGION']
-db_gen['SEQUENCE_IMGT'] = gp['V-D-J-REGION'] if gp['V-D-J-REGION'] else gp['V-J-REGION']
-db_gen['V_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['V-GENE and allele']))
-db_gen['D_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['D-GENE and allele']))
-db_gen['J_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['J-GENE and allele']))
-v_seq_length = len(nt['V-REGION']) if nt['V-REGION'] else 0
-db_gen['V_SEQ_START'] = nt['V-REGION start']
-db_gen['V_SEQ_LENGTH'] = v_seq_length
-db_gen['V_GERM_START_IMGT'] = 1
-db_gen['V_GERM_LENGTH_IMGT'] = len(gp['V-REGION']) if gp['V-REGION'] else 0
-db_gen['N1_LENGTH'] = sum(int(i) for i in [jn["P3'V-nt nb"],
-jn['N-REGION-nt nb'],
-jn['N1-REGION-nt nb'],
-jn["P5'D-nt nb"]] if i)
-db_gen['D_SEQ_START'] = sum(int(i) for i in [1, v_seq_length,
-jn["P3'V-nt nb"],
-jn['N-REGION-nt nb'],
-jn['N1-REGION-nt nb'],
-jn["P5'D-nt nb"]] if i)
-db_gen['D_SEQ_LENGTH'] = int(jn["D-REGION-nt nb"] or 0)
-db_gen['D_GERM_START'] = int(jn["5'D-REGION trimmed-nt nb"] or 0) + 1
-db_gen['D_GERM_LENGTH'] = int(jn["D-REGION-nt nb"] or 0)
-db_gen['N2_LENGTH'] = sum(int(i) for i in [jn["P3'D-nt nb"],
-jn['N2-REGION-nt nb'],
-jn["P5'J-nt nb"]] if i)
-db_gen['J_SEQ_START_IMGT'] = sum(int(i) for i in [1, v_seq_length,
-jn["P3'V-nt nb"],
-jn['N-REGION-nt nb'],
-jn['N1-REGION-nt nb'],
-jn["P5'D-nt nb"],
-jn["D-REGION-nt nb"],
-jn["P3'D-nt nb"],
-jn['N2-REGION-nt nb'],
-jn["P5'J-nt nb"]] if i)
-db_gen['J_SEQ_LENGTH'] = len(nt['J-REGION']) if nt['J-REGION'] else 0
-db_gen['J_GERM_START'] = int(jn["5'J-REGION trimmed-nt nb"] or 0) + 1
-db_gen['J_GERM_LENGTH'] = len(gp['J-REGION']) if gp['J-REGION'] else 0
-db_gen['JUNCTION_LENGTH'] = len(jn['JUNCTION']) if jn['JUNCTION'] else 0
-db_gen['JUNCTION'] = jn['JUNCTION']
-# Alignment scores
-if score_fields:
-try:  db_gen['V_SCORE'] = float(sm['V-REGION score'])
-except (TypeError, ValueError):  db_gen['V_SCORE'] = 'None'
-try:  db_gen['V_IDENTITY'] = float(sm['V-REGION identity %']) / 100.0
-except (TypeError, ValueError):  db_gen['V_IDENTITY'] = 'None'
-try:  db_gen['J_SCORE'] = float(sm['J-REGION score'])
-except (TypeError, ValueError):  db_gen['J_SCORE'] = 'None'
-try:  db_gen['J_IDENTITY'] = float(sm['J-REGION identity %']) / 100.0
-except (TypeError, ValueError):  db_gen['J_IDENTITY'] = 'None'
-# FWR and CDR regions
-if region_fields: getRegions(db_gen)
-else:
-db_gen['V_CALL'] = 'None'
-db_gen['D_CALL'] = 'None'
-db_gen['J_CALL'] = 'None'
-yield IgRecord(db_gen)
-def getIDforIMGT(seq_file):
-"""
-Create a sequence ID translation using IMGT truncation
-Arguments:
-seq_file = a fasta file of sequences input to IMGT
-Returns:
-a dictionary of {truncated ID: full seq description}
-"""
-# Create a seq_dict ID translation using IDs truncate up to space or 50 chars
-ids = {}
-for i, rec in enumerate(SeqIO.parse(seq_file, 'fasta', IUPAC.ambiguous_dna)):
-if len(rec.description) <= 50:
-id_key = rec.description
-else:
-id_key = re.sub('\||\s|!|&|\*|<|>|\?','_',rec.description[:50])
-ids.update({id_key:rec.description})
-return ids
-def writeDb(db_gen, file_prefix, total_count, id_dict={}, no_parse=True,
-score_fields=False, region_fields=False, out_args=default_out_args):
-"""
-Writes tab-delimited database file in output directory
-Arguments:
-db_gen = a generator of IgRecord objects containing alignment data
-file_prefix = directory and prefix for CLIP tab-delim file
-total_count = number of records (for progress bar)
-id_dict = a dictionary of {IMGT ID: full seq description}
-no_parse = if ID is to be parsed for pRESTO output with default delimiters
-score_fields = if True add alignment score fields to output file
-region_fields = if True add FWR and CDR region fields to output file
-out_args = common output argument dictionary from parseCommonArgs
-Returns:
-None
-"""
-pass_file = "%s_db-pass.tab" % file_prefix
-fail_file = "%s_db-fail.tab" % file_prefix
-ordered_fields = ['SEQUENCE_ID',
-'SEQUENCE_INPUT',
-'FUNCTIONAL',
-'IN_FRAME',
-'STOP',
-'MUTATED_INVARIANT',
-'INDELS',
-'V_CALL',
-'D_CALL',
-'J_CALL',
-'SEQUENCE_VDJ',
-'SEQUENCE_IMGT',
-'V_SEQ_START',
-'V_SEQ_LENGTH',
-'V_GERM_START_VDJ',
-'V_GERM_LENGTH_VDJ',
-'V_GERM_START_IMGT',
-'V_GERM_LENGTH_IMGT',
-'N1_LENGTH',
-'D_SEQ_START',
-'D_SEQ_LENGTH',
-'D_GERM_START',
-'D_GERM_LENGTH',
-'N2_LENGTH',
-'J_SEQ_START',
-'J_SEQ_LENGTH',
-'J_GERM_START',
-'J_GERM_LENGTH',
-'JUNCTION_LENGTH',
-'JUNCTION']
-if score_fields:
-ordered_fields.extend(['V_SCORE',
-'V_IDENTITY',
-'V_EVALUE',
-'V_BTOP',
-'J_SCORE',
-'J_IDENTITY',
-'J_EVALUE',
-'J_BTOP'])
-if region_fields:
-ordered_fields.extend(['FWR1_IMGT', 'FWR2_IMGT', 'FWR3_IMGT', 'FWR4_IMGT',
-'CDR1_IMGT', 'CDR2_IMGT', 'CDR3_IMGT'])
-# TODO:  This is not the best approach. should pass in output fields.
-# Initiate passed handle
 pass_handle = None
+fail_handle = None
-# Open failed file
-if out_args['failed']:
-fail_handle = open(fail_file, 'wt')
-fail_writer = getDbWriter(fail_handle, add_fields=['SEQUENCE_ID', 'SEQUENCE_INPUT'])
-else:
-fail_handle = None
-fail_writer = None
-# Initialize counters and file
 pass_writer = None
+fail_writer = None
 start_time = time()
 rec_count = pass_count = fail_count = 0
-for record in db_gen:
-#printProgress(i + (total_count/2 if id_dict else 0), total_count, 0.05, start_time)
+# Validate and write output
-printProgress(rec_count, total_count, 0.05, start_time)
+printProgress(0, total_count, 0.05, start_time)
-rec_count += 1
+for i, record in enumerate(db, start=1):
-# Count pass or fail
+# Replace sequence description with full string, if required
-if (record.v_call == 'None' and record.j_call == 'None') or \
+if id_dict is not None and record.id in id_dict:
-record.functional is None or \
-not record.seq_vdj or \
-not record.junction:
-# print(record.v_call, record.j_call, record.functional, record.junction)
-fail_count += 1
-if fail_writer is not None: fail_writer.writerow(record.toDict())
-continue
-else:
-pass_count += 1
-# Build sample sequence description
-if record.id in id_dict:
 record.id = id_dict[record.id]
 # Parse sequence description into new columns
 if not no_parse:
-record.annotations = parseAnnotation(record.id, delimiter=out_args['delimiter'])
+try:
-record.id = record.annotations['ID']
+record.annotations = parseAnnotation(record.id, delimiter=out_args['delimiter'])
-del record.annotations['ID']
+record.id = record.annotations['ID']
+del record.annotations['ID']
-# TODO:  This is not the best approach. should pass in output fields.
-# If first sequence, use parsed description to create new columns and initialize writer
+# TODO:  This is not the best approach. should pass in output fields.
-if pass_writer is None:
+# If first record, use parsed description to define extra columns
-if not no_parse:  ordered_fields.extend(list(record.annotations.keys()))
+if i == 1:  fields.extend(list(record.annotations.keys()))
-pass_handle = open(pass_file, 'wt')
+except IndexError:
-pass_writer = getDbWriter(pass_handle, add_fields=ordered_fields)
+# Could not parse pRESTO-style annotations so fall back to no parse
+no_parse = True
-# Write row to tab-delim CLIP file
+sys.stderr.write('\nWARNING: Sequence annotation format not recognized. Sequence headers will not be parsed.\n')
-pass_writer.writerow(record.toDict())
+# Count pass or fail and write to appropriate file
-# Print log
+if _pass(record):
-#printProgress(i+1 + (total_count/2 if id_dict else 0), total_count, 0.05, start_time)
+# Open pass file
-printProgress(rec_count, total_count, 0.05, start_time)
+if pass_writer is None:
+pass_handle = open(pass_file, 'wt')
+pass_writer = getDbWriter(pass_handle, add_fields=fields)
+# Write row to pass file
+pass_count += 1
+pass_writer.writerow(record.toDict())
+else:
+# Open failed file
+if out_args['failed'] and fail_writer is None:
+fail_handle = open(fail_file, 'wt')
+fail_writer = getDbWriter(fail_handle, add_fields=fields)
+# Write row to fail file if specified
+fail_count += 1
+if fail_writer is not None:
+fail_writer.writerow(record.toDict())
+# Print progress
+printProgress(i, total_count, 0.05, start_time)
+# Print consol log
 log = OrderedDict()
 log['OUTPUT'] = pass_file
 log['PASS'] = pass_count
 log['FAIL'] = fail_count
 log['END'] = 'MakeDb'
 if pass_handle is not None: pass_handle.close()
 if fail_handle is not None: fail_handle.close()
-# TODO:  may be able to merge with parseIMGT
+# TODO:  may be able to merge with other mains
-def parseIgBlast(igblast_output, seq_file, repo, no_parse=True, score_fields=False,
+def parseIMGT(aligner_output, seq_file=None, no_parse=True, partial=False,
-region_fields=False, out_args=default_out_args):
+parse_scores=False, parse_regions=False, parse_junction=False,
-"""
+out_args=default_out_args):
-Main for IgBlast aligned sample sequences
+"""
+Main for IMGT aligned sample sequences.
 Arguments:
-igblast_output = IgBlast output file to process
+aligner_output : zipped file or unzipped folder output by IMGT.
-seq_file = fasta file input to IgBlast (from which to get sequence)
+seq_file : FASTA file input to IMGT (from which to get seqID).
-repo = folder with germline repertoire files
+no_parse : if ID is to be parsed for pRESTO output with default delimiters.
-no_parse = if ID is to be parsed for pRESTO output with default delimiters
+partial : If True put incomplete alignments in the pass file.
-score_fields = if True add alignment score fields to output file
+parse_scores : if True add alignment score fields to output file.
-region_fields = if True add FWR and CDR region fields to output file
+parse_regions : if True add FWR and CDR region fields to output file.
-out_args = common output argument dictionary from parseCommonArgs
+out_args : common output argument dictionary from parseCommonArgs.
 Returns:
 None
+"""
+# Print parameter info
+log = OrderedDict()
+log['START'] = 'MakeDb'
+log['ALIGNER'] = 'IMGT'
+log['ALIGNER_OUTPUT'] = aligner_output
+log['SEQ_FILE'] = os.path.basename(seq_file) if seq_file else ''
+log['NO_PARSE'] = no_parse
+log['PARTIAL'] = partial
+log['SCORES'] = parse_scores
+log['REGIONS'] = parse_regions
+log['JUNCTION'] = parse_junction
+printLog(log)
+start_time = time()
+printMessage('Loading sequence files', start_time=start_time, width=25)
+# Extract IMGT files
+temp_dir, imgt_files = extractIMGT(aligner_output)
+# Count records in IMGT files
+total_count = countDbFile(imgt_files['summary'])
+# Get (parsed) IDs from fasta file submitted to IMGT
+id_dict = getIDforIMGT(seq_file) if seq_file else {}
+printMessage('Done', start_time=start_time, end=True, width=25)
+# Parse IMGT output and write db
+with open(imgt_files['summary'], 'r') as summary_handle, \
+open(imgt_files['gapped'], 'r') as gapped_handle, \
+open(imgt_files['ntseq'], 'r') as ntseq_handle, \
+open(imgt_files['junction'], 'r') as junction_handle:
+parse_iter = IMGTReader(summary_handle, gapped_handle, ntseq_handle, junction_handle,
+parse_scores=parse_scores, parse_regions=parse_regions,
+parse_junction=parse_junction)
+file_prefix = getFilePrefix(aligner_output, out_args)
+writeDb(parse_iter, parse_iter.fields, file_prefix, total_count, id_dict=id_dict,
+no_parse=no_parse, partial=partial, out_args=out_args)
+# Cleanup temp directory
+temp_dir.cleanup()
+return None
+# TODO:  may be able to merge with other mains
+def parseIgBLAST(aligner_output, seq_file, repo, no_parse=True, partial=False,
+parse_regions=False, parse_scores=False, parse_igblast_cdr3=False,
+out_args=default_out_args):
+"""
+Main for IgBLAST aligned sample sequences.
+Arguments:
+aligner_output : IgBLAST output file to process.
+seq_file : fasta file input to IgBlast (from which to get sequence).
+repo : folder with germline repertoire files.
+no_parse : if ID is to be parsed for pRESTO output with default delimiters.
+partial : If True put incomplete alignments in the pass file.
+parse_regions : if True add FWR and CDR fields to output file.
+parse_scores : if True add alignment score fields to output file.
+parse_igblast_cdr3 : if True parse CDR3 sequences generated by IgBLAST
+out_args : common output argument dictionary from parseCommonArgs.
+Returns:
+None
 """
 # Print parameter info
 log = OrderedDict()
 log['START'] = 'MakeDB'
 log['ALIGNER'] = 'IgBlast'
-log['ALIGN_RESULTS'] = os.path.basename(igblast_output)
+log['ALIGNER_OUTPUT'] = os.path.basename(aligner_output)
 log['SEQ_FILE'] = os.path.basename(seq_file)
 log['NO_PARSE'] = no_parse
-log['SCORE_FIELDS'] = score_fields
+log['PARTIAL'] = partial
-log['REGION_FIELDS'] = region_fields
+log['SCORES'] = parse_scores
+log['REGIONS'] = parse_regions
 printLog(log)
+start_time = time()
+printMessage('Loading sequence files', start_time=start_time, width=25)
+# Count records in sequence file
+total_count = countSeqFile(seq_file)
 # Get input sequence dictionary
-seq_dict = getSeqforIgBlast(seq_file)
+seq_dict = getSeqDict(seq_file)
+# Create germline repo dictionary
-# Formalize out_dir and file-prefix
+repo_dict = readRepo(repo)
-if not out_args['out_dir']:
+printMessage('Done', start_time=start_time, end=True, width=25)
-out_dir = os.path.split(igblast_output)[0]
-else:
+# Parse and write output
-out_dir = os.path.abspath(out_args['out_dir'])
+with open(aligner_output, 'r') as f:
-if not os.path.exists(out_dir):  os.mkdir(out_dir)
+parse_iter = IgBLASTReader(f, seq_dict, repo_dict,
-if out_args['out_name']:
+parse_scores=parse_scores, parse_regions=parse_regions,
-file_prefix = out_args['out_name']
+parse_igblast_cdr3=parse_igblast_cdr3)
-else:
+file_prefix = getFilePrefix(aligner_output, out_args)
-file_prefix = os.path.basename(os.path.splitext(igblast_output)[0])
+writeDb(parse_iter, parse_iter.fields, file_prefix, total_count,
-file_prefix = os.path.join(out_dir, file_prefix)
+no_parse=no_parse, partial=partial, out_args=out_args)
-total_count = countSeqFile(seq_file)
+return None
-# Create
-repo_dict = getRepo(repo)
+# TODO:  may be able to merge with other mains
-igblast_dict = readIgBlast(igblast_output, seq_dict, repo_dict,
+def parseIHMM(aligner_output, seq_file, repo, no_parse=True, partial=False,
-score_fields=score_fields, region_fields=region_fields)
+parse_scores=False, parse_regions=False, out_args=default_out_args):
-writeDb(igblast_dict, file_prefix, total_count, no_parse=no_parse,
+"""
-score_fields=score_fields, region_fields=region_fields, out_args=out_args)
+Main for iHMMuneAlign aligned sample sequences.
-# TODO:  may be able to merge with parseIgBlast
-def parseIMGT(imgt_output, seq_file=None, no_parse=True, score_fields=False,
-region_fields=False, out_args=default_out_args):
-"""
-Main for IMGT aligned sample sequences
 Arguments:
-imgt_output = zipped file or unzipped folder output by IMGT
+aligner_output : iHMMune-Align output file to process.
-seq_file = FASTA file input to IMGT (from which to get seqID)
+seq_file : fasta file input to iHMMuneAlign (from which to get sequence).
-no_parse = if ID is to be parsed for pRESTO output with default delimiters
+repo : folder with germline repertoire files.
-score_fields = if True add alignment score fields to output file
+no_parse : if ID is to be parsed for pRESTO output with default delimiters.
-region_fields = if True add FWR and CDR region fields to output file
+partial : If True put incomplete alignments in the pass file.
-out_args = common output argument dictionary from parseCommonArgs
+parse_scores : if True parse alignment scores.
+parse_regions : if True add FWR and CDR region fields.
-Returns:
+out_args : common output argument dictionary from parseCommonArgs.
-None
+Returns:
+None
 """
 # Print parameter info
 log = OrderedDict()
-log['START'] = 'MakeDb'
+log['START'] = 'MakeDB'
-log['ALIGNER'] = 'IMGT'
+log['ALIGNER'] = 'iHMMune-Align'
-log['ALIGN_RESULTS'] = imgt_output
+log['ALIGNER_OUTPUT'] = os.path.basename(aligner_output)
-log['SEQ_FILE'] = os.path.basename(seq_file) if seq_file else ''
+log['SEQ_FILE'] = os.path.basename(seq_file)
 log['NO_PARSE'] = no_parse
-log['SCORE_FIELDS'] = score_fields
+log['PARTIAL'] = partial
-log['REGION_FIELDS'] = region_fields
+log['SCORES'] = parse_scores
+log['REGIONS'] = parse_regions
 printLog(log)
-# Get individual IMGT result files
+start_time = time()
-temp_dir, imgt_files = extractIMGT(imgt_output)
+printMessage('Loading sequence files', start_time=start_time, width=25)
+# Count records in sequence file
-# Formalize out_dir and file-prefix
+total_count = countSeqFile(seq_file)
-if not out_args['out_dir']:
+# Get input sequence dictionary
-out_dir = os.path.dirname(os.path.abspath(imgt_output))
+seq_dict = getSeqDict(seq_file)
-else:
+# Create germline repo dictionary
-out_dir = os.path.abspath(out_args['out_dir'])
+repo_dict = readRepo(repo)
-if not os.path.exists(out_dir):  os.mkdir(out_dir)
+printMessage('Done', start_time=start_time, end=True, width=25)
-if out_args['out_name']:
-file_prefix = out_args['out_name']
+# Parse and write output
-else:
+with open(aligner_output, 'r') as f:
-file_prefix = os.path.splitext(os.path.split(os.path.abspath(imgt_output))[1])[0]
+parse_iter = IHMMuneReader(f, seq_dict, repo_dict,
-file_prefix = os.path.join(out_dir, file_prefix)
+parse_scores=parse_scores, parse_regions=parse_regions)
+file_prefix = getFilePrefix(aligner_output, out_args)
-total_count = countDbFile(imgt_files[0])
+writeDb(parse_iter, parse_iter.fields, file_prefix, total_count,
+no_parse=no_parse, partial=partial, out_args=out_args)
-# Get (parsed) IDs from fasta file submitted to IMGT
-id_dict = getIDforIMGT(seq_file) if seq_file else {}
+return None
-# Create
-imgt_dict = readIMGT(imgt_files, score_fields=score_fields,
-region_fields=region_fields)
-writeDb(imgt_dict, file_prefix, total_count, id_dict=id_dict, no_parse=no_parse,
-score_fields=score_fields, region_fields=region_fields, out_args=out_args)
-# Delete temp directory
-rmtree(temp_dir)
 def getArgParser():
 """
-Defines the ArgumentParser
+Defines the ArgumentParser.
-Arguments:
-None
 Returns:
-an ArgumentParser object
+argparse.ArgumentParser
 """
 fields = dedent(
 '''
 output files:
 db-pass
-database of parsed alignment records.
+database of alignment records with functionality information,
+V and J calls, and a junction region.
 db-fail
-database with records failing alignment.
+database with records that fail due to no functionality information
+(did not pass IMGT), no V call, no J call, or no junction region.
-output fields:
-SEQUENCE_ID, SEQUENCE_INPUT, FUNCTIONAL, IN_FRAME, STOP, MUTATED_INVARIANT,
+universal output fields:
-INDELS, V_CALL, D_CALL, J_CALL, SEQUENCE_VDJ and/or SEQUENCE_IMGT,
+SEQUENCE_ID, SEQUENCE_INPUT, SEQUENCE_VDJ, SEQUENCE_IMGT,
-V_SEQ_START, V_SEQ_LENGTH, V_GERM_START_VDJ and/or V_GERM_START_IMGT,
+FUNCTIONAL, IN_FRAME, STOP, MUTATED_INVARIANT, INDELS,
-V_GERM_LENGTH_VDJ and/or V_GERM_LENGTH_IMGT, N1_LENGTH,
+V_CALL, D_CALL, J_CALL,
-D_SEQ_START, D_SEQ_LENGTH, D_GERM_START, D_GERM_LENGTH, N2_LENGTH,
+V_SEQ_START, V_SEQ_LENGTH,
+D_SEQ_START, D_SEQ_LENGTH, D_GERM_START, D_GERM_LENGTH,
 J_SEQ_START, J_SEQ_LENGTH, J_GERM_START, J_GERM_LENGTH,
-JUNCTION_LENGTH, JUNCTION, V_SCORE, V_IDENTITY, V_EVALUE, V_BTOP,
+JUNCTION_LENGTH, JUNCTION, NP1_LENGTH, NP2_LENGTH,
-J_SCORE, J_IDENTITY, J_EVALUE, J_BTOP, FWR1_IMGT, FWR2_IMGT, FWR3_IMGT,
+FWR1_IMGT, FWR2_IMGT, FWR3_IMGT, FWR4_IMGT,
-FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, CDR3_IMGT
+CDR1_IMGT, CDR2_IMGT, CDR3_IMGT
+imgt specific output fields:
+V_GERM_START_IMGT, V_GERM_LENGTH_IMGT,
+N1_LENGTH, N2_LENGTH, P3V_LENGTH, P5D_LENGTH, P3D_LENGTH, P5J_LENGTH,
+D_FRAME, V_SCORE, V_IDENTITY, J_SCORE, J_IDENTITY,
+igblast specific output fields:
+V_GERM_START_VDJ, V_GERM_LENGTH_VDJ,
+V_EVALUE, V_SCORE, V_IDENTITY, V_BTOP,
+J_EVALUE, J_SCORE, J_IDENTITY, J_BTOP.
+CDR3_IGBLAST_NT, CDR3_IGBLAST_AA
+ihmm specific output fields:
+V_GERM_START_VDJ, V_GERM_LENGTH_VDJ,
+HMM_SCORE
 ''')
 # Define ArgumentParser
 parser = ArgumentParser(description=__doc__, epilog=fields,
 formatter_class=CommonHelpFormatter)
 # Parent parser
 parser_parent = getCommonArgParser(seq_in=False, seq_out=False, log=False)
 # IgBlast Aligner
-parser_igblast = subparsers.add_parser('igblast', help='Process IgBlast output',
+parser_igblast = subparsers.add_parser('igblast', parents=[parser_parent],
-parents=[parser_parent],
+formatter_class=CommonHelpFormatter,
-formatter_class=CommonHelpFormatter)
+help='Process IgBLAST output.',
-parser_igblast.set_defaults(func=parseIgBlast)
+description='Process IgBLAST output.')
-parser_igblast.add_argument('-i', nargs='+', action='store', dest='aligner_files',
+parser_igblast.add_argument('-i', nargs='+', action='store', dest='aligner_outputs',
 required=True,
 help='''IgBLAST output files in format 7 with query sequence
 (IgBLAST argument \'-outfmt "7 std qseq sseq btop"\').''')
 parser_igblast.add_argument('-r', nargs='+', action='store', dest='repo', required=True,
 help='''List of folders and/or fasta files containing
 IMGT-gapped germline sequences corresponding to the
 set of germlines used in the IgBLAST alignment.''')
 parser_igblast.add_argument('-s', action='store', nargs='+', dest='seq_files',
 required=True,
-help='List of input FASTA files containing sequences')
+help='''List of input FASTA files (with .fasta, .fna or .fa
+extension), containing sequences.''')
 parser_igblast.add_argument('--noparse', action='store_true', dest='no_parse',
-help='''Specify if input IDs should not be parsed to add
+help='''Specify to prevent input sequence headers from being parsed
-new columns to database.''')
+to add new columns to database. Parsing of sequence headers requires
-parser_igblast.add_argument('--scores', action='store_true', dest='score_fields',
+headers to be in the pRESTO annotation format, so this should be specified
+when sequence headers are incompatible with the pRESTO annotation scheme.
+Note, unrecognized header formats will default to this behavior.''')
+parser_igblast.add_argument('--partial', action='store_true', dest='partial',
+help='''If specified, include incomplete V(D)J alignments in
+the pass file instead of the fail file.''')
+parser_igblast.add_argument('--scores', action='store_true', dest='parse_scores',
 help='''Specify if alignment score metrics should be
 included in the output. Adds the V_SCORE, V_IDENTITY,
 V_EVALUE, V_BTOP, J_SCORE, J_IDENTITY,
 J_BTOP, and J_EVALUE columns.''')
-parser_igblast.add_argument('--regions', action='store_true', dest='region_fields',
+parser_igblast.add_argument('--regions', action='store_true', dest='parse_regions',
-help='''Specify if IMGT framework and CDR regions should be
+help='''Specify if IMGT FWR and CDRs should be
 included in the output. Adds the FWR1_IMGT, FWR2_IMGT,
 FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and
 CDR3_IMGT columns.''')
+parser_igblast.add_argument('--cdr3', action='store_true',
+dest='parse_igblast_cdr3',
+help='''Specify if the CDR3 sequences generated by IgBLAST
+should be included in the output. Adds the columns
+CDR3_IGBLAST_NT and CDR3_IGBLAST_AA. Requires IgBLAST
+version 1.5 or greater.''')
+parser_igblast.set_defaults(func=parseIgBLAST)
 # IMGT aligner
-parser_imgt = subparsers.add_parser('imgt', help='Process IMGT/HighV-Quest output',
+parser_imgt = subparsers.add_parser('imgt', parents=[parser_parent],
-parents=[parser_parent],
+formatter_class=CommonHelpFormatter,
-formatter_class=CommonHelpFormatter)
+help='''Process IMGT/HighV-Quest output
-imgt_arg_group =  parser_imgt.add_mutually_exclusive_group(required=True)
+(does not work with V-QUEST).''',
-imgt_arg_group.add_argument('-i', nargs='+', action='store', dest='aligner_files',
+description='''Process IMGT/HighV-Quest output
-help='''Either zipped IMGT output files (.zip) or a folder
+(does not work with V-QUEST).''')
-containing unzipped IMGT output files (which must
+parser_imgt.add_argument('-i', nargs='+', action='store', dest='aligner_outputs',
-include 1_Summary, 2_IMGT-gapped, 3_Nt-sequences,
+help='''Either zipped IMGT output files (.zip or .txz) or a
-and 6_Junction).''')
+folder containing unzipped IMGT output files (which must
+include 1_Summary, 2_IMGT-gapped, 3_Nt-sequences,
+and 6_Junction).''')
 parser_imgt.add_argument('-s', nargs='*', action='store', dest='seq_files',
 required=False,
-help='List of input FASTA files containing sequences')
+help='''List of input FASTA files (with .fasta, .fna or .fa
+extension) containing sequences.''')
 parser_imgt.add_argument('--noparse', action='store_true', dest='no_parse',
-help='''Specify if input IDs should not be parsed to add new
+help='''Specify to prevent input sequence headers from being parsed
-columns to database.''')
+to add new columns to database. Parsing of sequence headers requires
-parser_imgt.add_argument('--scores', action='store_true', dest='score_fields',
+headers to be in the pRESTO annotation format, so this should be specified
+when sequence headers are incompatible with the pRESTO annotation scheme.
+Note, unrecognized header formats will default to this behavior.''')
+parser_imgt.add_argument('--partial', action='store_true', dest='partial',
+help='''If specified, include incomplete V(D)J alignments in
+the pass file instead of the fail file.''')
+parser_imgt.add_argument('--scores', action='store_true', dest='parse_scores',
 help='''Specify if alignment score metrics should be
 included in the output. Adds the V_SCORE, V_IDENTITY,
-J_SCORE and J_IDENTITY. Note, this will also add
+J_SCORE and J_IDENTITY.''')
-the columns V_EVALUE, V_BTOP, J_EVALUE and J_BTOP,
+parser_imgt.add_argument('--regions', action='store_true', dest='parse_regions',
-but they will be empty for IMGT output.''')
+help='''Specify if IMGT FWRs and CDRs should be
-parser_imgt.add_argument('--regions', action='store_true', dest='region_fields',
-help='''Specify if IMGT framework and CDR regions should be
 included in the output. Adds the FWR1_IMGT, FWR2_IMGT,
 FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and
 CDR3_IMGT columns.''')
+parser_imgt.add_argument('--junction', action='store_true', dest='parse_junction',
+help='''Specify if detailed junction fields should be
+included in the output. Adds the columns
+N1_LENGTH, N2_LENGTH, P3V_LENGTH, P5D_LENGTH, P3D_LENGTH,
+P5J_LENGTH, D_FRAME.''')
 parser_imgt.set_defaults(func=parseIMGT)
+# iHMMuneAlign Aligner
+parser_ihmm = subparsers.add_parser('ihmm', parents=[parser_parent],
+formatter_class=CommonHelpFormatter,
+help='Process iHMMune-Align output.',
+description='Process iHMMune-Align output.')
+parser_ihmm.add_argument('-i', nargs='+', action='store', dest='aligner_outputs',
+required=True,
+help='''iHMMune-Align output file.''')
+parser_ihmm.add_argument('-r', nargs='+', action='store', dest='repo', required=True,
+help='''List of folders and/or FASTA files containing
+IMGT-gapped germline sequences corresponding to the
+set of germlines used in the IgBLAST alignment.''')
+parser_ihmm.add_argument('-s', action='store', nargs='+', dest='seq_files',
+required=True,
+help='''List of input FASTA files (with .fasta, .fna or .fa
+extension) containing sequences.''')
+parser_ihmm.add_argument('--noparse', action='store_true', dest='no_parse',
+help='''Specify to prevent input sequence headers from being parsed
+to add new columns to database. Parsing of sequence headers requires
+headers to be in the pRESTO annotation format, so this should be specified
+when sequence headers are incompatible with the pRESTO annotation scheme.
+Note, unrecognized header formats will default to this behavior.''')
+parser_ihmm.add_argument('--partial', action='store_true', dest='partial',
+help='''If specified, include incomplete V(D)J alignments in
+the pass file instead of the fail file.''')
+parser_ihmm.add_argument('--scores', action='store_true', dest='parse_scores',
+help='''Specify if alignment score metrics should be
+included in the output. Adds the path score of the
+iHMMune-Align hidden Markov model to HMM_SCORE.''')
+parser_ihmm.add_argument('--regions', action='store_true', dest='parse_regions',
+help='''Specify if IMGT FWRs and CDRs should be
+included in the output. Adds the FWR1_IMGT, FWR2_IMGT,
+FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and
+CDR3_IMGT columns.''')
+parser_ihmm.set_defaults(func=parseIHMM)
 return parser
 if __name__ == "__main__":
 """
 Parses command line arguments and calls main
 """
 parser = getArgParser()
 args = parser.parse_args()
-args_dict = parseCommonArgs(args, in_arg='aligner_files')
+args_dict = parseCommonArgs(args, in_arg='aligner_outputs')
 # Set no ID parsing if sequence files are not provided
 if 'seq_files' in args_dict and not args_dict['seq_files']:
 args_dict['no_parse'] = True
 # Delete
 if 'seq_files' in args_dict: del args_dict['seq_files']
-if 'aligner_files' in args_dict: del args_dict['aligner_files']
+if 'aligner_outputs' in args_dict: del args_dict['aligner_outputs']
 if 'command' in args_dict: del args_dict['command']
 if 'func' in args_dict: del args_dict['func']
 if args.command == 'imgt':
-for i in range(len(args.__dict__['aligner_files'])):
+for i in range(len(args.__dict__['aligner_outputs'])):
-args_dict['imgt_output'] = args.__dict__['aligner_files'][i]
+args_dict['aligner_output'] = args.__dict__['aligner_outputs'][i]
 args_dict['seq_file'] = args.__dict__['seq_files'][i] \
 if args.__dict__['seq_files'] else None
 args.func(**args_dict)
-elif args.command == 'igblast':
+elif args.command == 'igblast' or args.command == 'ihmm':
-for i in range(len(args.__dict__['aligner_files'])):
+for i in range(len(args.__dict__['aligner_outputs'])):
-args_dict['igblast_output'] =  args.__dict__['aligner_files'][i]
+args_dict['aligner_output'] =  args.__dict__['aligner_outputs'][i]
 args_dict['seq_file'] = args.__dict__['seq_files'][i]
 args.func(**args_dict)

Mercurial > repos > davidvanzessen > shm_csr

comparison change_o/MakeDb.py @ 52:22dddabe3637 draft