Mercurial > repos > davidvanzessen > shm_csr
diff change_o/MakeDb.py @ 52:22dddabe3637 draft
Uploaded
author | davidvanzessen |
---|---|
date | Tue, 23 May 2017 08:32:58 -0400 |
parents | c33d93683a09 |
children |
line wrap: on
line diff
--- a/change_o/MakeDb.py Tue May 16 08:52:48 2017 -0400 +++ b/change_o/MakeDb.py Tue May 23 08:32:58 2017 -0400 @@ -7,765 +7,164 @@ from changeo import __version__, __date__ # Imports -import csv import os -import re import sys -import pandas as pd -import tarfile -import zipfile from argparse import ArgumentParser from collections import OrderedDict -from itertools import groupby -from shutil import rmtree -from tempfile import mkdtemp from textwrap import dedent from time import time from Bio import SeqIO -from Bio.Seq import Seq -from Bio.Alphabet import IUPAC # Presto and changeo imports from presto.Defaults import default_out_args from presto.Annotation import parseAnnotation -from presto.IO import countSeqFile, printLog, printProgress +from presto.IO import countSeqFile, printLog, printMessage, printProgress, readSeqFile from changeo.Commandline import CommonHelpFormatter, getCommonArgParser, parseCommonArgs -from changeo.IO import getDbWriter, countDbFile, getRepo -from changeo.Receptor import IgRecord, parseAllele, v_allele_regex, d_allele_regex, \ - j_allele_regex - -# Default parameters -default_delimiter = ('\t', ',', '-') - - -def gapV(ig_dict, repo_dict): - """ - Insert gaps into V region and update alignment information - - Arguments: - ig_dict : Dictionary of parsed IgBlast output - repo_dict : Dictionary of IMGT gapped germline sequences - - Returns: - dict : Updated with SEQUENCE_IMGT, V_GERM_START_IMGT, and V_GERM_LENGTH_IMGT fields - """ - - seq_imgt = '.' * (int(ig_dict['V_GERM_START_VDJ'])-1) + ig_dict['SEQUENCE_VDJ'] - - # Find gapped germline V segment - vgene = parseAllele(ig_dict['V_CALL'], v_allele_regex, 'first') - vkey = (vgene, ) - #TODO: Figure out else case - if vkey in repo_dict: - vgap = repo_dict[vkey] - # Iterate over gaps in the germline segment - gaps = re.finditer(r'\.', vgap) - gapcount = int(ig_dict['V_GERM_START_VDJ'])-1 - for gap in gaps: - i = gap.start() - # Break if gap begins after V region - if i >= ig_dict['V_GERM_LENGTH_VDJ'] + gapcount: - break - # Insert gap into IMGT sequence - seq_imgt = seq_imgt[:i] + '.' + seq_imgt[i:] - # Update gap counter - gapcount += 1 - ig_dict['SEQUENCE_IMGT'] = seq_imgt - # Update IMGT positioning information for V - ig_dict['V_GERM_START_IMGT'] = 1 - ig_dict['V_GERM_LENGTH_IMGT'] = ig_dict['V_GERM_LENGTH_VDJ'] + gapcount - - return ig_dict +from changeo.IO import countDbFile, extractIMGT, getDbWriter, readRepo +from changeo.Parsers import IgBLASTReader, IMGTReader, IHMMuneReader, getIDforIMGT -def getIMGTJunc(ig_dict, repo_dict): - """ - Identify junction region by IMGT definition - - Arguments: - ig_dict : Dictionary of parsed IgBlast output - repo_dict : Dictionary of IMGT gapped germline sequences - - Returns: - dict : Updated with JUNCTION_LENGTH_IMGT and JUNCTION_IMGT fields - """ - # Find germline J segment - jgene = parseAllele(ig_dict['J_CALL'], j_allele_regex, 'first') - jkey = (jgene, ) - #TODO: Figure out else case - if jkey in repo_dict: - # Get germline J sequence - jgerm = repo_dict[jkey] - jgerm = jgerm[:ig_dict['J_GERM_START']+ig_dict['J_GERM_LENGTH']-1] - # Look for (F|W)GXG aa motif in nt sequence - motif = re.search(r'T(TT|TC|GG)GG[ACGT]{4}GG[AGCT]', jgerm) - aa_end = len(ig_dict['SEQUENCE_IMGT']) - #TODO: Figure out else case - if motif: - # print('\n', motif.group()) - aa_end = motif.start() - len(jgerm) + 3 - # Add fields to dict - ig_dict['JUNCTION'] = ig_dict['SEQUENCE_IMGT'][309:aa_end] - ig_dict['JUNCTION_LENGTH'] = len(ig_dict['JUNCTION']) - - return ig_dict - - -def getRegions(ig_dict): +def getFilePrefix(aligner_output, out_args): """ - Identify FWR and CDR regions by IMGT definition - - Arguments: - ig_dict : Dictionary of parsed alignment output - - Returns: - dict : Updated with FWR1_IMGT, FWR2_IMGT, FWR3_IMGT, FWR4_IMGT, - CDR1_IMGT, CDR2_IMGT, and CDR3_IMGT fields - """ - try: - seq_len = len(ig_dict['SEQUENCE_IMGT']) - ig_dict['FWR1_IMGT'] = ig_dict['SEQUENCE_IMGT'][0:min(78,seq_len)] - except (KeyError, IndexError): - return ig_dict - - try: ig_dict['CDR1_IMGT'] = ig_dict['SEQUENCE_IMGT'][78:min(114, seq_len)] - except (IndexError): return ig_dict - - try: ig_dict['FWR2_IMGT'] = ig_dict['SEQUENCE_IMGT'][114:min(165, seq_len)] - except (IndexError): return ig_dict - - try: ig_dict['CDR2_IMGT'] = ig_dict['SEQUENCE_IMGT'][165:min(195, seq_len)] - except (IndexError): return ig_dict - - try: ig_dict['FWR3_IMGT'] = ig_dict['SEQUENCE_IMGT'][195:min(312, seq_len)] - except (IndexError): return ig_dict - - try: - cdr3_end = 306 + ig_dict['JUNCTION_LENGTH'] - ig_dict['CDR3_IMGT'] = ig_dict['SEQUENCE_IMGT'][312:cdr3_end] - ig_dict['FWR4_IMGT'] = ig_dict['SEQUENCE_IMGT'][cdr3_end:] - except (KeyError, IndexError): - return ig_dict - - return ig_dict - - -def getSeqforIgBlast(seq_file): - """ - Fetch input sequences for IgBlast queries + Get file name prefix and create output directory Arguments: - seq_file = a fasta file of sequences input to IgBlast + aligner_output : aligner output file or directory. + out_args : dictionary of output arguments. Returns: - a dictionary of {ID:Seq} + str : file name prefix. """ - - seq_dict = SeqIO.index(seq_file, "fasta", IUPAC.ambiguous_dna) - - # Create a seq_dict ID translation using IDs truncate up to space or 50 chars - seqs = {} - for seq in seq_dict.values(): - seqs.update({seq.description:str(seq.seq)}) - - return seqs - + # Determine output directory + if not out_args['out_dir']: + out_dir = os.path.dirname(os.path.abspath(aligner_output)) + else: + out_dir = os.path.abspath(out_args['out_dir']) + if not os.path.exists(out_dir): + os.mkdir(out_dir) -def findLine(handle, query): - """ - Finds line with query string in file + # Determine file prefix + if out_args['out_name']: + file_prefix = out_args['out_name'] + else: + file_prefix = os.path.splitext(os.path.split(os.path.abspath(aligner_output))[1])[0] - Arguments: - handle = file handle in which to search for line - query = query string for which to search in file - - Returns: - line from handle in which query string was found - """ - for line in handle: - if(re.match(query, line)): - return line + return os.path.join(out_dir, file_prefix) -def extractIMGT(imgt_output): - """ - Extract necessary files from IMGT results, zipped or unzipped - - Arguments: - imgt_output = zipped file or unzipped folder output by IMGT - - Returns: - sorted list of filenames from which information will be read +def getSeqDict(seq_file): """ - #file_ext = os.path.splitext(imgt_output)[1].lower() - imgt_flags = ('1_Summary', '2_IMGT-gapped', '3_Nt-sequences', '6_Junction') - temp_dir = mkdtemp() - if zipfile.is_zipfile(imgt_output): - # Open zip file - imgt_zip = zipfile.ZipFile(imgt_output, 'r') - # Extract required files - imgt_files = sorted([n for n in imgt_zip.namelist() \ - if os.path.basename(n).startswith(imgt_flags)]) - imgt_zip.extractall(temp_dir, imgt_files) - # Define file list - imgt_files = [os.path.join(temp_dir, f) for f in imgt_files] - elif os.path.isdir(imgt_output): - # Find required files in folder - folder_files = [] - for root, dirs, files in os.walk(imgt_output): - folder_files.extend([os.path.join(os.path.abspath(root), f) for f in files]) - # Define file list - imgt_files = sorted([n for n in folder_files \ - if os.path.basename(n).startswith(imgt_flags)]) - elif tarfile.is_tarfile(imgt_output): - # Open zip file - imgt_tar = tarfile.open(imgt_output, 'r') - # Extract required files - imgt_files = sorted([n for n in imgt_tar.getnames() \ - if os.path.basename(n).startswith(imgt_flags)]) - imgt_tar.extractall(temp_dir, [imgt_tar.getmember(n) for n in imgt_files]) - # Define file list - imgt_files = [os.path.join(temp_dir, f) for f in imgt_files] - else: - sys.exit('ERROR: Unsupported IGMT output file. Must be either a zipped file (.zip), LZMA compressed tarfile (.txz) or a folder.') - - if len(imgt_files) > len(imgt_flags): # e.g. multiple 1_Summary files - sys.exit('ERROR: Wrong files in IMGT output %s.' % imgt_output) - elif len(imgt_files) < len(imgt_flags): - sys.exit('ERROR: Missing necessary file IMGT output %s.' % imgt_output) - - return temp_dir, imgt_files - - -# TODO: return a dictionary with keys determined by the comment strings in the blocks, thus avoiding problems with missing blocks -def readOneIgBlastResult(block): - """ - Parse a single IgBLAST query result + Create a dictionary from a sequence file. Arguments: - block = itertools groupby object of single result - - Returns: - None if no results, otherwise list of DataFrames for each result block - """ - results = list() - i = 0 - for match, subblock in groupby(block, lambda l: l=='\n'): - if not match: - # Strip whitespace and comments - sub = [s.strip() for s in subblock if not s.startswith('#')] - - # Continue on empty block - if not sub: continue - else: i += 1 - - # Split by tabs - sub = [s.split('\t') for s in sub] - - # Append list for "V-(D)-J rearrangement summary" (i == 1) - # And "V-(D)-J junction details" (i == 2) - # Otherwise append DataFrame of subblock - if i == 1 or i == 2: - results.append(sub[0]) - else: - df = pd.DataFrame(sub) - if not df.empty: results.append(df) - - return results if results else None - - -# TODO: needs more speeds. pandas is probably to blame. -def readIgBlast(igblast_output, seq_dict, repo_dict, - score_fields=False, region_fields=False): - """ - Reads IgBlast output - - Arguments: - igblast_output = IgBlast output file (format 7) - seq_dict = a dictionary of {ID:Seq} from input fasta file - repo_dict = dictionary of IMGT gapped germline sequences - score_fields = if True parse alignment scores - region_fields = if True add FWR and CDR region fields + seq_file : sequence file. Returns: - a generator of dictionaries containing alignment data + dict : sequence description as keys with Bio.SeqRecords as values. """ - - # Open IgBlast output file - with open(igblast_output) as f: - # Iterate over individual results (separated by # IGBLASTN) - for k1, block in groupby(f, lambda x: re.match('# IGBLASTN', x)): - block = list(block) - if not k1: - # TODO: move query name extraction into block parser readOneIgBlastResult(). - # Extract sequence ID - query_name = ' '.join(block[0].strip().split(' ')[2:]) - # Initialize db_gen to have ID and input sequence - db_gen = {'SEQUENCE_ID': query_name, - 'SEQUENCE_INPUT': seq_dict[query_name]} - - # Parse further sub-blocks - block_list = readOneIgBlastResult(block) - - # TODO: this is indented pretty far. should be a separate function. or several functions. - # If results exist, parse further to obtain full db_gen - if block_list is not None: - # Parse quality information - db_gen['STOP'] = 'T' if block_list[0][-4] == 'Yes' else 'F' - db_gen['IN_FRAME'] = 'T' if block_list[0][-3] == 'In-frame' else 'F' - db_gen['FUNCTIONAL'] = 'T' if block_list[0][-2] == 'Yes' else 'F' - if block_list[0][-1] == '-': - db_gen['SEQUENCE_INPUT'] = str(Seq(db_gen['SEQUENCE_INPUT'], - IUPAC.ambiguous_dna).reverse_complement()) - - # Parse V, D, and J calls - call_str = ' '.join(block_list[0]) - v_call = parseAllele(call_str, v_allele_regex, action='list') - d_call = parseAllele(call_str, d_allele_regex, action='list') - j_call = parseAllele(call_str, j_allele_regex, action='list') - db_gen['V_CALL'] = ','.join(v_call) if v_call is not None else 'None' - db_gen['D_CALL'] = ','.join(d_call) if d_call is not None else 'None' - db_gen['J_CALL'] = ','.join(j_call) if j_call is not None else 'None' - - # Parse junction sequence - # db_gen['JUNCTION_VDJ'] = re.sub('(N/A)|\[|\(|\)|\]', '', ''.join(block_list[1])) - # db_gen['JUNCTION_LENGTH_VDJ'] = len(db_gen['JUNCTION_VDJ']) - - # TODO: IgBLAST does a stupid and doesn't output block #3 sometimes. why? - # TODO: maybe we should fail these. they look craptastic. - #pd.set_option('display.width', 500) - #print query_name, len(block_list), hit_idx - #for i, x in enumerate(block_list): - # print '[%i]' % i - # print x - - # Parse segment start and stop positions - hit_df = block_list[-1] - - # Alignment info block - # 0: segment - # 1: query id - # 2: subject id - # 3: % identity - # 4: alignment length - # 5: mismatches - # 6: gap opens - # 7: gaps - # 8: q. start - # 9: q. end - # 10: s. start - # 11: s. end - # 12: evalue - # 13: bit score - # 14: query seq - # 15: subject seq - # 16: btop - - # If V call exists, parse V alignment information - seq_vdj = '' - if v_call is not None: - v_align = hit_df[hit_df[0] == 'V'].iloc[0] - # Germline positions - db_gen['V_GERM_START_VDJ'] = int(v_align[10]) - db_gen['V_GERM_LENGTH_VDJ'] = int(v_align[11]) - db_gen['V_GERM_START_VDJ'] + 1 - # Query sequence positions - db_gen['V_SEQ_START'] = int(v_align[8]) - db_gen['V_SEQ_LENGTH'] = int(v_align[9]) - db_gen['V_SEQ_START'] + 1 - - if int(v_align[6]) == 0: - db_gen['INDELS'] = 'F' - else: - db_gen['INDELS'] = 'T' - # Set functional to none so record gets tossed (junction will be wrong) - # db_gen['FUNCTIONAL'] = None - - # V alignment scores - if score_fields: - try: db_gen['V_SCORE'] = float(v_align[13]) - except (TypeError, ValueError): db_gen['V_SCORE'] = 'None' - - try: db_gen['V_IDENTITY'] = float(v_align[3]) / 100.0 - except (TypeError, ValueError): db_gen['V_IDENTITY'] = 'None' - - try: db_gen['V_EVALUE'] = float(v_align[12]) - except (TypeError, ValueError): db_gen['V_EVALUE'] = 'None' - - try: db_gen['V_BTOP'] = v_align[16] - except (TypeError, ValueError): db_gen['V_BTOP'] = 'None' + seq_dict = SeqIO.to_dict(readSeqFile(seq_file), + key_function=lambda x: x.description) - # Update VDJ sequence, removing insertions - start = 0 - for m in re.finditer(r'-', v_align[15]): - ins = m.start() - seq_vdj += v_align[14][start:ins] - start = ins + 1 - seq_vdj += v_align[14][start:] - - # TODO: needs to check that the V results are present before trying to determine N1_LENGTH from them. - # If D call exists, parse D alignment information - if d_call is not None: - d_align = hit_df[hit_df[0] == 'D'].iloc[0] - - # TODO: this is kinda gross. not sure how else to fix the alignment overlap problem though. - # Determine N-region length and amount of J overlap with V or D alignment - overlap = 0 - if v_call is not None: - n1_len = int(d_align[8]) - (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH']) - if n1_len < 0: - db_gen['N1_LENGTH'] = 0 - overlap = abs(n1_len) - else: - db_gen['N1_LENGTH'] = n1_len - n1_start = (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH']-1) - n1_end = int(d_align[8])-1 - seq_vdj += db_gen['SEQUENCE_INPUT'][n1_start:n1_end] - - # Query sequence positions - db_gen['D_SEQ_START'] = int(d_align[8]) + overlap - db_gen['D_SEQ_LENGTH'] = max(int(d_align[9]) - db_gen['D_SEQ_START'] + 1, 0) - - # Germline positions - db_gen['D_GERM_START'] = int(d_align[10]) + overlap - db_gen['D_GERM_LENGTH'] = max(int(d_align[11]) - db_gen['D_GERM_START'] + 1, 0) - - # Update VDJ sequence, removing insertions - start = overlap - for m in re.finditer(r'-', d_align[15]): - ins = m.start() - seq_vdj += d_align[14][start:ins] - start = ins + 1 - seq_vdj += d_align[14][start:] - - # TODO: needs to check that the V results are present before trying to determine N1_LENGTH from them. - # If J call exists, parse J alignment information - if j_call is not None: - j_align = hit_df[hit_df[0] == 'J'].iloc[0] - - # TODO: this is kinda gross. not sure how else to fix the alignment overlap problem though. - # Determine N-region length and amount of J overlap with V or D alignment - overlap = 0 - if d_call is not None: - n2_len = int(j_align[8]) - (db_gen['D_SEQ_START'] + db_gen['D_SEQ_LENGTH']) - if n2_len < 0: - db_gen['N2_LENGTH'] = 0 - overlap = abs(n2_len) - else: - db_gen['N2_LENGTH'] = n2_len - n2_start = (db_gen['D_SEQ_START']+db_gen['D_SEQ_LENGTH']-1) - n2_end = int(j_align[8])-1 - seq_vdj += db_gen['SEQUENCE_INPUT'][n2_start:n2_end] - elif v_call is not None: - n1_len = int(j_align[8]) - (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH']) - if n1_len < 0: - db_gen['N1_LENGTH'] = 0 - overlap = abs(n1_len) - else: - db_gen['N1_LENGTH'] = n1_len - n1_start = (db_gen['V_SEQ_START']+db_gen['V_SEQ_LENGTH']-1) - n1_end = int(j_align[8])-1 - seq_vdj += db_gen['SEQUENCE_INPUT'][n1_start:n1_end] - else: - db_gen['N1_LENGTH'] = 0 - - # Query positions - db_gen['J_SEQ_START'] = int(j_align[8]) + overlap - db_gen['J_SEQ_LENGTH'] = max(int(j_align[9]) - db_gen['J_SEQ_START'] + 1, 0) - - # Germline positions - db_gen['J_GERM_START'] = int(j_align[10]) + overlap - db_gen['J_GERM_LENGTH'] = max(int(j_align[11]) - db_gen['J_GERM_START'] + 1, 0) - - # J alignment scores - if score_fields: - try: db_gen['J_SCORE'] = float(j_align[13]) - except (TypeError, ValueError): db_gen['J_SCORE'] = 'None' - - try: db_gen['J_IDENTITY'] = float(j_align[3]) / 100.0 - except (TypeError, ValueError): db_gen['J_IDENTITY'] = 'None' - - try: db_gen['J_EVALUE'] = float(j_align[12]) - except (TypeError, ValueError): db_gen['J_EVALUE'] = 'None' - - try: db_gen['J_BTOP'] = j_align[16] - except (TypeError, ValueError): db_gen['J_BTOP'] = 'None' - - # Update VDJ sequence, removing insertions - start = overlap - for m in re.finditer(r'-', j_align[15]): - ins = m.start() - seq_vdj += j_align[14][start:ins] - start = ins + 1 - seq_vdj += j_align[14][start:] - - db_gen['SEQUENCE_VDJ'] = seq_vdj - - # Create IMGT-gapped sequence and infer IMGT junction - if v_call is not None: - db_gen = gapV(db_gen, repo_dict) - if j_call is not None: - db_gen = getIMGTJunc(db_gen, repo_dict) - - # FWR and CDR regions - if region_fields: getRegions(db_gen) - - yield IgRecord(db_gen) + return seq_dict -# TODO: should be more readable -def readIMGT(imgt_files, score_fields=False, region_fields=False): - """ - Reads IMGT/HighV-Quest output - - Arguments: - imgt_files = IMGT/HighV-Quest output files 1, 2, 3, and 6 - score_fields = if True parse alignment scores - region_fields = if True add FWR and CDR region fields - - Returns: - a generator of dictionaries containing alignment data +def writeDb(db, fields, file_prefix, total_count, id_dict=None, no_parse=True, partial=False, + out_args=default_out_args): """ - imgt_iters = [csv.DictReader(open(f, 'rU'), delimiter='\t') for f in imgt_files] - # Create a dictionary for each sequence alignment and yield its generator - for sm, gp, nt, jn in zip(*imgt_iters): - if len(set([sm['Sequence ID'], - gp['Sequence ID'], - nt['Sequence ID'], - jn['Sequence ID']])) != 1: - sys.exit('Error: IMGT files are corrupt starting with Summary file record %s' \ - % sm['Sequence ID']) - - db_gen = {'SEQUENCE_ID': sm['Sequence ID'], - 'SEQUENCE_INPUT': sm['Sequence']} - - if 'No results' not in sm['Functionality']: - db_gen['FUNCTIONAL'] = ['?','T','F'][('productive' in sm['Functionality']) + - ('unprod' in sm['Functionality'])] - db_gen['IN_FRAME'] = ['?','T','F'][('in-frame' in sm['JUNCTION frame']) + - ('out-of-frame' in sm['JUNCTION frame'])], - db_gen['STOP'] = ['F','?','T'][('stop codon' in sm['Functionality comment']) + - ('unprod' in sm['Functionality'])] - db_gen['MUTATED_INVARIANT'] = ['F','?','T'][(any(('missing' in sm['Functionality comment'], - 'missing' in sm['V-REGION potential ins/del']))) + - ('unprod' in sm['Functionality'])] - db_gen['INDELS'] = ['F','T'][any((sm['V-REGION potential ins/del'], - sm['V-REGION insertions'], - sm['V-REGION deletions']))] - - db_gen['SEQUENCE_VDJ'] = nt['V-D-J-REGION'] if nt['V-D-J-REGION'] else nt['V-J-REGION'] - db_gen['SEQUENCE_IMGT'] = gp['V-D-J-REGION'] if gp['V-D-J-REGION'] else gp['V-J-REGION'] - - db_gen['V_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['V-GENE and allele'])) - db_gen['D_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['D-GENE and allele'])) - db_gen['J_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['J-GENE and allele'])) - - v_seq_length = len(nt['V-REGION']) if nt['V-REGION'] else 0 - db_gen['V_SEQ_START'] = nt['V-REGION start'] - db_gen['V_SEQ_LENGTH'] = v_seq_length - db_gen['V_GERM_START_IMGT'] = 1 - db_gen['V_GERM_LENGTH_IMGT'] = len(gp['V-REGION']) if gp['V-REGION'] else 0 - - db_gen['N1_LENGTH'] = sum(int(i) for i in [jn["P3'V-nt nb"], - jn['N-REGION-nt nb'], - jn['N1-REGION-nt nb'], - jn["P5'D-nt nb"]] if i) - db_gen['D_SEQ_START'] = sum(int(i) for i in [1, v_seq_length, - jn["P3'V-nt nb"], - jn['N-REGION-nt nb'], - jn['N1-REGION-nt nb'], - jn["P5'D-nt nb"]] if i) - db_gen['D_SEQ_LENGTH'] = int(jn["D-REGION-nt nb"] or 0) - db_gen['D_GERM_START'] = int(jn["5'D-REGION trimmed-nt nb"] or 0) + 1 - db_gen['D_GERM_LENGTH'] = int(jn["D-REGION-nt nb"] or 0) - db_gen['N2_LENGTH'] = sum(int(i) for i in [jn["P3'D-nt nb"], - jn['N2-REGION-nt nb'], - jn["P5'J-nt nb"]] if i) - - db_gen['J_SEQ_START_IMGT'] = sum(int(i) for i in [1, v_seq_length, - jn["P3'V-nt nb"], - jn['N-REGION-nt nb'], - jn['N1-REGION-nt nb'], - jn["P5'D-nt nb"], - jn["D-REGION-nt nb"], - jn["P3'D-nt nb"], - jn['N2-REGION-nt nb'], - jn["P5'J-nt nb"]] if i) - db_gen['J_SEQ_LENGTH'] = len(nt['J-REGION']) if nt['J-REGION'] else 0 - db_gen['J_GERM_START'] = int(jn["5'J-REGION trimmed-nt nb"] or 0) + 1 - db_gen['J_GERM_LENGTH'] = len(gp['J-REGION']) if gp['J-REGION'] else 0 - - db_gen['JUNCTION_LENGTH'] = len(jn['JUNCTION']) if jn['JUNCTION'] else 0 - db_gen['JUNCTION'] = jn['JUNCTION'] - - # Alignment scores - if score_fields: - try: db_gen['V_SCORE'] = float(sm['V-REGION score']) - except (TypeError, ValueError): db_gen['V_SCORE'] = 'None' - - try: db_gen['V_IDENTITY'] = float(sm['V-REGION identity %']) / 100.0 - except (TypeError, ValueError): db_gen['V_IDENTITY'] = 'None' - - try: db_gen['J_SCORE'] = float(sm['J-REGION score']) - except (TypeError, ValueError): db_gen['J_SCORE'] = 'None' - - try: db_gen['J_IDENTITY'] = float(sm['J-REGION identity %']) / 100.0 - except (TypeError, ValueError): db_gen['J_IDENTITY'] = 'None' - - # FWR and CDR regions - if region_fields: getRegions(db_gen) - else: - db_gen['V_CALL'] = 'None' - db_gen['D_CALL'] = 'None' - db_gen['J_CALL'] = 'None' - - yield IgRecord(db_gen) - - -def getIDforIMGT(seq_file): - """ - Create a sequence ID translation using IMGT truncation - - Arguments: - seq_file = a fasta file of sequences input to IMGT - - Returns: - a dictionary of {truncated ID: full seq description} - """ - - # Create a seq_dict ID translation using IDs truncate up to space or 50 chars - ids = {} - for i, rec in enumerate(SeqIO.parse(seq_file, 'fasta', IUPAC.ambiguous_dna)): - if len(rec.description) <= 50: - id_key = rec.description - else: - id_key = re.sub('\||\s|!|&|\*|<|>|\?','_',rec.description[:50]) - ids.update({id_key:rec.description}) - - return ids - - -def writeDb(db_gen, file_prefix, total_count, id_dict={}, no_parse=True, - score_fields=False, region_fields=False, out_args=default_out_args): - """ - Writes tab-delimited database file in output directory + Writes tab-delimited database file in output directory. Arguments: - db_gen = a generator of IgRecord objects containing alignment data - file_prefix = directory and prefix for CLIP tab-delim file - total_count = number of records (for progress bar) - id_dict = a dictionary of {IMGT ID: full seq description} - no_parse = if ID is to be parsed for pRESTO output with default delimiters - score_fields = if True add alignment score fields to output file - region_fields = if True add FWR and CDR region fields to output file - out_args = common output argument dictionary from parseCommonArgs + db : a iterator of IgRecord objects containing alignment data. + fields : a list of ordered field names to write. + file_prefix : directory and prefix for CLIP tab-delim file. + total_count : number of records (for progress bar). + id_dict : a dictionary of the truncated sequence ID mapped to the full sequence ID. + no_parse : if ID is to be parsed for pRESTO output with default delimiters. + partial : if True put incomplete alignments in the pass file. + out_args : common output argument dictionary from parseCommonArgs. Returns: - None + None """ - pass_file = "%s_db-pass.tab" % file_prefix - fail_file = "%s_db-fail.tab" % file_prefix - ordered_fields = ['SEQUENCE_ID', - 'SEQUENCE_INPUT', - 'FUNCTIONAL', - 'IN_FRAME', - 'STOP', - 'MUTATED_INVARIANT', - 'INDELS', - 'V_CALL', - 'D_CALL', - 'J_CALL', - 'SEQUENCE_VDJ', - 'SEQUENCE_IMGT', - 'V_SEQ_START', - 'V_SEQ_LENGTH', - 'V_GERM_START_VDJ', - 'V_GERM_LENGTH_VDJ', - 'V_GERM_START_IMGT', - 'V_GERM_LENGTH_IMGT', - 'N1_LENGTH', - 'D_SEQ_START', - 'D_SEQ_LENGTH', - 'D_GERM_START', - 'D_GERM_LENGTH', - 'N2_LENGTH', - 'J_SEQ_START', - 'J_SEQ_LENGTH', - 'J_GERM_START', - 'J_GERM_LENGTH', - 'JUNCTION_LENGTH', - 'JUNCTION'] + # Function to check for valid records strictly + def _pass_strict(rec): + valid = [rec.v_call and rec.v_call != 'None', + rec.j_call and rec.j_call != 'None', + rec.functional is not None, + rec.seq_vdj, + rec.junction] + return all(valid) - if score_fields: - ordered_fields.extend(['V_SCORE', - 'V_IDENTITY', - 'V_EVALUE', - 'V_BTOP', - 'J_SCORE', - 'J_IDENTITY', - 'J_EVALUE', - 'J_BTOP']) + # Function to check for valid records loosely + def _pass_gentle(rec): + valid = [rec.v_call and rec.v_call != 'None', + rec.d_call and rec.d_call != 'None', + rec.j_call and rec.j_call != 'None'] + return any(valid) - if region_fields: - ordered_fields.extend(['FWR1_IMGT', 'FWR2_IMGT', 'FWR3_IMGT', 'FWR4_IMGT', - 'CDR1_IMGT', 'CDR2_IMGT', 'CDR3_IMGT']) - + # Set pass criteria + _pass = _pass_gentle if partial else _pass_strict - # TODO: This is not the best approach. should pass in output fields. - # Initiate passed handle - pass_handle = None + # Define output file names + pass_file = '%s_db-pass.tab' % file_prefix + fail_file = '%s_db-fail.tab' % file_prefix - # Open failed file - if out_args['failed']: - fail_handle = open(fail_file, 'wt') - fail_writer = getDbWriter(fail_handle, add_fields=['SEQUENCE_ID', 'SEQUENCE_INPUT']) - else: - fail_handle = None - fail_writer = None - - # Initialize counters and file + # Initiate handles, writers and counters + pass_handle = None + fail_handle = None pass_writer = None + fail_writer = None start_time = time() rec_count = pass_count = fail_count = 0 - for record in db_gen: - #printProgress(i + (total_count/2 if id_dict else 0), total_count, 0.05, start_time) - printProgress(rec_count, total_count, 0.05, start_time) - rec_count += 1 - # Count pass or fail - if (record.v_call == 'None' and record.j_call == 'None') or \ - record.functional is None or \ - not record.seq_vdj or \ - not record.junction: - # print(record.v_call, record.j_call, record.functional, record.junction) - fail_count += 1 - if fail_writer is not None: fail_writer.writerow(record.toDict()) - continue - else: - pass_count += 1 - - # Build sample sequence description - if record.id in id_dict: + # Validate and write output + printProgress(0, total_count, 0.05, start_time) + for i, record in enumerate(db, start=1): + + # Replace sequence description with full string, if required + if id_dict is not None and record.id in id_dict: record.id = id_dict[record.id] # Parse sequence description into new columns if not no_parse: - record.annotations = parseAnnotation(record.id, delimiter=out_args['delimiter']) - record.id = record.annotations['ID'] - del record.annotations['ID'] + try: + record.annotations = parseAnnotation(record.id, delimiter=out_args['delimiter']) + record.id = record.annotations['ID'] + del record.annotations['ID'] + + # TODO: This is not the best approach. should pass in output fields. + # If first record, use parsed description to define extra columns + if i == 1: fields.extend(list(record.annotations.keys())) + except IndexError: + # Could not parse pRESTO-style annotations so fall back to no parse + no_parse = True + sys.stderr.write('\nWARNING: Sequence annotation format not recognized. Sequence headers will not be parsed.\n') - # TODO: This is not the best approach. should pass in output fields. - # If first sequence, use parsed description to create new columns and initialize writer - if pass_writer is None: - if not no_parse: ordered_fields.extend(list(record.annotations.keys())) - pass_handle = open(pass_file, 'wt') - pass_writer = getDbWriter(pass_handle, add_fields=ordered_fields) + # Count pass or fail and write to appropriate file + if _pass(record): + # Open pass file + if pass_writer is None: + pass_handle = open(pass_file, 'wt') + pass_writer = getDbWriter(pass_handle, add_fields=fields) - # Write row to tab-delim CLIP file - pass_writer.writerow(record.toDict()) - - # Print log - #printProgress(i+1 + (total_count/2 if id_dict else 0), total_count, 0.05, start_time) - printProgress(rec_count, total_count, 0.05, start_time) + # Write row to pass file + pass_count += 1 + pass_writer.writerow(record.toDict()) + else: + # Open failed file + if out_args['failed'] and fail_writer is None: + fail_handle = open(fail_file, 'wt') + fail_writer = getDbWriter(fail_handle, add_fields=fields) + # Write row to fail file if specified + fail_count += 1 + if fail_writer is not None: + fail_writer.writerow(record.toDict()) + + # Print progress + printProgress(i, total_count, 0.05, start_time) + + # Print consol log log = OrderedDict() log['OUTPUT'] = pass_file log['PASS'] = pass_count @@ -777,146 +176,215 @@ if fail_handle is not None: fail_handle.close() -# TODO: may be able to merge with parseIMGT -def parseIgBlast(igblast_output, seq_file, repo, no_parse=True, score_fields=False, - region_fields=False, out_args=default_out_args): +# TODO: may be able to merge with other mains +def parseIMGT(aligner_output, seq_file=None, no_parse=True, partial=False, + parse_scores=False, parse_regions=False, parse_junction=False, + out_args=default_out_args): """ - Main for IgBlast aligned sample sequences + Main for IMGT aligned sample sequences. Arguments: - igblast_output = IgBlast output file to process - seq_file = fasta file input to IgBlast (from which to get sequence) - repo = folder with germline repertoire files - no_parse = if ID is to be parsed for pRESTO output with default delimiters - score_fields = if True add alignment score fields to output file - region_fields = if True add FWR and CDR region fields to output file - out_args = common output argument dictionary from parseCommonArgs + aligner_output : zipped file or unzipped folder output by IMGT. + seq_file : FASTA file input to IMGT (from which to get seqID). + no_parse : if ID is to be parsed for pRESTO output with default delimiters. + partial : If True put incomplete alignments in the pass file. + parse_scores : if True add alignment score fields to output file. + parse_regions : if True add FWR and CDR region fields to output file. + out_args : common output argument dictionary from parseCommonArgs. Returns: - None + None + """ + # Print parameter info + log = OrderedDict() + log['START'] = 'MakeDb' + log['ALIGNER'] = 'IMGT' + log['ALIGNER_OUTPUT'] = aligner_output + log['SEQ_FILE'] = os.path.basename(seq_file) if seq_file else '' + log['NO_PARSE'] = no_parse + log['PARTIAL'] = partial + log['SCORES'] = parse_scores + log['REGIONS'] = parse_regions + log['JUNCTION'] = parse_junction + printLog(log) + + start_time = time() + printMessage('Loading sequence files', start_time=start_time, width=25) + # Extract IMGT files + temp_dir, imgt_files = extractIMGT(aligner_output) + # Count records in IMGT files + total_count = countDbFile(imgt_files['summary']) + # Get (parsed) IDs from fasta file submitted to IMGT + id_dict = getIDforIMGT(seq_file) if seq_file else {} + printMessage('Done', start_time=start_time, end=True, width=25) + + # Parse IMGT output and write db + with open(imgt_files['summary'], 'r') as summary_handle, \ + open(imgt_files['gapped'], 'r') as gapped_handle, \ + open(imgt_files['ntseq'], 'r') as ntseq_handle, \ + open(imgt_files['junction'], 'r') as junction_handle: + parse_iter = IMGTReader(summary_handle, gapped_handle, ntseq_handle, junction_handle, + parse_scores=parse_scores, parse_regions=parse_regions, + parse_junction=parse_junction) + file_prefix = getFilePrefix(aligner_output, out_args) + writeDb(parse_iter, parse_iter.fields, file_prefix, total_count, id_dict=id_dict, + no_parse=no_parse, partial=partial, out_args=out_args) + + # Cleanup temp directory + temp_dir.cleanup() + + return None + + +# TODO: may be able to merge with other mains +def parseIgBLAST(aligner_output, seq_file, repo, no_parse=True, partial=False, + parse_regions=False, parse_scores=False, parse_igblast_cdr3=False, + out_args=default_out_args): + """ + Main for IgBLAST aligned sample sequences. + + Arguments: + aligner_output : IgBLAST output file to process. + seq_file : fasta file input to IgBlast (from which to get sequence). + repo : folder with germline repertoire files. + no_parse : if ID is to be parsed for pRESTO output with default delimiters. + partial : If True put incomplete alignments in the pass file. + parse_regions : if True add FWR and CDR fields to output file. + parse_scores : if True add alignment score fields to output file. + parse_igblast_cdr3 : if True parse CDR3 sequences generated by IgBLAST + out_args : common output argument dictionary from parseCommonArgs. + + Returns: + None """ # Print parameter info log = OrderedDict() log['START'] = 'MakeDB' log['ALIGNER'] = 'IgBlast' - log['ALIGN_RESULTS'] = os.path.basename(igblast_output) + log['ALIGNER_OUTPUT'] = os.path.basename(aligner_output) log['SEQ_FILE'] = os.path.basename(seq_file) log['NO_PARSE'] = no_parse - log['SCORE_FIELDS'] = score_fields - log['REGION_FIELDS'] = region_fields + log['PARTIAL'] = partial + log['SCORES'] = parse_scores + log['REGIONS'] = parse_regions printLog(log) + start_time = time() + printMessage('Loading sequence files', start_time=start_time, width=25) + # Count records in sequence file + total_count = countSeqFile(seq_file) # Get input sequence dictionary - seq_dict = getSeqforIgBlast(seq_file) + seq_dict = getSeqDict(seq_file) + # Create germline repo dictionary + repo_dict = readRepo(repo) + printMessage('Done', start_time=start_time, end=True, width=25) - # Formalize out_dir and file-prefix - if not out_args['out_dir']: - out_dir = os.path.split(igblast_output)[0] - else: - out_dir = os.path.abspath(out_args['out_dir']) - if not os.path.exists(out_dir): os.mkdir(out_dir) - if out_args['out_name']: - file_prefix = out_args['out_name'] - else: - file_prefix = os.path.basename(os.path.splitext(igblast_output)[0]) - file_prefix = os.path.join(out_dir, file_prefix) + # Parse and write output + with open(aligner_output, 'r') as f: + parse_iter = IgBLASTReader(f, seq_dict, repo_dict, + parse_scores=parse_scores, parse_regions=parse_regions, + parse_igblast_cdr3=parse_igblast_cdr3) + file_prefix = getFilePrefix(aligner_output, out_args) + writeDb(parse_iter, parse_iter.fields, file_prefix, total_count, + no_parse=no_parse, partial=partial, out_args=out_args) - total_count = countSeqFile(seq_file) - - # Create - repo_dict = getRepo(repo) - igblast_dict = readIgBlast(igblast_output, seq_dict, repo_dict, - score_fields=score_fields, region_fields=region_fields) - writeDb(igblast_dict, file_prefix, total_count, no_parse=no_parse, - score_fields=score_fields, region_fields=region_fields, out_args=out_args) + return None -# TODO: may be able to merge with parseIgBlast -def parseIMGT(imgt_output, seq_file=None, no_parse=True, score_fields=False, - region_fields=False, out_args=default_out_args): +# TODO: may be able to merge with other mains +def parseIHMM(aligner_output, seq_file, repo, no_parse=True, partial=False, + parse_scores=False, parse_regions=False, out_args=default_out_args): """ - Main for IMGT aligned sample sequences + Main for iHMMuneAlign aligned sample sequences. Arguments: - imgt_output = zipped file or unzipped folder output by IMGT - seq_file = FASTA file input to IMGT (from which to get seqID) - no_parse = if ID is to be parsed for pRESTO output with default delimiters - score_fields = if True add alignment score fields to output file - region_fields = if True add FWR and CDR region fields to output file - out_args = common output argument dictionary from parseCommonArgs - - Returns: - None + aligner_output : iHMMune-Align output file to process. + seq_file : fasta file input to iHMMuneAlign (from which to get sequence). + repo : folder with germline repertoire files. + no_parse : if ID is to be parsed for pRESTO output with default delimiters. + partial : If True put incomplete alignments in the pass file. + parse_scores : if True parse alignment scores. + parse_regions : if True add FWR and CDR region fields. + out_args : common output argument dictionary from parseCommonArgs. + + Returns: + None """ # Print parameter info log = OrderedDict() - log['START'] = 'MakeDb' - log['ALIGNER'] = 'IMGT' - log['ALIGN_RESULTS'] = imgt_output - log['SEQ_FILE'] = os.path.basename(seq_file) if seq_file else '' + log['START'] = 'MakeDB' + log['ALIGNER'] = 'iHMMune-Align' + log['ALIGNER_OUTPUT'] = os.path.basename(aligner_output) + log['SEQ_FILE'] = os.path.basename(seq_file) log['NO_PARSE'] = no_parse - log['SCORE_FIELDS'] = score_fields - log['REGION_FIELDS'] = region_fields + log['PARTIAL'] = partial + log['SCORES'] = parse_scores + log['REGIONS'] = parse_regions printLog(log) - - # Get individual IMGT result files - temp_dir, imgt_files = extractIMGT(imgt_output) - - # Formalize out_dir and file-prefix - if not out_args['out_dir']: - out_dir = os.path.dirname(os.path.abspath(imgt_output)) - else: - out_dir = os.path.abspath(out_args['out_dir']) - if not os.path.exists(out_dir): os.mkdir(out_dir) - if out_args['out_name']: - file_prefix = out_args['out_name'] - else: - file_prefix = os.path.splitext(os.path.split(os.path.abspath(imgt_output))[1])[0] - file_prefix = os.path.join(out_dir, file_prefix) - total_count = countDbFile(imgt_files[0]) - - # Get (parsed) IDs from fasta file submitted to IMGT - id_dict = getIDforIMGT(seq_file) if seq_file else {} - - # Create - imgt_dict = readIMGT(imgt_files, score_fields=score_fields, - region_fields=region_fields) - writeDb(imgt_dict, file_prefix, total_count, id_dict=id_dict, no_parse=no_parse, - score_fields=score_fields, region_fields=region_fields, out_args=out_args) + start_time = time() + printMessage('Loading sequence files', start_time=start_time, width=25) + # Count records in sequence file + total_count = countSeqFile(seq_file) + # Get input sequence dictionary + seq_dict = getSeqDict(seq_file) + # Create germline repo dictionary + repo_dict = readRepo(repo) + printMessage('Done', start_time=start_time, end=True, width=25) - # Delete temp directory - rmtree(temp_dir) + # Parse and write output + with open(aligner_output, 'r') as f: + parse_iter = IHMMuneReader(f, seq_dict, repo_dict, + parse_scores=parse_scores, parse_regions=parse_regions) + file_prefix = getFilePrefix(aligner_output, out_args) + writeDb(parse_iter, parse_iter.fields, file_prefix, total_count, + no_parse=no_parse, partial=partial, out_args=out_args) + + return None def getArgParser(): """ - Defines the ArgumentParser + Defines the ArgumentParser. - Arguments: - None - Returns: - an ArgumentParser object + argparse.ArgumentParser """ fields = dedent( ''' output files: db-pass - database of parsed alignment records. + database of alignment records with functionality information, + V and J calls, and a junction region. db-fail - database with records failing alignment. + database with records that fail due to no functionality information + (did not pass IMGT), no V call, no J call, or no junction region. - output fields: - SEQUENCE_ID, SEQUENCE_INPUT, FUNCTIONAL, IN_FRAME, STOP, MUTATED_INVARIANT, - INDELS, V_CALL, D_CALL, J_CALL, SEQUENCE_VDJ and/or SEQUENCE_IMGT, - V_SEQ_START, V_SEQ_LENGTH, V_GERM_START_VDJ and/or V_GERM_START_IMGT, - V_GERM_LENGTH_VDJ and/or V_GERM_LENGTH_IMGT, N1_LENGTH, - D_SEQ_START, D_SEQ_LENGTH, D_GERM_START, D_GERM_LENGTH, N2_LENGTH, + universal output fields: + SEQUENCE_ID, SEQUENCE_INPUT, SEQUENCE_VDJ, SEQUENCE_IMGT, + FUNCTIONAL, IN_FRAME, STOP, MUTATED_INVARIANT, INDELS, + V_CALL, D_CALL, J_CALL, + V_SEQ_START, V_SEQ_LENGTH, + D_SEQ_START, D_SEQ_LENGTH, D_GERM_START, D_GERM_LENGTH, J_SEQ_START, J_SEQ_LENGTH, J_GERM_START, J_GERM_LENGTH, - JUNCTION_LENGTH, JUNCTION, V_SCORE, V_IDENTITY, V_EVALUE, V_BTOP, - J_SCORE, J_IDENTITY, J_EVALUE, J_BTOP, FWR1_IMGT, FWR2_IMGT, FWR3_IMGT, - FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, CDR3_IMGT + JUNCTION_LENGTH, JUNCTION, NP1_LENGTH, NP2_LENGTH, + FWR1_IMGT, FWR2_IMGT, FWR3_IMGT, FWR4_IMGT, + CDR1_IMGT, CDR2_IMGT, CDR3_IMGT + + imgt specific output fields: + V_GERM_START_IMGT, V_GERM_LENGTH_IMGT, + N1_LENGTH, N2_LENGTH, P3V_LENGTH, P5D_LENGTH, P3D_LENGTH, P5J_LENGTH, + D_FRAME, V_SCORE, V_IDENTITY, J_SCORE, J_IDENTITY, + + igblast specific output fields: + V_GERM_START_VDJ, V_GERM_LENGTH_VDJ, + V_EVALUE, V_SCORE, V_IDENTITY, V_BTOP, + J_EVALUE, J_SCORE, J_IDENTITY, J_BTOP. + CDR3_IGBLAST_NT, CDR3_IGBLAST_AA + + ihmm specific output fields: + V_GERM_START_VDJ, V_GERM_LENGTH_VDJ, + HMM_SCORE ''') # Define ArgumentParser @@ -933,11 +401,11 @@ parser_parent = getCommonArgParser(seq_in=False, seq_out=False, log=False) # IgBlast Aligner - parser_igblast = subparsers.add_parser('igblast', help='Process IgBlast output', - parents=[parser_parent], - formatter_class=CommonHelpFormatter) - parser_igblast.set_defaults(func=parseIgBlast) - parser_igblast.add_argument('-i', nargs='+', action='store', dest='aligner_files', + parser_igblast = subparsers.add_parser('igblast', parents=[parser_parent], + formatter_class=CommonHelpFormatter, + help='Process IgBLAST output.', + description='Process IgBLAST output.') + parser_igblast.add_argument('-i', nargs='+', action='store', dest='aligner_outputs', required=True, help='''IgBLAST output files in format 7 with query sequence (IgBLAST argument \'-outfmt "7 std qseq sseq btop"\').''') @@ -947,50 +415,112 @@ set of germlines used in the IgBLAST alignment.''') parser_igblast.add_argument('-s', action='store', nargs='+', dest='seq_files', required=True, - help='List of input FASTA files containing sequences') + help='''List of input FASTA files (with .fasta, .fna or .fa + extension), containing sequences.''') parser_igblast.add_argument('--noparse', action='store_true', dest='no_parse', - help='''Specify if input IDs should not be parsed to add - new columns to database.''') - parser_igblast.add_argument('--scores', action='store_true', dest='score_fields', + help='''Specify to prevent input sequence headers from being parsed + to add new columns to database. Parsing of sequence headers requires + headers to be in the pRESTO annotation format, so this should be specified + when sequence headers are incompatible with the pRESTO annotation scheme. + Note, unrecognized header formats will default to this behavior.''') + parser_igblast.add_argument('--partial', action='store_true', dest='partial', + help='''If specified, include incomplete V(D)J alignments in + the pass file instead of the fail file.''') + parser_igblast.add_argument('--scores', action='store_true', dest='parse_scores', help='''Specify if alignment score metrics should be included in the output. Adds the V_SCORE, V_IDENTITY, V_EVALUE, V_BTOP, J_SCORE, J_IDENTITY, J_BTOP, and J_EVALUE columns.''') - parser_igblast.add_argument('--regions', action='store_true', dest='region_fields', - help='''Specify if IMGT framework and CDR regions should be + parser_igblast.add_argument('--regions', action='store_true', dest='parse_regions', + help='''Specify if IMGT FWR and CDRs should be included in the output. Adds the FWR1_IMGT, FWR2_IMGT, FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and CDR3_IMGT columns.''') - + parser_igblast.add_argument('--cdr3', action='store_true', + dest='parse_igblast_cdr3', + help='''Specify if the CDR3 sequences generated by IgBLAST + should be included in the output. Adds the columns + CDR3_IGBLAST_NT and CDR3_IGBLAST_AA. Requires IgBLAST + version 1.5 or greater.''') + parser_igblast.set_defaults(func=parseIgBLAST) + # IMGT aligner - parser_imgt = subparsers.add_parser('imgt', help='Process IMGT/HighV-Quest output', - parents=[parser_parent], - formatter_class=CommonHelpFormatter) - imgt_arg_group = parser_imgt.add_mutually_exclusive_group(required=True) - imgt_arg_group.add_argument('-i', nargs='+', action='store', dest='aligner_files', - help='''Either zipped IMGT output files (.zip) or a folder - containing unzipped IMGT output files (which must - include 1_Summary, 2_IMGT-gapped, 3_Nt-sequences, - and 6_Junction).''') + parser_imgt = subparsers.add_parser('imgt', parents=[parser_parent], + formatter_class=CommonHelpFormatter, + help='''Process IMGT/HighV-Quest output + (does not work with V-QUEST).''', + description='''Process IMGT/HighV-Quest output + (does not work with V-QUEST).''') + parser_imgt.add_argument('-i', nargs='+', action='store', dest='aligner_outputs', + help='''Either zipped IMGT output files (.zip or .txz) or a + folder containing unzipped IMGT output files (which must + include 1_Summary, 2_IMGT-gapped, 3_Nt-sequences, + and 6_Junction).''') parser_imgt.add_argument('-s', nargs='*', action='store', dest='seq_files', required=False, - help='List of input FASTA files containing sequences') + help='''List of input FASTA files (with .fasta, .fna or .fa + extension) containing sequences.''') parser_imgt.add_argument('--noparse', action='store_true', dest='no_parse', - help='''Specify if input IDs should not be parsed to add new - columns to database.''') - parser_imgt.add_argument('--scores', action='store_true', dest='score_fields', + help='''Specify to prevent input sequence headers from being parsed + to add new columns to database. Parsing of sequence headers requires + headers to be in the pRESTO annotation format, so this should be specified + when sequence headers are incompatible with the pRESTO annotation scheme. + Note, unrecognized header formats will default to this behavior.''') + parser_imgt.add_argument('--partial', action='store_true', dest='partial', + help='''If specified, include incomplete V(D)J alignments in + the pass file instead of the fail file.''') + parser_imgt.add_argument('--scores', action='store_true', dest='parse_scores', help='''Specify if alignment score metrics should be included in the output. Adds the V_SCORE, V_IDENTITY, - J_SCORE and J_IDENTITY. Note, this will also add - the columns V_EVALUE, V_BTOP, J_EVALUE and J_BTOP, - but they will be empty for IMGT output.''') - parser_imgt.add_argument('--regions', action='store_true', dest='region_fields', - help='''Specify if IMGT framework and CDR regions should be + J_SCORE and J_IDENTITY.''') + parser_imgt.add_argument('--regions', action='store_true', dest='parse_regions', + help='''Specify if IMGT FWRs and CDRs should be included in the output. Adds the FWR1_IMGT, FWR2_IMGT, FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and CDR3_IMGT columns.''') + parser_imgt.add_argument('--junction', action='store_true', dest='parse_junction', + help='''Specify if detailed junction fields should be + included in the output. Adds the columns + N1_LENGTH, N2_LENGTH, P3V_LENGTH, P5D_LENGTH, P3D_LENGTH, + P5J_LENGTH, D_FRAME.''') parser_imgt.set_defaults(func=parseIMGT) + # iHMMuneAlign Aligner + parser_ihmm = subparsers.add_parser('ihmm', parents=[parser_parent], + formatter_class=CommonHelpFormatter, + help='Process iHMMune-Align output.', + description='Process iHMMune-Align output.') + parser_ihmm.add_argument('-i', nargs='+', action='store', dest='aligner_outputs', + required=True, + help='''iHMMune-Align output file.''') + parser_ihmm.add_argument('-r', nargs='+', action='store', dest='repo', required=True, + help='''List of folders and/or FASTA files containing + IMGT-gapped germline sequences corresponding to the + set of germlines used in the IgBLAST alignment.''') + parser_ihmm.add_argument('-s', action='store', nargs='+', dest='seq_files', + required=True, + help='''List of input FASTA files (with .fasta, .fna or .fa + extension) containing sequences.''') + parser_ihmm.add_argument('--noparse', action='store_true', dest='no_parse', + help='''Specify to prevent input sequence headers from being parsed + to add new columns to database. Parsing of sequence headers requires + headers to be in the pRESTO annotation format, so this should be specified + when sequence headers are incompatible with the pRESTO annotation scheme. + Note, unrecognized header formats will default to this behavior.''') + parser_ihmm.add_argument('--partial', action='store_true', dest='partial', + help='''If specified, include incomplete V(D)J alignments in + the pass file instead of the fail file.''') + parser_ihmm.add_argument('--scores', action='store_true', dest='parse_scores', + help='''Specify if alignment score metrics should be + included in the output. Adds the path score of the + iHMMune-Align hidden Markov model to HMM_SCORE.''') + parser_ihmm.add_argument('--regions', action='store_true', dest='parse_regions', + help='''Specify if IMGT FWRs and CDRs should be + included in the output. Adds the FWR1_IMGT, FWR2_IMGT, + FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and + CDR3_IMGT columns.''') + parser_ihmm.set_defaults(func=parseIHMM) + return parser @@ -1000,7 +530,7 @@ """ parser = getArgParser() args = parser.parse_args() - args_dict = parseCommonArgs(args, in_arg='aligner_files') + args_dict = parseCommonArgs(args, in_arg='aligner_outputs') # Set no ID parsing if sequence files are not provided if 'seq_files' in args_dict and not args_dict['seq_files']: @@ -1008,18 +538,18 @@ # Delete if 'seq_files' in args_dict: del args_dict['seq_files'] - if 'aligner_files' in args_dict: del args_dict['aligner_files'] + if 'aligner_outputs' in args_dict: del args_dict['aligner_outputs'] if 'command' in args_dict: del args_dict['command'] if 'func' in args_dict: del args_dict['func'] if args.command == 'imgt': - for i in range(len(args.__dict__['aligner_files'])): - args_dict['imgt_output'] = args.__dict__['aligner_files'][i] + for i in range(len(args.__dict__['aligner_outputs'])): + args_dict['aligner_output'] = args.__dict__['aligner_outputs'][i] args_dict['seq_file'] = args.__dict__['seq_files'][i] \ if args.__dict__['seq_files'] else None args.func(**args_dict) - elif args.command == 'igblast': - for i in range(len(args.__dict__['aligner_files'])): - args_dict['igblast_output'] = args.__dict__['aligner_files'][i] + elif args.command == 'igblast' or args.command == 'ihmm': + for i in range(len(args.__dict__['aligner_outputs'])): + args_dict['aligner_output'] = args.__dict__['aligner_outputs'][i] args_dict['seq_file'] = args.__dict__['seq_files'][i] args.func(**args_dict)