shm_csr: change_o/MakeDb.py comparison

comparison change_o/MakeDb.py @ 0:c33d93683a09 draft

Uploaded

author	davidvanzessen
date	Thu, 13 Oct 2016 10:52:24 -0400
parents
children	22dddabe3637

comparison

equal deleted inserted replaced

--1:000000000000
+:c33d93683a09
+#!/usr/bin/env python3
+"""
+Create tab-delimited database file to store sequence alignment information
+"""
+# Info
+__author__ = 'Namita Gupta, Jason Anthony Vander Heiden'
+from changeo import __version__, __date__
+# Imports
+import csv
+import os
+import re
+import sys
+import pandas as pd
+import tarfile
+import zipfile
+from argparse import ArgumentParser
+from collections import OrderedDict
+from itertools import groupby
+from shutil import rmtree
+from tempfile import mkdtemp
+from textwrap import dedent
+from time import time
+from Bio import SeqIO
+from Bio.Seq import Seq
+from Bio.Alphabet import IUPAC
+# Presto and changeo imports
+from presto.Defaults import default_out_args
+from presto.Annotation import parseAnnotation
+from presto.IO import countSeqFile, printLog, printProgress
+from changeo.Commandline import CommonHelpFormatter, getCommonArgParser, parseCommonArgs
+from changeo.IO import getDbWriter, countDbFile, getRepo
+from changeo.Receptor import IgRecord, parseAllele, v_allele_regex, d_allele_regex, \
+j_allele_regex
+# Default parameters
+default_delimiter = ('\t', ',', '-')
+def gapV(ig_dict, repo_dict):
+"""
+Insert gaps into V region and update alignment information
+Arguments:
+ig_dict : Dictionary of parsed IgBlast output
+repo_dict : Dictionary of IMGT gapped germline sequences
+Returns:
+dict : Updated with SEQUENCE_IMGT, V_GERM_START_IMGT, and V_GERM_LENGTH_IMGT fields
+"""
+seq_imgt = '.' * (int(ig_dict['V_GERM_START_VDJ'])-1) + ig_dict['SEQUENCE_VDJ']
+# Find gapped germline V segment
+vgene = parseAllele(ig_dict['V_CALL'], v_allele_regex, 'first')
+vkey = (vgene, )
+#TODO: Figure out else case
+if vkey in repo_dict:
+vgap = repo_dict[vkey]
+# Iterate over gaps in the germline segment
+gaps = re.finditer(r'\.', vgap)
+gapcount = int(ig_dict['V_GERM_START_VDJ'])-1
+for gap in gaps:
+i = gap.start()
+# Break if gap begins after V region
+if i >= ig_dict['V_GERM_LENGTH_VDJ'] + gapcount:
+break
+# Insert gap into IMGT sequence
+seq_imgt = seq_imgt[:i] + '.' + seq_imgt[i:]
+# Update gap counter
+gapcount += 1
+ig_dict['SEQUENCE_IMGT'] = seq_imgt
+# Update IMGT positioning information for V
+ig_dict['V_GERM_START_IMGT'] = 1
+ig_dict['V_GERM_LENGTH_IMGT'] = ig_dict['V_GERM_LENGTH_VDJ'] + gapcount
+return ig_dict
+def getIMGTJunc(ig_dict, repo_dict):
+"""
+Identify junction region by IMGT definition
+Arguments:
+ig_dict : Dictionary of parsed IgBlast output
+repo_dict : Dictionary of IMGT gapped germline sequences
+Returns:
+dict : Updated with JUNCTION_LENGTH_IMGT and JUNCTION_IMGT fields
+"""
+# Find germline J segment
+jgene = parseAllele(ig_dict['J_CALL'], j_allele_regex, 'first')
+jkey = (jgene, )
+#TODO: Figure out else case
+if jkey in repo_dict:
+# Get germline J sequence
+jgerm = repo_dict[jkey]
+jgerm = jgerm[:ig_dict['J_GERM_START']+ig_dict['J_GERM_LENGTH']-1]
+# Look for (F|W)GXG aa motif in nt sequence
+motif = re.search(r'T(TT|TC|GG)GG[ACGT]{4}GG[AGCT]', jgerm)
+aa_end = len(ig_dict['SEQUENCE_IMGT'])
+#TODO: Figure out else case
+if motif:
+# print('\n', motif.group())
+aa_end = motif.start() - len(jgerm) + 3
+# Add fields to dict
+ig_dict['JUNCTION'] = ig_dict['SEQUENCE_IMGT'][309:aa_end]
+ig_dict['JUNCTION_LENGTH'] = len(ig_dict['JUNCTION'])
+return ig_dict
+def getRegions(ig_dict):
+"""
+Identify FWR and CDR regions by IMGT definition
+Arguments:
+ig_dict : Dictionary of parsed alignment output
+Returns:
+dict : Updated with FWR1_IMGT, FWR2_IMGT, FWR3_IMGT, FWR4_IMGT,
+CDR1_IMGT, CDR2_IMGT, and CDR3_IMGT fields
+"""
+try:
+seq_len = len(ig_dict['SEQUENCE_IMGT'])
+ig_dict['FWR1_IMGT'] = ig_dict['SEQUENCE_IMGT'][0:min(78,seq_len)]
+except (KeyError, IndexError):
+return ig_dict
+try: ig_dict['CDR1_IMGT'] = ig_dict['SEQUENCE_IMGT'][78:min(114, seq_len)]
+except (IndexError): return ig_dict
+try: ig_dict['FWR2_IMGT'] = ig_dict['SEQUENCE_IMGT'][114:min(165, seq_len)]
+except (IndexError): return ig_dict
+try: ig_dict['CDR2_IMGT'] = ig_dict['SEQUENCE_IMGT'][165:min(195, seq_len)]
+except (IndexError): return ig_dict
+try: ig_dict['FWR3_IMGT'] = ig_dict['SEQUENCE_IMGT'][195:min(312, seq_len)]
+except (IndexError): return ig_dict
+try:
+cdr3_end = 306 + ig_dict['JUNCTION_LENGTH']
+ig_dict['CDR3_IMGT'] = ig_dict['SEQUENCE_IMGT'][312:cdr3_end]
+ig_dict['FWR4_IMGT'] = ig_dict['SEQUENCE_IMGT'][cdr3_end:]
+except (KeyError, IndexError):
+return ig_dict
+return ig_dict
+def getSeqforIgBlast(seq_file):
+"""
+Fetch input sequences for IgBlast queries
+Arguments:
+seq_file = a fasta file of sequences input to IgBlast
+Returns:
+a dictionary of {ID:Seq}
+"""
+seq_dict = SeqIO.index(seq_file, "fasta", IUPAC.ambiguous_dna)
+# Create a seq_dict ID translation using IDs truncate up to space or 50 chars
+seqs = {}
+for seq in seq_dict.values():
+seqs.update({seq.description:str(seq.seq)})
+return seqs
+def findLine(handle, query):
+"""
+Finds line with query string in file
+Arguments:
+handle = file handle in which to search for line
+query = query string for which to search in file
+Returns:
+line from handle in which query string was found
+"""
+for line in handle:
+if(re.match(query, line)):
+return line
+def extractIMGT(imgt_output):
+"""
+Extract necessary files from IMGT results, zipped or unzipped
+Arguments:
+imgt_output = zipped file or unzipped folder output by IMGT
+Returns:
+sorted list of filenames from which information will be read
+"""
+#file_ext = os.path.splitext(imgt_output)[1].lower()
+imgt_flags = ('1_Summary', '2_IMGT-gapped', '3_Nt-sequences', '6_Junction')
+temp_dir = mkdtemp()
+if zipfile.is_zipfile(imgt_output):
+# Open zip file
+imgt_zip = zipfile.ZipFile(imgt_output, 'r')
+# Extract required files
+imgt_files = sorted([n for n in imgt_zip.namelist() \
+if os.path.basename(n).startswith(imgt_flags)])
+imgt_zip.extractall(temp_dir, imgt_files)
+# Define file list
+imgt_files = [os.path.join(temp_dir, f) for f in imgt_files]
+elif os.path.isdir(imgt_output):
+# Find required files in folder
+folder_files = []
+for root, dirs, files in os.walk(imgt_output):
+folder_files.extend([os.path.join(os.path.abspath(root), f) for f in files])
+# Define file list
+imgt_files = sorted([n for n in folder_files \
+if os.path.basename(n).startswith(imgt_flags)])
+elif tarfile.is_tarfile(imgt_output):
+# Open zip file
+imgt_tar = tarfile.open(imgt_output, 'r')
+# Extract required files
+imgt_files = sorted([n for n in imgt_tar.getnames() \
+if os.path.basename(n).startswith(imgt_flags)])
+imgt_tar.extractall(temp_dir, [imgt_tar.getmember(n) for n in imgt_files])
+# Define file list
+imgt_files = [os.path.join(temp_dir, f) for f in imgt_files]
+else:
+sys.exit('ERROR: Unsupported IGMT output file. Must be either a zipped file (.zip), LZMA compressed tarfile (.txz) or a folder.')
+if len(imgt_files) > len(imgt_flags): # e.g. multiple 1_Summary files
+sys.exit('ERROR: Wrong files in IMGT output %s.' % imgt_output)
+elif len(imgt_files) < len(imgt_flags):
+sys.exit('ERROR: Missing necessary file IMGT output %s.' % imgt_output)
+return temp_dir, imgt_files
+# TODO: return a dictionary with keys determined by the comment strings in the blocks, thus avoiding problems with missing blocks
+def readOneIgBlastResult(block):
+"""
+Parse a single IgBLAST query result
+Arguments:
+block =  itertools groupby object of single result
+Returns:
+None if no results, otherwise list of DataFrames for each result block
+"""
+results = list()
+i = 0
+for match, subblock in groupby(block, lambda l: l=='\n'):
+if not match:
+# Strip whitespace and comments
+sub = [s.strip() for s in subblock if not s.startswith('#')]
+# Continue on empty block
+if not sub:  continue
+else:  i += 1
+# Split by tabs
+sub = [s.split('\t') for s in sub]
+# Append list for "V-(D)-J rearrangement summary" (i == 1)
+# And "V-(D)-J junction details" (i == 2)
+# Otherwise append DataFrame of subblock
+if i == 1 or i == 2:
+results.append(sub[0])
+else:
+df = pd.DataFrame(sub)
+if not df.empty: results.append(df)
+return results if results else None
+# TODO:  needs more speeds. pandas is probably to blame.
+def readIgBlast(igblast_output, seq_dict, repo_dict,
+score_fields=False, region_fields=False):
+"""
+Reads IgBlast output
+Arguments:
+igblast_output = IgBlast output file (format 7)
+seq_dict = a dictionary of {ID:Seq} from input fasta file
+repo_dict = dictionary of IMGT gapped germline sequences
+score_fields = if True parse alignment scores
+region_fields = if True add FWR and CDR region fields
+Returns:
+a generator of dictionaries containing alignment data
+"""
+# Open IgBlast output file
+with open(igblast_output) as f:
+# Iterate over individual results (separated by # IGBLASTN)
+for k1, block in groupby(f, lambda x: re.match('# IGBLASTN', x)):
+block = list(block)
+if not k1:
+# TODO: move query name extraction into block parser readOneIgBlastResult().
+# Extract sequence ID
+query_name = ' '.join(block[0].strip().split(' ')[2:])
+# Initialize db_gen to have ID and input sequence
+db_gen = {'SEQUENCE_ID':     query_name,
+'SEQUENCE_INPUT':  seq_dict[query_name]}
+# Parse further sub-blocks
+block_list = readOneIgBlastResult(block)
+# TODO: this is indented pretty far.  should be a separate function. or several functions.
+# If results exist, parse further to obtain full db_gen
+if block_list is not None:
+# Parse quality information
+db_gen['STOP'] = 'T' if block_list[0][-4] == 'Yes' else 'F'
+db_gen['IN_FRAME'] = 'T' if block_list[0][-3] == 'In-frame' else 'F'
+db_gen['FUNCTIONAL'] = 'T' if block_list[0][-2] == 'Yes' else 'F'
+if block_list[0][-1] == '-':
+db_gen['SEQUENCE_INPUT'] = str(Seq(db_gen['SEQUENCE_INPUT'],
+IUPAC.ambiguous_dna).reverse_complement())
+# Parse V, D, and J calls
+call_str = ' '.join(block_list[0])
+v_call = parseAllele(call_str, v_allele_regex, action='list')
+d_call = parseAllele(call_str, d_allele_regex, action='list')
+j_call = parseAllele(call_str, j_allele_regex, action='list')
+db_gen['V_CALL'] = ','.join(v_call) if v_call is not None else 'None'
+db_gen['D_CALL'] = ','.join(d_call) if d_call is not None else 'None'
+db_gen['J_CALL'] = ','.join(j_call) if j_call is not None else 'None'
+# Parse junction sequence
+# db_gen['JUNCTION_VDJ'] = re.sub('(N/A)|\[|\(|\)|\]', '', ''.join(block_list[1]))
+# db_gen['JUNCTION_LENGTH_VDJ'] = len(db_gen['JUNCTION_VDJ'])
+# TODO:  IgBLAST does a stupid and doesn't output block #3 sometimes. why?
+# TODO:  maybe we should fail these. they look craptastic.
+#pd.set_option('display.width', 500)
+#print query_name, len(block_list), hit_idx
+#for i, x in enumerate(block_list):
+#    print '[%i]' % i
+#    print x
+# Parse segment start and stop positions
+hit_df = block_list[-1]
+# Alignment info block
+#  0:  segment
+#  1:  query id
+#  2:  subject id
+#  3:  % identity
+#  4:  alignment length
+#  5:  mismatches
+#  6:  gap opens
+#  7:  gaps
+#  8:  q. start
+#  9:  q. end
+# 10:  s. start
+# 11:  s. end
+# 12:  evalue
+# 13:  bit score
+# 14:  query seq
+# 15:  subject seq
+# 16:  btop
+# If V call exists, parse V alignment information
+seq_vdj = ''
+if v_call is not None:
+v_align = hit_df[hit_df[0] == 'V'].iloc[0]
+# Germline positions
+db_gen['V_GERM_START_VDJ'] = int(v_align[10])
+db_gen['V_GERM_LENGTH_VDJ'] = int(v_align[11]) - db_gen['V_GERM_START_VDJ'] + 1
+# Query sequence positions
+db_gen['V_SEQ_START'] = int(v_align[8])
+db_gen['V_SEQ_LENGTH'] = int(v_align[9]) - db_gen['V_SEQ_START'] + 1
+if int(v_align[6]) == 0:
+db_gen['INDELS'] = 'F'
+else:
+db_gen['INDELS'] = 'T'
+# Set functional to none so record gets tossed (junction will be wrong)
+# db_gen['FUNCTIONAL'] = None
+# V alignment scores
+if score_fields:
+try: db_gen['V_SCORE'] = float(v_align[13])
+except (TypeError, ValueError): db_gen['V_SCORE'] = 'None'
+try: db_gen['V_IDENTITY'] = float(v_align[3]) / 100.0
+except (TypeError, ValueError): db_gen['V_IDENTITY'] = 'None'
+try: db_gen['V_EVALUE'] = float(v_align[12])
+except (TypeError, ValueError): db_gen['V_EVALUE'] = 'None'
+try: db_gen['V_BTOP'] = v_align[16]
+except (TypeError, ValueError): db_gen['V_BTOP'] = 'None'
+# Update VDJ sequence, removing insertions
+start = 0
+for m in re.finditer(r'-', v_align[15]):
+ins = m.start()
+seq_vdj += v_align[14][start:ins]
+start = ins + 1
+seq_vdj += v_align[14][start:]
+# TODO:  needs to check that the V results are present before trying to determine N1_LENGTH from them.
+# If D call exists, parse D alignment information
+if d_call is not None:
+d_align = hit_df[hit_df[0] == 'D'].iloc[0]
+# TODO:  this is kinda gross.  not sure how else to fix the alignment overlap problem though.
+# Determine N-region length and amount of J overlap with V or D alignment
+overlap = 0
+if v_call is not None:
+n1_len = int(d_align[8]) - (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH'])
+if n1_len < 0:
+db_gen['N1_LENGTH'] = 0
+overlap = abs(n1_len)
+else:
+db_gen['N1_LENGTH'] = n1_len
+n1_start = (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH']-1)
+n1_end = int(d_align[8])-1
+seq_vdj += db_gen['SEQUENCE_INPUT'][n1_start:n1_end]
+# Query sequence positions
+db_gen['D_SEQ_START'] = int(d_align[8]) + overlap
+db_gen['D_SEQ_LENGTH'] = max(int(d_align[9]) - db_gen['D_SEQ_START'] + 1, 0)
+# Germline positions
+db_gen['D_GERM_START'] = int(d_align[10]) + overlap
+db_gen['D_GERM_LENGTH'] = max(int(d_align[11]) - db_gen['D_GERM_START'] + 1, 0)
+# Update VDJ sequence, removing insertions
+start = overlap
+for m in re.finditer(r'-', d_align[15]):
+ins = m.start()
+seq_vdj += d_align[14][start:ins]
+start = ins + 1
+seq_vdj += d_align[14][start:]
+# TODO:  needs to check that the V results are present before trying to determine N1_LENGTH from them.
+# If J call exists, parse J alignment information
+if j_call is not None:
+j_align = hit_df[hit_df[0] == 'J'].iloc[0]
+# TODO:  this is kinda gross.  not sure how else to fix the alignment overlap problem though.
+# Determine N-region length and amount of J overlap with V or D alignment
+overlap = 0
+if d_call is not None:
+n2_len = int(j_align[8]) - (db_gen['D_SEQ_START'] + db_gen['D_SEQ_LENGTH'])
+if n2_len < 0:
+db_gen['N2_LENGTH'] = 0
+overlap = abs(n2_len)
+else:
+db_gen['N2_LENGTH'] = n2_len
+n2_start = (db_gen['D_SEQ_START']+db_gen['D_SEQ_LENGTH']-1)
+n2_end = int(j_align[8])-1
+seq_vdj += db_gen['SEQUENCE_INPUT'][n2_start:n2_end]
+elif v_call is not None:
+n1_len = int(j_align[8]) - (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH'])
+if n1_len < 0:
+db_gen['N1_LENGTH'] = 0
+overlap = abs(n1_len)
+else:
+db_gen['N1_LENGTH'] = n1_len
+n1_start = (db_gen['V_SEQ_START']+db_gen['V_SEQ_LENGTH']-1)
+n1_end = int(j_align[8])-1
+seq_vdj += db_gen['SEQUENCE_INPUT'][n1_start:n1_end]
+else:
+db_gen['N1_LENGTH'] = 0
+# Query positions
+db_gen['J_SEQ_START'] = int(j_align[8]) + overlap
+db_gen['J_SEQ_LENGTH'] = max(int(j_align[9]) - db_gen['J_SEQ_START'] + 1, 0)
+# Germline positions
+db_gen['J_GERM_START'] = int(j_align[10]) + overlap
+db_gen['J_GERM_LENGTH'] = max(int(j_align[11]) - db_gen['J_GERM_START'] + 1, 0)
+# J alignment scores
+if score_fields:
+try: db_gen['J_SCORE'] = float(j_align[13])
+except (TypeError, ValueError): db_gen['J_SCORE'] = 'None'
+try: db_gen['J_IDENTITY'] = float(j_align[3]) / 100.0
+except (TypeError, ValueError): db_gen['J_IDENTITY'] = 'None'
+try: db_gen['J_EVALUE'] = float(j_align[12])
+except (TypeError, ValueError): db_gen['J_EVALUE'] = 'None'
+try: db_gen['J_BTOP'] = j_align[16]
+except (TypeError, ValueError): db_gen['J_BTOP'] = 'None'
+# Update VDJ sequence, removing insertions
+start = overlap
+for m in re.finditer(r'-', j_align[15]):
+ins = m.start()
+seq_vdj += j_align[14][start:ins]
+start = ins + 1
+seq_vdj += j_align[14][start:]
+db_gen['SEQUENCE_VDJ'] = seq_vdj
+# Create IMGT-gapped sequence and infer IMGT junction
+if v_call is not None:
+db_gen = gapV(db_gen, repo_dict)
+if j_call is not None:
+db_gen = getIMGTJunc(db_gen, repo_dict)
+# FWR and CDR regions
+if region_fields: getRegions(db_gen)
+yield IgRecord(db_gen)
+# TODO:  should be more readable
+def readIMGT(imgt_files, score_fields=False, region_fields=False):
+"""
+Reads IMGT/HighV-Quest output
+Arguments:
+imgt_files = IMGT/HighV-Quest output files 1, 2, 3, and 6
+score_fields = if True parse alignment scores
+region_fields = if True add FWR and CDR region fields
+Returns:
+a generator of dictionaries containing alignment data
+"""
+imgt_iters = [csv.DictReader(open(f, 'rU'), delimiter='\t') for f in imgt_files]
+# Create a dictionary for each sequence alignment and yield its generator
+for sm, gp, nt, jn in zip(*imgt_iters):
+if len(set([sm['Sequence ID'],
+gp['Sequence ID'],
+nt['Sequence ID'],
+jn['Sequence ID']])) != 1:
+sys.exit('Error: IMGT files are corrupt starting with Summary file record %s' \
+% sm['Sequence ID'])
+db_gen = {'SEQUENCE_ID': sm['Sequence ID'],
+'SEQUENCE_INPUT': sm['Sequence']}
+if 'No results' not in sm['Functionality']:
+db_gen['FUNCTIONAL'] = ['?','T','F'][('productive' in sm['Functionality']) +
+('unprod' in sm['Functionality'])]
+db_gen['IN_FRAME'] = ['?','T','F'][('in-frame' in sm['JUNCTION frame']) +
+('out-of-frame' in sm['JUNCTION frame'])],
+db_gen['STOP'] = ['F','?','T'][('stop codon' in sm['Functionality comment']) +
+('unprod' in sm['Functionality'])]
+db_gen['MUTATED_INVARIANT'] = ['F','?','T'][(any(('missing' in sm['Functionality comment'],
+'missing' in sm['V-REGION potential ins/del']))) +
+('unprod' in sm['Functionality'])]
+db_gen['INDELS'] = ['F','T'][any((sm['V-REGION potential ins/del'],
+sm['V-REGION insertions'],
+sm['V-REGION deletions']))]
+db_gen['SEQUENCE_VDJ'] = nt['V-D-J-REGION'] if nt['V-D-J-REGION'] else nt['V-J-REGION']
+db_gen['SEQUENCE_IMGT'] = gp['V-D-J-REGION'] if gp['V-D-J-REGION'] else gp['V-J-REGION']
+db_gen['V_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['V-GENE and allele']))
+db_gen['D_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['D-GENE and allele']))
+db_gen['J_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['J-GENE and allele']))
+v_seq_length = len(nt['V-REGION']) if nt['V-REGION'] else 0
+db_gen['V_SEQ_START'] = nt['V-REGION start']
+db_gen['V_SEQ_LENGTH'] = v_seq_length
+db_gen['V_GERM_START_IMGT'] = 1
+db_gen['V_GERM_LENGTH_IMGT'] = len(gp['V-REGION']) if gp['V-REGION'] else 0
+db_gen['N1_LENGTH'] = sum(int(i) for i in [jn["P3'V-nt nb"],
+jn['N-REGION-nt nb'],
+jn['N1-REGION-nt nb'],
+jn["P5'D-nt nb"]] if i)
+db_gen['D_SEQ_START'] = sum(int(i) for i in [1, v_seq_length,
+jn["P3'V-nt nb"],
+jn['N-REGION-nt nb'],
+jn['N1-REGION-nt nb'],
+jn["P5'D-nt nb"]] if i)
+db_gen['D_SEQ_LENGTH'] = int(jn["D-REGION-nt nb"] or 0)
+db_gen['D_GERM_START'] = int(jn["5'D-REGION trimmed-nt nb"] or 0) + 1
+db_gen['D_GERM_LENGTH'] = int(jn["D-REGION-nt nb"] or 0)
+db_gen['N2_LENGTH'] = sum(int(i) for i in [jn["P3'D-nt nb"],
+jn['N2-REGION-nt nb'],
+jn["P5'J-nt nb"]] if i)
+db_gen['J_SEQ_START_IMGT'] = sum(int(i) for i in [1, v_seq_length,
+jn["P3'V-nt nb"],
+jn['N-REGION-nt nb'],
+jn['N1-REGION-nt nb'],
+jn["P5'D-nt nb"],
+jn["D-REGION-nt nb"],
+jn["P3'D-nt nb"],
+jn['N2-REGION-nt nb'],
+jn["P5'J-nt nb"]] if i)
+db_gen['J_SEQ_LENGTH'] = len(nt['J-REGION']) if nt['J-REGION'] else 0
+db_gen['J_GERM_START'] = int(jn["5'J-REGION trimmed-nt nb"] or 0) + 1
+db_gen['J_GERM_LENGTH'] = len(gp['J-REGION']) if gp['J-REGION'] else 0
+db_gen['JUNCTION_LENGTH'] = len(jn['JUNCTION']) if jn['JUNCTION'] else 0
+db_gen['JUNCTION'] = jn['JUNCTION']
+# Alignment scores
+if score_fields:
+try:  db_gen['V_SCORE'] = float(sm['V-REGION score'])
+except (TypeError, ValueError):  db_gen['V_SCORE'] = 'None'
+try:  db_gen['V_IDENTITY'] = float(sm['V-REGION identity %']) / 100.0
+except (TypeError, ValueError):  db_gen['V_IDENTITY'] = 'None'
+try:  db_gen['J_SCORE'] = float(sm['J-REGION score'])
+except (TypeError, ValueError):  db_gen['J_SCORE'] = 'None'
+try:  db_gen['J_IDENTITY'] = float(sm['J-REGION identity %']) / 100.0
+except (TypeError, ValueError):  db_gen['J_IDENTITY'] = 'None'
+# FWR and CDR regions
+if region_fields: getRegions(db_gen)
+else:
+db_gen['V_CALL'] = 'None'
+db_gen['D_CALL'] = 'None'
+db_gen['J_CALL'] = 'None'
+yield IgRecord(db_gen)
+def getIDforIMGT(seq_file):
+"""
+Create a sequence ID translation using IMGT truncation
+Arguments:
+seq_file = a fasta file of sequences input to IMGT
+Returns:
+a dictionary of {truncated ID: full seq description}
+"""
+# Create a seq_dict ID translation using IDs truncate up to space or 50 chars
+ids = {}
+for i, rec in enumerate(SeqIO.parse(seq_file, 'fasta', IUPAC.ambiguous_dna)):
+if len(rec.description) <= 50:
+id_key = rec.description
+else:
+id_key = re.sub('\||\s|!|&|\*|<|>|\?','_',rec.description[:50])
+ids.update({id_key:rec.description})
+return ids
+def writeDb(db_gen, file_prefix, total_count, id_dict={}, no_parse=True,
+score_fields=False, region_fields=False, out_args=default_out_args):
+"""
+Writes tab-delimited database file in output directory
+Arguments:
+db_gen = a generator of IgRecord objects containing alignment data
+file_prefix = directory and prefix for CLIP tab-delim file
+total_count = number of records (for progress bar)
+id_dict = a dictionary of {IMGT ID: full seq description}
+no_parse = if ID is to be parsed for pRESTO output with default delimiters
+score_fields = if True add alignment score fields to output file
+region_fields = if True add FWR and CDR region fields to output file
+out_args = common output argument dictionary from parseCommonArgs
+Returns:
+None
+"""
+pass_file = "%s_db-pass.tab" % file_prefix
+fail_file = "%s_db-fail.tab" % file_prefix
+ordered_fields = ['SEQUENCE_ID',
+'SEQUENCE_INPUT',
+'FUNCTIONAL',
+'IN_FRAME',
+'STOP',
+'MUTATED_INVARIANT',
+'INDELS',
+'V_CALL',
+'D_CALL',
+'J_CALL',
+'SEQUENCE_VDJ',
+'SEQUENCE_IMGT',
+'V_SEQ_START',
+'V_SEQ_LENGTH',
+'V_GERM_START_VDJ',
+'V_GERM_LENGTH_VDJ',
+'V_GERM_START_IMGT',
+'V_GERM_LENGTH_IMGT',
+'N1_LENGTH',
+'D_SEQ_START',
+'D_SEQ_LENGTH',
+'D_GERM_START',
+'D_GERM_LENGTH',
+'N2_LENGTH',
+'J_SEQ_START',
+'J_SEQ_LENGTH',
+'J_GERM_START',
+'J_GERM_LENGTH',
+'JUNCTION_LENGTH',
+'JUNCTION']
+if score_fields:
+ordered_fields.extend(['V_SCORE',
+'V_IDENTITY',
+'V_EVALUE',
+'V_BTOP',
+'J_SCORE',
+'J_IDENTITY',
+'J_EVALUE',
+'J_BTOP'])
+if region_fields:
+ordered_fields.extend(['FWR1_IMGT', 'FWR2_IMGT', 'FWR3_IMGT', 'FWR4_IMGT',
+'CDR1_IMGT', 'CDR2_IMGT', 'CDR3_IMGT'])
+# TODO:  This is not the best approach. should pass in output fields.
+# Initiate passed handle
+pass_handle = None
+# Open failed file
+if out_args['failed']:
+fail_handle = open(fail_file, 'wt')
+fail_writer = getDbWriter(fail_handle, add_fields=['SEQUENCE_ID', 'SEQUENCE_INPUT'])
+else:
+fail_handle = None
+fail_writer = None
+# Initialize counters and file
+pass_writer = None
+start_time = time()
+rec_count = pass_count = fail_count = 0
+for record in db_gen:
+#printProgress(i + (total_count/2 if id_dict else 0), total_count, 0.05, start_time)
+printProgress(rec_count, total_count, 0.05, start_time)
+rec_count += 1
+# Count pass or fail
+if (record.v_call == 'None' and record.j_call == 'None') or \
+record.functional is None or \
+not record.seq_vdj or \
+not record.junction:
+# print(record.v_call, record.j_call, record.functional, record.junction)
+fail_count += 1
+if fail_writer is not None: fail_writer.writerow(record.toDict())
+continue
+else:
+pass_count += 1
+# Build sample sequence description
+if record.id in id_dict:
+record.id = id_dict[record.id]
+# Parse sequence description into new columns
+if not no_parse:
+record.annotations = parseAnnotation(record.id, delimiter=out_args['delimiter'])
+record.id = record.annotations['ID']
+del record.annotations['ID']
+# TODO:  This is not the best approach. should pass in output fields.
+# If first sequence, use parsed description to create new columns and initialize writer
+if pass_writer is None:
+if not no_parse:  ordered_fields.extend(list(record.annotations.keys()))
+pass_handle = open(pass_file, 'wt')
+pass_writer = getDbWriter(pass_handle, add_fields=ordered_fields)
+# Write row to tab-delim CLIP file
+pass_writer.writerow(record.toDict())
+# Print log
+#printProgress(i+1 + (total_count/2 if id_dict else 0), total_count, 0.05, start_time)
+printProgress(rec_count, total_count, 0.05, start_time)
+log = OrderedDict()
+log['OUTPUT'] = pass_file
+log['PASS'] = pass_count
+log['FAIL'] = fail_count
+log['END'] = 'MakeDb'
+printLog(log)
+if pass_handle is not None: pass_handle.close()
+if fail_handle is not None: fail_handle.close()
+# TODO:  may be able to merge with parseIMGT
+def parseIgBlast(igblast_output, seq_file, repo, no_parse=True, score_fields=False,
+region_fields=False, out_args=default_out_args):
+"""
+Main for IgBlast aligned sample sequences
+Arguments:
+igblast_output = IgBlast output file to process
+seq_file = fasta file input to IgBlast (from which to get sequence)
+repo = folder with germline repertoire files
+no_parse = if ID is to be parsed for pRESTO output with default delimiters
+score_fields = if True add alignment score fields to output file
+region_fields = if True add FWR and CDR region fields to output file
+out_args = common output argument dictionary from parseCommonArgs
+Returns:
+None
+"""
+# Print parameter info
+log = OrderedDict()
+log['START'] = 'MakeDB'
+log['ALIGNER'] = 'IgBlast'
+log['ALIGN_RESULTS'] = os.path.basename(igblast_output)
+log['SEQ_FILE'] = os.path.basename(seq_file)
+log['NO_PARSE'] = no_parse
+log['SCORE_FIELDS'] = score_fields
+log['REGION_FIELDS'] = region_fields
+printLog(log)
+# Get input sequence dictionary
+seq_dict = getSeqforIgBlast(seq_file)
+# Formalize out_dir and file-prefix
+if not out_args['out_dir']:
+out_dir = os.path.split(igblast_output)[0]
+else:
+out_dir = os.path.abspath(out_args['out_dir'])
+if not os.path.exists(out_dir):  os.mkdir(out_dir)
+if out_args['out_name']:
+file_prefix = out_args['out_name']
+else:
+file_prefix = os.path.basename(os.path.splitext(igblast_output)[0])
+file_prefix = os.path.join(out_dir, file_prefix)
+total_count = countSeqFile(seq_file)
+# Create
+repo_dict = getRepo(repo)
+igblast_dict = readIgBlast(igblast_output, seq_dict, repo_dict,
+score_fields=score_fields, region_fields=region_fields)
+writeDb(igblast_dict, file_prefix, total_count, no_parse=no_parse,
+score_fields=score_fields, region_fields=region_fields, out_args=out_args)
+# TODO:  may be able to merge with parseIgBlast
+def parseIMGT(imgt_output, seq_file=None, no_parse=True, score_fields=False,
+region_fields=False, out_args=default_out_args):
+"""
+Main for IMGT aligned sample sequences
+Arguments:
+imgt_output = zipped file or unzipped folder output by IMGT
+seq_file = FASTA file input to IMGT (from which to get seqID)
+no_parse = if ID is to be parsed for pRESTO output with default delimiters
+score_fields = if True add alignment score fields to output file
+region_fields = if True add FWR and CDR region fields to output file
+out_args = common output argument dictionary from parseCommonArgs
+Returns:
+None
+"""
+# Print parameter info
+log = OrderedDict()
+log['START'] = 'MakeDb'
+log['ALIGNER'] = 'IMGT'
+log['ALIGN_RESULTS'] = imgt_output
+log['SEQ_FILE'] = os.path.basename(seq_file) if seq_file else ''
+log['NO_PARSE'] = no_parse
+log['SCORE_FIELDS'] = score_fields
+log['REGION_FIELDS'] = region_fields
+printLog(log)
+# Get individual IMGT result files
+temp_dir, imgt_files = extractIMGT(imgt_output)
+# Formalize out_dir and file-prefix
+if not out_args['out_dir']:
+out_dir = os.path.dirname(os.path.abspath(imgt_output))
+else:
+out_dir = os.path.abspath(out_args['out_dir'])
+if not os.path.exists(out_dir):  os.mkdir(out_dir)
+if out_args['out_name']:
+file_prefix = out_args['out_name']
+else:
+file_prefix = os.path.splitext(os.path.split(os.path.abspath(imgt_output))[1])[0]
+file_prefix = os.path.join(out_dir, file_prefix)
+total_count = countDbFile(imgt_files[0])
+# Get (parsed) IDs from fasta file submitted to IMGT
+id_dict = getIDforIMGT(seq_file) if seq_file else {}
+# Create
+imgt_dict = readIMGT(imgt_files, score_fields=score_fields,
+region_fields=region_fields)
+writeDb(imgt_dict, file_prefix, total_count, id_dict=id_dict, no_parse=no_parse,
+score_fields=score_fields, region_fields=region_fields, out_args=out_args)
+# Delete temp directory
+rmtree(temp_dir)
+def getArgParser():
+"""
+Defines the ArgumentParser
+Arguments:
+None
+Returns:
+an ArgumentParser object
+"""
+fields = dedent(
+'''
+output files:
+db-pass
+database of parsed alignment records.
+db-fail
+database with records failing alignment.
+output fields:
+SEQUENCE_ID, SEQUENCE_INPUT, FUNCTIONAL, IN_FRAME, STOP, MUTATED_INVARIANT,
+INDELS, V_CALL, D_CALL, J_CALL, SEQUENCE_VDJ and/or SEQUENCE_IMGT,
+V_SEQ_START, V_SEQ_LENGTH, V_GERM_START_VDJ and/or V_GERM_START_IMGT,
+V_GERM_LENGTH_VDJ and/or V_GERM_LENGTH_IMGT, N1_LENGTH,
+D_SEQ_START, D_SEQ_LENGTH, D_GERM_START, D_GERM_LENGTH, N2_LENGTH,
+J_SEQ_START, J_SEQ_LENGTH, J_GERM_START, J_GERM_LENGTH,
+JUNCTION_LENGTH, JUNCTION, V_SCORE, V_IDENTITY, V_EVALUE, V_BTOP,
+J_SCORE, J_IDENTITY, J_EVALUE, J_BTOP, FWR1_IMGT, FWR2_IMGT, FWR3_IMGT,
+FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, CDR3_IMGT
+''')
+# Define ArgumentParser
+parser = ArgumentParser(description=__doc__, epilog=fields,
+formatter_class=CommonHelpFormatter)
+parser.add_argument('--version', action='version',
+version='%(prog)s:' + ' %s-%s' %(__version__, __date__))
+subparsers = parser.add_subparsers(title='subcommands', dest='command',
+help='Aligner used', metavar='')
+# TODO:  This is a temporary fix for Python issue 9253
+subparsers.required = True
+# Parent parser
+parser_parent = getCommonArgParser(seq_in=False, seq_out=False, log=False)
+# IgBlast Aligner
+parser_igblast = subparsers.add_parser('igblast', help='Process IgBlast output',
+parents=[parser_parent],
+formatter_class=CommonHelpFormatter)
+parser_igblast.set_defaults(func=parseIgBlast)
+parser_igblast.add_argument('-i', nargs='+', action='store', dest='aligner_files',
+required=True,
+help='''IgBLAST output files in format 7 with query sequence
+(IgBLAST argument \'-outfmt "7 std qseq sseq btop"\').''')
+parser_igblast.add_argument('-r', nargs='+', action='store', dest='repo', required=True,
+help='''List of folders and/or fasta files containing
+IMGT-gapped germline sequences corresponding to the
+set of germlines used in the IgBLAST alignment.''')
+parser_igblast.add_argument('-s', action='store', nargs='+', dest='seq_files',
+required=True,
+help='List of input FASTA files containing sequences')
+parser_igblast.add_argument('--noparse', action='store_true', dest='no_parse',
+help='''Specify if input IDs should not be parsed to add
+new columns to database.''')
+parser_igblast.add_argument('--scores', action='store_true', dest='score_fields',
+help='''Specify if alignment score metrics should be
+included in the output. Adds the V_SCORE, V_IDENTITY,
+V_EVALUE, V_BTOP, J_SCORE, J_IDENTITY,
+J_BTOP, and J_EVALUE columns.''')
+parser_igblast.add_argument('--regions', action='store_true', dest='region_fields',
+help='''Specify if IMGT framework and CDR regions should be
+included in the output. Adds the FWR1_IMGT, FWR2_IMGT,
+FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and
+CDR3_IMGT columns.''')
+# IMGT aligner
+parser_imgt = subparsers.add_parser('imgt', help='Process IMGT/HighV-Quest output',
+parents=[parser_parent],
+formatter_class=CommonHelpFormatter)
+imgt_arg_group =  parser_imgt.add_mutually_exclusive_group(required=True)
+imgt_arg_group.add_argument('-i', nargs='+', action='store', dest='aligner_files',
+help='''Either zipped IMGT output files (.zip) or a folder
+containing unzipped IMGT output files (which must
+include 1_Summary, 2_IMGT-gapped, 3_Nt-sequences,
+and 6_Junction).''')
+parser_imgt.add_argument('-s', nargs='*', action='store', dest='seq_files',
+required=False,
+help='List of input FASTA files containing sequences')
+parser_imgt.add_argument('--noparse', action='store_true', dest='no_parse',
+help='''Specify if input IDs should not be parsed to add new
+columns to database.''')
+parser_imgt.add_argument('--scores', action='store_true', dest='score_fields',
+help='''Specify if alignment score metrics should be
+included in the output. Adds the V_SCORE, V_IDENTITY,
+J_SCORE and J_IDENTITY. Note, this will also add
+the columns V_EVALUE, V_BTOP, J_EVALUE and J_BTOP,
+but they will be empty for IMGT output.''')
+parser_imgt.add_argument('--regions', action='store_true', dest='region_fields',
+help='''Specify if IMGT framework and CDR regions should be
+included in the output. Adds the FWR1_IMGT, FWR2_IMGT,
+FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and
+CDR3_IMGT columns.''')
+parser_imgt.set_defaults(func=parseIMGT)
+return parser
+if __name__ == "__main__":
+"""
+Parses command line arguments and calls main
+"""
+parser = getArgParser()
+args = parser.parse_args()
+args_dict = parseCommonArgs(args, in_arg='aligner_files')
+# Set no ID parsing if sequence files are not provided
+if 'seq_files' in args_dict and not args_dict['seq_files']:
+args_dict['no_parse'] = True
+# Delete
+if 'seq_files' in args_dict: del args_dict['seq_files']
+if 'aligner_files' in args_dict: del args_dict['aligner_files']
+if 'command' in args_dict: del args_dict['command']
+if 'func' in args_dict: del args_dict['func']
+if args.command == 'imgt':
+for i in range(len(args.__dict__['aligner_files'])):
+args_dict['imgt_output'] = args.__dict__['aligner_files'][i]
+args_dict['seq_file'] = args.__dict__['seq_files'][i] \
+if args.__dict__['seq_files'] else None
+args.func(**args_dict)
+elif args.command == 'igblast':
+for i in range(len(args.__dict__['aligner_files'])):
+args_dict['igblast_output'] =  args.__dict__['aligner_files'][i]
+args_dict['seq_file'] = args.__dict__['seq_files'][i]
+args.func(**args_dict)

Mercurial > repos > davidvanzessen > shm_csr

comparison change_o/MakeDb.py @ 0:c33d93683a09 draft