comparison change_o/MakeDb.py @ 52:22dddabe3637 draft

Uploaded
author davidvanzessen
date Tue, 23 May 2017 08:32:58 -0400
parents c33d93683a09
children
comparison
equal deleted inserted replaced
51:8fa8836bd605 52:22dddabe3637
5 # Info 5 # Info
6 __author__ = 'Namita Gupta, Jason Anthony Vander Heiden' 6 __author__ = 'Namita Gupta, Jason Anthony Vander Heiden'
7 from changeo import __version__, __date__ 7 from changeo import __version__, __date__
8 8
9 # Imports 9 # Imports
10 import csv
11 import os 10 import os
12 import re
13 import sys 11 import sys
14 import pandas as pd
15 import tarfile
16 import zipfile
17 from argparse import ArgumentParser 12 from argparse import ArgumentParser
18 from collections import OrderedDict 13 from collections import OrderedDict
19 from itertools import groupby
20 from shutil import rmtree
21 from tempfile import mkdtemp
22 from textwrap import dedent 14 from textwrap import dedent
23 from time import time 15 from time import time
24 from Bio import SeqIO 16 from Bio import SeqIO
25 from Bio.Seq import Seq
26 from Bio.Alphabet import IUPAC
27 17
28 # Presto and changeo imports 18 # Presto and changeo imports
29 from presto.Defaults import default_out_args 19 from presto.Defaults import default_out_args
30 from presto.Annotation import parseAnnotation 20 from presto.Annotation import parseAnnotation
31 from presto.IO import countSeqFile, printLog, printProgress 21 from presto.IO import countSeqFile, printLog, printMessage, printProgress, readSeqFile
32 from changeo.Commandline import CommonHelpFormatter, getCommonArgParser, parseCommonArgs 22 from changeo.Commandline import CommonHelpFormatter, getCommonArgParser, parseCommonArgs
33 from changeo.IO import getDbWriter, countDbFile, getRepo 23 from changeo.IO import countDbFile, extractIMGT, getDbWriter, readRepo
34 from changeo.Receptor import IgRecord, parseAllele, v_allele_regex, d_allele_regex, \ 24 from changeo.Parsers import IgBLASTReader, IMGTReader, IHMMuneReader, getIDforIMGT
35 j_allele_regex 25
36 26
37 # Default parameters 27 def getFilePrefix(aligner_output, out_args):
38 default_delimiter = ('\t', ',', '-') 28 """
39 29 Get file name prefix and create output directory
40
41 def gapV(ig_dict, repo_dict):
42 """
43 Insert gaps into V region and update alignment information
44 30
45 Arguments: 31 Arguments:
46 ig_dict : Dictionary of parsed IgBlast output 32 aligner_output : aligner output file or directory.
47 repo_dict : Dictionary of IMGT gapped germline sequences 33 out_args : dictionary of output arguments.
48 34
49 Returns: 35 Returns:
50 dict : Updated with SEQUENCE_IMGT, V_GERM_START_IMGT, and V_GERM_LENGTH_IMGT fields 36 str : file name prefix.
51 """ 37 """
52 38 # Determine output directory
53 seq_imgt = '.' * (int(ig_dict['V_GERM_START_VDJ'])-1) + ig_dict['SEQUENCE_VDJ'] 39 if not out_args['out_dir']:
54 40 out_dir = os.path.dirname(os.path.abspath(aligner_output))
55 # Find gapped germline V segment 41 else:
56 vgene = parseAllele(ig_dict['V_CALL'], v_allele_regex, 'first') 42 out_dir = os.path.abspath(out_args['out_dir'])
57 vkey = (vgene, ) 43 if not os.path.exists(out_dir):
58 #TODO: Figure out else case 44 os.mkdir(out_dir)
59 if vkey in repo_dict: 45
60 vgap = repo_dict[vkey] 46 # Determine file prefix
61 # Iterate over gaps in the germline segment 47 if out_args['out_name']:
62 gaps = re.finditer(r'\.', vgap) 48 file_prefix = out_args['out_name']
63 gapcount = int(ig_dict['V_GERM_START_VDJ'])-1 49 else:
64 for gap in gaps: 50 file_prefix = os.path.splitext(os.path.split(os.path.abspath(aligner_output))[1])[0]
65 i = gap.start() 51
66 # Break if gap begins after V region 52 return os.path.join(out_dir, file_prefix)
67 if i >= ig_dict['V_GERM_LENGTH_VDJ'] + gapcount: 53
68 break 54
69 # Insert gap into IMGT sequence 55 def getSeqDict(seq_file):
70 seq_imgt = seq_imgt[:i] + '.' + seq_imgt[i:] 56 """
71 # Update gap counter 57 Create a dictionary from a sequence file.
72 gapcount += 1
73 ig_dict['SEQUENCE_IMGT'] = seq_imgt
74 # Update IMGT positioning information for V
75 ig_dict['V_GERM_START_IMGT'] = 1
76 ig_dict['V_GERM_LENGTH_IMGT'] = ig_dict['V_GERM_LENGTH_VDJ'] + gapcount
77
78 return ig_dict
79
80
81 def getIMGTJunc(ig_dict, repo_dict):
82 """
83 Identify junction region by IMGT definition
84 58
85 Arguments: 59 Arguments:
86 ig_dict : Dictionary of parsed IgBlast output 60 seq_file : sequence file.
87 repo_dict : Dictionary of IMGT gapped germline sequences
88 61
89 Returns: 62 Returns:
90 dict : Updated with JUNCTION_LENGTH_IMGT and JUNCTION_IMGT fields 63 dict : sequence description as keys with Bio.SeqRecords as values.
91 """ 64 """
92 # Find germline J segment 65 seq_dict = SeqIO.to_dict(readSeqFile(seq_file),
93 jgene = parseAllele(ig_dict['J_CALL'], j_allele_regex, 'first') 66 key_function=lambda x: x.description)
94 jkey = (jgene, ) 67
95 #TODO: Figure out else case 68 return seq_dict
96 if jkey in repo_dict: 69
97 # Get germline J sequence 70
98 jgerm = repo_dict[jkey] 71 def writeDb(db, fields, file_prefix, total_count, id_dict=None, no_parse=True, partial=False,
99 jgerm = jgerm[:ig_dict['J_GERM_START']+ig_dict['J_GERM_LENGTH']-1] 72 out_args=default_out_args):
100 # Look for (F|W)GXG aa motif in nt sequence 73 """
101 motif = re.search(r'T(TT|TC|GG)GG[ACGT]{4}GG[AGCT]', jgerm) 74 Writes tab-delimited database file in output directory.
102 aa_end = len(ig_dict['SEQUENCE_IMGT'])
103 #TODO: Figure out else case
104 if motif:
105 # print('\n', motif.group())
106 aa_end = motif.start() - len(jgerm) + 3
107 # Add fields to dict
108 ig_dict['JUNCTION'] = ig_dict['SEQUENCE_IMGT'][309:aa_end]
109 ig_dict['JUNCTION_LENGTH'] = len(ig_dict['JUNCTION'])
110
111 return ig_dict
112
113
114 def getRegions(ig_dict):
115 """
116 Identify FWR and CDR regions by IMGT definition
117
118 Arguments:
119 ig_dict : Dictionary of parsed alignment output
120
121 Returns:
122 dict : Updated with FWR1_IMGT, FWR2_IMGT, FWR3_IMGT, FWR4_IMGT,
123 CDR1_IMGT, CDR2_IMGT, and CDR3_IMGT fields
124 """
125 try:
126 seq_len = len(ig_dict['SEQUENCE_IMGT'])
127 ig_dict['FWR1_IMGT'] = ig_dict['SEQUENCE_IMGT'][0:min(78,seq_len)]
128 except (KeyError, IndexError):
129 return ig_dict
130
131 try: ig_dict['CDR1_IMGT'] = ig_dict['SEQUENCE_IMGT'][78:min(114, seq_len)]
132 except (IndexError): return ig_dict
133
134 try: ig_dict['FWR2_IMGT'] = ig_dict['SEQUENCE_IMGT'][114:min(165, seq_len)]
135 except (IndexError): return ig_dict
136
137 try: ig_dict['CDR2_IMGT'] = ig_dict['SEQUENCE_IMGT'][165:min(195, seq_len)]
138 except (IndexError): return ig_dict
139
140 try: ig_dict['FWR3_IMGT'] = ig_dict['SEQUENCE_IMGT'][195:min(312, seq_len)]
141 except (IndexError): return ig_dict
142
143 try:
144 cdr3_end = 306 + ig_dict['JUNCTION_LENGTH']
145 ig_dict['CDR3_IMGT'] = ig_dict['SEQUENCE_IMGT'][312:cdr3_end]
146 ig_dict['FWR4_IMGT'] = ig_dict['SEQUENCE_IMGT'][cdr3_end:]
147 except (KeyError, IndexError):
148 return ig_dict
149
150 return ig_dict
151
152
153 def getSeqforIgBlast(seq_file):
154 """
155 Fetch input sequences for IgBlast queries
156
157 Arguments:
158 seq_file = a fasta file of sequences input to IgBlast
159
160 Returns:
161 a dictionary of {ID:Seq}
162 """
163
164 seq_dict = SeqIO.index(seq_file, "fasta", IUPAC.ambiguous_dna)
165
166 # Create a seq_dict ID translation using IDs truncate up to space or 50 chars
167 seqs = {}
168 for seq in seq_dict.values():
169 seqs.update({seq.description:str(seq.seq)})
170
171 return seqs
172
173
174 def findLine(handle, query):
175 """
176 Finds line with query string in file
177
178 Arguments:
179 handle = file handle in which to search for line
180 query = query string for which to search in file
181
182 Returns:
183 line from handle in which query string was found
184 """
185 for line in handle:
186 if(re.match(query, line)):
187 return line
188
189
190 def extractIMGT(imgt_output):
191 """
192 Extract necessary files from IMGT results, zipped or unzipped
193 75
194 Arguments: 76 Arguments:
195 imgt_output = zipped file or unzipped folder output by IMGT 77 db : a iterator of IgRecord objects containing alignment data.
196 78 fields : a list of ordered field names to write.
79 file_prefix : directory and prefix for CLIP tab-delim file.
80 total_count : number of records (for progress bar).
81 id_dict : a dictionary of the truncated sequence ID mapped to the full sequence ID.
82 no_parse : if ID is to be parsed for pRESTO output with default delimiters.
83 partial : if True put incomplete alignments in the pass file.
84 out_args : common output argument dictionary from parseCommonArgs.
85
197 Returns: 86 Returns:
198 sorted list of filenames from which information will be read 87 None
199 """ 88 """
200 #file_ext = os.path.splitext(imgt_output)[1].lower() 89 # Function to check for valid records strictly
201 imgt_flags = ('1_Summary', '2_IMGT-gapped', '3_Nt-sequences', '6_Junction') 90 def _pass_strict(rec):
202 temp_dir = mkdtemp() 91 valid = [rec.v_call and rec.v_call != 'None',
203 if zipfile.is_zipfile(imgt_output): 92 rec.j_call and rec.j_call != 'None',
204 # Open zip file 93 rec.functional is not None,
205 imgt_zip = zipfile.ZipFile(imgt_output, 'r') 94 rec.seq_vdj,
206 # Extract required files 95 rec.junction]
207 imgt_files = sorted([n for n in imgt_zip.namelist() \ 96 return all(valid)
208 if os.path.basename(n).startswith(imgt_flags)]) 97
209 imgt_zip.extractall(temp_dir, imgt_files) 98 # Function to check for valid records loosely
210 # Define file list 99 def _pass_gentle(rec):
211 imgt_files = [os.path.join(temp_dir, f) for f in imgt_files] 100 valid = [rec.v_call and rec.v_call != 'None',
212 elif os.path.isdir(imgt_output): 101 rec.d_call and rec.d_call != 'None',
213 # Find required files in folder 102 rec.j_call and rec.j_call != 'None']
214 folder_files = [] 103 return any(valid)
215 for root, dirs, files in os.walk(imgt_output): 104
216 folder_files.extend([os.path.join(os.path.abspath(root), f) for f in files]) 105 # Set pass criteria
217 # Define file list 106 _pass = _pass_gentle if partial else _pass_strict
218 imgt_files = sorted([n for n in folder_files \ 107
219 if os.path.basename(n).startswith(imgt_flags)]) 108 # Define output file names
220 elif tarfile.is_tarfile(imgt_output): 109 pass_file = '%s_db-pass.tab' % file_prefix
221 # Open zip file 110 fail_file = '%s_db-fail.tab' % file_prefix
222 imgt_tar = tarfile.open(imgt_output, 'r') 111
223 # Extract required files 112 # Initiate handles, writers and counters
224 imgt_files = sorted([n for n in imgt_tar.getnames() \
225 if os.path.basename(n).startswith(imgt_flags)])
226 imgt_tar.extractall(temp_dir, [imgt_tar.getmember(n) for n in imgt_files])
227 # Define file list
228 imgt_files = [os.path.join(temp_dir, f) for f in imgt_files]
229 else:
230 sys.exit('ERROR: Unsupported IGMT output file. Must be either a zipped file (.zip), LZMA compressed tarfile (.txz) or a folder.')
231
232 if len(imgt_files) > len(imgt_flags): # e.g. multiple 1_Summary files
233 sys.exit('ERROR: Wrong files in IMGT output %s.' % imgt_output)
234 elif len(imgt_files) < len(imgt_flags):
235 sys.exit('ERROR: Missing necessary file IMGT output %s.' % imgt_output)
236
237 return temp_dir, imgt_files
238
239
240 # TODO: return a dictionary with keys determined by the comment strings in the blocks, thus avoiding problems with missing blocks
241 def readOneIgBlastResult(block):
242 """
243 Parse a single IgBLAST query result
244
245 Arguments:
246 block = itertools groupby object of single result
247
248 Returns:
249 None if no results, otherwise list of DataFrames for each result block
250 """
251 results = list()
252 i = 0
253 for match, subblock in groupby(block, lambda l: l=='\n'):
254 if not match:
255 # Strip whitespace and comments
256 sub = [s.strip() for s in subblock if not s.startswith('#')]
257
258 # Continue on empty block
259 if not sub: continue
260 else: i += 1
261
262 # Split by tabs
263 sub = [s.split('\t') for s in sub]
264
265 # Append list for "V-(D)-J rearrangement summary" (i == 1)
266 # And "V-(D)-J junction details" (i == 2)
267 # Otherwise append DataFrame of subblock
268 if i == 1 or i == 2:
269 results.append(sub[0])
270 else:
271 df = pd.DataFrame(sub)
272 if not df.empty: results.append(df)
273
274 return results if results else None
275
276
277 # TODO: needs more speeds. pandas is probably to blame.
278 def readIgBlast(igblast_output, seq_dict, repo_dict,
279 score_fields=False, region_fields=False):
280 """
281 Reads IgBlast output
282
283 Arguments:
284 igblast_output = IgBlast output file (format 7)
285 seq_dict = a dictionary of {ID:Seq} from input fasta file
286 repo_dict = dictionary of IMGT gapped germline sequences
287 score_fields = if True parse alignment scores
288 region_fields = if True add FWR and CDR region fields
289
290 Returns:
291 a generator of dictionaries containing alignment data
292 """
293
294 # Open IgBlast output file
295 with open(igblast_output) as f:
296 # Iterate over individual results (separated by # IGBLASTN)
297 for k1, block in groupby(f, lambda x: re.match('# IGBLASTN', x)):
298 block = list(block)
299 if not k1:
300 # TODO: move query name extraction into block parser readOneIgBlastResult().
301 # Extract sequence ID
302 query_name = ' '.join(block[0].strip().split(' ')[2:])
303 # Initialize db_gen to have ID and input sequence
304 db_gen = {'SEQUENCE_ID': query_name,
305 'SEQUENCE_INPUT': seq_dict[query_name]}
306
307 # Parse further sub-blocks
308 block_list = readOneIgBlastResult(block)
309
310 # TODO: this is indented pretty far. should be a separate function. or several functions.
311 # If results exist, parse further to obtain full db_gen
312 if block_list is not None:
313 # Parse quality information
314 db_gen['STOP'] = 'T' if block_list[0][-4] == 'Yes' else 'F'
315 db_gen['IN_FRAME'] = 'T' if block_list[0][-3] == 'In-frame' else 'F'
316 db_gen['FUNCTIONAL'] = 'T' if block_list[0][-2] == 'Yes' else 'F'
317 if block_list[0][-1] == '-':
318 db_gen['SEQUENCE_INPUT'] = str(Seq(db_gen['SEQUENCE_INPUT'],
319 IUPAC.ambiguous_dna).reverse_complement())
320
321 # Parse V, D, and J calls
322 call_str = ' '.join(block_list[0])
323 v_call = parseAllele(call_str, v_allele_regex, action='list')
324 d_call = parseAllele(call_str, d_allele_regex, action='list')
325 j_call = parseAllele(call_str, j_allele_regex, action='list')
326 db_gen['V_CALL'] = ','.join(v_call) if v_call is not None else 'None'
327 db_gen['D_CALL'] = ','.join(d_call) if d_call is not None else 'None'
328 db_gen['J_CALL'] = ','.join(j_call) if j_call is not None else 'None'
329
330 # Parse junction sequence
331 # db_gen['JUNCTION_VDJ'] = re.sub('(N/A)|\[|\(|\)|\]', '', ''.join(block_list[1]))
332 # db_gen['JUNCTION_LENGTH_VDJ'] = len(db_gen['JUNCTION_VDJ'])
333
334 # TODO: IgBLAST does a stupid and doesn't output block #3 sometimes. why?
335 # TODO: maybe we should fail these. they look craptastic.
336 #pd.set_option('display.width', 500)
337 #print query_name, len(block_list), hit_idx
338 #for i, x in enumerate(block_list):
339 # print '[%i]' % i
340 # print x
341
342 # Parse segment start and stop positions
343 hit_df = block_list[-1]
344
345 # Alignment info block
346 # 0: segment
347 # 1: query id
348 # 2: subject id
349 # 3: % identity
350 # 4: alignment length
351 # 5: mismatches
352 # 6: gap opens
353 # 7: gaps
354 # 8: q. start
355 # 9: q. end
356 # 10: s. start
357 # 11: s. end
358 # 12: evalue
359 # 13: bit score
360 # 14: query seq
361 # 15: subject seq
362 # 16: btop
363
364 # If V call exists, parse V alignment information
365 seq_vdj = ''
366 if v_call is not None:
367 v_align = hit_df[hit_df[0] == 'V'].iloc[0]
368 # Germline positions
369 db_gen['V_GERM_START_VDJ'] = int(v_align[10])
370 db_gen['V_GERM_LENGTH_VDJ'] = int(v_align[11]) - db_gen['V_GERM_START_VDJ'] + 1
371 # Query sequence positions
372 db_gen['V_SEQ_START'] = int(v_align[8])
373 db_gen['V_SEQ_LENGTH'] = int(v_align[9]) - db_gen['V_SEQ_START'] + 1
374
375 if int(v_align[6]) == 0:
376 db_gen['INDELS'] = 'F'
377 else:
378 db_gen['INDELS'] = 'T'
379 # Set functional to none so record gets tossed (junction will be wrong)
380 # db_gen['FUNCTIONAL'] = None
381
382 # V alignment scores
383 if score_fields:
384 try: db_gen['V_SCORE'] = float(v_align[13])
385 except (TypeError, ValueError): db_gen['V_SCORE'] = 'None'
386
387 try: db_gen['V_IDENTITY'] = float(v_align[3]) / 100.0
388 except (TypeError, ValueError): db_gen['V_IDENTITY'] = 'None'
389
390 try: db_gen['V_EVALUE'] = float(v_align[12])
391 except (TypeError, ValueError): db_gen['V_EVALUE'] = 'None'
392
393 try: db_gen['V_BTOP'] = v_align[16]
394 except (TypeError, ValueError): db_gen['V_BTOP'] = 'None'
395
396 # Update VDJ sequence, removing insertions
397 start = 0
398 for m in re.finditer(r'-', v_align[15]):
399 ins = m.start()
400 seq_vdj += v_align[14][start:ins]
401 start = ins + 1
402 seq_vdj += v_align[14][start:]
403
404 # TODO: needs to check that the V results are present before trying to determine N1_LENGTH from them.
405 # If D call exists, parse D alignment information
406 if d_call is not None:
407 d_align = hit_df[hit_df[0] == 'D'].iloc[0]
408
409 # TODO: this is kinda gross. not sure how else to fix the alignment overlap problem though.
410 # Determine N-region length and amount of J overlap with V or D alignment
411 overlap = 0
412 if v_call is not None:
413 n1_len = int(d_align[8]) - (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH'])
414 if n1_len < 0:
415 db_gen['N1_LENGTH'] = 0
416 overlap = abs(n1_len)
417 else:
418 db_gen['N1_LENGTH'] = n1_len
419 n1_start = (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH']-1)
420 n1_end = int(d_align[8])-1
421 seq_vdj += db_gen['SEQUENCE_INPUT'][n1_start:n1_end]
422
423 # Query sequence positions
424 db_gen['D_SEQ_START'] = int(d_align[8]) + overlap
425 db_gen['D_SEQ_LENGTH'] = max(int(d_align[9]) - db_gen['D_SEQ_START'] + 1, 0)
426
427 # Germline positions
428 db_gen['D_GERM_START'] = int(d_align[10]) + overlap
429 db_gen['D_GERM_LENGTH'] = max(int(d_align[11]) - db_gen['D_GERM_START'] + 1, 0)
430
431 # Update VDJ sequence, removing insertions
432 start = overlap
433 for m in re.finditer(r'-', d_align[15]):
434 ins = m.start()
435 seq_vdj += d_align[14][start:ins]
436 start = ins + 1
437 seq_vdj += d_align[14][start:]
438
439 # TODO: needs to check that the V results are present before trying to determine N1_LENGTH from them.
440 # If J call exists, parse J alignment information
441 if j_call is not None:
442 j_align = hit_df[hit_df[0] == 'J'].iloc[0]
443
444 # TODO: this is kinda gross. not sure how else to fix the alignment overlap problem though.
445 # Determine N-region length and amount of J overlap with V or D alignment
446 overlap = 0
447 if d_call is not None:
448 n2_len = int(j_align[8]) - (db_gen['D_SEQ_START'] + db_gen['D_SEQ_LENGTH'])
449 if n2_len < 0:
450 db_gen['N2_LENGTH'] = 0
451 overlap = abs(n2_len)
452 else:
453 db_gen['N2_LENGTH'] = n2_len
454 n2_start = (db_gen['D_SEQ_START']+db_gen['D_SEQ_LENGTH']-1)
455 n2_end = int(j_align[8])-1
456 seq_vdj += db_gen['SEQUENCE_INPUT'][n2_start:n2_end]
457 elif v_call is not None:
458 n1_len = int(j_align[8]) - (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH'])
459 if n1_len < 0:
460 db_gen['N1_LENGTH'] = 0
461 overlap = abs(n1_len)
462 else:
463 db_gen['N1_LENGTH'] = n1_len
464 n1_start = (db_gen['V_SEQ_START']+db_gen['V_SEQ_LENGTH']-1)
465 n1_end = int(j_align[8])-1
466 seq_vdj += db_gen['SEQUENCE_INPUT'][n1_start:n1_end]
467 else:
468 db_gen['N1_LENGTH'] = 0
469
470 # Query positions
471 db_gen['J_SEQ_START'] = int(j_align[8]) + overlap
472 db_gen['J_SEQ_LENGTH'] = max(int(j_align[9]) - db_gen['J_SEQ_START'] + 1, 0)
473
474 # Germline positions
475 db_gen['J_GERM_START'] = int(j_align[10]) + overlap
476 db_gen['J_GERM_LENGTH'] = max(int(j_align[11]) - db_gen['J_GERM_START'] + 1, 0)
477
478 # J alignment scores
479 if score_fields:
480 try: db_gen['J_SCORE'] = float(j_align[13])
481 except (TypeError, ValueError): db_gen['J_SCORE'] = 'None'
482
483 try: db_gen['J_IDENTITY'] = float(j_align[3]) / 100.0
484 except (TypeError, ValueError): db_gen['J_IDENTITY'] = 'None'
485
486 try: db_gen['J_EVALUE'] = float(j_align[12])
487 except (TypeError, ValueError): db_gen['J_EVALUE'] = 'None'
488
489 try: db_gen['J_BTOP'] = j_align[16]
490 except (TypeError, ValueError): db_gen['J_BTOP'] = 'None'
491
492 # Update VDJ sequence, removing insertions
493 start = overlap
494 for m in re.finditer(r'-', j_align[15]):
495 ins = m.start()
496 seq_vdj += j_align[14][start:ins]
497 start = ins + 1
498 seq_vdj += j_align[14][start:]
499
500 db_gen['SEQUENCE_VDJ'] = seq_vdj
501
502 # Create IMGT-gapped sequence and infer IMGT junction
503 if v_call is not None:
504 db_gen = gapV(db_gen, repo_dict)
505 if j_call is not None:
506 db_gen = getIMGTJunc(db_gen, repo_dict)
507
508 # FWR and CDR regions
509 if region_fields: getRegions(db_gen)
510
511 yield IgRecord(db_gen)
512
513
514 # TODO: should be more readable
515 def readIMGT(imgt_files, score_fields=False, region_fields=False):
516 """
517 Reads IMGT/HighV-Quest output
518
519 Arguments:
520 imgt_files = IMGT/HighV-Quest output files 1, 2, 3, and 6
521 score_fields = if True parse alignment scores
522 region_fields = if True add FWR and CDR region fields
523
524 Returns:
525 a generator of dictionaries containing alignment data
526 """
527 imgt_iters = [csv.DictReader(open(f, 'rU'), delimiter='\t') for f in imgt_files]
528 # Create a dictionary for each sequence alignment and yield its generator
529 for sm, gp, nt, jn in zip(*imgt_iters):
530 if len(set([sm['Sequence ID'],
531 gp['Sequence ID'],
532 nt['Sequence ID'],
533 jn['Sequence ID']])) != 1:
534 sys.exit('Error: IMGT files are corrupt starting with Summary file record %s' \
535 % sm['Sequence ID'])
536
537 db_gen = {'SEQUENCE_ID': sm['Sequence ID'],
538 'SEQUENCE_INPUT': sm['Sequence']}
539
540 if 'No results' not in sm['Functionality']:
541 db_gen['FUNCTIONAL'] = ['?','T','F'][('productive' in sm['Functionality']) +
542 ('unprod' in sm['Functionality'])]
543 db_gen['IN_FRAME'] = ['?','T','F'][('in-frame' in sm['JUNCTION frame']) +
544 ('out-of-frame' in sm['JUNCTION frame'])],
545 db_gen['STOP'] = ['F','?','T'][('stop codon' in sm['Functionality comment']) +
546 ('unprod' in sm['Functionality'])]
547 db_gen['MUTATED_INVARIANT'] = ['F','?','T'][(any(('missing' in sm['Functionality comment'],
548 'missing' in sm['V-REGION potential ins/del']))) +
549 ('unprod' in sm['Functionality'])]
550 db_gen['INDELS'] = ['F','T'][any((sm['V-REGION potential ins/del'],
551 sm['V-REGION insertions'],
552 sm['V-REGION deletions']))]
553
554 db_gen['SEQUENCE_VDJ'] = nt['V-D-J-REGION'] if nt['V-D-J-REGION'] else nt['V-J-REGION']
555 db_gen['SEQUENCE_IMGT'] = gp['V-D-J-REGION'] if gp['V-D-J-REGION'] else gp['V-J-REGION']
556
557 db_gen['V_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['V-GENE and allele']))
558 db_gen['D_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['D-GENE and allele']))
559 db_gen['J_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['J-GENE and allele']))
560
561 v_seq_length = len(nt['V-REGION']) if nt['V-REGION'] else 0
562 db_gen['V_SEQ_START'] = nt['V-REGION start']
563 db_gen['V_SEQ_LENGTH'] = v_seq_length
564 db_gen['V_GERM_START_IMGT'] = 1
565 db_gen['V_GERM_LENGTH_IMGT'] = len(gp['V-REGION']) if gp['V-REGION'] else 0
566
567 db_gen['N1_LENGTH'] = sum(int(i) for i in [jn["P3'V-nt nb"],
568 jn['N-REGION-nt nb'],
569 jn['N1-REGION-nt nb'],
570 jn["P5'D-nt nb"]] if i)
571 db_gen['D_SEQ_START'] = sum(int(i) for i in [1, v_seq_length,
572 jn["P3'V-nt nb"],
573 jn['N-REGION-nt nb'],
574 jn['N1-REGION-nt nb'],
575 jn["P5'D-nt nb"]] if i)
576 db_gen['D_SEQ_LENGTH'] = int(jn["D-REGION-nt nb"] or 0)
577 db_gen['D_GERM_START'] = int(jn["5'D-REGION trimmed-nt nb"] or 0) + 1
578 db_gen['D_GERM_LENGTH'] = int(jn["D-REGION-nt nb"] or 0)
579 db_gen['N2_LENGTH'] = sum(int(i) for i in [jn["P3'D-nt nb"],
580 jn['N2-REGION-nt nb'],
581 jn["P5'J-nt nb"]] if i)
582
583 db_gen['J_SEQ_START_IMGT'] = sum(int(i) for i in [1, v_seq_length,
584 jn["P3'V-nt nb"],
585 jn['N-REGION-nt nb'],
586 jn['N1-REGION-nt nb'],
587 jn["P5'D-nt nb"],
588 jn["D-REGION-nt nb"],
589 jn["P3'D-nt nb"],
590 jn['N2-REGION-nt nb'],
591 jn["P5'J-nt nb"]] if i)
592 db_gen['J_SEQ_LENGTH'] = len(nt['J-REGION']) if nt['J-REGION'] else 0
593 db_gen['J_GERM_START'] = int(jn["5'J-REGION trimmed-nt nb"] or 0) + 1
594 db_gen['J_GERM_LENGTH'] = len(gp['J-REGION']) if gp['J-REGION'] else 0
595
596 db_gen['JUNCTION_LENGTH'] = len(jn['JUNCTION']) if jn['JUNCTION'] else 0
597 db_gen['JUNCTION'] = jn['JUNCTION']
598
599 # Alignment scores
600 if score_fields:
601 try: db_gen['V_SCORE'] = float(sm['V-REGION score'])
602 except (TypeError, ValueError): db_gen['V_SCORE'] = 'None'
603
604 try: db_gen['V_IDENTITY'] = float(sm['V-REGION identity %']) / 100.0
605 except (TypeError, ValueError): db_gen['V_IDENTITY'] = 'None'
606
607 try: db_gen['J_SCORE'] = float(sm['J-REGION score'])
608 except (TypeError, ValueError): db_gen['J_SCORE'] = 'None'
609
610 try: db_gen['J_IDENTITY'] = float(sm['J-REGION identity %']) / 100.0
611 except (TypeError, ValueError): db_gen['J_IDENTITY'] = 'None'
612
613 # FWR and CDR regions
614 if region_fields: getRegions(db_gen)
615 else:
616 db_gen['V_CALL'] = 'None'
617 db_gen['D_CALL'] = 'None'
618 db_gen['J_CALL'] = 'None'
619
620 yield IgRecord(db_gen)
621
622
623 def getIDforIMGT(seq_file):
624 """
625 Create a sequence ID translation using IMGT truncation
626
627 Arguments:
628 seq_file = a fasta file of sequences input to IMGT
629
630 Returns:
631 a dictionary of {truncated ID: full seq description}
632 """
633
634 # Create a seq_dict ID translation using IDs truncate up to space or 50 chars
635 ids = {}
636 for i, rec in enumerate(SeqIO.parse(seq_file, 'fasta', IUPAC.ambiguous_dna)):
637 if len(rec.description) <= 50:
638 id_key = rec.description
639 else:
640 id_key = re.sub('\||\s|!|&|\*|<|>|\?','_',rec.description[:50])
641 ids.update({id_key:rec.description})
642
643 return ids
644
645
646 def writeDb(db_gen, file_prefix, total_count, id_dict={}, no_parse=True,
647 score_fields=False, region_fields=False, out_args=default_out_args):
648 """
649 Writes tab-delimited database file in output directory
650
651 Arguments:
652 db_gen = a generator of IgRecord objects containing alignment data
653 file_prefix = directory and prefix for CLIP tab-delim file
654 total_count = number of records (for progress bar)
655 id_dict = a dictionary of {IMGT ID: full seq description}
656 no_parse = if ID is to be parsed for pRESTO output with default delimiters
657 score_fields = if True add alignment score fields to output file
658 region_fields = if True add FWR and CDR region fields to output file
659 out_args = common output argument dictionary from parseCommonArgs
660
661 Returns:
662 None
663 """
664 pass_file = "%s_db-pass.tab" % file_prefix
665 fail_file = "%s_db-fail.tab" % file_prefix
666 ordered_fields = ['SEQUENCE_ID',
667 'SEQUENCE_INPUT',
668 'FUNCTIONAL',
669 'IN_FRAME',
670 'STOP',
671 'MUTATED_INVARIANT',
672 'INDELS',
673 'V_CALL',
674 'D_CALL',
675 'J_CALL',
676 'SEQUENCE_VDJ',
677 'SEQUENCE_IMGT',
678 'V_SEQ_START',
679 'V_SEQ_LENGTH',
680 'V_GERM_START_VDJ',
681 'V_GERM_LENGTH_VDJ',
682 'V_GERM_START_IMGT',
683 'V_GERM_LENGTH_IMGT',
684 'N1_LENGTH',
685 'D_SEQ_START',
686 'D_SEQ_LENGTH',
687 'D_GERM_START',
688 'D_GERM_LENGTH',
689 'N2_LENGTH',
690 'J_SEQ_START',
691 'J_SEQ_LENGTH',
692 'J_GERM_START',
693 'J_GERM_LENGTH',
694 'JUNCTION_LENGTH',
695 'JUNCTION']
696
697 if score_fields:
698 ordered_fields.extend(['V_SCORE',
699 'V_IDENTITY',
700 'V_EVALUE',
701 'V_BTOP',
702 'J_SCORE',
703 'J_IDENTITY',
704 'J_EVALUE',
705 'J_BTOP'])
706
707 if region_fields:
708 ordered_fields.extend(['FWR1_IMGT', 'FWR2_IMGT', 'FWR3_IMGT', 'FWR4_IMGT',
709 'CDR1_IMGT', 'CDR2_IMGT', 'CDR3_IMGT'])
710
711
712 # TODO: This is not the best approach. should pass in output fields.
713 # Initiate passed handle
714 pass_handle = None 113 pass_handle = None
715 114 fail_handle = None
716 # Open failed file
717 if out_args['failed']:
718 fail_handle = open(fail_file, 'wt')
719 fail_writer = getDbWriter(fail_handle, add_fields=['SEQUENCE_ID', 'SEQUENCE_INPUT'])
720 else:
721 fail_handle = None
722 fail_writer = None
723
724 # Initialize counters and file
725 pass_writer = None 115 pass_writer = None
116 fail_writer = None
726 start_time = time() 117 start_time = time()
727 rec_count = pass_count = fail_count = 0 118 rec_count = pass_count = fail_count = 0
728 for record in db_gen: 119
729 #printProgress(i + (total_count/2 if id_dict else 0), total_count, 0.05, start_time) 120 # Validate and write output
730 printProgress(rec_count, total_count, 0.05, start_time) 121 printProgress(0, total_count, 0.05, start_time)
731 rec_count += 1 122 for i, record in enumerate(db, start=1):
732 123
733 # Count pass or fail 124 # Replace sequence description with full string, if required
734 if (record.v_call == 'None' and record.j_call == 'None') or \ 125 if id_dict is not None and record.id in id_dict:
735 record.functional is None or \
736 not record.seq_vdj or \
737 not record.junction:
738 # print(record.v_call, record.j_call, record.functional, record.junction)
739 fail_count += 1
740 if fail_writer is not None: fail_writer.writerow(record.toDict())
741 continue
742 else:
743 pass_count += 1
744
745 # Build sample sequence description
746 if record.id in id_dict:
747 record.id = id_dict[record.id] 126 record.id = id_dict[record.id]
748 127
749 # Parse sequence description into new columns 128 # Parse sequence description into new columns
750 if not no_parse: 129 if not no_parse:
751 record.annotations = parseAnnotation(record.id, delimiter=out_args['delimiter']) 130 try:
752 record.id = record.annotations['ID'] 131 record.annotations = parseAnnotation(record.id, delimiter=out_args['delimiter'])
753 del record.annotations['ID'] 132 record.id = record.annotations['ID']
754 133 del record.annotations['ID']
755 # TODO: This is not the best approach. should pass in output fields. 134
756 # If first sequence, use parsed description to create new columns and initialize writer 135 # TODO: This is not the best approach. should pass in output fields.
757 if pass_writer is None: 136 # If first record, use parsed description to define extra columns
758 if not no_parse: ordered_fields.extend(list(record.annotations.keys())) 137 if i == 1: fields.extend(list(record.annotations.keys()))
759 pass_handle = open(pass_file, 'wt') 138 except IndexError:
760 pass_writer = getDbWriter(pass_handle, add_fields=ordered_fields) 139 # Could not parse pRESTO-style annotations so fall back to no parse
761 140 no_parse = True
762 # Write row to tab-delim CLIP file 141 sys.stderr.write('\nWARNING: Sequence annotation format not recognized. Sequence headers will not be parsed.\n')
763 pass_writer.writerow(record.toDict()) 142
764 143 # Count pass or fail and write to appropriate file
765 # Print log 144 if _pass(record):
766 #printProgress(i+1 + (total_count/2 if id_dict else 0), total_count, 0.05, start_time) 145 # Open pass file
767 printProgress(rec_count, total_count, 0.05, start_time) 146 if pass_writer is None:
768 147 pass_handle = open(pass_file, 'wt')
148 pass_writer = getDbWriter(pass_handle, add_fields=fields)
149
150 # Write row to pass file
151 pass_count += 1
152 pass_writer.writerow(record.toDict())
153 else:
154 # Open failed file
155 if out_args['failed'] and fail_writer is None:
156 fail_handle = open(fail_file, 'wt')
157 fail_writer = getDbWriter(fail_handle, add_fields=fields)
158
159 # Write row to fail file if specified
160 fail_count += 1
161 if fail_writer is not None:
162 fail_writer.writerow(record.toDict())
163
164 # Print progress
165 printProgress(i, total_count, 0.05, start_time)
166
167 # Print consol log
769 log = OrderedDict() 168 log = OrderedDict()
770 log['OUTPUT'] = pass_file 169 log['OUTPUT'] = pass_file
771 log['PASS'] = pass_count 170 log['PASS'] = pass_count
772 log['FAIL'] = fail_count 171 log['FAIL'] = fail_count
773 log['END'] = 'MakeDb' 172 log['END'] = 'MakeDb'
775 174
776 if pass_handle is not None: pass_handle.close() 175 if pass_handle is not None: pass_handle.close()
777 if fail_handle is not None: fail_handle.close() 176 if fail_handle is not None: fail_handle.close()
778 177
779 178
780 # TODO: may be able to merge with parseIMGT 179 # TODO: may be able to merge with other mains
781 def parseIgBlast(igblast_output, seq_file, repo, no_parse=True, score_fields=False, 180 def parseIMGT(aligner_output, seq_file=None, no_parse=True, partial=False,
782 region_fields=False, out_args=default_out_args): 181 parse_scores=False, parse_regions=False, parse_junction=False,
783 """ 182 out_args=default_out_args):
784 Main for IgBlast aligned sample sequences 183 """
184 Main for IMGT aligned sample sequences.
785 185
786 Arguments: 186 Arguments:
787 igblast_output = IgBlast output file to process 187 aligner_output : zipped file or unzipped folder output by IMGT.
788 seq_file = fasta file input to IgBlast (from which to get sequence) 188 seq_file : FASTA file input to IMGT (from which to get seqID).
789 repo = folder with germline repertoire files 189 no_parse : if ID is to be parsed for pRESTO output with default delimiters.
790 no_parse = if ID is to be parsed for pRESTO output with default delimiters 190 partial : If True put incomplete alignments in the pass file.
791 score_fields = if True add alignment score fields to output file 191 parse_scores : if True add alignment score fields to output file.
792 region_fields = if True add FWR and CDR region fields to output file 192 parse_regions : if True add FWR and CDR region fields to output file.
793 out_args = common output argument dictionary from parseCommonArgs 193 out_args : common output argument dictionary from parseCommonArgs.
794 194
795 Returns: 195 Returns:
796 None 196 None
197 """
198 # Print parameter info
199 log = OrderedDict()
200 log['START'] = 'MakeDb'
201 log['ALIGNER'] = 'IMGT'
202 log['ALIGNER_OUTPUT'] = aligner_output
203 log['SEQ_FILE'] = os.path.basename(seq_file) if seq_file else ''
204 log['NO_PARSE'] = no_parse
205 log['PARTIAL'] = partial
206 log['SCORES'] = parse_scores
207 log['REGIONS'] = parse_regions
208 log['JUNCTION'] = parse_junction
209 printLog(log)
210
211 start_time = time()
212 printMessage('Loading sequence files', start_time=start_time, width=25)
213 # Extract IMGT files
214 temp_dir, imgt_files = extractIMGT(aligner_output)
215 # Count records in IMGT files
216 total_count = countDbFile(imgt_files['summary'])
217 # Get (parsed) IDs from fasta file submitted to IMGT
218 id_dict = getIDforIMGT(seq_file) if seq_file else {}
219 printMessage('Done', start_time=start_time, end=True, width=25)
220
221 # Parse IMGT output and write db
222 with open(imgt_files['summary'], 'r') as summary_handle, \
223 open(imgt_files['gapped'], 'r') as gapped_handle, \
224 open(imgt_files['ntseq'], 'r') as ntseq_handle, \
225 open(imgt_files['junction'], 'r') as junction_handle:
226 parse_iter = IMGTReader(summary_handle, gapped_handle, ntseq_handle, junction_handle,
227 parse_scores=parse_scores, parse_regions=parse_regions,
228 parse_junction=parse_junction)
229 file_prefix = getFilePrefix(aligner_output, out_args)
230 writeDb(parse_iter, parse_iter.fields, file_prefix, total_count, id_dict=id_dict,
231 no_parse=no_parse, partial=partial, out_args=out_args)
232
233 # Cleanup temp directory
234 temp_dir.cleanup()
235
236 return None
237
238
239 # TODO: may be able to merge with other mains
240 def parseIgBLAST(aligner_output, seq_file, repo, no_parse=True, partial=False,
241 parse_regions=False, parse_scores=False, parse_igblast_cdr3=False,
242 out_args=default_out_args):
243 """
244 Main for IgBLAST aligned sample sequences.
245
246 Arguments:
247 aligner_output : IgBLAST output file to process.
248 seq_file : fasta file input to IgBlast (from which to get sequence).
249 repo : folder with germline repertoire files.
250 no_parse : if ID is to be parsed for pRESTO output with default delimiters.
251 partial : If True put incomplete alignments in the pass file.
252 parse_regions : if True add FWR and CDR fields to output file.
253 parse_scores : if True add alignment score fields to output file.
254 parse_igblast_cdr3 : if True parse CDR3 sequences generated by IgBLAST
255 out_args : common output argument dictionary from parseCommonArgs.
256
257 Returns:
258 None
797 """ 259 """
798 # Print parameter info 260 # Print parameter info
799 log = OrderedDict() 261 log = OrderedDict()
800 log['START'] = 'MakeDB' 262 log['START'] = 'MakeDB'
801 log['ALIGNER'] = 'IgBlast' 263 log['ALIGNER'] = 'IgBlast'
802 log['ALIGN_RESULTS'] = os.path.basename(igblast_output) 264 log['ALIGNER_OUTPUT'] = os.path.basename(aligner_output)
803 log['SEQ_FILE'] = os.path.basename(seq_file) 265 log['SEQ_FILE'] = os.path.basename(seq_file)
804 log['NO_PARSE'] = no_parse 266 log['NO_PARSE'] = no_parse
805 log['SCORE_FIELDS'] = score_fields 267 log['PARTIAL'] = partial
806 log['REGION_FIELDS'] = region_fields 268 log['SCORES'] = parse_scores
269 log['REGIONS'] = parse_regions
807 printLog(log) 270 printLog(log)
808 271
272 start_time = time()
273 printMessage('Loading sequence files', start_time=start_time, width=25)
274 # Count records in sequence file
275 total_count = countSeqFile(seq_file)
809 # Get input sequence dictionary 276 # Get input sequence dictionary
810 seq_dict = getSeqforIgBlast(seq_file) 277 seq_dict = getSeqDict(seq_file)
811 278 # Create germline repo dictionary
812 # Formalize out_dir and file-prefix 279 repo_dict = readRepo(repo)
813 if not out_args['out_dir']: 280 printMessage('Done', start_time=start_time, end=True, width=25)
814 out_dir = os.path.split(igblast_output)[0] 281
815 else: 282 # Parse and write output
816 out_dir = os.path.abspath(out_args['out_dir']) 283 with open(aligner_output, 'r') as f:
817 if not os.path.exists(out_dir): os.mkdir(out_dir) 284 parse_iter = IgBLASTReader(f, seq_dict, repo_dict,
818 if out_args['out_name']: 285 parse_scores=parse_scores, parse_regions=parse_regions,
819 file_prefix = out_args['out_name'] 286 parse_igblast_cdr3=parse_igblast_cdr3)
820 else: 287 file_prefix = getFilePrefix(aligner_output, out_args)
821 file_prefix = os.path.basename(os.path.splitext(igblast_output)[0]) 288 writeDb(parse_iter, parse_iter.fields, file_prefix, total_count,
822 file_prefix = os.path.join(out_dir, file_prefix) 289 no_parse=no_parse, partial=partial, out_args=out_args)
823 290
824 total_count = countSeqFile(seq_file) 291 return None
825 292
826 # Create 293
827 repo_dict = getRepo(repo) 294 # TODO: may be able to merge with other mains
828 igblast_dict = readIgBlast(igblast_output, seq_dict, repo_dict, 295 def parseIHMM(aligner_output, seq_file, repo, no_parse=True, partial=False,
829 score_fields=score_fields, region_fields=region_fields) 296 parse_scores=False, parse_regions=False, out_args=default_out_args):
830 writeDb(igblast_dict, file_prefix, total_count, no_parse=no_parse, 297 """
831 score_fields=score_fields, region_fields=region_fields, out_args=out_args) 298 Main for iHMMuneAlign aligned sample sequences.
832
833
834 # TODO: may be able to merge with parseIgBlast
835 def parseIMGT(imgt_output, seq_file=None, no_parse=True, score_fields=False,
836 region_fields=False, out_args=default_out_args):
837 """
838 Main for IMGT aligned sample sequences
839 299
840 Arguments: 300 Arguments:
841 imgt_output = zipped file or unzipped folder output by IMGT 301 aligner_output : iHMMune-Align output file to process.
842 seq_file = FASTA file input to IMGT (from which to get seqID) 302 seq_file : fasta file input to iHMMuneAlign (from which to get sequence).
843 no_parse = if ID is to be parsed for pRESTO output with default delimiters 303 repo : folder with germline repertoire files.
844 score_fields = if True add alignment score fields to output file 304 no_parse : if ID is to be parsed for pRESTO output with default delimiters.
845 region_fields = if True add FWR and CDR region fields to output file 305 partial : If True put incomplete alignments in the pass file.
846 out_args = common output argument dictionary from parseCommonArgs 306 parse_scores : if True parse alignment scores.
847 307 parse_regions : if True add FWR and CDR region fields.
848 Returns: 308 out_args : common output argument dictionary from parseCommonArgs.
849 None 309
310 Returns:
311 None
850 """ 312 """
851 # Print parameter info 313 # Print parameter info
852 log = OrderedDict() 314 log = OrderedDict()
853 log['START'] = 'MakeDb' 315 log['START'] = 'MakeDB'
854 log['ALIGNER'] = 'IMGT' 316 log['ALIGNER'] = 'iHMMune-Align'
855 log['ALIGN_RESULTS'] = imgt_output 317 log['ALIGNER_OUTPUT'] = os.path.basename(aligner_output)
856 log['SEQ_FILE'] = os.path.basename(seq_file) if seq_file else '' 318 log['SEQ_FILE'] = os.path.basename(seq_file)
857 log['NO_PARSE'] = no_parse 319 log['NO_PARSE'] = no_parse
858 log['SCORE_FIELDS'] = score_fields 320 log['PARTIAL'] = partial
859 log['REGION_FIELDS'] = region_fields 321 log['SCORES'] = parse_scores
322 log['REGIONS'] = parse_regions
860 printLog(log) 323 printLog(log)
861 324
862 # Get individual IMGT result files 325 start_time = time()
863 temp_dir, imgt_files = extractIMGT(imgt_output) 326 printMessage('Loading sequence files', start_time=start_time, width=25)
864 327 # Count records in sequence file
865 # Formalize out_dir and file-prefix 328 total_count = countSeqFile(seq_file)
866 if not out_args['out_dir']: 329 # Get input sequence dictionary
867 out_dir = os.path.dirname(os.path.abspath(imgt_output)) 330 seq_dict = getSeqDict(seq_file)
868 else: 331 # Create germline repo dictionary
869 out_dir = os.path.abspath(out_args['out_dir']) 332 repo_dict = readRepo(repo)
870 if not os.path.exists(out_dir): os.mkdir(out_dir) 333 printMessage('Done', start_time=start_time, end=True, width=25)
871 if out_args['out_name']: 334
872 file_prefix = out_args['out_name'] 335 # Parse and write output
873 else: 336 with open(aligner_output, 'r') as f:
874 file_prefix = os.path.splitext(os.path.split(os.path.abspath(imgt_output))[1])[0] 337 parse_iter = IHMMuneReader(f, seq_dict, repo_dict,
875 file_prefix = os.path.join(out_dir, file_prefix) 338 parse_scores=parse_scores, parse_regions=parse_regions)
876 339 file_prefix = getFilePrefix(aligner_output, out_args)
877 total_count = countDbFile(imgt_files[0]) 340 writeDb(parse_iter, parse_iter.fields, file_prefix, total_count,
878 341 no_parse=no_parse, partial=partial, out_args=out_args)
879 # Get (parsed) IDs from fasta file submitted to IMGT 342
880 id_dict = getIDforIMGT(seq_file) if seq_file else {} 343 return None
881
882 # Create
883 imgt_dict = readIMGT(imgt_files, score_fields=score_fields,
884 region_fields=region_fields)
885 writeDb(imgt_dict, file_prefix, total_count, id_dict=id_dict, no_parse=no_parse,
886 score_fields=score_fields, region_fields=region_fields, out_args=out_args)
887
888 # Delete temp directory
889 rmtree(temp_dir)
890 344
891 345
892 def getArgParser(): 346 def getArgParser():
893 """ 347 """
894 Defines the ArgumentParser 348 Defines the ArgumentParser.
895 349
896 Arguments:
897 None
898
899 Returns: 350 Returns:
900 an ArgumentParser object 351 argparse.ArgumentParser
901 """ 352 """
902 fields = dedent( 353 fields = dedent(
903 ''' 354 '''
904 output files: 355 output files:
905 db-pass 356 db-pass
906 database of parsed alignment records. 357 database of alignment records with functionality information,
358 V and J calls, and a junction region.
907 db-fail 359 db-fail
908 database with records failing alignment. 360 database with records that fail due to no functionality information
909 361 (did not pass IMGT), no V call, no J call, or no junction region.
910 output fields: 362
911 SEQUENCE_ID, SEQUENCE_INPUT, FUNCTIONAL, IN_FRAME, STOP, MUTATED_INVARIANT, 363 universal output fields:
912 INDELS, V_CALL, D_CALL, J_CALL, SEQUENCE_VDJ and/or SEQUENCE_IMGT, 364 SEQUENCE_ID, SEQUENCE_INPUT, SEQUENCE_VDJ, SEQUENCE_IMGT,
913 V_SEQ_START, V_SEQ_LENGTH, V_GERM_START_VDJ and/or V_GERM_START_IMGT, 365 FUNCTIONAL, IN_FRAME, STOP, MUTATED_INVARIANT, INDELS,
914 V_GERM_LENGTH_VDJ and/or V_GERM_LENGTH_IMGT, N1_LENGTH, 366 V_CALL, D_CALL, J_CALL,
915 D_SEQ_START, D_SEQ_LENGTH, D_GERM_START, D_GERM_LENGTH, N2_LENGTH, 367 V_SEQ_START, V_SEQ_LENGTH,
368 D_SEQ_START, D_SEQ_LENGTH, D_GERM_START, D_GERM_LENGTH,
916 J_SEQ_START, J_SEQ_LENGTH, J_GERM_START, J_GERM_LENGTH, 369 J_SEQ_START, J_SEQ_LENGTH, J_GERM_START, J_GERM_LENGTH,
917 JUNCTION_LENGTH, JUNCTION, V_SCORE, V_IDENTITY, V_EVALUE, V_BTOP, 370 JUNCTION_LENGTH, JUNCTION, NP1_LENGTH, NP2_LENGTH,
918 J_SCORE, J_IDENTITY, J_EVALUE, J_BTOP, FWR1_IMGT, FWR2_IMGT, FWR3_IMGT, 371 FWR1_IMGT, FWR2_IMGT, FWR3_IMGT, FWR4_IMGT,
919 FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, CDR3_IMGT 372 CDR1_IMGT, CDR2_IMGT, CDR3_IMGT
373
374 imgt specific output fields:
375 V_GERM_START_IMGT, V_GERM_LENGTH_IMGT,
376 N1_LENGTH, N2_LENGTH, P3V_LENGTH, P5D_LENGTH, P3D_LENGTH, P5J_LENGTH,
377 D_FRAME, V_SCORE, V_IDENTITY, J_SCORE, J_IDENTITY,
378
379 igblast specific output fields:
380 V_GERM_START_VDJ, V_GERM_LENGTH_VDJ,
381 V_EVALUE, V_SCORE, V_IDENTITY, V_BTOP,
382 J_EVALUE, J_SCORE, J_IDENTITY, J_BTOP.
383 CDR3_IGBLAST_NT, CDR3_IGBLAST_AA
384
385 ihmm specific output fields:
386 V_GERM_START_VDJ, V_GERM_LENGTH_VDJ,
387 HMM_SCORE
920 ''') 388 ''')
921 389
922 # Define ArgumentParser 390 # Define ArgumentParser
923 parser = ArgumentParser(description=__doc__, epilog=fields, 391 parser = ArgumentParser(description=__doc__, epilog=fields,
924 formatter_class=CommonHelpFormatter) 392 formatter_class=CommonHelpFormatter)
931 399
932 # Parent parser 400 # Parent parser
933 parser_parent = getCommonArgParser(seq_in=False, seq_out=False, log=False) 401 parser_parent = getCommonArgParser(seq_in=False, seq_out=False, log=False)
934 402
935 # IgBlast Aligner 403 # IgBlast Aligner
936 parser_igblast = subparsers.add_parser('igblast', help='Process IgBlast output', 404 parser_igblast = subparsers.add_parser('igblast', parents=[parser_parent],
937 parents=[parser_parent], 405 formatter_class=CommonHelpFormatter,
938 formatter_class=CommonHelpFormatter) 406 help='Process IgBLAST output.',
939 parser_igblast.set_defaults(func=parseIgBlast) 407 description='Process IgBLAST output.')
940 parser_igblast.add_argument('-i', nargs='+', action='store', dest='aligner_files', 408 parser_igblast.add_argument('-i', nargs='+', action='store', dest='aligner_outputs',
941 required=True, 409 required=True,
942 help='''IgBLAST output files in format 7 with query sequence 410 help='''IgBLAST output files in format 7 with query sequence
943 (IgBLAST argument \'-outfmt "7 std qseq sseq btop"\').''') 411 (IgBLAST argument \'-outfmt "7 std qseq sseq btop"\').''')
944 parser_igblast.add_argument('-r', nargs='+', action='store', dest='repo', required=True, 412 parser_igblast.add_argument('-r', nargs='+', action='store', dest='repo', required=True,
945 help='''List of folders and/or fasta files containing 413 help='''List of folders and/or fasta files containing
946 IMGT-gapped germline sequences corresponding to the 414 IMGT-gapped germline sequences corresponding to the
947 set of germlines used in the IgBLAST alignment.''') 415 set of germlines used in the IgBLAST alignment.''')
948 parser_igblast.add_argument('-s', action='store', nargs='+', dest='seq_files', 416 parser_igblast.add_argument('-s', action='store', nargs='+', dest='seq_files',
949 required=True, 417 required=True,
950 help='List of input FASTA files containing sequences') 418 help='''List of input FASTA files (with .fasta, .fna or .fa
419 extension), containing sequences.''')
951 parser_igblast.add_argument('--noparse', action='store_true', dest='no_parse', 420 parser_igblast.add_argument('--noparse', action='store_true', dest='no_parse',
952 help='''Specify if input IDs should not be parsed to add 421 help='''Specify to prevent input sequence headers from being parsed
953 new columns to database.''') 422 to add new columns to database. Parsing of sequence headers requires
954 parser_igblast.add_argument('--scores', action='store_true', dest='score_fields', 423 headers to be in the pRESTO annotation format, so this should be specified
424 when sequence headers are incompatible with the pRESTO annotation scheme.
425 Note, unrecognized header formats will default to this behavior.''')
426 parser_igblast.add_argument('--partial', action='store_true', dest='partial',
427 help='''If specified, include incomplete V(D)J alignments in
428 the pass file instead of the fail file.''')
429 parser_igblast.add_argument('--scores', action='store_true', dest='parse_scores',
955 help='''Specify if alignment score metrics should be 430 help='''Specify if alignment score metrics should be
956 included in the output. Adds the V_SCORE, V_IDENTITY, 431 included in the output. Adds the V_SCORE, V_IDENTITY,
957 V_EVALUE, V_BTOP, J_SCORE, J_IDENTITY, 432 V_EVALUE, V_BTOP, J_SCORE, J_IDENTITY,
958 J_BTOP, and J_EVALUE columns.''') 433 J_BTOP, and J_EVALUE columns.''')
959 parser_igblast.add_argument('--regions', action='store_true', dest='region_fields', 434 parser_igblast.add_argument('--regions', action='store_true', dest='parse_regions',
960 help='''Specify if IMGT framework and CDR regions should be 435 help='''Specify if IMGT FWR and CDRs should be
961 included in the output. Adds the FWR1_IMGT, FWR2_IMGT, 436 included in the output. Adds the FWR1_IMGT, FWR2_IMGT,
962 FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and 437 FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and
963 CDR3_IMGT columns.''') 438 CDR3_IMGT columns.''')
964 439 parser_igblast.add_argument('--cdr3', action='store_true',
440 dest='parse_igblast_cdr3',
441 help='''Specify if the CDR3 sequences generated by IgBLAST
442 should be included in the output. Adds the columns
443 CDR3_IGBLAST_NT and CDR3_IGBLAST_AA. Requires IgBLAST
444 version 1.5 or greater.''')
445 parser_igblast.set_defaults(func=parseIgBLAST)
446
965 # IMGT aligner 447 # IMGT aligner
966 parser_imgt = subparsers.add_parser('imgt', help='Process IMGT/HighV-Quest output', 448 parser_imgt = subparsers.add_parser('imgt', parents=[parser_parent],
967 parents=[parser_parent], 449 formatter_class=CommonHelpFormatter,
968 formatter_class=CommonHelpFormatter) 450 help='''Process IMGT/HighV-Quest output
969 imgt_arg_group = parser_imgt.add_mutually_exclusive_group(required=True) 451 (does not work with V-QUEST).''',
970 imgt_arg_group.add_argument('-i', nargs='+', action='store', dest='aligner_files', 452 description='''Process IMGT/HighV-Quest output
971 help='''Either zipped IMGT output files (.zip) or a folder 453 (does not work with V-QUEST).''')
972 containing unzipped IMGT output files (which must 454 parser_imgt.add_argument('-i', nargs='+', action='store', dest='aligner_outputs',
973 include 1_Summary, 2_IMGT-gapped, 3_Nt-sequences, 455 help='''Either zipped IMGT output files (.zip or .txz) or a
974 and 6_Junction).''') 456 folder containing unzipped IMGT output files (which must
457 include 1_Summary, 2_IMGT-gapped, 3_Nt-sequences,
458 and 6_Junction).''')
975 parser_imgt.add_argument('-s', nargs='*', action='store', dest='seq_files', 459 parser_imgt.add_argument('-s', nargs='*', action='store', dest='seq_files',
976 required=False, 460 required=False,
977 help='List of input FASTA files containing sequences') 461 help='''List of input FASTA files (with .fasta, .fna or .fa
462 extension) containing sequences.''')
978 parser_imgt.add_argument('--noparse', action='store_true', dest='no_parse', 463 parser_imgt.add_argument('--noparse', action='store_true', dest='no_parse',
979 help='''Specify if input IDs should not be parsed to add new 464 help='''Specify to prevent input sequence headers from being parsed
980 columns to database.''') 465 to add new columns to database. Parsing of sequence headers requires
981 parser_imgt.add_argument('--scores', action='store_true', dest='score_fields', 466 headers to be in the pRESTO annotation format, so this should be specified
467 when sequence headers are incompatible with the pRESTO annotation scheme.
468 Note, unrecognized header formats will default to this behavior.''')
469 parser_imgt.add_argument('--partial', action='store_true', dest='partial',
470 help='''If specified, include incomplete V(D)J alignments in
471 the pass file instead of the fail file.''')
472 parser_imgt.add_argument('--scores', action='store_true', dest='parse_scores',
982 help='''Specify if alignment score metrics should be 473 help='''Specify if alignment score metrics should be
983 included in the output. Adds the V_SCORE, V_IDENTITY, 474 included in the output. Adds the V_SCORE, V_IDENTITY,
984 J_SCORE and J_IDENTITY. Note, this will also add 475 J_SCORE and J_IDENTITY.''')
985 the columns V_EVALUE, V_BTOP, J_EVALUE and J_BTOP, 476 parser_imgt.add_argument('--regions', action='store_true', dest='parse_regions',
986 but they will be empty for IMGT output.''') 477 help='''Specify if IMGT FWRs and CDRs should be
987 parser_imgt.add_argument('--regions', action='store_true', dest='region_fields',
988 help='''Specify if IMGT framework and CDR regions should be
989 included in the output. Adds the FWR1_IMGT, FWR2_IMGT, 478 included in the output. Adds the FWR1_IMGT, FWR2_IMGT,
990 FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and 479 FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and
991 CDR3_IMGT columns.''') 480 CDR3_IMGT columns.''')
481 parser_imgt.add_argument('--junction', action='store_true', dest='parse_junction',
482 help='''Specify if detailed junction fields should be
483 included in the output. Adds the columns
484 N1_LENGTH, N2_LENGTH, P3V_LENGTH, P5D_LENGTH, P3D_LENGTH,
485 P5J_LENGTH, D_FRAME.''')
992 parser_imgt.set_defaults(func=parseIMGT) 486 parser_imgt.set_defaults(func=parseIMGT)
487
488 # iHMMuneAlign Aligner
489 parser_ihmm = subparsers.add_parser('ihmm', parents=[parser_parent],
490 formatter_class=CommonHelpFormatter,
491 help='Process iHMMune-Align output.',
492 description='Process iHMMune-Align output.')
493 parser_ihmm.add_argument('-i', nargs='+', action='store', dest='aligner_outputs',
494 required=True,
495 help='''iHMMune-Align output file.''')
496 parser_ihmm.add_argument('-r', nargs='+', action='store', dest='repo', required=True,
497 help='''List of folders and/or FASTA files containing
498 IMGT-gapped germline sequences corresponding to the
499 set of germlines used in the IgBLAST alignment.''')
500 parser_ihmm.add_argument('-s', action='store', nargs='+', dest='seq_files',
501 required=True,
502 help='''List of input FASTA files (with .fasta, .fna or .fa
503 extension) containing sequences.''')
504 parser_ihmm.add_argument('--noparse', action='store_true', dest='no_parse',
505 help='''Specify to prevent input sequence headers from being parsed
506 to add new columns to database. Parsing of sequence headers requires
507 headers to be in the pRESTO annotation format, so this should be specified
508 when sequence headers are incompatible with the pRESTO annotation scheme.
509 Note, unrecognized header formats will default to this behavior.''')
510 parser_ihmm.add_argument('--partial', action='store_true', dest='partial',
511 help='''If specified, include incomplete V(D)J alignments in
512 the pass file instead of the fail file.''')
513 parser_ihmm.add_argument('--scores', action='store_true', dest='parse_scores',
514 help='''Specify if alignment score metrics should be
515 included in the output. Adds the path score of the
516 iHMMune-Align hidden Markov model to HMM_SCORE.''')
517 parser_ihmm.add_argument('--regions', action='store_true', dest='parse_regions',
518 help='''Specify if IMGT FWRs and CDRs should be
519 included in the output. Adds the FWR1_IMGT, FWR2_IMGT,
520 FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and
521 CDR3_IMGT columns.''')
522 parser_ihmm.set_defaults(func=parseIHMM)
993 523
994 return parser 524 return parser
995 525
996 526
997 if __name__ == "__main__": 527 if __name__ == "__main__":
998 """ 528 """
999 Parses command line arguments and calls main 529 Parses command line arguments and calls main
1000 """ 530 """
1001 parser = getArgParser() 531 parser = getArgParser()
1002 args = parser.parse_args() 532 args = parser.parse_args()
1003 args_dict = parseCommonArgs(args, in_arg='aligner_files') 533 args_dict = parseCommonArgs(args, in_arg='aligner_outputs')
1004 534
1005 # Set no ID parsing if sequence files are not provided 535 # Set no ID parsing if sequence files are not provided
1006 if 'seq_files' in args_dict and not args_dict['seq_files']: 536 if 'seq_files' in args_dict and not args_dict['seq_files']:
1007 args_dict['no_parse'] = True 537 args_dict['no_parse'] = True
1008 538
1009 # Delete 539 # Delete
1010 if 'seq_files' in args_dict: del args_dict['seq_files'] 540 if 'seq_files' in args_dict: del args_dict['seq_files']
1011 if 'aligner_files' in args_dict: del args_dict['aligner_files'] 541 if 'aligner_outputs' in args_dict: del args_dict['aligner_outputs']
1012 if 'command' in args_dict: del args_dict['command'] 542 if 'command' in args_dict: del args_dict['command']
1013 if 'func' in args_dict: del args_dict['func'] 543 if 'func' in args_dict: del args_dict['func']
1014 544
1015 if args.command == 'imgt': 545 if args.command == 'imgt':
1016 for i in range(len(args.__dict__['aligner_files'])): 546 for i in range(len(args.__dict__['aligner_outputs'])):
1017 args_dict['imgt_output'] = args.__dict__['aligner_files'][i] 547 args_dict['aligner_output'] = args.__dict__['aligner_outputs'][i]
1018 args_dict['seq_file'] = args.__dict__['seq_files'][i] \ 548 args_dict['seq_file'] = args.__dict__['seq_files'][i] \
1019 if args.__dict__['seq_files'] else None 549 if args.__dict__['seq_files'] else None
1020 args.func(**args_dict) 550 args.func(**args_dict)
1021 elif args.command == 'igblast': 551 elif args.command == 'igblast' or args.command == 'ihmm':
1022 for i in range(len(args.__dict__['aligner_files'])): 552 for i in range(len(args.__dict__['aligner_outputs'])):
1023 args_dict['igblast_output'] = args.__dict__['aligner_files'][i] 553 args_dict['aligner_output'] = args.__dict__['aligner_outputs'][i]
1024 args_dict['seq_file'] = args.__dict__['seq_files'][i] 554 args_dict['seq_file'] = args.__dict__['seq_files'][i]
1025 args.func(**args_dict) 555 args.func(**args_dict)