comparison change_o/MakeDb.py @ 0:c33d93683a09 draft

Uploaded
author davidvanzessen
date Thu, 13 Oct 2016 10:52:24 -0400
parents
children 22dddabe3637
comparison
equal deleted inserted replaced
-1:000000000000 0:c33d93683a09
1 #!/usr/bin/env python3
2 """
3 Create tab-delimited database file to store sequence alignment information
4 """
5 # Info
6 __author__ = 'Namita Gupta, Jason Anthony Vander Heiden'
7 from changeo import __version__, __date__
8
9 # Imports
10 import csv
11 import os
12 import re
13 import sys
14 import pandas as pd
15 import tarfile
16 import zipfile
17 from argparse import ArgumentParser
18 from collections import OrderedDict
19 from itertools import groupby
20 from shutil import rmtree
21 from tempfile import mkdtemp
22 from textwrap import dedent
23 from time import time
24 from Bio import SeqIO
25 from Bio.Seq import Seq
26 from Bio.Alphabet import IUPAC
27
28 # Presto and changeo imports
29 from presto.Defaults import default_out_args
30 from presto.Annotation import parseAnnotation
31 from presto.IO import countSeqFile, printLog, printProgress
32 from changeo.Commandline import CommonHelpFormatter, getCommonArgParser, parseCommonArgs
33 from changeo.IO import getDbWriter, countDbFile, getRepo
34 from changeo.Receptor import IgRecord, parseAllele, v_allele_regex, d_allele_regex, \
35 j_allele_regex
36
37 # Default parameters
38 default_delimiter = ('\t', ',', '-')
39
40
41 def gapV(ig_dict, repo_dict):
42 """
43 Insert gaps into V region and update alignment information
44
45 Arguments:
46 ig_dict : Dictionary of parsed IgBlast output
47 repo_dict : Dictionary of IMGT gapped germline sequences
48
49 Returns:
50 dict : Updated with SEQUENCE_IMGT, V_GERM_START_IMGT, and V_GERM_LENGTH_IMGT fields
51 """
52
53 seq_imgt = '.' * (int(ig_dict['V_GERM_START_VDJ'])-1) + ig_dict['SEQUENCE_VDJ']
54
55 # Find gapped germline V segment
56 vgene = parseAllele(ig_dict['V_CALL'], v_allele_regex, 'first')
57 vkey = (vgene, )
58 #TODO: Figure out else case
59 if vkey in repo_dict:
60 vgap = repo_dict[vkey]
61 # Iterate over gaps in the germline segment
62 gaps = re.finditer(r'\.', vgap)
63 gapcount = int(ig_dict['V_GERM_START_VDJ'])-1
64 for gap in gaps:
65 i = gap.start()
66 # Break if gap begins after V region
67 if i >= ig_dict['V_GERM_LENGTH_VDJ'] + gapcount:
68 break
69 # Insert gap into IMGT sequence
70 seq_imgt = seq_imgt[:i] + '.' + seq_imgt[i:]
71 # Update gap counter
72 gapcount += 1
73 ig_dict['SEQUENCE_IMGT'] = seq_imgt
74 # Update IMGT positioning information for V
75 ig_dict['V_GERM_START_IMGT'] = 1
76 ig_dict['V_GERM_LENGTH_IMGT'] = ig_dict['V_GERM_LENGTH_VDJ'] + gapcount
77
78 return ig_dict
79
80
81 def getIMGTJunc(ig_dict, repo_dict):
82 """
83 Identify junction region by IMGT definition
84
85 Arguments:
86 ig_dict : Dictionary of parsed IgBlast output
87 repo_dict : Dictionary of IMGT gapped germline sequences
88
89 Returns:
90 dict : Updated with JUNCTION_LENGTH_IMGT and JUNCTION_IMGT fields
91 """
92 # Find germline J segment
93 jgene = parseAllele(ig_dict['J_CALL'], j_allele_regex, 'first')
94 jkey = (jgene, )
95 #TODO: Figure out else case
96 if jkey in repo_dict:
97 # Get germline J sequence
98 jgerm = repo_dict[jkey]
99 jgerm = jgerm[:ig_dict['J_GERM_START']+ig_dict['J_GERM_LENGTH']-1]
100 # Look for (F|W)GXG aa motif in nt sequence
101 motif = re.search(r'T(TT|TC|GG)GG[ACGT]{4}GG[AGCT]', jgerm)
102 aa_end = len(ig_dict['SEQUENCE_IMGT'])
103 #TODO: Figure out else case
104 if motif:
105 # print('\n', motif.group())
106 aa_end = motif.start() - len(jgerm) + 3
107 # Add fields to dict
108 ig_dict['JUNCTION'] = ig_dict['SEQUENCE_IMGT'][309:aa_end]
109 ig_dict['JUNCTION_LENGTH'] = len(ig_dict['JUNCTION'])
110
111 return ig_dict
112
113
114 def getRegions(ig_dict):
115 """
116 Identify FWR and CDR regions by IMGT definition
117
118 Arguments:
119 ig_dict : Dictionary of parsed alignment output
120
121 Returns:
122 dict : Updated with FWR1_IMGT, FWR2_IMGT, FWR3_IMGT, FWR4_IMGT,
123 CDR1_IMGT, CDR2_IMGT, and CDR3_IMGT fields
124 """
125 try:
126 seq_len = len(ig_dict['SEQUENCE_IMGT'])
127 ig_dict['FWR1_IMGT'] = ig_dict['SEQUENCE_IMGT'][0:min(78,seq_len)]
128 except (KeyError, IndexError):
129 return ig_dict
130
131 try: ig_dict['CDR1_IMGT'] = ig_dict['SEQUENCE_IMGT'][78:min(114, seq_len)]
132 except (IndexError): return ig_dict
133
134 try: ig_dict['FWR2_IMGT'] = ig_dict['SEQUENCE_IMGT'][114:min(165, seq_len)]
135 except (IndexError): return ig_dict
136
137 try: ig_dict['CDR2_IMGT'] = ig_dict['SEQUENCE_IMGT'][165:min(195, seq_len)]
138 except (IndexError): return ig_dict
139
140 try: ig_dict['FWR3_IMGT'] = ig_dict['SEQUENCE_IMGT'][195:min(312, seq_len)]
141 except (IndexError): return ig_dict
142
143 try:
144 cdr3_end = 306 + ig_dict['JUNCTION_LENGTH']
145 ig_dict['CDR3_IMGT'] = ig_dict['SEQUENCE_IMGT'][312:cdr3_end]
146 ig_dict['FWR4_IMGT'] = ig_dict['SEQUENCE_IMGT'][cdr3_end:]
147 except (KeyError, IndexError):
148 return ig_dict
149
150 return ig_dict
151
152
153 def getSeqforIgBlast(seq_file):
154 """
155 Fetch input sequences for IgBlast queries
156
157 Arguments:
158 seq_file = a fasta file of sequences input to IgBlast
159
160 Returns:
161 a dictionary of {ID:Seq}
162 """
163
164 seq_dict = SeqIO.index(seq_file, "fasta", IUPAC.ambiguous_dna)
165
166 # Create a seq_dict ID translation using IDs truncate up to space or 50 chars
167 seqs = {}
168 for seq in seq_dict.values():
169 seqs.update({seq.description:str(seq.seq)})
170
171 return seqs
172
173
174 def findLine(handle, query):
175 """
176 Finds line with query string in file
177
178 Arguments:
179 handle = file handle in which to search for line
180 query = query string for which to search in file
181
182 Returns:
183 line from handle in which query string was found
184 """
185 for line in handle:
186 if(re.match(query, line)):
187 return line
188
189
190 def extractIMGT(imgt_output):
191 """
192 Extract necessary files from IMGT results, zipped or unzipped
193
194 Arguments:
195 imgt_output = zipped file or unzipped folder output by IMGT
196
197 Returns:
198 sorted list of filenames from which information will be read
199 """
200 #file_ext = os.path.splitext(imgt_output)[1].lower()
201 imgt_flags = ('1_Summary', '2_IMGT-gapped', '3_Nt-sequences', '6_Junction')
202 temp_dir = mkdtemp()
203 if zipfile.is_zipfile(imgt_output):
204 # Open zip file
205 imgt_zip = zipfile.ZipFile(imgt_output, 'r')
206 # Extract required files
207 imgt_files = sorted([n for n in imgt_zip.namelist() \
208 if os.path.basename(n).startswith(imgt_flags)])
209 imgt_zip.extractall(temp_dir, imgt_files)
210 # Define file list
211 imgt_files = [os.path.join(temp_dir, f) for f in imgt_files]
212 elif os.path.isdir(imgt_output):
213 # Find required files in folder
214 folder_files = []
215 for root, dirs, files in os.walk(imgt_output):
216 folder_files.extend([os.path.join(os.path.abspath(root), f) for f in files])
217 # Define file list
218 imgt_files = sorted([n for n in folder_files \
219 if os.path.basename(n).startswith(imgt_flags)])
220 elif tarfile.is_tarfile(imgt_output):
221 # Open zip file
222 imgt_tar = tarfile.open(imgt_output, 'r')
223 # Extract required files
224 imgt_files = sorted([n for n in imgt_tar.getnames() \
225 if os.path.basename(n).startswith(imgt_flags)])
226 imgt_tar.extractall(temp_dir, [imgt_tar.getmember(n) for n in imgt_files])
227 # Define file list
228 imgt_files = [os.path.join(temp_dir, f) for f in imgt_files]
229 else:
230 sys.exit('ERROR: Unsupported IGMT output file. Must be either a zipped file (.zip), LZMA compressed tarfile (.txz) or a folder.')
231
232 if len(imgt_files) > len(imgt_flags): # e.g. multiple 1_Summary files
233 sys.exit('ERROR: Wrong files in IMGT output %s.' % imgt_output)
234 elif len(imgt_files) < len(imgt_flags):
235 sys.exit('ERROR: Missing necessary file IMGT output %s.' % imgt_output)
236
237 return temp_dir, imgt_files
238
239
240 # TODO: return a dictionary with keys determined by the comment strings in the blocks, thus avoiding problems with missing blocks
241 def readOneIgBlastResult(block):
242 """
243 Parse a single IgBLAST query result
244
245 Arguments:
246 block = itertools groupby object of single result
247
248 Returns:
249 None if no results, otherwise list of DataFrames for each result block
250 """
251 results = list()
252 i = 0
253 for match, subblock in groupby(block, lambda l: l=='\n'):
254 if not match:
255 # Strip whitespace and comments
256 sub = [s.strip() for s in subblock if not s.startswith('#')]
257
258 # Continue on empty block
259 if not sub: continue
260 else: i += 1
261
262 # Split by tabs
263 sub = [s.split('\t') for s in sub]
264
265 # Append list for "V-(D)-J rearrangement summary" (i == 1)
266 # And "V-(D)-J junction details" (i == 2)
267 # Otherwise append DataFrame of subblock
268 if i == 1 or i == 2:
269 results.append(sub[0])
270 else:
271 df = pd.DataFrame(sub)
272 if not df.empty: results.append(df)
273
274 return results if results else None
275
276
277 # TODO: needs more speeds. pandas is probably to blame.
278 def readIgBlast(igblast_output, seq_dict, repo_dict,
279 score_fields=False, region_fields=False):
280 """
281 Reads IgBlast output
282
283 Arguments:
284 igblast_output = IgBlast output file (format 7)
285 seq_dict = a dictionary of {ID:Seq} from input fasta file
286 repo_dict = dictionary of IMGT gapped germline sequences
287 score_fields = if True parse alignment scores
288 region_fields = if True add FWR and CDR region fields
289
290 Returns:
291 a generator of dictionaries containing alignment data
292 """
293
294 # Open IgBlast output file
295 with open(igblast_output) as f:
296 # Iterate over individual results (separated by # IGBLASTN)
297 for k1, block in groupby(f, lambda x: re.match('# IGBLASTN', x)):
298 block = list(block)
299 if not k1:
300 # TODO: move query name extraction into block parser readOneIgBlastResult().
301 # Extract sequence ID
302 query_name = ' '.join(block[0].strip().split(' ')[2:])
303 # Initialize db_gen to have ID and input sequence
304 db_gen = {'SEQUENCE_ID': query_name,
305 'SEQUENCE_INPUT': seq_dict[query_name]}
306
307 # Parse further sub-blocks
308 block_list = readOneIgBlastResult(block)
309
310 # TODO: this is indented pretty far. should be a separate function. or several functions.
311 # If results exist, parse further to obtain full db_gen
312 if block_list is not None:
313 # Parse quality information
314 db_gen['STOP'] = 'T' if block_list[0][-4] == 'Yes' else 'F'
315 db_gen['IN_FRAME'] = 'T' if block_list[0][-3] == 'In-frame' else 'F'
316 db_gen['FUNCTIONAL'] = 'T' if block_list[0][-2] == 'Yes' else 'F'
317 if block_list[0][-1] == '-':
318 db_gen['SEQUENCE_INPUT'] = str(Seq(db_gen['SEQUENCE_INPUT'],
319 IUPAC.ambiguous_dna).reverse_complement())
320
321 # Parse V, D, and J calls
322 call_str = ' '.join(block_list[0])
323 v_call = parseAllele(call_str, v_allele_regex, action='list')
324 d_call = parseAllele(call_str, d_allele_regex, action='list')
325 j_call = parseAllele(call_str, j_allele_regex, action='list')
326 db_gen['V_CALL'] = ','.join(v_call) if v_call is not None else 'None'
327 db_gen['D_CALL'] = ','.join(d_call) if d_call is not None else 'None'
328 db_gen['J_CALL'] = ','.join(j_call) if j_call is not None else 'None'
329
330 # Parse junction sequence
331 # db_gen['JUNCTION_VDJ'] = re.sub('(N/A)|\[|\(|\)|\]', '', ''.join(block_list[1]))
332 # db_gen['JUNCTION_LENGTH_VDJ'] = len(db_gen['JUNCTION_VDJ'])
333
334 # TODO: IgBLAST does a stupid and doesn't output block #3 sometimes. why?
335 # TODO: maybe we should fail these. they look craptastic.
336 #pd.set_option('display.width', 500)
337 #print query_name, len(block_list), hit_idx
338 #for i, x in enumerate(block_list):
339 # print '[%i]' % i
340 # print x
341
342 # Parse segment start and stop positions
343 hit_df = block_list[-1]
344
345 # Alignment info block
346 # 0: segment
347 # 1: query id
348 # 2: subject id
349 # 3: % identity
350 # 4: alignment length
351 # 5: mismatches
352 # 6: gap opens
353 # 7: gaps
354 # 8: q. start
355 # 9: q. end
356 # 10: s. start
357 # 11: s. end
358 # 12: evalue
359 # 13: bit score
360 # 14: query seq
361 # 15: subject seq
362 # 16: btop
363
364 # If V call exists, parse V alignment information
365 seq_vdj = ''
366 if v_call is not None:
367 v_align = hit_df[hit_df[0] == 'V'].iloc[0]
368 # Germline positions
369 db_gen['V_GERM_START_VDJ'] = int(v_align[10])
370 db_gen['V_GERM_LENGTH_VDJ'] = int(v_align[11]) - db_gen['V_GERM_START_VDJ'] + 1
371 # Query sequence positions
372 db_gen['V_SEQ_START'] = int(v_align[8])
373 db_gen['V_SEQ_LENGTH'] = int(v_align[9]) - db_gen['V_SEQ_START'] + 1
374
375 if int(v_align[6]) == 0:
376 db_gen['INDELS'] = 'F'
377 else:
378 db_gen['INDELS'] = 'T'
379 # Set functional to none so record gets tossed (junction will be wrong)
380 # db_gen['FUNCTIONAL'] = None
381
382 # V alignment scores
383 if score_fields:
384 try: db_gen['V_SCORE'] = float(v_align[13])
385 except (TypeError, ValueError): db_gen['V_SCORE'] = 'None'
386
387 try: db_gen['V_IDENTITY'] = float(v_align[3]) / 100.0
388 except (TypeError, ValueError): db_gen['V_IDENTITY'] = 'None'
389
390 try: db_gen['V_EVALUE'] = float(v_align[12])
391 except (TypeError, ValueError): db_gen['V_EVALUE'] = 'None'
392
393 try: db_gen['V_BTOP'] = v_align[16]
394 except (TypeError, ValueError): db_gen['V_BTOP'] = 'None'
395
396 # Update VDJ sequence, removing insertions
397 start = 0
398 for m in re.finditer(r'-', v_align[15]):
399 ins = m.start()
400 seq_vdj += v_align[14][start:ins]
401 start = ins + 1
402 seq_vdj += v_align[14][start:]
403
404 # TODO: needs to check that the V results are present before trying to determine N1_LENGTH from them.
405 # If D call exists, parse D alignment information
406 if d_call is not None:
407 d_align = hit_df[hit_df[0] == 'D'].iloc[0]
408
409 # TODO: this is kinda gross. not sure how else to fix the alignment overlap problem though.
410 # Determine N-region length and amount of J overlap with V or D alignment
411 overlap = 0
412 if v_call is not None:
413 n1_len = int(d_align[8]) - (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH'])
414 if n1_len < 0:
415 db_gen['N1_LENGTH'] = 0
416 overlap = abs(n1_len)
417 else:
418 db_gen['N1_LENGTH'] = n1_len
419 n1_start = (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH']-1)
420 n1_end = int(d_align[8])-1
421 seq_vdj += db_gen['SEQUENCE_INPUT'][n1_start:n1_end]
422
423 # Query sequence positions
424 db_gen['D_SEQ_START'] = int(d_align[8]) + overlap
425 db_gen['D_SEQ_LENGTH'] = max(int(d_align[9]) - db_gen['D_SEQ_START'] + 1, 0)
426
427 # Germline positions
428 db_gen['D_GERM_START'] = int(d_align[10]) + overlap
429 db_gen['D_GERM_LENGTH'] = max(int(d_align[11]) - db_gen['D_GERM_START'] + 1, 0)
430
431 # Update VDJ sequence, removing insertions
432 start = overlap
433 for m in re.finditer(r'-', d_align[15]):
434 ins = m.start()
435 seq_vdj += d_align[14][start:ins]
436 start = ins + 1
437 seq_vdj += d_align[14][start:]
438
439 # TODO: needs to check that the V results are present before trying to determine N1_LENGTH from them.
440 # If J call exists, parse J alignment information
441 if j_call is not None:
442 j_align = hit_df[hit_df[0] == 'J'].iloc[0]
443
444 # TODO: this is kinda gross. not sure how else to fix the alignment overlap problem though.
445 # Determine N-region length and amount of J overlap with V or D alignment
446 overlap = 0
447 if d_call is not None:
448 n2_len = int(j_align[8]) - (db_gen['D_SEQ_START'] + db_gen['D_SEQ_LENGTH'])
449 if n2_len < 0:
450 db_gen['N2_LENGTH'] = 0
451 overlap = abs(n2_len)
452 else:
453 db_gen['N2_LENGTH'] = n2_len
454 n2_start = (db_gen['D_SEQ_START']+db_gen['D_SEQ_LENGTH']-1)
455 n2_end = int(j_align[8])-1
456 seq_vdj += db_gen['SEQUENCE_INPUT'][n2_start:n2_end]
457 elif v_call is not None:
458 n1_len = int(j_align[8]) - (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH'])
459 if n1_len < 0:
460 db_gen['N1_LENGTH'] = 0
461 overlap = abs(n1_len)
462 else:
463 db_gen['N1_LENGTH'] = n1_len
464 n1_start = (db_gen['V_SEQ_START']+db_gen['V_SEQ_LENGTH']-1)
465 n1_end = int(j_align[8])-1
466 seq_vdj += db_gen['SEQUENCE_INPUT'][n1_start:n1_end]
467 else:
468 db_gen['N1_LENGTH'] = 0
469
470 # Query positions
471 db_gen['J_SEQ_START'] = int(j_align[8]) + overlap
472 db_gen['J_SEQ_LENGTH'] = max(int(j_align[9]) - db_gen['J_SEQ_START'] + 1, 0)
473
474 # Germline positions
475 db_gen['J_GERM_START'] = int(j_align[10]) + overlap
476 db_gen['J_GERM_LENGTH'] = max(int(j_align[11]) - db_gen['J_GERM_START'] + 1, 0)
477
478 # J alignment scores
479 if score_fields:
480 try: db_gen['J_SCORE'] = float(j_align[13])
481 except (TypeError, ValueError): db_gen['J_SCORE'] = 'None'
482
483 try: db_gen['J_IDENTITY'] = float(j_align[3]) / 100.0
484 except (TypeError, ValueError): db_gen['J_IDENTITY'] = 'None'
485
486 try: db_gen['J_EVALUE'] = float(j_align[12])
487 except (TypeError, ValueError): db_gen['J_EVALUE'] = 'None'
488
489 try: db_gen['J_BTOP'] = j_align[16]
490 except (TypeError, ValueError): db_gen['J_BTOP'] = 'None'
491
492 # Update VDJ sequence, removing insertions
493 start = overlap
494 for m in re.finditer(r'-', j_align[15]):
495 ins = m.start()
496 seq_vdj += j_align[14][start:ins]
497 start = ins + 1
498 seq_vdj += j_align[14][start:]
499
500 db_gen['SEQUENCE_VDJ'] = seq_vdj
501
502 # Create IMGT-gapped sequence and infer IMGT junction
503 if v_call is not None:
504 db_gen = gapV(db_gen, repo_dict)
505 if j_call is not None:
506 db_gen = getIMGTJunc(db_gen, repo_dict)
507
508 # FWR and CDR regions
509 if region_fields: getRegions(db_gen)
510
511 yield IgRecord(db_gen)
512
513
514 # TODO: should be more readable
515 def readIMGT(imgt_files, score_fields=False, region_fields=False):
516 """
517 Reads IMGT/HighV-Quest output
518
519 Arguments:
520 imgt_files = IMGT/HighV-Quest output files 1, 2, 3, and 6
521 score_fields = if True parse alignment scores
522 region_fields = if True add FWR and CDR region fields
523
524 Returns:
525 a generator of dictionaries containing alignment data
526 """
527 imgt_iters = [csv.DictReader(open(f, 'rU'), delimiter='\t') for f in imgt_files]
528 # Create a dictionary for each sequence alignment and yield its generator
529 for sm, gp, nt, jn in zip(*imgt_iters):
530 if len(set([sm['Sequence ID'],
531 gp['Sequence ID'],
532 nt['Sequence ID'],
533 jn['Sequence ID']])) != 1:
534 sys.exit('Error: IMGT files are corrupt starting with Summary file record %s' \
535 % sm['Sequence ID'])
536
537 db_gen = {'SEQUENCE_ID': sm['Sequence ID'],
538 'SEQUENCE_INPUT': sm['Sequence']}
539
540 if 'No results' not in sm['Functionality']:
541 db_gen['FUNCTIONAL'] = ['?','T','F'][('productive' in sm['Functionality']) +
542 ('unprod' in sm['Functionality'])]
543 db_gen['IN_FRAME'] = ['?','T','F'][('in-frame' in sm['JUNCTION frame']) +
544 ('out-of-frame' in sm['JUNCTION frame'])],
545 db_gen['STOP'] = ['F','?','T'][('stop codon' in sm['Functionality comment']) +
546 ('unprod' in sm['Functionality'])]
547 db_gen['MUTATED_INVARIANT'] = ['F','?','T'][(any(('missing' in sm['Functionality comment'],
548 'missing' in sm['V-REGION potential ins/del']))) +
549 ('unprod' in sm['Functionality'])]
550 db_gen['INDELS'] = ['F','T'][any((sm['V-REGION potential ins/del'],
551 sm['V-REGION insertions'],
552 sm['V-REGION deletions']))]
553
554 db_gen['SEQUENCE_VDJ'] = nt['V-D-J-REGION'] if nt['V-D-J-REGION'] else nt['V-J-REGION']
555 db_gen['SEQUENCE_IMGT'] = gp['V-D-J-REGION'] if gp['V-D-J-REGION'] else gp['V-J-REGION']
556
557 db_gen['V_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['V-GENE and allele']))
558 db_gen['D_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['D-GENE and allele']))
559 db_gen['J_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['J-GENE and allele']))
560
561 v_seq_length = len(nt['V-REGION']) if nt['V-REGION'] else 0
562 db_gen['V_SEQ_START'] = nt['V-REGION start']
563 db_gen['V_SEQ_LENGTH'] = v_seq_length
564 db_gen['V_GERM_START_IMGT'] = 1
565 db_gen['V_GERM_LENGTH_IMGT'] = len(gp['V-REGION']) if gp['V-REGION'] else 0
566
567 db_gen['N1_LENGTH'] = sum(int(i) for i in [jn["P3'V-nt nb"],
568 jn['N-REGION-nt nb'],
569 jn['N1-REGION-nt nb'],
570 jn["P5'D-nt nb"]] if i)
571 db_gen['D_SEQ_START'] = sum(int(i) for i in [1, v_seq_length,
572 jn["P3'V-nt nb"],
573 jn['N-REGION-nt nb'],
574 jn['N1-REGION-nt nb'],
575 jn["P5'D-nt nb"]] if i)
576 db_gen['D_SEQ_LENGTH'] = int(jn["D-REGION-nt nb"] or 0)
577 db_gen['D_GERM_START'] = int(jn["5'D-REGION trimmed-nt nb"] or 0) + 1
578 db_gen['D_GERM_LENGTH'] = int(jn["D-REGION-nt nb"] or 0)
579 db_gen['N2_LENGTH'] = sum(int(i) for i in [jn["P3'D-nt nb"],
580 jn['N2-REGION-nt nb'],
581 jn["P5'J-nt nb"]] if i)
582
583 db_gen['J_SEQ_START_IMGT'] = sum(int(i) for i in [1, v_seq_length,
584 jn["P3'V-nt nb"],
585 jn['N-REGION-nt nb'],
586 jn['N1-REGION-nt nb'],
587 jn["P5'D-nt nb"],
588 jn["D-REGION-nt nb"],
589 jn["P3'D-nt nb"],
590 jn['N2-REGION-nt nb'],
591 jn["P5'J-nt nb"]] if i)
592 db_gen['J_SEQ_LENGTH'] = len(nt['J-REGION']) if nt['J-REGION'] else 0
593 db_gen['J_GERM_START'] = int(jn["5'J-REGION trimmed-nt nb"] or 0) + 1
594 db_gen['J_GERM_LENGTH'] = len(gp['J-REGION']) if gp['J-REGION'] else 0
595
596 db_gen['JUNCTION_LENGTH'] = len(jn['JUNCTION']) if jn['JUNCTION'] else 0
597 db_gen['JUNCTION'] = jn['JUNCTION']
598
599 # Alignment scores
600 if score_fields:
601 try: db_gen['V_SCORE'] = float(sm['V-REGION score'])
602 except (TypeError, ValueError): db_gen['V_SCORE'] = 'None'
603
604 try: db_gen['V_IDENTITY'] = float(sm['V-REGION identity %']) / 100.0
605 except (TypeError, ValueError): db_gen['V_IDENTITY'] = 'None'
606
607 try: db_gen['J_SCORE'] = float(sm['J-REGION score'])
608 except (TypeError, ValueError): db_gen['J_SCORE'] = 'None'
609
610 try: db_gen['J_IDENTITY'] = float(sm['J-REGION identity %']) / 100.0
611 except (TypeError, ValueError): db_gen['J_IDENTITY'] = 'None'
612
613 # FWR and CDR regions
614 if region_fields: getRegions(db_gen)
615 else:
616 db_gen['V_CALL'] = 'None'
617 db_gen['D_CALL'] = 'None'
618 db_gen['J_CALL'] = 'None'
619
620 yield IgRecord(db_gen)
621
622
623 def getIDforIMGT(seq_file):
624 """
625 Create a sequence ID translation using IMGT truncation
626
627 Arguments:
628 seq_file = a fasta file of sequences input to IMGT
629
630 Returns:
631 a dictionary of {truncated ID: full seq description}
632 """
633
634 # Create a seq_dict ID translation using IDs truncate up to space or 50 chars
635 ids = {}
636 for i, rec in enumerate(SeqIO.parse(seq_file, 'fasta', IUPAC.ambiguous_dna)):
637 if len(rec.description) <= 50:
638 id_key = rec.description
639 else:
640 id_key = re.sub('\||\s|!|&|\*|<|>|\?','_',rec.description[:50])
641 ids.update({id_key:rec.description})
642
643 return ids
644
645
646 def writeDb(db_gen, file_prefix, total_count, id_dict={}, no_parse=True,
647 score_fields=False, region_fields=False, out_args=default_out_args):
648 """
649 Writes tab-delimited database file in output directory
650
651 Arguments:
652 db_gen = a generator of IgRecord objects containing alignment data
653 file_prefix = directory and prefix for CLIP tab-delim file
654 total_count = number of records (for progress bar)
655 id_dict = a dictionary of {IMGT ID: full seq description}
656 no_parse = if ID is to be parsed for pRESTO output with default delimiters
657 score_fields = if True add alignment score fields to output file
658 region_fields = if True add FWR and CDR region fields to output file
659 out_args = common output argument dictionary from parseCommonArgs
660
661 Returns:
662 None
663 """
664 pass_file = "%s_db-pass.tab" % file_prefix
665 fail_file = "%s_db-fail.tab" % file_prefix
666 ordered_fields = ['SEQUENCE_ID',
667 'SEQUENCE_INPUT',
668 'FUNCTIONAL',
669 'IN_FRAME',
670 'STOP',
671 'MUTATED_INVARIANT',
672 'INDELS',
673 'V_CALL',
674 'D_CALL',
675 'J_CALL',
676 'SEQUENCE_VDJ',
677 'SEQUENCE_IMGT',
678 'V_SEQ_START',
679 'V_SEQ_LENGTH',
680 'V_GERM_START_VDJ',
681 'V_GERM_LENGTH_VDJ',
682 'V_GERM_START_IMGT',
683 'V_GERM_LENGTH_IMGT',
684 'N1_LENGTH',
685 'D_SEQ_START',
686 'D_SEQ_LENGTH',
687 'D_GERM_START',
688 'D_GERM_LENGTH',
689 'N2_LENGTH',
690 'J_SEQ_START',
691 'J_SEQ_LENGTH',
692 'J_GERM_START',
693 'J_GERM_LENGTH',
694 'JUNCTION_LENGTH',
695 'JUNCTION']
696
697 if score_fields:
698 ordered_fields.extend(['V_SCORE',
699 'V_IDENTITY',
700 'V_EVALUE',
701 'V_BTOP',
702 'J_SCORE',
703 'J_IDENTITY',
704 'J_EVALUE',
705 'J_BTOP'])
706
707 if region_fields:
708 ordered_fields.extend(['FWR1_IMGT', 'FWR2_IMGT', 'FWR3_IMGT', 'FWR4_IMGT',
709 'CDR1_IMGT', 'CDR2_IMGT', 'CDR3_IMGT'])
710
711
712 # TODO: This is not the best approach. should pass in output fields.
713 # Initiate passed handle
714 pass_handle = None
715
716 # Open failed file
717 if out_args['failed']:
718 fail_handle = open(fail_file, 'wt')
719 fail_writer = getDbWriter(fail_handle, add_fields=['SEQUENCE_ID', 'SEQUENCE_INPUT'])
720 else:
721 fail_handle = None
722 fail_writer = None
723
724 # Initialize counters and file
725 pass_writer = None
726 start_time = time()
727 rec_count = pass_count = fail_count = 0
728 for record in db_gen:
729 #printProgress(i + (total_count/2 if id_dict else 0), total_count, 0.05, start_time)
730 printProgress(rec_count, total_count, 0.05, start_time)
731 rec_count += 1
732
733 # Count pass or fail
734 if (record.v_call == 'None' and record.j_call == 'None') or \
735 record.functional is None or \
736 not record.seq_vdj or \
737 not record.junction:
738 # print(record.v_call, record.j_call, record.functional, record.junction)
739 fail_count += 1
740 if fail_writer is not None: fail_writer.writerow(record.toDict())
741 continue
742 else:
743 pass_count += 1
744
745 # Build sample sequence description
746 if record.id in id_dict:
747 record.id = id_dict[record.id]
748
749 # Parse sequence description into new columns
750 if not no_parse:
751 record.annotations = parseAnnotation(record.id, delimiter=out_args['delimiter'])
752 record.id = record.annotations['ID']
753 del record.annotations['ID']
754
755 # TODO: This is not the best approach. should pass in output fields.
756 # If first sequence, use parsed description to create new columns and initialize writer
757 if pass_writer is None:
758 if not no_parse: ordered_fields.extend(list(record.annotations.keys()))
759 pass_handle = open(pass_file, 'wt')
760 pass_writer = getDbWriter(pass_handle, add_fields=ordered_fields)
761
762 # Write row to tab-delim CLIP file
763 pass_writer.writerow(record.toDict())
764
765 # Print log
766 #printProgress(i+1 + (total_count/2 if id_dict else 0), total_count, 0.05, start_time)
767 printProgress(rec_count, total_count, 0.05, start_time)
768
769 log = OrderedDict()
770 log['OUTPUT'] = pass_file
771 log['PASS'] = pass_count
772 log['FAIL'] = fail_count
773 log['END'] = 'MakeDb'
774 printLog(log)
775
776 if pass_handle is not None: pass_handle.close()
777 if fail_handle is not None: fail_handle.close()
778
779
780 # TODO: may be able to merge with parseIMGT
781 def parseIgBlast(igblast_output, seq_file, repo, no_parse=True, score_fields=False,
782 region_fields=False, out_args=default_out_args):
783 """
784 Main for IgBlast aligned sample sequences
785
786 Arguments:
787 igblast_output = IgBlast output file to process
788 seq_file = fasta file input to IgBlast (from which to get sequence)
789 repo = folder with germline repertoire files
790 no_parse = if ID is to be parsed for pRESTO output with default delimiters
791 score_fields = if True add alignment score fields to output file
792 region_fields = if True add FWR and CDR region fields to output file
793 out_args = common output argument dictionary from parseCommonArgs
794
795 Returns:
796 None
797 """
798 # Print parameter info
799 log = OrderedDict()
800 log['START'] = 'MakeDB'
801 log['ALIGNER'] = 'IgBlast'
802 log['ALIGN_RESULTS'] = os.path.basename(igblast_output)
803 log['SEQ_FILE'] = os.path.basename(seq_file)
804 log['NO_PARSE'] = no_parse
805 log['SCORE_FIELDS'] = score_fields
806 log['REGION_FIELDS'] = region_fields
807 printLog(log)
808
809 # Get input sequence dictionary
810 seq_dict = getSeqforIgBlast(seq_file)
811
812 # Formalize out_dir and file-prefix
813 if not out_args['out_dir']:
814 out_dir = os.path.split(igblast_output)[0]
815 else:
816 out_dir = os.path.abspath(out_args['out_dir'])
817 if not os.path.exists(out_dir): os.mkdir(out_dir)
818 if out_args['out_name']:
819 file_prefix = out_args['out_name']
820 else:
821 file_prefix = os.path.basename(os.path.splitext(igblast_output)[0])
822 file_prefix = os.path.join(out_dir, file_prefix)
823
824 total_count = countSeqFile(seq_file)
825
826 # Create
827 repo_dict = getRepo(repo)
828 igblast_dict = readIgBlast(igblast_output, seq_dict, repo_dict,
829 score_fields=score_fields, region_fields=region_fields)
830 writeDb(igblast_dict, file_prefix, total_count, no_parse=no_parse,
831 score_fields=score_fields, region_fields=region_fields, out_args=out_args)
832
833
834 # TODO: may be able to merge with parseIgBlast
835 def parseIMGT(imgt_output, seq_file=None, no_parse=True, score_fields=False,
836 region_fields=False, out_args=default_out_args):
837 """
838 Main for IMGT aligned sample sequences
839
840 Arguments:
841 imgt_output = zipped file or unzipped folder output by IMGT
842 seq_file = FASTA file input to IMGT (from which to get seqID)
843 no_parse = if ID is to be parsed for pRESTO output with default delimiters
844 score_fields = if True add alignment score fields to output file
845 region_fields = if True add FWR and CDR region fields to output file
846 out_args = common output argument dictionary from parseCommonArgs
847
848 Returns:
849 None
850 """
851 # Print parameter info
852 log = OrderedDict()
853 log['START'] = 'MakeDb'
854 log['ALIGNER'] = 'IMGT'
855 log['ALIGN_RESULTS'] = imgt_output
856 log['SEQ_FILE'] = os.path.basename(seq_file) if seq_file else ''
857 log['NO_PARSE'] = no_parse
858 log['SCORE_FIELDS'] = score_fields
859 log['REGION_FIELDS'] = region_fields
860 printLog(log)
861
862 # Get individual IMGT result files
863 temp_dir, imgt_files = extractIMGT(imgt_output)
864
865 # Formalize out_dir and file-prefix
866 if not out_args['out_dir']:
867 out_dir = os.path.dirname(os.path.abspath(imgt_output))
868 else:
869 out_dir = os.path.abspath(out_args['out_dir'])
870 if not os.path.exists(out_dir): os.mkdir(out_dir)
871 if out_args['out_name']:
872 file_prefix = out_args['out_name']
873 else:
874 file_prefix = os.path.splitext(os.path.split(os.path.abspath(imgt_output))[1])[0]
875 file_prefix = os.path.join(out_dir, file_prefix)
876
877 total_count = countDbFile(imgt_files[0])
878
879 # Get (parsed) IDs from fasta file submitted to IMGT
880 id_dict = getIDforIMGT(seq_file) if seq_file else {}
881
882 # Create
883 imgt_dict = readIMGT(imgt_files, score_fields=score_fields,
884 region_fields=region_fields)
885 writeDb(imgt_dict, file_prefix, total_count, id_dict=id_dict, no_parse=no_parse,
886 score_fields=score_fields, region_fields=region_fields, out_args=out_args)
887
888 # Delete temp directory
889 rmtree(temp_dir)
890
891
892 def getArgParser():
893 """
894 Defines the ArgumentParser
895
896 Arguments:
897 None
898
899 Returns:
900 an ArgumentParser object
901 """
902 fields = dedent(
903 '''
904 output files:
905 db-pass
906 database of parsed alignment records.
907 db-fail
908 database with records failing alignment.
909
910 output fields:
911 SEQUENCE_ID, SEQUENCE_INPUT, FUNCTIONAL, IN_FRAME, STOP, MUTATED_INVARIANT,
912 INDELS, V_CALL, D_CALL, J_CALL, SEQUENCE_VDJ and/or SEQUENCE_IMGT,
913 V_SEQ_START, V_SEQ_LENGTH, V_GERM_START_VDJ and/or V_GERM_START_IMGT,
914 V_GERM_LENGTH_VDJ and/or V_GERM_LENGTH_IMGT, N1_LENGTH,
915 D_SEQ_START, D_SEQ_LENGTH, D_GERM_START, D_GERM_LENGTH, N2_LENGTH,
916 J_SEQ_START, J_SEQ_LENGTH, J_GERM_START, J_GERM_LENGTH,
917 JUNCTION_LENGTH, JUNCTION, V_SCORE, V_IDENTITY, V_EVALUE, V_BTOP,
918 J_SCORE, J_IDENTITY, J_EVALUE, J_BTOP, FWR1_IMGT, FWR2_IMGT, FWR3_IMGT,
919 FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, CDR3_IMGT
920 ''')
921
922 # Define ArgumentParser
923 parser = ArgumentParser(description=__doc__, epilog=fields,
924 formatter_class=CommonHelpFormatter)
925 parser.add_argument('--version', action='version',
926 version='%(prog)s:' + ' %s-%s' %(__version__, __date__))
927 subparsers = parser.add_subparsers(title='subcommands', dest='command',
928 help='Aligner used', metavar='')
929 # TODO: This is a temporary fix for Python issue 9253
930 subparsers.required = True
931
932 # Parent parser
933 parser_parent = getCommonArgParser(seq_in=False, seq_out=False, log=False)
934
935 # IgBlast Aligner
936 parser_igblast = subparsers.add_parser('igblast', help='Process IgBlast output',
937 parents=[parser_parent],
938 formatter_class=CommonHelpFormatter)
939 parser_igblast.set_defaults(func=parseIgBlast)
940 parser_igblast.add_argument('-i', nargs='+', action='store', dest='aligner_files',
941 required=True,
942 help='''IgBLAST output files in format 7 with query sequence
943 (IgBLAST argument \'-outfmt "7 std qseq sseq btop"\').''')
944 parser_igblast.add_argument('-r', nargs='+', action='store', dest='repo', required=True,
945 help='''List of folders and/or fasta files containing
946 IMGT-gapped germline sequences corresponding to the
947 set of germlines used in the IgBLAST alignment.''')
948 parser_igblast.add_argument('-s', action='store', nargs='+', dest='seq_files',
949 required=True,
950 help='List of input FASTA files containing sequences')
951 parser_igblast.add_argument('--noparse', action='store_true', dest='no_parse',
952 help='''Specify if input IDs should not be parsed to add
953 new columns to database.''')
954 parser_igblast.add_argument('--scores', action='store_true', dest='score_fields',
955 help='''Specify if alignment score metrics should be
956 included in the output. Adds the V_SCORE, V_IDENTITY,
957 V_EVALUE, V_BTOP, J_SCORE, J_IDENTITY,
958 J_BTOP, and J_EVALUE columns.''')
959 parser_igblast.add_argument('--regions', action='store_true', dest='region_fields',
960 help='''Specify if IMGT framework and CDR regions should be
961 included in the output. Adds the FWR1_IMGT, FWR2_IMGT,
962 FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and
963 CDR3_IMGT columns.''')
964
965 # IMGT aligner
966 parser_imgt = subparsers.add_parser('imgt', help='Process IMGT/HighV-Quest output',
967 parents=[parser_parent],
968 formatter_class=CommonHelpFormatter)
969 imgt_arg_group = parser_imgt.add_mutually_exclusive_group(required=True)
970 imgt_arg_group.add_argument('-i', nargs='+', action='store', dest='aligner_files',
971 help='''Either zipped IMGT output files (.zip) or a folder
972 containing unzipped IMGT output files (which must
973 include 1_Summary, 2_IMGT-gapped, 3_Nt-sequences,
974 and 6_Junction).''')
975 parser_imgt.add_argument('-s', nargs='*', action='store', dest='seq_files',
976 required=False,
977 help='List of input FASTA files containing sequences')
978 parser_imgt.add_argument('--noparse', action='store_true', dest='no_parse',
979 help='''Specify if input IDs should not be parsed to add new
980 columns to database.''')
981 parser_imgt.add_argument('--scores', action='store_true', dest='score_fields',
982 help='''Specify if alignment score metrics should be
983 included in the output. Adds the V_SCORE, V_IDENTITY,
984 J_SCORE and J_IDENTITY. Note, this will also add
985 the columns V_EVALUE, V_BTOP, J_EVALUE and J_BTOP,
986 but they will be empty for IMGT output.''')
987 parser_imgt.add_argument('--regions', action='store_true', dest='region_fields',
988 help='''Specify if IMGT framework and CDR regions should be
989 included in the output. Adds the FWR1_IMGT, FWR2_IMGT,
990 FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and
991 CDR3_IMGT columns.''')
992 parser_imgt.set_defaults(func=parseIMGT)
993
994 return parser
995
996
997 if __name__ == "__main__":
998 """
999 Parses command line arguments and calls main
1000 """
1001 parser = getArgParser()
1002 args = parser.parse_args()
1003 args_dict = parseCommonArgs(args, in_arg='aligner_files')
1004
1005 # Set no ID parsing if sequence files are not provided
1006 if 'seq_files' in args_dict and not args_dict['seq_files']:
1007 args_dict['no_parse'] = True
1008
1009 # Delete
1010 if 'seq_files' in args_dict: del args_dict['seq_files']
1011 if 'aligner_files' in args_dict: del args_dict['aligner_files']
1012 if 'command' in args_dict: del args_dict['command']
1013 if 'func' in args_dict: del args_dict['func']
1014
1015 if args.command == 'imgt':
1016 for i in range(len(args.__dict__['aligner_files'])):
1017 args_dict['imgt_output'] = args.__dict__['aligner_files'][i]
1018 args_dict['seq_file'] = args.__dict__['seq_files'][i] \
1019 if args.__dict__['seq_files'] else None
1020 args.func(**args_dict)
1021 elif args.command == 'igblast':
1022 for i in range(len(args.__dict__['aligner_files'])):
1023 args_dict['igblast_output'] = args.__dict__['aligner_files'][i]
1024 args_dict['seq_file'] = args.__dict__['seq_files'][i]
1025 args.func(**args_dict)