0
|
1 #!/usr/bin/env python3
|
|
2 """
|
|
3 Create tab-delimited database file to store sequence alignment information
|
|
4 """
|
|
5 # Info
|
|
6 __author__ = 'Namita Gupta, Jason Anthony Vander Heiden'
|
|
7 from changeo import __version__, __date__
|
|
8
|
|
9 # Imports
|
|
10 import csv
|
|
11 import os
|
|
12 import re
|
|
13 import sys
|
|
14 import pandas as pd
|
|
15 import tarfile
|
|
16 import zipfile
|
|
17 from argparse import ArgumentParser
|
|
18 from collections import OrderedDict
|
|
19 from itertools import groupby
|
|
20 from shutil import rmtree
|
|
21 from tempfile import mkdtemp
|
|
22 from textwrap import dedent
|
|
23 from time import time
|
|
24 from Bio import SeqIO
|
|
25 from Bio.Seq import Seq
|
|
26 from Bio.Alphabet import IUPAC
|
|
27
|
|
28 # Presto and changeo imports
|
|
29 from presto.Defaults import default_out_args
|
|
30 from presto.Annotation import parseAnnotation
|
|
31 from presto.IO import countSeqFile, printLog, printProgress
|
|
32 from changeo.Commandline import CommonHelpFormatter, getCommonArgParser, parseCommonArgs
|
|
33 from changeo.IO import getDbWriter, countDbFile, getRepo
|
|
34 from changeo.Receptor import IgRecord, parseAllele, v_allele_regex, d_allele_regex, \
|
|
35 j_allele_regex
|
|
36
|
|
37 # Default parameters
|
|
38 default_delimiter = ('\t', ',', '-')
|
|
39
|
|
40
|
|
41 def gapV(ig_dict, repo_dict):
|
|
42 """
|
|
43 Insert gaps into V region and update alignment information
|
|
44
|
|
45 Arguments:
|
|
46 ig_dict : Dictionary of parsed IgBlast output
|
|
47 repo_dict : Dictionary of IMGT gapped germline sequences
|
|
48
|
|
49 Returns:
|
|
50 dict : Updated with SEQUENCE_IMGT, V_GERM_START_IMGT, and V_GERM_LENGTH_IMGT fields
|
|
51 """
|
|
52
|
|
53 seq_imgt = '.' * (int(ig_dict['V_GERM_START_VDJ'])-1) + ig_dict['SEQUENCE_VDJ']
|
|
54
|
|
55 # Find gapped germline V segment
|
|
56 vgene = parseAllele(ig_dict['V_CALL'], v_allele_regex, 'first')
|
|
57 vkey = (vgene, )
|
|
58 #TODO: Figure out else case
|
|
59 if vkey in repo_dict:
|
|
60 vgap = repo_dict[vkey]
|
|
61 # Iterate over gaps in the germline segment
|
|
62 gaps = re.finditer(r'\.', vgap)
|
|
63 gapcount = int(ig_dict['V_GERM_START_VDJ'])-1
|
|
64 for gap in gaps:
|
|
65 i = gap.start()
|
|
66 # Break if gap begins after V region
|
|
67 if i >= ig_dict['V_GERM_LENGTH_VDJ'] + gapcount:
|
|
68 break
|
|
69 # Insert gap into IMGT sequence
|
|
70 seq_imgt = seq_imgt[:i] + '.' + seq_imgt[i:]
|
|
71 # Update gap counter
|
|
72 gapcount += 1
|
|
73 ig_dict['SEQUENCE_IMGT'] = seq_imgt
|
|
74 # Update IMGT positioning information for V
|
|
75 ig_dict['V_GERM_START_IMGT'] = 1
|
|
76 ig_dict['V_GERM_LENGTH_IMGT'] = ig_dict['V_GERM_LENGTH_VDJ'] + gapcount
|
|
77
|
|
78 return ig_dict
|
|
79
|
|
80
|
|
81 def getIMGTJunc(ig_dict, repo_dict):
|
|
82 """
|
|
83 Identify junction region by IMGT definition
|
|
84
|
|
85 Arguments:
|
|
86 ig_dict : Dictionary of parsed IgBlast output
|
|
87 repo_dict : Dictionary of IMGT gapped germline sequences
|
|
88
|
|
89 Returns:
|
|
90 dict : Updated with JUNCTION_LENGTH_IMGT and JUNCTION_IMGT fields
|
|
91 """
|
|
92 # Find germline J segment
|
|
93 jgene = parseAllele(ig_dict['J_CALL'], j_allele_regex, 'first')
|
|
94 jkey = (jgene, )
|
|
95 #TODO: Figure out else case
|
|
96 if jkey in repo_dict:
|
|
97 # Get germline J sequence
|
|
98 jgerm = repo_dict[jkey]
|
|
99 jgerm = jgerm[:ig_dict['J_GERM_START']+ig_dict['J_GERM_LENGTH']-1]
|
|
100 # Look for (F|W)GXG aa motif in nt sequence
|
|
101 motif = re.search(r'T(TT|TC|GG)GG[ACGT]{4}GG[AGCT]', jgerm)
|
|
102 aa_end = len(ig_dict['SEQUENCE_IMGT'])
|
|
103 #TODO: Figure out else case
|
|
104 if motif:
|
|
105 # print('\n', motif.group())
|
|
106 aa_end = motif.start() - len(jgerm) + 3
|
|
107 # Add fields to dict
|
|
108 ig_dict['JUNCTION'] = ig_dict['SEQUENCE_IMGT'][309:aa_end]
|
|
109 ig_dict['JUNCTION_LENGTH'] = len(ig_dict['JUNCTION'])
|
|
110
|
|
111 return ig_dict
|
|
112
|
|
113
|
|
114 def getRegions(ig_dict):
|
|
115 """
|
|
116 Identify FWR and CDR regions by IMGT definition
|
|
117
|
|
118 Arguments:
|
|
119 ig_dict : Dictionary of parsed alignment output
|
|
120
|
|
121 Returns:
|
|
122 dict : Updated with FWR1_IMGT, FWR2_IMGT, FWR3_IMGT, FWR4_IMGT,
|
|
123 CDR1_IMGT, CDR2_IMGT, and CDR3_IMGT fields
|
|
124 """
|
|
125 try:
|
|
126 seq_len = len(ig_dict['SEQUENCE_IMGT'])
|
|
127 ig_dict['FWR1_IMGT'] = ig_dict['SEQUENCE_IMGT'][0:min(78,seq_len)]
|
|
128 except (KeyError, IndexError):
|
|
129 return ig_dict
|
|
130
|
|
131 try: ig_dict['CDR1_IMGT'] = ig_dict['SEQUENCE_IMGT'][78:min(114, seq_len)]
|
|
132 except (IndexError): return ig_dict
|
|
133
|
|
134 try: ig_dict['FWR2_IMGT'] = ig_dict['SEQUENCE_IMGT'][114:min(165, seq_len)]
|
|
135 except (IndexError): return ig_dict
|
|
136
|
|
137 try: ig_dict['CDR2_IMGT'] = ig_dict['SEQUENCE_IMGT'][165:min(195, seq_len)]
|
|
138 except (IndexError): return ig_dict
|
|
139
|
|
140 try: ig_dict['FWR3_IMGT'] = ig_dict['SEQUENCE_IMGT'][195:min(312, seq_len)]
|
|
141 except (IndexError): return ig_dict
|
|
142
|
|
143 try:
|
|
144 cdr3_end = 306 + ig_dict['JUNCTION_LENGTH']
|
|
145 ig_dict['CDR3_IMGT'] = ig_dict['SEQUENCE_IMGT'][312:cdr3_end]
|
|
146 ig_dict['FWR4_IMGT'] = ig_dict['SEQUENCE_IMGT'][cdr3_end:]
|
|
147 except (KeyError, IndexError):
|
|
148 return ig_dict
|
|
149
|
|
150 return ig_dict
|
|
151
|
|
152
|
|
153 def getSeqforIgBlast(seq_file):
|
|
154 """
|
|
155 Fetch input sequences for IgBlast queries
|
|
156
|
|
157 Arguments:
|
|
158 seq_file = a fasta file of sequences input to IgBlast
|
|
159
|
|
160 Returns:
|
|
161 a dictionary of {ID:Seq}
|
|
162 """
|
|
163
|
|
164 seq_dict = SeqIO.index(seq_file, "fasta", IUPAC.ambiguous_dna)
|
|
165
|
|
166 # Create a seq_dict ID translation using IDs truncate up to space or 50 chars
|
|
167 seqs = {}
|
|
168 for seq in seq_dict.values():
|
|
169 seqs.update({seq.description:str(seq.seq)})
|
|
170
|
|
171 return seqs
|
|
172
|
|
173
|
|
174 def findLine(handle, query):
|
|
175 """
|
|
176 Finds line with query string in file
|
|
177
|
|
178 Arguments:
|
|
179 handle = file handle in which to search for line
|
|
180 query = query string for which to search in file
|
|
181
|
|
182 Returns:
|
|
183 line from handle in which query string was found
|
|
184 """
|
|
185 for line in handle:
|
|
186 if(re.match(query, line)):
|
|
187 return line
|
|
188
|
|
189
|
|
190 def extractIMGT(imgt_output):
|
|
191 """
|
|
192 Extract necessary files from IMGT results, zipped or unzipped
|
|
193
|
|
194 Arguments:
|
|
195 imgt_output = zipped file or unzipped folder output by IMGT
|
|
196
|
|
197 Returns:
|
|
198 sorted list of filenames from which information will be read
|
|
199 """
|
|
200 #file_ext = os.path.splitext(imgt_output)[1].lower()
|
|
201 imgt_flags = ('1_Summary', '2_IMGT-gapped', '3_Nt-sequences', '6_Junction')
|
|
202 temp_dir = mkdtemp()
|
|
203 if zipfile.is_zipfile(imgt_output):
|
|
204 # Open zip file
|
|
205 imgt_zip = zipfile.ZipFile(imgt_output, 'r')
|
|
206 # Extract required files
|
|
207 imgt_files = sorted([n for n in imgt_zip.namelist() \
|
|
208 if os.path.basename(n).startswith(imgt_flags)])
|
|
209 imgt_zip.extractall(temp_dir, imgt_files)
|
|
210 # Define file list
|
|
211 imgt_files = [os.path.join(temp_dir, f) for f in imgt_files]
|
|
212 elif os.path.isdir(imgt_output):
|
|
213 # Find required files in folder
|
|
214 folder_files = []
|
|
215 for root, dirs, files in os.walk(imgt_output):
|
|
216 folder_files.extend([os.path.join(os.path.abspath(root), f) for f in files])
|
|
217 # Define file list
|
|
218 imgt_files = sorted([n for n in folder_files \
|
|
219 if os.path.basename(n).startswith(imgt_flags)])
|
|
220 elif tarfile.is_tarfile(imgt_output):
|
|
221 # Open zip file
|
|
222 imgt_tar = tarfile.open(imgt_output, 'r')
|
|
223 # Extract required files
|
|
224 imgt_files = sorted([n for n in imgt_tar.getnames() \
|
|
225 if os.path.basename(n).startswith(imgt_flags)])
|
|
226 imgt_tar.extractall(temp_dir, [imgt_tar.getmember(n) for n in imgt_files])
|
|
227 # Define file list
|
|
228 imgt_files = [os.path.join(temp_dir, f) for f in imgt_files]
|
|
229 else:
|
|
230 sys.exit('ERROR: Unsupported IGMT output file. Must be either a zipped file (.zip), LZMA compressed tarfile (.txz) or a folder.')
|
|
231
|
|
232 if len(imgt_files) > len(imgt_flags): # e.g. multiple 1_Summary files
|
|
233 sys.exit('ERROR: Wrong files in IMGT output %s.' % imgt_output)
|
|
234 elif len(imgt_files) < len(imgt_flags):
|
|
235 sys.exit('ERROR: Missing necessary file IMGT output %s.' % imgt_output)
|
|
236
|
|
237 return temp_dir, imgt_files
|
|
238
|
|
239
|
|
240 # TODO: return a dictionary with keys determined by the comment strings in the blocks, thus avoiding problems with missing blocks
|
|
241 def readOneIgBlastResult(block):
|
|
242 """
|
|
243 Parse a single IgBLAST query result
|
|
244
|
|
245 Arguments:
|
|
246 block = itertools groupby object of single result
|
|
247
|
|
248 Returns:
|
|
249 None if no results, otherwise list of DataFrames for each result block
|
|
250 """
|
|
251 results = list()
|
|
252 i = 0
|
|
253 for match, subblock in groupby(block, lambda l: l=='\n'):
|
|
254 if not match:
|
|
255 # Strip whitespace and comments
|
|
256 sub = [s.strip() for s in subblock if not s.startswith('#')]
|
|
257
|
|
258 # Continue on empty block
|
|
259 if not sub: continue
|
|
260 else: i += 1
|
|
261
|
|
262 # Split by tabs
|
|
263 sub = [s.split('\t') for s in sub]
|
|
264
|
|
265 # Append list for "V-(D)-J rearrangement summary" (i == 1)
|
|
266 # And "V-(D)-J junction details" (i == 2)
|
|
267 # Otherwise append DataFrame of subblock
|
|
268 if i == 1 or i == 2:
|
|
269 results.append(sub[0])
|
|
270 else:
|
|
271 df = pd.DataFrame(sub)
|
|
272 if not df.empty: results.append(df)
|
|
273
|
|
274 return results if results else None
|
|
275
|
|
276
|
|
277 # TODO: needs more speeds. pandas is probably to blame.
|
|
278 def readIgBlast(igblast_output, seq_dict, repo_dict,
|
|
279 score_fields=False, region_fields=False):
|
|
280 """
|
|
281 Reads IgBlast output
|
|
282
|
|
283 Arguments:
|
|
284 igblast_output = IgBlast output file (format 7)
|
|
285 seq_dict = a dictionary of {ID:Seq} from input fasta file
|
|
286 repo_dict = dictionary of IMGT gapped germline sequences
|
|
287 score_fields = if True parse alignment scores
|
|
288 region_fields = if True add FWR and CDR region fields
|
|
289
|
|
290 Returns:
|
|
291 a generator of dictionaries containing alignment data
|
|
292 """
|
|
293
|
|
294 # Open IgBlast output file
|
|
295 with open(igblast_output) as f:
|
|
296 # Iterate over individual results (separated by # IGBLASTN)
|
|
297 for k1, block in groupby(f, lambda x: re.match('# IGBLASTN', x)):
|
|
298 block = list(block)
|
|
299 if not k1:
|
|
300 # TODO: move query name extraction into block parser readOneIgBlastResult().
|
|
301 # Extract sequence ID
|
|
302 query_name = ' '.join(block[0].strip().split(' ')[2:])
|
|
303 # Initialize db_gen to have ID and input sequence
|
|
304 db_gen = {'SEQUENCE_ID': query_name,
|
|
305 'SEQUENCE_INPUT': seq_dict[query_name]}
|
|
306
|
|
307 # Parse further sub-blocks
|
|
308 block_list = readOneIgBlastResult(block)
|
|
309
|
|
310 # TODO: this is indented pretty far. should be a separate function. or several functions.
|
|
311 # If results exist, parse further to obtain full db_gen
|
|
312 if block_list is not None:
|
|
313 # Parse quality information
|
|
314 db_gen['STOP'] = 'T' if block_list[0][-4] == 'Yes' else 'F'
|
|
315 db_gen['IN_FRAME'] = 'T' if block_list[0][-3] == 'In-frame' else 'F'
|
|
316 db_gen['FUNCTIONAL'] = 'T' if block_list[0][-2] == 'Yes' else 'F'
|
|
317 if block_list[0][-1] == '-':
|
|
318 db_gen['SEQUENCE_INPUT'] = str(Seq(db_gen['SEQUENCE_INPUT'],
|
|
319 IUPAC.ambiguous_dna).reverse_complement())
|
|
320
|
|
321 # Parse V, D, and J calls
|
|
322 call_str = ' '.join(block_list[0])
|
|
323 v_call = parseAllele(call_str, v_allele_regex, action='list')
|
|
324 d_call = parseAllele(call_str, d_allele_regex, action='list')
|
|
325 j_call = parseAllele(call_str, j_allele_regex, action='list')
|
|
326 db_gen['V_CALL'] = ','.join(v_call) if v_call is not None else 'None'
|
|
327 db_gen['D_CALL'] = ','.join(d_call) if d_call is not None else 'None'
|
|
328 db_gen['J_CALL'] = ','.join(j_call) if j_call is not None else 'None'
|
|
329
|
|
330 # Parse junction sequence
|
|
331 # db_gen['JUNCTION_VDJ'] = re.sub('(N/A)|\[|\(|\)|\]', '', ''.join(block_list[1]))
|
|
332 # db_gen['JUNCTION_LENGTH_VDJ'] = len(db_gen['JUNCTION_VDJ'])
|
|
333
|
|
334 # TODO: IgBLAST does a stupid and doesn't output block #3 sometimes. why?
|
|
335 # TODO: maybe we should fail these. they look craptastic.
|
|
336 #pd.set_option('display.width', 500)
|
|
337 #print query_name, len(block_list), hit_idx
|
|
338 #for i, x in enumerate(block_list):
|
|
339 # print '[%i]' % i
|
|
340 # print x
|
|
341
|
|
342 # Parse segment start and stop positions
|
|
343 hit_df = block_list[-1]
|
|
344
|
|
345 # Alignment info block
|
|
346 # 0: segment
|
|
347 # 1: query id
|
|
348 # 2: subject id
|
|
349 # 3: % identity
|
|
350 # 4: alignment length
|
|
351 # 5: mismatches
|
|
352 # 6: gap opens
|
|
353 # 7: gaps
|
|
354 # 8: q. start
|
|
355 # 9: q. end
|
|
356 # 10: s. start
|
|
357 # 11: s. end
|
|
358 # 12: evalue
|
|
359 # 13: bit score
|
|
360 # 14: query seq
|
|
361 # 15: subject seq
|
|
362 # 16: btop
|
|
363
|
|
364 # If V call exists, parse V alignment information
|
|
365 seq_vdj = ''
|
|
366 if v_call is not None:
|
|
367 v_align = hit_df[hit_df[0] == 'V'].iloc[0]
|
|
368 # Germline positions
|
|
369 db_gen['V_GERM_START_VDJ'] = int(v_align[10])
|
|
370 db_gen['V_GERM_LENGTH_VDJ'] = int(v_align[11]) - db_gen['V_GERM_START_VDJ'] + 1
|
|
371 # Query sequence positions
|
|
372 db_gen['V_SEQ_START'] = int(v_align[8])
|
|
373 db_gen['V_SEQ_LENGTH'] = int(v_align[9]) - db_gen['V_SEQ_START'] + 1
|
|
374
|
|
375 if int(v_align[6]) == 0:
|
|
376 db_gen['INDELS'] = 'F'
|
|
377 else:
|
|
378 db_gen['INDELS'] = 'T'
|
|
379 # Set functional to none so record gets tossed (junction will be wrong)
|
|
380 # db_gen['FUNCTIONAL'] = None
|
|
381
|
|
382 # V alignment scores
|
|
383 if score_fields:
|
|
384 try: db_gen['V_SCORE'] = float(v_align[13])
|
|
385 except (TypeError, ValueError): db_gen['V_SCORE'] = 'None'
|
|
386
|
|
387 try: db_gen['V_IDENTITY'] = float(v_align[3]) / 100.0
|
|
388 except (TypeError, ValueError): db_gen['V_IDENTITY'] = 'None'
|
|
389
|
|
390 try: db_gen['V_EVALUE'] = float(v_align[12])
|
|
391 except (TypeError, ValueError): db_gen['V_EVALUE'] = 'None'
|
|
392
|
|
393 try: db_gen['V_BTOP'] = v_align[16]
|
|
394 except (TypeError, ValueError): db_gen['V_BTOP'] = 'None'
|
|
395
|
|
396 # Update VDJ sequence, removing insertions
|
|
397 start = 0
|
|
398 for m in re.finditer(r'-', v_align[15]):
|
|
399 ins = m.start()
|
|
400 seq_vdj += v_align[14][start:ins]
|
|
401 start = ins + 1
|
|
402 seq_vdj += v_align[14][start:]
|
|
403
|
|
404 # TODO: needs to check that the V results are present before trying to determine N1_LENGTH from them.
|
|
405 # If D call exists, parse D alignment information
|
|
406 if d_call is not None:
|
|
407 d_align = hit_df[hit_df[0] == 'D'].iloc[0]
|
|
408
|
|
409 # TODO: this is kinda gross. not sure how else to fix the alignment overlap problem though.
|
|
410 # Determine N-region length and amount of J overlap with V or D alignment
|
|
411 overlap = 0
|
|
412 if v_call is not None:
|
|
413 n1_len = int(d_align[8]) - (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH'])
|
|
414 if n1_len < 0:
|
|
415 db_gen['N1_LENGTH'] = 0
|
|
416 overlap = abs(n1_len)
|
|
417 else:
|
|
418 db_gen['N1_LENGTH'] = n1_len
|
|
419 n1_start = (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH']-1)
|
|
420 n1_end = int(d_align[8])-1
|
|
421 seq_vdj += db_gen['SEQUENCE_INPUT'][n1_start:n1_end]
|
|
422
|
|
423 # Query sequence positions
|
|
424 db_gen['D_SEQ_START'] = int(d_align[8]) + overlap
|
|
425 db_gen['D_SEQ_LENGTH'] = max(int(d_align[9]) - db_gen['D_SEQ_START'] + 1, 0)
|
|
426
|
|
427 # Germline positions
|
|
428 db_gen['D_GERM_START'] = int(d_align[10]) + overlap
|
|
429 db_gen['D_GERM_LENGTH'] = max(int(d_align[11]) - db_gen['D_GERM_START'] + 1, 0)
|
|
430
|
|
431 # Update VDJ sequence, removing insertions
|
|
432 start = overlap
|
|
433 for m in re.finditer(r'-', d_align[15]):
|
|
434 ins = m.start()
|
|
435 seq_vdj += d_align[14][start:ins]
|
|
436 start = ins + 1
|
|
437 seq_vdj += d_align[14][start:]
|
|
438
|
|
439 # TODO: needs to check that the V results are present before trying to determine N1_LENGTH from them.
|
|
440 # If J call exists, parse J alignment information
|
|
441 if j_call is not None:
|
|
442 j_align = hit_df[hit_df[0] == 'J'].iloc[0]
|
|
443
|
|
444 # TODO: this is kinda gross. not sure how else to fix the alignment overlap problem though.
|
|
445 # Determine N-region length and amount of J overlap with V or D alignment
|
|
446 overlap = 0
|
|
447 if d_call is not None:
|
|
448 n2_len = int(j_align[8]) - (db_gen['D_SEQ_START'] + db_gen['D_SEQ_LENGTH'])
|
|
449 if n2_len < 0:
|
|
450 db_gen['N2_LENGTH'] = 0
|
|
451 overlap = abs(n2_len)
|
|
452 else:
|
|
453 db_gen['N2_LENGTH'] = n2_len
|
|
454 n2_start = (db_gen['D_SEQ_START']+db_gen['D_SEQ_LENGTH']-1)
|
|
455 n2_end = int(j_align[8])-1
|
|
456 seq_vdj += db_gen['SEQUENCE_INPUT'][n2_start:n2_end]
|
|
457 elif v_call is not None:
|
|
458 n1_len = int(j_align[8]) - (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH'])
|
|
459 if n1_len < 0:
|
|
460 db_gen['N1_LENGTH'] = 0
|
|
461 overlap = abs(n1_len)
|
|
462 else:
|
|
463 db_gen['N1_LENGTH'] = n1_len
|
|
464 n1_start = (db_gen['V_SEQ_START']+db_gen['V_SEQ_LENGTH']-1)
|
|
465 n1_end = int(j_align[8])-1
|
|
466 seq_vdj += db_gen['SEQUENCE_INPUT'][n1_start:n1_end]
|
|
467 else:
|
|
468 db_gen['N1_LENGTH'] = 0
|
|
469
|
|
470 # Query positions
|
|
471 db_gen['J_SEQ_START'] = int(j_align[8]) + overlap
|
|
472 db_gen['J_SEQ_LENGTH'] = max(int(j_align[9]) - db_gen['J_SEQ_START'] + 1, 0)
|
|
473
|
|
474 # Germline positions
|
|
475 db_gen['J_GERM_START'] = int(j_align[10]) + overlap
|
|
476 db_gen['J_GERM_LENGTH'] = max(int(j_align[11]) - db_gen['J_GERM_START'] + 1, 0)
|
|
477
|
|
478 # J alignment scores
|
|
479 if score_fields:
|
|
480 try: db_gen['J_SCORE'] = float(j_align[13])
|
|
481 except (TypeError, ValueError): db_gen['J_SCORE'] = 'None'
|
|
482
|
|
483 try: db_gen['J_IDENTITY'] = float(j_align[3]) / 100.0
|
|
484 except (TypeError, ValueError): db_gen['J_IDENTITY'] = 'None'
|
|
485
|
|
486 try: db_gen['J_EVALUE'] = float(j_align[12])
|
|
487 except (TypeError, ValueError): db_gen['J_EVALUE'] = 'None'
|
|
488
|
|
489 try: db_gen['J_BTOP'] = j_align[16]
|
|
490 except (TypeError, ValueError): db_gen['J_BTOP'] = 'None'
|
|
491
|
|
492 # Update VDJ sequence, removing insertions
|
|
493 start = overlap
|
|
494 for m in re.finditer(r'-', j_align[15]):
|
|
495 ins = m.start()
|
|
496 seq_vdj += j_align[14][start:ins]
|
|
497 start = ins + 1
|
|
498 seq_vdj += j_align[14][start:]
|
|
499
|
|
500 db_gen['SEQUENCE_VDJ'] = seq_vdj
|
|
501
|
|
502 # Create IMGT-gapped sequence and infer IMGT junction
|
|
503 if v_call is not None:
|
|
504 db_gen = gapV(db_gen, repo_dict)
|
|
505 if j_call is not None:
|
|
506 db_gen = getIMGTJunc(db_gen, repo_dict)
|
|
507
|
|
508 # FWR and CDR regions
|
|
509 if region_fields: getRegions(db_gen)
|
|
510
|
|
511 yield IgRecord(db_gen)
|
|
512
|
|
513
|
|
514 # TODO: should be more readable
|
|
515 def readIMGT(imgt_files, score_fields=False, region_fields=False):
|
|
516 """
|
|
517 Reads IMGT/HighV-Quest output
|
|
518
|
|
519 Arguments:
|
|
520 imgt_files = IMGT/HighV-Quest output files 1, 2, 3, and 6
|
|
521 score_fields = if True parse alignment scores
|
|
522 region_fields = if True add FWR and CDR region fields
|
|
523
|
|
524 Returns:
|
|
525 a generator of dictionaries containing alignment data
|
|
526 """
|
|
527 imgt_iters = [csv.DictReader(open(f, 'rU'), delimiter='\t') for f in imgt_files]
|
|
528 # Create a dictionary for each sequence alignment and yield its generator
|
|
529 for sm, gp, nt, jn in zip(*imgt_iters):
|
|
530 if len(set([sm['Sequence ID'],
|
|
531 gp['Sequence ID'],
|
|
532 nt['Sequence ID'],
|
|
533 jn['Sequence ID']])) != 1:
|
|
534 sys.exit('Error: IMGT files are corrupt starting with Summary file record %s' \
|
|
535 % sm['Sequence ID'])
|
|
536
|
|
537 db_gen = {'SEQUENCE_ID': sm['Sequence ID'],
|
|
538 'SEQUENCE_INPUT': sm['Sequence']}
|
|
539
|
|
540 if 'No results' not in sm['Functionality']:
|
|
541 db_gen['FUNCTIONAL'] = ['?','T','F'][('productive' in sm['Functionality']) +
|
|
542 ('unprod' in sm['Functionality'])]
|
|
543 db_gen['IN_FRAME'] = ['?','T','F'][('in-frame' in sm['JUNCTION frame']) +
|
|
544 ('out-of-frame' in sm['JUNCTION frame'])],
|
|
545 db_gen['STOP'] = ['F','?','T'][('stop codon' in sm['Functionality comment']) +
|
|
546 ('unprod' in sm['Functionality'])]
|
|
547 db_gen['MUTATED_INVARIANT'] = ['F','?','T'][(any(('missing' in sm['Functionality comment'],
|
|
548 'missing' in sm['V-REGION potential ins/del']))) +
|
|
549 ('unprod' in sm['Functionality'])]
|
|
550 db_gen['INDELS'] = ['F','T'][any((sm['V-REGION potential ins/del'],
|
|
551 sm['V-REGION insertions'],
|
|
552 sm['V-REGION deletions']))]
|
|
553
|
|
554 db_gen['SEQUENCE_VDJ'] = nt['V-D-J-REGION'] if nt['V-D-J-REGION'] else nt['V-J-REGION']
|
|
555 db_gen['SEQUENCE_IMGT'] = gp['V-D-J-REGION'] if gp['V-D-J-REGION'] else gp['V-J-REGION']
|
|
556
|
|
557 db_gen['V_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['V-GENE and allele']))
|
|
558 db_gen['D_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['D-GENE and allele']))
|
|
559 db_gen['J_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['J-GENE and allele']))
|
|
560
|
|
561 v_seq_length = len(nt['V-REGION']) if nt['V-REGION'] else 0
|
|
562 db_gen['V_SEQ_START'] = nt['V-REGION start']
|
|
563 db_gen['V_SEQ_LENGTH'] = v_seq_length
|
|
564 db_gen['V_GERM_START_IMGT'] = 1
|
|
565 db_gen['V_GERM_LENGTH_IMGT'] = len(gp['V-REGION']) if gp['V-REGION'] else 0
|
|
566
|
|
567 db_gen['N1_LENGTH'] = sum(int(i) for i in [jn["P3'V-nt nb"],
|
|
568 jn['N-REGION-nt nb'],
|
|
569 jn['N1-REGION-nt nb'],
|
|
570 jn["P5'D-nt nb"]] if i)
|
|
571 db_gen['D_SEQ_START'] = sum(int(i) for i in [1, v_seq_length,
|
|
572 jn["P3'V-nt nb"],
|
|
573 jn['N-REGION-nt nb'],
|
|
574 jn['N1-REGION-nt nb'],
|
|
575 jn["P5'D-nt nb"]] if i)
|
|
576 db_gen['D_SEQ_LENGTH'] = int(jn["D-REGION-nt nb"] or 0)
|
|
577 db_gen['D_GERM_START'] = int(jn["5'D-REGION trimmed-nt nb"] or 0) + 1
|
|
578 db_gen['D_GERM_LENGTH'] = int(jn["D-REGION-nt nb"] or 0)
|
|
579 db_gen['N2_LENGTH'] = sum(int(i) for i in [jn["P3'D-nt nb"],
|
|
580 jn['N2-REGION-nt nb'],
|
|
581 jn["P5'J-nt nb"]] if i)
|
|
582
|
|
583 db_gen['J_SEQ_START_IMGT'] = sum(int(i) for i in [1, v_seq_length,
|
|
584 jn["P3'V-nt nb"],
|
|
585 jn['N-REGION-nt nb'],
|
|
586 jn['N1-REGION-nt nb'],
|
|
587 jn["P5'D-nt nb"],
|
|
588 jn["D-REGION-nt nb"],
|
|
589 jn["P3'D-nt nb"],
|
|
590 jn['N2-REGION-nt nb'],
|
|
591 jn["P5'J-nt nb"]] if i)
|
|
592 db_gen['J_SEQ_LENGTH'] = len(nt['J-REGION']) if nt['J-REGION'] else 0
|
|
593 db_gen['J_GERM_START'] = int(jn["5'J-REGION trimmed-nt nb"] or 0) + 1
|
|
594 db_gen['J_GERM_LENGTH'] = len(gp['J-REGION']) if gp['J-REGION'] else 0
|
|
595
|
|
596 db_gen['JUNCTION_LENGTH'] = len(jn['JUNCTION']) if jn['JUNCTION'] else 0
|
|
597 db_gen['JUNCTION'] = jn['JUNCTION']
|
|
598
|
|
599 # Alignment scores
|
|
600 if score_fields:
|
|
601 try: db_gen['V_SCORE'] = float(sm['V-REGION score'])
|
|
602 except (TypeError, ValueError): db_gen['V_SCORE'] = 'None'
|
|
603
|
|
604 try: db_gen['V_IDENTITY'] = float(sm['V-REGION identity %']) / 100.0
|
|
605 except (TypeError, ValueError): db_gen['V_IDENTITY'] = 'None'
|
|
606
|
|
607 try: db_gen['J_SCORE'] = float(sm['J-REGION score'])
|
|
608 except (TypeError, ValueError): db_gen['J_SCORE'] = 'None'
|
|
609
|
|
610 try: db_gen['J_IDENTITY'] = float(sm['J-REGION identity %']) / 100.0
|
|
611 except (TypeError, ValueError): db_gen['J_IDENTITY'] = 'None'
|
|
612
|
|
613 # FWR and CDR regions
|
|
614 if region_fields: getRegions(db_gen)
|
|
615 else:
|
|
616 db_gen['V_CALL'] = 'None'
|
|
617 db_gen['D_CALL'] = 'None'
|
|
618 db_gen['J_CALL'] = 'None'
|
|
619
|
|
620 yield IgRecord(db_gen)
|
|
621
|
|
622
|
|
623 def getIDforIMGT(seq_file):
|
|
624 """
|
|
625 Create a sequence ID translation using IMGT truncation
|
|
626
|
|
627 Arguments:
|
|
628 seq_file = a fasta file of sequences input to IMGT
|
|
629
|
|
630 Returns:
|
|
631 a dictionary of {truncated ID: full seq description}
|
|
632 """
|
|
633
|
|
634 # Create a seq_dict ID translation using IDs truncate up to space or 50 chars
|
|
635 ids = {}
|
|
636 for i, rec in enumerate(SeqIO.parse(seq_file, 'fasta', IUPAC.ambiguous_dna)):
|
|
637 if len(rec.description) <= 50:
|
|
638 id_key = rec.description
|
|
639 else:
|
|
640 id_key = re.sub('\||\s|!|&|\*|<|>|\?','_',rec.description[:50])
|
|
641 ids.update({id_key:rec.description})
|
|
642
|
|
643 return ids
|
|
644
|
|
645
|
|
646 def writeDb(db_gen, file_prefix, total_count, id_dict={}, no_parse=True,
|
|
647 score_fields=False, region_fields=False, out_args=default_out_args):
|
|
648 """
|
|
649 Writes tab-delimited database file in output directory
|
|
650
|
|
651 Arguments:
|
|
652 db_gen = a generator of IgRecord objects containing alignment data
|
|
653 file_prefix = directory and prefix for CLIP tab-delim file
|
|
654 total_count = number of records (for progress bar)
|
|
655 id_dict = a dictionary of {IMGT ID: full seq description}
|
|
656 no_parse = if ID is to be parsed for pRESTO output with default delimiters
|
|
657 score_fields = if True add alignment score fields to output file
|
|
658 region_fields = if True add FWR and CDR region fields to output file
|
|
659 out_args = common output argument dictionary from parseCommonArgs
|
|
660
|
|
661 Returns:
|
|
662 None
|
|
663 """
|
|
664 pass_file = "%s_db-pass.tab" % file_prefix
|
|
665 fail_file = "%s_db-fail.tab" % file_prefix
|
|
666 ordered_fields = ['SEQUENCE_ID',
|
|
667 'SEQUENCE_INPUT',
|
|
668 'FUNCTIONAL',
|
|
669 'IN_FRAME',
|
|
670 'STOP',
|
|
671 'MUTATED_INVARIANT',
|
|
672 'INDELS',
|
|
673 'V_CALL',
|
|
674 'D_CALL',
|
|
675 'J_CALL',
|
|
676 'SEQUENCE_VDJ',
|
|
677 'SEQUENCE_IMGT',
|
|
678 'V_SEQ_START',
|
|
679 'V_SEQ_LENGTH',
|
|
680 'V_GERM_START_VDJ',
|
|
681 'V_GERM_LENGTH_VDJ',
|
|
682 'V_GERM_START_IMGT',
|
|
683 'V_GERM_LENGTH_IMGT',
|
|
684 'N1_LENGTH',
|
|
685 'D_SEQ_START',
|
|
686 'D_SEQ_LENGTH',
|
|
687 'D_GERM_START',
|
|
688 'D_GERM_LENGTH',
|
|
689 'N2_LENGTH',
|
|
690 'J_SEQ_START',
|
|
691 'J_SEQ_LENGTH',
|
|
692 'J_GERM_START',
|
|
693 'J_GERM_LENGTH',
|
|
694 'JUNCTION_LENGTH',
|
|
695 'JUNCTION']
|
|
696
|
|
697 if score_fields:
|
|
698 ordered_fields.extend(['V_SCORE',
|
|
699 'V_IDENTITY',
|
|
700 'V_EVALUE',
|
|
701 'V_BTOP',
|
|
702 'J_SCORE',
|
|
703 'J_IDENTITY',
|
|
704 'J_EVALUE',
|
|
705 'J_BTOP'])
|
|
706
|
|
707 if region_fields:
|
|
708 ordered_fields.extend(['FWR1_IMGT', 'FWR2_IMGT', 'FWR3_IMGT', 'FWR4_IMGT',
|
|
709 'CDR1_IMGT', 'CDR2_IMGT', 'CDR3_IMGT'])
|
|
710
|
|
711
|
|
712 # TODO: This is not the best approach. should pass in output fields.
|
|
713 # Initiate passed handle
|
|
714 pass_handle = None
|
|
715
|
|
716 # Open failed file
|
|
717 if out_args['failed']:
|
|
718 fail_handle = open(fail_file, 'wt')
|
|
719 fail_writer = getDbWriter(fail_handle, add_fields=['SEQUENCE_ID', 'SEQUENCE_INPUT'])
|
|
720 else:
|
|
721 fail_handle = None
|
|
722 fail_writer = None
|
|
723
|
|
724 # Initialize counters and file
|
|
725 pass_writer = None
|
|
726 start_time = time()
|
|
727 rec_count = pass_count = fail_count = 0
|
|
728 for record in db_gen:
|
|
729 #printProgress(i + (total_count/2 if id_dict else 0), total_count, 0.05, start_time)
|
|
730 printProgress(rec_count, total_count, 0.05, start_time)
|
|
731 rec_count += 1
|
|
732
|
|
733 # Count pass or fail
|
|
734 if (record.v_call == 'None' and record.j_call == 'None') or \
|
|
735 record.functional is None or \
|
|
736 not record.seq_vdj or \
|
|
737 not record.junction:
|
|
738 # print(record.v_call, record.j_call, record.functional, record.junction)
|
|
739 fail_count += 1
|
|
740 if fail_writer is not None: fail_writer.writerow(record.toDict())
|
|
741 continue
|
|
742 else:
|
|
743 pass_count += 1
|
|
744
|
|
745 # Build sample sequence description
|
|
746 if record.id in id_dict:
|
|
747 record.id = id_dict[record.id]
|
|
748
|
|
749 # Parse sequence description into new columns
|
|
750 if not no_parse:
|
|
751 record.annotations = parseAnnotation(record.id, delimiter=out_args['delimiter'])
|
|
752 record.id = record.annotations['ID']
|
|
753 del record.annotations['ID']
|
|
754
|
|
755 # TODO: This is not the best approach. should pass in output fields.
|
|
756 # If first sequence, use parsed description to create new columns and initialize writer
|
|
757 if pass_writer is None:
|
|
758 if not no_parse: ordered_fields.extend(list(record.annotations.keys()))
|
|
759 pass_handle = open(pass_file, 'wt')
|
|
760 pass_writer = getDbWriter(pass_handle, add_fields=ordered_fields)
|
|
761
|
|
762 # Write row to tab-delim CLIP file
|
|
763 pass_writer.writerow(record.toDict())
|
|
764
|
|
765 # Print log
|
|
766 #printProgress(i+1 + (total_count/2 if id_dict else 0), total_count, 0.05, start_time)
|
|
767 printProgress(rec_count, total_count, 0.05, start_time)
|
|
768
|
|
769 log = OrderedDict()
|
|
770 log['OUTPUT'] = pass_file
|
|
771 log['PASS'] = pass_count
|
|
772 log['FAIL'] = fail_count
|
|
773 log['END'] = 'MakeDb'
|
|
774 printLog(log)
|
|
775
|
|
776 if pass_handle is not None: pass_handle.close()
|
|
777 if fail_handle is not None: fail_handle.close()
|
|
778
|
|
779
|
|
780 # TODO: may be able to merge with parseIMGT
|
|
781 def parseIgBlast(igblast_output, seq_file, repo, no_parse=True, score_fields=False,
|
|
782 region_fields=False, out_args=default_out_args):
|
|
783 """
|
|
784 Main for IgBlast aligned sample sequences
|
|
785
|
|
786 Arguments:
|
|
787 igblast_output = IgBlast output file to process
|
|
788 seq_file = fasta file input to IgBlast (from which to get sequence)
|
|
789 repo = folder with germline repertoire files
|
|
790 no_parse = if ID is to be parsed for pRESTO output with default delimiters
|
|
791 score_fields = if True add alignment score fields to output file
|
|
792 region_fields = if True add FWR and CDR region fields to output file
|
|
793 out_args = common output argument dictionary from parseCommonArgs
|
|
794
|
|
795 Returns:
|
|
796 None
|
|
797 """
|
|
798 # Print parameter info
|
|
799 log = OrderedDict()
|
|
800 log['START'] = 'MakeDB'
|
|
801 log['ALIGNER'] = 'IgBlast'
|
|
802 log['ALIGN_RESULTS'] = os.path.basename(igblast_output)
|
|
803 log['SEQ_FILE'] = os.path.basename(seq_file)
|
|
804 log['NO_PARSE'] = no_parse
|
|
805 log['SCORE_FIELDS'] = score_fields
|
|
806 log['REGION_FIELDS'] = region_fields
|
|
807 printLog(log)
|
|
808
|
|
809 # Get input sequence dictionary
|
|
810 seq_dict = getSeqforIgBlast(seq_file)
|
|
811
|
|
812 # Formalize out_dir and file-prefix
|
|
813 if not out_args['out_dir']:
|
|
814 out_dir = os.path.split(igblast_output)[0]
|
|
815 else:
|
|
816 out_dir = os.path.abspath(out_args['out_dir'])
|
|
817 if not os.path.exists(out_dir): os.mkdir(out_dir)
|
|
818 if out_args['out_name']:
|
|
819 file_prefix = out_args['out_name']
|
|
820 else:
|
|
821 file_prefix = os.path.basename(os.path.splitext(igblast_output)[0])
|
|
822 file_prefix = os.path.join(out_dir, file_prefix)
|
|
823
|
|
824 total_count = countSeqFile(seq_file)
|
|
825
|
|
826 # Create
|
|
827 repo_dict = getRepo(repo)
|
|
828 igblast_dict = readIgBlast(igblast_output, seq_dict, repo_dict,
|
|
829 score_fields=score_fields, region_fields=region_fields)
|
|
830 writeDb(igblast_dict, file_prefix, total_count, no_parse=no_parse,
|
|
831 score_fields=score_fields, region_fields=region_fields, out_args=out_args)
|
|
832
|
|
833
|
|
834 # TODO: may be able to merge with parseIgBlast
|
|
835 def parseIMGT(imgt_output, seq_file=None, no_parse=True, score_fields=False,
|
|
836 region_fields=False, out_args=default_out_args):
|
|
837 """
|
|
838 Main for IMGT aligned sample sequences
|
|
839
|
|
840 Arguments:
|
|
841 imgt_output = zipped file or unzipped folder output by IMGT
|
|
842 seq_file = FASTA file input to IMGT (from which to get seqID)
|
|
843 no_parse = if ID is to be parsed for pRESTO output with default delimiters
|
|
844 score_fields = if True add alignment score fields to output file
|
|
845 region_fields = if True add FWR and CDR region fields to output file
|
|
846 out_args = common output argument dictionary from parseCommonArgs
|
|
847
|
|
848 Returns:
|
|
849 None
|
|
850 """
|
|
851 # Print parameter info
|
|
852 log = OrderedDict()
|
|
853 log['START'] = 'MakeDb'
|
|
854 log['ALIGNER'] = 'IMGT'
|
|
855 log['ALIGN_RESULTS'] = imgt_output
|
|
856 log['SEQ_FILE'] = os.path.basename(seq_file) if seq_file else ''
|
|
857 log['NO_PARSE'] = no_parse
|
|
858 log['SCORE_FIELDS'] = score_fields
|
|
859 log['REGION_FIELDS'] = region_fields
|
|
860 printLog(log)
|
|
861
|
|
862 # Get individual IMGT result files
|
|
863 temp_dir, imgt_files = extractIMGT(imgt_output)
|
|
864
|
|
865 # Formalize out_dir and file-prefix
|
|
866 if not out_args['out_dir']:
|
|
867 out_dir = os.path.dirname(os.path.abspath(imgt_output))
|
|
868 else:
|
|
869 out_dir = os.path.abspath(out_args['out_dir'])
|
|
870 if not os.path.exists(out_dir): os.mkdir(out_dir)
|
|
871 if out_args['out_name']:
|
|
872 file_prefix = out_args['out_name']
|
|
873 else:
|
|
874 file_prefix = os.path.splitext(os.path.split(os.path.abspath(imgt_output))[1])[0]
|
|
875 file_prefix = os.path.join(out_dir, file_prefix)
|
|
876
|
|
877 total_count = countDbFile(imgt_files[0])
|
|
878
|
|
879 # Get (parsed) IDs from fasta file submitted to IMGT
|
|
880 id_dict = getIDforIMGT(seq_file) if seq_file else {}
|
|
881
|
|
882 # Create
|
|
883 imgt_dict = readIMGT(imgt_files, score_fields=score_fields,
|
|
884 region_fields=region_fields)
|
|
885 writeDb(imgt_dict, file_prefix, total_count, id_dict=id_dict, no_parse=no_parse,
|
|
886 score_fields=score_fields, region_fields=region_fields, out_args=out_args)
|
|
887
|
|
888 # Delete temp directory
|
|
889 rmtree(temp_dir)
|
|
890
|
|
891
|
|
892 def getArgParser():
|
|
893 """
|
|
894 Defines the ArgumentParser
|
|
895
|
|
896 Arguments:
|
|
897 None
|
|
898
|
|
899 Returns:
|
|
900 an ArgumentParser object
|
|
901 """
|
|
902 fields = dedent(
|
|
903 '''
|
|
904 output files:
|
|
905 db-pass
|
|
906 database of parsed alignment records.
|
|
907 db-fail
|
|
908 database with records failing alignment.
|
|
909
|
|
910 output fields:
|
|
911 SEQUENCE_ID, SEQUENCE_INPUT, FUNCTIONAL, IN_FRAME, STOP, MUTATED_INVARIANT,
|
|
912 INDELS, V_CALL, D_CALL, J_CALL, SEQUENCE_VDJ and/or SEQUENCE_IMGT,
|
|
913 V_SEQ_START, V_SEQ_LENGTH, V_GERM_START_VDJ and/or V_GERM_START_IMGT,
|
|
914 V_GERM_LENGTH_VDJ and/or V_GERM_LENGTH_IMGT, N1_LENGTH,
|
|
915 D_SEQ_START, D_SEQ_LENGTH, D_GERM_START, D_GERM_LENGTH, N2_LENGTH,
|
|
916 J_SEQ_START, J_SEQ_LENGTH, J_GERM_START, J_GERM_LENGTH,
|
|
917 JUNCTION_LENGTH, JUNCTION, V_SCORE, V_IDENTITY, V_EVALUE, V_BTOP,
|
|
918 J_SCORE, J_IDENTITY, J_EVALUE, J_BTOP, FWR1_IMGT, FWR2_IMGT, FWR3_IMGT,
|
|
919 FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, CDR3_IMGT
|
|
920 ''')
|
|
921
|
|
922 # Define ArgumentParser
|
|
923 parser = ArgumentParser(description=__doc__, epilog=fields,
|
|
924 formatter_class=CommonHelpFormatter)
|
|
925 parser.add_argument('--version', action='version',
|
|
926 version='%(prog)s:' + ' %s-%s' %(__version__, __date__))
|
|
927 subparsers = parser.add_subparsers(title='subcommands', dest='command',
|
|
928 help='Aligner used', metavar='')
|
|
929 # TODO: This is a temporary fix for Python issue 9253
|
|
930 subparsers.required = True
|
|
931
|
|
932 # Parent parser
|
|
933 parser_parent = getCommonArgParser(seq_in=False, seq_out=False, log=False)
|
|
934
|
|
935 # IgBlast Aligner
|
|
936 parser_igblast = subparsers.add_parser('igblast', help='Process IgBlast output',
|
|
937 parents=[parser_parent],
|
|
938 formatter_class=CommonHelpFormatter)
|
|
939 parser_igblast.set_defaults(func=parseIgBlast)
|
|
940 parser_igblast.add_argument('-i', nargs='+', action='store', dest='aligner_files',
|
|
941 required=True,
|
|
942 help='''IgBLAST output files in format 7 with query sequence
|
|
943 (IgBLAST argument \'-outfmt "7 std qseq sseq btop"\').''')
|
|
944 parser_igblast.add_argument('-r', nargs='+', action='store', dest='repo', required=True,
|
|
945 help='''List of folders and/or fasta files containing
|
|
946 IMGT-gapped germline sequences corresponding to the
|
|
947 set of germlines used in the IgBLAST alignment.''')
|
|
948 parser_igblast.add_argument('-s', action='store', nargs='+', dest='seq_files',
|
|
949 required=True,
|
|
950 help='List of input FASTA files containing sequences')
|
|
951 parser_igblast.add_argument('--noparse', action='store_true', dest='no_parse',
|
|
952 help='''Specify if input IDs should not be parsed to add
|
|
953 new columns to database.''')
|
|
954 parser_igblast.add_argument('--scores', action='store_true', dest='score_fields',
|
|
955 help='''Specify if alignment score metrics should be
|
|
956 included in the output. Adds the V_SCORE, V_IDENTITY,
|
|
957 V_EVALUE, V_BTOP, J_SCORE, J_IDENTITY,
|
|
958 J_BTOP, and J_EVALUE columns.''')
|
|
959 parser_igblast.add_argument('--regions', action='store_true', dest='region_fields',
|
|
960 help='''Specify if IMGT framework and CDR regions should be
|
|
961 included in the output. Adds the FWR1_IMGT, FWR2_IMGT,
|
|
962 FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and
|
|
963 CDR3_IMGT columns.''')
|
|
964
|
|
965 # IMGT aligner
|
|
966 parser_imgt = subparsers.add_parser('imgt', help='Process IMGT/HighV-Quest output',
|
|
967 parents=[parser_parent],
|
|
968 formatter_class=CommonHelpFormatter)
|
|
969 imgt_arg_group = parser_imgt.add_mutually_exclusive_group(required=True)
|
|
970 imgt_arg_group.add_argument('-i', nargs='+', action='store', dest='aligner_files',
|
|
971 help='''Either zipped IMGT output files (.zip) or a folder
|
|
972 containing unzipped IMGT output files (which must
|
|
973 include 1_Summary, 2_IMGT-gapped, 3_Nt-sequences,
|
|
974 and 6_Junction).''')
|
|
975 parser_imgt.add_argument('-s', nargs='*', action='store', dest='seq_files',
|
|
976 required=False,
|
|
977 help='List of input FASTA files containing sequences')
|
|
978 parser_imgt.add_argument('--noparse', action='store_true', dest='no_parse',
|
|
979 help='''Specify if input IDs should not be parsed to add new
|
|
980 columns to database.''')
|
|
981 parser_imgt.add_argument('--scores', action='store_true', dest='score_fields',
|
|
982 help='''Specify if alignment score metrics should be
|
|
983 included in the output. Adds the V_SCORE, V_IDENTITY,
|
|
984 J_SCORE and J_IDENTITY. Note, this will also add
|
|
985 the columns V_EVALUE, V_BTOP, J_EVALUE and J_BTOP,
|
|
986 but they will be empty for IMGT output.''')
|
|
987 parser_imgt.add_argument('--regions', action='store_true', dest='region_fields',
|
|
988 help='''Specify if IMGT framework and CDR regions should be
|
|
989 included in the output. Adds the FWR1_IMGT, FWR2_IMGT,
|
|
990 FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and
|
|
991 CDR3_IMGT columns.''')
|
|
992 parser_imgt.set_defaults(func=parseIMGT)
|
|
993
|
|
994 return parser
|
|
995
|
|
996
|
|
997 if __name__ == "__main__":
|
|
998 """
|
|
999 Parses command line arguments and calls main
|
|
1000 """
|
|
1001 parser = getArgParser()
|
|
1002 args = parser.parse_args()
|
|
1003 args_dict = parseCommonArgs(args, in_arg='aligner_files')
|
|
1004
|
|
1005 # Set no ID parsing if sequence files are not provided
|
|
1006 if 'seq_files' in args_dict and not args_dict['seq_files']:
|
|
1007 args_dict['no_parse'] = True
|
|
1008
|
|
1009 # Delete
|
|
1010 if 'seq_files' in args_dict: del args_dict['seq_files']
|
|
1011 if 'aligner_files' in args_dict: del args_dict['aligner_files']
|
|
1012 if 'command' in args_dict: del args_dict['command']
|
|
1013 if 'func' in args_dict: del args_dict['func']
|
|
1014
|
|
1015 if args.command == 'imgt':
|
|
1016 for i in range(len(args.__dict__['aligner_files'])):
|
|
1017 args_dict['imgt_output'] = args.__dict__['aligner_files'][i]
|
|
1018 args_dict['seq_file'] = args.__dict__['seq_files'][i] \
|
|
1019 if args.__dict__['seq_files'] else None
|
|
1020 args.func(**args_dict)
|
|
1021 elif args.command == 'igblast':
|
|
1022 for i in range(len(args.__dict__['aligner_files'])):
|
|
1023 args_dict['igblast_output'] = args.__dict__['aligner_files'][i]
|
|
1024 args_dict['seq_file'] = args.__dict__['seq_files'][i]
|
|
1025 args.func(**args_dict)
|