Mercurial > repos > davidvanzessen > shm_csr
comparison change_o/MakeDb.py @ 52:22dddabe3637 draft
Uploaded
author | davidvanzessen |
---|---|
date | Tue, 23 May 2017 08:32:58 -0400 |
parents | c33d93683a09 |
children |
comparison
equal
deleted
inserted
replaced
51:8fa8836bd605 | 52:22dddabe3637 |
---|---|
5 # Info | 5 # Info |
6 __author__ = 'Namita Gupta, Jason Anthony Vander Heiden' | 6 __author__ = 'Namita Gupta, Jason Anthony Vander Heiden' |
7 from changeo import __version__, __date__ | 7 from changeo import __version__, __date__ |
8 | 8 |
9 # Imports | 9 # Imports |
10 import csv | |
11 import os | 10 import os |
12 import re | |
13 import sys | 11 import sys |
14 import pandas as pd | |
15 import tarfile | |
16 import zipfile | |
17 from argparse import ArgumentParser | 12 from argparse import ArgumentParser |
18 from collections import OrderedDict | 13 from collections import OrderedDict |
19 from itertools import groupby | |
20 from shutil import rmtree | |
21 from tempfile import mkdtemp | |
22 from textwrap import dedent | 14 from textwrap import dedent |
23 from time import time | 15 from time import time |
24 from Bio import SeqIO | 16 from Bio import SeqIO |
25 from Bio.Seq import Seq | |
26 from Bio.Alphabet import IUPAC | |
27 | 17 |
28 # Presto and changeo imports | 18 # Presto and changeo imports |
29 from presto.Defaults import default_out_args | 19 from presto.Defaults import default_out_args |
30 from presto.Annotation import parseAnnotation | 20 from presto.Annotation import parseAnnotation |
31 from presto.IO import countSeqFile, printLog, printProgress | 21 from presto.IO import countSeqFile, printLog, printMessage, printProgress, readSeqFile |
32 from changeo.Commandline import CommonHelpFormatter, getCommonArgParser, parseCommonArgs | 22 from changeo.Commandline import CommonHelpFormatter, getCommonArgParser, parseCommonArgs |
33 from changeo.IO import getDbWriter, countDbFile, getRepo | 23 from changeo.IO import countDbFile, extractIMGT, getDbWriter, readRepo |
34 from changeo.Receptor import IgRecord, parseAllele, v_allele_regex, d_allele_regex, \ | 24 from changeo.Parsers import IgBLASTReader, IMGTReader, IHMMuneReader, getIDforIMGT |
35 j_allele_regex | 25 |
36 | 26 |
37 # Default parameters | 27 def getFilePrefix(aligner_output, out_args): |
38 default_delimiter = ('\t', ',', '-') | 28 """ |
39 | 29 Get file name prefix and create output directory |
40 | |
41 def gapV(ig_dict, repo_dict): | |
42 """ | |
43 Insert gaps into V region and update alignment information | |
44 | 30 |
45 Arguments: | 31 Arguments: |
46 ig_dict : Dictionary of parsed IgBlast output | 32 aligner_output : aligner output file or directory. |
47 repo_dict : Dictionary of IMGT gapped germline sequences | 33 out_args : dictionary of output arguments. |
48 | 34 |
49 Returns: | 35 Returns: |
50 dict : Updated with SEQUENCE_IMGT, V_GERM_START_IMGT, and V_GERM_LENGTH_IMGT fields | 36 str : file name prefix. |
51 """ | 37 """ |
52 | 38 # Determine output directory |
53 seq_imgt = '.' * (int(ig_dict['V_GERM_START_VDJ'])-1) + ig_dict['SEQUENCE_VDJ'] | 39 if not out_args['out_dir']: |
54 | 40 out_dir = os.path.dirname(os.path.abspath(aligner_output)) |
55 # Find gapped germline V segment | 41 else: |
56 vgene = parseAllele(ig_dict['V_CALL'], v_allele_regex, 'first') | 42 out_dir = os.path.abspath(out_args['out_dir']) |
57 vkey = (vgene, ) | 43 if not os.path.exists(out_dir): |
58 #TODO: Figure out else case | 44 os.mkdir(out_dir) |
59 if vkey in repo_dict: | 45 |
60 vgap = repo_dict[vkey] | 46 # Determine file prefix |
61 # Iterate over gaps in the germline segment | 47 if out_args['out_name']: |
62 gaps = re.finditer(r'\.', vgap) | 48 file_prefix = out_args['out_name'] |
63 gapcount = int(ig_dict['V_GERM_START_VDJ'])-1 | 49 else: |
64 for gap in gaps: | 50 file_prefix = os.path.splitext(os.path.split(os.path.abspath(aligner_output))[1])[0] |
65 i = gap.start() | 51 |
66 # Break if gap begins after V region | 52 return os.path.join(out_dir, file_prefix) |
67 if i >= ig_dict['V_GERM_LENGTH_VDJ'] + gapcount: | 53 |
68 break | 54 |
69 # Insert gap into IMGT sequence | 55 def getSeqDict(seq_file): |
70 seq_imgt = seq_imgt[:i] + '.' + seq_imgt[i:] | 56 """ |
71 # Update gap counter | 57 Create a dictionary from a sequence file. |
72 gapcount += 1 | |
73 ig_dict['SEQUENCE_IMGT'] = seq_imgt | |
74 # Update IMGT positioning information for V | |
75 ig_dict['V_GERM_START_IMGT'] = 1 | |
76 ig_dict['V_GERM_LENGTH_IMGT'] = ig_dict['V_GERM_LENGTH_VDJ'] + gapcount | |
77 | |
78 return ig_dict | |
79 | |
80 | |
81 def getIMGTJunc(ig_dict, repo_dict): | |
82 """ | |
83 Identify junction region by IMGT definition | |
84 | 58 |
85 Arguments: | 59 Arguments: |
86 ig_dict : Dictionary of parsed IgBlast output | 60 seq_file : sequence file. |
87 repo_dict : Dictionary of IMGT gapped germline sequences | |
88 | 61 |
89 Returns: | 62 Returns: |
90 dict : Updated with JUNCTION_LENGTH_IMGT and JUNCTION_IMGT fields | 63 dict : sequence description as keys with Bio.SeqRecords as values. |
91 """ | 64 """ |
92 # Find germline J segment | 65 seq_dict = SeqIO.to_dict(readSeqFile(seq_file), |
93 jgene = parseAllele(ig_dict['J_CALL'], j_allele_regex, 'first') | 66 key_function=lambda x: x.description) |
94 jkey = (jgene, ) | 67 |
95 #TODO: Figure out else case | 68 return seq_dict |
96 if jkey in repo_dict: | 69 |
97 # Get germline J sequence | 70 |
98 jgerm = repo_dict[jkey] | 71 def writeDb(db, fields, file_prefix, total_count, id_dict=None, no_parse=True, partial=False, |
99 jgerm = jgerm[:ig_dict['J_GERM_START']+ig_dict['J_GERM_LENGTH']-1] | 72 out_args=default_out_args): |
100 # Look for (F|W)GXG aa motif in nt sequence | 73 """ |
101 motif = re.search(r'T(TT|TC|GG)GG[ACGT]{4}GG[AGCT]', jgerm) | 74 Writes tab-delimited database file in output directory. |
102 aa_end = len(ig_dict['SEQUENCE_IMGT']) | |
103 #TODO: Figure out else case | |
104 if motif: | |
105 # print('\n', motif.group()) | |
106 aa_end = motif.start() - len(jgerm) + 3 | |
107 # Add fields to dict | |
108 ig_dict['JUNCTION'] = ig_dict['SEQUENCE_IMGT'][309:aa_end] | |
109 ig_dict['JUNCTION_LENGTH'] = len(ig_dict['JUNCTION']) | |
110 | |
111 return ig_dict | |
112 | |
113 | |
114 def getRegions(ig_dict): | |
115 """ | |
116 Identify FWR and CDR regions by IMGT definition | |
117 | |
118 Arguments: | |
119 ig_dict : Dictionary of parsed alignment output | |
120 | |
121 Returns: | |
122 dict : Updated with FWR1_IMGT, FWR2_IMGT, FWR3_IMGT, FWR4_IMGT, | |
123 CDR1_IMGT, CDR2_IMGT, and CDR3_IMGT fields | |
124 """ | |
125 try: | |
126 seq_len = len(ig_dict['SEQUENCE_IMGT']) | |
127 ig_dict['FWR1_IMGT'] = ig_dict['SEQUENCE_IMGT'][0:min(78,seq_len)] | |
128 except (KeyError, IndexError): | |
129 return ig_dict | |
130 | |
131 try: ig_dict['CDR1_IMGT'] = ig_dict['SEQUENCE_IMGT'][78:min(114, seq_len)] | |
132 except (IndexError): return ig_dict | |
133 | |
134 try: ig_dict['FWR2_IMGT'] = ig_dict['SEQUENCE_IMGT'][114:min(165, seq_len)] | |
135 except (IndexError): return ig_dict | |
136 | |
137 try: ig_dict['CDR2_IMGT'] = ig_dict['SEQUENCE_IMGT'][165:min(195, seq_len)] | |
138 except (IndexError): return ig_dict | |
139 | |
140 try: ig_dict['FWR3_IMGT'] = ig_dict['SEQUENCE_IMGT'][195:min(312, seq_len)] | |
141 except (IndexError): return ig_dict | |
142 | |
143 try: | |
144 cdr3_end = 306 + ig_dict['JUNCTION_LENGTH'] | |
145 ig_dict['CDR3_IMGT'] = ig_dict['SEQUENCE_IMGT'][312:cdr3_end] | |
146 ig_dict['FWR4_IMGT'] = ig_dict['SEQUENCE_IMGT'][cdr3_end:] | |
147 except (KeyError, IndexError): | |
148 return ig_dict | |
149 | |
150 return ig_dict | |
151 | |
152 | |
153 def getSeqforIgBlast(seq_file): | |
154 """ | |
155 Fetch input sequences for IgBlast queries | |
156 | |
157 Arguments: | |
158 seq_file = a fasta file of sequences input to IgBlast | |
159 | |
160 Returns: | |
161 a dictionary of {ID:Seq} | |
162 """ | |
163 | |
164 seq_dict = SeqIO.index(seq_file, "fasta", IUPAC.ambiguous_dna) | |
165 | |
166 # Create a seq_dict ID translation using IDs truncate up to space or 50 chars | |
167 seqs = {} | |
168 for seq in seq_dict.values(): | |
169 seqs.update({seq.description:str(seq.seq)}) | |
170 | |
171 return seqs | |
172 | |
173 | |
174 def findLine(handle, query): | |
175 """ | |
176 Finds line with query string in file | |
177 | |
178 Arguments: | |
179 handle = file handle in which to search for line | |
180 query = query string for which to search in file | |
181 | |
182 Returns: | |
183 line from handle in which query string was found | |
184 """ | |
185 for line in handle: | |
186 if(re.match(query, line)): | |
187 return line | |
188 | |
189 | |
190 def extractIMGT(imgt_output): | |
191 """ | |
192 Extract necessary files from IMGT results, zipped or unzipped | |
193 | 75 |
194 Arguments: | 76 Arguments: |
195 imgt_output = zipped file or unzipped folder output by IMGT | 77 db : a iterator of IgRecord objects containing alignment data. |
196 | 78 fields : a list of ordered field names to write. |
79 file_prefix : directory and prefix for CLIP tab-delim file. | |
80 total_count : number of records (for progress bar). | |
81 id_dict : a dictionary of the truncated sequence ID mapped to the full sequence ID. | |
82 no_parse : if ID is to be parsed for pRESTO output with default delimiters. | |
83 partial : if True put incomplete alignments in the pass file. | |
84 out_args : common output argument dictionary from parseCommonArgs. | |
85 | |
197 Returns: | 86 Returns: |
198 sorted list of filenames from which information will be read | 87 None |
199 """ | 88 """ |
200 #file_ext = os.path.splitext(imgt_output)[1].lower() | 89 # Function to check for valid records strictly |
201 imgt_flags = ('1_Summary', '2_IMGT-gapped', '3_Nt-sequences', '6_Junction') | 90 def _pass_strict(rec): |
202 temp_dir = mkdtemp() | 91 valid = [rec.v_call and rec.v_call != 'None', |
203 if zipfile.is_zipfile(imgt_output): | 92 rec.j_call and rec.j_call != 'None', |
204 # Open zip file | 93 rec.functional is not None, |
205 imgt_zip = zipfile.ZipFile(imgt_output, 'r') | 94 rec.seq_vdj, |
206 # Extract required files | 95 rec.junction] |
207 imgt_files = sorted([n for n in imgt_zip.namelist() \ | 96 return all(valid) |
208 if os.path.basename(n).startswith(imgt_flags)]) | 97 |
209 imgt_zip.extractall(temp_dir, imgt_files) | 98 # Function to check for valid records loosely |
210 # Define file list | 99 def _pass_gentle(rec): |
211 imgt_files = [os.path.join(temp_dir, f) for f in imgt_files] | 100 valid = [rec.v_call and rec.v_call != 'None', |
212 elif os.path.isdir(imgt_output): | 101 rec.d_call and rec.d_call != 'None', |
213 # Find required files in folder | 102 rec.j_call and rec.j_call != 'None'] |
214 folder_files = [] | 103 return any(valid) |
215 for root, dirs, files in os.walk(imgt_output): | 104 |
216 folder_files.extend([os.path.join(os.path.abspath(root), f) for f in files]) | 105 # Set pass criteria |
217 # Define file list | 106 _pass = _pass_gentle if partial else _pass_strict |
218 imgt_files = sorted([n for n in folder_files \ | 107 |
219 if os.path.basename(n).startswith(imgt_flags)]) | 108 # Define output file names |
220 elif tarfile.is_tarfile(imgt_output): | 109 pass_file = '%s_db-pass.tab' % file_prefix |
221 # Open zip file | 110 fail_file = '%s_db-fail.tab' % file_prefix |
222 imgt_tar = tarfile.open(imgt_output, 'r') | 111 |
223 # Extract required files | 112 # Initiate handles, writers and counters |
224 imgt_files = sorted([n for n in imgt_tar.getnames() \ | |
225 if os.path.basename(n).startswith(imgt_flags)]) | |
226 imgt_tar.extractall(temp_dir, [imgt_tar.getmember(n) for n in imgt_files]) | |
227 # Define file list | |
228 imgt_files = [os.path.join(temp_dir, f) for f in imgt_files] | |
229 else: | |
230 sys.exit('ERROR: Unsupported IGMT output file. Must be either a zipped file (.zip), LZMA compressed tarfile (.txz) or a folder.') | |
231 | |
232 if len(imgt_files) > len(imgt_flags): # e.g. multiple 1_Summary files | |
233 sys.exit('ERROR: Wrong files in IMGT output %s.' % imgt_output) | |
234 elif len(imgt_files) < len(imgt_flags): | |
235 sys.exit('ERROR: Missing necessary file IMGT output %s.' % imgt_output) | |
236 | |
237 return temp_dir, imgt_files | |
238 | |
239 | |
240 # TODO: return a dictionary with keys determined by the comment strings in the blocks, thus avoiding problems with missing blocks | |
241 def readOneIgBlastResult(block): | |
242 """ | |
243 Parse a single IgBLAST query result | |
244 | |
245 Arguments: | |
246 block = itertools groupby object of single result | |
247 | |
248 Returns: | |
249 None if no results, otherwise list of DataFrames for each result block | |
250 """ | |
251 results = list() | |
252 i = 0 | |
253 for match, subblock in groupby(block, lambda l: l=='\n'): | |
254 if not match: | |
255 # Strip whitespace and comments | |
256 sub = [s.strip() for s in subblock if not s.startswith('#')] | |
257 | |
258 # Continue on empty block | |
259 if not sub: continue | |
260 else: i += 1 | |
261 | |
262 # Split by tabs | |
263 sub = [s.split('\t') for s in sub] | |
264 | |
265 # Append list for "V-(D)-J rearrangement summary" (i == 1) | |
266 # And "V-(D)-J junction details" (i == 2) | |
267 # Otherwise append DataFrame of subblock | |
268 if i == 1 or i == 2: | |
269 results.append(sub[0]) | |
270 else: | |
271 df = pd.DataFrame(sub) | |
272 if not df.empty: results.append(df) | |
273 | |
274 return results if results else None | |
275 | |
276 | |
277 # TODO: needs more speeds. pandas is probably to blame. | |
278 def readIgBlast(igblast_output, seq_dict, repo_dict, | |
279 score_fields=False, region_fields=False): | |
280 """ | |
281 Reads IgBlast output | |
282 | |
283 Arguments: | |
284 igblast_output = IgBlast output file (format 7) | |
285 seq_dict = a dictionary of {ID:Seq} from input fasta file | |
286 repo_dict = dictionary of IMGT gapped germline sequences | |
287 score_fields = if True parse alignment scores | |
288 region_fields = if True add FWR and CDR region fields | |
289 | |
290 Returns: | |
291 a generator of dictionaries containing alignment data | |
292 """ | |
293 | |
294 # Open IgBlast output file | |
295 with open(igblast_output) as f: | |
296 # Iterate over individual results (separated by # IGBLASTN) | |
297 for k1, block in groupby(f, lambda x: re.match('# IGBLASTN', x)): | |
298 block = list(block) | |
299 if not k1: | |
300 # TODO: move query name extraction into block parser readOneIgBlastResult(). | |
301 # Extract sequence ID | |
302 query_name = ' '.join(block[0].strip().split(' ')[2:]) | |
303 # Initialize db_gen to have ID and input sequence | |
304 db_gen = {'SEQUENCE_ID': query_name, | |
305 'SEQUENCE_INPUT': seq_dict[query_name]} | |
306 | |
307 # Parse further sub-blocks | |
308 block_list = readOneIgBlastResult(block) | |
309 | |
310 # TODO: this is indented pretty far. should be a separate function. or several functions. | |
311 # If results exist, parse further to obtain full db_gen | |
312 if block_list is not None: | |
313 # Parse quality information | |
314 db_gen['STOP'] = 'T' if block_list[0][-4] == 'Yes' else 'F' | |
315 db_gen['IN_FRAME'] = 'T' if block_list[0][-3] == 'In-frame' else 'F' | |
316 db_gen['FUNCTIONAL'] = 'T' if block_list[0][-2] == 'Yes' else 'F' | |
317 if block_list[0][-1] == '-': | |
318 db_gen['SEQUENCE_INPUT'] = str(Seq(db_gen['SEQUENCE_INPUT'], | |
319 IUPAC.ambiguous_dna).reverse_complement()) | |
320 | |
321 # Parse V, D, and J calls | |
322 call_str = ' '.join(block_list[0]) | |
323 v_call = parseAllele(call_str, v_allele_regex, action='list') | |
324 d_call = parseAllele(call_str, d_allele_regex, action='list') | |
325 j_call = parseAllele(call_str, j_allele_regex, action='list') | |
326 db_gen['V_CALL'] = ','.join(v_call) if v_call is not None else 'None' | |
327 db_gen['D_CALL'] = ','.join(d_call) if d_call is not None else 'None' | |
328 db_gen['J_CALL'] = ','.join(j_call) if j_call is not None else 'None' | |
329 | |
330 # Parse junction sequence | |
331 # db_gen['JUNCTION_VDJ'] = re.sub('(N/A)|\[|\(|\)|\]', '', ''.join(block_list[1])) | |
332 # db_gen['JUNCTION_LENGTH_VDJ'] = len(db_gen['JUNCTION_VDJ']) | |
333 | |
334 # TODO: IgBLAST does a stupid and doesn't output block #3 sometimes. why? | |
335 # TODO: maybe we should fail these. they look craptastic. | |
336 #pd.set_option('display.width', 500) | |
337 #print query_name, len(block_list), hit_idx | |
338 #for i, x in enumerate(block_list): | |
339 # print '[%i]' % i | |
340 # print x | |
341 | |
342 # Parse segment start and stop positions | |
343 hit_df = block_list[-1] | |
344 | |
345 # Alignment info block | |
346 # 0: segment | |
347 # 1: query id | |
348 # 2: subject id | |
349 # 3: % identity | |
350 # 4: alignment length | |
351 # 5: mismatches | |
352 # 6: gap opens | |
353 # 7: gaps | |
354 # 8: q. start | |
355 # 9: q. end | |
356 # 10: s. start | |
357 # 11: s. end | |
358 # 12: evalue | |
359 # 13: bit score | |
360 # 14: query seq | |
361 # 15: subject seq | |
362 # 16: btop | |
363 | |
364 # If V call exists, parse V alignment information | |
365 seq_vdj = '' | |
366 if v_call is not None: | |
367 v_align = hit_df[hit_df[0] == 'V'].iloc[0] | |
368 # Germline positions | |
369 db_gen['V_GERM_START_VDJ'] = int(v_align[10]) | |
370 db_gen['V_GERM_LENGTH_VDJ'] = int(v_align[11]) - db_gen['V_GERM_START_VDJ'] + 1 | |
371 # Query sequence positions | |
372 db_gen['V_SEQ_START'] = int(v_align[8]) | |
373 db_gen['V_SEQ_LENGTH'] = int(v_align[9]) - db_gen['V_SEQ_START'] + 1 | |
374 | |
375 if int(v_align[6]) == 0: | |
376 db_gen['INDELS'] = 'F' | |
377 else: | |
378 db_gen['INDELS'] = 'T' | |
379 # Set functional to none so record gets tossed (junction will be wrong) | |
380 # db_gen['FUNCTIONAL'] = None | |
381 | |
382 # V alignment scores | |
383 if score_fields: | |
384 try: db_gen['V_SCORE'] = float(v_align[13]) | |
385 except (TypeError, ValueError): db_gen['V_SCORE'] = 'None' | |
386 | |
387 try: db_gen['V_IDENTITY'] = float(v_align[3]) / 100.0 | |
388 except (TypeError, ValueError): db_gen['V_IDENTITY'] = 'None' | |
389 | |
390 try: db_gen['V_EVALUE'] = float(v_align[12]) | |
391 except (TypeError, ValueError): db_gen['V_EVALUE'] = 'None' | |
392 | |
393 try: db_gen['V_BTOP'] = v_align[16] | |
394 except (TypeError, ValueError): db_gen['V_BTOP'] = 'None' | |
395 | |
396 # Update VDJ sequence, removing insertions | |
397 start = 0 | |
398 for m in re.finditer(r'-', v_align[15]): | |
399 ins = m.start() | |
400 seq_vdj += v_align[14][start:ins] | |
401 start = ins + 1 | |
402 seq_vdj += v_align[14][start:] | |
403 | |
404 # TODO: needs to check that the V results are present before trying to determine N1_LENGTH from them. | |
405 # If D call exists, parse D alignment information | |
406 if d_call is not None: | |
407 d_align = hit_df[hit_df[0] == 'D'].iloc[0] | |
408 | |
409 # TODO: this is kinda gross. not sure how else to fix the alignment overlap problem though. | |
410 # Determine N-region length and amount of J overlap with V or D alignment | |
411 overlap = 0 | |
412 if v_call is not None: | |
413 n1_len = int(d_align[8]) - (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH']) | |
414 if n1_len < 0: | |
415 db_gen['N1_LENGTH'] = 0 | |
416 overlap = abs(n1_len) | |
417 else: | |
418 db_gen['N1_LENGTH'] = n1_len | |
419 n1_start = (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH']-1) | |
420 n1_end = int(d_align[8])-1 | |
421 seq_vdj += db_gen['SEQUENCE_INPUT'][n1_start:n1_end] | |
422 | |
423 # Query sequence positions | |
424 db_gen['D_SEQ_START'] = int(d_align[8]) + overlap | |
425 db_gen['D_SEQ_LENGTH'] = max(int(d_align[9]) - db_gen['D_SEQ_START'] + 1, 0) | |
426 | |
427 # Germline positions | |
428 db_gen['D_GERM_START'] = int(d_align[10]) + overlap | |
429 db_gen['D_GERM_LENGTH'] = max(int(d_align[11]) - db_gen['D_GERM_START'] + 1, 0) | |
430 | |
431 # Update VDJ sequence, removing insertions | |
432 start = overlap | |
433 for m in re.finditer(r'-', d_align[15]): | |
434 ins = m.start() | |
435 seq_vdj += d_align[14][start:ins] | |
436 start = ins + 1 | |
437 seq_vdj += d_align[14][start:] | |
438 | |
439 # TODO: needs to check that the V results are present before trying to determine N1_LENGTH from them. | |
440 # If J call exists, parse J alignment information | |
441 if j_call is not None: | |
442 j_align = hit_df[hit_df[0] == 'J'].iloc[0] | |
443 | |
444 # TODO: this is kinda gross. not sure how else to fix the alignment overlap problem though. | |
445 # Determine N-region length and amount of J overlap with V or D alignment | |
446 overlap = 0 | |
447 if d_call is not None: | |
448 n2_len = int(j_align[8]) - (db_gen['D_SEQ_START'] + db_gen['D_SEQ_LENGTH']) | |
449 if n2_len < 0: | |
450 db_gen['N2_LENGTH'] = 0 | |
451 overlap = abs(n2_len) | |
452 else: | |
453 db_gen['N2_LENGTH'] = n2_len | |
454 n2_start = (db_gen['D_SEQ_START']+db_gen['D_SEQ_LENGTH']-1) | |
455 n2_end = int(j_align[8])-1 | |
456 seq_vdj += db_gen['SEQUENCE_INPUT'][n2_start:n2_end] | |
457 elif v_call is not None: | |
458 n1_len = int(j_align[8]) - (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH']) | |
459 if n1_len < 0: | |
460 db_gen['N1_LENGTH'] = 0 | |
461 overlap = abs(n1_len) | |
462 else: | |
463 db_gen['N1_LENGTH'] = n1_len | |
464 n1_start = (db_gen['V_SEQ_START']+db_gen['V_SEQ_LENGTH']-1) | |
465 n1_end = int(j_align[8])-1 | |
466 seq_vdj += db_gen['SEQUENCE_INPUT'][n1_start:n1_end] | |
467 else: | |
468 db_gen['N1_LENGTH'] = 0 | |
469 | |
470 # Query positions | |
471 db_gen['J_SEQ_START'] = int(j_align[8]) + overlap | |
472 db_gen['J_SEQ_LENGTH'] = max(int(j_align[9]) - db_gen['J_SEQ_START'] + 1, 0) | |
473 | |
474 # Germline positions | |
475 db_gen['J_GERM_START'] = int(j_align[10]) + overlap | |
476 db_gen['J_GERM_LENGTH'] = max(int(j_align[11]) - db_gen['J_GERM_START'] + 1, 0) | |
477 | |
478 # J alignment scores | |
479 if score_fields: | |
480 try: db_gen['J_SCORE'] = float(j_align[13]) | |
481 except (TypeError, ValueError): db_gen['J_SCORE'] = 'None' | |
482 | |
483 try: db_gen['J_IDENTITY'] = float(j_align[3]) / 100.0 | |
484 except (TypeError, ValueError): db_gen['J_IDENTITY'] = 'None' | |
485 | |
486 try: db_gen['J_EVALUE'] = float(j_align[12]) | |
487 except (TypeError, ValueError): db_gen['J_EVALUE'] = 'None' | |
488 | |
489 try: db_gen['J_BTOP'] = j_align[16] | |
490 except (TypeError, ValueError): db_gen['J_BTOP'] = 'None' | |
491 | |
492 # Update VDJ sequence, removing insertions | |
493 start = overlap | |
494 for m in re.finditer(r'-', j_align[15]): | |
495 ins = m.start() | |
496 seq_vdj += j_align[14][start:ins] | |
497 start = ins + 1 | |
498 seq_vdj += j_align[14][start:] | |
499 | |
500 db_gen['SEQUENCE_VDJ'] = seq_vdj | |
501 | |
502 # Create IMGT-gapped sequence and infer IMGT junction | |
503 if v_call is not None: | |
504 db_gen = gapV(db_gen, repo_dict) | |
505 if j_call is not None: | |
506 db_gen = getIMGTJunc(db_gen, repo_dict) | |
507 | |
508 # FWR and CDR regions | |
509 if region_fields: getRegions(db_gen) | |
510 | |
511 yield IgRecord(db_gen) | |
512 | |
513 | |
514 # TODO: should be more readable | |
515 def readIMGT(imgt_files, score_fields=False, region_fields=False): | |
516 """ | |
517 Reads IMGT/HighV-Quest output | |
518 | |
519 Arguments: | |
520 imgt_files = IMGT/HighV-Quest output files 1, 2, 3, and 6 | |
521 score_fields = if True parse alignment scores | |
522 region_fields = if True add FWR and CDR region fields | |
523 | |
524 Returns: | |
525 a generator of dictionaries containing alignment data | |
526 """ | |
527 imgt_iters = [csv.DictReader(open(f, 'rU'), delimiter='\t') for f in imgt_files] | |
528 # Create a dictionary for each sequence alignment and yield its generator | |
529 for sm, gp, nt, jn in zip(*imgt_iters): | |
530 if len(set([sm['Sequence ID'], | |
531 gp['Sequence ID'], | |
532 nt['Sequence ID'], | |
533 jn['Sequence ID']])) != 1: | |
534 sys.exit('Error: IMGT files are corrupt starting with Summary file record %s' \ | |
535 % sm['Sequence ID']) | |
536 | |
537 db_gen = {'SEQUENCE_ID': sm['Sequence ID'], | |
538 'SEQUENCE_INPUT': sm['Sequence']} | |
539 | |
540 if 'No results' not in sm['Functionality']: | |
541 db_gen['FUNCTIONAL'] = ['?','T','F'][('productive' in sm['Functionality']) + | |
542 ('unprod' in sm['Functionality'])] | |
543 db_gen['IN_FRAME'] = ['?','T','F'][('in-frame' in sm['JUNCTION frame']) + | |
544 ('out-of-frame' in sm['JUNCTION frame'])], | |
545 db_gen['STOP'] = ['F','?','T'][('stop codon' in sm['Functionality comment']) + | |
546 ('unprod' in sm['Functionality'])] | |
547 db_gen['MUTATED_INVARIANT'] = ['F','?','T'][(any(('missing' in sm['Functionality comment'], | |
548 'missing' in sm['V-REGION potential ins/del']))) + | |
549 ('unprod' in sm['Functionality'])] | |
550 db_gen['INDELS'] = ['F','T'][any((sm['V-REGION potential ins/del'], | |
551 sm['V-REGION insertions'], | |
552 sm['V-REGION deletions']))] | |
553 | |
554 db_gen['SEQUENCE_VDJ'] = nt['V-D-J-REGION'] if nt['V-D-J-REGION'] else nt['V-J-REGION'] | |
555 db_gen['SEQUENCE_IMGT'] = gp['V-D-J-REGION'] if gp['V-D-J-REGION'] else gp['V-J-REGION'] | |
556 | |
557 db_gen['V_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['V-GENE and allele'])) | |
558 db_gen['D_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['D-GENE and allele'])) | |
559 db_gen['J_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['J-GENE and allele'])) | |
560 | |
561 v_seq_length = len(nt['V-REGION']) if nt['V-REGION'] else 0 | |
562 db_gen['V_SEQ_START'] = nt['V-REGION start'] | |
563 db_gen['V_SEQ_LENGTH'] = v_seq_length | |
564 db_gen['V_GERM_START_IMGT'] = 1 | |
565 db_gen['V_GERM_LENGTH_IMGT'] = len(gp['V-REGION']) if gp['V-REGION'] else 0 | |
566 | |
567 db_gen['N1_LENGTH'] = sum(int(i) for i in [jn["P3'V-nt nb"], | |
568 jn['N-REGION-nt nb'], | |
569 jn['N1-REGION-nt nb'], | |
570 jn["P5'D-nt nb"]] if i) | |
571 db_gen['D_SEQ_START'] = sum(int(i) for i in [1, v_seq_length, | |
572 jn["P3'V-nt nb"], | |
573 jn['N-REGION-nt nb'], | |
574 jn['N1-REGION-nt nb'], | |
575 jn["P5'D-nt nb"]] if i) | |
576 db_gen['D_SEQ_LENGTH'] = int(jn["D-REGION-nt nb"] or 0) | |
577 db_gen['D_GERM_START'] = int(jn["5'D-REGION trimmed-nt nb"] or 0) + 1 | |
578 db_gen['D_GERM_LENGTH'] = int(jn["D-REGION-nt nb"] or 0) | |
579 db_gen['N2_LENGTH'] = sum(int(i) for i in [jn["P3'D-nt nb"], | |
580 jn['N2-REGION-nt nb'], | |
581 jn["P5'J-nt nb"]] if i) | |
582 | |
583 db_gen['J_SEQ_START_IMGT'] = sum(int(i) for i in [1, v_seq_length, | |
584 jn["P3'V-nt nb"], | |
585 jn['N-REGION-nt nb'], | |
586 jn['N1-REGION-nt nb'], | |
587 jn["P5'D-nt nb"], | |
588 jn["D-REGION-nt nb"], | |
589 jn["P3'D-nt nb"], | |
590 jn['N2-REGION-nt nb'], | |
591 jn["P5'J-nt nb"]] if i) | |
592 db_gen['J_SEQ_LENGTH'] = len(nt['J-REGION']) if nt['J-REGION'] else 0 | |
593 db_gen['J_GERM_START'] = int(jn["5'J-REGION trimmed-nt nb"] or 0) + 1 | |
594 db_gen['J_GERM_LENGTH'] = len(gp['J-REGION']) if gp['J-REGION'] else 0 | |
595 | |
596 db_gen['JUNCTION_LENGTH'] = len(jn['JUNCTION']) if jn['JUNCTION'] else 0 | |
597 db_gen['JUNCTION'] = jn['JUNCTION'] | |
598 | |
599 # Alignment scores | |
600 if score_fields: | |
601 try: db_gen['V_SCORE'] = float(sm['V-REGION score']) | |
602 except (TypeError, ValueError): db_gen['V_SCORE'] = 'None' | |
603 | |
604 try: db_gen['V_IDENTITY'] = float(sm['V-REGION identity %']) / 100.0 | |
605 except (TypeError, ValueError): db_gen['V_IDENTITY'] = 'None' | |
606 | |
607 try: db_gen['J_SCORE'] = float(sm['J-REGION score']) | |
608 except (TypeError, ValueError): db_gen['J_SCORE'] = 'None' | |
609 | |
610 try: db_gen['J_IDENTITY'] = float(sm['J-REGION identity %']) / 100.0 | |
611 except (TypeError, ValueError): db_gen['J_IDENTITY'] = 'None' | |
612 | |
613 # FWR and CDR regions | |
614 if region_fields: getRegions(db_gen) | |
615 else: | |
616 db_gen['V_CALL'] = 'None' | |
617 db_gen['D_CALL'] = 'None' | |
618 db_gen['J_CALL'] = 'None' | |
619 | |
620 yield IgRecord(db_gen) | |
621 | |
622 | |
623 def getIDforIMGT(seq_file): | |
624 """ | |
625 Create a sequence ID translation using IMGT truncation | |
626 | |
627 Arguments: | |
628 seq_file = a fasta file of sequences input to IMGT | |
629 | |
630 Returns: | |
631 a dictionary of {truncated ID: full seq description} | |
632 """ | |
633 | |
634 # Create a seq_dict ID translation using IDs truncate up to space or 50 chars | |
635 ids = {} | |
636 for i, rec in enumerate(SeqIO.parse(seq_file, 'fasta', IUPAC.ambiguous_dna)): | |
637 if len(rec.description) <= 50: | |
638 id_key = rec.description | |
639 else: | |
640 id_key = re.sub('\||\s|!|&|\*|<|>|\?','_',rec.description[:50]) | |
641 ids.update({id_key:rec.description}) | |
642 | |
643 return ids | |
644 | |
645 | |
646 def writeDb(db_gen, file_prefix, total_count, id_dict={}, no_parse=True, | |
647 score_fields=False, region_fields=False, out_args=default_out_args): | |
648 """ | |
649 Writes tab-delimited database file in output directory | |
650 | |
651 Arguments: | |
652 db_gen = a generator of IgRecord objects containing alignment data | |
653 file_prefix = directory and prefix for CLIP tab-delim file | |
654 total_count = number of records (for progress bar) | |
655 id_dict = a dictionary of {IMGT ID: full seq description} | |
656 no_parse = if ID is to be parsed for pRESTO output with default delimiters | |
657 score_fields = if True add alignment score fields to output file | |
658 region_fields = if True add FWR and CDR region fields to output file | |
659 out_args = common output argument dictionary from parseCommonArgs | |
660 | |
661 Returns: | |
662 None | |
663 """ | |
664 pass_file = "%s_db-pass.tab" % file_prefix | |
665 fail_file = "%s_db-fail.tab" % file_prefix | |
666 ordered_fields = ['SEQUENCE_ID', | |
667 'SEQUENCE_INPUT', | |
668 'FUNCTIONAL', | |
669 'IN_FRAME', | |
670 'STOP', | |
671 'MUTATED_INVARIANT', | |
672 'INDELS', | |
673 'V_CALL', | |
674 'D_CALL', | |
675 'J_CALL', | |
676 'SEQUENCE_VDJ', | |
677 'SEQUENCE_IMGT', | |
678 'V_SEQ_START', | |
679 'V_SEQ_LENGTH', | |
680 'V_GERM_START_VDJ', | |
681 'V_GERM_LENGTH_VDJ', | |
682 'V_GERM_START_IMGT', | |
683 'V_GERM_LENGTH_IMGT', | |
684 'N1_LENGTH', | |
685 'D_SEQ_START', | |
686 'D_SEQ_LENGTH', | |
687 'D_GERM_START', | |
688 'D_GERM_LENGTH', | |
689 'N2_LENGTH', | |
690 'J_SEQ_START', | |
691 'J_SEQ_LENGTH', | |
692 'J_GERM_START', | |
693 'J_GERM_LENGTH', | |
694 'JUNCTION_LENGTH', | |
695 'JUNCTION'] | |
696 | |
697 if score_fields: | |
698 ordered_fields.extend(['V_SCORE', | |
699 'V_IDENTITY', | |
700 'V_EVALUE', | |
701 'V_BTOP', | |
702 'J_SCORE', | |
703 'J_IDENTITY', | |
704 'J_EVALUE', | |
705 'J_BTOP']) | |
706 | |
707 if region_fields: | |
708 ordered_fields.extend(['FWR1_IMGT', 'FWR2_IMGT', 'FWR3_IMGT', 'FWR4_IMGT', | |
709 'CDR1_IMGT', 'CDR2_IMGT', 'CDR3_IMGT']) | |
710 | |
711 | |
712 # TODO: This is not the best approach. should pass in output fields. | |
713 # Initiate passed handle | |
714 pass_handle = None | 113 pass_handle = None |
715 | 114 fail_handle = None |
716 # Open failed file | |
717 if out_args['failed']: | |
718 fail_handle = open(fail_file, 'wt') | |
719 fail_writer = getDbWriter(fail_handle, add_fields=['SEQUENCE_ID', 'SEQUENCE_INPUT']) | |
720 else: | |
721 fail_handle = None | |
722 fail_writer = None | |
723 | |
724 # Initialize counters and file | |
725 pass_writer = None | 115 pass_writer = None |
116 fail_writer = None | |
726 start_time = time() | 117 start_time = time() |
727 rec_count = pass_count = fail_count = 0 | 118 rec_count = pass_count = fail_count = 0 |
728 for record in db_gen: | 119 |
729 #printProgress(i + (total_count/2 if id_dict else 0), total_count, 0.05, start_time) | 120 # Validate and write output |
730 printProgress(rec_count, total_count, 0.05, start_time) | 121 printProgress(0, total_count, 0.05, start_time) |
731 rec_count += 1 | 122 for i, record in enumerate(db, start=1): |
732 | 123 |
733 # Count pass or fail | 124 # Replace sequence description with full string, if required |
734 if (record.v_call == 'None' and record.j_call == 'None') or \ | 125 if id_dict is not None and record.id in id_dict: |
735 record.functional is None or \ | |
736 not record.seq_vdj or \ | |
737 not record.junction: | |
738 # print(record.v_call, record.j_call, record.functional, record.junction) | |
739 fail_count += 1 | |
740 if fail_writer is not None: fail_writer.writerow(record.toDict()) | |
741 continue | |
742 else: | |
743 pass_count += 1 | |
744 | |
745 # Build sample sequence description | |
746 if record.id in id_dict: | |
747 record.id = id_dict[record.id] | 126 record.id = id_dict[record.id] |
748 | 127 |
749 # Parse sequence description into new columns | 128 # Parse sequence description into new columns |
750 if not no_parse: | 129 if not no_parse: |
751 record.annotations = parseAnnotation(record.id, delimiter=out_args['delimiter']) | 130 try: |
752 record.id = record.annotations['ID'] | 131 record.annotations = parseAnnotation(record.id, delimiter=out_args['delimiter']) |
753 del record.annotations['ID'] | 132 record.id = record.annotations['ID'] |
754 | 133 del record.annotations['ID'] |
755 # TODO: This is not the best approach. should pass in output fields. | 134 |
756 # If first sequence, use parsed description to create new columns and initialize writer | 135 # TODO: This is not the best approach. should pass in output fields. |
757 if pass_writer is None: | 136 # If first record, use parsed description to define extra columns |
758 if not no_parse: ordered_fields.extend(list(record.annotations.keys())) | 137 if i == 1: fields.extend(list(record.annotations.keys())) |
759 pass_handle = open(pass_file, 'wt') | 138 except IndexError: |
760 pass_writer = getDbWriter(pass_handle, add_fields=ordered_fields) | 139 # Could not parse pRESTO-style annotations so fall back to no parse |
761 | 140 no_parse = True |
762 # Write row to tab-delim CLIP file | 141 sys.stderr.write('\nWARNING: Sequence annotation format not recognized. Sequence headers will not be parsed.\n') |
763 pass_writer.writerow(record.toDict()) | 142 |
764 | 143 # Count pass or fail and write to appropriate file |
765 # Print log | 144 if _pass(record): |
766 #printProgress(i+1 + (total_count/2 if id_dict else 0), total_count, 0.05, start_time) | 145 # Open pass file |
767 printProgress(rec_count, total_count, 0.05, start_time) | 146 if pass_writer is None: |
768 | 147 pass_handle = open(pass_file, 'wt') |
148 pass_writer = getDbWriter(pass_handle, add_fields=fields) | |
149 | |
150 # Write row to pass file | |
151 pass_count += 1 | |
152 pass_writer.writerow(record.toDict()) | |
153 else: | |
154 # Open failed file | |
155 if out_args['failed'] and fail_writer is None: | |
156 fail_handle = open(fail_file, 'wt') | |
157 fail_writer = getDbWriter(fail_handle, add_fields=fields) | |
158 | |
159 # Write row to fail file if specified | |
160 fail_count += 1 | |
161 if fail_writer is not None: | |
162 fail_writer.writerow(record.toDict()) | |
163 | |
164 # Print progress | |
165 printProgress(i, total_count, 0.05, start_time) | |
166 | |
167 # Print consol log | |
769 log = OrderedDict() | 168 log = OrderedDict() |
770 log['OUTPUT'] = pass_file | 169 log['OUTPUT'] = pass_file |
771 log['PASS'] = pass_count | 170 log['PASS'] = pass_count |
772 log['FAIL'] = fail_count | 171 log['FAIL'] = fail_count |
773 log['END'] = 'MakeDb' | 172 log['END'] = 'MakeDb' |
775 | 174 |
776 if pass_handle is not None: pass_handle.close() | 175 if pass_handle is not None: pass_handle.close() |
777 if fail_handle is not None: fail_handle.close() | 176 if fail_handle is not None: fail_handle.close() |
778 | 177 |
779 | 178 |
780 # TODO: may be able to merge with parseIMGT | 179 # TODO: may be able to merge with other mains |
781 def parseIgBlast(igblast_output, seq_file, repo, no_parse=True, score_fields=False, | 180 def parseIMGT(aligner_output, seq_file=None, no_parse=True, partial=False, |
782 region_fields=False, out_args=default_out_args): | 181 parse_scores=False, parse_regions=False, parse_junction=False, |
783 """ | 182 out_args=default_out_args): |
784 Main for IgBlast aligned sample sequences | 183 """ |
184 Main for IMGT aligned sample sequences. | |
785 | 185 |
786 Arguments: | 186 Arguments: |
787 igblast_output = IgBlast output file to process | 187 aligner_output : zipped file or unzipped folder output by IMGT. |
788 seq_file = fasta file input to IgBlast (from which to get sequence) | 188 seq_file : FASTA file input to IMGT (from which to get seqID). |
789 repo = folder with germline repertoire files | 189 no_parse : if ID is to be parsed for pRESTO output with default delimiters. |
790 no_parse = if ID is to be parsed for pRESTO output with default delimiters | 190 partial : If True put incomplete alignments in the pass file. |
791 score_fields = if True add alignment score fields to output file | 191 parse_scores : if True add alignment score fields to output file. |
792 region_fields = if True add FWR and CDR region fields to output file | 192 parse_regions : if True add FWR and CDR region fields to output file. |
793 out_args = common output argument dictionary from parseCommonArgs | 193 out_args : common output argument dictionary from parseCommonArgs. |
794 | 194 |
795 Returns: | 195 Returns: |
796 None | 196 None |
197 """ | |
198 # Print parameter info | |
199 log = OrderedDict() | |
200 log['START'] = 'MakeDb' | |
201 log['ALIGNER'] = 'IMGT' | |
202 log['ALIGNER_OUTPUT'] = aligner_output | |
203 log['SEQ_FILE'] = os.path.basename(seq_file) if seq_file else '' | |
204 log['NO_PARSE'] = no_parse | |
205 log['PARTIAL'] = partial | |
206 log['SCORES'] = parse_scores | |
207 log['REGIONS'] = parse_regions | |
208 log['JUNCTION'] = parse_junction | |
209 printLog(log) | |
210 | |
211 start_time = time() | |
212 printMessage('Loading sequence files', start_time=start_time, width=25) | |
213 # Extract IMGT files | |
214 temp_dir, imgt_files = extractIMGT(aligner_output) | |
215 # Count records in IMGT files | |
216 total_count = countDbFile(imgt_files['summary']) | |
217 # Get (parsed) IDs from fasta file submitted to IMGT | |
218 id_dict = getIDforIMGT(seq_file) if seq_file else {} | |
219 printMessage('Done', start_time=start_time, end=True, width=25) | |
220 | |
221 # Parse IMGT output and write db | |
222 with open(imgt_files['summary'], 'r') as summary_handle, \ | |
223 open(imgt_files['gapped'], 'r') as gapped_handle, \ | |
224 open(imgt_files['ntseq'], 'r') as ntseq_handle, \ | |
225 open(imgt_files['junction'], 'r') as junction_handle: | |
226 parse_iter = IMGTReader(summary_handle, gapped_handle, ntseq_handle, junction_handle, | |
227 parse_scores=parse_scores, parse_regions=parse_regions, | |
228 parse_junction=parse_junction) | |
229 file_prefix = getFilePrefix(aligner_output, out_args) | |
230 writeDb(parse_iter, parse_iter.fields, file_prefix, total_count, id_dict=id_dict, | |
231 no_parse=no_parse, partial=partial, out_args=out_args) | |
232 | |
233 # Cleanup temp directory | |
234 temp_dir.cleanup() | |
235 | |
236 return None | |
237 | |
238 | |
239 # TODO: may be able to merge with other mains | |
240 def parseIgBLAST(aligner_output, seq_file, repo, no_parse=True, partial=False, | |
241 parse_regions=False, parse_scores=False, parse_igblast_cdr3=False, | |
242 out_args=default_out_args): | |
243 """ | |
244 Main for IgBLAST aligned sample sequences. | |
245 | |
246 Arguments: | |
247 aligner_output : IgBLAST output file to process. | |
248 seq_file : fasta file input to IgBlast (from which to get sequence). | |
249 repo : folder with germline repertoire files. | |
250 no_parse : if ID is to be parsed for pRESTO output with default delimiters. | |
251 partial : If True put incomplete alignments in the pass file. | |
252 parse_regions : if True add FWR and CDR fields to output file. | |
253 parse_scores : if True add alignment score fields to output file. | |
254 parse_igblast_cdr3 : if True parse CDR3 sequences generated by IgBLAST | |
255 out_args : common output argument dictionary from parseCommonArgs. | |
256 | |
257 Returns: | |
258 None | |
797 """ | 259 """ |
798 # Print parameter info | 260 # Print parameter info |
799 log = OrderedDict() | 261 log = OrderedDict() |
800 log['START'] = 'MakeDB' | 262 log['START'] = 'MakeDB' |
801 log['ALIGNER'] = 'IgBlast' | 263 log['ALIGNER'] = 'IgBlast' |
802 log['ALIGN_RESULTS'] = os.path.basename(igblast_output) | 264 log['ALIGNER_OUTPUT'] = os.path.basename(aligner_output) |
803 log['SEQ_FILE'] = os.path.basename(seq_file) | 265 log['SEQ_FILE'] = os.path.basename(seq_file) |
804 log['NO_PARSE'] = no_parse | 266 log['NO_PARSE'] = no_parse |
805 log['SCORE_FIELDS'] = score_fields | 267 log['PARTIAL'] = partial |
806 log['REGION_FIELDS'] = region_fields | 268 log['SCORES'] = parse_scores |
269 log['REGIONS'] = parse_regions | |
807 printLog(log) | 270 printLog(log) |
808 | 271 |
272 start_time = time() | |
273 printMessage('Loading sequence files', start_time=start_time, width=25) | |
274 # Count records in sequence file | |
275 total_count = countSeqFile(seq_file) | |
809 # Get input sequence dictionary | 276 # Get input sequence dictionary |
810 seq_dict = getSeqforIgBlast(seq_file) | 277 seq_dict = getSeqDict(seq_file) |
811 | 278 # Create germline repo dictionary |
812 # Formalize out_dir and file-prefix | 279 repo_dict = readRepo(repo) |
813 if not out_args['out_dir']: | 280 printMessage('Done', start_time=start_time, end=True, width=25) |
814 out_dir = os.path.split(igblast_output)[0] | 281 |
815 else: | 282 # Parse and write output |
816 out_dir = os.path.abspath(out_args['out_dir']) | 283 with open(aligner_output, 'r') as f: |
817 if not os.path.exists(out_dir): os.mkdir(out_dir) | 284 parse_iter = IgBLASTReader(f, seq_dict, repo_dict, |
818 if out_args['out_name']: | 285 parse_scores=parse_scores, parse_regions=parse_regions, |
819 file_prefix = out_args['out_name'] | 286 parse_igblast_cdr3=parse_igblast_cdr3) |
820 else: | 287 file_prefix = getFilePrefix(aligner_output, out_args) |
821 file_prefix = os.path.basename(os.path.splitext(igblast_output)[0]) | 288 writeDb(parse_iter, parse_iter.fields, file_prefix, total_count, |
822 file_prefix = os.path.join(out_dir, file_prefix) | 289 no_parse=no_parse, partial=partial, out_args=out_args) |
823 | 290 |
824 total_count = countSeqFile(seq_file) | 291 return None |
825 | 292 |
826 # Create | 293 |
827 repo_dict = getRepo(repo) | 294 # TODO: may be able to merge with other mains |
828 igblast_dict = readIgBlast(igblast_output, seq_dict, repo_dict, | 295 def parseIHMM(aligner_output, seq_file, repo, no_parse=True, partial=False, |
829 score_fields=score_fields, region_fields=region_fields) | 296 parse_scores=False, parse_regions=False, out_args=default_out_args): |
830 writeDb(igblast_dict, file_prefix, total_count, no_parse=no_parse, | 297 """ |
831 score_fields=score_fields, region_fields=region_fields, out_args=out_args) | 298 Main for iHMMuneAlign aligned sample sequences. |
832 | |
833 | |
834 # TODO: may be able to merge with parseIgBlast | |
835 def parseIMGT(imgt_output, seq_file=None, no_parse=True, score_fields=False, | |
836 region_fields=False, out_args=default_out_args): | |
837 """ | |
838 Main for IMGT aligned sample sequences | |
839 | 299 |
840 Arguments: | 300 Arguments: |
841 imgt_output = zipped file or unzipped folder output by IMGT | 301 aligner_output : iHMMune-Align output file to process. |
842 seq_file = FASTA file input to IMGT (from which to get seqID) | 302 seq_file : fasta file input to iHMMuneAlign (from which to get sequence). |
843 no_parse = if ID is to be parsed for pRESTO output with default delimiters | 303 repo : folder with germline repertoire files. |
844 score_fields = if True add alignment score fields to output file | 304 no_parse : if ID is to be parsed for pRESTO output with default delimiters. |
845 region_fields = if True add FWR and CDR region fields to output file | 305 partial : If True put incomplete alignments in the pass file. |
846 out_args = common output argument dictionary from parseCommonArgs | 306 parse_scores : if True parse alignment scores. |
847 | 307 parse_regions : if True add FWR and CDR region fields. |
848 Returns: | 308 out_args : common output argument dictionary from parseCommonArgs. |
849 None | 309 |
310 Returns: | |
311 None | |
850 """ | 312 """ |
851 # Print parameter info | 313 # Print parameter info |
852 log = OrderedDict() | 314 log = OrderedDict() |
853 log['START'] = 'MakeDb' | 315 log['START'] = 'MakeDB' |
854 log['ALIGNER'] = 'IMGT' | 316 log['ALIGNER'] = 'iHMMune-Align' |
855 log['ALIGN_RESULTS'] = imgt_output | 317 log['ALIGNER_OUTPUT'] = os.path.basename(aligner_output) |
856 log['SEQ_FILE'] = os.path.basename(seq_file) if seq_file else '' | 318 log['SEQ_FILE'] = os.path.basename(seq_file) |
857 log['NO_PARSE'] = no_parse | 319 log['NO_PARSE'] = no_parse |
858 log['SCORE_FIELDS'] = score_fields | 320 log['PARTIAL'] = partial |
859 log['REGION_FIELDS'] = region_fields | 321 log['SCORES'] = parse_scores |
322 log['REGIONS'] = parse_regions | |
860 printLog(log) | 323 printLog(log) |
861 | 324 |
862 # Get individual IMGT result files | 325 start_time = time() |
863 temp_dir, imgt_files = extractIMGT(imgt_output) | 326 printMessage('Loading sequence files', start_time=start_time, width=25) |
864 | 327 # Count records in sequence file |
865 # Formalize out_dir and file-prefix | 328 total_count = countSeqFile(seq_file) |
866 if not out_args['out_dir']: | 329 # Get input sequence dictionary |
867 out_dir = os.path.dirname(os.path.abspath(imgt_output)) | 330 seq_dict = getSeqDict(seq_file) |
868 else: | 331 # Create germline repo dictionary |
869 out_dir = os.path.abspath(out_args['out_dir']) | 332 repo_dict = readRepo(repo) |
870 if not os.path.exists(out_dir): os.mkdir(out_dir) | 333 printMessage('Done', start_time=start_time, end=True, width=25) |
871 if out_args['out_name']: | 334 |
872 file_prefix = out_args['out_name'] | 335 # Parse and write output |
873 else: | 336 with open(aligner_output, 'r') as f: |
874 file_prefix = os.path.splitext(os.path.split(os.path.abspath(imgt_output))[1])[0] | 337 parse_iter = IHMMuneReader(f, seq_dict, repo_dict, |
875 file_prefix = os.path.join(out_dir, file_prefix) | 338 parse_scores=parse_scores, parse_regions=parse_regions) |
876 | 339 file_prefix = getFilePrefix(aligner_output, out_args) |
877 total_count = countDbFile(imgt_files[0]) | 340 writeDb(parse_iter, parse_iter.fields, file_prefix, total_count, |
878 | 341 no_parse=no_parse, partial=partial, out_args=out_args) |
879 # Get (parsed) IDs from fasta file submitted to IMGT | 342 |
880 id_dict = getIDforIMGT(seq_file) if seq_file else {} | 343 return None |
881 | |
882 # Create | |
883 imgt_dict = readIMGT(imgt_files, score_fields=score_fields, | |
884 region_fields=region_fields) | |
885 writeDb(imgt_dict, file_prefix, total_count, id_dict=id_dict, no_parse=no_parse, | |
886 score_fields=score_fields, region_fields=region_fields, out_args=out_args) | |
887 | |
888 # Delete temp directory | |
889 rmtree(temp_dir) | |
890 | 344 |
891 | 345 |
892 def getArgParser(): | 346 def getArgParser(): |
893 """ | 347 """ |
894 Defines the ArgumentParser | 348 Defines the ArgumentParser. |
895 | 349 |
896 Arguments: | |
897 None | |
898 | |
899 Returns: | 350 Returns: |
900 an ArgumentParser object | 351 argparse.ArgumentParser |
901 """ | 352 """ |
902 fields = dedent( | 353 fields = dedent( |
903 ''' | 354 ''' |
904 output files: | 355 output files: |
905 db-pass | 356 db-pass |
906 database of parsed alignment records. | 357 database of alignment records with functionality information, |
358 V and J calls, and a junction region. | |
907 db-fail | 359 db-fail |
908 database with records failing alignment. | 360 database with records that fail due to no functionality information |
909 | 361 (did not pass IMGT), no V call, no J call, or no junction region. |
910 output fields: | 362 |
911 SEQUENCE_ID, SEQUENCE_INPUT, FUNCTIONAL, IN_FRAME, STOP, MUTATED_INVARIANT, | 363 universal output fields: |
912 INDELS, V_CALL, D_CALL, J_CALL, SEQUENCE_VDJ and/or SEQUENCE_IMGT, | 364 SEQUENCE_ID, SEQUENCE_INPUT, SEQUENCE_VDJ, SEQUENCE_IMGT, |
913 V_SEQ_START, V_SEQ_LENGTH, V_GERM_START_VDJ and/or V_GERM_START_IMGT, | 365 FUNCTIONAL, IN_FRAME, STOP, MUTATED_INVARIANT, INDELS, |
914 V_GERM_LENGTH_VDJ and/or V_GERM_LENGTH_IMGT, N1_LENGTH, | 366 V_CALL, D_CALL, J_CALL, |
915 D_SEQ_START, D_SEQ_LENGTH, D_GERM_START, D_GERM_LENGTH, N2_LENGTH, | 367 V_SEQ_START, V_SEQ_LENGTH, |
368 D_SEQ_START, D_SEQ_LENGTH, D_GERM_START, D_GERM_LENGTH, | |
916 J_SEQ_START, J_SEQ_LENGTH, J_GERM_START, J_GERM_LENGTH, | 369 J_SEQ_START, J_SEQ_LENGTH, J_GERM_START, J_GERM_LENGTH, |
917 JUNCTION_LENGTH, JUNCTION, V_SCORE, V_IDENTITY, V_EVALUE, V_BTOP, | 370 JUNCTION_LENGTH, JUNCTION, NP1_LENGTH, NP2_LENGTH, |
918 J_SCORE, J_IDENTITY, J_EVALUE, J_BTOP, FWR1_IMGT, FWR2_IMGT, FWR3_IMGT, | 371 FWR1_IMGT, FWR2_IMGT, FWR3_IMGT, FWR4_IMGT, |
919 FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, CDR3_IMGT | 372 CDR1_IMGT, CDR2_IMGT, CDR3_IMGT |
373 | |
374 imgt specific output fields: | |
375 V_GERM_START_IMGT, V_GERM_LENGTH_IMGT, | |
376 N1_LENGTH, N2_LENGTH, P3V_LENGTH, P5D_LENGTH, P3D_LENGTH, P5J_LENGTH, | |
377 D_FRAME, V_SCORE, V_IDENTITY, J_SCORE, J_IDENTITY, | |
378 | |
379 igblast specific output fields: | |
380 V_GERM_START_VDJ, V_GERM_LENGTH_VDJ, | |
381 V_EVALUE, V_SCORE, V_IDENTITY, V_BTOP, | |
382 J_EVALUE, J_SCORE, J_IDENTITY, J_BTOP. | |
383 CDR3_IGBLAST_NT, CDR3_IGBLAST_AA | |
384 | |
385 ihmm specific output fields: | |
386 V_GERM_START_VDJ, V_GERM_LENGTH_VDJ, | |
387 HMM_SCORE | |
920 ''') | 388 ''') |
921 | 389 |
922 # Define ArgumentParser | 390 # Define ArgumentParser |
923 parser = ArgumentParser(description=__doc__, epilog=fields, | 391 parser = ArgumentParser(description=__doc__, epilog=fields, |
924 formatter_class=CommonHelpFormatter) | 392 formatter_class=CommonHelpFormatter) |
931 | 399 |
932 # Parent parser | 400 # Parent parser |
933 parser_parent = getCommonArgParser(seq_in=False, seq_out=False, log=False) | 401 parser_parent = getCommonArgParser(seq_in=False, seq_out=False, log=False) |
934 | 402 |
935 # IgBlast Aligner | 403 # IgBlast Aligner |
936 parser_igblast = subparsers.add_parser('igblast', help='Process IgBlast output', | 404 parser_igblast = subparsers.add_parser('igblast', parents=[parser_parent], |
937 parents=[parser_parent], | 405 formatter_class=CommonHelpFormatter, |
938 formatter_class=CommonHelpFormatter) | 406 help='Process IgBLAST output.', |
939 parser_igblast.set_defaults(func=parseIgBlast) | 407 description='Process IgBLAST output.') |
940 parser_igblast.add_argument('-i', nargs='+', action='store', dest='aligner_files', | 408 parser_igblast.add_argument('-i', nargs='+', action='store', dest='aligner_outputs', |
941 required=True, | 409 required=True, |
942 help='''IgBLAST output files in format 7 with query sequence | 410 help='''IgBLAST output files in format 7 with query sequence |
943 (IgBLAST argument \'-outfmt "7 std qseq sseq btop"\').''') | 411 (IgBLAST argument \'-outfmt "7 std qseq sseq btop"\').''') |
944 parser_igblast.add_argument('-r', nargs='+', action='store', dest='repo', required=True, | 412 parser_igblast.add_argument('-r', nargs='+', action='store', dest='repo', required=True, |
945 help='''List of folders and/or fasta files containing | 413 help='''List of folders and/or fasta files containing |
946 IMGT-gapped germline sequences corresponding to the | 414 IMGT-gapped germline sequences corresponding to the |
947 set of germlines used in the IgBLAST alignment.''') | 415 set of germlines used in the IgBLAST alignment.''') |
948 parser_igblast.add_argument('-s', action='store', nargs='+', dest='seq_files', | 416 parser_igblast.add_argument('-s', action='store', nargs='+', dest='seq_files', |
949 required=True, | 417 required=True, |
950 help='List of input FASTA files containing sequences') | 418 help='''List of input FASTA files (with .fasta, .fna or .fa |
419 extension), containing sequences.''') | |
951 parser_igblast.add_argument('--noparse', action='store_true', dest='no_parse', | 420 parser_igblast.add_argument('--noparse', action='store_true', dest='no_parse', |
952 help='''Specify if input IDs should not be parsed to add | 421 help='''Specify to prevent input sequence headers from being parsed |
953 new columns to database.''') | 422 to add new columns to database. Parsing of sequence headers requires |
954 parser_igblast.add_argument('--scores', action='store_true', dest='score_fields', | 423 headers to be in the pRESTO annotation format, so this should be specified |
424 when sequence headers are incompatible with the pRESTO annotation scheme. | |
425 Note, unrecognized header formats will default to this behavior.''') | |
426 parser_igblast.add_argument('--partial', action='store_true', dest='partial', | |
427 help='''If specified, include incomplete V(D)J alignments in | |
428 the pass file instead of the fail file.''') | |
429 parser_igblast.add_argument('--scores', action='store_true', dest='parse_scores', | |
955 help='''Specify if alignment score metrics should be | 430 help='''Specify if alignment score metrics should be |
956 included in the output. Adds the V_SCORE, V_IDENTITY, | 431 included in the output. Adds the V_SCORE, V_IDENTITY, |
957 V_EVALUE, V_BTOP, J_SCORE, J_IDENTITY, | 432 V_EVALUE, V_BTOP, J_SCORE, J_IDENTITY, |
958 J_BTOP, and J_EVALUE columns.''') | 433 J_BTOP, and J_EVALUE columns.''') |
959 parser_igblast.add_argument('--regions', action='store_true', dest='region_fields', | 434 parser_igblast.add_argument('--regions', action='store_true', dest='parse_regions', |
960 help='''Specify if IMGT framework and CDR regions should be | 435 help='''Specify if IMGT FWR and CDRs should be |
961 included in the output. Adds the FWR1_IMGT, FWR2_IMGT, | 436 included in the output. Adds the FWR1_IMGT, FWR2_IMGT, |
962 FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and | 437 FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and |
963 CDR3_IMGT columns.''') | 438 CDR3_IMGT columns.''') |
964 | 439 parser_igblast.add_argument('--cdr3', action='store_true', |
440 dest='parse_igblast_cdr3', | |
441 help='''Specify if the CDR3 sequences generated by IgBLAST | |
442 should be included in the output. Adds the columns | |
443 CDR3_IGBLAST_NT and CDR3_IGBLAST_AA. Requires IgBLAST | |
444 version 1.5 or greater.''') | |
445 parser_igblast.set_defaults(func=parseIgBLAST) | |
446 | |
965 # IMGT aligner | 447 # IMGT aligner |
966 parser_imgt = subparsers.add_parser('imgt', help='Process IMGT/HighV-Quest output', | 448 parser_imgt = subparsers.add_parser('imgt', parents=[parser_parent], |
967 parents=[parser_parent], | 449 formatter_class=CommonHelpFormatter, |
968 formatter_class=CommonHelpFormatter) | 450 help='''Process IMGT/HighV-Quest output |
969 imgt_arg_group = parser_imgt.add_mutually_exclusive_group(required=True) | 451 (does not work with V-QUEST).''', |
970 imgt_arg_group.add_argument('-i', nargs='+', action='store', dest='aligner_files', | 452 description='''Process IMGT/HighV-Quest output |
971 help='''Either zipped IMGT output files (.zip) or a folder | 453 (does not work with V-QUEST).''') |
972 containing unzipped IMGT output files (which must | 454 parser_imgt.add_argument('-i', nargs='+', action='store', dest='aligner_outputs', |
973 include 1_Summary, 2_IMGT-gapped, 3_Nt-sequences, | 455 help='''Either zipped IMGT output files (.zip or .txz) or a |
974 and 6_Junction).''') | 456 folder containing unzipped IMGT output files (which must |
457 include 1_Summary, 2_IMGT-gapped, 3_Nt-sequences, | |
458 and 6_Junction).''') | |
975 parser_imgt.add_argument('-s', nargs='*', action='store', dest='seq_files', | 459 parser_imgt.add_argument('-s', nargs='*', action='store', dest='seq_files', |
976 required=False, | 460 required=False, |
977 help='List of input FASTA files containing sequences') | 461 help='''List of input FASTA files (with .fasta, .fna or .fa |
462 extension) containing sequences.''') | |
978 parser_imgt.add_argument('--noparse', action='store_true', dest='no_parse', | 463 parser_imgt.add_argument('--noparse', action='store_true', dest='no_parse', |
979 help='''Specify if input IDs should not be parsed to add new | 464 help='''Specify to prevent input sequence headers from being parsed |
980 columns to database.''') | 465 to add new columns to database. Parsing of sequence headers requires |
981 parser_imgt.add_argument('--scores', action='store_true', dest='score_fields', | 466 headers to be in the pRESTO annotation format, so this should be specified |
467 when sequence headers are incompatible with the pRESTO annotation scheme. | |
468 Note, unrecognized header formats will default to this behavior.''') | |
469 parser_imgt.add_argument('--partial', action='store_true', dest='partial', | |
470 help='''If specified, include incomplete V(D)J alignments in | |
471 the pass file instead of the fail file.''') | |
472 parser_imgt.add_argument('--scores', action='store_true', dest='parse_scores', | |
982 help='''Specify if alignment score metrics should be | 473 help='''Specify if alignment score metrics should be |
983 included in the output. Adds the V_SCORE, V_IDENTITY, | 474 included in the output. Adds the V_SCORE, V_IDENTITY, |
984 J_SCORE and J_IDENTITY. Note, this will also add | 475 J_SCORE and J_IDENTITY.''') |
985 the columns V_EVALUE, V_BTOP, J_EVALUE and J_BTOP, | 476 parser_imgt.add_argument('--regions', action='store_true', dest='parse_regions', |
986 but they will be empty for IMGT output.''') | 477 help='''Specify if IMGT FWRs and CDRs should be |
987 parser_imgt.add_argument('--regions', action='store_true', dest='region_fields', | |
988 help='''Specify if IMGT framework and CDR regions should be | |
989 included in the output. Adds the FWR1_IMGT, FWR2_IMGT, | 478 included in the output. Adds the FWR1_IMGT, FWR2_IMGT, |
990 FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and | 479 FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and |
991 CDR3_IMGT columns.''') | 480 CDR3_IMGT columns.''') |
481 parser_imgt.add_argument('--junction', action='store_true', dest='parse_junction', | |
482 help='''Specify if detailed junction fields should be | |
483 included in the output. Adds the columns | |
484 N1_LENGTH, N2_LENGTH, P3V_LENGTH, P5D_LENGTH, P3D_LENGTH, | |
485 P5J_LENGTH, D_FRAME.''') | |
992 parser_imgt.set_defaults(func=parseIMGT) | 486 parser_imgt.set_defaults(func=parseIMGT) |
487 | |
488 # iHMMuneAlign Aligner | |
489 parser_ihmm = subparsers.add_parser('ihmm', parents=[parser_parent], | |
490 formatter_class=CommonHelpFormatter, | |
491 help='Process iHMMune-Align output.', | |
492 description='Process iHMMune-Align output.') | |
493 parser_ihmm.add_argument('-i', nargs='+', action='store', dest='aligner_outputs', | |
494 required=True, | |
495 help='''iHMMune-Align output file.''') | |
496 parser_ihmm.add_argument('-r', nargs='+', action='store', dest='repo', required=True, | |
497 help='''List of folders and/or FASTA files containing | |
498 IMGT-gapped germline sequences corresponding to the | |
499 set of germlines used in the IgBLAST alignment.''') | |
500 parser_ihmm.add_argument('-s', action='store', nargs='+', dest='seq_files', | |
501 required=True, | |
502 help='''List of input FASTA files (with .fasta, .fna or .fa | |
503 extension) containing sequences.''') | |
504 parser_ihmm.add_argument('--noparse', action='store_true', dest='no_parse', | |
505 help='''Specify to prevent input sequence headers from being parsed | |
506 to add new columns to database. Parsing of sequence headers requires | |
507 headers to be in the pRESTO annotation format, so this should be specified | |
508 when sequence headers are incompatible with the pRESTO annotation scheme. | |
509 Note, unrecognized header formats will default to this behavior.''') | |
510 parser_ihmm.add_argument('--partial', action='store_true', dest='partial', | |
511 help='''If specified, include incomplete V(D)J alignments in | |
512 the pass file instead of the fail file.''') | |
513 parser_ihmm.add_argument('--scores', action='store_true', dest='parse_scores', | |
514 help='''Specify if alignment score metrics should be | |
515 included in the output. Adds the path score of the | |
516 iHMMune-Align hidden Markov model to HMM_SCORE.''') | |
517 parser_ihmm.add_argument('--regions', action='store_true', dest='parse_regions', | |
518 help='''Specify if IMGT FWRs and CDRs should be | |
519 included in the output. Adds the FWR1_IMGT, FWR2_IMGT, | |
520 FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and | |
521 CDR3_IMGT columns.''') | |
522 parser_ihmm.set_defaults(func=parseIHMM) | |
993 | 523 |
994 return parser | 524 return parser |
995 | 525 |
996 | 526 |
997 if __name__ == "__main__": | 527 if __name__ == "__main__": |
998 """ | 528 """ |
999 Parses command line arguments and calls main | 529 Parses command line arguments and calls main |
1000 """ | 530 """ |
1001 parser = getArgParser() | 531 parser = getArgParser() |
1002 args = parser.parse_args() | 532 args = parser.parse_args() |
1003 args_dict = parseCommonArgs(args, in_arg='aligner_files') | 533 args_dict = parseCommonArgs(args, in_arg='aligner_outputs') |
1004 | 534 |
1005 # Set no ID parsing if sequence files are not provided | 535 # Set no ID parsing if sequence files are not provided |
1006 if 'seq_files' in args_dict and not args_dict['seq_files']: | 536 if 'seq_files' in args_dict and not args_dict['seq_files']: |
1007 args_dict['no_parse'] = True | 537 args_dict['no_parse'] = True |
1008 | 538 |
1009 # Delete | 539 # Delete |
1010 if 'seq_files' in args_dict: del args_dict['seq_files'] | 540 if 'seq_files' in args_dict: del args_dict['seq_files'] |
1011 if 'aligner_files' in args_dict: del args_dict['aligner_files'] | 541 if 'aligner_outputs' in args_dict: del args_dict['aligner_outputs'] |
1012 if 'command' in args_dict: del args_dict['command'] | 542 if 'command' in args_dict: del args_dict['command'] |
1013 if 'func' in args_dict: del args_dict['func'] | 543 if 'func' in args_dict: del args_dict['func'] |
1014 | 544 |
1015 if args.command == 'imgt': | 545 if args.command == 'imgt': |
1016 for i in range(len(args.__dict__['aligner_files'])): | 546 for i in range(len(args.__dict__['aligner_outputs'])): |
1017 args_dict['imgt_output'] = args.__dict__['aligner_files'][i] | 547 args_dict['aligner_output'] = args.__dict__['aligner_outputs'][i] |
1018 args_dict['seq_file'] = args.__dict__['seq_files'][i] \ | 548 args_dict['seq_file'] = args.__dict__['seq_files'][i] \ |
1019 if args.__dict__['seq_files'] else None | 549 if args.__dict__['seq_files'] else None |
1020 args.func(**args_dict) | 550 args.func(**args_dict) |
1021 elif args.command == 'igblast': | 551 elif args.command == 'igblast' or args.command == 'ihmm': |
1022 for i in range(len(args.__dict__['aligner_files'])): | 552 for i in range(len(args.__dict__['aligner_outputs'])): |
1023 args_dict['igblast_output'] = args.__dict__['aligner_files'][i] | 553 args_dict['aligner_output'] = args.__dict__['aligner_outputs'][i] |
1024 args_dict['seq_file'] = args.__dict__['seq_files'][i] | 554 args_dict['seq_file'] = args.__dict__['seq_files'][i] |
1025 args.func(**args_dict) | 555 args.func(**args_dict) |