Mercurial > repos > davidvanzessen > argalaxy_tools
comparison change_o/MakeDb.py @ 4:5ffd52fc35c4 draft
Uploaded
author | davidvanzessen |
---|---|
date | Mon, 12 Dec 2016 05:22:37 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
3:beaa487ecf43 | 4:5ffd52fc35c4 |
---|---|
1 #!/usr/bin/env python3 | |
2 """ | |
3 Create tab-delimited database file to store sequence alignment information | |
4 """ | |
5 # Info | |
6 __author__ = 'Namita Gupta, Jason Anthony Vander Heiden' | |
7 from changeo import __version__, __date__ | |
8 | |
9 # Imports | |
10 import csv | |
11 import os | |
12 import re | |
13 import sys | |
14 import pandas as pd | |
15 import tarfile | |
16 import zipfile | |
17 from argparse import ArgumentParser | |
18 from collections import OrderedDict | |
19 from itertools import groupby | |
20 from shutil import rmtree | |
21 from tempfile import mkdtemp | |
22 from textwrap import dedent | |
23 from time import time | |
24 from Bio import SeqIO | |
25 from Bio.Seq import Seq | |
26 from Bio.Alphabet import IUPAC | |
27 | |
28 # Presto and changeo imports | |
29 from presto.Defaults import default_out_args | |
30 from presto.Annotation import parseAnnotation | |
31 from presto.IO import countSeqFile, printLog, printProgress | |
32 from changeo.Commandline import CommonHelpFormatter, getCommonArgParser, parseCommonArgs | |
33 from changeo.IO import getDbWriter, countDbFile, getRepo | |
34 from changeo.Receptor import IgRecord, parseAllele, v_allele_regex, d_allele_regex, \ | |
35 j_allele_regex | |
36 | |
37 # Default parameters | |
38 default_delimiter = ('\t', ',', '-') | |
39 | |
40 | |
41 def gapV(ig_dict, repo_dict): | |
42 """ | |
43 Insert gaps into V region and update alignment information | |
44 | |
45 Arguments: | |
46 ig_dict : Dictionary of parsed IgBlast output | |
47 repo_dict : Dictionary of IMGT gapped germline sequences | |
48 | |
49 Returns: | |
50 dict : Updated with SEQUENCE_IMGT, V_GERM_START_IMGT, and V_GERM_LENGTH_IMGT fields | |
51 """ | |
52 | |
53 seq_imgt = '.' * (int(ig_dict['V_GERM_START_VDJ'])-1) + ig_dict['SEQUENCE_VDJ'] | |
54 | |
55 # Find gapped germline V segment | |
56 vgene = parseAllele(ig_dict['V_CALL'], v_allele_regex, 'first') | |
57 vkey = (vgene, ) | |
58 #TODO: Figure out else case | |
59 if vkey in repo_dict: | |
60 vgap = repo_dict[vkey] | |
61 # Iterate over gaps in the germline segment | |
62 gaps = re.finditer(r'\.', vgap) | |
63 gapcount = int(ig_dict['V_GERM_START_VDJ'])-1 | |
64 for gap in gaps: | |
65 i = gap.start() | |
66 # Break if gap begins after V region | |
67 if i >= ig_dict['V_GERM_LENGTH_VDJ'] + gapcount: | |
68 break | |
69 # Insert gap into IMGT sequence | |
70 seq_imgt = seq_imgt[:i] + '.' + seq_imgt[i:] | |
71 # Update gap counter | |
72 gapcount += 1 | |
73 ig_dict['SEQUENCE_IMGT'] = seq_imgt | |
74 # Update IMGT positioning information for V | |
75 ig_dict['V_GERM_START_IMGT'] = 1 | |
76 ig_dict['V_GERM_LENGTH_IMGT'] = ig_dict['V_GERM_LENGTH_VDJ'] + gapcount | |
77 | |
78 return ig_dict | |
79 | |
80 | |
81 def getIMGTJunc(ig_dict, repo_dict): | |
82 """ | |
83 Identify junction region by IMGT definition | |
84 | |
85 Arguments: | |
86 ig_dict : Dictionary of parsed IgBlast output | |
87 repo_dict : Dictionary of IMGT gapped germline sequences | |
88 | |
89 Returns: | |
90 dict : Updated with JUNCTION_LENGTH_IMGT and JUNCTION_IMGT fields | |
91 """ | |
92 # Find germline J segment | |
93 jgene = parseAllele(ig_dict['J_CALL'], j_allele_regex, 'first') | |
94 jkey = (jgene, ) | |
95 #TODO: Figure out else case | |
96 if jkey in repo_dict: | |
97 # Get germline J sequence | |
98 jgerm = repo_dict[jkey] | |
99 jgerm = jgerm[:ig_dict['J_GERM_START']+ig_dict['J_GERM_LENGTH']-1] | |
100 # Look for (F|W)GXG aa motif in nt sequence | |
101 motif = re.search(r'T(TT|TC|GG)GG[ACGT]{4}GG[AGCT]', jgerm) | |
102 aa_end = len(ig_dict['SEQUENCE_IMGT']) | |
103 #TODO: Figure out else case | |
104 if motif: | |
105 # print('\n', motif.group()) | |
106 aa_end = motif.start() - len(jgerm) + 3 | |
107 # Add fields to dict | |
108 ig_dict['JUNCTION'] = ig_dict['SEQUENCE_IMGT'][309:aa_end] | |
109 ig_dict['JUNCTION_LENGTH'] = len(ig_dict['JUNCTION']) | |
110 | |
111 return ig_dict | |
112 | |
113 | |
114 def getRegions(ig_dict): | |
115 """ | |
116 Identify FWR and CDR regions by IMGT definition | |
117 | |
118 Arguments: | |
119 ig_dict : Dictionary of parsed alignment output | |
120 | |
121 Returns: | |
122 dict : Updated with FWR1_IMGT, FWR2_IMGT, FWR3_IMGT, FWR4_IMGT, | |
123 CDR1_IMGT, CDR2_IMGT, and CDR3_IMGT fields | |
124 """ | |
125 try: | |
126 seq_len = len(ig_dict['SEQUENCE_IMGT']) | |
127 ig_dict['FWR1_IMGT'] = ig_dict['SEQUENCE_IMGT'][0:min(78,seq_len)] | |
128 except (KeyError, IndexError): | |
129 return ig_dict | |
130 | |
131 try: ig_dict['CDR1_IMGT'] = ig_dict['SEQUENCE_IMGT'][78:min(114, seq_len)] | |
132 except (IndexError): return ig_dict | |
133 | |
134 try: ig_dict['FWR2_IMGT'] = ig_dict['SEQUENCE_IMGT'][114:min(165, seq_len)] | |
135 except (IndexError): return ig_dict | |
136 | |
137 try: ig_dict['CDR2_IMGT'] = ig_dict['SEQUENCE_IMGT'][165:min(195, seq_len)] | |
138 except (IndexError): return ig_dict | |
139 | |
140 try: ig_dict['FWR3_IMGT'] = ig_dict['SEQUENCE_IMGT'][195:min(312, seq_len)] | |
141 except (IndexError): return ig_dict | |
142 | |
143 try: | |
144 cdr3_end = 306 + ig_dict['JUNCTION_LENGTH'] | |
145 ig_dict['CDR3_IMGT'] = ig_dict['SEQUENCE_IMGT'][312:cdr3_end] | |
146 ig_dict['FWR4_IMGT'] = ig_dict['SEQUENCE_IMGT'][cdr3_end:] | |
147 except (KeyError, IndexError): | |
148 return ig_dict | |
149 | |
150 return ig_dict | |
151 | |
152 | |
153 def getSeqforIgBlast(seq_file): | |
154 """ | |
155 Fetch input sequences for IgBlast queries | |
156 | |
157 Arguments: | |
158 seq_file = a fasta file of sequences input to IgBlast | |
159 | |
160 Returns: | |
161 a dictionary of {ID:Seq} | |
162 """ | |
163 | |
164 seq_dict = SeqIO.index(seq_file, "fasta", IUPAC.ambiguous_dna) | |
165 | |
166 # Create a seq_dict ID translation using IDs truncate up to space or 50 chars | |
167 seqs = {} | |
168 for seq in seq_dict.values(): | |
169 seqs.update({seq.description:str(seq.seq)}) | |
170 | |
171 return seqs | |
172 | |
173 | |
174 def findLine(handle, query): | |
175 """ | |
176 Finds line with query string in file | |
177 | |
178 Arguments: | |
179 handle = file handle in which to search for line | |
180 query = query string for which to search in file | |
181 | |
182 Returns: | |
183 line from handle in which query string was found | |
184 """ | |
185 for line in handle: | |
186 if(re.match(query, line)): | |
187 return line | |
188 | |
189 | |
190 def extractIMGT(imgt_output): | |
191 """ | |
192 Extract necessary files from IMGT results, zipped or unzipped | |
193 | |
194 Arguments: | |
195 imgt_output = zipped file or unzipped folder output by IMGT | |
196 | |
197 Returns: | |
198 sorted list of filenames from which information will be read | |
199 """ | |
200 #file_ext = os.path.splitext(imgt_output)[1].lower() | |
201 imgt_flags = ('1_Summary', '2_IMGT-gapped', '3_Nt-sequences', '6_Junction') | |
202 temp_dir = mkdtemp() | |
203 if zipfile.is_zipfile(imgt_output): | |
204 # Open zip file | |
205 imgt_zip = zipfile.ZipFile(imgt_output, 'r') | |
206 # Extract required files | |
207 imgt_files = sorted([n for n in imgt_zip.namelist() \ | |
208 if os.path.basename(n).startswith(imgt_flags)]) | |
209 imgt_zip.extractall(temp_dir, imgt_files) | |
210 # Define file list | |
211 imgt_files = [os.path.join(temp_dir, f) for f in imgt_files] | |
212 elif os.path.isdir(imgt_output): | |
213 # Find required files in folder | |
214 folder_files = [] | |
215 for root, dirs, files in os.walk(imgt_output): | |
216 folder_files.extend([os.path.join(os.path.abspath(root), f) for f in files]) | |
217 # Define file list | |
218 imgt_files = sorted([n for n in folder_files \ | |
219 if os.path.basename(n).startswith(imgt_flags)]) | |
220 elif tarfile.is_tarfile(imgt_output): | |
221 # Open zip file | |
222 imgt_tar = tarfile.open(imgt_output, 'r') | |
223 # Extract required files | |
224 imgt_files = sorted([n for n in imgt_tar.getnames() \ | |
225 if os.path.basename(n).startswith(imgt_flags)]) | |
226 imgt_tar.extractall(temp_dir, [imgt_tar.getmember(n) for n in imgt_files]) | |
227 # Define file list | |
228 imgt_files = [os.path.join(temp_dir, f) for f in imgt_files] | |
229 else: | |
230 sys.exit('ERROR: Unsupported IGMT output file. Must be either a zipped file (.zip), LZMA compressed tarfile (.txz) or a folder.') | |
231 | |
232 if len(imgt_files) > len(imgt_flags): # e.g. multiple 1_Summary files | |
233 sys.exit('ERROR: Wrong files in IMGT output %s.' % imgt_output) | |
234 elif len(imgt_files) < len(imgt_flags): | |
235 sys.exit('ERROR: Missing necessary file IMGT output %s.' % imgt_output) | |
236 | |
237 return temp_dir, imgt_files | |
238 | |
239 | |
240 # TODO: return a dictionary with keys determined by the comment strings in the blocks, thus avoiding problems with missing blocks | |
241 def readOneIgBlastResult(block): | |
242 """ | |
243 Parse a single IgBLAST query result | |
244 | |
245 Arguments: | |
246 block = itertools groupby object of single result | |
247 | |
248 Returns: | |
249 None if no results, otherwise list of DataFrames for each result block | |
250 """ | |
251 results = list() | |
252 i = 0 | |
253 for match, subblock in groupby(block, lambda l: l=='\n'): | |
254 if not match: | |
255 # Strip whitespace and comments | |
256 sub = [s.strip() for s in subblock if not s.startswith('#')] | |
257 | |
258 # Continue on empty block | |
259 if not sub: continue | |
260 else: i += 1 | |
261 | |
262 # Split by tabs | |
263 sub = [s.split('\t') for s in sub] | |
264 | |
265 # Append list for "V-(D)-J rearrangement summary" (i == 1) | |
266 # And "V-(D)-J junction details" (i == 2) | |
267 # Otherwise append DataFrame of subblock | |
268 if i == 1 or i == 2: | |
269 results.append(sub[0]) | |
270 else: | |
271 df = pd.DataFrame(sub) | |
272 if not df.empty: results.append(df) | |
273 | |
274 return results if results else None | |
275 | |
276 | |
277 # TODO: needs more speeds. pandas is probably to blame. | |
278 def readIgBlast(igblast_output, seq_dict, repo_dict, | |
279 score_fields=False, region_fields=False): | |
280 """ | |
281 Reads IgBlast output | |
282 | |
283 Arguments: | |
284 igblast_output = IgBlast output file (format 7) | |
285 seq_dict = a dictionary of {ID:Seq} from input fasta file | |
286 repo_dict = dictionary of IMGT gapped germline sequences | |
287 score_fields = if True parse alignment scores | |
288 region_fields = if True add FWR and CDR region fields | |
289 | |
290 Returns: | |
291 a generator of dictionaries containing alignment data | |
292 """ | |
293 | |
294 # Open IgBlast output file | |
295 with open(igblast_output) as f: | |
296 # Iterate over individual results (separated by # IGBLASTN) | |
297 for k1, block in groupby(f, lambda x: re.match('# IGBLASTN', x)): | |
298 block = list(block) | |
299 if not k1: | |
300 # TODO: move query name extraction into block parser readOneIgBlastResult(). | |
301 # Extract sequence ID | |
302 query_name = ' '.join(block[0].strip().split(' ')[2:]) | |
303 # Initialize db_gen to have ID and input sequence | |
304 db_gen = {'SEQUENCE_ID': query_name, | |
305 'SEQUENCE_INPUT': seq_dict[query_name]} | |
306 | |
307 # Parse further sub-blocks | |
308 block_list = readOneIgBlastResult(block) | |
309 | |
310 # TODO: this is indented pretty far. should be a separate function. or several functions. | |
311 # If results exist, parse further to obtain full db_gen | |
312 if block_list is not None: | |
313 # Parse quality information | |
314 db_gen['STOP'] = 'T' if block_list[0][-4] == 'Yes' else 'F' | |
315 db_gen['IN_FRAME'] = 'T' if block_list[0][-3] == 'In-frame' else 'F' | |
316 db_gen['FUNCTIONAL'] = 'T' if block_list[0][-2] == 'Yes' else 'F' | |
317 if block_list[0][-1] == '-': | |
318 db_gen['SEQUENCE_INPUT'] = str(Seq(db_gen['SEQUENCE_INPUT'], | |
319 IUPAC.ambiguous_dna).reverse_complement()) | |
320 | |
321 # Parse V, D, and J calls | |
322 call_str = ' '.join(block_list[0]) | |
323 v_call = parseAllele(call_str, v_allele_regex, action='list') | |
324 d_call = parseAllele(call_str, d_allele_regex, action='list') | |
325 j_call = parseAllele(call_str, j_allele_regex, action='list') | |
326 db_gen['V_CALL'] = ','.join(v_call) if v_call is not None else 'None' | |
327 db_gen['D_CALL'] = ','.join(d_call) if d_call is not None else 'None' | |
328 db_gen['J_CALL'] = ','.join(j_call) if j_call is not None else 'None' | |
329 | |
330 # Parse junction sequence | |
331 # db_gen['JUNCTION_VDJ'] = re.sub('(N/A)|\[|\(|\)|\]', '', ''.join(block_list[1])) | |
332 # db_gen['JUNCTION_LENGTH_VDJ'] = len(db_gen['JUNCTION_VDJ']) | |
333 | |
334 # TODO: IgBLAST does a stupid and doesn't output block #3 sometimes. why? | |
335 # TODO: maybe we should fail these. they look craptastic. | |
336 #pd.set_option('display.width', 500) | |
337 #print query_name, len(block_list), hit_idx | |
338 #for i, x in enumerate(block_list): | |
339 # print '[%i]' % i | |
340 # print x | |
341 | |
342 # Parse segment start and stop positions | |
343 hit_df = block_list[-1] | |
344 | |
345 # Alignment info block | |
346 # 0: segment | |
347 # 1: query id | |
348 # 2: subject id | |
349 # 3: % identity | |
350 # 4: alignment length | |
351 # 5: mismatches | |
352 # 6: gap opens | |
353 # 7: gaps | |
354 # 8: q. start | |
355 # 9: q. end | |
356 # 10: s. start | |
357 # 11: s. end | |
358 # 12: evalue | |
359 # 13: bit score | |
360 # 14: query seq | |
361 # 15: subject seq | |
362 # 16: btop | |
363 | |
364 # If V call exists, parse V alignment information | |
365 seq_vdj = '' | |
366 if v_call is not None: | |
367 v_align = hit_df[hit_df[0] == 'V'].iloc[0] | |
368 # Germline positions | |
369 db_gen['V_GERM_START_VDJ'] = int(v_align[10]) | |
370 db_gen['V_GERM_LENGTH_VDJ'] = int(v_align[11]) - db_gen['V_GERM_START_VDJ'] + 1 | |
371 # Query sequence positions | |
372 db_gen['V_SEQ_START'] = int(v_align[8]) | |
373 db_gen['V_SEQ_LENGTH'] = int(v_align[9]) - db_gen['V_SEQ_START'] + 1 | |
374 | |
375 if int(v_align[6]) == 0: | |
376 db_gen['INDELS'] = 'F' | |
377 else: | |
378 db_gen['INDELS'] = 'T' | |
379 # Set functional to none so record gets tossed (junction will be wrong) | |
380 # db_gen['FUNCTIONAL'] = None | |
381 | |
382 # V alignment scores | |
383 if score_fields: | |
384 try: db_gen['V_SCORE'] = float(v_align[13]) | |
385 except (TypeError, ValueError): db_gen['V_SCORE'] = 'None' | |
386 | |
387 try: db_gen['V_IDENTITY'] = float(v_align[3]) / 100.0 | |
388 except (TypeError, ValueError): db_gen['V_IDENTITY'] = 'None' | |
389 | |
390 try: db_gen['V_EVALUE'] = float(v_align[12]) | |
391 except (TypeError, ValueError): db_gen['V_EVALUE'] = 'None' | |
392 | |
393 try: db_gen['V_BTOP'] = v_align[16] | |
394 except (TypeError, ValueError): db_gen['V_BTOP'] = 'None' | |
395 | |
396 # Update VDJ sequence, removing insertions | |
397 start = 0 | |
398 for m in re.finditer(r'-', v_align[15]): | |
399 ins = m.start() | |
400 seq_vdj += v_align[14][start:ins] | |
401 start = ins + 1 | |
402 seq_vdj += v_align[14][start:] | |
403 | |
404 # TODO: needs to check that the V results are present before trying to determine N1_LENGTH from them. | |
405 # If D call exists, parse D alignment information | |
406 if d_call is not None: | |
407 d_align = hit_df[hit_df[0] == 'D'].iloc[0] | |
408 | |
409 # TODO: this is kinda gross. not sure how else to fix the alignment overlap problem though. | |
410 # Determine N-region length and amount of J overlap with V or D alignment | |
411 overlap = 0 | |
412 if v_call is not None: | |
413 n1_len = int(d_align[8]) - (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH']) | |
414 if n1_len < 0: | |
415 db_gen['N1_LENGTH'] = 0 | |
416 overlap = abs(n1_len) | |
417 else: | |
418 db_gen['N1_LENGTH'] = n1_len | |
419 n1_start = (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH']-1) | |
420 n1_end = int(d_align[8])-1 | |
421 seq_vdj += db_gen['SEQUENCE_INPUT'][n1_start:n1_end] | |
422 | |
423 # Query sequence positions | |
424 db_gen['D_SEQ_START'] = int(d_align[8]) + overlap | |
425 db_gen['D_SEQ_LENGTH'] = max(int(d_align[9]) - db_gen['D_SEQ_START'] + 1, 0) | |
426 | |
427 # Germline positions | |
428 db_gen['D_GERM_START'] = int(d_align[10]) + overlap | |
429 db_gen['D_GERM_LENGTH'] = max(int(d_align[11]) - db_gen['D_GERM_START'] + 1, 0) | |
430 | |
431 # Update VDJ sequence, removing insertions | |
432 start = overlap | |
433 for m in re.finditer(r'-', d_align[15]): | |
434 ins = m.start() | |
435 seq_vdj += d_align[14][start:ins] | |
436 start = ins + 1 | |
437 seq_vdj += d_align[14][start:] | |
438 | |
439 # TODO: needs to check that the V results are present before trying to determine N1_LENGTH from them. | |
440 # If J call exists, parse J alignment information | |
441 if j_call is not None: | |
442 j_align = hit_df[hit_df[0] == 'J'].iloc[0] | |
443 | |
444 # TODO: this is kinda gross. not sure how else to fix the alignment overlap problem though. | |
445 # Determine N-region length and amount of J overlap with V or D alignment | |
446 overlap = 0 | |
447 if d_call is not None: | |
448 n2_len = int(j_align[8]) - (db_gen['D_SEQ_START'] + db_gen['D_SEQ_LENGTH']) | |
449 if n2_len < 0: | |
450 db_gen['N2_LENGTH'] = 0 | |
451 overlap = abs(n2_len) | |
452 else: | |
453 db_gen['N2_LENGTH'] = n2_len | |
454 n2_start = (db_gen['D_SEQ_START']+db_gen['D_SEQ_LENGTH']-1) | |
455 n2_end = int(j_align[8])-1 | |
456 seq_vdj += db_gen['SEQUENCE_INPUT'][n2_start:n2_end] | |
457 elif v_call is not None: | |
458 n1_len = int(j_align[8]) - (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH']) | |
459 if n1_len < 0: | |
460 db_gen['N1_LENGTH'] = 0 | |
461 overlap = abs(n1_len) | |
462 else: | |
463 db_gen['N1_LENGTH'] = n1_len | |
464 n1_start = (db_gen['V_SEQ_START']+db_gen['V_SEQ_LENGTH']-1) | |
465 n1_end = int(j_align[8])-1 | |
466 seq_vdj += db_gen['SEQUENCE_INPUT'][n1_start:n1_end] | |
467 else: | |
468 db_gen['N1_LENGTH'] = 0 | |
469 | |
470 # Query positions | |
471 db_gen['J_SEQ_START'] = int(j_align[8]) + overlap | |
472 db_gen['J_SEQ_LENGTH'] = max(int(j_align[9]) - db_gen['J_SEQ_START'] + 1, 0) | |
473 | |
474 # Germline positions | |
475 db_gen['J_GERM_START'] = int(j_align[10]) + overlap | |
476 db_gen['J_GERM_LENGTH'] = max(int(j_align[11]) - db_gen['J_GERM_START'] + 1, 0) | |
477 | |
478 # J alignment scores | |
479 if score_fields: | |
480 try: db_gen['J_SCORE'] = float(j_align[13]) | |
481 except (TypeError, ValueError): db_gen['J_SCORE'] = 'None' | |
482 | |
483 try: db_gen['J_IDENTITY'] = float(j_align[3]) / 100.0 | |
484 except (TypeError, ValueError): db_gen['J_IDENTITY'] = 'None' | |
485 | |
486 try: db_gen['J_EVALUE'] = float(j_align[12]) | |
487 except (TypeError, ValueError): db_gen['J_EVALUE'] = 'None' | |
488 | |
489 try: db_gen['J_BTOP'] = j_align[16] | |
490 except (TypeError, ValueError): db_gen['J_BTOP'] = 'None' | |
491 | |
492 # Update VDJ sequence, removing insertions | |
493 start = overlap | |
494 for m in re.finditer(r'-', j_align[15]): | |
495 ins = m.start() | |
496 seq_vdj += j_align[14][start:ins] | |
497 start = ins + 1 | |
498 seq_vdj += j_align[14][start:] | |
499 | |
500 db_gen['SEQUENCE_VDJ'] = seq_vdj | |
501 | |
502 # Create IMGT-gapped sequence and infer IMGT junction | |
503 if v_call is not None: | |
504 db_gen = gapV(db_gen, repo_dict) | |
505 if j_call is not None: | |
506 db_gen = getIMGTJunc(db_gen, repo_dict) | |
507 | |
508 # FWR and CDR regions | |
509 if region_fields: getRegions(db_gen) | |
510 | |
511 yield IgRecord(db_gen) | |
512 | |
513 | |
514 # TODO: should be more readable | |
515 def readIMGT(imgt_files, score_fields=False, region_fields=False): | |
516 """ | |
517 Reads IMGT/HighV-Quest output | |
518 | |
519 Arguments: | |
520 imgt_files = IMGT/HighV-Quest output files 1, 2, 3, and 6 | |
521 score_fields = if True parse alignment scores | |
522 region_fields = if True add FWR and CDR region fields | |
523 | |
524 Returns: | |
525 a generator of dictionaries containing alignment data | |
526 """ | |
527 imgt_iters = [csv.DictReader(open(f, 'rU'), delimiter='\t') for f in imgt_files] | |
528 # Create a dictionary for each sequence alignment and yield its generator | |
529 for sm, gp, nt, jn in zip(*imgt_iters): | |
530 if len(set([sm['Sequence ID'], | |
531 gp['Sequence ID'], | |
532 nt['Sequence ID'], | |
533 jn['Sequence ID']])) != 1: | |
534 sys.exit('Error: IMGT files are corrupt starting with Summary file record %s' \ | |
535 % sm['Sequence ID']) | |
536 | |
537 db_gen = {'SEQUENCE_ID': sm['Sequence ID'], | |
538 'SEQUENCE_INPUT': sm['Sequence']} | |
539 | |
540 if 'No results' not in sm['Functionality']: | |
541 db_gen['FUNCTIONAL'] = ['?','T','F'][('productive' in sm['Functionality']) + | |
542 ('unprod' in sm['Functionality'])] | |
543 db_gen['IN_FRAME'] = ['?','T','F'][('in-frame' in sm['JUNCTION frame']) + | |
544 ('out-of-frame' in sm['JUNCTION frame'])], | |
545 db_gen['STOP'] = ['F','?','T'][('stop codon' in sm['Functionality comment']) + | |
546 ('unprod' in sm['Functionality'])] | |
547 db_gen['MUTATED_INVARIANT'] = ['F','?','T'][(any(('missing' in sm['Functionality comment'], | |
548 'missing' in sm['V-REGION potential ins/del']))) + | |
549 ('unprod' in sm['Functionality'])] | |
550 db_gen['INDELS'] = ['F','T'][any((sm['V-REGION potential ins/del'], | |
551 sm['V-REGION insertions'], | |
552 sm['V-REGION deletions']))] | |
553 | |
554 db_gen['SEQUENCE_VDJ'] = nt['V-D-J-REGION'] if nt['V-D-J-REGION'] else nt['V-J-REGION'] | |
555 db_gen['SEQUENCE_IMGT'] = gp['V-D-J-REGION'] if gp['V-D-J-REGION'] else gp['V-J-REGION'] | |
556 | |
557 db_gen['V_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['V-GENE and allele'])) | |
558 db_gen['D_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['D-GENE and allele'])) | |
559 db_gen['J_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['J-GENE and allele'])) | |
560 | |
561 v_seq_length = len(nt['V-REGION']) if nt['V-REGION'] else 0 | |
562 db_gen['V_SEQ_START'] = nt['V-REGION start'] | |
563 db_gen['V_SEQ_LENGTH'] = v_seq_length | |
564 db_gen['V_GERM_START_IMGT'] = 1 | |
565 db_gen['V_GERM_LENGTH_IMGT'] = len(gp['V-REGION']) if gp['V-REGION'] else 0 | |
566 | |
567 db_gen['N1_LENGTH'] = sum(int(i) for i in [jn["P3'V-nt nb"], | |
568 jn['N-REGION-nt nb'], | |
569 jn['N1-REGION-nt nb'], | |
570 jn["P5'D-nt nb"]] if i) | |
571 db_gen['D_SEQ_START'] = sum(int(i) for i in [1, v_seq_length, | |
572 jn["P3'V-nt nb"], | |
573 jn['N-REGION-nt nb'], | |
574 jn['N1-REGION-nt nb'], | |
575 jn["P5'D-nt nb"]] if i) | |
576 db_gen['D_SEQ_LENGTH'] = int(jn["D-REGION-nt nb"] or 0) | |
577 db_gen['D_GERM_START'] = int(jn["5'D-REGION trimmed-nt nb"] or 0) + 1 | |
578 db_gen['D_GERM_LENGTH'] = int(jn["D-REGION-nt nb"] or 0) | |
579 db_gen['N2_LENGTH'] = sum(int(i) for i in [jn["P3'D-nt nb"], | |
580 jn['N2-REGION-nt nb'], | |
581 jn["P5'J-nt nb"]] if i) | |
582 | |
583 db_gen['J_SEQ_START_IMGT'] = sum(int(i) for i in [1, v_seq_length, | |
584 jn["P3'V-nt nb"], | |
585 jn['N-REGION-nt nb'], | |
586 jn['N1-REGION-nt nb'], | |
587 jn["P5'D-nt nb"], | |
588 jn["D-REGION-nt nb"], | |
589 jn["P3'D-nt nb"], | |
590 jn['N2-REGION-nt nb'], | |
591 jn["P5'J-nt nb"]] if i) | |
592 db_gen['J_SEQ_LENGTH'] = len(nt['J-REGION']) if nt['J-REGION'] else 0 | |
593 db_gen['J_GERM_START'] = int(jn["5'J-REGION trimmed-nt nb"] or 0) + 1 | |
594 db_gen['J_GERM_LENGTH'] = len(gp['J-REGION']) if gp['J-REGION'] else 0 | |
595 | |
596 db_gen['JUNCTION_LENGTH'] = len(jn['JUNCTION']) if jn['JUNCTION'] else 0 | |
597 db_gen['JUNCTION'] = jn['JUNCTION'] | |
598 | |
599 # Alignment scores | |
600 if score_fields: | |
601 try: db_gen['V_SCORE'] = float(sm['V-REGION score']) | |
602 except (TypeError, ValueError): db_gen['V_SCORE'] = 'None' | |
603 | |
604 try: db_gen['V_IDENTITY'] = float(sm['V-REGION identity %']) / 100.0 | |
605 except (TypeError, ValueError): db_gen['V_IDENTITY'] = 'None' | |
606 | |
607 try: db_gen['J_SCORE'] = float(sm['J-REGION score']) | |
608 except (TypeError, ValueError): db_gen['J_SCORE'] = 'None' | |
609 | |
610 try: db_gen['J_IDENTITY'] = float(sm['J-REGION identity %']) / 100.0 | |
611 except (TypeError, ValueError): db_gen['J_IDENTITY'] = 'None' | |
612 | |
613 # FWR and CDR regions | |
614 if region_fields: getRegions(db_gen) | |
615 else: | |
616 db_gen['V_CALL'] = 'None' | |
617 db_gen['D_CALL'] = 'None' | |
618 db_gen['J_CALL'] = 'None' | |
619 | |
620 yield IgRecord(db_gen) | |
621 | |
622 | |
623 def getIDforIMGT(seq_file): | |
624 """ | |
625 Create a sequence ID translation using IMGT truncation | |
626 | |
627 Arguments: | |
628 seq_file = a fasta file of sequences input to IMGT | |
629 | |
630 Returns: | |
631 a dictionary of {truncated ID: full seq description} | |
632 """ | |
633 | |
634 # Create a seq_dict ID translation using IDs truncate up to space or 50 chars | |
635 ids = {} | |
636 for i, rec in enumerate(SeqIO.parse(seq_file, 'fasta', IUPAC.ambiguous_dna)): | |
637 if len(rec.description) <= 50: | |
638 id_key = rec.description | |
639 else: | |
640 id_key = re.sub('\||\s|!|&|\*|<|>|\?','_',rec.description[:50]) | |
641 ids.update({id_key:rec.description}) | |
642 | |
643 return ids | |
644 | |
645 | |
646 def writeDb(db_gen, file_prefix, total_count, id_dict={}, no_parse=True, | |
647 score_fields=False, region_fields=False, out_args=default_out_args): | |
648 """ | |
649 Writes tab-delimited database file in output directory | |
650 | |
651 Arguments: | |
652 db_gen = a generator of IgRecord objects containing alignment data | |
653 file_prefix = directory and prefix for CLIP tab-delim file | |
654 total_count = number of records (for progress bar) | |
655 id_dict = a dictionary of {IMGT ID: full seq description} | |
656 no_parse = if ID is to be parsed for pRESTO output with default delimiters | |
657 score_fields = if True add alignment score fields to output file | |
658 region_fields = if True add FWR and CDR region fields to output file | |
659 out_args = common output argument dictionary from parseCommonArgs | |
660 | |
661 Returns: | |
662 None | |
663 """ | |
664 pass_file = "%s_db-pass.tab" % file_prefix | |
665 fail_file = "%s_db-fail.tab" % file_prefix | |
666 ordered_fields = ['SEQUENCE_ID', | |
667 'SEQUENCE_INPUT', | |
668 'FUNCTIONAL', | |
669 'IN_FRAME', | |
670 'STOP', | |
671 'MUTATED_INVARIANT', | |
672 'INDELS', | |
673 'V_CALL', | |
674 'D_CALL', | |
675 'J_CALL', | |
676 'SEQUENCE_VDJ', | |
677 'SEQUENCE_IMGT', | |
678 'V_SEQ_START', | |
679 'V_SEQ_LENGTH', | |
680 'V_GERM_START_VDJ', | |
681 'V_GERM_LENGTH_VDJ', | |
682 'V_GERM_START_IMGT', | |
683 'V_GERM_LENGTH_IMGT', | |
684 'N1_LENGTH', | |
685 'D_SEQ_START', | |
686 'D_SEQ_LENGTH', | |
687 'D_GERM_START', | |
688 'D_GERM_LENGTH', | |
689 'N2_LENGTH', | |
690 'J_SEQ_START', | |
691 'J_SEQ_LENGTH', | |
692 'J_GERM_START', | |
693 'J_GERM_LENGTH', | |
694 'JUNCTION_LENGTH', | |
695 'JUNCTION'] | |
696 | |
697 if score_fields: | |
698 ordered_fields.extend(['V_SCORE', | |
699 'V_IDENTITY', | |
700 'V_EVALUE', | |
701 'V_BTOP', | |
702 'J_SCORE', | |
703 'J_IDENTITY', | |
704 'J_EVALUE', | |
705 'J_BTOP']) | |
706 | |
707 if region_fields: | |
708 ordered_fields.extend(['FWR1_IMGT', 'FWR2_IMGT', 'FWR3_IMGT', 'FWR4_IMGT', | |
709 'CDR1_IMGT', 'CDR2_IMGT', 'CDR3_IMGT']) | |
710 | |
711 | |
712 # TODO: This is not the best approach. should pass in output fields. | |
713 # Initiate passed handle | |
714 pass_handle = None | |
715 | |
716 # Open failed file | |
717 if out_args['failed']: | |
718 fail_handle = open(fail_file, 'wt') | |
719 fail_writer = getDbWriter(fail_handle, add_fields=['SEQUENCE_ID', 'SEQUENCE_INPUT']) | |
720 else: | |
721 fail_handle = None | |
722 fail_writer = None | |
723 | |
724 # Initialize counters and file | |
725 pass_writer = None | |
726 start_time = time() | |
727 rec_count = pass_count = fail_count = 0 | |
728 for record in db_gen: | |
729 #printProgress(i + (total_count/2 if id_dict else 0), total_count, 0.05, start_time) | |
730 printProgress(rec_count, total_count, 0.05, start_time) | |
731 rec_count += 1 | |
732 | |
733 # Count pass or fail | |
734 if (record.v_call == 'None' and record.j_call == 'None') or \ | |
735 record.functional is None or \ | |
736 not record.seq_vdj or \ | |
737 not record.junction: | |
738 # print(record.v_call, record.j_call, record.functional, record.junction) | |
739 fail_count += 1 | |
740 if fail_writer is not None: fail_writer.writerow(record.toDict()) | |
741 continue | |
742 else: | |
743 pass_count += 1 | |
744 | |
745 # Build sample sequence description | |
746 if record.id in id_dict: | |
747 record.id = id_dict[record.id] | |
748 | |
749 # Parse sequence description into new columns | |
750 if not no_parse: | |
751 record.annotations = parseAnnotation(record.id, delimiter=out_args['delimiter']) | |
752 record.id = record.annotations['ID'] | |
753 del record.annotations['ID'] | |
754 | |
755 # TODO: This is not the best approach. should pass in output fields. | |
756 # If first sequence, use parsed description to create new columns and initialize writer | |
757 if pass_writer is None: | |
758 if not no_parse: ordered_fields.extend(list(record.annotations.keys())) | |
759 pass_handle = open(pass_file, 'wt') | |
760 pass_writer = getDbWriter(pass_handle, add_fields=ordered_fields) | |
761 | |
762 # Write row to tab-delim CLIP file | |
763 pass_writer.writerow(record.toDict()) | |
764 | |
765 # Print log | |
766 #printProgress(i+1 + (total_count/2 if id_dict else 0), total_count, 0.05, start_time) | |
767 printProgress(rec_count, total_count, 0.05, start_time) | |
768 | |
769 log = OrderedDict() | |
770 log['OUTPUT'] = pass_file | |
771 log['PASS'] = pass_count | |
772 log['FAIL'] = fail_count | |
773 log['END'] = 'MakeDb' | |
774 printLog(log) | |
775 | |
776 if pass_handle is not None: pass_handle.close() | |
777 if fail_handle is not None: fail_handle.close() | |
778 | |
779 | |
780 # TODO: may be able to merge with parseIMGT | |
781 def parseIgBlast(igblast_output, seq_file, repo, no_parse=True, score_fields=False, | |
782 region_fields=False, out_args=default_out_args): | |
783 """ | |
784 Main for IgBlast aligned sample sequences | |
785 | |
786 Arguments: | |
787 igblast_output = IgBlast output file to process | |
788 seq_file = fasta file input to IgBlast (from which to get sequence) | |
789 repo = folder with germline repertoire files | |
790 no_parse = if ID is to be parsed for pRESTO output with default delimiters | |
791 score_fields = if True add alignment score fields to output file | |
792 region_fields = if True add FWR and CDR region fields to output file | |
793 out_args = common output argument dictionary from parseCommonArgs | |
794 | |
795 Returns: | |
796 None | |
797 """ | |
798 # Print parameter info | |
799 log = OrderedDict() | |
800 log['START'] = 'MakeDB' | |
801 log['ALIGNER'] = 'IgBlast' | |
802 log['ALIGN_RESULTS'] = os.path.basename(igblast_output) | |
803 log['SEQ_FILE'] = os.path.basename(seq_file) | |
804 log['NO_PARSE'] = no_parse | |
805 log['SCORE_FIELDS'] = score_fields | |
806 log['REGION_FIELDS'] = region_fields | |
807 printLog(log) | |
808 | |
809 # Get input sequence dictionary | |
810 seq_dict = getSeqforIgBlast(seq_file) | |
811 | |
812 # Formalize out_dir and file-prefix | |
813 if not out_args['out_dir']: | |
814 out_dir = os.path.split(igblast_output)[0] | |
815 else: | |
816 out_dir = os.path.abspath(out_args['out_dir']) | |
817 if not os.path.exists(out_dir): os.mkdir(out_dir) | |
818 if out_args['out_name']: | |
819 file_prefix = out_args['out_name'] | |
820 else: | |
821 file_prefix = os.path.basename(os.path.splitext(igblast_output)[0]) | |
822 file_prefix = os.path.join(out_dir, file_prefix) | |
823 | |
824 total_count = countSeqFile(seq_file) | |
825 | |
826 # Create | |
827 repo_dict = getRepo(repo) | |
828 igblast_dict = readIgBlast(igblast_output, seq_dict, repo_dict, | |
829 score_fields=score_fields, region_fields=region_fields) | |
830 writeDb(igblast_dict, file_prefix, total_count, no_parse=no_parse, | |
831 score_fields=score_fields, region_fields=region_fields, out_args=out_args) | |
832 | |
833 | |
834 # TODO: may be able to merge with parseIgBlast | |
835 def parseIMGT(imgt_output, seq_file=None, no_parse=True, score_fields=False, | |
836 region_fields=False, out_args=default_out_args): | |
837 """ | |
838 Main for IMGT aligned sample sequences | |
839 | |
840 Arguments: | |
841 imgt_output = zipped file or unzipped folder output by IMGT | |
842 seq_file = FASTA file input to IMGT (from which to get seqID) | |
843 no_parse = if ID is to be parsed for pRESTO output with default delimiters | |
844 score_fields = if True add alignment score fields to output file | |
845 region_fields = if True add FWR and CDR region fields to output file | |
846 out_args = common output argument dictionary from parseCommonArgs | |
847 | |
848 Returns: | |
849 None | |
850 """ | |
851 # Print parameter info | |
852 log = OrderedDict() | |
853 log['START'] = 'MakeDb' | |
854 log['ALIGNER'] = 'IMGT' | |
855 log['ALIGN_RESULTS'] = imgt_output | |
856 log['SEQ_FILE'] = os.path.basename(seq_file) if seq_file else '' | |
857 log['NO_PARSE'] = no_parse | |
858 log['SCORE_FIELDS'] = score_fields | |
859 log['REGION_FIELDS'] = region_fields | |
860 printLog(log) | |
861 | |
862 # Get individual IMGT result files | |
863 temp_dir, imgt_files = extractIMGT(imgt_output) | |
864 | |
865 # Formalize out_dir and file-prefix | |
866 if not out_args['out_dir']: | |
867 out_dir = os.path.dirname(os.path.abspath(imgt_output)) | |
868 else: | |
869 out_dir = os.path.abspath(out_args['out_dir']) | |
870 if not os.path.exists(out_dir): os.mkdir(out_dir) | |
871 if out_args['out_name']: | |
872 file_prefix = out_args['out_name'] | |
873 else: | |
874 file_prefix = os.path.splitext(os.path.split(os.path.abspath(imgt_output))[1])[0] | |
875 file_prefix = os.path.join(out_dir, file_prefix) | |
876 | |
877 total_count = countDbFile(imgt_files[0]) | |
878 | |
879 # Get (parsed) IDs from fasta file submitted to IMGT | |
880 id_dict = getIDforIMGT(seq_file) if seq_file else {} | |
881 | |
882 # Create | |
883 imgt_dict = readIMGT(imgt_files, score_fields=score_fields, | |
884 region_fields=region_fields) | |
885 writeDb(imgt_dict, file_prefix, total_count, id_dict=id_dict, no_parse=no_parse, | |
886 score_fields=score_fields, region_fields=region_fields, out_args=out_args) | |
887 | |
888 # Delete temp directory | |
889 rmtree(temp_dir) | |
890 | |
891 | |
892 def getArgParser(): | |
893 """ | |
894 Defines the ArgumentParser | |
895 | |
896 Arguments: | |
897 None | |
898 | |
899 Returns: | |
900 an ArgumentParser object | |
901 """ | |
902 fields = dedent( | |
903 ''' | |
904 output files: | |
905 db-pass | |
906 database of parsed alignment records. | |
907 db-fail | |
908 database with records failing alignment. | |
909 | |
910 output fields: | |
911 SEQUENCE_ID, SEQUENCE_INPUT, FUNCTIONAL, IN_FRAME, STOP, MUTATED_INVARIANT, | |
912 INDELS, V_CALL, D_CALL, J_CALL, SEQUENCE_VDJ and/or SEQUENCE_IMGT, | |
913 V_SEQ_START, V_SEQ_LENGTH, V_GERM_START_VDJ and/or V_GERM_START_IMGT, | |
914 V_GERM_LENGTH_VDJ and/or V_GERM_LENGTH_IMGT, N1_LENGTH, | |
915 D_SEQ_START, D_SEQ_LENGTH, D_GERM_START, D_GERM_LENGTH, N2_LENGTH, | |
916 J_SEQ_START, J_SEQ_LENGTH, J_GERM_START, J_GERM_LENGTH, | |
917 JUNCTION_LENGTH, JUNCTION, V_SCORE, V_IDENTITY, V_EVALUE, V_BTOP, | |
918 J_SCORE, J_IDENTITY, J_EVALUE, J_BTOP, FWR1_IMGT, FWR2_IMGT, FWR3_IMGT, | |
919 FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, CDR3_IMGT | |
920 ''') | |
921 | |
922 # Define ArgumentParser | |
923 parser = ArgumentParser(description=__doc__, epilog=fields, | |
924 formatter_class=CommonHelpFormatter) | |
925 parser.add_argument('--version', action='version', | |
926 version='%(prog)s:' + ' %s-%s' %(__version__, __date__)) | |
927 subparsers = parser.add_subparsers(title='subcommands', dest='command', | |
928 help='Aligner used', metavar='') | |
929 # TODO: This is a temporary fix for Python issue 9253 | |
930 subparsers.required = True | |
931 | |
932 # Parent parser | |
933 parser_parent = getCommonArgParser(seq_in=False, seq_out=False, log=False) | |
934 | |
935 # IgBlast Aligner | |
936 parser_igblast = subparsers.add_parser('igblast', help='Process IgBlast output', | |
937 parents=[parser_parent], | |
938 formatter_class=CommonHelpFormatter) | |
939 parser_igblast.set_defaults(func=parseIgBlast) | |
940 parser_igblast.add_argument('-i', nargs='+', action='store', dest='aligner_files', | |
941 required=True, | |
942 help='''IgBLAST output files in format 7 with query sequence | |
943 (IgBLAST argument \'-outfmt "7 std qseq sseq btop"\').''') | |
944 parser_igblast.add_argument('-r', nargs='+', action='store', dest='repo', required=True, | |
945 help='''List of folders and/or fasta files containing | |
946 IMGT-gapped germline sequences corresponding to the | |
947 set of germlines used in the IgBLAST alignment.''') | |
948 parser_igblast.add_argument('-s', action='store', nargs='+', dest='seq_files', | |
949 required=True, | |
950 help='List of input FASTA files containing sequences') | |
951 parser_igblast.add_argument('--noparse', action='store_true', dest='no_parse', | |
952 help='''Specify if input IDs should not be parsed to add | |
953 new columns to database.''') | |
954 parser_igblast.add_argument('--scores', action='store_true', dest='score_fields', | |
955 help='''Specify if alignment score metrics should be | |
956 included in the output. Adds the V_SCORE, V_IDENTITY, | |
957 V_EVALUE, V_BTOP, J_SCORE, J_IDENTITY, | |
958 J_BTOP, and J_EVALUE columns.''') | |
959 parser_igblast.add_argument('--regions', action='store_true', dest='region_fields', | |
960 help='''Specify if IMGT framework and CDR regions should be | |
961 included in the output. Adds the FWR1_IMGT, FWR2_IMGT, | |
962 FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and | |
963 CDR3_IMGT columns.''') | |
964 | |
965 # IMGT aligner | |
966 parser_imgt = subparsers.add_parser('imgt', help='Process IMGT/HighV-Quest output', | |
967 parents=[parser_parent], | |
968 formatter_class=CommonHelpFormatter) | |
969 imgt_arg_group = parser_imgt.add_mutually_exclusive_group(required=True) | |
970 imgt_arg_group.add_argument('-i', nargs='+', action='store', dest='aligner_files', | |
971 help='''Either zipped IMGT output files (.zip) or a folder | |
972 containing unzipped IMGT output files (which must | |
973 include 1_Summary, 2_IMGT-gapped, 3_Nt-sequences, | |
974 and 6_Junction).''') | |
975 parser_imgt.add_argument('-s', nargs='*', action='store', dest='seq_files', | |
976 required=False, | |
977 help='List of input FASTA files containing sequences') | |
978 parser_imgt.add_argument('--noparse', action='store_true', dest='no_parse', | |
979 help='''Specify if input IDs should not be parsed to add new | |
980 columns to database.''') | |
981 parser_imgt.add_argument('--scores', action='store_true', dest='score_fields', | |
982 help='''Specify if alignment score metrics should be | |
983 included in the output. Adds the V_SCORE, V_IDENTITY, | |
984 J_SCORE and J_IDENTITY. Note, this will also add | |
985 the columns V_EVALUE, V_BTOP, J_EVALUE and J_BTOP, | |
986 but they will be empty for IMGT output.''') | |
987 parser_imgt.add_argument('--regions', action='store_true', dest='region_fields', | |
988 help='''Specify if IMGT framework and CDR regions should be | |
989 included in the output. Adds the FWR1_IMGT, FWR2_IMGT, | |
990 FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and | |
991 CDR3_IMGT columns.''') | |
992 parser_imgt.set_defaults(func=parseIMGT) | |
993 | |
994 return parser | |
995 | |
996 | |
997 if __name__ == "__main__": | |
998 """ | |
999 Parses command line arguments and calls main | |
1000 """ | |
1001 parser = getArgParser() | |
1002 args = parser.parse_args() | |
1003 args_dict = parseCommonArgs(args, in_arg='aligner_files') | |
1004 | |
1005 # Set no ID parsing if sequence files are not provided | |
1006 if 'seq_files' in args_dict and not args_dict['seq_files']: | |
1007 args_dict['no_parse'] = True | |
1008 | |
1009 # Delete | |
1010 if 'seq_files' in args_dict: del args_dict['seq_files'] | |
1011 if 'aligner_files' in args_dict: del args_dict['aligner_files'] | |
1012 if 'command' in args_dict: del args_dict['command'] | |
1013 if 'func' in args_dict: del args_dict['func'] | |
1014 | |
1015 if args.command == 'imgt': | |
1016 for i in range(len(args.__dict__['aligner_files'])): | |
1017 args_dict['imgt_output'] = args.__dict__['aligner_files'][i] | |
1018 args_dict['seq_file'] = args.__dict__['seq_files'][i] \ | |
1019 if args.__dict__['seq_files'] else None | |
1020 args.func(**args_dict) | |
1021 elif args.command == 'igblast': | |
1022 for i in range(len(args.__dict__['aligner_files'])): | |
1023 args_dict['igblast_output'] = args.__dict__['aligner_files'][i] | |
1024 args_dict['seq_file'] = args.__dict__['seq_files'][i] | |
1025 args.func(**args_dict) |