annotate MakeDb.py @ 0:183edf446dcf draft default tip

Uploaded
author davidvanzessen
date Mon, 17 Jul 2017 07:44:27 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1 #!/usr/bin/env python3
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
2 """
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
3 Create tab-delimited database file to store sequence alignment information
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
4 """
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
5 # Info
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
6 __author__ = 'Namita Gupta, Jason Anthony Vander Heiden'
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
7 from changeo import __version__, __date__
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
8
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
9 # Imports
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
10 import os
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
11 import sys
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
12 from argparse import ArgumentParser
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
13 from collections import OrderedDict
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
14 from textwrap import dedent
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
15 from time import time
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
16 from Bio import SeqIO
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
17
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
18 # Presto and changeo imports
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
19 from presto.Defaults import default_out_args
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
20 from presto.Annotation import parseAnnotation
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
21 from presto.IO import countSeqFile, printLog, printMessage, printProgress, readSeqFile
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
22 from changeo.Commandline import CommonHelpFormatter, checkArgs, getCommonArgParser, parseCommonArgs
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
23 from changeo.IO import countDbFile, extractIMGT, getDbWriter, readRepo
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
24 from changeo.Parsers import IgBLASTReader, IMGTReader, IHMMuneReader, getIDforIMGT
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
25
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
26
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
27 def getFilePrefix(aligner_output, out_args):
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
28 """
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
29 Get file name prefix and create output directory
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
30
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
31 Arguments:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
32 aligner_output : aligner output file or directory.
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
33 out_args : dictionary of output arguments.
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
34
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
35 Returns:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
36 str : file name prefix.
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
37 """
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
38 # Determine output directory
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
39 if not out_args['out_dir']:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
40 out_dir = os.path.dirname(os.path.abspath(aligner_output))
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
41 else:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
42 out_dir = os.path.abspath(out_args['out_dir'])
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
43 if not os.path.exists(out_dir):
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
44 os.mkdir(out_dir)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
45
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
46 # Determine file prefix
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
47 if out_args['out_name']:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
48 file_prefix = out_args['out_name']
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
49 else:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
50 file_prefix = os.path.splitext(os.path.split(os.path.abspath(aligner_output))[1])[0]
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
51
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
52 return os.path.join(out_dir, file_prefix)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
53
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
54
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
55 def getSeqDict(seq_file):
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
56 """
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
57 Create a dictionary from a sequence file.
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
58
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
59 Arguments:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
60 seq_file : sequence file.
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
61
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
62 Returns:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
63 dict : sequence description as keys with Bio.SeqRecords as values.
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
64 """
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
65 seq_dict = SeqIO.to_dict(readSeqFile(seq_file),
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
66 key_function=lambda x: x.description)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
67
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
68 return seq_dict
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
69
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
70
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
71 def writeDb(db, fields, file_prefix, total_count, id_dict=None, no_parse=True, partial=False,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
72 out_args=default_out_args):
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
73 """
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
74 Writes tab-delimited database file in output directory.
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
75
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
76 Arguments:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
77 db : a iterator of IgRecord objects containing alignment data.
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
78 fields : a list of ordered field names to write.
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
79 file_prefix : directory and prefix for CLIP tab-delim file.
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
80 total_count : number of records (for progress bar).
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
81 id_dict : a dictionary of the truncated sequence ID mapped to the full sequence ID.
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
82 no_parse : if ID is to be parsed for pRESTO output with default delimiters.
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
83 partial : if True put incomplete alignments in the pass file.
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
84 out_args : common output argument dictionary from parseCommonArgs.
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
85
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
86 Returns:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
87 None
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
88 """
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
89 # Function to check for valid records strictly
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
90 def _pass_strict(rec):
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
91 valid = [rec.v_call and rec.v_call != 'None',
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
92 rec.j_call and rec.j_call != 'None',
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
93 rec.functional is not None,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
94 rec.seq_vdj,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
95 rec.junction]
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
96 return all(valid)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
97
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
98 # Function to check for valid records loosely
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
99 def _pass_gentle(rec):
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
100 valid = [rec.v_call and rec.v_call != 'None',
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
101 rec.d_call and rec.d_call != 'None',
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
102 rec.j_call and rec.j_call != 'None']
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
103 return any(valid)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
104
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
105 # Set pass criteria
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
106 _pass = _pass_gentle if partial else _pass_strict
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
107
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
108 # Define output file names
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
109 pass_file = '%s_db-pass.tab' % file_prefix
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
110 fail_file = '%s_db-fail.tab' % file_prefix
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
111
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
112 # Initiate handles, writers and counters
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
113 pass_handle = None
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
114 fail_handle = None
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
115 pass_writer = None
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
116 fail_writer = None
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
117 start_time = time()
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
118 rec_count = pass_count = fail_count = 0
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
119
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
120 # Validate and write output
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
121 printProgress(0, total_count, 0.05, start_time)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
122 for i, record in enumerate(db, start=1):
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
123
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
124 # Replace sequence description with full string, if required
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
125 if id_dict is not None and record.id in id_dict:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
126 record.id = id_dict[record.id]
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
127
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
128 # Parse sequence description into new columns
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
129 if not no_parse:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
130 try:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
131 record.annotations = parseAnnotation(record.id, delimiter=out_args['delimiter'])
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
132 record.id = record.annotations['ID']
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
133 del record.annotations['ID']
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
134
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
135 # TODO: This is not the best approach. should pass in output fields.
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
136 # If first record, use parsed description to define extra columns
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
137 if i == 1: fields.extend(list(record.annotations.keys()))
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
138 except IndexError:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
139 # Could not parse pRESTO-style annotations so fall back to no parse
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
140 no_parse = True
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
141 sys.stderr.write('\nWARNING: Sequence annotation format not recognized. Sequence headers will not be parsed.\n')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
142
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
143 # Count pass or fail and write to appropriate file
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
144 if _pass(record):
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
145 # Open pass file
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
146 if pass_writer is None:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
147 pass_handle = open(pass_file, 'wt')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
148 pass_writer = getDbWriter(pass_handle, add_fields=fields)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
149
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
150 # Write row to pass file
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
151 pass_count += 1
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
152 pass_writer.writerow(record.toDict())
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
153 else:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
154 # Open failed file
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
155 if out_args['failed'] and fail_writer is None:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
156 fail_handle = open(fail_file, 'wt')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
157 fail_writer = getDbWriter(fail_handle, add_fields=fields)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
158
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
159 # Write row to fail file if specified
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
160 fail_count += 1
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
161 if fail_writer is not None:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
162 fail_writer.writerow(record.toDict())
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
163
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
164 # Print progress
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
165 printProgress(i, total_count, 0.05, start_time)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
166
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
167 # Print consol log
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
168 log = OrderedDict()
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
169 log['OUTPUT'] = pass_file
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
170 log['PASS'] = pass_count
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
171 log['FAIL'] = fail_count
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
172 log['END'] = 'MakeDb'
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
173 printLog(log)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
174
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
175 if pass_handle is not None: pass_handle.close()
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
176 if fail_handle is not None: fail_handle.close()
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
177
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
178
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
179 # TODO: may be able to merge with other mains
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
180 def parseIMGT(aligner_output, seq_file=None, no_parse=True, partial=False,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
181 parse_scores=False, parse_regions=False, parse_junction=False,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
182 out_args=default_out_args):
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
183 """
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
184 Main for IMGT aligned sample sequences.
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
185
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
186 Arguments:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
187 aligner_output : zipped file or unzipped folder output by IMGT.
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
188 seq_file : FASTA file input to IMGT (from which to get seqID).
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
189 no_parse : if ID is to be parsed for pRESTO output with default delimiters.
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
190 partial : If True put incomplete alignments in the pass file.
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
191 parse_scores : if True add alignment score fields to output file.
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
192 parse_regions : if True add FWR and CDR region fields to output file.
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
193 out_args : common output argument dictionary from parseCommonArgs.
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
194
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
195 Returns:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
196 None
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
197 """
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
198 # Print parameter info
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
199 log = OrderedDict()
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
200 log['START'] = 'MakeDb'
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
201 log['ALIGNER'] = 'IMGT'
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
202 log['ALIGNER_OUTPUT'] = aligner_output
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
203 log['SEQ_FILE'] = os.path.basename(seq_file) if seq_file else ''
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
204 log['NO_PARSE'] = no_parse
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
205 log['PARTIAL'] = partial
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
206 log['SCORES'] = parse_scores
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
207 log['REGIONS'] = parse_regions
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
208 log['JUNCTION'] = parse_junction
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
209 printLog(log)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
210
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
211 start_time = time()
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
212 printMessage('Loading sequence files', start_time=start_time, width=25)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
213 # Extract IMGT files
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
214 temp_dir, imgt_files = extractIMGT(aligner_output)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
215 # Count records in IMGT files
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
216 total_count = countDbFile(imgt_files['summary'])
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
217 # Get (parsed) IDs from fasta file submitted to IMGT
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
218 id_dict = getIDforIMGT(seq_file) if seq_file else {}
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
219 printMessage('Done', start_time=start_time, end=True, width=25)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
220
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
221 # Parse IMGT output and write db
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
222 with open(imgt_files['summary'], 'r') as summary_handle, \
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
223 open(imgt_files['gapped'], 'r') as gapped_handle, \
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
224 open(imgt_files['ntseq'], 'r') as ntseq_handle, \
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
225 open(imgt_files['junction'], 'r') as junction_handle:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
226 parse_iter = IMGTReader(summary_handle, gapped_handle, ntseq_handle, junction_handle,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
227 parse_scores=parse_scores, parse_regions=parse_regions,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
228 parse_junction=parse_junction)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
229 file_prefix = getFilePrefix(aligner_output, out_args)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
230 writeDb(parse_iter, parse_iter.fields, file_prefix, total_count, id_dict=id_dict,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
231 no_parse=no_parse, partial=partial, out_args=out_args)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
232
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
233 # Cleanup temp directory
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
234 temp_dir.cleanup()
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
235
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
236 return None
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
237
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
238
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
239 # TODO: may be able to merge with other mains
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
240 def parseIgBLAST(aligner_output, seq_file, repo, no_parse=True, partial=False,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
241 parse_regions=False, parse_scores=False, parse_igblast_cdr3=False,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
242 out_args=default_out_args):
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
243 """
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
244 Main for IgBLAST aligned sample sequences.
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
245
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
246 Arguments:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
247 aligner_output : IgBLAST output file to process.
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
248 seq_file : fasta file input to IgBlast (from which to get sequence).
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
249 repo : folder with germline repertoire files.
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
250 no_parse : if ID is to be parsed for pRESTO output with default delimiters.
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
251 partial : If True put incomplete alignments in the pass file.
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
252 parse_regions : if True add FWR and CDR fields to output file.
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
253 parse_scores : if True add alignment score fields to output file.
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
254 parse_igblast_cdr3 : if True parse CDR3 sequences generated by IgBLAST
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
255 out_args : common output argument dictionary from parseCommonArgs.
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
256
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
257 Returns:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
258 None
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
259 """
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
260 # Print parameter info
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
261 log = OrderedDict()
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
262 log['START'] = 'MakeDB'
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
263 log['ALIGNER'] = 'IgBlast'
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
264 log['ALIGNER_OUTPUT'] = os.path.basename(aligner_output)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
265 log['SEQ_FILE'] = os.path.basename(seq_file)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
266 log['NO_PARSE'] = no_parse
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
267 log['PARTIAL'] = partial
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
268 log['SCORES'] = parse_scores
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
269 log['REGIONS'] = parse_regions
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
270 printLog(log)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
271
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
272 start_time = time()
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
273 printMessage('Loading sequence files', start_time=start_time, width=25)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
274 # Count records in sequence file
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
275 total_count = countSeqFile(seq_file)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
276 # Get input sequence dictionary
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
277 seq_dict = getSeqDict(seq_file)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
278 # Create germline repo dictionary
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
279 repo_dict = readRepo(repo)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
280 printMessage('Done', start_time=start_time, end=True, width=25)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
281
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
282 # Parse and write output
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
283 with open(aligner_output, 'r') as f:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
284 parse_iter = IgBLASTReader(f, seq_dict, repo_dict,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
285 parse_scores=parse_scores, parse_regions=parse_regions,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
286 parse_igblast_cdr3=parse_igblast_cdr3)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
287 file_prefix = getFilePrefix(aligner_output, out_args)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
288 writeDb(parse_iter, parse_iter.fields, file_prefix, total_count,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
289 no_parse=no_parse, partial=partial, out_args=out_args)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
290
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
291 return None
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
292
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
293
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
294 # TODO: may be able to merge with other mains
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
295 def parseIHMM(aligner_output, seq_file, repo, no_parse=True, partial=False,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
296 parse_scores=False, parse_regions=False, out_args=default_out_args):
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
297 """
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
298 Main for iHMMuneAlign aligned sample sequences.
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
299
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
300 Arguments:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
301 aligner_output : iHMMune-Align output file to process.
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
302 seq_file : fasta file input to iHMMuneAlign (from which to get sequence).
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
303 repo : folder with germline repertoire files.
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
304 no_parse : if ID is to be parsed for pRESTO output with default delimiters.
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
305 partial : If True put incomplete alignments in the pass file.
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
306 parse_scores : if True parse alignment scores.
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
307 parse_regions : if True add FWR and CDR region fields.
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
308 out_args : common output argument dictionary from parseCommonArgs.
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
309
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
310 Returns:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
311 None
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
312 """
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
313 # Print parameter info
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
314 log = OrderedDict()
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
315 log['START'] = 'MakeDB'
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
316 log['ALIGNER'] = 'iHMMune-Align'
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
317 log['ALIGNER_OUTPUT'] = os.path.basename(aligner_output)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
318 log['SEQ_FILE'] = os.path.basename(seq_file)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
319 log['NO_PARSE'] = no_parse
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
320 log['PARTIAL'] = partial
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
321 log['SCORES'] = parse_scores
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
322 log['REGIONS'] = parse_regions
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
323 printLog(log)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
324
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
325 start_time = time()
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
326 printMessage('Loading sequence files', start_time=start_time, width=25)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
327 # Count records in sequence file
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
328 total_count = countSeqFile(seq_file)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
329 # Get input sequence dictionary
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
330 seq_dict = getSeqDict(seq_file)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
331 # Create germline repo dictionary
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
332 repo_dict = readRepo(repo)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
333 printMessage('Done', start_time=start_time, end=True, width=25)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
334
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
335 # Parse and write output
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
336 with open(aligner_output, 'r') as f:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
337 parse_iter = IHMMuneReader(f, seq_dict, repo_dict,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
338 parse_scores=parse_scores, parse_regions=parse_regions)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
339 file_prefix = getFilePrefix(aligner_output, out_args)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
340 writeDb(parse_iter, parse_iter.fields, file_prefix, total_count,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
341 no_parse=no_parse, partial=partial, out_args=out_args)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
342
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
343 return None
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
344
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
345
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
346 def getArgParser():
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
347 """
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
348 Defines the ArgumentParser.
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
349
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
350 Returns:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
351 argparse.ArgumentParser
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
352 """
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
353 fields = dedent(
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
354 '''
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
355 output files:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
356 db-pass
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
357 database of alignment records with functionality information,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
358 V and J calls, and a junction region.
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
359 db-fail
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
360 database with records that fail due to no functionality information
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
361 (did not pass IMGT), no V call, no J call, or no junction region.
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
362
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
363 universal output fields:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
364 SEQUENCE_ID, SEQUENCE_INPUT, SEQUENCE_VDJ, SEQUENCE_IMGT,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
365 FUNCTIONAL, IN_FRAME, STOP, MUTATED_INVARIANT, INDELS,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
366 V_CALL, D_CALL, J_CALL,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
367 V_SEQ_START, V_SEQ_LENGTH,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
368 D_SEQ_START, D_SEQ_LENGTH, D_GERM_START, D_GERM_LENGTH,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
369 J_SEQ_START, J_SEQ_LENGTH, J_GERM_START, J_GERM_LENGTH,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
370 JUNCTION_LENGTH, JUNCTION, NP1_LENGTH, NP2_LENGTH,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
371 FWR1_IMGT, FWR2_IMGT, FWR3_IMGT, FWR4_IMGT,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
372 CDR1_IMGT, CDR2_IMGT, CDR3_IMGT
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
373
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
374 imgt specific output fields:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
375 V_GERM_START_IMGT, V_GERM_LENGTH_IMGT,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
376 N1_LENGTH, N2_LENGTH, P3V_LENGTH, P5D_LENGTH, P3D_LENGTH, P5J_LENGTH,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
377 D_FRAME, V_SCORE, V_IDENTITY, J_SCORE, J_IDENTITY,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
378
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
379 igblast specific output fields:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
380 V_GERM_START_VDJ, V_GERM_LENGTH_VDJ,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
381 V_EVALUE, V_SCORE, V_IDENTITY, V_BTOP,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
382 J_EVALUE, J_SCORE, J_IDENTITY, J_BTOP.
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
383 CDR3_IGBLAST_NT, CDR3_IGBLAST_AA
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
384
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
385 ihmm specific output fields:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
386 V_GERM_START_VDJ, V_GERM_LENGTH_VDJ,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
387 HMM_SCORE
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
388 ''')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
389
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
390 # Define ArgumentParser
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
391 parser = ArgumentParser(description=__doc__, epilog=fields,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
392 formatter_class=CommonHelpFormatter)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
393 parser.add_argument('--version', action='version',
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
394 version='%(prog)s:' + ' %s-%s' %(__version__, __date__))
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
395 subparsers = parser.add_subparsers(title='subcommands', dest='command',
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
396 help='Aligner used', metavar='')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
397 # TODO: This is a temporary fix for Python issue 9253
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
398 subparsers.required = True
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
399
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
400 # Parent parser
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
401 parser_parent = getCommonArgParser(seq_in=False, seq_out=False, log=False)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
402
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
403 # IgBlast Aligner
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
404 parser_igblast = subparsers.add_parser('igblast', parents=[parser_parent],
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
405 formatter_class=CommonHelpFormatter,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
406 help='Process IgBLAST output.',
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
407 description='Process IgBLAST output.')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
408 parser_igblast.add_argument('-i', nargs='+', action='store', dest='aligner_outputs',
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
409 required=True,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
410 help='''IgBLAST output files in format 7 with query sequence
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
411 (IgBLAST argument \'-outfmt "7 std qseq sseq btop"\').''')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
412 parser_igblast.add_argument('-r', nargs='+', action='store', dest='repo', required=True,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
413 help='''List of folders and/or fasta files containing
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
414 IMGT-gapped germline sequences corresponding to the
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
415 set of germlines used in the IgBLAST alignment.''')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
416 parser_igblast.add_argument('-s', action='store', nargs='+', dest='seq_files',
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
417 required=True,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
418 help='''List of input FASTA files (with .fasta, .fna or .fa
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
419 extension), containing sequences.''')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
420 parser_igblast.add_argument('--noparse', action='store_true', dest='no_parse',
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
421 help='''Specify to prevent input sequence headers from being parsed
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
422 to add new columns to database. Parsing of sequence headers requires
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
423 headers to be in the pRESTO annotation format, so this should be specified
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
424 when sequence headers are incompatible with the pRESTO annotation scheme.
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
425 Note, unrecognized header formats will default to this behavior.''')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
426 parser_igblast.add_argument('--partial', action='store_true', dest='partial',
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
427 help='''If specified, include incomplete V(D)J alignments in
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
428 the pass file instead of the fail file.''')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
429 parser_igblast.add_argument('--scores', action='store_true', dest='parse_scores',
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
430 help='''Specify if alignment score metrics should be
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
431 included in the output. Adds the V_SCORE, V_IDENTITY,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
432 V_EVALUE, V_BTOP, J_SCORE, J_IDENTITY,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
433 J_BTOP, and J_EVALUE columns.''')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
434 parser_igblast.add_argument('--regions', action='store_true', dest='parse_regions',
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
435 help='''Specify if IMGT FWR and CDRs should be
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
436 included in the output. Adds the FWR1_IMGT, FWR2_IMGT,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
437 FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
438 CDR3_IMGT columns.''')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
439 parser_igblast.add_argument('--cdr3', action='store_true',
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
440 dest='parse_igblast_cdr3',
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
441 help='''Specify if the CDR3 sequences generated by IgBLAST
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
442 should be included in the output. Adds the columns
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
443 CDR3_IGBLAST_NT and CDR3_IGBLAST_AA. Requires IgBLAST
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
444 version 1.5 or greater.''')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
445 parser_igblast.set_defaults(func=parseIgBLAST)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
446
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
447 # IMGT aligner
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
448 parser_imgt = subparsers.add_parser('imgt', parents=[parser_parent],
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
449 formatter_class=CommonHelpFormatter,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
450 help='''Process IMGT/HighV-Quest output
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
451 (does not work with V-QUEST).''',
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
452 description='''Process IMGT/HighV-Quest output
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
453 (does not work with V-QUEST).''')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
454 parser_imgt.add_argument('-i', nargs='+', action='store', dest='aligner_outputs',
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
455 help='''Either zipped IMGT output files (.zip or .txz) or a
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
456 folder containing unzipped IMGT output files (which must
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
457 include 1_Summary, 2_IMGT-gapped, 3_Nt-sequences,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
458 and 6_Junction).''')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
459 parser_imgt.add_argument('-s', nargs='*', action='store', dest='seq_files',
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
460 required=False,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
461 help='''List of input FASTA files (with .fasta, .fna or .fa
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
462 extension) containing sequences.''')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
463 parser_imgt.add_argument('--noparse', action='store_true', dest='no_parse',
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
464 help='''Specify to prevent input sequence headers from being parsed
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
465 to add new columns to database. Parsing of sequence headers requires
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
466 headers to be in the pRESTO annotation format, so this should be specified
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
467 when sequence headers are incompatible with the pRESTO annotation scheme.
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
468 Note, unrecognized header formats will default to this behavior.''')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
469 parser_imgt.add_argument('--partial', action='store_true', dest='partial',
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
470 help='''If specified, include incomplete V(D)J alignments in
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
471 the pass file instead of the fail file.''')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
472 parser_imgt.add_argument('--scores', action='store_true', dest='parse_scores',
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
473 help='''Specify if alignment score metrics should be
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
474 included in the output. Adds the V_SCORE, V_IDENTITY,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
475 J_SCORE and J_IDENTITY.''')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
476 parser_imgt.add_argument('--regions', action='store_true', dest='parse_regions',
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
477 help='''Specify if IMGT FWRs and CDRs should be
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
478 included in the output. Adds the FWR1_IMGT, FWR2_IMGT,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
479 FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
480 CDR3_IMGT columns.''')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
481 parser_imgt.add_argument('--junction', action='store_true', dest='parse_junction',
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
482 help='''Specify if detailed junction fields should be
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
483 included in the output. Adds the columns
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
484 N1_LENGTH, N2_LENGTH, P3V_LENGTH, P5D_LENGTH, P3D_LENGTH,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
485 P5J_LENGTH, D_FRAME.''')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
486 parser_imgt.set_defaults(func=parseIMGT)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
487
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
488 # iHMMuneAlign Aligner
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
489 parser_ihmm = subparsers.add_parser('ihmm', parents=[parser_parent],
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
490 formatter_class=CommonHelpFormatter,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
491 help='Process iHMMune-Align output.',
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
492 description='Process iHMMune-Align output.')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
493 parser_ihmm.add_argument('-i', nargs='+', action='store', dest='aligner_outputs',
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
494 required=True,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
495 help='''iHMMune-Align output file.''')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
496 parser_ihmm.add_argument('-r', nargs='+', action='store', dest='repo', required=True,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
497 help='''List of folders and/or FASTA files containing
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
498 IMGT-gapped germline sequences corresponding to the
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
499 set of germlines used in the IgBLAST alignment.''')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
500 parser_ihmm.add_argument('-s', action='store', nargs='+', dest='seq_files',
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
501 required=True,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
502 help='''List of input FASTA files (with .fasta, .fna or .fa
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
503 extension) containing sequences.''')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
504 parser_ihmm.add_argument('--noparse', action='store_true', dest='no_parse',
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
505 help='''Specify to prevent input sequence headers from being parsed
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
506 to add new columns to database. Parsing of sequence headers requires
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
507 headers to be in the pRESTO annotation format, so this should be specified
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
508 when sequence headers are incompatible with the pRESTO annotation scheme.
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
509 Note, unrecognized header formats will default to this behavior.''')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
510 parser_ihmm.add_argument('--partial', action='store_true', dest='partial',
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
511 help='''If specified, include incomplete V(D)J alignments in
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
512 the pass file instead of the fail file.''')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
513 parser_ihmm.add_argument('--scores', action='store_true', dest='parse_scores',
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
514 help='''Specify if alignment score metrics should be
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
515 included in the output. Adds the path score of the
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
516 iHMMune-Align hidden Markov model to HMM_SCORE.''')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
517 parser_ihmm.add_argument('--regions', action='store_true', dest='parse_regions',
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
518 help='''Specify if IMGT FWRs and CDRs should be
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
519 included in the output. Adds the FWR1_IMGT, FWR2_IMGT,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
520 FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
521 CDR3_IMGT columns.''')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
522 parser_ihmm.set_defaults(func=parseIHMM)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
523
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
524 return parser
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
525
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
526
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
527 if __name__ == "__main__":
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
528 """
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
529 Parses command line arguments and calls main
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
530 """
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
531 parser = getArgParser()
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
532 checkArgs(parser)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
533 args = parser.parse_args()
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
534 args_dict = parseCommonArgs(args, in_arg='aligner_outputs')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
535
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
536 # Set no ID parsing if sequence files are not provided
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
537 if 'seq_files' in args_dict and not args_dict['seq_files']:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
538 args_dict['no_parse'] = True
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
539
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
540 # Delete
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
541 if 'seq_files' in args_dict: del args_dict['seq_files']
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
542 if 'aligner_outputs' in args_dict: del args_dict['aligner_outputs']
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
543 if 'command' in args_dict: del args_dict['command']
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
544 if 'func' in args_dict: del args_dict['func']
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
545
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
546 if args.command == 'imgt':
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
547 for i in range(len(args.__dict__['aligner_outputs'])):
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
548 args_dict['aligner_output'] = args.__dict__['aligner_outputs'][i]
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
549 args_dict['seq_file'] = args.__dict__['seq_files'][i] \
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
550 if args.__dict__['seq_files'] else None
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
551 args.func(**args_dict)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
552 elif args.command == 'igblast' or args.command == 'ihmm':
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
553 for i in range(len(args.__dict__['aligner_outputs'])):
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
554 args_dict['aligner_output'] = args.__dict__['aligner_outputs'][i]
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
555 args_dict['seq_file'] = args.__dict__['seq_files'][i]
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
556 args.func(**args_dict)