annotate change_o/MakeDb.py @ 78:aff3ba86ef7a draft

Uploaded
author davidvanzessen
date Mon, 31 Aug 2020 11:20:08 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
78
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
1 #!/usr/bin/env python3
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
2 """
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
3 Create tab-delimited database file to store sequence alignment information
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
4 """
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
5
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
6 # Info
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
7 __author__ = 'Namita Gupta, Jason Anthony Vander Heiden'
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
8 from changeo import __version__, __date__
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
9
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
10 # Imports
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
11 import os
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
12 import re
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
13 import csv
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
14 from argparse import ArgumentParser
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
15 from collections import OrderedDict
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
16 from textwrap import dedent
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
17 from time import time
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
18 from Bio import SeqIO
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
19
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
20 # Presto and changeo imports
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
21 from presto.Annotation import parseAnnotation
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
22 from presto.IO import countSeqFile, printLog, printMessage, printProgress, printError, printWarning, readSeqFile
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
23 from changeo.Defaults import default_format, default_out_args
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
24 from changeo.Commandline import CommonHelpFormatter, checkArgs, getCommonArgParser, parseCommonArgs
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
25 from changeo.Alignment import RegionDefinition
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
26 from changeo.Gene import buildGermline
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
27 from changeo.IO import countDbFile, extractIMGT, readGermlines, getFormatOperators, getOutputHandle, \
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
28 AIRRWriter, ChangeoWriter, IgBLASTReader, IgBLASTReaderAA, IMGTReader, IHMMuneReader
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
29 from changeo.Receptor import ChangeoSchema, AIRRSchema
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
30
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
31 # 10X Receptor attributes
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
32 cellranger_base = ['cell', 'c_call', 'conscount', 'umicount']
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
33 cellranger_extended = ['cell', 'c_call', 'conscount', 'umicount',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
34 'v_call_10x', 'd_call_10x', 'j_call_10x',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
35 'junction_10x', 'junction_10x_aa']
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
36
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
37 def readCellRanger(cellranger_file, fields=cellranger_base):
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
38 """
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
39 Load a Cell Ranger annotation table
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
40
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
41 Arguments:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
42 cellranger_file (str): path to the annotation file.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
43 fields (list): list of fields to keep.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
44
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
45 Returns:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
46 dict: dict of dicts with contig_id as the primary key.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
47 """
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
48 # Mapping of 10X annotations to Receptor attributes
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
49 cellranger_map = {'cell': 'barcode',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
50 'c_call': 'c_gene',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
51 'locus': 'chain',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
52 'conscount': 'reads',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
53 'umicount': 'umis',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
54 'v_call_10x': 'v_gene',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
55 'd_call_10x': 'd_gene',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
56 'j_call_10x': 'j_gene',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
57 'junction_10x': 'cdr3_nt',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
58 'junction_10x_aa': 'cdr3'}
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
59
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
60 # Function to parse individual fields
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
61 def _parse(x):
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
62 return '' if x == 'None' else x
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
63
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
64 # Generate annotation dictionary
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
65 ann_dict = {}
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
66 with open(cellranger_file) as csv_file:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
67 # Detect delimiters
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
68 dialect = csv.Sniffer().sniff(csv_file.readline())
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
69 csv_file.seek(0)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
70 # Read in annotation file
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
71 csv_reader = csv.DictReader(csv_file, dialect=dialect)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
72
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
73 # Generate annotation dictionary
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
74 for row in csv_reader:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
75 ann_dict[row['contig_id']] = {f: _parse(row[cellranger_map[f]]) for f in fields}
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
76
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
77 return ann_dict
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
78
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
79
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
80 def addGermline(receptor, references, amino_acid=False):
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
81 """
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
82 Add full length germline to Receptor object
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
83
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
84 Arguments:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
85 receptor (changeo.Receptor.Receptor): Receptor object to modify.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
86 references (dict): dictionary of IMGT-gapped references sequences.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
87 amino_acid (bool): if True build amino acid germline, otherwise build nucleotide germline
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
88
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
89 Returns:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
90 changeo.Receptor.Receptor: modified Receptor with the germline sequence added.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
91 """
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
92 if amino_acid:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
93 __, germlines, __ = buildGermline(receptor, references, seq_field='sequence_aa_imgt',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
94 amino_acid=True)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
95 germline_seq = None if germlines is None else germlines['full']
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
96 receptor.setField('germline_aa_imgt', germline_seq)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
97 else:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
98 __, germlines, __ = buildGermline(receptor, references, amino_acid=False)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
99 germline_seq = None if germlines is None else germlines['full']
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
100 receptor.setField('germline_imgt', germline_seq)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
101
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
102 return receptor
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
103
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
104
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
105 def getIDforIMGT(seq_file):
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
106 """
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
107 Create a sequence ID translation using IMGT truncation.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
108
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
109 Arguments:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
110 seq_file : a fasta file of sequences input to IMGT.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
111
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
112 Returns:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
113 dict : a dictionary of with the IMGT truncated ID as the key and the full sequence description as the value.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
114 """
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
115
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
116 # Create a sequence ID translation using IDs truncate up to space or 50 chars
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
117 ids = {}
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
118 for rec in readSeqFile(seq_file):
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
119 if len(rec.description) <= 50:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
120 id_key = rec.description
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
121 else:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
122 id_key = re.sub('\||\s|!|&|\*|<|>|\?', '_', rec.description[:50])
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
123 ids.update({id_key: rec.description})
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
124
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
125 return ids
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
126
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
127
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
128 def getSeqDict(seq_file):
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
129 """
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
130 Create a dictionary from a sequence file.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
131
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
132 Arguments:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
133 seq_file : sequence file.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
134
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
135 Returns:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
136 dict : sequence description as keys with Bio.SeqRecords as values.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
137 """
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
138 seq_dict = SeqIO.to_dict(readSeqFile(seq_file), key_function=lambda x: x.description)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
139
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
140 return seq_dict
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
141
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
142
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
143 def writeDb(records, fields, aligner_file, total_count, id_dict=None, annotations=None,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
144 amino_acid=False, partial=False, asis_id=True, regions='default',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
145 writer=AIRRWriter, out_file=None, out_args=default_out_args):
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
146 """
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
147 Writes parsed records to an output file
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
148
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
149 Arguments:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
150 records : a iterator of Receptor objects containing alignment data.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
151 fields : a list of ordered field names to write.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
152 aligner_file : input file name.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
153 total_count : number of records (for progress bar).
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
154 id_dict : a dictionary of the truncated sequence ID mapped to the full sequence ID.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
155 annotations : additional annotation dictionary.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
156 amino_acid : if True do verification on amino acid fields.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
157 partial : if True put incomplete alignments in the pass file.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
158 asis_id : if ID is to be parsed for pRESTO output with default delimiters.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
159 regions (str): name of the IMGT FWR/CDR region definitions to use.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
160 writer : writer class.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
161 out_file : output file name. Automatically generated from the input file if None.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
162 out_args : common output argument dictionary from parseCommonArgs.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
163
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
164 Returns:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
165 None
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
166 """
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
167 # Wrapper for opening handles and writers
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
168 def _open(x, f, writer=writer, out_file=out_file):
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
169 if out_file is not None and x == 'pass':
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
170 handle = open(out_file, 'w')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
171 else:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
172 handle = getOutputHandle(aligner_file,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
173 out_label='db-%s' % x,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
174 out_dir=out_args['out_dir'],
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
175 out_name=out_args['out_name'],
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
176 out_type=out_args['out_type'])
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
177 return handle, writer(handle, fields=f)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
178
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
179 # Function to convert fasta header annotations to changeo columns
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
180 def _changeo(f, header):
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
181 h = [ChangeoSchema.fromReceptor(x) for x in header if x.upper() not in f]
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
182 f.extend(h)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
183 return f
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
184
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
185 def _airr(f, header):
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
186 h = [AIRRSchema.fromReceptor(x) for x in header if x.lower() not in f]
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
187 f.extend(h)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
188 return f
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
189
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
190 # Function to verify IMGT-gapped sequence and junction concur
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
191 def _imgt_check(rec):
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
192 try:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
193 if amino_acid:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
194 rd = RegionDefinition(rec.junction_aa_length, amino_acid=amino_acid, definition=regions)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
195 x, y = rd.positions['junction']
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
196 check = (rec.junction_aa == rec.sequence_aa_imgt[x:y])
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
197 else:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
198 rd = RegionDefinition(rec.junction_length, amino_acid=amino_acid, definition=regions)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
199 x, y = rd.positions['junction']
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
200 check = (rec.junction == rec.sequence_imgt[x:y])
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
201 except (TypeError, AttributeError):
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
202 check = False
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
203 return check
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
204
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
205 # Function to check for valid records strictly
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
206 def _strict(rec):
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
207 if amino_acid:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
208 valid = [rec.v_call and rec.v_call != 'None',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
209 rec.j_call and rec.j_call != 'None',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
210 rec.functional is not None,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
211 rec.sequence_aa_imgt,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
212 rec.junction_aa,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
213 _imgt_check(rec)]
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
214 else:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
215 valid = [rec.v_call and rec.v_call != 'None',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
216 rec.j_call and rec.j_call != 'None',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
217 rec.functional is not None,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
218 rec.sequence_imgt,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
219 rec.junction,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
220 _imgt_check(rec)]
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
221 return all(valid)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
222
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
223 # Function to check for valid records loosely
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
224 def _gentle(rec):
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
225 valid = [rec.v_call and rec.v_call != 'None',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
226 rec.d_call and rec.d_call != 'None',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
227 rec.j_call and rec.j_call != 'None']
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
228 return any(valid)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
229
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
230 # Set writer class and annotation conversion function
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
231 if writer == ChangeoWriter:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
232 _annotate = _changeo
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
233 elif writer == AIRRWriter:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
234 _annotate = _airr
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
235 else:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
236 printError('Invalid output writer.')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
237
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
238 # Additional annotation (e.g. 10X cell calls)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
239 # _append_table = None
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
240 # if cellranger_file is not None:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
241 # with open(cellranger_file) as csv_file:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
242 # # Read in annotation file (use Sniffer to discover file delimiters)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
243 # dialect = csv.Sniffer().sniff(csv_file.readline())
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
244 # csv_file.seek(0)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
245 # csv_reader = csv.DictReader(csv_file, dialect = dialect)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
246 #
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
247 # # Generate annotation dictionary
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
248 # anntab_dict = {entry['contig_id']: {cellranger_map[field]: entry[field] \
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
249 # for field in cellranger_map.keys()} for entry in csv_reader}
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
250 #
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
251 # fields = _annotate(fields, cellranger_map.values())
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
252 # _append_table = lambda sequence_id: anntab_dict[sequence_id]
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
253
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
254 # Set pass criteria
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
255 _pass = _gentle if partial else _strict
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
256
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
257 # Define log handle
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
258 if out_args['log_file'] is None:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
259 log_handle = None
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
260 else:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
261 log_handle = open(out_args['log_file'], 'w')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
262
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
263 # Initialize handles, writers and counters
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
264 pass_handle, pass_writer = None, None
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
265 fail_handle, fail_writer = None, None
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
266 pass_count, fail_count = 0, 0
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
267 start_time = time()
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
268
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
269 # Validate and write output
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
270 printProgress(0, total_count, 0.05, start_time=start_time)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
271 for i, record in enumerate(records, start=1):
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
272 # Replace sequence description with full string, if required
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
273 if id_dict is not None and record.sequence_id in id_dict:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
274 record.sequence_id = id_dict[record.sequence_id]
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
275
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
276 # Parse sequence description into new columns
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
277 if not asis_id:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
278 try:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
279 ann_raw = parseAnnotation(record.sequence_id)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
280 record.sequence_id = ann_raw.pop('ID')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
281
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
282 # Convert to Receptor fields
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
283 ann_parsed = OrderedDict()
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
284 for k, v in ann_raw.items():
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
285 ann_parsed[ChangeoSchema.toReceptor(k)] = v
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
286
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
287 # Add annotations to Receptor and update field list
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
288 record.setDict(ann_parsed, parse=True)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
289 if i == 1: fields = _annotate(fields, ann_parsed.keys())
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
290 except IndexError:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
291 # Could not parse pRESTO-style annotations so fall back to no parse
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
292 asis_id = True
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
293 printWarning('Sequence annotation format not recognized. Sequence headers will not be parsed.')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
294
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
295 # Add supplemental annotation fields
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
296 # if _append_table is not None:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
297 # record.setDict(_append_table(record.sequence_id), parse=True)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
298 if annotations is not None:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
299 record.setDict(annotations[record.sequence_id], parse=True)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
300 if i == 1: fields = _annotate(fields, annotations[record.sequence_id].keys())
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
301
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
302 # Count pass or fail and write to appropriate file
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
303 if _pass(record):
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
304 pass_count += 1
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
305 # Write row to pass file
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
306 try:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
307 pass_writer.writeReceptor(record)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
308 except AttributeError:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
309 # Open pass file and writer
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
310 pass_handle, pass_writer = _open('pass', fields)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
311 pass_writer.writeReceptor(record)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
312 else:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
313 fail_count += 1
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
314 # Write row to fail file if specified
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
315 if out_args['failed']:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
316 try:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
317 fail_writer.writeReceptor(record)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
318 except AttributeError:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
319 # Open fail file and writer
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
320 fail_handle, fail_writer = _open('fail', fields)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
321 fail_writer.writeReceptor(record)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
322
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
323 # Write log
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
324 if log_handle is not None:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
325 log = OrderedDict([('ID', record.sequence_id),
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
326 ('V_CALL', record.v_call),
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
327 ('D_CALL', record.d_call),
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
328 ('J_CALL', record.j_call),
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
329 ('PRODUCTIVE', record.functional)])
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
330 if not _imgt_check(record) and not amino_acid:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
331 log['ERROR'] = 'Junction does not match the sequence starting at position 310 in the IMGT numbered V(D)J sequence.'
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
332 printLog(log, log_handle)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
333
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
334 # Print progress
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
335 printProgress(i, total_count, 0.05, start_time=start_time)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
336
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
337 # Print console log
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
338 log = OrderedDict()
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
339 log['OUTPUT'] = os.path.basename(pass_handle.name) if pass_handle is not None else None
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
340 log['PASS'] = pass_count
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
341 log['FAIL'] = fail_count
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
342 log['END'] = 'MakeDb'
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
343 printLog(log)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
344
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
345 # Close file handles
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
346 output = {'pass': None, 'fail': None}
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
347 if pass_handle is not None:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
348 output['pass'] = pass_handle.name
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
349 pass_handle.close()
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
350 if fail_handle is not None:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
351 output['fail'] = fail_handle.name
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
352 fail_handle.close()
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
353
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
354 return output
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
355
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
356
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
357 def parseIMGT(aligner_file, seq_file=None, repo=None, cellranger_file=None, partial=False, asis_id=True,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
358 extended=False, format=default_format, out_file=None, out_args=default_out_args):
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
359 """
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
360 Main for IMGT aligned sample sequences.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
361
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
362 Arguments:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
363 aligner_file : zipped file or unzipped folder output by IMGT.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
364 seq_file : FASTA file input to IMGT (from which to get seqID).
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
365 repo : folder with germline repertoire files.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
366 partial : If True put incomplete alignments in the pass file.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
367 asis_id : if ID is to be parsed for pRESTO output with default delimiters.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
368 extended : if True add alignment score, FWR, CDR and junction fields to output file.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
369 format : output format. one of 'changeo' or 'airr'.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
370 out_file : output file name. Automatically generated from the input file if None.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
371 out_args : common output argument dictionary from parseCommonArgs.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
372
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
373 Returns:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
374 dict : names of the 'pass' and 'fail' output files.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
375 """
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
376 # Print parameter info
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
377 log = OrderedDict()
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
378 log['START'] = 'MakeDb'
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
379 log['COMMAND'] = 'imgt'
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
380 log['ALIGNER_FILE'] = aligner_file
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
381 log['SEQ_FILE'] = os.path.basename(seq_file) if seq_file else ''
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
382 log['ASIS_ID'] = asis_id
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
383 log['PARTIAL'] = partial
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
384 log['EXTENDED'] = extended
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
385 printLog(log)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
386
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
387 start_time = time()
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
388 printMessage('Loading files', start_time=start_time, width=20)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
389
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
390 # Extract IMGT files
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
391 temp_dir, imgt_files = extractIMGT(aligner_file)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
392
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
393 # Count records in IMGT files
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
394 total_count = countDbFile(imgt_files['summary'])
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
395
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
396 # Get (parsed) IDs from fasta file submitted to IMGT
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
397 id_dict = getIDforIMGT(seq_file) if seq_file else {}
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
398
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
399 # Load supplementary annotation table
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
400 if cellranger_file is not None:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
401 f = cellranger_extended if extended else cellranger_base
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
402 annotations = readCellRanger(cellranger_file, fields=f)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
403 else:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
404 annotations = None
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
405
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
406 printMessage('Done', start_time=start_time, end=True, width=20)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
407
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
408 # Define format operators
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
409 try:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
410 __, writer, schema = getFormatOperators(format)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
411 except ValueError:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
412 printError('Invalid format %s.' % format)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
413 out_args['out_type'] = schema.out_type
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
414
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
415 # Define output fields
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
416 fields = list(schema.required)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
417 if extended:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
418 custom = IMGTReader.customFields(scores=True, regions=True, junction=True, schema=schema)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
419 fields.extend(custom)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
420
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
421 # Parse IMGT output and write db
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
422 with open(imgt_files['summary'], 'r') as summary_handle, \
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
423 open(imgt_files['gapped'], 'r') as gapped_handle, \
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
424 open(imgt_files['ntseq'], 'r') as ntseq_handle, \
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
425 open(imgt_files['junction'], 'r') as junction_handle:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
426
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
427 # Open parser
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
428 parse_iter = IMGTReader(summary_handle, gapped_handle, ntseq_handle, junction_handle)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
429
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
430 # Add germline sequence
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
431 if repo is None:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
432 germ_iter = parse_iter
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
433 else:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
434 references = readGermlines(repo)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
435 # Check for IMGT-gaps in germlines
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
436 if all('...' not in x for x in references.values()):
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
437 printWarning('Germline reference sequences do not appear to contain IMGT-numbering spacers. Results may be incorrect.')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
438 germ_iter = (addGermline(x, references) for x in parse_iter)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
439
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
440 # Write db
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
441 output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
442 annotations=annotations, id_dict=id_dict, asis_id=asis_id, partial=partial,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
443 writer=writer, out_file=out_file, out_args=out_args)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
444
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
445 # Cleanup temp directory
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
446 temp_dir.cleanup()
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
447
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
448 return output
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
449
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
450
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
451 def parseIgBLAST(aligner_file, seq_file, repo, amino_acid=False, cellranger_file=None, partial=False,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
452 asis_id=True, asis_calls=False, extended=False, regions='default',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
453 format='changeo', out_file=None, out_args=default_out_args):
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
454 """
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
455 Main for IgBLAST aligned sample sequences.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
456
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
457 Arguments:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
458 aligner_file (str): IgBLAST output file to process.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
459 seq_file (str): fasta file input to IgBlast (from which to get sequence).
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
460 repo (str): folder with germline repertoire files.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
461 amino_acid (bool): if True then the IgBLAST output files are results from igblastp. igblastn is assumed if False.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
462 partial : If True put incomplete alignments in the pass file.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
463 asis_id (bool): if ID is to be parsed for pRESTO output with default delimiters.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
464 asis_calls (bool): if True do not parse gene calls for allele names.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
465 extended (bool): if True add alignment scores, FWR regions, and CDR regions to the output.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
466 regions (str): name of the IMGT FWR/CDR region definitions to use.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
467 format (str): output format. one of 'changeo' or 'airr'.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
468 out_file (str): output file name. Automatically generated from the input file if None.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
469 out_args (dict): common output argument dictionary from parseCommonArgs.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
470
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
471 Returns:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
472 dict : names of the 'pass' and 'fail' output files.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
473 """
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
474 # Print parameter info
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
475 log = OrderedDict()
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
476 log['START'] = 'MakeDB'
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
477 log['COMMAND'] = 'igblast-aa' if amino_acid else 'igblast'
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
478 log['ALIGNER_FILE'] = os.path.basename(aligner_file)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
479 log['SEQ_FILE'] = os.path.basename(seq_file)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
480 log['ASIS_ID'] = asis_id
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
481 log['ASIS_CALLS'] = asis_calls
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
482 log['PARTIAL'] = partial
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
483 log['EXTENDED'] = extended
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
484 printLog(log)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
485
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
486 # Set amino acid conditions
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
487 if amino_acid:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
488 format = '%s-aa' % format
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
489 parser = IgBLASTReaderAA
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
490 else:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
491 parser = IgBLASTReader
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
492
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
493 # Start
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
494 start_time = time()
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
495 printMessage('Loading files', start_time=start_time, width=20)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
496
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
497 # Count records in sequence file
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
498 total_count = countSeqFile(seq_file)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
499
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
500 # Get input sequence dictionary
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
501 seq_dict = getSeqDict(seq_file)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
502
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
503 # Create germline repo dictionary
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
504 references = readGermlines(repo, asis=asis_calls)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
505
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
506 # Load supplementary annotation table
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
507 if cellranger_file is not None:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
508 f = cellranger_extended if extended else cellranger_base
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
509 annotations = readCellRanger(cellranger_file, fields=f)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
510 else:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
511 annotations = None
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
512
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
513 printMessage('Done', start_time=start_time, end=True, width=20)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
514
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
515 # Check for IMGT-gaps in germlines
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
516 if all('...' not in x for x in references.values()):
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
517 printWarning('Germline reference sequences do not appear to contain IMGT-numbering spacers. Results may be incorrect.')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
518
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
519 # Define format operators
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
520 try:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
521 __, writer, schema = getFormatOperators(format)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
522 except ValueError:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
523 printError('Invalid format %s.' % format)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
524 out_args['out_type'] = schema.out_type
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
525
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
526 # Define output fields
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
527 fields = list(schema.required)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
528 if extended:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
529 custom = parser.customFields(schema=schema)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
530 fields.extend(custom)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
531
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
532 # Parse and write output
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
533 with open(aligner_file, 'r') as f:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
534 parse_iter = parser(f, seq_dict, references, regions=regions, asis_calls=asis_calls)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
535 germ_iter = (addGermline(x, references, amino_acid=amino_acid) for x in parse_iter)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
536 output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
537 annotations=annotations, amino_acid=amino_acid, partial=partial, asis_id=asis_id,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
538 regions=regions, writer=writer, out_file=out_file, out_args=out_args)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
539
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
540 return output
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
541
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
542
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
543 def parseIHMM(aligner_file, seq_file, repo, cellranger_file=None, partial=False, asis_id=True,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
544 extended=False, format=default_format, out_file=None, out_args=default_out_args):
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
545 """
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
546 Main for iHMMuneAlign aligned sample sequences.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
547
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
548 Arguments:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
549 aligner_file : iHMMune-Align output file to process.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
550 seq_file : fasta file input to iHMMuneAlign (from which to get sequence).
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
551 repo : folder with germline repertoire files.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
552 partial : If True put incomplete alignments in the pass file.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
553 asis_id : if ID is to be parsed for pRESTO output with default delimiters.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
554 extended : if True parse alignment scores, FWR and CDR region fields.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
555 format : output format. One of 'changeo' or 'airr'.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
556 out_file : output file name. Automatically generated from the input file if None.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
557 out_args : common output argument dictionary from parseCommonArgs.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
558
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
559 Returns:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
560 dict : names of the 'pass' and 'fail' output files.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
561 """
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
562 # Print parameter info
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
563 log = OrderedDict()
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
564 log['START'] = 'MakeDB'
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
565 log['COMMAND'] = 'ihmm'
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
566 log['ALIGNER_FILE'] = os.path.basename(aligner_file)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
567 log['SEQ_FILE'] = os.path.basename(seq_file)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
568 log['ASIS_ID'] = asis_id
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
569 log['PARTIAL'] = partial
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
570 log['EXTENDED'] = extended
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
571 printLog(log)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
572
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
573 start_time = time()
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
574 printMessage('Loading files', start_time=start_time, width=20)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
575
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
576 # Count records in sequence file
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
577 total_count = countSeqFile(seq_file)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
578
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
579 # Get input sequence dictionary
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
580 seq_dict = getSeqDict(seq_file)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
581
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
582 # Create germline repo dictionary
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
583 references = readGermlines(repo)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
584
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
585 # Load supplementary annotation table
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
586 if cellranger_file is not None:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
587 f = cellranger_extended if extended else cellranger_base
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
588 annotations = readCellRanger(cellranger_file, fields=f)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
589 else:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
590 annotations = None
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
591
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
592 printMessage('Done', start_time=start_time, end=True, width=20)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
593
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
594 # Check for IMGT-gaps in germlines
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
595 if all('...' not in x for x in references.values()):
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
596 printWarning('Germline reference sequences do not appear to contain IMGT-numbering spacers. Results may be incorrect.')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
597
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
598 # Define format operators
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
599 try:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
600 __, writer, schema = getFormatOperators(format)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
601 except ValueError:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
602 printError('Invalid format %s.' % format)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
603 out_args['out_type'] = schema.out_type
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
604
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
605 # Define output fields
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
606 fields = list(schema.required)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
607 if extended:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
608 custom = IHMMuneReader.customFields(scores=True, regions=True, schema=schema)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
609 fields.extend(custom)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
610
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
611 # Parse and write output
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
612 with open(aligner_file, 'r') as f:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
613 parse_iter = IHMMuneReader(f, seq_dict, references)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
614 germ_iter = (addGermline(x, references) for x in parse_iter)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
615 output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
616 annotations=annotations, asis_id=asis_id, partial=partial,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
617 writer=writer, out_file=out_file, out_args=out_args)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
618
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
619 return output
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
620
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
621
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
622 def getArgParser():
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
623 """
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
624 Defines the ArgumentParser.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
625
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
626 Returns:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
627 argparse.ArgumentParser
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
628 """
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
629 fields = dedent(
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
630 '''
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
631 output files:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
632 db-pass
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
633 database of alignment records with functionality information,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
634 V and J calls, and a junction region.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
635 db-fail
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
636 database with records that fail due to no productivity information,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
637 no gene V assignment, no J assignment, or no junction region.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
638
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
639 universal output fields:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
640 sequence_id, sequence, sequence_alignment, germline_alignment,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
641 rev_comp, productive, stop_codon, vj_in_frame, locus,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
642 v_call, d_call, j_call, junction, junction_length, junction_aa,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
643 v_sequence_start, v_sequence_end, v_germline_start, v_germline_end,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
644 d_sequence_start, d_sequence_end, d_germline_start, d_germline_end,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
645 j_sequence_start, j_sequence_end, j_germline_start, j_germline_end,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
646 np1_length, np2_length, fwr1, fwr2, fwr3, fwr4, cdr1, cdr2, cdr3
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
647
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
648 imgt specific output fields:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
649 n1_length, n2_length, p3v_length, p5d_length, p3d_length, p5j_length,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
650 d_frame, v_score, v_identity, d_score, d_identity, j_score, j_identity
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
651
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
652 igblast specific output fields:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
653 v_score, v_identity, v_support, v_cigar,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
654 d_score, d_identity, d_support, d_cigar,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
655 j_score, j_identity, j_support, j_cigar
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
656
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
657 ihmm specific output fields:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
658 vdj_score
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
659
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
660 10X specific output fields:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
661 cell_id, c_call, consensus_count, umi_count,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
662 v_call_10x, d_call_10x, j_call_10x,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
663 junction_10x, junction_10x_aa
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
664 ''')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
665
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
666 # Define ArgumentParser
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
667 parser = ArgumentParser(description=__doc__, epilog=fields,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
668 formatter_class=CommonHelpFormatter, add_help=False)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
669 group_help = parser.add_argument_group('help')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
670 group_help.add_argument('--version', action='version',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
671 version='%(prog)s:' + ' %s %s' %(__version__, __date__))
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
672 group_help.add_argument('-h', '--help', action='help', help='show this help message and exit')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
673 subparsers = parser.add_subparsers(title='subcommands', dest='command',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
674 help='Aligner used', metavar='')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
675 # TODO: This is a temporary fix for Python issue 9253
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
676 subparsers.required = True
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
677
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
678 # Parent parser
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
679 parser_parent = getCommonArgParser(db_in=False)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
680
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
681 # igblastn output parser
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
682 parser_igblast = subparsers.add_parser('igblast', parents=[parser_parent],
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
683 formatter_class=CommonHelpFormatter, add_help=False,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
684 help='Process igblastn output.',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
685 description='Process igblastn output.')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
686 group_igblast = parser_igblast.add_argument_group('aligner parsing arguments')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
687 group_igblast.add_argument('-i', nargs='+', action='store', dest='aligner_files',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
688 required=True,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
689 help='''IgBLAST output files in format 7 with query sequence
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
690 (igblastn argument \'-outfmt "7 std qseq sseq btop"\').''')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
691 group_igblast.add_argument('-r', nargs='+', action='store', dest='repo', required=True,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
692 help='''List of folders and/or fasta files containing
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
693 the same germline set used in the IgBLAST alignment. These
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
694 reference sequences must contain IMGT-numbering spacers (gaps)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
695 in the V segment.''')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
696 group_igblast.add_argument('-s', action='store', nargs='+', dest='seq_files',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
697 required=True,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
698 help='''List of input FASTA files (with .fasta, .fna or .fa
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
699 extension), containing sequences.''')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
700 group_igblast.add_argument('--10x', action='store', nargs='+', dest='cellranger_file',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
701 help='''Table file containing 10X annotations (with .csv or .tsv
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
702 extension).''')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
703 group_igblast.add_argument('--asis-id', action='store_true', dest='asis_id',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
704 help='''Specify to prevent input sequence headers from being parsed
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
705 to add new columns to database. Parsing of sequence headers requires
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
706 headers to be in the pRESTO annotation format, so this should be specified
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
707 when sequence headers are incompatible with the pRESTO annotation scheme.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
708 Note, unrecognized header formats will default to this behavior.''')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
709 group_igblast.add_argument('--asis-calls', action='store_true', dest='asis_calls',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
710 help='''Specify to prevent gene calls from being parsed into standard allele names
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
711 in both the IgBLAST output and reference database. Note, this requires
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
712 the sequence identifiers in the reference sequence set and the IgBLAST
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
713 database to be exact string matches.''')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
714 group_igblast.add_argument('--partial', action='store_true', dest='partial',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
715 help='''If specified, include incomplete V(D)J alignments in
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
716 the pass file instead of the fail file. An incomplete alignment
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
717 is defined as a record for which a valid IMGT-gapped sequence
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
718 cannot be built or that is missing a V gene assignment,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
719 J gene assignment, junction region, or productivity call.''')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
720 group_igblast.add_argument('--extended', action='store_true', dest='extended',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
721 help='''Specify to include additional aligner specific fields in the output.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
722 Adds <vdj>_score, <vdj>_identity, <vdj>_support, <vdj>_cigar,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
723 fwr1, fwr2, fwr3, fwr4, cdr1, cdr2 and cdr3.''')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
724 group_igblast.add_argument('--regions', action='store', dest='regions',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
725 choices=('default', 'rhesus-igl'), default='default',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
726 help='''IMGT CDR and FWR boundary definition to use.''')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
727 parser_igblast.set_defaults(func=parseIgBLAST, amino_acid=False)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
728
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
729 # igblastp output parser
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
730 parser_igblast_aa = subparsers.add_parser('igblast-aa', parents=[parser_parent],
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
731 formatter_class=CommonHelpFormatter, add_help=False,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
732 help='Process igblastp output.',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
733 description='Process igblastp output.')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
734 group_igblast_aa = parser_igblast_aa.add_argument_group('aligner parsing arguments')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
735 group_igblast_aa.add_argument('-i', nargs='+', action='store', dest='aligner_files',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
736 required=True,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
737 help='''IgBLAST output files in format 7 with query sequence
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
738 (igblastp argument \'-outfmt "7 std qseq sseq btop"\').''')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
739 group_igblast_aa.add_argument('-r', nargs='+', action='store', dest='repo', required=True,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
740 help='''List of folders and/or fasta files containing
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
741 the same germline set used in the IgBLAST alignment. These
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
742 reference sequences must contain IMGT-numbering spacers (gaps)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
743 in the V segment.''')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
744 group_igblast_aa.add_argument('-s', action='store', nargs='+', dest='seq_files', required=True,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
745 help='''List of input FASTA files (with .fasta, .fna or .fa
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
746 extension), containing sequences.''')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
747 group_igblast_aa.add_argument('--10x', action='store', nargs='+', dest='cellranger_file',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
748 help='''Table file containing 10X annotations (with .csv or .tsv extension).''')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
749 group_igblast_aa.add_argument('--asis-id', action='store_true', dest='asis_id',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
750 help='''Specify to prevent input sequence headers from being parsed
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
751 to add new columns to database. Parsing of sequence headers requires
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
752 headers to be in the pRESTO annotation format, so this should be specified
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
753 when sequence headers are incompatible with the pRESTO annotation scheme.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
754 Note, unrecognized header formats will default to this behavior.''')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
755 group_igblast_aa.add_argument('--asis-calls', action='store_true', dest='asis_calls',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
756 help='''Specify to prevent gene calls from being parsed into standard allele names
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
757 in both the IgBLAST output and reference database. Note, this requires
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
758 the sequence identifiers in the reference sequence set and the IgBLAST
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
759 database to be exact string matches.''')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
760 group_igblast_aa.add_argument('--extended', action='store_true', dest='extended',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
761 help='''Specify to include additional aligner specific fields in the output.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
762 Adds v_score, v_identity, v_support, v_cigar, fwr1, fwr2, fwr3, cdr1 and cdr2.''')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
763 group_igblast_aa.add_argument('--regions', action='store', dest='regions',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
764 choices=('default', 'rhesus-igl'), default='default',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
765 help='''IMGT CDR and FWR boundary definition to use.''')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
766 parser_igblast_aa.set_defaults(func=parseIgBLAST, partial=True, amino_acid=True)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
767
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
768
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
769 # IMGT aligner
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
770 parser_imgt = subparsers.add_parser('imgt', parents=[parser_parent],
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
771 formatter_class=CommonHelpFormatter, add_help=False,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
772 help='''Process IMGT/HighV-Quest output
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
773 (does not work with V-QUEST).''',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
774 description='''Process IMGT/HighV-Quest output
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
775 (does not work with V-QUEST).''')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
776 group_imgt = parser_imgt.add_argument_group('aligner parsing arguments')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
777 group_imgt.add_argument('-i', nargs='+', action='store', dest='aligner_files',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
778 help='''Either zipped IMGT output files (.zip or .txz) or a
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
779 folder containing unzipped IMGT output files (which must
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
780 include 1_Summary, 2_IMGT-gapped, 3_Nt-sequences,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
781 and 6_Junction).''')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
782 group_imgt.add_argument('-s', nargs='*', action='store', dest='seq_files', required=False,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
783 help='''List of FASTA files (with .fasta, .fna or .fa
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
784 extension) that were submitted to IMGT/HighV-QUEST.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
785 If unspecified, sequence identifiers truncated by IMGT/HighV-QUEST
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
786 will not be corrected.''')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
787 group_imgt.add_argument('-r', nargs='+', action='store', dest='repo', required=False,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
788 help='''List of folders and/or fasta files containing
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
789 the germline sequence set used by IMGT/HighV-QUEST.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
790 These reference sequences must contain IMGT-numbering spacers (gaps)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
791 in the V segment. If unspecified, the germline sequence reconstruction
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
792 will not be included in the output.''')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
793 group_imgt.add_argument('--10x', action='store', nargs='+', dest='cellranger_file',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
794 help='''Table file containing 10X annotations (with .csv or .tsv
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
795 extension).''')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
796 group_imgt.add_argument('--asis-id', action='store_true', dest='asis_id',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
797 help='''Specify to prevent input sequence headers from being parsed
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
798 to add new columns to database. Parsing of sequence headers requires
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
799 headers to be in the pRESTO annotation format, so this should be specified
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
800 when sequence headers are incompatible with the pRESTO annotation scheme.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
801 Note, unrecognized header formats will default to this behavior.''')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
802 group_imgt.add_argument('--partial', action='store_true', dest='partial',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
803 help='''If specified, include incomplete V(D)J alignments in
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
804 the pass file instead of the fail file. An incomplete alignment
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
805 is defined as a record that is missing a V gene assignment,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
806 J gene assignment, junction region, or productivity call.''')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
807 group_imgt.add_argument('--extended', action='store_true', dest='extended',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
808 help='''Specify to include additional aligner specific fields in the output.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
809 Adds <vdj>_score, <vdj>_identity>, fwr1, fwr2, fwr3, fwr4,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
810 cdr1, cdr2, cdr3, n1_length, n2_length, p3v_length, p5d_length,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
811 p3d_length, p5j_length and d_frame.''')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
812 parser_imgt.set_defaults(func=parseIMGT)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
813
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
814 # iHMMuneAlign Aligner
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
815 parser_ihmm = subparsers.add_parser('ihmm', parents=[parser_parent],
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
816 formatter_class=CommonHelpFormatter, add_help=False,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
817 help='Process iHMMune-Align output.',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
818 description='Process iHMMune-Align output.')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
819 group_ihmm = parser_ihmm.add_argument_group('aligner parsing arguments')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
820 group_ihmm.add_argument('-i', nargs='+', action='store', dest='aligner_files',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
821 required=True,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
822 help='''iHMMune-Align output file.''')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
823 group_ihmm.add_argument('-r', nargs='+', action='store', dest='repo', required=True,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
824 help='''List of folders and/or FASTA files containing
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
825 the set of germline sequences used by iHMMune-Align. These
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
826 reference sequences must contain IMGT-numbering spacers (gaps)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
827 in the V segment.''')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
828 group_ihmm.add_argument('-s', action='store', nargs='+', dest='seq_files',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
829 required=True,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
830 help='''List of input FASTA files (with .fasta, .fna or .fa
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
831 extension) containing sequences.''')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
832 group_ihmm.add_argument('--10x', action='store', nargs='+', dest='cellranger_file',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
833 help='''Table file containing 10X annotations (with .csv or .tsv
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
834 extension).''')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
835 group_ihmm.add_argument('--asis-id', action='store_true', dest='asis_id',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
836 help='''Specify to prevent input sequence headers from being parsed
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
837 to add new columns to database. Parsing of sequence headers requires
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
838 headers to be in the pRESTO annotation format, so this should be specified
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
839 when sequence headers are incompatible with the pRESTO annotation scheme.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
840 Note, unrecognized header formats will default to this behavior.''')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
841 group_ihmm.add_argument('--partial', action='store_true', dest='partial',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
842 help='''If specified, include incomplete V(D)J alignments in
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
843 the pass file instead of the fail file. An incomplete alignment
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
844 is defined as a record for which a valid IMGT-gapped sequence
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
845 cannot be built or that is missing a V gene assignment,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
846 J gene assignment, junction region, or productivity call.''')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
847 group_ihmm.add_argument('--extended', action='store_true', dest='extended',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
848 help='''Specify to include additional aligner specific fields in the output.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
849 Adds the path score of the iHMMune-Align hidden Markov model as vdj_score;
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
850 adds fwr1, fwr2, fwr3, fwr4, cdr1, cdr2 and cdr3.''')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
851 parser_ihmm.set_defaults(func=parseIHMM)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
852
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
853 return parser
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
854
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
855
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
856 if __name__ == "__main__":
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
857 """
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
858 Parses command line arguments and calls main
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
859 """
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
860 parser = getArgParser()
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
861 checkArgs(parser)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
862 args = parser.parse_args()
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
863 args_dict = parseCommonArgs(args, in_arg='aligner_files')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
864
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
865 # Set no ID parsing if sequence files are not provided
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
866 if 'seq_files' in args_dict and not args_dict['seq_files']:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
867 args_dict['asis_id'] = True
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
868
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
869 # Delete
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
870 if 'aligner_files' in args_dict: del args_dict['aligner_files']
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
871 if 'seq_files' in args_dict: del args_dict['seq_files']
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
872 if 'out_files' in args_dict: del args_dict['out_files']
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
873 if 'command' in args_dict: del args_dict['command']
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
874 if 'func' in args_dict: del args_dict['func']
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
875
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
876 # Call main
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
877 for i, f in enumerate(args.__dict__['aligner_files']):
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
878 args_dict['aligner_file'] = f
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
879 args_dict['seq_file'] = args.__dict__['seq_files'][i] \
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
880 if args.__dict__['seq_files'] else None
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
881 args_dict['out_file'] = args.__dict__['out_files'][i] \
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
882 if args.__dict__['out_files'] else None
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
883 args_dict['cellranger_file'] = args.__dict__['cellranger_file'][i] \
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
884 if args.__dict__['cellranger_file'] else None
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
885 args.func(**args_dict)