78
|
1 #!/usr/bin/env python3
|
|
2 """
|
|
3 Assign Ig sequences into clones
|
|
4 """
|
|
5
|
|
6 # Info
|
|
7 __author__ = 'Namita Gupta, Jason Anthony Vander Heiden, Gur Yaari, Mohamed Uduman'
|
|
8 from changeo import __version__, __date__
|
|
9
|
|
10 # Imports
|
|
11 import os
|
|
12 import re
|
|
13 import sys
|
|
14 from argparse import ArgumentParser
|
|
15 from collections import OrderedDict
|
|
16 from itertools import chain
|
|
17 from textwrap import dedent
|
|
18 from time import time
|
|
19 from Bio.Seq import translate
|
|
20
|
|
21 # Presto and changeo imports
|
|
22 from presto.Defaults import default_out_args
|
|
23 from presto.IO import printLog, printProgress, printCount, printWarning, printError
|
|
24 from presto.Multiprocessing import manageProcesses
|
|
25 from changeo.Defaults import default_format, default_v_field, default_j_field, default_junction_field
|
|
26 from changeo.Commandline import CommonHelpFormatter, checkArgs, getCommonArgParser, parseCommonArgs
|
|
27 from changeo.Distance import distance_models, calcDistances, formClusters
|
|
28 from changeo.IO import countDbFile, getDbFields, getFormatOperators, getOutputHandle, \
|
|
29 AIRRWriter, ChangeoWriter
|
|
30 from changeo.Multiprocessing import DbResult, feedDbQueue, processDbQueue
|
|
31
|
|
32 # Defaults
|
|
33 default_translate = False
|
|
34 default_distance = 0.0
|
|
35 default_index_mode = 'gene'
|
|
36 default_index_action = 'set'
|
|
37 default_distance_model = 'ham'
|
|
38 default_norm = 'len'
|
|
39 default_sym = 'avg'
|
|
40 default_linkage = 'single'
|
|
41 default_max_missing=0
|
|
42 choices_distance_model = ('ham', 'aa', 'hh_s1f', 'hh_s5f',
|
|
43 'mk_rs1nf', 'mk_rs5nf',
|
|
44 'hs1f_compat', 'm1n_compat')
|
|
45
|
|
46
|
|
47 def filterMissing(data, seq_field=default_junction_field, v_field=default_v_field,
|
|
48 j_field=default_j_field, max_missing=default_max_missing):
|
|
49 """
|
|
50 Splits a set of sequence into passed and failed groups based on the number
|
|
51 of missing characters in the sequence
|
|
52
|
|
53 Arguments:
|
|
54 data : changeo.Multiprocessing.DbData object.
|
|
55 seq_field : sequence field to filter on.
|
|
56 v_field : field containing the V call.
|
|
57 j_field : field containing the J call.
|
|
58 max_missing : maximum number of missing characters (non-ACGT) to permit before failing the record.
|
|
59
|
|
60 Returns:
|
|
61 changeo.Multiprocessing.DbResult : objected containing filtered records.
|
|
62 """
|
|
63 # Function to validate the sequence string
|
|
64 def _pass(seq):
|
|
65 if len(seq) > 0 and len(re.findall(r'[^ACGT]', seq)) <= max_missing:
|
|
66 return True
|
|
67 else:
|
|
68 return False
|
|
69
|
|
70 # Define result object for iteration and get data records
|
|
71 result = DbResult(data.id, data.data)
|
|
72
|
|
73 if not data:
|
|
74 result.data_pass = []
|
|
75 result.data_fail = data.data
|
|
76 return result
|
|
77
|
|
78 result.data_pass = []
|
|
79 result.data_fail = []
|
|
80 for rec in data.data:
|
|
81 seq = rec.getField(seq_field)
|
|
82 if _pass(seq): result.data_pass.append(rec)
|
|
83 else: result.data_fail.append(rec)
|
|
84
|
|
85 # Add V(D)J to log
|
|
86 result.log['ID'] = ','.join([str(x) for x in data.id])
|
|
87 result.log['VCALL'] = ','.join(set([(r.getVAllele(field=v_field) or '') for r in data.data]))
|
|
88 result.log['JCALL'] = ','.join(set([(r.getJAllele(field=j_field) or '') for r in data.data]))
|
|
89 result.log['JUNCLEN'] = ','.join(set([(str(len(r.junction)) or '0') for r in data.data]))
|
|
90 result.log['CLONED'] = len(result.data_pass)
|
|
91 result.log['FILTERED'] = len(result.data_fail)
|
|
92
|
|
93 return result
|
|
94
|
|
95
|
|
96 def indexByIdentity(index, key, rec, group_fields=None):
|
|
97 """
|
|
98 Updates a preclone index with a simple key
|
|
99
|
|
100 Arguments:
|
|
101 index : preclone index from groupByGene
|
|
102 key : index key
|
|
103 rec : Receptor to add to the index
|
|
104 group_fields : additional annotation fields to use to group preclones;
|
|
105 if None use only V, J and junction length
|
|
106
|
|
107 Returns:
|
|
108 None : Updates index with new key and records.
|
|
109 """
|
|
110 index.setdefault(tuple(key), []).append(rec)
|
|
111
|
|
112
|
|
113 def indexByUnion(index, key, rec, group_fields=None):
|
|
114 """
|
|
115 Updates a preclone index with the union of nested keys
|
|
116
|
|
117 Arguments:
|
|
118 index : preclone index from groupByGene
|
|
119 key : index key
|
|
120 rec : Receptor to add to the index
|
|
121 group_fields : additional annotation fields to use to group preclones;
|
|
122 if None use only V, J and junction length
|
|
123
|
|
124 Returns:
|
|
125 None : Updates index with new key and records.
|
|
126 """
|
|
127 # List of values for this/new key
|
|
128 val = [rec]
|
|
129 f_range = list(range(2, 3 + (len(group_fields) if group_fields else 0)))
|
|
130
|
|
131 # See if field/junction length combination exists in index
|
|
132 outer_dict = index
|
|
133 for field in f_range:
|
|
134 try:
|
|
135 outer_dict = outer_dict[key[field]]
|
|
136 except KeyError:
|
|
137 outer_dict = None
|
|
138 break
|
|
139 # If field combination exists, look through Js
|
|
140 j_matches = []
|
|
141 if outer_dict is not None:
|
|
142 for j in outer_dict.keys():
|
|
143 if not set(key[1]).isdisjoint(set(j)):
|
|
144 key[1] = tuple(set(key[1]).union(set(j)))
|
|
145 j_matches += [j]
|
|
146 # If J overlap exists, look through Vs for each J
|
|
147 for j in j_matches:
|
|
148 v_matches = []
|
|
149 # Collect V matches for this J
|
|
150 for v in outer_dict[j].keys():
|
|
151 if not set(key[0]).isdisjoint(set(v)):
|
|
152 key[0] = tuple(set(key[0]).union(set(v)))
|
|
153 v_matches += [v]
|
|
154 # If there are V overlaps for this J, pop them out
|
|
155 if v_matches:
|
|
156 val += list(chain(*(outer_dict[j].pop(v) for v in v_matches)))
|
|
157 # If the J dict is now empty, remove it
|
|
158 if not outer_dict[j]:
|
|
159 outer_dict.pop(j, None)
|
|
160
|
|
161 # Add value(s) into index nested dictionary
|
|
162 # OMG Python pointers are the best!
|
|
163 # Add field dictionaries into index
|
|
164 outer_dict = index
|
|
165 for field in f_range:
|
|
166 outer_dict.setdefault(key[field], {})
|
|
167 outer_dict = outer_dict[key[field]]
|
|
168 # Add J, then V into index
|
|
169 if key[1] in outer_dict:
|
|
170 outer_dict[key[1]].update({key[0]: val})
|
|
171 else:
|
|
172 outer_dict[key[1]] = {key[0]: val}
|
|
173
|
|
174
|
|
175 def groupByGene(db_iter, group_fields=None, v_field=default_v_field, j_field=default_j_field,
|
|
176 mode=default_index_mode, action=default_index_action):
|
|
177 """
|
|
178 Identifies preclonal groups by V, J and junction length
|
|
179
|
|
180 Arguments:
|
|
181 db_iter : an iterator of Receptor objects defined by ChangeoReader
|
|
182 group_fields : additional annotation fields to use to group preclones;
|
|
183 if None use only V, J and junction length
|
|
184 mode : specificity of alignment call to use for assigning preclones;
|
|
185 one of ('allele', 'gene')
|
|
186 action : how to handle multiple value fields when assigning preclones;
|
|
187 one of ('first', 'set')
|
|
188
|
|
189 Returns:
|
|
190 dict: dictionary of {(V, J, junction length):[Receptor]}
|
|
191 """
|
|
192 # print(fields)
|
|
193 # Define functions for grouping keys
|
|
194 if mode == 'allele' and group_fields is None:
|
|
195 def _get_key(rec, act):
|
|
196 return [rec.getVAllele(act, field=v_field), rec.getJAllele(act, field=j_field),
|
|
197 None if rec.junction is None else len(rec.junction)]
|
|
198 elif mode == 'gene' and group_fields is None:
|
|
199 def _get_key(rec, act):
|
|
200 return [rec.getVGene(act, field=v_field), rec.getJGene(act, field=j_field),
|
|
201 None if rec.junction is None else len(rec.junction)]
|
|
202 elif mode == 'allele' and group_fields is not None:
|
|
203 def _get_key(rec, act):
|
|
204 vdj = [rec.getVAllele(act, field=v_field), rec.getJAllele(act, field=j_field),
|
|
205 None if rec.junction is None else len(rec.junction)]
|
|
206 ann = [rec.getField(k) for k in group_fields]
|
|
207 return list(chain(vdj, ann))
|
|
208 elif mode == 'gene' and group_fields is not None:
|
|
209 def _get_key(rec, act):
|
|
210 vdj = [rec.getVGene(act, field=v_field), rec.getJGene(act, field=j_field),
|
|
211 None if rec.junction is None else len(rec.junction)]
|
|
212 ann = [rec.getField(k) for k in group_fields]
|
|
213 return list(chain(vdj, ann))
|
|
214
|
|
215 # Function to flatten nested dictionary
|
|
216 def _flatten_dict(d, parent_key=''):
|
|
217 items = []
|
|
218 for k, v in d.items():
|
|
219 new_key = parent_key + [k] if parent_key else [k]
|
|
220 if isinstance(v, dict):
|
|
221 items.extend(_flatten_dict(v, new_key).items())
|
|
222 else:
|
|
223 items.append((new_key, v))
|
|
224 flat_dict = {None if None in i[0] else tuple(i[0]): i[1] for i in items}
|
|
225 return flat_dict
|
|
226
|
|
227 if action == 'first':
|
|
228 index_func = indexByIdentity
|
|
229 elif action == 'set':
|
|
230 index_func = indexByUnion
|
|
231 else:
|
|
232 sys.stderr.write('Unrecognized action: %s.\n' % action)
|
|
233
|
|
234 start_time = time()
|
|
235 clone_index = {}
|
|
236 rec_count = 0
|
|
237 for rec in db_iter:
|
|
238 key = _get_key(rec, action)
|
|
239
|
|
240 # Print progress
|
|
241 printCount(rec_count, step=1000, start_time=start_time, task='Grouping sequences')
|
|
242 rec_count += 1
|
|
243
|
|
244 # Assigned passed preclone records to key and failed to index None
|
|
245 if all([k is not None and k != '' for k in key]):
|
|
246 # Update index dictionary
|
|
247 index_func(clone_index, key, rec, group_fields)
|
|
248 else:
|
|
249 clone_index.setdefault(None, []).append(rec)
|
|
250
|
|
251 printCount(rec_count, step=1000, start_time=start_time, task='Grouping sequences', end=True)
|
|
252
|
|
253 if action == 'set':
|
|
254 clone_index = _flatten_dict(clone_index)
|
|
255
|
|
256 return clone_index
|
|
257
|
|
258
|
|
259 def distanceClones(result, seq_field=default_junction_field, model=default_distance_model,
|
|
260 distance=default_distance, dist_mat=None, norm=default_norm, sym=default_sym,
|
|
261 linkage=default_linkage):
|
|
262 """
|
|
263 Separates a set of Receptor objects into clones
|
|
264
|
|
265 Arguments:
|
|
266 result : a changeo.Multiprocessing.DbResult object with filtered records to clone
|
|
267 seq_field : sequence field used to calculate distance between records
|
|
268 model : substitution model used to calculate distance
|
|
269 distance : the distance threshold to assign clonal groups
|
|
270 dist_mat : pandas DataFrame of pairwise nucleotide or amino acid distances
|
|
271 norm : normalization method
|
|
272 sym : symmetry method
|
|
273 linkage : type of linkage
|
|
274
|
|
275 Returns:
|
|
276 changeo.Multiprocessing.DbResult : an updated DbResult object
|
|
277 """
|
|
278 # Get distance matrix if not provided
|
|
279 if dist_mat is None:
|
|
280 try:
|
|
281 dist_mat = distance_models[model]
|
|
282 except KeyError:
|
|
283 printError('Unrecognized distance model: %s' % args_dict['model'])
|
|
284
|
|
285 # TODO: can be cleaned up with abstract model class
|
|
286 # Determine length of n-mers
|
|
287 if model in ['hs1f_compat', 'm1n_compat', 'aa', 'ham', 'hh_s1f', 'mk_rs1nf']:
|
|
288 nmer_len = 1
|
|
289 elif model in ['hh_s5f', 'mk_rs5nf']:
|
|
290 nmer_len = 5
|
|
291 else:
|
|
292 printError('Unrecognized distance model: %s.\n' % model)
|
|
293
|
|
294 # Define unique junction mapping
|
|
295 seq_map = {}
|
|
296 for rec in result.data_pass:
|
|
297 seq = rec.getField(seq_field)
|
|
298 seq = re.sub('[\.-]', 'N', seq)
|
|
299 if model == 'aa': seq = translate(seq)
|
|
300 seq_map.setdefault(seq, []).append(rec)
|
|
301
|
|
302 # Define sequences
|
|
303 sequences = list(seq_map.keys())
|
|
304
|
|
305 # Zero record case
|
|
306 if not sequences:
|
|
307 result.valid = False
|
|
308 result.log['CLONES'] = 0
|
|
309 return result
|
|
310
|
|
311 # Single record case
|
|
312 if len(sequences) == 1:
|
|
313 result.results = {1: result.data_pass}
|
|
314 result.valid = True
|
|
315 result.log['CLONES'] = 1
|
|
316 return result
|
|
317
|
|
318 # Calculate pairwise distance matrix
|
|
319 dists = calcDistances(sequences, nmer_len, dist_mat, sym=sym, norm=norm)
|
|
320
|
|
321 # Perform hierarchical clustering
|
|
322 clusters = formClusters(dists, linkage, distance)
|
|
323
|
|
324 # Turn clusters into clone dictionary
|
|
325 clone_dict = {}
|
|
326 for i, c in enumerate(clusters):
|
|
327 clone_dict.setdefault(c, []).extend(seq_map[sequences[i]])
|
|
328
|
|
329 if clone_dict:
|
|
330 result.results = clone_dict
|
|
331 result.valid = True
|
|
332 result.log['CLONES'] = len(clone_dict)
|
|
333 else:
|
|
334 result.log['CLONES'] = 0
|
|
335
|
|
336 return result
|
|
337
|
|
338
|
|
339 def collectQueue(alive, result_queue, collect_queue, db_file, fields,
|
|
340 writer=ChangeoWriter, out_file=None, out_args=default_out_args):
|
|
341 """
|
|
342 Assembles results from a queue of individual sequence results and manages log/file I/O
|
|
343
|
|
344 Arguments:
|
|
345 alive = a multiprocessing.Value boolean controlling whether processing continues
|
|
346 if False exit process
|
|
347 result_queue : a multiprocessing.Queue holding processQueue results
|
|
348 collect_queue : a multiprocessing.Queue to store collector return values
|
|
349 db_file : the input database file name
|
|
350 fields : list of output field names
|
|
351 writer : writer class.
|
|
352 out_file : output file name. Automatically generated from the input file if None.
|
|
353 out_args : common output argument dictionary from parseCommonArgs
|
|
354
|
|
355 Returns:
|
|
356 None : Adds a dictionary with key value pairs to collect_queue containing
|
|
357 'log' defining a log object along with the 'pass' and 'fail' output file names.
|
|
358 """
|
|
359 # Wrapper for opening handles and writers
|
|
360 def _open(x, f, writer=writer, out_file=out_file):
|
|
361 if out_file is not None and x == 'pass':
|
|
362 handle = open(out_file, 'w')
|
|
363 else:
|
|
364 handle = getOutputHandle(db_file,
|
|
365 out_label='clone-%s' % x,
|
|
366 out_dir=out_args['out_dir'],
|
|
367 out_name=out_args['out_name'],
|
|
368 out_type=out_args['out_type'])
|
|
369 return handle, writer(handle, fields=f)
|
|
370
|
|
371 # Open log file
|
|
372 try:
|
|
373 # Count input records
|
|
374 result_count = countDbFile(db_file)
|
|
375
|
|
376 # Define log handle
|
|
377 if out_args['log_file'] is None:
|
|
378 log_handle = None
|
|
379 else:
|
|
380 log_handle = open(out_args['log_file'], 'w')
|
|
381 except:
|
|
382 #sys.stderr.write('Exception in collector file opening step\n')
|
|
383 alive.value = False
|
|
384 raise
|
|
385
|
|
386 # Get results from queue and write to files
|
|
387 try:
|
|
388 # Initialize handles, writers and counters
|
|
389 pass_handle, pass_writer = None, None
|
|
390 fail_handle, fail_writer = None, None
|
|
391 rec_count, clone_count, pass_count, fail_count = 0, 0, 0, 0
|
|
392 start_time = time()
|
|
393
|
|
394 # Iterator over results queue until sentinel object reached
|
|
395 while alive.value:
|
|
396 # Get result from queue
|
|
397 if result_queue.empty(): continue
|
|
398 else: result = result_queue.get()
|
|
399 # Exit upon reaching sentinel
|
|
400 if result is None: break
|
|
401
|
|
402 # Print progress for previous iteration and update record count
|
|
403 printProgress(rec_count, result_count, 0.05, start_time=start_time, task='Assigning clones')
|
|
404 rec_count += len(result.data)
|
|
405
|
|
406 # Write passed and failed records
|
|
407 if result:
|
|
408 # Writing passing sequences
|
|
409 for clone in result.results.values():
|
|
410 clone_count += 1
|
|
411 for i, rec in enumerate(clone, start=1):
|
|
412 pass_count += 1
|
|
413 rec.setField('clone', str(clone_count))
|
|
414 result.log['CLONE%i-%i' % (clone_count, i)] = rec.junction
|
|
415 try:
|
|
416 pass_writer.writeReceptor(rec)
|
|
417 except AttributeError:
|
|
418 # Open pass file and define writer object
|
|
419 pass_handle, pass_writer = _open('pass', fields)
|
|
420 pass_writer.writeReceptor(rec)
|
|
421
|
|
422 # Write failed sequences from passing sets
|
|
423 if result.data_fail:
|
|
424 # Write failed sequences
|
|
425 for i, rec in enumerate(result.data_fail, start=1):
|
|
426 fail_count += 1
|
|
427 result.log['FAIL%i-%i' % (clone_count, i)] = rec.junction
|
|
428 if out_args['failed']:
|
|
429 try:
|
|
430 fail_writer.writeReceptor(rec)
|
|
431 except AttributeError:
|
|
432 # Open fail file and define writer object
|
|
433 fail_handle, fail_writer = _open('fail', fields)
|
|
434 fail_writer.writeReceptor(rec)
|
|
435 else:
|
|
436 # Write failing records
|
|
437 for i, rec in enumerate(result.data, start=1):
|
|
438 fail_count += 1
|
|
439 result.log['CLONE0-%i' % (i)] = rec.junction
|
|
440 if out_args['failed']:
|
|
441 try:
|
|
442 fail_writer.writeReceptor(rec)
|
|
443 except AttributeError:
|
|
444 # Open fail file and define writer object
|
|
445 fail_handle, fail_writer = _open('fail', fields)
|
|
446 fail_writer.writeReceptor(rec)
|
|
447
|
|
448 # Write log
|
|
449 printLog(result.log, handle=log_handle)
|
|
450 else:
|
|
451 sys.stderr.write('PID %s> Error in sibling process detected. Cleaning up.\n' \
|
|
452 % os.getpid())
|
|
453 return None
|
|
454
|
|
455 # Print total counts
|
|
456 printProgress(rec_count, result_count, 0.05, start_time=start_time, task='Assigning clones')
|
|
457
|
|
458 # Update return list
|
|
459 log = OrderedDict()
|
|
460 log['OUTPUT'] = os.path.basename(pass_handle.name) if pass_handle is not None else None
|
|
461 log['CLONES'] = clone_count
|
|
462 log['RECORDS'] = rec_count
|
|
463 log['PASS'] = pass_count
|
|
464 log['FAIL'] = fail_count
|
|
465
|
|
466 # Close file handles and generate return data
|
|
467 collect_dict = {'log': log, 'pass': None, 'fail': None}
|
|
468 if pass_handle is not None:
|
|
469 collect_dict['pass'] = pass_handle.name
|
|
470 pass_handle.close()
|
|
471 if fail_handle is not None:
|
|
472 collect_dict['fail'] = fail_handle.name
|
|
473 fail_handle.close()
|
|
474 if log_handle is not None:
|
|
475 log_handle.close()
|
|
476 collect_queue.put(collect_dict)
|
|
477 except:
|
|
478 alive.value = False
|
|
479 raise
|
|
480
|
|
481 return None
|
|
482
|
|
483
|
|
484 def defineClones(db_file, seq_field=default_junction_field, v_field=default_v_field,
|
|
485 j_field=default_j_field, max_missing=default_max_missing,
|
|
486 group_fields=None, group_func=groupByGene, group_args={},
|
|
487 clone_func=distanceClones, clone_args={},
|
|
488 format=default_format, out_file=None, out_args=default_out_args,
|
|
489 nproc=None, queue_size=None):
|
|
490 """
|
|
491 Define clonally related sequences
|
|
492
|
|
493 Arguments:
|
|
494 db_file : filename of input database.
|
|
495 seq_field : sequence field used to determine clones.
|
|
496 v_field : field containing the V call.
|
|
497 j_field : field containing the J call.
|
|
498 max_missing : maximum number of non-ACGT characters to allow in the junction sequence.
|
|
499 group_fields : additional annotation fields to use to group preclones;
|
|
500 if None use only V and J.
|
|
501 group_func : the function to use for assigning preclones.
|
|
502 group_args : a dictionary of arguments to pass to group_func.
|
|
503 clone_func : the function to use for determining clones within preclonal groups.
|
|
504 clone_args : a dictionary of arguments to pass to clone_func.
|
|
505 format : input and output format.
|
|
506 out_file : output file name. Automatically generated from the input file if None.
|
|
507 out_args : common output argument dictionary from parseCommonArgs.
|
|
508 nproc : the number of processQueue processes;
|
|
509 if None defaults to the number of CPUs.
|
|
510 queue_size : maximum size of the argument queue;
|
|
511 if None defaults to 2*nproc.
|
|
512
|
|
513 Returns:
|
|
514 dict: dictionary of output pass and fail files.
|
|
515 """
|
|
516 # Print parameter info
|
|
517 log = OrderedDict()
|
|
518 log['START'] = 'DefineClones'
|
|
519 log['FILE'] = os.path.basename(db_file)
|
|
520 log['SEQ_FIELD'] = seq_field
|
|
521 log['V_FIELD'] = v_field
|
|
522 log['J_FIELD'] = j_field
|
|
523 log['MAX_MISSING'] = max_missing
|
|
524 log['GROUP_FIELDS'] = ','.join(group_fields) if group_fields is not None else None
|
|
525 for k in sorted(group_args):
|
|
526 log[k.upper()] = group_args[k]
|
|
527 for k in sorted(clone_args):
|
|
528 if k != 'dist_mat': log[k.upper()] = clone_args[k]
|
|
529 log['NPROC'] = nproc
|
|
530 printLog(log)
|
|
531
|
|
532 # Define format operators
|
|
533 try:
|
|
534 reader, writer, schema = getFormatOperators(format)
|
|
535 except ValueError:
|
|
536 printError('Invalid format %s.' % format)
|
|
537
|
|
538 # Translate to Receptor attribute names
|
|
539 seq_field = schema.toReceptor(seq_field)
|
|
540 v_field = schema.toReceptor(v_field)
|
|
541 j_field = schema.toReceptor(j_field)
|
|
542 if group_fields is not None:
|
|
543 group_fields = [schema.toReceptor(f) for f in group_fields]
|
|
544
|
|
545 # Define feeder function and arguments
|
|
546 group_args['group_fields'] = group_fields
|
|
547 group_args['v_field'] = v_field
|
|
548 group_args['j_field'] = j_field
|
|
549 feed_args = {'db_file': db_file,
|
|
550 'reader': reader,
|
|
551 'group_func': group_func,
|
|
552 'group_args': group_args}
|
|
553
|
|
554 # Define worker function and arguments
|
|
555 filter_args = {'seq_field': seq_field,
|
|
556 'v_field': v_field,
|
|
557 'j_field': j_field,
|
|
558 'max_missing': max_missing}
|
|
559 clone_args['seq_field'] = seq_field
|
|
560 work_args = {'process_func': clone_func,
|
|
561 'process_args': clone_args,
|
|
562 'filter_func': filterMissing,
|
|
563 'filter_args': filter_args}
|
|
564
|
|
565 # Define collector function and arguments
|
|
566 out_fields = getDbFields(db_file, add=schema.fromReceptor('clone'), reader=reader)
|
|
567 out_args['out_type'] = schema.out_type
|
|
568 collect_args = {'db_file': db_file,
|
|
569 'fields': out_fields,
|
|
570 'writer': writer,
|
|
571 'out_file': out_file,
|
|
572 'out_args': out_args}
|
|
573
|
|
574 # Call process manager
|
|
575 result = manageProcesses(feed_func=feedDbQueue, work_func=processDbQueue, collect_func=collectQueue,
|
|
576 feed_args=feed_args, work_args=work_args, collect_args=collect_args,
|
|
577 nproc=nproc, queue_size=queue_size)
|
|
578
|
|
579 # Print log
|
|
580 result['log']['END'] = 'DefineClones'
|
|
581 printLog(result['log'])
|
|
582 output = {k: v for k, v in result.items() if k in ('pass', 'fail')}
|
|
583
|
|
584 return output
|
|
585
|
|
586
|
|
587 def getArgParser():
|
|
588 """
|
|
589 Defines the ArgumentParser
|
|
590
|
|
591 Arguments:
|
|
592 None
|
|
593
|
|
594 Returns:
|
|
595 an ArgumentParser object
|
|
596 """
|
|
597 # Define input and output fields
|
|
598 fields = dedent(
|
|
599 '''
|
|
600 output files:
|
|
601 clone-pass
|
|
602 database with assigned clonal group numbers.
|
|
603 clone-fail
|
|
604 database with records failing clonal grouping.
|
|
605
|
|
606 required fields:
|
|
607 SEQUENCE_ID, V_CALL, J_CALL, JUNCTION
|
|
608
|
|
609 output fields:
|
|
610 CLONE
|
|
611 ''')
|
|
612 # Define argument parser
|
|
613 parser = ArgumentParser(description=__doc__, epilog=fields,
|
|
614 parents=[getCommonArgParser(format=False, multiproc=True)],
|
|
615 formatter_class=CommonHelpFormatter, add_help=False)
|
|
616
|
|
617 # Distance cloning method
|
|
618 group = parser.add_argument_group('cloning arguments')
|
|
619 group.add_argument('--sf', action='store', dest='seq_field', default=default_junction_field,
|
|
620 help='Field to be used to calculate distance between records.')
|
|
621 group.add_argument('--vf', action='store', dest='v_field', default=default_v_field,
|
|
622 help='Field containing the germline V segment call.')
|
|
623 group.add_argument('--jf', action='store', dest='j_field', default=default_j_field,
|
|
624 help='Field containing the germline J segment call.')
|
|
625 group.add_argument('--gf', nargs='+', action='store', dest='group_fields', default=None,
|
|
626 help='Additional fields to use for grouping clones aside from V, J and junction length.')
|
|
627 group.add_argument('--mode', action='store', dest='mode',
|
|
628 choices=('allele', 'gene'), default=default_index_mode,
|
|
629 help='''Specifies whether to use the V(D)J allele or gene for
|
|
630 initial grouping.''')
|
|
631 group.add_argument('--act', action='store', dest='action',
|
|
632 choices=('first', 'set'), default=default_index_action,
|
|
633 help='''Specifies how to handle multiple V(D)J assignments for initial grouping.
|
|
634 The "first" action will use only the first gene listed.
|
|
635 The "set" action will use all gene assignments and construct a larger gene
|
|
636 grouping composed of any sequences sharing an assignment or linked to another
|
|
637 sequence by a common assignment (similar to single-linkage).''')
|
|
638 group.add_argument('--model', action='store', dest='model',
|
|
639 choices=choices_distance_model,
|
|
640 default=default_distance_model,
|
|
641 help='''Specifies which substitution model to use for calculating distance
|
|
642 between sequences. The "ham" model is nucleotide Hamming distance and
|
|
643 "aa" is amino acid Hamming distance. The "hh_s1f" and "hh_s5f" models are
|
|
644 human specific single nucleotide and 5-mer content models, respectively,
|
|
645 from Yaari et al, 2013. The "mk_rs1nf" and "mk_rs5nf" models are
|
|
646 mouse specific single nucleotide and 5-mer content models, respectively,
|
|
647 from Cui et al, 2016. The "m1n_compat" and "hs1f_compat" models are
|
|
648 deprecated models provided backwards compatibility with the "m1n" and
|
|
649 "hs1f" models in Change-O v0.3.3 and SHazaM v0.1.4. Both
|
|
650 5-mer models should be considered experimental.''')
|
|
651 group.add_argument('--dist', action='store', dest='distance', type=float,
|
|
652 default=default_distance,
|
|
653 help='The distance threshold for clonal grouping')
|
|
654 group.add_argument('--norm', action='store', dest='norm',
|
|
655 choices=('len', 'mut', 'none'), default=default_norm,
|
|
656 help='''Specifies how to normalize distances. One of none
|
|
657 (do not normalize), len (normalize by length),
|
|
658 or mut (normalize by number of mutations between sequences).''')
|
|
659 group.add_argument('--sym', action='store', dest='sym',
|
|
660 choices=('avg', 'min'), default=default_sym,
|
|
661 help='''Specifies how to combine asymmetric distances. One of avg
|
|
662 (average of A->B and B->A) or min (minimum of A->B and B->A).''')
|
|
663 group.add_argument('--link', action='store', dest='linkage',
|
|
664 choices=('single', 'average', 'complete'), default=default_linkage,
|
|
665 help='''Type of linkage to use for hierarchical clustering.''')
|
|
666 group.add_argument('--maxmiss', action='store', dest='max_missing', type=int,
|
|
667 default=default_max_missing,
|
|
668 help='''The maximum number of non-ACGT characters (gaps or Ns) to
|
|
669 permit in the junction sequence before excluding the record
|
|
670 from clonal assignment. Note, under single linkage
|
|
671 non-informative positions can create artifactual links
|
|
672 between unrelated sequences. Use with caution.''')
|
|
673 parser.set_defaults(group_func=groupByGene)
|
|
674 parser.set_defaults(clone_func=distanceClones)
|
|
675
|
|
676 return parser
|
|
677
|
|
678
|
|
679 if __name__ == '__main__':
|
|
680 """
|
|
681 Parses command line arguments and calls main function
|
|
682 """
|
|
683 # Parse arguments
|
|
684 parser = getArgParser()
|
|
685 checkArgs(parser)
|
|
686 args = parser.parse_args()
|
|
687 args_dict = parseCommonArgs(args)
|
|
688
|
|
689 # # Set default fields if not specified.
|
|
690 # default_fields = {'seq_field': default_junction_field,
|
|
691 # 'v_field': default_v_field,
|
|
692 # 'j_field': default_j_field}
|
|
693 #
|
|
694 # # Default Change-O fields
|
|
695 # if args_dict['format'] == 'changeo':
|
|
696 # for f in default_fields:
|
|
697 # if args_dict[f] is None: args_dict[f] = default_fields[f]
|
|
698 # else: args_dict[f] = args_dict[f].upper()
|
|
699 #
|
|
700 # # Default AIRR fields
|
|
701 # if args_dict['format'] == 'airr':
|
|
702 # for f in default_fields:
|
|
703 # if args_dict[f] is None: args_dict[f] = ChangeoSchema.toAIRR(default_fields[f])
|
|
704 # else: args_dict[f] = args_dict[f].lower()
|
|
705
|
|
706 # Define grouping and cloning function arguments
|
|
707 args_dict['group_args'] = {'action': args_dict['action'],
|
|
708 'mode':args_dict['mode']}
|
|
709 args_dict['clone_args'] = {'model': args_dict['model'],
|
|
710 'distance': args_dict['distance'],
|
|
711 'norm': args_dict['norm'],
|
|
712 'sym': args_dict['sym'],
|
|
713 'linkage': args_dict['linkage']}
|
|
714
|
|
715 # Get distance matrix
|
|
716 try:
|
|
717 args_dict['clone_args']['dist_mat'] = distance_models[args_dict['model']]
|
|
718 except KeyError:
|
|
719 printError('Unrecognized distance model: %s' % args_dict['model'])
|
|
720
|
|
721 # Clean argument dictionary
|
|
722 del args_dict['action']
|
|
723 del args_dict['mode']
|
|
724 del args_dict['model']
|
|
725 del args_dict['distance']
|
|
726 del args_dict['norm']
|
|
727 del args_dict['sym']
|
|
728 del args_dict['linkage']
|
|
729
|
|
730 # Clean arguments dictionary
|
|
731 del args_dict['db_files']
|
|
732 if 'out_files' in args_dict: del args_dict['out_files']
|
|
733
|
|
734 # Call main function for each input file
|
|
735 for i, f in enumerate(args.__dict__['db_files']):
|
|
736 args_dict['db_file'] = f
|
|
737 args_dict['out_file'] = args.__dict__['out_files'][i] \
|
|
738 if args.__dict__['out_files'] else None
|
|
739 defineClones(**args_dict)
|