Mercurial > repos > davidvanzessen > shm_csr
comparison change_o/DefineClones.py @ 78:aff3ba86ef7a draft
Uploaded
author | davidvanzessen |
---|---|
date | Mon, 31 Aug 2020 11:20:08 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
77:58d2377b507d | 78:aff3ba86ef7a |
---|---|
1 #!/usr/bin/env python3 | |
2 """ | |
3 Assign Ig sequences into clones | |
4 """ | |
5 | |
6 # Info | |
7 __author__ = 'Namita Gupta, Jason Anthony Vander Heiden, Gur Yaari, Mohamed Uduman' | |
8 from changeo import __version__, __date__ | |
9 | |
10 # Imports | |
11 import os | |
12 import re | |
13 import sys | |
14 from argparse import ArgumentParser | |
15 from collections import OrderedDict | |
16 from itertools import chain | |
17 from textwrap import dedent | |
18 from time import time | |
19 from Bio.Seq import translate | |
20 | |
21 # Presto and changeo imports | |
22 from presto.Defaults import default_out_args | |
23 from presto.IO import printLog, printProgress, printCount, printWarning, printError | |
24 from presto.Multiprocessing import manageProcesses | |
25 from changeo.Defaults import default_format, default_v_field, default_j_field, default_junction_field | |
26 from changeo.Commandline import CommonHelpFormatter, checkArgs, getCommonArgParser, parseCommonArgs | |
27 from changeo.Distance import distance_models, calcDistances, formClusters | |
28 from changeo.IO import countDbFile, getDbFields, getFormatOperators, getOutputHandle, \ | |
29 AIRRWriter, ChangeoWriter | |
30 from changeo.Multiprocessing import DbResult, feedDbQueue, processDbQueue | |
31 | |
32 # Defaults | |
33 default_translate = False | |
34 default_distance = 0.0 | |
35 default_index_mode = 'gene' | |
36 default_index_action = 'set' | |
37 default_distance_model = 'ham' | |
38 default_norm = 'len' | |
39 default_sym = 'avg' | |
40 default_linkage = 'single' | |
41 default_max_missing=0 | |
42 choices_distance_model = ('ham', 'aa', 'hh_s1f', 'hh_s5f', | |
43 'mk_rs1nf', 'mk_rs5nf', | |
44 'hs1f_compat', 'm1n_compat') | |
45 | |
46 | |
47 def filterMissing(data, seq_field=default_junction_field, v_field=default_v_field, | |
48 j_field=default_j_field, max_missing=default_max_missing): | |
49 """ | |
50 Splits a set of sequence into passed and failed groups based on the number | |
51 of missing characters in the sequence | |
52 | |
53 Arguments: | |
54 data : changeo.Multiprocessing.DbData object. | |
55 seq_field : sequence field to filter on. | |
56 v_field : field containing the V call. | |
57 j_field : field containing the J call. | |
58 max_missing : maximum number of missing characters (non-ACGT) to permit before failing the record. | |
59 | |
60 Returns: | |
61 changeo.Multiprocessing.DbResult : objected containing filtered records. | |
62 """ | |
63 # Function to validate the sequence string | |
64 def _pass(seq): | |
65 if len(seq) > 0 and len(re.findall(r'[^ACGT]', seq)) <= max_missing: | |
66 return True | |
67 else: | |
68 return False | |
69 | |
70 # Define result object for iteration and get data records | |
71 result = DbResult(data.id, data.data) | |
72 | |
73 if not data: | |
74 result.data_pass = [] | |
75 result.data_fail = data.data | |
76 return result | |
77 | |
78 result.data_pass = [] | |
79 result.data_fail = [] | |
80 for rec in data.data: | |
81 seq = rec.getField(seq_field) | |
82 if _pass(seq): result.data_pass.append(rec) | |
83 else: result.data_fail.append(rec) | |
84 | |
85 # Add V(D)J to log | |
86 result.log['ID'] = ','.join([str(x) for x in data.id]) | |
87 result.log['VCALL'] = ','.join(set([(r.getVAllele(field=v_field) or '') for r in data.data])) | |
88 result.log['JCALL'] = ','.join(set([(r.getJAllele(field=j_field) or '') for r in data.data])) | |
89 result.log['JUNCLEN'] = ','.join(set([(str(len(r.junction)) or '0') for r in data.data])) | |
90 result.log['CLONED'] = len(result.data_pass) | |
91 result.log['FILTERED'] = len(result.data_fail) | |
92 | |
93 return result | |
94 | |
95 | |
96 def indexByIdentity(index, key, rec, group_fields=None): | |
97 """ | |
98 Updates a preclone index with a simple key | |
99 | |
100 Arguments: | |
101 index : preclone index from groupByGene | |
102 key : index key | |
103 rec : Receptor to add to the index | |
104 group_fields : additional annotation fields to use to group preclones; | |
105 if None use only V, J and junction length | |
106 | |
107 Returns: | |
108 None : Updates index with new key and records. | |
109 """ | |
110 index.setdefault(tuple(key), []).append(rec) | |
111 | |
112 | |
113 def indexByUnion(index, key, rec, group_fields=None): | |
114 """ | |
115 Updates a preclone index with the union of nested keys | |
116 | |
117 Arguments: | |
118 index : preclone index from groupByGene | |
119 key : index key | |
120 rec : Receptor to add to the index | |
121 group_fields : additional annotation fields to use to group preclones; | |
122 if None use only V, J and junction length | |
123 | |
124 Returns: | |
125 None : Updates index with new key and records. | |
126 """ | |
127 # List of values for this/new key | |
128 val = [rec] | |
129 f_range = list(range(2, 3 + (len(group_fields) if group_fields else 0))) | |
130 | |
131 # See if field/junction length combination exists in index | |
132 outer_dict = index | |
133 for field in f_range: | |
134 try: | |
135 outer_dict = outer_dict[key[field]] | |
136 except KeyError: | |
137 outer_dict = None | |
138 break | |
139 # If field combination exists, look through Js | |
140 j_matches = [] | |
141 if outer_dict is not None: | |
142 for j in outer_dict.keys(): | |
143 if not set(key[1]).isdisjoint(set(j)): | |
144 key[1] = tuple(set(key[1]).union(set(j))) | |
145 j_matches += [j] | |
146 # If J overlap exists, look through Vs for each J | |
147 for j in j_matches: | |
148 v_matches = [] | |
149 # Collect V matches for this J | |
150 for v in outer_dict[j].keys(): | |
151 if not set(key[0]).isdisjoint(set(v)): | |
152 key[0] = tuple(set(key[0]).union(set(v))) | |
153 v_matches += [v] | |
154 # If there are V overlaps for this J, pop them out | |
155 if v_matches: | |
156 val += list(chain(*(outer_dict[j].pop(v) for v in v_matches))) | |
157 # If the J dict is now empty, remove it | |
158 if not outer_dict[j]: | |
159 outer_dict.pop(j, None) | |
160 | |
161 # Add value(s) into index nested dictionary | |
162 # OMG Python pointers are the best! | |
163 # Add field dictionaries into index | |
164 outer_dict = index | |
165 for field in f_range: | |
166 outer_dict.setdefault(key[field], {}) | |
167 outer_dict = outer_dict[key[field]] | |
168 # Add J, then V into index | |
169 if key[1] in outer_dict: | |
170 outer_dict[key[1]].update({key[0]: val}) | |
171 else: | |
172 outer_dict[key[1]] = {key[0]: val} | |
173 | |
174 | |
175 def groupByGene(db_iter, group_fields=None, v_field=default_v_field, j_field=default_j_field, | |
176 mode=default_index_mode, action=default_index_action): | |
177 """ | |
178 Identifies preclonal groups by V, J and junction length | |
179 | |
180 Arguments: | |
181 db_iter : an iterator of Receptor objects defined by ChangeoReader | |
182 group_fields : additional annotation fields to use to group preclones; | |
183 if None use only V, J and junction length | |
184 mode : specificity of alignment call to use for assigning preclones; | |
185 one of ('allele', 'gene') | |
186 action : how to handle multiple value fields when assigning preclones; | |
187 one of ('first', 'set') | |
188 | |
189 Returns: | |
190 dict: dictionary of {(V, J, junction length):[Receptor]} | |
191 """ | |
192 # print(fields) | |
193 # Define functions for grouping keys | |
194 if mode == 'allele' and group_fields is None: | |
195 def _get_key(rec, act): | |
196 return [rec.getVAllele(act, field=v_field), rec.getJAllele(act, field=j_field), | |
197 None if rec.junction is None else len(rec.junction)] | |
198 elif mode == 'gene' and group_fields is None: | |
199 def _get_key(rec, act): | |
200 return [rec.getVGene(act, field=v_field), rec.getJGene(act, field=j_field), | |
201 None if rec.junction is None else len(rec.junction)] | |
202 elif mode == 'allele' and group_fields is not None: | |
203 def _get_key(rec, act): | |
204 vdj = [rec.getVAllele(act, field=v_field), rec.getJAllele(act, field=j_field), | |
205 None if rec.junction is None else len(rec.junction)] | |
206 ann = [rec.getField(k) for k in group_fields] | |
207 return list(chain(vdj, ann)) | |
208 elif mode == 'gene' and group_fields is not None: | |
209 def _get_key(rec, act): | |
210 vdj = [rec.getVGene(act, field=v_field), rec.getJGene(act, field=j_field), | |
211 None if rec.junction is None else len(rec.junction)] | |
212 ann = [rec.getField(k) for k in group_fields] | |
213 return list(chain(vdj, ann)) | |
214 | |
215 # Function to flatten nested dictionary | |
216 def _flatten_dict(d, parent_key=''): | |
217 items = [] | |
218 for k, v in d.items(): | |
219 new_key = parent_key + [k] if parent_key else [k] | |
220 if isinstance(v, dict): | |
221 items.extend(_flatten_dict(v, new_key).items()) | |
222 else: | |
223 items.append((new_key, v)) | |
224 flat_dict = {None if None in i[0] else tuple(i[0]): i[1] for i in items} | |
225 return flat_dict | |
226 | |
227 if action == 'first': | |
228 index_func = indexByIdentity | |
229 elif action == 'set': | |
230 index_func = indexByUnion | |
231 else: | |
232 sys.stderr.write('Unrecognized action: %s.\n' % action) | |
233 | |
234 start_time = time() | |
235 clone_index = {} | |
236 rec_count = 0 | |
237 for rec in db_iter: | |
238 key = _get_key(rec, action) | |
239 | |
240 # Print progress | |
241 printCount(rec_count, step=1000, start_time=start_time, task='Grouping sequences') | |
242 rec_count += 1 | |
243 | |
244 # Assigned passed preclone records to key and failed to index None | |
245 if all([k is not None and k != '' for k in key]): | |
246 # Update index dictionary | |
247 index_func(clone_index, key, rec, group_fields) | |
248 else: | |
249 clone_index.setdefault(None, []).append(rec) | |
250 | |
251 printCount(rec_count, step=1000, start_time=start_time, task='Grouping sequences', end=True) | |
252 | |
253 if action == 'set': | |
254 clone_index = _flatten_dict(clone_index) | |
255 | |
256 return clone_index | |
257 | |
258 | |
259 def distanceClones(result, seq_field=default_junction_field, model=default_distance_model, | |
260 distance=default_distance, dist_mat=None, norm=default_norm, sym=default_sym, | |
261 linkage=default_linkage): | |
262 """ | |
263 Separates a set of Receptor objects into clones | |
264 | |
265 Arguments: | |
266 result : a changeo.Multiprocessing.DbResult object with filtered records to clone | |
267 seq_field : sequence field used to calculate distance between records | |
268 model : substitution model used to calculate distance | |
269 distance : the distance threshold to assign clonal groups | |
270 dist_mat : pandas DataFrame of pairwise nucleotide or amino acid distances | |
271 norm : normalization method | |
272 sym : symmetry method | |
273 linkage : type of linkage | |
274 | |
275 Returns: | |
276 changeo.Multiprocessing.DbResult : an updated DbResult object | |
277 """ | |
278 # Get distance matrix if not provided | |
279 if dist_mat is None: | |
280 try: | |
281 dist_mat = distance_models[model] | |
282 except KeyError: | |
283 printError('Unrecognized distance model: %s' % args_dict['model']) | |
284 | |
285 # TODO: can be cleaned up with abstract model class | |
286 # Determine length of n-mers | |
287 if model in ['hs1f_compat', 'm1n_compat', 'aa', 'ham', 'hh_s1f', 'mk_rs1nf']: | |
288 nmer_len = 1 | |
289 elif model in ['hh_s5f', 'mk_rs5nf']: | |
290 nmer_len = 5 | |
291 else: | |
292 printError('Unrecognized distance model: %s.\n' % model) | |
293 | |
294 # Define unique junction mapping | |
295 seq_map = {} | |
296 for rec in result.data_pass: | |
297 seq = rec.getField(seq_field) | |
298 seq = re.sub('[\.-]', 'N', seq) | |
299 if model == 'aa': seq = translate(seq) | |
300 seq_map.setdefault(seq, []).append(rec) | |
301 | |
302 # Define sequences | |
303 sequences = list(seq_map.keys()) | |
304 | |
305 # Zero record case | |
306 if not sequences: | |
307 result.valid = False | |
308 result.log['CLONES'] = 0 | |
309 return result | |
310 | |
311 # Single record case | |
312 if len(sequences) == 1: | |
313 result.results = {1: result.data_pass} | |
314 result.valid = True | |
315 result.log['CLONES'] = 1 | |
316 return result | |
317 | |
318 # Calculate pairwise distance matrix | |
319 dists = calcDistances(sequences, nmer_len, dist_mat, sym=sym, norm=norm) | |
320 | |
321 # Perform hierarchical clustering | |
322 clusters = formClusters(dists, linkage, distance) | |
323 | |
324 # Turn clusters into clone dictionary | |
325 clone_dict = {} | |
326 for i, c in enumerate(clusters): | |
327 clone_dict.setdefault(c, []).extend(seq_map[sequences[i]]) | |
328 | |
329 if clone_dict: | |
330 result.results = clone_dict | |
331 result.valid = True | |
332 result.log['CLONES'] = len(clone_dict) | |
333 else: | |
334 result.log['CLONES'] = 0 | |
335 | |
336 return result | |
337 | |
338 | |
339 def collectQueue(alive, result_queue, collect_queue, db_file, fields, | |
340 writer=ChangeoWriter, out_file=None, out_args=default_out_args): | |
341 """ | |
342 Assembles results from a queue of individual sequence results and manages log/file I/O | |
343 | |
344 Arguments: | |
345 alive = a multiprocessing.Value boolean controlling whether processing continues | |
346 if False exit process | |
347 result_queue : a multiprocessing.Queue holding processQueue results | |
348 collect_queue : a multiprocessing.Queue to store collector return values | |
349 db_file : the input database file name | |
350 fields : list of output field names | |
351 writer : writer class. | |
352 out_file : output file name. Automatically generated from the input file if None. | |
353 out_args : common output argument dictionary from parseCommonArgs | |
354 | |
355 Returns: | |
356 None : Adds a dictionary with key value pairs to collect_queue containing | |
357 'log' defining a log object along with the 'pass' and 'fail' output file names. | |
358 """ | |
359 # Wrapper for opening handles and writers | |
360 def _open(x, f, writer=writer, out_file=out_file): | |
361 if out_file is not None and x == 'pass': | |
362 handle = open(out_file, 'w') | |
363 else: | |
364 handle = getOutputHandle(db_file, | |
365 out_label='clone-%s' % x, | |
366 out_dir=out_args['out_dir'], | |
367 out_name=out_args['out_name'], | |
368 out_type=out_args['out_type']) | |
369 return handle, writer(handle, fields=f) | |
370 | |
371 # Open log file | |
372 try: | |
373 # Count input records | |
374 result_count = countDbFile(db_file) | |
375 | |
376 # Define log handle | |
377 if out_args['log_file'] is None: | |
378 log_handle = None | |
379 else: | |
380 log_handle = open(out_args['log_file'], 'w') | |
381 except: | |
382 #sys.stderr.write('Exception in collector file opening step\n') | |
383 alive.value = False | |
384 raise | |
385 | |
386 # Get results from queue and write to files | |
387 try: | |
388 # Initialize handles, writers and counters | |
389 pass_handle, pass_writer = None, None | |
390 fail_handle, fail_writer = None, None | |
391 rec_count, clone_count, pass_count, fail_count = 0, 0, 0, 0 | |
392 start_time = time() | |
393 | |
394 # Iterator over results queue until sentinel object reached | |
395 while alive.value: | |
396 # Get result from queue | |
397 if result_queue.empty(): continue | |
398 else: result = result_queue.get() | |
399 # Exit upon reaching sentinel | |
400 if result is None: break | |
401 | |
402 # Print progress for previous iteration and update record count | |
403 printProgress(rec_count, result_count, 0.05, start_time=start_time, task='Assigning clones') | |
404 rec_count += len(result.data) | |
405 | |
406 # Write passed and failed records | |
407 if result: | |
408 # Writing passing sequences | |
409 for clone in result.results.values(): | |
410 clone_count += 1 | |
411 for i, rec in enumerate(clone, start=1): | |
412 pass_count += 1 | |
413 rec.setField('clone', str(clone_count)) | |
414 result.log['CLONE%i-%i' % (clone_count, i)] = rec.junction | |
415 try: | |
416 pass_writer.writeReceptor(rec) | |
417 except AttributeError: | |
418 # Open pass file and define writer object | |
419 pass_handle, pass_writer = _open('pass', fields) | |
420 pass_writer.writeReceptor(rec) | |
421 | |
422 # Write failed sequences from passing sets | |
423 if result.data_fail: | |
424 # Write failed sequences | |
425 for i, rec in enumerate(result.data_fail, start=1): | |
426 fail_count += 1 | |
427 result.log['FAIL%i-%i' % (clone_count, i)] = rec.junction | |
428 if out_args['failed']: | |
429 try: | |
430 fail_writer.writeReceptor(rec) | |
431 except AttributeError: | |
432 # Open fail file and define writer object | |
433 fail_handle, fail_writer = _open('fail', fields) | |
434 fail_writer.writeReceptor(rec) | |
435 else: | |
436 # Write failing records | |
437 for i, rec in enumerate(result.data, start=1): | |
438 fail_count += 1 | |
439 result.log['CLONE0-%i' % (i)] = rec.junction | |
440 if out_args['failed']: | |
441 try: | |
442 fail_writer.writeReceptor(rec) | |
443 except AttributeError: | |
444 # Open fail file and define writer object | |
445 fail_handle, fail_writer = _open('fail', fields) | |
446 fail_writer.writeReceptor(rec) | |
447 | |
448 # Write log | |
449 printLog(result.log, handle=log_handle) | |
450 else: | |
451 sys.stderr.write('PID %s> Error in sibling process detected. Cleaning up.\n' \ | |
452 % os.getpid()) | |
453 return None | |
454 | |
455 # Print total counts | |
456 printProgress(rec_count, result_count, 0.05, start_time=start_time, task='Assigning clones') | |
457 | |
458 # Update return list | |
459 log = OrderedDict() | |
460 log['OUTPUT'] = os.path.basename(pass_handle.name) if pass_handle is not None else None | |
461 log['CLONES'] = clone_count | |
462 log['RECORDS'] = rec_count | |
463 log['PASS'] = pass_count | |
464 log['FAIL'] = fail_count | |
465 | |
466 # Close file handles and generate return data | |
467 collect_dict = {'log': log, 'pass': None, 'fail': None} | |
468 if pass_handle is not None: | |
469 collect_dict['pass'] = pass_handle.name | |
470 pass_handle.close() | |
471 if fail_handle is not None: | |
472 collect_dict['fail'] = fail_handle.name | |
473 fail_handle.close() | |
474 if log_handle is not None: | |
475 log_handle.close() | |
476 collect_queue.put(collect_dict) | |
477 except: | |
478 alive.value = False | |
479 raise | |
480 | |
481 return None | |
482 | |
483 | |
484 def defineClones(db_file, seq_field=default_junction_field, v_field=default_v_field, | |
485 j_field=default_j_field, max_missing=default_max_missing, | |
486 group_fields=None, group_func=groupByGene, group_args={}, | |
487 clone_func=distanceClones, clone_args={}, | |
488 format=default_format, out_file=None, out_args=default_out_args, | |
489 nproc=None, queue_size=None): | |
490 """ | |
491 Define clonally related sequences | |
492 | |
493 Arguments: | |
494 db_file : filename of input database. | |
495 seq_field : sequence field used to determine clones. | |
496 v_field : field containing the V call. | |
497 j_field : field containing the J call. | |
498 max_missing : maximum number of non-ACGT characters to allow in the junction sequence. | |
499 group_fields : additional annotation fields to use to group preclones; | |
500 if None use only V and J. | |
501 group_func : the function to use for assigning preclones. | |
502 group_args : a dictionary of arguments to pass to group_func. | |
503 clone_func : the function to use for determining clones within preclonal groups. | |
504 clone_args : a dictionary of arguments to pass to clone_func. | |
505 format : input and output format. | |
506 out_file : output file name. Automatically generated from the input file if None. | |
507 out_args : common output argument dictionary from parseCommonArgs. | |
508 nproc : the number of processQueue processes; | |
509 if None defaults to the number of CPUs. | |
510 queue_size : maximum size of the argument queue; | |
511 if None defaults to 2*nproc. | |
512 | |
513 Returns: | |
514 dict: dictionary of output pass and fail files. | |
515 """ | |
516 # Print parameter info | |
517 log = OrderedDict() | |
518 log['START'] = 'DefineClones' | |
519 log['FILE'] = os.path.basename(db_file) | |
520 log['SEQ_FIELD'] = seq_field | |
521 log['V_FIELD'] = v_field | |
522 log['J_FIELD'] = j_field | |
523 log['MAX_MISSING'] = max_missing | |
524 log['GROUP_FIELDS'] = ','.join(group_fields) if group_fields is not None else None | |
525 for k in sorted(group_args): | |
526 log[k.upper()] = group_args[k] | |
527 for k in sorted(clone_args): | |
528 if k != 'dist_mat': log[k.upper()] = clone_args[k] | |
529 log['NPROC'] = nproc | |
530 printLog(log) | |
531 | |
532 # Define format operators | |
533 try: | |
534 reader, writer, schema = getFormatOperators(format) | |
535 except ValueError: | |
536 printError('Invalid format %s.' % format) | |
537 | |
538 # Translate to Receptor attribute names | |
539 seq_field = schema.toReceptor(seq_field) | |
540 v_field = schema.toReceptor(v_field) | |
541 j_field = schema.toReceptor(j_field) | |
542 if group_fields is not None: | |
543 group_fields = [schema.toReceptor(f) for f in group_fields] | |
544 | |
545 # Define feeder function and arguments | |
546 group_args['group_fields'] = group_fields | |
547 group_args['v_field'] = v_field | |
548 group_args['j_field'] = j_field | |
549 feed_args = {'db_file': db_file, | |
550 'reader': reader, | |
551 'group_func': group_func, | |
552 'group_args': group_args} | |
553 | |
554 # Define worker function and arguments | |
555 filter_args = {'seq_field': seq_field, | |
556 'v_field': v_field, | |
557 'j_field': j_field, | |
558 'max_missing': max_missing} | |
559 clone_args['seq_field'] = seq_field | |
560 work_args = {'process_func': clone_func, | |
561 'process_args': clone_args, | |
562 'filter_func': filterMissing, | |
563 'filter_args': filter_args} | |
564 | |
565 # Define collector function and arguments | |
566 out_fields = getDbFields(db_file, add=schema.fromReceptor('clone'), reader=reader) | |
567 out_args['out_type'] = schema.out_type | |
568 collect_args = {'db_file': db_file, | |
569 'fields': out_fields, | |
570 'writer': writer, | |
571 'out_file': out_file, | |
572 'out_args': out_args} | |
573 | |
574 # Call process manager | |
575 result = manageProcesses(feed_func=feedDbQueue, work_func=processDbQueue, collect_func=collectQueue, | |
576 feed_args=feed_args, work_args=work_args, collect_args=collect_args, | |
577 nproc=nproc, queue_size=queue_size) | |
578 | |
579 # Print log | |
580 result['log']['END'] = 'DefineClones' | |
581 printLog(result['log']) | |
582 output = {k: v for k, v in result.items() if k in ('pass', 'fail')} | |
583 | |
584 return output | |
585 | |
586 | |
587 def getArgParser(): | |
588 """ | |
589 Defines the ArgumentParser | |
590 | |
591 Arguments: | |
592 None | |
593 | |
594 Returns: | |
595 an ArgumentParser object | |
596 """ | |
597 # Define input and output fields | |
598 fields = dedent( | |
599 ''' | |
600 output files: | |
601 clone-pass | |
602 database with assigned clonal group numbers. | |
603 clone-fail | |
604 database with records failing clonal grouping. | |
605 | |
606 required fields: | |
607 SEQUENCE_ID, V_CALL, J_CALL, JUNCTION | |
608 | |
609 output fields: | |
610 CLONE | |
611 ''') | |
612 # Define argument parser | |
613 parser = ArgumentParser(description=__doc__, epilog=fields, | |
614 parents=[getCommonArgParser(format=False, multiproc=True)], | |
615 formatter_class=CommonHelpFormatter, add_help=False) | |
616 | |
617 # Distance cloning method | |
618 group = parser.add_argument_group('cloning arguments') | |
619 group.add_argument('--sf', action='store', dest='seq_field', default=default_junction_field, | |
620 help='Field to be used to calculate distance between records.') | |
621 group.add_argument('--vf', action='store', dest='v_field', default=default_v_field, | |
622 help='Field containing the germline V segment call.') | |
623 group.add_argument('--jf', action='store', dest='j_field', default=default_j_field, | |
624 help='Field containing the germline J segment call.') | |
625 group.add_argument('--gf', nargs='+', action='store', dest='group_fields', default=None, | |
626 help='Additional fields to use for grouping clones aside from V, J and junction length.') | |
627 group.add_argument('--mode', action='store', dest='mode', | |
628 choices=('allele', 'gene'), default=default_index_mode, | |
629 help='''Specifies whether to use the V(D)J allele or gene for | |
630 initial grouping.''') | |
631 group.add_argument('--act', action='store', dest='action', | |
632 choices=('first', 'set'), default=default_index_action, | |
633 help='''Specifies how to handle multiple V(D)J assignments for initial grouping. | |
634 The "first" action will use only the first gene listed. | |
635 The "set" action will use all gene assignments and construct a larger gene | |
636 grouping composed of any sequences sharing an assignment or linked to another | |
637 sequence by a common assignment (similar to single-linkage).''') | |
638 group.add_argument('--model', action='store', dest='model', | |
639 choices=choices_distance_model, | |
640 default=default_distance_model, | |
641 help='''Specifies which substitution model to use for calculating distance | |
642 between sequences. The "ham" model is nucleotide Hamming distance and | |
643 "aa" is amino acid Hamming distance. The "hh_s1f" and "hh_s5f" models are | |
644 human specific single nucleotide and 5-mer content models, respectively, | |
645 from Yaari et al, 2013. The "mk_rs1nf" and "mk_rs5nf" models are | |
646 mouse specific single nucleotide and 5-mer content models, respectively, | |
647 from Cui et al, 2016. The "m1n_compat" and "hs1f_compat" models are | |
648 deprecated models provided backwards compatibility with the "m1n" and | |
649 "hs1f" models in Change-O v0.3.3 and SHazaM v0.1.4. Both | |
650 5-mer models should be considered experimental.''') | |
651 group.add_argument('--dist', action='store', dest='distance', type=float, | |
652 default=default_distance, | |
653 help='The distance threshold for clonal grouping') | |
654 group.add_argument('--norm', action='store', dest='norm', | |
655 choices=('len', 'mut', 'none'), default=default_norm, | |
656 help='''Specifies how to normalize distances. One of none | |
657 (do not normalize), len (normalize by length), | |
658 or mut (normalize by number of mutations between sequences).''') | |
659 group.add_argument('--sym', action='store', dest='sym', | |
660 choices=('avg', 'min'), default=default_sym, | |
661 help='''Specifies how to combine asymmetric distances. One of avg | |
662 (average of A->B and B->A) or min (minimum of A->B and B->A).''') | |
663 group.add_argument('--link', action='store', dest='linkage', | |
664 choices=('single', 'average', 'complete'), default=default_linkage, | |
665 help='''Type of linkage to use for hierarchical clustering.''') | |
666 group.add_argument('--maxmiss', action='store', dest='max_missing', type=int, | |
667 default=default_max_missing, | |
668 help='''The maximum number of non-ACGT characters (gaps or Ns) to | |
669 permit in the junction sequence before excluding the record | |
670 from clonal assignment. Note, under single linkage | |
671 non-informative positions can create artifactual links | |
672 between unrelated sequences. Use with caution.''') | |
673 parser.set_defaults(group_func=groupByGene) | |
674 parser.set_defaults(clone_func=distanceClones) | |
675 | |
676 return parser | |
677 | |
678 | |
679 if __name__ == '__main__': | |
680 """ | |
681 Parses command line arguments and calls main function | |
682 """ | |
683 # Parse arguments | |
684 parser = getArgParser() | |
685 checkArgs(parser) | |
686 args = parser.parse_args() | |
687 args_dict = parseCommonArgs(args) | |
688 | |
689 # # Set default fields if not specified. | |
690 # default_fields = {'seq_field': default_junction_field, | |
691 # 'v_field': default_v_field, | |
692 # 'j_field': default_j_field} | |
693 # | |
694 # # Default Change-O fields | |
695 # if args_dict['format'] == 'changeo': | |
696 # for f in default_fields: | |
697 # if args_dict[f] is None: args_dict[f] = default_fields[f] | |
698 # else: args_dict[f] = args_dict[f].upper() | |
699 # | |
700 # # Default AIRR fields | |
701 # if args_dict['format'] == 'airr': | |
702 # for f in default_fields: | |
703 # if args_dict[f] is None: args_dict[f] = ChangeoSchema.toAIRR(default_fields[f]) | |
704 # else: args_dict[f] = args_dict[f].lower() | |
705 | |
706 # Define grouping and cloning function arguments | |
707 args_dict['group_args'] = {'action': args_dict['action'], | |
708 'mode':args_dict['mode']} | |
709 args_dict['clone_args'] = {'model': args_dict['model'], | |
710 'distance': args_dict['distance'], | |
711 'norm': args_dict['norm'], | |
712 'sym': args_dict['sym'], | |
713 'linkage': args_dict['linkage']} | |
714 | |
715 # Get distance matrix | |
716 try: | |
717 args_dict['clone_args']['dist_mat'] = distance_models[args_dict['model']] | |
718 except KeyError: | |
719 printError('Unrecognized distance model: %s' % args_dict['model']) | |
720 | |
721 # Clean argument dictionary | |
722 del args_dict['action'] | |
723 del args_dict['mode'] | |
724 del args_dict['model'] | |
725 del args_dict['distance'] | |
726 del args_dict['norm'] | |
727 del args_dict['sym'] | |
728 del args_dict['linkage'] | |
729 | |
730 # Clean arguments dictionary | |
731 del args_dict['db_files'] | |
732 if 'out_files' in args_dict: del args_dict['out_files'] | |
733 | |
734 # Call main function for each input file | |
735 for i, f in enumerate(args.__dict__['db_files']): | |
736 args_dict['db_file'] = f | |
737 args_dict['out_file'] = args.__dict__['out_files'][i] \ | |
738 if args.__dict__['out_files'] else None | |
739 defineClones(**args_dict) |