comparison change_o/DefineClones.py @ 78:aff3ba86ef7a draft

Uploaded
author davidvanzessen
date Mon, 31 Aug 2020 11:20:08 -0400
parents
children
comparison
equal deleted inserted replaced
77:58d2377b507d 78:aff3ba86ef7a
1 #!/usr/bin/env python3
2 """
3 Assign Ig sequences into clones
4 """
5
6 # Info
7 __author__ = 'Namita Gupta, Jason Anthony Vander Heiden, Gur Yaari, Mohamed Uduman'
8 from changeo import __version__, __date__
9
10 # Imports
11 import os
12 import re
13 import sys
14 from argparse import ArgumentParser
15 from collections import OrderedDict
16 from itertools import chain
17 from textwrap import dedent
18 from time import time
19 from Bio.Seq import translate
20
21 # Presto and changeo imports
22 from presto.Defaults import default_out_args
23 from presto.IO import printLog, printProgress, printCount, printWarning, printError
24 from presto.Multiprocessing import manageProcesses
25 from changeo.Defaults import default_format, default_v_field, default_j_field, default_junction_field
26 from changeo.Commandline import CommonHelpFormatter, checkArgs, getCommonArgParser, parseCommonArgs
27 from changeo.Distance import distance_models, calcDistances, formClusters
28 from changeo.IO import countDbFile, getDbFields, getFormatOperators, getOutputHandle, \
29 AIRRWriter, ChangeoWriter
30 from changeo.Multiprocessing import DbResult, feedDbQueue, processDbQueue
31
32 # Defaults
33 default_translate = False
34 default_distance = 0.0
35 default_index_mode = 'gene'
36 default_index_action = 'set'
37 default_distance_model = 'ham'
38 default_norm = 'len'
39 default_sym = 'avg'
40 default_linkage = 'single'
41 default_max_missing=0
42 choices_distance_model = ('ham', 'aa', 'hh_s1f', 'hh_s5f',
43 'mk_rs1nf', 'mk_rs5nf',
44 'hs1f_compat', 'm1n_compat')
45
46
47 def filterMissing(data, seq_field=default_junction_field, v_field=default_v_field,
48 j_field=default_j_field, max_missing=default_max_missing):
49 """
50 Splits a set of sequence into passed and failed groups based on the number
51 of missing characters in the sequence
52
53 Arguments:
54 data : changeo.Multiprocessing.DbData object.
55 seq_field : sequence field to filter on.
56 v_field : field containing the V call.
57 j_field : field containing the J call.
58 max_missing : maximum number of missing characters (non-ACGT) to permit before failing the record.
59
60 Returns:
61 changeo.Multiprocessing.DbResult : objected containing filtered records.
62 """
63 # Function to validate the sequence string
64 def _pass(seq):
65 if len(seq) > 0 and len(re.findall(r'[^ACGT]', seq)) <= max_missing:
66 return True
67 else:
68 return False
69
70 # Define result object for iteration and get data records
71 result = DbResult(data.id, data.data)
72
73 if not data:
74 result.data_pass = []
75 result.data_fail = data.data
76 return result
77
78 result.data_pass = []
79 result.data_fail = []
80 for rec in data.data:
81 seq = rec.getField(seq_field)
82 if _pass(seq): result.data_pass.append(rec)
83 else: result.data_fail.append(rec)
84
85 # Add V(D)J to log
86 result.log['ID'] = ','.join([str(x) for x in data.id])
87 result.log['VCALL'] = ','.join(set([(r.getVAllele(field=v_field) or '') for r in data.data]))
88 result.log['JCALL'] = ','.join(set([(r.getJAllele(field=j_field) or '') for r in data.data]))
89 result.log['JUNCLEN'] = ','.join(set([(str(len(r.junction)) or '0') for r in data.data]))
90 result.log['CLONED'] = len(result.data_pass)
91 result.log['FILTERED'] = len(result.data_fail)
92
93 return result
94
95
96 def indexByIdentity(index, key, rec, group_fields=None):
97 """
98 Updates a preclone index with a simple key
99
100 Arguments:
101 index : preclone index from groupByGene
102 key : index key
103 rec : Receptor to add to the index
104 group_fields : additional annotation fields to use to group preclones;
105 if None use only V, J and junction length
106
107 Returns:
108 None : Updates index with new key and records.
109 """
110 index.setdefault(tuple(key), []).append(rec)
111
112
113 def indexByUnion(index, key, rec, group_fields=None):
114 """
115 Updates a preclone index with the union of nested keys
116
117 Arguments:
118 index : preclone index from groupByGene
119 key : index key
120 rec : Receptor to add to the index
121 group_fields : additional annotation fields to use to group preclones;
122 if None use only V, J and junction length
123
124 Returns:
125 None : Updates index with new key and records.
126 """
127 # List of values for this/new key
128 val = [rec]
129 f_range = list(range(2, 3 + (len(group_fields) if group_fields else 0)))
130
131 # See if field/junction length combination exists in index
132 outer_dict = index
133 for field in f_range:
134 try:
135 outer_dict = outer_dict[key[field]]
136 except KeyError:
137 outer_dict = None
138 break
139 # If field combination exists, look through Js
140 j_matches = []
141 if outer_dict is not None:
142 for j in outer_dict.keys():
143 if not set(key[1]).isdisjoint(set(j)):
144 key[1] = tuple(set(key[1]).union(set(j)))
145 j_matches += [j]
146 # If J overlap exists, look through Vs for each J
147 for j in j_matches:
148 v_matches = []
149 # Collect V matches for this J
150 for v in outer_dict[j].keys():
151 if not set(key[0]).isdisjoint(set(v)):
152 key[0] = tuple(set(key[0]).union(set(v)))
153 v_matches += [v]
154 # If there are V overlaps for this J, pop them out
155 if v_matches:
156 val += list(chain(*(outer_dict[j].pop(v) for v in v_matches)))
157 # If the J dict is now empty, remove it
158 if not outer_dict[j]:
159 outer_dict.pop(j, None)
160
161 # Add value(s) into index nested dictionary
162 # OMG Python pointers are the best!
163 # Add field dictionaries into index
164 outer_dict = index
165 for field in f_range:
166 outer_dict.setdefault(key[field], {})
167 outer_dict = outer_dict[key[field]]
168 # Add J, then V into index
169 if key[1] in outer_dict:
170 outer_dict[key[1]].update({key[0]: val})
171 else:
172 outer_dict[key[1]] = {key[0]: val}
173
174
175 def groupByGene(db_iter, group_fields=None, v_field=default_v_field, j_field=default_j_field,
176 mode=default_index_mode, action=default_index_action):
177 """
178 Identifies preclonal groups by V, J and junction length
179
180 Arguments:
181 db_iter : an iterator of Receptor objects defined by ChangeoReader
182 group_fields : additional annotation fields to use to group preclones;
183 if None use only V, J and junction length
184 mode : specificity of alignment call to use for assigning preclones;
185 one of ('allele', 'gene')
186 action : how to handle multiple value fields when assigning preclones;
187 one of ('first', 'set')
188
189 Returns:
190 dict: dictionary of {(V, J, junction length):[Receptor]}
191 """
192 # print(fields)
193 # Define functions for grouping keys
194 if mode == 'allele' and group_fields is None:
195 def _get_key(rec, act):
196 return [rec.getVAllele(act, field=v_field), rec.getJAllele(act, field=j_field),
197 None if rec.junction is None else len(rec.junction)]
198 elif mode == 'gene' and group_fields is None:
199 def _get_key(rec, act):
200 return [rec.getVGene(act, field=v_field), rec.getJGene(act, field=j_field),
201 None if rec.junction is None else len(rec.junction)]
202 elif mode == 'allele' and group_fields is not None:
203 def _get_key(rec, act):
204 vdj = [rec.getVAllele(act, field=v_field), rec.getJAllele(act, field=j_field),
205 None if rec.junction is None else len(rec.junction)]
206 ann = [rec.getField(k) for k in group_fields]
207 return list(chain(vdj, ann))
208 elif mode == 'gene' and group_fields is not None:
209 def _get_key(rec, act):
210 vdj = [rec.getVGene(act, field=v_field), rec.getJGene(act, field=j_field),
211 None if rec.junction is None else len(rec.junction)]
212 ann = [rec.getField(k) for k in group_fields]
213 return list(chain(vdj, ann))
214
215 # Function to flatten nested dictionary
216 def _flatten_dict(d, parent_key=''):
217 items = []
218 for k, v in d.items():
219 new_key = parent_key + [k] if parent_key else [k]
220 if isinstance(v, dict):
221 items.extend(_flatten_dict(v, new_key).items())
222 else:
223 items.append((new_key, v))
224 flat_dict = {None if None in i[0] else tuple(i[0]): i[1] for i in items}
225 return flat_dict
226
227 if action == 'first':
228 index_func = indexByIdentity
229 elif action == 'set':
230 index_func = indexByUnion
231 else:
232 sys.stderr.write('Unrecognized action: %s.\n' % action)
233
234 start_time = time()
235 clone_index = {}
236 rec_count = 0
237 for rec in db_iter:
238 key = _get_key(rec, action)
239
240 # Print progress
241 printCount(rec_count, step=1000, start_time=start_time, task='Grouping sequences')
242 rec_count += 1
243
244 # Assigned passed preclone records to key and failed to index None
245 if all([k is not None and k != '' for k in key]):
246 # Update index dictionary
247 index_func(clone_index, key, rec, group_fields)
248 else:
249 clone_index.setdefault(None, []).append(rec)
250
251 printCount(rec_count, step=1000, start_time=start_time, task='Grouping sequences', end=True)
252
253 if action == 'set':
254 clone_index = _flatten_dict(clone_index)
255
256 return clone_index
257
258
259 def distanceClones(result, seq_field=default_junction_field, model=default_distance_model,
260 distance=default_distance, dist_mat=None, norm=default_norm, sym=default_sym,
261 linkage=default_linkage):
262 """
263 Separates a set of Receptor objects into clones
264
265 Arguments:
266 result : a changeo.Multiprocessing.DbResult object with filtered records to clone
267 seq_field : sequence field used to calculate distance between records
268 model : substitution model used to calculate distance
269 distance : the distance threshold to assign clonal groups
270 dist_mat : pandas DataFrame of pairwise nucleotide or amino acid distances
271 norm : normalization method
272 sym : symmetry method
273 linkage : type of linkage
274
275 Returns:
276 changeo.Multiprocessing.DbResult : an updated DbResult object
277 """
278 # Get distance matrix if not provided
279 if dist_mat is None:
280 try:
281 dist_mat = distance_models[model]
282 except KeyError:
283 printError('Unrecognized distance model: %s' % args_dict['model'])
284
285 # TODO: can be cleaned up with abstract model class
286 # Determine length of n-mers
287 if model in ['hs1f_compat', 'm1n_compat', 'aa', 'ham', 'hh_s1f', 'mk_rs1nf']:
288 nmer_len = 1
289 elif model in ['hh_s5f', 'mk_rs5nf']:
290 nmer_len = 5
291 else:
292 printError('Unrecognized distance model: %s.\n' % model)
293
294 # Define unique junction mapping
295 seq_map = {}
296 for rec in result.data_pass:
297 seq = rec.getField(seq_field)
298 seq = re.sub('[\.-]', 'N', seq)
299 if model == 'aa': seq = translate(seq)
300 seq_map.setdefault(seq, []).append(rec)
301
302 # Define sequences
303 sequences = list(seq_map.keys())
304
305 # Zero record case
306 if not sequences:
307 result.valid = False
308 result.log['CLONES'] = 0
309 return result
310
311 # Single record case
312 if len(sequences) == 1:
313 result.results = {1: result.data_pass}
314 result.valid = True
315 result.log['CLONES'] = 1
316 return result
317
318 # Calculate pairwise distance matrix
319 dists = calcDistances(sequences, nmer_len, dist_mat, sym=sym, norm=norm)
320
321 # Perform hierarchical clustering
322 clusters = formClusters(dists, linkage, distance)
323
324 # Turn clusters into clone dictionary
325 clone_dict = {}
326 for i, c in enumerate(clusters):
327 clone_dict.setdefault(c, []).extend(seq_map[sequences[i]])
328
329 if clone_dict:
330 result.results = clone_dict
331 result.valid = True
332 result.log['CLONES'] = len(clone_dict)
333 else:
334 result.log['CLONES'] = 0
335
336 return result
337
338
339 def collectQueue(alive, result_queue, collect_queue, db_file, fields,
340 writer=ChangeoWriter, out_file=None, out_args=default_out_args):
341 """
342 Assembles results from a queue of individual sequence results and manages log/file I/O
343
344 Arguments:
345 alive = a multiprocessing.Value boolean controlling whether processing continues
346 if False exit process
347 result_queue : a multiprocessing.Queue holding processQueue results
348 collect_queue : a multiprocessing.Queue to store collector return values
349 db_file : the input database file name
350 fields : list of output field names
351 writer : writer class.
352 out_file : output file name. Automatically generated from the input file if None.
353 out_args : common output argument dictionary from parseCommonArgs
354
355 Returns:
356 None : Adds a dictionary with key value pairs to collect_queue containing
357 'log' defining a log object along with the 'pass' and 'fail' output file names.
358 """
359 # Wrapper for opening handles and writers
360 def _open(x, f, writer=writer, out_file=out_file):
361 if out_file is not None and x == 'pass':
362 handle = open(out_file, 'w')
363 else:
364 handle = getOutputHandle(db_file,
365 out_label='clone-%s' % x,
366 out_dir=out_args['out_dir'],
367 out_name=out_args['out_name'],
368 out_type=out_args['out_type'])
369 return handle, writer(handle, fields=f)
370
371 # Open log file
372 try:
373 # Count input records
374 result_count = countDbFile(db_file)
375
376 # Define log handle
377 if out_args['log_file'] is None:
378 log_handle = None
379 else:
380 log_handle = open(out_args['log_file'], 'w')
381 except:
382 #sys.stderr.write('Exception in collector file opening step\n')
383 alive.value = False
384 raise
385
386 # Get results from queue and write to files
387 try:
388 # Initialize handles, writers and counters
389 pass_handle, pass_writer = None, None
390 fail_handle, fail_writer = None, None
391 rec_count, clone_count, pass_count, fail_count = 0, 0, 0, 0
392 start_time = time()
393
394 # Iterator over results queue until sentinel object reached
395 while alive.value:
396 # Get result from queue
397 if result_queue.empty(): continue
398 else: result = result_queue.get()
399 # Exit upon reaching sentinel
400 if result is None: break
401
402 # Print progress for previous iteration and update record count
403 printProgress(rec_count, result_count, 0.05, start_time=start_time, task='Assigning clones')
404 rec_count += len(result.data)
405
406 # Write passed and failed records
407 if result:
408 # Writing passing sequences
409 for clone in result.results.values():
410 clone_count += 1
411 for i, rec in enumerate(clone, start=1):
412 pass_count += 1
413 rec.setField('clone', str(clone_count))
414 result.log['CLONE%i-%i' % (clone_count, i)] = rec.junction
415 try:
416 pass_writer.writeReceptor(rec)
417 except AttributeError:
418 # Open pass file and define writer object
419 pass_handle, pass_writer = _open('pass', fields)
420 pass_writer.writeReceptor(rec)
421
422 # Write failed sequences from passing sets
423 if result.data_fail:
424 # Write failed sequences
425 for i, rec in enumerate(result.data_fail, start=1):
426 fail_count += 1
427 result.log['FAIL%i-%i' % (clone_count, i)] = rec.junction
428 if out_args['failed']:
429 try:
430 fail_writer.writeReceptor(rec)
431 except AttributeError:
432 # Open fail file and define writer object
433 fail_handle, fail_writer = _open('fail', fields)
434 fail_writer.writeReceptor(rec)
435 else:
436 # Write failing records
437 for i, rec in enumerate(result.data, start=1):
438 fail_count += 1
439 result.log['CLONE0-%i' % (i)] = rec.junction
440 if out_args['failed']:
441 try:
442 fail_writer.writeReceptor(rec)
443 except AttributeError:
444 # Open fail file and define writer object
445 fail_handle, fail_writer = _open('fail', fields)
446 fail_writer.writeReceptor(rec)
447
448 # Write log
449 printLog(result.log, handle=log_handle)
450 else:
451 sys.stderr.write('PID %s> Error in sibling process detected. Cleaning up.\n' \
452 % os.getpid())
453 return None
454
455 # Print total counts
456 printProgress(rec_count, result_count, 0.05, start_time=start_time, task='Assigning clones')
457
458 # Update return list
459 log = OrderedDict()
460 log['OUTPUT'] = os.path.basename(pass_handle.name) if pass_handle is not None else None
461 log['CLONES'] = clone_count
462 log['RECORDS'] = rec_count
463 log['PASS'] = pass_count
464 log['FAIL'] = fail_count
465
466 # Close file handles and generate return data
467 collect_dict = {'log': log, 'pass': None, 'fail': None}
468 if pass_handle is not None:
469 collect_dict['pass'] = pass_handle.name
470 pass_handle.close()
471 if fail_handle is not None:
472 collect_dict['fail'] = fail_handle.name
473 fail_handle.close()
474 if log_handle is not None:
475 log_handle.close()
476 collect_queue.put(collect_dict)
477 except:
478 alive.value = False
479 raise
480
481 return None
482
483
484 def defineClones(db_file, seq_field=default_junction_field, v_field=default_v_field,
485 j_field=default_j_field, max_missing=default_max_missing,
486 group_fields=None, group_func=groupByGene, group_args={},
487 clone_func=distanceClones, clone_args={},
488 format=default_format, out_file=None, out_args=default_out_args,
489 nproc=None, queue_size=None):
490 """
491 Define clonally related sequences
492
493 Arguments:
494 db_file : filename of input database.
495 seq_field : sequence field used to determine clones.
496 v_field : field containing the V call.
497 j_field : field containing the J call.
498 max_missing : maximum number of non-ACGT characters to allow in the junction sequence.
499 group_fields : additional annotation fields to use to group preclones;
500 if None use only V and J.
501 group_func : the function to use for assigning preclones.
502 group_args : a dictionary of arguments to pass to group_func.
503 clone_func : the function to use for determining clones within preclonal groups.
504 clone_args : a dictionary of arguments to pass to clone_func.
505 format : input and output format.
506 out_file : output file name. Automatically generated from the input file if None.
507 out_args : common output argument dictionary from parseCommonArgs.
508 nproc : the number of processQueue processes;
509 if None defaults to the number of CPUs.
510 queue_size : maximum size of the argument queue;
511 if None defaults to 2*nproc.
512
513 Returns:
514 dict: dictionary of output pass and fail files.
515 """
516 # Print parameter info
517 log = OrderedDict()
518 log['START'] = 'DefineClones'
519 log['FILE'] = os.path.basename(db_file)
520 log['SEQ_FIELD'] = seq_field
521 log['V_FIELD'] = v_field
522 log['J_FIELD'] = j_field
523 log['MAX_MISSING'] = max_missing
524 log['GROUP_FIELDS'] = ','.join(group_fields) if group_fields is not None else None
525 for k in sorted(group_args):
526 log[k.upper()] = group_args[k]
527 for k in sorted(clone_args):
528 if k != 'dist_mat': log[k.upper()] = clone_args[k]
529 log['NPROC'] = nproc
530 printLog(log)
531
532 # Define format operators
533 try:
534 reader, writer, schema = getFormatOperators(format)
535 except ValueError:
536 printError('Invalid format %s.' % format)
537
538 # Translate to Receptor attribute names
539 seq_field = schema.toReceptor(seq_field)
540 v_field = schema.toReceptor(v_field)
541 j_field = schema.toReceptor(j_field)
542 if group_fields is not None:
543 group_fields = [schema.toReceptor(f) for f in group_fields]
544
545 # Define feeder function and arguments
546 group_args['group_fields'] = group_fields
547 group_args['v_field'] = v_field
548 group_args['j_field'] = j_field
549 feed_args = {'db_file': db_file,
550 'reader': reader,
551 'group_func': group_func,
552 'group_args': group_args}
553
554 # Define worker function and arguments
555 filter_args = {'seq_field': seq_field,
556 'v_field': v_field,
557 'j_field': j_field,
558 'max_missing': max_missing}
559 clone_args['seq_field'] = seq_field
560 work_args = {'process_func': clone_func,
561 'process_args': clone_args,
562 'filter_func': filterMissing,
563 'filter_args': filter_args}
564
565 # Define collector function and arguments
566 out_fields = getDbFields(db_file, add=schema.fromReceptor('clone'), reader=reader)
567 out_args['out_type'] = schema.out_type
568 collect_args = {'db_file': db_file,
569 'fields': out_fields,
570 'writer': writer,
571 'out_file': out_file,
572 'out_args': out_args}
573
574 # Call process manager
575 result = manageProcesses(feed_func=feedDbQueue, work_func=processDbQueue, collect_func=collectQueue,
576 feed_args=feed_args, work_args=work_args, collect_args=collect_args,
577 nproc=nproc, queue_size=queue_size)
578
579 # Print log
580 result['log']['END'] = 'DefineClones'
581 printLog(result['log'])
582 output = {k: v for k, v in result.items() if k in ('pass', 'fail')}
583
584 return output
585
586
587 def getArgParser():
588 """
589 Defines the ArgumentParser
590
591 Arguments:
592 None
593
594 Returns:
595 an ArgumentParser object
596 """
597 # Define input and output fields
598 fields = dedent(
599 '''
600 output files:
601 clone-pass
602 database with assigned clonal group numbers.
603 clone-fail
604 database with records failing clonal grouping.
605
606 required fields:
607 SEQUENCE_ID, V_CALL, J_CALL, JUNCTION
608
609 output fields:
610 CLONE
611 ''')
612 # Define argument parser
613 parser = ArgumentParser(description=__doc__, epilog=fields,
614 parents=[getCommonArgParser(format=False, multiproc=True)],
615 formatter_class=CommonHelpFormatter, add_help=False)
616
617 # Distance cloning method
618 group = parser.add_argument_group('cloning arguments')
619 group.add_argument('--sf', action='store', dest='seq_field', default=default_junction_field,
620 help='Field to be used to calculate distance between records.')
621 group.add_argument('--vf', action='store', dest='v_field', default=default_v_field,
622 help='Field containing the germline V segment call.')
623 group.add_argument('--jf', action='store', dest='j_field', default=default_j_field,
624 help='Field containing the germline J segment call.')
625 group.add_argument('--gf', nargs='+', action='store', dest='group_fields', default=None,
626 help='Additional fields to use for grouping clones aside from V, J and junction length.')
627 group.add_argument('--mode', action='store', dest='mode',
628 choices=('allele', 'gene'), default=default_index_mode,
629 help='''Specifies whether to use the V(D)J allele or gene for
630 initial grouping.''')
631 group.add_argument('--act', action='store', dest='action',
632 choices=('first', 'set'), default=default_index_action,
633 help='''Specifies how to handle multiple V(D)J assignments for initial grouping.
634 The "first" action will use only the first gene listed.
635 The "set" action will use all gene assignments and construct a larger gene
636 grouping composed of any sequences sharing an assignment or linked to another
637 sequence by a common assignment (similar to single-linkage).''')
638 group.add_argument('--model', action='store', dest='model',
639 choices=choices_distance_model,
640 default=default_distance_model,
641 help='''Specifies which substitution model to use for calculating distance
642 between sequences. The "ham" model is nucleotide Hamming distance and
643 "aa" is amino acid Hamming distance. The "hh_s1f" and "hh_s5f" models are
644 human specific single nucleotide and 5-mer content models, respectively,
645 from Yaari et al, 2013. The "mk_rs1nf" and "mk_rs5nf" models are
646 mouse specific single nucleotide and 5-mer content models, respectively,
647 from Cui et al, 2016. The "m1n_compat" and "hs1f_compat" models are
648 deprecated models provided backwards compatibility with the "m1n" and
649 "hs1f" models in Change-O v0.3.3 and SHazaM v0.1.4. Both
650 5-mer models should be considered experimental.''')
651 group.add_argument('--dist', action='store', dest='distance', type=float,
652 default=default_distance,
653 help='The distance threshold for clonal grouping')
654 group.add_argument('--norm', action='store', dest='norm',
655 choices=('len', 'mut', 'none'), default=default_norm,
656 help='''Specifies how to normalize distances. One of none
657 (do not normalize), len (normalize by length),
658 or mut (normalize by number of mutations between sequences).''')
659 group.add_argument('--sym', action='store', dest='sym',
660 choices=('avg', 'min'), default=default_sym,
661 help='''Specifies how to combine asymmetric distances. One of avg
662 (average of A->B and B->A) or min (minimum of A->B and B->A).''')
663 group.add_argument('--link', action='store', dest='linkage',
664 choices=('single', 'average', 'complete'), default=default_linkage,
665 help='''Type of linkage to use for hierarchical clustering.''')
666 group.add_argument('--maxmiss', action='store', dest='max_missing', type=int,
667 default=default_max_missing,
668 help='''The maximum number of non-ACGT characters (gaps or Ns) to
669 permit in the junction sequence before excluding the record
670 from clonal assignment. Note, under single linkage
671 non-informative positions can create artifactual links
672 between unrelated sequences. Use with caution.''')
673 parser.set_defaults(group_func=groupByGene)
674 parser.set_defaults(clone_func=distanceClones)
675
676 return parser
677
678
679 if __name__ == '__main__':
680 """
681 Parses command line arguments and calls main function
682 """
683 # Parse arguments
684 parser = getArgParser()
685 checkArgs(parser)
686 args = parser.parse_args()
687 args_dict = parseCommonArgs(args)
688
689 # # Set default fields if not specified.
690 # default_fields = {'seq_field': default_junction_field,
691 # 'v_field': default_v_field,
692 # 'j_field': default_j_field}
693 #
694 # # Default Change-O fields
695 # if args_dict['format'] == 'changeo':
696 # for f in default_fields:
697 # if args_dict[f] is None: args_dict[f] = default_fields[f]
698 # else: args_dict[f] = args_dict[f].upper()
699 #
700 # # Default AIRR fields
701 # if args_dict['format'] == 'airr':
702 # for f in default_fields:
703 # if args_dict[f] is None: args_dict[f] = ChangeoSchema.toAIRR(default_fields[f])
704 # else: args_dict[f] = args_dict[f].lower()
705
706 # Define grouping and cloning function arguments
707 args_dict['group_args'] = {'action': args_dict['action'],
708 'mode':args_dict['mode']}
709 args_dict['clone_args'] = {'model': args_dict['model'],
710 'distance': args_dict['distance'],
711 'norm': args_dict['norm'],
712 'sym': args_dict['sym'],
713 'linkage': args_dict['linkage']}
714
715 # Get distance matrix
716 try:
717 args_dict['clone_args']['dist_mat'] = distance_models[args_dict['model']]
718 except KeyError:
719 printError('Unrecognized distance model: %s' % args_dict['model'])
720
721 # Clean argument dictionary
722 del args_dict['action']
723 del args_dict['mode']
724 del args_dict['model']
725 del args_dict['distance']
726 del args_dict['norm']
727 del args_dict['sym']
728 del args_dict['linkage']
729
730 # Clean arguments dictionary
731 del args_dict['db_files']
732 if 'out_files' in args_dict: del args_dict['out_files']
733
734 # Call main function for each input file
735 for i, f in enumerate(args.__dict__['db_files']):
736 args_dict['db_file'] = f
737 args_dict['out_file'] = args.__dict__['out_files'][i] \
738 if args.__dict__['out_files'] else None
739 defineClones(**args_dict)