annotate change_o/DefineClones.py @ 78:aff3ba86ef7a draft

Uploaded
author davidvanzessen
date Mon, 31 Aug 2020 11:20:08 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
78
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
1 #!/usr/bin/env python3
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
2 """
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
3 Assign Ig sequences into clones
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
4 """
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
5
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
6 # Info
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
7 __author__ = 'Namita Gupta, Jason Anthony Vander Heiden, Gur Yaari, Mohamed Uduman'
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
8 from changeo import __version__, __date__
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
9
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
10 # Imports
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
11 import os
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
12 import re
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
13 import sys
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
14 from argparse import ArgumentParser
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
15 from collections import OrderedDict
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
16 from itertools import chain
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
17 from textwrap import dedent
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
18 from time import time
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
19 from Bio.Seq import translate
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
20
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
21 # Presto and changeo imports
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
22 from presto.Defaults import default_out_args
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
23 from presto.IO import printLog, printProgress, printCount, printWarning, printError
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
24 from presto.Multiprocessing import manageProcesses
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
25 from changeo.Defaults import default_format, default_v_field, default_j_field, default_junction_field
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
26 from changeo.Commandline import CommonHelpFormatter, checkArgs, getCommonArgParser, parseCommonArgs
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
27 from changeo.Distance import distance_models, calcDistances, formClusters
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
28 from changeo.IO import countDbFile, getDbFields, getFormatOperators, getOutputHandle, \
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
29 AIRRWriter, ChangeoWriter
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
30 from changeo.Multiprocessing import DbResult, feedDbQueue, processDbQueue
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
31
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
32 # Defaults
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
33 default_translate = False
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
34 default_distance = 0.0
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
35 default_index_mode = 'gene'
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
36 default_index_action = 'set'
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
37 default_distance_model = 'ham'
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
38 default_norm = 'len'
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
39 default_sym = 'avg'
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
40 default_linkage = 'single'
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
41 default_max_missing=0
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
42 choices_distance_model = ('ham', 'aa', 'hh_s1f', 'hh_s5f',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
43 'mk_rs1nf', 'mk_rs5nf',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
44 'hs1f_compat', 'm1n_compat')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
45
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
46
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
47 def filterMissing(data, seq_field=default_junction_field, v_field=default_v_field,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
48 j_field=default_j_field, max_missing=default_max_missing):
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
49 """
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
50 Splits a set of sequence into passed and failed groups based on the number
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
51 of missing characters in the sequence
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
52
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
53 Arguments:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
54 data : changeo.Multiprocessing.DbData object.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
55 seq_field : sequence field to filter on.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
56 v_field : field containing the V call.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
57 j_field : field containing the J call.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
58 max_missing : maximum number of missing characters (non-ACGT) to permit before failing the record.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
59
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
60 Returns:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
61 changeo.Multiprocessing.DbResult : objected containing filtered records.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
62 """
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
63 # Function to validate the sequence string
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
64 def _pass(seq):
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
65 if len(seq) > 0 and len(re.findall(r'[^ACGT]', seq)) <= max_missing:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
66 return True
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
67 else:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
68 return False
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
69
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
70 # Define result object for iteration and get data records
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
71 result = DbResult(data.id, data.data)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
72
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
73 if not data:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
74 result.data_pass = []
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
75 result.data_fail = data.data
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
76 return result
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
77
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
78 result.data_pass = []
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
79 result.data_fail = []
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
80 for rec in data.data:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
81 seq = rec.getField(seq_field)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
82 if _pass(seq): result.data_pass.append(rec)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
83 else: result.data_fail.append(rec)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
84
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
85 # Add V(D)J to log
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
86 result.log['ID'] = ','.join([str(x) for x in data.id])
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
87 result.log['VCALL'] = ','.join(set([(r.getVAllele(field=v_field) or '') for r in data.data]))
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
88 result.log['JCALL'] = ','.join(set([(r.getJAllele(field=j_field) or '') for r in data.data]))
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
89 result.log['JUNCLEN'] = ','.join(set([(str(len(r.junction)) or '0') for r in data.data]))
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
90 result.log['CLONED'] = len(result.data_pass)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
91 result.log['FILTERED'] = len(result.data_fail)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
92
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
93 return result
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
94
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
95
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
96 def indexByIdentity(index, key, rec, group_fields=None):
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
97 """
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
98 Updates a preclone index with a simple key
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
99
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
100 Arguments:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
101 index : preclone index from groupByGene
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
102 key : index key
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
103 rec : Receptor to add to the index
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
104 group_fields : additional annotation fields to use to group preclones;
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
105 if None use only V, J and junction length
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
106
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
107 Returns:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
108 None : Updates index with new key and records.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
109 """
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
110 index.setdefault(tuple(key), []).append(rec)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
111
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
112
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
113 def indexByUnion(index, key, rec, group_fields=None):
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
114 """
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
115 Updates a preclone index with the union of nested keys
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
116
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
117 Arguments:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
118 index : preclone index from groupByGene
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
119 key : index key
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
120 rec : Receptor to add to the index
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
121 group_fields : additional annotation fields to use to group preclones;
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
122 if None use only V, J and junction length
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
123
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
124 Returns:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
125 None : Updates index with new key and records.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
126 """
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
127 # List of values for this/new key
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
128 val = [rec]
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
129 f_range = list(range(2, 3 + (len(group_fields) if group_fields else 0)))
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
130
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
131 # See if field/junction length combination exists in index
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
132 outer_dict = index
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
133 for field in f_range:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
134 try:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
135 outer_dict = outer_dict[key[field]]
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
136 except KeyError:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
137 outer_dict = None
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
138 break
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
139 # If field combination exists, look through Js
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
140 j_matches = []
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
141 if outer_dict is not None:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
142 for j in outer_dict.keys():
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
143 if not set(key[1]).isdisjoint(set(j)):
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
144 key[1] = tuple(set(key[1]).union(set(j)))
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
145 j_matches += [j]
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
146 # If J overlap exists, look through Vs for each J
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
147 for j in j_matches:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
148 v_matches = []
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
149 # Collect V matches for this J
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
150 for v in outer_dict[j].keys():
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
151 if not set(key[0]).isdisjoint(set(v)):
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
152 key[0] = tuple(set(key[0]).union(set(v)))
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
153 v_matches += [v]
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
154 # If there are V overlaps for this J, pop them out
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
155 if v_matches:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
156 val += list(chain(*(outer_dict[j].pop(v) for v in v_matches)))
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
157 # If the J dict is now empty, remove it
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
158 if not outer_dict[j]:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
159 outer_dict.pop(j, None)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
160
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
161 # Add value(s) into index nested dictionary
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
162 # OMG Python pointers are the best!
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
163 # Add field dictionaries into index
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
164 outer_dict = index
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
165 for field in f_range:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
166 outer_dict.setdefault(key[field], {})
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
167 outer_dict = outer_dict[key[field]]
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
168 # Add J, then V into index
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
169 if key[1] in outer_dict:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
170 outer_dict[key[1]].update({key[0]: val})
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
171 else:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
172 outer_dict[key[1]] = {key[0]: val}
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
173
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
174
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
175 def groupByGene(db_iter, group_fields=None, v_field=default_v_field, j_field=default_j_field,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
176 mode=default_index_mode, action=default_index_action):
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
177 """
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
178 Identifies preclonal groups by V, J and junction length
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
179
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
180 Arguments:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
181 db_iter : an iterator of Receptor objects defined by ChangeoReader
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
182 group_fields : additional annotation fields to use to group preclones;
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
183 if None use only V, J and junction length
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
184 mode : specificity of alignment call to use for assigning preclones;
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
185 one of ('allele', 'gene')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
186 action : how to handle multiple value fields when assigning preclones;
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
187 one of ('first', 'set')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
188
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
189 Returns:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
190 dict: dictionary of {(V, J, junction length):[Receptor]}
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
191 """
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
192 # print(fields)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
193 # Define functions for grouping keys
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
194 if mode == 'allele' and group_fields is None:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
195 def _get_key(rec, act):
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
196 return [rec.getVAllele(act, field=v_field), rec.getJAllele(act, field=j_field),
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
197 None if rec.junction is None else len(rec.junction)]
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
198 elif mode == 'gene' and group_fields is None:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
199 def _get_key(rec, act):
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
200 return [rec.getVGene(act, field=v_field), rec.getJGene(act, field=j_field),
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
201 None if rec.junction is None else len(rec.junction)]
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
202 elif mode == 'allele' and group_fields is not None:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
203 def _get_key(rec, act):
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
204 vdj = [rec.getVAllele(act, field=v_field), rec.getJAllele(act, field=j_field),
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
205 None if rec.junction is None else len(rec.junction)]
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
206 ann = [rec.getField(k) for k in group_fields]
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
207 return list(chain(vdj, ann))
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
208 elif mode == 'gene' and group_fields is not None:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
209 def _get_key(rec, act):
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
210 vdj = [rec.getVGene(act, field=v_field), rec.getJGene(act, field=j_field),
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
211 None if rec.junction is None else len(rec.junction)]
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
212 ann = [rec.getField(k) for k in group_fields]
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
213 return list(chain(vdj, ann))
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
214
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
215 # Function to flatten nested dictionary
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
216 def _flatten_dict(d, parent_key=''):
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
217 items = []
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
218 for k, v in d.items():
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
219 new_key = parent_key + [k] if parent_key else [k]
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
220 if isinstance(v, dict):
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
221 items.extend(_flatten_dict(v, new_key).items())
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
222 else:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
223 items.append((new_key, v))
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
224 flat_dict = {None if None in i[0] else tuple(i[0]): i[1] for i in items}
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
225 return flat_dict
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
226
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
227 if action == 'first':
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
228 index_func = indexByIdentity
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
229 elif action == 'set':
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
230 index_func = indexByUnion
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
231 else:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
232 sys.stderr.write('Unrecognized action: %s.\n' % action)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
233
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
234 start_time = time()
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
235 clone_index = {}
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
236 rec_count = 0
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
237 for rec in db_iter:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
238 key = _get_key(rec, action)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
239
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
240 # Print progress
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
241 printCount(rec_count, step=1000, start_time=start_time, task='Grouping sequences')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
242 rec_count += 1
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
243
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
244 # Assigned passed preclone records to key and failed to index None
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
245 if all([k is not None and k != '' for k in key]):
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
246 # Update index dictionary
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
247 index_func(clone_index, key, rec, group_fields)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
248 else:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
249 clone_index.setdefault(None, []).append(rec)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
250
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
251 printCount(rec_count, step=1000, start_time=start_time, task='Grouping sequences', end=True)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
252
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
253 if action == 'set':
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
254 clone_index = _flatten_dict(clone_index)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
255
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
256 return clone_index
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
257
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
258
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
259 def distanceClones(result, seq_field=default_junction_field, model=default_distance_model,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
260 distance=default_distance, dist_mat=None, norm=default_norm, sym=default_sym,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
261 linkage=default_linkage):
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
262 """
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
263 Separates a set of Receptor objects into clones
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
264
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
265 Arguments:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
266 result : a changeo.Multiprocessing.DbResult object with filtered records to clone
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
267 seq_field : sequence field used to calculate distance between records
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
268 model : substitution model used to calculate distance
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
269 distance : the distance threshold to assign clonal groups
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
270 dist_mat : pandas DataFrame of pairwise nucleotide or amino acid distances
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
271 norm : normalization method
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
272 sym : symmetry method
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
273 linkage : type of linkage
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
274
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
275 Returns:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
276 changeo.Multiprocessing.DbResult : an updated DbResult object
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
277 """
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
278 # Get distance matrix if not provided
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
279 if dist_mat is None:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
280 try:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
281 dist_mat = distance_models[model]
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
282 except KeyError:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
283 printError('Unrecognized distance model: %s' % args_dict['model'])
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
284
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
285 # TODO: can be cleaned up with abstract model class
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
286 # Determine length of n-mers
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
287 if model in ['hs1f_compat', 'm1n_compat', 'aa', 'ham', 'hh_s1f', 'mk_rs1nf']:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
288 nmer_len = 1
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
289 elif model in ['hh_s5f', 'mk_rs5nf']:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
290 nmer_len = 5
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
291 else:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
292 printError('Unrecognized distance model: %s.\n' % model)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
293
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
294 # Define unique junction mapping
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
295 seq_map = {}
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
296 for rec in result.data_pass:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
297 seq = rec.getField(seq_field)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
298 seq = re.sub('[\.-]', 'N', seq)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
299 if model == 'aa': seq = translate(seq)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
300 seq_map.setdefault(seq, []).append(rec)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
301
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
302 # Define sequences
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
303 sequences = list(seq_map.keys())
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
304
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
305 # Zero record case
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
306 if not sequences:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
307 result.valid = False
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
308 result.log['CLONES'] = 0
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
309 return result
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
310
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
311 # Single record case
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
312 if len(sequences) == 1:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
313 result.results = {1: result.data_pass}
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
314 result.valid = True
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
315 result.log['CLONES'] = 1
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
316 return result
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
317
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
318 # Calculate pairwise distance matrix
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
319 dists = calcDistances(sequences, nmer_len, dist_mat, sym=sym, norm=norm)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
320
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
321 # Perform hierarchical clustering
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
322 clusters = formClusters(dists, linkage, distance)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
323
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
324 # Turn clusters into clone dictionary
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
325 clone_dict = {}
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
326 for i, c in enumerate(clusters):
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
327 clone_dict.setdefault(c, []).extend(seq_map[sequences[i]])
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
328
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
329 if clone_dict:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
330 result.results = clone_dict
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
331 result.valid = True
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
332 result.log['CLONES'] = len(clone_dict)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
333 else:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
334 result.log['CLONES'] = 0
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
335
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
336 return result
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
337
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
338
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
339 def collectQueue(alive, result_queue, collect_queue, db_file, fields,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
340 writer=ChangeoWriter, out_file=None, out_args=default_out_args):
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
341 """
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
342 Assembles results from a queue of individual sequence results and manages log/file I/O
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
343
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
344 Arguments:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
345 alive = a multiprocessing.Value boolean controlling whether processing continues
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
346 if False exit process
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
347 result_queue : a multiprocessing.Queue holding processQueue results
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
348 collect_queue : a multiprocessing.Queue to store collector return values
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
349 db_file : the input database file name
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
350 fields : list of output field names
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
351 writer : writer class.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
352 out_file : output file name. Automatically generated from the input file if None.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
353 out_args : common output argument dictionary from parseCommonArgs
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
354
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
355 Returns:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
356 None : Adds a dictionary with key value pairs to collect_queue containing
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
357 'log' defining a log object along with the 'pass' and 'fail' output file names.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
358 """
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
359 # Wrapper for opening handles and writers
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
360 def _open(x, f, writer=writer, out_file=out_file):
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
361 if out_file is not None and x == 'pass':
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
362 handle = open(out_file, 'w')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
363 else:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
364 handle = getOutputHandle(db_file,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
365 out_label='clone-%s' % x,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
366 out_dir=out_args['out_dir'],
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
367 out_name=out_args['out_name'],
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
368 out_type=out_args['out_type'])
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
369 return handle, writer(handle, fields=f)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
370
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
371 # Open log file
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
372 try:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
373 # Count input records
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
374 result_count = countDbFile(db_file)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
375
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
376 # Define log handle
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
377 if out_args['log_file'] is None:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
378 log_handle = None
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
379 else:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
380 log_handle = open(out_args['log_file'], 'w')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
381 except:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
382 #sys.stderr.write('Exception in collector file opening step\n')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
383 alive.value = False
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
384 raise
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
385
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
386 # Get results from queue and write to files
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
387 try:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
388 # Initialize handles, writers and counters
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
389 pass_handle, pass_writer = None, None
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
390 fail_handle, fail_writer = None, None
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
391 rec_count, clone_count, pass_count, fail_count = 0, 0, 0, 0
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
392 start_time = time()
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
393
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
394 # Iterator over results queue until sentinel object reached
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
395 while alive.value:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
396 # Get result from queue
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
397 if result_queue.empty(): continue
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
398 else: result = result_queue.get()
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
399 # Exit upon reaching sentinel
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
400 if result is None: break
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
401
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
402 # Print progress for previous iteration and update record count
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
403 printProgress(rec_count, result_count, 0.05, start_time=start_time, task='Assigning clones')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
404 rec_count += len(result.data)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
405
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
406 # Write passed and failed records
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
407 if result:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
408 # Writing passing sequences
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
409 for clone in result.results.values():
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
410 clone_count += 1
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
411 for i, rec in enumerate(clone, start=1):
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
412 pass_count += 1
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
413 rec.setField('clone', str(clone_count))
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
414 result.log['CLONE%i-%i' % (clone_count, i)] = rec.junction
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
415 try:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
416 pass_writer.writeReceptor(rec)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
417 except AttributeError:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
418 # Open pass file and define writer object
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
419 pass_handle, pass_writer = _open('pass', fields)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
420 pass_writer.writeReceptor(rec)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
421
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
422 # Write failed sequences from passing sets
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
423 if result.data_fail:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
424 # Write failed sequences
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
425 for i, rec in enumerate(result.data_fail, start=1):
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
426 fail_count += 1
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
427 result.log['FAIL%i-%i' % (clone_count, i)] = rec.junction
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
428 if out_args['failed']:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
429 try:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
430 fail_writer.writeReceptor(rec)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
431 except AttributeError:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
432 # Open fail file and define writer object
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
433 fail_handle, fail_writer = _open('fail', fields)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
434 fail_writer.writeReceptor(rec)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
435 else:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
436 # Write failing records
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
437 for i, rec in enumerate(result.data, start=1):
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
438 fail_count += 1
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
439 result.log['CLONE0-%i' % (i)] = rec.junction
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
440 if out_args['failed']:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
441 try:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
442 fail_writer.writeReceptor(rec)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
443 except AttributeError:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
444 # Open fail file and define writer object
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
445 fail_handle, fail_writer = _open('fail', fields)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
446 fail_writer.writeReceptor(rec)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
447
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
448 # Write log
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
449 printLog(result.log, handle=log_handle)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
450 else:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
451 sys.stderr.write('PID %s> Error in sibling process detected. Cleaning up.\n' \
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
452 % os.getpid())
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
453 return None
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
454
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
455 # Print total counts
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
456 printProgress(rec_count, result_count, 0.05, start_time=start_time, task='Assigning clones')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
457
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
458 # Update return list
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
459 log = OrderedDict()
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
460 log['OUTPUT'] = os.path.basename(pass_handle.name) if pass_handle is not None else None
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
461 log['CLONES'] = clone_count
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
462 log['RECORDS'] = rec_count
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
463 log['PASS'] = pass_count
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
464 log['FAIL'] = fail_count
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
465
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
466 # Close file handles and generate return data
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
467 collect_dict = {'log': log, 'pass': None, 'fail': None}
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
468 if pass_handle is not None:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
469 collect_dict['pass'] = pass_handle.name
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
470 pass_handle.close()
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
471 if fail_handle is not None:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
472 collect_dict['fail'] = fail_handle.name
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
473 fail_handle.close()
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
474 if log_handle is not None:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
475 log_handle.close()
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
476 collect_queue.put(collect_dict)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
477 except:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
478 alive.value = False
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
479 raise
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
480
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
481 return None
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
482
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
483
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
484 def defineClones(db_file, seq_field=default_junction_field, v_field=default_v_field,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
485 j_field=default_j_field, max_missing=default_max_missing,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
486 group_fields=None, group_func=groupByGene, group_args={},
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
487 clone_func=distanceClones, clone_args={},
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
488 format=default_format, out_file=None, out_args=default_out_args,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
489 nproc=None, queue_size=None):
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
490 """
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
491 Define clonally related sequences
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
492
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
493 Arguments:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
494 db_file : filename of input database.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
495 seq_field : sequence field used to determine clones.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
496 v_field : field containing the V call.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
497 j_field : field containing the J call.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
498 max_missing : maximum number of non-ACGT characters to allow in the junction sequence.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
499 group_fields : additional annotation fields to use to group preclones;
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
500 if None use only V and J.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
501 group_func : the function to use for assigning preclones.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
502 group_args : a dictionary of arguments to pass to group_func.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
503 clone_func : the function to use for determining clones within preclonal groups.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
504 clone_args : a dictionary of arguments to pass to clone_func.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
505 format : input and output format.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
506 out_file : output file name. Automatically generated from the input file if None.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
507 out_args : common output argument dictionary from parseCommonArgs.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
508 nproc : the number of processQueue processes;
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
509 if None defaults to the number of CPUs.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
510 queue_size : maximum size of the argument queue;
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
511 if None defaults to 2*nproc.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
512
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
513 Returns:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
514 dict: dictionary of output pass and fail files.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
515 """
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
516 # Print parameter info
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
517 log = OrderedDict()
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
518 log['START'] = 'DefineClones'
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
519 log['FILE'] = os.path.basename(db_file)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
520 log['SEQ_FIELD'] = seq_field
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
521 log['V_FIELD'] = v_field
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
522 log['J_FIELD'] = j_field
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
523 log['MAX_MISSING'] = max_missing
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
524 log['GROUP_FIELDS'] = ','.join(group_fields) if group_fields is not None else None
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
525 for k in sorted(group_args):
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
526 log[k.upper()] = group_args[k]
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
527 for k in sorted(clone_args):
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
528 if k != 'dist_mat': log[k.upper()] = clone_args[k]
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
529 log['NPROC'] = nproc
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
530 printLog(log)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
531
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
532 # Define format operators
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
533 try:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
534 reader, writer, schema = getFormatOperators(format)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
535 except ValueError:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
536 printError('Invalid format %s.' % format)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
537
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
538 # Translate to Receptor attribute names
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
539 seq_field = schema.toReceptor(seq_field)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
540 v_field = schema.toReceptor(v_field)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
541 j_field = schema.toReceptor(j_field)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
542 if group_fields is not None:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
543 group_fields = [schema.toReceptor(f) for f in group_fields]
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
544
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
545 # Define feeder function and arguments
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
546 group_args['group_fields'] = group_fields
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
547 group_args['v_field'] = v_field
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
548 group_args['j_field'] = j_field
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
549 feed_args = {'db_file': db_file,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
550 'reader': reader,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
551 'group_func': group_func,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
552 'group_args': group_args}
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
553
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
554 # Define worker function and arguments
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
555 filter_args = {'seq_field': seq_field,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
556 'v_field': v_field,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
557 'j_field': j_field,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
558 'max_missing': max_missing}
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
559 clone_args['seq_field'] = seq_field
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
560 work_args = {'process_func': clone_func,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
561 'process_args': clone_args,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
562 'filter_func': filterMissing,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
563 'filter_args': filter_args}
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
564
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
565 # Define collector function and arguments
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
566 out_fields = getDbFields(db_file, add=schema.fromReceptor('clone'), reader=reader)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
567 out_args['out_type'] = schema.out_type
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
568 collect_args = {'db_file': db_file,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
569 'fields': out_fields,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
570 'writer': writer,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
571 'out_file': out_file,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
572 'out_args': out_args}
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
573
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
574 # Call process manager
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
575 result = manageProcesses(feed_func=feedDbQueue, work_func=processDbQueue, collect_func=collectQueue,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
576 feed_args=feed_args, work_args=work_args, collect_args=collect_args,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
577 nproc=nproc, queue_size=queue_size)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
578
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
579 # Print log
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
580 result['log']['END'] = 'DefineClones'
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
581 printLog(result['log'])
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
582 output = {k: v for k, v in result.items() if k in ('pass', 'fail')}
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
583
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
584 return output
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
585
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
586
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
587 def getArgParser():
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
588 """
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
589 Defines the ArgumentParser
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
590
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
591 Arguments:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
592 None
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
593
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
594 Returns:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
595 an ArgumentParser object
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
596 """
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
597 # Define input and output fields
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
598 fields = dedent(
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
599 '''
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
600 output files:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
601 clone-pass
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
602 database with assigned clonal group numbers.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
603 clone-fail
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
604 database with records failing clonal grouping.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
605
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
606 required fields:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
607 SEQUENCE_ID, V_CALL, J_CALL, JUNCTION
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
608
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
609 output fields:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
610 CLONE
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
611 ''')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
612 # Define argument parser
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
613 parser = ArgumentParser(description=__doc__, epilog=fields,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
614 parents=[getCommonArgParser(format=False, multiproc=True)],
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
615 formatter_class=CommonHelpFormatter, add_help=False)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
616
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
617 # Distance cloning method
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
618 group = parser.add_argument_group('cloning arguments')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
619 group.add_argument('--sf', action='store', dest='seq_field', default=default_junction_field,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
620 help='Field to be used to calculate distance between records.')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
621 group.add_argument('--vf', action='store', dest='v_field', default=default_v_field,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
622 help='Field containing the germline V segment call.')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
623 group.add_argument('--jf', action='store', dest='j_field', default=default_j_field,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
624 help='Field containing the germline J segment call.')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
625 group.add_argument('--gf', nargs='+', action='store', dest='group_fields', default=None,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
626 help='Additional fields to use for grouping clones aside from V, J and junction length.')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
627 group.add_argument('--mode', action='store', dest='mode',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
628 choices=('allele', 'gene'), default=default_index_mode,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
629 help='''Specifies whether to use the V(D)J allele or gene for
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
630 initial grouping.''')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
631 group.add_argument('--act', action='store', dest='action',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
632 choices=('first', 'set'), default=default_index_action,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
633 help='''Specifies how to handle multiple V(D)J assignments for initial grouping.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
634 The "first" action will use only the first gene listed.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
635 The "set" action will use all gene assignments and construct a larger gene
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
636 grouping composed of any sequences sharing an assignment or linked to another
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
637 sequence by a common assignment (similar to single-linkage).''')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
638 group.add_argument('--model', action='store', dest='model',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
639 choices=choices_distance_model,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
640 default=default_distance_model,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
641 help='''Specifies which substitution model to use for calculating distance
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
642 between sequences. The "ham" model is nucleotide Hamming distance and
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
643 "aa" is amino acid Hamming distance. The "hh_s1f" and "hh_s5f" models are
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
644 human specific single nucleotide and 5-mer content models, respectively,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
645 from Yaari et al, 2013. The "mk_rs1nf" and "mk_rs5nf" models are
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
646 mouse specific single nucleotide and 5-mer content models, respectively,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
647 from Cui et al, 2016. The "m1n_compat" and "hs1f_compat" models are
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
648 deprecated models provided backwards compatibility with the "m1n" and
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
649 "hs1f" models in Change-O v0.3.3 and SHazaM v0.1.4. Both
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
650 5-mer models should be considered experimental.''')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
651 group.add_argument('--dist', action='store', dest='distance', type=float,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
652 default=default_distance,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
653 help='The distance threshold for clonal grouping')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
654 group.add_argument('--norm', action='store', dest='norm',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
655 choices=('len', 'mut', 'none'), default=default_norm,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
656 help='''Specifies how to normalize distances. One of none
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
657 (do not normalize), len (normalize by length),
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
658 or mut (normalize by number of mutations between sequences).''')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
659 group.add_argument('--sym', action='store', dest='sym',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
660 choices=('avg', 'min'), default=default_sym,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
661 help='''Specifies how to combine asymmetric distances. One of avg
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
662 (average of A->B and B->A) or min (minimum of A->B and B->A).''')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
663 group.add_argument('--link', action='store', dest='linkage',
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
664 choices=('single', 'average', 'complete'), default=default_linkage,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
665 help='''Type of linkage to use for hierarchical clustering.''')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
666 group.add_argument('--maxmiss', action='store', dest='max_missing', type=int,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
667 default=default_max_missing,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
668 help='''The maximum number of non-ACGT characters (gaps or Ns) to
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
669 permit in the junction sequence before excluding the record
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
670 from clonal assignment. Note, under single linkage
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
671 non-informative positions can create artifactual links
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
672 between unrelated sequences. Use with caution.''')
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
673 parser.set_defaults(group_func=groupByGene)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
674 parser.set_defaults(clone_func=distanceClones)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
675
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
676 return parser
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
677
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
678
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
679 if __name__ == '__main__':
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
680 """
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
681 Parses command line arguments and calls main function
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
682 """
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
683 # Parse arguments
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
684 parser = getArgParser()
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
685 checkArgs(parser)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
686 args = parser.parse_args()
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
687 args_dict = parseCommonArgs(args)
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
688
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
689 # # Set default fields if not specified.
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
690 # default_fields = {'seq_field': default_junction_field,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
691 # 'v_field': default_v_field,
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
692 # 'j_field': default_j_field}
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
693 #
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
694 # # Default Change-O fields
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
695 # if args_dict['format'] == 'changeo':
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
696 # for f in default_fields:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
697 # if args_dict[f] is None: args_dict[f] = default_fields[f]
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
698 # else: args_dict[f] = args_dict[f].upper()
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
699 #
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
700 # # Default AIRR fields
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
701 # if args_dict['format'] == 'airr':
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
702 # for f in default_fields:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
703 # if args_dict[f] is None: args_dict[f] = ChangeoSchema.toAIRR(default_fields[f])
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
704 # else: args_dict[f] = args_dict[f].lower()
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
705
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
706 # Define grouping and cloning function arguments
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
707 args_dict['group_args'] = {'action': args_dict['action'],
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
708 'mode':args_dict['mode']}
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
709 args_dict['clone_args'] = {'model': args_dict['model'],
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
710 'distance': args_dict['distance'],
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
711 'norm': args_dict['norm'],
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
712 'sym': args_dict['sym'],
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
713 'linkage': args_dict['linkage']}
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
714
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
715 # Get distance matrix
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
716 try:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
717 args_dict['clone_args']['dist_mat'] = distance_models[args_dict['model']]
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
718 except KeyError:
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
719 printError('Unrecognized distance model: %s' % args_dict['model'])
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
720
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
721 # Clean argument dictionary
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
722 del args_dict['action']
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
723 del args_dict['mode']
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
724 del args_dict['model']
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
725 del args_dict['distance']
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
726 del args_dict['norm']
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
727 del args_dict['sym']
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
728 del args_dict['linkage']
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
729
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
730 # Clean arguments dictionary
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
731 del args_dict['db_files']
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
732 if 'out_files' in args_dict: del args_dict['out_files']
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
733
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
734 # Call main function for each input file
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
735 for i, f in enumerate(args.__dict__['db_files']):
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
736 args_dict['db_file'] = f
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
737 args_dict['out_file'] = args.__dict__['out_files'][i] \
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
738 if args.__dict__['out_files'] else None
aff3ba86ef7a Uploaded
davidvanzessen
parents:
diff changeset
739 defineClones(**args_dict)