annotate DefineClones.py @ 0:183edf446dcf draft default tip

Uploaded
author davidvanzessen
date Mon, 17 Jul 2017 07:44:27 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1 #!/usr/bin/env python3
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
2 """
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
3 Assign Ig sequences into clones
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
4 """
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
5 # Info
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
6 __author__ = 'Namita Gupta, Jason Anthony Vander Heiden, Gur Yaari, Mohamed Uduman'
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
7 from changeo import __version__, __date__
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
8
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
9 # Imports
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
10 import os
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
11 import re
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
12 import sys
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
13 import csv
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
14 import numpy as np
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
15 from argparse import ArgumentParser
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
16 from collections import OrderedDict
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
17 from itertools import chain
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
18 from textwrap import dedent
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
19 from time import time
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
20 from Bio import pairwise2
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
21 from Bio.Seq import translate
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
22
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
23 # Presto and changeo imports
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
24 from presto.Defaults import default_out_args
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
25 from presto.IO import getFileType, getOutputHandle, printLog, printProgress
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
26 from presto.Multiprocessing import manageProcesses
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
27 from presto.Sequence import getDNAScoreDict
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
28 from changeo.Commandline import CommonHelpFormatter, checkArgs, getCommonArgParser, parseCommonArgs
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
29 from changeo.Distance import distance_models, calcDistances, formClusters
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
30 from changeo.IO import getDbWriter, readDbFile, countDbFile
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
31 from changeo.Multiprocessing import DbData, DbResult
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
32
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
33 # Defaults
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
34 default_translate = False
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
35 default_distance = 0.0
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
36 default_index_mode = 'gene'
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
37 default_index_action = 'set'
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
38 default_bygroup_model = 'ham'
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
39 default_hclust_model = 'chen2010'
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
40 default_seq_field = 'JUNCTION'
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
41 default_norm = 'len'
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
42 default_sym = 'avg'
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
43 default_linkage = 'single'
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
44 choices_bygroup_model = ('ham', 'aa', 'hh_s1f', 'hh_s5f', 'mk_rs1nf', 'mk_rs5nf', 'hs1f_compat', 'm1n_compat')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
45
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
46
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
47 def indexByIdentity(index, key, rec, fields=None):
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
48 """
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
49 Updates a preclone index with a simple key
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
50
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
51 Arguments:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
52 index = preclone index from indexJunctions
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
53 key = index key
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
54 rec = IgRecord to add to the index
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
55 fields = additional annotation fields to use to group preclones;
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
56 if None use only V, J and junction length
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
57
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
58 Returns:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
59 None. Updates index with new key and records.
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
60 """
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
61 index.setdefault(tuple(key), []).append(rec)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
62
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
63
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
64 def indexByUnion(index, key, rec, fields=None):
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
65 """
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
66 Updates a preclone index with the union of nested keys
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
67
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
68 Arguments:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
69 index = preclone index from indexJunctions
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
70 key = index key
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
71 rec = IgRecord to add to the index
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
72 fields = additional annotation fields to use to group preclones;
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
73 if None use only V, J and junction length
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
74
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
75 Returns:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
76 None. Updates index with new key and records.
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
77 """
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
78 # List of values for this/new key
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
79 val = [rec]
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
80 f_range = list(range(2, 3 + (len(fields) if fields else 0)))
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
81
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
82 # See if field/junction length combination exists in index
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
83 outer_dict = index
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
84 for field in f_range:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
85 try:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
86 outer_dict = outer_dict[key[field]]
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
87 except (KeyError):
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
88 outer_dict = None
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
89 break
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
90 # If field combination exists, look through Js
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
91 j_matches = []
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
92 if outer_dict is not None:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
93 for j in outer_dict.keys():
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
94 if not set(key[1]).isdisjoint(set(j)):
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
95 key[1] = tuple(set(key[1]).union(set(j)))
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
96 j_matches += [j]
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
97 # If J overlap exists, look through Vs for each J
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
98 for j in j_matches:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
99 v_matches = []
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
100 # Collect V matches for this J
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
101 for v in outer_dict[j].keys():
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
102 if not set(key[0]).isdisjoint(set(v)):
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
103 key[0] = tuple(set(key[0]).union(set(v)))
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
104 v_matches += [v]
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
105 # If there are V overlaps for this J, pop them out
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
106 if v_matches:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
107 val += list(chain(*(outer_dict[j].pop(v) for v in v_matches)))
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
108 # If the J dict is now empty, remove it
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
109 if not outer_dict[j]:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
110 outer_dict.pop(j, None)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
111
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
112 # Add value(s) into index nested dictionary
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
113 # OMG Python pointers are the best!
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
114 # Add field dictionaries into index
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
115 outer_dict = index
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
116 for field in f_range:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
117 outer_dict.setdefault(key[field], {})
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
118 outer_dict = outer_dict[key[field]]
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
119 # Add J, then V into index
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
120 if key[1] in outer_dict:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
121 outer_dict[key[1]].update({key[0]: val})
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
122 else:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
123 outer_dict[key[1]] = {key[0]: val}
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
124
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
125
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
126 def indexJunctions(db_iter, fields=None, mode=default_index_mode,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
127 action=default_index_action):
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
128 """
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
129 Identifies preclonal groups by V, J and junction length
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
130
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
131 Arguments:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
132 db_iter = an iterator of IgRecords defined by readDbFile
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
133 fields = additional annotation fields to use to group preclones;
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
134 if None use only V, J and junction length
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
135 mode = specificity of alignment call to use for assigning preclones;
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
136 one of ('allele', 'gene')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
137 action = how to handle multiple value fields when assigning preclones;
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
138 one of ('first', 'set')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
139
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
140 Returns:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
141 a dictionary of {(V, J, junction length):[IgRecords]}
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
142 """
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
143 # print(fields)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
144 # Define functions for grouping keys
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
145 if mode == 'allele' and fields is None:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
146 def _get_key(rec, act):
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
147 return [rec.getVAllele(act), rec.getJAllele(act),
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
148 None if rec.junction is None else len(rec.junction)]
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
149 elif mode == 'gene' and fields is None:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
150 def _get_key(rec, act):
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
151 return [rec.getVGene(act), rec.getJGene(act),
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
152 None if rec.junction is None else len(rec.junction)]
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
153 elif mode == 'allele' and fields is not None:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
154 def _get_key(rec, act):
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
155 vdj = [rec.getVAllele(act), rec.getJAllele(act),
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
156 None if rec.junction is None else len(rec.junction)]
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
157 ann = [rec.toDict().get(k, None) for k in fields]
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
158 return list(chain(vdj, ann))
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
159 elif mode == 'gene' and fields is not None:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
160 def _get_key(rec, act):
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
161 vdj = [rec.getVGene(act), rec.getJGene(act),
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
162 None if rec.junction is None else len(rec.junction)]
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
163 ann = [rec.toDict().get(k, None) for k in fields]
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
164 return list(chain(vdj, ann))
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
165
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
166 # Function to flatten nested dictionary
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
167 def _flatten_dict(d, parent_key=''):
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
168 items = []
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
169 for k, v in d.items():
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
170 new_key = parent_key + [k] if parent_key else [k]
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
171 if isinstance(v, dict):
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
172 items.extend(_flatten_dict(v, new_key).items())
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
173 else:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
174 items.append((new_key, v))
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
175 flat_dict = {None if None in i[0] else tuple(i[0]): i[1] for i in items}
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
176 return flat_dict
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
177
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
178 if action == 'first':
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
179 index_func = indexByIdentity
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
180 elif action == 'set':
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
181 index_func = indexByUnion
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
182 else:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
183 sys.stderr.write('Unrecognized action: %s.\n' % action)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
184
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
185 start_time = time()
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
186 clone_index = {}
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
187 rec_count = 0
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
188 for rec in db_iter:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
189 key = _get_key(rec, action)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
190
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
191 # Print progress
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
192 if rec_count == 0:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
193 print('PROGRESS> Grouping sequences')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
194
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
195 printProgress(rec_count, step=1000, start_time=start_time)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
196 rec_count += 1
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
197
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
198 # Assigned passed preclone records to key and failed to index None
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
199 if all([k is not None and k != '' for k in key]):
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
200 # Update index dictionary
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
201 index_func(clone_index, key, rec, fields)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
202 else:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
203 clone_index.setdefault(None, []).append(rec)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
204
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
205 printProgress(rec_count, step=1000, start_time=start_time, end=True)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
206
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
207 if action == 'set':
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
208 clone_index = _flatten_dict(clone_index)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
209
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
210 return clone_index
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
211
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
212
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
213 def distanceClones(records, model=default_bygroup_model, distance=default_distance,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
214 dist_mat=None, norm=default_norm, sym=default_sym,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
215 linkage=default_linkage, seq_field=default_seq_field):
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
216 """
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
217 Separates a set of IgRecords into clones
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
218
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
219 Arguments:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
220 records = an iterator of IgRecords
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
221 model = substitution model used to calculate distance
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
222 distance = the distance threshold to assign clonal groups
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
223 dist_mat = pandas DataFrame of pairwise nucleotide or amino acid distances
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
224 norm = normalization method
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
225 sym = symmetry method
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
226 linkage = type of linkage
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
227 seq_field = sequence field used to calculate distance between records
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
228
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
229 Returns:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
230 a dictionary of lists defining {clone number: [IgRecords clonal group]}
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
231 """
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
232 # Get distance matrix if not provided
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
233 if dist_mat is None:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
234 try:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
235 dist_mat = distance_models[model]
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
236 except KeyError:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
237 sys.exit('Unrecognized distance model: %s' % args_dict['model'])
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
238
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
239 # TODO: can be cleaned up with abstract model class
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
240 # Determine length of n-mers
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
241 if model in ['hs1f_compat', 'm1n_compat', 'aa', 'ham', 'hh_s1f', 'mk_rs1nf']:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
242 nmer_len = 1
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
243 elif model in ['hh_s5f', 'mk_rs5nf']:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
244 nmer_len = 5
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
245 else:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
246 sys.exit('Unrecognized distance model: %s.\n' % model)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
247
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
248 # Define unique junction mapping
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
249 seq_map = {}
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
250 for ig in records:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
251 seq = ig.getSeqField(seq_field)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
252 # Check if sequence length is 0
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
253 if len(seq) == 0:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
254 return None
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
255
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
256 seq = re.sub('[\.-]', 'N', str(seq))
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
257 if model == 'aa': seq = translate(seq)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
258
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
259 seq_map.setdefault(seq, []).append(ig)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
260
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
261 # Process records
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
262 if len(seq_map) == 1:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
263 return {1:records}
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
264
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
265 # Define sequences
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
266 seqs = list(seq_map.keys())
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
267
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
268 # Calculate pairwise distance matrix
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
269 dists = calcDistances(seqs, nmer_len, dist_mat, sym=sym, norm=norm)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
270
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
271 # Perform hierarchical clustering
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
272 clusters = formClusters(dists, linkage, distance)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
273
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
274 # Turn clusters into clone dictionary
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
275 clone_dict = {}
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
276 for i, c in enumerate(clusters):
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
277 clone_dict.setdefault(c, []).extend(seq_map[seqs[i]])
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
278
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
279 return clone_dict
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
280
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
281
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
282 def distChen2010(records):
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
283 """
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
284 Calculate pairwise distances as defined in Chen 2010
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
285
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
286 Arguments:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
287 records = list of IgRecords where first is query to be compared to others in list
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
288
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
289 Returns:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
290 list of distances
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
291 """
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
292 # Pull out query sequence and V/J information
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
293 query = records.popitem(last=False)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
294 query_cdr3 = query.junction[3:-3]
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
295 query_v_allele = query.getVAllele()
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
296 query_v_gene = query.getVGene()
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
297 query_v_family = query.getVFamily()
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
298 query_j_allele = query.getJAllele()
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
299 query_j_gene = query.getJGene()
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
300 # Create alignment scoring dictionary
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
301 score_dict = getDNAScoreDict()
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
302
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
303 scores = [0]*len(records)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
304 for i in range(len(records)):
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
305 ld = pairwise2.align.globalds(query_cdr3, records[i].junction[3:-3],
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
306 score_dict, -1, -1, one_alignment_only=True)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
307 # Check V similarity
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
308 if records[i].getVAllele() == query_v_allele: ld += 0
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
309 elif records[i].getVGene() == query_v_gene: ld += 1
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
310 elif records[i].getVFamily() == query_v_family: ld += 3
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
311 else: ld += 5
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
312 # Check J similarity
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
313 if records[i].getJAllele() == query_j_allele: ld += 0
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
314 elif records[i].getJGene() == query_j_gene: ld += 1
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
315 else: ld += 3
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
316 # Divide by length
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
317 scores[i] = ld/max(len(records[i].junction[3:-3]), query_cdr3)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
318
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
319 return scores
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
320
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
321
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
322 def distAdemokun2011(records):
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
323 """
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
324 Calculate pairwise distances as defined in Ademokun 2011
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
325
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
326 Arguments:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
327 records = list of IgRecords where first is query to be compared to others in list
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
328
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
329 Returns:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
330 list of distances
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
331 """
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
332 # Pull out query sequence and V family information
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
333 query = records.popitem(last=False)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
334 query_cdr3 = query.junction[3:-3]
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
335 query_v_family = query.getVFamily()
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
336 # Create alignment scoring dictionary
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
337 score_dict = getDNAScoreDict()
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
338
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
339 scores = [0]*len(records)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
340 for i in range(len(records)):
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
341
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
342 if abs(len(query_cdr3) - len(records[i].junction[3:-3])) > 10:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
343 scores[i] = 1
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
344 elif query_v_family != records[i].getVFamily():
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
345 scores[i] = 1
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
346 else:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
347 ld = pairwise2.align.globalds(query_cdr3, records[i].junction[3:-3],
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
348 score_dict, -1, -1, one_alignment_only=True)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
349 scores[i] = ld/min(len(records[i].junction[3:-3]), query_cdr3)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
350
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
351 return scores
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
352
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
353
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
354 def hierClust(dist_mat, method='chen2010'):
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
355 """
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
356 Calculate hierarchical clustering
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
357
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
358 Arguments:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
359 dist_mat = square-formed distance matrix of pairwise CDR3 comparisons
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
360
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
361 Returns:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
362 list of cluster ids
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
363 """
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
364 if method == 'chen2010':
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
365 clusters = formClusters(dist_mat, 'average', 0.32)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
366 elif method == 'ademokun2011':
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
367 clusters = formClusters(dist_mat, 'complete', 0.25)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
368 else: clusters = np.ones(dist_mat.shape[0])
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
369
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
370 return clusters
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
371
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
372 # TODO: Merge duplicate feed, process and collect functions.
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
373 def feedQueue(alive, data_queue, db_file, group_func, group_args={}):
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
374 """
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
375 Feeds the data queue with Ig records
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
376
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
377 Arguments:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
378 alive = a multiprocessing.Value boolean controlling whether processing continues
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
379 if False exit process
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
380 data_queue = a multiprocessing.Queue to hold data for processing
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
381 db_file = the Ig record database file
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
382 group_func = the function to use for assigning preclones
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
383 group_args = a dictionary of arguments to pass to group_func
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
384
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
385 Returns:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
386 None
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
387 """
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
388 # Open input file and perform grouping
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
389 try:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
390 # Iterate over Ig records and assign groups
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
391 db_iter = readDbFile(db_file)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
392 clone_dict = group_func(db_iter, **group_args)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
393 except:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
394 #sys.stderr.write('Exception in feeder grouping step\n')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
395 alive.value = False
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
396 raise
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
397
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
398 # Add groups to data queue
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
399 try:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
400 #print 'START FEED', alive.value
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
401 # Iterate over groups and feed data queue
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
402 clone_iter = iter(clone_dict.items())
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
403 while alive.value:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
404 # Get data from queue
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
405 if data_queue.full(): continue
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
406 else: data = next(clone_iter, None)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
407 # Exit upon reaching end of iterator
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
408 if data is None: break
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
409 #print "FEED", alive.value, k
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
410
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
411 # Feed queue
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
412 data_queue.put(DbData(*data))
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
413 else:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
414 sys.stderr.write('PID %s: Error in sibling process detected. Cleaning up.\n' \
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
415 % os.getpid())
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
416 return None
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
417 except:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
418 #sys.stderr.write('Exception in feeder queue feeding step\n')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
419 alive.value = False
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
420 raise
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
421
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
422 return None
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
423
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
424
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
425 def feedQueueClust(alive, data_queue, db_file, group_func=None, group_args={}):
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
426 """
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
427 Feeds the data queue with Ig records
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
428
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
429 Arguments:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
430 alive = a multiprocessing.Value boolean controlling whether processing continues
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
431 if False exit process
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
432 data_queue = a multiprocessing.Queue to hold data for processing
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
433 db_file = the Ig record database file
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
434
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
435 Returns:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
436 None
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
437 """
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
438 # Open input file and perform grouping
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
439 try:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
440 # Iterate over Ig records and order by junction length
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
441 records = {}
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
442 db_iter = readDbFile(db_file)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
443 for rec in db_iter:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
444 records[rec.id] = rec
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
445 records = OrderedDict(sorted(list(records.items()), key=lambda i: i[1].junction_length))
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
446 dist_dict = {}
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
447 for __ in range(len(records)):
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
448 k,v = records.popitem(last=False)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
449 dist_dict[k] = [v].append(list(records.values()))
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
450 except:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
451 #sys.stderr.write('Exception in feeder grouping step\n')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
452 alive.value = False
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
453 raise
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
454
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
455 # Add groups to data queue
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
456 try:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
457 # print 'START FEED', alive.value
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
458 # Iterate over groups and feed data queue
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
459 dist_iter = iter(dist_dict.items())
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
460 while alive.value:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
461 # Get data from queue
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
462 if data_queue.full(): continue
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
463 else: data = next(dist_iter, None)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
464 # Exit upon reaching end of iterator
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
465 if data is None: break
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
466 #print "FEED", alive.value, k
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
467
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
468 # Feed queue
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
469 data_queue.put(DbData(*data))
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
470 else:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
471 sys.stderr.write('PID %s: Error in sibling process detected. Cleaning up.\n' \
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
472 % os.getpid())
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
473 return None
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
474 except:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
475 #sys.stderr.write('Exception in feeder queue feeding step\n')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
476 alive.value = False
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
477 raise
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
478
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
479 return None
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
480
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
481
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
482 def processQueue(alive, data_queue, result_queue, clone_func, clone_args):
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
483 """
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
484 Pulls from data queue, performs calculations, and feeds results queue
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
485
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
486 Arguments:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
487 alive = a multiprocessing.Value boolean controlling whether processing continues
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
488 if False exit process
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
489 data_queue = a multiprocessing.Queue holding data to process
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
490 result_queue = a multiprocessing.Queue to hold processed results
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
491 clone_func = the function to call for clonal assignment
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
492 clone_args = a dictionary of arguments to pass to clone_func
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
493
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
494 Returns:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
495 None
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
496 """
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
497 try:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
498 # Iterator over data queue until sentinel object reached
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
499 while alive.value:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
500 # Get data from queue
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
501 if data_queue.empty(): continue
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
502 else: data = data_queue.get()
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
503 # Exit upon reaching sentinel
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
504 if data is None: break
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
505
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
506 # Define result object for iteration and get data records
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
507 records = data.data
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
508 # print(data.id)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
509 result = DbResult(data.id, records)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
510
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
511 # Check for invalid data (due to failed indexing) and add failed result
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
512 if not data:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
513 result_queue.put(result)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
514 continue
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
515
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
516 # Add V(D)J to log
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
517 result.log['ID'] = ','.join([str(x) for x in data.id])
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
518 result.log['VALLELE'] = ','.join(set([(r.getVAllele() or '') for r in records]))
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
519 result.log['DALLELE'] = ','.join(set([(r.getDAllele() or '') for r in records]))
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
520 result.log['JALLELE'] = ','.join(set([(r.getJAllele() or '') for r in records]))
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
521 result.log['JUNCLEN'] = ','.join(set([(str(len(r.junction)) or '0') for r in records]))
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
522 result.log['SEQUENCES'] = len(records)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
523
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
524 # Checking for preclone failure and assign clones
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
525 clones = clone_func(records, **clone_args) if data else None
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
526
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
527 # import cProfile
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
528 # prof = cProfile.Profile()
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
529 # clones = prof.runcall(clone_func, records, **clone_args)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
530 # prof.dump_stats('worker-%d.prof' % os.getpid())
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
531
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
532 if clones is not None:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
533 result.results = clones
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
534 result.valid = True
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
535 result.log['CLONES'] = len(clones)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
536 else:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
537 result.log['CLONES'] = 0
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
538
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
539 # Feed results to result queue
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
540 result_queue.put(result)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
541 else:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
542 sys.stderr.write('PID %s: Error in sibling process detected. Cleaning up.\n' \
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
543 % os.getpid())
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
544 return None
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
545 except:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
546 #sys.stderr.write('Exception in worker\n')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
547 alive.value = False
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
548 raise
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
549
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
550 return None
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
551
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
552
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
553 def processQueueClust(alive, data_queue, result_queue, clone_func, clone_args):
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
554 """
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
555 Pulls from data queue, performs calculations, and feeds results queue
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
556
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
557 Arguments:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
558 alive = a multiprocessing.Value boolean controlling whether processing continues
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
559 if False exit process
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
560 data_queue = a multiprocessing.Queue holding data to process
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
561 result_queue = a multiprocessing.Queue to hold processed results
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
562 clone_func = the function to call for calculating pairwise distances between sequences
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
563 clone_args = a dictionary of arguments to pass to clone_func
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
564
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
565 Returns:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
566 None
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
567 """
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
568
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
569 try:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
570 # print 'START WORK', alive.value
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
571 # Iterator over data queue until sentinel object reached
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
572 while alive.value:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
573 # Get data from queue
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
574 if data_queue.empty(): continue
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
575 else: data = data_queue.get()
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
576 # Exit upon reaching sentinel
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
577 if data is None: break
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
578 # print "WORK", alive.value, data['id']
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
579
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
580 # Define result object for iteration and get data records
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
581 records = data.data
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
582 result = DbResult(data.id, records)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
583
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
584 # Create row of distance matrix and check for error
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
585 dist_row = clone_func(records, **clone_args) if data else None
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
586 if dist_row is not None:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
587 result.results = dist_row
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
588 result.valid = True
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
589
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
590 # Feed results to result queue
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
591 result_queue.put(result)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
592 else:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
593 sys.stderr.write('PID %s: Error in sibling process detected. Cleaning up.\n' \
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
594 % os.getpid())
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
595 return None
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
596 except:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
597 #sys.stderr.write('Exception in worker\n')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
598 alive.value = False
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
599 raise
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
600
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
601 return None
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
602
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
603
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
604 def collectQueue(alive, result_queue, collect_queue, db_file, out_args, cluster_func=None, cluster_args={}):
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
605 """
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
606 Assembles results from a queue of individual sequence results and manages log/file I/O
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
607
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
608 Arguments:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
609 alive = a multiprocessing.Value boolean controlling whether processing continues
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
610 if False exit process
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
611 result_queue = a multiprocessing.Queue holding processQueue results
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
612 collect_queue = a multiprocessing.Queue to store collector return values
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
613 db_file = the input database file name
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
614 out_args = common output argument dictionary from parseCommonArgs
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
615 cluster_func = the function to call for carrying out clustering on distance matrix
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
616 cluster_args = a dictionary of arguments to pass to cluster_func
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
617
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
618 Returns:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
619 None
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
620 (adds 'log' and 'out_files' to collect_dict)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
621 """
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
622 # Open output files
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
623 try:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
624 # Count records and define output format
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
625 out_type = getFileType(db_file) if out_args['out_type'] is None \
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
626 else out_args['out_type']
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
627 result_count = countDbFile(db_file)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
628
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
629 # Defined successful output handle
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
630 pass_handle = getOutputHandle(db_file,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
631 out_label='clone-pass',
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
632 out_dir=out_args['out_dir'],
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
633 out_name=out_args['out_name'],
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
634 out_type=out_type)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
635 pass_writer = getDbWriter(pass_handle, db_file, add_fields='CLONE')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
636
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
637 # Defined failed alignment output handle
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
638 if out_args['failed']:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
639 fail_handle = getOutputHandle(db_file,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
640 out_label='clone-fail',
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
641 out_dir=out_args['out_dir'],
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
642 out_name=out_args['out_name'],
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
643 out_type=out_type)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
644 fail_writer = getDbWriter(fail_handle, db_file)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
645 else:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
646 fail_handle = None
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
647 fail_writer = None
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
648
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
649 # Define log handle
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
650 if out_args['log_file'] is None:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
651 log_handle = None
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
652 else:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
653 log_handle = open(out_args['log_file'], 'w')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
654 except:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
655 #sys.stderr.write('Exception in collector file opening step\n')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
656 alive.value = False
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
657 raise
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
658
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
659 # Get results from queue and write to files
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
660 try:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
661 #print 'START COLLECT', alive.value
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
662 # Iterator over results queue until sentinel object reached
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
663 start_time = time()
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
664 rec_count = clone_count = pass_count = fail_count = 0
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
665 while alive.value:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
666 # Get result from queue
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
667 if result_queue.empty(): continue
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
668 else: result = result_queue.get()
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
669 # Exit upon reaching sentinel
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
670 if result is None: break
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
671 #print "COLLECT", alive.value, result['id']
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
672
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
673 # Print progress for previous iteration and update record count
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
674 if rec_count == 0:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
675 print('PROGRESS> Assigning clones')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
676 printProgress(rec_count, result_count, 0.05, start_time)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
677 rec_count += len(result.data)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
678
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
679 # Write passed and failed records
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
680 if result:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
681 for clone in result.results.values():
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
682 clone_count += 1
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
683 for i, rec in enumerate(clone):
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
684 rec.annotations['CLONE'] = clone_count
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
685 pass_writer.writerow(rec.toDict())
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
686 pass_count += 1
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
687 result.log['CLONE%i-%i' % (clone_count, i + 1)] = str(rec.junction)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
688
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
689 else:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
690 for i, rec in enumerate(result.data):
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
691 if fail_writer is not None: fail_writer.writerow(rec.toDict())
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
692 fail_count += 1
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
693 result.log['CLONE0-%i' % (i + 1)] = str(rec.junction)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
694
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
695 # Write log
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
696 printLog(result.log, handle=log_handle)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
697 else:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
698 sys.stderr.write('PID %s: Error in sibling process detected. Cleaning up.\n' \
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
699 % os.getpid())
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
700 return None
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
701
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
702 # Print total counts
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
703 printProgress(rec_count, result_count, 0.05, start_time)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
704
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
705 # Close file handles
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
706 pass_handle.close()
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
707 if fail_handle is not None: fail_handle.close()
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
708 if log_handle is not None: log_handle.close()
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
709
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
710 # Update return list
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
711 log = OrderedDict()
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
712 log['OUTPUT'] = os.path.basename(pass_handle.name)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
713 log['CLONES'] = clone_count
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
714 log['RECORDS'] = rec_count
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
715 log['PASS'] = pass_count
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
716 log['FAIL'] = fail_count
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
717 collect_dict = {'log':log, 'out_files': [pass_handle.name]}
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
718 collect_queue.put(collect_dict)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
719 except:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
720 #sys.stderr.write('Exception in collector result processing step\n')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
721 alive.value = False
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
722 raise
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
723
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
724 return None
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
725
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
726
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
727 def collectQueueClust(alive, result_queue, collect_queue, db_file, out_args, cluster_func, cluster_args):
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
728 """
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
729 Assembles results from a queue of individual sequence results and manages log/file I/O
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
730
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
731 Arguments:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
732 alive = a multiprocessing.Value boolean controlling whether processing continues
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
733 if False exit process
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
734 result_queue = a multiprocessing.Queue holding processQueue results
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
735 collect_queue = a multiprocessing.Queue to store collector return values
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
736 db_file = the input database file name
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
737 out_args = common output argument dictionary from parseCommonArgs
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
738 cluster_func = the function to call for carrying out clustering on distance matrix
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
739 cluster_args = a dictionary of arguments to pass to cluster_func
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
740
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
741 Returns:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
742 None
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
743 (adds 'log' and 'out_files' to collect_dict)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
744 """
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
745 # Open output files
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
746 try:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
747
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
748 # Iterate over Ig records to count and order by junction length
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
749 result_count = 0
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
750 records = {}
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
751 # print 'Reading file...'
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
752 db_iter = readDbFile(db_file)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
753 for rec in db_iter:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
754 records[rec.id] = rec
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
755 result_count += 1
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
756 records = OrderedDict(sorted(list(records.items()), key=lambda i: i[1].junction_length))
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
757
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
758 # Define empty matrix to store assembled results
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
759 dist_mat = np.zeros((result_count,result_count))
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
760
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
761 # Count records and define output format
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
762 out_type = getFileType(db_file) if out_args['out_type'] is None \
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
763 else out_args['out_type']
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
764
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
765 # Defined successful output handle
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
766 pass_handle = getOutputHandle(db_file,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
767 out_label='clone-pass',
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
768 out_dir=out_args['out_dir'],
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
769 out_name=out_args['out_name'],
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
770 out_type=out_type)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
771 pass_writer = getDbWriter(pass_handle, db_file, add_fields='CLONE')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
772
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
773 # Defined failed cloning output handle
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
774 if out_args['failed']:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
775 fail_handle = getOutputHandle(db_file,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
776 out_label='clone-fail',
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
777 out_dir=out_args['out_dir'],
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
778 out_name=out_args['out_name'],
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
779 out_type=out_type)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
780 fail_writer = getDbWriter(fail_handle, db_file)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
781 else:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
782 fail_handle = None
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
783 fail_writer = None
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
784
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
785 # Open log file
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
786 if out_args['log_file'] is None:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
787 log_handle = None
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
788 else:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
789 log_handle = open(out_args['log_file'], 'w')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
790 except:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
791 alive.value = False
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
792 raise
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
793
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
794 try:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
795 # Iterator over results queue until sentinel object reached
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
796 start_time = time()
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
797 row_count = rec_count = 0
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
798 while alive.value:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
799 # Get result from queue
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
800 if result_queue.empty(): continue
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
801 else: result = result_queue.get()
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
802 # Exit upon reaching sentinel
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
803 if result is None: break
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
804
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
805 # Print progress for previous iteration
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
806 if row_count == 0:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
807 print('PROGRESS> Assigning clones')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
808 printProgress(row_count, result_count, 0.05, start_time)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
809
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
810 # Update counts for iteration
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
811 row_count += 1
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
812 rec_count += len(result)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
813
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
814 # Add result row to distance matrix
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
815 if result:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
816 dist_mat[list(range(result_count-len(result),result_count)),result_count-len(result)] = result.results
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
817
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
818 else:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
819 sys.stderr.write('PID %s: Error in sibling process detected. Cleaning up.\n' \
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
820 % os.getpid())
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
821 return None
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
822
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
823 # Calculate linkage and carry out clustering
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
824 # print dist_mat
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
825 clusters = cluster_func(dist_mat, **cluster_args) if dist_mat is not None else None
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
826 clones = {}
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
827 # print clusters
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
828 for i, c in enumerate(clusters):
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
829 clones.setdefault(c, []).append(records[list(records.keys())[i]])
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
830
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
831 # Write passed and failed records
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
832 clone_count = pass_count = fail_count = 0
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
833 if clones:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
834 for clone in clones.values():
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
835 clone_count += 1
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
836 for i, rec in enumerate(clone):
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
837 rec.annotations['CLONE'] = clone_count
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
838 pass_writer.writerow(rec.toDict())
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
839 pass_count += 1
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
840 #result.log['CLONE%i-%i' % (clone_count, i + 1)] = str(rec.junction)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
841
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
842 else:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
843 for i, rec in enumerate(result.data):
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
844 fail_writer.writerow(rec.toDict())
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
845 fail_count += 1
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
846 #result.log['CLONE0-%i' % (i + 1)] = str(rec.junction)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
847
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
848 # Print final progress
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
849 printProgress(row_count, result_count, 0.05, start_time)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
850
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
851 # Close file handles
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
852 pass_handle.close()
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
853 if fail_handle is not None: fail_handle.close()
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
854 if log_handle is not None: log_handle.close()
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
855
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
856 # Update return list
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
857 log = OrderedDict()
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
858 log['OUTPUT'] = os.path.basename(pass_handle.name)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
859 log['CLONES'] = clone_count
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
860 log['RECORDS'] = rec_count
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
861 log['PASS'] = pass_count
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
862 log['FAIL'] = fail_count
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
863 collect_dict = {'log':log, 'out_files': [pass_handle.name]}
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
864 collect_queue.put(collect_dict)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
865 except:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
866 alive.value = False
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
867 raise
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
868
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
869 return None
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
870
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
871
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
872 def defineClones(db_file, feed_func, work_func, collect_func, clone_func, cluster_func=None,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
873 group_func=None, group_args={}, clone_args={}, cluster_args={},
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
874 out_args=default_out_args, nproc=None, queue_size=None):
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
875 """
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
876 Define clonally related sequences
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
877
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
878 Arguments:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
879 db_file = filename of input database
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
880 feed_func = the function that feeds the queue
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
881 work_func = the worker function that will run on each CPU
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
882 collect_func = the function that collects results from the workers
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
883 group_func = the function to use for assigning preclones
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
884 clone_func = the function to use for determining clones within preclonal groups
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
885 group_args = a dictionary of arguments to pass to group_func
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
886 clone_args = a dictionary of arguments to pass to clone_func
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
887 out_args = common output argument dictionary from parseCommonArgs
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
888 nproc = the number of processQueue processes;
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
889 if None defaults to the number of CPUs
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
890 queue_size = maximum size of the argument queue;
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
891 if None defaults to 2*nproc
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
892
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
893 Returns:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
894 a list of successful output file names
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
895 """
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
896 # Print parameter info
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
897 log = OrderedDict()
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
898 log['START'] = 'DefineClones'
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
899 log['DB_FILE'] = os.path.basename(db_file)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
900 if group_func is not None:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
901 log['GROUP_FUNC'] = group_func.__name__
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
902 log['GROUP_ARGS'] = group_args
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
903 log['CLONE_FUNC'] = clone_func.__name__
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
904
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
905 # TODO: this is yucky, but can be fixed by using a model class
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
906 clone_log = clone_args.copy()
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
907 if 'dist_mat' in clone_log: del clone_log['dist_mat']
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
908 log['CLONE_ARGS'] = clone_log
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
909
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
910 if cluster_func is not None:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
911 log['CLUSTER_FUNC'] = cluster_func.__name__
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
912 log['CLUSTER_ARGS'] = cluster_args
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
913 log['NPROC'] = nproc
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
914 printLog(log)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
915
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
916 # Define feeder function and arguments
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
917 feed_args = {'db_file': db_file,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
918 'group_func': group_func,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
919 'group_args': group_args}
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
920 # Define worker function and arguments
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
921 work_args = {'clone_func': clone_func,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
922 'clone_args': clone_args}
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
923 # Define collector function and arguments
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
924 collect_args = {'db_file': db_file,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
925 'out_args': out_args,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
926 'cluster_func': cluster_func,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
927 'cluster_args': cluster_args}
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
928
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
929 # Call process manager
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
930 result = manageProcesses(feed_func, work_func, collect_func,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
931 feed_args, work_args, collect_args,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
932 nproc, queue_size)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
933
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
934 # Print log
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
935 result['log']['END'] = 'DefineClones'
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
936 printLog(result['log'])
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
937
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
938 return result['out_files']
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
939
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
940
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
941 def getArgParser():
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
942 """
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
943 Defines the ArgumentParser
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
944
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
945 Arguments:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
946 None
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
947
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
948 Returns:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
949 an ArgumentParser object
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
950 """
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
951 # Define input and output fields
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
952 fields = dedent(
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
953 '''
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
954 output files:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
955 clone-pass
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
956 database with assigned clonal group numbers.
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
957 clone-fail
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
958 database with records failing clonal grouping.
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
959
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
960 required fields:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
961 SEQUENCE_ID, V_CALL or V_CALL_GENOTYPED, D_CALL, J_CALL, JUNCTION
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
962
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
963 <field>
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
964 sequence field specified by the --sf parameter
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
965
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
966 output fields:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
967 CLONE
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
968 ''')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
969
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
970 # Define ArgumentParser
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
971 parser = ArgumentParser(description=__doc__, epilog=fields,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
972 formatter_class=CommonHelpFormatter)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
973 parser.add_argument('--version', action='version',
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
974 version='%(prog)s:' + ' %s-%s' %(__version__, __date__))
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
975 subparsers = parser.add_subparsers(title='subcommands', dest='command', metavar='',
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
976 help='Cloning method')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
977 # TODO: This is a temporary fix for Python issue 9253
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
978 subparsers.required = True
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
979
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
980 # Parent parser
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
981 parser_parent = getCommonArgParser(seq_in=False, seq_out=False, db_in=True,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
982 multiproc=True)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
983
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
984 # Distance cloning method
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
985 parser_bygroup = subparsers.add_parser('bygroup', parents=[parser_parent],
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
986 formatter_class=CommonHelpFormatter,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
987 help='''Defines clones as having same V assignment,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
988 J assignment, and junction length with
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
989 specified substitution distance model.''',
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
990 description='''Defines clones as having same V assignment,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
991 J assignment, and junction length with
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
992 specified substitution distance model.''')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
993 parser_bygroup.add_argument('-f', nargs='+', action='store', dest='fields', default=None,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
994 help='Additional fields to use for grouping clones (non VDJ)')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
995 parser_bygroup.add_argument('--mode', action='store', dest='mode',
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
996 choices=('allele', 'gene'), default=default_index_mode,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
997 help='''Specifies whether to use the V(D)J allele or gene for
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
998 initial grouping.''')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
999 parser_bygroup.add_argument('--act', action='store', dest='action',
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1000 choices=('first', 'set'), default=default_index_action,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1001 help='''Specifies how to handle multiple V(D)J assignments
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1002 for initial grouping.''')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1003 parser_bygroup.add_argument('--model', action='store', dest='model',
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1004 choices=choices_bygroup_model,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1005 default=default_bygroup_model,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1006 help='''Specifies which substitution model to use for calculating distance
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1007 between sequences. The "ham" model is nucleotide Hamming distance and
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1008 "aa" is amino acid Hamming distance. The "hh_s1f" and "hh_s5f" models are
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1009 human specific single nucleotide and 5-mer content models, respectively,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1010 from Yaari et al, 2013. The "mk_rs1nf" and "mk_rs5nf" models are
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1011 mouse specific single nucleotide and 5-mer content models, respectively,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1012 from Cui et al, 2016. The "m1n_compat" and "hs1f_compat" models are
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1013 deprecated models provided backwards compatibility with the "m1n" and
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1014 "hs1f" models in Change-O v0.3.3 and SHazaM v0.1.4. Both
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1015 5-mer models should be considered experimental.''')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1016 parser_bygroup.add_argument('--dist', action='store', dest='distance', type=float,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1017 default=default_distance,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1018 help='The distance threshold for clonal grouping')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1019 parser_bygroup.add_argument('--norm', action='store', dest='norm',
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1020 choices=('len', 'mut', 'none'), default=default_norm,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1021 help='''Specifies how to normalize distances. One of none
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1022 (do not normalize), len (normalize by length),
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1023 or mut (normalize by number of mutations between sequences).''')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1024 parser_bygroup.add_argument('--sym', action='store', dest='sym',
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1025 choices=('avg', 'min'), default=default_sym,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1026 help='''Specifies how to combine asymmetric distances. One of avg
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1027 (average of A->B and B->A) or min (minimum of A->B and B->A).''')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1028 parser_bygroup.add_argument('--link', action='store', dest='linkage',
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1029 choices=('single', 'average', 'complete'), default=default_linkage,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1030 help='''Type of linkage to use for hierarchical clustering.''')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1031 parser_bygroup.add_argument('--sf', action='store', dest='seq_field',
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1032 default=default_seq_field,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1033 help='''The name of the field to be used to calculate
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1034 distance between records''')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1035 parser_bygroup.set_defaults(feed_func=feedQueue)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1036 parser_bygroup.set_defaults(work_func=processQueue)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1037 parser_bygroup.set_defaults(collect_func=collectQueue)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1038 parser_bygroup.set_defaults(group_func=indexJunctions)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1039 parser_bygroup.set_defaults(clone_func=distanceClones)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1040
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1041 # Chen2010
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1042 parser_chen = subparsers.add_parser('chen2010', parents=[parser_parent],
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1043 formatter_class=CommonHelpFormatter,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1044 help='''Defines clones by method specified in Chen, 2010.''',
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1045 description='''Defines clones by method specified in Chen, 2010.''')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1046 parser_chen.set_defaults(feed_func=feedQueueClust)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1047 parser_chen.set_defaults(work_func=processQueueClust)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1048 parser_chen.set_defaults(collect_func=collectQueueClust)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1049 parser_chen.set_defaults(cluster_func=hierClust)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1050
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1051 # Ademokun2011
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1052 parser_ade = subparsers.add_parser('ademokun2011', parents=[parser_parent],
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1053 formatter_class=CommonHelpFormatter,
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1054 help='''Defines clones by method specified in Ademokun, 2011.''',
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1055 description='''Defines clones by method specified in Ademokun, 2011.''')
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1056 parser_ade.set_defaults(feed_func=feedQueueClust)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1057 parser_ade.set_defaults(work_func=processQueueClust)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1058 parser_ade.set_defaults(collect_func=collectQueueClust)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1059 parser_ade.set_defaults(cluster_func=hierClust)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1060
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1061 return parser
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1062
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1063
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1064 if __name__ == '__main__':
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1065 """
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1066 Parses command line arguments and calls main function
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1067 """
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1068 # Parse arguments
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1069 parser = getArgParser()
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1070 checkArgs(parser)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1071 args = parser.parse_args()
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1072 args_dict = parseCommonArgs(args)
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1073 # Convert case of fields
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1074 if 'seq_field' in args_dict:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1075 args_dict['seq_field'] = args_dict['seq_field'].upper()
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1076 if 'fields' in args_dict and args_dict['fields'] is not None:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1077 args_dict['fields'] = [f.upper() for f in args_dict['fields']]
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1078
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1079 # Define clone_args
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1080 if args.command == 'bygroup':
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1081 args_dict['group_args'] = {'fields': args_dict['fields'],
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1082 'action': args_dict['action'],
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1083 'mode':args_dict['mode']}
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1084 args_dict['clone_args'] = {'model': args_dict['model'],
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1085 'distance': args_dict['distance'],
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1086 'norm': args_dict['norm'],
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1087 'sym': args_dict['sym'],
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1088 'linkage': args_dict['linkage'],
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1089 'seq_field': args_dict['seq_field']}
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1090
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1091 # Get distance matrix
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1092 try:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1093 args_dict['clone_args']['dist_mat'] = distance_models[args_dict['model']]
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1094 except KeyError:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1095 sys.exit('Unrecognized distance model: %s' % args_dict['model'])
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1096
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1097 del args_dict['fields']
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1098 del args_dict['action']
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1099 del args_dict['mode']
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1100 del args_dict['model']
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1101 del args_dict['distance']
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1102 del args_dict['norm']
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1103 del args_dict['sym']
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1104 del args_dict['linkage']
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1105 del args_dict['seq_field']
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1106
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1107 # Define clone_args
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1108 if args.command == 'chen2010':
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1109 args_dict['clone_func'] = distChen2010
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1110 args_dict['cluster_args'] = {'method': args.command }
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1111
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1112 if args.command == 'ademokun2011':
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1113 args_dict['clone_func'] = distAdemokun2011
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1114 args_dict['cluster_args'] = {'method': args.command }
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1115
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1116 # Call defineClones
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1117 del args_dict['command']
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1118 del args_dict['db_files']
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1119 for f in args.__dict__['db_files']:
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1120 args_dict['db_file'] = f
183edf446dcf Uploaded
davidvanzessen
parents:
diff changeset
1121 defineClones(**args_dict)