Mercurial > repos > clustalomega > clustalomega
comparison clustalomega/clustal-omega-0.2.0/src/squid/weight.c @ 0:ff1768533a07
Migrated tool version 0.2 from old tool shed archive to new tool shed repository
author | clustalomega |
---|---|
date | Tue, 07 Jun 2011 17:04:25 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:ff1768533a07 |
---|---|
1 /***************************************************************** | |
2 * SQUID - a library of functions for biological sequence analysis | |
3 * Copyright (C) 1992-2002 Washington University School of Medicine | |
4 * | |
5 * This source code is freely distributed under the terms of the | |
6 * GNU General Public License. See the files COPYRIGHT and LICENSE | |
7 * for details. | |
8 *****************************************************************/ | |
9 | |
10 /* weight.c | |
11 * SRE, Thu Mar 3 07:56:01 1994 | |
12 * | |
13 * Calculate weights for sequences in an alignment. | |
14 * RCS $Id: weight.c 217 2011-03-19 10:27:10Z andreas $ (Original squid RCS Id: weight.c,v 1.9 2002/10/09 14:26:09 eddy Exp) | |
15 */ | |
16 | |
17 #include <ctype.h> | |
18 #include <string.h> | |
19 #include "squid.h" | |
20 #include "sre_random.h" | |
21 | |
22 static void upweight(struct phylo_s *tree, int nseq, float *lwt, float *rwt, int node); | |
23 static void downweight(struct phylo_s *tree, int nseq, float *lwt, float *rwt, | |
24 float *fwt, int node); | |
25 static float simple_distance(char *s1, char *s2); | |
26 static int simple_diffmx(char **aseqs,int num, float ***ret_dmx); | |
27 | |
28 /* Function: GSCWeights() | |
29 * | |
30 * Purpose: Use Erik's tree-based algorithm to set weights for | |
31 * sequences in an alignment. upweight() and downweight() | |
32 * are derived from Graeme Mitchison's code. | |
33 * | |
34 * Args: aseq - array of (0..nseq-1) aligned sequences | |
35 * nseq - number of seqs in alignment | |
36 * alen - length of alignment | |
37 * wgt - allocated [0..nseq-1] array of weights to be returned | |
38 * | |
39 * Return: (void) | |
40 * wgt is filled in. | |
41 */ | |
42 void | |
43 GSCWeights(char **aseq, int nseq, int alen, float *wgt) | |
44 { | |
45 float **dmx; /* distance (difference) matrix */ | |
46 struct phylo_s *tree; | |
47 float *lwt, *rwt; /* weight on left, right of this tree node */ | |
48 float *fwt; /* final weight assigned to this node */ | |
49 int i; | |
50 | |
51 /* Sanity check first | |
52 */ | |
53 if (nseq == 1) { wgt[0] = 1.0; return; } | |
54 | |
55 /* I use a simple fractional difference matrix derived by | |
56 * pairwise identity. Perhaps I should include a Poisson | |
57 * distance correction. | |
58 */ | |
59 MakeDiffMx(aseq, nseq, &dmx); | |
60 if (! Cluster(dmx, nseq, CLUSTER_MIN, &tree)) Die("Cluster() failed"); | |
61 | |
62 /* Allocations | |
63 */ | |
64 lwt = MallocOrDie (sizeof(float) * (2 * nseq - 1)); | |
65 rwt = MallocOrDie (sizeof(float) * (2 * nseq - 1)); | |
66 fwt = MallocOrDie (sizeof(float) * (2 * nseq - 1)); | |
67 | |
68 /* lwt and rwt are the total branch weight to the left and | |
69 * right of a node or sequence. They are 0..2N-2. 0..N-1 are | |
70 * the sequences; these have weight 0. N..2N-2 are the actual | |
71 * tree nodes. | |
72 */ | |
73 for (i = 0; i < nseq; i++) | |
74 lwt[i] = rwt[i] = 0.0; | |
75 /* recursively calculate rwt, lwt, starting | |
76 at node nseq (the root) */ | |
77 upweight(tree, nseq, lwt, rwt, nseq); | |
78 | |
79 /* recursively distribute weight across the | |
80 tree */ | |
81 fwt[nseq] = nseq; | |
82 downweight(tree, nseq, lwt, rwt, fwt, nseq); | |
83 /* collect the weights */ | |
84 for (i = 0; i < nseq; i++) | |
85 wgt[i] = fwt[i]; | |
86 | |
87 FMX2Free(dmx); | |
88 FreePhylo(tree, nseq); | |
89 free(lwt); free(rwt); free(fwt); | |
90 } | |
91 | |
92 static void | |
93 upweight(struct phylo_s *tree, int nseq, float *lwt, float *rwt, int node) | |
94 { | |
95 int ld,rd; | |
96 | |
97 ld = tree[node-nseq].left; | |
98 if (ld >= nseq) upweight(tree, nseq, lwt, rwt, ld); | |
99 rd = tree[node-nseq].right; | |
100 if (rd >= nseq) upweight(tree, nseq, lwt, rwt, rd); | |
101 lwt[node] = lwt[ld] + rwt[ld] + tree[node-nseq].lblen; | |
102 rwt[node] = lwt[rd] + rwt[rd] + tree[node-nseq].rblen; | |
103 } | |
104 | |
105 | |
106 static void | |
107 downweight(struct phylo_s *tree, int nseq, float *lwt, float *rwt, float *fwt, int node) | |
108 { | |
109 int ld,rd; | |
110 float lnum, rnum; | |
111 | |
112 ld = tree[node-nseq].left; | |
113 rd = tree[node-nseq].right; | |
114 if (lwt[node] + rwt[node] > 0.0) | |
115 { | |
116 fwt[ld] = fwt[node] * (lwt[node] / (lwt[node] + rwt[node])); | |
117 fwt[rd] = fwt[node] * (rwt[node] / (lwt[node] + rwt[node])); | |
118 } | |
119 else | |
120 { | |
121 lnum = (ld >= nseq) ? tree[ld-nseq].incnum : 1.0; | |
122 rnum = (rd >= nseq) ? tree[rd-nseq].incnum : 1.0; | |
123 fwt[ld] = fwt[node] * lnum / (lnum + rnum); | |
124 fwt[rd] = fwt[node] * rnum / (lnum + rnum); | |
125 } | |
126 | |
127 if (ld >= nseq) downweight(tree, nseq, lwt, rwt, fwt, ld); | |
128 if (rd >= nseq) downweight(tree, nseq, lwt, rwt, fwt, rd); | |
129 } | |
130 | |
131 | |
132 | |
133 | |
134 /* Function: VoronoiWeights() | |
135 * | |
136 * Purpose: Calculate weights using the scheme of Sibbald & | |
137 * Argos (JMB 216:813-818 1990). The scheme is | |
138 * slightly modified because the original algorithm | |
139 * actually doesn't work on gapped alignments. | |
140 * The sequences are assumed to be protein. | |
141 * | |
142 * Args: aseq - array of (0..nseq-1) aligned sequences | |
143 * nseq - number of sequences | |
144 * alen - length of alignment | |
145 * wgt - allocated [0..nseq-1] array of weights to be returned | |
146 * | |
147 * Return: void | |
148 * wgt is filled in. | |
149 */ | |
150 void | |
151 VoronoiWeights(char **aseq, int nseq, int alen, float *wgt) | |
152 { | |
153 float **dmx; /* distance (difference) matrix */ | |
154 float *halfmin; /* 1/2 minimum distance to other seqs */ | |
155 char **psym; /* symbols seen in each column */ | |
156 int *nsym; /* # syms seen in each column */ | |
157 int symseen[27]; /* flags for observed syms */ | |
158 char *randseq; /* randomly generated sequence */ | |
159 int acol; /* pos in aligned columns */ | |
160 int idx; /* index in sequences */ | |
161 int symidx; /* 0..25 index for symbol */ | |
162 int i; /* generic counter */ | |
163 float min; /* minimum distance */ | |
164 float dist; /* distance between random and real */ | |
165 float challenge, champion; /* for resolving ties */ | |
166 int itscale; /* how many iterations per seq */ | |
167 int iteration; | |
168 int best; /* index of nearest real sequence */ | |
169 | |
170 /* Sanity check first | |
171 */ | |
172 if (nseq == 1) { wgt[0] = 1.0; return; } | |
173 | |
174 itscale = 50; | |
175 | |
176 /* Precalculate 1/2 minimum distance to other | |
177 * sequences for each sequence | |
178 */ | |
179 if (! simple_diffmx(aseq, nseq, &dmx)) | |
180 Die("simple_diffmx() failed"); | |
181 halfmin = MallocOrDie (sizeof(float) * nseq); | |
182 for (idx = 0; idx < nseq; idx++) | |
183 { | |
184 for (min = 1.0, i = 0; i < nseq; i++) | |
185 { | |
186 if (i == idx) continue; | |
187 if (dmx[idx][i] < min) min = dmx[idx][i]; | |
188 } | |
189 halfmin[idx] = min / 2.0; | |
190 } | |
191 Free2DArray((void **) dmx, nseq); | |
192 | |
193 /* Set up the random sequence generating model. | |
194 */ | |
195 psym = MallocOrDie (alen * sizeof(char *)); | |
196 nsym = MallocOrDie (alen * sizeof(int)); | |
197 for (acol = 0; acol < alen; acol++) | |
198 psym[acol] = MallocOrDie (27 * sizeof(char)); | |
199 | |
200 /* #ifdef ORIGINAL_SIBBALD_ALGORITHM_IS_BROKEN */ | |
201 for (acol = 0; acol < alen; acol++) | |
202 { | |
203 memset(symseen, 0, sizeof(int) * 27); | |
204 for (idx = 0; idx < nseq; idx++) | |
205 if (! isgap(aseq[idx][acol])) | |
206 { | |
207 if (isupper((int) aseq[idx][acol])) | |
208 symidx = aseq[idx][acol] - 'A'; | |
209 else | |
210 symidx = aseq[idx][acol] - 'a'; | |
211 if (symidx >= 0 && symidx < 26) | |
212 symseen[symidx] = 1; | |
213 } | |
214 else | |
215 symseen[26] = 1; /* a gap */ | |
216 | |
217 for (nsym[acol] = 0, i = 0; i < 26; i++) | |
218 if (symseen[i]) | |
219 { | |
220 psym[acol][nsym[acol]] = 'A'+i; | |
221 nsym[acol]++; | |
222 } | |
223 if (symseen[26]) { psym[acol][nsym[acol]] = ' '; nsym[acol]++; } | |
224 } | |
225 /* #endif ORIGINAL_SIBBALD_ALGORITHM_IS_BROKEN */ | |
226 | |
227 /* Note: the original Sibbald&Argos algorithm calls for | |
228 * bounding the sampled space using a template-like random | |
229 * sequence generator. However, this leads to one minor | |
230 * and one major problem. The minor problem is that | |
231 * exceptional amino acids in a column can have a | |
232 * significant effect by altering the amount of sampled | |
233 * sequence space; the larger the data set, the worse | |
234 * this problem becomes. The major problem is that | |
235 * there is no reasonable way to deal with gaps. | |
236 * Gapped sequences simply inhabit a different dimensionality | |
237 * and it's pretty painful to imagine calculating Voronoi | |
238 * volumes when the N in your N-space is varying. | |
239 * Note that all the examples shown by Sibbald and Argos | |
240 * are *ungapped* examples. | |
241 * | |
242 * The best way I've found to circumvent this problem is | |
243 * just not to bound the sampled space; count gaps as | |
244 * symbols and generate completely random sequences. | |
245 */ | |
246 #ifdef ALL_SEQUENCE_SPACE | |
247 for (acol = 0; acol < alen; acol++) | |
248 { | |
249 strcpy(psym[acol], "ACDEFGHIKLMNPQRSTVWY "); | |
250 nsym[acol] = 21; | |
251 } | |
252 #endif | |
253 | |
254 /* Sibbald and Argos algorithm: | |
255 * 1) assign all seqs weight 0. | |
256 * 2) generate a "random" sequence | |
257 * 3) calculate distance to every other sequence | |
258 * (if we get a distance < 1/2 minimum distance | |
259 * to other real seqs, we can stop) | |
260 * 4) if unique closest sequence, increment its weight 1. | |
261 * if multiple closest seq, choose one randomly | |
262 * 5) repeat 2-4 for lots of iterations | |
263 * 6) normalize all weights to sum to nseq. | |
264 */ | |
265 randseq = MallocOrDie ((alen+1) * sizeof(char)); | |
266 | |
267 best = 42.; /* solely to silence GCC uninit warnings. */ | |
268 FSet(wgt, nseq, 0.0); | |
269 for (iteration = 0; iteration < itscale * nseq; iteration++) | |
270 { | |
271 for (acol = 0; acol < alen; acol++) | |
272 randseq[acol] = (nsym[acol] == 0) ? ' ' : psym[acol][CHOOSE(nsym[acol])]; | |
273 randseq[acol] = '\0'; | |
274 | |
275 champion = sre_random(); | |
276 for (min = 1.0, idx = 0; idx < nseq; idx++) | |
277 { | |
278 dist = simple_distance(aseq[idx], randseq); | |
279 if (dist < halfmin[idx]) | |
280 { | |
281 best = idx; | |
282 break; | |
283 } | |
284 if (dist < min) | |
285 { champion = sre_random(); best = idx; min = dist; } | |
286 else if (dist == min) | |
287 { | |
288 challenge = sre_random(); | |
289 if (challenge > champion) | |
290 { champion = challenge; best = idx; min = dist; } | |
291 } | |
292 } | |
293 wgt[best] += 1.0; | |
294 } | |
295 | |
296 for (idx = 0; idx < nseq; idx++) | |
297 wgt[idx] = wgt[idx] / (float) itscale; | |
298 | |
299 free(randseq); | |
300 free(nsym); | |
301 free(halfmin); | |
302 Free2DArray((void **) psym, alen); | |
303 } | |
304 | |
305 | |
306 /* Function: simple_distance() | |
307 * | |
308 * Purpose: For two identical-length null-terminated strings, return | |
309 * the fractional difference between them. (0..1) | |
310 * (Gaps don't count toward anything.) | |
311 */ | |
312 static float | |
313 simple_distance(char *s1, char *s2) | |
314 { | |
315 int diff = 0; | |
316 int valid = 0; | |
317 | |
318 for (; *s1 != '\0'; s1++, s2++) | |
319 { | |
320 if (isgap(*s1) || isgap(*s2)) continue; | |
321 if (*s1 != *s2) diff++; | |
322 valid++; | |
323 } | |
324 return (valid > 0 ? ((float) diff / (float) valid) : 0.0); | |
325 } | |
326 | |
327 /* Function: simple_diffmx() | |
328 * | |
329 * Purpose: Given a set of flushed, aligned sequences, construct | |
330 * an NxN fractional difference matrix using the | |
331 * simple_distance rule. | |
332 * | |
333 * Args: aseqs - flushed, aligned sequences | |
334 * num - number of aseqs | |
335 * ret_dmx - RETURN: difference matrix (caller must free) | |
336 * | |
337 * Return: 1 on success, 0 on failure. | |
338 */ | |
339 static int | |
340 simple_diffmx(char **aseqs, | |
341 int num, | |
342 float ***ret_dmx) | |
343 { | |
344 float **dmx; /* RETURN: distance matrix */ | |
345 int i,j; /* counters over sequences */ | |
346 | |
347 /* Allocate | |
348 */ | |
349 if ((dmx = (float **) malloc (sizeof(float *) * num)) == NULL) | |
350 Die("malloc failed"); | |
351 for (i = 0; i < num; i++) | |
352 if ((dmx[i] = (float *) malloc (sizeof(float) * num)) == NULL) | |
353 Die("malloc failed"); | |
354 | |
355 /* Calculate distances, symmetric matrix | |
356 */ | |
357 for (i = 0; i < num; i++) | |
358 for (j = i; j < num; j++) | |
359 dmx[i][j] = dmx[j][i] = simple_distance(aseqs[i], aseqs[j]); | |
360 | |
361 /* Return | |
362 */ | |
363 *ret_dmx = dmx; | |
364 return 1; | |
365 } | |
366 | |
367 | |
368 | |
369 /* Function: BlosumWeights() | |
370 * Date: SRE, Fri Jul 16 17:33:59 1999 (St. Louis) | |
371 * | |
372 * Purpose: Assign weights to a set of aligned sequences | |
373 * using the BLOSUM rule: | |
374 * - do single linkage clustering at some pairwise identity | |
375 * - in each cluster, give each sequence 1/clustsize | |
376 * total weight. | |
377 * | |
378 * The clusters have no pairwise link >= maxid. | |
379 * | |
380 * O(N) in memory. Probably ~O(NlogN) in time; O(N^2) | |
381 * in worst case, which is no links between sequences | |
382 * (e.g., values of maxid near 1.0). | |
383 * | |
384 * Args: aseqs - alignment | |
385 * nseq - number of seqs in alignment | |
386 * alen - # of columns in alignment | |
387 * maxid - fractional identity (e.g. 0.62 for BLOSUM62) | |
388 * wgt - [0..nseq-1] array of weights to be returned | |
389 */ | |
390 void | |
391 BlosumWeights(char **aseqs, int nseq, int alen, float maxid, float *wgt) | |
392 { | |
393 int *c, nc; | |
394 int *nmem; /* number of seqs in each cluster */ | |
395 int i; /* loop counter */ | |
396 | |
397 SingleLinkCluster(aseqs, nseq, alen, maxid, &c, &nc); | |
398 | |
399 FSet(wgt, nseq, 1.0); | |
400 nmem = MallocOrDie(sizeof(int) * nc); | |
401 | |
402 for (i = 0; i < nc; i++) nmem[i] = 0; | |
403 for (i = 0; i < nseq; i++) nmem[c[i]]++; | |
404 for (i = 0; i < nseq; i++) wgt[i] = 1. / (float) nmem[c[i]]; | |
405 | |
406 free(nmem); | |
407 free(c); | |
408 return; | |
409 } | |
410 | |
411 | |
412 /* Function: PositionBasedWeights() | |
413 * Date: SRE, Fri Jul 16 17:47:22 1999 [St. Louis] | |
414 * | |
415 * Purpose: Implementation of Henikoff and Henikoff position-based | |
416 * weights (JMB 243:574-578, 1994) [Henikoff94b]. | |
417 * | |
418 * A significant advantage of this approach that Steve and Jorja | |
419 * don't point out is that it is O(N) in memory, unlike | |
420 * many other approaches like GSC weights or Voronoi. | |
421 * | |
422 * A potential disadvantage that they don't point out | |
423 * is that in the theoretical limit of infinite sequences | |
424 * in the alignment, weights go flat: eventually every | |
425 * column has at least one representative of each of 20 aa (or 4 nt) | |
426 * in it. | |
427 * | |
428 * They also don't give a rule for how to handle gaps. | |
429 * The rule used here seems the obvious and sensible one | |
430 * (ignore them). This means that longer sequences | |
431 * initially get more weight; hence a "double | |
432 * normalization" in which the weights are first divided | |
433 * by sequence length (to compensate for that effect), | |
434 * then normalized to sum to nseq. | |
435 * | |
436 * Limitations: | |
437 * Implemented in a way that's alphabet-independent: | |
438 * it uses the 26 upper case letters as "residues". | |
439 * Any alphabetic character in aseq is interpreted as | |
440 * a unique "residue" (case insensitively; lower case | |
441 * mapped to upper case). All other characters are | |
442 * interpreted as gaps. | |
443 * | |
444 * This way, we don't have to pass around any alphabet | |
445 * type info (DNA vs. RNA vs. protein) and don't have | |
446 * to deal with remapping IUPAC degenerate codes | |
447 * probabilistically. However, on the down side, | |
448 * a sequence with a lot of degenerate IUPAC characters | |
449 * will get an artifactually high PB weight. | |
450 * | |
451 * Args: aseq - sequence alignment to weight | |
452 * nseq - number of sequences in alignment | |
453 * alen - length of alignment | |
454 * wgt - RETURN: weights filled in (pre-allocated 0..nseq-1) | |
455 * | |
456 * Returns: (void) | |
457 * wgt is allocated (0..nseq-1) by caller, and filled in here. | |
458 */ | |
459 void | |
460 PositionBasedWeights(char **aseq, int nseq, int alen, float *wgt) | |
461 { | |
462 int rescount[26]; /* count of A-Z residues in a column */ | |
463 int nres; /* number of different residues in col */ | |
464 int idx, pos; /* indices into aseq */ | |
465 int x; | |
466 float norm; | |
467 | |
468 FSet(wgt, nseq, 0.0); | |
469 for (pos = 0; pos < alen; pos++) | |
470 { | |
471 for (x = 0; x < 26; x++) rescount[x] = 0; | |
472 for (idx = 0; idx < nseq; idx++) | |
473 if (isalpha(aseq[idx][pos])) | |
474 rescount[toupper(aseq[idx][pos]) - 'A'] ++; | |
475 | |
476 nres = 0; | |
477 for (x = 0; x < 26; x++) | |
478 if (rescount[x] > 0) nres++; | |
479 | |
480 for (idx = 0; idx < nseq; idx++) | |
481 if (isalpha(aseq[idx][pos])) | |
482 wgt[idx] += 1. / (float) (nres * rescount[toupper(aseq[idx][pos]) - 'A']); | |
483 } | |
484 | |
485 for (idx = 0; idx < nseq; idx++) | |
486 wgt[idx] /= (float) DealignedLength(aseq[idx]); | |
487 norm = (float) nseq / FSum(wgt, nseq); | |
488 FScale(wgt, nseq, norm); | |
489 return; | |
490 } | |
491 | |
492 | |
493 | |
494 | |
495 /* Function: FilterAlignment() | |
496 * Date: SRE, Wed Jun 30 09:19:30 1999 [St. Louis] | |
497 * | |
498 * Purpose: Constructs a new alignment by removing near-identical | |
499 * sequences from a given alignment (where identity is | |
500 * calculated *based on the alignment*). | |
501 * Does not affect the given alignment. | |
502 * Keeps earlier sequence, discards later one. | |
503 * | |
504 * Usually called as an ad hoc sequence "weighting" mechanism. | |
505 * | |
506 * Limitations: | |
507 * Unparsed Stockholm markup is not propagated into the | |
508 * new alignment. | |
509 * | |
510 * Args: msa -- original alignment | |
511 * cutoff -- fraction identity cutoff. 0.8 removes sequences > 80% id. | |
512 * ret_new -- RETURN: new MSA, usually w/ fewer sequences | |
513 * | |
514 * Return: (void) | |
515 * ret_new must be free'd by caller: MSAFree(). | |
516 */ | |
517 void | |
518 FilterAlignment(MSA *msa, float cutoff, MSA **ret_new) | |
519 { | |
520 int nnew; /* number of seqs in new alignment */ | |
521 int *list; | |
522 int *useme; | |
523 float ident; | |
524 int i,j; | |
525 int remove; | |
526 | |
527 /* find which seqs to keep (list) */ | |
528 /* diff matrix; allow ragged ends */ | |
529 list = MallocOrDie (sizeof(int) * msa->nseq); | |
530 useme = MallocOrDie (sizeof(int) * msa->nseq); | |
531 for (i = 0; i < msa->nseq; i++) useme[i] = FALSE; | |
532 | |
533 nnew = 0; | |
534 for (i = 0; i < msa->nseq; i++) | |
535 { | |
536 remove = FALSE; | |
537 for (j = 0; j < nnew; j++) | |
538 { | |
539 ident = PairwiseIdentity(msa->aseq[i], msa->aseq[list[j]]); | |
540 if (ident > cutoff) | |
541 { | |
542 remove = TRUE; | |
543 printf("removing %12s -- fractional identity %.2f to %s\n", | |
544 msa->sqname[i], ident, | |
545 msa->sqname[list[j]]); | |
546 break; | |
547 } | |
548 } | |
549 if (remove == FALSE) { | |
550 list[nnew++] = i; | |
551 useme[i] = TRUE; | |
552 } | |
553 } | |
554 | |
555 MSASmallerAlignment(msa, useme, ret_new); | |
556 free(list); | |
557 free(useme); | |
558 return; | |
559 } | |
560 | |
561 | |
562 /* Function: SampleAlignment() | |
563 * Date: SRE, Wed Jun 30 10:13:56 1999 [St. Louis] | |
564 * | |
565 * Purpose: Constructs a new, smaller alignment by sampling a given | |
566 * number of sequences at random. Does not change the | |
567 * alignment nor the order of the sequences. | |
568 * | |
569 * If you ask for a sample that is larger than nseqs, | |
570 * it silently returns the original alignment. | |
571 * | |
572 * Not really a weighting method, but this is as good | |
573 * a place as any to keep it, since it's similar in | |
574 * construction to FilterAlignment(). | |
575 * | |
576 * Args: msa -- original alignment | |
577 * sample -- number of sequences in new alignment (0 < sample <= nseq) | |
578 * ret_new -- RETURN: new MSA | |
579 * | |
580 * Return: (void) | |
581 * ret_new must be free'd by caller: MSAFree(). | |
582 */ | |
583 void | |
584 SampleAlignment(MSA *msa, int sample, MSA **ret_new) | |
585 { | |
586 int *list; /* array for random selection w/o replace */ | |
587 int *useme; /* array of flags 0..nseq-1: TRUE to use */ | |
588 int i, idx; | |
589 int len; | |
590 | |
591 /* Allocations | |
592 */ | |
593 list = (int *) MallocOrDie (sizeof(int) * msa->nseq); | |
594 useme = (int *) MallocOrDie (sizeof(int) * msa->nseq); | |
595 for (i = 0; i < msa->nseq; i++) | |
596 { | |
597 list[i] = i; | |
598 useme[i] = FALSE; | |
599 } | |
600 | |
601 /* Sanity check. | |
602 */ | |
603 if (sample >= msa->nseq) sample = msa->nseq; | |
604 | |
605 /* random selection w/o replacement */ | |
606 for (len = msa->nseq, i = 0; i < sample; i++) | |
607 { | |
608 idx = CHOOSE(len); | |
609 printf("chose %d: %s\n", list[idx], msa->sqname[list[idx]]); | |
610 useme[list[idx]] = TRUE; | |
611 list[idx] = list[--len]; | |
612 } | |
613 | |
614 MSASmallerAlignment(msa, useme, ret_new); | |
615 free(list); | |
616 free(useme); | |
617 return; | |
618 } | |
619 | |
620 | |
621 /* Function: SingleLinkCluster() | |
622 * Date: SRE, Fri Jul 16 15:02:57 1999 [St. Louis] | |
623 * | |
624 * Purpose: Perform simple single link clustering of seqs in a | |
625 * sequence alignment. A pairwise identity threshold | |
626 * defines whether two sequences are linked or not. | |
627 * | |
628 * Important: runs in O(N) memory, unlike standard | |
629 * graph decomposition algorithms that use O(N^2) | |
630 * adjacency matrices or adjacency lists. Requires | |
631 * O(N^2) time in worst case (which is when you have | |
632 * no links at all), O(NlogN) in "average" | |
633 * case, and O(N) in best case (when there is just | |
634 * one cluster in a completely connected graph. | |
635 * | |
636 * (Developed because hmmbuild could no longer deal | |
637 * with GP120, a 16,013 sequence alignment.) | |
638 * | |
639 * Limitations: | |
640 * CASE-SENSITIVE. Assumes aseq have been put into | |
641 * either all lower or all upper case; or at least, | |
642 * within a column, there's no mixed case. | |
643 * | |
644 * Algorithm: | |
645 * I don't know if this algorithm is published. I | |
646 * haven't seen it in graph theory books, but that might | |
647 * be because it's so obvious that nobody's bothered. | |
648 * | |
649 * In brief, we're going to do a breadth-first search | |
650 * of the graph, and we're going to calculate links | |
651 * on the fly rather than precalculating them into | |
652 * some sort of standard adjacency structure. | |
653 * | |
654 * While working, we keep two stacks of maximum length N: | |
655 * a : list of vertices that are still unconnected. | |
656 * b : list of vertices that we've connected to | |
657 * in our current breadth level, but we haven't | |
658 * yet tested for other connections to a. | |
659 * The current length (number of elements in) a and b are | |
660 * kept in na, nb. | |
661 * | |
662 * We store our results in an array of length N: | |
663 * c : assigns each vertex to a component. for example | |
664 * c[4] = 1 means that vertex 4 is in component 1. | |
665 * nc is the number of components. Components | |
666 * are numbered from 0 to nc-1. We return c and nc | |
667 * to our caller. | |
668 * | |
669 * The algorithm is: | |
670 * | |
671 * Initialisation: | |
672 * a <-- all the vertices | |
673 * na <-- N | |
674 * b <-- empty set | |
675 * nb <-- 0 | |
676 * nc <-- 0 | |
677 * | |
678 * Then: | |
679 * while (a is not empty) | |
680 * pop a vertex off a, push onto b | |
681 * while (b is not empty) | |
682 * pop vertex v off b | |
683 * assign c[v] = nc | |
684 * for each vertex w in a: | |
685 * compare v,w. If w is linked to v, remove w | |
686 * from a, push onto b. | |
687 * nc++ | |
688 * q.e.d. :) | |
689 * | |
690 * Args: aseq - aligned sequences | |
691 * nseq - number of sequences in aseq | |
692 * alen - alignment length | |
693 * maxid - fractional identity threshold 0..1. if id >= maxid, seqs linked | |
694 * ret_c - RETURN: 0..nseq-1 assignments of seqs to components (clusters) | |
695 * ret_nc - RETURN: number of components | |
696 * | |
697 * Returns: void. | |
698 * ret_c is allocated here. Caller free's with free(*ret_c) | |
699 */ | |
700 void | |
701 SingleLinkCluster(char **aseq, int nseq, int alen, float maxid, | |
702 int **ret_c, int *ret_nc) | |
703 { | |
704 int *a, na; /* stack of available vertices */ | |
705 int *b, nb; /* stack of working vertices */ | |
706 int *c; /* array of results */ | |
707 int nc; /* total number of components */ | |
708 int v,w; /* index of a working vertices */ | |
709 int i; /* loop counter */ | |
710 | |
711 /* allocations and initializations | |
712 */ | |
713 a = MallocOrDie (sizeof(int) * nseq); | |
714 b = MallocOrDie (sizeof(int) * nseq); | |
715 c = MallocOrDie (sizeof(int) * nseq); | |
716 for (i = 0; i < nseq; i++) a[i] = i; | |
717 na = nseq; | |
718 nb = 0; | |
719 nc = 0; | |
720 | |
721 /* Main algorithm | |
722 */ | |
723 while (na > 0) | |
724 { | |
725 v = a[na-1]; na--; /* pop a vertex off a, */ | |
726 b[nb] = v; nb++; /* and push onto b */ | |
727 while (nb > 0) | |
728 { | |
729 v = b[nb-1]; nb--; /* pop vertex off b */ | |
730 c[v] = nc; /* assign it to component nc */ | |
731 for (i = na-1; i >= 0; i--)/* backwards, becase of deletion/swapping we do*/ | |
732 if (simple_distance(aseq[v], aseq[a[i]]) <= 1. - maxid) /* linked? */ | |
733 { | |
734 w = a[i]; a[i] = a[na-1]; na--; /* delete w from a (note swap) */ | |
735 b[nb] = w; nb++; /* push w onto b */ | |
736 } | |
737 } | |
738 nc++; | |
739 } | |
740 | |
741 /* Cleanup and return | |
742 */ | |
743 free(a); | |
744 free(b); | |
745 *ret_c = c; | |
746 *ret_nc = nc; | |
747 return; | |
748 } |