comparison clustalomega/clustal-omega-0.2.0/src/squid/weight.c @ 0:ff1768533a07

Migrated tool version 0.2 from old tool shed archive to new tool shed repository
author clustalomega
date Tue, 07 Jun 2011 17:04:25 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:ff1768533a07
1 /*****************************************************************
2 * SQUID - a library of functions for biological sequence analysis
3 * Copyright (C) 1992-2002 Washington University School of Medicine
4 *
5 * This source code is freely distributed under the terms of the
6 * GNU General Public License. See the files COPYRIGHT and LICENSE
7 * for details.
8 *****************************************************************/
9
10 /* weight.c
11 * SRE, Thu Mar 3 07:56:01 1994
12 *
13 * Calculate weights for sequences in an alignment.
14 * RCS $Id: weight.c 217 2011-03-19 10:27:10Z andreas $ (Original squid RCS Id: weight.c,v 1.9 2002/10/09 14:26:09 eddy Exp)
15 */
16
17 #include <ctype.h>
18 #include <string.h>
19 #include "squid.h"
20 #include "sre_random.h"
21
22 static void upweight(struct phylo_s *tree, int nseq, float *lwt, float *rwt, int node);
23 static void downweight(struct phylo_s *tree, int nseq, float *lwt, float *rwt,
24 float *fwt, int node);
25 static float simple_distance(char *s1, char *s2);
26 static int simple_diffmx(char **aseqs,int num, float ***ret_dmx);
27
28 /* Function: GSCWeights()
29 *
30 * Purpose: Use Erik's tree-based algorithm to set weights for
31 * sequences in an alignment. upweight() and downweight()
32 * are derived from Graeme Mitchison's code.
33 *
34 * Args: aseq - array of (0..nseq-1) aligned sequences
35 * nseq - number of seqs in alignment
36 * alen - length of alignment
37 * wgt - allocated [0..nseq-1] array of weights to be returned
38 *
39 * Return: (void)
40 * wgt is filled in.
41 */
42 void
43 GSCWeights(char **aseq, int nseq, int alen, float *wgt)
44 {
45 float **dmx; /* distance (difference) matrix */
46 struct phylo_s *tree;
47 float *lwt, *rwt; /* weight on left, right of this tree node */
48 float *fwt; /* final weight assigned to this node */
49 int i;
50
51 /* Sanity check first
52 */
53 if (nseq == 1) { wgt[0] = 1.0; return; }
54
55 /* I use a simple fractional difference matrix derived by
56 * pairwise identity. Perhaps I should include a Poisson
57 * distance correction.
58 */
59 MakeDiffMx(aseq, nseq, &dmx);
60 if (! Cluster(dmx, nseq, CLUSTER_MIN, &tree)) Die("Cluster() failed");
61
62 /* Allocations
63 */
64 lwt = MallocOrDie (sizeof(float) * (2 * nseq - 1));
65 rwt = MallocOrDie (sizeof(float) * (2 * nseq - 1));
66 fwt = MallocOrDie (sizeof(float) * (2 * nseq - 1));
67
68 /* lwt and rwt are the total branch weight to the left and
69 * right of a node or sequence. They are 0..2N-2. 0..N-1 are
70 * the sequences; these have weight 0. N..2N-2 are the actual
71 * tree nodes.
72 */
73 for (i = 0; i < nseq; i++)
74 lwt[i] = rwt[i] = 0.0;
75 /* recursively calculate rwt, lwt, starting
76 at node nseq (the root) */
77 upweight(tree, nseq, lwt, rwt, nseq);
78
79 /* recursively distribute weight across the
80 tree */
81 fwt[nseq] = nseq;
82 downweight(tree, nseq, lwt, rwt, fwt, nseq);
83 /* collect the weights */
84 for (i = 0; i < nseq; i++)
85 wgt[i] = fwt[i];
86
87 FMX2Free(dmx);
88 FreePhylo(tree, nseq);
89 free(lwt); free(rwt); free(fwt);
90 }
91
92 static void
93 upweight(struct phylo_s *tree, int nseq, float *lwt, float *rwt, int node)
94 {
95 int ld,rd;
96
97 ld = tree[node-nseq].left;
98 if (ld >= nseq) upweight(tree, nseq, lwt, rwt, ld);
99 rd = tree[node-nseq].right;
100 if (rd >= nseq) upweight(tree, nseq, lwt, rwt, rd);
101 lwt[node] = lwt[ld] + rwt[ld] + tree[node-nseq].lblen;
102 rwt[node] = lwt[rd] + rwt[rd] + tree[node-nseq].rblen;
103 }
104
105
106 static void
107 downweight(struct phylo_s *tree, int nseq, float *lwt, float *rwt, float *fwt, int node)
108 {
109 int ld,rd;
110 float lnum, rnum;
111
112 ld = tree[node-nseq].left;
113 rd = tree[node-nseq].right;
114 if (lwt[node] + rwt[node] > 0.0)
115 {
116 fwt[ld] = fwt[node] * (lwt[node] / (lwt[node] + rwt[node]));
117 fwt[rd] = fwt[node] * (rwt[node] / (lwt[node] + rwt[node]));
118 }
119 else
120 {
121 lnum = (ld >= nseq) ? tree[ld-nseq].incnum : 1.0;
122 rnum = (rd >= nseq) ? tree[rd-nseq].incnum : 1.0;
123 fwt[ld] = fwt[node] * lnum / (lnum + rnum);
124 fwt[rd] = fwt[node] * rnum / (lnum + rnum);
125 }
126
127 if (ld >= nseq) downweight(tree, nseq, lwt, rwt, fwt, ld);
128 if (rd >= nseq) downweight(tree, nseq, lwt, rwt, fwt, rd);
129 }
130
131
132
133
134 /* Function: VoronoiWeights()
135 *
136 * Purpose: Calculate weights using the scheme of Sibbald &
137 * Argos (JMB 216:813-818 1990). The scheme is
138 * slightly modified because the original algorithm
139 * actually doesn't work on gapped alignments.
140 * The sequences are assumed to be protein.
141 *
142 * Args: aseq - array of (0..nseq-1) aligned sequences
143 * nseq - number of sequences
144 * alen - length of alignment
145 * wgt - allocated [0..nseq-1] array of weights to be returned
146 *
147 * Return: void
148 * wgt is filled in.
149 */
150 void
151 VoronoiWeights(char **aseq, int nseq, int alen, float *wgt)
152 {
153 float **dmx; /* distance (difference) matrix */
154 float *halfmin; /* 1/2 minimum distance to other seqs */
155 char **psym; /* symbols seen in each column */
156 int *nsym; /* # syms seen in each column */
157 int symseen[27]; /* flags for observed syms */
158 char *randseq; /* randomly generated sequence */
159 int acol; /* pos in aligned columns */
160 int idx; /* index in sequences */
161 int symidx; /* 0..25 index for symbol */
162 int i; /* generic counter */
163 float min; /* minimum distance */
164 float dist; /* distance between random and real */
165 float challenge, champion; /* for resolving ties */
166 int itscale; /* how many iterations per seq */
167 int iteration;
168 int best; /* index of nearest real sequence */
169
170 /* Sanity check first
171 */
172 if (nseq == 1) { wgt[0] = 1.0; return; }
173
174 itscale = 50;
175
176 /* Precalculate 1/2 minimum distance to other
177 * sequences for each sequence
178 */
179 if (! simple_diffmx(aseq, nseq, &dmx))
180 Die("simple_diffmx() failed");
181 halfmin = MallocOrDie (sizeof(float) * nseq);
182 for (idx = 0; idx < nseq; idx++)
183 {
184 for (min = 1.0, i = 0; i < nseq; i++)
185 {
186 if (i == idx) continue;
187 if (dmx[idx][i] < min) min = dmx[idx][i];
188 }
189 halfmin[idx] = min / 2.0;
190 }
191 Free2DArray((void **) dmx, nseq);
192
193 /* Set up the random sequence generating model.
194 */
195 psym = MallocOrDie (alen * sizeof(char *));
196 nsym = MallocOrDie (alen * sizeof(int));
197 for (acol = 0; acol < alen; acol++)
198 psym[acol] = MallocOrDie (27 * sizeof(char));
199
200 /* #ifdef ORIGINAL_SIBBALD_ALGORITHM_IS_BROKEN */
201 for (acol = 0; acol < alen; acol++)
202 {
203 memset(symseen, 0, sizeof(int) * 27);
204 for (idx = 0; idx < nseq; idx++)
205 if (! isgap(aseq[idx][acol]))
206 {
207 if (isupper((int) aseq[idx][acol]))
208 symidx = aseq[idx][acol] - 'A';
209 else
210 symidx = aseq[idx][acol] - 'a';
211 if (symidx >= 0 && symidx < 26)
212 symseen[symidx] = 1;
213 }
214 else
215 symseen[26] = 1; /* a gap */
216
217 for (nsym[acol] = 0, i = 0; i < 26; i++)
218 if (symseen[i])
219 {
220 psym[acol][nsym[acol]] = 'A'+i;
221 nsym[acol]++;
222 }
223 if (symseen[26]) { psym[acol][nsym[acol]] = ' '; nsym[acol]++; }
224 }
225 /* #endif ORIGINAL_SIBBALD_ALGORITHM_IS_BROKEN */
226
227 /* Note: the original Sibbald&Argos algorithm calls for
228 * bounding the sampled space using a template-like random
229 * sequence generator. However, this leads to one minor
230 * and one major problem. The minor problem is that
231 * exceptional amino acids in a column can have a
232 * significant effect by altering the amount of sampled
233 * sequence space; the larger the data set, the worse
234 * this problem becomes. The major problem is that
235 * there is no reasonable way to deal with gaps.
236 * Gapped sequences simply inhabit a different dimensionality
237 * and it's pretty painful to imagine calculating Voronoi
238 * volumes when the N in your N-space is varying.
239 * Note that all the examples shown by Sibbald and Argos
240 * are *ungapped* examples.
241 *
242 * The best way I've found to circumvent this problem is
243 * just not to bound the sampled space; count gaps as
244 * symbols and generate completely random sequences.
245 */
246 #ifdef ALL_SEQUENCE_SPACE
247 for (acol = 0; acol < alen; acol++)
248 {
249 strcpy(psym[acol], "ACDEFGHIKLMNPQRSTVWY ");
250 nsym[acol] = 21;
251 }
252 #endif
253
254 /* Sibbald and Argos algorithm:
255 * 1) assign all seqs weight 0.
256 * 2) generate a "random" sequence
257 * 3) calculate distance to every other sequence
258 * (if we get a distance < 1/2 minimum distance
259 * to other real seqs, we can stop)
260 * 4) if unique closest sequence, increment its weight 1.
261 * if multiple closest seq, choose one randomly
262 * 5) repeat 2-4 for lots of iterations
263 * 6) normalize all weights to sum to nseq.
264 */
265 randseq = MallocOrDie ((alen+1) * sizeof(char));
266
267 best = 42.; /* solely to silence GCC uninit warnings. */
268 FSet(wgt, nseq, 0.0);
269 for (iteration = 0; iteration < itscale * nseq; iteration++)
270 {
271 for (acol = 0; acol < alen; acol++)
272 randseq[acol] = (nsym[acol] == 0) ? ' ' : psym[acol][CHOOSE(nsym[acol])];
273 randseq[acol] = '\0';
274
275 champion = sre_random();
276 for (min = 1.0, idx = 0; idx < nseq; idx++)
277 {
278 dist = simple_distance(aseq[idx], randseq);
279 if (dist < halfmin[idx])
280 {
281 best = idx;
282 break;
283 }
284 if (dist < min)
285 { champion = sre_random(); best = idx; min = dist; }
286 else if (dist == min)
287 {
288 challenge = sre_random();
289 if (challenge > champion)
290 { champion = challenge; best = idx; min = dist; }
291 }
292 }
293 wgt[best] += 1.0;
294 }
295
296 for (idx = 0; idx < nseq; idx++)
297 wgt[idx] = wgt[idx] / (float) itscale;
298
299 free(randseq);
300 free(nsym);
301 free(halfmin);
302 Free2DArray((void **) psym, alen);
303 }
304
305
306 /* Function: simple_distance()
307 *
308 * Purpose: For two identical-length null-terminated strings, return
309 * the fractional difference between them. (0..1)
310 * (Gaps don't count toward anything.)
311 */
312 static float
313 simple_distance(char *s1, char *s2)
314 {
315 int diff = 0;
316 int valid = 0;
317
318 for (; *s1 != '\0'; s1++, s2++)
319 {
320 if (isgap(*s1) || isgap(*s2)) continue;
321 if (*s1 != *s2) diff++;
322 valid++;
323 }
324 return (valid > 0 ? ((float) diff / (float) valid) : 0.0);
325 }
326
327 /* Function: simple_diffmx()
328 *
329 * Purpose: Given a set of flushed, aligned sequences, construct
330 * an NxN fractional difference matrix using the
331 * simple_distance rule.
332 *
333 * Args: aseqs - flushed, aligned sequences
334 * num - number of aseqs
335 * ret_dmx - RETURN: difference matrix (caller must free)
336 *
337 * Return: 1 on success, 0 on failure.
338 */
339 static int
340 simple_diffmx(char **aseqs,
341 int num,
342 float ***ret_dmx)
343 {
344 float **dmx; /* RETURN: distance matrix */
345 int i,j; /* counters over sequences */
346
347 /* Allocate
348 */
349 if ((dmx = (float **) malloc (sizeof(float *) * num)) == NULL)
350 Die("malloc failed");
351 for (i = 0; i < num; i++)
352 if ((dmx[i] = (float *) malloc (sizeof(float) * num)) == NULL)
353 Die("malloc failed");
354
355 /* Calculate distances, symmetric matrix
356 */
357 for (i = 0; i < num; i++)
358 for (j = i; j < num; j++)
359 dmx[i][j] = dmx[j][i] = simple_distance(aseqs[i], aseqs[j]);
360
361 /* Return
362 */
363 *ret_dmx = dmx;
364 return 1;
365 }
366
367
368
369 /* Function: BlosumWeights()
370 * Date: SRE, Fri Jul 16 17:33:59 1999 (St. Louis)
371 *
372 * Purpose: Assign weights to a set of aligned sequences
373 * using the BLOSUM rule:
374 * - do single linkage clustering at some pairwise identity
375 * - in each cluster, give each sequence 1/clustsize
376 * total weight.
377 *
378 * The clusters have no pairwise link >= maxid.
379 *
380 * O(N) in memory. Probably ~O(NlogN) in time; O(N^2)
381 * in worst case, which is no links between sequences
382 * (e.g., values of maxid near 1.0).
383 *
384 * Args: aseqs - alignment
385 * nseq - number of seqs in alignment
386 * alen - # of columns in alignment
387 * maxid - fractional identity (e.g. 0.62 for BLOSUM62)
388 * wgt - [0..nseq-1] array of weights to be returned
389 */
390 void
391 BlosumWeights(char **aseqs, int nseq, int alen, float maxid, float *wgt)
392 {
393 int *c, nc;
394 int *nmem; /* number of seqs in each cluster */
395 int i; /* loop counter */
396
397 SingleLinkCluster(aseqs, nseq, alen, maxid, &c, &nc);
398
399 FSet(wgt, nseq, 1.0);
400 nmem = MallocOrDie(sizeof(int) * nc);
401
402 for (i = 0; i < nc; i++) nmem[i] = 0;
403 for (i = 0; i < nseq; i++) nmem[c[i]]++;
404 for (i = 0; i < nseq; i++) wgt[i] = 1. / (float) nmem[c[i]];
405
406 free(nmem);
407 free(c);
408 return;
409 }
410
411
412 /* Function: PositionBasedWeights()
413 * Date: SRE, Fri Jul 16 17:47:22 1999 [St. Louis]
414 *
415 * Purpose: Implementation of Henikoff and Henikoff position-based
416 * weights (JMB 243:574-578, 1994) [Henikoff94b].
417 *
418 * A significant advantage of this approach that Steve and Jorja
419 * don't point out is that it is O(N) in memory, unlike
420 * many other approaches like GSC weights or Voronoi.
421 *
422 * A potential disadvantage that they don't point out
423 * is that in the theoretical limit of infinite sequences
424 * in the alignment, weights go flat: eventually every
425 * column has at least one representative of each of 20 aa (or 4 nt)
426 * in it.
427 *
428 * They also don't give a rule for how to handle gaps.
429 * The rule used here seems the obvious and sensible one
430 * (ignore them). This means that longer sequences
431 * initially get more weight; hence a "double
432 * normalization" in which the weights are first divided
433 * by sequence length (to compensate for that effect),
434 * then normalized to sum to nseq.
435 *
436 * Limitations:
437 * Implemented in a way that's alphabet-independent:
438 * it uses the 26 upper case letters as "residues".
439 * Any alphabetic character in aseq is interpreted as
440 * a unique "residue" (case insensitively; lower case
441 * mapped to upper case). All other characters are
442 * interpreted as gaps.
443 *
444 * This way, we don't have to pass around any alphabet
445 * type info (DNA vs. RNA vs. protein) and don't have
446 * to deal with remapping IUPAC degenerate codes
447 * probabilistically. However, on the down side,
448 * a sequence with a lot of degenerate IUPAC characters
449 * will get an artifactually high PB weight.
450 *
451 * Args: aseq - sequence alignment to weight
452 * nseq - number of sequences in alignment
453 * alen - length of alignment
454 * wgt - RETURN: weights filled in (pre-allocated 0..nseq-1)
455 *
456 * Returns: (void)
457 * wgt is allocated (0..nseq-1) by caller, and filled in here.
458 */
459 void
460 PositionBasedWeights(char **aseq, int nseq, int alen, float *wgt)
461 {
462 int rescount[26]; /* count of A-Z residues in a column */
463 int nres; /* number of different residues in col */
464 int idx, pos; /* indices into aseq */
465 int x;
466 float norm;
467
468 FSet(wgt, nseq, 0.0);
469 for (pos = 0; pos < alen; pos++)
470 {
471 for (x = 0; x < 26; x++) rescount[x] = 0;
472 for (idx = 0; idx < nseq; idx++)
473 if (isalpha(aseq[idx][pos]))
474 rescount[toupper(aseq[idx][pos]) - 'A'] ++;
475
476 nres = 0;
477 for (x = 0; x < 26; x++)
478 if (rescount[x] > 0) nres++;
479
480 for (idx = 0; idx < nseq; idx++)
481 if (isalpha(aseq[idx][pos]))
482 wgt[idx] += 1. / (float) (nres * rescount[toupper(aseq[idx][pos]) - 'A']);
483 }
484
485 for (idx = 0; idx < nseq; idx++)
486 wgt[idx] /= (float) DealignedLength(aseq[idx]);
487 norm = (float) nseq / FSum(wgt, nseq);
488 FScale(wgt, nseq, norm);
489 return;
490 }
491
492
493
494
495 /* Function: FilterAlignment()
496 * Date: SRE, Wed Jun 30 09:19:30 1999 [St. Louis]
497 *
498 * Purpose: Constructs a new alignment by removing near-identical
499 * sequences from a given alignment (where identity is
500 * calculated *based on the alignment*).
501 * Does not affect the given alignment.
502 * Keeps earlier sequence, discards later one.
503 *
504 * Usually called as an ad hoc sequence "weighting" mechanism.
505 *
506 * Limitations:
507 * Unparsed Stockholm markup is not propagated into the
508 * new alignment.
509 *
510 * Args: msa -- original alignment
511 * cutoff -- fraction identity cutoff. 0.8 removes sequences > 80% id.
512 * ret_new -- RETURN: new MSA, usually w/ fewer sequences
513 *
514 * Return: (void)
515 * ret_new must be free'd by caller: MSAFree().
516 */
517 void
518 FilterAlignment(MSA *msa, float cutoff, MSA **ret_new)
519 {
520 int nnew; /* number of seqs in new alignment */
521 int *list;
522 int *useme;
523 float ident;
524 int i,j;
525 int remove;
526
527 /* find which seqs to keep (list) */
528 /* diff matrix; allow ragged ends */
529 list = MallocOrDie (sizeof(int) * msa->nseq);
530 useme = MallocOrDie (sizeof(int) * msa->nseq);
531 for (i = 0; i < msa->nseq; i++) useme[i] = FALSE;
532
533 nnew = 0;
534 for (i = 0; i < msa->nseq; i++)
535 {
536 remove = FALSE;
537 for (j = 0; j < nnew; j++)
538 {
539 ident = PairwiseIdentity(msa->aseq[i], msa->aseq[list[j]]);
540 if (ident > cutoff)
541 {
542 remove = TRUE;
543 printf("removing %12s -- fractional identity %.2f to %s\n",
544 msa->sqname[i], ident,
545 msa->sqname[list[j]]);
546 break;
547 }
548 }
549 if (remove == FALSE) {
550 list[nnew++] = i;
551 useme[i] = TRUE;
552 }
553 }
554
555 MSASmallerAlignment(msa, useme, ret_new);
556 free(list);
557 free(useme);
558 return;
559 }
560
561
562 /* Function: SampleAlignment()
563 * Date: SRE, Wed Jun 30 10:13:56 1999 [St. Louis]
564 *
565 * Purpose: Constructs a new, smaller alignment by sampling a given
566 * number of sequences at random. Does not change the
567 * alignment nor the order of the sequences.
568 *
569 * If you ask for a sample that is larger than nseqs,
570 * it silently returns the original alignment.
571 *
572 * Not really a weighting method, but this is as good
573 * a place as any to keep it, since it's similar in
574 * construction to FilterAlignment().
575 *
576 * Args: msa -- original alignment
577 * sample -- number of sequences in new alignment (0 < sample <= nseq)
578 * ret_new -- RETURN: new MSA
579 *
580 * Return: (void)
581 * ret_new must be free'd by caller: MSAFree().
582 */
583 void
584 SampleAlignment(MSA *msa, int sample, MSA **ret_new)
585 {
586 int *list; /* array for random selection w/o replace */
587 int *useme; /* array of flags 0..nseq-1: TRUE to use */
588 int i, idx;
589 int len;
590
591 /* Allocations
592 */
593 list = (int *) MallocOrDie (sizeof(int) * msa->nseq);
594 useme = (int *) MallocOrDie (sizeof(int) * msa->nseq);
595 for (i = 0; i < msa->nseq; i++)
596 {
597 list[i] = i;
598 useme[i] = FALSE;
599 }
600
601 /* Sanity check.
602 */
603 if (sample >= msa->nseq) sample = msa->nseq;
604
605 /* random selection w/o replacement */
606 for (len = msa->nseq, i = 0; i < sample; i++)
607 {
608 idx = CHOOSE(len);
609 printf("chose %d: %s\n", list[idx], msa->sqname[list[idx]]);
610 useme[list[idx]] = TRUE;
611 list[idx] = list[--len];
612 }
613
614 MSASmallerAlignment(msa, useme, ret_new);
615 free(list);
616 free(useme);
617 return;
618 }
619
620
621 /* Function: SingleLinkCluster()
622 * Date: SRE, Fri Jul 16 15:02:57 1999 [St. Louis]
623 *
624 * Purpose: Perform simple single link clustering of seqs in a
625 * sequence alignment. A pairwise identity threshold
626 * defines whether two sequences are linked or not.
627 *
628 * Important: runs in O(N) memory, unlike standard
629 * graph decomposition algorithms that use O(N^2)
630 * adjacency matrices or adjacency lists. Requires
631 * O(N^2) time in worst case (which is when you have
632 * no links at all), O(NlogN) in "average"
633 * case, and O(N) in best case (when there is just
634 * one cluster in a completely connected graph.
635 *
636 * (Developed because hmmbuild could no longer deal
637 * with GP120, a 16,013 sequence alignment.)
638 *
639 * Limitations:
640 * CASE-SENSITIVE. Assumes aseq have been put into
641 * either all lower or all upper case; or at least,
642 * within a column, there's no mixed case.
643 *
644 * Algorithm:
645 * I don't know if this algorithm is published. I
646 * haven't seen it in graph theory books, but that might
647 * be because it's so obvious that nobody's bothered.
648 *
649 * In brief, we're going to do a breadth-first search
650 * of the graph, and we're going to calculate links
651 * on the fly rather than precalculating them into
652 * some sort of standard adjacency structure.
653 *
654 * While working, we keep two stacks of maximum length N:
655 * a : list of vertices that are still unconnected.
656 * b : list of vertices that we've connected to
657 * in our current breadth level, but we haven't
658 * yet tested for other connections to a.
659 * The current length (number of elements in) a and b are
660 * kept in na, nb.
661 *
662 * We store our results in an array of length N:
663 * c : assigns each vertex to a component. for example
664 * c[4] = 1 means that vertex 4 is in component 1.
665 * nc is the number of components. Components
666 * are numbered from 0 to nc-1. We return c and nc
667 * to our caller.
668 *
669 * The algorithm is:
670 *
671 * Initialisation:
672 * a <-- all the vertices
673 * na <-- N
674 * b <-- empty set
675 * nb <-- 0
676 * nc <-- 0
677 *
678 * Then:
679 * while (a is not empty)
680 * pop a vertex off a, push onto b
681 * while (b is not empty)
682 * pop vertex v off b
683 * assign c[v] = nc
684 * for each vertex w in a:
685 * compare v,w. If w is linked to v, remove w
686 * from a, push onto b.
687 * nc++
688 * q.e.d. :)
689 *
690 * Args: aseq - aligned sequences
691 * nseq - number of sequences in aseq
692 * alen - alignment length
693 * maxid - fractional identity threshold 0..1. if id >= maxid, seqs linked
694 * ret_c - RETURN: 0..nseq-1 assignments of seqs to components (clusters)
695 * ret_nc - RETURN: number of components
696 *
697 * Returns: void.
698 * ret_c is allocated here. Caller free's with free(*ret_c)
699 */
700 void
701 SingleLinkCluster(char **aseq, int nseq, int alen, float maxid,
702 int **ret_c, int *ret_nc)
703 {
704 int *a, na; /* stack of available vertices */
705 int *b, nb; /* stack of working vertices */
706 int *c; /* array of results */
707 int nc; /* total number of components */
708 int v,w; /* index of a working vertices */
709 int i; /* loop counter */
710
711 /* allocations and initializations
712 */
713 a = MallocOrDie (sizeof(int) * nseq);
714 b = MallocOrDie (sizeof(int) * nseq);
715 c = MallocOrDie (sizeof(int) * nseq);
716 for (i = 0; i < nseq; i++) a[i] = i;
717 na = nseq;
718 nb = 0;
719 nc = 0;
720
721 /* Main algorithm
722 */
723 while (na > 0)
724 {
725 v = a[na-1]; na--; /* pop a vertex off a, */
726 b[nb] = v; nb++; /* and push onto b */
727 while (nb > 0)
728 {
729 v = b[nb-1]; nb--; /* pop vertex off b */
730 c[v] = nc; /* assign it to component nc */
731 for (i = na-1; i >= 0; i--)/* backwards, becase of deletion/swapping we do*/
732 if (simple_distance(aseq[v], aseq[a[i]]) <= 1. - maxid) /* linked? */
733 {
734 w = a[i]; a[i] = a[na-1]; na--; /* delete w from a (note swap) */
735 b[nb] = w; nb++; /* push w onto b */
736 }
737 }
738 nc++;
739 }
740
741 /* Cleanup and return
742 */
743 free(a);
744 free(b);
745 *ret_c = c;
746 *ret_nc = nc;
747 return;
748 }