Mercurial > repos > clustalomega > clustalomega

diff clustalomega/clustal-omega-0.2.0/src/squid/weight.c @ 0:ff1768533a07
Migrated tool version 0.2 from old tool shed archive to new tool shed repository
author: clustalomega
date: Tue, 07 Jun 2011 17:04:25 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/clustalomega/clustal-omega-0.2.0/src/squid/weight.c	Tue Jun 07 17:04:25 2011 -0400
@@ -0,0 +1,748 @@
+/*****************************************************************
+ * SQUID - a library of functions for biological sequence analysis
+ * Copyright (C) 1992-2002 Washington University School of Medicine
+ * 
+ *     This source code is freely distributed under the terms of the
+ *     GNU General Public License. See the files COPYRIGHT and LICENSE
+ *     for details.
+ *****************************************************************/
+
+/* weight.c
+ * SRE, Thu Mar  3 07:56:01 1994
+ * 
+ * Calculate weights for sequences in an alignment.
+ * RCS $Id: weight.c 217 2011-03-19 10:27:10Z andreas $ (Original squid RCS Id: weight.c,v 1.9 2002/10/09 14:26:09 eddy Exp)
+ */
+
+#include <ctype.h>
+#include <string.h>
+#include "squid.h"
+#include "sre_random.h"
+
+static void upweight(struct phylo_s *tree, int nseq, float *lwt, float *rwt, int node);
+static void downweight(struct phylo_s *tree, int nseq, float *lwt, float *rwt, 
+		       float *fwt, int node);
+static float simple_distance(char *s1, char *s2);
+static int    simple_diffmx(char **aseqs,int num, float ***ret_dmx);
+
+/* Function: GSCWeights()
+ * 
+ * Purpose:  Use Erik's tree-based algorithm to set weights for
+ *           sequences in an alignment. upweight() and downweight()
+ *           are derived from Graeme Mitchison's code.
+ *           
+ * Args:     aseq        - array of (0..nseq-1) aligned sequences
+ *           nseq        - number of seqs in alignment  
+ *           alen        - length of alignment
+ *           wgt         - allocated [0..nseq-1] array of weights to be returned
+ *           
+ * Return:   (void)
+ *           wgt is filled in.
+ */
+void
+GSCWeights(char **aseq, int nseq, int alen, float *wgt)
+{
+  float **dmx;                 /* distance (difference) matrix */
+  struct phylo_s *tree;
+  float  *lwt, *rwt;           /* weight on left, right of this tree node */
+  float  *fwt;                 /* final weight assigned to this node */
+  int      i;
+  
+  /* Sanity check first
+   */
+  if (nseq == 1) { wgt[0] = 1.0; return; }
+
+  /* I use a simple fractional difference matrix derived by
+   * pairwise identity. Perhaps I should include a Poisson
+   * distance correction.
+   */
+  MakeDiffMx(aseq, nseq, &dmx);
+  if (! Cluster(dmx, nseq, CLUSTER_MIN, &tree))  Die("Cluster() failed");
+  
+  /* Allocations
+   */
+  lwt = MallocOrDie (sizeof(float) * (2 * nseq - 1));
+  rwt = MallocOrDie (sizeof(float) * (2 * nseq - 1));
+  fwt = MallocOrDie (sizeof(float) * (2 * nseq - 1));
+  
+  /* lwt and rwt are the total branch weight to the left and
+   * right of a node or sequence. They are 0..2N-2.  0..N-1 are 
+   * the sequences; these have weight 0. N..2N-2 are the actual
+   * tree nodes.
+   */
+  for (i = 0; i < nseq; i++)
+    lwt[i] = rwt[i] = 0.0;
+				/* recursively calculate rwt, lwt, starting
+				   at node nseq (the root) */
+  upweight(tree, nseq, lwt, rwt, nseq);
+
+				/* recursively distribute weight across the
+				   tree */
+  fwt[nseq] = nseq;
+  downweight(tree, nseq, lwt, rwt, fwt, nseq);
+				/* collect the weights */
+  for (i = 0; i < nseq; i++)
+    wgt[i]  = fwt[i];
+
+  FMX2Free(dmx);
+  FreePhylo(tree, nseq);
+  free(lwt); free(rwt); free(fwt);
+}
+
+static void 
+upweight(struct phylo_s *tree, int nseq, float *lwt, float *rwt, int node)
+{
+  int ld,rd;
+
+  ld = tree[node-nseq].left;
+  if (ld >= nseq) upweight(tree, nseq, lwt, rwt, ld);
+  rd = tree[node-nseq].right;
+  if (rd >= nseq) upweight(tree, nseq, lwt, rwt, rd);
+  lwt[node] = lwt[ld] + rwt[ld] + tree[node-nseq].lblen;
+  rwt[node] = lwt[rd] + rwt[rd] + tree[node-nseq].rblen;
+}
+
+
+static void 
+downweight(struct phylo_s *tree, int nseq, float *lwt, float *rwt, float *fwt, int node)
+{
+  int ld,rd;
+  float lnum, rnum;
+
+  ld = tree[node-nseq].left;
+  rd = tree[node-nseq].right;
+  if (lwt[node] + rwt[node] > 0.0)
+    {
+      fwt[ld] = fwt[node] * (lwt[node] / (lwt[node] + rwt[node]));
+      fwt[rd] = fwt[node] * (rwt[node] / (lwt[node] + rwt[node]));
+    }
+  else
+    {
+      lnum = (ld >= nseq) ? tree[ld-nseq].incnum : 1.0;
+      rnum = (rd >= nseq) ? tree[rd-nseq].incnum : 1.0;
+      fwt[ld] = fwt[node] * lnum / (lnum + rnum);
+      fwt[rd] = fwt[node] * rnum / (lnum + rnum);
+    }
+
+  if (ld >= nseq) downweight(tree, nseq, lwt, rwt, fwt, ld);
+  if (rd >= nseq) downweight(tree, nseq, lwt, rwt, fwt, rd);
+}
+
+
+
+
+/* Function: VoronoiWeights()
+ * 
+ * Purpose:  Calculate weights using the scheme of Sibbald &
+ *           Argos (JMB 216:813-818 1990). The scheme is
+ *           slightly modified because the original algorithm
+ *           actually doesn't work on gapped alignments.
+ *           The sequences are assumed to be protein.
+ *           
+ * Args:     aseq  - array of (0..nseq-1) aligned sequences
+ *           nseq  - number of sequences
+ *           alen  - length of alignment
+ *           wgt   - allocated [0..nseq-1] array of weights to be returned
+ *
+ * Return:   void
+ *           wgt is filled in.
+ */
+void
+VoronoiWeights(char **aseq, int nseq, int alen, float *wgt)
+{
+  float **dmx;                 /* distance (difference) matrix    */
+  float  *halfmin;             /* 1/2 minimum distance to other seqs */
+  char   **psym;                /* symbols seen in each column     */
+  int     *nsym;                /* # syms seen in each column      */
+  int      symseen[27];         /* flags for observed syms         */
+  char    *randseq;             /* randomly generated sequence     */
+  int      acol;		/* pos in aligned columns          */
+  int      idx;                 /* index in sequences              */
+  int      symidx;              /* 0..25 index for symbol          */
+  int      i;			/* generic counter                 */
+  float   min;			/* minimum distance                */
+  float   dist;		/* distance between random and real */
+  float   challenge, champion; /* for resolving ties              */
+  int     itscale;		/* how many iterations per seq     */
+  int     iteration;           
+  int     best;			/* index of nearest real sequence  */
+
+  /* Sanity check first
+   */
+  if (nseq == 1) { wgt[0] = 1.0; return; }
+
+  itscale = 50;
+
+  /* Precalculate 1/2 minimum distance to other
+   * sequences for each sequence
+   */
+  if (! simple_diffmx(aseq, nseq, &dmx)) 
+    Die("simple_diffmx() failed");
+  halfmin = MallocOrDie (sizeof(float) * nseq);
+  for (idx = 0; idx < nseq; idx++)
+    {
+      for (min = 1.0, i = 0; i < nseq; i++)
+	{
+	  if (i == idx) continue;
+	  if (dmx[idx][i] < min) min = dmx[idx][i];
+	}
+      halfmin[idx] = min / 2.0;
+    }
+  Free2DArray((void **) dmx, nseq);
+
+  /* Set up the random sequence generating model.
+   */
+  psym = MallocOrDie (alen * sizeof(char *));
+  nsym = MallocOrDie (alen * sizeof(int));
+  for (acol = 0; acol < alen; acol++)
+    psym[acol] = MallocOrDie (27 * sizeof(char));
+
+/* #ifdef ORIGINAL_SIBBALD_ALGORITHM_IS_BROKEN */
+  for (acol = 0; acol < alen; acol++)
+    {
+      memset(symseen, 0, sizeof(int) * 27);
+      for (idx = 0; idx < nseq; idx++)
+	if (! isgap(aseq[idx][acol]))
+	  {
+	    if (isupper((int) aseq[idx][acol])) 
+	      symidx = aseq[idx][acol] - 'A';
+	    else
+	      symidx = aseq[idx][acol] - 'a';
+	    if (symidx >= 0 && symidx < 26)
+	      symseen[symidx] = 1;
+	  }
+	else
+	  symseen[26] = 1;	/* a gap */
+
+      for (nsym[acol] = 0, i = 0; i < 26; i++)
+	if (symseen[i]) 
+	  {
+	    psym[acol][nsym[acol]] = 'A'+i;
+	    nsym[acol]++;
+	  }
+      if (symseen[26]) { psym[acol][nsym[acol]] = ' '; nsym[acol]++; }
+    }
+/* #endif ORIGINAL_SIBBALD_ALGORITHM_IS_BROKEN */
+
+  /* Note: the original Sibbald&Argos algorithm calls for
+   * bounding the sampled space using a template-like random
+   * sequence generator. However, this leads to one minor
+   * and one major problem. The minor problem is that
+   * exceptional amino acids in a column can have a
+   * significant effect by altering the amount of sampled
+   * sequence space; the larger the data set, the worse
+   * this problem becomes. The major problem is that 
+   * there is no reasonable way to deal with gaps.
+   * Gapped sequences simply inhabit a different dimensionality
+   * and it's pretty painful to imagine calculating Voronoi
+   * volumes when the N in your N-space is varying.
+   * Note that all the examples shown by Sibbald and Argos
+   * are *ungapped* examples.
+   * 
+   * The best way I've found to circumvent this problem is
+   * just not to bound the sampled space; count gaps as
+   * symbols and generate completely random sequences.
+   */
+#ifdef ALL_SEQUENCE_SPACE
+  for (acol = 0; acol < alen; acol++)
+    {
+      strcpy(psym[acol], "ACDEFGHIKLMNPQRSTVWY ");
+      nsym[acol] = 21;
+    }
+#endif
+  
+  /* Sibbald and Argos algorithm:
+   *   1) assign all seqs weight 0.
+   *   2) generate a "random" sequence
+   *   3) calculate distance to every other sequence
+   *      (if we get a distance < 1/2 minimum distance
+   *       to other real seqs, we can stop)
+   *   4) if unique closest sequence, increment its weight 1.
+   *      if multiple closest seq, choose one randomly    
+   *   5) repeat 2-4 for lots of iterations
+   *   6) normalize all weights to sum to nseq.
+   */
+  randseq = MallocOrDie ((alen+1) * sizeof(char));
+
+  best = 42.;			/* solely to silence GCC uninit warnings. */
+  FSet(wgt, nseq, 0.0);
+  for (iteration = 0; iteration < itscale * nseq; iteration++)
+    {
+      for (acol = 0; acol < alen; acol++)
+	randseq[acol] = (nsym[acol] == 0) ? ' ' : psym[acol][CHOOSE(nsym[acol])];
+      randseq[acol] = '\0';
+
+      champion = sre_random();
+      for (min = 1.0, idx = 0; idx < nseq; idx++)
+	{
+	  dist = simple_distance(aseq[idx], randseq);
+	  if (dist < halfmin[idx]) 
+	    { 
+	      best = idx; 
+	      break;      
+	    } 
+	  if (dist < min)          
+	    { champion = sre_random(); best = idx; min = dist; } 
+	  else if (dist == min)    
+	    { 
+	      challenge = sre_random(); 
+	      if (challenge > champion)
+		{ champion = challenge; best = idx; min = dist; }
+	    }
+	}
+      wgt[best] += 1.0;
+    }
+
+  for (idx = 0; idx < nseq; idx++)
+    wgt[idx] = wgt[idx] / (float) itscale;
+
+  free(randseq);
+  free(nsym);
+  free(halfmin);
+  Free2DArray((void **) psym, alen);
+}
+
+
+/* Function: simple_distance()
+ * 
+ * Purpose:  For two identical-length null-terminated strings, return
+ *           the fractional difference between them. (0..1)
+ *           (Gaps don't count toward anything.)
+ */
+static float
+simple_distance(char *s1, char *s2)
+{
+  int diff  = 0;
+  int valid = 0;
+
+  for (; *s1 != '\0'; s1++, s2++)
+    {
+      if (isgap(*s1) || isgap(*s2)) continue;
+      if (*s1 != *s2) diff++;
+      valid++;
+    }
+  return (valid > 0 ? ((float) diff / (float) valid) : 0.0);
+}
+    
+/* Function: simple_diffmx()
+ * 
+ * Purpose:  Given a set of flushed, aligned sequences, construct
+ *           an NxN fractional difference matrix using the
+ *           simple_distance rule.
+ *           
+ * Args:     aseqs        - flushed, aligned sequences
+ *           num          - number of aseqs
+ *           ret_dmx      - RETURN: difference matrix (caller must free)
+ *           
+ * Return:   1 on success, 0 on failure.
+ */
+static int
+simple_diffmx(char    **aseqs,
+	      int       num,
+	      float ***ret_dmx)
+{
+  float **dmx;                 /* RETURN: distance matrix           */
+  int      i,j;			/* counters over sequences           */
+
+  /* Allocate
+   */
+  if ((dmx = (float **) malloc (sizeof(float *) * num)) == NULL)
+    Die("malloc failed");
+  for (i = 0; i < num; i++)
+    if ((dmx[i] = (float *) malloc (sizeof(float) * num)) == NULL)
+      Die("malloc failed");
+
+  /* Calculate distances, symmetric matrix
+   */
+  for (i = 0; i < num; i++)
+    for (j = i; j < num; j++)
+      dmx[i][j] = dmx[j][i] = simple_distance(aseqs[i], aseqs[j]);
+
+  /* Return
+   */
+  *ret_dmx = dmx;
+  return 1;
+}
+
+
+
+/* Function: BlosumWeights()
+ * Date:     SRE, Fri Jul 16 17:33:59 1999 (St. Louis)
+ * 
+ * Purpose:  Assign weights to a set of aligned sequences
+ *           using the BLOSUM rule:
+ *             - do single linkage clustering at some pairwise identity
+ *             - in each cluster, give each sequence 1/clustsize
+ *               total weight.
+ *
+ *           The clusters have no pairwise link >= maxid. 
+ *           
+ *           O(N) in memory. Probably ~O(NlogN) in time; O(N^2)
+ *           in worst case, which is no links between sequences
+ *           (e.g., values of maxid near 1.0).
+ * 
+ * Args:     aseqs - alignment
+ *           nseq  - number of seqs in alignment
+ *           alen  - # of columns in alignment
+ *           maxid - fractional identity (e.g. 0.62 for BLOSUM62)
+ *           wgt   - [0..nseq-1] array of weights to be returned
+ */               
+void
+BlosumWeights(char **aseqs, int nseq, int alen, float maxid, float *wgt)
+{
+  int  *c, nc;
+  int  *nmem;        /* number of seqs in each cluster */
+  int   i;           /* loop counter */
+
+  SingleLinkCluster(aseqs, nseq, alen, maxid, &c, &nc);
+
+  FSet(wgt, nseq, 1.0);
+  nmem = MallocOrDie(sizeof(int) * nc);
+
+  for (i = 0; i < nc;   i++) nmem[i] = 0;
+  for (i = 0; i < nseq; i++) nmem[c[i]]++;
+  for (i = 0; i < nseq; i++) wgt[i] = 1. / (float) nmem[c[i]];
+
+  free(nmem);
+  free(c);
+  return;
+}
+
+
+/* Function: PositionBasedWeights()
+ * Date:     SRE, Fri Jul 16 17:47:22 1999 [St. Louis]
+ *
+ * Purpose:  Implementation of Henikoff and Henikoff position-based
+ *           weights (JMB 243:574-578, 1994) [Henikoff94b].
+ *           
+ *           A significant advantage of this approach that Steve and Jorja
+ *           don't point out is that it is O(N) in memory, unlike
+ *           many other approaches like GSC weights or Voronoi.
+ *           
+ *           A potential disadvantage that they don't point out
+ *           is that in the theoretical limit of infinite sequences
+ *           in the alignment, weights go flat: eventually every
+ *           column has at least one representative of each of 20 aa (or 4 nt)
+ *           in it.
+ *           
+ *           They also don't give a rule for how to handle gaps.
+ *           The rule used here seems the obvious and sensible one
+ *           (ignore them). This means that longer sequences
+ *           initially get more weight; hence a "double
+ *           normalization" in which the weights are first divided
+ *           by sequence length (to compensate for that effect),
+ *           then normalized to sum to nseq.
+ *           
+ * Limitations:
+ *           Implemented in a way that's alphabet-independent:
+ *           it uses the 26 upper case letters as "residues".
+ *           Any alphabetic character in aseq is interpreted as
+ *           a unique "residue" (case insensitively; lower case
+ *           mapped to upper case). All other characters are
+ *           interpreted as gaps.
+ *           
+ *           This way, we don't have to pass around any alphabet
+ *           type info (DNA vs. RNA vs. protein) and don't have
+ *           to deal with remapping IUPAC degenerate codes
+ *           probabilistically. However, on the down side,
+ *           a sequence with a lot of degenerate IUPAC characters
+ *           will get an artifactually high PB weight.
+ *           
+ * Args:     aseq   - sequence alignment to weight
+ *           nseq   - number of sequences in alignment
+ *           alen   - length of alignment
+ *           wgt    - RETURN: weights filled in (pre-allocated 0..nseq-1)
+ *
+ * Returns:  (void)
+ *           wgt is allocated (0..nseq-1) by caller, and filled in here.
+ */
+void
+PositionBasedWeights(char **aseq, int nseq, int alen, float *wgt)
+{
+  int  rescount[26];		/* count of A-Z residues in a column   */
+  int  nres;			/* number of different residues in col */
+  int  idx, pos;                /* indices into aseq                   */
+  int  x;
+  float norm;
+
+  FSet(wgt, nseq, 0.0);
+  for (pos = 0; pos < alen; pos++)
+    {
+      for (x = 0; x < 26; x++) rescount[x] = 0;
+      for (idx = 0; idx < nseq; idx++)
+	if (isalpha(aseq[idx][pos]))
+	  rescount[toupper(aseq[idx][pos]) - 'A'] ++;
+
+      nres = 0;
+      for (x = 0; x < 26; x++) 
+	if (rescount[x] > 0) nres++;
+
+      for (idx = 0; idx < nseq; idx++)
+	if (isalpha(aseq[idx][pos]))
+	  wgt[idx] += 1. / (float) (nres * rescount[toupper(aseq[idx][pos]) - 'A']);
+    }
+
+  for (idx = 0; idx < nseq; idx++)
+    wgt[idx] /= (float) DealignedLength(aseq[idx]);
+  norm = (float) nseq / FSum(wgt, nseq);
+  FScale(wgt, nseq, norm);
+  return;
+}
+
+
+
+
+/* Function: FilterAlignment()
+ * Date:     SRE, Wed Jun 30 09:19:30 1999 [St. Louis]
+ * 
+ * Purpose:  Constructs a new alignment by removing near-identical 
+ *           sequences from a given alignment (where identity is 
+ *           calculated *based on the alignment*).
+ *           Does not affect the given alignment.
+ *           Keeps earlier sequence, discards later one. 
+ *           
+ *           Usually called as an ad hoc sequence "weighting" mechanism.
+ *           
+ * Limitations:
+ *           Unparsed Stockholm markup is not propagated into the
+ *           new alignment.
+ *           
+ * Args:     msa      -- original alignment
+ *           cutoff   -- fraction identity cutoff. 0.8 removes sequences > 80% id.
+ *           ret_new  -- RETURN: new MSA, usually w/ fewer sequences
+ *                         
+ * Return:   (void)
+ *           ret_new must be free'd by caller: MSAFree().
+ */
+void
+FilterAlignment(MSA *msa, float cutoff, MSA **ret_new)
+{
+  int     nnew;			/* number of seqs in new alignment */
+  int    *list;
+  int    *useme;
+  float   ident;
+  int     i,j;
+  int     remove;
+
+				/* find which seqs to keep (list) */
+				/* diff matrix; allow ragged ends */
+  list  = MallocOrDie (sizeof(int) * msa->nseq);
+  useme = MallocOrDie (sizeof(int) * msa->nseq);
+  for (i = 0; i < msa->nseq; i++) useme[i] = FALSE;
+
+  nnew = 0;
+  for (i = 0; i < msa->nseq; i++)
+    {
+      remove = FALSE;
+      for (j = 0; j < nnew; j++)
+	{
+	  ident = PairwiseIdentity(msa->aseq[i], msa->aseq[list[j]]);
+	  if (ident > cutoff)
+	    { 
+	      remove = TRUE; 
+	      printf("removing %12s -- fractional identity %.2f to %s\n", 
+		     msa->sqname[i], ident,
+		     msa->sqname[list[j]]);
+	      break; 
+	    }
+	}
+      if (remove == FALSE) {
+	list[nnew++] = i;
+	useme[i]     = TRUE;
+      }
+    }
+
+  MSASmallerAlignment(msa, useme, ret_new);
+  free(list);
+  free(useme);
+  return;
+}
+
+
+/* Function: SampleAlignment()
+ * Date:     SRE, Wed Jun 30 10:13:56 1999 [St. Louis]
+ * 
+ * Purpose:  Constructs a new, smaller alignment by sampling a given
+ *           number of sequences at random. Does not change the
+ *           alignment nor the order of the sequences.
+ *           
+ *           If you ask for a sample that is larger than nseqs,
+ *           it silently returns the original alignment.
+ *           
+ *           Not really a weighting method, but this is as good
+ *           a place as any to keep it, since it's similar in
+ *           construction to FilterAlignment().
+ *           
+ * Args:     msa      -- original alignment
+ *           sample   -- number of sequences in new alignment (0 < sample <= nseq)
+ *           ret_new  -- RETURN: new MSA 
+ *                         
+ * Return:   (void)
+ *           ret_new must be free'd by caller: MSAFree().
+ */
+void
+SampleAlignment(MSA *msa, int sample, MSA **ret_new)
+{
+  int    *list;                 /* array for random selection w/o replace */
+  int    *useme;                /* array of flags 0..nseq-1: TRUE to use */
+  int     i, idx;
+  int     len;
+
+  /* Allocations
+   */
+  list  = (int *) MallocOrDie (sizeof(int) * msa->nseq);
+  useme = (int *) MallocOrDie (sizeof(int) * msa->nseq);
+  for (i = 0; i < msa->nseq; i++)
+    {
+      list[i]  = i;
+      useme[i] = FALSE;
+    }
+
+  /* Sanity check.
+   */
+  if (sample >= msa->nseq) sample = msa->nseq;
+
+				/* random selection w/o replacement */
+  for (len = msa->nseq, i = 0; i < sample; i++)
+    {
+      idx = CHOOSE(len);
+      printf("chose %d: %s\n", list[idx], msa->sqname[list[idx]]);
+      useme[list[idx]] = TRUE;
+      list[idx] = list[--len];
+    }
+
+  MSASmallerAlignment(msa, useme, ret_new);
+  free(list);
+  free(useme);
+  return;
+}
+
+
+/* Function: SingleLinkCluster()
+ * Date:     SRE, Fri Jul 16 15:02:57 1999 [St. Louis]
+ *
+ * Purpose:  Perform simple single link clustering of seqs in a
+ *           sequence alignment. A pairwise identity threshold
+ *           defines whether two sequences are linked or not.
+ *           
+ *           Important: runs in O(N) memory, unlike standard
+ *           graph decomposition algorithms that use O(N^2)
+ *           adjacency matrices or adjacency lists. Requires
+ *           O(N^2) time in worst case (which is when you have
+ *           no links at all), O(NlogN) in "average"
+ *           case, and O(N) in best case (when there is just
+ *           one cluster in a completely connected graph.
+ *           
+ *           (Developed because hmmbuild could no longer deal
+ *           with GP120, a 16,013 sequence alignment.)
+ *           
+ * Limitations: 
+ *           CASE-SENSITIVE. Assumes aseq have been put into
+ *           either all lower or all upper case; or at least,
+ *           within a column, there's no mixed case.
+ *           
+ * Algorithm: 
+ *           I don't know if this algorithm is published. I 
+ *           haven't seen it in graph theory books, but that might
+ *           be because it's so obvious that nobody's bothered.
+ *           
+ *           In brief, we're going to do a breadth-first search
+ *           of the graph, and we're going to calculate links
+ *           on the fly rather than precalculating them into
+ *           some sort of standard adjacency structure.
+ *           
+ *           While working, we keep two stacks of maximum length N:
+ *                a : list of vertices that are still unconnected.
+ *                b : list of vertices that we've connected to 
+ *                    in our current breadth level, but we haven't
+ *                    yet tested for other connections to a.
+ *           The current length (number of elements in) a and b are
+ *           kept in na, nb.
+ *                    
+ *           We store our results in an array of length N:
+ *                c : assigns each vertex to a component. for example
+ *                    c[4] = 1 means that vertex 4 is in component 1.
+ *                    nc is the number of components. Components
+ *                    are numbered from 0 to nc-1. We return c and nc
+ *                    to our caller.
+ *                    
+ *           The algorithm is:
+ *           
+ *           Initialisation: 
+ *                a  <-- all the vertices
+ *                na <-- N
+ *                b  <-- empty set
+ *                nb <-- 0
+ *                nc <-- 0
+ *                
+ *           Then:
+ *                while (a is not empty)
+ *                  pop a vertex off a, push onto b
+ *                  while (b is not empty)
+ *                    pop vertex v off b
+ *                    assign c[v] = nc
+ *                    for each vertex w in a:
+ *                       compare v,w. If w is linked to v, remove w
+ *                       from a, push onto b.
+ *                  nc++     
+ *           q.e.d. :)       
+ *
+ * Args:     aseq   - aligned sequences
+ *           nseq   - number of sequences in aseq
+ *           alen   - alignment length
+ *           maxid  - fractional identity threshold 0..1. if id >= maxid, seqs linked
+ *           ret_c  - RETURN: 0..nseq-1 assignments of seqs to components (clusters)
+ *           ret_nc - RETURN: number of components
+ *
+ * Returns:  void.
+ *           ret_c is allocated here. Caller free's with free(*ret_c)
+ */
+void
+SingleLinkCluster(char **aseq, int nseq, int alen, float maxid, 
+		  int **ret_c, int *ret_nc)
+{
+  int *a, na;                   /* stack of available vertices */
+  int *b, nb;                   /* stack of working vertices   */
+  int *c;                       /* array of results            */
+  int  nc;			/* total number of components  */
+  int  v,w;			/* index of a working vertices */
+  int  i;			/* loop counter */
+
+  /* allocations and initializations
+   */
+  a = MallocOrDie (sizeof(int) * nseq);
+  b = MallocOrDie (sizeof(int) * nseq);
+  c = MallocOrDie (sizeof(int) * nseq);
+  for (i = 0; i < nseq; i++) a[i] = i;
+  na = nseq;
+  nb = 0;
+  nc = 0;
+
+  /* Main algorithm
+   */
+  while (na > 0)
+    {
+      v = a[na-1]; na--;	/* pop a vertex off a, */
+      b[nb] = v;   nb++;	/* and push onto b     */
+      while (nb > 0)
+	{
+	  v    = b[nb-1]; nb--;	/* pop vertex off b          */
+	  c[v] = nc;		/* assign it to component nc */
+	  for (i = na-1; i >= 0; i--)/* backwards, becase of deletion/swapping we do*/
+	    if (simple_distance(aseq[v], aseq[a[i]]) <= 1. - maxid) /* linked? */
+	      {			
+		w = a[i]; a[i] = a[na-1]; na--;	/* delete w from a (note swap) */
+		b[nb] = w; nb++;                /* push w onto b */
+	      }
+	}
+      nc++;
+    }
+
+  /* Cleanup and return
+   */
+  free(a);
+  free(b);
+  *ret_c  = c;
+  *ret_nc = nc;
+  return;
+}
author	clustalomega
date	Tue, 07 Jun 2011 17:04:25 -0400
parents
children