Mercurial > repos > clustalomega > clustalomega

/*****************************************************************
 * SQUID - a library of functions for biological sequence analysis
 * Copyright (C) 1992-2002 Washington University School of Medicine
 *
 *     This source code is freely distributed under the terms of the
 *     GNU General Public License. See the files COPYRIGHT and LICENSE
 *     for details.
 *****************************************************************/

/* weight.c
 * SRE, Thu Mar  3 07:56:01 1994
 *
 * Calculate weights for sequences in an alignment.
 * RCS $Id: weight.c 217 2011-03-19 10:27:10Z andreas $ (Original squid RCS Id: weight.c,v 1.9 2002/10/09 14:26:09 eddy Exp)
 */

#include <ctype.h>
#include <string.h>
#include "squid.h"
#include "sre_random.h"

static void upweight(struct phylo_s *tree, int nseq, float *lwt, float *rwt, int node);
static void downweight(struct phylo_s *tree, int nseq, float *lwt, float *rwt,
		       float *fwt, int node);
static float simple_distance(char *s1, char *s2);
static int    simple_diffmx(char **aseqs,int num, float ***ret_dmx);

/* Function: GSCWeights()
 *
 * Purpose:  Use Erik's tree-based algorithm to set weights for
 *           sequences in an alignment. upweight() and downweight()
 *           are derived from Graeme Mitchison's code.
 *
 * Args:     aseq        - array of (0..nseq-1) aligned sequences
 *           nseq        - number of seqs in alignment
 *           alen        - length of alignment
 *           wgt         - allocated [0..nseq-1] array of weights to be returned
 *
 * Return:   (void)
 *           wgt is filled in.
 */
void
GSCWeights(char **aseq, int nseq, int alen, float *wgt)
{
  float **dmx;                 /* distance (difference) matrix */
  struct phylo_s *tree;
  float  *lwt, *rwt;           /* weight on left, right of this tree node */
  float  *fwt;                 /* final weight assigned to this node */
  int      i;

  /* Sanity check first
   */
  if (nseq == 1) { wgt[0] = 1.0; return; }

  /* I use a simple fractional difference matrix derived by
   * pairwise identity. Perhaps I should include a Poisson
   * distance correction.
   */
  MakeDiffMx(aseq, nseq, &dmx);
  if (! Cluster(dmx, nseq, CLUSTER_MIN, &tree))  Die("Cluster() failed");

  /* Allocations
   */
  lwt = MallocOrDie (sizeof(float) * (2 * nseq - 1));
  rwt = MallocOrDie (sizeof(float) * (2 * nseq - 1));
  fwt = MallocOrDie (sizeof(float) * (2 * nseq - 1));

  /* lwt and rwt are the total branch weight to the left and
   * right of a node or sequence. They are 0..2N-2.  0..N-1 are
   * the sequences; these have weight 0. N..2N-2 are the actual
   * tree nodes.
   */
  for (i = 0; i < nseq; i++)
    lwt[i] = rwt[i] = 0.0;
				/* recursively calculate rwt, lwt, starting
				   at node nseq (the root) */
  upweight(tree, nseq, lwt, rwt, nseq);

				/* recursively distribute weight across the
				   tree */
  fwt[nseq] = nseq;
  downweight(tree, nseq, lwt, rwt, fwt, nseq);
				/* collect the weights */
  for (i = 0; i < nseq; i++)
    wgt[i]  = fwt[i];

  FMX2Free(dmx);
  FreePhylo(tree, nseq);
  free(lwt); free(rwt); free(fwt);
}

static void
upweight(struct phylo_s *tree, int nseq, float *lwt, float *rwt, int node)
{
  int ld,rd;

  ld = tree[node-nseq].left;
  if (ld >= nseq) upweight(tree, nseq, lwt, rwt, ld);
  rd = tree[node-nseq].right;
  if (rd >= nseq) upweight(tree, nseq, lwt, rwt, rd);
  lwt[node] = lwt[ld] + rwt[ld] + tree[node-nseq].lblen;
  rwt[node] = lwt[rd] + rwt[rd] + tree[node-nseq].rblen;
}


static void
downweight(struct phylo_s *tree, int nseq, float *lwt, float *rwt, float *fwt, int node)
{
  int ld,rd;
  float lnum, rnum;

  ld = tree[node-nseq].left;
  rd = tree[node-nseq].right;
  if (lwt[node] + rwt[node] > 0.0)
    {
      fwt[ld] = fwt[node] * (lwt[node] / (lwt[node] + rwt[node]));
      fwt[rd] = fwt[node] * (rwt[node] / (lwt[node] + rwt[node]));
    }
  else
    {
      lnum = (ld >= nseq) ? tree[ld-nseq].incnum : 1.0;
      rnum = (rd >= nseq) ? tree[rd-nseq].incnum : 1.0;
      fwt[ld] = fwt[node] * lnum / (lnum + rnum);
      fwt[rd] = fwt[node] * rnum / (lnum + rnum);
    }

  if (ld >= nseq) downweight(tree, nseq, lwt, rwt, fwt, ld);
  if (rd >= nseq) downweight(tree, nseq, lwt, rwt, fwt, rd);
}


/* Function: VoronoiWeights()
 *
 * Purpose:  Calculate weights using the scheme of Sibbald &
 *           Argos (JMB 216:813-818 1990). The scheme is
 *           slightly modified because the original algorithm
 *           actually doesn't work on gapped alignments.
 *           The sequences are assumed to be protein.
 *
 * Args:     aseq  - array of (0..nseq-1) aligned sequences
 *           nseq  - number of sequences
 *           alen  - length of alignment
 *           wgt   - allocated [0..nseq-1] array of weights to be returned
 *
 * Return:   void
 *           wgt is filled in.
 */
void
VoronoiWeights(char **aseq, int nseq, int alen, float *wgt)
{
  float **dmx;                 /* distance (difference) matrix    */
  float  *halfmin;             /* 1/2 minimum distance to other seqs */
  char   **psym;                /* symbols seen in each column     */
  int     *nsym;                /* # syms seen in each column      */
  int      symseen[27];         /* flags for observed syms         */
  char    *randseq;             /* randomly generated sequence     */
  int      acol;		/* pos in aligned columns          */
  int      idx;                 /* index in sequences              */
  int      symidx;              /* 0..25 index for symbol          */
  int      i;			/* generic counter                 */
  float   min;			/* minimum distance                */
  float   dist;		/* distance between random and real */
  float   challenge, champion; /* for resolving ties              */
  int     itscale;		/* how many iterations per seq     */
  int     iteration;
  int     best;			/* index of nearest real sequence  */

  /* Sanity check first
   */
  if (nseq == 1) { wgt[0] = 1.0; return; }

  itscale = 50;

  /* Precalculate 1/2 minimum distance to other
   * sequences for each sequence
   */
  if (! simple_diffmx(aseq, nseq, &dmx))
    Die("simple_diffmx() failed");
  halfmin = MallocOrDie (sizeof(float) * nseq);
  for (idx = 0; idx < nseq; idx++)
    {
      for (min = 1.0, i = 0; i < nseq; i++)
	{
	  if (i == idx) continue;
	  if (dmx[idx][i] < min) min = dmx[idx][i];
	}
      halfmin[idx] = min / 2.0;
    }
  Free2DArray((void **) dmx, nseq);

  /* Set up the random sequence generating model.
   */
  psym = MallocOrDie (alen * sizeof(char *));
  nsym = MallocOrDie (alen * sizeof(int));
  for (acol = 0; acol < alen; acol++)
    psym[acol] = MallocOrDie (27 * sizeof(char));

/* #ifdef ORIGINAL_SIBBALD_ALGORITHM_IS_BROKEN */
  for (acol = 0; acol < alen; acol++)
    {
      memset(symseen, 0, sizeof(int) * 27);
      for (idx = 0; idx < nseq; idx++)
	if (! isgap(aseq[idx][acol]))
	  {
	    if (isupper((int) aseq[idx][acol]))
	      symidx = aseq[idx][acol] - 'A';
	    else
	      symidx = aseq[idx][acol] - 'a';
	    if (symidx >= 0 && symidx < 26)
	      symseen[symidx] = 1;
	  }
	else
	  symseen[26] = 1;	/* a gap */

      for (nsym[acol] = 0, i = 0; i < 26; i++)
	if (symseen[i])
	  {
	    psym[acol][nsym[acol]] = 'A'+i;
	    nsym[acol]++;
	  }
      if (symseen[26]) { psym[acol][nsym[acol]] = ' '; nsym[acol]++; }
    }
/* #endif ORIGINAL_SIBBALD_ALGORITHM_IS_BROKEN */

  /* Note: the original Sibbald&Argos algorithm calls for
   * bounding the sampled space using a template-like random
   * sequence generator. However, this leads to one minor
   * and one major problem. The minor problem is that
   * exceptional amino acids in a column can have a
   * significant effect by altering the amount of sampled
   * sequence space; the larger the data set, the worse
   * this problem becomes. The major problem is that
   * there is no reasonable way to deal with gaps.
   * Gapped sequences simply inhabit a different dimensionality
   * and it's pretty painful to imagine calculating Voronoi
   * volumes when the N in your N-space is varying.
   * Note that all the examples shown by Sibbald and Argos
   * are *ungapped* examples.
   *
   * The best way I've found to circumvent this problem is
   * just not to bound the sampled space; count gaps as
   * symbols and generate completely random sequences.
   */
#ifdef ALL_SEQUENCE_SPACE
  for (acol = 0; acol < alen; acol++)
    {
      strcpy(psym[acol], "ACDEFGHIKLMNPQRSTVWY ");
      nsym[acol] = 21;
    }
#endif

  /* Sibbald and Argos algorithm:
   *   1) assign all seqs weight 0.
   *   2) generate a "random" sequence
   *   3) calculate distance to every other sequence
   *      (if we get a distance < 1/2 minimum distance
   *       to other real seqs, we can stop)
   *   4) if unique closest sequence, increment its weight 1.
   *      if multiple closest seq, choose one randomly
   *   5) repeat 2-4 for lots of iterations
   *   6) normalize all weights to sum to nseq.
   */
  randseq = MallocOrDie ((alen+1) * sizeof(char));

  best = 42.;			/* solely to silence GCC uninit warnings. */
  FSet(wgt, nseq, 0.0);
  for (iteration = 0; iteration < itscale * nseq; iteration++)
    {
      for (acol = 0; acol < alen; acol++)
	randseq[acol] = (nsym[acol] == 0) ? ' ' : psym[acol][CHOOSE(nsym[acol])];
      randseq[acol] = '\0';

      champion = sre_random();
      for (min = 1.0, idx = 0; idx < nseq; idx++)
	{
	  dist = simple_distance(aseq[idx], randseq);
	  if (dist < halfmin[idx])
	    {
	      best = idx;
	      break;
	    }
	  if (dist < min)
	    { champion = sre_random(); best = idx; min = dist; }
	  else if (dist == min)
	    {
	      challenge = sre_random();
	      if (challenge > champion)
		{ champion = challenge; best = idx; min = dist; }
	    }
	}
      wgt[best] += 1.0;
    }

  for (idx = 0; idx < nseq; idx++)
    wgt[idx] = wgt[idx] / (float) itscale;

  free(randseq);
  free(nsym);
  free(halfmin);
  Free2DArray((void **) psym, alen);
}


/* Function: simple_distance()
 *
 * Purpose:  For two identical-length null-terminated strings, return
 *           the fractional difference between them. (0..1)
 *           (Gaps don't count toward anything.)
 */
static float
simple_distance(char *s1, char *s2)
{
  int diff  = 0;
  int valid = 0;

  for (; *s1 != '\0'; s1++, s2++)
    {
      if (isgap(*s1) || isgap(*s2)) continue;
      if (*s1 != *s2) diff++;
      valid++;
    }
  return (valid > 0 ? ((float) diff / (float) valid) : 0.0);
}

/* Function: simple_diffmx()
 *
 * Purpose:  Given a set of flushed, aligned sequences, construct
 *           an NxN fractional difference matrix using the
 *           simple_distance rule.
 *
 * Args:     aseqs        - flushed, aligned sequences
 *           num          - number of aseqs
 *           ret_dmx      - RETURN: difference matrix (caller must free)
 *
 * Return:   1 on success, 0 on failure.
 */
static int
simple_diffmx(char    **aseqs,
	      int       num,
	      float ***ret_dmx)
{
  float **dmx;                 /* RETURN: distance matrix           */
  int      i,j;			/* counters over sequences           */

  /* Allocate
   */
  if ((dmx = (float **) malloc (sizeof(float *) * num)) == NULL)
    Die("malloc failed");
  for (i = 0; i < num; i++)
    if ((dmx[i] = (float *) malloc (sizeof(float) * num)) == NULL)
      Die("malloc failed");

  /* Calculate distances, symmetric matrix
   */
  for (i = 0; i < num; i++)
    for (j = i; j < num; j++)
      dmx[i][j] = dmx[j][i] = simple_distance(aseqs[i], aseqs[j]);

  /* Return
   */
  *ret_dmx = dmx;
  return 1;
}


/* Function: BlosumWeights()
 * Date:     SRE, Fri Jul 16 17:33:59 1999 (St. Louis)
 *
 * Purpose:  Assign weights to a set of aligned sequences
 *           using the BLOSUM rule:
 *             - do single linkage clustering at some pairwise identity
 *             - in each cluster, give each sequence 1/clustsize
 *               total weight.
 *
 *           The clusters have no pairwise link >= maxid.
 *
 *           O(N) in memory. Probably ~O(NlogN) in time; O(N^2)
 *           in worst case, which is no links between sequences
 *           (e.g., values of maxid near 1.0).
 *
 * Args:     aseqs - alignment
 *           nseq  - number of seqs in alignment
 *           alen  - # of columns in alignment
 *           maxid - fractional identity (e.g. 0.62 for BLOSUM62)
 *           wgt   - [0..nseq-1] array of weights to be returned
 */
void
BlosumWeights(char **aseqs, int nseq, int alen, float maxid, float *wgt)
{
  int  *c, nc;
  int  *nmem;        /* number of seqs in each cluster */
  int   i;           /* loop counter */

  SingleLinkCluster(aseqs, nseq, alen, maxid, &c, &nc);

  FSet(wgt, nseq, 1.0);
  nmem = MallocOrDie(sizeof(int) * nc);

  for (i = 0; i < nc;   i++) nmem[i] = 0;
  for (i = 0; i < nseq; i++) nmem[c[i]]++;
  for (i = 0; i < nseq; i++) wgt[i] = 1. / (float) nmem[c[i]];

  free(nmem);
  free(c);
  return;
}


/* Function: PositionBasedWeights()
 * Date:     SRE, Fri Jul 16 17:47:22 1999 [St. Louis]
 *
 * Purpose:  Implementation of Henikoff and Henikoff position-based
 *           weights (JMB 243:574-578, 1994) [Henikoff94b].
 *
 *           A significant advantage of this approach that Steve and Jorja
 *           don't point out is that it is O(N) in memory, unlike
 *           many other approaches like GSC weights or Voronoi.
 *
 *           A potential disadvantage that they don't point out
 *           is that in the theoretical limit of infinite sequences
 *           in the alignment, weights go flat: eventually every
 *           column has at least one representative of each of 20 aa (or 4 nt)
 *           in it.
 *
 *           They also don't give a rule for how to handle gaps.
 *           The rule used here seems the obvious and sensible one
 *           (ignore them). This means that longer sequences
 *           initially get more weight; hence a "double
 *           normalization" in which the weights are first divided
 *           by sequence length (to compensate for that effect),
 *           then normalized to sum to nseq.
 *
 * Limitations:
 *           Implemented in a way that's alphabet-independent:
 *           it uses the 26 upper case letters as "residues".
 *           Any alphabetic character in aseq is interpreted as
 *           a unique "residue" (case insensitively; lower case
 *           mapped to upper case). All other characters are
 *           interpreted as gaps.
 *
 *           This way, we don't have to pass around any alphabet
 *           type info (DNA vs. RNA vs. protein) and don't have
 *           to deal with remapping IUPAC degenerate codes
 *           probabilistically. However, on the down side,
 *           a sequence with a lot of degenerate IUPAC characters
 *           will get an artifactually high PB weight.
 *
 * Args:     aseq   - sequence alignment to weight
 *           nseq   - number of sequences in alignment
 *           alen   - length of alignment
 *           wgt    - RETURN: weights filled in (pre-allocated 0..nseq-1)
 *
 * Returns:  (void)
 *           wgt is allocated (0..nseq-1) by caller, and filled in here.
 */
void
PositionBasedWeights(char **aseq, int nseq, int alen, float *wgt)
{
  int  rescount[26];		/* count of A-Z residues in a column   */
  int  nres;			/* number of different residues in col */
  int  idx, pos;                /* indices into aseq                   */
  int  x;
  float norm;

  FSet(wgt, nseq, 0.0);
  for (pos = 0; pos < alen; pos++)
    {
      for (x = 0; x < 26; x++) rescount[x] = 0;
      for (idx = 0; idx < nseq; idx++)
	if (isalpha(aseq[idx][pos]))
	  rescount[toupper(aseq[idx][pos]) - 'A'] ++;

      nres = 0;
      for (x = 0; x < 26; x++)
	if (rescount[x] > 0) nres++;

      for (idx = 0; idx < nseq; idx++)
	if (isalpha(aseq[idx][pos]))
	  wgt[idx] += 1. / (float) (nres * rescount[toupper(aseq[idx][pos]) - 'A']);
    }

  for (idx = 0; idx < nseq; idx++)
    wgt[idx] /= (float) DealignedLength(aseq[idx]);
  norm = (float) nseq / FSum(wgt, nseq);
  FScale(wgt, nseq, norm);
  return;
}


/* Function: FilterAlignment()
 * Date:     SRE, Wed Jun 30 09:19:30 1999 [St. Louis]
 *
 * Purpose:  Constructs a new alignment by removing near-identical
 *           sequences from a given alignment (where identity is
 *           calculated *based on the alignment*).
 *           Does not affect the given alignment.
 *           Keeps earlier sequence, discards later one.
 *
 *           Usually called as an ad hoc sequence "weighting" mechanism.
 *
 * Limitations:
 *           Unparsed Stockholm markup is not propagated into the
 *           new alignment.
 *
 * Args:     msa      -- original alignment
 *           cutoff   -- fraction identity cutoff. 0.8 removes sequences > 80% id.
 *           ret_new  -- RETURN: new MSA, usually w/ fewer sequences
 *
 * Return:   (void)
 *           ret_new must be free'd by caller: MSAFree().
 */
void
FilterAlignment(MSA *msa, float cutoff, MSA **ret_new)
{
  int     nnew;			/* number of seqs in new alignment */
  int    *list;
  int    *useme;
  float   ident;
  int     i,j;
  int     remove;

				/* find which seqs to keep (list) */
				/* diff matrix; allow ragged ends */
  list  = MallocOrDie (sizeof(int) * msa->nseq);
  useme = MallocOrDie (sizeof(int) * msa->nseq);
  for (i = 0; i < msa->nseq; i++) useme[i] = FALSE;

  nnew = 0;
  for (i = 0; i < msa->nseq; i++)
    {
      remove = FALSE;
      for (j = 0; j < nnew; j++)
	{
	  ident = PairwiseIdentity(msa->aseq[i], msa->aseq[list[j]]);
	  if (ident > cutoff)
	    {
	      remove = TRUE;
	      printf("removing %12s -- fractional identity %.2f to %s\n",
		     msa->sqname[i], ident,
		     msa->sqname[list[j]]);
	      break;
	    }
	}
      if (remove == FALSE) {
	list[nnew++] = i;
	useme[i]     = TRUE;
      }
    }

  MSASmallerAlignment(msa, useme, ret_new);
  free(list);
  free(useme);
  return;
}


/* Function: SampleAlignment()
 * Date:     SRE, Wed Jun 30 10:13:56 1999 [St. Louis]
 *
 * Purpose:  Constructs a new, smaller alignment by sampling a given
 *           number of sequences at random. Does not change the
 *           alignment nor the order of the sequences.
 *
 *           If you ask for a sample that is larger than nseqs,
 *           it silently returns the original alignment.
 *
 *           Not really a weighting method, but this is as good
 *           a place as any to keep it, since it's similar in
 *           construction to FilterAlignment().
 *
 * Args:     msa      -- original alignment
 *           sample   -- number of sequences in new alignment (0 < sample <= nseq)
 *           ret_new  -- RETURN: new MSA
 *
 * Return:   (void)
 *           ret_new must be free'd by caller: MSAFree().
 */
void
SampleAlignment(MSA *msa, int sample, MSA **ret_new)
{
  int    *list;                 /* array for random selection w/o replace */
  int    *useme;                /* array of flags 0..nseq-1: TRUE to use */
  int     i, idx;
  int     len;

  /* Allocations
   */
  list  = (int *) MallocOrDie (sizeof(int) * msa->nseq);
  useme = (int *) MallocOrDie (sizeof(int) * msa->nseq);
  for (i = 0; i < msa->nseq; i++)
    {
      list[i]  = i;
      useme[i] = FALSE;
    }

  /* Sanity check.
   */
  if (sample >= msa->nseq) sample = msa->nseq;

				/* random selection w/o replacement */
  for (len = msa->nseq, i = 0; i < sample; i++)
    {
      idx = CHOOSE(len);
      printf("chose %d: %s\n", list[idx], msa->sqname[list[idx]]);
      useme[list[idx]] = TRUE;
      list[idx] = list[--len];
    }

  MSASmallerAlignment(msa, useme, ret_new);
  free(list);
  free(useme);
  return;
}


/* Function: SingleLinkCluster()
 * Date:     SRE, Fri Jul 16 15:02:57 1999 [St. Louis]
 *
 * Purpose:  Perform simple single link clustering of seqs in a
 *           sequence alignment. A pairwise identity threshold
 *           defines whether two sequences are linked or not.
 *
 *           Important: runs in O(N) memory, unlike standard
 *           graph decomposition algorithms that use O(N^2)
 *           adjacency matrices or adjacency lists. Requires
 *           O(N^2) time in worst case (which is when you have
 *           no links at all), O(NlogN) in "average"
 *           case, and O(N) in best case (when there is just
 *           one cluster in a completely connected graph.
 *
 *           (Developed because hmmbuild could no longer deal
 *           with GP120, a 16,013 sequence alignment.)
 *
 * Limitations:
 *           CASE-SENSITIVE. Assumes aseq have been put into
 *           either all lower or all upper case; or at least,
 *           within a column, there's no mixed case.
 *
 * Algorithm:
 *           I don't know if this algorithm is published. I
 *           haven't seen it in graph theory books, but that might
 *           be because it's so obvious that nobody's bothered.
 *
 *           In brief, we're going to do a breadth-first search
 *           of the graph, and we're going to calculate links
 *           on the fly rather than precalculating them into
 *           some sort of standard adjacency structure.
 *
 *           While working, we keep two stacks of maximum length N:
 *                a : list of vertices that are still unconnected.
 *                b : list of vertices that we've connected to
 *                    in our current breadth level, but we haven't
 *                    yet tested for other connections to a.
 *           The current length (number of elements in) a and b are
 *           kept in na, nb.
 *
 *           We store our results in an array of length N:
 *                c : assigns each vertex to a component. for example
 *                    c[4] = 1 means that vertex 4 is in component 1.
 *                    nc is the number of components. Components
 *                    are numbered from 0 to nc-1. We return c and nc
 *                    to our caller.
 *
 *           The algorithm is:
 *
 *           Initialisation:
 *                a  <-- all the vertices
 *                na <-- N
 *                b  <-- empty set
 *                nb <-- 0
 *                nc <-- 0
 *
 *           Then:
 *                while (a is not empty)
 *                  pop a vertex off a, push onto b
 *                  while (b is not empty)
 *                    pop vertex v off b
 *                    assign c[v] = nc
 *                    for each vertex w in a:
 *                       compare v,w. If w is linked to v, remove w
 *                       from a, push onto b.
 *                  nc++
 *           q.e.d. :)
 *
 * Args:     aseq   - aligned sequences
 *           nseq   - number of sequences in aseq
 *           alen   - alignment length
 *           maxid  - fractional identity threshold 0..1. if id >= maxid, seqs linked
 *           ret_c  - RETURN: 0..nseq-1 assignments of seqs to components (clusters)
 *           ret_nc - RETURN: number of components
 *
 * Returns:  void.
 *           ret_c is allocated here. Caller free's with free(*ret_c)
 */
void
SingleLinkCluster(char **aseq, int nseq, int alen, float maxid,
		  int **ret_c, int *ret_nc)
{
  int *a, na;                   /* stack of available vertices */
  int *b, nb;                   /* stack of working vertices   */
  int *c;                       /* array of results            */
  int  nc;			/* total number of components  */
  int  v,w;			/* index of a working vertices */
  int  i;			/* loop counter */

  /* allocations and initializations
   */
  a = MallocOrDie (sizeof(int) * nseq);
  b = MallocOrDie (sizeof(int) * nseq);
  c = MallocOrDie (sizeof(int) * nseq);
  for (i = 0; i < nseq; i++) a[i] = i;
  na = nseq;
  nb = 0;
  nc = 0;

  /* Main algorithm
   */
  while (na > 0)
    {
      v = a[na-1]; na--;	/* pop a vertex off a, */
      b[nb] = v;   nb++;	/* and push onto b     */
      while (nb > 0)
	{
	  v    = b[nb-1]; nb--;	/* pop vertex off b          */
	  c[v] = nc;		/* assign it to component nc */
	  for (i = na-1; i >= 0; i--)/* backwards, becase of deletion/swapping we do*/
	    if (simple_distance(aseq[v], aseq[a[i]]) <= 1. - maxid) /* linked? */
	      {
		w = a[i]; a[i] = a[na-1]; na--;	/* delete w from a (note swap) */
		b[nb] = w; nb++;                /* push w onto b */
	      }
	}
      nc++;
    }

  /* Cleanup and return
   */
  free(a);
  free(b);
  *ret_c  = c;
  *ret_nc = nc;
  return;
}
author	clustalomega
date	Tue, 07 Jun 2011 17:04:25 -0400
parents
children