clustalomega: clustalomega/clustal-omega-0.2.0/src/squid/weight.c comparison

comparison clustalomega/clustal-omega-0.2.0/src/squid/weight.c @ 0:ff1768533a07

Migrated tool version 0.2 from old tool shed archive to new tool shed repository

author	clustalomega
date	Tue, 07 Jun 2011 17:04:25 -0400
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:ff1768533a07
+/*****************************************************************
+* SQUID - a library of functions for biological sequence analysis
+* Copyright (C) 1992-2002 Washington University School of Medicine
+*
+*     This source code is freely distributed under the terms of the
+*     GNU General Public License. See the files COPYRIGHT and LICENSE
+*     for details.
+*****************************************************************/
+/* weight.c
+* SRE, Thu Mar  3 07:56:01 1994
+*
+* Calculate weights for sequences in an alignment.
+* RCS $Id: weight.c 217 2011-03-19 10:27:10Z andreas $ (Original squid RCS Id: weight.c,v 1.9 2002/10/09 14:26:09 eddy Exp)
+*/
+#include <ctype.h>
+#include <string.h>
+#include "squid.h"
+#include "sre_random.h"
+static void upweight(struct phylo_s *tree, int nseq, float *lwt, float *rwt, int node);
+static void downweight(struct phylo_s *tree, int nseq, float *lwt, float *rwt,
+		       float *fwt, int node);
+static float simple_distance(char *s1, char *s2);
+static int    simple_diffmx(char **aseqs,int num, float ***ret_dmx);
+/* Function: GSCWeights()
+*
+* Purpose:  Use Erik's tree-based algorithm to set weights for
+*           sequences in an alignment. upweight() and downweight()
+*           are derived from Graeme Mitchison's code.
+*
+* Args:     aseq        - array of (0..nseq-1) aligned sequences
+*           nseq        - number of seqs in alignment
+*           alen        - length of alignment
+*           wgt         - allocated [0..nseq-1] array of weights to be returned
+*
+* Return:   (void)
+*           wgt is filled in.
+*/
+void
+GSCWeights(char **aseq, int nseq, int alen, float *wgt)
+{
+float **dmx;                 /* distance (difference) matrix */
+struct phylo_s *tree;
+float  *lwt, *rwt;           /* weight on left, right of this tree node */
+float  *fwt;                 /* final weight assigned to this node */
+int      i;
+/* Sanity check first
+*/
+if (nseq == 1) { wgt[0] = 1.0; return; }
+/* I use a simple fractional difference matrix derived by
+* pairwise identity. Perhaps I should include a Poisson
+* distance correction.
+*/
+MakeDiffMx(aseq, nseq, &dmx);
+if (! Cluster(dmx, nseq, CLUSTER_MIN, &tree))  Die("Cluster() failed");
+/* Allocations
+*/
+lwt = MallocOrDie (sizeof(float) * (2 * nseq - 1));
+rwt = MallocOrDie (sizeof(float) * (2 * nseq - 1));
+fwt = MallocOrDie (sizeof(float) * (2 * nseq - 1));
+/* lwt and rwt are the total branch weight to the left and
+* right of a node or sequence. They are 0..2N-2.  0..N-1 are
+* the sequences; these have weight 0. N..2N-2 are the actual
+* tree nodes.
+*/
+for (i = 0; i < nseq; i++)
+lwt[i] = rwt[i] = 0.0;
+				/* recursively calculate rwt, lwt, starting
+				   at node nseq (the root) */
+upweight(tree, nseq, lwt, rwt, nseq);
+				/* recursively distribute weight across the
+				   tree */
+fwt[nseq] = nseq;
+downweight(tree, nseq, lwt, rwt, fwt, nseq);
+				/* collect the weights */
+for (i = 0; i < nseq; i++)
+wgt[i]  = fwt[i];
+FMX2Free(dmx);
+FreePhylo(tree, nseq);
+free(lwt); free(rwt); free(fwt);
+}
+static void
+upweight(struct phylo_s *tree, int nseq, float *lwt, float *rwt, int node)
+{
+int ld,rd;
+ld = tree[node-nseq].left;
+if (ld >= nseq) upweight(tree, nseq, lwt, rwt, ld);
+rd = tree[node-nseq].right;
+if (rd >= nseq) upweight(tree, nseq, lwt, rwt, rd);
+lwt[node] = lwt[ld] + rwt[ld] + tree[node-nseq].lblen;
+rwt[node] = lwt[rd] + rwt[rd] + tree[node-nseq].rblen;
+}
+static void
+downweight(struct phylo_s *tree, int nseq, float *lwt, float *rwt, float *fwt, int node)
+{
+int ld,rd;
+float lnum, rnum;
+ld = tree[node-nseq].left;
+rd = tree[node-nseq].right;
+if (lwt[node] + rwt[node] > 0.0)
+{
+fwt[ld] = fwt[node] * (lwt[node] / (lwt[node] + rwt[node]));
+fwt[rd] = fwt[node] * (rwt[node] / (lwt[node] + rwt[node]));
+}
+else
+{
+lnum = (ld >= nseq) ? tree[ld-nseq].incnum : 1.0;
+rnum = (rd >= nseq) ? tree[rd-nseq].incnum : 1.0;
+fwt[ld] = fwt[node] * lnum / (lnum + rnum);
+fwt[rd] = fwt[node] * rnum / (lnum + rnum);
+}
+if (ld >= nseq) downweight(tree, nseq, lwt, rwt, fwt, ld);
+if (rd >= nseq) downweight(tree, nseq, lwt, rwt, fwt, rd);
+}
+/* Function: VoronoiWeights()
+*
+* Purpose:  Calculate weights using the scheme of Sibbald &
+*           Argos (JMB 216:813-818 1990). The scheme is
+*           slightly modified because the original algorithm
+*           actually doesn't work on gapped alignments.
+*           The sequences are assumed to be protein.
+*
+* Args:     aseq  - array of (0..nseq-1) aligned sequences
+*           nseq  - number of sequences
+*           alen  - length of alignment
+*           wgt   - allocated [0..nseq-1] array of weights to be returned
+*
+* Return:   void
+*           wgt is filled in.
+*/
+void
+VoronoiWeights(char **aseq, int nseq, int alen, float *wgt)
+{
+float **dmx;                 /* distance (difference) matrix    */
+float  *halfmin;             /* 1/2 minimum distance to other seqs */
+char   **psym;                /* symbols seen in each column     */
+int     *nsym;                /* # syms seen in each column      */
+int      symseen[27];         /* flags for observed syms         */
+char    *randseq;             /* randomly generated sequence     */
+int      acol;		/* pos in aligned columns          */
+int      idx;                 /* index in sequences              */
+int      symidx;              /* 0..25 index for symbol          */
+int      i;			/* generic counter                 */
+float   min;			/* minimum distance                */
+float   dist;		/* distance between random and real */
+float   challenge, champion; /* for resolving ties              */
+int     itscale;		/* how many iterations per seq     */
+int     iteration;
+int     best;			/* index of nearest real sequence  */
+/* Sanity check first
+*/
+if (nseq == 1) { wgt[0] = 1.0; return; }
+itscale = 50;
+/* Precalculate 1/2 minimum distance to other
+* sequences for each sequence
+*/
+if (! simple_diffmx(aseq, nseq, &dmx))
+Die("simple_diffmx() failed");
+halfmin = MallocOrDie (sizeof(float) * nseq);
+for (idx = 0; idx < nseq; idx++)
+{
+for (min = 1.0, i = 0; i < nseq; i++)
+	{
+	  if (i == idx) continue;
+	  if (dmx[idx][i] < min) min = dmx[idx][i];
+	}
+halfmin[idx] = min / 2.0;
+}
+Free2DArray((void **) dmx, nseq);
+/* Set up the random sequence generating model.
+*/
+psym = MallocOrDie (alen * sizeof(char *));
+nsym = MallocOrDie (alen * sizeof(int));
+for (acol = 0; acol < alen; acol++)
+psym[acol] = MallocOrDie (27 * sizeof(char));
+/* #ifdef ORIGINAL_SIBBALD_ALGORITHM_IS_BROKEN */
+for (acol = 0; acol < alen; acol++)
+{
+memset(symseen, 0, sizeof(int) * 27);
+for (idx = 0; idx < nseq; idx++)
+	if (! isgap(aseq[idx][acol]))
+	  {
+	    if (isupper((int) aseq[idx][acol]))
+	      symidx = aseq[idx][acol] - 'A';
+	    else
+	      symidx = aseq[idx][acol] - 'a';
+	    if (symidx >= 0 && symidx < 26)
+	      symseen[symidx] = 1;
+	  }
+	else
+	  symseen[26] = 1;	/* a gap */
+for (nsym[acol] = 0, i = 0; i < 26; i++)
+	if (symseen[i])
+	  {
+	    psym[acol][nsym[acol]] = 'A'+i;
+	    nsym[acol]++;
+	  }
+if (symseen[26]) { psym[acol][nsym[acol]] = ' '; nsym[acol]++; }
+}
+/* #endif ORIGINAL_SIBBALD_ALGORITHM_IS_BROKEN */
+/* Note: the original Sibbald&Argos algorithm calls for
+* bounding the sampled space using a template-like random
+* sequence generator. However, this leads to one minor
+* and one major problem. The minor problem is that
+* exceptional amino acids in a column can have a
+* significant effect by altering the amount of sampled
+* sequence space; the larger the data set, the worse
+* this problem becomes. The major problem is that
+* there is no reasonable way to deal with gaps.
+* Gapped sequences simply inhabit a different dimensionality
+* and it's pretty painful to imagine calculating Voronoi
+* volumes when the N in your N-space is varying.
+* Note that all the examples shown by Sibbald and Argos
+* are *ungapped* examples.
+*
+* The best way I've found to circumvent this problem is
+* just not to bound the sampled space; count gaps as
+* symbols and generate completely random sequences.
+*/
+#ifdef ALL_SEQUENCE_SPACE
+for (acol = 0; acol < alen; acol++)
+{
+strcpy(psym[acol], "ACDEFGHIKLMNPQRSTVWY ");
+nsym[acol] = 21;
+}
+#endif
+/* Sibbald and Argos algorithm:
+*   1) assign all seqs weight 0.
+*   2) generate a "random" sequence
+*   3) calculate distance to every other sequence
+*      (if we get a distance < 1/2 minimum distance
+*       to other real seqs, we can stop)
+*   4) if unique closest sequence, increment its weight 1.
+*      if multiple closest seq, choose one randomly
+*   5) repeat 2-4 for lots of iterations
+*   6) normalize all weights to sum to nseq.
+*/
+randseq = MallocOrDie ((alen+1) * sizeof(char));
+best = 42.;			/* solely to silence GCC uninit warnings. */
+FSet(wgt, nseq, 0.0);
+for (iteration = 0; iteration < itscale * nseq; iteration++)
+{
+for (acol = 0; acol < alen; acol++)
+	randseq[acol] = (nsym[acol] == 0) ? ' ' : psym[acol][CHOOSE(nsym[acol])];
+randseq[acol] = '\0';
+champion = sre_random();
+for (min = 1.0, idx = 0; idx < nseq; idx++)
+	{
+	  dist = simple_distance(aseq[idx], randseq);
+	  if (dist < halfmin[idx])
+	    {
+	      best = idx;
+	      break;
+	    }
+	  if (dist < min)
+	    { champion = sre_random(); best = idx; min = dist; }
+	  else if (dist == min)
+	    {
+	      challenge = sre_random();
+	      if (challenge > champion)
+		{ champion = challenge; best = idx; min = dist; }
+	    }
+	}
+wgt[best] += 1.0;
+}
+for (idx = 0; idx < nseq; idx++)
+wgt[idx] = wgt[idx] / (float) itscale;
+free(randseq);
+free(nsym);
+free(halfmin);
+Free2DArray((void **) psym, alen);
+}
+/* Function: simple_distance()
+*
+* Purpose:  For two identical-length null-terminated strings, return
+*           the fractional difference between them. (0..1)
+*           (Gaps don't count toward anything.)
+*/
+static float
+simple_distance(char *s1, char *s2)
+{
+int diff  = 0;
+int valid = 0;
+for (; *s1 != '\0'; s1++, s2++)
+{
+if (isgap(*s1) || isgap(*s2)) continue;
+if (*s1 != *s2) diff++;
+valid++;
+}
+return (valid > 0 ? ((float) diff / (float) valid) : 0.0);
+}
+/* Function: simple_diffmx()
+*
+* Purpose:  Given a set of flushed, aligned sequences, construct
+*           an NxN fractional difference matrix using the
+*           simple_distance rule.
+*
+* Args:     aseqs        - flushed, aligned sequences
+*           num          - number of aseqs
+*           ret_dmx      - RETURN: difference matrix (caller must free)
+*
+* Return:   1 on success, 0 on failure.
+*/
+static int
+simple_diffmx(char    **aseqs,
+	      int       num,
+	      float ***ret_dmx)
+{
+float **dmx;                 /* RETURN: distance matrix           */
+int      i,j;			/* counters over sequences           */
+/* Allocate
+*/
+if ((dmx = (float **) malloc (sizeof(float *) * num)) == NULL)
+Die("malloc failed");
+for (i = 0; i < num; i++)
+if ((dmx[i] = (float *) malloc (sizeof(float) * num)) == NULL)
+Die("malloc failed");
+/* Calculate distances, symmetric matrix
+*/
+for (i = 0; i < num; i++)
+for (j = i; j < num; j++)
+dmx[i][j] = dmx[j][i] = simple_distance(aseqs[i], aseqs[j]);
+/* Return
+*/
+*ret_dmx = dmx;
+return 1;
+}
+/* Function: BlosumWeights()
+* Date:     SRE, Fri Jul 16 17:33:59 1999 (St. Louis)
+*
+* Purpose:  Assign weights to a set of aligned sequences
+*           using the BLOSUM rule:
+*             - do single linkage clustering at some pairwise identity
+*             - in each cluster, give each sequence 1/clustsize
+*               total weight.
+*
+*           The clusters have no pairwise link >= maxid.
+*
+*           O(N) in memory. Probably ~O(NlogN) in time; O(N^2)
+*           in worst case, which is no links between sequences
+*           (e.g., values of maxid near 1.0).
+*
+* Args:     aseqs - alignment
+*           nseq  - number of seqs in alignment
+*           alen  - # of columns in alignment
+*           maxid - fractional identity (e.g. 0.62 for BLOSUM62)
+*           wgt   - [0..nseq-1] array of weights to be returned
+*/
+void
+BlosumWeights(char **aseqs, int nseq, int alen, float maxid, float *wgt)
+{
+int  *c, nc;
+int  *nmem;        /* number of seqs in each cluster */
+int   i;           /* loop counter */
+SingleLinkCluster(aseqs, nseq, alen, maxid, &c, &nc);
+FSet(wgt, nseq, 1.0);
+nmem = MallocOrDie(sizeof(int) * nc);
+for (i = 0; i < nc;   i++) nmem[i] = 0;
+for (i = 0; i < nseq; i++) nmem[c[i]]++;
+for (i = 0; i < nseq; i++) wgt[i] = 1. / (float) nmem[c[i]];
+free(nmem);
+free(c);
+return;
+}
+/* Function: PositionBasedWeights()
+* Date:     SRE, Fri Jul 16 17:47:22 1999 [St. Louis]
+*
+* Purpose:  Implementation of Henikoff and Henikoff position-based
+*           weights (JMB 243:574-578, 1994) [Henikoff94b].
+*
+*           A significant advantage of this approach that Steve and Jorja
+*           don't point out is that it is O(N) in memory, unlike
+*           many other approaches like GSC weights or Voronoi.
+*
+*           A potential disadvantage that they don't point out
+*           is that in the theoretical limit of infinite sequences
+*           in the alignment, weights go flat: eventually every
+*           column has at least one representative of each of 20 aa (or 4 nt)
+*           in it.
+*
+*           They also don't give a rule for how to handle gaps.
+*           The rule used here seems the obvious and sensible one
+*           (ignore them). This means that longer sequences
+*           initially get more weight; hence a "double
+*           normalization" in which the weights are first divided
+*           by sequence length (to compensate for that effect),
+*           then normalized to sum to nseq.
+*
+* Limitations:
+*           Implemented in a way that's alphabet-independent:
+*           it uses the 26 upper case letters as "residues".
+*           Any alphabetic character in aseq is interpreted as
+*           a unique "residue" (case insensitively; lower case
+*           mapped to upper case). All other characters are
+*           interpreted as gaps.
+*
+*           This way, we don't have to pass around any alphabet
+*           type info (DNA vs. RNA vs. protein) and don't have
+*           to deal with remapping IUPAC degenerate codes
+*           probabilistically. However, on the down side,
+*           a sequence with a lot of degenerate IUPAC characters
+*           will get an artifactually high PB weight.
+*
+* Args:     aseq   - sequence alignment to weight
+*           nseq   - number of sequences in alignment
+*           alen   - length of alignment
+*           wgt    - RETURN: weights filled in (pre-allocated 0..nseq-1)
+*
+* Returns:  (void)
+*           wgt is allocated (0..nseq-1) by caller, and filled in here.
+*/
+void
+PositionBasedWeights(char **aseq, int nseq, int alen, float *wgt)
+{
+int  rescount[26];		/* count of A-Z residues in a column   */
+int  nres;			/* number of different residues in col */
+int  idx, pos;                /* indices into aseq                   */
+int  x;
+float norm;
+FSet(wgt, nseq, 0.0);
+for (pos = 0; pos < alen; pos++)
+{
+for (x = 0; x < 26; x++) rescount[x] = 0;
+for (idx = 0; idx < nseq; idx++)
+	if (isalpha(aseq[idx][pos]))
+	  rescount[toupper(aseq[idx][pos]) - 'A'] ++;
+nres = 0;
+for (x = 0; x < 26; x++)
+	if (rescount[x] > 0) nres++;
+for (idx = 0; idx < nseq; idx++)
+	if (isalpha(aseq[idx][pos]))
+	  wgt[idx] += 1. / (float) (nres * rescount[toupper(aseq[idx][pos]) - 'A']);
+}
+for (idx = 0; idx < nseq; idx++)
+wgt[idx] /= (float) DealignedLength(aseq[idx]);
+norm = (float) nseq / FSum(wgt, nseq);
+FScale(wgt, nseq, norm);
+return;
+}
+/* Function: FilterAlignment()
+* Date:     SRE, Wed Jun 30 09:19:30 1999 [St. Louis]
+*
+* Purpose:  Constructs a new alignment by removing near-identical
+*           sequences from a given alignment (where identity is
+*           calculated *based on the alignment*).
+*           Does not affect the given alignment.
+*           Keeps earlier sequence, discards later one.
+*
+*           Usually called as an ad hoc sequence "weighting" mechanism.
+*
+* Limitations:
+*           Unparsed Stockholm markup is not propagated into the
+*           new alignment.
+*
+* Args:     msa      -- original alignment
+*           cutoff   -- fraction identity cutoff. 0.8 removes sequences > 80% id.
+*           ret_new  -- RETURN: new MSA, usually w/ fewer sequences
+*
+* Return:   (void)
+*           ret_new must be free'd by caller: MSAFree().
+*/
+void
+FilterAlignment(MSA *msa, float cutoff, MSA **ret_new)
+{
+int     nnew;			/* number of seqs in new alignment */
+int    *list;
+int    *useme;
+float   ident;
+int     i,j;
+int     remove;
+				/* find which seqs to keep (list) */
+				/* diff matrix; allow ragged ends */
+list  = MallocOrDie (sizeof(int) * msa->nseq);
+useme = MallocOrDie (sizeof(int) * msa->nseq);
+for (i = 0; i < msa->nseq; i++) useme[i] = FALSE;
+nnew = 0;
+for (i = 0; i < msa->nseq; i++)
+{
+remove = FALSE;
+for (j = 0; j < nnew; j++)
+	{
+	  ident = PairwiseIdentity(msa->aseq[i], msa->aseq[list[j]]);
+	  if (ident > cutoff)
+	    {
+	      remove = TRUE;
+	      printf("removing %12s -- fractional identity %.2f to %s\n",
+		     msa->sqname[i], ident,
+		     msa->sqname[list[j]]);
+	      break;
+	    }
+	}
+if (remove == FALSE) {
+	list[nnew++] = i;
+	useme[i]     = TRUE;
+}
+}
+MSASmallerAlignment(msa, useme, ret_new);
+free(list);
+free(useme);
+return;
+}
+/* Function: SampleAlignment()
+* Date:     SRE, Wed Jun 30 10:13:56 1999 [St. Louis]
+*
+* Purpose:  Constructs a new, smaller alignment by sampling a given
+*           number of sequences at random. Does not change the
+*           alignment nor the order of the sequences.
+*
+*           If you ask for a sample that is larger than nseqs,
+*           it silently returns the original alignment.
+*
+*           Not really a weighting method, but this is as good
+*           a place as any to keep it, since it's similar in
+*           construction to FilterAlignment().
+*
+* Args:     msa      -- original alignment
+*           sample   -- number of sequences in new alignment (0 < sample <= nseq)
+*           ret_new  -- RETURN: new MSA
+*
+* Return:   (void)
+*           ret_new must be free'd by caller: MSAFree().
+*/
+void
+SampleAlignment(MSA *msa, int sample, MSA **ret_new)
+{
+int    *list;                 /* array for random selection w/o replace */
+int    *useme;                /* array of flags 0..nseq-1: TRUE to use */
+int     i, idx;
+int     len;
+/* Allocations
+*/
+list  = (int *) MallocOrDie (sizeof(int) * msa->nseq);
+useme = (int *) MallocOrDie (sizeof(int) * msa->nseq);
+for (i = 0; i < msa->nseq; i++)
+{
+list[i]  = i;
+useme[i] = FALSE;
+}
+/* Sanity check.
+*/
+if (sample >= msa->nseq) sample = msa->nseq;
+				/* random selection w/o replacement */
+for (len = msa->nseq, i = 0; i < sample; i++)
+{
+idx = CHOOSE(len);
+printf("chose %d: %s\n", list[idx], msa->sqname[list[idx]]);
+useme[list[idx]] = TRUE;
+list[idx] = list[--len];
+}
+MSASmallerAlignment(msa, useme, ret_new);
+free(list);
+free(useme);
+return;
+}
+/* Function: SingleLinkCluster()
+* Date:     SRE, Fri Jul 16 15:02:57 1999 [St. Louis]
+*
+* Purpose:  Perform simple single link clustering of seqs in a
+*           sequence alignment. A pairwise identity threshold
+*           defines whether two sequences are linked or not.
+*
+*           Important: runs in O(N) memory, unlike standard
+*           graph decomposition algorithms that use O(N^2)
+*           adjacency matrices or adjacency lists. Requires
+*           O(N^2) time in worst case (which is when you have
+*           no links at all), O(NlogN) in "average"
+*           case, and O(N) in best case (when there is just
+*           one cluster in a completely connected graph.
+*
+*           (Developed because hmmbuild could no longer deal
+*           with GP120, a 16,013 sequence alignment.)
+*
+* Limitations:
+*           CASE-SENSITIVE. Assumes aseq have been put into
+*           either all lower or all upper case; or at least,
+*           within a column, there's no mixed case.
+*
+* Algorithm:
+*           I don't know if this algorithm is published. I
+*           haven't seen it in graph theory books, but that might
+*           be because it's so obvious that nobody's bothered.
+*
+*           In brief, we're going to do a breadth-first search
+*           of the graph, and we're going to calculate links
+*           on the fly rather than precalculating them into
+*           some sort of standard adjacency structure.
+*
+*           While working, we keep two stacks of maximum length N:
+*                a : list of vertices that are still unconnected.
+*                b : list of vertices that we've connected to
+*                    in our current breadth level, but we haven't
+*                    yet tested for other connections to a.
+*           The current length (number of elements in) a and b are
+*           kept in na, nb.
+*
+*           We store our results in an array of length N:
+*                c : assigns each vertex to a component. for example
+*                    c[4] = 1 means that vertex 4 is in component 1.
+*                    nc is the number of components. Components
+*                    are numbered from 0 to nc-1. We return c and nc
+*                    to our caller.
+*
+*           The algorithm is:
+*
+*           Initialisation:
+*                a  <-- all the vertices
+*                na <-- N
+*                b  <-- empty set
+*                nb <-- 0
+*                nc <-- 0
+*
+*           Then:
+*                while (a is not empty)
+*                  pop a vertex off a, push onto b
+*                  while (b is not empty)
+*                    pop vertex v off b
+*                    assign c[v] = nc
+*                    for each vertex w in a:
+*                       compare v,w. If w is linked to v, remove w
+*                       from a, push onto b.
+*                  nc++
+*           q.e.d. :)
+*
+* Args:     aseq   - aligned sequences
+*           nseq   - number of sequences in aseq
+*           alen   - alignment length
+*           maxid  - fractional identity threshold 0..1. if id >= maxid, seqs linked
+*           ret_c  - RETURN: 0..nseq-1 assignments of seqs to components (clusters)
+*           ret_nc - RETURN: number of components
+*
+* Returns:  void.
+*           ret_c is allocated here. Caller free's with free(*ret_c)
+*/
+void
+SingleLinkCluster(char **aseq, int nseq, int alen, float maxid,
+		  int **ret_c, int *ret_nc)
+{
+int *a, na;                   /* stack of available vertices */
+int *b, nb;                   /* stack of working vertices   */
+int *c;                       /* array of results            */
+int  nc;			/* total number of components  */
+int  v,w;			/* index of a working vertices */
+int  i;			/* loop counter */
+/* allocations and initializations
+*/
+a = MallocOrDie (sizeof(int) * nseq);
+b = MallocOrDie (sizeof(int) * nseq);
+c = MallocOrDie (sizeof(int) * nseq);
+for (i = 0; i < nseq; i++) a[i] = i;
+na = nseq;
+nb = 0;
+nc = 0;
+/* Main algorithm
+*/
+while (na > 0)
+{
+v = a[na-1]; na--;	/* pop a vertex off a, */
+b[nb] = v;   nb++;	/* and push onto b     */
+while (nb > 0)
+	{
+	  v    = b[nb-1]; nb--;	/* pop vertex off b          */
+	  c[v] = nc;		/* assign it to component nc */
+	  for (i = na-1; i >= 0; i--)/* backwards, becase of deletion/swapping we do*/
+	    if (simple_distance(aseq[v], aseq[a[i]]) <= 1. - maxid) /* linked? */
+	      {
+		w = a[i]; a[i] = a[na-1]; na--;	/* delete w from a (note swap) */
+		b[nb] = w; nb++;                /* push w onto b */
+	      }
+	}
+nc++;
+}
+/* Cleanup and return
+*/
+free(a);
+free(b);
+*ret_c  = c;
+*ret_nc = nc;
+return;
+}

Mercurial > repos > clustalomega > clustalomega

comparison clustalomega/clustal-omega-0.2.0/src/squid/weight.c @ 0:ff1768533a07