Mercurial > repos > clustalomega > clustalomega
diff clustalomega/clustal-omega-0.2.0/src/squid/weight.c @ 0:ff1768533a07
Migrated tool version 0.2 from old tool shed archive to new tool shed repository
author | clustalomega |
---|---|
date | Tue, 07 Jun 2011 17:04:25 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/clustalomega/clustal-omega-0.2.0/src/squid/weight.c Tue Jun 07 17:04:25 2011 -0400 @@ -0,0 +1,748 @@ +/***************************************************************** + * SQUID - a library of functions for biological sequence analysis + * Copyright (C) 1992-2002 Washington University School of Medicine + * + * This source code is freely distributed under the terms of the + * GNU General Public License. See the files COPYRIGHT and LICENSE + * for details. + *****************************************************************/ + +/* weight.c + * SRE, Thu Mar 3 07:56:01 1994 + * + * Calculate weights for sequences in an alignment. + * RCS $Id: weight.c 217 2011-03-19 10:27:10Z andreas $ (Original squid RCS Id: weight.c,v 1.9 2002/10/09 14:26:09 eddy Exp) + */ + +#include <ctype.h> +#include <string.h> +#include "squid.h" +#include "sre_random.h" + +static void upweight(struct phylo_s *tree, int nseq, float *lwt, float *rwt, int node); +static void downweight(struct phylo_s *tree, int nseq, float *lwt, float *rwt, + float *fwt, int node); +static float simple_distance(char *s1, char *s2); +static int simple_diffmx(char **aseqs,int num, float ***ret_dmx); + +/* Function: GSCWeights() + * + * Purpose: Use Erik's tree-based algorithm to set weights for + * sequences in an alignment. upweight() and downweight() + * are derived from Graeme Mitchison's code. + * + * Args: aseq - array of (0..nseq-1) aligned sequences + * nseq - number of seqs in alignment + * alen - length of alignment + * wgt - allocated [0..nseq-1] array of weights to be returned + * + * Return: (void) + * wgt is filled in. + */ +void +GSCWeights(char **aseq, int nseq, int alen, float *wgt) +{ + float **dmx; /* distance (difference) matrix */ + struct phylo_s *tree; + float *lwt, *rwt; /* weight on left, right of this tree node */ + float *fwt; /* final weight assigned to this node */ + int i; + + /* Sanity check first + */ + if (nseq == 1) { wgt[0] = 1.0; return; } + + /* I use a simple fractional difference matrix derived by + * pairwise identity. Perhaps I should include a Poisson + * distance correction. + */ + MakeDiffMx(aseq, nseq, &dmx); + if (! Cluster(dmx, nseq, CLUSTER_MIN, &tree)) Die("Cluster() failed"); + + /* Allocations + */ + lwt = MallocOrDie (sizeof(float) * (2 * nseq - 1)); + rwt = MallocOrDie (sizeof(float) * (2 * nseq - 1)); + fwt = MallocOrDie (sizeof(float) * (2 * nseq - 1)); + + /* lwt and rwt are the total branch weight to the left and + * right of a node or sequence. They are 0..2N-2. 0..N-1 are + * the sequences; these have weight 0. N..2N-2 are the actual + * tree nodes. + */ + for (i = 0; i < nseq; i++) + lwt[i] = rwt[i] = 0.0; + /* recursively calculate rwt, lwt, starting + at node nseq (the root) */ + upweight(tree, nseq, lwt, rwt, nseq); + + /* recursively distribute weight across the + tree */ + fwt[nseq] = nseq; + downweight(tree, nseq, lwt, rwt, fwt, nseq); + /* collect the weights */ + for (i = 0; i < nseq; i++) + wgt[i] = fwt[i]; + + FMX2Free(dmx); + FreePhylo(tree, nseq); + free(lwt); free(rwt); free(fwt); +} + +static void +upweight(struct phylo_s *tree, int nseq, float *lwt, float *rwt, int node) +{ + int ld,rd; + + ld = tree[node-nseq].left; + if (ld >= nseq) upweight(tree, nseq, lwt, rwt, ld); + rd = tree[node-nseq].right; + if (rd >= nseq) upweight(tree, nseq, lwt, rwt, rd); + lwt[node] = lwt[ld] + rwt[ld] + tree[node-nseq].lblen; + rwt[node] = lwt[rd] + rwt[rd] + tree[node-nseq].rblen; +} + + +static void +downweight(struct phylo_s *tree, int nseq, float *lwt, float *rwt, float *fwt, int node) +{ + int ld,rd; + float lnum, rnum; + + ld = tree[node-nseq].left; + rd = tree[node-nseq].right; + if (lwt[node] + rwt[node] > 0.0) + { + fwt[ld] = fwt[node] * (lwt[node] / (lwt[node] + rwt[node])); + fwt[rd] = fwt[node] * (rwt[node] / (lwt[node] + rwt[node])); + } + else + { + lnum = (ld >= nseq) ? tree[ld-nseq].incnum : 1.0; + rnum = (rd >= nseq) ? tree[rd-nseq].incnum : 1.0; + fwt[ld] = fwt[node] * lnum / (lnum + rnum); + fwt[rd] = fwt[node] * rnum / (lnum + rnum); + } + + if (ld >= nseq) downweight(tree, nseq, lwt, rwt, fwt, ld); + if (rd >= nseq) downweight(tree, nseq, lwt, rwt, fwt, rd); +} + + + + +/* Function: VoronoiWeights() + * + * Purpose: Calculate weights using the scheme of Sibbald & + * Argos (JMB 216:813-818 1990). The scheme is + * slightly modified because the original algorithm + * actually doesn't work on gapped alignments. + * The sequences are assumed to be protein. + * + * Args: aseq - array of (0..nseq-1) aligned sequences + * nseq - number of sequences + * alen - length of alignment + * wgt - allocated [0..nseq-1] array of weights to be returned + * + * Return: void + * wgt is filled in. + */ +void +VoronoiWeights(char **aseq, int nseq, int alen, float *wgt) +{ + float **dmx; /* distance (difference) matrix */ + float *halfmin; /* 1/2 minimum distance to other seqs */ + char **psym; /* symbols seen in each column */ + int *nsym; /* # syms seen in each column */ + int symseen[27]; /* flags for observed syms */ + char *randseq; /* randomly generated sequence */ + int acol; /* pos in aligned columns */ + int idx; /* index in sequences */ + int symidx; /* 0..25 index for symbol */ + int i; /* generic counter */ + float min; /* minimum distance */ + float dist; /* distance between random and real */ + float challenge, champion; /* for resolving ties */ + int itscale; /* how many iterations per seq */ + int iteration; + int best; /* index of nearest real sequence */ + + /* Sanity check first + */ + if (nseq == 1) { wgt[0] = 1.0; return; } + + itscale = 50; + + /* Precalculate 1/2 minimum distance to other + * sequences for each sequence + */ + if (! simple_diffmx(aseq, nseq, &dmx)) + Die("simple_diffmx() failed"); + halfmin = MallocOrDie (sizeof(float) * nseq); + for (idx = 0; idx < nseq; idx++) + { + for (min = 1.0, i = 0; i < nseq; i++) + { + if (i == idx) continue; + if (dmx[idx][i] < min) min = dmx[idx][i]; + } + halfmin[idx] = min / 2.0; + } + Free2DArray((void **) dmx, nseq); + + /* Set up the random sequence generating model. + */ + psym = MallocOrDie (alen * sizeof(char *)); + nsym = MallocOrDie (alen * sizeof(int)); + for (acol = 0; acol < alen; acol++) + psym[acol] = MallocOrDie (27 * sizeof(char)); + +/* #ifdef ORIGINAL_SIBBALD_ALGORITHM_IS_BROKEN */ + for (acol = 0; acol < alen; acol++) + { + memset(symseen, 0, sizeof(int) * 27); + for (idx = 0; idx < nseq; idx++) + if (! isgap(aseq[idx][acol])) + { + if (isupper((int) aseq[idx][acol])) + symidx = aseq[idx][acol] - 'A'; + else + symidx = aseq[idx][acol] - 'a'; + if (symidx >= 0 && symidx < 26) + symseen[symidx] = 1; + } + else + symseen[26] = 1; /* a gap */ + + for (nsym[acol] = 0, i = 0; i < 26; i++) + if (symseen[i]) + { + psym[acol][nsym[acol]] = 'A'+i; + nsym[acol]++; + } + if (symseen[26]) { psym[acol][nsym[acol]] = ' '; nsym[acol]++; } + } +/* #endif ORIGINAL_SIBBALD_ALGORITHM_IS_BROKEN */ + + /* Note: the original Sibbald&Argos algorithm calls for + * bounding the sampled space using a template-like random + * sequence generator. However, this leads to one minor + * and one major problem. The minor problem is that + * exceptional amino acids in a column can have a + * significant effect by altering the amount of sampled + * sequence space; the larger the data set, the worse + * this problem becomes. The major problem is that + * there is no reasonable way to deal with gaps. + * Gapped sequences simply inhabit a different dimensionality + * and it's pretty painful to imagine calculating Voronoi + * volumes when the N in your N-space is varying. + * Note that all the examples shown by Sibbald and Argos + * are *ungapped* examples. + * + * The best way I've found to circumvent this problem is + * just not to bound the sampled space; count gaps as + * symbols and generate completely random sequences. + */ +#ifdef ALL_SEQUENCE_SPACE + for (acol = 0; acol < alen; acol++) + { + strcpy(psym[acol], "ACDEFGHIKLMNPQRSTVWY "); + nsym[acol] = 21; + } +#endif + + /* Sibbald and Argos algorithm: + * 1) assign all seqs weight 0. + * 2) generate a "random" sequence + * 3) calculate distance to every other sequence + * (if we get a distance < 1/2 minimum distance + * to other real seqs, we can stop) + * 4) if unique closest sequence, increment its weight 1. + * if multiple closest seq, choose one randomly + * 5) repeat 2-4 for lots of iterations + * 6) normalize all weights to sum to nseq. + */ + randseq = MallocOrDie ((alen+1) * sizeof(char)); + + best = 42.; /* solely to silence GCC uninit warnings. */ + FSet(wgt, nseq, 0.0); + for (iteration = 0; iteration < itscale * nseq; iteration++) + { + for (acol = 0; acol < alen; acol++) + randseq[acol] = (nsym[acol] == 0) ? ' ' : psym[acol][CHOOSE(nsym[acol])]; + randseq[acol] = '\0'; + + champion = sre_random(); + for (min = 1.0, idx = 0; idx < nseq; idx++) + { + dist = simple_distance(aseq[idx], randseq); + if (dist < halfmin[idx]) + { + best = idx; + break; + } + if (dist < min) + { champion = sre_random(); best = idx; min = dist; } + else if (dist == min) + { + challenge = sre_random(); + if (challenge > champion) + { champion = challenge; best = idx; min = dist; } + } + } + wgt[best] += 1.0; + } + + for (idx = 0; idx < nseq; idx++) + wgt[idx] = wgt[idx] / (float) itscale; + + free(randseq); + free(nsym); + free(halfmin); + Free2DArray((void **) psym, alen); +} + + +/* Function: simple_distance() + * + * Purpose: For two identical-length null-terminated strings, return + * the fractional difference between them. (0..1) + * (Gaps don't count toward anything.) + */ +static float +simple_distance(char *s1, char *s2) +{ + int diff = 0; + int valid = 0; + + for (; *s1 != '\0'; s1++, s2++) + { + if (isgap(*s1) || isgap(*s2)) continue; + if (*s1 != *s2) diff++; + valid++; + } + return (valid > 0 ? ((float) diff / (float) valid) : 0.0); +} + +/* Function: simple_diffmx() + * + * Purpose: Given a set of flushed, aligned sequences, construct + * an NxN fractional difference matrix using the + * simple_distance rule. + * + * Args: aseqs - flushed, aligned sequences + * num - number of aseqs + * ret_dmx - RETURN: difference matrix (caller must free) + * + * Return: 1 on success, 0 on failure. + */ +static int +simple_diffmx(char **aseqs, + int num, + float ***ret_dmx) +{ + float **dmx; /* RETURN: distance matrix */ + int i,j; /* counters over sequences */ + + /* Allocate + */ + if ((dmx = (float **) malloc (sizeof(float *) * num)) == NULL) + Die("malloc failed"); + for (i = 0; i < num; i++) + if ((dmx[i] = (float *) malloc (sizeof(float) * num)) == NULL) + Die("malloc failed"); + + /* Calculate distances, symmetric matrix + */ + for (i = 0; i < num; i++) + for (j = i; j < num; j++) + dmx[i][j] = dmx[j][i] = simple_distance(aseqs[i], aseqs[j]); + + /* Return + */ + *ret_dmx = dmx; + return 1; +} + + + +/* Function: BlosumWeights() + * Date: SRE, Fri Jul 16 17:33:59 1999 (St. Louis) + * + * Purpose: Assign weights to a set of aligned sequences + * using the BLOSUM rule: + * - do single linkage clustering at some pairwise identity + * - in each cluster, give each sequence 1/clustsize + * total weight. + * + * The clusters have no pairwise link >= maxid. + * + * O(N) in memory. Probably ~O(NlogN) in time; O(N^2) + * in worst case, which is no links between sequences + * (e.g., values of maxid near 1.0). + * + * Args: aseqs - alignment + * nseq - number of seqs in alignment + * alen - # of columns in alignment + * maxid - fractional identity (e.g. 0.62 for BLOSUM62) + * wgt - [0..nseq-1] array of weights to be returned + */ +void +BlosumWeights(char **aseqs, int nseq, int alen, float maxid, float *wgt) +{ + int *c, nc; + int *nmem; /* number of seqs in each cluster */ + int i; /* loop counter */ + + SingleLinkCluster(aseqs, nseq, alen, maxid, &c, &nc); + + FSet(wgt, nseq, 1.0); + nmem = MallocOrDie(sizeof(int) * nc); + + for (i = 0; i < nc; i++) nmem[i] = 0; + for (i = 0; i < nseq; i++) nmem[c[i]]++; + for (i = 0; i < nseq; i++) wgt[i] = 1. / (float) nmem[c[i]]; + + free(nmem); + free(c); + return; +} + + +/* Function: PositionBasedWeights() + * Date: SRE, Fri Jul 16 17:47:22 1999 [St. Louis] + * + * Purpose: Implementation of Henikoff and Henikoff position-based + * weights (JMB 243:574-578, 1994) [Henikoff94b]. + * + * A significant advantage of this approach that Steve and Jorja + * don't point out is that it is O(N) in memory, unlike + * many other approaches like GSC weights or Voronoi. + * + * A potential disadvantage that they don't point out + * is that in the theoretical limit of infinite sequences + * in the alignment, weights go flat: eventually every + * column has at least one representative of each of 20 aa (or 4 nt) + * in it. + * + * They also don't give a rule for how to handle gaps. + * The rule used here seems the obvious and sensible one + * (ignore them). This means that longer sequences + * initially get more weight; hence a "double + * normalization" in which the weights are first divided + * by sequence length (to compensate for that effect), + * then normalized to sum to nseq. + * + * Limitations: + * Implemented in a way that's alphabet-independent: + * it uses the 26 upper case letters as "residues". + * Any alphabetic character in aseq is interpreted as + * a unique "residue" (case insensitively; lower case + * mapped to upper case). All other characters are + * interpreted as gaps. + * + * This way, we don't have to pass around any alphabet + * type info (DNA vs. RNA vs. protein) and don't have + * to deal with remapping IUPAC degenerate codes + * probabilistically. However, on the down side, + * a sequence with a lot of degenerate IUPAC characters + * will get an artifactually high PB weight. + * + * Args: aseq - sequence alignment to weight + * nseq - number of sequences in alignment + * alen - length of alignment + * wgt - RETURN: weights filled in (pre-allocated 0..nseq-1) + * + * Returns: (void) + * wgt is allocated (0..nseq-1) by caller, and filled in here. + */ +void +PositionBasedWeights(char **aseq, int nseq, int alen, float *wgt) +{ + int rescount[26]; /* count of A-Z residues in a column */ + int nres; /* number of different residues in col */ + int idx, pos; /* indices into aseq */ + int x; + float norm; + + FSet(wgt, nseq, 0.0); + for (pos = 0; pos < alen; pos++) + { + for (x = 0; x < 26; x++) rescount[x] = 0; + for (idx = 0; idx < nseq; idx++) + if (isalpha(aseq[idx][pos])) + rescount[toupper(aseq[idx][pos]) - 'A'] ++; + + nres = 0; + for (x = 0; x < 26; x++) + if (rescount[x] > 0) nres++; + + for (idx = 0; idx < nseq; idx++) + if (isalpha(aseq[idx][pos])) + wgt[idx] += 1. / (float) (nres * rescount[toupper(aseq[idx][pos]) - 'A']); + } + + for (idx = 0; idx < nseq; idx++) + wgt[idx] /= (float) DealignedLength(aseq[idx]); + norm = (float) nseq / FSum(wgt, nseq); + FScale(wgt, nseq, norm); + return; +} + + + + +/* Function: FilterAlignment() + * Date: SRE, Wed Jun 30 09:19:30 1999 [St. Louis] + * + * Purpose: Constructs a new alignment by removing near-identical + * sequences from a given alignment (where identity is + * calculated *based on the alignment*). + * Does not affect the given alignment. + * Keeps earlier sequence, discards later one. + * + * Usually called as an ad hoc sequence "weighting" mechanism. + * + * Limitations: + * Unparsed Stockholm markup is not propagated into the + * new alignment. + * + * Args: msa -- original alignment + * cutoff -- fraction identity cutoff. 0.8 removes sequences > 80% id. + * ret_new -- RETURN: new MSA, usually w/ fewer sequences + * + * Return: (void) + * ret_new must be free'd by caller: MSAFree(). + */ +void +FilterAlignment(MSA *msa, float cutoff, MSA **ret_new) +{ + int nnew; /* number of seqs in new alignment */ + int *list; + int *useme; + float ident; + int i,j; + int remove; + + /* find which seqs to keep (list) */ + /* diff matrix; allow ragged ends */ + list = MallocOrDie (sizeof(int) * msa->nseq); + useme = MallocOrDie (sizeof(int) * msa->nseq); + for (i = 0; i < msa->nseq; i++) useme[i] = FALSE; + + nnew = 0; + for (i = 0; i < msa->nseq; i++) + { + remove = FALSE; + for (j = 0; j < nnew; j++) + { + ident = PairwiseIdentity(msa->aseq[i], msa->aseq[list[j]]); + if (ident > cutoff) + { + remove = TRUE; + printf("removing %12s -- fractional identity %.2f to %s\n", + msa->sqname[i], ident, + msa->sqname[list[j]]); + break; + } + } + if (remove == FALSE) { + list[nnew++] = i; + useme[i] = TRUE; + } + } + + MSASmallerAlignment(msa, useme, ret_new); + free(list); + free(useme); + return; +} + + +/* Function: SampleAlignment() + * Date: SRE, Wed Jun 30 10:13:56 1999 [St. Louis] + * + * Purpose: Constructs a new, smaller alignment by sampling a given + * number of sequences at random. Does not change the + * alignment nor the order of the sequences. + * + * If you ask for a sample that is larger than nseqs, + * it silently returns the original alignment. + * + * Not really a weighting method, but this is as good + * a place as any to keep it, since it's similar in + * construction to FilterAlignment(). + * + * Args: msa -- original alignment + * sample -- number of sequences in new alignment (0 < sample <= nseq) + * ret_new -- RETURN: new MSA + * + * Return: (void) + * ret_new must be free'd by caller: MSAFree(). + */ +void +SampleAlignment(MSA *msa, int sample, MSA **ret_new) +{ + int *list; /* array for random selection w/o replace */ + int *useme; /* array of flags 0..nseq-1: TRUE to use */ + int i, idx; + int len; + + /* Allocations + */ + list = (int *) MallocOrDie (sizeof(int) * msa->nseq); + useme = (int *) MallocOrDie (sizeof(int) * msa->nseq); + for (i = 0; i < msa->nseq; i++) + { + list[i] = i; + useme[i] = FALSE; + } + + /* Sanity check. + */ + if (sample >= msa->nseq) sample = msa->nseq; + + /* random selection w/o replacement */ + for (len = msa->nseq, i = 0; i < sample; i++) + { + idx = CHOOSE(len); + printf("chose %d: %s\n", list[idx], msa->sqname[list[idx]]); + useme[list[idx]] = TRUE; + list[idx] = list[--len]; + } + + MSASmallerAlignment(msa, useme, ret_new); + free(list); + free(useme); + return; +} + + +/* Function: SingleLinkCluster() + * Date: SRE, Fri Jul 16 15:02:57 1999 [St. Louis] + * + * Purpose: Perform simple single link clustering of seqs in a + * sequence alignment. A pairwise identity threshold + * defines whether two sequences are linked or not. + * + * Important: runs in O(N) memory, unlike standard + * graph decomposition algorithms that use O(N^2) + * adjacency matrices or adjacency lists. Requires + * O(N^2) time in worst case (which is when you have + * no links at all), O(NlogN) in "average" + * case, and O(N) in best case (when there is just + * one cluster in a completely connected graph. + * + * (Developed because hmmbuild could no longer deal + * with GP120, a 16,013 sequence alignment.) + * + * Limitations: + * CASE-SENSITIVE. Assumes aseq have been put into + * either all lower or all upper case; or at least, + * within a column, there's no mixed case. + * + * Algorithm: + * I don't know if this algorithm is published. I + * haven't seen it in graph theory books, but that might + * be because it's so obvious that nobody's bothered. + * + * In brief, we're going to do a breadth-first search + * of the graph, and we're going to calculate links + * on the fly rather than precalculating them into + * some sort of standard adjacency structure. + * + * While working, we keep two stacks of maximum length N: + * a : list of vertices that are still unconnected. + * b : list of vertices that we've connected to + * in our current breadth level, but we haven't + * yet tested for other connections to a. + * The current length (number of elements in) a and b are + * kept in na, nb. + * + * We store our results in an array of length N: + * c : assigns each vertex to a component. for example + * c[4] = 1 means that vertex 4 is in component 1. + * nc is the number of components. Components + * are numbered from 0 to nc-1. We return c and nc + * to our caller. + * + * The algorithm is: + * + * Initialisation: + * a <-- all the vertices + * na <-- N + * b <-- empty set + * nb <-- 0 + * nc <-- 0 + * + * Then: + * while (a is not empty) + * pop a vertex off a, push onto b + * while (b is not empty) + * pop vertex v off b + * assign c[v] = nc + * for each vertex w in a: + * compare v,w. If w is linked to v, remove w + * from a, push onto b. + * nc++ + * q.e.d. :) + * + * Args: aseq - aligned sequences + * nseq - number of sequences in aseq + * alen - alignment length + * maxid - fractional identity threshold 0..1. if id >= maxid, seqs linked + * ret_c - RETURN: 0..nseq-1 assignments of seqs to components (clusters) + * ret_nc - RETURN: number of components + * + * Returns: void. + * ret_c is allocated here. Caller free's with free(*ret_c) + */ +void +SingleLinkCluster(char **aseq, int nseq, int alen, float maxid, + int **ret_c, int *ret_nc) +{ + int *a, na; /* stack of available vertices */ + int *b, nb; /* stack of working vertices */ + int *c; /* array of results */ + int nc; /* total number of components */ + int v,w; /* index of a working vertices */ + int i; /* loop counter */ + + /* allocations and initializations + */ + a = MallocOrDie (sizeof(int) * nseq); + b = MallocOrDie (sizeof(int) * nseq); + c = MallocOrDie (sizeof(int) * nseq); + for (i = 0; i < nseq; i++) a[i] = i; + na = nseq; + nb = 0; + nc = 0; + + /* Main algorithm + */ + while (na > 0) + { + v = a[na-1]; na--; /* pop a vertex off a, */ + b[nb] = v; nb++; /* and push onto b */ + while (nb > 0) + { + v = b[nb-1]; nb--; /* pop vertex off b */ + c[v] = nc; /* assign it to component nc */ + for (i = na-1; i >= 0; i--)/* backwards, becase of deletion/swapping we do*/ + if (simple_distance(aseq[v], aseq[a[i]]) <= 1. - maxid) /* linked? */ + { + w = a[i]; a[i] = a[na-1]; na--; /* delete w from a (note swap) */ + b[nb] = w; nb++; /* push w onto b */ + } + } + nc++; + } + + /* Cleanup and return + */ + free(a); + free(b); + *ret_c = c; + *ret_nc = nc; + return; +}