view clustalomega/clustal-omega-0.2.0/src/squid/stockholm.c @ 0:ff1768533a07

Migrated tool version 0.2 from old tool shed archive to new tool shed repository
author clustalomega
date Tue, 07 Jun 2011 17:04:25 -0400
parents
children
line wrap: on
line source

/*****************************************************************
 * SQUID - a library of functions for biological sequence analysis
 * Copyright (C) 1992-2002 Washington University School of Medicine
 * 
 *     This source code is freely distributed under the terms of the
 *     GNU General Public License. See the files COPYRIGHT and LICENSE
 *     for details.
 *****************************************************************/

/* stockholm.c
 * SRE, Fri May 28 15:46:41 1999
 * 
 * Reading/writing of Stockholm format multiple sequence alignments.
 * 
 * example of API:
 * 
 * MSA     *msa;
 * FILE    *fp;        -- opened for write with fopen()
 * MSAFILE *afp;       -- opened for read with MSAFileOpen()
 *      
 * while ((msa = ReadStockholm(afp)) != NULL)
 *   {
 *      WriteStockholm(fp, msa);
 *      MSAFree(msa);
 *   }
 * 
 * RCS $Id: stockholm.c 217 2011-03-19 10:27:10Z andreas $ (Original squid RCS Id: stockholm.c,v 1.7 2002/10/12 04:40:36 eddy Exp)
 */
#include <stdio.h>
#include <string.h>
#include "squid.h"
#include "msa.h"

static int  parse_gf(MSA *msa, char *buf);
static int  parse_gs(MSA *msa, char *buf);
static int  parse_gc(MSA *msa, char *buf);
static int  parse_gr(MSA *msa, char *buf);
static int  parse_comment(MSA *msa, char *buf);
static int  parse_sequence(MSA *msa, char *buf);
static void actually_write_stockholm(FILE *fp, MSA *msa, int cpl);

#ifdef TESTDRIVE_STOCKHOLM
/*****************************************************************
 * stockholm.c test driver: 
 * cc -DTESTDRIVE_STOCKHOLM -g -O2 -Wall -o test stockholm.c msa.c gki.c sqerror.c sre_string.c file.c hsregex.c sre_math.c sre_ctype.c -lm 
 * 
 */
int
main(int argc, char **argv)
{
  MSAFILE *afp;
  MSA     *msa;
  char    *file;
  
  file = argv[1];

  if ((afp = MSAFileOpen(file, MSAFILE_STOCKHOLM, NULL)) == NULL)
    Die("Couldn't open %s\n", file);

  while ((msa = ReadStockholm(afp)) != NULL)
    {
      WriteStockholm(stdout, msa);
      MSAFree(msa); 
    }
  
  MSAFileClose(afp);
  exit(0);
}
/******************************************************************/
#endif /* testdriver */


/* Function: ReadStockholm()
 * Date:     SRE, Fri May 21 17:33:10 1999 [St. Louis]
 *
 * Purpose:  Parse the next alignment from an open Stockholm
 *           format alignment file. Return the alignment, or
 *           NULL if there are no more alignments in the file.
 *
 * Args:     afp  - open alignment file
 *
 * Returns:  MSA *   - an alignment object. 
 *                     caller responsible for an MSAFree() 
 *           NULL if no more alignments
 *
 * Diagnostics:
 *           Will Die() here with a (potentially) useful message
 *           if a parsing error occurs 
 */
MSA *
ReadStockholm(MSAFILE *afp)
{
  MSA   *msa;
  char  *s;
  int    status;

  if (feof(afp->f)) return NULL;

  /* Initialize allocation of the MSA.
   */
  msa = MSAAlloc(10, 0);

  /* Check the magic Stockholm header line.
   * We have to skip blank lines here, else we perceive
   * trailing blank lines in a file as a format error when
   * reading in multi-record mode.
   */
  do {
    if ((s = MSAFileGetLine(afp)) == NULL) {
      MSAFree(msa);
      return NULL;
    }
  } while (IsBlankline(s));

  if (strncmp(s, "# STOCKHOLM 1.", 14) != 0)
    Die("\
File %s doesn't appear to be in Stockholm format.\n\
Assuming there isn't some other problem with your file (it is an\n\
alignment file, right?), please either:\n\
  a) use the Babelfish format autotranslator option (-B, usually);\n\
  b) specify the file's format with the --informat option; or\n\
  a) reformat the alignment to Stockholm format.\n", 
	afp->fname);

  /* Read the alignment file one line at a time.
   */
  while ((s = MSAFileGetLine(afp)) != NULL) 
    {
      while (*s == ' ' || *s == '\t') s++;  /* skip leading whitespace */

      if (*s == '#') {
	if      (strncmp(s, "#=GF", 4) == 0)   status = parse_gf(msa, s);
	else if (strncmp(s, "#=GS", 4) == 0)   status = parse_gs(msa, s);
	else if (strncmp(s, "#=GC", 4) == 0)   status = parse_gc(msa, s);
	else if (strncmp(s, "#=GR", 4) == 0)   status = parse_gr(msa, s);
	else                                   status = parse_comment(msa, s);
      } 
      else if (strncmp(s, "//",   2) == 0)   break;
      else if (*s == '\n')                   continue;
      else                                   status = parse_sequence(msa, s);

      if (status == 0)  
	Die("Stockholm format parse error: line %d of file %s while reading alignment %s",
	    afp->linenumber, afp->fname, msa->name == NULL? "" : msa->name);
    }

  if (s == NULL && msa->nseq != 0)
    Die ("Didn't find // at end of alignment %s", msa->name == NULL ? "" : msa->name);

  if (s == NULL && msa->nseq == 0) {
    				/* probably just some junk at end of file */
      MSAFree(msa); 
      return NULL; 
    }
  
  MSAVerifyParse(msa);
  return msa;
}


/* Function: WriteStockholm()
 * Date:     SRE, Mon May 31 19:15:22 1999 [St. Louis]
 *
 * Purpose:  Write an alignment in standard multi-block 
 *           Stockholm format to an open file. A wrapper
 *           for actually_write_stockholm().
 *
 * Args:     fp  - file that's open for writing
 *           msa - alignment to write    
 *
 * Returns:  (void)
 */
void
WriteStockholm(FILE *fp, MSA *msa)
{
  actually_write_stockholm(fp, msa, 50); /* 50 char per block */
}

/* Function: WriteStockholmOneBlock()
 * Date:     SRE, Mon May 31 19:15:22 1999 [St. Louis]
 *
 * Purpose:  Write an alignment in Pfam's single-block
 *           Stockholm format to an open file. A wrapper
 *           for actually_write_stockholm().
 *
 * Args:     fp  - file that's open for writing
 *           msa - alignment to write    
 *
 * Returns:  (void)
 */
void
WriteStockholmOneBlock(FILE *fp, MSA *msa)
{
  actually_write_stockholm(fp, msa, msa->alen); /* one big block */
}


/* Function: actually_write_stockholm()
 * Date:     SRE, Fri May 21 17:39:22 1999 [St. Louis]
 *
 * Purpose:  Write an alignment in Stockholm format to 
 *           an open file. This is the function that actually
 *           does the work. The API's WriteStockholm()
 *           and WriteStockholmOneBlock() are wrappers.
 *
 * Args:     fp    - file that's open for writing
 *           msa   - alignment to write        
 *           cpl   - characters to write per line in alignment block
 *
 * Returns:  (void)
 */
static void
actually_write_stockholm(FILE *fp, MSA *msa, int cpl)
{
  int  i, j;
  int  len = 0;
  int  namewidth;
  int  typewidth = 0;		/* markup tags are up to 5 chars long */
  int  markupwidth = 0;		/* #=GR, #=GC are four char wide + 1 space */
  char *buf;
  int  currpos;
  char *s, *tok;
  
  /* Figure out how much space we need for name + markup
   * to keep the alignment in register. Required by Stockholm
   * spec, even though our Stockholm parser doesn't care (Erik's does).
   */
  namewidth = 0;
  for (i = 0; i < msa->nseq; i++)
    if ((len = strlen(msa->sqname[i])) > namewidth) 
      namewidth = len;

  /* Figure out how much space we need for markup tags
   *   markupwidth = always 4 if we're doing markup:  strlen("#=GR")
   *   typewidth   = longest markup tag
   */
  if (msa->ss      != NULL) { markupwidth = 4; typewidth = 2; }
  if (msa->sa      != NULL) { markupwidth = 4; typewidth = 2; }
  for (i = 0; i < msa->ngr; i++)
    if ((len = strlen(msa->gr_tag[i])) > typewidth) typewidth = len;

  if (msa->rf      != NULL) { markupwidth = 4; if (typewidth < 2) typewidth = 2; }
  if (msa->ss_cons != NULL) { markupwidth = 4; if (typewidth < 7) typewidth = 7; }
  if (msa->sa_cons != NULL) { markupwidth = 4; if (typewidth < 7) typewidth = 7; }
  for (i = 0; i < msa->ngc; i++)
    if ((len = strlen(msa->gc_tag[i])) > typewidth) typewidth = len;
  
  buf = MallocOrDie(sizeof(char) * (cpl+namewidth+typewidth+markupwidth+61)); 

  /* Magic Stockholm header
   */
  fprintf(fp, "# STOCKHOLM 1.0\n");

  /* Free text comments
   */
  for (i = 0;  i < msa->ncomment; i++)
    fprintf(fp, "# %s\n", msa->comment[i]);
  if (msa->ncomment > 0) fprintf(fp, "\n");

  /* GF section: per-file annotation
   */
  if (msa->name  != NULL)       fprintf(fp, "#=GF ID    %s\n", msa->name);
  if (msa->acc   != NULL)       fprintf(fp, "#=GF AC    %s\n", msa->acc);
  if (msa->desc  != NULL)       fprintf(fp, "#=GF DE    %s\n", msa->desc);
  if (msa->au    != NULL)       fprintf(fp, "#=GF AU    %s\n", msa->au);
  
  /* Thresholds are hacky. Pfam has two. Rfam has one.
   */
  if      (msa->cutoff_is_set[MSA_CUTOFF_GA1] && msa->cutoff_is_set[MSA_CUTOFF_GA2])
    fprintf(fp, "#=GF GA    %.1f %.1f\n", msa->cutoff[MSA_CUTOFF_GA1], msa->cutoff[MSA_CUTOFF_GA2]);
  else if (msa->cutoff_is_set[MSA_CUTOFF_GA1])
    fprintf(fp, "#=GF GA    %.1f\n", msa->cutoff[MSA_CUTOFF_GA1]);
  if      (msa->cutoff_is_set[MSA_CUTOFF_NC1] && msa->cutoff_is_set[MSA_CUTOFF_NC2])
    fprintf(fp, "#=GF NC    %.1f %.1f\n", msa->cutoff[MSA_CUTOFF_NC1], msa->cutoff[MSA_CUTOFF_NC2]);
  else if (msa->cutoff_is_set[MSA_CUTOFF_NC1])
    fprintf(fp, "#=GF NC    %.1f\n", msa->cutoff[MSA_CUTOFF_NC1]);
  if      (msa->cutoff_is_set[MSA_CUTOFF_TC1] && msa->cutoff_is_set[MSA_CUTOFF_TC2])
    fprintf(fp, "#=GF TC    %.1f %.1f\n", msa->cutoff[MSA_CUTOFF_TC1], msa->cutoff[MSA_CUTOFF_TC2]);
  else if (msa->cutoff_is_set[MSA_CUTOFF_TC1])
    fprintf(fp, "#=GF TC    %.1f\n", msa->cutoff[MSA_CUTOFF_TC1]);

  for (i = 0; i < msa->ngf; i++)
    fprintf(fp, "#=GF %-5s %s\n", msa->gf_tag[i], msa->gf[i]); 
  fprintf(fp, "\n");


  /* GS section: per-sequence annotation
   */
  if (msa->flags & MSA_SET_WGT) 
    {
      for (i = 0; i < msa->nseq; i++) 
	fprintf(fp, "#=GS %-*.*s WT    %.2f\n", namewidth, namewidth, msa->sqname[i], msa->wgt[i]);
      fprintf(fp, "\n");
    }
  if (msa->sqacc != NULL) 
    {
      for (i = 0; i < msa->nseq; i++) 
	if (msa->sqacc[i] != NULL)
	  fprintf(fp, "#=GS %-*.*s AC    %s\n", namewidth, namewidth, msa->sqname[i], msa->sqacc[i]);
      fprintf(fp, "\n");
    }
  if (msa->sqdesc != NULL) 
    {
      for (i = 0; i < msa->nseq; i++) 
	if (msa->sqdesc[i] != NULL)
	  fprintf(fp, "#=GS %*.*s DE    %s\n", namewidth, namewidth, msa->sqname[i], msa->sqdesc[i]);
      fprintf(fp, "\n");
    }
  for (i = 0; i < msa->ngs; i++)
    {
      /* Multiannotated GS tags are possible; for example, 
       *     #=GS foo DR PDB; 1xxx;
       *     #=GS foo DR PDB; 2yyy;
       * These are stored, for example, as:
       *     msa->gs[0][0] = "PDB; 1xxx;\nPDB; 2yyy;"
       * and must be decomposed.
       */
      for (j = 0; j < msa->nseq; j++)
	if (msa->gs[i][j] != NULL)
	  {
	    s = msa->gs[i][j];
	    while ((tok = sre_strtok(&s, "\n", NULL)) != NULL)
	      fprintf(fp, "#=GS %*.*s %5s %s\n", namewidth, namewidth,
		      msa->sqname[j], msa->gs_tag[i], tok);
	  }
      fprintf(fp, "\n");
    }

  /* Alignment section:
   * contains aligned sequence, #=GR annotation, and #=GC annotation
   */
  for (currpos = 0; currpos < msa->alen; currpos += cpl)
    {
      if (currpos > 0) fprintf(fp, "\n");
      for (i = 0; i < msa->nseq; i++)
	{
	  strncpy(buf, msa->aseq[i] + currpos, cpl);
	  buf[cpl] = '\0';	      
	  fprintf(fp, "%-*.*s  %s\n", namewidth+typewidth+markupwidth, namewidth+typewidth+markupwidth, 
		  msa->sqname[i], buf);

	  if (msa->ss != NULL && msa->ss[i] != NULL) {
	    strncpy(buf, msa->ss[i] + currpos, cpl);
	    buf[cpl] = '\0';	 
	    fprintf(fp, "#=GR %-*.*s SS     %s\n", namewidth, namewidth, msa->sqname[i], buf);
	  }
	  if (msa->sa != NULL && msa->sa[i] != NULL) {
	    strncpy(buf, msa->sa[i] + currpos, cpl);
	    buf[cpl] = '\0';
	    fprintf(fp, "#=GR %-*.*s SA     %s\n", namewidth, namewidth, msa->sqname[i], buf);
	  }
	  for (j = 0; j < msa->ngr; j++)
	    if (msa->gr[j][i] != NULL) {
	      strncpy(buf, msa->gr[j][i] + currpos, cpl);
	      buf[cpl] = '\0';
	      fprintf(fp, "#=GR %-*.*s %5s  %s\n", 
		      namewidth, namewidth, msa->sqname[i], msa->gr_tag[j], buf);
	    }
	}
      if (msa->ss_cons != NULL) {
	strncpy(buf, msa->ss_cons + currpos, cpl);
	buf[cpl] = '\0';
	fprintf(fp, "#=GC %-*.*s %s\n", namewidth+typewidth, namewidth+typewidth, "SS_cons", buf);
      }

      if (msa->sa_cons != NULL) {
	strncpy(buf, msa->sa_cons + currpos, cpl);
	buf[cpl] = '\0';
	fprintf(fp, "#=GC %-*.*s %s\n", namewidth+typewidth, namewidth+typewidth, "SA_cons", buf);
      }

      if (msa->rf != NULL) {
	strncpy(buf, msa->rf + currpos, cpl);
	buf[cpl] = '\0';
	fprintf(fp, "#=GC %-*.*s %s\n", namewidth+typewidth, namewidth+typewidth, "RF", buf);
      }
      for (j = 0; j < msa->ngc; j++) {
	strncpy(buf, msa->gc[j] + currpos, cpl);
	buf[cpl] = '\0';
	fprintf(fp, "#=GC %-*.*s %s\n", namewidth+typewidth, namewidth+typewidth, 
		msa->gc_tag[j], buf);
      }
    }
  fprintf(fp, "//\n");
  free(buf);
}





/* Format of a GF line:
 *    #=GF <featurename> <text>
 */
static int
parse_gf(MSA *msa, char *buf)
{
  char *gf;
  char *featurename;
  char *text;
  char *s;

  s = buf;
  if ((gf          = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
  if ((featurename = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
  if ((text        = sre_strtok(&s, "\n",       NULL)) == NULL) return 0;
  while (*text && (*text == ' ' || *text == '\t')) text++;

  if      (strcmp(featurename, "ID") == 0) 
    msa->name                 = sre_strdup(text, -1);
  else if (strcmp(featurename, "AC") == 0) 
    msa->acc                  = sre_strdup(text, -1);
  else if (strcmp(featurename, "DE") == 0) 
    msa->desc                 = sre_strdup(text, -1);
  else if (strcmp(featurename, "AU") == 0) 
    msa->au                   = sre_strdup(text, -1);
  else if (strcmp(featurename, "GA") == 0) 
    {				/* Pfam has GA1, GA2. Rfam just has GA1. */
      s = text;
      if ((text = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
      msa->cutoff[MSA_CUTOFF_GA1]        = atof(text);
      msa->cutoff_is_set[MSA_CUTOFF_GA1] = TRUE;
      if ((text = sre_strtok(&s, WHITESPACE, NULL)) != NULL) {
	msa->cutoff[MSA_CUTOFF_GA2]        = atof(text);
	msa->cutoff_is_set[MSA_CUTOFF_GA2] = TRUE;
      }
    }
  else if (strcmp(featurename, "NC") == 0) 
    {
      s = text;
      if ((text = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
      msa->cutoff[MSA_CUTOFF_NC1]        = atof(text);
      msa->cutoff_is_set[MSA_CUTOFF_NC1] = TRUE;
      if ((text = sre_strtok(&s, WHITESPACE, NULL)) != NULL) {
	msa->cutoff[MSA_CUTOFF_NC2]        = atof(text);
	msa->cutoff_is_set[MSA_CUTOFF_NC2] = TRUE;
      }
    }
  else if (strcmp(featurename, "TC") == 0) 
    {
      s = text;
      if ((text = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
      msa->cutoff[MSA_CUTOFF_TC1]        = atof(text);
      msa->cutoff_is_set[MSA_CUTOFF_TC1] = TRUE;
      if ((text = sre_strtok(&s, WHITESPACE, NULL)) != NULL) {
	msa->cutoff[MSA_CUTOFF_TC2]        = atof(text);
	msa->cutoff_is_set[MSA_CUTOFF_TC2] = TRUE;
      }
    }
  else 
    MSAAddGF(msa, featurename, text);

  return 1;
}


/* Format of a GS line:
 *    #=GS <seqname> <featurename> <text>
 */
static int
parse_gs(MSA *msa, char *buf)
{
  char *gs;
  char *seqname;
  char *featurename;
  char *text; 
  int   seqidx;
  char *s;

  s = buf;
  if ((gs          = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
  if ((seqname     = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
  if ((featurename = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
  if ((text        = sre_strtok(&s, "\n",       NULL)) == NULL) return 0;
  while (*text && (*text == ' ' || *text == '\t')) text++;
  
  /* GS usually follows another GS; guess lastidx+1
   */
  seqidx = MSAGetSeqidx(msa, seqname, msa->lastidx+1);
  msa->lastidx = seqidx;

  if (strcmp(featurename, "WT") == 0)
    {
      msa->wgt[seqidx]          = atof(text);
      msa->flags |= MSA_SET_WGT;
    }

  else if (strcmp(featurename, "AC") == 0)
    MSASetSeqAccession(msa, seqidx, text);

  else if (strcmp(featurename, "DE") == 0)
    MSASetSeqDescription(msa, seqidx, text);

  else				
    MSAAddGS(msa, featurename, seqidx, text);

  return 1;
}

/* Format of a GC line:
 *    #=GC <featurename> <text>
 */
static int 
parse_gc(MSA *msa, char *buf)
{
  char *gc;
  char *featurename;
  char *text; 
  char *s;
  int   len;

  s = buf;
  if ((gc          = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
  if ((featurename = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
  if ((text        = sre_strtok(&s, WHITESPACE, &len)) == NULL) return 0;
  
  if (strcmp(featurename, "SS_cons") == 0)
    sre_strcat(&(msa->ss_cons), -1, text, len);
  else if (strcmp(featurename, "SA_cons") == 0)
    sre_strcat(&(msa->sa_cons), -1, text, len);
  else if (strcmp(featurename, "RF") == 0)
    sre_strcat(&(msa->rf), -1, text, len);
  else
    MSAAppendGC(msa, featurename, text);

  return 1;
}

/* Format of a GR line:
 *    #=GR <seqname> <featurename> <text>
 */
static int
parse_gr(MSA *msa, char *buf)
{
  char *gr;
  char *seqname;
  char *featurename;
  char *text;
  int   seqidx;
  int   len;
  int   j;
  char *s;

  s = buf;
  if ((gr          = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
  if ((seqname     = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
  if ((featurename = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
  if ((text        = sre_strtok(&s, WHITESPACE, &len)) == NULL) return 0;

  /* GR usually follows sequence it refers to; guess msa->lastidx */
  seqidx = MSAGetSeqidx(msa, seqname, msa->lastidx);
  msa->lastidx = seqidx;

  if (strcmp(featurename, "SS") == 0) 
    {
      if (msa->ss == NULL)
	{
	  msa->ss    = MallocOrDie(sizeof(char *) * msa->nseqalloc);
	  msa->sslen = MallocOrDie(sizeof(int)    * msa->nseqalloc);
	  for (j = 0; j < msa->nseqalloc; j++)
	    {
	      msa->ss[j]    = NULL;
	      msa->sslen[j] = 0;
	    }
	}
      msa->sslen[seqidx] = sre_strcat(&(msa->ss[seqidx]), msa->sslen[seqidx], text, len);
    }
  else if (strcmp(featurename, "SA") == 0)
    {
      if (msa->sa == NULL)
	{
	  msa->sa    = MallocOrDie(sizeof(char *) * msa->nseqalloc);
	  msa->salen = MallocOrDie(sizeof(int)    * msa->nseqalloc);
	  for (j = 0; j < msa->nseqalloc; j++) 
	    {
	      msa->sa[j]    = NULL;
	      msa->salen[j] = 0;
	    }
	}
      msa->salen[seqidx] = sre_strcat(&(msa->sa[seqidx]), msa->salen[seqidx], text, len);
    }
  else 
    MSAAppendGR(msa, featurename, seqidx, text);

  return 1;
}


/* comments are simply stored verbatim, not parsed
 */
static int
parse_comment(MSA *msa, char *buf)
{
  char *s;
  char *comment;

  s = buf + 1;			               /* skip leading '#' */
  if (*s == '\n') { *s = '\0'; comment = s; }  /* deal with blank comment */
  else if ((comment = sre_strtok(&s, "\n", NULL)) == NULL) return 0;
  
  MSAAddComment(msa, comment);
  return 1;
}

static int
parse_sequence(MSA *msa, char *buf)
{
  char *s;
  char *seqname;
  char *text;
  int   seqidx;
  int   len;

  s = buf;
  if ((seqname     = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
  if ((text        = sre_strtok(&s, WHITESPACE, &len)) == NULL) return 0; 
  
  /* seq usually follows another seq; guess msa->lastidx +1 */
  seqidx = MSAGetSeqidx(msa, seqname, msa->lastidx+1);
  msa->lastidx = seqidx;

  msa->sqlen[seqidx] = sre_strcat(&(msa->aseq[seqidx]), msa->sqlen[seqidx], text, len);
  return 1;
}