diff clustalomega/clustal-omega-0.2.0/src/clustal/symmatrix.c @ 0:ff1768533a07

Migrated tool version 0.2 from old tool shed archive to new tool shed repository
author clustalomega
date Tue, 07 Jun 2011 17:04:25 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/clustalomega/clustal-omega-0.2.0/src/clustal/symmatrix.c	Tue Jun 07 17:04:25 2011 -0400
@@ -0,0 +1,496 @@
+/* -*- mode: c; tab-width: 4; c-basic-offset: 4;  indent-tabs-mode: nil -*- */
+
+/*********************************************************************
+ * Clustal Omega - Multiple sequence alignment
+ *
+ * Copyright (C) 2010 University College Dublin
+ *
+ * Clustal-Omega is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This file is part of Clustal-Omega.
+ *
+ ********************************************************************/
+
+/*
+ *  RCS $Id: symmatrix.c 230 2011-04-09 15:37:50Z andreas $
+ *
+ *
+ * Functions for symmetric (square) matrices including diagonal.
+ * supports the notion of non-square sub-matrices of a symmetric
+ * matrix, i.e. where |rows|<|cols|.
+ *
+ * FIXME Allocating one big chunk of memory is probably
+ * much faster and also easier to maintain.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <assert.h>
+#include "symmatrix.h"
+
+
+#if 0
+#define DEBUG
+#endif
+
+#define MAX_BUF_SIZE 65536
+
+/**
+ * @brief Allocates symmat and its members and initialises them. Data
+ * will be calloced, i.e. initialised with zeros.
+ *
+ * @param[out] symmat
+ * newly allocated and initialised symmatrix instance
+ * @param[in] nrows
+ * number of rows
+ * @param[in]
+ * ncols number of columns
+ *
+ * @return: non-zero on error
+ *
+ * @see FreeSymMatrix()
+ *
+ * @note: symmat data will be of fake shape nrows x ncols
+ *
+ */
+int
+NewSymMatrix(symmatrix_t **symmat, int nrows, int ncols)
+{
+    int i; /* aux */
+    
+    assert(nrows>0 && ncols>0 && ncols>=nrows);
+    assert(ncols>0 && ncols>=nrows);
+
+    (*symmat) = (symmatrix_t *) malloc(1*sizeof(symmatrix_t));
+    if (NULL == (*symmat)) {
+        fprintf(stderr, "Couldn't allocate memory (%s|%s)\n",
+                __FILE__, __FUNCTION__);
+        return -1;
+    }
+    
+    (*symmat)->data = (double **) malloc(nrows * sizeof(double *));
+    if (NULL == (*symmat)->data) {
+        fprintf(stderr, "Couldn't allocate memory (%s|%s)\n",
+                __FILE__, __FUNCTION__);
+        free(*symmat);
+        *symmat = NULL;
+        return -1;
+    }
+    for (i=0; i<nrows; i++) {
+        (*symmat)->data[i] = (double *) calloc((ncols-i), sizeof(double));
+        if (NULL == (*symmat)->data[i]) {
+            fprintf(stderr, "Couldn't allocate memory (%s|%s)\n",
+                    __FILE__, __FUNCTION__); 
+            while (0!=--i) {
+                free((*symmat)->data[i]);
+            }           
+            free((*symmat)->data);
+            free(*symmat);
+            *symmat = NULL;
+            return -1;
+        }
+#ifdef TRACE
+        fprintf(stderr, "DEBUG(%s|%s():%d) initialising symmat with the number of the beast\n",
+                __FILE__, __FUNCTION__, __LINE__);
+        {
+            int j;
+            for (j=0; j<ncols-i; j++) {
+                (*symmat)->data[i][j] = -666.0;
+            }
+
+        }
+#endif
+    }
+    
+    (*symmat)->nrows = nrows;
+    (*symmat)->ncols = ncols;
+
+    return 0;
+}
+/***   end: new_symmatrix   ***/
+
+
+/**
+ * @brief Sets symmat data of given index to given value
+ *
+ * @param[in] symmat
+ * symmatrix_t whose data is to be set
+ * @param[in] i
+ * first index 
+ * @param[in] j
+ * second index 
+ * @param[in] value
+ * value used to set data point
+ *
+ * @see SymMatrixGetValue()
+ *
+ * @note This is a convenience function that checks index order.
+ *
+ */
+void
+SymMatrixSetValue(symmatrix_t *symmat, const int i, const int j, const double value)
+{
+    assert(NULL != symmat);
+
+    if (i<=j) {
+        assert(i < symmat->nrows && j < symmat->ncols);
+        symmat->data[i][j-i] = value;
+    } else {
+        assert(j < symmat->nrows && i < symmat->ncols);
+        symmat->data[j][i-j] = value;
+    }
+}
+/***   end: symmatrix_setvalue   ***/
+
+
+
+/**
+ * @brief Returns element of symmat corresponding to given indices
+ *
+ * @param[in] symmat
+ * symmatrix_t of question
+ * @param[in] i
+ * index i
+ * @param[in] j
+ * index j
+ *
+ * @return requested value
+ *
+ * @see SymMatrixSetValue()
+ *
+ * @note This is a convenience function that checks index order.
+ */
+double
+SymMatrixGetValue(symmatrix_t *symmat, const int i, const int j)
+{
+    assert(NULL != symmat);
+
+    if (i<=j) {
+        assert(i < symmat->nrows && j < symmat->ncols);
+        return symmat->data[i][j-i];
+    } else {
+        assert(j < symmat->nrows && i < symmat->ncols);
+        return symmat->data[j][i-j];
+    }
+}
+/***   end: symmatrix_getvalue   ***/
+
+
+
+
+
+/**
+ * @brief Returns a pointer to an element of symmat corresponding to
+ * given indices
+ *
+ * @param[out] val
+ * Value to be set
+ * @param[in] symmat
+ * symmatrix_t of question
+ * @param[in] i
+ * index i
+ * @param[in] j
+ * index j
+ *
+ * @return pointer to value
+ *
+ * @see SymMatrixGetValue()
+ *
+ * @note This is a convenience function that checks index order.
+ *
+ */
+void
+SymMatrixGetValueP(double **val, symmatrix_t *symmat,
+                   const int i, const int j)
+{
+    assert(NULL != symmat);
+    
+    if (i<=j) {
+        assert(i < symmat->nrows && j < symmat->ncols);
+        *val = &(symmat->data[i][j-i]);
+    } else {
+        assert(j < symmat->nrows && i < symmat->ncols);
+        *val = &(symmat->data[j][i-j]);
+    }
+}
+/***   end: symmatrix_getvaluep   ***/
+
+
+/**
+ * @brief Frees memory allocated by data members of symmat and symmat
+ * itself. 
+ *
+ * @param[in] symmat
+ * symmatrix_t to be freed
+ *
+ * @note Use in conjunction with NewSymMatrix()
+ *
+ * @see NewSymMatrix()
+ *
+ */
+void
+FreeSymMatrix(symmatrix_t **symmat)
+{
+    int i;
+    if (NULL != (*symmat)) {
+        if (NULL != (*symmat)->data) {
+            for (i=0; i<(*symmat)->nrows; i++) {
+                free((*symmat)->data[i]);
+            }
+            free((*symmat)->data);
+        }
+    }
+    free(*symmat);
+    *symmat = NULL;
+}
+/***   end: free_symmatrix   ***/
+
+
+
+/**
+ * @brief Print out a symmat in phylip style. Distances are printed on
+ * one line per sequence/object. Since we also support matrices with
+ * rows\<cols, the first line can also be nrows by ncolumns instead
+ * of just nrows
+ *
+ * @param[in] symmat
+ * the symmatrix_t to print
+ * @param[in] labels
+ * sequence/objects labels/names. must be at least of length symmat nrows
+ * @param[in] path
+ * filename or NULL. If NULL stdout will be used.
+ *
+ */
+void
+SymMatrixPrint(symmatrix_t *symmat, char **labels,  const char *path)
+{
+    FILE *fp = NULL;
+    int i, j; /* aux */
+    int max_label_len = 0;
+    
+    if (NULL==symmat || NULL==labels) {
+        fprintf(stderr,
+                "One of the provided arguments is empty or NULL (print_matrix)\n");
+        return;
+    }
+    if (NULL==path) {
+        fp = stdout;
+    } else {
+        if (NULL==(fp=fopen(path, "w"))) {
+            fprintf(stderr, "Couldn't open %s for writing.", path);
+            return;
+        }
+    }
+
+    /* find maximum label length for pretty printing
+     */
+    for (i=0; i<symmat->nrows; i++) {
+        int this_len;
+        assert(NULL != labels[i]);
+        this_len = strlen(labels[i]);
+        if (this_len>max_label_len)
+            max_label_len = this_len;
+    }
+
+    if (symmat->ncols==symmat->nrows) {
+        fprintf(fp, "%u\n", symmat->ncols);
+    } else {
+        /* this is not standard phylip, but necessary to indicate
+           seed matrices */
+        fprintf(fp, "%u x %u\n", symmat->nrows, symmat->ncols);
+    }
+    for (i=0; i<symmat->nrows; i++) {
+        /* phylip restriction is 10 characters. we don't care here
+         */
+        fprintf(fp, "%-*s", max_label_len, labels[i]);
+        /* we are lazy and do it like fastdist: write all distances
+         *  for one seq to one line
+         */
+        for (j=0; j<symmat->ncols; j++) {
+            fprintf(fp, " %f", SymMatrixGetValue(symmat, i, j));
+        }
+        fprintf(fp, "%s", "\n");
+    }
+    if (NULL != path) {
+        (void) fclose(fp);
+    } else {
+        (void) fflush(fp);
+    }
+}
+/***   end: SymMatrixPrint   ***/
+
+
+
+/**
+ *
+ * @brief Read a distance matrix in phylip format
+ *
+ * @param[in] pcFileIn
+ * distance matrix filename 
+ * @param[out] prSymMat_p
+ * the symmatrix_t. will be allocated here.
+ * @return:
+ * non-zero on error
+ *
+ * @note:
+ * FIXME untested code
+ */
+int
+SymMatrixRead(char *pcFileIn, symmatrix_t **prSymMat_p)
+{    
+    FILE *prFilePointer;
+    char *buf;
+    /* number of parsed sequences */
+    int iNParsedSeq = 0; 
+    /* number of parsed distances per sequence */
+    int iNParsedDists = 0;
+    /* total number of sequences/objects */
+    int iTotalNSeq = 0; 
+    int iRetCode = 0;
+
+    assert(NULL!=pcFileIn);
+
+    /* FIXME: test correctness and implement label checking */
+    fprintf(stderr, "WARNING: Reading of distance matrix from file not thoroughly tested!\n");
+    fprintf(stderr, "WARNING: Assuming same order of sequences in sequence file and distance matrix file (matching of labels not implemented)\n");
+
+    if (NULL == (buf = (char *) malloc(MAX_BUF_SIZE * sizeof(char)))) {
+        fprintf(stderr, "ERROR: couldn't allocate memory at %s:%s:%d\n",
+                __FILE__, __FUNCTION__, __LINE__);
+        return -1;
+    }
+    
+    if (NULL == (prFilePointer = fopen(pcFileIn, "r"))) {
+        fprintf(stderr, "ERROR: Couldn't open %s for reading\n", pcFileIn);
+        free(buf);
+        return -1;
+    }
+    
+    /* get number of sequences from first line and allocate memory for
+     * distance matrix
+     *
+     */
+    if (NULL == fgets(buf, MAX_BUF_SIZE, prFilePointer) ) {
+        fprintf(stderr, "Couldn't read first line from %s\n", pcFileIn);
+        iRetCode = -1;
+        goto closefile_and_freebuf;
+    }
+    if (strlen(buf)==MAX_BUF_SIZE-1) {
+        fprintf(stderr, "%s\n", "Looks like I couldn't read complete line. Wrong format (or too small MAX_BUF_SIZE)");
+        iRetCode = -1;
+        goto closefile_and_freebuf;
+    }
+    if (sscanf(buf, "%d", &iTotalNSeq)!=1) {
+        fprintf(stderr, "ERROR: couldn't parse number of sequences from first line of %s\n", pcFileIn); 
+        iRetCode = -1;
+        goto closefile_and_freebuf;
+    }
+
+#if TRACE
+    Log(&rLog, LOG_FORCED_DEBUG, "iTotalNSeq parsed from %s is %d\n", pcFileIn, iTotalNSeq);
+#endif
+    
+    if (NewSymMatrix(prSymMat_p, iTotalNSeq, iTotalNSeq)) {
+        fprintf(stderr, "FATAL %s", "Memory allocation for distance matrix failed");
+        iRetCode = -1;
+        goto closefile_and_freebuf;
+    }
+
+
+    /* parse file line by line
+     *
+     */
+    while (NULL != fgets(buf, MAX_BUF_SIZE, prFilePointer)) {
+        char *szToken;
+        int is_newseq;
+
+        if (MAX_BUF_SIZE-1 == strlen(buf)) {
+            fprintf(stderr, "%s\n", "Looks like I couldn't read complete line. Wrong format (or too small MAX_BUF_SIZE)");
+            iRetCode = -1;
+            goto closefile_and_freebuf;
+        }
+
+#ifdef DEBUG
+        Log(&rLog, LOG_FORCED_DEBUG, "Got line: %s\n", buf);
+#endif
+
+        /* new sequence label at beginning of line?
+         */
+        if (isblank(buf[0])) {
+            is_newseq = 0;
+        } else {
+            is_newseq = 1;
+        }
+
+        /* tokenise line and treat new sequence specially
+         */
+        szToken = strtok(buf, " \t");
+        if (is_newseq==1) {
+            iNParsedSeq++;
+            iNParsedDists=0;
+
+            /* if format is lower dimensional matrix then first
+             * sequence has no distances but might have newline
+             * character at it's end.
+             */
+            while (isspace(szToken[strlen(szToken)-1])) {
+                szToken[strlen(szToken)-1]='\0';
+            }
+            /* FIXME save label? */
+            szToken = strtok(NULL, " \t");
+        }
+        /* from here on it's only parsing of distances */
+        while (szToken != NULL) {
+            double dist;
+            iNParsedDists++;
+
+            /* only parse what's needed */
+            if (iNParsedDists!=iNParsedSeq) {
+                /* parse and store distance
+                 */
+                if (sscanf(szToken, "%lf", &dist)!=1) {
+                    fprintf(stderr, "Couldn't parse float from entry '%s'\n", szToken);
+                    iRetCode = -1;
+                    goto closefile_and_freebuf;
+                }
+#if TRACE
+                Log(&rLog, LOG_FORCED_DEBUG, "Parsed distance %d for seq %d = %f\n", iNParsedDists-1, iNParsedSeq-1, dist);
+#endif
+                SymMatrixSetValue(*prSymMat_p, iNParsedSeq-1, iNParsedDists-1, dist);
+                SymMatrixSetValue(*prSymMat_p, iNParsedDists-1, iNParsedSeq-1, dist);
+            }
+            szToken = strtok(NULL, " \t");
+        }        
+    }
+
+    if (iTotalNSeq!=iNParsedSeq) {
+        fprintf(stderr, "expected %d seqs, but only parsed %d\n", iTotalNSeq, iNParsedSeq);
+        iRetCode = -1;
+        goto closefile_and_freebuf;
+    }
+#if TRACE
+    for (i=0; i<iNParsedSeq; i++) {
+        int j;
+        for (j=0; j<iNParsedSeq; j++) {
+            Log(&rLog, LOG_FORCED_DEBUG, "prSymMat_p[%d][%d]=%f\n", i, j, (*prSymMat_p)[i][j]);
+        }
+    }
+#endif
+
+closefile_and_freebuf:
+    fclose(prFilePointer);
+    free(buf);
+    
+    return iRetCode;
+}
+/***   end: SymMatrixRead   ***/
+