view clustalomega/clustal-omega-0.2.0/src/squid/gki.c @ 0:ff1768533a07

Migrated tool version 0.2 from old tool shed archive to new tool shed repository
author clustalomega
date Tue, 07 Jun 2011 17:04:25 -0400
parents
children
line wrap: on
line source

/*****************************************************************
 * SQUID - a library of functions for biological sequence analysis
 * Copyright (C) 1992-2002 Washington University School of Medicine
 * 
 *     This source code is freely distributed under the terms of the
 *     GNU General Public License. See the files COPYRIGHT and LICENSE
 *     for details.
 *****************************************************************/

/* gki.c
 * SRE, Sat May  1 14:49:08 1999
 * 
 * "generic key index" module: emulation of Perl hashes.
 * Maps keys (ASCII char strings) to array index. Dynamically
 * resizes the hash table. 
 * 
 * Limitations:
 *     - hash table can only grow; no provision for deleting keys
 *       or downsizing the hash table.
 *     - Maximum hash table size set at 100003. Performance 
 *       will degrade for key sets much larger than this.
 *     - Assumes that integers are 32 bits (or greater). 
 * 
 * Defines a typedef'd structure:
 *     gki           - a key index hash table.
 * Provides functions:
 *     GKIInit()     - start a hash table.
 *     GKIStoreKey() - store a new key, get a unique index.                        
 *     GKIKeyIndex() - retrieve an existing key's index.
 *     GKIFree()     - free a hash table.
 *     GKIStatus()   - Debugging: prints internal status of a hash struct
 *            
 *
 * Note that there are no dependencies on squid; the gki.c/gki.h
 * pair are base ANSI C and can be reused anywhere.
 *****************************************************************
 * 
 * API for storing/reading stuff: 
 * moral equivalent of Perl's $foo{$key} = whatever, $bar{$key} = whatever:
 *       #include "gki.h"
 *     
 *       gki  *hash;
 *       int   idx;
 *       char *key;
 *       
 *       hash = GKIInit();
 * (Storing:) 
 *       (foreach key) {
 *          idx = GKIStoreKey(hash, key);       
 *          (reallocate foo, bar as needed)
 *          foo[idx] = whatever;
 *          bar[idx] = whatever;
 *       }     
 * (Reading:)
 *       (foreach key) {
 *          idx = GKIKeyIndex(hash, key);
 *          if (idx == -1) {no_such_key; }
 *          (do something with) foo[idx];
 *          (do something with) bar[idx];
 *       }   
 *       GKIFree();
 *       
 *****************************************************************
 *
 * Timings on wrasse for 45402 keys in /usr/dict/words using
 * Tests/test_gki: 
 *      250 msec store      (6 usec/store)
 *      140 msec retrieve   (3 usec/retrieve)
 * and using the 13408 names of Pfam's GP120.full alignment:
 *       70 msec store      (5 usec/store)
 *       50 msec retrieve   (4 usec/retrieve)     
 * 
 * RCS $Id: gki.c 217 2011-03-19 10:27:10Z andreas $ (Original squid RCS Id: gki.c,v 1.3 2000/12/21 23:42:59 eddy Exp)
 */



#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <limits.h>
#include "squid.h"
#include "gki.h"

/* 
 *   Best hash table sizes are prime numbers (see Knuth vol 3, Sorting
 * and Searching). 
 *   gki_primes[] defines the ascending order of hash table sizes
 * that we use in upsizing the hash table dynamically.
 *   useful site for testing primes:
 * http://www.idbsu.edu/people/jbrennan/algebra/numbers/sieve.html
 *   Because of the way gki_hashvalue works, the largest number
 * must be < INT_MAX / 128 / 128 : 131072 on a 32 bit machine.
 */
static int gki_primes[]  = { 101, 1009, 10007, 100003 };
#define GKI_NPRIMES      4
#define GKI_ALPHABETSIZE 128

static GKI *gki_alloc(int primelevel);
static int  gki_hashvalue(GKI *hash, char *key);
static int  gki_upsize(GKI *old);


/* Function: GKIInit()
 * Date:     SRE, Sat May  1 11:12:24 1999 [May Day geek-out]
 *
 * Purpose:  Initialize a hash table for key indexing.  
 *           Simply a wrapper around a level 0 gki_alloc().
 *
 * Args:     (void)
 *
 * Returns:  An allocated hash table structure.
 *           Caller frees with GKIFree().
 */
GKI *
GKIInit(void)
{
  GKI *hash;
  hash = gki_alloc(0);
  return hash;
}

/* Function: GKIFree()
 * Date:     SRE, Sat May  1 11:13:26 1999 [May Day geek-out]
 *
 * Purpose:  Free a key index hash table.
 *
 * Args:     hash  - the gki structure
 *
 * Returns:  (void).
 *           hash table is destroyed.
 */
void
GKIFree(GKI *hash)
{
  struct gki_elem *ptr;
  int       i;

  if (hash == NULL) return;	/* tolerate a NULL */

  for (i = 0; i < hash->nhash; i++)
    while (hash->table[i] != NULL)
      {
	ptr = hash->table[i]->nxt;
				/* NULL keys can occur after we've gki_upsize'd */
	if (hash->table[i]->key != NULL) free(hash->table[i]->key);
	free(hash->table[i]);
	hash->table[i] = ptr;
      }
  free(hash->table);
  free(hash);
}

/* Function: GKIStoreKey()
 * Date:     SRE, Sat May  1 11:16:48 1999 [May Day geek-out]
 *
 * Purpose:  Store a key in the key index hash table.
 *           Associate it with a unique "key index", counting
 *           from 0. (It's this index that lets us map
 *           the hashed keys to indexed C arrays, (clumsily)
 *           emulating Perl's hashes.)
 *
 *           Does *not* check to see if the key's already
 *           in the table, so it's possible to store multiple
 *           copies of a key with different indices; probably
 *           not what you want, so if you're not sure the 
 *           key is unique, check the table first with 
 *           GKIKeyIndex().
 *
 * Args:     hash  - GKI structure to store the key in
 *           key   - string to store
 *
 * Returns:  the new key's index. Since it's always the
 *           last one in the current array, this index is
 *           just hash->nkeys-1.
 *           On a malloc failure, returns -1.
 *           hash table is modified.
 */
int
GKIStoreKey(GKI *hash, char *key)
{
  int val;
  struct gki_elem *ptr;

  val = gki_hashvalue(hash, key);
  
  ptr = hash->table[val];
  hash->table[val]      = MallocOrDie(sizeof(struct gki_elem));
  hash->table[val]->key = MallocOrDie(sizeof(char) * (strlen(key)+1));
  strcpy(hash->table[val]->key, key);

  hash->table[val]->idx = hash->nkeys;
  hash->table[val]->nxt = ptr;

  hash->nkeys++;
				/* time to upsize? */
  if (hash->nkeys > 3*hash->nhash && hash->primelevel < GKI_NPRIMES-1)
    gki_upsize(hash);

  return hash->nkeys-1; 
}

/* Function: GKIKeyIndex()
 * Date:     SRE, Sat May  1 11:20:42 1999 [May Day geek-out]
 *
 * Purpose:  Look up a key in the hash table. Return
 *           its index (0..nkeys-1), else -1 if the key
 *           isn't in the hash (yet).
 *           
 * Args:     hash  - the GKI hash table to search in
 *           key   - the key to look up        
 *
 * Returns:  -1 if key is not found;
 *           index of key if it is found (range 0..nkeys-1).
 *           hash table is unchanged.
 */
int
GKIKeyIndex(GKI *hash, char *key)
{
  struct gki_elem *ptr;
  int val;
  
  val = gki_hashvalue(hash, key);
  for (ptr = hash->table[val]; ptr != NULL; ptr = ptr->nxt)
    if (strcmp(key, ptr->key) == 0) return ptr->idx;
  return -1;
}

/* Function: GKIStatus()
 * Date:     SRE, Sat May  1 11:11:13 1999 [St. Louis]
 *
 * Purpose:  (DEBUGGING) How are we doing? Calculate some
 *           simple statistics for the hash table.
 *
 * Args:     hash - the GKI hash table to look at
 *
 * Returns:  (void) 
 *           Prints diagnostics on stdout.
 *           hash table is unchanged.
 */
void
GKIStatus(GKI *hash)
{
  struct gki_elem *ptr;
  int i;
  int nkeys;
  int nempty  = 0;
  int maxkeys = -1;
  int minkeys = INT_MAX;

  for (i = 0; i < hash->nhash; i++)
    {
      nkeys = 0;
      for (ptr = hash->table[i]; ptr != NULL; ptr = ptr->nxt)
	nkeys++;

      if (nkeys == 0)      nempty++;
      if (nkeys > maxkeys) maxkeys = nkeys;
      if (nkeys < minkeys) minkeys = nkeys;
    }

  printf("Total keys:        %d\n", hash->nkeys);
  printf("Hash table size:   %d\n", hash->nhash);
  printf("Average occupancy: %.1f\n", (float) hash->nkeys / (float) hash->nhash);
  printf("Unoccupied slots:  %d\n", nempty);
  printf("Most in one slot:  %d\n", maxkeys);
  printf("Least in one slot: %d\n", minkeys);
  
}


/* Function: gki_alloc()
 * Date:     SRE, Sat May  1 11:55:47 1999 [May Day geek-out]
 *
 * Purpose:  Allocate a hash table structure with the
 *           size given by primelevel.
 *
 * Args:     primelevel - level 0..GKI_NPRIMES-1, specifying
 *                        the size of the table; see gki_primes[]
 *                        array.
 *
 * Returns:  An allocated hash table structure. 
 *           Caller frees with GKIFree().
 */
static GKI *
gki_alloc(int primelevel)
{
  GKI *hash;
  int  i;

  if (primelevel < 0 || primelevel >= GKI_NPRIMES) 
    Die("bad primelevel in gki_alloc()");
  hash = MallocOrDie(sizeof(GKI));

  hash->primelevel = primelevel;
  hash->nhash      = gki_primes[hash->primelevel];
  hash->table      = MallocOrDie(sizeof(struct gki_elem) * hash->nhash);
  for (i = 0; i < hash->nhash; i++)
    hash->table[i] = NULL;
  hash->nkeys = 0;
  return hash;
}  


/* Function: gki_hashvalue()
 * Date:     SRE, Sat May  1 11:14:10 1999 [May Day geek-out]
 *
 * Purpose:  Calculate the hash value for a key. Usually
 *           we expect a one-word key, but the function will
 *           hash any ASCII string effectively. The hash function
 *           is a simple one (see p. 233 of Sedgewick,
 *           Algorithms in C).
 *           Slightly optimized: does two characters at a time
 *           before doing the modulo; this gives us a significant
 *           speedup.  
 *
 * Args:     hash - the gki structure (we need to know the hash table size)
 *           key  - a string to calculate the hash value for       
 *
 * Returns:  a hash value, in the range 0..hash->nhash-1.
 *           hash table is unmodified.
 */
static int
gki_hashvalue(GKI *hash, char *key)
{
  int val = 0;

  for (; *key != '\0'; key++)
    {
      val = GKI_ALPHABETSIZE*val + *key; 
      if (*(++key) == '\0') { val = val % hash->nhash; break; }
      val = (GKI_ALPHABETSIZE*val + *key) % hash->nhash;
    }
  return val;
}

/* Function: gki_upsize()
 * Date:     SRE, Sat May  1 11:46:07 1999 [May Day geek-out]
 *
 * Purpose:  Grow the hash table to the next available size.
 *
 * Args:     old - the GKI hash table to reallocate.
 *
 * Returns:  1 on success (the hash table is changed);
 *           0 on failure; the table is already at its maximum size,
 *              and the hash table is returned unchanged.
 */
static int
gki_upsize(GKI *old)
{
  GKI      *new;
  int       i;
  struct gki_elem *optr;
  struct gki_elem *nptr;
  int       val;

  if (old->primelevel >= GKI_NPRIMES-1)  return 0;
  new = gki_alloc(old->primelevel+1);

  /* Read the old, store in the new, while *not changing*
   * any key indices. Because of the way the lists are
   * treated as LIFO stacks, all the lists are reversed 
   * in the new structure.
   */
  for (i = 0; i < old->nhash; i++)
    {
      optr = old->table[i];
      while (optr != NULL)
	{
	  val = gki_hashvalue(new, optr->key);

	  nptr = new->table[val];
	  new->table[val]      = optr;
	  optr                 = optr->nxt;
	  new->table[val]->nxt = nptr;
	}
    }
  free(old->table);

  /* Now swap within the interior of the structures, so the old
   * structure is updated to the new structure.
   * (nkeys is identical, so we don't need to swap that element.)
   */
  old->primelevel = new->primelevel;
  old->nhash      = new->nhash;
  old->table      = new->table;
  free(new);
  return 1;
}