diff mrfast-2.1.0.4/HashTable.c @ 0:7b3dc85dc7fd

Uploaded mrfast source tarball
author calkan
date Tue, 21 Feb 2012 10:29:47 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mrfast-2.1.0.4/HashTable.c	Tue Feb 21 10:29:47 2012 -0500
@@ -0,0 +1,472 @@
+/*
+ * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this list
+ * of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright notice, this
+ *   list of conditions and the following disclaimer in the documentation and/or other
+ *   materials provided with the distribution.
+ * - Neither the names of the University of Washington, Simon Fraser University, 
+ *   nor the names of its contributors may be
+ *   used to endorse or promote products derived from this software without specific
+ *   prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+  Authors: 
+	Farhad Hormozdiari
+        Faraz Hach
+	Can Alkan
+  Emails: 
+	farhadh AT uw DOT edu
+	fhach AT cs DOT sfu DOT ca
+        calkan AT uw DOT edu
+*/
+
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include "Common.h"
+#include "RefGenome.h"
+#include "HashTable.h"
+/**********************************************/
+FILE		*_ih_fp					= NULL;
+IHashTable	*_ih_hashTable			= NULL;
+int 		_ih_maxHashTableSize	= 0;
+char		*_ih_refGen				= NULL;
+char		*_ih_refGenName			= NULL;
+long long	_ih_memUsage			= 0;
+int			_ih_refGenOff			= 0;
+/**********************************************/
+
+int hashVal(char *seq)
+{
+	int i=0;
+	int val=0, numericVal=0;
+
+	while(i<WINDOW_SIZE)
+	{
+		switch (seq[i])
+		{
+			case 'A':
+				numericVal = 0;
+				break;
+			case 'C':
+				numericVal = 1;
+				break;
+			case 'G' :
+				numericVal = 2;
+				break;
+			case 'T':
+				numericVal = 3;
+				break;
+			default:
+				return -1;
+				break;
+		}
+		val = (val << 2)|numericVal;
+		i++;
+	}
+	return val;
+}
+/**********************************************/
+void freeIHashTableContent(IHashTable *hashTable, unsigned int maxSize)
+{
+	int i=0;
+	for (i=0; i<maxSize; i++)
+	{
+		if (hashTable[i].locs != NULL)
+		{
+			freeMem(hashTable[i].locs, (hashTable[i].locs[0]+1)*(sizeof(unsigned int)));
+			hashTable[i].locs = NULL;
+		}
+	}
+}
+/**********************************************/
+void initSavingIHashTable(char *fileName)
+{
+	int tmp;
+	_ih_fp = fileOpen(fileName, "w");
+	unsigned char bIndex = 0;				// Bisulfite Index
+	
+	// First Two bytes are indicating the type of the index & window size
+	tmp = fwrite(&bIndex, sizeof(bIndex), 1, _ih_fp);
+	tmp = fwrite(&WINDOW_SIZE, sizeof(WINDOW_SIZE), 1, _ih_fp);
+	
+	if (tmp == 0)
+	  fprintf(stderr, "Write error while initializing hash table.\n");
+
+}
+/**********************************************/
+void finalizeSavingIHashTable()
+{
+	fclose(_ih_fp);
+}
+/**********************************************/
+void saveIHashTable(IHashTable *hashTable,  unsigned int size, unsigned int maxSize, char *refGen, char *refGenName, int refGenOffset)
+{
+	int tmp;
+	
+	// Every Chunk starts with a byte indicating whether it has extra info;
+	unsigned char extraInfo = 0;
+	tmp = fwrite (&extraInfo, sizeof(extraInfo), 1, _ih_fp);
+
+	short len = strlen(refGenName);
+	tmp = fwrite(&len, sizeof(len), 1, _ih_fp);
+	tmp = fwrite(refGenName, sizeof(char), len, _ih_fp);
+
+	tmp = fwrite(&refGenOffset, sizeof(refGenOffset), 1, _ih_fp);
+	
+	unsigned int refGenLength = strlen(refGen);
+	tmp = fwrite(&refGenLength, sizeof(refGenLength), 1, _ih_fp);
+	
+	tmp = fwrite(refGen, sizeof(char), refGenLength, _ih_fp);
+	tmp = fwrite(&size, sizeof(size), 1, _ih_fp);
+
+	int i=0,j=0;
+	unsigned char cnt=0;
+	for (i=0; i<maxSize; i++)
+	{
+		if (hashTable[i].locs != NULL)
+		{
+			tmp = fwrite(&i, sizeof(i), 1, _ih_fp);
+
+			if (hashTable[i].locs[0] < 250)
+			{
+				cnt = hashTable[i].locs[0];
+				tmp = fwrite(&cnt, sizeof(cnt), 1, _ih_fp);
+			}
+			else
+			{
+				cnt =0;
+				tmp = fwrite (&cnt, sizeof(cnt), 1, _ih_fp);
+				tmp = fwrite (&(hashTable[i].locs[0]), sizeof(hashTable[i].locs[0]), 1, _ih_fp);
+			}
+
+			for (j=1; j<=hashTable[i].locs[0]; j++)
+				tmp = fwrite(&(hashTable[i].locs[j]), sizeof(hashTable[i].locs[j]), 1, _ih_fp);
+		}
+	}
+
+	if (tmp == 0)
+	  fprintf(stderr, "Write error while saving hash table.\n");
+}
+/**********************************************/
+unsigned int addIHashTableLocation(IHashTable *hashTable, int hv, int location)
+{
+	unsigned int	sizeInc				= 0;
+
+	if (hashTable[hv].locs == NULL)
+	{
+		sizeInc = 1;
+		hashTable[hv].locs = getMem (sizeof(unsigned int)*2);
+		hashTable[hv].locs[0]=1;
+		hashTable[hv].locs[1]=location;
+	}
+	else
+	{
+		int size = hashTable[hv].locs[0];
+		int i;
+		unsigned int *tmp = getMem( (size + 2) * sizeof(unsigned int) );
+
+		for (i = 0; i <= size; i++)
+		{
+			tmp[i] = hashTable[hv].locs[i];
+		}
+		size++;
+		tmp[0] = size;
+		tmp[size] = location;
+		
+		freeMem(hashTable[hv].locs, (hashTable[hv].locs[0]*(sizeof(unsigned int))));
+		hashTable[hv].locs = tmp;
+	}
+	return sizeInc;
+}
+/**********************************************/
+void generateIHashTable(char *fileName, char *indexName)
+{
+	double          startTime           = getTime();
+	unsigned int	hashTableSize		= 0;
+	unsigned int 	hashTableMaxSize	= pow(4, WINDOW_SIZE);
+	IHashTable		*hashTable			= getMem(sizeof(IHashTable)*hashTableMaxSize);
+	char 			*refGenName;
+	char			*refGen;
+	int				refGenOff			= 0;
+	int i, hv, l, flag;
+
+
+	for ( i = 0; i < hashTableMaxSize; i++)
+	{
+		hashTable[i].locs = NULL;
+	}
+
+	//Loading Fasta File
+	if (!initLoadingRefGenome(fileName))
+		return;		
+	initSavingIHashTable(indexName);
+	
+	fprintf(stdout, "Generating Index from %s", fileName);
+	fflush(stdout);
+
+	char *prev = getMem (CONTIG_NAME_SIZE);
+	prev[0]='\0';
+
+	do
+	{
+		flag = 	 loadRefGenome (&refGen, &refGenName, &refGenOff);	
+
+		if ( strcmp(prev, refGenName) != 0)
+		{
+			fprintf(stdout, "\n - %s ", refGenName);
+			fflush(stdout);
+			sprintf(prev, "%s", refGenName);
+		}
+		else
+		{
+			fprintf(stdout, ".");
+			fflush(stdout);
+		}
+		
+		l = strlen(refGen) - WINDOW_SIZE;
+
+		for (i=0; i < l; i++)
+		{
+			hv = hashVal(refGen + i);
+			if (hv != -1)
+			{
+				hashTableSize += addIHashTableLocation (hashTable, hv, i+1);
+			}
+		}
+
+		saveIHashTable(hashTable, hashTableSize, hashTableMaxSize, refGen, refGenName, refGenOff);
+		freeIHashTableContent(hashTable, hashTableMaxSize);
+		hashTableSize = 0;
+	} while (flag);
+
+	freeMem(prev, CONTIG_NAME_SIZE);
+	freeMem(hashTable, sizeof(IHashTable)*hashTableMaxSize);
+
+	finalizeLoadingRefGenome();
+	finalizeSavingIHashTable();
+
+	fprintf(stdout, "\nDONE in %0.2fs!\n", (getTime()-startTime));
+}
+
+/**********************************************/
+void finalizeLoadingIHashTable()
+{
+	freeIHashTableContent(_ih_hashTable, _ih_maxHashTableSize);
+	freeMem(_ih_hashTable, sizeof(IHashTable)* _ih_maxHashTableSize);
+	freeMem(_ih_refGen, strlen(_ih_refGen)+1) ;
+	freeMem(_ih_refGenName, strlen(_ih_refGenName)+1);
+	fclose(_ih_fp);
+}
+
+/**********************************************/
+int  loadIHashTable(double *loadTime, int errThreshould)
+{
+	double startTime = getTime();
+	unsigned char extraInfo = 0;
+	short len;
+	unsigned int refGenLength;
+	unsigned int hashTableSize;
+	unsigned int tmpSize;
+	int tmp;
+	int i=0,j=0;
+
+	if ( fread(&extraInfo, sizeof(extraInfo), 1, _ih_fp) != sizeof(extraInfo) )
+		return 0;
+
+	freeIHashTableContent(_ih_hashTable, _ih_maxHashTableSize);
+	freeMem(_ih_refGen, strlen(_ih_refGen)+1) ;
+	freeMem(_ih_refGenName, strlen(_ih_refGenName)+1);
+
+	// Reading Chr Name
+	tmp = fread(&len, sizeof(len), 1, _ih_fp);
+	if (tmp == 0){
+	  fprintf(stderr, "Read error while loading hash table.\n");
+	  exit(0);
+	}
+
+	_ih_refGenName = getMem(sizeof(char)* (len+1));
+	tmp = fread(_ih_refGenName, sizeof(char), len, _ih_fp);
+	_ih_refGenName [len] ='\0';
+
+	tmp = fread(&_ih_refGenOff, sizeof (_ih_refGenOff), 1, _ih_fp);
+
+	// Reading Size and Content of Ref Genome
+	tmp = fread(&refGenLength, sizeof(refGenLength), 1, _ih_fp);
+	_ih_refGen = getMem(sizeof(char)*(refGenLength+1));
+	tmp = fread(_ih_refGen, sizeof(char), refGenLength, _ih_fp);
+	_ih_refGen[refGenLength]='\0';
+
+	//Reading Hashtable Size and Content
+	tmp = fread(&hashTableSize, sizeof(hashTableSize), 1, _ih_fp);
+	
+	unsigned int hv;
+	unsigned char cnt=0;
+	unsigned int  index = 1;
+	unsigned int  tmpValue = 0;
+	for (i=0; i<hashTableSize; i++)
+	{
+		index = 1;
+		tmp = fread(&hv, sizeof(hv), 1, _ih_fp);
+		tmp = fread(&cnt, sizeof(cnt), 1, _ih_fp);
+
+		if (cnt>0)
+		{
+			tmpSize = cnt;
+		}
+		else
+		{
+			tmp = fread(&tmpSize, sizeof(tmpSize), 1, _ih_fp);
+		}
+		
+		_ih_hashTable[hv].locs = getMem( sizeof(unsigned int)* (tmpSize+1) );
+		_ih_hashTable[hv].locs[0] = tmpSize;
+
+/*		for (j=1; j<=tmpSize; j++) 
+			tmp = fread(&(_ih_hashTable[hv].locs[j]), sizeof(_ih_hashTable[hv].locs[j]), 1, _ih_fp);
+*/	
+		
+		if(tmpSize != 0)
+		{
+			tmp = fread(&(_ih_hashTable[hv].locs[index]), sizeof(_ih_hashTable[hv].locs[index]), 1, _ih_fp);
+			index++;
+		}
+
+		for (j=2; j<=tmpSize; j++)	
+		{
+			tmp = fread(&tmpValue, sizeof(_ih_hashTable[hv].locs[index-1]), 1, _ih_fp);
+			if(abs(tmpValue-_ih_hashTable[hv].locs[index-1]) >  errThreshould)
+			{
+				_ih_hashTable[hv].locs[index] = tmpValue;
+				index++;
+			}
+		}
+               _ih_hashTable[hv].locs[0] = index-1;
+	}
+	*loadTime = getTime()-startTime;
+	return 1;
+}
+/**********************************************/
+unsigned int *getIHashTableCandidates(int hv)
+{
+	if ( hv != -1 )
+		return _ih_hashTable[hv].locs;
+	else 
+		return NULL;
+}
+/**********************************************/
+/**********************************************/
+/**********************************************/
+void configHashTable()
+{
+	if (WINDOW_SIZE <= 15)
+	{
+		generateHashTable = &generateIHashTable;
+		loadHashTable = &loadIHashTable;
+		finalizeLoadingHashTable = &finalizeLoadingIHashTable;
+		getCandidates = &getIHashTableCandidates;
+	}
+	else
+	{
+
+
+	}
+}
+/**********************************************/
+int initLoadingHashTable(char *fileName)
+{
+	int i;	
+	unsigned char bsIndex;
+	int tmp; 
+
+	_ih_fp = fileOpen(fileName, "r");	
+
+	if (_ih_fp == NULL)
+		return 0;
+
+	tmp = fread(&bsIndex, sizeof(bsIndex), 1, _ih_fp);
+
+	if (tmp == 0){
+	  fprintf(stderr, "Read error while inilializing hash table loading.\n");
+	  exit(0);
+	}
+
+	if (bsIndex)
+	{
+		fprintf(stdout, "Error: Wrong Type of Index indicated");
+		return 0;
+	}
+	
+	tmp = fread(&WINDOW_SIZE, sizeof(WINDOW_SIZE), 1, _ih_fp);
+
+	configHashTable();
+
+	if (_ih_maxHashTableSize != pow(4, WINDOW_SIZE))
+	{
+
+		if (_ih_hashTable != NULL)
+		{
+			freeIHashTableContent(_ih_hashTable, _ih_maxHashTableSize);
+			freeMem(_ih_hashTable, sizeof(IHashTable)* _ih_maxHashTableSize);
+			freeMem(_ih_refGen, strlen(_ih_refGen)+1) ;
+			freeMem(_ih_refGenName, strlen(_ih_refGenName)+1);
+		}
+
+		_ih_maxHashTableSize = pow(4, WINDOW_SIZE);
+
+		_ih_hashTable = getMem (sizeof(IHashTable) * _ih_maxHashTableSize);
+		for (i=0; i<_ih_maxHashTableSize; i++)
+			_ih_hashTable[i].locs = NULL;
+		_ih_refGen = getMem(1);
+		_ih_refGen[0]='\0';
+		_ih_refGenName = getMem(1);
+		_ih_refGenName[0] = '\0';
+	}
+
+	return 1;
+}
+/**********************************************/
+char *getRefGenome()
+{
+	return _ih_refGen;
+
+}
+/**********************************************/
+char *getRefGenomeName()
+{
+	return _ih_refGenName;
+
+}
+/**********************************************/
+int getRefGenomeOffset()
+{
+	return _ih_refGenOff;
+
+}
+/**********************************************/
+HashTable *getHashTable()
+{
+	return NULL;
+}