Mercurial > repos > calkan > mrfast
view mrfast-2.1.0.4/HashTable.c @ 0:7b3dc85dc7fd
Uploaded mrfast source tarball
author | calkan |
---|---|
date | Tue, 21 Feb 2012 10:29:47 -0500 |
parents | |
children |
line wrap: on
line source
/* * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright notice, this list * of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, this * list of conditions and the following disclaimer in the documentation and/or other * materials provided with the distribution. * - Neither the names of the University of Washington, Simon Fraser University, * nor the names of its contributors may be * used to endorse or promote products derived from this software without specific * prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Authors: Farhad Hormozdiari Faraz Hach Can Alkan Emails: farhadh AT uw DOT edu fhach AT cs DOT sfu DOT ca calkan AT uw DOT edu */ #include <stdio.h> #include <stdlib.h> #include <string.h> #include <math.h> #include "Common.h" #include "RefGenome.h" #include "HashTable.h" /**********************************************/ FILE *_ih_fp = NULL; IHashTable *_ih_hashTable = NULL; int _ih_maxHashTableSize = 0; char *_ih_refGen = NULL; char *_ih_refGenName = NULL; long long _ih_memUsage = 0; int _ih_refGenOff = 0; /**********************************************/ int hashVal(char *seq) { int i=0; int val=0, numericVal=0; while(i<WINDOW_SIZE) { switch (seq[i]) { case 'A': numericVal = 0; break; case 'C': numericVal = 1; break; case 'G' : numericVal = 2; break; case 'T': numericVal = 3; break; default: return -1; break; } val = (val << 2)|numericVal; i++; } return val; } /**********************************************/ void freeIHashTableContent(IHashTable *hashTable, unsigned int maxSize) { int i=0; for (i=0; i<maxSize; i++) { if (hashTable[i].locs != NULL) { freeMem(hashTable[i].locs, (hashTable[i].locs[0]+1)*(sizeof(unsigned int))); hashTable[i].locs = NULL; } } } /**********************************************/ void initSavingIHashTable(char *fileName) { int tmp; _ih_fp = fileOpen(fileName, "w"); unsigned char bIndex = 0; // Bisulfite Index // First Two bytes are indicating the type of the index & window size tmp = fwrite(&bIndex, sizeof(bIndex), 1, _ih_fp); tmp = fwrite(&WINDOW_SIZE, sizeof(WINDOW_SIZE), 1, _ih_fp); if (tmp == 0) fprintf(stderr, "Write error while initializing hash table.\n"); } /**********************************************/ void finalizeSavingIHashTable() { fclose(_ih_fp); } /**********************************************/ void saveIHashTable(IHashTable *hashTable, unsigned int size, unsigned int maxSize, char *refGen, char *refGenName, int refGenOffset) { int tmp; // Every Chunk starts with a byte indicating whether it has extra info; unsigned char extraInfo = 0; tmp = fwrite (&extraInfo, sizeof(extraInfo), 1, _ih_fp); short len = strlen(refGenName); tmp = fwrite(&len, sizeof(len), 1, _ih_fp); tmp = fwrite(refGenName, sizeof(char), len, _ih_fp); tmp = fwrite(&refGenOffset, sizeof(refGenOffset), 1, _ih_fp); unsigned int refGenLength = strlen(refGen); tmp = fwrite(&refGenLength, sizeof(refGenLength), 1, _ih_fp); tmp = fwrite(refGen, sizeof(char), refGenLength, _ih_fp); tmp = fwrite(&size, sizeof(size), 1, _ih_fp); int i=0,j=0; unsigned char cnt=0; for (i=0; i<maxSize; i++) { if (hashTable[i].locs != NULL) { tmp = fwrite(&i, sizeof(i), 1, _ih_fp); if (hashTable[i].locs[0] < 250) { cnt = hashTable[i].locs[0]; tmp = fwrite(&cnt, sizeof(cnt), 1, _ih_fp); } else { cnt =0; tmp = fwrite (&cnt, sizeof(cnt), 1, _ih_fp); tmp = fwrite (&(hashTable[i].locs[0]), sizeof(hashTable[i].locs[0]), 1, _ih_fp); } for (j=1; j<=hashTable[i].locs[0]; j++) tmp = fwrite(&(hashTable[i].locs[j]), sizeof(hashTable[i].locs[j]), 1, _ih_fp); } } if (tmp == 0) fprintf(stderr, "Write error while saving hash table.\n"); } /**********************************************/ unsigned int addIHashTableLocation(IHashTable *hashTable, int hv, int location) { unsigned int sizeInc = 0; if (hashTable[hv].locs == NULL) { sizeInc = 1; hashTable[hv].locs = getMem (sizeof(unsigned int)*2); hashTable[hv].locs[0]=1; hashTable[hv].locs[1]=location; } else { int size = hashTable[hv].locs[0]; int i; unsigned int *tmp = getMem( (size + 2) * sizeof(unsigned int) ); for (i = 0; i <= size; i++) { tmp[i] = hashTable[hv].locs[i]; } size++; tmp[0] = size; tmp[size] = location; freeMem(hashTable[hv].locs, (hashTable[hv].locs[0]*(sizeof(unsigned int)))); hashTable[hv].locs = tmp; } return sizeInc; } /**********************************************/ void generateIHashTable(char *fileName, char *indexName) { double startTime = getTime(); unsigned int hashTableSize = 0; unsigned int hashTableMaxSize = pow(4, WINDOW_SIZE); IHashTable *hashTable = getMem(sizeof(IHashTable)*hashTableMaxSize); char *refGenName; char *refGen; int refGenOff = 0; int i, hv, l, flag; for ( i = 0; i < hashTableMaxSize; i++) { hashTable[i].locs = NULL; } //Loading Fasta File if (!initLoadingRefGenome(fileName)) return; initSavingIHashTable(indexName); fprintf(stdout, "Generating Index from %s", fileName); fflush(stdout); char *prev = getMem (CONTIG_NAME_SIZE); prev[0]='\0'; do { flag = loadRefGenome (&refGen, &refGenName, &refGenOff); if ( strcmp(prev, refGenName) != 0) { fprintf(stdout, "\n - %s ", refGenName); fflush(stdout); sprintf(prev, "%s", refGenName); } else { fprintf(stdout, "."); fflush(stdout); } l = strlen(refGen) - WINDOW_SIZE; for (i=0; i < l; i++) { hv = hashVal(refGen + i); if (hv != -1) { hashTableSize += addIHashTableLocation (hashTable, hv, i+1); } } saveIHashTable(hashTable, hashTableSize, hashTableMaxSize, refGen, refGenName, refGenOff); freeIHashTableContent(hashTable, hashTableMaxSize); hashTableSize = 0; } while (flag); freeMem(prev, CONTIG_NAME_SIZE); freeMem(hashTable, sizeof(IHashTable)*hashTableMaxSize); finalizeLoadingRefGenome(); finalizeSavingIHashTable(); fprintf(stdout, "\nDONE in %0.2fs!\n", (getTime()-startTime)); } /**********************************************/ void finalizeLoadingIHashTable() { freeIHashTableContent(_ih_hashTable, _ih_maxHashTableSize); freeMem(_ih_hashTable, sizeof(IHashTable)* _ih_maxHashTableSize); freeMem(_ih_refGen, strlen(_ih_refGen)+1) ; freeMem(_ih_refGenName, strlen(_ih_refGenName)+1); fclose(_ih_fp); } /**********************************************/ int loadIHashTable(double *loadTime, int errThreshould) { double startTime = getTime(); unsigned char extraInfo = 0; short len; unsigned int refGenLength; unsigned int hashTableSize; unsigned int tmpSize; int tmp; int i=0,j=0; if ( fread(&extraInfo, sizeof(extraInfo), 1, _ih_fp) != sizeof(extraInfo) ) return 0; freeIHashTableContent(_ih_hashTable, _ih_maxHashTableSize); freeMem(_ih_refGen, strlen(_ih_refGen)+1) ; freeMem(_ih_refGenName, strlen(_ih_refGenName)+1); // Reading Chr Name tmp = fread(&len, sizeof(len), 1, _ih_fp); if (tmp == 0){ fprintf(stderr, "Read error while loading hash table.\n"); exit(0); } _ih_refGenName = getMem(sizeof(char)* (len+1)); tmp = fread(_ih_refGenName, sizeof(char), len, _ih_fp); _ih_refGenName [len] ='\0'; tmp = fread(&_ih_refGenOff, sizeof (_ih_refGenOff), 1, _ih_fp); // Reading Size and Content of Ref Genome tmp = fread(&refGenLength, sizeof(refGenLength), 1, _ih_fp); _ih_refGen = getMem(sizeof(char)*(refGenLength+1)); tmp = fread(_ih_refGen, sizeof(char), refGenLength, _ih_fp); _ih_refGen[refGenLength]='\0'; //Reading Hashtable Size and Content tmp = fread(&hashTableSize, sizeof(hashTableSize), 1, _ih_fp); unsigned int hv; unsigned char cnt=0; unsigned int index = 1; unsigned int tmpValue = 0; for (i=0; i<hashTableSize; i++) { index = 1; tmp = fread(&hv, sizeof(hv), 1, _ih_fp); tmp = fread(&cnt, sizeof(cnt), 1, _ih_fp); if (cnt>0) { tmpSize = cnt; } else { tmp = fread(&tmpSize, sizeof(tmpSize), 1, _ih_fp); } _ih_hashTable[hv].locs = getMem( sizeof(unsigned int)* (tmpSize+1) ); _ih_hashTable[hv].locs[0] = tmpSize; /* for (j=1; j<=tmpSize; j++) tmp = fread(&(_ih_hashTable[hv].locs[j]), sizeof(_ih_hashTable[hv].locs[j]), 1, _ih_fp); */ if(tmpSize != 0) { tmp = fread(&(_ih_hashTable[hv].locs[index]), sizeof(_ih_hashTable[hv].locs[index]), 1, _ih_fp); index++; } for (j=2; j<=tmpSize; j++) { tmp = fread(&tmpValue, sizeof(_ih_hashTable[hv].locs[index-1]), 1, _ih_fp); if(abs(tmpValue-_ih_hashTable[hv].locs[index-1]) > errThreshould) { _ih_hashTable[hv].locs[index] = tmpValue; index++; } } _ih_hashTable[hv].locs[0] = index-1; } *loadTime = getTime()-startTime; return 1; } /**********************************************/ unsigned int *getIHashTableCandidates(int hv) { if ( hv != -1 ) return _ih_hashTable[hv].locs; else return NULL; } /**********************************************/ /**********************************************/ /**********************************************/ void configHashTable() { if (WINDOW_SIZE <= 15) { generateHashTable = &generateIHashTable; loadHashTable = &loadIHashTable; finalizeLoadingHashTable = &finalizeLoadingIHashTable; getCandidates = &getIHashTableCandidates; } else { } } /**********************************************/ int initLoadingHashTable(char *fileName) { int i; unsigned char bsIndex; int tmp; _ih_fp = fileOpen(fileName, "r"); if (_ih_fp == NULL) return 0; tmp = fread(&bsIndex, sizeof(bsIndex), 1, _ih_fp); if (tmp == 0){ fprintf(stderr, "Read error while inilializing hash table loading.\n"); exit(0); } if (bsIndex) { fprintf(stdout, "Error: Wrong Type of Index indicated"); return 0; } tmp = fread(&WINDOW_SIZE, sizeof(WINDOW_SIZE), 1, _ih_fp); configHashTable(); if (_ih_maxHashTableSize != pow(4, WINDOW_SIZE)) { if (_ih_hashTable != NULL) { freeIHashTableContent(_ih_hashTable, _ih_maxHashTableSize); freeMem(_ih_hashTable, sizeof(IHashTable)* _ih_maxHashTableSize); freeMem(_ih_refGen, strlen(_ih_refGen)+1) ; freeMem(_ih_refGenName, strlen(_ih_refGenName)+1); } _ih_maxHashTableSize = pow(4, WINDOW_SIZE); _ih_hashTable = getMem (sizeof(IHashTable) * _ih_maxHashTableSize); for (i=0; i<_ih_maxHashTableSize; i++) _ih_hashTable[i].locs = NULL; _ih_refGen = getMem(1); _ih_refGen[0]='\0'; _ih_refGenName = getMem(1); _ih_refGenName[0] = '\0'; } return 1; } /**********************************************/ char *getRefGenome() { return _ih_refGen; } /**********************************************/ char *getRefGenomeName() { return _ih_refGenName; } /**********************************************/ int getRefGenomeOffset() { return _ih_refGenOff; } /**********************************************/ HashTable *getHashTable() { return NULL; }