comparison mrsfast-2.3.0.2/HashTable.c @ 0:ec628ba33878 default tip

Uploaded source code for mrsFAST
author calkan
date Tue, 21 Feb 2012 10:39:28 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:ec628ba33878
1 /*
2 * Copyright (c) <2008 - 2009>, University of Washington, Simon Fraser University
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without modification,
6 * are permitted provided that the following conditions are met:
7 *
8 * Redistributions of source code must retain the above copyright notice, this list
9 * of conditions and the following disclaimer.
10 * - Redistributions in binary form must reproduce the above copyright notice, this
11 * list of conditions and the following disclaimer in the documentation and/or other
12 * materials provided with the distribution.
13 * - Neither the name of the <ORGANIZATION> nor the names of its contributors may be
14 * used to endorse or promote products derived from this software without specific
15 * prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
21 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
24 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
25 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
26 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 /*
31 * Author : Faraz Hach
32 * Email : fhach AT cs DOT sfu
33 * Last Update : 2009-12-08
34 */
35
36
37 #include <stdio.h>
38 #include <stdlib.h>
39 #include <string.h>
40 #include <math.h>
41 #include "Common.h"
42 #include "RefGenome.h"
43 #include "HashTable.h"
44 /**********************************************/
45 FILE *_ih_fp = NULL;
46 IHashTable *_ih_hashTable = NULL;
47 int _ih_maxHashTableSize = 0;
48 char *_ih_refGen = NULL;
49 char *_ih_refGenName = NULL;
50 long long _ih_memUsage = 0;
51 int _ih_refGenOff = 0;
52 /**********************************************/
53
54 int hashVal(char *seq)
55 {
56 int i=0;
57 int val=0, numericVal=0;
58
59 while(i<WINDOW_SIZE)
60 {
61 switch (seq[i])
62 {
63 case 'A':
64 numericVal = 0;
65 break;
66 case 'C':
67 numericVal = 1;
68 break;
69 case 'G' :
70 numericVal = 2;
71 break;
72 case 'T':
73 numericVal = 3;
74 break;
75 default:
76 return -1;
77 break;
78 }
79 val = (val << 2)|numericVal;
80 i++;
81 }
82 return val;
83 }
84 /**********************************************/
85 void freeIHashTableContent(IHashTable *hashTable, unsigned int maxSize)
86 {
87 int i=0;
88 for (i=0; i<maxSize; i++)
89 {
90 if (hashTable[i].locs != NULL)
91 {
92 freeMem(hashTable[i].locs, (hashTable[i].locs[0]+1)*(sizeof(unsigned int)));
93 hashTable[i].locs = NULL;
94 }
95 }
96 }
97 /**********************************************/
98 void initSavingIHashTable(char *fileName)
99 {
100 int tmp;
101 _ih_fp = fileOpen(fileName, "w");
102 unsigned char bIndex = 0; // Bisulfite Index
103
104 // First Two bytes are indicating the type of the index & window size
105 tmp = fwrite(&bIndex, sizeof(bIndex), 1, _ih_fp);
106 tmp = fwrite(&WINDOW_SIZE, sizeof(WINDOW_SIZE), 1, _ih_fp);
107
108 }
109 /**********************************************/
110 void finalizeSavingIHashTable()
111 {
112 fclose(_ih_fp);
113 }
114 /**********************************************/
115 void saveIHashTable(IHashTable *hashTable, unsigned int size, unsigned int maxSize, char *refGen, char *refGenName, int refGenOffset)
116 {
117 int tmp;
118
119 // Every Chunk starts with a byte indicating whether it has extra info;
120 unsigned char extraInfo = 0;
121 tmp = fwrite (&extraInfo, sizeof(extraInfo), 1, _ih_fp);
122
123 short len = strlen(refGenName);
124 tmp = fwrite(&len, sizeof(len), 1, _ih_fp);
125 tmp = fwrite(refGenName, sizeof(char), len, _ih_fp);
126
127 tmp = fwrite(&refGenOffset, sizeof(refGenOffset), 1, _ih_fp);
128
129 unsigned int refGenLength = strlen(refGen);
130 tmp = fwrite(&refGenLength, sizeof(refGenLength), 1, _ih_fp);
131 tmp = fwrite(refGen, sizeof(char), refGenLength, _ih_fp);
132
133 tmp = fwrite(&size, sizeof(size), 1, _ih_fp);
134
135 int i=0,j=0;
136 unsigned char cnt=0;
137 for (i=0; i<maxSize; i++)
138 {
139 if (hashTable[i].locs != NULL)
140 {
141 tmp = fwrite(&i, sizeof(i), 1, _ih_fp);
142
143 if (hashTable[i].locs[0] < 250)
144 {
145 cnt = hashTable[i].locs[0];
146 tmp = fwrite(&cnt, sizeof(cnt), 1, _ih_fp);
147 }
148 else
149 {
150 cnt =0;
151 tmp = fwrite (&cnt, sizeof(cnt), 1, _ih_fp);
152 tmp = fwrite (&(hashTable[i].locs[0]), sizeof(hashTable[i].locs[0]), 1, _ih_fp);
153 }
154
155 for (j=1; j<=hashTable[i].locs[0]; j++)
156 tmp = fwrite(&(hashTable[i].locs[j]), sizeof(hashTable[i].locs[j]), 1, _ih_fp);
157 }
158 }
159 }
160 /**********************************************/
161 unsigned int addIHashTableLocation(IHashTable *hashTable, int hv, int location)
162 {
163 unsigned int sizeInc = 0;
164
165 if (hashTable[hv].locs == NULL)
166 {
167 sizeInc = 1;
168 hashTable[hv].locs = getMem (sizeof(unsigned int)*2);
169 hashTable[hv].locs[0]=1;
170 hashTable[hv].locs[1]=location;
171 }
172 else
173 {
174 int size = hashTable[hv].locs[0];
175 int i;
176 unsigned int *tmp = getMem( (size + 2) * sizeof(unsigned int) );
177
178 for (i = 0; i <= size; i++)
179 {
180 tmp[i] = hashTable[hv].locs[i];
181 }
182 size++;
183 tmp[0] = size;
184 tmp[size] = location;
185
186 freeMem(hashTable[hv].locs, (hashTable[hv].locs[0]*(sizeof(unsigned int))));
187 hashTable[hv].locs = tmp;
188 }
189 return sizeInc;
190 }
191 /**********************************************/
192 void generateIHashTable(char *fileName, char *indexName)
193 {
194 double startTime = getTime();
195 unsigned int hashTableSize = 0;
196 unsigned int hashTableMaxSize = pow(4, WINDOW_SIZE);
197 IHashTable *hashTable = getMem(sizeof(IHashTable)*hashTableMaxSize);
198 char *refGenName;
199 char *refGen;
200 int refGenOff = 0;
201 int i, hv, l, flag;
202
203
204 for ( i = 0; i < hashTableMaxSize; i++)
205 {
206 hashTable[i].locs = NULL;
207 }
208
209 //Loading Fasta File
210 if (!initLoadingRefGenome(fileName))
211 return;
212 initSavingIHashTable(indexName);
213
214 fprintf(stdout, "Generating Index from %s", fileName);
215 fflush(stdout);
216
217 char *prev = getMem (CONTIG_NAME_SIZE);
218 prev[0]='\0';
219
220 do
221 {
222 flag = loadRefGenome (&refGen, &refGenName, &refGenOff);
223
224 if ( strcmp(prev, refGenName) != 0)
225 {
226 fprintf(stdout, "\n - %s ", refGenName);
227 fflush(stdout);
228 sprintf(prev, "%s", refGenName);
229 }
230 else
231 {
232 fprintf(stdout, ".");
233 fflush(stdout);
234 }
235
236 l = strlen(refGen) - WINDOW_SIZE;
237
238 for (i=0; i < l; i++)
239 {
240 hv = hashVal(refGen + i);
241 if (hv != -1)
242 {
243 hashTableSize += addIHashTableLocation (hashTable, hv, i+1);
244 }
245 }
246
247 saveIHashTable(hashTable, hashTableSize, hashTableMaxSize, refGen, refGenName, refGenOff);
248 freeIHashTableContent(hashTable, hashTableMaxSize);
249 hashTableSize = 0;
250 } while (flag);
251
252 freeMem(prev, CONTIG_NAME_SIZE);
253 freeMem(hashTable, sizeof(IHashTable)*hashTableMaxSize);
254
255 finalizeLoadingRefGenome();
256 finalizeSavingIHashTable();
257
258 fprintf(stdout, "\nDONE in %0.2fs!\n", (getTime()-startTime));
259 }
260
261 /**********************************************/
262 void finalizeLoadingIHashTable()
263 {
264 freeIHashTableContent(_ih_hashTable, _ih_maxHashTableSize);
265 freeMem(_ih_hashTable, sizeof(IHashTable)* _ih_maxHashTableSize);
266 freeMem(_ih_refGen, strlen(_ih_refGen)+1) ;
267 freeMem(_ih_refGenName, strlen(_ih_refGenName)+1);
268 fclose(_ih_fp);
269 }
270
271 /**********************************************/
272 int loadIHashTable(double *loadTime)
273 {
274 double startTime = getTime();
275 unsigned char extraInfo = 0;
276 short len;
277 unsigned int refGenLength;
278 unsigned int hashTableSize;
279 unsigned int tmpSize;
280 int tmp;
281 int i=0,j=0;
282
283 if ( fread(&extraInfo, sizeof(extraInfo), 1, _ih_fp) != sizeof(extraInfo) )
284 return 0;
285
286 freeIHashTableContent(_ih_hashTable, _ih_maxHashTableSize);
287 freeMem(_ih_refGen, strlen(_ih_refGen)+1) ;
288 freeMem(_ih_refGenName, strlen(_ih_refGenName)+1);
289
290 // Reading Chr Name
291 tmp = fread(&len, sizeof(len), 1, _ih_fp);
292 _ih_refGenName = getMem(sizeof(char)* (len+1));
293 tmp = fread(_ih_refGenName, sizeof(char), len, _ih_fp);
294 _ih_refGenName [len] ='\0';
295
296 tmp = fread(&_ih_refGenOff, sizeof (_ih_refGenOff), 1, _ih_fp);
297
298 // Reading Size and Content of Ref Genome
299 tmp = fread(&refGenLength, sizeof(refGenLength), 1, _ih_fp);
300 _ih_refGen = getMem(sizeof(char)*(refGenLength+1));
301 tmp = fread(_ih_refGen, sizeof(char), refGenLength, _ih_fp);
302 _ih_refGen[refGenLength]='\0';
303
304 //Reading Hashtable Size and Content
305 tmp = fread(&hashTableSize, sizeof(hashTableSize), 1, _ih_fp);
306
307 unsigned int hv;
308 unsigned char cnt=0;
309 for (i=0; i<hashTableSize; i++)
310 {
311 tmp = fread(&hv, sizeof(hv), 1, _ih_fp);
312 tmp = fread(&cnt, sizeof(cnt), 1, _ih_fp);
313
314 if (cnt>0)
315 {
316 tmpSize = cnt;
317 }
318 else
319 {
320 tmp = fread(&tmpSize, sizeof(tmpSize), 1, _ih_fp);
321 }
322
323 _ih_hashTable[hv].locs = getMem( sizeof(unsigned int)* (tmpSize+1) );
324 _ih_hashTable[hv].locs[0] = tmpSize;
325
326 for (j=1; j<=tmpSize; j++)
327 tmp = fread(&(_ih_hashTable[hv].locs[j]), sizeof(_ih_hashTable[hv].locs[j]), 1, _ih_fp);
328 }
329 *loadTime = getTime()-startTime;
330
331 return 1;
332 }
333 /**********************************************/
334 unsigned int *getIHashTableCandidates(int hv)
335 {
336 if ( hv != -1 )
337 return _ih_hashTable[hv].locs;
338 else
339 return NULL;
340 }
341 /**********************************************/
342 /**********************************************/
343 /**********************************************/
344 void configHashTable()
345 {
346 if (WINDOW_SIZE <= 14)
347 {
348 generateHashTable = &generateIHashTable;
349 loadHashTable = &loadIHashTable;
350 finalizeLoadingHashTable = &finalizeLoadingIHashTable;
351 getCandidates = &getIHashTableCandidates;
352 }
353 else
354 {
355
356
357 }
358 }
359 /**********************************************/
360 int initLoadingHashTable(char *fileName)
361 {
362 int i;
363 unsigned char bsIndex;
364 int tmp;
365
366 _ih_fp = fileOpen(fileName, "r");
367
368 if (_ih_fp == NULL)
369 return 0;
370
371 tmp = fread(&bsIndex, sizeof(bsIndex), 1, _ih_fp);
372 if (bsIndex)
373 {
374 fprintf(stdout, "Error: Wrong Type of Index indicated");
375 return 0;
376 }
377
378 tmp = fread(&WINDOW_SIZE, sizeof(WINDOW_SIZE), 1, _ih_fp);
379
380 configHashTable();
381
382 if (_ih_maxHashTableSize != pow(4, WINDOW_SIZE))
383 {
384
385 if (_ih_hashTable != NULL)
386 {
387 freeIHashTableContent(_ih_hashTable, _ih_maxHashTableSize);
388 freeMem(_ih_hashTable, sizeof(IHashTable)* _ih_maxHashTableSize);
389 freeMem(_ih_refGen, strlen(_ih_refGen)+1) ;
390 freeMem(_ih_refGenName, strlen(_ih_refGenName)+1);
391 }
392
393 _ih_maxHashTableSize = pow(4, WINDOW_SIZE);
394
395 _ih_hashTable = getMem (sizeof(IHashTable) * _ih_maxHashTableSize);
396 for (i=0; i<_ih_maxHashTableSize; i++)
397 _ih_hashTable[i].locs = NULL;
398 _ih_refGen = getMem(1);
399 _ih_refGen[0]='\0';
400 _ih_refGenName = getMem(1);
401 _ih_refGenName[0] = '\0';
402 }
403
404 return 1;
405 }
406 /**********************************************/
407 char *getRefGenome()
408 {
409 return _ih_refGen;
410
411 }
412 /**********************************************/
413 char *getRefGenomeName()
414 {
415 return _ih_refGenName;
416
417 }
418 /**********************************************/
419 int getRefGenomeOffset()
420 {
421 return _ih_refGenOff;
422
423 }
424 /**********************************************/
425 HashTable *getHashTable()
426 {
427 return NULL;
428 }