Mercurial > repos > siyuan > prada
comparison pyPRADA_1.2/tools/bwa-0.5.7-mh/bwt_gen/bwt_gen.c @ 0:acc2ca1a3ba4
Uploaded
author | siyuan |
---|---|
date | Thu, 20 Feb 2014 00:44:58 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:acc2ca1a3ba4 |
---|---|
1 /* | |
2 | |
3 BWTConstruct.c BWT-Index Construction | |
4 | |
5 This module constructs BWT and auxiliary data structures. | |
6 | |
7 Copyright (C) 2004, Wong Chi Kwong. | |
8 | |
9 This program is free software; you can redistribute it and/or | |
10 modify it under the terms of the GNU General Public License | |
11 as published by the Free Software Foundation; either version 2 | |
12 of the License, or (at your option) any later version. | |
13 | |
14 This program is distributed in the hope that it will be useful, | |
15 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
17 GNU General Public License for more details. | |
18 | |
19 You should have received a copy of the GNU General Public License | |
20 along with this program; if not, write to the Free Software | |
21 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | |
22 | |
23 */ | |
24 | |
25 #include <stdio.h> | |
26 #include <stdlib.h> | |
27 #include <string.h> | |
28 #include "bwt_gen.h" | |
29 #include "QSufSort.h" | |
30 | |
31 static unsigned int TextLengthFromBytePacked(unsigned int bytePackedLength, unsigned int bitPerChar, | |
32 unsigned int lastByteLength) | |
33 { | |
34 if (bytePackedLength > ALL_ONE_MASK / (BITS_IN_BYTE / bitPerChar)) { | |
35 fprintf(stderr, "TextLengthFromBytePacked(): text length > 2^32!\n"); | |
36 exit(1); | |
37 } | |
38 return (bytePackedLength - 1) * (BITS_IN_BYTE / bitPerChar) + lastByteLength; | |
39 } | |
40 | |
41 static void initializeVAL(unsigned int *startAddr, const unsigned int length, const unsigned int initValue) | |
42 { | |
43 unsigned int i; | |
44 for (i=0; i<length; i++) startAddr[i] = initValue; | |
45 } | |
46 | |
47 static void GenerateDNAOccCountTable(unsigned int *dnaDecodeTable) | |
48 { | |
49 unsigned int i, j, c, t; | |
50 | |
51 for (i=0; i<DNA_OCC_CNT_TABLE_SIZE_IN_WORD; i++) { | |
52 dnaDecodeTable[i] = 0; | |
53 c = i; | |
54 for (j=0; j<8; j++) { | |
55 t = c & 0x00000003; | |
56 dnaDecodeTable[i] += 1 << (t * 8); | |
57 c >>= 2; | |
58 } | |
59 } | |
60 | |
61 } | |
62 // for BWTIncCreate() | |
63 static unsigned int BWTOccValueMajorSizeInWord(const unsigned int numChar) | |
64 { | |
65 unsigned int numOfOccValue; | |
66 unsigned int numOfOccIntervalPerMajor; | |
67 numOfOccValue = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // Value at both end for bi-directional encoding | |
68 numOfOccIntervalPerMajor = OCC_INTERVAL_MAJOR / OCC_INTERVAL; | |
69 return (numOfOccValue + numOfOccIntervalPerMajor - 1) / numOfOccIntervalPerMajor * ALPHABET_SIZE; | |
70 } | |
71 // for BWTIncCreate() | |
72 static unsigned int BWTOccValueMinorSizeInWord(const unsigned int numChar) | |
73 { | |
74 unsigned int numOfOccValue; | |
75 numOfOccValue = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // Value at both end for bi-directional encoding | |
76 return (numOfOccValue + OCC_VALUE_PER_WORD - 1) / OCC_VALUE_PER_WORD * ALPHABET_SIZE; | |
77 } | |
78 // for BWTIncCreate() | |
79 static unsigned int BWTResidentSizeInWord(const unsigned int numChar) { | |
80 | |
81 unsigned int numCharRoundUpToOccInterval; | |
82 | |
83 // The $ in BWT at the position of inverseSa0 is not encoded | |
84 numCharRoundUpToOccInterval = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL * OCC_INTERVAL; | |
85 | |
86 return (numCharRoundUpToOccInterval + CHAR_PER_WORD - 1) / CHAR_PER_WORD; | |
87 | |
88 } | |
89 | |
90 static void BWTIncSetBuildSizeAndTextAddr(BWTInc *bwtInc) | |
91 { | |
92 unsigned int maxBuildSize; | |
93 | |
94 if (bwtInc->bwt->textLength == 0) { | |
95 // initial build | |
96 // Minus 2 because n+1 entries of seq and rank needed for n char | |
97 maxBuildSize = (bwtInc->availableWord - 2 - OCC_INTERVAL / CHAR_PER_WORD) | |
98 / (2 * CHAR_PER_WORD + 1) * CHAR_PER_WORD; | |
99 if (bwtInc->initialMaxBuildSize > 0) { | |
100 bwtInc->buildSize = min(bwtInc->initialMaxBuildSize, maxBuildSize); | |
101 } else { | |
102 bwtInc->buildSize = maxBuildSize; | |
103 } | |
104 } else { | |
105 // Minus 3 because n+1 entries of sorted rank, seq and rank needed for n char | |
106 // Minus numberOfIterationDone because bwt slightly shift to left in each iteration | |
107 maxBuildSize = (bwtInc->availableWord - bwtInc->bwt->bwtSizeInWord - bwtInc->bwt->occSizeInWord - 3 | |
108 - bwtInc->numberOfIterationDone * OCC_INTERVAL / BIT_PER_CHAR) | |
109 / 3; | |
110 if (maxBuildSize < CHAR_PER_WORD) { | |
111 fprintf(stderr, "BWTIncSetBuildSizeAndTextAddr(): Not enough space allocated to continue construction!\n"); | |
112 exit(1); | |
113 } | |
114 if (bwtInc->incMaxBuildSize > 0) { | |
115 bwtInc->buildSize = min(bwtInc->incMaxBuildSize, maxBuildSize); | |
116 } else { | |
117 bwtInc->buildSize = maxBuildSize; | |
118 } | |
119 if (bwtInc->buildSize < CHAR_PER_WORD) { | |
120 bwtInc->buildSize = CHAR_PER_WORD; | |
121 } | |
122 } | |
123 | |
124 if (bwtInc->buildSize < CHAR_PER_WORD) { | |
125 fprintf(stderr, "BWTIncSetBuildSizeAndTextAddr(): Not enough space allocated to continue construction!\n"); | |
126 exit(1); | |
127 } | |
128 | |
129 bwtInc->buildSize = bwtInc->buildSize / CHAR_PER_WORD * CHAR_PER_WORD; | |
130 | |
131 bwtInc->packedText = bwtInc->workingMemory + 2 * (bwtInc->buildSize + 1); | |
132 bwtInc->textBuffer = (unsigned char*)(bwtInc->workingMemory + bwtInc->buildSize + 1); | |
133 | |
134 } | |
135 | |
136 // for ceilLog2() | |
137 unsigned int leadingZero(const unsigned int input) | |
138 { | |
139 unsigned int l; | |
140 const static unsigned int leadingZero8bit[256] = {8,7,6,6,5,5,5,5,4,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, | |
141 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, | |
142 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
143 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
144 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
145 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
146 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
147 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; | |
148 | |
149 if (input & 0xFFFF0000) { | |
150 if (input & 0xFF000000) { | |
151 l = leadingZero8bit[input >> 24]; | |
152 } else { | |
153 l = 8 + leadingZero8bit[input >> 16]; | |
154 } | |
155 } else { | |
156 if (input & 0x0000FF00) { | |
157 l = 16 + leadingZero8bit[input >> 8]; | |
158 } else { | |
159 l = 24 + leadingZero8bit[input]; | |
160 } | |
161 } | |
162 return l; | |
163 | |
164 } | |
165 // for BitPerBytePackedChar() | |
166 static unsigned int ceilLog2(const unsigned int input) | |
167 { | |
168 if (input <= 1) return 0; | |
169 return BITS_IN_WORD - leadingZero(input - 1); | |
170 | |
171 } | |
172 // for ConvertBytePackedToWordPacked() | |
173 static unsigned int BitPerBytePackedChar(const unsigned int alphabetSize) | |
174 { | |
175 unsigned int bitPerChar; | |
176 bitPerChar = ceilLog2(alphabetSize); | |
177 // Return the largest number of bit that does not affect packing efficiency | |
178 if (BITS_IN_BYTE / (BITS_IN_BYTE / bitPerChar) > bitPerChar) | |
179 bitPerChar = BITS_IN_BYTE / (BITS_IN_BYTE / bitPerChar); | |
180 return bitPerChar; | |
181 } | |
182 // for ConvertBytePackedToWordPacked() | |
183 static unsigned int BitPerWordPackedChar(const unsigned int alphabetSize) | |
184 { | |
185 return ceilLog2(alphabetSize); | |
186 } | |
187 | |
188 static void ConvertBytePackedToWordPacked(const unsigned char *input, unsigned int *output, const unsigned int alphabetSize, | |
189 const unsigned int textLength) | |
190 { | |
191 unsigned int i, j, k; | |
192 unsigned int c; | |
193 unsigned int bitPerBytePackedChar; | |
194 unsigned int bitPerWordPackedChar; | |
195 unsigned int charPerWord; | |
196 unsigned int charPerByte; | |
197 unsigned int bytePerIteration; | |
198 unsigned int byteProcessed = 0; | |
199 unsigned int wordProcessed = 0; | |
200 unsigned int mask, shift; | |
201 | |
202 unsigned int buffer[BITS_IN_WORD]; | |
203 | |
204 bitPerBytePackedChar = BitPerBytePackedChar(alphabetSize); | |
205 bitPerWordPackedChar = BitPerWordPackedChar(alphabetSize); | |
206 charPerByte = BITS_IN_BYTE / bitPerBytePackedChar; | |
207 charPerWord = BITS_IN_WORD / bitPerWordPackedChar; | |
208 | |
209 bytePerIteration = charPerWord / charPerByte; | |
210 mask = truncateRight(ALL_ONE_MASK, BITS_IN_WORD - bitPerWordPackedChar); | |
211 shift = BITS_IN_WORD - BITS_IN_BYTE + bitPerBytePackedChar - bitPerWordPackedChar; | |
212 | |
213 while ((wordProcessed + 1) * charPerWord < textLength) { | |
214 | |
215 k = 0; | |
216 for (i=0; i<bytePerIteration; i++) { | |
217 c = (unsigned int)input[byteProcessed] << shift; | |
218 for (j=0; j<charPerByte; j++) { | |
219 buffer[k] = c & mask; | |
220 c <<= bitPerBytePackedChar; | |
221 k++; | |
222 } | |
223 byteProcessed++; | |
224 } | |
225 | |
226 c = 0; | |
227 for (i=0; i<charPerWord; i++) { | |
228 c |= buffer[i] >> bitPerWordPackedChar * i; | |
229 } | |
230 output[wordProcessed] = c; | |
231 wordProcessed++; | |
232 | |
233 } | |
234 | |
235 k = 0; | |
236 for (i=0; i < (textLength - wordProcessed * charPerWord - 1) / charPerByte + 1; i++) { | |
237 c = (unsigned int)input[byteProcessed] << shift; | |
238 for (j=0; j<charPerByte; j++) { | |
239 buffer[k] = c & mask; | |
240 c <<= bitPerBytePackedChar; | |
241 k++; | |
242 } | |
243 byteProcessed++; | |
244 } | |
245 | |
246 c = 0; | |
247 for (i=0; i<textLength - wordProcessed * charPerWord; i++) { | |
248 c |= buffer[i] >> bitPerWordPackedChar * i; | |
249 } | |
250 output[wordProcessed] = c; | |
251 } | |
252 | |
253 BWT *BWTCreate(const unsigned int textLength, unsigned int *decodeTable) | |
254 { | |
255 BWT *bwt; | |
256 | |
257 bwt = (BWT*)calloc(1, sizeof(BWT)); | |
258 | |
259 bwt->textLength = 0; | |
260 bwt->inverseSa = 0; | |
261 | |
262 bwt->cumulativeFreq = (unsigned*)calloc((ALPHABET_SIZE + 1), sizeof(unsigned int*)); | |
263 initializeVAL(bwt->cumulativeFreq, ALPHABET_SIZE + 1, 0); | |
264 | |
265 bwt->bwtSizeInWord = 0; | |
266 bwt->saValueOnBoundary = NULL; | |
267 | |
268 // Generate decode tables | |
269 if (decodeTable == NULL) { | |
270 bwt->decodeTable = (unsigned*)calloc(DNA_OCC_CNT_TABLE_SIZE_IN_WORD, sizeof(unsigned int)); | |
271 GenerateDNAOccCountTable(bwt->decodeTable); | |
272 } else { | |
273 bwt->decodeTable = decodeTable; | |
274 } | |
275 | |
276 bwt->occMajorSizeInWord = BWTOccValueMajorSizeInWord(textLength); | |
277 bwt->occValueMajor = (unsigned*)calloc(bwt->occMajorSizeInWord, sizeof(unsigned int)); | |
278 | |
279 bwt->occSizeInWord = 0; | |
280 bwt->occValue = NULL; | |
281 | |
282 bwt->saInterval = ALL_ONE_MASK; | |
283 bwt->saValueSize = 0; | |
284 bwt->saValue = NULL; | |
285 | |
286 bwt->inverseSaInterval = ALL_ONE_MASK; | |
287 bwt->inverseSaSize = 0; | |
288 bwt->inverseSa = NULL; | |
289 | |
290 return bwt; | |
291 } | |
292 | |
293 BWTInc *BWTIncCreate(const unsigned int textLength, const float targetNBit, | |
294 const unsigned int initialMaxBuildSize, const unsigned int incMaxBuildSize) | |
295 { | |
296 BWTInc *bwtInc; | |
297 unsigned int i; | |
298 | |
299 if (targetNBit == 0) { | |
300 fprintf(stderr, "BWTIncCreate() : targetNBit = 0!\n"); | |
301 exit(1); | |
302 } | |
303 | |
304 bwtInc = (BWTInc*)calloc(1, sizeof(BWTInc)); | |
305 bwtInc->numberOfIterationDone = 0; | |
306 bwtInc->bwt = BWTCreate(textLength, NULL); | |
307 bwtInc->initialMaxBuildSize = initialMaxBuildSize; | |
308 bwtInc->incMaxBuildSize = incMaxBuildSize; | |
309 bwtInc->targetNBit = targetNBit; | |
310 bwtInc->cumulativeCountInCurrentBuild = (unsigned*)calloc((ALPHABET_SIZE + 1), sizeof(unsigned int)); | |
311 initializeVAL(bwtInc->cumulativeCountInCurrentBuild, ALPHABET_SIZE + 1, 0); | |
312 | |
313 // Build frequently accessed data | |
314 bwtInc->packedShift = (unsigned*)calloc(CHAR_PER_WORD, sizeof(unsigned int)); | |
315 for (i=0; i<CHAR_PER_WORD; i++) { | |
316 bwtInc->packedShift[i] = BITS_IN_WORD - (i+1) * BIT_PER_CHAR; | |
317 } | |
318 | |
319 bwtInc->targetTextLength = textLength; | |
320 bwtInc->availableWord = (unsigned int)((textLength + OCC_INTERVAL - 1) / OCC_INTERVAL * OCC_INTERVAL / BITS_IN_WORD * bwtInc->targetNBit); | |
321 if (bwtInc->availableWord < BWTResidentSizeInWord(textLength) + BWTOccValueMinorSizeInWord(textLength)) { | |
322 fprintf(stderr, "BWTIncCreate() : targetNBit is too low!\n"); | |
323 exit(1); | |
324 } | |
325 bwtInc->workingMemory = (unsigned*)calloc(bwtInc->availableWord, BYTES_IN_WORD); | |
326 | |
327 return bwtInc; | |
328 | |
329 } | |
330 // for BWTIncConstruct() | |
331 static void BWTIncPutPackedTextToRank(const unsigned int *packedText, unsigned int* __restrict rank, | |
332 unsigned int* __restrict cumulativeCount, const unsigned int numChar) | |
333 { | |
334 unsigned int i, j; | |
335 unsigned int c, t; | |
336 unsigned int packedMask; | |
337 unsigned int rankIndex; | |
338 unsigned int lastWord, numCharInLastWord; | |
339 | |
340 lastWord = (numChar - 1) / CHAR_PER_WORD; | |
341 numCharInLastWord = numChar - lastWord * CHAR_PER_WORD; | |
342 | |
343 packedMask = ALL_ONE_MASK >> (BITS_IN_WORD - BIT_PER_CHAR); | |
344 rankIndex = numChar - 1; | |
345 | |
346 t = packedText[lastWord] >> (BITS_IN_WORD - numCharInLastWord * BIT_PER_CHAR); | |
347 for (i=0; i<numCharInLastWord; i++) { | |
348 c = t & packedMask; | |
349 cumulativeCount[c+1]++; | |
350 rank[rankIndex] = c; | |
351 rankIndex--; | |
352 t >>= BIT_PER_CHAR; | |
353 } | |
354 | |
355 for (i=lastWord; i--;) { // loop from lastWord - 1 to 0 | |
356 t = packedText[i]; | |
357 for (j=0; j<CHAR_PER_WORD; j++) { | |
358 c = t & packedMask; | |
359 cumulativeCount[c+1]++; | |
360 rank[rankIndex] = c; | |
361 rankIndex--; | |
362 t >>= BIT_PER_CHAR; | |
363 } | |
364 } | |
365 | |
366 // Convert occurrence to cumulativeCount | |
367 cumulativeCount[2] += cumulativeCount[1]; | |
368 cumulativeCount[3] += cumulativeCount[2]; | |
369 cumulativeCount[4] += cumulativeCount[3]; | |
370 } | |
371 | |
372 | |
373 static void ForwardDNAAllOccCountNoLimit(const unsigned int* dna, const unsigned int index, | |
374 unsigned int* __restrict occCount, const unsigned int* dnaDecodeTable) | |
375 { | |
376 static const unsigned int truncateRightMask[16] = { 0x00000000, 0xC0000000, 0xF0000000, 0xFC000000, | |
377 0xFF000000, 0xFFC00000, 0xFFF00000, 0xFFFC0000, | |
378 0xFFFF0000, 0xFFFFC000, 0xFFFFF000, 0xFFFFFC00, | |
379 0xFFFFFF00, 0xFFFFFFC0, 0xFFFFFFF0, 0xFFFFFFFC }; | |
380 | |
381 unsigned int iteration, wordToCount, charToCount; | |
382 unsigned int i, j, c; | |
383 unsigned int sum; | |
384 | |
385 occCount[0] = 0; | |
386 occCount[1] = 0; | |
387 occCount[2] = 0; | |
388 occCount[3] = 0; | |
389 | |
390 iteration = index / 256; | |
391 wordToCount = (index - iteration * 256) / 16; | |
392 charToCount = index - iteration * 256 - wordToCount * 16; | |
393 | |
394 for (i=0; i<iteration; i++) { | |
395 | |
396 sum = 0; | |
397 for (j=0; j<16; j++) { | |
398 sum += dnaDecodeTable[*dna >> 16]; | |
399 sum += dnaDecodeTable[*dna & 0x0000FFFF]; | |
400 dna++; | |
401 } | |
402 if (!DNA_OCC_SUM_EXCEPTION(sum)) { | |
403 occCount[0] += sum & 0x000000FF; sum >>= 8; | |
404 occCount[1] += sum & 0x000000FF; sum >>= 8; | |
405 occCount[2] += sum & 0x000000FF; sum >>= 8; | |
406 occCount[3] += sum; | |
407 } else { | |
408 // only some or all of the 3 bits are on | |
409 // in reality, only one of the four cases are possible | |
410 if (sum == 0x00000100) { | |
411 occCount[0] += 256; | |
412 } else if (sum == 0x00010000) { | |
413 occCount[1] += 256; | |
414 } else if (sum == 0x01000000) { | |
415 occCount[2] += 256; | |
416 } else if (sum == 0x00000000) { | |
417 occCount[3] += 256; | |
418 } else { | |
419 fprintf(stderr, "ForwardDNAAllOccCountNoLimit(): DNA occ sum exception!\n"); | |
420 exit(1); | |
421 } | |
422 } | |
423 | |
424 } | |
425 | |
426 sum = 0; | |
427 for (j=0; j<wordToCount; j++) { | |
428 sum += dnaDecodeTable[*dna >> 16]; | |
429 sum += dnaDecodeTable[*dna & 0x0000FFFF]; | |
430 dna++; | |
431 } | |
432 | |
433 if (charToCount > 0) { | |
434 c = *dna & truncateRightMask[charToCount]; // increase count of 'a' by 16 - c; | |
435 sum += dnaDecodeTable[c >> 16]; | |
436 sum += dnaDecodeTable[c & 0xFFFF]; | |
437 sum += charToCount - 16; // decrease count of 'a' by 16 - positionToProcess | |
438 } | |
439 | |
440 occCount[0] += sum & 0x000000FF; sum >>= 8; | |
441 occCount[1] += sum & 0x000000FF; sum >>= 8; | |
442 occCount[2] += sum & 0x000000FF; sum >>= 8; | |
443 occCount[3] += sum; | |
444 } | |
445 | |
446 static void BWTIncBuildPackedBwt(const unsigned int *relativeRank, unsigned int* __restrict bwt, const unsigned int numChar, | |
447 const unsigned int *cumulativeCount, const unsigned int *packedShift) { | |
448 | |
449 unsigned int i, c, r; | |
450 unsigned int previousRank, currentRank; | |
451 unsigned int wordIndex, charIndex; | |
452 unsigned int inverseSa0; | |
453 | |
454 inverseSa0 = previousRank = relativeRank[0]; | |
455 | |
456 for (i=1; i<=numChar; i++) { | |
457 currentRank = relativeRank[i]; | |
458 // previousRank > cumulativeCount[c] because $ is one of the char | |
459 c = (previousRank > cumulativeCount[1]) + (previousRank > cumulativeCount[2]) | |
460 + (previousRank > cumulativeCount[3]); | |
461 // set bwt for currentRank | |
462 if (c > 0) { | |
463 // c <> 'a' | |
464 r = currentRank; | |
465 if (r > inverseSa0) { | |
466 // - 1 because $ at inverseSa0 is not encoded | |
467 r--; | |
468 } | |
469 wordIndex = r / CHAR_PER_WORD; | |
470 charIndex = r - wordIndex * CHAR_PER_WORD; | |
471 bwt[wordIndex] |= c << packedShift[charIndex]; | |
472 } | |
473 previousRank = currentRank; | |
474 } | |
475 } | |
476 | |
477 static inline unsigned int BWTOccValueExplicit(const BWT *bwt, const unsigned int occIndexExplicit, | |
478 const unsigned int character) | |
479 { | |
480 unsigned int occIndexMajor; | |
481 | |
482 occIndexMajor = occIndexExplicit * OCC_INTERVAL / OCC_INTERVAL_MAJOR; | |
483 | |
484 if (occIndexExplicit % OCC_VALUE_PER_WORD == 0) { | |
485 return bwt->occValueMajor[occIndexMajor * ALPHABET_SIZE + character] + | |
486 (bwt->occValue[occIndexExplicit / OCC_VALUE_PER_WORD * ALPHABET_SIZE + character] >> 16); | |
487 | |
488 } else { | |
489 return bwt->occValueMajor[occIndexMajor * ALPHABET_SIZE + character] + | |
490 (bwt->occValue[occIndexExplicit / OCC_VALUE_PER_WORD * ALPHABET_SIZE + character] & 0x0000FFFF); | |
491 } | |
492 } | |
493 | |
494 | |
495 static unsigned int ForwardDNAOccCount(const unsigned int* dna, const unsigned int index, const unsigned int character, | |
496 const unsigned int* dnaDecodeTable) | |
497 { | |
498 static const unsigned int truncateRightMask[16] = { 0x00000000, 0xC0000000, 0xF0000000, 0xFC000000, | |
499 0xFF000000, 0xFFC00000, 0xFFF00000, 0xFFFC0000, | |
500 0xFFFF0000, 0xFFFFC000, 0xFFFFF000, 0xFFFFFC00, | |
501 0xFFFFFF00, 0xFFFFFFC0, 0xFFFFFFF0, 0xFFFFFFFC }; | |
502 | |
503 unsigned int wordToCount, charToCount; | |
504 unsigned int i, c; | |
505 unsigned int sum = 0; | |
506 | |
507 wordToCount = index / 16; | |
508 charToCount = index - wordToCount * 16; | |
509 | |
510 for (i=0; i<wordToCount; i++) { | |
511 sum += dnaDecodeTable[dna[i] >> 16]; | |
512 sum += dnaDecodeTable[dna[i] & 0x0000FFFF]; | |
513 } | |
514 | |
515 if (charToCount > 0) { | |
516 c = dna[i] & truncateRightMask[charToCount]; // increase count of 'a' by 16 - c; | |
517 sum += dnaDecodeTable[c >> 16]; | |
518 sum += dnaDecodeTable[c & 0xFFFF]; | |
519 sum += charToCount - 16; // decrease count of 'a' by 16 - positionToProcess | |
520 } | |
521 | |
522 return (sum >> (character * 8)) & 0x000000FF; | |
523 | |
524 } | |
525 | |
526 static unsigned int BackwardDNAOccCount(const unsigned int* dna, const unsigned int index, const unsigned int character, | |
527 const unsigned int* dnaDecodeTable) | |
528 { | |
529 static const unsigned int truncateLeftMask[16] = { 0x00000000, 0x00000003, 0x0000000F, 0x0000003F, | |
530 0x000000FF, 0x000003FF, 0x00000FFF, 0x00003FFF, | |
531 0x0000FFFF, 0x0003FFFF, 0x000FFFFF, 0x003FFFFF, | |
532 0x00FFFFFF, 0x03FFFFFF, 0x0FFFFFFF, 0x3FFFFFFF }; | |
533 | |
534 unsigned int wordToCount, charToCount; | |
535 unsigned int i, c; | |
536 unsigned int sum = 0; | |
537 | |
538 wordToCount = index / 16; | |
539 charToCount = index - wordToCount * 16; | |
540 | |
541 dna -= wordToCount + 1; | |
542 | |
543 if (charToCount > 0) { | |
544 c = *dna & truncateLeftMask[charToCount]; // increase count of 'a' by 16 - c; | |
545 sum += dnaDecodeTable[c >> 16]; | |
546 sum += dnaDecodeTable[c & 0xFFFF]; | |
547 sum += charToCount - 16; // decrease count of 'a' by 16 - positionToProcess | |
548 } | |
549 | |
550 for (i=0; i<wordToCount; i++) { | |
551 dna++; | |
552 sum += dnaDecodeTable[*dna >> 16]; | |
553 sum += dnaDecodeTable[*dna & 0x0000FFFF]; | |
554 } | |
555 | |
556 return (sum >> (character * 8)) & 0x000000FF; | |
557 | |
558 } | |
559 | |
560 unsigned int BWTOccValue(const BWT *bwt, unsigned int index, const unsigned int character) { | |
561 | |
562 unsigned int occValue; | |
563 unsigned int occExplicitIndex, occIndex; | |
564 | |
565 // $ is supposed to be positioned at inverseSa0 but it is not encoded | |
566 // therefore index is subtracted by 1 for adjustment | |
567 if (index > bwt->inverseSa0) { | |
568 index--; | |
569 } | |
570 | |
571 occExplicitIndex = (index + OCC_INTERVAL / 2 - 1) / OCC_INTERVAL; // Bidirectional encoding | |
572 occIndex = occExplicitIndex * OCC_INTERVAL; | |
573 occValue = BWTOccValueExplicit(bwt, occExplicitIndex, character); | |
574 | |
575 if (occIndex == index) { | |
576 return occValue; | |
577 } | |
578 | |
579 if (occIndex < index) { | |
580 return occValue + ForwardDNAOccCount(bwt->bwtCode + occIndex / CHAR_PER_WORD, index - occIndex, character, bwt->decodeTable); | |
581 } else { | |
582 return occValue - BackwardDNAOccCount(bwt->bwtCode + occIndex / CHAR_PER_WORD, occIndex - index, character, bwt->decodeTable); | |
583 } | |
584 | |
585 } | |
586 | |
587 static unsigned int BWTIncGetAbsoluteRank(BWT *bwt, unsigned int* __restrict absoluteRank, unsigned int* __restrict seq, | |
588 const unsigned int *packedText, const unsigned int numChar, | |
589 const unsigned int* cumulativeCount, const unsigned int firstCharInLastIteration) | |
590 { | |
591 unsigned int saIndex; | |
592 unsigned int lastWord; | |
593 unsigned int packedMask; | |
594 unsigned int i, j; | |
595 unsigned int c, t; | |
596 unsigned int rankIndex; | |
597 unsigned int shift; | |
598 unsigned int seqIndexFromStart[ALPHABET_SIZE]; | |
599 unsigned int seqIndexFromEnd[ALPHABET_SIZE]; | |
600 | |
601 for (i=0; i<ALPHABET_SIZE; i++) { | |
602 seqIndexFromStart[i] = cumulativeCount[i]; | |
603 seqIndexFromEnd[i] = cumulativeCount[i+1] - 1; | |
604 } | |
605 | |
606 shift = BITS_IN_WORD - BIT_PER_CHAR; | |
607 packedMask = ALL_ONE_MASK >> shift; | |
608 saIndex = bwt->inverseSa0; | |
609 rankIndex = numChar - 1; | |
610 | |
611 lastWord = numChar / CHAR_PER_WORD; | |
612 for (i=lastWord; i--;) { // loop from lastWord - 1 to 0 | |
613 t = packedText[i]; | |
614 for (j=0; j<CHAR_PER_WORD; j++) { | |
615 c = t & packedMask; | |
616 saIndex = bwt->cumulativeFreq[c] + BWTOccValue(bwt, saIndex, c) + 1; | |
617 // A counting sort using the first character of suffix is done here | |
618 // If rank > inverseSa0 -> fill seq from end, otherwise fill seq from start -> to leave the right entry for inverseSa0 | |
619 if (saIndex > bwt->inverseSa0) { | |
620 seq[seqIndexFromEnd[c]] = rankIndex; | |
621 absoluteRank[seqIndexFromEnd[c]] = saIndex; | |
622 seqIndexFromEnd[c]--; | |
623 } else { | |
624 seq[seqIndexFromStart[c]] = rankIndex; | |
625 absoluteRank[seqIndexFromStart[c]] = saIndex; | |
626 seqIndexFromStart[c]++; | |
627 } | |
628 rankIndex--; | |
629 t >>= BIT_PER_CHAR; | |
630 } | |
631 } | |
632 | |
633 absoluteRank[seqIndexFromStart[firstCharInLastIteration]] = bwt->inverseSa0; // representing the substring of all preceding characters | |
634 seq[seqIndexFromStart[firstCharInLastIteration]] = numChar; | |
635 | |
636 return seqIndexFromStart[firstCharInLastIteration]; | |
637 } | |
638 | |
639 static void BWTIncSortKey(unsigned int* __restrict key, unsigned int* __restrict seq, const unsigned int numItem) | |
640 { | |
641 #define EQUAL_KEY_THRESHOLD 4 // Partition for equal key if data array size / the number of data with equal value with pivot < EQUAL_KEY_THRESHOLD | |
642 | |
643 int lowIndex, highIndex, midIndex; | |
644 int lowPartitionIndex, highPartitionIndex; | |
645 int lowStack[32], highStack[32]; | |
646 int stackDepth; | |
647 int i, j; | |
648 unsigned int tempSeq, tempKey; | |
649 int numberOfEqualKey; | |
650 | |
651 if (numItem < 2) return; | |
652 | |
653 stackDepth = 0; | |
654 | |
655 lowIndex = 0; | |
656 highIndex = numItem - 1; | |
657 | |
658 for (;;) { | |
659 | |
660 for (;;) { | |
661 | |
662 // Sort small array of data | |
663 if (highIndex - lowIndex < BWTINC_INSERT_SORT_NUM_ITEM) { // Insertion sort on smallest arrays | |
664 for (i=lowIndex+1; i<=highIndex; i++) { | |
665 tempSeq = seq[i]; | |
666 tempKey = key[i]; | |
667 for (j = i; j > lowIndex && key[j-1] > tempKey; j--) { | |
668 seq[j] = seq[j-1]; | |
669 key[j] = key[j-1]; | |
670 } | |
671 if (j != i) { | |
672 seq[j] = tempSeq; | |
673 key[j] = tempKey; | |
674 } | |
675 } | |
676 break; | |
677 } | |
678 | |
679 // Choose pivot as median of the lowest, middle, and highest data; sort the three data | |
680 | |
681 midIndex = average(lowIndex, highIndex); | |
682 if (key[lowIndex] > key[midIndex]) { | |
683 tempSeq = seq[lowIndex]; | |
684 tempKey = key[lowIndex]; | |
685 seq[lowIndex] = seq[midIndex]; | |
686 key[lowIndex] = key[midIndex]; | |
687 seq[midIndex] = tempSeq; | |
688 key[midIndex] = tempKey; | |
689 } | |
690 if (key[lowIndex] > key[highIndex]) { | |
691 tempSeq = seq[lowIndex]; | |
692 tempKey = key[lowIndex]; | |
693 seq[lowIndex] = seq[highIndex]; | |
694 key[lowIndex] = key[highIndex]; | |
695 seq[highIndex] = tempSeq; | |
696 key[highIndex] = tempKey; | |
697 } | |
698 if (key[midIndex] > key[highIndex]) { | |
699 tempSeq = seq[midIndex]; | |
700 tempKey = key[midIndex]; | |
701 seq[midIndex] = seq[highIndex]; | |
702 key[midIndex] = key[highIndex]; | |
703 seq[highIndex] = tempSeq; | |
704 key[highIndex] = tempKey; | |
705 } | |
706 | |
707 // Partition data | |
708 | |
709 numberOfEqualKey = 0; | |
710 | |
711 lowPartitionIndex = lowIndex + 1; | |
712 highPartitionIndex = highIndex - 1; | |
713 | |
714 for (;;) { | |
715 while (lowPartitionIndex <= highPartitionIndex && key[lowPartitionIndex] <= key[midIndex]) { | |
716 numberOfEqualKey += (key[lowPartitionIndex] == key[midIndex]); | |
717 lowPartitionIndex++; | |
718 } | |
719 while (lowPartitionIndex < highPartitionIndex) { | |
720 if (key[midIndex] >= key[highPartitionIndex]) { | |
721 numberOfEqualKey += (key[midIndex] == key[highPartitionIndex]); | |
722 break; | |
723 } | |
724 highPartitionIndex--; | |
725 } | |
726 if (lowPartitionIndex >= highPartitionIndex) { | |
727 break; | |
728 } | |
729 tempSeq = seq[lowPartitionIndex]; | |
730 tempKey = key[lowPartitionIndex]; | |
731 seq[lowPartitionIndex] = seq[highPartitionIndex]; | |
732 key[lowPartitionIndex] = key[highPartitionIndex]; | |
733 seq[highPartitionIndex] = tempSeq; | |
734 key[highPartitionIndex] = tempKey; | |
735 if (highPartitionIndex == midIndex) { | |
736 // partition key has been moved | |
737 midIndex = lowPartitionIndex; | |
738 } | |
739 lowPartitionIndex++; | |
740 highPartitionIndex--; | |
741 } | |
742 | |
743 // Adjust the partition index | |
744 highPartitionIndex = lowPartitionIndex; | |
745 lowPartitionIndex--; | |
746 | |
747 // move the partition key to end of low partition | |
748 tempSeq = seq[midIndex]; | |
749 tempKey = key[midIndex]; | |
750 seq[midIndex] = seq[lowPartitionIndex]; | |
751 key[midIndex] = key[lowPartitionIndex]; | |
752 seq[lowPartitionIndex] = tempSeq; | |
753 key[lowPartitionIndex] = tempKey; | |
754 | |
755 if (highIndex - lowIndex + BWTINC_INSERT_SORT_NUM_ITEM <= EQUAL_KEY_THRESHOLD * numberOfEqualKey) { | |
756 | |
757 // Many keys = partition key; separate the equal key data from the lower partition | |
758 | |
759 midIndex = lowIndex; | |
760 | |
761 for (;;) { | |
762 while (midIndex < lowPartitionIndex && key[midIndex] < key[lowPartitionIndex]) { | |
763 midIndex++; | |
764 } | |
765 while (midIndex < lowPartitionIndex && key[lowPartitionIndex] == key[lowPartitionIndex - 1]) { | |
766 lowPartitionIndex--; | |
767 } | |
768 if (midIndex >= lowPartitionIndex) { | |
769 break; | |
770 } | |
771 tempSeq = seq[midIndex]; | |
772 tempKey = key[midIndex]; | |
773 seq[midIndex] = seq[lowPartitionIndex - 1]; | |
774 key[midIndex] = key[lowPartitionIndex - 1]; | |
775 seq[lowPartitionIndex - 1] = tempSeq; | |
776 key[lowPartitionIndex - 1] = tempKey; | |
777 midIndex++; | |
778 lowPartitionIndex--; | |
779 } | |
780 | |
781 } | |
782 | |
783 if (lowPartitionIndex - lowIndex > highIndex - highPartitionIndex) { | |
784 // put the larger partition to stack | |
785 lowStack[stackDepth] = lowIndex; | |
786 highStack[stackDepth] = lowPartitionIndex - 1; | |
787 stackDepth++; | |
788 // sort the smaller partition first | |
789 lowIndex = highPartitionIndex; | |
790 } else { | |
791 // put the larger partition to stack | |
792 lowStack[stackDepth] = highPartitionIndex; | |
793 highStack[stackDepth] = highIndex; | |
794 stackDepth++; | |
795 // sort the smaller partition first | |
796 if (lowPartitionIndex > lowIndex) { | |
797 highIndex = lowPartitionIndex - 1; | |
798 } else { | |
799 // all keys in the partition equals to the partition key | |
800 break; | |
801 } | |
802 } | |
803 continue; | |
804 } | |
805 | |
806 // Pop a range from stack | |
807 if (stackDepth > 0) { | |
808 stackDepth--; | |
809 lowIndex = lowStack[stackDepth]; | |
810 highIndex = highStack[stackDepth]; | |
811 continue; | |
812 } else return; | |
813 } | |
814 } | |
815 | |
816 | |
817 static void BWTIncBuildRelativeRank(unsigned int* __restrict sortedRank, unsigned int* __restrict seq, | |
818 unsigned int* __restrict relativeRank, const unsigned int numItem, | |
819 unsigned int oldInverseSa0, const unsigned int *cumulativeCount) | |
820 { | |
821 unsigned int i, c; | |
822 unsigned int s, r; | |
823 unsigned int lastRank, lastIndex; | |
824 unsigned int oldInverseSa0RelativeRank = 0; | |
825 unsigned int freq; | |
826 | |
827 lastIndex = numItem; | |
828 lastRank = sortedRank[numItem]; | |
829 if (lastRank > oldInverseSa0) { | |
830 sortedRank[numItem]--; // to prepare for merging; $ is not encoded in bwt | |
831 } | |
832 s = seq[numItem]; | |
833 relativeRank[s] = numItem; | |
834 if (lastRank == oldInverseSa0) { | |
835 oldInverseSa0RelativeRank = numItem; | |
836 oldInverseSa0++; // so that this segment of code is not run again | |
837 lastRank++; // so that oldInverseSa0 become a sorted group with 1 item | |
838 } | |
839 | |
840 c = ALPHABET_SIZE - 1; | |
841 freq = cumulativeCount[c]; | |
842 | |
843 for (i=numItem; i--;) { // from numItem - 1 to 0 | |
844 r = sortedRank[i]; | |
845 if (r > oldInverseSa0) { | |
846 sortedRank[i]--; // to prepare for merging; $ is not encoded in bwt | |
847 } | |
848 s = seq[i]; | |
849 if (i < freq) { | |
850 if (lastIndex >= freq) { | |
851 lastRank++; // to trigger the group across alphabet boundary to be split | |
852 } | |
853 c--; | |
854 freq = cumulativeCount[c]; | |
855 } | |
856 if (r == lastRank) { | |
857 relativeRank[s] = lastIndex; | |
858 } else { | |
859 if (i == lastIndex - 1) { | |
860 if (lastIndex < numItem && (int)seq[lastIndex + 1] < 0) { | |
861 seq[lastIndex] = seq[lastIndex + 1] - 1; | |
862 } else { | |
863 seq[lastIndex] = (unsigned int)-1; | |
864 } | |
865 } | |
866 lastIndex = i; | |
867 lastRank = r; | |
868 relativeRank[s] = i; | |
869 if (r == oldInverseSa0) { | |
870 oldInverseSa0RelativeRank = i; | |
871 oldInverseSa0++; // so that this segment of code is not run again | |
872 lastRank++; // so that oldInverseSa0 become a sorted group with 1 item | |
873 } | |
874 } | |
875 } | |
876 | |
877 } | |
878 | |
879 static void BWTIncBuildBwt(unsigned int* seq, const unsigned int *relativeRank, const unsigned int numChar, | |
880 const unsigned int *cumulativeCount) | |
881 { | |
882 unsigned int i, c; | |
883 unsigned int previousRank, currentRank; | |
884 | |
885 previousRank = relativeRank[0]; | |
886 | |
887 for (i=1; i<=numChar; i++) { | |
888 currentRank = relativeRank[i]; | |
889 c = (previousRank >= cumulativeCount[1]) + (previousRank >= cumulativeCount[2]) | |
890 + (previousRank >= cumulativeCount[3]); | |
891 seq[currentRank] = c; | |
892 previousRank = currentRank; | |
893 } | |
894 } | |
895 | |
896 static void BWTIncMergeBwt(const unsigned int *sortedRank, const unsigned int* oldBwt, const unsigned int *insertBwt, | |
897 unsigned int* __restrict mergedBwt, const unsigned int numOldBwt, const unsigned int numInsertBwt) | |
898 { | |
899 unsigned int bitsInWordMinusBitPerChar; | |
900 unsigned int leftShift, rightShift; | |
901 unsigned int o; | |
902 unsigned int oIndex, iIndex, mIndex; | |
903 unsigned int mWord, mChar, oWord, oChar; | |
904 unsigned int numInsert; | |
905 | |
906 bitsInWordMinusBitPerChar = BITS_IN_WORD - BIT_PER_CHAR; | |
907 | |
908 oIndex = 0; | |
909 iIndex = 0; | |
910 mIndex = 0; | |
911 | |
912 mWord = 0; | |
913 mChar = 0; | |
914 | |
915 mergedBwt[0] = 0; // this can be cleared as merged Bwt slightly shift to the left in each iteration | |
916 | |
917 while (oIndex < numOldBwt) { | |
918 | |
919 // copy from insertBwt | |
920 while (iIndex <= numInsertBwt && sortedRank[iIndex] <= oIndex) { | |
921 if (sortedRank[iIndex] != 0) { // special value to indicate that this is for new inverseSa0 | |
922 mergedBwt[mWord] |= insertBwt[iIndex] << (BITS_IN_WORD - (mChar + 1) * BIT_PER_CHAR); | |
923 mIndex++; | |
924 mChar++; | |
925 if (mChar == CHAR_PER_WORD) { | |
926 mChar = 0; | |
927 mWord++; | |
928 mergedBwt[mWord] = 0; // no need to worry about crossing mergedBwt boundary | |
929 } | |
930 } | |
931 iIndex++; | |
932 } | |
933 | |
934 // Copy from oldBwt to mergedBwt | |
935 if (iIndex <= numInsertBwt) { | |
936 o = sortedRank[iIndex]; | |
937 } else { | |
938 o = numOldBwt; | |
939 } | |
940 numInsert = o - oIndex; | |
941 | |
942 oWord = oIndex / CHAR_PER_WORD; | |
943 oChar = oIndex - oWord * CHAR_PER_WORD; | |
944 if (oChar > mChar) { | |
945 leftShift = (oChar - mChar) * BIT_PER_CHAR; | |
946 rightShift = (CHAR_PER_WORD + mChar - oChar) * BIT_PER_CHAR; | |
947 mergedBwt[mWord] = mergedBwt[mWord] | |
948 | (oldBwt[oWord] << (oChar * BIT_PER_CHAR) >> (mChar * BIT_PER_CHAR)) | |
949 | (oldBwt[oWord+1] >> rightShift); | |
950 oIndex += min(numInsert, CHAR_PER_WORD - mChar); | |
951 while (o > oIndex) { | |
952 oWord++; | |
953 mWord++; | |
954 mergedBwt[mWord] = (oldBwt[oWord] << leftShift) | (oldBwt[oWord+1] >> rightShift); | |
955 oIndex += CHAR_PER_WORD; | |
956 } | |
957 } else if (oChar < mChar) { | |
958 rightShift = (mChar - oChar) * BIT_PER_CHAR; | |
959 leftShift = (CHAR_PER_WORD + oChar - mChar) * BIT_PER_CHAR; | |
960 mergedBwt[mWord] = mergedBwt[mWord] | |
961 | (oldBwt[oWord] << (oChar * BIT_PER_CHAR) >> (mChar * BIT_PER_CHAR)); | |
962 oIndex += min(numInsert, CHAR_PER_WORD - mChar); | |
963 while (o > oIndex) { | |
964 oWord++; | |
965 mWord++; | |
966 mergedBwt[mWord] = (oldBwt[oWord-1] << leftShift) | (oldBwt[oWord] >> rightShift); | |
967 oIndex += CHAR_PER_WORD; | |
968 } | |
969 } else { // oChar == mChar | |
970 mergedBwt[mWord] = mergedBwt[mWord] | truncateLeft(oldBwt[oWord], mChar * BIT_PER_CHAR); | |
971 oIndex += min(numInsert, CHAR_PER_WORD - mChar); | |
972 while (o > oIndex) { | |
973 oWord++; | |
974 mWord++; | |
975 mergedBwt[mWord] = oldBwt[oWord]; | |
976 oIndex += CHAR_PER_WORD; | |
977 } | |
978 } | |
979 oIndex = o; | |
980 mIndex += numInsert; | |
981 | |
982 // Clear the trailing garbage in mergedBwt | |
983 mWord = mIndex / CHAR_PER_WORD; | |
984 mChar = mIndex - mWord * CHAR_PER_WORD; | |
985 if (mChar == 0) { | |
986 mergedBwt[mWord] = 0; | |
987 } else { | |
988 mergedBwt[mWord] = truncateRight(mergedBwt[mWord], (BITS_IN_WORD - mChar * BIT_PER_CHAR)); | |
989 } | |
990 | |
991 } | |
992 | |
993 // copy from insertBwt | |
994 while (iIndex <= numInsertBwt) { | |
995 if (sortedRank[iIndex] != 0) { | |
996 mergedBwt[mWord] |= insertBwt[iIndex] << (BITS_IN_WORD - (mChar + 1) * BIT_PER_CHAR); | |
997 mIndex++; | |
998 mChar++; | |
999 if (mChar == CHAR_PER_WORD) { | |
1000 mChar = 0; | |
1001 mWord++; | |
1002 mergedBwt[mWord] = 0; // no need to worry about crossing mergedBwt boundary | |
1003 } | |
1004 } | |
1005 iIndex++; | |
1006 } | |
1007 } | |
1008 | |
1009 void BWTClearTrailingBwtCode(BWT *bwt) | |
1010 { | |
1011 unsigned int bwtResidentSizeInWord; | |
1012 unsigned int wordIndex, offset; | |
1013 unsigned int i; | |
1014 | |
1015 bwtResidentSizeInWord = BWTResidentSizeInWord(bwt->textLength); | |
1016 | |
1017 wordIndex = bwt->textLength / CHAR_PER_WORD; | |
1018 offset = (bwt->textLength - wordIndex * CHAR_PER_WORD) * BIT_PER_CHAR; | |
1019 if (offset > 0) { | |
1020 bwt->bwtCode[wordIndex] = truncateRight(bwt->bwtCode[wordIndex], BITS_IN_WORD - offset); | |
1021 } else { | |
1022 if (wordIndex < bwtResidentSizeInWord) { | |
1023 bwt->bwtCode[wordIndex] = 0; | |
1024 } | |
1025 } | |
1026 | |
1027 for (i=wordIndex+1; i<bwtResidentSizeInWord; i++) { | |
1028 bwt->bwtCode[i] = 0; | |
1029 } | |
1030 } | |
1031 | |
1032 | |
1033 void BWTGenerateOccValueFromBwt(const unsigned int* bwt, unsigned int* __restrict occValue, | |
1034 unsigned int* __restrict occValueMajor, | |
1035 const unsigned int textLength, const unsigned int* decodeTable) | |
1036 { | |
1037 unsigned int numberOfOccValueMajor, numberOfOccValue; | |
1038 unsigned int wordBetweenOccValue; | |
1039 unsigned int numberOfOccIntervalPerMajor; | |
1040 unsigned int c; | |
1041 unsigned int i, j; | |
1042 unsigned int occMajorIndex; | |
1043 unsigned int occIndex, bwtIndex; | |
1044 unsigned int sum; | |
1045 unsigned int tempOccValue0[ALPHABET_SIZE], tempOccValue1[ALPHABET_SIZE]; | |
1046 | |
1047 wordBetweenOccValue = OCC_INTERVAL / CHAR_PER_WORD; | |
1048 | |
1049 // Calculate occValue | |
1050 // [lh3] by default: OCC_INTERVAL_MAJOR=65536, OCC_INTERVAL=256 | |
1051 numberOfOccValue = (textLength + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // Value at both end for bi-directional encoding | |
1052 numberOfOccIntervalPerMajor = OCC_INTERVAL_MAJOR / OCC_INTERVAL; | |
1053 numberOfOccValueMajor = (numberOfOccValue + numberOfOccIntervalPerMajor - 1) / numberOfOccIntervalPerMajor; | |
1054 | |
1055 tempOccValue0[0] = 0; | |
1056 tempOccValue0[1] = 0; | |
1057 tempOccValue0[2] = 0; | |
1058 tempOccValue0[3] = 0; | |
1059 occValueMajor[0] = 0; | |
1060 occValueMajor[1] = 0; | |
1061 occValueMajor[2] = 0; | |
1062 occValueMajor[3] = 0; | |
1063 | |
1064 occIndex = 0; | |
1065 bwtIndex = 0; | |
1066 for (occMajorIndex=1; occMajorIndex<numberOfOccValueMajor; occMajorIndex++) { | |
1067 | |
1068 for (i=0; i<numberOfOccIntervalPerMajor/2; i++) { | |
1069 | |
1070 sum = 0; | |
1071 tempOccValue1[0] = tempOccValue0[0]; | |
1072 tempOccValue1[1] = tempOccValue0[1]; | |
1073 tempOccValue1[2] = tempOccValue0[2]; | |
1074 tempOccValue1[3] = tempOccValue0[3]; | |
1075 | |
1076 for (j=0; j<wordBetweenOccValue; j++) { | |
1077 c = bwt[bwtIndex]; | |
1078 sum += decodeTable[c >> 16]; | |
1079 sum += decodeTable[c & 0x0000FFFF]; | |
1080 bwtIndex++; | |
1081 } | |
1082 if (!DNA_OCC_SUM_EXCEPTION(sum)) { | |
1083 tempOccValue1[0] += (sum & 0x000000FF); sum >>= 8; | |
1084 tempOccValue1[1] += (sum & 0x000000FF); sum >>= 8; | |
1085 tempOccValue1[2] += (sum & 0x000000FF); sum >>= 8; | |
1086 tempOccValue1[3] += sum; | |
1087 } else { | |
1088 if (sum == 0x00000100) { | |
1089 tempOccValue1[0] += 256; | |
1090 } else if (sum == 0x00010000) { | |
1091 tempOccValue1[1] += 256; | |
1092 } else if (sum == 0x01000000) { | |
1093 tempOccValue1[2] += 256; | |
1094 } else { | |
1095 tempOccValue1[3] += 256; | |
1096 } | |
1097 } | |
1098 occValue[occIndex * 4 + 0] = (tempOccValue0[0] << 16) | tempOccValue1[0]; | |
1099 occValue[occIndex * 4 + 1] = (tempOccValue0[1] << 16) | tempOccValue1[1]; | |
1100 occValue[occIndex * 4 + 2] = (tempOccValue0[2] << 16) | tempOccValue1[2]; | |
1101 occValue[occIndex * 4 + 3] = (tempOccValue0[3] << 16) | tempOccValue1[3]; | |
1102 tempOccValue0[0] = tempOccValue1[0]; | |
1103 tempOccValue0[1] = tempOccValue1[1]; | |
1104 tempOccValue0[2] = tempOccValue1[2]; | |
1105 tempOccValue0[3] = tempOccValue1[3]; | |
1106 sum = 0; | |
1107 | |
1108 occIndex++; | |
1109 | |
1110 for (j=0; j<wordBetweenOccValue; j++) { | |
1111 c = bwt[bwtIndex]; | |
1112 sum += decodeTable[c >> 16]; | |
1113 sum += decodeTable[c & 0x0000FFFF]; | |
1114 bwtIndex++; | |
1115 } | |
1116 if (!DNA_OCC_SUM_EXCEPTION(sum)) { | |
1117 tempOccValue0[0] += (sum & 0x000000FF); sum >>= 8; | |
1118 tempOccValue0[1] += (sum & 0x000000FF); sum >>= 8; | |
1119 tempOccValue0[2] += (sum & 0x000000FF); sum >>= 8; | |
1120 tempOccValue0[3] += sum; | |
1121 } else { | |
1122 if (sum == 0x00000100) { | |
1123 tempOccValue0[0] += 256; | |
1124 } else if (sum == 0x00010000) { | |
1125 tempOccValue0[1] += 256; | |
1126 } else if (sum == 0x01000000) { | |
1127 tempOccValue0[2] += 256; | |
1128 } else { | |
1129 tempOccValue0[3] += 256; | |
1130 } | |
1131 } | |
1132 } | |
1133 | |
1134 occValueMajor[occMajorIndex * 4 + 0] = occValueMajor[(occMajorIndex - 1) * 4 + 0] + tempOccValue0[0]; | |
1135 occValueMajor[occMajorIndex * 4 + 1] = occValueMajor[(occMajorIndex - 1) * 4 + 1] + tempOccValue0[1]; | |
1136 occValueMajor[occMajorIndex * 4 + 2] = occValueMajor[(occMajorIndex - 1) * 4 + 2] + tempOccValue0[2]; | |
1137 occValueMajor[occMajorIndex * 4 + 3] = occValueMajor[(occMajorIndex - 1) * 4 + 3] + tempOccValue0[3]; | |
1138 tempOccValue0[0] = 0; | |
1139 tempOccValue0[1] = 0; | |
1140 tempOccValue0[2] = 0; | |
1141 tempOccValue0[3] = 0; | |
1142 | |
1143 } | |
1144 | |
1145 while (occIndex < (numberOfOccValue-1)/2) { | |
1146 sum = 0; | |
1147 tempOccValue1[0] = tempOccValue0[0]; | |
1148 tempOccValue1[1] = tempOccValue0[1]; | |
1149 tempOccValue1[2] = tempOccValue0[2]; | |
1150 tempOccValue1[3] = tempOccValue0[3]; | |
1151 for (j=0; j<wordBetweenOccValue; j++) { | |
1152 c = bwt[bwtIndex]; | |
1153 sum += decodeTable[c >> 16]; | |
1154 sum += decodeTable[c & 0x0000FFFF]; | |
1155 bwtIndex++; | |
1156 } | |
1157 if (!DNA_OCC_SUM_EXCEPTION(sum)) { | |
1158 tempOccValue1[0] += (sum & 0x000000FF); sum >>= 8; | |
1159 tempOccValue1[1] += (sum & 0x000000FF); sum >>= 8; | |
1160 tempOccValue1[2] += (sum & 0x000000FF); sum >>= 8; | |
1161 tempOccValue1[3] += sum; | |
1162 } else { | |
1163 if (sum == 0x00000100) { | |
1164 tempOccValue1[0] += 256; | |
1165 } else if (sum == 0x00010000) { | |
1166 tempOccValue1[1] += 256; | |
1167 } else if (sum == 0x01000000) { | |
1168 tempOccValue1[2] += 256; | |
1169 } else { | |
1170 tempOccValue1[3] += 256; | |
1171 } | |
1172 } | |
1173 occValue[occIndex * 4 + 0] = (tempOccValue0[0] << 16) | tempOccValue1[0]; | |
1174 occValue[occIndex * 4 + 1] = (tempOccValue0[1] << 16) | tempOccValue1[1]; | |
1175 occValue[occIndex * 4 + 2] = (tempOccValue0[2] << 16) | tempOccValue1[2]; | |
1176 occValue[occIndex * 4 + 3] = (tempOccValue0[3] << 16) | tempOccValue1[3]; | |
1177 tempOccValue0[0] = tempOccValue1[0]; | |
1178 tempOccValue0[1] = tempOccValue1[1]; | |
1179 tempOccValue0[2] = tempOccValue1[2]; | |
1180 tempOccValue0[3] = tempOccValue1[3]; | |
1181 sum = 0; | |
1182 occIndex++; | |
1183 | |
1184 for (j=0; j<wordBetweenOccValue; j++) { | |
1185 c = bwt[bwtIndex]; | |
1186 sum += decodeTable[c >> 16]; | |
1187 sum += decodeTable[c & 0x0000FFFF]; | |
1188 bwtIndex++; | |
1189 } | |
1190 if (!DNA_OCC_SUM_EXCEPTION(sum)) { | |
1191 tempOccValue0[0] += (sum & 0x000000FF); sum >>= 8; | |
1192 tempOccValue0[1] += (sum & 0x000000FF); sum >>= 8; | |
1193 tempOccValue0[2] += (sum & 0x000000FF); sum >>= 8; | |
1194 tempOccValue0[3] += sum; | |
1195 } else { | |
1196 if (sum == 0x00000100) { | |
1197 tempOccValue0[0] += 256; | |
1198 } else if (sum == 0x00010000) { | |
1199 tempOccValue0[1] += 256; | |
1200 } else if (sum == 0x01000000) { | |
1201 tempOccValue0[2] += 256; | |
1202 } else { | |
1203 tempOccValue0[3] += 256; | |
1204 } | |
1205 } | |
1206 } | |
1207 | |
1208 sum = 0; | |
1209 tempOccValue1[0] = tempOccValue0[0]; | |
1210 tempOccValue1[1] = tempOccValue0[1]; | |
1211 tempOccValue1[2] = tempOccValue0[2]; | |
1212 tempOccValue1[3] = tempOccValue0[3]; | |
1213 | |
1214 if (occIndex * 2 < numberOfOccValue - 1) { | |
1215 for (j=0; j<wordBetweenOccValue; j++) { | |
1216 c = bwt[bwtIndex]; | |
1217 sum += decodeTable[c >> 16]; | |
1218 sum += decodeTable[c & 0x0000FFFF]; | |
1219 bwtIndex++; | |
1220 } | |
1221 if (!DNA_OCC_SUM_EXCEPTION(sum)) { | |
1222 tempOccValue1[0] += (sum & 0x000000FF); sum >>= 8; | |
1223 tempOccValue1[1] += (sum & 0x000000FF); sum >>= 8; | |
1224 tempOccValue1[2] += (sum & 0x000000FF); sum >>= 8; | |
1225 tempOccValue1[3] += sum; | |
1226 } else { | |
1227 if (sum == 0x00000100) { | |
1228 tempOccValue1[0] += 256; | |
1229 } else if (sum == 0x00010000) { | |
1230 tempOccValue1[1] += 256; | |
1231 } else if (sum == 0x01000000) { | |
1232 tempOccValue1[2] += 256; | |
1233 } else { | |
1234 tempOccValue1[3] += 256; | |
1235 } | |
1236 } | |
1237 } | |
1238 | |
1239 occValue[occIndex * 4 + 0] = (tempOccValue0[0] << 16) | tempOccValue1[0]; | |
1240 occValue[occIndex * 4 + 1] = (tempOccValue0[1] << 16) | tempOccValue1[1]; | |
1241 occValue[occIndex * 4 + 2] = (tempOccValue0[2] << 16) | tempOccValue1[2]; | |
1242 occValue[occIndex * 4 + 3] = (tempOccValue0[3] << 16) | tempOccValue1[3]; | |
1243 | |
1244 } | |
1245 | |
1246 static void BWTIncConstruct(BWTInc *bwtInc, const unsigned int numChar) | |
1247 { | |
1248 unsigned int i; | |
1249 unsigned int mergedBwtSizeInWord, mergedOccSizeInWord; | |
1250 unsigned int firstCharInThisIteration; | |
1251 | |
1252 unsigned int *relativeRank, *seq, *sortedRank, *insertBwt, *mergedBwt; | |
1253 unsigned int newInverseSa0RelativeRank, oldInverseSa0RelativeRank, newInverseSa0; | |
1254 | |
1255 #ifdef DEBUG | |
1256 if (numChar > bwtInc->buildSize) { | |
1257 fprintf(stderr, "BWTIncConstruct(): numChar > buildSize!\n"); | |
1258 exit(1); | |
1259 } | |
1260 #endif | |
1261 | |
1262 mergedBwtSizeInWord = BWTResidentSizeInWord(bwtInc->bwt->textLength + numChar); | |
1263 mergedOccSizeInWord = BWTOccValueMinorSizeInWord(bwtInc->bwt->textLength + numChar); | |
1264 | |
1265 initializeVAL(bwtInc->cumulativeCountInCurrentBuild, ALPHABET_SIZE + 1, 0); | |
1266 | |
1267 if (bwtInc->bwt->textLength == 0) { // Initial build | |
1268 | |
1269 // Set address | |
1270 seq = bwtInc->workingMemory; | |
1271 relativeRank = seq + bwtInc->buildSize + 1; | |
1272 mergedBwt = insertBwt = bwtInc->workingMemory + bwtInc->availableWord - mergedBwtSizeInWord; // build in place | |
1273 | |
1274 BWTIncPutPackedTextToRank(bwtInc->packedText, relativeRank, bwtInc->cumulativeCountInCurrentBuild, numChar); | |
1275 | |
1276 firstCharInThisIteration = relativeRank[0]; | |
1277 relativeRank[numChar] = 0; | |
1278 | |
1279 // Sort suffix | |
1280 QSufSortSuffixSort((int*)relativeRank, (int*)seq, (int)numChar, (int)ALPHABET_SIZE - 1, 0, FALSE); | |
1281 newInverseSa0 = relativeRank[0]; | |
1282 | |
1283 // Clear BWT area | |
1284 initializeVAL(insertBwt, mergedBwtSizeInWord, 0); | |
1285 | |
1286 // Build BWT | |
1287 BWTIncBuildPackedBwt(relativeRank, insertBwt, numChar, bwtInc->cumulativeCountInCurrentBuild, bwtInc->packedShift); | |
1288 | |
1289 // so that the cumulativeCount is not deducted | |
1290 bwtInc->firstCharInLastIteration = ALPHABET_SIZE; | |
1291 | |
1292 } else { // Incremental build | |
1293 // Set address | |
1294 sortedRank = bwtInc->workingMemory; | |
1295 seq = sortedRank + bwtInc->buildSize + 1; | |
1296 insertBwt = seq; | |
1297 relativeRank = seq + bwtInc->buildSize + 1; | |
1298 | |
1299 // Store the first character of this iteration | |
1300 firstCharInThisIteration = bwtInc->packedText[0] >> (BITS_IN_WORD - BIT_PER_CHAR); | |
1301 | |
1302 // Count occurrence of input text | |
1303 ForwardDNAAllOccCountNoLimit(bwtInc->packedText, numChar, bwtInc->cumulativeCountInCurrentBuild + 1, bwtInc->bwt->decodeTable); | |
1304 // Add the first character of the previous iteration to represent the inverseSa0 of the previous iteration | |
1305 bwtInc->cumulativeCountInCurrentBuild[bwtInc->firstCharInLastIteration + 1]++; | |
1306 bwtInc->cumulativeCountInCurrentBuild[2] += bwtInc->cumulativeCountInCurrentBuild[1]; | |
1307 bwtInc->cumulativeCountInCurrentBuild[3] += bwtInc->cumulativeCountInCurrentBuild[2]; | |
1308 bwtInc->cumulativeCountInCurrentBuild[4] += bwtInc->cumulativeCountInCurrentBuild[3]; | |
1309 | |
1310 // Get rank of new suffix among processed suffix | |
1311 // The seq array is built into ALPHABET_SIZE + 2 groups; ALPHABET_SIZE groups + 1 group divided into 2 by inverseSa0 + inverseSa0 as 1 group | |
1312 oldInverseSa0RelativeRank = BWTIncGetAbsoluteRank(bwtInc->bwt, sortedRank, seq, bwtInc->packedText, | |
1313 numChar, bwtInc->cumulativeCountInCurrentBuild, bwtInc->firstCharInLastIteration); | |
1314 | |
1315 // Sort rank by ALPHABET_SIZE + 2 groups (or ALPHABET_SIZE + 1 groups when inverseSa0 sit on the border of a group) | |
1316 for (i=0; i<ALPHABET_SIZE; i++) { | |
1317 if (bwtInc->cumulativeCountInCurrentBuild[i] > oldInverseSa0RelativeRank || | |
1318 bwtInc->cumulativeCountInCurrentBuild[i+1] <= oldInverseSa0RelativeRank) { | |
1319 BWTIncSortKey(sortedRank + bwtInc->cumulativeCountInCurrentBuild[i], seq + bwtInc->cumulativeCountInCurrentBuild[i], bwtInc->cumulativeCountInCurrentBuild[i+1] - bwtInc->cumulativeCountInCurrentBuild[i]); | |
1320 } else { | |
1321 if (bwtInc->cumulativeCountInCurrentBuild[i] < oldInverseSa0RelativeRank) { | |
1322 BWTIncSortKey(sortedRank + bwtInc->cumulativeCountInCurrentBuild[i], seq + bwtInc->cumulativeCountInCurrentBuild[i], oldInverseSa0RelativeRank - bwtInc->cumulativeCountInCurrentBuild[i]); | |
1323 } | |
1324 if (bwtInc->cumulativeCountInCurrentBuild[i+1] > oldInverseSa0RelativeRank + 1) { | |
1325 BWTIncSortKey(sortedRank + oldInverseSa0RelativeRank + 1, seq + oldInverseSa0RelativeRank + 1, bwtInc->cumulativeCountInCurrentBuild[i+1] - oldInverseSa0RelativeRank - 1); | |
1326 } | |
1327 } | |
1328 } | |
1329 | |
1330 // build relative rank; sortedRank is updated for merging to cater for the fact that $ is not encoded in bwt | |
1331 // the cumulative freq information is used to make sure that inverseSa0 and suffix beginning with different characters are kept in different unsorted groups) | |
1332 BWTIncBuildRelativeRank(sortedRank, seq, relativeRank, numChar, bwtInc->bwt->inverseSa0, bwtInc->cumulativeCountInCurrentBuild); | |
1333 #ifdef DEBUG | |
1334 if (relativeRank[numChar] != oldInverseSa0RelativeRank) { | |
1335 fprintf(stderr, "BWTIncConstruct(): relativeRank[numChar] != oldInverseSa0RelativeRank!\n"); | |
1336 exit(1); | |
1337 } | |
1338 #endif | |
1339 | |
1340 // Sort suffix | |
1341 QSufSortSuffixSort((int*)relativeRank, (int*)seq, (int)numChar, (int)numChar, 1, TRUE); | |
1342 | |
1343 newInverseSa0RelativeRank = relativeRank[0]; | |
1344 newInverseSa0 = sortedRank[newInverseSa0RelativeRank] + newInverseSa0RelativeRank; | |
1345 | |
1346 sortedRank[newInverseSa0RelativeRank] = 0; // a special value so that this is skipped in the merged bwt | |
1347 | |
1348 // Build BWT | |
1349 BWTIncBuildBwt(seq, relativeRank, numChar, bwtInc->cumulativeCountInCurrentBuild); | |
1350 | |
1351 // Merge BWT | |
1352 mergedBwt = bwtInc->workingMemory + bwtInc->availableWord - mergedBwtSizeInWord | |
1353 - bwtInc->numberOfIterationDone * OCC_INTERVAL / BIT_PER_CHAR; | |
1354 // minus numberOfIteration * occInterval to create a buffer for merging | |
1355 BWTIncMergeBwt(sortedRank, bwtInc->bwt->bwtCode, insertBwt, mergedBwt, bwtInc->bwt->textLength, numChar); | |
1356 | |
1357 } | |
1358 | |
1359 // Build auxiliary structure and update info and pointers in BWT | |
1360 bwtInc->bwt->textLength += numChar; | |
1361 bwtInc->bwt->bwtCode = mergedBwt; | |
1362 bwtInc->bwt->bwtSizeInWord = mergedBwtSizeInWord; | |
1363 bwtInc->bwt->occSizeInWord = mergedOccSizeInWord; | |
1364 if (mergedBwt < bwtInc->workingMemory + mergedOccSizeInWord) { | |
1365 fprintf(stderr, "BWTIncConstruct() : Not enough memory allocated!\n"); | |
1366 exit(1); | |
1367 } | |
1368 | |
1369 bwtInc->bwt->occValue = mergedBwt - mergedOccSizeInWord; | |
1370 | |
1371 BWTClearTrailingBwtCode(bwtInc->bwt); | |
1372 BWTGenerateOccValueFromBwt(bwtInc->bwt->bwtCode, bwtInc->bwt->occValue, bwtInc->bwt->occValueMajor, | |
1373 bwtInc->bwt->textLength, bwtInc->bwt->decodeTable); | |
1374 | |
1375 bwtInc->bwt->inverseSa0 = newInverseSa0; | |
1376 | |
1377 bwtInc->bwt->cumulativeFreq[1] += bwtInc->cumulativeCountInCurrentBuild[1] - (bwtInc->firstCharInLastIteration <= 0); | |
1378 bwtInc->bwt->cumulativeFreq[2] += bwtInc->cumulativeCountInCurrentBuild[2] - (bwtInc->firstCharInLastIteration <= 1); | |
1379 bwtInc->bwt->cumulativeFreq[3] += bwtInc->cumulativeCountInCurrentBuild[3] - (bwtInc->firstCharInLastIteration <= 2); | |
1380 bwtInc->bwt->cumulativeFreq[4] += bwtInc->cumulativeCountInCurrentBuild[4] - (bwtInc->firstCharInLastIteration <= 3); | |
1381 | |
1382 bwtInc->firstCharInLastIteration = firstCharInThisIteration; | |
1383 | |
1384 // Set build size and text address for the next build | |
1385 BWTIncSetBuildSizeAndTextAddr(bwtInc); | |
1386 bwtInc->numberOfIterationDone++; | |
1387 | |
1388 } | |
1389 | |
1390 BWTInc *BWTIncConstructFromPacked(const char *inputFileName, const float targetNBit, | |
1391 const unsigned int initialMaxBuildSize, const unsigned int incMaxBuildSize) | |
1392 { | |
1393 | |
1394 FILE *packedFile; | |
1395 unsigned int packedFileLen; | |
1396 unsigned int totalTextLength; | |
1397 unsigned int textToLoad, textSizeInByte; | |
1398 unsigned int processedTextLength; | |
1399 unsigned char lastByteLength; | |
1400 | |
1401 BWTInc *bwtInc; | |
1402 | |
1403 packedFile = (FILE*)fopen(inputFileName, "rb"); | |
1404 | |
1405 if (packedFile == NULL) { | |
1406 fprintf(stderr, "BWTIncConstructFromPacked() : Cannot open inputFileName!\n"); | |
1407 exit(1); | |
1408 } | |
1409 | |
1410 fseek(packedFile, -1, SEEK_END); | |
1411 packedFileLen = ftell(packedFile); | |
1412 if ((int)packedFileLen < 0) { | |
1413 fprintf(stderr, "BWTIncConstructFromPacked: Cannot determine file length!\n"); | |
1414 exit(1); | |
1415 } | |
1416 fread(&lastByteLength, sizeof(unsigned char), 1, packedFile); | |
1417 totalTextLength = TextLengthFromBytePacked(packedFileLen, BIT_PER_CHAR, lastByteLength); | |
1418 | |
1419 bwtInc = BWTIncCreate(totalTextLength, targetNBit, initialMaxBuildSize, incMaxBuildSize); | |
1420 | |
1421 BWTIncSetBuildSizeAndTextAddr(bwtInc); | |
1422 | |
1423 if (bwtInc->buildSize > totalTextLength) { | |
1424 textToLoad = totalTextLength; | |
1425 } else { | |
1426 textToLoad = totalTextLength - ((totalTextLength - bwtInc->buildSize + CHAR_PER_WORD - 1) / CHAR_PER_WORD * CHAR_PER_WORD); | |
1427 } | |
1428 textSizeInByte = textToLoad / CHAR_PER_BYTE; // excluded the odd byte | |
1429 | |
1430 fseek(packedFile, -2, SEEK_CUR); | |
1431 fseek(packedFile, -((int)textSizeInByte), SEEK_CUR); | |
1432 fread(bwtInc->textBuffer, sizeof(unsigned char), textSizeInByte + 1, packedFile); | |
1433 fseek(packedFile, -((int)textSizeInByte + 1), SEEK_CUR); | |
1434 | |
1435 ConvertBytePackedToWordPacked(bwtInc->textBuffer, bwtInc->packedText, ALPHABET_SIZE, textToLoad); | |
1436 BWTIncConstruct(bwtInc, textToLoad); | |
1437 | |
1438 processedTextLength = textToLoad; | |
1439 | |
1440 while (processedTextLength < totalTextLength) { | |
1441 textToLoad = bwtInc->buildSize / CHAR_PER_WORD * CHAR_PER_WORD; | |
1442 if (textToLoad > totalTextLength - processedTextLength) { | |
1443 textToLoad = totalTextLength - processedTextLength; | |
1444 } | |
1445 textSizeInByte = textToLoad / CHAR_PER_BYTE; | |
1446 fseek(packedFile, -((int)textSizeInByte), SEEK_CUR); | |
1447 fread(bwtInc->textBuffer, sizeof(unsigned char), textSizeInByte, packedFile); | |
1448 fseek(packedFile, -((int)textSizeInByte), SEEK_CUR); | |
1449 ConvertBytePackedToWordPacked(bwtInc->textBuffer, bwtInc->packedText, ALPHABET_SIZE, textToLoad); | |
1450 BWTIncConstruct(bwtInc, textToLoad); | |
1451 processedTextLength += textToLoad; | |
1452 if (bwtInc->numberOfIterationDone % 10 == 0) { | |
1453 printf("[BWTIncConstructFromPacked] %u iterations done. %u characters processed.\n", | |
1454 bwtInc->numberOfIterationDone, processedTextLength); | |
1455 } | |
1456 } | |
1457 return bwtInc; | |
1458 } | |
1459 | |
1460 void BWTFree(BWT *bwt) | |
1461 { | |
1462 if (bwt == 0) return; | |
1463 free(bwt->cumulativeFreq); | |
1464 free(bwt->bwtCode); | |
1465 free(bwt->occValue); | |
1466 free(bwt->occValueMajor); | |
1467 free(bwt->saValue); | |
1468 free(bwt->inverseSa); | |
1469 free(bwt->decodeTable); | |
1470 free(bwt->saIndexRange); | |
1471 free(bwt->saValueOnBoundary); | |
1472 free(bwt); | |
1473 } | |
1474 | |
1475 void BWTIncFree(BWTInc *bwtInc) | |
1476 { | |
1477 if (bwtInc == 0) return; | |
1478 free(bwtInc->bwt); | |
1479 free(bwtInc->workingMemory); | |
1480 free(bwtInc); | |
1481 } | |
1482 | |
1483 static unsigned int BWTFileSizeInWord(const unsigned int numChar) | |
1484 { | |
1485 // The $ in BWT at the position of inverseSa0 is not encoded | |
1486 return (numChar + CHAR_PER_WORD - 1) / CHAR_PER_WORD; | |
1487 } | |
1488 | |
1489 void BWTSaveBwtCodeAndOcc(const BWT *bwt, const char *bwtFileName, const char *occValueFileName) | |
1490 { | |
1491 FILE *bwtFile; | |
1492 /* FILE *occValueFile; */ | |
1493 unsigned int bwtLength; | |
1494 | |
1495 bwtFile = (FILE*)fopen(bwtFileName, "wb"); | |
1496 if (bwtFile == NULL) { | |
1497 fprintf(stderr, "BWTSaveBwtCodeAndOcc(): Cannot open BWT code file!\n"); | |
1498 exit(1); | |
1499 } | |
1500 | |
1501 fwrite(&bwt->inverseSa0, sizeof(unsigned int), 1, bwtFile); | |
1502 fwrite(bwt->cumulativeFreq + 1, sizeof(unsigned int), ALPHABET_SIZE, bwtFile); | |
1503 bwtLength = BWTFileSizeInWord(bwt->textLength); | |
1504 fwrite(bwt->bwtCode, sizeof(unsigned int), bwtLength, bwtFile); | |
1505 fclose(bwtFile); | |
1506 /* | |
1507 occValueFile = (FILE*)fopen(occValueFileName, "wb"); | |
1508 if (occValueFile == NULL) { | |
1509 fprintf(stderr, "BWTSaveBwtCodeAndOcc(): Cannot open occ value file!\n"); | |
1510 exit(1); | |
1511 } | |
1512 | |
1513 fwrite(&bwt->inverseSa0, sizeof(unsigned int), 1, occValueFile); | |
1514 fwrite(bwt->cumulativeFreq + 1, sizeof(unsigned int), ALPHABET_SIZE, occValueFile); | |
1515 fwrite(bwt->occValue, sizeof(unsigned int), bwt->occSizeInWord, occValueFile); | |
1516 fwrite(bwt->occValueMajor, sizeof(unsigned int), bwt->occMajorSizeInWord, occValueFile); | |
1517 fclose(occValueFile); | |
1518 */ | |
1519 } | |
1520 | |
1521 void bwt_bwtgen(const char *fn_pac, const char *fn_bwt) | |
1522 { | |
1523 BWTInc *bwtInc; | |
1524 bwtInc = BWTIncConstructFromPacked(fn_pac, 2.5, 10000000, 10000000); | |
1525 printf("[bwt_gen] Finished constructing BWT in %u iterations.\n", bwtInc->numberOfIterationDone); | |
1526 BWTSaveBwtCodeAndOcc(bwtInc->bwt, fn_bwt, 0); | |
1527 BWTIncFree(bwtInc); | |
1528 } | |
1529 | |
1530 int bwt_bwtgen_main(int argc, char *argv[]) | |
1531 { | |
1532 if (argc < 3) { | |
1533 fprintf(stderr, "Usage: bwtgen <in.pac> <out.bwt>\n"); | |
1534 return 1; | |
1535 } | |
1536 bwt_bwtgen(argv[1], argv[2]); | |
1537 return 0; | |
1538 } | |
1539 | |
1540 #ifdef MAIN_BWT_GEN | |
1541 | |
1542 int main(int argc, char *argv[]) | |
1543 { | |
1544 return bwt_bwtgen_main(argc, argv); | |
1545 } | |
1546 | |
1547 #endif |