comparison pyPRADA_1.2/tools/bwa-0.5.7-mh/bwt_gen/bwt_gen.c @ 0:acc2ca1a3ba4

Uploaded
author siyuan
date Thu, 20 Feb 2014 00:44:58 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:acc2ca1a3ba4
1 /*
2
3 BWTConstruct.c BWT-Index Construction
4
5 This module constructs BWT and auxiliary data structures.
6
7 Copyright (C) 2004, Wong Chi Kwong.
8
9 This program is free software; you can redistribute it and/or
10 modify it under the terms of the GNU General Public License
11 as published by the Free Software Foundation; either version 2
12 of the License, or (at your option) any later version.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
22
23 */
24
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include "bwt_gen.h"
29 #include "QSufSort.h"
30
31 static unsigned int TextLengthFromBytePacked(unsigned int bytePackedLength, unsigned int bitPerChar,
32 unsigned int lastByteLength)
33 {
34 if (bytePackedLength > ALL_ONE_MASK / (BITS_IN_BYTE / bitPerChar)) {
35 fprintf(stderr, "TextLengthFromBytePacked(): text length > 2^32!\n");
36 exit(1);
37 }
38 return (bytePackedLength - 1) * (BITS_IN_BYTE / bitPerChar) + lastByteLength;
39 }
40
41 static void initializeVAL(unsigned int *startAddr, const unsigned int length, const unsigned int initValue)
42 {
43 unsigned int i;
44 for (i=0; i<length; i++) startAddr[i] = initValue;
45 }
46
47 static void GenerateDNAOccCountTable(unsigned int *dnaDecodeTable)
48 {
49 unsigned int i, j, c, t;
50
51 for (i=0; i<DNA_OCC_CNT_TABLE_SIZE_IN_WORD; i++) {
52 dnaDecodeTable[i] = 0;
53 c = i;
54 for (j=0; j<8; j++) {
55 t = c & 0x00000003;
56 dnaDecodeTable[i] += 1 << (t * 8);
57 c >>= 2;
58 }
59 }
60
61 }
62 // for BWTIncCreate()
63 static unsigned int BWTOccValueMajorSizeInWord(const unsigned int numChar)
64 {
65 unsigned int numOfOccValue;
66 unsigned int numOfOccIntervalPerMajor;
67 numOfOccValue = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // Value at both end for bi-directional encoding
68 numOfOccIntervalPerMajor = OCC_INTERVAL_MAJOR / OCC_INTERVAL;
69 return (numOfOccValue + numOfOccIntervalPerMajor - 1) / numOfOccIntervalPerMajor * ALPHABET_SIZE;
70 }
71 // for BWTIncCreate()
72 static unsigned int BWTOccValueMinorSizeInWord(const unsigned int numChar)
73 {
74 unsigned int numOfOccValue;
75 numOfOccValue = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // Value at both end for bi-directional encoding
76 return (numOfOccValue + OCC_VALUE_PER_WORD - 1) / OCC_VALUE_PER_WORD * ALPHABET_SIZE;
77 }
78 // for BWTIncCreate()
79 static unsigned int BWTResidentSizeInWord(const unsigned int numChar) {
80
81 unsigned int numCharRoundUpToOccInterval;
82
83 // The $ in BWT at the position of inverseSa0 is not encoded
84 numCharRoundUpToOccInterval = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL * OCC_INTERVAL;
85
86 return (numCharRoundUpToOccInterval + CHAR_PER_WORD - 1) / CHAR_PER_WORD;
87
88 }
89
90 static void BWTIncSetBuildSizeAndTextAddr(BWTInc *bwtInc)
91 {
92 unsigned int maxBuildSize;
93
94 if (bwtInc->bwt->textLength == 0) {
95 // initial build
96 // Minus 2 because n+1 entries of seq and rank needed for n char
97 maxBuildSize = (bwtInc->availableWord - 2 - OCC_INTERVAL / CHAR_PER_WORD)
98 / (2 * CHAR_PER_WORD + 1) * CHAR_PER_WORD;
99 if (bwtInc->initialMaxBuildSize > 0) {
100 bwtInc->buildSize = min(bwtInc->initialMaxBuildSize, maxBuildSize);
101 } else {
102 bwtInc->buildSize = maxBuildSize;
103 }
104 } else {
105 // Minus 3 because n+1 entries of sorted rank, seq and rank needed for n char
106 // Minus numberOfIterationDone because bwt slightly shift to left in each iteration
107 maxBuildSize = (bwtInc->availableWord - bwtInc->bwt->bwtSizeInWord - bwtInc->bwt->occSizeInWord - 3
108 - bwtInc->numberOfIterationDone * OCC_INTERVAL / BIT_PER_CHAR)
109 / 3;
110 if (maxBuildSize < CHAR_PER_WORD) {
111 fprintf(stderr, "BWTIncSetBuildSizeAndTextAddr(): Not enough space allocated to continue construction!\n");
112 exit(1);
113 }
114 if (bwtInc->incMaxBuildSize > 0) {
115 bwtInc->buildSize = min(bwtInc->incMaxBuildSize, maxBuildSize);
116 } else {
117 bwtInc->buildSize = maxBuildSize;
118 }
119 if (bwtInc->buildSize < CHAR_PER_WORD) {
120 bwtInc->buildSize = CHAR_PER_WORD;
121 }
122 }
123
124 if (bwtInc->buildSize < CHAR_PER_WORD) {
125 fprintf(stderr, "BWTIncSetBuildSizeAndTextAddr(): Not enough space allocated to continue construction!\n");
126 exit(1);
127 }
128
129 bwtInc->buildSize = bwtInc->buildSize / CHAR_PER_WORD * CHAR_PER_WORD;
130
131 bwtInc->packedText = bwtInc->workingMemory + 2 * (bwtInc->buildSize + 1);
132 bwtInc->textBuffer = (unsigned char*)(bwtInc->workingMemory + bwtInc->buildSize + 1);
133
134 }
135
136 // for ceilLog2()
137 unsigned int leadingZero(const unsigned int input)
138 {
139 unsigned int l;
140 const static unsigned int leadingZero8bit[256] = {8,7,6,6,5,5,5,5,4,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
141 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
142 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
143 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
144 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
145 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
146 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
147 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
148
149 if (input & 0xFFFF0000) {
150 if (input & 0xFF000000) {
151 l = leadingZero8bit[input >> 24];
152 } else {
153 l = 8 + leadingZero8bit[input >> 16];
154 }
155 } else {
156 if (input & 0x0000FF00) {
157 l = 16 + leadingZero8bit[input >> 8];
158 } else {
159 l = 24 + leadingZero8bit[input];
160 }
161 }
162 return l;
163
164 }
165 // for BitPerBytePackedChar()
166 static unsigned int ceilLog2(const unsigned int input)
167 {
168 if (input <= 1) return 0;
169 return BITS_IN_WORD - leadingZero(input - 1);
170
171 }
172 // for ConvertBytePackedToWordPacked()
173 static unsigned int BitPerBytePackedChar(const unsigned int alphabetSize)
174 {
175 unsigned int bitPerChar;
176 bitPerChar = ceilLog2(alphabetSize);
177 // Return the largest number of bit that does not affect packing efficiency
178 if (BITS_IN_BYTE / (BITS_IN_BYTE / bitPerChar) > bitPerChar)
179 bitPerChar = BITS_IN_BYTE / (BITS_IN_BYTE / bitPerChar);
180 return bitPerChar;
181 }
182 // for ConvertBytePackedToWordPacked()
183 static unsigned int BitPerWordPackedChar(const unsigned int alphabetSize)
184 {
185 return ceilLog2(alphabetSize);
186 }
187
188 static void ConvertBytePackedToWordPacked(const unsigned char *input, unsigned int *output, const unsigned int alphabetSize,
189 const unsigned int textLength)
190 {
191 unsigned int i, j, k;
192 unsigned int c;
193 unsigned int bitPerBytePackedChar;
194 unsigned int bitPerWordPackedChar;
195 unsigned int charPerWord;
196 unsigned int charPerByte;
197 unsigned int bytePerIteration;
198 unsigned int byteProcessed = 0;
199 unsigned int wordProcessed = 0;
200 unsigned int mask, shift;
201
202 unsigned int buffer[BITS_IN_WORD];
203
204 bitPerBytePackedChar = BitPerBytePackedChar(alphabetSize);
205 bitPerWordPackedChar = BitPerWordPackedChar(alphabetSize);
206 charPerByte = BITS_IN_BYTE / bitPerBytePackedChar;
207 charPerWord = BITS_IN_WORD / bitPerWordPackedChar;
208
209 bytePerIteration = charPerWord / charPerByte;
210 mask = truncateRight(ALL_ONE_MASK, BITS_IN_WORD - bitPerWordPackedChar);
211 shift = BITS_IN_WORD - BITS_IN_BYTE + bitPerBytePackedChar - bitPerWordPackedChar;
212
213 while ((wordProcessed + 1) * charPerWord < textLength) {
214
215 k = 0;
216 for (i=0; i<bytePerIteration; i++) {
217 c = (unsigned int)input[byteProcessed] << shift;
218 for (j=0; j<charPerByte; j++) {
219 buffer[k] = c & mask;
220 c <<= bitPerBytePackedChar;
221 k++;
222 }
223 byteProcessed++;
224 }
225
226 c = 0;
227 for (i=0; i<charPerWord; i++) {
228 c |= buffer[i] >> bitPerWordPackedChar * i;
229 }
230 output[wordProcessed] = c;
231 wordProcessed++;
232
233 }
234
235 k = 0;
236 for (i=0; i < (textLength - wordProcessed * charPerWord - 1) / charPerByte + 1; i++) {
237 c = (unsigned int)input[byteProcessed] << shift;
238 for (j=0; j<charPerByte; j++) {
239 buffer[k] = c & mask;
240 c <<= bitPerBytePackedChar;
241 k++;
242 }
243 byteProcessed++;
244 }
245
246 c = 0;
247 for (i=0; i<textLength - wordProcessed * charPerWord; i++) {
248 c |= buffer[i] >> bitPerWordPackedChar * i;
249 }
250 output[wordProcessed] = c;
251 }
252
253 BWT *BWTCreate(const unsigned int textLength, unsigned int *decodeTable)
254 {
255 BWT *bwt;
256
257 bwt = (BWT*)calloc(1, sizeof(BWT));
258
259 bwt->textLength = 0;
260 bwt->inverseSa = 0;
261
262 bwt->cumulativeFreq = (unsigned*)calloc((ALPHABET_SIZE + 1), sizeof(unsigned int*));
263 initializeVAL(bwt->cumulativeFreq, ALPHABET_SIZE + 1, 0);
264
265 bwt->bwtSizeInWord = 0;
266 bwt->saValueOnBoundary = NULL;
267
268 // Generate decode tables
269 if (decodeTable == NULL) {
270 bwt->decodeTable = (unsigned*)calloc(DNA_OCC_CNT_TABLE_SIZE_IN_WORD, sizeof(unsigned int));
271 GenerateDNAOccCountTable(bwt->decodeTable);
272 } else {
273 bwt->decodeTable = decodeTable;
274 }
275
276 bwt->occMajorSizeInWord = BWTOccValueMajorSizeInWord(textLength);
277 bwt->occValueMajor = (unsigned*)calloc(bwt->occMajorSizeInWord, sizeof(unsigned int));
278
279 bwt->occSizeInWord = 0;
280 bwt->occValue = NULL;
281
282 bwt->saInterval = ALL_ONE_MASK;
283 bwt->saValueSize = 0;
284 bwt->saValue = NULL;
285
286 bwt->inverseSaInterval = ALL_ONE_MASK;
287 bwt->inverseSaSize = 0;
288 bwt->inverseSa = NULL;
289
290 return bwt;
291 }
292
293 BWTInc *BWTIncCreate(const unsigned int textLength, const float targetNBit,
294 const unsigned int initialMaxBuildSize, const unsigned int incMaxBuildSize)
295 {
296 BWTInc *bwtInc;
297 unsigned int i;
298
299 if (targetNBit == 0) {
300 fprintf(stderr, "BWTIncCreate() : targetNBit = 0!\n");
301 exit(1);
302 }
303
304 bwtInc = (BWTInc*)calloc(1, sizeof(BWTInc));
305 bwtInc->numberOfIterationDone = 0;
306 bwtInc->bwt = BWTCreate(textLength, NULL);
307 bwtInc->initialMaxBuildSize = initialMaxBuildSize;
308 bwtInc->incMaxBuildSize = incMaxBuildSize;
309 bwtInc->targetNBit = targetNBit;
310 bwtInc->cumulativeCountInCurrentBuild = (unsigned*)calloc((ALPHABET_SIZE + 1), sizeof(unsigned int));
311 initializeVAL(bwtInc->cumulativeCountInCurrentBuild, ALPHABET_SIZE + 1, 0);
312
313 // Build frequently accessed data
314 bwtInc->packedShift = (unsigned*)calloc(CHAR_PER_WORD, sizeof(unsigned int));
315 for (i=0; i<CHAR_PER_WORD; i++) {
316 bwtInc->packedShift[i] = BITS_IN_WORD - (i+1) * BIT_PER_CHAR;
317 }
318
319 bwtInc->targetTextLength = textLength;
320 bwtInc->availableWord = (unsigned int)((textLength + OCC_INTERVAL - 1) / OCC_INTERVAL * OCC_INTERVAL / BITS_IN_WORD * bwtInc->targetNBit);
321 if (bwtInc->availableWord < BWTResidentSizeInWord(textLength) + BWTOccValueMinorSizeInWord(textLength)) {
322 fprintf(stderr, "BWTIncCreate() : targetNBit is too low!\n");
323 exit(1);
324 }
325 bwtInc->workingMemory = (unsigned*)calloc(bwtInc->availableWord, BYTES_IN_WORD);
326
327 return bwtInc;
328
329 }
330 // for BWTIncConstruct()
331 static void BWTIncPutPackedTextToRank(const unsigned int *packedText, unsigned int* __restrict rank,
332 unsigned int* __restrict cumulativeCount, const unsigned int numChar)
333 {
334 unsigned int i, j;
335 unsigned int c, t;
336 unsigned int packedMask;
337 unsigned int rankIndex;
338 unsigned int lastWord, numCharInLastWord;
339
340 lastWord = (numChar - 1) / CHAR_PER_WORD;
341 numCharInLastWord = numChar - lastWord * CHAR_PER_WORD;
342
343 packedMask = ALL_ONE_MASK >> (BITS_IN_WORD - BIT_PER_CHAR);
344 rankIndex = numChar - 1;
345
346 t = packedText[lastWord] >> (BITS_IN_WORD - numCharInLastWord * BIT_PER_CHAR);
347 for (i=0; i<numCharInLastWord; i++) {
348 c = t & packedMask;
349 cumulativeCount[c+1]++;
350 rank[rankIndex] = c;
351 rankIndex--;
352 t >>= BIT_PER_CHAR;
353 }
354
355 for (i=lastWord; i--;) { // loop from lastWord - 1 to 0
356 t = packedText[i];
357 for (j=0; j<CHAR_PER_WORD; j++) {
358 c = t & packedMask;
359 cumulativeCount[c+1]++;
360 rank[rankIndex] = c;
361 rankIndex--;
362 t >>= BIT_PER_CHAR;
363 }
364 }
365
366 // Convert occurrence to cumulativeCount
367 cumulativeCount[2] += cumulativeCount[1];
368 cumulativeCount[3] += cumulativeCount[2];
369 cumulativeCount[4] += cumulativeCount[3];
370 }
371
372
373 static void ForwardDNAAllOccCountNoLimit(const unsigned int* dna, const unsigned int index,
374 unsigned int* __restrict occCount, const unsigned int* dnaDecodeTable)
375 {
376 static const unsigned int truncateRightMask[16] = { 0x00000000, 0xC0000000, 0xF0000000, 0xFC000000,
377 0xFF000000, 0xFFC00000, 0xFFF00000, 0xFFFC0000,
378 0xFFFF0000, 0xFFFFC000, 0xFFFFF000, 0xFFFFFC00,
379 0xFFFFFF00, 0xFFFFFFC0, 0xFFFFFFF0, 0xFFFFFFFC };
380
381 unsigned int iteration, wordToCount, charToCount;
382 unsigned int i, j, c;
383 unsigned int sum;
384
385 occCount[0] = 0;
386 occCount[1] = 0;
387 occCount[2] = 0;
388 occCount[3] = 0;
389
390 iteration = index / 256;
391 wordToCount = (index - iteration * 256) / 16;
392 charToCount = index - iteration * 256 - wordToCount * 16;
393
394 for (i=0; i<iteration; i++) {
395
396 sum = 0;
397 for (j=0; j<16; j++) {
398 sum += dnaDecodeTable[*dna >> 16];
399 sum += dnaDecodeTable[*dna & 0x0000FFFF];
400 dna++;
401 }
402 if (!DNA_OCC_SUM_EXCEPTION(sum)) {
403 occCount[0] += sum & 0x000000FF; sum >>= 8;
404 occCount[1] += sum & 0x000000FF; sum >>= 8;
405 occCount[2] += sum & 0x000000FF; sum >>= 8;
406 occCount[3] += sum;
407 } else {
408 // only some or all of the 3 bits are on
409 // in reality, only one of the four cases are possible
410 if (sum == 0x00000100) {
411 occCount[0] += 256;
412 } else if (sum == 0x00010000) {
413 occCount[1] += 256;
414 } else if (sum == 0x01000000) {
415 occCount[2] += 256;
416 } else if (sum == 0x00000000) {
417 occCount[3] += 256;
418 } else {
419 fprintf(stderr, "ForwardDNAAllOccCountNoLimit(): DNA occ sum exception!\n");
420 exit(1);
421 }
422 }
423
424 }
425
426 sum = 0;
427 for (j=0; j<wordToCount; j++) {
428 sum += dnaDecodeTable[*dna >> 16];
429 sum += dnaDecodeTable[*dna & 0x0000FFFF];
430 dna++;
431 }
432
433 if (charToCount > 0) {
434 c = *dna & truncateRightMask[charToCount]; // increase count of 'a' by 16 - c;
435 sum += dnaDecodeTable[c >> 16];
436 sum += dnaDecodeTable[c & 0xFFFF];
437 sum += charToCount - 16; // decrease count of 'a' by 16 - positionToProcess
438 }
439
440 occCount[0] += sum & 0x000000FF; sum >>= 8;
441 occCount[1] += sum & 0x000000FF; sum >>= 8;
442 occCount[2] += sum & 0x000000FF; sum >>= 8;
443 occCount[3] += sum;
444 }
445
446 static void BWTIncBuildPackedBwt(const unsigned int *relativeRank, unsigned int* __restrict bwt, const unsigned int numChar,
447 const unsigned int *cumulativeCount, const unsigned int *packedShift) {
448
449 unsigned int i, c, r;
450 unsigned int previousRank, currentRank;
451 unsigned int wordIndex, charIndex;
452 unsigned int inverseSa0;
453
454 inverseSa0 = previousRank = relativeRank[0];
455
456 for (i=1; i<=numChar; i++) {
457 currentRank = relativeRank[i];
458 // previousRank > cumulativeCount[c] because $ is one of the char
459 c = (previousRank > cumulativeCount[1]) + (previousRank > cumulativeCount[2])
460 + (previousRank > cumulativeCount[3]);
461 // set bwt for currentRank
462 if (c > 0) {
463 // c <> 'a'
464 r = currentRank;
465 if (r > inverseSa0) {
466 // - 1 because $ at inverseSa0 is not encoded
467 r--;
468 }
469 wordIndex = r / CHAR_PER_WORD;
470 charIndex = r - wordIndex * CHAR_PER_WORD;
471 bwt[wordIndex] |= c << packedShift[charIndex];
472 }
473 previousRank = currentRank;
474 }
475 }
476
477 static inline unsigned int BWTOccValueExplicit(const BWT *bwt, const unsigned int occIndexExplicit,
478 const unsigned int character)
479 {
480 unsigned int occIndexMajor;
481
482 occIndexMajor = occIndexExplicit * OCC_INTERVAL / OCC_INTERVAL_MAJOR;
483
484 if (occIndexExplicit % OCC_VALUE_PER_WORD == 0) {
485 return bwt->occValueMajor[occIndexMajor * ALPHABET_SIZE + character] +
486 (bwt->occValue[occIndexExplicit / OCC_VALUE_PER_WORD * ALPHABET_SIZE + character] >> 16);
487
488 } else {
489 return bwt->occValueMajor[occIndexMajor * ALPHABET_SIZE + character] +
490 (bwt->occValue[occIndexExplicit / OCC_VALUE_PER_WORD * ALPHABET_SIZE + character] & 0x0000FFFF);
491 }
492 }
493
494
495 static unsigned int ForwardDNAOccCount(const unsigned int* dna, const unsigned int index, const unsigned int character,
496 const unsigned int* dnaDecodeTable)
497 {
498 static const unsigned int truncateRightMask[16] = { 0x00000000, 0xC0000000, 0xF0000000, 0xFC000000,
499 0xFF000000, 0xFFC00000, 0xFFF00000, 0xFFFC0000,
500 0xFFFF0000, 0xFFFFC000, 0xFFFFF000, 0xFFFFFC00,
501 0xFFFFFF00, 0xFFFFFFC0, 0xFFFFFFF0, 0xFFFFFFFC };
502
503 unsigned int wordToCount, charToCount;
504 unsigned int i, c;
505 unsigned int sum = 0;
506
507 wordToCount = index / 16;
508 charToCount = index - wordToCount * 16;
509
510 for (i=0; i<wordToCount; i++) {
511 sum += dnaDecodeTable[dna[i] >> 16];
512 sum += dnaDecodeTable[dna[i] & 0x0000FFFF];
513 }
514
515 if (charToCount > 0) {
516 c = dna[i] & truncateRightMask[charToCount]; // increase count of 'a' by 16 - c;
517 sum += dnaDecodeTable[c >> 16];
518 sum += dnaDecodeTable[c & 0xFFFF];
519 sum += charToCount - 16; // decrease count of 'a' by 16 - positionToProcess
520 }
521
522 return (sum >> (character * 8)) & 0x000000FF;
523
524 }
525
526 static unsigned int BackwardDNAOccCount(const unsigned int* dna, const unsigned int index, const unsigned int character,
527 const unsigned int* dnaDecodeTable)
528 {
529 static const unsigned int truncateLeftMask[16] = { 0x00000000, 0x00000003, 0x0000000F, 0x0000003F,
530 0x000000FF, 0x000003FF, 0x00000FFF, 0x00003FFF,
531 0x0000FFFF, 0x0003FFFF, 0x000FFFFF, 0x003FFFFF,
532 0x00FFFFFF, 0x03FFFFFF, 0x0FFFFFFF, 0x3FFFFFFF };
533
534 unsigned int wordToCount, charToCount;
535 unsigned int i, c;
536 unsigned int sum = 0;
537
538 wordToCount = index / 16;
539 charToCount = index - wordToCount * 16;
540
541 dna -= wordToCount + 1;
542
543 if (charToCount > 0) {
544 c = *dna & truncateLeftMask[charToCount]; // increase count of 'a' by 16 - c;
545 sum += dnaDecodeTable[c >> 16];
546 sum += dnaDecodeTable[c & 0xFFFF];
547 sum += charToCount - 16; // decrease count of 'a' by 16 - positionToProcess
548 }
549
550 for (i=0; i<wordToCount; i++) {
551 dna++;
552 sum += dnaDecodeTable[*dna >> 16];
553 sum += dnaDecodeTable[*dna & 0x0000FFFF];
554 }
555
556 return (sum >> (character * 8)) & 0x000000FF;
557
558 }
559
560 unsigned int BWTOccValue(const BWT *bwt, unsigned int index, const unsigned int character) {
561
562 unsigned int occValue;
563 unsigned int occExplicitIndex, occIndex;
564
565 // $ is supposed to be positioned at inverseSa0 but it is not encoded
566 // therefore index is subtracted by 1 for adjustment
567 if (index > bwt->inverseSa0) {
568 index--;
569 }
570
571 occExplicitIndex = (index + OCC_INTERVAL / 2 - 1) / OCC_INTERVAL; // Bidirectional encoding
572 occIndex = occExplicitIndex * OCC_INTERVAL;
573 occValue = BWTOccValueExplicit(bwt, occExplicitIndex, character);
574
575 if (occIndex == index) {
576 return occValue;
577 }
578
579 if (occIndex < index) {
580 return occValue + ForwardDNAOccCount(bwt->bwtCode + occIndex / CHAR_PER_WORD, index - occIndex, character, bwt->decodeTable);
581 } else {
582 return occValue - BackwardDNAOccCount(bwt->bwtCode + occIndex / CHAR_PER_WORD, occIndex - index, character, bwt->decodeTable);
583 }
584
585 }
586
587 static unsigned int BWTIncGetAbsoluteRank(BWT *bwt, unsigned int* __restrict absoluteRank, unsigned int* __restrict seq,
588 const unsigned int *packedText, const unsigned int numChar,
589 const unsigned int* cumulativeCount, const unsigned int firstCharInLastIteration)
590 {
591 unsigned int saIndex;
592 unsigned int lastWord;
593 unsigned int packedMask;
594 unsigned int i, j;
595 unsigned int c, t;
596 unsigned int rankIndex;
597 unsigned int shift;
598 unsigned int seqIndexFromStart[ALPHABET_SIZE];
599 unsigned int seqIndexFromEnd[ALPHABET_SIZE];
600
601 for (i=0; i<ALPHABET_SIZE; i++) {
602 seqIndexFromStart[i] = cumulativeCount[i];
603 seqIndexFromEnd[i] = cumulativeCount[i+1] - 1;
604 }
605
606 shift = BITS_IN_WORD - BIT_PER_CHAR;
607 packedMask = ALL_ONE_MASK >> shift;
608 saIndex = bwt->inverseSa0;
609 rankIndex = numChar - 1;
610
611 lastWord = numChar / CHAR_PER_WORD;
612 for (i=lastWord; i--;) { // loop from lastWord - 1 to 0
613 t = packedText[i];
614 for (j=0; j<CHAR_PER_WORD; j++) {
615 c = t & packedMask;
616 saIndex = bwt->cumulativeFreq[c] + BWTOccValue(bwt, saIndex, c) + 1;
617 // A counting sort using the first character of suffix is done here
618 // If rank > inverseSa0 -> fill seq from end, otherwise fill seq from start -> to leave the right entry for inverseSa0
619 if (saIndex > bwt->inverseSa0) {
620 seq[seqIndexFromEnd[c]] = rankIndex;
621 absoluteRank[seqIndexFromEnd[c]] = saIndex;
622 seqIndexFromEnd[c]--;
623 } else {
624 seq[seqIndexFromStart[c]] = rankIndex;
625 absoluteRank[seqIndexFromStart[c]] = saIndex;
626 seqIndexFromStart[c]++;
627 }
628 rankIndex--;
629 t >>= BIT_PER_CHAR;
630 }
631 }
632
633 absoluteRank[seqIndexFromStart[firstCharInLastIteration]] = bwt->inverseSa0; // representing the substring of all preceding characters
634 seq[seqIndexFromStart[firstCharInLastIteration]] = numChar;
635
636 return seqIndexFromStart[firstCharInLastIteration];
637 }
638
639 static void BWTIncSortKey(unsigned int* __restrict key, unsigned int* __restrict seq, const unsigned int numItem)
640 {
641 #define EQUAL_KEY_THRESHOLD 4 // Partition for equal key if data array size / the number of data with equal value with pivot < EQUAL_KEY_THRESHOLD
642
643 int lowIndex, highIndex, midIndex;
644 int lowPartitionIndex, highPartitionIndex;
645 int lowStack[32], highStack[32];
646 int stackDepth;
647 int i, j;
648 unsigned int tempSeq, tempKey;
649 int numberOfEqualKey;
650
651 if (numItem < 2) return;
652
653 stackDepth = 0;
654
655 lowIndex = 0;
656 highIndex = numItem - 1;
657
658 for (;;) {
659
660 for (;;) {
661
662 // Sort small array of data
663 if (highIndex - lowIndex < BWTINC_INSERT_SORT_NUM_ITEM) { // Insertion sort on smallest arrays
664 for (i=lowIndex+1; i<=highIndex; i++) {
665 tempSeq = seq[i];
666 tempKey = key[i];
667 for (j = i; j > lowIndex && key[j-1] > tempKey; j--) {
668 seq[j] = seq[j-1];
669 key[j] = key[j-1];
670 }
671 if (j != i) {
672 seq[j] = tempSeq;
673 key[j] = tempKey;
674 }
675 }
676 break;
677 }
678
679 // Choose pivot as median of the lowest, middle, and highest data; sort the three data
680
681 midIndex = average(lowIndex, highIndex);
682 if (key[lowIndex] > key[midIndex]) {
683 tempSeq = seq[lowIndex];
684 tempKey = key[lowIndex];
685 seq[lowIndex] = seq[midIndex];
686 key[lowIndex] = key[midIndex];
687 seq[midIndex] = tempSeq;
688 key[midIndex] = tempKey;
689 }
690 if (key[lowIndex] > key[highIndex]) {
691 tempSeq = seq[lowIndex];
692 tempKey = key[lowIndex];
693 seq[lowIndex] = seq[highIndex];
694 key[lowIndex] = key[highIndex];
695 seq[highIndex] = tempSeq;
696 key[highIndex] = tempKey;
697 }
698 if (key[midIndex] > key[highIndex]) {
699 tempSeq = seq[midIndex];
700 tempKey = key[midIndex];
701 seq[midIndex] = seq[highIndex];
702 key[midIndex] = key[highIndex];
703 seq[highIndex] = tempSeq;
704 key[highIndex] = tempKey;
705 }
706
707 // Partition data
708
709 numberOfEqualKey = 0;
710
711 lowPartitionIndex = lowIndex + 1;
712 highPartitionIndex = highIndex - 1;
713
714 for (;;) {
715 while (lowPartitionIndex <= highPartitionIndex && key[lowPartitionIndex] <= key[midIndex]) {
716 numberOfEqualKey += (key[lowPartitionIndex] == key[midIndex]);
717 lowPartitionIndex++;
718 }
719 while (lowPartitionIndex < highPartitionIndex) {
720 if (key[midIndex] >= key[highPartitionIndex]) {
721 numberOfEqualKey += (key[midIndex] == key[highPartitionIndex]);
722 break;
723 }
724 highPartitionIndex--;
725 }
726 if (lowPartitionIndex >= highPartitionIndex) {
727 break;
728 }
729 tempSeq = seq[lowPartitionIndex];
730 tempKey = key[lowPartitionIndex];
731 seq[lowPartitionIndex] = seq[highPartitionIndex];
732 key[lowPartitionIndex] = key[highPartitionIndex];
733 seq[highPartitionIndex] = tempSeq;
734 key[highPartitionIndex] = tempKey;
735 if (highPartitionIndex == midIndex) {
736 // partition key has been moved
737 midIndex = lowPartitionIndex;
738 }
739 lowPartitionIndex++;
740 highPartitionIndex--;
741 }
742
743 // Adjust the partition index
744 highPartitionIndex = lowPartitionIndex;
745 lowPartitionIndex--;
746
747 // move the partition key to end of low partition
748 tempSeq = seq[midIndex];
749 tempKey = key[midIndex];
750 seq[midIndex] = seq[lowPartitionIndex];
751 key[midIndex] = key[lowPartitionIndex];
752 seq[lowPartitionIndex] = tempSeq;
753 key[lowPartitionIndex] = tempKey;
754
755 if (highIndex - lowIndex + BWTINC_INSERT_SORT_NUM_ITEM <= EQUAL_KEY_THRESHOLD * numberOfEqualKey) {
756
757 // Many keys = partition key; separate the equal key data from the lower partition
758
759 midIndex = lowIndex;
760
761 for (;;) {
762 while (midIndex < lowPartitionIndex && key[midIndex] < key[lowPartitionIndex]) {
763 midIndex++;
764 }
765 while (midIndex < lowPartitionIndex && key[lowPartitionIndex] == key[lowPartitionIndex - 1]) {
766 lowPartitionIndex--;
767 }
768 if (midIndex >= lowPartitionIndex) {
769 break;
770 }
771 tempSeq = seq[midIndex];
772 tempKey = key[midIndex];
773 seq[midIndex] = seq[lowPartitionIndex - 1];
774 key[midIndex] = key[lowPartitionIndex - 1];
775 seq[lowPartitionIndex - 1] = tempSeq;
776 key[lowPartitionIndex - 1] = tempKey;
777 midIndex++;
778 lowPartitionIndex--;
779 }
780
781 }
782
783 if (lowPartitionIndex - lowIndex > highIndex - highPartitionIndex) {
784 // put the larger partition to stack
785 lowStack[stackDepth] = lowIndex;
786 highStack[stackDepth] = lowPartitionIndex - 1;
787 stackDepth++;
788 // sort the smaller partition first
789 lowIndex = highPartitionIndex;
790 } else {
791 // put the larger partition to stack
792 lowStack[stackDepth] = highPartitionIndex;
793 highStack[stackDepth] = highIndex;
794 stackDepth++;
795 // sort the smaller partition first
796 if (lowPartitionIndex > lowIndex) {
797 highIndex = lowPartitionIndex - 1;
798 } else {
799 // all keys in the partition equals to the partition key
800 break;
801 }
802 }
803 continue;
804 }
805
806 // Pop a range from stack
807 if (stackDepth > 0) {
808 stackDepth--;
809 lowIndex = lowStack[stackDepth];
810 highIndex = highStack[stackDepth];
811 continue;
812 } else return;
813 }
814 }
815
816
817 static void BWTIncBuildRelativeRank(unsigned int* __restrict sortedRank, unsigned int* __restrict seq,
818 unsigned int* __restrict relativeRank, const unsigned int numItem,
819 unsigned int oldInverseSa0, const unsigned int *cumulativeCount)
820 {
821 unsigned int i, c;
822 unsigned int s, r;
823 unsigned int lastRank, lastIndex;
824 unsigned int oldInverseSa0RelativeRank = 0;
825 unsigned int freq;
826
827 lastIndex = numItem;
828 lastRank = sortedRank[numItem];
829 if (lastRank > oldInverseSa0) {
830 sortedRank[numItem]--; // to prepare for merging; $ is not encoded in bwt
831 }
832 s = seq[numItem];
833 relativeRank[s] = numItem;
834 if (lastRank == oldInverseSa0) {
835 oldInverseSa0RelativeRank = numItem;
836 oldInverseSa0++; // so that this segment of code is not run again
837 lastRank++; // so that oldInverseSa0 become a sorted group with 1 item
838 }
839
840 c = ALPHABET_SIZE - 1;
841 freq = cumulativeCount[c];
842
843 for (i=numItem; i--;) { // from numItem - 1 to 0
844 r = sortedRank[i];
845 if (r > oldInverseSa0) {
846 sortedRank[i]--; // to prepare for merging; $ is not encoded in bwt
847 }
848 s = seq[i];
849 if (i < freq) {
850 if (lastIndex >= freq) {
851 lastRank++; // to trigger the group across alphabet boundary to be split
852 }
853 c--;
854 freq = cumulativeCount[c];
855 }
856 if (r == lastRank) {
857 relativeRank[s] = lastIndex;
858 } else {
859 if (i == lastIndex - 1) {
860 if (lastIndex < numItem && (int)seq[lastIndex + 1] < 0) {
861 seq[lastIndex] = seq[lastIndex + 1] - 1;
862 } else {
863 seq[lastIndex] = (unsigned int)-1;
864 }
865 }
866 lastIndex = i;
867 lastRank = r;
868 relativeRank[s] = i;
869 if (r == oldInverseSa0) {
870 oldInverseSa0RelativeRank = i;
871 oldInverseSa0++; // so that this segment of code is not run again
872 lastRank++; // so that oldInverseSa0 become a sorted group with 1 item
873 }
874 }
875 }
876
877 }
878
879 static void BWTIncBuildBwt(unsigned int* seq, const unsigned int *relativeRank, const unsigned int numChar,
880 const unsigned int *cumulativeCount)
881 {
882 unsigned int i, c;
883 unsigned int previousRank, currentRank;
884
885 previousRank = relativeRank[0];
886
887 for (i=1; i<=numChar; i++) {
888 currentRank = relativeRank[i];
889 c = (previousRank >= cumulativeCount[1]) + (previousRank >= cumulativeCount[2])
890 + (previousRank >= cumulativeCount[3]);
891 seq[currentRank] = c;
892 previousRank = currentRank;
893 }
894 }
895
896 static void BWTIncMergeBwt(const unsigned int *sortedRank, const unsigned int* oldBwt, const unsigned int *insertBwt,
897 unsigned int* __restrict mergedBwt, const unsigned int numOldBwt, const unsigned int numInsertBwt)
898 {
899 unsigned int bitsInWordMinusBitPerChar;
900 unsigned int leftShift, rightShift;
901 unsigned int o;
902 unsigned int oIndex, iIndex, mIndex;
903 unsigned int mWord, mChar, oWord, oChar;
904 unsigned int numInsert;
905
906 bitsInWordMinusBitPerChar = BITS_IN_WORD - BIT_PER_CHAR;
907
908 oIndex = 0;
909 iIndex = 0;
910 mIndex = 0;
911
912 mWord = 0;
913 mChar = 0;
914
915 mergedBwt[0] = 0; // this can be cleared as merged Bwt slightly shift to the left in each iteration
916
917 while (oIndex < numOldBwt) {
918
919 // copy from insertBwt
920 while (iIndex <= numInsertBwt && sortedRank[iIndex] <= oIndex) {
921 if (sortedRank[iIndex] != 0) { // special value to indicate that this is for new inverseSa0
922 mergedBwt[mWord] |= insertBwt[iIndex] << (BITS_IN_WORD - (mChar + 1) * BIT_PER_CHAR);
923 mIndex++;
924 mChar++;
925 if (mChar == CHAR_PER_WORD) {
926 mChar = 0;
927 mWord++;
928 mergedBwt[mWord] = 0; // no need to worry about crossing mergedBwt boundary
929 }
930 }
931 iIndex++;
932 }
933
934 // Copy from oldBwt to mergedBwt
935 if (iIndex <= numInsertBwt) {
936 o = sortedRank[iIndex];
937 } else {
938 o = numOldBwt;
939 }
940 numInsert = o - oIndex;
941
942 oWord = oIndex / CHAR_PER_WORD;
943 oChar = oIndex - oWord * CHAR_PER_WORD;
944 if (oChar > mChar) {
945 leftShift = (oChar - mChar) * BIT_PER_CHAR;
946 rightShift = (CHAR_PER_WORD + mChar - oChar) * BIT_PER_CHAR;
947 mergedBwt[mWord] = mergedBwt[mWord]
948 | (oldBwt[oWord] << (oChar * BIT_PER_CHAR) >> (mChar * BIT_PER_CHAR))
949 | (oldBwt[oWord+1] >> rightShift);
950 oIndex += min(numInsert, CHAR_PER_WORD - mChar);
951 while (o > oIndex) {
952 oWord++;
953 mWord++;
954 mergedBwt[mWord] = (oldBwt[oWord] << leftShift) | (oldBwt[oWord+1] >> rightShift);
955 oIndex += CHAR_PER_WORD;
956 }
957 } else if (oChar < mChar) {
958 rightShift = (mChar - oChar) * BIT_PER_CHAR;
959 leftShift = (CHAR_PER_WORD + oChar - mChar) * BIT_PER_CHAR;
960 mergedBwt[mWord] = mergedBwt[mWord]
961 | (oldBwt[oWord] << (oChar * BIT_PER_CHAR) >> (mChar * BIT_PER_CHAR));
962 oIndex += min(numInsert, CHAR_PER_WORD - mChar);
963 while (o > oIndex) {
964 oWord++;
965 mWord++;
966 mergedBwt[mWord] = (oldBwt[oWord-1] << leftShift) | (oldBwt[oWord] >> rightShift);
967 oIndex += CHAR_PER_WORD;
968 }
969 } else { // oChar == mChar
970 mergedBwt[mWord] = mergedBwt[mWord] | truncateLeft(oldBwt[oWord], mChar * BIT_PER_CHAR);
971 oIndex += min(numInsert, CHAR_PER_WORD - mChar);
972 while (o > oIndex) {
973 oWord++;
974 mWord++;
975 mergedBwt[mWord] = oldBwt[oWord];
976 oIndex += CHAR_PER_WORD;
977 }
978 }
979 oIndex = o;
980 mIndex += numInsert;
981
982 // Clear the trailing garbage in mergedBwt
983 mWord = mIndex / CHAR_PER_WORD;
984 mChar = mIndex - mWord * CHAR_PER_WORD;
985 if (mChar == 0) {
986 mergedBwt[mWord] = 0;
987 } else {
988 mergedBwt[mWord] = truncateRight(mergedBwt[mWord], (BITS_IN_WORD - mChar * BIT_PER_CHAR));
989 }
990
991 }
992
993 // copy from insertBwt
994 while (iIndex <= numInsertBwt) {
995 if (sortedRank[iIndex] != 0) {
996 mergedBwt[mWord] |= insertBwt[iIndex] << (BITS_IN_WORD - (mChar + 1) * BIT_PER_CHAR);
997 mIndex++;
998 mChar++;
999 if (mChar == CHAR_PER_WORD) {
1000 mChar = 0;
1001 mWord++;
1002 mergedBwt[mWord] = 0; // no need to worry about crossing mergedBwt boundary
1003 }
1004 }
1005 iIndex++;
1006 }
1007 }
1008
1009 void BWTClearTrailingBwtCode(BWT *bwt)
1010 {
1011 unsigned int bwtResidentSizeInWord;
1012 unsigned int wordIndex, offset;
1013 unsigned int i;
1014
1015 bwtResidentSizeInWord = BWTResidentSizeInWord(bwt->textLength);
1016
1017 wordIndex = bwt->textLength / CHAR_PER_WORD;
1018 offset = (bwt->textLength - wordIndex * CHAR_PER_WORD) * BIT_PER_CHAR;
1019 if (offset > 0) {
1020 bwt->bwtCode[wordIndex] = truncateRight(bwt->bwtCode[wordIndex], BITS_IN_WORD - offset);
1021 } else {
1022 if (wordIndex < bwtResidentSizeInWord) {
1023 bwt->bwtCode[wordIndex] = 0;
1024 }
1025 }
1026
1027 for (i=wordIndex+1; i<bwtResidentSizeInWord; i++) {
1028 bwt->bwtCode[i] = 0;
1029 }
1030 }
1031
1032
1033 void BWTGenerateOccValueFromBwt(const unsigned int* bwt, unsigned int* __restrict occValue,
1034 unsigned int* __restrict occValueMajor,
1035 const unsigned int textLength, const unsigned int* decodeTable)
1036 {
1037 unsigned int numberOfOccValueMajor, numberOfOccValue;
1038 unsigned int wordBetweenOccValue;
1039 unsigned int numberOfOccIntervalPerMajor;
1040 unsigned int c;
1041 unsigned int i, j;
1042 unsigned int occMajorIndex;
1043 unsigned int occIndex, bwtIndex;
1044 unsigned int sum;
1045 unsigned int tempOccValue0[ALPHABET_SIZE], tempOccValue1[ALPHABET_SIZE];
1046
1047 wordBetweenOccValue = OCC_INTERVAL / CHAR_PER_WORD;
1048
1049 // Calculate occValue
1050 // [lh3] by default: OCC_INTERVAL_MAJOR=65536, OCC_INTERVAL=256
1051 numberOfOccValue = (textLength + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // Value at both end for bi-directional encoding
1052 numberOfOccIntervalPerMajor = OCC_INTERVAL_MAJOR / OCC_INTERVAL;
1053 numberOfOccValueMajor = (numberOfOccValue + numberOfOccIntervalPerMajor - 1) / numberOfOccIntervalPerMajor;
1054
1055 tempOccValue0[0] = 0;
1056 tempOccValue0[1] = 0;
1057 tempOccValue0[2] = 0;
1058 tempOccValue0[3] = 0;
1059 occValueMajor[0] = 0;
1060 occValueMajor[1] = 0;
1061 occValueMajor[2] = 0;
1062 occValueMajor[3] = 0;
1063
1064 occIndex = 0;
1065 bwtIndex = 0;
1066 for (occMajorIndex=1; occMajorIndex<numberOfOccValueMajor; occMajorIndex++) {
1067
1068 for (i=0; i<numberOfOccIntervalPerMajor/2; i++) {
1069
1070 sum = 0;
1071 tempOccValue1[0] = tempOccValue0[0];
1072 tempOccValue1[1] = tempOccValue0[1];
1073 tempOccValue1[2] = tempOccValue0[2];
1074 tempOccValue1[3] = tempOccValue0[3];
1075
1076 for (j=0; j<wordBetweenOccValue; j++) {
1077 c = bwt[bwtIndex];
1078 sum += decodeTable[c >> 16];
1079 sum += decodeTable[c & 0x0000FFFF];
1080 bwtIndex++;
1081 }
1082 if (!DNA_OCC_SUM_EXCEPTION(sum)) {
1083 tempOccValue1[0] += (sum & 0x000000FF); sum >>= 8;
1084 tempOccValue1[1] += (sum & 0x000000FF); sum >>= 8;
1085 tempOccValue1[2] += (sum & 0x000000FF); sum >>= 8;
1086 tempOccValue1[3] += sum;
1087 } else {
1088 if (sum == 0x00000100) {
1089 tempOccValue1[0] += 256;
1090 } else if (sum == 0x00010000) {
1091 tempOccValue1[1] += 256;
1092 } else if (sum == 0x01000000) {
1093 tempOccValue1[2] += 256;
1094 } else {
1095 tempOccValue1[3] += 256;
1096 }
1097 }
1098 occValue[occIndex * 4 + 0] = (tempOccValue0[0] << 16) | tempOccValue1[0];
1099 occValue[occIndex * 4 + 1] = (tempOccValue0[1] << 16) | tempOccValue1[1];
1100 occValue[occIndex * 4 + 2] = (tempOccValue0[2] << 16) | tempOccValue1[2];
1101 occValue[occIndex * 4 + 3] = (tempOccValue0[3] << 16) | tempOccValue1[3];
1102 tempOccValue0[0] = tempOccValue1[0];
1103 tempOccValue0[1] = tempOccValue1[1];
1104 tempOccValue0[2] = tempOccValue1[2];
1105 tempOccValue0[3] = tempOccValue1[3];
1106 sum = 0;
1107
1108 occIndex++;
1109
1110 for (j=0; j<wordBetweenOccValue; j++) {
1111 c = bwt[bwtIndex];
1112 sum += decodeTable[c >> 16];
1113 sum += decodeTable[c & 0x0000FFFF];
1114 bwtIndex++;
1115 }
1116 if (!DNA_OCC_SUM_EXCEPTION(sum)) {
1117 tempOccValue0[0] += (sum & 0x000000FF); sum >>= 8;
1118 tempOccValue0[1] += (sum & 0x000000FF); sum >>= 8;
1119 tempOccValue0[2] += (sum & 0x000000FF); sum >>= 8;
1120 tempOccValue0[3] += sum;
1121 } else {
1122 if (sum == 0x00000100) {
1123 tempOccValue0[0] += 256;
1124 } else if (sum == 0x00010000) {
1125 tempOccValue0[1] += 256;
1126 } else if (sum == 0x01000000) {
1127 tempOccValue0[2] += 256;
1128 } else {
1129 tempOccValue0[3] += 256;
1130 }
1131 }
1132 }
1133
1134 occValueMajor[occMajorIndex * 4 + 0] = occValueMajor[(occMajorIndex - 1) * 4 + 0] + tempOccValue0[0];
1135 occValueMajor[occMajorIndex * 4 + 1] = occValueMajor[(occMajorIndex - 1) * 4 + 1] + tempOccValue0[1];
1136 occValueMajor[occMajorIndex * 4 + 2] = occValueMajor[(occMajorIndex - 1) * 4 + 2] + tempOccValue0[2];
1137 occValueMajor[occMajorIndex * 4 + 3] = occValueMajor[(occMajorIndex - 1) * 4 + 3] + tempOccValue0[3];
1138 tempOccValue0[0] = 0;
1139 tempOccValue0[1] = 0;
1140 tempOccValue0[2] = 0;
1141 tempOccValue0[3] = 0;
1142
1143 }
1144
1145 while (occIndex < (numberOfOccValue-1)/2) {
1146 sum = 0;
1147 tempOccValue1[0] = tempOccValue0[0];
1148 tempOccValue1[1] = tempOccValue0[1];
1149 tempOccValue1[2] = tempOccValue0[2];
1150 tempOccValue1[3] = tempOccValue0[3];
1151 for (j=0; j<wordBetweenOccValue; j++) {
1152 c = bwt[bwtIndex];
1153 sum += decodeTable[c >> 16];
1154 sum += decodeTable[c & 0x0000FFFF];
1155 bwtIndex++;
1156 }
1157 if (!DNA_OCC_SUM_EXCEPTION(sum)) {
1158 tempOccValue1[0] += (sum & 0x000000FF); sum >>= 8;
1159 tempOccValue1[1] += (sum & 0x000000FF); sum >>= 8;
1160 tempOccValue1[2] += (sum & 0x000000FF); sum >>= 8;
1161 tempOccValue1[3] += sum;
1162 } else {
1163 if (sum == 0x00000100) {
1164 tempOccValue1[0] += 256;
1165 } else if (sum == 0x00010000) {
1166 tempOccValue1[1] += 256;
1167 } else if (sum == 0x01000000) {
1168 tempOccValue1[2] += 256;
1169 } else {
1170 tempOccValue1[3] += 256;
1171 }
1172 }
1173 occValue[occIndex * 4 + 0] = (tempOccValue0[0] << 16) | tempOccValue1[0];
1174 occValue[occIndex * 4 + 1] = (tempOccValue0[1] << 16) | tempOccValue1[1];
1175 occValue[occIndex * 4 + 2] = (tempOccValue0[2] << 16) | tempOccValue1[2];
1176 occValue[occIndex * 4 + 3] = (tempOccValue0[3] << 16) | tempOccValue1[3];
1177 tempOccValue0[0] = tempOccValue1[0];
1178 tempOccValue0[1] = tempOccValue1[1];
1179 tempOccValue0[2] = tempOccValue1[2];
1180 tempOccValue0[3] = tempOccValue1[3];
1181 sum = 0;
1182 occIndex++;
1183
1184 for (j=0; j<wordBetweenOccValue; j++) {
1185 c = bwt[bwtIndex];
1186 sum += decodeTable[c >> 16];
1187 sum += decodeTable[c & 0x0000FFFF];
1188 bwtIndex++;
1189 }
1190 if (!DNA_OCC_SUM_EXCEPTION(sum)) {
1191 tempOccValue0[0] += (sum & 0x000000FF); sum >>= 8;
1192 tempOccValue0[1] += (sum & 0x000000FF); sum >>= 8;
1193 tempOccValue0[2] += (sum & 0x000000FF); sum >>= 8;
1194 tempOccValue0[3] += sum;
1195 } else {
1196 if (sum == 0x00000100) {
1197 tempOccValue0[0] += 256;
1198 } else if (sum == 0x00010000) {
1199 tempOccValue0[1] += 256;
1200 } else if (sum == 0x01000000) {
1201 tempOccValue0[2] += 256;
1202 } else {
1203 tempOccValue0[3] += 256;
1204 }
1205 }
1206 }
1207
1208 sum = 0;
1209 tempOccValue1[0] = tempOccValue0[0];
1210 tempOccValue1[1] = tempOccValue0[1];
1211 tempOccValue1[2] = tempOccValue0[2];
1212 tempOccValue1[3] = tempOccValue0[3];
1213
1214 if (occIndex * 2 < numberOfOccValue - 1) {
1215 for (j=0; j<wordBetweenOccValue; j++) {
1216 c = bwt[bwtIndex];
1217 sum += decodeTable[c >> 16];
1218 sum += decodeTable[c & 0x0000FFFF];
1219 bwtIndex++;
1220 }
1221 if (!DNA_OCC_SUM_EXCEPTION(sum)) {
1222 tempOccValue1[0] += (sum & 0x000000FF); sum >>= 8;
1223 tempOccValue1[1] += (sum & 0x000000FF); sum >>= 8;
1224 tempOccValue1[2] += (sum & 0x000000FF); sum >>= 8;
1225 tempOccValue1[3] += sum;
1226 } else {
1227 if (sum == 0x00000100) {
1228 tempOccValue1[0] += 256;
1229 } else if (sum == 0x00010000) {
1230 tempOccValue1[1] += 256;
1231 } else if (sum == 0x01000000) {
1232 tempOccValue1[2] += 256;
1233 } else {
1234 tempOccValue1[3] += 256;
1235 }
1236 }
1237 }
1238
1239 occValue[occIndex * 4 + 0] = (tempOccValue0[0] << 16) | tempOccValue1[0];
1240 occValue[occIndex * 4 + 1] = (tempOccValue0[1] << 16) | tempOccValue1[1];
1241 occValue[occIndex * 4 + 2] = (tempOccValue0[2] << 16) | tempOccValue1[2];
1242 occValue[occIndex * 4 + 3] = (tempOccValue0[3] << 16) | tempOccValue1[3];
1243
1244 }
1245
1246 static void BWTIncConstruct(BWTInc *bwtInc, const unsigned int numChar)
1247 {
1248 unsigned int i;
1249 unsigned int mergedBwtSizeInWord, mergedOccSizeInWord;
1250 unsigned int firstCharInThisIteration;
1251
1252 unsigned int *relativeRank, *seq, *sortedRank, *insertBwt, *mergedBwt;
1253 unsigned int newInverseSa0RelativeRank, oldInverseSa0RelativeRank, newInverseSa0;
1254
1255 #ifdef DEBUG
1256 if (numChar > bwtInc->buildSize) {
1257 fprintf(stderr, "BWTIncConstruct(): numChar > buildSize!\n");
1258 exit(1);
1259 }
1260 #endif
1261
1262 mergedBwtSizeInWord = BWTResidentSizeInWord(bwtInc->bwt->textLength + numChar);
1263 mergedOccSizeInWord = BWTOccValueMinorSizeInWord(bwtInc->bwt->textLength + numChar);
1264
1265 initializeVAL(bwtInc->cumulativeCountInCurrentBuild, ALPHABET_SIZE + 1, 0);
1266
1267 if (bwtInc->bwt->textLength == 0) { // Initial build
1268
1269 // Set address
1270 seq = bwtInc->workingMemory;
1271 relativeRank = seq + bwtInc->buildSize + 1;
1272 mergedBwt = insertBwt = bwtInc->workingMemory + bwtInc->availableWord - mergedBwtSizeInWord; // build in place
1273
1274 BWTIncPutPackedTextToRank(bwtInc->packedText, relativeRank, bwtInc->cumulativeCountInCurrentBuild, numChar);
1275
1276 firstCharInThisIteration = relativeRank[0];
1277 relativeRank[numChar] = 0;
1278
1279 // Sort suffix
1280 QSufSortSuffixSort((int*)relativeRank, (int*)seq, (int)numChar, (int)ALPHABET_SIZE - 1, 0, FALSE);
1281 newInverseSa0 = relativeRank[0];
1282
1283 // Clear BWT area
1284 initializeVAL(insertBwt, mergedBwtSizeInWord, 0);
1285
1286 // Build BWT
1287 BWTIncBuildPackedBwt(relativeRank, insertBwt, numChar, bwtInc->cumulativeCountInCurrentBuild, bwtInc->packedShift);
1288
1289 // so that the cumulativeCount is not deducted
1290 bwtInc->firstCharInLastIteration = ALPHABET_SIZE;
1291
1292 } else { // Incremental build
1293 // Set address
1294 sortedRank = bwtInc->workingMemory;
1295 seq = sortedRank + bwtInc->buildSize + 1;
1296 insertBwt = seq;
1297 relativeRank = seq + bwtInc->buildSize + 1;
1298
1299 // Store the first character of this iteration
1300 firstCharInThisIteration = bwtInc->packedText[0] >> (BITS_IN_WORD - BIT_PER_CHAR);
1301
1302 // Count occurrence of input text
1303 ForwardDNAAllOccCountNoLimit(bwtInc->packedText, numChar, bwtInc->cumulativeCountInCurrentBuild + 1, bwtInc->bwt->decodeTable);
1304 // Add the first character of the previous iteration to represent the inverseSa0 of the previous iteration
1305 bwtInc->cumulativeCountInCurrentBuild[bwtInc->firstCharInLastIteration + 1]++;
1306 bwtInc->cumulativeCountInCurrentBuild[2] += bwtInc->cumulativeCountInCurrentBuild[1];
1307 bwtInc->cumulativeCountInCurrentBuild[3] += bwtInc->cumulativeCountInCurrentBuild[2];
1308 bwtInc->cumulativeCountInCurrentBuild[4] += bwtInc->cumulativeCountInCurrentBuild[3];
1309
1310 // Get rank of new suffix among processed suffix
1311 // The seq array is built into ALPHABET_SIZE + 2 groups; ALPHABET_SIZE groups + 1 group divided into 2 by inverseSa0 + inverseSa0 as 1 group
1312 oldInverseSa0RelativeRank = BWTIncGetAbsoluteRank(bwtInc->bwt, sortedRank, seq, bwtInc->packedText,
1313 numChar, bwtInc->cumulativeCountInCurrentBuild, bwtInc->firstCharInLastIteration);
1314
1315 // Sort rank by ALPHABET_SIZE + 2 groups (or ALPHABET_SIZE + 1 groups when inverseSa0 sit on the border of a group)
1316 for (i=0; i<ALPHABET_SIZE; i++) {
1317 if (bwtInc->cumulativeCountInCurrentBuild[i] > oldInverseSa0RelativeRank ||
1318 bwtInc->cumulativeCountInCurrentBuild[i+1] <= oldInverseSa0RelativeRank) {
1319 BWTIncSortKey(sortedRank + bwtInc->cumulativeCountInCurrentBuild[i], seq + bwtInc->cumulativeCountInCurrentBuild[i], bwtInc->cumulativeCountInCurrentBuild[i+1] - bwtInc->cumulativeCountInCurrentBuild[i]);
1320 } else {
1321 if (bwtInc->cumulativeCountInCurrentBuild[i] < oldInverseSa0RelativeRank) {
1322 BWTIncSortKey(sortedRank + bwtInc->cumulativeCountInCurrentBuild[i], seq + bwtInc->cumulativeCountInCurrentBuild[i], oldInverseSa0RelativeRank - bwtInc->cumulativeCountInCurrentBuild[i]);
1323 }
1324 if (bwtInc->cumulativeCountInCurrentBuild[i+1] > oldInverseSa0RelativeRank + 1) {
1325 BWTIncSortKey(sortedRank + oldInverseSa0RelativeRank + 1, seq + oldInverseSa0RelativeRank + 1, bwtInc->cumulativeCountInCurrentBuild[i+1] - oldInverseSa0RelativeRank - 1);
1326 }
1327 }
1328 }
1329
1330 // build relative rank; sortedRank is updated for merging to cater for the fact that $ is not encoded in bwt
1331 // the cumulative freq information is used to make sure that inverseSa0 and suffix beginning with different characters are kept in different unsorted groups)
1332 BWTIncBuildRelativeRank(sortedRank, seq, relativeRank, numChar, bwtInc->bwt->inverseSa0, bwtInc->cumulativeCountInCurrentBuild);
1333 #ifdef DEBUG
1334 if (relativeRank[numChar] != oldInverseSa0RelativeRank) {
1335 fprintf(stderr, "BWTIncConstruct(): relativeRank[numChar] != oldInverseSa0RelativeRank!\n");
1336 exit(1);
1337 }
1338 #endif
1339
1340 // Sort suffix
1341 QSufSortSuffixSort((int*)relativeRank, (int*)seq, (int)numChar, (int)numChar, 1, TRUE);
1342
1343 newInverseSa0RelativeRank = relativeRank[0];
1344 newInverseSa0 = sortedRank[newInverseSa0RelativeRank] + newInverseSa0RelativeRank;
1345
1346 sortedRank[newInverseSa0RelativeRank] = 0; // a special value so that this is skipped in the merged bwt
1347
1348 // Build BWT
1349 BWTIncBuildBwt(seq, relativeRank, numChar, bwtInc->cumulativeCountInCurrentBuild);
1350
1351 // Merge BWT
1352 mergedBwt = bwtInc->workingMemory + bwtInc->availableWord - mergedBwtSizeInWord
1353 - bwtInc->numberOfIterationDone * OCC_INTERVAL / BIT_PER_CHAR;
1354 // minus numberOfIteration * occInterval to create a buffer for merging
1355 BWTIncMergeBwt(sortedRank, bwtInc->bwt->bwtCode, insertBwt, mergedBwt, bwtInc->bwt->textLength, numChar);
1356
1357 }
1358
1359 // Build auxiliary structure and update info and pointers in BWT
1360 bwtInc->bwt->textLength += numChar;
1361 bwtInc->bwt->bwtCode = mergedBwt;
1362 bwtInc->bwt->bwtSizeInWord = mergedBwtSizeInWord;
1363 bwtInc->bwt->occSizeInWord = mergedOccSizeInWord;
1364 if (mergedBwt < bwtInc->workingMemory + mergedOccSizeInWord) {
1365 fprintf(stderr, "BWTIncConstruct() : Not enough memory allocated!\n");
1366 exit(1);
1367 }
1368
1369 bwtInc->bwt->occValue = mergedBwt - mergedOccSizeInWord;
1370
1371 BWTClearTrailingBwtCode(bwtInc->bwt);
1372 BWTGenerateOccValueFromBwt(bwtInc->bwt->bwtCode, bwtInc->bwt->occValue, bwtInc->bwt->occValueMajor,
1373 bwtInc->bwt->textLength, bwtInc->bwt->decodeTable);
1374
1375 bwtInc->bwt->inverseSa0 = newInverseSa0;
1376
1377 bwtInc->bwt->cumulativeFreq[1] += bwtInc->cumulativeCountInCurrentBuild[1] - (bwtInc->firstCharInLastIteration <= 0);
1378 bwtInc->bwt->cumulativeFreq[2] += bwtInc->cumulativeCountInCurrentBuild[2] - (bwtInc->firstCharInLastIteration <= 1);
1379 bwtInc->bwt->cumulativeFreq[3] += bwtInc->cumulativeCountInCurrentBuild[3] - (bwtInc->firstCharInLastIteration <= 2);
1380 bwtInc->bwt->cumulativeFreq[4] += bwtInc->cumulativeCountInCurrentBuild[4] - (bwtInc->firstCharInLastIteration <= 3);
1381
1382 bwtInc->firstCharInLastIteration = firstCharInThisIteration;
1383
1384 // Set build size and text address for the next build
1385 BWTIncSetBuildSizeAndTextAddr(bwtInc);
1386 bwtInc->numberOfIterationDone++;
1387
1388 }
1389
1390 BWTInc *BWTIncConstructFromPacked(const char *inputFileName, const float targetNBit,
1391 const unsigned int initialMaxBuildSize, const unsigned int incMaxBuildSize)
1392 {
1393
1394 FILE *packedFile;
1395 unsigned int packedFileLen;
1396 unsigned int totalTextLength;
1397 unsigned int textToLoad, textSizeInByte;
1398 unsigned int processedTextLength;
1399 unsigned char lastByteLength;
1400
1401 BWTInc *bwtInc;
1402
1403 packedFile = (FILE*)fopen(inputFileName, "rb");
1404
1405 if (packedFile == NULL) {
1406 fprintf(stderr, "BWTIncConstructFromPacked() : Cannot open inputFileName!\n");
1407 exit(1);
1408 }
1409
1410 fseek(packedFile, -1, SEEK_END);
1411 packedFileLen = ftell(packedFile);
1412 if ((int)packedFileLen < 0) {
1413 fprintf(stderr, "BWTIncConstructFromPacked: Cannot determine file length!\n");
1414 exit(1);
1415 }
1416 fread(&lastByteLength, sizeof(unsigned char), 1, packedFile);
1417 totalTextLength = TextLengthFromBytePacked(packedFileLen, BIT_PER_CHAR, lastByteLength);
1418
1419 bwtInc = BWTIncCreate(totalTextLength, targetNBit, initialMaxBuildSize, incMaxBuildSize);
1420
1421 BWTIncSetBuildSizeAndTextAddr(bwtInc);
1422
1423 if (bwtInc->buildSize > totalTextLength) {
1424 textToLoad = totalTextLength;
1425 } else {
1426 textToLoad = totalTextLength - ((totalTextLength - bwtInc->buildSize + CHAR_PER_WORD - 1) / CHAR_PER_WORD * CHAR_PER_WORD);
1427 }
1428 textSizeInByte = textToLoad / CHAR_PER_BYTE; // excluded the odd byte
1429
1430 fseek(packedFile, -2, SEEK_CUR);
1431 fseek(packedFile, -((int)textSizeInByte), SEEK_CUR);
1432 fread(bwtInc->textBuffer, sizeof(unsigned char), textSizeInByte + 1, packedFile);
1433 fseek(packedFile, -((int)textSizeInByte + 1), SEEK_CUR);
1434
1435 ConvertBytePackedToWordPacked(bwtInc->textBuffer, bwtInc->packedText, ALPHABET_SIZE, textToLoad);
1436 BWTIncConstruct(bwtInc, textToLoad);
1437
1438 processedTextLength = textToLoad;
1439
1440 while (processedTextLength < totalTextLength) {
1441 textToLoad = bwtInc->buildSize / CHAR_PER_WORD * CHAR_PER_WORD;
1442 if (textToLoad > totalTextLength - processedTextLength) {
1443 textToLoad = totalTextLength - processedTextLength;
1444 }
1445 textSizeInByte = textToLoad / CHAR_PER_BYTE;
1446 fseek(packedFile, -((int)textSizeInByte), SEEK_CUR);
1447 fread(bwtInc->textBuffer, sizeof(unsigned char), textSizeInByte, packedFile);
1448 fseek(packedFile, -((int)textSizeInByte), SEEK_CUR);
1449 ConvertBytePackedToWordPacked(bwtInc->textBuffer, bwtInc->packedText, ALPHABET_SIZE, textToLoad);
1450 BWTIncConstruct(bwtInc, textToLoad);
1451 processedTextLength += textToLoad;
1452 if (bwtInc->numberOfIterationDone % 10 == 0) {
1453 printf("[BWTIncConstructFromPacked] %u iterations done. %u characters processed.\n",
1454 bwtInc->numberOfIterationDone, processedTextLength);
1455 }
1456 }
1457 return bwtInc;
1458 }
1459
1460 void BWTFree(BWT *bwt)
1461 {
1462 if (bwt == 0) return;
1463 free(bwt->cumulativeFreq);
1464 free(bwt->bwtCode);
1465 free(bwt->occValue);
1466 free(bwt->occValueMajor);
1467 free(bwt->saValue);
1468 free(bwt->inverseSa);
1469 free(bwt->decodeTable);
1470 free(bwt->saIndexRange);
1471 free(bwt->saValueOnBoundary);
1472 free(bwt);
1473 }
1474
1475 void BWTIncFree(BWTInc *bwtInc)
1476 {
1477 if (bwtInc == 0) return;
1478 free(bwtInc->bwt);
1479 free(bwtInc->workingMemory);
1480 free(bwtInc);
1481 }
1482
1483 static unsigned int BWTFileSizeInWord(const unsigned int numChar)
1484 {
1485 // The $ in BWT at the position of inverseSa0 is not encoded
1486 return (numChar + CHAR_PER_WORD - 1) / CHAR_PER_WORD;
1487 }
1488
1489 void BWTSaveBwtCodeAndOcc(const BWT *bwt, const char *bwtFileName, const char *occValueFileName)
1490 {
1491 FILE *bwtFile;
1492 /* FILE *occValueFile; */
1493 unsigned int bwtLength;
1494
1495 bwtFile = (FILE*)fopen(bwtFileName, "wb");
1496 if (bwtFile == NULL) {
1497 fprintf(stderr, "BWTSaveBwtCodeAndOcc(): Cannot open BWT code file!\n");
1498 exit(1);
1499 }
1500
1501 fwrite(&bwt->inverseSa0, sizeof(unsigned int), 1, bwtFile);
1502 fwrite(bwt->cumulativeFreq + 1, sizeof(unsigned int), ALPHABET_SIZE, bwtFile);
1503 bwtLength = BWTFileSizeInWord(bwt->textLength);
1504 fwrite(bwt->bwtCode, sizeof(unsigned int), bwtLength, bwtFile);
1505 fclose(bwtFile);
1506 /*
1507 occValueFile = (FILE*)fopen(occValueFileName, "wb");
1508 if (occValueFile == NULL) {
1509 fprintf(stderr, "BWTSaveBwtCodeAndOcc(): Cannot open occ value file!\n");
1510 exit(1);
1511 }
1512
1513 fwrite(&bwt->inverseSa0, sizeof(unsigned int), 1, occValueFile);
1514 fwrite(bwt->cumulativeFreq + 1, sizeof(unsigned int), ALPHABET_SIZE, occValueFile);
1515 fwrite(bwt->occValue, sizeof(unsigned int), bwt->occSizeInWord, occValueFile);
1516 fwrite(bwt->occValueMajor, sizeof(unsigned int), bwt->occMajorSizeInWord, occValueFile);
1517 fclose(occValueFile);
1518 */
1519 }
1520
1521 void bwt_bwtgen(const char *fn_pac, const char *fn_bwt)
1522 {
1523 BWTInc *bwtInc;
1524 bwtInc = BWTIncConstructFromPacked(fn_pac, 2.5, 10000000, 10000000);
1525 printf("[bwt_gen] Finished constructing BWT in %u iterations.\n", bwtInc->numberOfIterationDone);
1526 BWTSaveBwtCodeAndOcc(bwtInc->bwt, fn_bwt, 0);
1527 BWTIncFree(bwtInc);
1528 }
1529
1530 int bwt_bwtgen_main(int argc, char *argv[])
1531 {
1532 if (argc < 3) {
1533 fprintf(stderr, "Usage: bwtgen <in.pac> <out.bwt>\n");
1534 return 1;
1535 }
1536 bwt_bwtgen(argv[1], argv[2]);
1537 return 0;
1538 }
1539
1540 #ifdef MAIN_BWT_GEN
1541
1542 int main(int argc, char *argv[])
1543 {
1544 return bwt_bwtgen_main(argc, argv);
1545 }
1546
1547 #endif