Mercurial > repos > siyuan > prada
comparison pyPRADA_1.2/tools/bwa-0.5.7-mh/bwt_gen/bwt_gen.c @ 0:acc2ca1a3ba4
Uploaded
| author | siyuan |
|---|---|
| date | Thu, 20 Feb 2014 00:44:58 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:acc2ca1a3ba4 |
|---|---|
| 1 /* | |
| 2 | |
| 3 BWTConstruct.c BWT-Index Construction | |
| 4 | |
| 5 This module constructs BWT and auxiliary data structures. | |
| 6 | |
| 7 Copyright (C) 2004, Wong Chi Kwong. | |
| 8 | |
| 9 This program is free software; you can redistribute it and/or | |
| 10 modify it under the terms of the GNU General Public License | |
| 11 as published by the Free Software Foundation; either version 2 | |
| 12 of the License, or (at your option) any later version. | |
| 13 | |
| 14 This program is distributed in the hope that it will be useful, | |
| 15 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 17 GNU General Public License for more details. | |
| 18 | |
| 19 You should have received a copy of the GNU General Public License | |
| 20 along with this program; if not, write to the Free Software | |
| 21 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | |
| 22 | |
| 23 */ | |
| 24 | |
| 25 #include <stdio.h> | |
| 26 #include <stdlib.h> | |
| 27 #include <string.h> | |
| 28 #include "bwt_gen.h" | |
| 29 #include "QSufSort.h" | |
| 30 | |
| 31 static unsigned int TextLengthFromBytePacked(unsigned int bytePackedLength, unsigned int bitPerChar, | |
| 32 unsigned int lastByteLength) | |
| 33 { | |
| 34 if (bytePackedLength > ALL_ONE_MASK / (BITS_IN_BYTE / bitPerChar)) { | |
| 35 fprintf(stderr, "TextLengthFromBytePacked(): text length > 2^32!\n"); | |
| 36 exit(1); | |
| 37 } | |
| 38 return (bytePackedLength - 1) * (BITS_IN_BYTE / bitPerChar) + lastByteLength; | |
| 39 } | |
| 40 | |
| 41 static void initializeVAL(unsigned int *startAddr, const unsigned int length, const unsigned int initValue) | |
| 42 { | |
| 43 unsigned int i; | |
| 44 for (i=0; i<length; i++) startAddr[i] = initValue; | |
| 45 } | |
| 46 | |
| 47 static void GenerateDNAOccCountTable(unsigned int *dnaDecodeTable) | |
| 48 { | |
| 49 unsigned int i, j, c, t; | |
| 50 | |
| 51 for (i=0; i<DNA_OCC_CNT_TABLE_SIZE_IN_WORD; i++) { | |
| 52 dnaDecodeTable[i] = 0; | |
| 53 c = i; | |
| 54 for (j=0; j<8; j++) { | |
| 55 t = c & 0x00000003; | |
| 56 dnaDecodeTable[i] += 1 << (t * 8); | |
| 57 c >>= 2; | |
| 58 } | |
| 59 } | |
| 60 | |
| 61 } | |
| 62 // for BWTIncCreate() | |
| 63 static unsigned int BWTOccValueMajorSizeInWord(const unsigned int numChar) | |
| 64 { | |
| 65 unsigned int numOfOccValue; | |
| 66 unsigned int numOfOccIntervalPerMajor; | |
| 67 numOfOccValue = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // Value at both end for bi-directional encoding | |
| 68 numOfOccIntervalPerMajor = OCC_INTERVAL_MAJOR / OCC_INTERVAL; | |
| 69 return (numOfOccValue + numOfOccIntervalPerMajor - 1) / numOfOccIntervalPerMajor * ALPHABET_SIZE; | |
| 70 } | |
| 71 // for BWTIncCreate() | |
| 72 static unsigned int BWTOccValueMinorSizeInWord(const unsigned int numChar) | |
| 73 { | |
| 74 unsigned int numOfOccValue; | |
| 75 numOfOccValue = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // Value at both end for bi-directional encoding | |
| 76 return (numOfOccValue + OCC_VALUE_PER_WORD - 1) / OCC_VALUE_PER_WORD * ALPHABET_SIZE; | |
| 77 } | |
| 78 // for BWTIncCreate() | |
| 79 static unsigned int BWTResidentSizeInWord(const unsigned int numChar) { | |
| 80 | |
| 81 unsigned int numCharRoundUpToOccInterval; | |
| 82 | |
| 83 // The $ in BWT at the position of inverseSa0 is not encoded | |
| 84 numCharRoundUpToOccInterval = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL * OCC_INTERVAL; | |
| 85 | |
| 86 return (numCharRoundUpToOccInterval + CHAR_PER_WORD - 1) / CHAR_PER_WORD; | |
| 87 | |
| 88 } | |
| 89 | |
| 90 static void BWTIncSetBuildSizeAndTextAddr(BWTInc *bwtInc) | |
| 91 { | |
| 92 unsigned int maxBuildSize; | |
| 93 | |
| 94 if (bwtInc->bwt->textLength == 0) { | |
| 95 // initial build | |
| 96 // Minus 2 because n+1 entries of seq and rank needed for n char | |
| 97 maxBuildSize = (bwtInc->availableWord - 2 - OCC_INTERVAL / CHAR_PER_WORD) | |
| 98 / (2 * CHAR_PER_WORD + 1) * CHAR_PER_WORD; | |
| 99 if (bwtInc->initialMaxBuildSize > 0) { | |
| 100 bwtInc->buildSize = min(bwtInc->initialMaxBuildSize, maxBuildSize); | |
| 101 } else { | |
| 102 bwtInc->buildSize = maxBuildSize; | |
| 103 } | |
| 104 } else { | |
| 105 // Minus 3 because n+1 entries of sorted rank, seq and rank needed for n char | |
| 106 // Minus numberOfIterationDone because bwt slightly shift to left in each iteration | |
| 107 maxBuildSize = (bwtInc->availableWord - bwtInc->bwt->bwtSizeInWord - bwtInc->bwt->occSizeInWord - 3 | |
| 108 - bwtInc->numberOfIterationDone * OCC_INTERVAL / BIT_PER_CHAR) | |
| 109 / 3; | |
| 110 if (maxBuildSize < CHAR_PER_WORD) { | |
| 111 fprintf(stderr, "BWTIncSetBuildSizeAndTextAddr(): Not enough space allocated to continue construction!\n"); | |
| 112 exit(1); | |
| 113 } | |
| 114 if (bwtInc->incMaxBuildSize > 0) { | |
| 115 bwtInc->buildSize = min(bwtInc->incMaxBuildSize, maxBuildSize); | |
| 116 } else { | |
| 117 bwtInc->buildSize = maxBuildSize; | |
| 118 } | |
| 119 if (bwtInc->buildSize < CHAR_PER_WORD) { | |
| 120 bwtInc->buildSize = CHAR_PER_WORD; | |
| 121 } | |
| 122 } | |
| 123 | |
| 124 if (bwtInc->buildSize < CHAR_PER_WORD) { | |
| 125 fprintf(stderr, "BWTIncSetBuildSizeAndTextAddr(): Not enough space allocated to continue construction!\n"); | |
| 126 exit(1); | |
| 127 } | |
| 128 | |
| 129 bwtInc->buildSize = bwtInc->buildSize / CHAR_PER_WORD * CHAR_PER_WORD; | |
| 130 | |
| 131 bwtInc->packedText = bwtInc->workingMemory + 2 * (bwtInc->buildSize + 1); | |
| 132 bwtInc->textBuffer = (unsigned char*)(bwtInc->workingMemory + bwtInc->buildSize + 1); | |
| 133 | |
| 134 } | |
| 135 | |
| 136 // for ceilLog2() | |
| 137 unsigned int leadingZero(const unsigned int input) | |
| 138 { | |
| 139 unsigned int l; | |
| 140 const static unsigned int leadingZero8bit[256] = {8,7,6,6,5,5,5,5,4,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, | |
| 141 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, | |
| 142 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
| 143 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
| 144 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
| 145 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
| 146 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
| 147 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; | |
| 148 | |
| 149 if (input & 0xFFFF0000) { | |
| 150 if (input & 0xFF000000) { | |
| 151 l = leadingZero8bit[input >> 24]; | |
| 152 } else { | |
| 153 l = 8 + leadingZero8bit[input >> 16]; | |
| 154 } | |
| 155 } else { | |
| 156 if (input & 0x0000FF00) { | |
| 157 l = 16 + leadingZero8bit[input >> 8]; | |
| 158 } else { | |
| 159 l = 24 + leadingZero8bit[input]; | |
| 160 } | |
| 161 } | |
| 162 return l; | |
| 163 | |
| 164 } | |
| 165 // for BitPerBytePackedChar() | |
| 166 static unsigned int ceilLog2(const unsigned int input) | |
| 167 { | |
| 168 if (input <= 1) return 0; | |
| 169 return BITS_IN_WORD - leadingZero(input - 1); | |
| 170 | |
| 171 } | |
| 172 // for ConvertBytePackedToWordPacked() | |
| 173 static unsigned int BitPerBytePackedChar(const unsigned int alphabetSize) | |
| 174 { | |
| 175 unsigned int bitPerChar; | |
| 176 bitPerChar = ceilLog2(alphabetSize); | |
| 177 // Return the largest number of bit that does not affect packing efficiency | |
| 178 if (BITS_IN_BYTE / (BITS_IN_BYTE / bitPerChar) > bitPerChar) | |
| 179 bitPerChar = BITS_IN_BYTE / (BITS_IN_BYTE / bitPerChar); | |
| 180 return bitPerChar; | |
| 181 } | |
| 182 // for ConvertBytePackedToWordPacked() | |
| 183 static unsigned int BitPerWordPackedChar(const unsigned int alphabetSize) | |
| 184 { | |
| 185 return ceilLog2(alphabetSize); | |
| 186 } | |
| 187 | |
| 188 static void ConvertBytePackedToWordPacked(const unsigned char *input, unsigned int *output, const unsigned int alphabetSize, | |
| 189 const unsigned int textLength) | |
| 190 { | |
| 191 unsigned int i, j, k; | |
| 192 unsigned int c; | |
| 193 unsigned int bitPerBytePackedChar; | |
| 194 unsigned int bitPerWordPackedChar; | |
| 195 unsigned int charPerWord; | |
| 196 unsigned int charPerByte; | |
| 197 unsigned int bytePerIteration; | |
| 198 unsigned int byteProcessed = 0; | |
| 199 unsigned int wordProcessed = 0; | |
| 200 unsigned int mask, shift; | |
| 201 | |
| 202 unsigned int buffer[BITS_IN_WORD]; | |
| 203 | |
| 204 bitPerBytePackedChar = BitPerBytePackedChar(alphabetSize); | |
| 205 bitPerWordPackedChar = BitPerWordPackedChar(alphabetSize); | |
| 206 charPerByte = BITS_IN_BYTE / bitPerBytePackedChar; | |
| 207 charPerWord = BITS_IN_WORD / bitPerWordPackedChar; | |
| 208 | |
| 209 bytePerIteration = charPerWord / charPerByte; | |
| 210 mask = truncateRight(ALL_ONE_MASK, BITS_IN_WORD - bitPerWordPackedChar); | |
| 211 shift = BITS_IN_WORD - BITS_IN_BYTE + bitPerBytePackedChar - bitPerWordPackedChar; | |
| 212 | |
| 213 while ((wordProcessed + 1) * charPerWord < textLength) { | |
| 214 | |
| 215 k = 0; | |
| 216 for (i=0; i<bytePerIteration; i++) { | |
| 217 c = (unsigned int)input[byteProcessed] << shift; | |
| 218 for (j=0; j<charPerByte; j++) { | |
| 219 buffer[k] = c & mask; | |
| 220 c <<= bitPerBytePackedChar; | |
| 221 k++; | |
| 222 } | |
| 223 byteProcessed++; | |
| 224 } | |
| 225 | |
| 226 c = 0; | |
| 227 for (i=0; i<charPerWord; i++) { | |
| 228 c |= buffer[i] >> bitPerWordPackedChar * i; | |
| 229 } | |
| 230 output[wordProcessed] = c; | |
| 231 wordProcessed++; | |
| 232 | |
| 233 } | |
| 234 | |
| 235 k = 0; | |
| 236 for (i=0; i < (textLength - wordProcessed * charPerWord - 1) / charPerByte + 1; i++) { | |
| 237 c = (unsigned int)input[byteProcessed] << shift; | |
| 238 for (j=0; j<charPerByte; j++) { | |
| 239 buffer[k] = c & mask; | |
| 240 c <<= bitPerBytePackedChar; | |
| 241 k++; | |
| 242 } | |
| 243 byteProcessed++; | |
| 244 } | |
| 245 | |
| 246 c = 0; | |
| 247 for (i=0; i<textLength - wordProcessed * charPerWord; i++) { | |
| 248 c |= buffer[i] >> bitPerWordPackedChar * i; | |
| 249 } | |
| 250 output[wordProcessed] = c; | |
| 251 } | |
| 252 | |
| 253 BWT *BWTCreate(const unsigned int textLength, unsigned int *decodeTable) | |
| 254 { | |
| 255 BWT *bwt; | |
| 256 | |
| 257 bwt = (BWT*)calloc(1, sizeof(BWT)); | |
| 258 | |
| 259 bwt->textLength = 0; | |
| 260 bwt->inverseSa = 0; | |
| 261 | |
| 262 bwt->cumulativeFreq = (unsigned*)calloc((ALPHABET_SIZE + 1), sizeof(unsigned int*)); | |
| 263 initializeVAL(bwt->cumulativeFreq, ALPHABET_SIZE + 1, 0); | |
| 264 | |
| 265 bwt->bwtSizeInWord = 0; | |
| 266 bwt->saValueOnBoundary = NULL; | |
| 267 | |
| 268 // Generate decode tables | |
| 269 if (decodeTable == NULL) { | |
| 270 bwt->decodeTable = (unsigned*)calloc(DNA_OCC_CNT_TABLE_SIZE_IN_WORD, sizeof(unsigned int)); | |
| 271 GenerateDNAOccCountTable(bwt->decodeTable); | |
| 272 } else { | |
| 273 bwt->decodeTable = decodeTable; | |
| 274 } | |
| 275 | |
| 276 bwt->occMajorSizeInWord = BWTOccValueMajorSizeInWord(textLength); | |
| 277 bwt->occValueMajor = (unsigned*)calloc(bwt->occMajorSizeInWord, sizeof(unsigned int)); | |
| 278 | |
| 279 bwt->occSizeInWord = 0; | |
| 280 bwt->occValue = NULL; | |
| 281 | |
| 282 bwt->saInterval = ALL_ONE_MASK; | |
| 283 bwt->saValueSize = 0; | |
| 284 bwt->saValue = NULL; | |
| 285 | |
| 286 bwt->inverseSaInterval = ALL_ONE_MASK; | |
| 287 bwt->inverseSaSize = 0; | |
| 288 bwt->inverseSa = NULL; | |
| 289 | |
| 290 return bwt; | |
| 291 } | |
| 292 | |
| 293 BWTInc *BWTIncCreate(const unsigned int textLength, const float targetNBit, | |
| 294 const unsigned int initialMaxBuildSize, const unsigned int incMaxBuildSize) | |
| 295 { | |
| 296 BWTInc *bwtInc; | |
| 297 unsigned int i; | |
| 298 | |
| 299 if (targetNBit == 0) { | |
| 300 fprintf(stderr, "BWTIncCreate() : targetNBit = 0!\n"); | |
| 301 exit(1); | |
| 302 } | |
| 303 | |
| 304 bwtInc = (BWTInc*)calloc(1, sizeof(BWTInc)); | |
| 305 bwtInc->numberOfIterationDone = 0; | |
| 306 bwtInc->bwt = BWTCreate(textLength, NULL); | |
| 307 bwtInc->initialMaxBuildSize = initialMaxBuildSize; | |
| 308 bwtInc->incMaxBuildSize = incMaxBuildSize; | |
| 309 bwtInc->targetNBit = targetNBit; | |
| 310 bwtInc->cumulativeCountInCurrentBuild = (unsigned*)calloc((ALPHABET_SIZE + 1), sizeof(unsigned int)); | |
| 311 initializeVAL(bwtInc->cumulativeCountInCurrentBuild, ALPHABET_SIZE + 1, 0); | |
| 312 | |
| 313 // Build frequently accessed data | |
| 314 bwtInc->packedShift = (unsigned*)calloc(CHAR_PER_WORD, sizeof(unsigned int)); | |
| 315 for (i=0; i<CHAR_PER_WORD; i++) { | |
| 316 bwtInc->packedShift[i] = BITS_IN_WORD - (i+1) * BIT_PER_CHAR; | |
| 317 } | |
| 318 | |
| 319 bwtInc->targetTextLength = textLength; | |
| 320 bwtInc->availableWord = (unsigned int)((textLength + OCC_INTERVAL - 1) / OCC_INTERVAL * OCC_INTERVAL / BITS_IN_WORD * bwtInc->targetNBit); | |
| 321 if (bwtInc->availableWord < BWTResidentSizeInWord(textLength) + BWTOccValueMinorSizeInWord(textLength)) { | |
| 322 fprintf(stderr, "BWTIncCreate() : targetNBit is too low!\n"); | |
| 323 exit(1); | |
| 324 } | |
| 325 bwtInc->workingMemory = (unsigned*)calloc(bwtInc->availableWord, BYTES_IN_WORD); | |
| 326 | |
| 327 return bwtInc; | |
| 328 | |
| 329 } | |
| 330 // for BWTIncConstruct() | |
| 331 static void BWTIncPutPackedTextToRank(const unsigned int *packedText, unsigned int* __restrict rank, | |
| 332 unsigned int* __restrict cumulativeCount, const unsigned int numChar) | |
| 333 { | |
| 334 unsigned int i, j; | |
| 335 unsigned int c, t; | |
| 336 unsigned int packedMask; | |
| 337 unsigned int rankIndex; | |
| 338 unsigned int lastWord, numCharInLastWord; | |
| 339 | |
| 340 lastWord = (numChar - 1) / CHAR_PER_WORD; | |
| 341 numCharInLastWord = numChar - lastWord * CHAR_PER_WORD; | |
| 342 | |
| 343 packedMask = ALL_ONE_MASK >> (BITS_IN_WORD - BIT_PER_CHAR); | |
| 344 rankIndex = numChar - 1; | |
| 345 | |
| 346 t = packedText[lastWord] >> (BITS_IN_WORD - numCharInLastWord * BIT_PER_CHAR); | |
| 347 for (i=0; i<numCharInLastWord; i++) { | |
| 348 c = t & packedMask; | |
| 349 cumulativeCount[c+1]++; | |
| 350 rank[rankIndex] = c; | |
| 351 rankIndex--; | |
| 352 t >>= BIT_PER_CHAR; | |
| 353 } | |
| 354 | |
| 355 for (i=lastWord; i--;) { // loop from lastWord - 1 to 0 | |
| 356 t = packedText[i]; | |
| 357 for (j=0; j<CHAR_PER_WORD; j++) { | |
| 358 c = t & packedMask; | |
| 359 cumulativeCount[c+1]++; | |
| 360 rank[rankIndex] = c; | |
| 361 rankIndex--; | |
| 362 t >>= BIT_PER_CHAR; | |
| 363 } | |
| 364 } | |
| 365 | |
| 366 // Convert occurrence to cumulativeCount | |
| 367 cumulativeCount[2] += cumulativeCount[1]; | |
| 368 cumulativeCount[3] += cumulativeCount[2]; | |
| 369 cumulativeCount[4] += cumulativeCount[3]; | |
| 370 } | |
| 371 | |
| 372 | |
| 373 static void ForwardDNAAllOccCountNoLimit(const unsigned int* dna, const unsigned int index, | |
| 374 unsigned int* __restrict occCount, const unsigned int* dnaDecodeTable) | |
| 375 { | |
| 376 static const unsigned int truncateRightMask[16] = { 0x00000000, 0xC0000000, 0xF0000000, 0xFC000000, | |
| 377 0xFF000000, 0xFFC00000, 0xFFF00000, 0xFFFC0000, | |
| 378 0xFFFF0000, 0xFFFFC000, 0xFFFFF000, 0xFFFFFC00, | |
| 379 0xFFFFFF00, 0xFFFFFFC0, 0xFFFFFFF0, 0xFFFFFFFC }; | |
| 380 | |
| 381 unsigned int iteration, wordToCount, charToCount; | |
| 382 unsigned int i, j, c; | |
| 383 unsigned int sum; | |
| 384 | |
| 385 occCount[0] = 0; | |
| 386 occCount[1] = 0; | |
| 387 occCount[2] = 0; | |
| 388 occCount[3] = 0; | |
| 389 | |
| 390 iteration = index / 256; | |
| 391 wordToCount = (index - iteration * 256) / 16; | |
| 392 charToCount = index - iteration * 256 - wordToCount * 16; | |
| 393 | |
| 394 for (i=0; i<iteration; i++) { | |
| 395 | |
| 396 sum = 0; | |
| 397 for (j=0; j<16; j++) { | |
| 398 sum += dnaDecodeTable[*dna >> 16]; | |
| 399 sum += dnaDecodeTable[*dna & 0x0000FFFF]; | |
| 400 dna++; | |
| 401 } | |
| 402 if (!DNA_OCC_SUM_EXCEPTION(sum)) { | |
| 403 occCount[0] += sum & 0x000000FF; sum >>= 8; | |
| 404 occCount[1] += sum & 0x000000FF; sum >>= 8; | |
| 405 occCount[2] += sum & 0x000000FF; sum >>= 8; | |
| 406 occCount[3] += sum; | |
| 407 } else { | |
| 408 // only some or all of the 3 bits are on | |
| 409 // in reality, only one of the four cases are possible | |
| 410 if (sum == 0x00000100) { | |
| 411 occCount[0] += 256; | |
| 412 } else if (sum == 0x00010000) { | |
| 413 occCount[1] += 256; | |
| 414 } else if (sum == 0x01000000) { | |
| 415 occCount[2] += 256; | |
| 416 } else if (sum == 0x00000000) { | |
| 417 occCount[3] += 256; | |
| 418 } else { | |
| 419 fprintf(stderr, "ForwardDNAAllOccCountNoLimit(): DNA occ sum exception!\n"); | |
| 420 exit(1); | |
| 421 } | |
| 422 } | |
| 423 | |
| 424 } | |
| 425 | |
| 426 sum = 0; | |
| 427 for (j=0; j<wordToCount; j++) { | |
| 428 sum += dnaDecodeTable[*dna >> 16]; | |
| 429 sum += dnaDecodeTable[*dna & 0x0000FFFF]; | |
| 430 dna++; | |
| 431 } | |
| 432 | |
| 433 if (charToCount > 0) { | |
| 434 c = *dna & truncateRightMask[charToCount]; // increase count of 'a' by 16 - c; | |
| 435 sum += dnaDecodeTable[c >> 16]; | |
| 436 sum += dnaDecodeTable[c & 0xFFFF]; | |
| 437 sum += charToCount - 16; // decrease count of 'a' by 16 - positionToProcess | |
| 438 } | |
| 439 | |
| 440 occCount[0] += sum & 0x000000FF; sum >>= 8; | |
| 441 occCount[1] += sum & 0x000000FF; sum >>= 8; | |
| 442 occCount[2] += sum & 0x000000FF; sum >>= 8; | |
| 443 occCount[3] += sum; | |
| 444 } | |
| 445 | |
| 446 static void BWTIncBuildPackedBwt(const unsigned int *relativeRank, unsigned int* __restrict bwt, const unsigned int numChar, | |
| 447 const unsigned int *cumulativeCount, const unsigned int *packedShift) { | |
| 448 | |
| 449 unsigned int i, c, r; | |
| 450 unsigned int previousRank, currentRank; | |
| 451 unsigned int wordIndex, charIndex; | |
| 452 unsigned int inverseSa0; | |
| 453 | |
| 454 inverseSa0 = previousRank = relativeRank[0]; | |
| 455 | |
| 456 for (i=1; i<=numChar; i++) { | |
| 457 currentRank = relativeRank[i]; | |
| 458 // previousRank > cumulativeCount[c] because $ is one of the char | |
| 459 c = (previousRank > cumulativeCount[1]) + (previousRank > cumulativeCount[2]) | |
| 460 + (previousRank > cumulativeCount[3]); | |
| 461 // set bwt for currentRank | |
| 462 if (c > 0) { | |
| 463 // c <> 'a' | |
| 464 r = currentRank; | |
| 465 if (r > inverseSa0) { | |
| 466 // - 1 because $ at inverseSa0 is not encoded | |
| 467 r--; | |
| 468 } | |
| 469 wordIndex = r / CHAR_PER_WORD; | |
| 470 charIndex = r - wordIndex * CHAR_PER_WORD; | |
| 471 bwt[wordIndex] |= c << packedShift[charIndex]; | |
| 472 } | |
| 473 previousRank = currentRank; | |
| 474 } | |
| 475 } | |
| 476 | |
| 477 static inline unsigned int BWTOccValueExplicit(const BWT *bwt, const unsigned int occIndexExplicit, | |
| 478 const unsigned int character) | |
| 479 { | |
| 480 unsigned int occIndexMajor; | |
| 481 | |
| 482 occIndexMajor = occIndexExplicit * OCC_INTERVAL / OCC_INTERVAL_MAJOR; | |
| 483 | |
| 484 if (occIndexExplicit % OCC_VALUE_PER_WORD == 0) { | |
| 485 return bwt->occValueMajor[occIndexMajor * ALPHABET_SIZE + character] + | |
| 486 (bwt->occValue[occIndexExplicit / OCC_VALUE_PER_WORD * ALPHABET_SIZE + character] >> 16); | |
| 487 | |
| 488 } else { | |
| 489 return bwt->occValueMajor[occIndexMajor * ALPHABET_SIZE + character] + | |
| 490 (bwt->occValue[occIndexExplicit / OCC_VALUE_PER_WORD * ALPHABET_SIZE + character] & 0x0000FFFF); | |
| 491 } | |
| 492 } | |
| 493 | |
| 494 | |
| 495 static unsigned int ForwardDNAOccCount(const unsigned int* dna, const unsigned int index, const unsigned int character, | |
| 496 const unsigned int* dnaDecodeTable) | |
| 497 { | |
| 498 static const unsigned int truncateRightMask[16] = { 0x00000000, 0xC0000000, 0xF0000000, 0xFC000000, | |
| 499 0xFF000000, 0xFFC00000, 0xFFF00000, 0xFFFC0000, | |
| 500 0xFFFF0000, 0xFFFFC000, 0xFFFFF000, 0xFFFFFC00, | |
| 501 0xFFFFFF00, 0xFFFFFFC0, 0xFFFFFFF0, 0xFFFFFFFC }; | |
| 502 | |
| 503 unsigned int wordToCount, charToCount; | |
| 504 unsigned int i, c; | |
| 505 unsigned int sum = 0; | |
| 506 | |
| 507 wordToCount = index / 16; | |
| 508 charToCount = index - wordToCount * 16; | |
| 509 | |
| 510 for (i=0; i<wordToCount; i++) { | |
| 511 sum += dnaDecodeTable[dna[i] >> 16]; | |
| 512 sum += dnaDecodeTable[dna[i] & 0x0000FFFF]; | |
| 513 } | |
| 514 | |
| 515 if (charToCount > 0) { | |
| 516 c = dna[i] & truncateRightMask[charToCount]; // increase count of 'a' by 16 - c; | |
| 517 sum += dnaDecodeTable[c >> 16]; | |
| 518 sum += dnaDecodeTable[c & 0xFFFF]; | |
| 519 sum += charToCount - 16; // decrease count of 'a' by 16 - positionToProcess | |
| 520 } | |
| 521 | |
| 522 return (sum >> (character * 8)) & 0x000000FF; | |
| 523 | |
| 524 } | |
| 525 | |
| 526 static unsigned int BackwardDNAOccCount(const unsigned int* dna, const unsigned int index, const unsigned int character, | |
| 527 const unsigned int* dnaDecodeTable) | |
| 528 { | |
| 529 static const unsigned int truncateLeftMask[16] = { 0x00000000, 0x00000003, 0x0000000F, 0x0000003F, | |
| 530 0x000000FF, 0x000003FF, 0x00000FFF, 0x00003FFF, | |
| 531 0x0000FFFF, 0x0003FFFF, 0x000FFFFF, 0x003FFFFF, | |
| 532 0x00FFFFFF, 0x03FFFFFF, 0x0FFFFFFF, 0x3FFFFFFF }; | |
| 533 | |
| 534 unsigned int wordToCount, charToCount; | |
| 535 unsigned int i, c; | |
| 536 unsigned int sum = 0; | |
| 537 | |
| 538 wordToCount = index / 16; | |
| 539 charToCount = index - wordToCount * 16; | |
| 540 | |
| 541 dna -= wordToCount + 1; | |
| 542 | |
| 543 if (charToCount > 0) { | |
| 544 c = *dna & truncateLeftMask[charToCount]; // increase count of 'a' by 16 - c; | |
| 545 sum += dnaDecodeTable[c >> 16]; | |
| 546 sum += dnaDecodeTable[c & 0xFFFF]; | |
| 547 sum += charToCount - 16; // decrease count of 'a' by 16 - positionToProcess | |
| 548 } | |
| 549 | |
| 550 for (i=0; i<wordToCount; i++) { | |
| 551 dna++; | |
| 552 sum += dnaDecodeTable[*dna >> 16]; | |
| 553 sum += dnaDecodeTable[*dna & 0x0000FFFF]; | |
| 554 } | |
| 555 | |
| 556 return (sum >> (character * 8)) & 0x000000FF; | |
| 557 | |
| 558 } | |
| 559 | |
| 560 unsigned int BWTOccValue(const BWT *bwt, unsigned int index, const unsigned int character) { | |
| 561 | |
| 562 unsigned int occValue; | |
| 563 unsigned int occExplicitIndex, occIndex; | |
| 564 | |
| 565 // $ is supposed to be positioned at inverseSa0 but it is not encoded | |
| 566 // therefore index is subtracted by 1 for adjustment | |
| 567 if (index > bwt->inverseSa0) { | |
| 568 index--; | |
| 569 } | |
| 570 | |
| 571 occExplicitIndex = (index + OCC_INTERVAL / 2 - 1) / OCC_INTERVAL; // Bidirectional encoding | |
| 572 occIndex = occExplicitIndex * OCC_INTERVAL; | |
| 573 occValue = BWTOccValueExplicit(bwt, occExplicitIndex, character); | |
| 574 | |
| 575 if (occIndex == index) { | |
| 576 return occValue; | |
| 577 } | |
| 578 | |
| 579 if (occIndex < index) { | |
| 580 return occValue + ForwardDNAOccCount(bwt->bwtCode + occIndex / CHAR_PER_WORD, index - occIndex, character, bwt->decodeTable); | |
| 581 } else { | |
| 582 return occValue - BackwardDNAOccCount(bwt->bwtCode + occIndex / CHAR_PER_WORD, occIndex - index, character, bwt->decodeTable); | |
| 583 } | |
| 584 | |
| 585 } | |
| 586 | |
| 587 static unsigned int BWTIncGetAbsoluteRank(BWT *bwt, unsigned int* __restrict absoluteRank, unsigned int* __restrict seq, | |
| 588 const unsigned int *packedText, const unsigned int numChar, | |
| 589 const unsigned int* cumulativeCount, const unsigned int firstCharInLastIteration) | |
| 590 { | |
| 591 unsigned int saIndex; | |
| 592 unsigned int lastWord; | |
| 593 unsigned int packedMask; | |
| 594 unsigned int i, j; | |
| 595 unsigned int c, t; | |
| 596 unsigned int rankIndex; | |
| 597 unsigned int shift; | |
| 598 unsigned int seqIndexFromStart[ALPHABET_SIZE]; | |
| 599 unsigned int seqIndexFromEnd[ALPHABET_SIZE]; | |
| 600 | |
| 601 for (i=0; i<ALPHABET_SIZE; i++) { | |
| 602 seqIndexFromStart[i] = cumulativeCount[i]; | |
| 603 seqIndexFromEnd[i] = cumulativeCount[i+1] - 1; | |
| 604 } | |
| 605 | |
| 606 shift = BITS_IN_WORD - BIT_PER_CHAR; | |
| 607 packedMask = ALL_ONE_MASK >> shift; | |
| 608 saIndex = bwt->inverseSa0; | |
| 609 rankIndex = numChar - 1; | |
| 610 | |
| 611 lastWord = numChar / CHAR_PER_WORD; | |
| 612 for (i=lastWord; i--;) { // loop from lastWord - 1 to 0 | |
| 613 t = packedText[i]; | |
| 614 for (j=0; j<CHAR_PER_WORD; j++) { | |
| 615 c = t & packedMask; | |
| 616 saIndex = bwt->cumulativeFreq[c] + BWTOccValue(bwt, saIndex, c) + 1; | |
| 617 // A counting sort using the first character of suffix is done here | |
| 618 // If rank > inverseSa0 -> fill seq from end, otherwise fill seq from start -> to leave the right entry for inverseSa0 | |
| 619 if (saIndex > bwt->inverseSa0) { | |
| 620 seq[seqIndexFromEnd[c]] = rankIndex; | |
| 621 absoluteRank[seqIndexFromEnd[c]] = saIndex; | |
| 622 seqIndexFromEnd[c]--; | |
| 623 } else { | |
| 624 seq[seqIndexFromStart[c]] = rankIndex; | |
| 625 absoluteRank[seqIndexFromStart[c]] = saIndex; | |
| 626 seqIndexFromStart[c]++; | |
| 627 } | |
| 628 rankIndex--; | |
| 629 t >>= BIT_PER_CHAR; | |
| 630 } | |
| 631 } | |
| 632 | |
| 633 absoluteRank[seqIndexFromStart[firstCharInLastIteration]] = bwt->inverseSa0; // representing the substring of all preceding characters | |
| 634 seq[seqIndexFromStart[firstCharInLastIteration]] = numChar; | |
| 635 | |
| 636 return seqIndexFromStart[firstCharInLastIteration]; | |
| 637 } | |
| 638 | |
| 639 static void BWTIncSortKey(unsigned int* __restrict key, unsigned int* __restrict seq, const unsigned int numItem) | |
| 640 { | |
| 641 #define EQUAL_KEY_THRESHOLD 4 // Partition for equal key if data array size / the number of data with equal value with pivot < EQUAL_KEY_THRESHOLD | |
| 642 | |
| 643 int lowIndex, highIndex, midIndex; | |
| 644 int lowPartitionIndex, highPartitionIndex; | |
| 645 int lowStack[32], highStack[32]; | |
| 646 int stackDepth; | |
| 647 int i, j; | |
| 648 unsigned int tempSeq, tempKey; | |
| 649 int numberOfEqualKey; | |
| 650 | |
| 651 if (numItem < 2) return; | |
| 652 | |
| 653 stackDepth = 0; | |
| 654 | |
| 655 lowIndex = 0; | |
| 656 highIndex = numItem - 1; | |
| 657 | |
| 658 for (;;) { | |
| 659 | |
| 660 for (;;) { | |
| 661 | |
| 662 // Sort small array of data | |
| 663 if (highIndex - lowIndex < BWTINC_INSERT_SORT_NUM_ITEM) { // Insertion sort on smallest arrays | |
| 664 for (i=lowIndex+1; i<=highIndex; i++) { | |
| 665 tempSeq = seq[i]; | |
| 666 tempKey = key[i]; | |
| 667 for (j = i; j > lowIndex && key[j-1] > tempKey; j--) { | |
| 668 seq[j] = seq[j-1]; | |
| 669 key[j] = key[j-1]; | |
| 670 } | |
| 671 if (j != i) { | |
| 672 seq[j] = tempSeq; | |
| 673 key[j] = tempKey; | |
| 674 } | |
| 675 } | |
| 676 break; | |
| 677 } | |
| 678 | |
| 679 // Choose pivot as median of the lowest, middle, and highest data; sort the three data | |
| 680 | |
| 681 midIndex = average(lowIndex, highIndex); | |
| 682 if (key[lowIndex] > key[midIndex]) { | |
| 683 tempSeq = seq[lowIndex]; | |
| 684 tempKey = key[lowIndex]; | |
| 685 seq[lowIndex] = seq[midIndex]; | |
| 686 key[lowIndex] = key[midIndex]; | |
| 687 seq[midIndex] = tempSeq; | |
| 688 key[midIndex] = tempKey; | |
| 689 } | |
| 690 if (key[lowIndex] > key[highIndex]) { | |
| 691 tempSeq = seq[lowIndex]; | |
| 692 tempKey = key[lowIndex]; | |
| 693 seq[lowIndex] = seq[highIndex]; | |
| 694 key[lowIndex] = key[highIndex]; | |
| 695 seq[highIndex] = tempSeq; | |
| 696 key[highIndex] = tempKey; | |
| 697 } | |
| 698 if (key[midIndex] > key[highIndex]) { | |
| 699 tempSeq = seq[midIndex]; | |
| 700 tempKey = key[midIndex]; | |
| 701 seq[midIndex] = seq[highIndex]; | |
| 702 key[midIndex] = key[highIndex]; | |
| 703 seq[highIndex] = tempSeq; | |
| 704 key[highIndex] = tempKey; | |
| 705 } | |
| 706 | |
| 707 // Partition data | |
| 708 | |
| 709 numberOfEqualKey = 0; | |
| 710 | |
| 711 lowPartitionIndex = lowIndex + 1; | |
| 712 highPartitionIndex = highIndex - 1; | |
| 713 | |
| 714 for (;;) { | |
| 715 while (lowPartitionIndex <= highPartitionIndex && key[lowPartitionIndex] <= key[midIndex]) { | |
| 716 numberOfEqualKey += (key[lowPartitionIndex] == key[midIndex]); | |
| 717 lowPartitionIndex++; | |
| 718 } | |
| 719 while (lowPartitionIndex < highPartitionIndex) { | |
| 720 if (key[midIndex] >= key[highPartitionIndex]) { | |
| 721 numberOfEqualKey += (key[midIndex] == key[highPartitionIndex]); | |
| 722 break; | |
| 723 } | |
| 724 highPartitionIndex--; | |
| 725 } | |
| 726 if (lowPartitionIndex >= highPartitionIndex) { | |
| 727 break; | |
| 728 } | |
| 729 tempSeq = seq[lowPartitionIndex]; | |
| 730 tempKey = key[lowPartitionIndex]; | |
| 731 seq[lowPartitionIndex] = seq[highPartitionIndex]; | |
| 732 key[lowPartitionIndex] = key[highPartitionIndex]; | |
| 733 seq[highPartitionIndex] = tempSeq; | |
| 734 key[highPartitionIndex] = tempKey; | |
| 735 if (highPartitionIndex == midIndex) { | |
| 736 // partition key has been moved | |
| 737 midIndex = lowPartitionIndex; | |
| 738 } | |
| 739 lowPartitionIndex++; | |
| 740 highPartitionIndex--; | |
| 741 } | |
| 742 | |
| 743 // Adjust the partition index | |
| 744 highPartitionIndex = lowPartitionIndex; | |
| 745 lowPartitionIndex--; | |
| 746 | |
| 747 // move the partition key to end of low partition | |
| 748 tempSeq = seq[midIndex]; | |
| 749 tempKey = key[midIndex]; | |
| 750 seq[midIndex] = seq[lowPartitionIndex]; | |
| 751 key[midIndex] = key[lowPartitionIndex]; | |
| 752 seq[lowPartitionIndex] = tempSeq; | |
| 753 key[lowPartitionIndex] = tempKey; | |
| 754 | |
| 755 if (highIndex - lowIndex + BWTINC_INSERT_SORT_NUM_ITEM <= EQUAL_KEY_THRESHOLD * numberOfEqualKey) { | |
| 756 | |
| 757 // Many keys = partition key; separate the equal key data from the lower partition | |
| 758 | |
| 759 midIndex = lowIndex; | |
| 760 | |
| 761 for (;;) { | |
| 762 while (midIndex < lowPartitionIndex && key[midIndex] < key[lowPartitionIndex]) { | |
| 763 midIndex++; | |
| 764 } | |
| 765 while (midIndex < lowPartitionIndex && key[lowPartitionIndex] == key[lowPartitionIndex - 1]) { | |
| 766 lowPartitionIndex--; | |
| 767 } | |
| 768 if (midIndex >= lowPartitionIndex) { | |
| 769 break; | |
| 770 } | |
| 771 tempSeq = seq[midIndex]; | |
| 772 tempKey = key[midIndex]; | |
| 773 seq[midIndex] = seq[lowPartitionIndex - 1]; | |
| 774 key[midIndex] = key[lowPartitionIndex - 1]; | |
| 775 seq[lowPartitionIndex - 1] = tempSeq; | |
| 776 key[lowPartitionIndex - 1] = tempKey; | |
| 777 midIndex++; | |
| 778 lowPartitionIndex--; | |
| 779 } | |
| 780 | |
| 781 } | |
| 782 | |
| 783 if (lowPartitionIndex - lowIndex > highIndex - highPartitionIndex) { | |
| 784 // put the larger partition to stack | |
| 785 lowStack[stackDepth] = lowIndex; | |
| 786 highStack[stackDepth] = lowPartitionIndex - 1; | |
| 787 stackDepth++; | |
| 788 // sort the smaller partition first | |
| 789 lowIndex = highPartitionIndex; | |
| 790 } else { | |
| 791 // put the larger partition to stack | |
| 792 lowStack[stackDepth] = highPartitionIndex; | |
| 793 highStack[stackDepth] = highIndex; | |
| 794 stackDepth++; | |
| 795 // sort the smaller partition first | |
| 796 if (lowPartitionIndex > lowIndex) { | |
| 797 highIndex = lowPartitionIndex - 1; | |
| 798 } else { | |
| 799 // all keys in the partition equals to the partition key | |
| 800 break; | |
| 801 } | |
| 802 } | |
| 803 continue; | |
| 804 } | |
| 805 | |
| 806 // Pop a range from stack | |
| 807 if (stackDepth > 0) { | |
| 808 stackDepth--; | |
| 809 lowIndex = lowStack[stackDepth]; | |
| 810 highIndex = highStack[stackDepth]; | |
| 811 continue; | |
| 812 } else return; | |
| 813 } | |
| 814 } | |
| 815 | |
| 816 | |
| 817 static void BWTIncBuildRelativeRank(unsigned int* __restrict sortedRank, unsigned int* __restrict seq, | |
| 818 unsigned int* __restrict relativeRank, const unsigned int numItem, | |
| 819 unsigned int oldInverseSa0, const unsigned int *cumulativeCount) | |
| 820 { | |
| 821 unsigned int i, c; | |
| 822 unsigned int s, r; | |
| 823 unsigned int lastRank, lastIndex; | |
| 824 unsigned int oldInverseSa0RelativeRank = 0; | |
| 825 unsigned int freq; | |
| 826 | |
| 827 lastIndex = numItem; | |
| 828 lastRank = sortedRank[numItem]; | |
| 829 if (lastRank > oldInverseSa0) { | |
| 830 sortedRank[numItem]--; // to prepare for merging; $ is not encoded in bwt | |
| 831 } | |
| 832 s = seq[numItem]; | |
| 833 relativeRank[s] = numItem; | |
| 834 if (lastRank == oldInverseSa0) { | |
| 835 oldInverseSa0RelativeRank = numItem; | |
| 836 oldInverseSa0++; // so that this segment of code is not run again | |
| 837 lastRank++; // so that oldInverseSa0 become a sorted group with 1 item | |
| 838 } | |
| 839 | |
| 840 c = ALPHABET_SIZE - 1; | |
| 841 freq = cumulativeCount[c]; | |
| 842 | |
| 843 for (i=numItem; i--;) { // from numItem - 1 to 0 | |
| 844 r = sortedRank[i]; | |
| 845 if (r > oldInverseSa0) { | |
| 846 sortedRank[i]--; // to prepare for merging; $ is not encoded in bwt | |
| 847 } | |
| 848 s = seq[i]; | |
| 849 if (i < freq) { | |
| 850 if (lastIndex >= freq) { | |
| 851 lastRank++; // to trigger the group across alphabet boundary to be split | |
| 852 } | |
| 853 c--; | |
| 854 freq = cumulativeCount[c]; | |
| 855 } | |
| 856 if (r == lastRank) { | |
| 857 relativeRank[s] = lastIndex; | |
| 858 } else { | |
| 859 if (i == lastIndex - 1) { | |
| 860 if (lastIndex < numItem && (int)seq[lastIndex + 1] < 0) { | |
| 861 seq[lastIndex] = seq[lastIndex + 1] - 1; | |
| 862 } else { | |
| 863 seq[lastIndex] = (unsigned int)-1; | |
| 864 } | |
| 865 } | |
| 866 lastIndex = i; | |
| 867 lastRank = r; | |
| 868 relativeRank[s] = i; | |
| 869 if (r == oldInverseSa0) { | |
| 870 oldInverseSa0RelativeRank = i; | |
| 871 oldInverseSa0++; // so that this segment of code is not run again | |
| 872 lastRank++; // so that oldInverseSa0 become a sorted group with 1 item | |
| 873 } | |
| 874 } | |
| 875 } | |
| 876 | |
| 877 } | |
| 878 | |
| 879 static void BWTIncBuildBwt(unsigned int* seq, const unsigned int *relativeRank, const unsigned int numChar, | |
| 880 const unsigned int *cumulativeCount) | |
| 881 { | |
| 882 unsigned int i, c; | |
| 883 unsigned int previousRank, currentRank; | |
| 884 | |
| 885 previousRank = relativeRank[0]; | |
| 886 | |
| 887 for (i=1; i<=numChar; i++) { | |
| 888 currentRank = relativeRank[i]; | |
| 889 c = (previousRank >= cumulativeCount[1]) + (previousRank >= cumulativeCount[2]) | |
| 890 + (previousRank >= cumulativeCount[3]); | |
| 891 seq[currentRank] = c; | |
| 892 previousRank = currentRank; | |
| 893 } | |
| 894 } | |
| 895 | |
| 896 static void BWTIncMergeBwt(const unsigned int *sortedRank, const unsigned int* oldBwt, const unsigned int *insertBwt, | |
| 897 unsigned int* __restrict mergedBwt, const unsigned int numOldBwt, const unsigned int numInsertBwt) | |
| 898 { | |
| 899 unsigned int bitsInWordMinusBitPerChar; | |
| 900 unsigned int leftShift, rightShift; | |
| 901 unsigned int o; | |
| 902 unsigned int oIndex, iIndex, mIndex; | |
| 903 unsigned int mWord, mChar, oWord, oChar; | |
| 904 unsigned int numInsert; | |
| 905 | |
| 906 bitsInWordMinusBitPerChar = BITS_IN_WORD - BIT_PER_CHAR; | |
| 907 | |
| 908 oIndex = 0; | |
| 909 iIndex = 0; | |
| 910 mIndex = 0; | |
| 911 | |
| 912 mWord = 0; | |
| 913 mChar = 0; | |
| 914 | |
| 915 mergedBwt[0] = 0; // this can be cleared as merged Bwt slightly shift to the left in each iteration | |
| 916 | |
| 917 while (oIndex < numOldBwt) { | |
| 918 | |
| 919 // copy from insertBwt | |
| 920 while (iIndex <= numInsertBwt && sortedRank[iIndex] <= oIndex) { | |
| 921 if (sortedRank[iIndex] != 0) { // special value to indicate that this is for new inverseSa0 | |
| 922 mergedBwt[mWord] |= insertBwt[iIndex] << (BITS_IN_WORD - (mChar + 1) * BIT_PER_CHAR); | |
| 923 mIndex++; | |
| 924 mChar++; | |
| 925 if (mChar == CHAR_PER_WORD) { | |
| 926 mChar = 0; | |
| 927 mWord++; | |
| 928 mergedBwt[mWord] = 0; // no need to worry about crossing mergedBwt boundary | |
| 929 } | |
| 930 } | |
| 931 iIndex++; | |
| 932 } | |
| 933 | |
| 934 // Copy from oldBwt to mergedBwt | |
| 935 if (iIndex <= numInsertBwt) { | |
| 936 o = sortedRank[iIndex]; | |
| 937 } else { | |
| 938 o = numOldBwt; | |
| 939 } | |
| 940 numInsert = o - oIndex; | |
| 941 | |
| 942 oWord = oIndex / CHAR_PER_WORD; | |
| 943 oChar = oIndex - oWord * CHAR_PER_WORD; | |
| 944 if (oChar > mChar) { | |
| 945 leftShift = (oChar - mChar) * BIT_PER_CHAR; | |
| 946 rightShift = (CHAR_PER_WORD + mChar - oChar) * BIT_PER_CHAR; | |
| 947 mergedBwt[mWord] = mergedBwt[mWord] | |
| 948 | (oldBwt[oWord] << (oChar * BIT_PER_CHAR) >> (mChar * BIT_PER_CHAR)) | |
| 949 | (oldBwt[oWord+1] >> rightShift); | |
| 950 oIndex += min(numInsert, CHAR_PER_WORD - mChar); | |
| 951 while (o > oIndex) { | |
| 952 oWord++; | |
| 953 mWord++; | |
| 954 mergedBwt[mWord] = (oldBwt[oWord] << leftShift) | (oldBwt[oWord+1] >> rightShift); | |
| 955 oIndex += CHAR_PER_WORD; | |
| 956 } | |
| 957 } else if (oChar < mChar) { | |
| 958 rightShift = (mChar - oChar) * BIT_PER_CHAR; | |
| 959 leftShift = (CHAR_PER_WORD + oChar - mChar) * BIT_PER_CHAR; | |
| 960 mergedBwt[mWord] = mergedBwt[mWord] | |
| 961 | (oldBwt[oWord] << (oChar * BIT_PER_CHAR) >> (mChar * BIT_PER_CHAR)); | |
| 962 oIndex += min(numInsert, CHAR_PER_WORD - mChar); | |
| 963 while (o > oIndex) { | |
| 964 oWord++; | |
| 965 mWord++; | |
| 966 mergedBwt[mWord] = (oldBwt[oWord-1] << leftShift) | (oldBwt[oWord] >> rightShift); | |
| 967 oIndex += CHAR_PER_WORD; | |
| 968 } | |
| 969 } else { // oChar == mChar | |
| 970 mergedBwt[mWord] = mergedBwt[mWord] | truncateLeft(oldBwt[oWord], mChar * BIT_PER_CHAR); | |
| 971 oIndex += min(numInsert, CHAR_PER_WORD - mChar); | |
| 972 while (o > oIndex) { | |
| 973 oWord++; | |
| 974 mWord++; | |
| 975 mergedBwt[mWord] = oldBwt[oWord]; | |
| 976 oIndex += CHAR_PER_WORD; | |
| 977 } | |
| 978 } | |
| 979 oIndex = o; | |
| 980 mIndex += numInsert; | |
| 981 | |
| 982 // Clear the trailing garbage in mergedBwt | |
| 983 mWord = mIndex / CHAR_PER_WORD; | |
| 984 mChar = mIndex - mWord * CHAR_PER_WORD; | |
| 985 if (mChar == 0) { | |
| 986 mergedBwt[mWord] = 0; | |
| 987 } else { | |
| 988 mergedBwt[mWord] = truncateRight(mergedBwt[mWord], (BITS_IN_WORD - mChar * BIT_PER_CHAR)); | |
| 989 } | |
| 990 | |
| 991 } | |
| 992 | |
| 993 // copy from insertBwt | |
| 994 while (iIndex <= numInsertBwt) { | |
| 995 if (sortedRank[iIndex] != 0) { | |
| 996 mergedBwt[mWord] |= insertBwt[iIndex] << (BITS_IN_WORD - (mChar + 1) * BIT_PER_CHAR); | |
| 997 mIndex++; | |
| 998 mChar++; | |
| 999 if (mChar == CHAR_PER_WORD) { | |
| 1000 mChar = 0; | |
| 1001 mWord++; | |
| 1002 mergedBwt[mWord] = 0; // no need to worry about crossing mergedBwt boundary | |
| 1003 } | |
| 1004 } | |
| 1005 iIndex++; | |
| 1006 } | |
| 1007 } | |
| 1008 | |
| 1009 void BWTClearTrailingBwtCode(BWT *bwt) | |
| 1010 { | |
| 1011 unsigned int bwtResidentSizeInWord; | |
| 1012 unsigned int wordIndex, offset; | |
| 1013 unsigned int i; | |
| 1014 | |
| 1015 bwtResidentSizeInWord = BWTResidentSizeInWord(bwt->textLength); | |
| 1016 | |
| 1017 wordIndex = bwt->textLength / CHAR_PER_WORD; | |
| 1018 offset = (bwt->textLength - wordIndex * CHAR_PER_WORD) * BIT_PER_CHAR; | |
| 1019 if (offset > 0) { | |
| 1020 bwt->bwtCode[wordIndex] = truncateRight(bwt->bwtCode[wordIndex], BITS_IN_WORD - offset); | |
| 1021 } else { | |
| 1022 if (wordIndex < bwtResidentSizeInWord) { | |
| 1023 bwt->bwtCode[wordIndex] = 0; | |
| 1024 } | |
| 1025 } | |
| 1026 | |
| 1027 for (i=wordIndex+1; i<bwtResidentSizeInWord; i++) { | |
| 1028 bwt->bwtCode[i] = 0; | |
| 1029 } | |
| 1030 } | |
| 1031 | |
| 1032 | |
| 1033 void BWTGenerateOccValueFromBwt(const unsigned int* bwt, unsigned int* __restrict occValue, | |
| 1034 unsigned int* __restrict occValueMajor, | |
| 1035 const unsigned int textLength, const unsigned int* decodeTable) | |
| 1036 { | |
| 1037 unsigned int numberOfOccValueMajor, numberOfOccValue; | |
| 1038 unsigned int wordBetweenOccValue; | |
| 1039 unsigned int numberOfOccIntervalPerMajor; | |
| 1040 unsigned int c; | |
| 1041 unsigned int i, j; | |
| 1042 unsigned int occMajorIndex; | |
| 1043 unsigned int occIndex, bwtIndex; | |
| 1044 unsigned int sum; | |
| 1045 unsigned int tempOccValue0[ALPHABET_SIZE], tempOccValue1[ALPHABET_SIZE]; | |
| 1046 | |
| 1047 wordBetweenOccValue = OCC_INTERVAL / CHAR_PER_WORD; | |
| 1048 | |
| 1049 // Calculate occValue | |
| 1050 // [lh3] by default: OCC_INTERVAL_MAJOR=65536, OCC_INTERVAL=256 | |
| 1051 numberOfOccValue = (textLength + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // Value at both end for bi-directional encoding | |
| 1052 numberOfOccIntervalPerMajor = OCC_INTERVAL_MAJOR / OCC_INTERVAL; | |
| 1053 numberOfOccValueMajor = (numberOfOccValue + numberOfOccIntervalPerMajor - 1) / numberOfOccIntervalPerMajor; | |
| 1054 | |
| 1055 tempOccValue0[0] = 0; | |
| 1056 tempOccValue0[1] = 0; | |
| 1057 tempOccValue0[2] = 0; | |
| 1058 tempOccValue0[3] = 0; | |
| 1059 occValueMajor[0] = 0; | |
| 1060 occValueMajor[1] = 0; | |
| 1061 occValueMajor[2] = 0; | |
| 1062 occValueMajor[3] = 0; | |
| 1063 | |
| 1064 occIndex = 0; | |
| 1065 bwtIndex = 0; | |
| 1066 for (occMajorIndex=1; occMajorIndex<numberOfOccValueMajor; occMajorIndex++) { | |
| 1067 | |
| 1068 for (i=0; i<numberOfOccIntervalPerMajor/2; i++) { | |
| 1069 | |
| 1070 sum = 0; | |
| 1071 tempOccValue1[0] = tempOccValue0[0]; | |
| 1072 tempOccValue1[1] = tempOccValue0[1]; | |
| 1073 tempOccValue1[2] = tempOccValue0[2]; | |
| 1074 tempOccValue1[3] = tempOccValue0[3]; | |
| 1075 | |
| 1076 for (j=0; j<wordBetweenOccValue; j++) { | |
| 1077 c = bwt[bwtIndex]; | |
| 1078 sum += decodeTable[c >> 16]; | |
| 1079 sum += decodeTable[c & 0x0000FFFF]; | |
| 1080 bwtIndex++; | |
| 1081 } | |
| 1082 if (!DNA_OCC_SUM_EXCEPTION(sum)) { | |
| 1083 tempOccValue1[0] += (sum & 0x000000FF); sum >>= 8; | |
| 1084 tempOccValue1[1] += (sum & 0x000000FF); sum >>= 8; | |
| 1085 tempOccValue1[2] += (sum & 0x000000FF); sum >>= 8; | |
| 1086 tempOccValue1[3] += sum; | |
| 1087 } else { | |
| 1088 if (sum == 0x00000100) { | |
| 1089 tempOccValue1[0] += 256; | |
| 1090 } else if (sum == 0x00010000) { | |
| 1091 tempOccValue1[1] += 256; | |
| 1092 } else if (sum == 0x01000000) { | |
| 1093 tempOccValue1[2] += 256; | |
| 1094 } else { | |
| 1095 tempOccValue1[3] += 256; | |
| 1096 } | |
| 1097 } | |
| 1098 occValue[occIndex * 4 + 0] = (tempOccValue0[0] << 16) | tempOccValue1[0]; | |
| 1099 occValue[occIndex * 4 + 1] = (tempOccValue0[1] << 16) | tempOccValue1[1]; | |
| 1100 occValue[occIndex * 4 + 2] = (tempOccValue0[2] << 16) | tempOccValue1[2]; | |
| 1101 occValue[occIndex * 4 + 3] = (tempOccValue0[3] << 16) | tempOccValue1[3]; | |
| 1102 tempOccValue0[0] = tempOccValue1[0]; | |
| 1103 tempOccValue0[1] = tempOccValue1[1]; | |
| 1104 tempOccValue0[2] = tempOccValue1[2]; | |
| 1105 tempOccValue0[3] = tempOccValue1[3]; | |
| 1106 sum = 0; | |
| 1107 | |
| 1108 occIndex++; | |
| 1109 | |
| 1110 for (j=0; j<wordBetweenOccValue; j++) { | |
| 1111 c = bwt[bwtIndex]; | |
| 1112 sum += decodeTable[c >> 16]; | |
| 1113 sum += decodeTable[c & 0x0000FFFF]; | |
| 1114 bwtIndex++; | |
| 1115 } | |
| 1116 if (!DNA_OCC_SUM_EXCEPTION(sum)) { | |
| 1117 tempOccValue0[0] += (sum & 0x000000FF); sum >>= 8; | |
| 1118 tempOccValue0[1] += (sum & 0x000000FF); sum >>= 8; | |
| 1119 tempOccValue0[2] += (sum & 0x000000FF); sum >>= 8; | |
| 1120 tempOccValue0[3] += sum; | |
| 1121 } else { | |
| 1122 if (sum == 0x00000100) { | |
| 1123 tempOccValue0[0] += 256; | |
| 1124 } else if (sum == 0x00010000) { | |
| 1125 tempOccValue0[1] += 256; | |
| 1126 } else if (sum == 0x01000000) { | |
| 1127 tempOccValue0[2] += 256; | |
| 1128 } else { | |
| 1129 tempOccValue0[3] += 256; | |
| 1130 } | |
| 1131 } | |
| 1132 } | |
| 1133 | |
| 1134 occValueMajor[occMajorIndex * 4 + 0] = occValueMajor[(occMajorIndex - 1) * 4 + 0] + tempOccValue0[0]; | |
| 1135 occValueMajor[occMajorIndex * 4 + 1] = occValueMajor[(occMajorIndex - 1) * 4 + 1] + tempOccValue0[1]; | |
| 1136 occValueMajor[occMajorIndex * 4 + 2] = occValueMajor[(occMajorIndex - 1) * 4 + 2] + tempOccValue0[2]; | |
| 1137 occValueMajor[occMajorIndex * 4 + 3] = occValueMajor[(occMajorIndex - 1) * 4 + 3] + tempOccValue0[3]; | |
| 1138 tempOccValue0[0] = 0; | |
| 1139 tempOccValue0[1] = 0; | |
| 1140 tempOccValue0[2] = 0; | |
| 1141 tempOccValue0[3] = 0; | |
| 1142 | |
| 1143 } | |
| 1144 | |
| 1145 while (occIndex < (numberOfOccValue-1)/2) { | |
| 1146 sum = 0; | |
| 1147 tempOccValue1[0] = tempOccValue0[0]; | |
| 1148 tempOccValue1[1] = tempOccValue0[1]; | |
| 1149 tempOccValue1[2] = tempOccValue0[2]; | |
| 1150 tempOccValue1[3] = tempOccValue0[3]; | |
| 1151 for (j=0; j<wordBetweenOccValue; j++) { | |
| 1152 c = bwt[bwtIndex]; | |
| 1153 sum += decodeTable[c >> 16]; | |
| 1154 sum += decodeTable[c & 0x0000FFFF]; | |
| 1155 bwtIndex++; | |
| 1156 } | |
| 1157 if (!DNA_OCC_SUM_EXCEPTION(sum)) { | |
| 1158 tempOccValue1[0] += (sum & 0x000000FF); sum >>= 8; | |
| 1159 tempOccValue1[1] += (sum & 0x000000FF); sum >>= 8; | |
| 1160 tempOccValue1[2] += (sum & 0x000000FF); sum >>= 8; | |
| 1161 tempOccValue1[3] += sum; | |
| 1162 } else { | |
| 1163 if (sum == 0x00000100) { | |
| 1164 tempOccValue1[0] += 256; | |
| 1165 } else if (sum == 0x00010000) { | |
| 1166 tempOccValue1[1] += 256; | |
| 1167 } else if (sum == 0x01000000) { | |
| 1168 tempOccValue1[2] += 256; | |
| 1169 } else { | |
| 1170 tempOccValue1[3] += 256; | |
| 1171 } | |
| 1172 } | |
| 1173 occValue[occIndex * 4 + 0] = (tempOccValue0[0] << 16) | tempOccValue1[0]; | |
| 1174 occValue[occIndex * 4 + 1] = (tempOccValue0[1] << 16) | tempOccValue1[1]; | |
| 1175 occValue[occIndex * 4 + 2] = (tempOccValue0[2] << 16) | tempOccValue1[2]; | |
| 1176 occValue[occIndex * 4 + 3] = (tempOccValue0[3] << 16) | tempOccValue1[3]; | |
| 1177 tempOccValue0[0] = tempOccValue1[0]; | |
| 1178 tempOccValue0[1] = tempOccValue1[1]; | |
| 1179 tempOccValue0[2] = tempOccValue1[2]; | |
| 1180 tempOccValue0[3] = tempOccValue1[3]; | |
| 1181 sum = 0; | |
| 1182 occIndex++; | |
| 1183 | |
| 1184 for (j=0; j<wordBetweenOccValue; j++) { | |
| 1185 c = bwt[bwtIndex]; | |
| 1186 sum += decodeTable[c >> 16]; | |
| 1187 sum += decodeTable[c & 0x0000FFFF]; | |
| 1188 bwtIndex++; | |
| 1189 } | |
| 1190 if (!DNA_OCC_SUM_EXCEPTION(sum)) { | |
| 1191 tempOccValue0[0] += (sum & 0x000000FF); sum >>= 8; | |
| 1192 tempOccValue0[1] += (sum & 0x000000FF); sum >>= 8; | |
| 1193 tempOccValue0[2] += (sum & 0x000000FF); sum >>= 8; | |
| 1194 tempOccValue0[3] += sum; | |
| 1195 } else { | |
| 1196 if (sum == 0x00000100) { | |
| 1197 tempOccValue0[0] += 256; | |
| 1198 } else if (sum == 0x00010000) { | |
| 1199 tempOccValue0[1] += 256; | |
| 1200 } else if (sum == 0x01000000) { | |
| 1201 tempOccValue0[2] += 256; | |
| 1202 } else { | |
| 1203 tempOccValue0[3] += 256; | |
| 1204 } | |
| 1205 } | |
| 1206 } | |
| 1207 | |
| 1208 sum = 0; | |
| 1209 tempOccValue1[0] = tempOccValue0[0]; | |
| 1210 tempOccValue1[1] = tempOccValue0[1]; | |
| 1211 tempOccValue1[2] = tempOccValue0[2]; | |
| 1212 tempOccValue1[3] = tempOccValue0[3]; | |
| 1213 | |
| 1214 if (occIndex * 2 < numberOfOccValue - 1) { | |
| 1215 for (j=0; j<wordBetweenOccValue; j++) { | |
| 1216 c = bwt[bwtIndex]; | |
| 1217 sum += decodeTable[c >> 16]; | |
| 1218 sum += decodeTable[c & 0x0000FFFF]; | |
| 1219 bwtIndex++; | |
| 1220 } | |
| 1221 if (!DNA_OCC_SUM_EXCEPTION(sum)) { | |
| 1222 tempOccValue1[0] += (sum & 0x000000FF); sum >>= 8; | |
| 1223 tempOccValue1[1] += (sum & 0x000000FF); sum >>= 8; | |
| 1224 tempOccValue1[2] += (sum & 0x000000FF); sum >>= 8; | |
| 1225 tempOccValue1[3] += sum; | |
| 1226 } else { | |
| 1227 if (sum == 0x00000100) { | |
| 1228 tempOccValue1[0] += 256; | |
| 1229 } else if (sum == 0x00010000) { | |
| 1230 tempOccValue1[1] += 256; | |
| 1231 } else if (sum == 0x01000000) { | |
| 1232 tempOccValue1[2] += 256; | |
| 1233 } else { | |
| 1234 tempOccValue1[3] += 256; | |
| 1235 } | |
| 1236 } | |
| 1237 } | |
| 1238 | |
| 1239 occValue[occIndex * 4 + 0] = (tempOccValue0[0] << 16) | tempOccValue1[0]; | |
| 1240 occValue[occIndex * 4 + 1] = (tempOccValue0[1] << 16) | tempOccValue1[1]; | |
| 1241 occValue[occIndex * 4 + 2] = (tempOccValue0[2] << 16) | tempOccValue1[2]; | |
| 1242 occValue[occIndex * 4 + 3] = (tempOccValue0[3] << 16) | tempOccValue1[3]; | |
| 1243 | |
| 1244 } | |
| 1245 | |
| 1246 static void BWTIncConstruct(BWTInc *bwtInc, const unsigned int numChar) | |
| 1247 { | |
| 1248 unsigned int i; | |
| 1249 unsigned int mergedBwtSizeInWord, mergedOccSizeInWord; | |
| 1250 unsigned int firstCharInThisIteration; | |
| 1251 | |
| 1252 unsigned int *relativeRank, *seq, *sortedRank, *insertBwt, *mergedBwt; | |
| 1253 unsigned int newInverseSa0RelativeRank, oldInverseSa0RelativeRank, newInverseSa0; | |
| 1254 | |
| 1255 #ifdef DEBUG | |
| 1256 if (numChar > bwtInc->buildSize) { | |
| 1257 fprintf(stderr, "BWTIncConstruct(): numChar > buildSize!\n"); | |
| 1258 exit(1); | |
| 1259 } | |
| 1260 #endif | |
| 1261 | |
| 1262 mergedBwtSizeInWord = BWTResidentSizeInWord(bwtInc->bwt->textLength + numChar); | |
| 1263 mergedOccSizeInWord = BWTOccValueMinorSizeInWord(bwtInc->bwt->textLength + numChar); | |
| 1264 | |
| 1265 initializeVAL(bwtInc->cumulativeCountInCurrentBuild, ALPHABET_SIZE + 1, 0); | |
| 1266 | |
| 1267 if (bwtInc->bwt->textLength == 0) { // Initial build | |
| 1268 | |
| 1269 // Set address | |
| 1270 seq = bwtInc->workingMemory; | |
| 1271 relativeRank = seq + bwtInc->buildSize + 1; | |
| 1272 mergedBwt = insertBwt = bwtInc->workingMemory + bwtInc->availableWord - mergedBwtSizeInWord; // build in place | |
| 1273 | |
| 1274 BWTIncPutPackedTextToRank(bwtInc->packedText, relativeRank, bwtInc->cumulativeCountInCurrentBuild, numChar); | |
| 1275 | |
| 1276 firstCharInThisIteration = relativeRank[0]; | |
| 1277 relativeRank[numChar] = 0; | |
| 1278 | |
| 1279 // Sort suffix | |
| 1280 QSufSortSuffixSort((int*)relativeRank, (int*)seq, (int)numChar, (int)ALPHABET_SIZE - 1, 0, FALSE); | |
| 1281 newInverseSa0 = relativeRank[0]; | |
| 1282 | |
| 1283 // Clear BWT area | |
| 1284 initializeVAL(insertBwt, mergedBwtSizeInWord, 0); | |
| 1285 | |
| 1286 // Build BWT | |
| 1287 BWTIncBuildPackedBwt(relativeRank, insertBwt, numChar, bwtInc->cumulativeCountInCurrentBuild, bwtInc->packedShift); | |
| 1288 | |
| 1289 // so that the cumulativeCount is not deducted | |
| 1290 bwtInc->firstCharInLastIteration = ALPHABET_SIZE; | |
| 1291 | |
| 1292 } else { // Incremental build | |
| 1293 // Set address | |
| 1294 sortedRank = bwtInc->workingMemory; | |
| 1295 seq = sortedRank + bwtInc->buildSize + 1; | |
| 1296 insertBwt = seq; | |
| 1297 relativeRank = seq + bwtInc->buildSize + 1; | |
| 1298 | |
| 1299 // Store the first character of this iteration | |
| 1300 firstCharInThisIteration = bwtInc->packedText[0] >> (BITS_IN_WORD - BIT_PER_CHAR); | |
| 1301 | |
| 1302 // Count occurrence of input text | |
| 1303 ForwardDNAAllOccCountNoLimit(bwtInc->packedText, numChar, bwtInc->cumulativeCountInCurrentBuild + 1, bwtInc->bwt->decodeTable); | |
| 1304 // Add the first character of the previous iteration to represent the inverseSa0 of the previous iteration | |
| 1305 bwtInc->cumulativeCountInCurrentBuild[bwtInc->firstCharInLastIteration + 1]++; | |
| 1306 bwtInc->cumulativeCountInCurrentBuild[2] += bwtInc->cumulativeCountInCurrentBuild[1]; | |
| 1307 bwtInc->cumulativeCountInCurrentBuild[3] += bwtInc->cumulativeCountInCurrentBuild[2]; | |
| 1308 bwtInc->cumulativeCountInCurrentBuild[4] += bwtInc->cumulativeCountInCurrentBuild[3]; | |
| 1309 | |
| 1310 // Get rank of new suffix among processed suffix | |
| 1311 // The seq array is built into ALPHABET_SIZE + 2 groups; ALPHABET_SIZE groups + 1 group divided into 2 by inverseSa0 + inverseSa0 as 1 group | |
| 1312 oldInverseSa0RelativeRank = BWTIncGetAbsoluteRank(bwtInc->bwt, sortedRank, seq, bwtInc->packedText, | |
| 1313 numChar, bwtInc->cumulativeCountInCurrentBuild, bwtInc->firstCharInLastIteration); | |
| 1314 | |
| 1315 // Sort rank by ALPHABET_SIZE + 2 groups (or ALPHABET_SIZE + 1 groups when inverseSa0 sit on the border of a group) | |
| 1316 for (i=0; i<ALPHABET_SIZE; i++) { | |
| 1317 if (bwtInc->cumulativeCountInCurrentBuild[i] > oldInverseSa0RelativeRank || | |
| 1318 bwtInc->cumulativeCountInCurrentBuild[i+1] <= oldInverseSa0RelativeRank) { | |
| 1319 BWTIncSortKey(sortedRank + bwtInc->cumulativeCountInCurrentBuild[i], seq + bwtInc->cumulativeCountInCurrentBuild[i], bwtInc->cumulativeCountInCurrentBuild[i+1] - bwtInc->cumulativeCountInCurrentBuild[i]); | |
| 1320 } else { | |
| 1321 if (bwtInc->cumulativeCountInCurrentBuild[i] < oldInverseSa0RelativeRank) { | |
| 1322 BWTIncSortKey(sortedRank + bwtInc->cumulativeCountInCurrentBuild[i], seq + bwtInc->cumulativeCountInCurrentBuild[i], oldInverseSa0RelativeRank - bwtInc->cumulativeCountInCurrentBuild[i]); | |
| 1323 } | |
| 1324 if (bwtInc->cumulativeCountInCurrentBuild[i+1] > oldInverseSa0RelativeRank + 1) { | |
| 1325 BWTIncSortKey(sortedRank + oldInverseSa0RelativeRank + 1, seq + oldInverseSa0RelativeRank + 1, bwtInc->cumulativeCountInCurrentBuild[i+1] - oldInverseSa0RelativeRank - 1); | |
| 1326 } | |
| 1327 } | |
| 1328 } | |
| 1329 | |
| 1330 // build relative rank; sortedRank is updated for merging to cater for the fact that $ is not encoded in bwt | |
| 1331 // the cumulative freq information is used to make sure that inverseSa0 and suffix beginning with different characters are kept in different unsorted groups) | |
| 1332 BWTIncBuildRelativeRank(sortedRank, seq, relativeRank, numChar, bwtInc->bwt->inverseSa0, bwtInc->cumulativeCountInCurrentBuild); | |
| 1333 #ifdef DEBUG | |
| 1334 if (relativeRank[numChar] != oldInverseSa0RelativeRank) { | |
| 1335 fprintf(stderr, "BWTIncConstruct(): relativeRank[numChar] != oldInverseSa0RelativeRank!\n"); | |
| 1336 exit(1); | |
| 1337 } | |
| 1338 #endif | |
| 1339 | |
| 1340 // Sort suffix | |
| 1341 QSufSortSuffixSort((int*)relativeRank, (int*)seq, (int)numChar, (int)numChar, 1, TRUE); | |
| 1342 | |
| 1343 newInverseSa0RelativeRank = relativeRank[0]; | |
| 1344 newInverseSa0 = sortedRank[newInverseSa0RelativeRank] + newInverseSa0RelativeRank; | |
| 1345 | |
| 1346 sortedRank[newInverseSa0RelativeRank] = 0; // a special value so that this is skipped in the merged bwt | |
| 1347 | |
| 1348 // Build BWT | |
| 1349 BWTIncBuildBwt(seq, relativeRank, numChar, bwtInc->cumulativeCountInCurrentBuild); | |
| 1350 | |
| 1351 // Merge BWT | |
| 1352 mergedBwt = bwtInc->workingMemory + bwtInc->availableWord - mergedBwtSizeInWord | |
| 1353 - bwtInc->numberOfIterationDone * OCC_INTERVAL / BIT_PER_CHAR; | |
| 1354 // minus numberOfIteration * occInterval to create a buffer for merging | |
| 1355 BWTIncMergeBwt(sortedRank, bwtInc->bwt->bwtCode, insertBwt, mergedBwt, bwtInc->bwt->textLength, numChar); | |
| 1356 | |
| 1357 } | |
| 1358 | |
| 1359 // Build auxiliary structure and update info and pointers in BWT | |
| 1360 bwtInc->bwt->textLength += numChar; | |
| 1361 bwtInc->bwt->bwtCode = mergedBwt; | |
| 1362 bwtInc->bwt->bwtSizeInWord = mergedBwtSizeInWord; | |
| 1363 bwtInc->bwt->occSizeInWord = mergedOccSizeInWord; | |
| 1364 if (mergedBwt < bwtInc->workingMemory + mergedOccSizeInWord) { | |
| 1365 fprintf(stderr, "BWTIncConstruct() : Not enough memory allocated!\n"); | |
| 1366 exit(1); | |
| 1367 } | |
| 1368 | |
| 1369 bwtInc->bwt->occValue = mergedBwt - mergedOccSizeInWord; | |
| 1370 | |
| 1371 BWTClearTrailingBwtCode(bwtInc->bwt); | |
| 1372 BWTGenerateOccValueFromBwt(bwtInc->bwt->bwtCode, bwtInc->bwt->occValue, bwtInc->bwt->occValueMajor, | |
| 1373 bwtInc->bwt->textLength, bwtInc->bwt->decodeTable); | |
| 1374 | |
| 1375 bwtInc->bwt->inverseSa0 = newInverseSa0; | |
| 1376 | |
| 1377 bwtInc->bwt->cumulativeFreq[1] += bwtInc->cumulativeCountInCurrentBuild[1] - (bwtInc->firstCharInLastIteration <= 0); | |
| 1378 bwtInc->bwt->cumulativeFreq[2] += bwtInc->cumulativeCountInCurrentBuild[2] - (bwtInc->firstCharInLastIteration <= 1); | |
| 1379 bwtInc->bwt->cumulativeFreq[3] += bwtInc->cumulativeCountInCurrentBuild[3] - (bwtInc->firstCharInLastIteration <= 2); | |
| 1380 bwtInc->bwt->cumulativeFreq[4] += bwtInc->cumulativeCountInCurrentBuild[4] - (bwtInc->firstCharInLastIteration <= 3); | |
| 1381 | |
| 1382 bwtInc->firstCharInLastIteration = firstCharInThisIteration; | |
| 1383 | |
| 1384 // Set build size and text address for the next build | |
| 1385 BWTIncSetBuildSizeAndTextAddr(bwtInc); | |
| 1386 bwtInc->numberOfIterationDone++; | |
| 1387 | |
| 1388 } | |
| 1389 | |
| 1390 BWTInc *BWTIncConstructFromPacked(const char *inputFileName, const float targetNBit, | |
| 1391 const unsigned int initialMaxBuildSize, const unsigned int incMaxBuildSize) | |
| 1392 { | |
| 1393 | |
| 1394 FILE *packedFile; | |
| 1395 unsigned int packedFileLen; | |
| 1396 unsigned int totalTextLength; | |
| 1397 unsigned int textToLoad, textSizeInByte; | |
| 1398 unsigned int processedTextLength; | |
| 1399 unsigned char lastByteLength; | |
| 1400 | |
| 1401 BWTInc *bwtInc; | |
| 1402 | |
| 1403 packedFile = (FILE*)fopen(inputFileName, "rb"); | |
| 1404 | |
| 1405 if (packedFile == NULL) { | |
| 1406 fprintf(stderr, "BWTIncConstructFromPacked() : Cannot open inputFileName!\n"); | |
| 1407 exit(1); | |
| 1408 } | |
| 1409 | |
| 1410 fseek(packedFile, -1, SEEK_END); | |
| 1411 packedFileLen = ftell(packedFile); | |
| 1412 if ((int)packedFileLen < 0) { | |
| 1413 fprintf(stderr, "BWTIncConstructFromPacked: Cannot determine file length!\n"); | |
| 1414 exit(1); | |
| 1415 } | |
| 1416 fread(&lastByteLength, sizeof(unsigned char), 1, packedFile); | |
| 1417 totalTextLength = TextLengthFromBytePacked(packedFileLen, BIT_PER_CHAR, lastByteLength); | |
| 1418 | |
| 1419 bwtInc = BWTIncCreate(totalTextLength, targetNBit, initialMaxBuildSize, incMaxBuildSize); | |
| 1420 | |
| 1421 BWTIncSetBuildSizeAndTextAddr(bwtInc); | |
| 1422 | |
| 1423 if (bwtInc->buildSize > totalTextLength) { | |
| 1424 textToLoad = totalTextLength; | |
| 1425 } else { | |
| 1426 textToLoad = totalTextLength - ((totalTextLength - bwtInc->buildSize + CHAR_PER_WORD - 1) / CHAR_PER_WORD * CHAR_PER_WORD); | |
| 1427 } | |
| 1428 textSizeInByte = textToLoad / CHAR_PER_BYTE; // excluded the odd byte | |
| 1429 | |
| 1430 fseek(packedFile, -2, SEEK_CUR); | |
| 1431 fseek(packedFile, -((int)textSizeInByte), SEEK_CUR); | |
| 1432 fread(bwtInc->textBuffer, sizeof(unsigned char), textSizeInByte + 1, packedFile); | |
| 1433 fseek(packedFile, -((int)textSizeInByte + 1), SEEK_CUR); | |
| 1434 | |
| 1435 ConvertBytePackedToWordPacked(bwtInc->textBuffer, bwtInc->packedText, ALPHABET_SIZE, textToLoad); | |
| 1436 BWTIncConstruct(bwtInc, textToLoad); | |
| 1437 | |
| 1438 processedTextLength = textToLoad; | |
| 1439 | |
| 1440 while (processedTextLength < totalTextLength) { | |
| 1441 textToLoad = bwtInc->buildSize / CHAR_PER_WORD * CHAR_PER_WORD; | |
| 1442 if (textToLoad > totalTextLength - processedTextLength) { | |
| 1443 textToLoad = totalTextLength - processedTextLength; | |
| 1444 } | |
| 1445 textSizeInByte = textToLoad / CHAR_PER_BYTE; | |
| 1446 fseek(packedFile, -((int)textSizeInByte), SEEK_CUR); | |
| 1447 fread(bwtInc->textBuffer, sizeof(unsigned char), textSizeInByte, packedFile); | |
| 1448 fseek(packedFile, -((int)textSizeInByte), SEEK_CUR); | |
| 1449 ConvertBytePackedToWordPacked(bwtInc->textBuffer, bwtInc->packedText, ALPHABET_SIZE, textToLoad); | |
| 1450 BWTIncConstruct(bwtInc, textToLoad); | |
| 1451 processedTextLength += textToLoad; | |
| 1452 if (bwtInc->numberOfIterationDone % 10 == 0) { | |
| 1453 printf("[BWTIncConstructFromPacked] %u iterations done. %u characters processed.\n", | |
| 1454 bwtInc->numberOfIterationDone, processedTextLength); | |
| 1455 } | |
| 1456 } | |
| 1457 return bwtInc; | |
| 1458 } | |
| 1459 | |
| 1460 void BWTFree(BWT *bwt) | |
| 1461 { | |
| 1462 if (bwt == 0) return; | |
| 1463 free(bwt->cumulativeFreq); | |
| 1464 free(bwt->bwtCode); | |
| 1465 free(bwt->occValue); | |
| 1466 free(bwt->occValueMajor); | |
| 1467 free(bwt->saValue); | |
| 1468 free(bwt->inverseSa); | |
| 1469 free(bwt->decodeTable); | |
| 1470 free(bwt->saIndexRange); | |
| 1471 free(bwt->saValueOnBoundary); | |
| 1472 free(bwt); | |
| 1473 } | |
| 1474 | |
| 1475 void BWTIncFree(BWTInc *bwtInc) | |
| 1476 { | |
| 1477 if (bwtInc == 0) return; | |
| 1478 free(bwtInc->bwt); | |
| 1479 free(bwtInc->workingMemory); | |
| 1480 free(bwtInc); | |
| 1481 } | |
| 1482 | |
| 1483 static unsigned int BWTFileSizeInWord(const unsigned int numChar) | |
| 1484 { | |
| 1485 // The $ in BWT at the position of inverseSa0 is not encoded | |
| 1486 return (numChar + CHAR_PER_WORD - 1) / CHAR_PER_WORD; | |
| 1487 } | |
| 1488 | |
| 1489 void BWTSaveBwtCodeAndOcc(const BWT *bwt, const char *bwtFileName, const char *occValueFileName) | |
| 1490 { | |
| 1491 FILE *bwtFile; | |
| 1492 /* FILE *occValueFile; */ | |
| 1493 unsigned int bwtLength; | |
| 1494 | |
| 1495 bwtFile = (FILE*)fopen(bwtFileName, "wb"); | |
| 1496 if (bwtFile == NULL) { | |
| 1497 fprintf(stderr, "BWTSaveBwtCodeAndOcc(): Cannot open BWT code file!\n"); | |
| 1498 exit(1); | |
| 1499 } | |
| 1500 | |
| 1501 fwrite(&bwt->inverseSa0, sizeof(unsigned int), 1, bwtFile); | |
| 1502 fwrite(bwt->cumulativeFreq + 1, sizeof(unsigned int), ALPHABET_SIZE, bwtFile); | |
| 1503 bwtLength = BWTFileSizeInWord(bwt->textLength); | |
| 1504 fwrite(bwt->bwtCode, sizeof(unsigned int), bwtLength, bwtFile); | |
| 1505 fclose(bwtFile); | |
| 1506 /* | |
| 1507 occValueFile = (FILE*)fopen(occValueFileName, "wb"); | |
| 1508 if (occValueFile == NULL) { | |
| 1509 fprintf(stderr, "BWTSaveBwtCodeAndOcc(): Cannot open occ value file!\n"); | |
| 1510 exit(1); | |
| 1511 } | |
| 1512 | |
| 1513 fwrite(&bwt->inverseSa0, sizeof(unsigned int), 1, occValueFile); | |
| 1514 fwrite(bwt->cumulativeFreq + 1, sizeof(unsigned int), ALPHABET_SIZE, occValueFile); | |
| 1515 fwrite(bwt->occValue, sizeof(unsigned int), bwt->occSizeInWord, occValueFile); | |
| 1516 fwrite(bwt->occValueMajor, sizeof(unsigned int), bwt->occMajorSizeInWord, occValueFile); | |
| 1517 fclose(occValueFile); | |
| 1518 */ | |
| 1519 } | |
| 1520 | |
| 1521 void bwt_bwtgen(const char *fn_pac, const char *fn_bwt) | |
| 1522 { | |
| 1523 BWTInc *bwtInc; | |
| 1524 bwtInc = BWTIncConstructFromPacked(fn_pac, 2.5, 10000000, 10000000); | |
| 1525 printf("[bwt_gen] Finished constructing BWT in %u iterations.\n", bwtInc->numberOfIterationDone); | |
| 1526 BWTSaveBwtCodeAndOcc(bwtInc->bwt, fn_bwt, 0); | |
| 1527 BWTIncFree(bwtInc); | |
| 1528 } | |
| 1529 | |
| 1530 int bwt_bwtgen_main(int argc, char *argv[]) | |
| 1531 { | |
| 1532 if (argc < 3) { | |
| 1533 fprintf(stderr, "Usage: bwtgen <in.pac> <out.bwt>\n"); | |
| 1534 return 1; | |
| 1535 } | |
| 1536 bwt_bwtgen(argv[1], argv[2]); | |
| 1537 return 0; | |
| 1538 } | |
| 1539 | |
| 1540 #ifdef MAIN_BWT_GEN | |
| 1541 | |
| 1542 int main(int argc, char *argv[]) | |
| 1543 { | |
| 1544 return bwt_bwtgen_main(argc, argv); | |
| 1545 } | |
| 1546 | |
| 1547 #endif |
