Mercurial > repos > calkan > mrfast
comparison mrfast-2.1.0.4/Reads.c @ 1:d4054b05b015 default tip
Version update to 2.1.0.5
| author | calkan |
|---|---|
| date | Fri, 09 Mar 2012 07:35:51 -0500 |
| parents | 7b3dc85dc7fd |
| children |
comparison
equal
deleted
inserted
replaced
| 0:7b3dc85dc7fd | 1:d4054b05b015 |
|---|---|
| 1 /* | |
| 2 * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University | |
| 3 * All rights reserved. | |
| 4 * | |
| 5 * Redistribution and use in source and binary forms, with or without modification, | |
| 6 * are permitted provided that the following conditions are met: | |
| 7 * | |
| 8 * Redistributions of source code must retain the above copyright notice, this list | |
| 9 * of conditions and the following disclaimer. | |
| 10 * - Redistributions in binary form must reproduce the above copyright notice, this | |
| 11 * list of conditions and the following disclaimer in the documentation and/or other | |
| 12 * materials provided with the distribution. | |
| 13 * - Neither the names of the University of Washington, Simon Fraser University, | |
| 14 * nor the names of its contributors may be | |
| 15 * used to endorse or promote products derived from this software without specific | |
| 16 * prior written permission. | |
| 17 * | |
| 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
| 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
| 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
| 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR | |
| 22 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | |
| 23 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | |
| 24 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | |
| 25 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | |
| 26 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | |
| 27 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |
| 28 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
| 29 */ | |
| 30 | |
| 31 /* | |
| 32 Authors: | |
| 33 Farhad Hormozdiari | |
| 34 Faraz Hach | |
| 35 Can Alkan | |
| 36 Emails: | |
| 37 farhadh AT uw DOT edu | |
| 38 fhach AT cs DOT sfu DOT ca | |
| 39 calkan AT uw DOT edu | |
| 40 */ | |
| 41 | |
| 42 | |
| 43 | |
| 44 #include <stdio.h> | |
| 45 #include <stdlib.h> | |
| 46 #include <string.h> | |
| 47 #include <ctype.h> | |
| 48 #include <zlib.h> | |
| 49 #include "Common.h" | |
| 50 #include "Reads.h" | |
| 51 | |
| 52 #define CHARCODE(a) (a=='A' ? 0 : (a=='C' ? 1 : (a=='G' ? 2 : (a=='T' ? 3 : 4)))) | |
| 53 | |
| 54 FILE *_r_fp1; | |
| 55 FILE *_r_fp2; | |
| 56 gzFile _r_gzfp1; | |
| 57 gzFile _r_gzfp2; | |
| 58 Read *_r_seq; | |
| 59 int _r_seqCnt; | |
| 60 int *_r_samplingLocs; | |
| 61 | |
| 62 /**********************************************/ | |
| 63 char *(*readFirstSeq)(char *); | |
| 64 char *(*readSecondSeq)(char *); | |
| 65 /**********************************************/ | |
| 66 char *readFirstSeqTXT( char *seq ) | |
| 67 { | |
| 68 return fgets(seq, SEQ_MAX_LENGTH, _r_fp1); | |
| 69 } | |
| 70 | |
| 71 /**********************************************/ | |
| 72 char *readSecondSeqTXT( char *seq ) | |
| 73 { | |
| 74 return fgets(seq, SEQ_MAX_LENGTH, _r_fp2); | |
| 75 } | |
| 76 /**********************************************/ | |
| 77 char *readFirstSeqGZ( char *seq ) | |
| 78 { | |
| 79 return gzgets(_r_gzfp1, seq, SEQ_MAX_LENGTH); | |
| 80 } | |
| 81 | |
| 82 /**********************************************/ | |
| 83 char *readSecondSeqGZ( char *seq ) | |
| 84 { | |
| 85 return gzgets(_r_gzfp2, seq, SEQ_MAX_LENGTH); | |
| 86 } | |
| 87 /**********************************************/ | |
| 88 int toCompareRead(const void * elem1, const void * elem2) | |
| 89 { | |
| 90 return strcmp(((Read *)elem1)->seq, ((Read *)elem2)->seq); | |
| 91 } | |
| 92 /**********************************************/ | |
| 93 int readAllReads(char *fileName1, | |
| 94 char *fileName2, | |
| 95 int compressed, | |
| 96 unsigned char *fastq, | |
| 97 unsigned char pairedEnd, | |
| 98 Read **seqList, | |
| 99 unsigned int *seqListSize) | |
| 100 { | |
| 101 double startTime=getTime(); | |
| 102 | |
| 103 char seq1[SEQ_MAX_LENGTH]; | |
| 104 char rseq1[SEQ_MAX_LENGTH]; | |
| 105 char name1[SEQ_MAX_LENGTH]; | |
| 106 char qual1[SEQ_MAX_LENGTH]; | |
| 107 char seq2[SEQ_MAX_LENGTH]; | |
| 108 char rseq2[SEQ_MAX_LENGTH]; | |
| 109 char name2[SEQ_MAX_LENGTH]; | |
| 110 char qual2[SEQ_MAX_LENGTH]; | |
| 111 | |
| 112 char dummy[SEQ_MAX_LENGTH]; | |
| 113 char ch; | |
| 114 int err1, err2; | |
| 115 int nCnt; | |
| 116 int discarded = 0; | |
| 117 int seqCnt = 0; | |
| 118 int maxCnt = 0; | |
| 119 int i; | |
| 120 Read *list = NULL; | |
| 121 | |
| 122 int clipped = 0; | |
| 123 | |
| 124 if (!compressed) | |
| 125 { | |
| 126 _r_fp1 = fileOpen( fileName1, "r"); | |
| 127 | |
| 128 if (_r_fp1 == NULL) | |
| 129 { | |
| 130 return 0; | |
| 131 } | |
| 132 | |
| 133 ch = fgetc(_r_fp1); | |
| 134 | |
| 135 if ( pairedEnd && fileName2 != NULL ) | |
| 136 { | |
| 137 _r_fp2 = fileOpen ( fileName2, "r" ); | |
| 138 if (_r_fp2 == NULL) | |
| 139 { | |
| 140 return 0; | |
| 141 } | |
| 142 } | |
| 143 else | |
| 144 { | |
| 145 _r_fp2 = _r_fp1; | |
| 146 } | |
| 147 | |
| 148 readFirstSeq = &readFirstSeqTXT; | |
| 149 readSecondSeq = &readSecondSeqTXT; | |
| 150 } | |
| 151 else | |
| 152 { | |
| 153 | |
| 154 _r_gzfp1 = fileOpenGZ (fileName1, "r"); | |
| 155 | |
| 156 if (_r_gzfp1 == NULL) | |
| 157 { | |
| 158 return 0; | |
| 159 } | |
| 160 | |
| 161 ch = gzgetc(_r_gzfp1); | |
| 162 | |
| 163 if ( pairedEnd && fileName2 != NULL ) | |
| 164 { | |
| 165 _r_gzfp2 = fileOpenGZ ( fileName2, "r" ); | |
| 166 if (_r_gzfp2 == NULL) | |
| 167 { | |
| 168 return 0; | |
| 169 } | |
| 170 } | |
| 171 else | |
| 172 { | |
| 173 _r_gzfp2 = _r_gzfp1; | |
| 174 } | |
| 175 | |
| 176 readFirstSeq = &readFirstSeqGZ; | |
| 177 readSecondSeq = &readSecondSeqGZ; | |
| 178 } | |
| 179 | |
| 180 if (ch == '>') | |
| 181 *fastq = 0; | |
| 182 else | |
| 183 *fastq = 1; | |
| 184 | |
| 185 // Counting the number of lines in the file | |
| 186 while (readFirstSeq(dummy)) maxCnt++; | |
| 187 | |
| 188 if (!compressed) | |
| 189 { | |
| 190 rewind(_r_fp1); | |
| 191 } | |
| 192 else | |
| 193 { | |
| 194 gzrewind(_r_gzfp1); | |
| 195 } | |
| 196 | |
| 197 // Calculating the Maximum # of sequences | |
| 198 if (*fastq) | |
| 199 { | |
| 200 maxCnt /= 4; | |
| 201 } | |
| 202 else | |
| 203 { | |
| 204 maxCnt /= 2; | |
| 205 } | |
| 206 | |
| 207 if (pairedEnd && fileName2 != NULL ) | |
| 208 maxCnt *= 2; | |
| 209 | |
| 210 list = getMem(sizeof(Read)*maxCnt); | |
| 211 | |
| 212 while( readFirstSeq(name1) ) | |
| 213 { | |
| 214 err1 = 0; | |
| 215 err2 = 0; | |
| 216 readFirstSeq(seq1); | |
| 217 name1[strlen(name1)-1] = '\0'; | |
| 218 | |
| 219 if ( *fastq ) | |
| 220 { | |
| 221 readFirstSeq(dummy); | |
| 222 readFirstSeq(qual1); | |
| 223 qual1[strlen(qual1)-1] = '\0'; | |
| 224 } | |
| 225 else | |
| 226 { | |
| 227 sprintf(qual1, "*"); | |
| 228 } | |
| 229 | |
| 230 // Cropping | |
| 231 if (cropSize > 0) | |
| 232 { | |
| 233 seq1[cropSize] = '\0'; | |
| 234 if ( *fastq ) | |
| 235 qual1[cropSize] = '\0'; | |
| 236 } | |
| 237 | |
| 238 | |
| 239 nCnt = 0; | |
| 240 for (i=0; i<strlen(seq1); i++) | |
| 241 { | |
| 242 seq1[i] = toupper (seq1[i]); | |
| 243 if (seq1[i] == 'N') | |
| 244 { | |
| 245 nCnt++; | |
| 246 } | |
| 247 else if (isspace(seq1[i])) | |
| 248 { | |
| 249 | |
| 250 seq1[i] = '\0'; | |
| 251 break; | |
| 252 } | |
| 253 } | |
| 254 | |
| 255 if (nCnt > errThreshold) | |
| 256 { | |
| 257 err1 = 1; | |
| 258 } | |
| 259 | |
| 260 // Reading the second seq of pair-ends | |
| 261 if (pairedEnd) | |
| 262 { | |
| 263 readSecondSeq(name2); | |
| 264 readSecondSeq(seq2); | |
| 265 name2[strlen(name2)-1] = '\0'; | |
| 266 | |
| 267 | |
| 268 if ( *fastq ) | |
| 269 { | |
| 270 readSecondSeq(dummy); | |
| 271 readSecondSeq(qual2); | |
| 272 | |
| 273 qual2[strlen(qual2)-1] = '\0'; | |
| 274 } | |
| 275 else | |
| 276 { | |
| 277 sprintf(qual2, "*"); | |
| 278 } | |
| 279 | |
| 280 | |
| 281 // Cropping | |
| 282 if (cropSize > 0) | |
| 283 { | |
| 284 seq2[cropSize] = '\0'; | |
| 285 if ( *fastq ) | |
| 286 qual2[cropSize] = '\0'; | |
| 287 } | |
| 288 | |
| 289 | |
| 290 nCnt = 0; | |
| 291 for (i=0; i<strlen(seq2); i++) | |
| 292 { | |
| 293 seq2[i] = toupper (seq2[i]); | |
| 294 if (seq2[i] == 'N') | |
| 295 { | |
| 296 nCnt++; | |
| 297 | |
| 298 } | |
| 299 else if (isspace(seq2[i])) | |
| 300 { | |
| 301 seq2[i] = '\0'; | |
| 302 } | |
| 303 } | |
| 304 if (nCnt > errThreshold) | |
| 305 { | |
| 306 err2 = 1; | |
| 307 } | |
| 308 | |
| 309 | |
| 310 if (strlen(seq1) < strlen(seq2)) { | |
| 311 seq2[strlen(seq1)] = '\0'; | |
| 312 if ( *fastq ) | |
| 313 qual2[strlen(seq1)] = '\0'; | |
| 314 if (!clipped) clipped = 2; | |
| 315 } | |
| 316 else if (strlen(seq1) > strlen(seq2)){ | |
| 317 seq1[strlen(seq2)] = '\0'; | |
| 318 if ( *fastq ) | |
| 319 qual1[strlen(seq2)] = '\0'; | |
| 320 if (!clipped) clipped = 1; | |
| 321 } | |
| 322 | |
| 323 if (clipped == 1 || clipped == 2){ | |
| 324 fprintf(stdout, "[PE mode warning] Sequence lengths are different, read #%d is clipped to match.\n", clipped); | |
| 325 clipped = 3; | |
| 326 } | |
| 327 | |
| 328 | |
| 329 | |
| 330 } | |
| 331 | |
| 332 if (!pairedEnd && !err1) | |
| 333 { | |
| 334 int _mtmp = strlen(seq1); | |
| 335 list[seqCnt].hits = getMem (1+3*_mtmp+3+strlen(name1)+1); | |
| 336 list[seqCnt].seq = list[seqCnt].hits + 1; | |
| 337 list[seqCnt].rseq = list[seqCnt].seq + _mtmp+1; | |
| 338 list[seqCnt].qual = list[seqCnt].rseq + _mtmp+1; | |
| 339 list[seqCnt].name = list[seqCnt].qual + _mtmp+1; | |
| 340 | |
| 341 list[seqCnt].hashValue = getMem(sizeof(short)*_mtmp); | |
| 342 list[seqCnt].rhashValue = getMem(sizeof(short)*_mtmp); | |
| 343 | |
| 344 list[seqCnt].readNumber = seqCnt; | |
| 345 | |
| 346 reverseComplete(seq1, rseq1, _mtmp); | |
| 347 rseq1[_mtmp] = '\0'; | |
| 348 int i; | |
| 349 | |
| 350 list[seqCnt].hits[0] = 0; | |
| 351 | |
| 352 for (i=0; i<=_mtmp; i++) | |
| 353 { | |
| 354 list[seqCnt].seq[i] = seq1[i]; | |
| 355 list[seqCnt].rseq[i] = rseq1[i] ; | |
| 356 list[seqCnt].qual[i] = qual1[i]; | |
| 357 } | |
| 358 | |
| 359 | |
| 360 //MAKE HASH VALUE | |
| 361 short code = 0; | |
| 362 | |
| 363 for(i=0; i < 4; i++) | |
| 364 code = code * 5 + CHARCODE(list[seqCnt].seq[i]); | |
| 365 list[seqCnt].hashValue[0] = code; | |
| 366 | |
| 367 | |
| 368 for(i = 1; i < _mtmp-3; i++) | |
| 369 { | |
| 370 list[seqCnt].hashValue[i] = (list[seqCnt].hashValue[i-1] - 125 * CHARCODE(seq1[i-1])) * 5 + CHARCODE(seq1[i+3]); | |
| 371 } | |
| 372 | |
| 373 | |
| 374 code = 0; | |
| 375 for(i=0; i < 4; i++) | |
| 376 code = code * 5 + CHARCODE(list[seqCnt].rseq[i]); | |
| 377 list[seqCnt].rhashValue[0] = code; | |
| 378 | |
| 379 | |
| 380 for(i = 1; i < _mtmp-3; i++) | |
| 381 { | |
| 382 list[seqCnt].rhashValue[i] = (list[seqCnt].rhashValue[i-1] - 125 * CHARCODE(rseq1[i-1])) * 5 + CHARCODE(rseq1[i+3]); | |
| 383 } | |
| 384 | |
| 385 int j = 0; | |
| 386 int tmpSize = _mtmp / (errThreshold+1); | |
| 387 | |
| 388 list[seqCnt].hashValSampleSize = getMem(sizeof(int)*(errThreshold+1)); | |
| 389 for(i=0; i < errThreshold+1; i++) | |
| 390 { | |
| 391 code = 0; | |
| 392 for(j = 0; j < tmpSize; j++) | |
| 393 { | |
| 394 code = code * 5 + CHARCODE(list[seqCnt].seq[i*tmpSize+j]); | |
| 395 } | |
| 396 list[seqCnt].hashValSampleSize[i] = code; | |
| 397 } | |
| 398 | |
| 399 sprintf(list[seqCnt].name,"%s%c", ((char*)name1)+1,'\0'); | |
| 400 | |
| 401 seqCnt++; | |
| 402 | |
| 403 } | |
| 404 else if (pairedEnd && !err1 && !err2) | |
| 405 { | |
| 406 | |
| 407 // Naming Conventions X/1, X/2 OR X | |
| 408 int tmplen = strlen(name1); | |
| 409 if (strcmp(name1, name2) != 0) | |
| 410 { | |
| 411 tmplen = strlen(name1)-2; | |
| 412 } | |
| 413 | |
| 414 //first seq | |
| 415 int _mtmp = strlen(seq1); | |
| 416 list[seqCnt].hits = getMem (1+3*_mtmp+3+tmplen+1); | |
| 417 list[seqCnt].seq = list[seqCnt].hits + 1; | |
| 418 list[seqCnt].rseq = list[seqCnt].seq + _mtmp+1; | |
| 419 list[seqCnt].qual = list[seqCnt].rseq + _mtmp+1; | |
| 420 list[seqCnt].name = list[seqCnt].qual + _mtmp+1; | |
| 421 | |
| 422 list[seqCnt].hashValue = getMem(sizeof(short)*_mtmp); | |
| 423 list[seqCnt].rhashValue = getMem(sizeof(short)*_mtmp); | |
| 424 list[seqCnt].readNumber = seqCnt; | |
| 425 | |
| 426 reverseComplete(seq1, rseq1, _mtmp); | |
| 427 rseq1[_mtmp] = '\0'; | |
| 428 int i; | |
| 429 | |
| 430 list[seqCnt].hits[0] = 0; | |
| 431 | |
| 432 for (i=0; i<=_mtmp; i++) | |
| 433 { | |
| 434 list[seqCnt].seq[i] = seq1[i]; | |
| 435 list[seqCnt].rseq[i] = rseq1[i] ; | |
| 436 list[seqCnt].qual[i] = qual1[i]; | |
| 437 } | |
| 438 | |
| 439 | |
| 440 name1[tmplen]='\0'; | |
| 441 | |
| 442 //MAKE HASH VALUE | |
| 443 short code = 0; | |
| 444 | |
| 445 for(i=0; i < 4; i++) | |
| 446 code = code * 5 + CHARCODE(list[seqCnt].seq[i]); | |
| 447 list[seqCnt].hashValue[0] = code; | |
| 448 | |
| 449 | |
| 450 for(i = 1; i < _mtmp-3; i++) | |
| 451 { | |
| 452 list[seqCnt].hashValue[i] = (list[seqCnt].hashValue[i-1] - 125 * CHARCODE(seq1[i-1])) * 5 + CHARCODE(seq1[i+3]); | |
| 453 } | |
| 454 | |
| 455 | |
| 456 code = 0; | |
| 457 for(i=0; i < 4; i++) | |
| 458 code = code * 5 + CHARCODE(list[seqCnt].rseq[i]); | |
| 459 list[seqCnt].rhashValue[0] = code; | |
| 460 | |
| 461 | |
| 462 for(i = 1; i < _mtmp-3; i++) | |
| 463 { | |
| 464 list[seqCnt].rhashValue[i] = (list[seqCnt].rhashValue[i-1] - 125 * CHARCODE(rseq1[i-1])) * 5 + CHARCODE(rseq1[i+3]); | |
| 465 } | |
| 466 | |
| 467 int j = 0; | |
| 468 int tmpSize = _mtmp / (errThreshold+1); | |
| 469 | |
| 470 list[seqCnt].hashValSampleSize = getMem(sizeof(int)*(errThreshold+1)); | |
| 471 for(i=0; i < errThreshold+1; i++) | |
| 472 { | |
| 473 code = 0; | |
| 474 for(j = 0; j < tmpSize; j++) | |
| 475 { | |
| 476 code = code * 5 + CHARCODE(list[seqCnt].seq[i*tmpSize+j]); | |
| 477 } | |
| 478 list[seqCnt].hashValSampleSize[i] = code; | |
| 479 } | |
| 480 | |
| 481 sprintf(list[seqCnt].name,"%s%c", ((char*)name1)+1,'\0'); | |
| 482 | |
| 483 | |
| 484 seqCnt++; | |
| 485 | |
| 486 //second seq | |
| 487 list[seqCnt].hits = getMem (1+3*_mtmp+3+tmplen+1); | |
| 488 list[seqCnt].seq = list[seqCnt].hits + 1; | |
| 489 list[seqCnt].rseq = list[seqCnt].seq + _mtmp+1; | |
| 490 list[seqCnt].qual = list[seqCnt].rseq + _mtmp+1; | |
| 491 list[seqCnt].name = list[seqCnt].qual + _mtmp+1; | |
| 492 | |
| 493 list[seqCnt].hashValue = getMem(sizeof(short)*_mtmp); | |
| 494 list[seqCnt].rhashValue = getMem(sizeof(short)*_mtmp); | |
| 495 list[seqCnt].readNumber = seqCnt; | |
| 496 | |
| 497 reverseComplete(seq2, rseq2, _mtmp); | |
| 498 rseq2[_mtmp] = '\0'; | |
| 499 | |
| 500 list[seqCnt].hits[0] = 0; | |
| 501 | |
| 502 for (i=0; i<=_mtmp; i++) | |
| 503 { | |
| 504 list[seqCnt].seq[i] = seq2[i]; | |
| 505 list[seqCnt].rseq[i] = rseq2[i] ; | |
| 506 list[seqCnt].qual[i] = qual2[i]; | |
| 507 } | |
| 508 | |
| 509 | |
| 510 name2[tmplen]='\0'; | |
| 511 | |
| 512 //MAKE HASH VALUE | |
| 513 code = 0; | |
| 514 | |
| 515 for(i=0; i < 4; i++) | |
| 516 code = code * 5 + CHARCODE(list[seqCnt].seq[i]); | |
| 517 list[seqCnt].hashValue[0] = code; | |
| 518 | |
| 519 | |
| 520 for(i = 1; i < _mtmp-3; i++) | |
| 521 { | |
| 522 list[seqCnt].hashValue[i] = (list[seqCnt].hashValue[i-1] - 125 * CHARCODE(seq1[i-1])) * 5 + CHARCODE(seq1[i+3]); | |
| 523 } | |
| 524 | |
| 525 | |
| 526 code = 0; | |
| 527 for(i=0; i < 4; i++) | |
| 528 code = code * 5 + CHARCODE(list[seqCnt].rseq[i]); | |
| 529 list[seqCnt].rhashValue[0] = code; | |
| 530 | |
| 531 | |
| 532 for(i = 1; i < _mtmp-3; i++) | |
| 533 { | |
| 534 list[seqCnt].rhashValue[i] = (list[seqCnt].rhashValue[i-1] - 125 * CHARCODE(rseq1[i-1])) * 5 + CHARCODE(rseq1[i+3]); | |
| 535 } | |
| 536 | |
| 537 j = 0; | |
| 538 tmpSize = _mtmp / (errThreshold+1); | |
| 539 | |
| 540 list[seqCnt].hashValSampleSize = getMem(sizeof(int)*(errThreshold+1)); | |
| 541 for(i=0; i < errThreshold+1; i++) | |
| 542 { | |
| 543 code = 0; | |
| 544 for(j = 0; j < tmpSize; j++) | |
| 545 { | |
| 546 code = code * 5 + CHARCODE(list[seqCnt].seq[i*tmpSize+j]); | |
| 547 } | |
| 548 list[seqCnt].hashValSampleSize[i] = code; | |
| 549 } | |
| 550 | |
| 551 sprintf(list[seqCnt].name,"%s%c", ((char*)name2)+1,'\0'); | |
| 552 | |
| 553 seqCnt++; | |
| 554 | |
| 555 } | |
| 556 else | |
| 557 { | |
| 558 discarded++; | |
| 559 } | |
| 560 } | |
| 561 | |
| 562 if (seqCnt > 0) | |
| 563 { | |
| 564 SEQ_LENGTH = strlen(list[0].seq); | |
| 565 } | |
| 566 else | |
| 567 { | |
| 568 fprintf(stdout, "ERROR: No reads can be found for mapping\n"); | |
| 569 return 0; | |
| 570 } | |
| 571 | |
| 572 | |
| 573 // Closing Files | |
| 574 if (!compressed) | |
| 575 { | |
| 576 fclose(_r_fp1); | |
| 577 if ( pairedEnd && fileName2 != NULL ) | |
| 578 { | |
| 579 fclose(_r_fp2); | |
| 580 } | |
| 581 } | |
| 582 else | |
| 583 { | |
| 584 gzclose(_r_gzfp1); | |
| 585 if ( pairedEnd && fileName2 != NULL) | |
| 586 { | |
| 587 gzclose(_r_fp2); | |
| 588 } | |
| 589 } | |
| 590 | |
| 591 //qsort(list, seqCnt, sizeof(Read), toCompareRead); | |
| 592 | |
| 593 adjustQual(list, seqCnt); | |
| 594 | |
| 595 *seqList = list; | |
| 596 *seqListSize = seqCnt; | |
| 597 | |
| 598 | |
| 599 _r_seq = list; | |
| 600 _r_seqCnt = seqCnt; | |
| 601 | |
| 602 if ( pairedEnd ) discarded *= 2; | |
| 603 | |
| 604 if (seqCnt>1) | |
| 605 fprintf(stdout, "%d sequences are read in %0.2f. (%d discarded) [Mem:%0.2f M]\n", seqCnt, (getTime()-startTime), discarded, getMemUsage()); | |
| 606 else | |
| 607 fprintf(stdout, "%d sequence is read in %0.2f. (%d discarded) [Mem:%0.2f M]\n", seqCnt, (getTime()-startTime), discarded, getMemUsage()); | |
| 608 | |
| 609 | |
| 610 | |
| 611 return 1; | |
| 612 } | |
| 613 /**********************************************/ | |
| 614 void loadSamplingLocations(int **samplingLocs, int * samplingLocsSize) | |
| 615 { | |
| 616 int i; | |
| 617 int samLocsSize = errThreshold + 1; | |
| 618 int *samLocs = getMem(sizeof(int)*samLocsSize); | |
| 619 | |
| 620 for (i=0; i<samLocsSize; i++) | |
| 621 { | |
| 622 samLocs[i] = (SEQ_LENGTH / samLocsSize) *i; | |
| 623 if ( samLocs[i] + WINDOW_SIZE > SEQ_LENGTH) | |
| 624 samLocs[i] = SEQ_LENGTH - WINDOW_SIZE; | |
| 625 } | |
| 626 | |
| 627 // Outputing the sampling locations | |
| 628 | |
| 629 /* | |
| 630 | |
| 631 int j; | |
| 632 for (i=0; i<SEQ_LENGTH; i++) | |
| 633 { | |
| 634 fprintf(stdout, "-"); | |
| 635 } | |
| 636 fprintf(stdout, "\n"); | |
| 637 | |
| 638 for ( i=0; i<samLocsSize; i++ ) | |
| 639 { | |
| 640 for ( j=0; j<samLocs[i]; j++ ) | |
| 641 { | |
| 642 fprintf(stdout," "); | |
| 643 } | |
| 644 for (j=0; j<WINDOW_SIZE; j++) | |
| 645 { | |
| 646 fprintf(stdout,"+"); | |
| 647 } | |
| 648 fprintf(stdout, "\n"); | |
| 649 fflush(stdout); | |
| 650 } | |
| 651 | |
| 652 | |
| 653 for ( i=0; i<SEQ_LENGTH; i++ ) | |
| 654 { | |
| 655 fprintf(stdout, "-"); | |
| 656 } | |
| 657 fprintf(stdout, "\n"); | |
| 658 | |
| 659 */ | |
| 660 | |
| 661 *samplingLocs = samLocs; | |
| 662 *samplingLocsSize = samLocsSize; | |
| 663 _r_samplingLocs = samLocs; | |
| 664 } | |
| 665 | |
| 666 void finalizeReads(char *fileName) | |
| 667 { | |
| 668 FILE *fp1=NULL; | |
| 669 | |
| 670 if (fileName != NULL) | |
| 671 { | |
| 672 fp1 = fileOpen(fileName, "w"); | |
| 673 } | |
| 674 if (pairedEndMode) | |
| 675 _r_seqCnt /=2; | |
| 676 | |
| 677 int i=0; | |
| 678 for (i = 0; i < _r_seqCnt; i++) | |
| 679 { | |
| 680 if (pairedEndMode && _r_seq[2*i].hits[0] == 0 && _r_seq[2*i+1].hits[0] == 0 && strcmp(_r_seq[2*i].qual,"*")!=0) | |
| 681 { | |
| 682 fprintf(fp1,"@%s/1\n%s\n+\n%s\n@%s/2\n%s\n+\n%s\n", _r_seq[i*2].name, _r_seq[i*2].seq, _r_seq[i*2].qual, _r_seq[i*2].name, _r_seq[i*2+1].seq, _r_seq[i*2+1].qual); | |
| 683 } | |
| 684 else if (pairedEndMode && _r_seq[2*i].hits[0] == 0 && _r_seq[2*i+1].hits[0] == 0) | |
| 685 { | |
| 686 fprintf(fp1, ">%s/1\n%s\n>%s/2\n%shits=%d\n", _r_seq[i*2].name, _r_seq[i*2].seq, _r_seq[i*2].name, _r_seq[i*2+1].seq, _r_seq[2*i+1].hits[0]); | |
| 687 } | |
| 688 else if (!pairedEndMode && _r_seq[i].hits[0] == 0 && strcmp(_r_seq[i].qual, "*")!=0) | |
| 689 { | |
| 690 fprintf(fp1,"@%s\n%s\n+\n%s\n", _r_seq[i].name, _r_seq[i].seq, _r_seq[i].qual); | |
| 691 } | |
| 692 else if (!pairedEndMode && _r_seq[i].hits[0] == 0) | |
| 693 { | |
| 694 fprintf(fp1,">%s\n%s\n", _r_seq[i].name, _r_seq[i].seq); | |
| 695 } | |
| 696 } | |
| 697 | |
| 698 fclose(fp1); | |
| 699 if (pairedEndMode) | |
| 700 _r_seqCnt *= 2; | |
| 701 | |
| 702 for (i = 0; i < _r_seqCnt; i++) | |
| 703 { | |
| 704 freeMem(_r_seq[i].hits,0); | |
| 705 } | |
| 706 | |
| 707 | |
| 708 freeMem(_r_seq,0); | |
| 709 freeMem(_r_samplingLocs,0); | |
| 710 } | |
| 711 | |
| 712 void adjustQual(Read *list, int seqCnt){ | |
| 713 /* This function will automatically determine the phred_offset and readjust quality values if needed */ | |
| 714 } | |
| 715 | |
| 716 | |
| 717 | |
| 718 /*void finalizeOEAReads(char *fileName) | |
| 719 { | |
| 720 FILE *fp1=NULL; | |
| 721 FILE *fp2=NULL; | |
| 722 | |
| 723 //printf("OEA%s\n", fileName); | |
| 724 char fileNameOEA1[200]; | |
| 725 char fileNameOEA2[200]; | |
| 726 sprintf(fileNameOEA1, "%s_OEA1", fileName); | |
| 727 sprintf(fileNameOEA2, "%s_OEA2", fileName); | |
| 728 | |
| 729 | |
| 730 if (fileName != NULL) | |
| 731 { | |
| 732 fp1 = fileOpen(fileNameOEA1, "w"); | |
| 733 fp2 = fileOpen(fileNameOEA2, "w"); | |
| 734 } | |
| 735 if (pairedEndMode) | |
| 736 _r_seqCnt /=2; | |
| 737 | |
| 738 int i=0; | |
| 739 printf("%d\n", _r_seqCnt); | |
| 740 for (i = 0; i < _r_seqCnt; i++) | |
| 741 { | |
| 742 if (pairedEndMode && _r_seq[2*i].hits[0] == 0 && _r_seq[2*i+1].hits[0] != 0 && strcmp(_r_seq[2*i].qual, "*")==0) | |
| 743 { | |
| 744 fprintf(fp1,">%s/1\n%s\n>%s/2\n%s\n", _r_seq[i*2].name, _r_seq[i*2].seq, _r_seq[i*2].name, _r_seq[i*2+1].seq); | |
| 745 } | |
| 746 else if (pairedEndMode && _r_seq[2*i].hits[0] != 0 && _r_seq[2*i+1].hits[0] == 0 && strcmp(_r_seq[2*i].qual, "*")==0) | |
| 747 { | |
| 748 fprintf(fp2, ">%s/1\n%s\n>%s/2\n%s\n", _r_seq[i*2].name, _r_seq[i*2].seq, _r_seq[i*2].name, _r_seq[i*2+1].seq); | |
| 749 } | |
| 750 else if (pairedEndMode && _r_seq[2*i].hits[0] == 0 && _r_seq[2*i+1].hits[0] != 0 && strcmp(_r_seq[2*i].qual, "*")!=0) | |
| 751 { | |
| 752 fprintf(fp1,"@%s\n%s\n+\n%s\n@%s\n%s\n+\n%s\n", _r_seq[2*i].name, | |
| 753 _r_seq[2*i].seq, | |
| 754 _r_seq[2*i].qual, | |
| 755 _r_seq[2*i+1].name, | |
| 756 _r_seq[2*i+1].seq, | |
| 757 _r_seq[2*i+1].qual); | |
| 758 } | |
| 759 else if ( pairedEndMode && _r_seq[2*i].hits[0] != 0 && _r_seq[2*i+1].hits[0] == 0 && strcmp(_r_seq[2*i].qual, "*")!=0 ) | |
| 760 { | |
| 761 fprintf(fp2,"@%s\n%s\n+\n%s\n@%s\n%s\n+\n%s\n", _r_seq[2*i].name, | |
| 762 _r_seq[2*i].seq, | |
| 763 _r_seq[2*i].qual, | |
| 764 _r_seq[2*i+1].name, | |
| 765 _r_seq[2*i+1].seq, | |
| 766 _r_seq[2*i+1].qual); | |
| 767 } | |
| 768 } | |
| 769 | |
| 770 fclose(fp1); | |
| 771 fclose(fp2); | |
| 772 | |
| 773 }*/ |
