| 1 | 1 /* | 
|  | 2  * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University | 
|  | 3  * All rights reserved. | 
|  | 4  * | 
|  | 5  * Redistribution and use in source and binary forms, with or without modification, | 
|  | 6  * are permitted provided that the following conditions are met: | 
|  | 7  * | 
|  | 8  * Redistributions of source code must retain the above copyright notice, this list | 
|  | 9  * of conditions and the following disclaimer. | 
|  | 10  * - Redistributions in binary form must reproduce the above copyright notice, this | 
|  | 11  *   list of conditions and the following disclaimer in the documentation and/or other | 
|  | 12  *   materials provided with the distribution. | 
|  | 13  * - Neither the names of the University of Washington, Simon Fraser University, | 
|  | 14  *   nor the names of its contributors may be | 
|  | 15  *   used to endorse or promote products derived from this software without specific | 
|  | 16  *   prior written permission. | 
|  | 17  * | 
|  | 18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | 
|  | 19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | 
|  | 20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | 
|  | 21  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR | 
|  | 22  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | 
|  | 23  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | 
|  | 24  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | 
|  | 25  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | 
|  | 26  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | 
|  | 27  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | 
|  | 28  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 
|  | 29  */ | 
|  | 30 | 
|  | 31 /* | 
|  | 32   Authors: | 
|  | 33 	Farhad Hormozdiari | 
|  | 34         Faraz Hach | 
|  | 35 	Can Alkan | 
|  | 36   Emails: | 
|  | 37 	farhadh AT uw DOT edu | 
|  | 38 	fhach AT cs DOT sfu DOT ca | 
|  | 39         calkan AT  uw DOT edu | 
|  | 40  */ | 
|  | 41 | 
|  | 42 | 
|  | 43 | 
|  | 44 #include <stdio.h> | 
|  | 45 #include <stdlib.h> | 
|  | 46 #include <string.h> | 
|  | 47 #include <ctype.h> | 
|  | 48 #include <zlib.h> | 
|  | 49 #include "Common.h" | 
|  | 50 #include "Reads.h" | 
|  | 51 | 
|  | 52 #define CHARCODE(a) (a=='A' ? 0 : (a=='C' ? 1 : (a=='G' ? 2 : (a=='T' ? 3 : 4)))) | 
|  | 53 | 
|  | 54 FILE *_r_fp1; | 
|  | 55 FILE *_r_fp2; | 
|  | 56 gzFile _r_gzfp1; | 
|  | 57 gzFile _r_gzfp2; | 
|  | 58 Read *_r_seq; | 
|  | 59 int _r_seqCnt; | 
|  | 60 int *_r_samplingLocs; | 
|  | 61 | 
|  | 62 /**********************************************/ | 
|  | 63 char *(*readFirstSeq)(char *); | 
|  | 64 char *(*readSecondSeq)(char *); | 
|  | 65 /**********************************************/ | 
|  | 66 char *readFirstSeqTXT( char *seq ) | 
|  | 67 { | 
|  | 68 	return fgets(seq, SEQ_MAX_LENGTH, _r_fp1); | 
|  | 69 } | 
|  | 70 | 
|  | 71 /**********************************************/ | 
|  | 72 char *readSecondSeqTXT( char *seq ) | 
|  | 73 { | 
|  | 74 	return fgets(seq, SEQ_MAX_LENGTH, _r_fp2); | 
|  | 75 } | 
|  | 76 /**********************************************/ | 
|  | 77 char *readFirstSeqGZ( char *seq ) | 
|  | 78 { | 
|  | 79 	return gzgets(_r_gzfp1, seq, SEQ_MAX_LENGTH); | 
|  | 80 } | 
|  | 81 | 
|  | 82 /**********************************************/ | 
|  | 83 char *readSecondSeqGZ( char *seq ) | 
|  | 84 { | 
|  | 85 	return gzgets(_r_gzfp2, seq, SEQ_MAX_LENGTH); | 
|  | 86 } | 
|  | 87 /**********************************************/ | 
|  | 88 int toCompareRead(const void * elem1, const void * elem2) | 
|  | 89 { | 
|  | 90 	return strcmp(((Read *)elem1)->seq, ((Read *)elem2)->seq); | 
|  | 91 } | 
|  | 92 /**********************************************/ | 
|  | 93 int readAllReads(char *fileName1, | 
|  | 94 						char *fileName2, | 
|  | 95 						int compressed, | 
|  | 96 						unsigned char *fastq, | 
|  | 97 						unsigned char pairedEnd, | 
|  | 98 						Read **seqList, | 
|  | 99 						unsigned int *seqListSize) | 
|  | 100 { | 
|  | 101 	double startTime=getTime(); | 
|  | 102 | 
|  | 103 	char seq1[SEQ_MAX_LENGTH]; | 
|  | 104 	char rseq1[SEQ_MAX_LENGTH]; | 
|  | 105 	char name1[SEQ_MAX_LENGTH]; | 
|  | 106 	char qual1[SEQ_MAX_LENGTH]; | 
|  | 107 	char seq2[SEQ_MAX_LENGTH]; | 
|  | 108 	char rseq2[SEQ_MAX_LENGTH]; | 
|  | 109 	char name2[SEQ_MAX_LENGTH]; | 
|  | 110 	char qual2[SEQ_MAX_LENGTH]; | 
|  | 111 | 
|  | 112 	char dummy[SEQ_MAX_LENGTH]; | 
|  | 113 	char ch; | 
|  | 114 	int err1, err2; | 
|  | 115 	int nCnt; | 
|  | 116 	int discarded = 0; | 
|  | 117 	int seqCnt = 0; | 
|  | 118 	int maxCnt = 0; | 
|  | 119 	int i; | 
|  | 120 	Read *list = NULL; | 
|  | 121 | 
|  | 122 	int clipped = 0; | 
|  | 123 | 
|  | 124 | 
|  | 125 	if (!compressed) | 
|  | 126 	{ | 
|  | 127 		_r_fp1 = fileOpen( fileName1, "r"); | 
|  | 128 | 
|  | 129 		if (_r_fp1 == NULL) | 
|  | 130 		{ | 
|  | 131 			return 0; | 
|  | 132 		} | 
|  | 133 | 
|  | 134 		ch = fgetc(_r_fp1); | 
|  | 135 | 
|  | 136 		if ( pairedEnd && fileName2 != NULL ) | 
|  | 137 		{ | 
|  | 138 			_r_fp2 = fileOpen ( fileName2, "r" ); | 
|  | 139 			if (_r_fp2 == NULL) | 
|  | 140 			{ | 
|  | 141 				return 0; | 
|  | 142 			} | 
|  | 143 		} | 
|  | 144 		else | 
|  | 145 		{ | 
|  | 146 			_r_fp2 = _r_fp1; | 
|  | 147 		} | 
|  | 148 | 
|  | 149 		readFirstSeq = &readFirstSeqTXT; | 
|  | 150 		readSecondSeq = &readSecondSeqTXT; | 
|  | 151 	} | 
|  | 152 	else | 
|  | 153 	{ | 
|  | 154 | 
|  | 155 		_r_gzfp1 = fileOpenGZ (fileName1, "r"); | 
|  | 156 | 
|  | 157 		if (_r_gzfp1 == NULL) | 
|  | 158 		{ | 
|  | 159 			return 0; | 
|  | 160 		} | 
|  | 161 | 
|  | 162 		ch = gzgetc(_r_gzfp1); | 
|  | 163 | 
|  | 164 		if ( pairedEnd && fileName2 != NULL ) | 
|  | 165 		{ | 
|  | 166 			_r_gzfp2 = fileOpenGZ ( fileName2, "r" ); | 
|  | 167 			if (_r_gzfp2 == NULL) | 
|  | 168 			{ | 
|  | 169 				return 0; | 
|  | 170 			} | 
|  | 171 		} | 
|  | 172 		else | 
|  | 173 		{ | 
|  | 174 			_r_gzfp2 = _r_gzfp1; | 
|  | 175 		} | 
|  | 176 | 
|  | 177 		readFirstSeq = &readFirstSeqGZ; | 
|  | 178 		readSecondSeq = &readSecondSeqGZ; | 
|  | 179 	} | 
|  | 180 | 
|  | 181 	if (ch == '>') | 
|  | 182 		*fastq = 0; | 
|  | 183 	else | 
|  | 184 		*fastq = 1; | 
|  | 185 | 
|  | 186 	// Counting the number of lines in the file | 
|  | 187 	while (readFirstSeq(dummy)) maxCnt++; | 
|  | 188 | 
|  | 189 	if (!compressed) | 
|  | 190 	{ | 
|  | 191 		rewind(_r_fp1); | 
|  | 192 	} | 
|  | 193 	else | 
|  | 194 	{ | 
|  | 195 		gzrewind(_r_gzfp1); | 
|  | 196 	} | 
|  | 197 | 
|  | 198 	// Calculating the Maximum # of sequences | 
|  | 199 	if (*fastq) | 
|  | 200 	{ | 
|  | 201 		maxCnt /= 4; | 
|  | 202 	} | 
|  | 203 	else | 
|  | 204 	{ | 
|  | 205 		maxCnt /= 2; | 
|  | 206 	} | 
|  | 207 | 
|  | 208 	if (pairedEnd && fileName2 != NULL ) | 
|  | 209 		maxCnt *= 2; | 
|  | 210 | 
|  | 211 	list = getMem(sizeof(Read)*maxCnt); | 
|  | 212 | 
|  | 213 	while( readFirstSeq(name1) ) | 
|  | 214 	{ | 
|  | 215 		err1 = 0; | 
|  | 216 		err2 = 0; | 
|  | 217 		readFirstSeq(seq1); | 
|  | 218 		name1[strlen(name1)-1] = '\0'; | 
|  | 219 | 
|  | 220 		if ( *fastq ) | 
|  | 221 		{ | 
|  | 222 			readFirstSeq(dummy); | 
|  | 223 			readFirstSeq(qual1); | 
|  | 224 			qual1[strlen(qual1)-1] = '\0'; | 
|  | 225 		} | 
|  | 226 		else | 
|  | 227 		{ | 
|  | 228 			sprintf(qual1, "*"); | 
|  | 229 		} | 
|  | 230 | 
|  | 231 		// Cropping | 
|  | 232 		if (cropSize > 0) | 
|  | 233 		{ | 
|  | 234 			seq1[cropSize] = '\0'; | 
|  | 235 			if ( *fastq ) | 
|  | 236 				qual1[cropSize] = '\0'; | 
|  | 237 		} | 
|  | 238 | 
|  | 239 | 
|  | 240 		nCnt = 0; | 
|  | 241 		for (i=0; i<strlen(seq1); i++) | 
|  | 242 		{ | 
|  | 243 			seq1[i] = toupper (seq1[i]); | 
|  | 244 			if (seq1[i] == 'N') | 
|  | 245 			{ | 
|  | 246 				nCnt++; | 
|  | 247 			} | 
|  | 248 			else if (isspace(seq1[i])) | 
|  | 249 			{ | 
|  | 250 | 
|  | 251 				seq1[i] = '\0'; | 
|  | 252 				break; | 
|  | 253 			} | 
|  | 254 		} | 
|  | 255 | 
|  | 256 		if (nCnt > errThreshold) | 
|  | 257 		{ | 
|  | 258 			err1 = 1; | 
|  | 259 		} | 
|  | 260 | 
|  | 261 		// Reading the second seq of pair-ends | 
|  | 262 		if (pairedEnd) | 
|  | 263 		{ | 
|  | 264 			readSecondSeq(name2); | 
|  | 265 			readSecondSeq(seq2); | 
|  | 266 			name2[strlen(name2)-1] = '\0'; | 
|  | 267 | 
|  | 268 | 
|  | 269 			if ( *fastq ) | 
|  | 270 			{ | 
|  | 271 				readSecondSeq(dummy); | 
|  | 272 				readSecondSeq(qual2); | 
|  | 273 | 
|  | 274 				qual2[strlen(qual2)-1] = '\0'; | 
|  | 275 			} | 
|  | 276 			else | 
|  | 277 			{ | 
|  | 278 				sprintf(qual2, "*"); | 
|  | 279 			} | 
|  | 280 | 
|  | 281 | 
|  | 282 			// Cropping | 
|  | 283 			if (cropSize > 0) | 
|  | 284 			{ | 
|  | 285 				seq2[cropSize] = '\0'; | 
|  | 286 				if ( *fastq ) | 
|  | 287 					qual2[cropSize] = '\0'; | 
|  | 288 			} | 
|  | 289 | 
|  | 290 | 
|  | 291 			nCnt = 0; | 
|  | 292 			for (i=0; i<strlen(seq2); i++) | 
|  | 293 			{ | 
|  | 294 				seq2[i] = toupper (seq2[i]); | 
|  | 295 				if (seq2[i] == 'N') | 
|  | 296 				{ | 
|  | 297 					nCnt++; | 
|  | 298 | 
|  | 299 				} | 
|  | 300 				else if (isspace(seq2[i])) | 
|  | 301 				{ | 
|  | 302 					seq2[i] = '\0'; | 
|  | 303 				} | 
|  | 304 			} | 
|  | 305 			if (nCnt > errThreshold) | 
|  | 306 			{ | 
|  | 307 				err2 = 1; | 
|  | 308 			} | 
|  | 309 | 
|  | 310 | 
|  | 311 			if (strlen(seq1) < strlen(seq2)) { | 
|  | 312 			  seq2[strlen(seq1)] = '\0'; | 
|  | 313 			  if ( *fastq ) | 
|  | 314 			    qual2[strlen(seq1)] = '\0'; | 
|  | 315 			  if (!clipped) clipped = 2; | 
|  | 316 			} | 
|  | 317 			else if (strlen(seq1) > strlen(seq2)){ | 
|  | 318 			  seq1[strlen(seq2)] = '\0'; | 
|  | 319 			  if ( *fastq ) | 
|  | 320 			    qual1[strlen(seq2)] = '\0'; | 
|  | 321 			  if (!clipped) clipped = 1; | 
|  | 322 			} | 
|  | 323 | 
|  | 324 			if (clipped == 1 || clipped == 2){ | 
|  | 325 			  fprintf(stdout, "[PE mode Warning] Sequence lengths are different,  read #%d is clipped to match.\n", clipped); | 
|  | 326 			  clipped = 3; | 
|  | 327 			} | 
|  | 328 | 
|  | 329 | 
|  | 330 | 
|  | 331 		} | 
|  | 332 | 
|  | 333 		if (!pairedEnd && !err1) | 
|  | 334 		{ | 
|  | 335 			int _mtmp = strlen(seq1); | 
|  | 336                         list[seqCnt].hits = getMem (1+3*_mtmp+3+strlen(name1)+1); | 
|  | 337                         list[seqCnt].seq  = list[seqCnt].hits + 1; | 
|  | 338                         list[seqCnt].rseq = list[seqCnt].seq + _mtmp+1; | 
|  | 339                         list[seqCnt].qual = list[seqCnt].rseq + _mtmp+1; | 
|  | 340                         list[seqCnt].name = list[seqCnt].qual + _mtmp+1; | 
|  | 341 | 
|  | 342 			list[seqCnt].hashValue = getMem(sizeof(short)*_mtmp); | 
|  | 343                         list[seqCnt].rhashValue = getMem(sizeof(short)*_mtmp); | 
|  | 344 | 
|  | 345 			list[seqCnt].readNumber = seqCnt; | 
|  | 346 | 
|  | 347                         reverseComplete(seq1, rseq1, _mtmp); | 
|  | 348                         rseq1[_mtmp] =  '\0'; | 
|  | 349                         int i; | 
|  | 350 | 
|  | 351                         list[seqCnt].hits[0] = 0; | 
|  | 352 | 
|  | 353                         for (i=0; i<=_mtmp; i++) | 
|  | 354                         { | 
|  | 355                                 list[seqCnt].seq[i] = seq1[i]; | 
|  | 356                                 list[seqCnt].rseq[i] = rseq1[i] ; | 
|  | 357                                 list[seqCnt].qual[i] = qual1[i]; | 
|  | 358                         } | 
|  | 359 | 
|  | 360 | 
|  | 361                         //MAKE HASH VALUE | 
|  | 362                         short code = 0; | 
|  | 363 | 
|  | 364                         for(i=0; i < 4; i++) | 
|  | 365                                 code = code * 5 + CHARCODE(list[seqCnt].seq[i]); | 
|  | 366                         list[seqCnt].hashValue[0] = code; | 
|  | 367 | 
|  | 368 | 
|  | 369                         for(i = 1; i < _mtmp-3; i++) | 
|  | 370                         { | 
|  | 371                                 list[seqCnt].hashValue[i] = (list[seqCnt].hashValue[i-1] - 125 * CHARCODE(seq1[i-1])) * 5 + CHARCODE(seq1[i+3]); | 
|  | 372                         } | 
|  | 373 | 
|  | 374 | 
|  | 375                         code = 0; | 
|  | 376                         for(i=0; i < 4; i++) | 
|  | 377                                 code = code * 5 + CHARCODE(list[seqCnt].rseq[i]); | 
|  | 378                         list[seqCnt].rhashValue[0] = code; | 
|  | 379 | 
|  | 380 | 
|  | 381                         for(i = 1; i < _mtmp-3; i++) | 
|  | 382                         { | 
|  | 383                                 list[seqCnt].rhashValue[i] = (list[seqCnt].rhashValue[i-1] - 125 * CHARCODE(rseq1[i-1])) * 5 + CHARCODE(rseq1[i+3]); | 
|  | 384                         } | 
|  | 385 | 
|  | 386 			int j = 0; | 
|  | 387 			int tmpSize = _mtmp / (errThreshold+1); | 
|  | 388 | 
|  | 389 	                list[seqCnt].hashValSampleSize = getMem(sizeof(int)*(errThreshold+1)); | 
|  | 390 			for(i=0; i < errThreshold+1; i++) | 
|  | 391 			{ | 
|  | 392 				code = 0; | 
|  | 393 				for(j = 0; j < tmpSize; j++) | 
|  | 394 				{ | 
|  | 395 					code = code * 5 + CHARCODE(list[seqCnt].seq[i*tmpSize+j]); | 
|  | 396 				} | 
|  | 397 				list[seqCnt].hashValSampleSize[i] = code; | 
|  | 398 			} | 
|  | 399 | 
|  | 400 			sprintf(list[seqCnt].name,"%s%c", ((char*)name1)+1,'\0'); | 
|  | 401 | 
|  | 402 			seqCnt++; | 
|  | 403 | 
|  | 404 		} | 
|  | 405 		else if (pairedEnd && !err1 && !err2) | 
|  | 406 		{ | 
|  | 407 | 
|  | 408 			// Naming Conventions X/1, X/2 OR X | 
|  | 409 			int tmplen = strlen(name1); | 
|  | 410 			if (strcmp(name1, name2) != 0) | 
|  | 411 			{ | 
|  | 412 				tmplen = strlen(name1)-2; | 
|  | 413 			} | 
|  | 414 | 
|  | 415 			//first seq | 
|  | 416 			int _mtmp = strlen(seq1); | 
|  | 417 			list[seqCnt].hits = getMem (1+3*_mtmp+3+tmplen+1); | 
|  | 418 			list[seqCnt].seq = list[seqCnt].hits + 1; | 
|  | 419 			list[seqCnt].rseq = list[seqCnt].seq + _mtmp+1; | 
|  | 420 			list[seqCnt].qual = list[seqCnt].rseq + _mtmp+1; | 
|  | 421 			list[seqCnt].name = list[seqCnt].qual + _mtmp+1; | 
|  | 422 | 
|  | 423 			list[seqCnt].hashValue = getMem(sizeof(short)*_mtmp); | 
|  | 424                         list[seqCnt].rhashValue = getMem(sizeof(short)*_mtmp); | 
|  | 425                   	list[seqCnt].readNumber = seqCnt; | 
|  | 426 | 
|  | 427 			reverseComplete(seq1, rseq1, _mtmp); | 
|  | 428 			rseq1[_mtmp] =  '\0'; | 
|  | 429 			int i; | 
|  | 430 | 
|  | 431 			list[seqCnt].hits[0] = 0; | 
|  | 432 | 
|  | 433 			for (i=0; i<=_mtmp; i++) | 
|  | 434 			{ | 
|  | 435 				list[seqCnt].seq[i] = seq1[i]; | 
|  | 436 				list[seqCnt].rseq[i] = rseq1[i] ; | 
|  | 437 				list[seqCnt].qual[i] = qual1[i]; | 
|  | 438 			} | 
|  | 439 | 
|  | 440 | 
|  | 441 			name1[tmplen]='\0'; | 
|  | 442 | 
|  | 443 			//MAKE HASH VALUE | 
|  | 444                         short code = 0; | 
|  | 445 | 
|  | 446                         for(i=0; i < 4; i++) | 
|  | 447                                 code = code * 5 + CHARCODE(list[seqCnt].seq[i]); | 
|  | 448                         list[seqCnt].hashValue[0] = code; | 
|  | 449 | 
|  | 450 | 
|  | 451                         for(i = 1; i < _mtmp-3; i++) | 
|  | 452                         { | 
|  | 453                                 list[seqCnt].hashValue[i] = (list[seqCnt].hashValue[i-1] - 125 * CHARCODE(seq1[i-1])) * 5 + CHARCODE(seq1[i+3]); | 
|  | 454                         } | 
|  | 455 | 
|  | 456 | 
|  | 457                         code = 0; | 
|  | 458                         for(i=0; i < 4; i++) | 
|  | 459                                 code = code * 5 + CHARCODE(list[seqCnt].rseq[i]); | 
|  | 460                         list[seqCnt].rhashValue[0] = code; | 
|  | 461 | 
|  | 462 | 
|  | 463                         for(i = 1; i < _mtmp-3; i++) | 
|  | 464                         { | 
|  | 465                                 list[seqCnt].rhashValue[i] = (list[seqCnt].rhashValue[i-1] - 125 * CHARCODE(rseq1[i-1])) * 5 + CHARCODE(rseq1[i+3]); | 
|  | 466                         } | 
|  | 467 | 
|  | 468                         int j = 0; | 
|  | 469                         int tmpSize = _mtmp / (errThreshold+1); | 
|  | 470 | 
|  | 471                         list[seqCnt].hashValSampleSize = getMem(sizeof(int)*(errThreshold+1)); | 
|  | 472                         for(i=0; i < errThreshold+1; i++) | 
|  | 473                         { | 
|  | 474                                 code = 0; | 
|  | 475                                 for(j = 0; j < tmpSize; j++) | 
|  | 476                                 { | 
|  | 477                                         code = code * 5 + CHARCODE(list[seqCnt].seq[i*tmpSize+j]); | 
|  | 478                                 } | 
|  | 479                                 list[seqCnt].hashValSampleSize[i] = code; | 
|  | 480                         } | 
|  | 481 | 
|  | 482 			sprintf(list[seqCnt].name,"%s%c", ((char*)name1)+1,'\0'); | 
|  | 483 | 
|  | 484 | 
|  | 485 			seqCnt++; | 
|  | 486 | 
|  | 487 			//second seq | 
|  | 488 			list[seqCnt].hits = getMem (1+3*_mtmp+3+tmplen+1); | 
|  | 489 			list[seqCnt].seq = list[seqCnt].hits + 1; | 
|  | 490 			list[seqCnt].rseq = list[seqCnt].seq + _mtmp+1; | 
|  | 491 			list[seqCnt].qual = list[seqCnt].rseq + _mtmp+1; | 
|  | 492 			list[seqCnt].name = list[seqCnt].qual + _mtmp+1; | 
|  | 493 | 
|  | 494 			list[seqCnt].hashValue = getMem(sizeof(short)*_mtmp); | 
|  | 495                         list[seqCnt].rhashValue = getMem(sizeof(short)*_mtmp); | 
|  | 496  		 	list[seqCnt].readNumber = seqCnt; | 
|  | 497 | 
|  | 498 			reverseComplete(seq2, rseq2, _mtmp); | 
|  | 499 			rseq2[_mtmp] =  '\0'; | 
|  | 500 | 
|  | 501 			list[seqCnt].hits[0] = 0; | 
|  | 502 | 
|  | 503 			for (i=0; i<=_mtmp; i++) | 
|  | 504 			{ | 
|  | 505 				list[seqCnt].seq[i] = seq2[i]; | 
|  | 506 				list[seqCnt].rseq[i] = rseq2[i] ; | 
|  | 507 				list[seqCnt].qual[i] = qual2[i]; | 
|  | 508 			} | 
|  | 509 | 
|  | 510 | 
|  | 511 			name2[tmplen]='\0'; | 
|  | 512 | 
|  | 513     		        //MAKE HASH VALUE | 
|  | 514                         code = 0; | 
|  | 515 | 
|  | 516                         for(i=0; i < 4; i++) | 
|  | 517                                 code = code * 5 + CHARCODE(list[seqCnt].seq[i]); | 
|  | 518                         list[seqCnt].hashValue[0] = code; | 
|  | 519 | 
|  | 520 | 
|  | 521                         for(i = 1; i < _mtmp-3; i++) | 
|  | 522                         { | 
|  | 523                                 list[seqCnt].hashValue[i] = (list[seqCnt].hashValue[i-1] - 125 * CHARCODE(seq1[i-1])) * 5 + CHARCODE(seq1[i+3]); | 
|  | 524                         } | 
|  | 525 | 
|  | 526 | 
|  | 527                         code = 0; | 
|  | 528                         for(i=0; i < 4; i++) | 
|  | 529                                 code = code * 5 + CHARCODE(list[seqCnt].rseq[i]); | 
|  | 530                         list[seqCnt].rhashValue[0] = code; | 
|  | 531 | 
|  | 532 | 
|  | 533                         for(i = 1; i < _mtmp-3; i++) | 
|  | 534                         { | 
|  | 535                                 list[seqCnt].rhashValue[i] = (list[seqCnt].rhashValue[i-1] - 125 * CHARCODE(rseq1[i-1])) * 5 + CHARCODE(rseq1[i+3]); | 
|  | 536                         } | 
|  | 537 | 
|  | 538                         j = 0; | 
|  | 539                         tmpSize = _mtmp / (errThreshold+1); | 
|  | 540 | 
|  | 541                         list[seqCnt].hashValSampleSize = getMem(sizeof(int)*(errThreshold+1)); | 
|  | 542                         for(i=0; i < errThreshold+1; i++) | 
|  | 543                         { | 
|  | 544                                 code = 0; | 
|  | 545                                 for(j = 0; j < tmpSize; j++) | 
|  | 546                                 { | 
|  | 547                                         code = code * 5 + CHARCODE(list[seqCnt].seq[i*tmpSize+j]); | 
|  | 548                                 } | 
|  | 549                                 list[seqCnt].hashValSampleSize[i] = code; | 
|  | 550                         } | 
|  | 551 | 
|  | 552 			sprintf(list[seqCnt].name,"%s%c", ((char*)name2)+1,'\0'); | 
|  | 553 | 
|  | 554 			seqCnt++; | 
|  | 555 | 
|  | 556 		} | 
|  | 557 		else | 
|  | 558 		{ | 
|  | 559 			discarded++; | 
|  | 560 		} | 
|  | 561 	} | 
|  | 562 | 
|  | 563 	if (seqCnt > 0) | 
|  | 564 	{ | 
|  | 565 		SEQ_LENGTH = strlen(list[0].seq); | 
|  | 566 	} | 
|  | 567 	else | 
|  | 568 	{ | 
|  | 569 		fprintf(stdout, "ERROR: No reads can be found for mapping\n"); | 
|  | 570 		return 0; | 
|  | 571 	} | 
|  | 572 | 
|  | 573 | 
|  | 574 	// Closing Files | 
|  | 575 	if (!compressed) | 
|  | 576 	{ | 
|  | 577 		fclose(_r_fp1); | 
|  | 578 		if ( pairedEnd && fileName2 != NULL ) | 
|  | 579 		{ | 
|  | 580 			fclose(_r_fp2); | 
|  | 581 		} | 
|  | 582 	} | 
|  | 583 	else | 
|  | 584 	{ | 
|  | 585 		gzclose(_r_gzfp1); | 
|  | 586 		if ( pairedEnd && fileName2 != NULL) | 
|  | 587 		{ | 
|  | 588 			gzclose(_r_fp2); | 
|  | 589 		} | 
|  | 590 	} | 
|  | 591 | 
|  | 592 	//qsort(list, seqCnt, sizeof(Read), toCompareRead); | 
|  | 593 | 
|  | 594 	adjustQual(list, seqCnt); | 
|  | 595 | 
|  | 596 	*seqList = list; | 
|  | 597 	*seqListSize = seqCnt; | 
|  | 598 | 
|  | 599 | 
|  | 600 	_r_seq = list; | 
|  | 601 	_r_seqCnt = seqCnt; | 
|  | 602 | 
|  | 603 	if ( pairedEnd ) discarded *= 2; | 
|  | 604 | 
|  | 605 	if (seqCnt>1) | 
|  | 606 	  fprintf(stdout, "%d sequences are read in %0.2f. (%d discarded) [Mem:%0.2f M]\n", seqCnt, (getTime()-startTime), discarded, getMemUsage()); | 
|  | 607 	else | 
|  | 608 	  fprintf(stdout, "%d sequence is read in %0.2f. (%d discarded) [Mem:%0.2f M]\n", seqCnt, (getTime()-startTime), discarded, getMemUsage()); | 
|  | 609 | 
|  | 610 | 
|  | 611 | 
|  | 612 	return 1; | 
|  | 613 } | 
|  | 614 /**********************************************/ | 
|  | 615 void loadSamplingLocations(int **samplingLocs, int * samplingLocsSize) | 
|  | 616 { | 
|  | 617 	int i; | 
|  | 618 	int samLocsSize = errThreshold + 1; | 
|  | 619 	int *samLocs = getMem(sizeof(int)*samLocsSize); | 
|  | 620 | 
|  | 621 	for (i=0; i<samLocsSize; i++) | 
|  | 622 	{ | 
|  | 623 		samLocs[i] = (SEQ_LENGTH / samLocsSize) *i; | 
|  | 624 		if ( samLocs[i] + WINDOW_SIZE > SEQ_LENGTH) | 
|  | 625 			samLocs[i] = SEQ_LENGTH - WINDOW_SIZE; | 
|  | 626 	} | 
|  | 627 | 
|  | 628 	// Outputing the sampling locations | 
|  | 629 | 
|  | 630 	/* | 
|  | 631 | 
|  | 632 	int j; | 
|  | 633  	for (i=0; i<SEQ_LENGTH; i++) | 
|  | 634 	{ | 
|  | 635 		fprintf(stdout, "-"); | 
|  | 636 	} | 
|  | 637 	fprintf(stdout, "\n"); | 
|  | 638 | 
|  | 639 	for ( i=0; i<samLocsSize; i++ ) | 
|  | 640 	{ | 
|  | 641 		for ( j=0; j<samLocs[i]; j++ ) | 
|  | 642 		{ | 
|  | 643 			fprintf(stdout," "); | 
|  | 644 		} | 
|  | 645 		for (j=0; j<WINDOW_SIZE; j++) | 
|  | 646 		{ | 
|  | 647 			fprintf(stdout,"+"); | 
|  | 648 		} | 
|  | 649 		fprintf(stdout, "\n"); | 
|  | 650 		fflush(stdout); | 
|  | 651 	} | 
|  | 652 | 
|  | 653 | 
|  | 654 	for ( i=0; i<SEQ_LENGTH; i++ ) | 
|  | 655 	{ | 
|  | 656 		fprintf(stdout, "-"); | 
|  | 657 	} | 
|  | 658 	fprintf(stdout, "\n"); | 
|  | 659 | 
|  | 660 	*/ | 
|  | 661 | 
|  | 662 	*samplingLocs = samLocs; | 
|  | 663 	*samplingLocsSize = samLocsSize; | 
|  | 664 	_r_samplingLocs = samLocs; | 
|  | 665 } | 
|  | 666 | 
|  | 667 void finalizeReads(char *fileName) | 
|  | 668 { | 
|  | 669 	FILE *fp1=NULL; | 
|  | 670 | 
|  | 671 	if (fileName != NULL) | 
|  | 672 	{ | 
|  | 673 		fp1 = fileOpen(fileName, "w"); | 
|  | 674 	} | 
|  | 675 	if (pairedEndMode) | 
|  | 676 		_r_seqCnt /=2; | 
|  | 677 | 
|  | 678 	int i=0; | 
|  | 679 	for (i = 0; i < _r_seqCnt; i++) | 
|  | 680 	{ | 
|  | 681 		if (pairedEndMode && _r_seq[2*i].hits[0] == 0 && _r_seq[2*i+1].hits[0] == 0  &&  strcmp(_r_seq[2*i].qual,"*")!=0) | 
|  | 682 		{ | 
|  | 683 			fprintf(fp1,"@%s/1\n%s\n+\n%s\n@%s/2\n%s\n+\n%s\n", _r_seq[i*2].name, _r_seq[i*2].seq, _r_seq[i*2].qual, _r_seq[i*2].name, _r_seq[i*2+1].seq, _r_seq[i*2+1].qual); | 
|  | 684 		} | 
|  | 685 		else if (pairedEndMode && _r_seq[2*i].hits[0] == 0 && _r_seq[2*i+1].hits[0] == 0) | 
|  | 686 		{ | 
|  | 687 			fprintf(fp1, ">%s/1\n%s\n>%s/2\n%shits=%d\n", _r_seq[i*2].name, _r_seq[i*2].seq, _r_seq[i*2].name, _r_seq[i*2+1].seq, _r_seq[2*i+1].hits[0]); | 
|  | 688 		} | 
|  | 689 		else if (!pairedEndMode && _r_seq[i].hits[0] == 0 && strcmp(_r_seq[i].qual, "*")!=0) | 
|  | 690 		{ | 
|  | 691 			fprintf(fp1,"@%s\n%s\n+\n%s\n", _r_seq[i].name, _r_seq[i].seq, _r_seq[i].qual); | 
|  | 692 		} | 
|  | 693 		else if (!pairedEndMode && _r_seq[i].hits[0] == 0) | 
|  | 694 		{ | 
|  | 695 			fprintf(fp1,">%s\n%s\n", _r_seq[i].name, _r_seq[i].seq); | 
|  | 696 		} | 
|  | 697 	} | 
|  | 698 | 
|  | 699 	fclose(fp1); | 
|  | 700 	if (pairedEndMode) | 
|  | 701 		_r_seqCnt *= 2; | 
|  | 702 | 
|  | 703 	for (i = 0; i < _r_seqCnt; i++) | 
|  | 704 	{ | 
|  | 705 		freeMem(_r_seq[i].hits,0); | 
|  | 706 	} | 
|  | 707 | 
|  | 708 | 
|  | 709 	freeMem(_r_seq,0); | 
|  | 710 	freeMem(_r_samplingLocs,0); | 
|  | 711 } | 
|  | 712 | 
|  | 713 void adjustQual(Read *list, int seqCnt){ | 
|  | 714   /* This function will automatically determine the phred_offset and readjust quality values if needed */ | 
|  | 715   int i,j,q, offset=64; | 
|  | 716   int len = strlen(list[0].qual); | 
|  | 717 | 
|  | 718   for (i=0; i<10000 && i<seqCnt; i++){ | 
|  | 719     for (j=0;j<len;j++){ | 
|  | 720       q = (int) list[i].qual[j] - offset; | 
|  | 721       if (q < 0){ | 
|  | 722 	offset = 33; | 
|  | 723 	break; | 
|  | 724       } | 
|  | 725     } | 
|  | 726     if (offset == 33) | 
|  | 727       break; | 
|  | 728   } | 
|  | 729 | 
|  | 730   if (offset == 64){ | 
|  | 731     fprintf(stdout, "[Quality Warning] Phred offset is 64. Readjusting to 33.\n"); | 
|  | 732     fflush(stdout); | 
|  | 733     for (i=0;i<seqCnt;i++){ | 
|  | 734       for (j=0;j<len;j++){ | 
|  | 735 	list[i].qual[j] -= 31; | 
|  | 736       } | 
|  | 737     } | 
|  | 738   } | 
|  | 739 } | 
|  | 740 | 
|  | 741 | 
|  | 742 | 
|  | 743 /*void finalizeOEAReads(char *fileName) | 
|  | 744 { | 
|  | 745 	FILE *fp1=NULL; | 
|  | 746 	FILE *fp2=NULL; | 
|  | 747 | 
|  | 748 	//printf("OEA%s\n", fileName); | 
|  | 749 	char fileNameOEA1[200]; | 
|  | 750 	char fileNameOEA2[200]; | 
|  | 751 	sprintf(fileNameOEA1, "%s_OEA1", fileName); | 
|  | 752 	sprintf(fileNameOEA2, "%s_OEA2", fileName); | 
|  | 753 | 
|  | 754 | 
|  | 755 	if (fileName != NULL) | 
|  | 756 	{ | 
|  | 757 		fp1 = fileOpen(fileNameOEA1, "w"); | 
|  | 758 		fp2 = fileOpen(fileNameOEA2, "w"); | 
|  | 759 	} | 
|  | 760 	if (pairedEndMode) | 
|  | 761 		_r_seqCnt /=2; | 
|  | 762 | 
|  | 763 	int i=0; | 
|  | 764 	printf("%d\n", _r_seqCnt); | 
|  | 765 	for (i = 0; i < _r_seqCnt; i++) | 
|  | 766 	{ | 
|  | 767 		if (pairedEndMode && _r_seq[2*i].hits[0] == 0 &&  _r_seq[2*i+1].hits[0] != 0 && strcmp(_r_seq[2*i].qual, "*")==0) | 
|  | 768 		{ | 
|  | 769 			fprintf(fp1,">%s/1\n%s\n>%s/2\n%s\n", _r_seq[i*2].name, _r_seq[i*2].seq, _r_seq[i*2].name, _r_seq[i*2+1].seq); | 
|  | 770 		} | 
|  | 771 		else if (pairedEndMode && _r_seq[2*i].hits[0] != 0 && _r_seq[2*i+1].hits[0] == 0 && strcmp(_r_seq[2*i].qual, "*")==0) | 
|  | 772 		{ | 
|  | 773 			fprintf(fp2, ">%s/1\n%s\n>%s/2\n%s\n", _r_seq[i*2].name, _r_seq[i*2].seq, _r_seq[i*2].name, _r_seq[i*2+1].seq); | 
|  | 774 		} | 
|  | 775 		else if (pairedEndMode && _r_seq[2*i].hits[0] == 0 &&  _r_seq[2*i+1].hits[0] != 0 && strcmp(_r_seq[2*i].qual, "*")!=0) | 
|  | 776 		{ | 
|  | 777  		fprintf(fp1,"@%s\n%s\n+\n%s\n@%s\n%s\n+\n%s\n", _r_seq[2*i].name, | 
|  | 778 									_r_seq[2*i].seq, | 
|  | 779 									_r_seq[2*i].qual, | 
|  | 780 									_r_seq[2*i+1].name, | 
|  | 781 									_r_seq[2*i+1].seq, | 
|  | 782 									_r_seq[2*i+1].qual); | 
|  | 783 		} | 
|  | 784 		else if ( pairedEndMode && _r_seq[2*i].hits[0] != 0 &&  _r_seq[2*i+1].hits[0] == 0 && strcmp(_r_seq[2*i].qual, "*")!=0  ) | 
|  | 785 		{ | 
|  | 786 			 fprintf(fp2,"@%s\n%s\n+\n%s\n@%s\n%s\n+\n%s\n", _r_seq[2*i].name, | 
|  | 787                                                                         _r_seq[2*i].seq, | 
|  | 788                                                                         _r_seq[2*i].qual, | 
|  | 789                                                                         _r_seq[2*i+1].name, | 
|  | 790                                                                         _r_seq[2*i+1].seq, | 
|  | 791                                                                         _r_seq[2*i+1].qual); | 
|  | 792 		} | 
|  | 793 	} | 
|  | 794 | 
|  | 795 	fclose(fp1); | 
|  | 796 	fclose(fp2); | 
|  | 797 | 
|  | 798 }*/ |