Mercurial > repos > calkan > mrsfast
comparison mrsfast-2.3.0.2/Reads.c @ 0:ec628ba33878 default tip
Uploaded source code for mrsFAST
author | calkan |
---|---|
date | Tue, 21 Feb 2012 10:39:28 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:ec628ba33878 |
---|---|
1 /* | |
2 * Copyright (c) <2008 - 2009>, University of Washington, Simon Fraser University | |
3 * All rights reserved. | |
4 * | |
5 * Redistribution and use in source and binary forms, with or without modification, | |
6 * are permitted provided that the following conditions are met: | |
7 * | |
8 * Redistributions of source code must retain the above copyright notice, this list | |
9 * of conditions and the following disclaimer. | |
10 * - Redistributions in binary form must reproduce the above copyright notice, this | |
11 * list of conditions and the following disclaimer in the documentation and/or other | |
12 * materials provided with the distribution. | |
13 * - Neither the name of the <ORGANIZATION> nor the names of its contributors may be | |
14 * used to endorse or promote products derived from this software without specific | |
15 * prior written permission. | |
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
17 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
18 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
19 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR | |
20 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | |
21 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | |
22 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | |
23 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | |
24 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | |
25 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
27 */ | |
28 | |
29 /* | |
30 * Author : Faraz Hach | |
31 * Email : fhach AT cs DOT sfu | |
32 * Last Update : 2009-12-08 | |
33 */ | |
34 | |
35 | |
36 #include <stdio.h> | |
37 #include <stdlib.h> | |
38 #include <string.h> | |
39 #include <ctype.h> | |
40 #include <zlib.h> | |
41 #include "Common.h" | |
42 #include "Reads.h" | |
43 | |
44 | |
45 | |
46 FILE *_r_fp1; | |
47 FILE *_r_fp2; | |
48 gzFile _r_gzfp1; | |
49 gzFile _r_gzfp2; | |
50 Read *_r_seq; | |
51 int _r_seqCnt; | |
52 int *_r_samplingLocs; | |
53 | |
54 /**********************************************/ | |
55 char *(*readFirstSeq)(char *); | |
56 char *(*readSecondSeq)(char *); | |
57 /**********************************************/ | |
58 char *readFirstSeqTXT( char *seq ) | |
59 { | |
60 return fgets(seq, SEQ_MAX_LENGTH, _r_fp1); | |
61 } | |
62 | |
63 /**********************************************/ | |
64 char *readSecondSeqTXT( char *seq ) | |
65 { | |
66 return fgets(seq, SEQ_MAX_LENGTH, _r_fp2); | |
67 } | |
68 /**********************************************/ | |
69 char *readFirstSeqGZ( char *seq ) | |
70 { | |
71 return gzgets(_r_gzfp1, seq, SEQ_MAX_LENGTH); | |
72 } | |
73 | |
74 /**********************************************/ | |
75 char *readSecondSeqGZ( char *seq ) | |
76 { | |
77 return gzgets(_r_gzfp2, seq, SEQ_MAX_LENGTH); | |
78 } | |
79 /**********************************************/ | |
80 int readAllReads(char *fileName1, | |
81 char *fileName2, | |
82 int compressed, | |
83 unsigned char *fastq, | |
84 unsigned char pairedEnd, | |
85 Read **seqList, | |
86 unsigned int *seqListSize) | |
87 { | |
88 double startTime=getTime(); | |
89 | |
90 char seq1[SEQ_MAX_LENGTH]; | |
91 char rseq1[SEQ_MAX_LENGTH]; | |
92 char name1[SEQ_MAX_LENGTH]; | |
93 char qual1[SEQ_MAX_LENGTH]; | |
94 char seq2[SEQ_MAX_LENGTH]; | |
95 char rseq2[SEQ_MAX_LENGTH]; | |
96 char name2[SEQ_MAX_LENGTH]; | |
97 char qual2[SEQ_MAX_LENGTH]; | |
98 | |
99 char dummy[SEQ_MAX_LENGTH]; | |
100 char ch; | |
101 int err1, err2; | |
102 int nCnt; | |
103 int discarded = 0; | |
104 int seqCnt = 0; | |
105 int maxCnt = 0; | |
106 int i; | |
107 Read *list = NULL; | |
108 | |
109 | |
110 if (!compressed) | |
111 { | |
112 _r_fp1 = fileOpen( fileName1, "r"); | |
113 | |
114 if (_r_fp1 == NULL) | |
115 { | |
116 return 0; | |
117 } | |
118 | |
119 ch = fgetc(_r_fp1); | |
120 | |
121 if ( pairedEnd && fileName2 != NULL ) | |
122 { | |
123 _r_fp2 = fileOpen ( fileName2, "r" ); | |
124 if (_r_fp2 == NULL) | |
125 { | |
126 return 0; | |
127 } | |
128 } | |
129 else | |
130 { | |
131 _r_fp2 = _r_fp1; | |
132 } | |
133 | |
134 readFirstSeq = &readFirstSeqTXT; | |
135 readSecondSeq = &readSecondSeqTXT; | |
136 } | |
137 else | |
138 { | |
139 | |
140 _r_gzfp1 = fileOpenGZ (fileName1, "r"); | |
141 | |
142 if (_r_gzfp1 == NULL) | |
143 { | |
144 return 0; | |
145 } | |
146 | |
147 ch = gzgetc(_r_gzfp1); | |
148 | |
149 if ( pairedEnd && fileName2 != NULL ) | |
150 { | |
151 _r_fp2 = fileOpenGZ ( fileName2, "r" ); | |
152 if (_r_fp2 == NULL) | |
153 { | |
154 return 0; | |
155 } | |
156 } | |
157 else | |
158 { | |
159 _r_fp2 = _r_fp1; | |
160 } | |
161 | |
162 readFirstSeq = &readFirstSeqGZ; | |
163 readSecondSeq = &readSecondSeqGZ; | |
164 } | |
165 | |
166 if (ch == '>') | |
167 *fastq = 0; | |
168 else | |
169 *fastq = 1; | |
170 | |
171 // Counting the number of lines in the file | |
172 while (readFirstSeq(dummy)) maxCnt++; | |
173 | |
174 if (!compressed) | |
175 { | |
176 rewind(_r_fp1); | |
177 } | |
178 else | |
179 { | |
180 gzrewind(_r_gzfp1); | |
181 } | |
182 | |
183 // Calculating the Maximum # of sequences | |
184 if (*fastq) | |
185 { | |
186 maxCnt /= 4; | |
187 } | |
188 else | |
189 { | |
190 maxCnt /= 2; | |
191 } | |
192 | |
193 | |
194 | |
195 if (pairedEnd && fileName2 != NULL ) | |
196 maxCnt *= 2; | |
197 | |
198 list = getMem(sizeof(Read)*maxCnt); | |
199 | |
200 while( readFirstSeq(name1) ) | |
201 { | |
202 err1 = 0; | |
203 err2 = 0; | |
204 readFirstSeq(seq1); | |
205 name1[strlen(name1)-1] = '\0'; | |
206 for (i=0; i<strlen(name1);i++) | |
207 { | |
208 if (name1[i] == ' ') | |
209 { | |
210 name1[i] = '\0'; | |
211 break; | |
212 } | |
213 | |
214 } | |
215 | |
216 if ( *fastq ) | |
217 { | |
218 readFirstSeq(dummy); | |
219 readFirstSeq(qual1); | |
220 qual1[strlen(qual1)-1] = '\0'; | |
221 } | |
222 else | |
223 { | |
224 sprintf(qual1, "*"); | |
225 } | |
226 | |
227 | |
228 // Cropping | |
229 if (cropSize > 0) | |
230 { | |
231 seq1[cropSize] = '\0'; | |
232 if ( *fastq ) | |
233 qual1[cropSize] = '\0'; | |
234 } | |
235 | |
236 | |
237 nCnt = 0; | |
238 for (i=0; i<strlen(seq1); i++) | |
239 { | |
240 seq1[i] = toupper (seq1[i]); | |
241 if (seq1[i] == 'N') | |
242 { | |
243 nCnt++; | |
244 } | |
245 else if (isspace(seq1[i])) | |
246 { | |
247 | |
248 seq1[i] = '\0'; | |
249 break; | |
250 } | |
251 } | |
252 | |
253 if (nCnt > errThreshold) | |
254 { | |
255 err1 = 1; | |
256 } | |
257 | |
258 // Reading the second seq of pair-ends | |
259 if (pairedEnd) | |
260 { | |
261 readSecondSeq(name2); | |
262 readSecondSeq(seq2); | |
263 name2[strlen(name2)-1] = '\0'; | |
264 for (i=0; i<strlen(name2);i++) | |
265 { | |
266 if (name2[i] == ' ') | |
267 { | |
268 name2[i] = '\0'; | |
269 break; | |
270 } | |
271 | |
272 } | |
273 | |
274 if ( *fastq ) | |
275 { | |
276 readSecondSeq(dummy); | |
277 readSecondSeq(qual2); | |
278 | |
279 qual2[strlen(qual2)-1] = '\0'; | |
280 } | |
281 else | |
282 { | |
283 sprintf(qual2, "*"); | |
284 } | |
285 | |
286 | |
287 // Cropping | |
288 if (cropSize > 0) | |
289 { | |
290 seq2[cropSize] = '\0'; | |
291 if ( *fastq ) | |
292 qual2[cropSize] = '\0'; | |
293 } | |
294 | |
295 | |
296 nCnt = 0; | |
297 for (i=0; i<strlen(seq2); i++) | |
298 { | |
299 seq2[i] = toupper (seq2[i]); | |
300 if (seq2[i] == 'N') | |
301 { | |
302 nCnt++; | |
303 | |
304 } | |
305 else if (isspace(seq2[i])) | |
306 { | |
307 seq2[i] = '\0'; | |
308 } | |
309 } | |
310 if (nCnt > errThreshold) | |
311 { | |
312 err2 = 1; | |
313 } | |
314 } | |
315 | |
316 if (!pairedEnd && !err1) | |
317 { | |
318 | |
319 int _mtmp = strlen(seq1); | |
320 list[seqCnt].hits = getMem (1+3*_mtmp+3+strlen(name1)+1); | |
321 list[seqCnt].seq = list[seqCnt].hits + 1; | |
322 list[seqCnt].rseq = list[seqCnt].seq + _mtmp+1; | |
323 list[seqCnt].qual = list[seqCnt].rseq + _mtmp+1; | |
324 list[seqCnt].name = list[seqCnt].qual + _mtmp+1; | |
325 | |
326 | |
327 reverseComplete(seq1, rseq1, _mtmp); | |
328 rseq1[_mtmp] = '\0'; | |
329 int i; | |
330 | |
331 list[seqCnt].hits[0] = 0; | |
332 | |
333 for (i=0; i<=_mtmp; i++) | |
334 { | |
335 list[seqCnt].seq[i] = seq1[i]; | |
336 list[seqCnt].rseq[i] = rseq1[i] ; | |
337 list[seqCnt].qual[i] = qual1[i]; | |
338 } | |
339 sprintf(list[seqCnt].name,"%s%c", ((char*)name1)+1,'\0'); | |
340 | |
341 seqCnt++; | |
342 | |
343 } | |
344 else if (pairedEnd && !err1 && !err2) | |
345 { | |
346 // Naming Conventions X/1, X/2 OR X | |
347 int tmplen = strlen(name1); | |
348 if (strcmp(name1, name2) != 0) | |
349 { | |
350 tmplen = strlen(name1)-2; | |
351 } | |
352 | |
353 if (strcmp(name1, "@IL11_266:2:1:922:509/1") == 0) | |
354 { | |
355 fprintf(stdout, "%d\n", seqCnt); | |
356 } | |
357 //first seq | |
358 int _mtmp = strlen(seq1); | |
359 list[seqCnt].hits = getMem (1+3*_mtmp+3+tmplen+1); | |
360 list[seqCnt].seq = list[seqCnt].hits + 1; | |
361 list[seqCnt].rseq = list[seqCnt].seq + _mtmp+1; | |
362 list[seqCnt].qual = list[seqCnt].rseq + _mtmp+1; | |
363 list[seqCnt].name = list[seqCnt].qual + _mtmp+1; | |
364 | |
365 reverseComplete(seq1, rseq1, _mtmp); | |
366 rseq1[_mtmp] = '\0'; | |
367 int i; | |
368 | |
369 list[seqCnt].hits[0] = 0; | |
370 | |
371 for (i=0; i<=_mtmp; i++) | |
372 { | |
373 list[seqCnt].seq[i] = seq1[i]; | |
374 list[seqCnt].rseq[i] = rseq1[i] ; | |
375 list[seqCnt].qual[i] = qual1[i]; | |
376 } | |
377 | |
378 | |
379 name1[tmplen]='\0'; | |
380 sprintf(list[seqCnt].name,"%s%c", ((char*)name1)+1,'\0'); | |
381 | |
382 | |
383 seqCnt++; | |
384 | |
385 //second seq | |
386 list[seqCnt].hits = getMem (1+3*_mtmp+3+tmplen+1); | |
387 list[seqCnt].seq = list[seqCnt].hits + 1; | |
388 list[seqCnt].rseq = list[seqCnt].seq + _mtmp+1; | |
389 list[seqCnt].qual = list[seqCnt].rseq + _mtmp+1; | |
390 list[seqCnt].name = list[seqCnt].qual + _mtmp+1; | |
391 | |
392 reverseComplete(seq2, rseq2, _mtmp); | |
393 rseq2[_mtmp] = '\0'; | |
394 | |
395 list[seqCnt].hits[0] = 0; | |
396 | |
397 for (i=0; i<=_mtmp; i++) | |
398 { | |
399 list[seqCnt].seq[i] = seq2[i]; | |
400 list[seqCnt].rseq[i] = rseq2[i] ; | |
401 list[seqCnt].qual[i] = qual2[i]; | |
402 } | |
403 | |
404 | |
405 name2[tmplen]='\0'; | |
406 sprintf(list[seqCnt].name,"%s%c", ((char*)name2)+1,'\0'); | |
407 | |
408 | |
409 seqCnt++; | |
410 | |
411 } | |
412 else | |
413 { | |
414 discarded++; | |
415 } | |
416 } | |
417 | |
418 if (seqCnt > 0) | |
419 { | |
420 QUAL_LENGTH = SEQ_LENGTH = strlen(list[0].seq); | |
421 if (! *fastq) | |
422 { | |
423 QUAL_LENGTH = 1; | |
424 } | |
425 //fprintf(stderr, "%d %d\n", SEQ_LENGTH, QUAL_LENGTH); | |
426 } | |
427 else | |
428 { | |
429 fprintf(stdout, "ERR: No reads can be found for mapping\n"); | |
430 return 0; | |
431 } | |
432 | |
433 | |
434 if (pairedEnd) | |
435 { | |
436 // seqCnt /= 2; | |
437 } | |
438 | |
439 | |
440 // Closing Files | |
441 if (!compressed) | |
442 { | |
443 fclose(_r_fp1); | |
444 if ( pairedEnd && fileName2 != NULL ) | |
445 { | |
446 fclose(_r_fp2); | |
447 } | |
448 } | |
449 else | |
450 { | |
451 gzclose(_r_gzfp1); | |
452 if ( pairedEnd && fileName2 != NULL) | |
453 { | |
454 gzclose(_r_fp2); | |
455 } | |
456 } | |
457 | |
458 *seqList = list; | |
459 *seqListSize = seqCnt; | |
460 | |
461 _r_seq = list; | |
462 _r_seqCnt = seqCnt; | |
463 | |
464 fprintf(stdout, "%d sequences are read in %0.2f. (%d discarded) [Mem:%0.2f M]\n", seqCnt, (getTime()-startTime), discarded, getMemUsage()); | |
465 //totalLoadingTime+=getTime()-startTime; | |
466 | |
467 return 1; | |
468 } | |
469 /**********************************************/ | |
470 void loadSamplingLocations(int **samplingLocs, int * samplingLocsSize) | |
471 { | |
472 int i; | |
473 int samLocsSize = errThreshold + 1; | |
474 int *samLocs = getMem(sizeof(int)*samLocsSize); | |
475 | |
476 for (i=0; i<samLocsSize; i++) | |
477 { | |
478 samLocs[i] = (SEQ_LENGTH / samLocsSize) *i; | |
479 if ( samLocs[i] + WINDOW_SIZE > SEQ_LENGTH) | |
480 samLocs[i] = SEQ_LENGTH - WINDOW_SIZE; | |
481 } | |
482 | |
483 // Outputing the sampling locations | |
484 | |
485 /* int j; | |
486 for (i=0; i<SEQ_LENGTH; i++) | |
487 { | |
488 fprintf(stdout, "-"); | |
489 } | |
490 fprintf(stdout, "\n"); | |
491 | |
492 for ( i=0; i<samLocsSize; i++ ) | |
493 { | |
494 for ( j=0; j<samLocs[i]; j++ ) | |
495 { | |
496 fprintf(stdout," "); | |
497 } | |
498 for (j=0; j<WINDOW_SIZE; j++) | |
499 { | |
500 fprintf(stdout,"+"); | |
501 } | |
502 fprintf(stdout, "\n"); | |
503 fflush(stdout); | |
504 } | |
505 for ( i=0; i<SEQ_LENGTH; i++ ) | |
506 { | |
507 fprintf(stdout, "-"); | |
508 } | |
509 fprintf(stdout, "\n");*/ | |
510 *samplingLocs = samLocs; | |
511 *samplingLocsSize = samLocsSize; | |
512 _r_samplingLocs = samLocs; | |
513 } | |
514 | |
515 void finalizeReads(char *fileName) | |
516 { | |
517 FILE *fp1=NULL; | |
518 | |
519 if (fileName != NULL) | |
520 { | |
521 fp1 = fileOpen(fileName, "w"); | |
522 } | |
523 if (pairedEndMode) | |
524 _r_seqCnt /=2; | |
525 | |
526 int i=0; | |
527 for (i = 0; i < _r_seqCnt; i++) | |
528 { | |
529 if (pairedEndMode && _r_seq[2*i].hits[0] == 0 && strcmp(_r_seq[2*i].qual,"*")!=0) | |
530 { | |
531 fprintf(fp1,"@%s/1\n%s\n+\n%s\n@%s/2\n%s\n+\n%s\n", _r_seq[i*2].name, _r_seq[i*2].seq, _r_seq[i*2].qual, _r_seq[i*2].name, _r_seq[i*2+1].seq, _r_seq[i*2+1].qual); | |
532 } | |
533 else if (pairedEndMode && _r_seq[2*i].hits[0] == 0) | |
534 { | |
535 fprintf(fp1, ">%s/1\n%s\n>%s/2\n%s\n", _r_seq[i*2].name, _r_seq[i*2].seq, _r_seq[i*2].name, _r_seq[i*2+1].seq); | |
536 } | |
537 else if (_r_seq[i].hits[0] == 0 && strcmp(_r_seq[i].qual, "*")!=0) | |
538 { | |
539 fprintf(fp1,"@%s\n%s\n+\n%s\n", _r_seq[i].name, _r_seq[i].seq, _r_seq[i].qual); | |
540 } | |
541 else if (_r_seq[i].hits[0] == 0) | |
542 { | |
543 fprintf(fp1,">%s\n%s\n", _r_seq[i].name, _r_seq[i].seq); | |
544 } | |
545 } | |
546 | |
547 fclose(fp1); | |
548 if (pairedEndMode) | |
549 _r_seqCnt *= 2; | |
550 | |
551 for (i = 0; i < _r_seqCnt; i++) | |
552 { | |
553 freeMem(_r_seq[i].hits,0); | |
554 } | |
555 | |
556 | |
557 freeMem(_r_seq,0); | |
558 freeMem(_r_samplingLocs,0); | |
559 } |