comparison mrsfast-2.3.0.2/Reads.c @ 0:ec628ba33878 default tip

Uploaded source code for mrsFAST
author calkan
date Tue, 21 Feb 2012 10:39:28 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:ec628ba33878
1 /*
2 * Copyright (c) <2008 - 2009>, University of Washington, Simon Fraser University
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without modification,
6 * are permitted provided that the following conditions are met:
7 *
8 * Redistributions of source code must retain the above copyright notice, this list
9 * of conditions and the following disclaimer.
10 * - Redistributions in binary form must reproduce the above copyright notice, this
11 * list of conditions and the following disclaimer in the documentation and/or other
12 * materials provided with the distribution.
13 * - Neither the name of the <ORGANIZATION> nor the names of its contributors may be
14 * used to endorse or promote products derived from this software without specific
15 * prior written permission.
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
20 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 /*
30 * Author : Faraz Hach
31 * Email : fhach AT cs DOT sfu
32 * Last Update : 2009-12-08
33 */
34
35
36 #include <stdio.h>
37 #include <stdlib.h>
38 #include <string.h>
39 #include <ctype.h>
40 #include <zlib.h>
41 #include "Common.h"
42 #include "Reads.h"
43
44
45
46 FILE *_r_fp1;
47 FILE *_r_fp2;
48 gzFile _r_gzfp1;
49 gzFile _r_gzfp2;
50 Read *_r_seq;
51 int _r_seqCnt;
52 int *_r_samplingLocs;
53
54 /**********************************************/
55 char *(*readFirstSeq)(char *);
56 char *(*readSecondSeq)(char *);
57 /**********************************************/
58 char *readFirstSeqTXT( char *seq )
59 {
60 return fgets(seq, SEQ_MAX_LENGTH, _r_fp1);
61 }
62
63 /**********************************************/
64 char *readSecondSeqTXT( char *seq )
65 {
66 return fgets(seq, SEQ_MAX_LENGTH, _r_fp2);
67 }
68 /**********************************************/
69 char *readFirstSeqGZ( char *seq )
70 {
71 return gzgets(_r_gzfp1, seq, SEQ_MAX_LENGTH);
72 }
73
74 /**********************************************/
75 char *readSecondSeqGZ( char *seq )
76 {
77 return gzgets(_r_gzfp2, seq, SEQ_MAX_LENGTH);
78 }
79 /**********************************************/
80 int readAllReads(char *fileName1,
81 char *fileName2,
82 int compressed,
83 unsigned char *fastq,
84 unsigned char pairedEnd,
85 Read **seqList,
86 unsigned int *seqListSize)
87 {
88 double startTime=getTime();
89
90 char seq1[SEQ_MAX_LENGTH];
91 char rseq1[SEQ_MAX_LENGTH];
92 char name1[SEQ_MAX_LENGTH];
93 char qual1[SEQ_MAX_LENGTH];
94 char seq2[SEQ_MAX_LENGTH];
95 char rseq2[SEQ_MAX_LENGTH];
96 char name2[SEQ_MAX_LENGTH];
97 char qual2[SEQ_MAX_LENGTH];
98
99 char dummy[SEQ_MAX_LENGTH];
100 char ch;
101 int err1, err2;
102 int nCnt;
103 int discarded = 0;
104 int seqCnt = 0;
105 int maxCnt = 0;
106 int i;
107 Read *list = NULL;
108
109
110 if (!compressed)
111 {
112 _r_fp1 = fileOpen( fileName1, "r");
113
114 if (_r_fp1 == NULL)
115 {
116 return 0;
117 }
118
119 ch = fgetc(_r_fp1);
120
121 if ( pairedEnd && fileName2 != NULL )
122 {
123 _r_fp2 = fileOpen ( fileName2, "r" );
124 if (_r_fp2 == NULL)
125 {
126 return 0;
127 }
128 }
129 else
130 {
131 _r_fp2 = _r_fp1;
132 }
133
134 readFirstSeq = &readFirstSeqTXT;
135 readSecondSeq = &readSecondSeqTXT;
136 }
137 else
138 {
139
140 _r_gzfp1 = fileOpenGZ (fileName1, "r");
141
142 if (_r_gzfp1 == NULL)
143 {
144 return 0;
145 }
146
147 ch = gzgetc(_r_gzfp1);
148
149 if ( pairedEnd && fileName2 != NULL )
150 {
151 _r_fp2 = fileOpenGZ ( fileName2, "r" );
152 if (_r_fp2 == NULL)
153 {
154 return 0;
155 }
156 }
157 else
158 {
159 _r_fp2 = _r_fp1;
160 }
161
162 readFirstSeq = &readFirstSeqGZ;
163 readSecondSeq = &readSecondSeqGZ;
164 }
165
166 if (ch == '>')
167 *fastq = 0;
168 else
169 *fastq = 1;
170
171 // Counting the number of lines in the file
172 while (readFirstSeq(dummy)) maxCnt++;
173
174 if (!compressed)
175 {
176 rewind(_r_fp1);
177 }
178 else
179 {
180 gzrewind(_r_gzfp1);
181 }
182
183 // Calculating the Maximum # of sequences
184 if (*fastq)
185 {
186 maxCnt /= 4;
187 }
188 else
189 {
190 maxCnt /= 2;
191 }
192
193
194
195 if (pairedEnd && fileName2 != NULL )
196 maxCnt *= 2;
197
198 list = getMem(sizeof(Read)*maxCnt);
199
200 while( readFirstSeq(name1) )
201 {
202 err1 = 0;
203 err2 = 0;
204 readFirstSeq(seq1);
205 name1[strlen(name1)-1] = '\0';
206 for (i=0; i<strlen(name1);i++)
207 {
208 if (name1[i] == ' ')
209 {
210 name1[i] = '\0';
211 break;
212 }
213
214 }
215
216 if ( *fastq )
217 {
218 readFirstSeq(dummy);
219 readFirstSeq(qual1);
220 qual1[strlen(qual1)-1] = '\0';
221 }
222 else
223 {
224 sprintf(qual1, "*");
225 }
226
227
228 // Cropping
229 if (cropSize > 0)
230 {
231 seq1[cropSize] = '\0';
232 if ( *fastq )
233 qual1[cropSize] = '\0';
234 }
235
236
237 nCnt = 0;
238 for (i=0; i<strlen(seq1); i++)
239 {
240 seq1[i] = toupper (seq1[i]);
241 if (seq1[i] == 'N')
242 {
243 nCnt++;
244 }
245 else if (isspace(seq1[i]))
246 {
247
248 seq1[i] = '\0';
249 break;
250 }
251 }
252
253 if (nCnt > errThreshold)
254 {
255 err1 = 1;
256 }
257
258 // Reading the second seq of pair-ends
259 if (pairedEnd)
260 {
261 readSecondSeq(name2);
262 readSecondSeq(seq2);
263 name2[strlen(name2)-1] = '\0';
264 for (i=0; i<strlen(name2);i++)
265 {
266 if (name2[i] == ' ')
267 {
268 name2[i] = '\0';
269 break;
270 }
271
272 }
273
274 if ( *fastq )
275 {
276 readSecondSeq(dummy);
277 readSecondSeq(qual2);
278
279 qual2[strlen(qual2)-1] = '\0';
280 }
281 else
282 {
283 sprintf(qual2, "*");
284 }
285
286
287 // Cropping
288 if (cropSize > 0)
289 {
290 seq2[cropSize] = '\0';
291 if ( *fastq )
292 qual2[cropSize] = '\0';
293 }
294
295
296 nCnt = 0;
297 for (i=0; i<strlen(seq2); i++)
298 {
299 seq2[i] = toupper (seq2[i]);
300 if (seq2[i] == 'N')
301 {
302 nCnt++;
303
304 }
305 else if (isspace(seq2[i]))
306 {
307 seq2[i] = '\0';
308 }
309 }
310 if (nCnt > errThreshold)
311 {
312 err2 = 1;
313 }
314 }
315
316 if (!pairedEnd && !err1)
317 {
318
319 int _mtmp = strlen(seq1);
320 list[seqCnt].hits = getMem (1+3*_mtmp+3+strlen(name1)+1);
321 list[seqCnt].seq = list[seqCnt].hits + 1;
322 list[seqCnt].rseq = list[seqCnt].seq + _mtmp+1;
323 list[seqCnt].qual = list[seqCnt].rseq + _mtmp+1;
324 list[seqCnt].name = list[seqCnt].qual + _mtmp+1;
325
326
327 reverseComplete(seq1, rseq1, _mtmp);
328 rseq1[_mtmp] = '\0';
329 int i;
330
331 list[seqCnt].hits[0] = 0;
332
333 for (i=0; i<=_mtmp; i++)
334 {
335 list[seqCnt].seq[i] = seq1[i];
336 list[seqCnt].rseq[i] = rseq1[i] ;
337 list[seqCnt].qual[i] = qual1[i];
338 }
339 sprintf(list[seqCnt].name,"%s%c", ((char*)name1)+1,'\0');
340
341 seqCnt++;
342
343 }
344 else if (pairedEnd && !err1 && !err2)
345 {
346 // Naming Conventions X/1, X/2 OR X
347 int tmplen = strlen(name1);
348 if (strcmp(name1, name2) != 0)
349 {
350 tmplen = strlen(name1)-2;
351 }
352
353 if (strcmp(name1, "@IL11_266:2:1:922:509/1") == 0)
354 {
355 fprintf(stdout, "%d\n", seqCnt);
356 }
357 //first seq
358 int _mtmp = strlen(seq1);
359 list[seqCnt].hits = getMem (1+3*_mtmp+3+tmplen+1);
360 list[seqCnt].seq = list[seqCnt].hits + 1;
361 list[seqCnt].rseq = list[seqCnt].seq + _mtmp+1;
362 list[seqCnt].qual = list[seqCnt].rseq + _mtmp+1;
363 list[seqCnt].name = list[seqCnt].qual + _mtmp+1;
364
365 reverseComplete(seq1, rseq1, _mtmp);
366 rseq1[_mtmp] = '\0';
367 int i;
368
369 list[seqCnt].hits[0] = 0;
370
371 for (i=0; i<=_mtmp; i++)
372 {
373 list[seqCnt].seq[i] = seq1[i];
374 list[seqCnt].rseq[i] = rseq1[i] ;
375 list[seqCnt].qual[i] = qual1[i];
376 }
377
378
379 name1[tmplen]='\0';
380 sprintf(list[seqCnt].name,"%s%c", ((char*)name1)+1,'\0');
381
382
383 seqCnt++;
384
385 //second seq
386 list[seqCnt].hits = getMem (1+3*_mtmp+3+tmplen+1);
387 list[seqCnt].seq = list[seqCnt].hits + 1;
388 list[seqCnt].rseq = list[seqCnt].seq + _mtmp+1;
389 list[seqCnt].qual = list[seqCnt].rseq + _mtmp+1;
390 list[seqCnt].name = list[seqCnt].qual + _mtmp+1;
391
392 reverseComplete(seq2, rseq2, _mtmp);
393 rseq2[_mtmp] = '\0';
394
395 list[seqCnt].hits[0] = 0;
396
397 for (i=0; i<=_mtmp; i++)
398 {
399 list[seqCnt].seq[i] = seq2[i];
400 list[seqCnt].rseq[i] = rseq2[i] ;
401 list[seqCnt].qual[i] = qual2[i];
402 }
403
404
405 name2[tmplen]='\0';
406 sprintf(list[seqCnt].name,"%s%c", ((char*)name2)+1,'\0');
407
408
409 seqCnt++;
410
411 }
412 else
413 {
414 discarded++;
415 }
416 }
417
418 if (seqCnt > 0)
419 {
420 QUAL_LENGTH = SEQ_LENGTH = strlen(list[0].seq);
421 if (! *fastq)
422 {
423 QUAL_LENGTH = 1;
424 }
425 //fprintf(stderr, "%d %d\n", SEQ_LENGTH, QUAL_LENGTH);
426 }
427 else
428 {
429 fprintf(stdout, "ERR: No reads can be found for mapping\n");
430 return 0;
431 }
432
433
434 if (pairedEnd)
435 {
436 // seqCnt /= 2;
437 }
438
439
440 // Closing Files
441 if (!compressed)
442 {
443 fclose(_r_fp1);
444 if ( pairedEnd && fileName2 != NULL )
445 {
446 fclose(_r_fp2);
447 }
448 }
449 else
450 {
451 gzclose(_r_gzfp1);
452 if ( pairedEnd && fileName2 != NULL)
453 {
454 gzclose(_r_fp2);
455 }
456 }
457
458 *seqList = list;
459 *seqListSize = seqCnt;
460
461 _r_seq = list;
462 _r_seqCnt = seqCnt;
463
464 fprintf(stdout, "%d sequences are read in %0.2f. (%d discarded) [Mem:%0.2f M]\n", seqCnt, (getTime()-startTime), discarded, getMemUsage());
465 //totalLoadingTime+=getTime()-startTime;
466
467 return 1;
468 }
469 /**********************************************/
470 void loadSamplingLocations(int **samplingLocs, int * samplingLocsSize)
471 {
472 int i;
473 int samLocsSize = errThreshold + 1;
474 int *samLocs = getMem(sizeof(int)*samLocsSize);
475
476 for (i=0; i<samLocsSize; i++)
477 {
478 samLocs[i] = (SEQ_LENGTH / samLocsSize) *i;
479 if ( samLocs[i] + WINDOW_SIZE > SEQ_LENGTH)
480 samLocs[i] = SEQ_LENGTH - WINDOW_SIZE;
481 }
482
483 // Outputing the sampling locations
484
485 /* int j;
486 for (i=0; i<SEQ_LENGTH; i++)
487 {
488 fprintf(stdout, "-");
489 }
490 fprintf(stdout, "\n");
491
492 for ( i=0; i<samLocsSize; i++ )
493 {
494 for ( j=0; j<samLocs[i]; j++ )
495 {
496 fprintf(stdout," ");
497 }
498 for (j=0; j<WINDOW_SIZE; j++)
499 {
500 fprintf(stdout,"+");
501 }
502 fprintf(stdout, "\n");
503 fflush(stdout);
504 }
505 for ( i=0; i<SEQ_LENGTH; i++ )
506 {
507 fprintf(stdout, "-");
508 }
509 fprintf(stdout, "\n");*/
510 *samplingLocs = samLocs;
511 *samplingLocsSize = samLocsSize;
512 _r_samplingLocs = samLocs;
513 }
514
515 void finalizeReads(char *fileName)
516 {
517 FILE *fp1=NULL;
518
519 if (fileName != NULL)
520 {
521 fp1 = fileOpen(fileName, "w");
522 }
523 if (pairedEndMode)
524 _r_seqCnt /=2;
525
526 int i=0;
527 for (i = 0; i < _r_seqCnt; i++)
528 {
529 if (pairedEndMode && _r_seq[2*i].hits[0] == 0 && strcmp(_r_seq[2*i].qual,"*")!=0)
530 {
531 fprintf(fp1,"@%s/1\n%s\n+\n%s\n@%s/2\n%s\n+\n%s\n", _r_seq[i*2].name, _r_seq[i*2].seq, _r_seq[i*2].qual, _r_seq[i*2].name, _r_seq[i*2+1].seq, _r_seq[i*2+1].qual);
532 }
533 else if (pairedEndMode && _r_seq[2*i].hits[0] == 0)
534 {
535 fprintf(fp1, ">%s/1\n%s\n>%s/2\n%s\n", _r_seq[i*2].name, _r_seq[i*2].seq, _r_seq[i*2].name, _r_seq[i*2+1].seq);
536 }
537 else if (_r_seq[i].hits[0] == 0 && strcmp(_r_seq[i].qual, "*")!=0)
538 {
539 fprintf(fp1,"@%s\n%s\n+\n%s\n", _r_seq[i].name, _r_seq[i].seq, _r_seq[i].qual);
540 }
541 else if (_r_seq[i].hits[0] == 0)
542 {
543 fprintf(fp1,">%s\n%s\n", _r_seq[i].name, _r_seq[i].seq);
544 }
545 }
546
547 fclose(fp1);
548 if (pairedEndMode)
549 _r_seqCnt *= 2;
550
551 for (i = 0; i < _r_seqCnt; i++)
552 {
553 freeMem(_r_seq[i].hits,0);
554 }
555
556
557 freeMem(_r_seq,0);
558 freeMem(_r_samplingLocs,0);
559 }