0
|
1 /*
|
|
2 * Copyright (c) <2008 - 2009>, University of Washington, Simon Fraser University
|
|
3 * All rights reserved.
|
|
4 *
|
|
5 * Redistribution and use in source and binary forms, with or without modification,
|
|
6 * are permitted provided that the following conditions are met:
|
|
7 *
|
|
8 * Redistributions of source code must retain the above copyright notice, this list
|
|
9 * of conditions and the following disclaimer.
|
|
10 * - Redistributions in binary form must reproduce the above copyright notice, this
|
|
11 * list of conditions and the following disclaimer in the documentation and/or other
|
|
12 * materials provided with the distribution.
|
|
13 * - Neither the name of the <ORGANIZATION> nor the names of its contributors may be
|
|
14 * used to endorse or promote products derived from this software without specific
|
|
15 * prior written permission.
|
|
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
17 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
18 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
19 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
20 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
21 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
22 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
23 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
|
24 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
25 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
27 */
|
|
28
|
|
29 /*
|
|
30 * Author : Faraz Hach
|
|
31 * Email : fhach AT cs DOT sfu
|
|
32 * Last Update : 2009-12-08
|
|
33 */
|
|
34
|
|
35
|
|
36 #include <stdio.h>
|
|
37 #include <stdlib.h>
|
|
38 #include <string.h>
|
|
39 #include <ctype.h>
|
|
40 #include <zlib.h>
|
|
41 #include "Common.h"
|
|
42 #include "Reads.h"
|
|
43
|
|
44
|
|
45
|
|
46 FILE *_r_fp1;
|
|
47 FILE *_r_fp2;
|
|
48 gzFile _r_gzfp1;
|
|
49 gzFile _r_gzfp2;
|
|
50 Read *_r_seq;
|
|
51 int _r_seqCnt;
|
|
52 int *_r_samplingLocs;
|
|
53
|
|
54 /**********************************************/
|
|
55 char *(*readFirstSeq)(char *);
|
|
56 char *(*readSecondSeq)(char *);
|
|
57 /**********************************************/
|
|
58 char *readFirstSeqTXT( char *seq )
|
|
59 {
|
|
60 return fgets(seq, SEQ_MAX_LENGTH, _r_fp1);
|
|
61 }
|
|
62
|
|
63 /**********************************************/
|
|
64 char *readSecondSeqTXT( char *seq )
|
|
65 {
|
|
66 return fgets(seq, SEQ_MAX_LENGTH, _r_fp2);
|
|
67 }
|
|
68 /**********************************************/
|
|
69 char *readFirstSeqGZ( char *seq )
|
|
70 {
|
|
71 return gzgets(_r_gzfp1, seq, SEQ_MAX_LENGTH);
|
|
72 }
|
|
73
|
|
74 /**********************************************/
|
|
75 char *readSecondSeqGZ( char *seq )
|
|
76 {
|
|
77 return gzgets(_r_gzfp2, seq, SEQ_MAX_LENGTH);
|
|
78 }
|
|
79 /**********************************************/
|
|
80 int readAllReads(char *fileName1,
|
|
81 char *fileName2,
|
|
82 int compressed,
|
|
83 unsigned char *fastq,
|
|
84 unsigned char pairedEnd,
|
|
85 Read **seqList,
|
|
86 unsigned int *seqListSize)
|
|
87 {
|
|
88 double startTime=getTime();
|
|
89
|
|
90 char seq1[SEQ_MAX_LENGTH];
|
|
91 char rseq1[SEQ_MAX_LENGTH];
|
|
92 char name1[SEQ_MAX_LENGTH];
|
|
93 char qual1[SEQ_MAX_LENGTH];
|
|
94 char seq2[SEQ_MAX_LENGTH];
|
|
95 char rseq2[SEQ_MAX_LENGTH];
|
|
96 char name2[SEQ_MAX_LENGTH];
|
|
97 char qual2[SEQ_MAX_LENGTH];
|
|
98
|
|
99 char dummy[SEQ_MAX_LENGTH];
|
|
100 char ch;
|
|
101 int err1, err2;
|
|
102 int nCnt;
|
|
103 int discarded = 0;
|
|
104 int seqCnt = 0;
|
|
105 int maxCnt = 0;
|
|
106 int i;
|
|
107 Read *list = NULL;
|
|
108
|
|
109
|
|
110 if (!compressed)
|
|
111 {
|
|
112 _r_fp1 = fileOpen( fileName1, "r");
|
|
113
|
|
114 if (_r_fp1 == NULL)
|
|
115 {
|
|
116 return 0;
|
|
117 }
|
|
118
|
|
119 ch = fgetc(_r_fp1);
|
|
120
|
|
121 if ( pairedEnd && fileName2 != NULL )
|
|
122 {
|
|
123 _r_fp2 = fileOpen ( fileName2, "r" );
|
|
124 if (_r_fp2 == NULL)
|
|
125 {
|
|
126 return 0;
|
|
127 }
|
|
128 }
|
|
129 else
|
|
130 {
|
|
131 _r_fp2 = _r_fp1;
|
|
132 }
|
|
133
|
|
134 readFirstSeq = &readFirstSeqTXT;
|
|
135 readSecondSeq = &readSecondSeqTXT;
|
|
136 }
|
|
137 else
|
|
138 {
|
|
139
|
|
140 _r_gzfp1 = fileOpenGZ (fileName1, "r");
|
|
141
|
|
142 if (_r_gzfp1 == NULL)
|
|
143 {
|
|
144 return 0;
|
|
145 }
|
|
146
|
|
147 ch = gzgetc(_r_gzfp1);
|
|
148
|
|
149 if ( pairedEnd && fileName2 != NULL )
|
|
150 {
|
|
151 _r_fp2 = fileOpenGZ ( fileName2, "r" );
|
|
152 if (_r_fp2 == NULL)
|
|
153 {
|
|
154 return 0;
|
|
155 }
|
|
156 }
|
|
157 else
|
|
158 {
|
|
159 _r_fp2 = _r_fp1;
|
|
160 }
|
|
161
|
|
162 readFirstSeq = &readFirstSeqGZ;
|
|
163 readSecondSeq = &readSecondSeqGZ;
|
|
164 }
|
|
165
|
|
166 if (ch == '>')
|
|
167 *fastq = 0;
|
|
168 else
|
|
169 *fastq = 1;
|
|
170
|
|
171 // Counting the number of lines in the file
|
|
172 while (readFirstSeq(dummy)) maxCnt++;
|
|
173
|
|
174 if (!compressed)
|
|
175 {
|
|
176 rewind(_r_fp1);
|
|
177 }
|
|
178 else
|
|
179 {
|
|
180 gzrewind(_r_gzfp1);
|
|
181 }
|
|
182
|
|
183 // Calculating the Maximum # of sequences
|
|
184 if (*fastq)
|
|
185 {
|
|
186 maxCnt /= 4;
|
|
187 }
|
|
188 else
|
|
189 {
|
|
190 maxCnt /= 2;
|
|
191 }
|
|
192
|
|
193
|
|
194
|
|
195 if (pairedEnd && fileName2 != NULL )
|
|
196 maxCnt *= 2;
|
|
197
|
|
198 list = getMem(sizeof(Read)*maxCnt);
|
|
199
|
|
200 while( readFirstSeq(name1) )
|
|
201 {
|
|
202 err1 = 0;
|
|
203 err2 = 0;
|
|
204 readFirstSeq(seq1);
|
|
205 name1[strlen(name1)-1] = '\0';
|
|
206 for (i=0; i<strlen(name1);i++)
|
|
207 {
|
|
208 if (name1[i] == ' ')
|
|
209 {
|
|
210 name1[i] = '\0';
|
|
211 break;
|
|
212 }
|
|
213
|
|
214 }
|
|
215
|
|
216 if ( *fastq )
|
|
217 {
|
|
218 readFirstSeq(dummy);
|
|
219 readFirstSeq(qual1);
|
|
220 qual1[strlen(qual1)-1] = '\0';
|
|
221 }
|
|
222 else
|
|
223 {
|
|
224 sprintf(qual1, "*");
|
|
225 }
|
|
226
|
|
227
|
|
228 // Cropping
|
|
229 if (cropSize > 0)
|
|
230 {
|
|
231 seq1[cropSize] = '\0';
|
|
232 if ( *fastq )
|
|
233 qual1[cropSize] = '\0';
|
|
234 }
|
|
235
|
|
236
|
|
237 nCnt = 0;
|
|
238 for (i=0; i<strlen(seq1); i++)
|
|
239 {
|
|
240 seq1[i] = toupper (seq1[i]);
|
|
241 if (seq1[i] == 'N')
|
|
242 {
|
|
243 nCnt++;
|
|
244 }
|
|
245 else if (isspace(seq1[i]))
|
|
246 {
|
|
247
|
|
248 seq1[i] = '\0';
|
|
249 break;
|
|
250 }
|
|
251 }
|
|
252
|
|
253 if (nCnt > errThreshold)
|
|
254 {
|
|
255 err1 = 1;
|
|
256 }
|
|
257
|
|
258 // Reading the second seq of pair-ends
|
|
259 if (pairedEnd)
|
|
260 {
|
|
261 readSecondSeq(name2);
|
|
262 readSecondSeq(seq2);
|
|
263 name2[strlen(name2)-1] = '\0';
|
|
264 for (i=0; i<strlen(name2);i++)
|
|
265 {
|
|
266 if (name2[i] == ' ')
|
|
267 {
|
|
268 name2[i] = '\0';
|
|
269 break;
|
|
270 }
|
|
271
|
|
272 }
|
|
273
|
|
274 if ( *fastq )
|
|
275 {
|
|
276 readSecondSeq(dummy);
|
|
277 readSecondSeq(qual2);
|
|
278
|
|
279 qual2[strlen(qual2)-1] = '\0';
|
|
280 }
|
|
281 else
|
|
282 {
|
|
283 sprintf(qual2, "*");
|
|
284 }
|
|
285
|
|
286
|
|
287 // Cropping
|
|
288 if (cropSize > 0)
|
|
289 {
|
|
290 seq2[cropSize] = '\0';
|
|
291 if ( *fastq )
|
|
292 qual2[cropSize] = '\0';
|
|
293 }
|
|
294
|
|
295
|
|
296 nCnt = 0;
|
|
297 for (i=0; i<strlen(seq2); i++)
|
|
298 {
|
|
299 seq2[i] = toupper (seq2[i]);
|
|
300 if (seq2[i] == 'N')
|
|
301 {
|
|
302 nCnt++;
|
|
303
|
|
304 }
|
|
305 else if (isspace(seq2[i]))
|
|
306 {
|
|
307 seq2[i] = '\0';
|
|
308 }
|
|
309 }
|
|
310 if (nCnt > errThreshold)
|
|
311 {
|
|
312 err2 = 1;
|
|
313 }
|
|
314 }
|
|
315
|
|
316 if (!pairedEnd && !err1)
|
|
317 {
|
|
318
|
|
319 int _mtmp = strlen(seq1);
|
|
320 list[seqCnt].hits = getMem (1+3*_mtmp+3+strlen(name1)+1);
|
|
321 list[seqCnt].seq = list[seqCnt].hits + 1;
|
|
322 list[seqCnt].rseq = list[seqCnt].seq + _mtmp+1;
|
|
323 list[seqCnt].qual = list[seqCnt].rseq + _mtmp+1;
|
|
324 list[seqCnt].name = list[seqCnt].qual + _mtmp+1;
|
|
325
|
|
326
|
|
327 reverseComplete(seq1, rseq1, _mtmp);
|
|
328 rseq1[_mtmp] = '\0';
|
|
329 int i;
|
|
330
|
|
331 list[seqCnt].hits[0] = 0;
|
|
332
|
|
333 for (i=0; i<=_mtmp; i++)
|
|
334 {
|
|
335 list[seqCnt].seq[i] = seq1[i];
|
|
336 list[seqCnt].rseq[i] = rseq1[i] ;
|
|
337 list[seqCnt].qual[i] = qual1[i];
|
|
338 }
|
|
339 sprintf(list[seqCnt].name,"%s%c", ((char*)name1)+1,'\0');
|
|
340
|
|
341 seqCnt++;
|
|
342
|
|
343 }
|
|
344 else if (pairedEnd && !err1 && !err2)
|
|
345 {
|
|
346 // Naming Conventions X/1, X/2 OR X
|
|
347 int tmplen = strlen(name1);
|
|
348 if (strcmp(name1, name2) != 0)
|
|
349 {
|
|
350 tmplen = strlen(name1)-2;
|
|
351 }
|
|
352
|
|
353 if (strcmp(name1, "@IL11_266:2:1:922:509/1") == 0)
|
|
354 {
|
|
355 fprintf(stdout, "%d\n", seqCnt);
|
|
356 }
|
|
357 //first seq
|
|
358 int _mtmp = strlen(seq1);
|
|
359 list[seqCnt].hits = getMem (1+3*_mtmp+3+tmplen+1);
|
|
360 list[seqCnt].seq = list[seqCnt].hits + 1;
|
|
361 list[seqCnt].rseq = list[seqCnt].seq + _mtmp+1;
|
|
362 list[seqCnt].qual = list[seqCnt].rseq + _mtmp+1;
|
|
363 list[seqCnt].name = list[seqCnt].qual + _mtmp+1;
|
|
364
|
|
365 reverseComplete(seq1, rseq1, _mtmp);
|
|
366 rseq1[_mtmp] = '\0';
|
|
367 int i;
|
|
368
|
|
369 list[seqCnt].hits[0] = 0;
|
|
370
|
|
371 for (i=0; i<=_mtmp; i++)
|
|
372 {
|
|
373 list[seqCnt].seq[i] = seq1[i];
|
|
374 list[seqCnt].rseq[i] = rseq1[i] ;
|
|
375 list[seqCnt].qual[i] = qual1[i];
|
|
376 }
|
|
377
|
|
378
|
|
379 name1[tmplen]='\0';
|
|
380 sprintf(list[seqCnt].name,"%s%c", ((char*)name1)+1,'\0');
|
|
381
|
|
382
|
|
383 seqCnt++;
|
|
384
|
|
385 //second seq
|
|
386 list[seqCnt].hits = getMem (1+3*_mtmp+3+tmplen+1);
|
|
387 list[seqCnt].seq = list[seqCnt].hits + 1;
|
|
388 list[seqCnt].rseq = list[seqCnt].seq + _mtmp+1;
|
|
389 list[seqCnt].qual = list[seqCnt].rseq + _mtmp+1;
|
|
390 list[seqCnt].name = list[seqCnt].qual + _mtmp+1;
|
|
391
|
|
392 reverseComplete(seq2, rseq2, _mtmp);
|
|
393 rseq2[_mtmp] = '\0';
|
|
394
|
|
395 list[seqCnt].hits[0] = 0;
|
|
396
|
|
397 for (i=0; i<=_mtmp; i++)
|
|
398 {
|
|
399 list[seqCnt].seq[i] = seq2[i];
|
|
400 list[seqCnt].rseq[i] = rseq2[i] ;
|
|
401 list[seqCnt].qual[i] = qual2[i];
|
|
402 }
|
|
403
|
|
404
|
|
405 name2[tmplen]='\0';
|
|
406 sprintf(list[seqCnt].name,"%s%c", ((char*)name2)+1,'\0');
|
|
407
|
|
408
|
|
409 seqCnt++;
|
|
410
|
|
411 }
|
|
412 else
|
|
413 {
|
|
414 discarded++;
|
|
415 }
|
|
416 }
|
|
417
|
|
418 if (seqCnt > 0)
|
|
419 {
|
|
420 QUAL_LENGTH = SEQ_LENGTH = strlen(list[0].seq);
|
|
421 if (! *fastq)
|
|
422 {
|
|
423 QUAL_LENGTH = 1;
|
|
424 }
|
|
425 //fprintf(stderr, "%d %d\n", SEQ_LENGTH, QUAL_LENGTH);
|
|
426 }
|
|
427 else
|
|
428 {
|
|
429 fprintf(stdout, "ERR: No reads can be found for mapping\n");
|
|
430 return 0;
|
|
431 }
|
|
432
|
|
433
|
|
434 if (pairedEnd)
|
|
435 {
|
|
436 // seqCnt /= 2;
|
|
437 }
|
|
438
|
|
439
|
|
440 // Closing Files
|
|
441 if (!compressed)
|
|
442 {
|
|
443 fclose(_r_fp1);
|
|
444 if ( pairedEnd && fileName2 != NULL )
|
|
445 {
|
|
446 fclose(_r_fp2);
|
|
447 }
|
|
448 }
|
|
449 else
|
|
450 {
|
|
451 gzclose(_r_gzfp1);
|
|
452 if ( pairedEnd && fileName2 != NULL)
|
|
453 {
|
|
454 gzclose(_r_fp2);
|
|
455 }
|
|
456 }
|
|
457
|
|
458 *seqList = list;
|
|
459 *seqListSize = seqCnt;
|
|
460
|
|
461 _r_seq = list;
|
|
462 _r_seqCnt = seqCnt;
|
|
463
|
|
464 fprintf(stdout, "%d sequences are read in %0.2f. (%d discarded) [Mem:%0.2f M]\n", seqCnt, (getTime()-startTime), discarded, getMemUsage());
|
|
465 //totalLoadingTime+=getTime()-startTime;
|
|
466
|
|
467 return 1;
|
|
468 }
|
|
469 /**********************************************/
|
|
470 void loadSamplingLocations(int **samplingLocs, int * samplingLocsSize)
|
|
471 {
|
|
472 int i;
|
|
473 int samLocsSize = errThreshold + 1;
|
|
474 int *samLocs = getMem(sizeof(int)*samLocsSize);
|
|
475
|
|
476 for (i=0; i<samLocsSize; i++)
|
|
477 {
|
|
478 samLocs[i] = (SEQ_LENGTH / samLocsSize) *i;
|
|
479 if ( samLocs[i] + WINDOW_SIZE > SEQ_LENGTH)
|
|
480 samLocs[i] = SEQ_LENGTH - WINDOW_SIZE;
|
|
481 }
|
|
482
|
|
483 // Outputing the sampling locations
|
|
484
|
|
485 /* int j;
|
|
486 for (i=0; i<SEQ_LENGTH; i++)
|
|
487 {
|
|
488 fprintf(stdout, "-");
|
|
489 }
|
|
490 fprintf(stdout, "\n");
|
|
491
|
|
492 for ( i=0; i<samLocsSize; i++ )
|
|
493 {
|
|
494 for ( j=0; j<samLocs[i]; j++ )
|
|
495 {
|
|
496 fprintf(stdout," ");
|
|
497 }
|
|
498 for (j=0; j<WINDOW_SIZE; j++)
|
|
499 {
|
|
500 fprintf(stdout,"+");
|
|
501 }
|
|
502 fprintf(stdout, "\n");
|
|
503 fflush(stdout);
|
|
504 }
|
|
505 for ( i=0; i<SEQ_LENGTH; i++ )
|
|
506 {
|
|
507 fprintf(stdout, "-");
|
|
508 }
|
|
509 fprintf(stdout, "\n");*/
|
|
510 *samplingLocs = samLocs;
|
|
511 *samplingLocsSize = samLocsSize;
|
|
512 _r_samplingLocs = samLocs;
|
|
513 }
|
|
514
|
|
515 void finalizeReads(char *fileName)
|
|
516 {
|
|
517 FILE *fp1=NULL;
|
|
518
|
|
519 if (fileName != NULL)
|
|
520 {
|
|
521 fp1 = fileOpen(fileName, "w");
|
|
522 }
|
|
523 if (pairedEndMode)
|
|
524 _r_seqCnt /=2;
|
|
525
|
|
526 int i=0;
|
|
527 for (i = 0; i < _r_seqCnt; i++)
|
|
528 {
|
|
529 if (pairedEndMode && _r_seq[2*i].hits[0] == 0 && strcmp(_r_seq[2*i].qual,"*")!=0)
|
|
530 {
|
|
531 fprintf(fp1,"@%s/1\n%s\n+\n%s\n@%s/2\n%s\n+\n%s\n", _r_seq[i*2].name, _r_seq[i*2].seq, _r_seq[i*2].qual, _r_seq[i*2].name, _r_seq[i*2+1].seq, _r_seq[i*2+1].qual);
|
|
532 }
|
|
533 else if (pairedEndMode && _r_seq[2*i].hits[0] == 0)
|
|
534 {
|
|
535 fprintf(fp1, ">%s/1\n%s\n>%s/2\n%s\n", _r_seq[i*2].name, _r_seq[i*2].seq, _r_seq[i*2].name, _r_seq[i*2+1].seq);
|
|
536 }
|
|
537 else if (_r_seq[i].hits[0] == 0 && strcmp(_r_seq[i].qual, "*")!=0)
|
|
538 {
|
|
539 fprintf(fp1,"@%s\n%s\n+\n%s\n", _r_seq[i].name, _r_seq[i].seq, _r_seq[i].qual);
|
|
540 }
|
|
541 else if (_r_seq[i].hits[0] == 0)
|
|
542 {
|
|
543 fprintf(fp1,">%s\n%s\n", _r_seq[i].name, _r_seq[i].seq);
|
|
544 }
|
|
545 }
|
|
546
|
|
547 fclose(fp1);
|
|
548 if (pairedEndMode)
|
|
549 _r_seqCnt *= 2;
|
|
550
|
|
551 for (i = 0; i < _r_seqCnt; i++)
|
|
552 {
|
|
553 freeMem(_r_seq[i].hits,0);
|
|
554 }
|
|
555
|
|
556
|
|
557 freeMem(_r_seq,0);
|
|
558 freeMem(_r_samplingLocs,0);
|
|
559 }
|