1
|
1 /*
|
|
2 * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University
|
|
3 * All rights reserved.
|
|
4 *
|
|
5 * Redistribution and use in source and binary forms, with or without modification,
|
|
6 * are permitted provided that the following conditions are met:
|
|
7 *
|
|
8 * Redistributions of source code must retain the above copyright notice, this list
|
|
9 * of conditions and the following disclaimer.
|
|
10 * - Redistributions in binary form must reproduce the above copyright notice, this
|
|
11 * list of conditions and the following disclaimer in the documentation and/or other
|
|
12 * materials provided with the distribution.
|
|
13 * - Neither the names of the University of Washington, Simon Fraser University,
|
|
14 * nor the names of its contributors may be
|
|
15 * used to endorse or promote products derived from this software without specific
|
|
16 * prior written permission.
|
|
17 *
|
|
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
22 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
23 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
24 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
25 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
|
26 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
27 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
28 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
29 */
|
|
30
|
|
31 /*
|
|
32 Authors:
|
|
33 Farhad Hormozdiari
|
|
34 Faraz Hach
|
|
35 Can Alkan
|
|
36 Emails:
|
|
37 farhadh AT uw DOT edu
|
|
38 fhach AT cs DOT sfu DOT ca
|
|
39 calkan AT uw DOT edu
|
|
40 */
|
|
41
|
|
42
|
|
43
|
|
44 #include <stdio.h>
|
|
45 #include <stdlib.h>
|
|
46 #include <string.h>
|
|
47 #include <ctype.h>
|
|
48 #include <zlib.h>
|
|
49 #include "Common.h"
|
|
50 #include "Reads.h"
|
|
51
|
|
52 #define CHARCODE(a) (a=='A' ? 0 : (a=='C' ? 1 : (a=='G' ? 2 : (a=='T' ? 3 : 4))))
|
|
53
|
|
54 FILE *_r_fp1;
|
|
55 FILE *_r_fp2;
|
|
56 gzFile _r_gzfp1;
|
|
57 gzFile _r_gzfp2;
|
|
58 Read *_r_seq;
|
|
59 int _r_seqCnt;
|
|
60 int *_r_samplingLocs;
|
|
61
|
|
62 /**********************************************/
|
|
63 char *(*readFirstSeq)(char *);
|
|
64 char *(*readSecondSeq)(char *);
|
|
65 /**********************************************/
|
|
66 char *readFirstSeqTXT( char *seq )
|
|
67 {
|
|
68 return fgets(seq, SEQ_MAX_LENGTH, _r_fp1);
|
|
69 }
|
|
70
|
|
71 /**********************************************/
|
|
72 char *readSecondSeqTXT( char *seq )
|
|
73 {
|
|
74 return fgets(seq, SEQ_MAX_LENGTH, _r_fp2);
|
|
75 }
|
|
76 /**********************************************/
|
|
77 char *readFirstSeqGZ( char *seq )
|
|
78 {
|
|
79 return gzgets(_r_gzfp1, seq, SEQ_MAX_LENGTH);
|
|
80 }
|
|
81
|
|
82 /**********************************************/
|
|
83 char *readSecondSeqGZ( char *seq )
|
|
84 {
|
|
85 return gzgets(_r_gzfp2, seq, SEQ_MAX_LENGTH);
|
|
86 }
|
|
87 /**********************************************/
|
|
88 int toCompareRead(const void * elem1, const void * elem2)
|
|
89 {
|
|
90 return strcmp(((Read *)elem1)->seq, ((Read *)elem2)->seq);
|
|
91 }
|
|
92 /**********************************************/
|
|
93 int readAllReads(char *fileName1,
|
|
94 char *fileName2,
|
|
95 int compressed,
|
|
96 unsigned char *fastq,
|
|
97 unsigned char pairedEnd,
|
|
98 Read **seqList,
|
|
99 unsigned int *seqListSize)
|
|
100 {
|
|
101 double startTime=getTime();
|
|
102
|
|
103 char seq1[SEQ_MAX_LENGTH];
|
|
104 char rseq1[SEQ_MAX_LENGTH];
|
|
105 char name1[SEQ_MAX_LENGTH];
|
|
106 char qual1[SEQ_MAX_LENGTH];
|
|
107 char seq2[SEQ_MAX_LENGTH];
|
|
108 char rseq2[SEQ_MAX_LENGTH];
|
|
109 char name2[SEQ_MAX_LENGTH];
|
|
110 char qual2[SEQ_MAX_LENGTH];
|
|
111
|
|
112 char dummy[SEQ_MAX_LENGTH];
|
|
113 char ch;
|
|
114 int err1, err2;
|
|
115 int nCnt;
|
|
116 int discarded = 0;
|
|
117 int seqCnt = 0;
|
|
118 int maxCnt = 0;
|
|
119 int i;
|
|
120 Read *list = NULL;
|
|
121
|
|
122 int clipped = 0;
|
|
123
|
|
124
|
|
125 if (!compressed)
|
|
126 {
|
|
127 _r_fp1 = fileOpen( fileName1, "r");
|
|
128
|
|
129 if (_r_fp1 == NULL)
|
|
130 {
|
|
131 return 0;
|
|
132 }
|
|
133
|
|
134 ch = fgetc(_r_fp1);
|
|
135
|
|
136 if ( pairedEnd && fileName2 != NULL )
|
|
137 {
|
|
138 _r_fp2 = fileOpen ( fileName2, "r" );
|
|
139 if (_r_fp2 == NULL)
|
|
140 {
|
|
141 return 0;
|
|
142 }
|
|
143 }
|
|
144 else
|
|
145 {
|
|
146 _r_fp2 = _r_fp1;
|
|
147 }
|
|
148
|
|
149 readFirstSeq = &readFirstSeqTXT;
|
|
150 readSecondSeq = &readSecondSeqTXT;
|
|
151 }
|
|
152 else
|
|
153 {
|
|
154
|
|
155 _r_gzfp1 = fileOpenGZ (fileName1, "r");
|
|
156
|
|
157 if (_r_gzfp1 == NULL)
|
|
158 {
|
|
159 return 0;
|
|
160 }
|
|
161
|
|
162 ch = gzgetc(_r_gzfp1);
|
|
163
|
|
164 if ( pairedEnd && fileName2 != NULL )
|
|
165 {
|
|
166 _r_gzfp2 = fileOpenGZ ( fileName2, "r" );
|
|
167 if (_r_gzfp2 == NULL)
|
|
168 {
|
|
169 return 0;
|
|
170 }
|
|
171 }
|
|
172 else
|
|
173 {
|
|
174 _r_gzfp2 = _r_gzfp1;
|
|
175 }
|
|
176
|
|
177 readFirstSeq = &readFirstSeqGZ;
|
|
178 readSecondSeq = &readSecondSeqGZ;
|
|
179 }
|
|
180
|
|
181 if (ch == '>')
|
|
182 *fastq = 0;
|
|
183 else
|
|
184 *fastq = 1;
|
|
185
|
|
186 // Counting the number of lines in the file
|
|
187 while (readFirstSeq(dummy)) maxCnt++;
|
|
188
|
|
189 if (!compressed)
|
|
190 {
|
|
191 rewind(_r_fp1);
|
|
192 }
|
|
193 else
|
|
194 {
|
|
195 gzrewind(_r_gzfp1);
|
|
196 }
|
|
197
|
|
198 // Calculating the Maximum # of sequences
|
|
199 if (*fastq)
|
|
200 {
|
|
201 maxCnt /= 4;
|
|
202 }
|
|
203 else
|
|
204 {
|
|
205 maxCnt /= 2;
|
|
206 }
|
|
207
|
|
208 if (pairedEnd && fileName2 != NULL )
|
|
209 maxCnt *= 2;
|
|
210
|
|
211 list = getMem(sizeof(Read)*maxCnt);
|
|
212
|
|
213 while( readFirstSeq(name1) )
|
|
214 {
|
|
215 err1 = 0;
|
|
216 err2 = 0;
|
|
217 readFirstSeq(seq1);
|
|
218 name1[strlen(name1)-1] = '\0';
|
|
219
|
|
220 if ( *fastq )
|
|
221 {
|
|
222 readFirstSeq(dummy);
|
|
223 readFirstSeq(qual1);
|
|
224 qual1[strlen(qual1)-1] = '\0';
|
|
225 }
|
|
226 else
|
|
227 {
|
|
228 sprintf(qual1, "*");
|
|
229 }
|
|
230
|
|
231 // Cropping
|
|
232 if (cropSize > 0)
|
|
233 {
|
|
234 seq1[cropSize] = '\0';
|
|
235 if ( *fastq )
|
|
236 qual1[cropSize] = '\0';
|
|
237 }
|
|
238
|
|
239
|
|
240 nCnt = 0;
|
|
241 for (i=0; i<strlen(seq1); i++)
|
|
242 {
|
|
243 seq1[i] = toupper (seq1[i]);
|
|
244 if (seq1[i] == 'N')
|
|
245 {
|
|
246 nCnt++;
|
|
247 }
|
|
248 else if (isspace(seq1[i]))
|
|
249 {
|
|
250
|
|
251 seq1[i] = '\0';
|
|
252 break;
|
|
253 }
|
|
254 }
|
|
255
|
|
256 if (nCnt > errThreshold)
|
|
257 {
|
|
258 err1 = 1;
|
|
259 }
|
|
260
|
|
261 // Reading the second seq of pair-ends
|
|
262 if (pairedEnd)
|
|
263 {
|
|
264 readSecondSeq(name2);
|
|
265 readSecondSeq(seq2);
|
|
266 name2[strlen(name2)-1] = '\0';
|
|
267
|
|
268
|
|
269 if ( *fastq )
|
|
270 {
|
|
271 readSecondSeq(dummy);
|
|
272 readSecondSeq(qual2);
|
|
273
|
|
274 qual2[strlen(qual2)-1] = '\0';
|
|
275 }
|
|
276 else
|
|
277 {
|
|
278 sprintf(qual2, "*");
|
|
279 }
|
|
280
|
|
281
|
|
282 // Cropping
|
|
283 if (cropSize > 0)
|
|
284 {
|
|
285 seq2[cropSize] = '\0';
|
|
286 if ( *fastq )
|
|
287 qual2[cropSize] = '\0';
|
|
288 }
|
|
289
|
|
290
|
|
291 nCnt = 0;
|
|
292 for (i=0; i<strlen(seq2); i++)
|
|
293 {
|
|
294 seq2[i] = toupper (seq2[i]);
|
|
295 if (seq2[i] == 'N')
|
|
296 {
|
|
297 nCnt++;
|
|
298
|
|
299 }
|
|
300 else if (isspace(seq2[i]))
|
|
301 {
|
|
302 seq2[i] = '\0';
|
|
303 }
|
|
304 }
|
|
305 if (nCnt > errThreshold)
|
|
306 {
|
|
307 err2 = 1;
|
|
308 }
|
|
309
|
|
310
|
|
311 if (strlen(seq1) < strlen(seq2)) {
|
|
312 seq2[strlen(seq1)] = '\0';
|
|
313 if ( *fastq )
|
|
314 qual2[strlen(seq1)] = '\0';
|
|
315 if (!clipped) clipped = 2;
|
|
316 }
|
|
317 else if (strlen(seq1) > strlen(seq2)){
|
|
318 seq1[strlen(seq2)] = '\0';
|
|
319 if ( *fastq )
|
|
320 qual1[strlen(seq2)] = '\0';
|
|
321 if (!clipped) clipped = 1;
|
|
322 }
|
|
323
|
|
324 if (clipped == 1 || clipped == 2){
|
|
325 fprintf(stdout, "[PE mode Warning] Sequence lengths are different, read #%d is clipped to match.\n", clipped);
|
|
326 clipped = 3;
|
|
327 }
|
|
328
|
|
329
|
|
330
|
|
331 }
|
|
332
|
|
333 if (!pairedEnd && !err1)
|
|
334 {
|
|
335 int _mtmp = strlen(seq1);
|
|
336 list[seqCnt].hits = getMem (1+3*_mtmp+3+strlen(name1)+1);
|
|
337 list[seqCnt].seq = list[seqCnt].hits + 1;
|
|
338 list[seqCnt].rseq = list[seqCnt].seq + _mtmp+1;
|
|
339 list[seqCnt].qual = list[seqCnt].rseq + _mtmp+1;
|
|
340 list[seqCnt].name = list[seqCnt].qual + _mtmp+1;
|
|
341
|
|
342 list[seqCnt].hashValue = getMem(sizeof(short)*_mtmp);
|
|
343 list[seqCnt].rhashValue = getMem(sizeof(short)*_mtmp);
|
|
344
|
|
345 list[seqCnt].readNumber = seqCnt;
|
|
346
|
|
347 reverseComplete(seq1, rseq1, _mtmp);
|
|
348 rseq1[_mtmp] = '\0';
|
|
349 int i;
|
|
350
|
|
351 list[seqCnt].hits[0] = 0;
|
|
352
|
|
353 for (i=0; i<=_mtmp; i++)
|
|
354 {
|
|
355 list[seqCnt].seq[i] = seq1[i];
|
|
356 list[seqCnt].rseq[i] = rseq1[i] ;
|
|
357 list[seqCnt].qual[i] = qual1[i];
|
|
358 }
|
|
359
|
|
360
|
|
361 //MAKE HASH VALUE
|
|
362 short code = 0;
|
|
363
|
|
364 for(i=0; i < 4; i++)
|
|
365 code = code * 5 + CHARCODE(list[seqCnt].seq[i]);
|
|
366 list[seqCnt].hashValue[0] = code;
|
|
367
|
|
368
|
|
369 for(i = 1; i < _mtmp-3; i++)
|
|
370 {
|
|
371 list[seqCnt].hashValue[i] = (list[seqCnt].hashValue[i-1] - 125 * CHARCODE(seq1[i-1])) * 5 + CHARCODE(seq1[i+3]);
|
|
372 }
|
|
373
|
|
374
|
|
375 code = 0;
|
|
376 for(i=0; i < 4; i++)
|
|
377 code = code * 5 + CHARCODE(list[seqCnt].rseq[i]);
|
|
378 list[seqCnt].rhashValue[0] = code;
|
|
379
|
|
380
|
|
381 for(i = 1; i < _mtmp-3; i++)
|
|
382 {
|
|
383 list[seqCnt].rhashValue[i] = (list[seqCnt].rhashValue[i-1] - 125 * CHARCODE(rseq1[i-1])) * 5 + CHARCODE(rseq1[i+3]);
|
|
384 }
|
|
385
|
|
386 int j = 0;
|
|
387 int tmpSize = _mtmp / (errThreshold+1);
|
|
388
|
|
389 list[seqCnt].hashValSampleSize = getMem(sizeof(int)*(errThreshold+1));
|
|
390 for(i=0; i < errThreshold+1; i++)
|
|
391 {
|
|
392 code = 0;
|
|
393 for(j = 0; j < tmpSize; j++)
|
|
394 {
|
|
395 code = code * 5 + CHARCODE(list[seqCnt].seq[i*tmpSize+j]);
|
|
396 }
|
|
397 list[seqCnt].hashValSampleSize[i] = code;
|
|
398 }
|
|
399
|
|
400 sprintf(list[seqCnt].name,"%s%c", ((char*)name1)+1,'\0');
|
|
401
|
|
402 seqCnt++;
|
|
403
|
|
404 }
|
|
405 else if (pairedEnd && !err1 && !err2)
|
|
406 {
|
|
407
|
|
408 // Naming Conventions X/1, X/2 OR X
|
|
409 int tmplen = strlen(name1);
|
|
410 if (strcmp(name1, name2) != 0)
|
|
411 {
|
|
412 tmplen = strlen(name1)-2;
|
|
413 }
|
|
414
|
|
415 //first seq
|
|
416 int _mtmp = strlen(seq1);
|
|
417 list[seqCnt].hits = getMem (1+3*_mtmp+3+tmplen+1);
|
|
418 list[seqCnt].seq = list[seqCnt].hits + 1;
|
|
419 list[seqCnt].rseq = list[seqCnt].seq + _mtmp+1;
|
|
420 list[seqCnt].qual = list[seqCnt].rseq + _mtmp+1;
|
|
421 list[seqCnt].name = list[seqCnt].qual + _mtmp+1;
|
|
422
|
|
423 list[seqCnt].hashValue = getMem(sizeof(short)*_mtmp);
|
|
424 list[seqCnt].rhashValue = getMem(sizeof(short)*_mtmp);
|
|
425 list[seqCnt].readNumber = seqCnt;
|
|
426
|
|
427 reverseComplete(seq1, rseq1, _mtmp);
|
|
428 rseq1[_mtmp] = '\0';
|
|
429 int i;
|
|
430
|
|
431 list[seqCnt].hits[0] = 0;
|
|
432
|
|
433 for (i=0; i<=_mtmp; i++)
|
|
434 {
|
|
435 list[seqCnt].seq[i] = seq1[i];
|
|
436 list[seqCnt].rseq[i] = rseq1[i] ;
|
|
437 list[seqCnt].qual[i] = qual1[i];
|
|
438 }
|
|
439
|
|
440
|
|
441 name1[tmplen]='\0';
|
|
442
|
|
443 //MAKE HASH VALUE
|
|
444 short code = 0;
|
|
445
|
|
446 for(i=0; i < 4; i++)
|
|
447 code = code * 5 + CHARCODE(list[seqCnt].seq[i]);
|
|
448 list[seqCnt].hashValue[0] = code;
|
|
449
|
|
450
|
|
451 for(i = 1; i < _mtmp-3; i++)
|
|
452 {
|
|
453 list[seqCnt].hashValue[i] = (list[seqCnt].hashValue[i-1] - 125 * CHARCODE(seq1[i-1])) * 5 + CHARCODE(seq1[i+3]);
|
|
454 }
|
|
455
|
|
456
|
|
457 code = 0;
|
|
458 for(i=0; i < 4; i++)
|
|
459 code = code * 5 + CHARCODE(list[seqCnt].rseq[i]);
|
|
460 list[seqCnt].rhashValue[0] = code;
|
|
461
|
|
462
|
|
463 for(i = 1; i < _mtmp-3; i++)
|
|
464 {
|
|
465 list[seqCnt].rhashValue[i] = (list[seqCnt].rhashValue[i-1] - 125 * CHARCODE(rseq1[i-1])) * 5 + CHARCODE(rseq1[i+3]);
|
|
466 }
|
|
467
|
|
468 int j = 0;
|
|
469 int tmpSize = _mtmp / (errThreshold+1);
|
|
470
|
|
471 list[seqCnt].hashValSampleSize = getMem(sizeof(int)*(errThreshold+1));
|
|
472 for(i=0; i < errThreshold+1; i++)
|
|
473 {
|
|
474 code = 0;
|
|
475 for(j = 0; j < tmpSize; j++)
|
|
476 {
|
|
477 code = code * 5 + CHARCODE(list[seqCnt].seq[i*tmpSize+j]);
|
|
478 }
|
|
479 list[seqCnt].hashValSampleSize[i] = code;
|
|
480 }
|
|
481
|
|
482 sprintf(list[seqCnt].name,"%s%c", ((char*)name1)+1,'\0');
|
|
483
|
|
484
|
|
485 seqCnt++;
|
|
486
|
|
487 //second seq
|
|
488 list[seqCnt].hits = getMem (1+3*_mtmp+3+tmplen+1);
|
|
489 list[seqCnt].seq = list[seqCnt].hits + 1;
|
|
490 list[seqCnt].rseq = list[seqCnt].seq + _mtmp+1;
|
|
491 list[seqCnt].qual = list[seqCnt].rseq + _mtmp+1;
|
|
492 list[seqCnt].name = list[seqCnt].qual + _mtmp+1;
|
|
493
|
|
494 list[seqCnt].hashValue = getMem(sizeof(short)*_mtmp);
|
|
495 list[seqCnt].rhashValue = getMem(sizeof(short)*_mtmp);
|
|
496 list[seqCnt].readNumber = seqCnt;
|
|
497
|
|
498 reverseComplete(seq2, rseq2, _mtmp);
|
|
499 rseq2[_mtmp] = '\0';
|
|
500
|
|
501 list[seqCnt].hits[0] = 0;
|
|
502
|
|
503 for (i=0; i<=_mtmp; i++)
|
|
504 {
|
|
505 list[seqCnt].seq[i] = seq2[i];
|
|
506 list[seqCnt].rseq[i] = rseq2[i] ;
|
|
507 list[seqCnt].qual[i] = qual2[i];
|
|
508 }
|
|
509
|
|
510
|
|
511 name2[tmplen]='\0';
|
|
512
|
|
513 //MAKE HASH VALUE
|
|
514 code = 0;
|
|
515
|
|
516 for(i=0; i < 4; i++)
|
|
517 code = code * 5 + CHARCODE(list[seqCnt].seq[i]);
|
|
518 list[seqCnt].hashValue[0] = code;
|
|
519
|
|
520
|
|
521 for(i = 1; i < _mtmp-3; i++)
|
|
522 {
|
|
523 list[seqCnt].hashValue[i] = (list[seqCnt].hashValue[i-1] - 125 * CHARCODE(seq1[i-1])) * 5 + CHARCODE(seq1[i+3]);
|
|
524 }
|
|
525
|
|
526
|
|
527 code = 0;
|
|
528 for(i=0; i < 4; i++)
|
|
529 code = code * 5 + CHARCODE(list[seqCnt].rseq[i]);
|
|
530 list[seqCnt].rhashValue[0] = code;
|
|
531
|
|
532
|
|
533 for(i = 1; i < _mtmp-3; i++)
|
|
534 {
|
|
535 list[seqCnt].rhashValue[i] = (list[seqCnt].rhashValue[i-1] - 125 * CHARCODE(rseq1[i-1])) * 5 + CHARCODE(rseq1[i+3]);
|
|
536 }
|
|
537
|
|
538 j = 0;
|
|
539 tmpSize = _mtmp / (errThreshold+1);
|
|
540
|
|
541 list[seqCnt].hashValSampleSize = getMem(sizeof(int)*(errThreshold+1));
|
|
542 for(i=0; i < errThreshold+1; i++)
|
|
543 {
|
|
544 code = 0;
|
|
545 for(j = 0; j < tmpSize; j++)
|
|
546 {
|
|
547 code = code * 5 + CHARCODE(list[seqCnt].seq[i*tmpSize+j]);
|
|
548 }
|
|
549 list[seqCnt].hashValSampleSize[i] = code;
|
|
550 }
|
|
551
|
|
552 sprintf(list[seqCnt].name,"%s%c", ((char*)name2)+1,'\0');
|
|
553
|
|
554 seqCnt++;
|
|
555
|
|
556 }
|
|
557 else
|
|
558 {
|
|
559 discarded++;
|
|
560 }
|
|
561 }
|
|
562
|
|
563 if (seqCnt > 0)
|
|
564 {
|
|
565 SEQ_LENGTH = strlen(list[0].seq);
|
|
566 }
|
|
567 else
|
|
568 {
|
|
569 fprintf(stdout, "ERROR: No reads can be found for mapping\n");
|
|
570 return 0;
|
|
571 }
|
|
572
|
|
573
|
|
574 // Closing Files
|
|
575 if (!compressed)
|
|
576 {
|
|
577 fclose(_r_fp1);
|
|
578 if ( pairedEnd && fileName2 != NULL )
|
|
579 {
|
|
580 fclose(_r_fp2);
|
|
581 }
|
|
582 }
|
|
583 else
|
|
584 {
|
|
585 gzclose(_r_gzfp1);
|
|
586 if ( pairedEnd && fileName2 != NULL)
|
|
587 {
|
|
588 gzclose(_r_fp2);
|
|
589 }
|
|
590 }
|
|
591
|
|
592 //qsort(list, seqCnt, sizeof(Read), toCompareRead);
|
|
593
|
|
594 adjustQual(list, seqCnt);
|
|
595
|
|
596 *seqList = list;
|
|
597 *seqListSize = seqCnt;
|
|
598
|
|
599
|
|
600 _r_seq = list;
|
|
601 _r_seqCnt = seqCnt;
|
|
602
|
|
603 if ( pairedEnd ) discarded *= 2;
|
|
604
|
|
605 if (seqCnt>1)
|
|
606 fprintf(stdout, "%d sequences are read in %0.2f. (%d discarded) [Mem:%0.2f M]\n", seqCnt, (getTime()-startTime), discarded, getMemUsage());
|
|
607 else
|
|
608 fprintf(stdout, "%d sequence is read in %0.2f. (%d discarded) [Mem:%0.2f M]\n", seqCnt, (getTime()-startTime), discarded, getMemUsage());
|
|
609
|
|
610
|
|
611
|
|
612 return 1;
|
|
613 }
|
|
614 /**********************************************/
|
|
615 void loadSamplingLocations(int **samplingLocs, int * samplingLocsSize)
|
|
616 {
|
|
617 int i;
|
|
618 int samLocsSize = errThreshold + 1;
|
|
619 int *samLocs = getMem(sizeof(int)*samLocsSize);
|
|
620
|
|
621 for (i=0; i<samLocsSize; i++)
|
|
622 {
|
|
623 samLocs[i] = (SEQ_LENGTH / samLocsSize) *i;
|
|
624 if ( samLocs[i] + WINDOW_SIZE > SEQ_LENGTH)
|
|
625 samLocs[i] = SEQ_LENGTH - WINDOW_SIZE;
|
|
626 }
|
|
627
|
|
628 // Outputing the sampling locations
|
|
629
|
|
630 /*
|
|
631
|
|
632 int j;
|
|
633 for (i=0; i<SEQ_LENGTH; i++)
|
|
634 {
|
|
635 fprintf(stdout, "-");
|
|
636 }
|
|
637 fprintf(stdout, "\n");
|
|
638
|
|
639 for ( i=0; i<samLocsSize; i++ )
|
|
640 {
|
|
641 for ( j=0; j<samLocs[i]; j++ )
|
|
642 {
|
|
643 fprintf(stdout," ");
|
|
644 }
|
|
645 for (j=0; j<WINDOW_SIZE; j++)
|
|
646 {
|
|
647 fprintf(stdout,"+");
|
|
648 }
|
|
649 fprintf(stdout, "\n");
|
|
650 fflush(stdout);
|
|
651 }
|
|
652
|
|
653
|
|
654 for ( i=0; i<SEQ_LENGTH; i++ )
|
|
655 {
|
|
656 fprintf(stdout, "-");
|
|
657 }
|
|
658 fprintf(stdout, "\n");
|
|
659
|
|
660 */
|
|
661
|
|
662 *samplingLocs = samLocs;
|
|
663 *samplingLocsSize = samLocsSize;
|
|
664 _r_samplingLocs = samLocs;
|
|
665 }
|
|
666
|
|
667 void finalizeReads(char *fileName)
|
|
668 {
|
|
669 FILE *fp1=NULL;
|
|
670
|
|
671 if (fileName != NULL)
|
|
672 {
|
|
673 fp1 = fileOpen(fileName, "w");
|
|
674 }
|
|
675 if (pairedEndMode)
|
|
676 _r_seqCnt /=2;
|
|
677
|
|
678 int i=0;
|
|
679 for (i = 0; i < _r_seqCnt; i++)
|
|
680 {
|
|
681 if (pairedEndMode && _r_seq[2*i].hits[0] == 0 && _r_seq[2*i+1].hits[0] == 0 && strcmp(_r_seq[2*i].qual,"*")!=0)
|
|
682 {
|
|
683 fprintf(fp1,"@%s/1\n%s\n+\n%s\n@%s/2\n%s\n+\n%s\n", _r_seq[i*2].name, _r_seq[i*2].seq, _r_seq[i*2].qual, _r_seq[i*2].name, _r_seq[i*2+1].seq, _r_seq[i*2+1].qual);
|
|
684 }
|
|
685 else if (pairedEndMode && _r_seq[2*i].hits[0] == 0 && _r_seq[2*i+1].hits[0] == 0)
|
|
686 {
|
|
687 fprintf(fp1, ">%s/1\n%s\n>%s/2\n%shits=%d\n", _r_seq[i*2].name, _r_seq[i*2].seq, _r_seq[i*2].name, _r_seq[i*2+1].seq, _r_seq[2*i+1].hits[0]);
|
|
688 }
|
|
689 else if (!pairedEndMode && _r_seq[i].hits[0] == 0 && strcmp(_r_seq[i].qual, "*")!=0)
|
|
690 {
|
|
691 fprintf(fp1,"@%s\n%s\n+\n%s\n", _r_seq[i].name, _r_seq[i].seq, _r_seq[i].qual);
|
|
692 }
|
|
693 else if (!pairedEndMode && _r_seq[i].hits[0] == 0)
|
|
694 {
|
|
695 fprintf(fp1,">%s\n%s\n", _r_seq[i].name, _r_seq[i].seq);
|
|
696 }
|
|
697 }
|
|
698
|
|
699 fclose(fp1);
|
|
700 if (pairedEndMode)
|
|
701 _r_seqCnt *= 2;
|
|
702
|
|
703 for (i = 0; i < _r_seqCnt; i++)
|
|
704 {
|
|
705 freeMem(_r_seq[i].hits,0);
|
|
706 }
|
|
707
|
|
708
|
|
709 freeMem(_r_seq,0);
|
|
710 freeMem(_r_samplingLocs,0);
|
|
711 }
|
|
712
|
|
713 void adjustQual(Read *list, int seqCnt){
|
|
714 /* This function will automatically determine the phred_offset and readjust quality values if needed */
|
|
715 int i,j,q, offset=64;
|
|
716 int len = strlen(list[0].qual);
|
|
717
|
|
718 for (i=0; i<10000 && i<seqCnt; i++){
|
|
719 for (j=0;j<len;j++){
|
|
720 q = (int) list[i].qual[j] - offset;
|
|
721 if (q < 0){
|
|
722 offset = 33;
|
|
723 break;
|
|
724 }
|
|
725 }
|
|
726 if (offset == 33)
|
|
727 break;
|
|
728 }
|
|
729
|
|
730 if (offset == 64){
|
|
731 fprintf(stdout, "[Quality Warning] Phred offset is 64. Readjusting to 33.\n");
|
|
732 fflush(stdout);
|
|
733 for (i=0;i<seqCnt;i++){
|
|
734 for (j=0;j<len;j++){
|
|
735 list[i].qual[j] -= 31;
|
|
736 }
|
|
737 }
|
|
738 }
|
|
739 }
|
|
740
|
|
741
|
|
742
|
|
743 /*void finalizeOEAReads(char *fileName)
|
|
744 {
|
|
745 FILE *fp1=NULL;
|
|
746 FILE *fp2=NULL;
|
|
747
|
|
748 //printf("OEA%s\n", fileName);
|
|
749 char fileNameOEA1[200];
|
|
750 char fileNameOEA2[200];
|
|
751 sprintf(fileNameOEA1, "%s_OEA1", fileName);
|
|
752 sprintf(fileNameOEA2, "%s_OEA2", fileName);
|
|
753
|
|
754
|
|
755 if (fileName != NULL)
|
|
756 {
|
|
757 fp1 = fileOpen(fileNameOEA1, "w");
|
|
758 fp2 = fileOpen(fileNameOEA2, "w");
|
|
759 }
|
|
760 if (pairedEndMode)
|
|
761 _r_seqCnt /=2;
|
|
762
|
|
763 int i=0;
|
|
764 printf("%d\n", _r_seqCnt);
|
|
765 for (i = 0; i < _r_seqCnt; i++)
|
|
766 {
|
|
767 if (pairedEndMode && _r_seq[2*i].hits[0] == 0 && _r_seq[2*i+1].hits[0] != 0 && strcmp(_r_seq[2*i].qual, "*")==0)
|
|
768 {
|
|
769 fprintf(fp1,">%s/1\n%s\n>%s/2\n%s\n", _r_seq[i*2].name, _r_seq[i*2].seq, _r_seq[i*2].name, _r_seq[i*2+1].seq);
|
|
770 }
|
|
771 else if (pairedEndMode && _r_seq[2*i].hits[0] != 0 && _r_seq[2*i+1].hits[0] == 0 && strcmp(_r_seq[2*i].qual, "*")==0)
|
|
772 {
|
|
773 fprintf(fp2, ">%s/1\n%s\n>%s/2\n%s\n", _r_seq[i*2].name, _r_seq[i*2].seq, _r_seq[i*2].name, _r_seq[i*2+1].seq);
|
|
774 }
|
|
775 else if (pairedEndMode && _r_seq[2*i].hits[0] == 0 && _r_seq[2*i+1].hits[0] != 0 && strcmp(_r_seq[2*i].qual, "*")!=0)
|
|
776 {
|
|
777 fprintf(fp1,"@%s\n%s\n+\n%s\n@%s\n%s\n+\n%s\n", _r_seq[2*i].name,
|
|
778 _r_seq[2*i].seq,
|
|
779 _r_seq[2*i].qual,
|
|
780 _r_seq[2*i+1].name,
|
|
781 _r_seq[2*i+1].seq,
|
|
782 _r_seq[2*i+1].qual);
|
|
783 }
|
|
784 else if ( pairedEndMode && _r_seq[2*i].hits[0] != 0 && _r_seq[2*i+1].hits[0] == 0 && strcmp(_r_seq[2*i].qual, "*")!=0 )
|
|
785 {
|
|
786 fprintf(fp2,"@%s\n%s\n+\n%s\n@%s\n%s\n+\n%s\n", _r_seq[2*i].name,
|
|
787 _r_seq[2*i].seq,
|
|
788 _r_seq[2*i].qual,
|
|
789 _r_seq[2*i+1].name,
|
|
790 _r_seq[2*i+1].seq,
|
|
791 _r_seq[2*i+1].qual);
|
|
792 }
|
|
793 }
|
|
794
|
|
795 fclose(fp1);
|
|
796 fclose(fp2);
|
|
797
|
|
798 }*/
|