Mercurial > repos > geert-vandeweyer > fastq_qc_trimmer
comparison Paired_fastQ_trimmer.pl @ 1:fdbcc1aa4f01 draft
Uploaded
author | geert-vandeweyer |
---|---|
date | Thu, 13 Feb 2014 08:21:15 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:548887c4227c | 1:fdbcc1aa4f01 |
---|---|
1 #!/usr/bin/perl | |
2 | |
3 # load modules | |
4 use Getopt::Std; | |
5 use threads; | |
6 use Thread::Queue; | |
7 use threads::shared; | |
8 use File::Basename; | |
9 use List::Util qw(max); | |
10 | |
11 $now = time ; | |
12 | |
13 $|++; | |
14 # variables | |
15 my $paired :shared; | |
16 $paired = 0; | |
17 my $fq :shared; | |
18 my $rq :shared; | |
19 my $ofq :shared; | |
20 my $orq :shared; | |
21 my $trimq :shared; | |
22 my $failedfile :shared; | |
23 my $done :shared; | |
24 $done = 0; | |
25 my $nrin :shared; | |
26 $nrin = 0; | |
27 | |
28 my $rqz :shared; | |
29 my $fqz :shared; | |
30 my $rand :shared; | |
31 my $discarded :shared; | |
32 $discarded = 0; | |
33 my $verbose :shared; | |
34 $verbose = 1; | |
35 my $trim3 :shared; | |
36 $trim3 = 1; | |
37 my $trim5 :shared; | |
38 $trim5 = 1; | |
39 # create file lock in the tmp dir | |
40 $rand = int(rand(10000)); | |
41 while (-e "/tmp/$rand.lock") { | |
42 $rand = int(rand(10000)); | |
43 } | |
44 system("touch '/tmp/$rand.lock'"); | |
45 if (!-e "/tmp/sg.write.lock") { | |
46 system("touch '/tmp/sg.write.lock'"); | |
47 system("chmod 777 '/tmp/sg.write.lock'"); | |
48 } | |
49 | |
50 # opts | |
51 # i : path to forward read FASTQ file | |
52 # I : path to reverse read FASTQ file | |
53 # q : quality threshold in phred. | |
54 # n : string representing common read name parts | |
55 # s : trimming style : simple/bwa | |
56 # o : output file for forward reads | |
57 # O : output file for reverse reads | |
58 # F : output file with fully failed reads pairs. | |
59 # v : verbose: defaults to on (0/1) | |
60 # S : Trim on side: 5/3/b(oth) | |
61 # m : minimal length. | |
62 getopts('i:I:q:n:s:o:O:F:v:S:m:', \%opts) ; | |
63 | |
64 | |
65 if (defined($opts{'v'})) { | |
66 $verbose = $opts{'v'}; | |
67 } | |
68 | |
69 if (defined($opts{'s'})) { | |
70 if ($opts{'s'} eq 'bwa') { | |
71 if ($verbose == 1) { | |
72 print "Using BWA style trimming\n"; | |
73 } | |
74 $style = 'bwa'; | |
75 } | |
76 else { | |
77 if ($verbose == 1) { | |
78 print "Using simple 1bp Quality trimming\n"; | |
79 } | |
80 $style = 'simple'; | |
81 } | |
82 } | |
83 else { | |
84 if ($verbose == 1){ | |
85 print "Using simple 1bp Quality trimming\n"; | |
86 } | |
87 $style = 'simple'; | |
88 } | |
89 | |
90 if (!defined($opts{'i'})) { die('Forward reads fastq is mandatory');} | |
91 $fq = $opts{'i'}; | |
92 $ofq = $opts{'o'}; | |
93 | |
94 ## gzipped infile? | |
95 my $out = `file $fq`; | |
96 if ($out =~ m/gzip compressed/) { | |
97 if ($verbose == 1) { | |
98 print "Forward reads in gzipped format.\n"; | |
99 } | |
100 $fqz = 1; | |
101 } | |
102 else { | |
103 if ($verbose == 1) { | |
104 print "Forward reads in plain fastq format.\n"; | |
105 } | |
106 $fqz = 0; | |
107 } | |
108 | |
109 | |
110 if (defined($opts{'I'})) { | |
111 if ($verbose == 1) { | |
112 print "Reverse reads provided. Performing paired-end trimming\n"; | |
113 } | |
114 $paired = 1; | |
115 $rq = $opts{'I'}; | |
116 $orq = $opts{'O'}; | |
117 # gzipped ? | |
118 my $out = `file $rq`; | |
119 if ($out =~ m/gzip compressed/) { | |
120 if ($verbose == 1) { | |
121 print "Reverse reads in gzipped format.\n"; | |
122 } | |
123 $rqz = 1; | |
124 } | |
125 else { | |
126 if ($verbose == 1) { | |
127 print "Reverse reads in plain fastq format.\n"; | |
128 } | |
129 $rqz = 0; | |
130 } | |
131 } | |
132 ## FAILED FILENAME | |
133 $failedfile = $opts{'F'}; | |
134 | |
135 ## set the sides to trim | |
136 my $trimside = ''; | |
137 if (defined($opts{'S'})) { | |
138 if ($opts{'S'} eq '3') { | |
139 $trimside = "Trimming reads from the 3' end only\n"; | |
140 $trim3 = 1; | |
141 $trim5 = 0; | |
142 } | |
143 elsif ($opts{'S'} eq '5') { | |
144 $trimside = "Trimming reads from the 5' end only\n"; | |
145 $trim3 = 0; | |
146 $trim5 = 1; | |
147 } | |
148 elsif ($opts{'S'} eq 'b') { | |
149 $trimside = "Trimming reads both from 5' and 3'\n"; | |
150 $trim3 = 1; | |
151 $trim5 = 1; | |
152 } | |
153 else { | |
154 $trimside = "Invalid trimming side specification. Setting to both sides trimming\n"; | |
155 $trim3 = 1; | |
156 $trim5 = 1; | |
157 } | |
158 } | |
159 else { | |
160 $trimside = "Invalid trimming side specification. Setting to both sides trimming\n"; | |
161 $trim3 = 1; | |
162 $trim5 = 1; | |
163 } | |
164 if ($verbose == 1) { | |
165 print $trimside; | |
166 } | |
167 | |
168 # SET MINIMAL LENGTH | |
169 my $minlength :shared; | |
170 if (exists($opts{'m'})) { | |
171 $minlength = $opts{'m'}; | |
172 } | |
173 else { | |
174 $minlength = 0; | |
175 } | |
176 | |
177 ## CHECK IF OUTPUT FILESNAMES ARE PRESENT | |
178 if ($ofq eq '') { | |
179 $ofq = "$fq.$style.trimmed"; | |
180 if ($verbose == 1) { | |
181 print "No Forward output name specified. Forward reads will be printed to:\n\t$ofq\n"; | |
182 } | |
183 } | |
184 if ($paired == 1 && $orq eq '') { | |
185 $orq = "$rq.$style.trimmed"; | |
186 if ($verbose == 1) { | |
187 print "No Reverse output name specified. Reverse reads will be printed to:\n\t$orq\n"; | |
188 } | |
189 } | |
190 if ($failedfile eq '') { | |
191 $failedfile = "$fq.Failed_Reads"; | |
192 if ($verbose == 1) { | |
193 print "No failed ouput name specified. Failed reads will be printed to:\n\t$failedfile\n"; | |
194 } | |
195 } | |
196 | |
197 ## set trimming threshold | |
198 if (!defined($opts{'q'}) or !($opts{'q'} =~ m/^[0-9]+$/)) { | |
199 $trimq = 20; | |
200 } | |
201 else { | |
202 $trimq = $opts{'q'}; | |
203 } | |
204 | |
205 ## set read delimiter | |
206 if (defined($opts{'n'})) { | |
207 $indelim = $opts{'n'}; | |
208 # galaxy replaced @ by __at__ | |
209 if (substr($indelim,0,6) eq '__at__') { | |
210 $indelim = '@'.substr($indelim,6); | |
211 } | |
212 } | |
213 else { | |
214 print "No indelim defined\n"; | |
215 $indelim = "@"; # has risk, fails if @ is first qual item | |
216 } | |
217 | |
218 ## NEW : REPLACE if indelim = @ by the first 5 chars of first line of forward-fastq. | |
219 if ($indelim eq '@') { | |
220 if ($fqz == 0) { | |
221 open IN, $fq; | |
222 } | |
223 else { | |
224 open(IN, "gunzip -c $fq | "); | |
225 } | |
226 $head = <IN>; | |
227 $indelim = substr($head,0,5); | |
228 print "\n"; | |
229 print "WARNING:\n"; | |
230 print "########\n"; | |
231 print " Setting Delimiter as '\@' is prone to errors\n"; | |
232 print " as it will incorrectly split if '\@' is the firt value in the quality string !!\n"; | |
233 print ' It is recommended to use eg -n \'\@SRR301\''."\n"; | |
234 print " Therefore, the delimiter was changed to the first 5 characters of the first forward fastq file.\n"; | |
235 print " => Delimiter : $indelim\n"; | |
236 print "\n\n"; | |
237 sleep 3; | |
238 } | |
239 ## create queues | |
240 my $totrim = Thread::Queue->new(); | |
241 my $toprint = Thread::Queue->new(); | |
242 my $dummy = Thread::Queue->new(); | |
243 my $failed = Thread::Queue->new(); | |
244 | |
245 # monitor thread | |
246 my $finish :shared; | |
247 $finish = 0; | |
248 $mon = threads->create('monitor'); | |
249 | |
250 # start FASTQ reading thread | |
251 if ($verbose == 1) { | |
252 print "start reading file\n"; | |
253 } | |
254 $reading = threads->create('readfile'); | |
255 | |
256 ## give time to build reading buffer | |
257 sleep 10; | |
258 | |
259 # start trimming threads | |
260 if ($verbose == 1) { | |
261 print "start trimming\n"; | |
262 } | |
263 if ($paired == 1) { | |
264 if ($style eq 'bwa') { | |
265 $trimming1 = threads->create('bwapaired'); | |
266 $trimming2 = threads->create('bwapaired'); | |
267 | |
268 } | |
269 else { | |
270 $trimming1 = threads->create('simplepaired'); | |
271 $trimming2 = threads->create('simplepaired'); | |
272 } | |
273 | |
274 } | |
275 else { | |
276 if ($style eq 'bwa') { | |
277 #print "not available yet, switching to simple trimming\n"; | |
278 #$style = 'simple'; | |
279 $trimming1 = threads->create('bwasingle'); | |
280 $trimming2 = threads->create('bwasingle'); | |
281 | |
282 } | |
283 if ($style eq 'simple') { | |
284 $trimming1 = threads->create('simplesingle'); | |
285 $trimming2 = threads->create('simplesingle'); | |
286 } | |
287 } | |
288 # start printing threads | |
289 $printout = threads->create('printout'); | |
290 $printfail = threads->create('printfailed'); | |
291 # end threads | |
292 $reading->join(); | |
293 ## all reads are queued for trimming, add undef to trimming queues to end threads. | |
294 $totrim->enqueue(undef); | |
295 $totrim->enqueue(undef); | |
296 $trimming1->join(); | |
297 $trimming2->join(); | |
298 ## all reads are trimmed and queued from printing, send undef to end printing threads. | |
299 $toprint->enqueue(undef); | |
300 $failed->enqueue(undef); | |
301 $printout->join(); | |
302 $printfail->join(); | |
303 if ($verbose == 1) { | |
304 print "Waiting for status monitor to shut down\n"; | |
305 } | |
306 $mon->join(); | |
307 ## copy files from /tmp to destination folder | |
308 if ($verbose == 1) { | |
309 print "Copying files to destination: Forward.."; | |
310 } | |
311 if ($fqz == 0) { | |
312 system("cp -f /tmp/$rand.fq $ofq"); | |
313 unlink("/tmp/$rand.fq"); | |
314 } | |
315 else { | |
316 system("gzip -c /tmp/$rand.fq > $ofq"); | |
317 unlink("/tmp/$rand.fq"); | |
318 } | |
319 if ($paired == 1) { | |
320 if ($verbose == 1) { | |
321 print " Reverse.."; | |
322 } | |
323 if ($rqz == 0) { | |
324 system("cp -f /tmp/$rand.rq $orq"); | |
325 unlink("/tmp/$rand.rq"); | |
326 } | |
327 else { | |
328 system("gzip -c /tmp/$rand.rq > $orq"); | |
329 unlink("/tmp/$rand.rq"); | |
330 } | |
331 } | |
332 | |
333 if ($verbose == 1) { | |
334 print " Failed items.."; | |
335 } | |
336 system("cp -f /tmp/$rand.failed $failedfile"); | |
337 unlink("/tmp/$rand.failed"); | |
338 if ($verbose == 1) { | |
339 print " Done\n"; | |
340 } | |
341 | |
342 # remove lock file | |
343 unlink("/tmp/$rand.lock"); | |
344 | |
345 ######################## | |
346 ## print run details: ## | |
347 ######################## | |
348 my $string = sprintf("%.2f",$done/1000000)."M Reads in (each) FASTQ file processed.\n"; | |
349 if ($paired == 1) { | |
350 $string .= sprintf ("%.2f",($done-$discarded)/1000000)."M read pairs in output FASTQ's.\n"; | |
351 $string .= sprintf("%.2f",$discarded/1000000)."M reads pairs discarded\n"; | |
352 } | |
353 else { | |
354 $string .= sprintf ("%.2f",($done-$discarded)/1000000)."M reads in output FASTQ.\n"; | |
355 $string .= sprintf("%.2f",$discarded/1000000)."M reads discarded\n"; | |
356 } | |
357 print $string; | |
358 ################## | |
359 # PRINT RUN-TIME # | |
360 ################## | |
361 $now = time - $now; | |
362 printf("\n\nRunning time:%02d:%02d:%02d\n",int($now/3600),int(($now % 3600)/60),int($now % 60)); | |
363 | |
364 | |
365 exit(); | |
366 | |
367 | |
368 | |
369 ## SUBROUTINE : READ FASTQ FILES | |
370 ################################ | |
371 sub readfile { | |
372 if ($paired == 1) { | |
373 if ($fqz == 0) { | |
374 open IN1, $fq; | |
375 } | |
376 else { | |
377 open(IN1, "gunzip -c $fq | "); | |
378 } | |
379 if ($rqz == 0) { | |
380 open IN2, $rq; | |
381 } | |
382 else { | |
383 open(IN2, "gunzip -c $rq | "); | |
384 } | |
385 $/ = "\n$indelim"; | |
386 $deliml = length($indelim); | |
387 # compose pack ignore string | |
388 my $dx = ''; | |
389 for (my $idx = 0; $idx<length($indelim); $idx++) { | |
390 $dx .= 'X'; | |
391 } | |
392 my $f = <IN1>; | |
393 my $r = <IN2>; | |
394 # the reads contain the delimiter on the end (cf \n on regular file read line endings.) | |
395 # so we strip teh delimiter using pack, leaving just the \n. | |
396 # this means that in subsequent reads, we need to add the delimiter to the front of the read. | |
397 my $read = substr(pack("A*$dx",$f),$deliml) . $r; #pack("A*$dx",$r) ;#. '|'; | |
398 my $counter = 1; | |
399 my $fr = ''; | |
400 my $rr = ''; | |
401 my $reads = ''; | |
402 my $lastf = ''; | |
403 while ($fr = <IN1>) { | |
404 $rr = <IN2>; | |
405 $lastf = $fr; | |
406 ## add delim to front, trim from back. add our delim. | |
407 ## done in next iteration, because last is different. | |
408 # the delimiter from the forward read is used to complete the reverse read. | |
409 $reads .= $indelim.pack("A*$dx",$read) .'|'; | |
410 # concat reads | |
411 $read = $fr.$rr; | |
412 ## add string to queue if 20,000 reads in array | |
413 if ($counter == 20000) { | |
414 $nrin = $nrin + 20000; | |
415 $totrim->enqueue(pack("A*X",$reads)); # pack("A*X") strips last | | |
416 $reads = ''; | |
417 $counter = 0; | |
418 } | |
419 $counter++; | |
420 | |
421 ## dummy (see monitor). | |
422 my $d = $dummy->dequeue_nb(); | |
423 } | |
424 # last reads in files need special care. | |
425 $reads .= $indelim.$lastf.$indelim.$rr; | |
426 close IN1; | |
427 close IN2; | |
428 # submit last reads to queue | |
429 if ($reads ne '') { | |
430 #$totrim->enqueue(pack("A*X",$reads)); | |
431 $totrim->enqueue($reads); | |
432 } | |
433 if ($verbose == 1) { | |
434 print "All reads queued for trimming\n"; | |
435 } | |
436 } | |
437 else { | |
438 if ($fqz == 0) { | |
439 open IN1, $fq; | |
440 } | |
441 else { | |
442 open(IN1, "gunzip -c $fq | "); | |
443 } | |
444 $/ = "\n$indelim"; | |
445 $deliml = length($indelim); | |
446 # compose pack ignore string | |
447 my $dx = ''; | |
448 for (my $idx = 0; $idx<length($indelim); $idx++) { | |
449 $dx .= 'X'; | |
450 } | |
451 my $f = <IN1>; | |
452 # the reads contain the delimiter on the end (cf \n on regular file read line endings.) | |
453 # so we strip teh delimiter using pack, leaving just the \n. | |
454 # this means that in subsequent reads, we need to add the delimiter to the front of the read. | |
455 my $read = substr($f,$deliml); | |
456 my $counter = 1; | |
457 my $fr = ''; | |
458 my $reads = ''; | |
459 while ($fr = <IN1>) { | |
460 ## add delim to front, trim from back. add our delim. | |
461 ## done in next iteration, because last is different. | |
462 $reads .= $indelim.pack("A*$dx",$read) .'|'; | |
463 # concat reads | |
464 $read = $fr; | |
465 ## add string to queue if 10,000 reads in array | |
466 if ($counter == 40000) { | |
467 $nrin = $nrin + 40000; | |
468 $totrim->enqueue(pack("A*X",$reads)); | |
469 $reads = ''; | |
470 $counter = 0; | |
471 } | |
472 $counter++; | |
473 | |
474 ## dummy (see monitor). | |
475 my $d = $dummy->dequeue_nb(); | |
476 } | |
477 $reads .= $indelim.$read; | |
478 close IN1; | |
479 # submit last reads to queue | |
480 if ($reads ne '') { | |
481 #$totrim->enqueue(pack("A*X",$reads)); | |
482 $totrim->enqueue($reads); | |
483 } | |
484 if ($verbose == 1) { | |
485 print "All reads queued for trimming\n"; | |
486 } | |
487 | |
488 } | |
489 } | |
490 | |
491 sub bwapaired { | |
492 my $tq = $trimq; | |
493 while ( defined(my $rstring = $totrim->dequeue())) { | |
494 my @reads = split(/\|/,$rstring); # each item has 20.000 reads | |
495 my $printstring = ''; | |
496 my $failedstring = ''; | |
497 foreach (@reads) { | |
498 my $read = $_; | |
499 my ($h1a, $s1,$h1b,$q1,$h2a,$s2,$h2b,$q2) = split(/\n/,$read); | |
500 ## skip too short reads | |
501 if (length($q1) < $minlength || length($q2) < $minlength) { | |
502 $discarded++; | |
503 $failedstring .= $read ;#. '|'; | |
504 next; | |
505 } | |
506 # initial check for maximal quality value. | |
507 ## quality arrays | |
508 my @q1array = unpack("C*",$q1); | |
509 if (max(@q1array) < $tq + 33) { | |
510 #print "Discarding on failed forward qualities: ".max(@q1array). " vs $tq + 33)\n$read\n"; | |
511 $discarded++; | |
512 $failedstring .= $read ;#. '|'; | |
513 next; | |
514 } | |
515 my @q2array = unpack("C*",$q2); | |
516 if (max(@q2array) < $trimq + 33) { | |
517 #print "Discarding on failed reverse qualities\n$read\n"; | |
518 $discarded++; | |
519 $failedstring .= $read ;#. '|'; | |
520 next; | |
521 } | |
522 | |
523 ############# | |
524 ## Trim 3' ## | |
525 ############# | |
526 if ($trim3 == 1) { | |
527 #trim first | |
528 my $pos = length($q1) - 1; | |
529 my $maxPos = $pos; | |
530 ## skip empty read. | |
531 #if ($pos <= 0) { | |
532 # $discarded++; | |
533 # $failedstring .= $read . '|'; | |
534 # #print "skip empty : \n$read\n"; | |
535 # next; | |
536 #} | |
537 my $area = 0; | |
538 my $maxArea = 0; | |
539 ## unpack qual-string to array of phred scores | |
540 #my @qarray = unpack("C*",$q1); # use from initial check. | |
541 while ($pos>0 && $area>=0) { | |
542 $area += $tq - ($q1array[($pos)] - 33); | |
543 ## not <= for more aggressive trimming : if multiple equal maxima => use shortest read. | |
544 if ($area < $maxArea) { | |
545 $pos--; | |
546 next; | |
547 } | |
548 else { | |
549 $maxArea = $area; | |
550 $maxPos = $pos; | |
551 $pos--; | |
552 next; | |
553 } | |
554 } | |
555 if ($maxPos==0) { # scanned whole read and didn't integrate to maximum? skip (no output for this read) | |
556 $discarded++; | |
557 $failedstring .= $read ;#. '|'; | |
558 next; | |
559 } | |
560 # otherwise trim before position where area reached a maximum | |
561 # $maxPos as length of substr gives positions zero to just before the maxpos. | |
562 #$s1 = substr($s1,0,$maxPos); | |
563 #$q1 = substr($q1,0,$maxPos); | |
564 $s1 = unpack("A$maxPos",$s1); | |
565 $q1 = unpack("A$maxPos",$q1); | |
566 | |
567 #trim second | |
568 $pos = length($q2); | |
569 $maxPos = $pos; | |
570 ## skip empty read. | |
571 #if ($pos <= 0) { | |
572 # $discarded++; | |
573 # $failedstring .= $read . '|'; | |
574 # next; | |
575 #} | |
576 $area = 0; | |
577 $maxArea = 0; | |
578 #@qarray = unpack("C*",$q2); | |
579 while ($pos>0 && $area>=0) { | |
580 $area += $tq - ($q2array[($pos)] - 33); | |
581 if ($area < $maxArea) { | |
582 $pos--; | |
583 next; | |
584 } | |
585 else { | |
586 $maxArea = $area; | |
587 $maxPos = $pos; | |
588 $pos--; | |
589 next; | |
590 } | |
591 } | |
592 if ($maxPos==0) { # scanned whole read and didn't integrate to zero? skip (no output for this read) | |
593 $discarded++; | |
594 $failedstring .= $read ;#. '|'; | |
595 next; | |
596 } | |
597 # Otherwise trim before position where area reached a maximum | |
598 $s2 = unpack("A$maxPos",$s2); | |
599 $q2 = unpack("A$maxPos",$q2); | |
600 #$s2 = substr($s2,0,$maxPos); | |
601 #$q2 = substr($q2,0,$maxPos); | |
602 | |
603 } | |
604 ############# | |
605 ## TRIM 5' ## | |
606 ############# | |
607 if ($trim5 == 1) { | |
608 #trim first | |
609 my $skip = 0; #length($q1); | |
610 my $maxPos = $skip; | |
611 my $area = 0; | |
612 my $maxArea = 0; | |
613 my @qarray = unpack("C*",$q1); | |
614 while ($skip<length($q1) && $area>=0) { | |
615 $area += $tq - ($qarray[($skip)] - 33); | |
616 if ($area < $maxArea) { | |
617 $skip++; | |
618 next; | |
619 } | |
620 else { | |
621 $maxArea = $area; | |
622 $maxPos = $skip; | |
623 $skip++; | |
624 next; | |
625 } | |
626 } | |
627 if ($maxPos==length($q1)- 1) { # => maximal value at end of string => no point of no return found. | |
628 $discarded++; | |
629 $failedstring .= $read ;#. '|'; | |
630 next; | |
631 } | |
632 # trim after position where area reached a maximum | |
633 #$s1 = substr($s1,($maxPos+1)); | |
634 #$q1 = substr($q1,($maxPos+1)); | |
635 $s1 = unpack("x$maxPos A*",$s1); | |
636 $q1 = unpack("x$maxPos A*",$q1); | |
637 | |
638 | |
639 #trim second | |
640 $skip = 0 ;#length($q2); | |
641 $maxPos = $skip; | |
642 $area = 0; | |
643 $maxArea = 0; | |
644 @qarray = unpack("C*",$q2); | |
645 | |
646 while ($skip < length($q2) && $area>=0) { | |
647 $area += $tq - ($qarray[($skip)] - 33); | |
648 | |
649 if ($area < $maxArea) { | |
650 $skip++; | |
651 next; | |
652 } | |
653 else { | |
654 $maxArea = $area; | |
655 $maxPos = $skip; | |
656 $skip++; | |
657 next; | |
658 } | |
659 } | |
660 if ($maxPos==(length($q2)-1)) { # scanned whole read and didn't integrate to zero? skip (no output for this read) | |
661 $discarded++; | |
662 $failedstring .= $read ;#. '|'; | |
663 next; | |
664 } | |
665 # trim after position where area reached a maximum | |
666 #$s2 = substr($s2,($maxPos+1)); | |
667 #$q2 = substr($q2,($maxPos+1)); | |
668 $s2 = unpack("x$maxPos A*",$s2); | |
669 $q2 = unpack("x$maxPos A*",$q2); | |
670 } | |
671 | |
672 ## DISCARD IF TOO SHORT ## | |
673 if (length($s1) < $minlength || length($s2) < $minlength) { | |
674 $discarded++; | |
675 $failedstring .= $read ;#. '|'; | |
676 next; | |
677 | |
678 } | |
679 # queue for printing | |
680 my @arr = ($h1a,$s1,$h1b,$q1,$h2a,$s2,$h2b,$q2); | |
681 $printstring .= join('~',@arr) . '|'; # these two are not in quality string, and can thus be used as seperator | |
682 # | seperates pairs of reads; ~ seperates lines of single read entry. | |
683 #print "$printstring\n"; | |
684 } | |
685 | |
686 if ($printstring ne '') { | |
687 $toprint->enqueue(pack("A*X",$printstring)); # removes trailing | | |
688 #$toprint->enqueue($printstring); | |
689 } | |
690 if ($failedstring ne '') { | |
691 #$failed->enqueue(pack("A*X",$failedstring)); | |
692 $failed->enqueue($failedstring); | |
693 } | |
694 | |
695 | |
696 } | |
697 } | |
698 sub simplepaired { | |
699 # local variables speed up threading | |
700 my $tq = $trimq; | |
701 while ( defined(my $rstring = $totrim->dequeue())){ | |
702 my @reads = split(/\|/,$rstring); | |
703 my $printstring = ''; | |
704 my $failedstring = ''; | |
705 foreach (@reads) { | |
706 my $read = $_; | |
707 my ($h1a, $s1,$h1b,$q1,$h2a,$s2,$h2b,$q2) = split(/\n/,$read); | |
708 ## skip too short reads | |
709 if (length($q1) < $minlength || length($q2) < $minlength) { | |
710 $discarded++; | |
711 $failedstring .= $read; # . '|'; | |
712 next; | |
713 } | |
714 | |
715 # initial check for maximal quality value. | |
716 ## quality arrays | |
717 my @q1array = unpack("C*",$q1); | |
718 if (max(@q1array) < $trimq + 33) { | |
719 #print "Discarding on failed forward qualities\n$read\n"; | |
720 $discarded++; | |
721 $failedstring .= $read;# . '|'; | |
722 next; | |
723 } | |
724 my @q2array = unpack("C*",$q2); | |
725 if (max(@q2array) < $trimq + 33) { | |
726 #print "Discarding on failed reverse qualities\n$read\n"; | |
727 $discarded++; | |
728 $failedstring .= $read;# . '|'; | |
729 next; | |
730 } | |
731 | |
732 ############# | |
733 ## trim 3' ## | |
734 ############# | |
735 if ($trim3 == 1) { | |
736 #trim first | |
737 my $pos = length($q1) - 1; | |
738 my $maxPos = $pos; | |
739 #if ($pos <= 0) { | |
740 # $discarded++; | |
741 # $failedstring .= $read . '|'; | |
742 # next; | |
743 #} | |
744 #my @qarray = unpack("C*",$q1); | |
745 while ( $pos>0 ) { | |
746 if ($tq > $q1array[$pos] - 33) { | |
747 # score at position is below threshold | |
748 $pos--; | |
749 next; | |
750 } | |
751 else { | |
752 # score is at or above threshold | |
753 $maxPos = $pos; | |
754 last; | |
755 } | |
756 } | |
757 if ($pos==0) { # scanned whole read and didn't integrate to zero? skip (no output for this read) | |
758 $discarded++; | |
759 $failedstring .= $read ;#. '|'; | |
760 next; | |
761 } | |
762 # integrated to zero? trim before position where quality was below threshold | |
763 # unpack is one based | |
764 #$maxPos++; | |
765 #$s1 = substr($s1,0,$maxPos); | |
766 $s1 = unpack("A$maxPos",$s1); | |
767 #$q1 = substr($q1,0,$maxPos); | |
768 $q1 = unpack("A$maxPos",$q1); | |
769 | |
770 #trim second | |
771 $pos = length($q2) -1; | |
772 $maxPos = $pos; | |
773 if ($pos <= 0) { | |
774 $discarded++; | |
775 $failedstring .= $read ;#. '|'; | |
776 next; | |
777 } | |
778 #my @qarray = unpack("C*",$q2); | |
779 | |
780 while ($pos>0) { | |
781 if ($tq > $q2array[$pos] - 33) { | |
782 # score at position is below threshold | |
783 $pos--; | |
784 next; | |
785 } | |
786 else { | |
787 # score is at or above threshold | |
788 $maxPos = $pos; | |
789 last; | |
790 } | |
791 } | |
792 if ($pos==0) { # scanned whole read and didn't integrate to zero? skip (no output for this read) | |
793 $discarded++; | |
794 $failedstring .= $read ;#. '|'; | |
795 next; | |
796 } | |
797 # integrated to zero? trim before position where score below threshold | |
798 #$maxPos++; | |
799 $s2 = unpack("A$maxPos",$s2); | |
800 $q2 = unpack("A$maxPos",$q2); | |
801 #$s2 = substr($s2,0,$maxPos); | |
802 #$q2 = substr($q2,0,$maxPos); | |
803 | |
804 } | |
805 ############# | |
806 ## trim 5' ## | |
807 ############# | |
808 if ($trim5 == 1) { | |
809 # first | |
810 my $skip = 0; #length($q1); | |
811 #my $maxPos = $skip; | |
812 my @qarray = unpack("C*",$q1); | |
813 while ( $skip < length($q1) ) { | |
814 if ($tq > $qarray[$skip] - 33) { | |
815 # score at position is below threshold | |
816 $skip++; | |
817 next; | |
818 } | |
819 else { | |
820 # score is at or above threshold | |
821 #$maxPos = $skip; | |
822 last; | |
823 } | |
824 } | |
825 if ($skip == length($q1)) { # should not fail, as that would have happened on 3' trim | |
826 $discarded++; | |
827 $failedstring .= $read ;#. '|'; | |
828 next; | |
829 } | |
830 # integrated to zero? trim before position where quality was below threshold | |
831 # unpack is one based | |
832 $s1 = unpack("x$skip A*",$s1); | |
833 $q1 = unpack("x$skip A*",$q1); | |
834 #$s1 = substr($s1,$skip); | |
835 #$q1 = substr($q1,$skip); | |
836 | |
837 # second | |
838 $skip = 0; #length($q1); | |
839 my @qarray = unpack("C*",$q2); | |
840 #$maxPos = $skip; | |
841 while ( $skip < length($q2) ) { | |
842 if ($tq > $qarray[$skip] - 33) { | |
843 # score at position is below threshold | |
844 $skip++; | |
845 next; | |
846 } | |
847 else { | |
848 # score is at or above threshold | |
849 #$maxPos = $skip; | |
850 last; | |
851 } | |
852 } | |
853 if ($skip == length($q2)) { # should not fail, as that would have happened on 3' trim | |
854 $discarded++; | |
855 $failedstring .= $read ;#. '|'; | |
856 next; | |
857 } | |
858 # integrated to zero? trim before position where quality was below threshold | |
859 # unpack is one based | |
860 $s2 = unpack("x$skip A*",$s2); | |
861 $q2 = unpack("x$skip A*",$q2); | |
862 #$s2 = substr($s2,$skip); | |
863 #$q2 = substr($q2,$skip); | |
864 | |
865 } | |
866 ## DISCARD IF TOO SHORT ## | |
867 if (length($s1) < $minlength || length($s2) < $minlength) { | |
868 $discarded++; | |
869 $failedstring .= $read .#. '|'; | |
870 next; | |
871 | |
872 } | |
873 | |
874 # queue for printing | |
875 my @arr = ($h1a,$s1,$h1b,$q1,$h2a,$s2,$h2b,$q2); | |
876 $printstring .= join('~',@arr) . '|'; # these two are not in quality string, and can thus be used as seperator | |
877 # | seperates pairs of reads; ~ seperates lines of single read entry. | |
878 } | |
879 if ($printstring ne '') { | |
880 $toprint->enqueue(pack("A*X",$printstring)); # removes trailing | | |
881 #$toprint->enqueue($printstring); | |
882 } | |
883 if ($failedstring ne '') { | |
884 #$failed->enqueue(pack("A*X",$failedstring)); | |
885 $failed->enqueue($failedstring); | |
886 } | |
887 } | |
888 } | |
889 | |
890 sub bwasingle { | |
891 # local variables speed up threading | |
892 my $tq = $trimq; | |
893 while ( defined(my $rstring = $totrim->dequeue())){ | |
894 my @reads = split(/\|/,$rstring); | |
895 my $printstring = ''; | |
896 my $failedstring = ''; | |
897 foreach (@reads) { | |
898 my $read = $_; | |
899 my ($h1a,$s1,$h1b,$q1) = split(/\n/,$read); | |
900 ## skip too short reads | |
901 if (length($q1) < $minlength ) { | |
902 | |
903 $discarded++; | |
904 $failedstring .= $read ;#. '|'; | |
905 next; | |
906 } | |
907 # initial check for maximal quality value. | |
908 ## quality arrays | |
909 my @q1array = unpack("C*",$q1); | |
910 if (max(@q1array) < $tq + 33) { | |
911 print "Discarding on failed qualities\n$read\n"; | |
912 $discarded++; | |
913 $failedstring .= $read;# . '|'; | |
914 next; | |
915 } | |
916 | |
917 ############# | |
918 ## trim 3' ## | |
919 ############# | |
920 if ($trim3 == 1) { | |
921 my $pos = length($q1) - 1; | |
922 my $maxPos = $pos; | |
923 ## skip empty read. | |
924 #if ($pos <= 0) { | |
925 # $discarded++; | |
926 # $failedstring .= $read . '|'; | |
927 # next; | |
928 #} | |
929 my $area = 0; | |
930 my $maxArea = 0; | |
931 #my @qarray = unpack("C*",$q1); | |
932 while ($pos>0 && $area>=0) { | |
933 $area += $tq - ($q1array[($pos)] - 33); | |
934 ## not <= for more aggressive trimming : if multiple equal maxima => use shortest read. | |
935 if ($area < $maxArea) { | |
936 $pos--; | |
937 next; | |
938 } | |
939 else { | |
940 $maxArea = $area; | |
941 $maxPos = $pos; | |
942 $pos--; | |
943 next; | |
944 } | |
945 } | |
946 ## The online perl approach is wrong if the high-quality part is not long enough to reach 0. | |
947 if ($maxPos == 0) { # maximal badness on position 0 => no point of no return reached. | |
948 $discarded++; | |
949 $failedstring .= $read ;#. '|'; | |
950 next; | |
951 } | |
952 # otherwise trim before position where area reached a maximum | |
953 # $maxPos as length of substr gives positions zero to just before the maxpos. | |
954 $s1 = unpack("A$maxPos",$s1); | |
955 $q1 = unpack("A$maxPos",$q1); | |
956 | |
957 } | |
958 ############# | |
959 ## trim 5' ## | |
960 ############# | |
961 if ($trim5 == 1) { | |
962 my $skip = 0; #length($q1); | |
963 my $maxPos = $skip; | |
964 my $area = 0; | |
965 my $maxArea = 0; | |
966 my @qarray = unpack("C*",$q1); | |
967 while ($skip<length($q1) && $area>=0) { | |
968 #$area += $tq - (ord(unpack("x$skip A1",$q1)) - 33); | |
969 $area += $tq - ($qarray[($skip)] - 33); | |
970 if ($area < $maxArea) { | |
971 $skip++; | |
972 next; | |
973 } | |
974 else { | |
975 $maxArea = $area; | |
976 $maxPos = $skip; | |
977 $skip++; | |
978 next; | |
979 } | |
980 } | |
981 #if ($skip==length($q1)) { # scanned whole read and didn't integrate to zero? skip (no output for this read | |
982 if ($maxPos == (length($q1)-1)) { # => maximal value at end of string => no point of no return found. | |
983 $discarded++; | |
984 $failedstring .= $read ;#. '|'; | |
985 next; | |
986 } | |
987 # trim after position where area reached a maximum | |
988 $s1 = unpack("x$maxPos A*",$s1); | |
989 $q1 = unpack("x$maxPos A*",$q1); | |
990 #$s1 = substr($s1,($maxPos+1)); | |
991 #$q1 = substr($q1,($maxPos+1)); | |
992 } | |
993 ## DISCARD IF TOO SHORT ## | |
994 if (length($s1) < $minlength) { | |
995 $discarded++; | |
996 $failedstring .= $read ;#. '|'; | |
997 next; | |
998 | |
999 } | |
1000 | |
1001 # queue for printing | |
1002 my @arr = ($h1a,$s1,$h1b,$q1); | |
1003 $printstring .= join("\n",@arr) . "\n"; # join components by newline character for printing | |
1004 } | |
1005 if ($printstring ne '') { | |
1006 #$toprint->enqueue(pack("A*X",$printstring)); # removes trailing | | |
1007 $toprint->enqueue($printstring); | |
1008 } | |
1009 if ($failedstring ne '') { | |
1010 #$failed->enqueue(pack("A*X",$failedstring)); | |
1011 $failed->enqueue($failedstring); | |
1012 } | |
1013 | |
1014 } | |
1015 } | |
1016 | |
1017 | |
1018 sub simplesingle { | |
1019 # local variables speed up threading | |
1020 my $tq = $trimq; | |
1021 while ( defined(my $rstring = $totrim->dequeue())){ | |
1022 my @reads = split(/\|/,$rstring); | |
1023 my $printstring = ''; | |
1024 my $failedstring = ''; | |
1025 foreach (@reads) { | |
1026 my $read = $_; | |
1027 my ($h1a,$s1,$h1b,$q1) = split(/\n/,$read); | |
1028 ## skip too short reads | |
1029 if (length($q1) < $minlength ) { | |
1030 $discarded++; | |
1031 $failedstring .= $read ;#. '|'; | |
1032 next; | |
1033 } | |
1034 # initial check for maximal quality value. | |
1035 ## quality arrays | |
1036 my @q1array = unpack("C*",$q1); | |
1037 if (max(@q1array) < $trimq + 33) { | |
1038 #print "Discarding on failed qualities\n$read\n"; | |
1039 $discarded++; | |
1040 $failedstring .= $read ;#. '|'; | |
1041 next; | |
1042 } | |
1043 | |
1044 ############# | |
1045 ## trim 3' ## | |
1046 ############# | |
1047 if ($trim3 == 1) { | |
1048 my $pos = length($q1); | |
1049 my $maxPos = $pos; | |
1050 #if ($pos <= 0) { | |
1051 # $discarded++; | |
1052 # $failedstring .= $read . '|'; | |
1053 # next; | |
1054 #} | |
1055 #my @qarray = unpack("C*",$q1); | |
1056 | |
1057 while ( $pos>0 ) { | |
1058 if ($tq > $q1array[$pos] - 33) { | |
1059 # score at position is below threshold | |
1060 $pos--; | |
1061 next; | |
1062 } | |
1063 else { | |
1064 # score is at or above threshold | |
1065 $maxPos = $pos; | |
1066 last; | |
1067 } | |
1068 } | |
1069 if ($pos==0) { # scanned whole read and didn't integrate to zero? skip (no output for this read) | |
1070 $discarded++; | |
1071 $failedstring .= $read ;#. '|'; | |
1072 next; | |
1073 } | |
1074 # integrated to zero? trim before position where quality was below threshold | |
1075 # unpack is one based | |
1076 $s1 = unpack("A$maxPos",$s1); | |
1077 $q1 = unpack("A$maxPos",$q1); | |
1078 #$s1 = substr($s1,0,$maxPos); | |
1079 #$q1 = substr($q1,0,$maxPos); | |
1080 | |
1081 } | |
1082 ############# | |
1083 ## trim 5' ## | |
1084 ############# | |
1085 if ($trim5 == 1) { | |
1086 my $skip = 0; #length($q1); | |
1087 #my $maxPos = $skip; | |
1088 my @qarray = unpack("C*",$q1); | |
1089 | |
1090 while ( $skip < length($q1) ) { | |
1091 if ($tq > $qarray[$skip] - 33) { | |
1092 # score at position is below threshold | |
1093 $skip++; | |
1094 next; | |
1095 } | |
1096 else { | |
1097 # score is at or above threshold | |
1098 #$maxPos = $skip; | |
1099 last; | |
1100 } | |
1101 } | |
1102 if ($skip == length($q1)) { | |
1103 $discarded++; | |
1104 $failedstring .= $read ;#. '|'; | |
1105 next; | |
1106 } | |
1107 # integrated to zero? trim before position where quality was below threshold | |
1108 # unpack is one based | |
1109 #$s1 = substr($s1,$skip); | |
1110 #$q1 = substr($q1,$skip); | |
1111 $s1 = unpack("x$skip A*",$s1); | |
1112 $q1 = unpack("x$skip A*",$q1); | |
1113 | |
1114 | |
1115 } | |
1116 ## DISCARD IF TOO SHORT ## | |
1117 if (length($s1) < $minlength) { | |
1118 $discarded++; | |
1119 $failedstring .= $read ;#. '|'; | |
1120 next; | |
1121 | |
1122 } | |
1123 | |
1124 | |
1125 # queue for printing | |
1126 my @arr = ($h1a,$s1,$h1b,$q1); | |
1127 $printstring .= join("\n",@arr) . "\n"; # join components by newline character for printing | |
1128 } | |
1129 if ($printstring ne '') { | |
1130 $toprint->enqueue($printstring); # single reads => no | present, just \n | |
1131 } | |
1132 if ($failedstring ne '') { | |
1133 #$failed->enqueue(pack("A*X",$failedstring)); | |
1134 $failed->enqueue($failedstring); | |
1135 } | |
1136 | |
1137 } | |
1138 } | |
1139 | |
1140 sub printout { | |
1141 if ($rand eq '' || !defined($rand)) { | |
1142 die('Random value not passed on!'); | |
1143 } | |
1144 if ($paired == 0) { | |
1145 my $counter = 0; | |
1146 my $string1 = ''; | |
1147 if (-e "/tmp/$rand.fq") { | |
1148 unlink("/tmp/$rand.fq"); | |
1149 } | |
1150 while (defined(my $string = $toprint->dequeue())) { # each item represents 10000 input reads (might be less by trimming) | |
1151 #my @array = split(/\|/,$printstring); | |
1152 #$done = $done + scalar(@array); | |
1153 $done = $done + 10000; | |
1154 $string1 .= $string; | |
1155 $counter++; | |
1156 #print batches of 500,000 reads | |
1157 if ($counter == 50) { | |
1158 open OUT1, ">>/tmp/$rand.fq"; | |
1159 print OUT1 $string1; | |
1160 close OUT1; | |
1161 $string1 = ''; | |
1162 #$string2 = ''; | |
1163 $counter = 0; | |
1164 } | |
1165 } | |
1166 ## print last | |
1167 open OUT1, ">>/tmp/$rand.fq"; | |
1168 print OUT1 $string1; | |
1169 close OUT1; | |
1170 $finish = 1; | |
1171 | |
1172 } | |
1173 else { | |
1174 my $counter = 0; | |
1175 my $string1 = ''; | |
1176 my $string2 = ''; | |
1177 if (-e "/tmp/$rand.fq") { | |
1178 unlink("/tmp/$rand.fq"); | |
1179 } | |
1180 if (-e "/tmp/$rand.fq") { | |
1181 unlink("/tmp/$rand.rq"); | |
1182 } | |
1183 while (defined(my $printstring = $toprint->dequeue())) { # each item represents 10000 input reads | |
1184 my @array = split(/\|/,$printstring); | |
1185 $done = $done + scalar(@array); | |
1186 foreach (@array) { | |
1187 my @subarr = split(/\~/,$_); | |
1188 $string1 .= join("\n",@subarr[0..3])."\n"; | |
1189 $string2 .= join("\n",@subarr[4..7])."\n"; | |
1190 } | |
1191 $counter++; | |
1192 if ($counter == 50) { | |
1193 #print "Printing!\n"; | |
1194 flock("/tmp/sg.write.lock",2); | |
1195 open OUT1, ">>/tmp/$rand.fq"; | |
1196 print OUT1 $string1; | |
1197 close OUT1; | |
1198 open OUT2, ">>/tmp/$rand.rq"; | |
1199 print OUT2 $string2; | |
1200 close OUT2; | |
1201 flock("/tmp/sg.write.lock",8); | |
1202 $string1 = ''; | |
1203 $string2 = ''; | |
1204 $counter = 0; | |
1205 } | |
1206 } | |
1207 ## print last | |
1208 flock("/tmp/sg.write.lock",2); | |
1209 open OUT1, ">>/tmp/$rand.fq"; | |
1210 print OUT1 $string1; | |
1211 close OUT1; | |
1212 open OUT2, ">>/tmp/$rand.rq"; | |
1213 print OUT2 $string2; | |
1214 close OUT2; | |
1215 flock("/tmp/sg.write.lock",8); | |
1216 $finish = 1; | |
1217 } | |
1218 | |
1219 } | |
1220 | |
1221 sub printfailed { | |
1222 if (-e "/tmp/$rand.failed") { | |
1223 unlink("/tmp/$rand.failed"); | |
1224 } | |
1225 #open FAIL, ">/tmp/$rand.failed" or die("could not open file with failed reads, '/tmp/$rand.failed'"); | |
1226 #FAIL->autoflush(1); | |
1227 my $nr = 0; | |
1228 my $string = ''; | |
1229 while ( defined(my $rstring = $failed->dequeue())) { | |
1230 #my @array = split(/\|/,$rstring); | |
1231 #foreach (@array) { | |
1232 $string .= $rstring; | |
1233 #} | |
1234 $nr++; | |
1235 if ($nr == 50) { | |
1236 open FAIL, ">>/tmp/$rand.failed"; | |
1237 print FAIL $string; | |
1238 close FAIL; | |
1239 $nr = 0; | |
1240 $string = ''; | |
1241 } | |
1242 #close FAIL; | |
1243 } | |
1244 open FAIL, ">>/tmp/$rand.failed"; | |
1245 print FAIL $string; | |
1246 close FAIL; | |
1247 } | |
1248 sub monitor { | |
1249 # checks the file read monitor | |
1250 my $counter = 0; | |
1251 while ($finish == 0) { | |
1252 $counter++; | |
1253 if ($totrim->pending() > 200) { | |
1254 lock($dummy); | |
1255 #print "Putting the file reading to sleep for 10 seconds.\n"; | |
1256 sleep 5; | |
1257 } | |
1258 else { | |
1259 sleep 5; | |
1260 } | |
1261 if ($counter == 6) { | |
1262 if ($verbose == 1) { | |
1263 my $string = ($done/1000000)."M Reads Passed, ".($discarded/1000000)."M reads discarded (".$totrim->pending().".10^4 pending)\n"; | |
1264 print $string ; | |
1265 } | |
1266 $counter = 0; | |
1267 } | |
1268 | |
1269 } | |
1270 } |