Mercurial > repos > geert-vandeweyer > fastq_qc_trimmer
view Paired_fastQ_trimmer.pl @ 2:16a85df293d2 draft
Uploaded
author | geert-vandeweyer |
---|---|
date | Thu, 13 Feb 2014 08:21:22 -0500 |
parents | fdbcc1aa4f01 |
children |
line wrap: on
line source
#!/usr/bin/perl # load modules use Getopt::Std; use threads; use Thread::Queue; use threads::shared; use File::Basename; use List::Util qw(max); $now = time ; $|++; # variables my $paired :shared; $paired = 0; my $fq :shared; my $rq :shared; my $ofq :shared; my $orq :shared; my $trimq :shared; my $failedfile :shared; my $done :shared; $done = 0; my $nrin :shared; $nrin = 0; my $rqz :shared; my $fqz :shared; my $rand :shared; my $discarded :shared; $discarded = 0; my $verbose :shared; $verbose = 1; my $trim3 :shared; $trim3 = 1; my $trim5 :shared; $trim5 = 1; # create file lock in the tmp dir $rand = int(rand(10000)); while (-e "/tmp/$rand.lock") { $rand = int(rand(10000)); } system("touch '/tmp/$rand.lock'"); if (!-e "/tmp/sg.write.lock") { system("touch '/tmp/sg.write.lock'"); system("chmod 777 '/tmp/sg.write.lock'"); } # opts # i : path to forward read FASTQ file # I : path to reverse read FASTQ file # q : quality threshold in phred. # n : string representing common read name parts # s : trimming style : simple/bwa # o : output file for forward reads # O : output file for reverse reads # F : output file with fully failed reads pairs. # v : verbose: defaults to on (0/1) # S : Trim on side: 5/3/b(oth) # m : minimal length. getopts('i:I:q:n:s:o:O:F:v:S:m:', \%opts) ; if (defined($opts{'v'})) { $verbose = $opts{'v'}; } if (defined($opts{'s'})) { if ($opts{'s'} eq 'bwa') { if ($verbose == 1) { print "Using BWA style trimming\n"; } $style = 'bwa'; } else { if ($verbose == 1) { print "Using simple 1bp Quality trimming\n"; } $style = 'simple'; } } else { if ($verbose == 1){ print "Using simple 1bp Quality trimming\n"; } $style = 'simple'; } if (!defined($opts{'i'})) { die('Forward reads fastq is mandatory');} $fq = $opts{'i'}; $ofq = $opts{'o'}; ## gzipped infile? my $out = `file $fq`; if ($out =~ m/gzip compressed/) { if ($verbose == 1) { print "Forward reads in gzipped format.\n"; } $fqz = 1; } else { if ($verbose == 1) { print "Forward reads in plain fastq format.\n"; } $fqz = 0; } if (defined($opts{'I'})) { if ($verbose == 1) { print "Reverse reads provided. Performing paired-end trimming\n"; } $paired = 1; $rq = $opts{'I'}; $orq = $opts{'O'}; # gzipped ? my $out = `file $rq`; if ($out =~ m/gzip compressed/) { if ($verbose == 1) { print "Reverse reads in gzipped format.\n"; } $rqz = 1; } else { if ($verbose == 1) { print "Reverse reads in plain fastq format.\n"; } $rqz = 0; } } ## FAILED FILENAME $failedfile = $opts{'F'}; ## set the sides to trim my $trimside = ''; if (defined($opts{'S'})) { if ($opts{'S'} eq '3') { $trimside = "Trimming reads from the 3' end only\n"; $trim3 = 1; $trim5 = 0; } elsif ($opts{'S'} eq '5') { $trimside = "Trimming reads from the 5' end only\n"; $trim3 = 0; $trim5 = 1; } elsif ($opts{'S'} eq 'b') { $trimside = "Trimming reads both from 5' and 3'\n"; $trim3 = 1; $trim5 = 1; } else { $trimside = "Invalid trimming side specification. Setting to both sides trimming\n"; $trim3 = 1; $trim5 = 1; } } else { $trimside = "Invalid trimming side specification. Setting to both sides trimming\n"; $trim3 = 1; $trim5 = 1; } if ($verbose == 1) { print $trimside; } # SET MINIMAL LENGTH my $minlength :shared; if (exists($opts{'m'})) { $minlength = $opts{'m'}; } else { $minlength = 0; } ## CHECK IF OUTPUT FILESNAMES ARE PRESENT if ($ofq eq '') { $ofq = "$fq.$style.trimmed"; if ($verbose == 1) { print "No Forward output name specified. Forward reads will be printed to:\n\t$ofq\n"; } } if ($paired == 1 && $orq eq '') { $orq = "$rq.$style.trimmed"; if ($verbose == 1) { print "No Reverse output name specified. Reverse reads will be printed to:\n\t$orq\n"; } } if ($failedfile eq '') { $failedfile = "$fq.Failed_Reads"; if ($verbose == 1) { print "No failed ouput name specified. Failed reads will be printed to:\n\t$failedfile\n"; } } ## set trimming threshold if (!defined($opts{'q'}) or !($opts{'q'} =~ m/^[0-9]+$/)) { $trimq = 20; } else { $trimq = $opts{'q'}; } ## set read delimiter if (defined($opts{'n'})) { $indelim = $opts{'n'}; # galaxy replaced @ by __at__ if (substr($indelim,0,6) eq '__at__') { $indelim = '@'.substr($indelim,6); } } else { print "No indelim defined\n"; $indelim = "@"; # has risk, fails if @ is first qual item } ## NEW : REPLACE if indelim = @ by the first 5 chars of first line of forward-fastq. if ($indelim eq '@') { if ($fqz == 0) { open IN, $fq; } else { open(IN, "gunzip -c $fq | "); } $head = <IN>; $indelim = substr($head,0,5); print "\n"; print "WARNING:\n"; print "########\n"; print " Setting Delimiter as '\@' is prone to errors\n"; print " as it will incorrectly split if '\@' is the firt value in the quality string !!\n"; print ' It is recommended to use eg -n \'\@SRR301\''."\n"; print " Therefore, the delimiter was changed to the first 5 characters of the first forward fastq file.\n"; print " => Delimiter : $indelim\n"; print "\n\n"; sleep 3; } ## create queues my $totrim = Thread::Queue->new(); my $toprint = Thread::Queue->new(); my $dummy = Thread::Queue->new(); my $failed = Thread::Queue->new(); # monitor thread my $finish :shared; $finish = 0; $mon = threads->create('monitor'); # start FASTQ reading thread if ($verbose == 1) { print "start reading file\n"; } $reading = threads->create('readfile'); ## give time to build reading buffer sleep 10; # start trimming threads if ($verbose == 1) { print "start trimming\n"; } if ($paired == 1) { if ($style eq 'bwa') { $trimming1 = threads->create('bwapaired'); $trimming2 = threads->create('bwapaired'); } else { $trimming1 = threads->create('simplepaired'); $trimming2 = threads->create('simplepaired'); } } else { if ($style eq 'bwa') { #print "not available yet, switching to simple trimming\n"; #$style = 'simple'; $trimming1 = threads->create('bwasingle'); $trimming2 = threads->create('bwasingle'); } if ($style eq 'simple') { $trimming1 = threads->create('simplesingle'); $trimming2 = threads->create('simplesingle'); } } # start printing threads $printout = threads->create('printout'); $printfail = threads->create('printfailed'); # end threads $reading->join(); ## all reads are queued for trimming, add undef to trimming queues to end threads. $totrim->enqueue(undef); $totrim->enqueue(undef); $trimming1->join(); $trimming2->join(); ## all reads are trimmed and queued from printing, send undef to end printing threads. $toprint->enqueue(undef); $failed->enqueue(undef); $printout->join(); $printfail->join(); if ($verbose == 1) { print "Waiting for status monitor to shut down\n"; } $mon->join(); ## copy files from /tmp to destination folder if ($verbose == 1) { print "Copying files to destination: Forward.."; } if ($fqz == 0) { system("cp -f /tmp/$rand.fq $ofq"); unlink("/tmp/$rand.fq"); } else { system("gzip -c /tmp/$rand.fq > $ofq"); unlink("/tmp/$rand.fq"); } if ($paired == 1) { if ($verbose == 1) { print " Reverse.."; } if ($rqz == 0) { system("cp -f /tmp/$rand.rq $orq"); unlink("/tmp/$rand.rq"); } else { system("gzip -c /tmp/$rand.rq > $orq"); unlink("/tmp/$rand.rq"); } } if ($verbose == 1) { print " Failed items.."; } system("cp -f /tmp/$rand.failed $failedfile"); unlink("/tmp/$rand.failed"); if ($verbose == 1) { print " Done\n"; } # remove lock file unlink("/tmp/$rand.lock"); ######################## ## print run details: ## ######################## my $string = sprintf("%.2f",$done/1000000)."M Reads in (each) FASTQ file processed.\n"; if ($paired == 1) { $string .= sprintf ("%.2f",($done-$discarded)/1000000)."M read pairs in output FASTQ's.\n"; $string .= sprintf("%.2f",$discarded/1000000)."M reads pairs discarded\n"; } else { $string .= sprintf ("%.2f",($done-$discarded)/1000000)."M reads in output FASTQ.\n"; $string .= sprintf("%.2f",$discarded/1000000)."M reads discarded\n"; } print $string; ################## # PRINT RUN-TIME # ################## $now = time - $now; printf("\n\nRunning time:%02d:%02d:%02d\n",int($now/3600),int(($now % 3600)/60),int($now % 60)); exit(); ## SUBROUTINE : READ FASTQ FILES ################################ sub readfile { if ($paired == 1) { if ($fqz == 0) { open IN1, $fq; } else { open(IN1, "gunzip -c $fq | "); } if ($rqz == 0) { open IN2, $rq; } else { open(IN2, "gunzip -c $rq | "); } $/ = "\n$indelim"; $deliml = length($indelim); # compose pack ignore string my $dx = ''; for (my $idx = 0; $idx<length($indelim); $idx++) { $dx .= 'X'; } my $f = <IN1>; my $r = <IN2>; # the reads contain the delimiter on the end (cf \n on regular file read line endings.) # so we strip teh delimiter using pack, leaving just the \n. # this means that in subsequent reads, we need to add the delimiter to the front of the read. my $read = substr(pack("A*$dx",$f),$deliml) . $r; #pack("A*$dx",$r) ;#. '|'; my $counter = 1; my $fr = ''; my $rr = ''; my $reads = ''; my $lastf = ''; while ($fr = <IN1>) { $rr = <IN2>; $lastf = $fr; ## add delim to front, trim from back. add our delim. ## done in next iteration, because last is different. # the delimiter from the forward read is used to complete the reverse read. $reads .= $indelim.pack("A*$dx",$read) .'|'; # concat reads $read = $fr.$rr; ## add string to queue if 20,000 reads in array if ($counter == 20000) { $nrin = $nrin + 20000; $totrim->enqueue(pack("A*X",$reads)); # pack("A*X") strips last | $reads = ''; $counter = 0; } $counter++; ## dummy (see monitor). my $d = $dummy->dequeue_nb(); } # last reads in files need special care. $reads .= $indelim.$lastf.$indelim.$rr; close IN1; close IN2; # submit last reads to queue if ($reads ne '') { #$totrim->enqueue(pack("A*X",$reads)); $totrim->enqueue($reads); } if ($verbose == 1) { print "All reads queued for trimming\n"; } } else { if ($fqz == 0) { open IN1, $fq; } else { open(IN1, "gunzip -c $fq | "); } $/ = "\n$indelim"; $deliml = length($indelim); # compose pack ignore string my $dx = ''; for (my $idx = 0; $idx<length($indelim); $idx++) { $dx .= 'X'; } my $f = <IN1>; # the reads contain the delimiter on the end (cf \n on regular file read line endings.) # so we strip teh delimiter using pack, leaving just the \n. # this means that in subsequent reads, we need to add the delimiter to the front of the read. my $read = substr($f,$deliml); my $counter = 1; my $fr = ''; my $reads = ''; while ($fr = <IN1>) { ## add delim to front, trim from back. add our delim. ## done in next iteration, because last is different. $reads .= $indelim.pack("A*$dx",$read) .'|'; # concat reads $read = $fr; ## add string to queue if 10,000 reads in array if ($counter == 40000) { $nrin = $nrin + 40000; $totrim->enqueue(pack("A*X",$reads)); $reads = ''; $counter = 0; } $counter++; ## dummy (see monitor). my $d = $dummy->dequeue_nb(); } $reads .= $indelim.$read; close IN1; # submit last reads to queue if ($reads ne '') { #$totrim->enqueue(pack("A*X",$reads)); $totrim->enqueue($reads); } if ($verbose == 1) { print "All reads queued for trimming\n"; } } } sub bwapaired { my $tq = $trimq; while ( defined(my $rstring = $totrim->dequeue())) { my @reads = split(/\|/,$rstring); # each item has 20.000 reads my $printstring = ''; my $failedstring = ''; foreach (@reads) { my $read = $_; my ($h1a, $s1,$h1b,$q1,$h2a,$s2,$h2b,$q2) = split(/\n/,$read); ## skip too short reads if (length($q1) < $minlength || length($q2) < $minlength) { $discarded++; $failedstring .= $read ;#. '|'; next; } # initial check for maximal quality value. ## quality arrays my @q1array = unpack("C*",$q1); if (max(@q1array) < $tq + 33) { #print "Discarding on failed forward qualities: ".max(@q1array). " vs $tq + 33)\n$read\n"; $discarded++; $failedstring .= $read ;#. '|'; next; } my @q2array = unpack("C*",$q2); if (max(@q2array) < $trimq + 33) { #print "Discarding on failed reverse qualities\n$read\n"; $discarded++; $failedstring .= $read ;#. '|'; next; } ############# ## Trim 3' ## ############# if ($trim3 == 1) { #trim first my $pos = length($q1) - 1; my $maxPos = $pos; ## skip empty read. #if ($pos <= 0) { # $discarded++; # $failedstring .= $read . '|'; # #print "skip empty : \n$read\n"; # next; #} my $area = 0; my $maxArea = 0; ## unpack qual-string to array of phred scores #my @qarray = unpack("C*",$q1); # use from initial check. while ($pos>0 && $area>=0) { $area += $tq - ($q1array[($pos)] - 33); ## not <= for more aggressive trimming : if multiple equal maxima => use shortest read. if ($area < $maxArea) { $pos--; next; } else { $maxArea = $area; $maxPos = $pos; $pos--; next; } } if ($maxPos==0) { # scanned whole read and didn't integrate to maximum? skip (no output for this read) $discarded++; $failedstring .= $read ;#. '|'; next; } # otherwise trim before position where area reached a maximum # $maxPos as length of substr gives positions zero to just before the maxpos. #$s1 = substr($s1,0,$maxPos); #$q1 = substr($q1,0,$maxPos); $s1 = unpack("A$maxPos",$s1); $q1 = unpack("A$maxPos",$q1); #trim second $pos = length($q2); $maxPos = $pos; ## skip empty read. #if ($pos <= 0) { # $discarded++; # $failedstring .= $read . '|'; # next; #} $area = 0; $maxArea = 0; #@qarray = unpack("C*",$q2); while ($pos>0 && $area>=0) { $area += $tq - ($q2array[($pos)] - 33); if ($area < $maxArea) { $pos--; next; } else { $maxArea = $area; $maxPos = $pos; $pos--; next; } } if ($maxPos==0) { # scanned whole read and didn't integrate to zero? skip (no output for this read) $discarded++; $failedstring .= $read ;#. '|'; next; } # Otherwise trim before position where area reached a maximum $s2 = unpack("A$maxPos",$s2); $q2 = unpack("A$maxPos",$q2); #$s2 = substr($s2,0,$maxPos); #$q2 = substr($q2,0,$maxPos); } ############# ## TRIM 5' ## ############# if ($trim5 == 1) { #trim first my $skip = 0; #length($q1); my $maxPos = $skip; my $area = 0; my $maxArea = 0; my @qarray = unpack("C*",$q1); while ($skip<length($q1) && $area>=0) { $area += $tq - ($qarray[($skip)] - 33); if ($area < $maxArea) { $skip++; next; } else { $maxArea = $area; $maxPos = $skip; $skip++; next; } } if ($maxPos==length($q1)- 1) { # => maximal value at end of string => no point of no return found. $discarded++; $failedstring .= $read ;#. '|'; next; } # trim after position where area reached a maximum #$s1 = substr($s1,($maxPos+1)); #$q1 = substr($q1,($maxPos+1)); $s1 = unpack("x$maxPos A*",$s1); $q1 = unpack("x$maxPos A*",$q1); #trim second $skip = 0 ;#length($q2); $maxPos = $skip; $area = 0; $maxArea = 0; @qarray = unpack("C*",$q2); while ($skip < length($q2) && $area>=0) { $area += $tq - ($qarray[($skip)] - 33); if ($area < $maxArea) { $skip++; next; } else { $maxArea = $area; $maxPos = $skip; $skip++; next; } } if ($maxPos==(length($q2)-1)) { # scanned whole read and didn't integrate to zero? skip (no output for this read) $discarded++; $failedstring .= $read ;#. '|'; next; } # trim after position where area reached a maximum #$s2 = substr($s2,($maxPos+1)); #$q2 = substr($q2,($maxPos+1)); $s2 = unpack("x$maxPos A*",$s2); $q2 = unpack("x$maxPos A*",$q2); } ## DISCARD IF TOO SHORT ## if (length($s1) < $minlength || length($s2) < $minlength) { $discarded++; $failedstring .= $read ;#. '|'; next; } # queue for printing my @arr = ($h1a,$s1,$h1b,$q1,$h2a,$s2,$h2b,$q2); $printstring .= join('~',@arr) . '|'; # these two are not in quality string, and can thus be used as seperator # | seperates pairs of reads; ~ seperates lines of single read entry. #print "$printstring\n"; } if ($printstring ne '') { $toprint->enqueue(pack("A*X",$printstring)); # removes trailing | #$toprint->enqueue($printstring); } if ($failedstring ne '') { #$failed->enqueue(pack("A*X",$failedstring)); $failed->enqueue($failedstring); } } } sub simplepaired { # local variables speed up threading my $tq = $trimq; while ( defined(my $rstring = $totrim->dequeue())){ my @reads = split(/\|/,$rstring); my $printstring = ''; my $failedstring = ''; foreach (@reads) { my $read = $_; my ($h1a, $s1,$h1b,$q1,$h2a,$s2,$h2b,$q2) = split(/\n/,$read); ## skip too short reads if (length($q1) < $minlength || length($q2) < $minlength) { $discarded++; $failedstring .= $read; # . '|'; next; } # initial check for maximal quality value. ## quality arrays my @q1array = unpack("C*",$q1); if (max(@q1array) < $trimq + 33) { #print "Discarding on failed forward qualities\n$read\n"; $discarded++; $failedstring .= $read;# . '|'; next; } my @q2array = unpack("C*",$q2); if (max(@q2array) < $trimq + 33) { #print "Discarding on failed reverse qualities\n$read\n"; $discarded++; $failedstring .= $read;# . '|'; next; } ############# ## trim 3' ## ############# if ($trim3 == 1) { #trim first my $pos = length($q1) - 1; my $maxPos = $pos; #if ($pos <= 0) { # $discarded++; # $failedstring .= $read . '|'; # next; #} #my @qarray = unpack("C*",$q1); while ( $pos>0 ) { if ($tq > $q1array[$pos] - 33) { # score at position is below threshold $pos--; next; } else { # score is at or above threshold $maxPos = $pos; last; } } if ($pos==0) { # scanned whole read and didn't integrate to zero? skip (no output for this read) $discarded++; $failedstring .= $read ;#. '|'; next; } # integrated to zero? trim before position where quality was below threshold # unpack is one based #$maxPos++; #$s1 = substr($s1,0,$maxPos); $s1 = unpack("A$maxPos",$s1); #$q1 = substr($q1,0,$maxPos); $q1 = unpack("A$maxPos",$q1); #trim second $pos = length($q2) -1; $maxPos = $pos; if ($pos <= 0) { $discarded++; $failedstring .= $read ;#. '|'; next; } #my @qarray = unpack("C*",$q2); while ($pos>0) { if ($tq > $q2array[$pos] - 33) { # score at position is below threshold $pos--; next; } else { # score is at or above threshold $maxPos = $pos; last; } } if ($pos==0) { # scanned whole read and didn't integrate to zero? skip (no output for this read) $discarded++; $failedstring .= $read ;#. '|'; next; } # integrated to zero? trim before position where score below threshold #$maxPos++; $s2 = unpack("A$maxPos",$s2); $q2 = unpack("A$maxPos",$q2); #$s2 = substr($s2,0,$maxPos); #$q2 = substr($q2,0,$maxPos); } ############# ## trim 5' ## ############# if ($trim5 == 1) { # first my $skip = 0; #length($q1); #my $maxPos = $skip; my @qarray = unpack("C*",$q1); while ( $skip < length($q1) ) { if ($tq > $qarray[$skip] - 33) { # score at position is below threshold $skip++; next; } else { # score is at or above threshold #$maxPos = $skip; last; } } if ($skip == length($q1)) { # should not fail, as that would have happened on 3' trim $discarded++; $failedstring .= $read ;#. '|'; next; } # integrated to zero? trim before position where quality was below threshold # unpack is one based $s1 = unpack("x$skip A*",$s1); $q1 = unpack("x$skip A*",$q1); #$s1 = substr($s1,$skip); #$q1 = substr($q1,$skip); # second $skip = 0; #length($q1); my @qarray = unpack("C*",$q2); #$maxPos = $skip; while ( $skip < length($q2) ) { if ($tq > $qarray[$skip] - 33) { # score at position is below threshold $skip++; next; } else { # score is at or above threshold #$maxPos = $skip; last; } } if ($skip == length($q2)) { # should not fail, as that would have happened on 3' trim $discarded++; $failedstring .= $read ;#. '|'; next; } # integrated to zero? trim before position where quality was below threshold # unpack is one based $s2 = unpack("x$skip A*",$s2); $q2 = unpack("x$skip A*",$q2); #$s2 = substr($s2,$skip); #$q2 = substr($q2,$skip); } ## DISCARD IF TOO SHORT ## if (length($s1) < $minlength || length($s2) < $minlength) { $discarded++; $failedstring .= $read .#. '|'; next; } # queue for printing my @arr = ($h1a,$s1,$h1b,$q1,$h2a,$s2,$h2b,$q2); $printstring .= join('~',@arr) . '|'; # these two are not in quality string, and can thus be used as seperator # | seperates pairs of reads; ~ seperates lines of single read entry. } if ($printstring ne '') { $toprint->enqueue(pack("A*X",$printstring)); # removes trailing | #$toprint->enqueue($printstring); } if ($failedstring ne '') { #$failed->enqueue(pack("A*X",$failedstring)); $failed->enqueue($failedstring); } } } sub bwasingle { # local variables speed up threading my $tq = $trimq; while ( defined(my $rstring = $totrim->dequeue())){ my @reads = split(/\|/,$rstring); my $printstring = ''; my $failedstring = ''; foreach (@reads) { my $read = $_; my ($h1a,$s1,$h1b,$q1) = split(/\n/,$read); ## skip too short reads if (length($q1) < $minlength ) { $discarded++; $failedstring .= $read ;#. '|'; next; } # initial check for maximal quality value. ## quality arrays my @q1array = unpack("C*",$q1); if (max(@q1array) < $tq + 33) { print "Discarding on failed qualities\n$read\n"; $discarded++; $failedstring .= $read;# . '|'; next; } ############# ## trim 3' ## ############# if ($trim3 == 1) { my $pos = length($q1) - 1; my $maxPos = $pos; ## skip empty read. #if ($pos <= 0) { # $discarded++; # $failedstring .= $read . '|'; # next; #} my $area = 0; my $maxArea = 0; #my @qarray = unpack("C*",$q1); while ($pos>0 && $area>=0) { $area += $tq - ($q1array[($pos)] - 33); ## not <= for more aggressive trimming : if multiple equal maxima => use shortest read. if ($area < $maxArea) { $pos--; next; } else { $maxArea = $area; $maxPos = $pos; $pos--; next; } } ## The online perl approach is wrong if the high-quality part is not long enough to reach 0. if ($maxPos == 0) { # maximal badness on position 0 => no point of no return reached. $discarded++; $failedstring .= $read ;#. '|'; next; } # otherwise trim before position where area reached a maximum # $maxPos as length of substr gives positions zero to just before the maxpos. $s1 = unpack("A$maxPos",$s1); $q1 = unpack("A$maxPos",$q1); } ############# ## trim 5' ## ############# if ($trim5 == 1) { my $skip = 0; #length($q1); my $maxPos = $skip; my $area = 0; my $maxArea = 0; my @qarray = unpack("C*",$q1); while ($skip<length($q1) && $area>=0) { #$area += $tq - (ord(unpack("x$skip A1",$q1)) - 33); $area += $tq - ($qarray[($skip)] - 33); if ($area < $maxArea) { $skip++; next; } else { $maxArea = $area; $maxPos = $skip; $skip++; next; } } #if ($skip==length($q1)) { # scanned whole read and didn't integrate to zero? skip (no output for this read if ($maxPos == (length($q1)-1)) { # => maximal value at end of string => no point of no return found. $discarded++; $failedstring .= $read ;#. '|'; next; } # trim after position where area reached a maximum $s1 = unpack("x$maxPos A*",$s1); $q1 = unpack("x$maxPos A*",$q1); #$s1 = substr($s1,($maxPos+1)); #$q1 = substr($q1,($maxPos+1)); } ## DISCARD IF TOO SHORT ## if (length($s1) < $minlength) { $discarded++; $failedstring .= $read ;#. '|'; next; } # queue for printing my @arr = ($h1a,$s1,$h1b,$q1); $printstring .= join("\n",@arr) . "\n"; # join components by newline character for printing } if ($printstring ne '') { #$toprint->enqueue(pack("A*X",$printstring)); # removes trailing | $toprint->enqueue($printstring); } if ($failedstring ne '') { #$failed->enqueue(pack("A*X",$failedstring)); $failed->enqueue($failedstring); } } } sub simplesingle { # local variables speed up threading my $tq = $trimq; while ( defined(my $rstring = $totrim->dequeue())){ my @reads = split(/\|/,$rstring); my $printstring = ''; my $failedstring = ''; foreach (@reads) { my $read = $_; my ($h1a,$s1,$h1b,$q1) = split(/\n/,$read); ## skip too short reads if (length($q1) < $minlength ) { $discarded++; $failedstring .= $read ;#. '|'; next; } # initial check for maximal quality value. ## quality arrays my @q1array = unpack("C*",$q1); if (max(@q1array) < $trimq + 33) { #print "Discarding on failed qualities\n$read\n"; $discarded++; $failedstring .= $read ;#. '|'; next; } ############# ## trim 3' ## ############# if ($trim3 == 1) { my $pos = length($q1); my $maxPos = $pos; #if ($pos <= 0) { # $discarded++; # $failedstring .= $read . '|'; # next; #} #my @qarray = unpack("C*",$q1); while ( $pos>0 ) { if ($tq > $q1array[$pos] - 33) { # score at position is below threshold $pos--; next; } else { # score is at or above threshold $maxPos = $pos; last; } } if ($pos==0) { # scanned whole read and didn't integrate to zero? skip (no output for this read) $discarded++; $failedstring .= $read ;#. '|'; next; } # integrated to zero? trim before position where quality was below threshold # unpack is one based $s1 = unpack("A$maxPos",$s1); $q1 = unpack("A$maxPos",$q1); #$s1 = substr($s1,0,$maxPos); #$q1 = substr($q1,0,$maxPos); } ############# ## trim 5' ## ############# if ($trim5 == 1) { my $skip = 0; #length($q1); #my $maxPos = $skip; my @qarray = unpack("C*",$q1); while ( $skip < length($q1) ) { if ($tq > $qarray[$skip] - 33) { # score at position is below threshold $skip++; next; } else { # score is at or above threshold #$maxPos = $skip; last; } } if ($skip == length($q1)) { $discarded++; $failedstring .= $read ;#. '|'; next; } # integrated to zero? trim before position where quality was below threshold # unpack is one based #$s1 = substr($s1,$skip); #$q1 = substr($q1,$skip); $s1 = unpack("x$skip A*",$s1); $q1 = unpack("x$skip A*",$q1); } ## DISCARD IF TOO SHORT ## if (length($s1) < $minlength) { $discarded++; $failedstring .= $read ;#. '|'; next; } # queue for printing my @arr = ($h1a,$s1,$h1b,$q1); $printstring .= join("\n",@arr) . "\n"; # join components by newline character for printing } if ($printstring ne '') { $toprint->enqueue($printstring); # single reads => no | present, just \n } if ($failedstring ne '') { #$failed->enqueue(pack("A*X",$failedstring)); $failed->enqueue($failedstring); } } } sub printout { if ($rand eq '' || !defined($rand)) { die('Random value not passed on!'); } if ($paired == 0) { my $counter = 0; my $string1 = ''; if (-e "/tmp/$rand.fq") { unlink("/tmp/$rand.fq"); } while (defined(my $string = $toprint->dequeue())) { # each item represents 10000 input reads (might be less by trimming) #my @array = split(/\|/,$printstring); #$done = $done + scalar(@array); $done = $done + 10000; $string1 .= $string; $counter++; #print batches of 500,000 reads if ($counter == 50) { open OUT1, ">>/tmp/$rand.fq"; print OUT1 $string1; close OUT1; $string1 = ''; #$string2 = ''; $counter = 0; } } ## print last open OUT1, ">>/tmp/$rand.fq"; print OUT1 $string1; close OUT1; $finish = 1; } else { my $counter = 0; my $string1 = ''; my $string2 = ''; if (-e "/tmp/$rand.fq") { unlink("/tmp/$rand.fq"); } if (-e "/tmp/$rand.fq") { unlink("/tmp/$rand.rq"); } while (defined(my $printstring = $toprint->dequeue())) { # each item represents 10000 input reads my @array = split(/\|/,$printstring); $done = $done + scalar(@array); foreach (@array) { my @subarr = split(/\~/,$_); $string1 .= join("\n",@subarr[0..3])."\n"; $string2 .= join("\n",@subarr[4..7])."\n"; } $counter++; if ($counter == 50) { #print "Printing!\n"; flock("/tmp/sg.write.lock",2); open OUT1, ">>/tmp/$rand.fq"; print OUT1 $string1; close OUT1; open OUT2, ">>/tmp/$rand.rq"; print OUT2 $string2; close OUT2; flock("/tmp/sg.write.lock",8); $string1 = ''; $string2 = ''; $counter = 0; } } ## print last flock("/tmp/sg.write.lock",2); open OUT1, ">>/tmp/$rand.fq"; print OUT1 $string1; close OUT1; open OUT2, ">>/tmp/$rand.rq"; print OUT2 $string2; close OUT2; flock("/tmp/sg.write.lock",8); $finish = 1; } } sub printfailed { if (-e "/tmp/$rand.failed") { unlink("/tmp/$rand.failed"); } #open FAIL, ">/tmp/$rand.failed" or die("could not open file with failed reads, '/tmp/$rand.failed'"); #FAIL->autoflush(1); my $nr = 0; my $string = ''; while ( defined(my $rstring = $failed->dequeue())) { #my @array = split(/\|/,$rstring); #foreach (@array) { $string .= $rstring; #} $nr++; if ($nr == 50) { open FAIL, ">>/tmp/$rand.failed"; print FAIL $string; close FAIL; $nr = 0; $string = ''; } #close FAIL; } open FAIL, ">>/tmp/$rand.failed"; print FAIL $string; close FAIL; } sub monitor { # checks the file read monitor my $counter = 0; while ($finish == 0) { $counter++; if ($totrim->pending() > 200) { lock($dummy); #print "Putting the file reading to sleep for 10 seconds.\n"; sleep 5; } else { sleep 5; } if ($counter == 6) { if ($verbose == 1) { my $string = ($done/1000000)."M Reads Passed, ".($discarded/1000000)."M reads discarded (".$totrim->pending().".10^4 pending)\n"; print $string ; } $counter = 0; } } }