Mercurial > repos > mcharles > rapsosnp
view rapsodyn/PrepareFastqLight.pl @ 4:9074a5104cdd draft
Uploaded
author | mcharles |
---|---|
date | Wed, 17 Sep 2014 04:20:08 -0400 |
parents | 442a7c88b886 |
children | b0cbb9d21aa9 |
line wrap: on
line source
#!/usr/bin/perl use strict; use warnings; my $read1 = $ARGV[0]; my $read2 = $ARGV[1]; my $output1 = $ARGV[2]; my $output2 = $ARGV[3]; my $TYPE = $ARGV[4]; my $MIN_LENGTH = $ARGV[5]; my $MIN_QUALITY = $ARGV[6]; my $VERBOSE = $ARGV[7]; if (!$VERBOSE){ $VERBOSE ="OFF"; } open(READ1, $read1) or die ("Can't open $read1\n"); open(READ2, $read2) or die ("Can't open $read2\n"); open(OUT1, ">$output1") or die ("Can't open $output1\n"); open(OUT2, ">$output2") or die ("Can't open $output2\n"); my $error1=0; my $error2=0; my $error3=0; my $error4=0; my $error5=0; my $error6=0; my $error7=0; my $error8=0; my $error9=0; my $error10=0; while (my $ligne1_r1 =<READ1>){ my $ligne2_r1 =<READ1>; my $ligne3_r1 =<READ1>; my $ligne4_r1 =<READ1>; my $ligne1_r2 =<READ2>; my $ligne2_r2 =<READ2>; my $ligne3_r2 =<READ2>; my $ligne4_r2 =<READ2>; #@ 1 sec if ((!$ligne1_r1)||(!$ligne2_r1)||(!$ligne3_r1)||(!$ligne4_r1)||(!$ligne1_r2)||(!$ligne2_r2)||(!$ligne3_r2)||(!$ligne4_r2)){ if ($VERBOSE eq "ON"){ print "Error in file format"; if ($ligne1_r1){print $ligne1_r1;} if ($ligne2_r1){print $ligne2_r1;} if ($ligne3_r1){print $ligne3_r1;} if ($ligne4_r1){print $ligne4_r1;} if ($ligne1_r2){print $ligne1_r2;} if ($ligne2_r2){print $ligne2_r2;} if ($ligne3_r2){print $ligne3_r2;} if ($ligne4_r2){print $ligne4_r2;} print "\n"; } $error1++; } elsif(($ligne1_r1 !~/^\@/)||($ligne1_r2 !~/^\@/)||($ligne3_r1 !~/^\+/)||($ligne3_r2 !~/^\+/)){ if ($VERBOSE eq "ON"){ print "Error in header : format\n"; print $ligne1_r1; print $ligne2_r1; print $ligne3_r1; print $ligne4_r1; print $ligne1_r2; print $ligne2_r2; print $ligne3_r2; print $ligne4_r2; print "\n"; } $error2++; } #@ 1 - 2 sec else { my $length_seq1 = length($ligne2_r1); my $length_qual1 =length($ligne4_r1); my $seq1; my $qual1; my $length_seq2 = length($ligne2_r2); my $length_qual2 =length($ligne4_r2); my $seq2; my $qual2; my $header1=""; my $header2=""; my $repheader1=""; my $repheader2=""; if ($ligne1_r1 =~/^\@(.*?)\#/){ $header1 = $1; } if ($ligne3_r1 =~/^\+(.*?)\#/){ $repheader1 = $1; } if ($ligne1_r2 =~/^\@(.*?)\#/){ $header2 = $1; } if ($ligne3_r2 =~/^\+(.*?)\#/){ $repheader2 = $1; } #@ 2 sec ### Verification de la coherence sequence /qualité @ 1 sec if (($TYPE eq "illumina")&&((!$header1)||(!$header2)||(!$repheader1)||(!$repheader2))){ if ($VERBOSE eq "ON"){ print "Error in header : empty\n"; print $ligne1_r1; print $ligne2_r1; print $ligne3_r1; print $ligne4_r1; print $ligne1_r2; print $ligne2_r2; print $ligne3_r2; print $ligne4_r2; print "\n"; } $error3++; } elsif (($TYPE eq "sanger")&&((!$header1)||(!$header2))){ if ($VERBOSE eq "ON"){ print "Error in header refgsd : empty\n"; print $ligne1_r1; print $ligne2_r1; print $ligne3_r1; print $ligne4_r1; print $ligne1_r2; print $ligne2_r2; print $ligne3_r2; print $ligne4_r2; print "\n"; } $error3++; } elsif (($TYPE eq "illumina")&&(($header1 ne $repheader1)||($header2 ne $repheader2)||($header1 ne $header2))){ if ($VERBOSE eq "ON"){ print "Error in header : different\n"; print $ligne1_r1; print $ligne2_r1; print $ligne3_r1; print $ligne4_r1; print $ligne1_r2; print $ligne2_r2; print $ligne3_r2; print $ligne4_r2; print "\n"; } $error4++; } elsif (($TYPE eq "sanger")&&($header1 ne $header2)){ if ($VERBOSE eq "ON"){ print "Error in header : different\n"; print $ligne1_r1; print $ligne2_r1; print $ligne3_r1; print $ligne4_r1; print $ligne1_r2; print $ligne2_r2; print $ligne3_r2; print $ligne4_r2; print "\n"; } $error4++; } elsif (($length_seq1 != $length_qual1)||($length_seq2 != $length_qual2)){ if ($VERBOSE eq "ON"){ print "Error in seq/qual length\n"; print $ligne1_r1; print $ligne2_r1; print $ligne3_r1; print $ligne4_r1; print $ligne1_r2; print $ligne2_r2; print $ligne3_r2; print $ligne4_r2; print "\n"; } $error5++; } #@ 1 - 2 sec else { ### Parsing sequence & qualité if ($ligne2_r1 =~ /^([ATGCNX]+)\s*$/i){ $seq1 = $1; } if ($ligne2_r2 =~ /^([ATGCNX]+)\s*$/i){ $seq2 = $1; } if ($ligne4_r1 =~ /^(.*)\s*$/i){ $qual1 = $1; } if ($ligne4_r2 =~ /^(.*)\s*$/i){ $qual2 = $1; } #@ 2 sec ### Verification du parsing et de la coherence sequence /qualité (n°2) if ((!$seq1)||(!$seq2)||(!$qual1)||(!$qual2)){ if ($VERBOSE eq "ON"){ print "Error parsing seq / quality \n"; print $ligne1_r1; print $ligne2_r1; print $ligne3_r1; print $ligne4_r1; print $ligne1_r2; print $ligne2_r2; print $ligne3_r2; print $ligne4_r2; print "\n"; } $error6++; } elsif ((length($seq1) != length($qual1))||(length($seq2) != length($qual2))){ if ($VERBOSE eq "ON"){ print "Error in seq/qual length after parsing\n"; print $ligne1_r1; print $ligne2_r1; print $ligne3_r1; print $ligne4_r1; print $ligne1_r2; print $ligne2_r2; print $ligne3_r2; print $ligne4_r2; print "\n"; } $error7++; } #@ <1 sec else { my $fastq_lines_r1=""; my $fastq_lines_r2=""; $fastq_lines_r1 = &grooming_and_trimming($ligne1_r1,$seq1,$qual1); if ($fastq_lines_r1){ $fastq_lines_r2 = &grooming_and_trimming($ligne1_r2,$seq2,$qual2); } if ($fastq_lines_r2){ print OUT1 $fastq_lines_r1; print OUT2 $fastq_lines_r2; } } } # print OUT1 $ligne1_r1; # print OUT1 $ligne2_r1; # print OUT1 $ligne3_r1; # print OUT1 $ligne4_r1; # print OUT2 $ligne1_r2; # print OUT2 $ligne2_r2; # print OUT2 $ligne3_r2; # print OUT2 $ligne4_r2; #@ 7 sec } } close (READ1); close (READ2); close (OUT1); close (OUT2); sub grooming_and_trimming{ my $header = shift; my $seq = shift; my $quality = shift; my $quality_converted=""; my $startnoN = 0; my $stopnoN = length($quality)-1; #print "HEAD:\t$header"; #print "SEQ:\t$seq\n"; my $chercheN = $seq; my @bad_position; my $current_index = index($chercheN,"N"); my $abs_index = $current_index; while ($current_index >=0){ push (@bad_position,$abs_index); if ($current_index<length($seq)){ $chercheN = substr($chercheN,$current_index+1); $current_index = index($chercheN,"N"); $abs_index = $current_index + $bad_position[$#bad_position]+1; } else { last; } } if ($#bad_position>=0){ my %coord=%{&extract_longer_string_coordinates_from_bad_position($startnoN,$stopnoN,\@bad_position)}; $startnoN = $coord{"start"}; $stopnoN = $coord{"stop"}; } my $lengthnoN = $stopnoN - $startnoN + 1; my $seqnoN = substr($seq,$startnoN,$lengthnoN); # print "SEQnoN\t:$seqnoN\n"; # for (my $i=0;$i<=$#bad_position;$i++){ # print $bad_position[$i]."\t"; # } # print "\n"; if ($lengthnoN >= $MIN_LENGTH){ my $startTrim = $startnoN; my $stopTrim = $stopnoN; my $quality_converted=""; #my @bad_position; my @q = split(//,$quality); #print "QUALITY\n"; #print "$quality\n"; for (my $i=0;$i<=$stopnoN;$i++){ my $chr = $q[$i]; my $num = ord($q[$i]); if ($TYPE eq "illumina"){ $num = $num -64+33; $quality_converted .= chr($num); } if ($num <$MIN_QUALITY + 64 - 33 ){ push(@bad_position,$i+$startnoN); } } if ($quality_converted){$quality = $quality_converted;} #print "$quality\n"; if ($#bad_position>=0){ @bad_position = sort {$a <=> $b} @bad_position; # for (my $i=0;$i<=$#bad_position;$i++){ # print $bad_position[$i]."\t"; # } # print "\n"; my %coord=%{&extract_longer_string_coordinates_from_bad_position($startnoN,$stopnoN,\@bad_position)}; $startTrim = $coord{"start"}; $stopTrim = $coord{"stop"}; #print "$startTrim .. $stopTrim\n"; } my $lengthTrim = $stopTrim - $startTrim +1; my $fastq_lines=""; if ($lengthTrim >= $MIN_LENGTH){ $fastq_lines .= $header; $fastq_lines .= substr($seq,$startTrim,$lengthTrim)."\n"; $fastq_lines .= "+\n"; $fastq_lines .= substr($quality,$startTrim,$lengthTrim)."\n"; # print $fastq_lines; return $fastq_lines; } else { return ""; } } else { return ""; } # my @s = split(//,$seq); # my $sanger_quality=""; # return $sanger_quality; } sub extract_longer_string_coordinates_from_bad_position{ my $start=shift; my $stop =shift; my $refbad = shift; my @bad_position = @$refbad; my %coord; my $current_start = $start; my $current_stop = $bad_position[0]-1; if ($current_stop < $start){$current_stop = $start;} #debut -> premier N my $current_length = $current_stop - $current_start +1; my $test_length; #entre les N for (my $i=1;$i<=$#bad_position;$i++){ $test_length = $bad_position[$i]+1-$bad_position[$i-1]-1; if ( $test_length > $current_length){ $current_start = $bad_position[$i-1]+1; $current_stop = $bad_position[$i]-1; $current_length = $current_stop - $current_start +1; } } #dernier N -> fin $test_length = $stop-$bad_position[$#bad_position]+1; if ( $test_length > $current_length){ $current_start = $bad_position[$#bad_position]+1; if ($current_start > $stop){$current_start=$stop;} $current_stop = $stop; } $coord{"start"}=$current_start; $coord{"stop"}= $current_stop; $coord{"lenght"}=$current_stop-$current_start+1; return \%coord; }