Mercurial > repos > mcharles > rapsosnp
view rapsodyn/PrepareFastqLight.pl @ 5:b0cbb9d21aa9 draft
Uploaded
author | mcharles |
---|---|
date | Mon, 22 Sep 2014 10:19:53 -0400 |
parents | 9074a5104cdd |
children | 3f7b0788a1c4 |
line wrap: on
line source
#!/usr/bin/perl use strict; use warnings; my $read1 = $ARGV[0]; my $read2 = $ARGV[1]; my $output1 = $ARGV[2]; my $output2 = $ARGV[3]; my $TYPE = $ARGV[4]; my $MIN_LENGTH = $ARGV[5]; my $MIN_QUALITY = $ARGV[6]; my $VERBOSE = $ARGV[7]; if (!$VERBOSE){ $VERBOSE ="OFF"; } open(READ1, $read1) or die ("Can't open $read1\n"); open(READ2, $read2) or die ("Can't open $read2\n"); open(OUT1, ">$output1") or die ("Can't open $output1\n"); open(OUT2, ">$output2") or die ("Can't open $output2\n"); my $error1=0; my $error2=0; my $error3=0; my $error4=0; my $error5=0; my $error6=0; my $error7=0; my $error8=0; my $error9=0; my $error10=0; while (my $ligne1_r1 =<READ1>){ my $ligne2_r1 =<READ1>; my $ligne3_r1 =<READ1>; my $ligne4_r1 =<READ1>; my $ligne1_r2 =<READ2>; my $ligne2_r2 =<READ2>; my $ligne3_r2 =<READ2>; my $ligne4_r2 =<READ2>; #@ 1 sec if ((!$ligne1_r1)||(!$ligne2_r1)||(!$ligne3_r1)||(!$ligne4_r1)||(!$ligne1_r2)||(!$ligne2_r2)||(!$ligne3_r2)||(!$ligne4_r2)){ if ($VERBOSE eq "ON"){ print "Error in file format"; if ($ligne1_r1){print $ligne1_r1;} if ($ligne2_r1){print $ligne2_r1;} if ($ligne3_r1){print $ligne3_r1;} if ($ligne4_r1){print $ligne4_r1;} if ($ligne1_r2){print $ligne1_r2;} if ($ligne2_r2){print $ligne2_r2;} if ($ligne3_r2){print $ligne3_r2;} if ($ligne4_r2){print $ligne4_r2;} print "\n"; } $error1++; } elsif(($ligne1_r1 !~/^\@/)||($ligne1_r2 !~/^\@/)||($ligne3_r1 !~/^\+/)||($ligne3_r2 !~/^\+/)){ if ($VERBOSE eq "ON"){ print "Error in header : format\n"; print $ligne1_r1; print $ligne2_r1; print $ligne3_r1; print $ligne4_r1; print $ligne1_r2; print $ligne2_r2; print $ligne3_r2; print $ligne4_r2; print "\n"; } $error2++; } #@ 1 - 2 sec else { my $length_seq1 = length($ligne2_r1); my $length_qual1 =length($ligne4_r1); my $seq1; my $qual1; my $length_seq2 = length($ligne2_r2); my $length_qual2 =length($ligne4_r2); my $seq2; my $qual2; my $header1=""; my $header2=""; my $repheader1=""; my $repheader2=""; if ($ligne1_r1 =~/^\@(.*?)\#/){ $header1 = $1; } if ($ligne3_r1 =~/^\+(.*?)\#/){ $repheader1 = $1; } if ($ligne1_r2 =~/^\@(.*?)\#/){ $header2 = $1; } if ($ligne3_r2 =~/^\+(.*?)\#/){ $repheader2 = $1; } #@ 2 sec ### Verification de la coherence sequence /qualité @ 1 sec if (($TYPE eq "illumina")&&((!$header1)||(!$header2)||(!$repheader1)||(!$repheader2))){ if ($VERBOSE eq "ON"){ print "Error in header : empty\n"; print $ligne1_r1; print $ligne2_r1; print $ligne3_r1; print $ligne4_r1; print $ligne1_r2; print $ligne2_r2; print $ligne3_r2; print $ligne4_r2; print "\n"; } $error3++; } elsif (($TYPE eq "sanger")&&((!$header1)||(!$header2))){ if ($VERBOSE eq "ON"){ print "Error in header refgsd : empty\n"; print $ligne1_r1; print $ligne2_r1; print $ligne3_r1; print $ligne4_r1; print $ligne1_r2; print $ligne2_r2; print $ligne3_r2; print $ligne4_r2; print "\n"; } $error3++; } elsif (($TYPE eq "illumina")&&(($header1 ne $repheader1)||($header2 ne $repheader2)||($header1 ne $header2))){ if ($VERBOSE eq "ON"){ print "Error in header : different\n"; print $ligne1_r1; print $ligne2_r1; print $ligne3_r1; print $ligne4_r1; print $ligne1_r2; print $ligne2_r2; print $ligne3_r2; print $ligne4_r2; print "\n"; } $error4++; } elsif (($TYPE eq "sanger")&&($header1 ne $header2)){ if ($VERBOSE eq "ON"){ print "Error in header : different\n"; print $ligne1_r1; print $ligne2_r1; print $ligne3_r1; print $ligne4_r1; print $ligne1_r2; print $ligne2_r2; print $ligne3_r2; print $ligne4_r2; print "\n"; } $error4++; } elsif (($length_seq1 != $length_qual1)||($length_seq2 != $length_qual2)){ if ($VERBOSE eq "ON"){ print "Error in seq/qual length\n"; print $ligne1_r1; print $ligne2_r1; print $ligne3_r1; print $ligne4_r1; print $ligne1_r2; print $ligne2_r2; print $ligne3_r2; print $ligne4_r2; print "\n"; } $error5++; } #@ 1 - 2 sec else { ### Parsing sequence & qualité if ($ligne2_r1 =~ /^([ATGCNX]+)\s*$/i){ $seq1 = $1; } if ($ligne2_r2 =~ /^([ATGCNX]+)\s*$/i){ $seq2 = $1; } if ($ligne4_r1 =~ /^(.*)\s*$/i){ $qual1 = $1; } if ($ligne4_r2 =~ /^(.*)\s*$/i){ $qual2 = $1; } #@ 2 sec ### Verification du parsing et de la coherence sequence /qualité (n°2) if ((!$seq1)||(!$seq2)||(!$qual1)||(!$qual2)){ if ($VERBOSE eq "ON"){ print "Error parsing seq / quality \n"; print $ligne1_r1; print $ligne2_r1; print $ligne3_r1; print $ligne4_r1; print $ligne1_r2; print $ligne2_r2; print $ligne3_r2; print $ligne4_r2; print "\n"; } $error6++; } elsif ((length($seq1) != length($qual1))||(length($seq2) != length($qual2))){ if ($VERBOSE eq "ON"){ print "Error in seq/qual length after parsing\n"; print $ligne1_r1; print $ligne2_r1; print $ligne3_r1; print $ligne4_r1; print $ligne1_r2; print $ligne2_r2; print $ligne3_r2; print $ligne4_r2; print "\n"; } $error7++; } #@ <1 sec else { my $fastq_lines_r1=""; my $fastq_lines_r2=""; $fastq_lines_r1 = &grooming_and_trimming($ligne1_r1,$seq1,$qual1); if ($fastq_lines_r1){ $fastq_lines_r2 = &grooming_and_trimming($ligne1_r2,$seq2,$qual2); } if ($fastq_lines_r2){ print OUT1 $fastq_lines_r1; print OUT2 $fastq_lines_r2; } } } # print OUT1 $ligne1_r1; # print OUT1 $ligne2_r1; # print OUT1 $ligne3_r1; # print OUT1 $ligne4_r1; # print OUT2 $ligne1_r2; # print OUT2 $ligne2_r2; # print OUT2 $ligne3_r2; # print OUT2 $ligne4_r2; #@ 7 sec } } close (READ1); close (READ2); close (OUT1); close (OUT2); sub grooming_and_trimming{ my $header = shift; my $seq = shift; my $quality = shift; my $quality_converted=""; my $quality_ori=$quality; my $lengthseq = length($seq); my $startTrim = 0; my $stopTrim = length($quality)-1; my $startnoN = $startTrim; my $stopnoN = $stopTrim; my $chercheN = $seq; my @bad_position_N; my @bad_position_Q; my $current_index = index($chercheN,"N"); my $abs_index = $current_index; while ($current_index >=0){ push (@bad_position_N,$abs_index); if ($current_index<length($seq)){ $chercheN = substr($chercheN,$current_index+1); $current_index = index($chercheN,"N"); $abs_index = $current_index + $bad_position_N[$#bad_position_N]+1; } else { last; } } my @q = split(//,$quality); for (my $i=0;$i<=$#q;$i++){ my $chr = $q[$i]; my $num = ord($q[$i]); if ($TYPE eq "illumina"){ $num = $num - 31; # 31 comme la difference entre la plage sanger (33-> 93 / 0->60) et illumina (64->104 / 0->40) $quality_converted .= chr($num); } if ($num < $MIN_QUALITY + 33){ #33 comme le départ de la plage sanger push(@bad_position_Q,$i); } } if ($quality_converted){$quality = $quality_converted;} my @bad_position = (@bad_position_N, @bad_position_Q); if ($#bad_position>=0){ @bad_position = sort {$a <=> $b} @bad_position; my %coord=%{&extract_longer_string_coordinates_from_bad_position(0,$stopTrim,\@bad_position)}; $startTrim = $coord{"start"}; $stopTrim = $coord{"stop"}; #print "$startTrim .. $stopTrim\n"; } my $lengthTrim = $stopTrim - $startTrim +1; my $fastq_lines=""; # if ($header =~ /GA8\-EAS671_0005\:3\:1\:1043\:4432/){ # print "HEAD:\t$header"; # print "SEQ:\n$seq\n"; # print "$quality_ori\n"; # print "$quality\n"; # for (my $i=0;$i<=$#bad_position;$i++){ # print $bad_position[$i]."(".$q[$bad_position[$i]]." : ".ord($q[$bad_position[$i]]).")"."\t"; # } # print "\n"; # print "$startTrim .. $stopTrim / $lengthTrim \n"; # print $fastq_lines; # print "\n"; # } if ($lengthTrim >= $MIN_LENGTH){ $fastq_lines .= $header; $fastq_lines .= substr($seq,$startTrim,$lengthTrim)."\n"; $fastq_lines .= "+\n"; $fastq_lines .= substr($quality,$startTrim,$lengthTrim)."\n"; return $fastq_lines; } else { #print "Insufficient length after trimming\n"; return ""; } } sub extract_longer_string_coordinates_from_bad_position{ my $start=shift; my $stop =shift; my $refbad = shift; my @bad_position = @$refbad; my %coord; my $current_start = $start; my $current_stop = $bad_position[0]-1; if ($current_stop < $start){$current_stop = $start;} #debut -> premier N my $current_length = $current_stop - $current_start +1; my $test_length; #entre les N for (my $i=1;$i<=$#bad_position;$i++){ $test_length = $bad_position[$i]+1-$bad_position[$i-1]-1; if ( $test_length > $current_length){ $current_start = $bad_position[$i-1]+1; $current_stop = $bad_position[$i]-1; $current_length = $current_stop - $current_start +1; } } #dernier N -> fin $test_length = $stop-$bad_position[$#bad_position]+1; if ( $test_length > $current_length){ $current_start = $bad_position[$#bad_position]+1; if ($current_start > $stop){$current_start=$stop;} $current_stop = $stop; } $coord{"start"}=$current_start; $coord{"stop"}= $current_stop; $coord{"lenght"}=$current_stop-$current_start+1; return \%coord; }