Mercurial > repos > mcharles > rapsosnp
view rapsodyn/PrepareFastqLight.pl @ 12:c794dafd1ae5 draft
Uploaded
author | mcharles |
---|---|
date | Mon, 19 Jan 2015 04:46:31 -0500 |
parents | 0a6c1cfe4dc8 |
children | 56d328bce3a7 |
line wrap: on
line source
#!/usr/bin/perl #v1.1.0 manage empty files #v1.0.4 bug correction, last read not considered #v1.0.3 support rapsodyn header (.... 1:... / .... 2:...) #V1.0.2 added auto type detection #V1.0.1 added log, option parameters use strict; use warnings; use Getopt::Long; my $read1_file; my $read2_file; my $log_file; my $output1_file; my $output2_file; my $TYPE="sanger"; my $MIN_LENGTH=30; my $MIN_QUALITY=30; my $VERBOSE = "OFF"; GetOptions ( "read1_file=s" => \$read1_file, "read2_file=s" => \$read2_file, "log_file=s" => \$log_file, "output1_file=s" => \$output1_file, "output2_file=s" => \$output2_file, "type=s" => \$TYPE, "min_length=i" => \$MIN_LENGTH, "min_quality=i" => \$MIN_QUALITY, "verbose=s" => \$VERBOSE ) or die("Error in command line arguments\n"); my $nb_read1=0; my $nb_base_read1=0; my $nb_read2=0; my $nb_base_read2=0; my $nb_read1_t=0; my $nb_base_read1_t=0; my $nb_read2_t=0; my $nb_base_read2_t=0; my $nb_base_current_t=0; open(READ1, $read1_file) or die ("Can't open $read1_file\n"); open(READ2, $read2_file) or die ("Can't open $read2_file\n"); open(OUT1, ">$output1_file") or die ("Can't open $output1_file\n"); open(OUT2, ">$output2_file") or die ("Can't open $output2_file\n"); open (LF,">$log_file") or die("Can't open $log_file\n"); if (( -z READ1)&&( -z READ2)){ exit(0); } elsif (( -z READ1)||( -z READ2)){ print STDERR "One empty File\n"; exit(0); } my $error1=0; my $error2=0; my $error3=0; my $error4=0; my $error5=0; my $error6=0; my $error7=0; my $error8=0; my $error9=0; my $error10=0; my $auto_type=""; my %qual; if ($TYPE eq "auto"){ my $compt=0; open(DETECT, $read1_file) or die ("Can't open $read1_file\n"); while (my $ligne1_r1 =<DETECT>){ my $ligne2_r1 =<DETECT>; my $ligne3_r1 =<DETECT>; my $ligne4_r1 =<DETECT>; $compt++; if ($ligne4_r1 =~ /^(.*)\s*$/i){ my $qual = $1; my @q = split(//,$qual); for (my $i=0;$i<=$#q;$i++){ my $num = ord($q[$i]); if ($qual{$num}){ $qual{$num}++; } else { $qual{$num} = 1; } #range sanger / illumina 1.8+ : 33->94 #range illumina 1.3->1.7 : 64->105 if ($num > 94){$auto_type = "illumina";last;} if ($num < 64){$auto_type = "sanger";last;} } } else { print STDERR "Error in format detection : quality not recognized\n$ligne4_r1"; exit(0); } if ($auto_type ne ""){ last; } } close (DETECT); if ($auto_type eq ""){ print STDERR "Error in format detection : type not recognized parsing read1\n"; foreach my $key (sort {$a <=> $b} keys %qual){ print "$key\t:\t",$qual{$key},"\n"; } exit(0); } else { $TYPE = $auto_type; } } my $compt=0; while (my $ligne1_r1 =<READ1>){ my $ligne2_r1 =<READ1>; my $ligne3_r1 =<READ1>; my $ligne4_r1 =<READ1>; my $ligne1_r2 =<READ2>; my $ligne2_r2 =<READ2>; my $ligne3_r2 =<READ2>; my $ligne4_r2 =<READ2>; # chomp($ligne1_r1); # chomp($ligne2_r1); # chomp($ligne3_r1); # chomp($ligne4_r1); # chomp($ligne2_r1); $compt++; $nb_read1++; $nb_read2++; #@ 1 sec if ((!$ligne1_r1)||(!$ligne2_r1)||(!$ligne3_r1)||(!$ligne4_r1)||(!$ligne1_r2)||(!$ligne2_r2)||(!$ligne3_r2)||(!$ligne4_r2)){ if ($VERBOSE eq "ON"){ print "Error in file format"; if ($ligne1_r1){print $ligne1_r1;} if ($ligne2_r1){print $ligne2_r1;} if ($ligne3_r1){print $ligne3_r1;} if ($ligne4_r1){print $ligne4_r1;} if ($ligne1_r2){print $ligne1_r2;} if ($ligne2_r2){print $ligne2_r2;} if ($ligne3_r2){print $ligne3_r2;} if ($ligne4_r2){print $ligne4_r2;} print "\n"; } $error1++; } elsif(($ligne1_r1 !~/^\@/)||($ligne1_r2 !~/^\@/)||($ligne3_r1 !~/^\+/)||($ligne3_r2 !~/^\+/)){ if ($VERBOSE eq "ON"){ print "Error in header : format\n"; print $ligne1_r1; print $ligne2_r1; print $ligne3_r1; print $ligne4_r1; print $ligne1_r2; print $ligne2_r2; print $ligne3_r2; print $ligne4_r2; print "\n"; } $error2++; } #@ 1 - 2 sec else { my $length_seq1 = length(chomp($ligne2_r1)); my $length_qual1 =length(chomp($ligne4_r1)); my $seq1; my $qual1; my $length_seq2 = length(chomp($ligne2_r2)); my $length_qual2 =length(chomp($ligne4_r2)); my $seq2; my $qual2; my $header1=""; my $header2=""; my $repheader1=""; my $repheader2=""; if ($ligne1_r1 =~/^\@(.*?)[\s\/]/){ $header1 = $1; } if ($ligne3_r1 =~/^\+(.*?)[\s\/]/){ $repheader1 = $1; } if ($ligne1_r2 =~/^\@(.*?)[\s\/]/){ $header2 = $1; } if ($ligne3_r2 =~/^\+(.*?)[\s\/]/){ $repheader2 = $1; } #@ 2 sec ### Verification de la coherence sequence /qualité @ 1 sec if (($TYPE eq "illumina")&&((!$header1)||(!$header2)||(!$repheader1)||(!$repheader2))){ if ($VERBOSE eq "ON"){ print "Error in header : empty\n"; print $ligne1_r1; print $ligne2_r1; print $ligne3_r1; print $ligne4_r1; print $ligne1_r2; print $ligne2_r2; print $ligne3_r2; print $ligne4_r2; print "\n"; } $error3++; } elsif (($TYPE eq "sanger")&&((!$header1)||(!$header2))){ if ($VERBOSE eq "ON"){ print "Error in header ref : empty\n"; print $ligne1_r1; print $ligne2_r1; print $ligne3_r1; print $ligne4_r1; print $ligne1_r2; print $ligne2_r2; print $ligne3_r2; print $ligne4_r2; print "\n"; } $error3++; } elsif (($TYPE eq "illumina")&&(($header1 ne $repheader1)||($header2 ne $repheader2)||($header1 ne $header2))){ if ($VERBOSE eq "ON"){ print "Error in header : different\n"; print $ligne1_r1; print $ligne2_r1; print $ligne3_r1; print $ligne4_r1; print $ligne1_r2; print $ligne2_r2; print $ligne3_r2; print $ligne4_r2; print "\n"; } $error4++; } elsif (($TYPE eq "sanger")&&($header1 ne $header2)){ if ($VERBOSE eq "ON"){ print "Error in header : different\n"; print $ligne1_r1; print $ligne2_r1; print $ligne3_r1; print $ligne4_r1; print $ligne1_r2; print $ligne2_r2; print $ligne3_r2; print $ligne4_r2; print "\n"; } $error4++; } elsif (($length_seq1 != $length_qual1)||($length_seq2 != $length_qual2)){ if ($VERBOSE eq "ON"){ print "Error in seq/qual length\n"; print "$length_seq1 / $length_qual1 \t $length_seq2 / $length_qual2\n"; print $ligne1_r1; print $ligne2_r1; print $ligne3_r1; print $ligne4_r1; print "\n"; print $ligne1_r2; print $ligne2_r2; print $ligne3_r2; print $ligne4_r2; print "\n"; } $error5++; } #@ 1 - 2 sec else { #print "TEST : $compt\n"; ### Parsing sequence & qualité if ($ligne2_r1 =~ /^([ATGCNX]+)\s*$/i){ $seq1 = $1; $nb_base_read1 += length($seq1); } if ($ligne2_r2 =~ /^([ATGCNX]+)\s*$/i){ $seq2 = $1; $nb_base_read2 += length($seq2); } if ($ligne4_r1 =~ /^(.*)\s*$/i){ $qual1 = $1; } if ($ligne4_r2 =~ /^(.*)\s*$/i){ $qual2 = $1; } #@ 2 sec ### Verification du parsing et de la coherence sequence /qualité (n°2) if ((!$seq1)||(!$seq2)||(!$qual1)||(!$qual2)){ if ($VERBOSE eq "ON"){ print "Error parsing seq / quality \n"; print $ligne1_r1; print $ligne2_r1; print $ligne3_r1; print $ligne4_r1; print $ligne1_r2; print $ligne2_r2; print $ligne3_r2; print $ligne4_r2; print "\n"; } $error6++; } elsif ((length($seq1) != length($qual1))||(length($seq2) != length($qual2))){ if ($VERBOSE eq "ON"){ print "Error in seq/qual length after parsing\n"; print $ligne1_r1; print $ligne2_r1; print $ligne3_r1; print $ligne4_r1; print $ligne1_r2; print $ligne2_r2; print $ligne3_r2; print $ligne4_r2; print "\n"; } $error7++; } #@ <1 sec else { my $fastq_lines_r1=""; my $fastq_lines_r2=""; my $nb_base_current_read1_t = 0; my $nb_base_current_read2_t = 0; $fastq_lines_r1 = &grooming_and_trimming($ligne1_r1,$seq1,$qual1); $nb_base_current_read1_t = $nb_base_current_t; if ($fastq_lines_r1){ $fastq_lines_r2 = &grooming_and_trimming($ligne1_r2,$seq2,$qual2); $nb_base_current_read2_t = $nb_base_current_t; } if ($fastq_lines_r2){ print OUT1 $fastq_lines_r1; print OUT2 $fastq_lines_r2; $nb_read1_t++; $nb_read2_t++; $nb_base_read1_t += $nb_base_current_read1_t; $nb_base_read2_t += $nb_base_current_read2_t; } } } #@ 7 sec } } close (READ1); close (READ2); close (OUT1); close (OUT2); print LF "\n####\t Fastq preparation \n"; print LF "Fastq format : $TYPE\n"; print LF "## Before preparation\n"; print LF "#Read1 :\t$nb_read1\t#Base :\t$nb_base_read1\n"; print LF "#Read2 :\t$nb_read2\t#Base :\t$nb_base_read2\n"; print LF "## After preparation\n"; print LF "#Read1 :\t$nb_read1_t\t#Base :\t$nb_base_read1_t\n"; print LF "#Read2 :\t$nb_read2_t\t#Base :\t$nb_base_read2_t\n"; close (LF); sub grooming_and_trimming{ my $header = shift; my $seq = shift; my $quality = shift; my $quality_converted=""; my $quality_ori=$quality; my $lengthseq = length($seq); my $startTrim = 0; my $stopTrim = length($quality)-1; my $startnoN = $startTrim; my $stopnoN = $stopTrim; my $chercheN = $seq; my @bad_position_N; my @bad_position_Q; my $current_index = index($chercheN,"N"); my $abs_index = $current_index; while ($current_index >=0){ push (@bad_position_N,$abs_index); if ($current_index<length($seq)){ $chercheN = substr($chercheN,$current_index+1); $current_index = index($chercheN,"N"); $abs_index = $current_index + $bad_position_N[$#bad_position_N]+1; } else { last; } } my @q = split(//,$quality); for (my $i=0;$i<=$#q;$i++){ my $chr = $q[$i]; my $num = ord($q[$i]); if ($TYPE eq "illumina"){ $num = $num - 31; # 31 comme la difference entre la plage sanger (33-> 93 / 0->60) et illumina (64->104 / 0->40) $quality_converted .= chr($num); } if ($num < $MIN_QUALITY + 33){ #33 comme le départ de la plage sanger push(@bad_position_Q,$i); } } if ($quality_converted){$quality = $quality_converted;} my @bad_position = (@bad_position_N, @bad_position_Q); if ($#bad_position>=0){ @bad_position = sort {$a <=> $b} @bad_position; my %coord=%{&extract_longer_string_coordinates_from_bad_position(0,$stopTrim,\@bad_position)}; $startTrim = $coord{"start"}; $stopTrim = $coord{"stop"}; #print "$startTrim .. $stopTrim\n"; } my $lengthTrim = $stopTrim - $startTrim +1; #if ($stats_length{$lengthTrim}){ # $stats_length{$lengthTrim} = 1; #} #else { # $stats_length{$lengthTrim}++; #} my $fastq_lines=""; # if ($header =~ /GA8\-EAS671_0005\:3\:1\:1043\:4432/){ # print "HEAD:\t$header"; # print "SEQ:\n$seq\n"; # print "$quality_ori\n"; # print "$quality\n"; # for (my $i=0;$i<=$#bad_position;$i++){ # print $bad_position[$i]."(".$q[$bad_position[$i]]." : ".ord($q[$bad_position[$i]]).")"."\t"; # } # print "\n"; # print "$startTrim .. $stopTrim / $lengthTrim \n"; # print $fastq_lines; # print "\n"; # } #for (my $i=$startTrim;$i<=$stopTrim;$i++){ # if ($stats_quality{ord($q{$i])}){ # $stats_quality{ord($q{$i])}=1; # } # else { # $stats_quality{ord($q{$i])}++; # } #} if ($lengthTrim >= $MIN_LENGTH){ $fastq_lines .= $header; my $new_seq = substr($seq,$startTrim,$lengthTrim); $nb_base_current_t = length($new_seq); $fastq_lines .= $new_seq."\n"; $fastq_lines .= "+\n"; my $new_q = substr($quality,$startTrim,$lengthTrim); $fastq_lines .= $new_q."\n"; return $fastq_lines; } else { #print "Insufficient length after trimming\n"; return ""; } } sub extract_longer_string_coordinates_from_bad_position{ my $start=shift; my $stop =shift; my $refbad = shift; my @bad_position = @$refbad; my %coord; my $current_start = $start; my $current_stop = $bad_position[0]-1; if ($current_stop < $start){$current_stop = $start;} #debut -> premier N my $current_length = $current_stop - $current_start +1; my $test_length; #entre les N for (my $i=1;$i<=$#bad_position;$i++){ $test_length = $bad_position[$i]+1-$bad_position[$i-1]-1; if ( $test_length > $current_length){ $current_start = $bad_position[$i-1]+1; $current_stop = $bad_position[$i]-1; $current_length = $current_stop - $current_start +1; } } #dernier N -> fin $test_length = $stop-$bad_position[$#bad_position]+1; if ( $test_length > $current_length){ $current_start = $bad_position[$#bad_position]+1; if ($current_start > $stop){$current_start=$stop;} $current_stop = $stop; } $coord{"start"}=$current_start; $coord{"stop"}= $current_stop; $coord{"lenght"}=$current_stop-$current_start+1; return \%coord; }