Mercurial > repos > mcharles > rapsosnp
diff rapsodyn/PrepareFastqLight.pl~ @ 15:56d328bce3a7 draft default tip
Uploaded
author | mcharles |
---|---|
date | Thu, 29 Jan 2015 08:54:06 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rapsodyn/PrepareFastqLight.pl~ Thu Jan 29 08:54:06 2015 -0500 @@ -0,0 +1,512 @@ +#!/usr/bin/perl +#V1.0.2 added auto type detection +#V1.0.1 added log, option parameters +use strict; +use warnings; +use Getopt::Long; + +my $read1_file; +my $read2_file; +my $log_file; +my $output1_file; +my $output2_file; + +my $TYPE="sanger"; +my $MIN_LENGTH=30; +my $MIN_QUALITY=30; + +my $VERBOSE = "OFF"; + +GetOptions ( +"read1_file=s" => \$read1_file, +"read2_file=s" => \$read2_file, +"log_file=s" => \$log_file, +"output1_file=s" => \$output1_file, +"output2_file=s" => \$output2_file, +"type=s" => \$TYPE, +"min_length=i" => \$MIN_LENGTH, +"min_quality=i" => \$MIN_QUALITY, +"verbose=s" => \$VERBOSE +) or die("Error in command line arguments\n"); + + +my $nb_read1=0; +my $nb_base_read1=0; +my $nb_read2=0; +my $nb_base_read2=0; + +my $nb_read1_t=0; +my $nb_base_read1_t=0; +my $nb_read2_t=0; +my $nb_base_read2_t=0; + +my $nb_base_current_t=0; + +open(READ1, $read1_file) or die ("Can't open $read1_file\n"); +open(READ2, $read2_file) or die ("Can't open $read2_file\n"); +open(OUT1, ">$output1_file") or die ("Can't open $output1_file\n"); +open(OUT2, ">$output2_file") or die ("Can't open $output2_file\n"); +open (LF,">$log_file") or die("Can't open $log_file\n"); + + +my $error1=0; +my $error2=0; +my $error3=0; +my $error4=0; +my $error5=0; +my $error6=0; +my $error7=0; +my $error8=0; +my $error9=0; +my $error10=0; + +my $auto_type=""; +my %qual; +if ($TYPE eq "auto"){ + my $compt=0; + open(DETECT, $read1_file) or die ("Can't open $read1_file\n"); + while (my $ligne1_r1 =<DETECT>){ + my $ligne2_r1 =<DETECT>; + my $ligne3_r1 =<DETECT>; + my $ligne4_r1 =<DETECT>; + $compt++; + if ($ligne4_r1 =~ /^(.*)\s*$/i){ + my $qual = $1; + my @q = split(//,$qual); + for (my $i=0;$i<=$#q;$i++){ + my $num = ord($q[$i]); + if ($qual{$num}){ + $qual{$num}++; + } + else { + $qual{$num} = 1; + } + #range sanger / illumina 1.8+ : 33->94 + #range illumina 1.3->1.7 : 64->105 + if ($num > 94){$auto_type = "illumina";last;} + if ($num < 64){$auto_type = "sanger";last;} + } + } + else { + print STDERR "Error in format detection : quality not recognized\n$ligne4_r1"; + exit(0); + } + + if ($auto_type ne ""){ + last; + } + + } + close (DETECT); + if ($auto_type eq ""){ + print STDERR "Error in format detection : type not recognized parsing read1\n"; + foreach my $key (sort {$a <=> $b} keys %qual){ + print "$key\t:\t",$qual{$key},"\n"; + } + exit(0); + } + else { + $TYPE = $auto_type; + } +} + + + + +while (my $ligne1_r1 =<READ1>){ + my $ligne2_r1 =<READ1>; + my $ligne3_r1 =<READ1>; + my $ligne4_r1 =<READ1>; + my $ligne1_r2 =<READ2>; + my $ligne2_r2 =<READ2>; + my $ligne3_r2 =<READ2>; + my $ligne4_r2 =<READ2>; + + $nb_read1++; + $nb_read2++; + +#@ 1 sec + if ((!$ligne1_r1)||(!$ligne2_r1)||(!$ligne3_r1)||(!$ligne4_r1)||(!$ligne1_r2)||(!$ligne2_r2)||(!$ligne3_r2)||(!$ligne4_r2)){ + if ($VERBOSE eq "ON"){ + print "Error in file format"; + if ($ligne1_r1){print $ligne1_r1;} + if ($ligne2_r1){print $ligne2_r1;} + if ($ligne3_r1){print $ligne3_r1;} + if ($ligne4_r1){print $ligne4_r1;} + if ($ligne1_r2){print $ligne1_r2;} + if ($ligne2_r2){print $ligne2_r2;} + if ($ligne3_r2){print $ligne3_r2;} + if ($ligne4_r2){print $ligne4_r2;} + print "\n"; + } + $error1++; + } + elsif(($ligne1_r1 !~/^\@/)||($ligne1_r2 !~/^\@/)||($ligne3_r1 !~/^\+/)||($ligne3_r2 !~/^\+/)){ + if ($VERBOSE eq "ON"){ + print "Error in header : format\n"; + print $ligne1_r1; + print $ligne2_r1; + print $ligne3_r1; + print $ligne4_r1; + print $ligne1_r2; + print $ligne2_r2; + print $ligne3_r2; + print $ligne4_r2; + print "\n"; + } + $error2++; + } +#@ 1 - 2 sec + else { + + my $length_seq1 = length($ligne2_r1); + my $length_qual1 =length($ligne4_r1); + my $seq1; + my $qual1; + + my $length_seq2 = length($ligne2_r2); + my $length_qual2 =length($ligne4_r2); + my $seq2; + my $qual2; + my $header1=""; + my $header2=""; + my $repheader1=""; + my $repheader2=""; + + + if ($ligne1_r1 =~/^\@(.*?)[\s\/]/){ + $header1 = $1; + } + + if ($ligne3_r1 =~/^\+(.*?)[\s\/]/){ + $repheader1 = $1; + } + + if ($ligne1_r2 =~/^\@(.*?)[\s\/]/){ + $header2 = $1; + } + + if ($ligne3_r2 =~/^\+(.*?)[\s\/]/){ + $repheader2 = $1; + } +#@ 2 sec + + ### Verification de la coherence sequence /qualité @ 1 sec + if (($TYPE eq "illumina")&&((!$header1)||(!$header2)||(!$repheader1)||(!$repheader2))){ + if ($VERBOSE eq "ON"){ + print "Error in header : empty\n"; + print $ligne1_r1; + print $ligne2_r1; + print $ligne3_r1; + print $ligne4_r1; + print $ligne1_r2; + print $ligne2_r2; + print $ligne3_r2; + print $ligne4_r2; + print "\n"; + } + $error3++; + } + elsif (($TYPE eq "sanger")&&((!$header1)||(!$header2))){ + if ($VERBOSE eq "ON"){ + print "Error in header refgsd : empty\n"; + print $ligne1_r1; + print $ligne2_r1; + print $ligne3_r1; + print $ligne4_r1; + print $ligne1_r2; + print $ligne2_r2; + print $ligne3_r2; + print $ligne4_r2; + print "\n"; + } + $error3++; + } + elsif (($TYPE eq "illumina")&&(($header1 ne $repheader1)||($header2 ne $repheader2)||($header1 ne $header2))){ + if ($VERBOSE eq "ON"){ + print "Error in header : different\n"; + print $ligne1_r1; + print $ligne2_r1; + print $ligne3_r1; + print $ligne4_r1; + print $ligne1_r2; + print $ligne2_r2; + print $ligne3_r2; + print $ligne4_r2; + print "\n"; + } + $error4++; + } + elsif (($TYPE eq "sanger")&&($header1 ne $header2)){ + if ($VERBOSE eq "ON"){ + print "Error in header : different\n"; + print $ligne1_r1; + print $ligne2_r1; + print $ligne3_r1; + print $ligne4_r1; + print $ligne1_r2; + print $ligne2_r2; + print $ligne3_r2; + print $ligne4_r2; + print "\n"; + } + $error4++; + } + elsif (($length_seq1 != $length_qual1)||($length_seq2 != $length_qual2)){ + if ($VERBOSE eq "ON"){ + print "Error in seq/qual length\n"; + print $ligne1_r1; + print $ligne2_r1; + print $ligne3_r1; + print $ligne4_r1; + print $ligne1_r2; + print $ligne2_r2; + print $ligne3_r2; + print $ligne4_r2; + print "\n"; + } + $error5++; + } +#@ 1 - 2 sec + else { + ### Parsing sequence & qualité + if ($ligne2_r1 =~ /^([ATGCNX]+)\s*$/i){ + $seq1 = $1; + $nb_base_read1 += length($seq1); + } + if ($ligne2_r2 =~ /^([ATGCNX]+)\s*$/i){ + $seq2 = $1; + $nb_base_read2 += length($seq2); + } + if ($ligne4_r1 =~ /^(.*)\s*$/i){ + $qual1 = $1; + } + if ($ligne4_r2 =~ /^(.*)\s*$/i){ + $qual2 = $1; + } +#@ 2 sec + ### Verification du parsing et de la coherence sequence /qualité (n°2) + if ((!$seq1)||(!$seq2)||(!$qual1)||(!$qual2)){ + if ($VERBOSE eq "ON"){ + print "Error parsing seq / quality \n"; + print $ligne1_r1; + print $ligne2_r1; + print $ligne3_r1; + print $ligne4_r1; + print $ligne1_r2; + print $ligne2_r2; + print $ligne3_r2; + print $ligne4_r2; + print "\n"; + } + $error6++; + } + elsif ((length($seq1) != length($qual1))||(length($seq2) != length($qual2))){ + if ($VERBOSE eq "ON"){ + print "Error in seq/qual length after parsing\n"; + print $ligne1_r1; + print $ligne2_r1; + print $ligne3_r1; + print $ligne4_r1; + print $ligne1_r2; + print $ligne2_r2; + print $ligne3_r2; + print $ligne4_r2; + print "\n"; + } + $error7++; + } +#@ <1 sec + else { + my $fastq_lines_r1=""; + my $fastq_lines_r2=""; + my $nb_base_current_read1_t = 0; + my $nb_base_current_read2_t = 0; + + $fastq_lines_r1 = &grooming_and_trimming($ligne1_r1,$seq1,$qual1); + $nb_base_current_read1_t = $nb_base_current_t; + if ($fastq_lines_r1){ + $fastq_lines_r2 = &grooming_and_trimming($ligne1_r2,$seq2,$qual2); + $nb_base_current_read2_t = $nb_base_current_t; + } + if ($fastq_lines_r2){ + print OUT1 $fastq_lines_r1; + print OUT2 $fastq_lines_r2; + + $nb_read1_t++; + $nb_read2_t++; + $nb_base_read1_t += $nb_base_current_read1_t; + $nb_base_read2_t += $nb_base_current_read2_t; + + + } + } + } + + +#@ 7 sec + } +} + +close (READ1); +close (READ2); +close (OUT1); +close (OUT2); + +print LF "\n####\t Fastq preparation \n"; +print LF "Fastq format : $TYPE\n"; +print LF "## Before preparation\n"; +print LF "#Read1 :\t$nb_read1\t#Base :\t$nb_base_read1\n"; +print LF "#Read2 :\t$nb_read2\t#Base :\t$nb_base_read2\n"; +print LF "## After preparation\n"; +print LF "#Read1 :\t$nb_read1_t\t#Base :\t$nb_base_read1_t\n"; +print LF "#Read2 :\t$nb_read2_t\t#Base :\t$nb_base_read2_t\n"; +close (LF); + + +sub grooming_and_trimming{ + my $header = shift; + my $seq = shift; + my $quality = shift; + my $quality_converted=""; + my $quality_ori=$quality; + + my $lengthseq = length($seq); + my $startTrim = 0; + my $stopTrim = length($quality)-1; + my $startnoN = $startTrim; + my $stopnoN = $stopTrim; + + + my $chercheN = $seq; + my @bad_position_N; + my @bad_position_Q; + my $current_index = index($chercheN,"N"); + my $abs_index = $current_index; + while ($current_index >=0){ + push (@bad_position_N,$abs_index); + + if ($current_index<length($seq)){ + $chercheN = substr($chercheN,$current_index+1); + $current_index = index($chercheN,"N"); + $abs_index = $current_index + $bad_position_N[$#bad_position_N]+1; + } + else { + last; + } + } + + my @q = split(//,$quality); + for (my $i=0;$i<=$#q;$i++){ + my $chr = $q[$i]; + my $num = ord($q[$i]); + if ($TYPE eq "illumina"){ + $num = $num - 31; # 31 comme la difference entre la plage sanger (33-> 93 / 0->60) et illumina (64->104 / 0->40) + $quality_converted .= chr($num); + } + + if ($num < $MIN_QUALITY + 33){ #33 comme le départ de la plage sanger + push(@bad_position_Q,$i); + } + } + if ($quality_converted){$quality = $quality_converted;} + + my @bad_position = (@bad_position_N, @bad_position_Q); + + if ($#bad_position>=0){ + @bad_position = sort {$a <=> $b} @bad_position; + my %coord=%{&extract_longer_string_coordinates_from_bad_position(0,$stopTrim,\@bad_position)}; + $startTrim = $coord{"start"}; + $stopTrim = $coord{"stop"}; +#print "$startTrim .. $stopTrim\n"; + + } + my $lengthTrim = $stopTrim - $startTrim +1; + + #if ($stats_length{$lengthTrim}){ + # $stats_length{$lengthTrim} = 1; + #} + #else { + # $stats_length{$lengthTrim}++; + #} + my $fastq_lines=""; + +# if ($header =~ /GA8\-EAS671_0005\:3\:1\:1043\:4432/){ +# print "HEAD:\t$header"; +# print "SEQ:\n$seq\n"; +# print "$quality_ori\n"; +# print "$quality\n"; +# for (my $i=0;$i<=$#bad_position;$i++){ +# print $bad_position[$i]."(".$q[$bad_position[$i]]." : ".ord($q[$bad_position[$i]]).")"."\t"; +# } +# print "\n"; +# print "$startTrim .. $stopTrim / $lengthTrim \n"; +# print $fastq_lines; +# print "\n"; +# } + + #for (my $i=$startTrim;$i<=$stopTrim;$i++){ + # if ($stats_quality{ord($q{$i])}){ + # $stats_quality{ord($q{$i])}=1; + # } + # else { + # $stats_quality{ord($q{$i])}++; + # } + #} + + if ($lengthTrim >= $MIN_LENGTH){ + $fastq_lines .= $header; + my $new_seq = substr($seq,$startTrim,$lengthTrim); + $nb_base_current_t = length($new_seq); + $fastq_lines .= $new_seq."\n"; + $fastq_lines .= "+\n"; + my $new_q = substr($quality,$startTrim,$lengthTrim); + $fastq_lines .= $new_q."\n"; + return $fastq_lines; + + } + else { + #print "Insufficient length after trimming\n"; + return ""; + } +} + +sub extract_longer_string_coordinates_from_bad_position{ + my $start=shift; + my $stop =shift; + my $refbad = shift; + my @bad_position = @$refbad; + my %coord; + + my $current_start = $start; + my $current_stop = $bad_position[0]-1; + if ($current_stop < $start){$current_stop = $start;} + + + #debut -> premier N + my $current_length = $current_stop - $current_start +1; + my $test_length; + + #entre les N + for (my $i=1;$i<=$#bad_position;$i++){ + $test_length = $bad_position[$i]+1-$bad_position[$i-1]-1; + if ( $test_length > $current_length){ + $current_start = $bad_position[$i-1]+1; + $current_stop = $bad_position[$i]-1; + $current_length = $current_stop - $current_start +1; + } + } + + #dernier N -> fin + $test_length = $stop-$bad_position[$#bad_position]+1; + if ( $test_length > $current_length){ + $current_start = $bad_position[$#bad_position]+1; + if ($current_start > $stop){$current_start=$stop;} + $current_stop = $stop; + } + $coord{"start"}=$current_start; + $coord{"stop"}= $current_stop; + $coord{"lenght"}=$current_stop-$current_start+1; + + return \%coord; +}