diff rapsodyn/PrepareFastqLight.pl~ @ 15:56d328bce3a7 draft default tip

Uploaded
author mcharles
date Thu, 29 Jan 2015 08:54:06 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rapsodyn/PrepareFastqLight.pl~	Thu Jan 29 08:54:06 2015 -0500
@@ -0,0 +1,512 @@
+#!/usr/bin/perl
+#V1.0.2 added auto type detection
+#V1.0.1 added log, option parameters
+use strict;
+use warnings;
+use Getopt::Long;
+
+my $read1_file;
+my $read2_file;
+my $log_file;
+my $output1_file;
+my $output2_file;
+
+my $TYPE="sanger";
+my $MIN_LENGTH=30;
+my $MIN_QUALITY=30;
+
+my $VERBOSE = "OFF";
+
+GetOptions (
+"read1_file=s" => \$read1_file,
+"read2_file=s" => \$read2_file,
+"log_file=s" => \$log_file,
+"output1_file=s" => \$output1_file,
+"output2_file=s" => \$output2_file,
+"type=s" => \$TYPE,
+"min_length=i" => \$MIN_LENGTH,
+"min_quality=i" => \$MIN_QUALITY,
+"verbose=s" => \$VERBOSE
+) or die("Error in command line arguments\n");
+
+
+my $nb_read1=0;
+my $nb_base_read1=0;
+my $nb_read2=0;
+my $nb_base_read2=0;
+
+my $nb_read1_t=0;
+my $nb_base_read1_t=0;
+my $nb_read2_t=0;
+my $nb_base_read2_t=0;
+
+my $nb_base_current_t=0;
+
+open(READ1, $read1_file) or die ("Can't open $read1_file\n");
+open(READ2, $read2_file) or die ("Can't open $read2_file\n");
+open(OUT1, ">$output1_file") or die ("Can't open $output1_file\n");
+open(OUT2, ">$output2_file") or die ("Can't open $output2_file\n");
+open (LF,">$log_file") or die("Can't open $log_file\n");
+
+
+my $error1=0;
+my $error2=0;
+my $error3=0;
+my $error4=0;
+my $error5=0;
+my $error6=0;
+my $error7=0;
+my $error8=0;
+my $error9=0;
+my $error10=0;
+
+my $auto_type="";
+my %qual;
+if ($TYPE eq "auto"){
+	my $compt=0;
+	open(DETECT, $read1_file) or die ("Can't open $read1_file\n"); 
+	while (my $ligne1_r1 =<DETECT>){
+		my $ligne2_r1 =<DETECT>;
+		my $ligne3_r1 =<DETECT>;
+		my $ligne4_r1 =<DETECT>;
+		$compt++;
+		if ($ligne4_r1 =~ /^(.*)\s*$/i){
+			my $qual = $1;
+			my @q = split(//,$qual);
+			for (my $i=0;$i<=$#q;$i++){
+				my $num = ord($q[$i]);
+				if ($qual{$num}){
+					$qual{$num}++;
+				}
+				else {
+					$qual{$num} = 1;
+				}
+				#range sanger / illumina 1.8+ 	: 33->94
+				#range illumina 1.3->1.7 	: 64->105
+				if ($num > 94){$auto_type = "illumina";last;}
+				if ($num < 64){$auto_type = "sanger";last;}
+			}
+		}
+		else {
+			print STDERR "Error in format detection : quality not recognized\n$ligne4_r1";
+			exit(0);
+		}
+	
+		if ($auto_type ne ""){
+			last;
+		}
+
+	}
+	close (DETECT);
+	if ($auto_type eq ""){
+		print STDERR "Error in format detection : type not recognized parsing read1\n";
+		foreach my $key (sort {$a <=> $b} keys %qual){
+			print "$key\t:\t",$qual{$key},"\n";
+		}
+		exit(0);
+	}
+	else {
+		$TYPE = $auto_type;
+	}
+}
+
+
+
+
+while (my $ligne1_r1 =<READ1>){
+	my $ligne2_r1 =<READ1>;
+	my $ligne3_r1 =<READ1>;
+	my $ligne4_r1 =<READ1>;
+	my $ligne1_r2 =<READ2>;
+	my $ligne2_r2 =<READ2>;
+	my $ligne3_r2 =<READ2>;
+	my $ligne4_r2 =<READ2>;
+
+	$nb_read1++;
+	$nb_read2++;
+	
+#@ 1 sec
+	if ((!$ligne1_r1)||(!$ligne2_r1)||(!$ligne3_r1)||(!$ligne4_r1)||(!$ligne1_r2)||(!$ligne2_r2)||(!$ligne3_r2)||(!$ligne4_r2)){
+		if ($VERBOSE eq "ON"){
+			print "Error in file format";
+			if ($ligne1_r1){print $ligne1_r1;}
+			if ($ligne2_r1){print $ligne2_r1;}
+			if ($ligne3_r1){print $ligne3_r1;}
+			if ($ligne4_r1){print $ligne4_r1;}
+			if ($ligne1_r2){print $ligne1_r2;}
+			if ($ligne2_r2){print $ligne2_r2;}
+			if ($ligne3_r2){print $ligne3_r2;}
+			if ($ligne4_r2){print $ligne4_r2;}
+			print "\n";
+		}
+		$error1++;
+	}
+	elsif(($ligne1_r1 !~/^\@/)||($ligne1_r2 !~/^\@/)||($ligne3_r1 !~/^\+/)||($ligne3_r2 !~/^\+/)){
+		if ($VERBOSE eq "ON"){
+			print "Error in header : format\n";
+			print $ligne1_r1;
+			print $ligne2_r1;
+			print $ligne3_r1;
+			print $ligne4_r1;
+			print $ligne1_r2;
+			print $ligne2_r2;
+			print $ligne3_r2;
+			print $ligne4_r2;
+			print "\n";
+		}
+		$error2++;
+	}
+#@ 1 - 2 sec
+	else {
+		
+		my $length_seq1 = length($ligne2_r1);
+		my $length_qual1 =length($ligne4_r1);
+		my $seq1;
+		my $qual1;
+	
+		my $length_seq2 = length($ligne2_r2);
+		my $length_qual2 =length($ligne4_r2);
+		my $seq2;
+		my $qual2;
+		my $header1="";
+		my $header2="";
+		my $repheader1="";
+		my $repheader2="";
+
+		
+		if ($ligne1_r1 =~/^\@(.*?)[\s\/]/){
+			$header1 = $1;
+		}
+		
+		if ($ligne3_r1 =~/^\+(.*?)[\s\/]/){
+                        $repheader1 = $1;
+               	}
+
+		if ($ligne1_r2 =~/^\@(.*?)[\s\/]/){
+                        $header2 = $1;
+               	}
+
+		if ($ligne3_r2 =~/^\+(.*?)[\s\/]/){
+                        $repheader2 = $1;
+               	}
+#@ 2 sec
+		
+		### Verification de la coherence sequence /qualité   @ 1 sec
+		if (($TYPE eq "illumina")&&((!$header1)||(!$header2)||(!$repheader1)||(!$repheader2))){
+			if ($VERBOSE eq "ON"){
+				print "Error in header : empty\n";
+				print $ligne1_r1;
+				print $ligne2_r1;
+				print $ligne3_r1;
+				print $ligne4_r1;
+				print $ligne1_r2;
+				print $ligne2_r2;
+				print $ligne3_r2;
+				print $ligne4_r2;
+				print "\n";
+			}
+			$error3++;
+		}
+		elsif (($TYPE eq "sanger")&&((!$header1)||(!$header2))){
+			if ($VERBOSE eq "ON"){
+				print "Error in header refgsd : empty\n";
+				print $ligne1_r1;
+				print $ligne2_r1;
+				print $ligne3_r1;
+				print $ligne4_r1;
+				print $ligne1_r2;
+				print $ligne2_r2;
+				print $ligne3_r2;
+				print $ligne4_r2;
+				print "\n";
+			}
+			$error3++;
+		}
+		elsif (($TYPE eq "illumina")&&(($header1 ne $repheader1)||($header2 ne $repheader2)||($header1 ne $header2))){
+			if ($VERBOSE eq "ON"){
+				print "Error in header : different\n";
+				print $ligne1_r1;
+				print $ligne2_r1;
+				print $ligne3_r1;
+				print $ligne4_r1;
+				print $ligne1_r2;
+				print $ligne2_r2;
+				print $ligne3_r2;
+				print $ligne4_r2;
+				print "\n";
+			}
+			$error4++;
+		}
+		elsif (($TYPE eq "sanger")&&($header1 ne $header2)){
+			if ($VERBOSE eq "ON"){
+				print "Error in header : different\n";
+				print $ligne1_r1;
+				print $ligne2_r1;
+				print $ligne3_r1;
+				print $ligne4_r1;
+				print $ligne1_r2;
+				print $ligne2_r2;
+				print $ligne3_r2;
+				print $ligne4_r2;
+				print "\n";
+			}
+			$error4++;
+		}
+		elsif (($length_seq1 != $length_qual1)||($length_seq2 != $length_qual2)){
+			if ($VERBOSE eq "ON"){
+				print "Error in seq/qual length\n";
+				print $ligne1_r1;
+				print $ligne2_r1;
+				print $ligne3_r1;
+				print $ligne4_r1;
+				print $ligne1_r2;
+				print $ligne2_r2;
+				print $ligne3_r2;
+				print $ligne4_r2;
+				print "\n";
+			}
+			$error5++;
+		}
+#@ 1 - 2 sec	
+		else {
+			### Parsing sequence & qualité
+			if ($ligne2_r1 =~ /^([ATGCNX]+)\s*$/i){
+				$seq1 = $1;
+				$nb_base_read1 += length($seq1);
+			}
+			if ($ligne2_r2 =~ /^([ATGCNX]+)\s*$/i){
+				$seq2 = $1;
+				$nb_base_read2 += length($seq2);
+			}
+			if ($ligne4_r1 =~ /^(.*)\s*$/i){
+				$qual1 = $1;
+			}
+			if ($ligne4_r2 =~ /^(.*)\s*$/i){
+				$qual2 = $1;
+			}
+#@ 2 sec			
+			### Verification du parsing et de la coherence sequence /qualité (n°2)
+			if ((!$seq1)||(!$seq2)||(!$qual1)||(!$qual2)){
+				if ($VERBOSE eq "ON"){
+					print "Error parsing seq / quality \n";
+					print $ligne1_r1;
+					print $ligne2_r1;
+					print $ligne3_r1;
+					print $ligne4_r1;
+					print $ligne1_r2;
+					print $ligne2_r2;
+					print $ligne3_r2;
+					print $ligne4_r2;
+					print "\n";
+				}
+				$error6++;
+			}
+			elsif ((length($seq1) != length($qual1))||(length($seq2) != length($qual2))){
+				if ($VERBOSE eq "ON"){
+					print "Error in seq/qual length after parsing\n";
+					print $ligne1_r1;
+					print $ligne2_r1;
+					print $ligne3_r1;
+					print $ligne4_r1;
+					print $ligne1_r2;
+					print $ligne2_r2;
+					print $ligne3_r2;
+					print $ligne4_r2;
+					print "\n";
+				}
+				$error7++;
+			}
+#@ <1 sec
+			else {
+				my $fastq_lines_r1="";
+				my $fastq_lines_r2="";
+				my $nb_base_current_read1_t = 0;
+				my $nb_base_current_read2_t = 0;
+				
+				$fastq_lines_r1 = &grooming_and_trimming($ligne1_r1,$seq1,$qual1);
+				$nb_base_current_read1_t = $nb_base_current_t;
+				if ($fastq_lines_r1){
+					$fastq_lines_r2 = &grooming_and_trimming($ligne1_r2,$seq2,$qual2);
+					$nb_base_current_read2_t = $nb_base_current_t;
+				}
+				if ($fastq_lines_r2){
+					print OUT1 $fastq_lines_r1;
+					print OUT2 $fastq_lines_r2;
+
+					$nb_read1_t++;
+					$nb_read2_t++;
+					$nb_base_read1_t += $nb_base_current_read1_t;
+					$nb_base_read2_t += $nb_base_current_read2_t;
+
+
+				}
+			}
+		}
+	
+
+#@ 7 sec
+	}
+}
+
+close (READ1);
+close (READ2);
+close (OUT1);
+close (OUT2);
+
+print LF "\n####\t Fastq preparation \n";
+print LF "Fastq format : $TYPE\n";
+print LF "## Before preparation\n";
+print LF "#Read1 :\t$nb_read1\t#Base :\t$nb_base_read1\n";
+print LF "#Read2 :\t$nb_read2\t#Base :\t$nb_base_read2\n";
+print LF "## After preparation\n";
+print LF "#Read1 :\t$nb_read1_t\t#Base :\t$nb_base_read1_t\n";
+print LF "#Read2 :\t$nb_read2_t\t#Base :\t$nb_base_read2_t\n";
+close (LF);
+		
+
+sub grooming_and_trimming{
+	my $header = shift;
+	my $seq = shift;
+	my $quality = shift;
+	my $quality_converted="";
+	my $quality_ori=$quality;
+	
+	my $lengthseq = length($seq);
+	my $startTrim = 0;
+	my $stopTrim = length($quality)-1;
+	my $startnoN = $startTrim;
+	my $stopnoN = $stopTrim;
+	
+	
+	my $chercheN = $seq;
+	my @bad_position_N;
+	my @bad_position_Q;
+	my $current_index = index($chercheN,"N");
+	my $abs_index = $current_index;
+	while ($current_index >=0){
+		push (@bad_position_N,$abs_index);
+
+		if ($current_index<length($seq)){
+			$chercheN = substr($chercheN,$current_index+1);
+			$current_index = index($chercheN,"N");
+			$abs_index = $current_index + $bad_position_N[$#bad_position_N]+1;
+		}
+		else {
+			last;
+		}
+	}
+		
+	my @q = split(//,$quality);
+	for (my $i=0;$i<=$#q;$i++){
+		my $chr = $q[$i];
+		my $num = ord($q[$i]);
+		if ($TYPE eq "illumina"){
+			$num = $num - 31; # 31 comme la difference entre la plage sanger (33-> 93 / 0->60) et illumina (64->104 / 0->40)
+			$quality_converted .= chr($num);
+		}
+		
+		if ($num < $MIN_QUALITY + 33){ #33 comme le départ de la plage sanger
+			push(@bad_position_Q,$i);
+		}
+	}
+	if ($quality_converted){$quality = $quality_converted;}
+	
+	my @bad_position = (@bad_position_N, @bad_position_Q);
+	
+	if ($#bad_position>=0){
+		@bad_position = sort {$a <=> $b} @bad_position;
+		my %coord=%{&extract_longer_string_coordinates_from_bad_position(0,$stopTrim,\@bad_position)};
+		$startTrim = $coord{"start"};
+		$stopTrim = $coord{"stop"};
+#print "$startTrim .. $stopTrim\n";
+		
+	}
+	my $lengthTrim = $stopTrim - $startTrim +1;
+
+	#if ($stats_length{$lengthTrim}){
+	#	$stats_length{$lengthTrim} = 1;		
+	#}
+	#else {
+	#	$stats_length{$lengthTrim}++;
+	#}
+	my $fastq_lines="";
+	
+#	if ($header =~ /GA8\-EAS671_0005\:3\:1\:1043\:4432/){
+#		print "HEAD:\t$header";	
+#		print "SEQ:\n$seq\n";
+#		print "$quality_ori\n";
+#		print "$quality\n";
+#		for (my $i=0;$i<=$#bad_position;$i++){
+#			print $bad_position[$i]."(".$q[$bad_position[$i]]." : ".ord($q[$bad_position[$i]]).")"."\t";
+#		}
+#		print "\n";
+#		print "$startTrim .. $stopTrim / $lengthTrim \n";
+#		print $fastq_lines;
+#		print "\n";
+#	}
+
+	#for (my $i=$startTrim;$i<=$stopTrim;$i++){
+	#	if ($stats_quality{ord($q{$i])}){
+	#		$stats_quality{ord($q{$i])}=1;
+	#	}
+	#	else {
+	#		$stats_quality{ord($q{$i])}++;
+	#	}
+	#}
+
+	if ($lengthTrim >= $MIN_LENGTH){
+		$fastq_lines .= $header;
+		my $new_seq = substr($seq,$startTrim,$lengthTrim);
+		$nb_base_current_t = length($new_seq);
+		$fastq_lines .= $new_seq."\n";
+		$fastq_lines .= "+\n";
+		my $new_q = substr($quality,$startTrim,$lengthTrim);
+		$fastq_lines .= $new_q."\n";
+		return $fastq_lines;
+		
+	}
+	else {
+		#print "Insufficient length after trimming\n";
+		return "";
+	}
+}
+
+sub extract_longer_string_coordinates_from_bad_position{
+	my $start=shift;
+	my $stop =shift;
+	my $refbad = shift;
+	my @bad_position = @$refbad;
+	my %coord;
+	
+	my $current_start = $start;
+	my $current_stop = $bad_position[0]-1;
+	if ($current_stop < $start){$current_stop = $start;}
+	
+	
+	#debut -> premier N
+	my $current_length = $current_stop - $current_start +1;
+	my $test_length;
+	
+	#entre les N
+	for (my $i=1;$i<=$#bad_position;$i++){
+		$test_length = $bad_position[$i]+1-$bad_position[$i-1]-1;
+		if ( $test_length >  $current_length){
+			$current_start = $bad_position[$i-1]+1;
+			$current_stop = $bad_position[$i]-1;
+			$current_length = $current_stop - $current_start +1;
+		}
+	}
+	
+	#dernier N -> fin
+	$test_length = $stop-$bad_position[$#bad_position]+1;
+	if ( $test_length >  $current_length){
+		$current_start = $bad_position[$#bad_position]+1;
+		if ($current_start > $stop){$current_start=$stop;}
+		$current_stop = $stop;
+	}	
+	$coord{"start"}=$current_start;
+	$coord{"stop"}= $current_stop;
+	$coord{"lenght"}=$current_stop-$current_start+1;
+	
+	return \%coord;
+}