diff rapsodyn/PrepareFastqLight.pl @ 5:b0cbb9d21aa9 draft

Uploaded
author mcharles
date Mon, 22 Sep 2014 10:19:53 -0400
parents 9074a5104cdd
children 3f7b0788a1c4
line wrap: on
line diff
--- a/rapsodyn/PrepareFastqLight.pl	Wed Sep 17 04:20:08 2014 -0400
+++ b/rapsodyn/PrepareFastqLight.pl	Mon Sep 22 10:19:53 2014 -0400
@@ -268,123 +268,93 @@
 close (OUT2);
 
 
-
-
 sub grooming_and_trimming{
 	my $header = shift;
 	my $seq = shift;
 	my $quality = shift;
 	my $quality_converted="";
+	my $quality_ori=$quality;
 	
-	my $startnoN = 0;
-	my $stopnoN = length($quality)-1;
+	my $lengthseq = length($seq);
+	my $startTrim = 0;
+	my $stopTrim = length($quality)-1;
+	my $startnoN = $startTrim;
+	my $stopnoN = $stopTrim;
 	
-#print "HEAD:\t$header";	
-#print "SEQ:\t$seq\n";
 	
 	my $chercheN = $seq;
-	my @bad_position;
+	my @bad_position_N;
+	my @bad_position_Q;
 	my $current_index = index($chercheN,"N");
 	my $abs_index = $current_index;
 	while ($current_index >=0){
-		push (@bad_position,$abs_index);
+		push (@bad_position_N,$abs_index);
 
 		if ($current_index<length($seq)){
 			$chercheN = substr($chercheN,$current_index+1);
 			$current_index = index($chercheN,"N");
-			$abs_index = $current_index + $bad_position[$#bad_position]+1;
+			$abs_index = $current_index + $bad_position_N[$#bad_position_N]+1;
 		}
 		else {
 			last;
 		}
 	}
+		
+	my @q = split(//,$quality);
+	for (my $i=0;$i<=$#q;$i++){
+		my $chr = $q[$i];
+		my $num = ord($q[$i]);
+		if ($TYPE eq "illumina"){
+			$num = $num - 31; # 31 comme la difference entre la plage sanger (33-> 93 / 0->60) et illumina (64->104 / 0->40)
+			$quality_converted .= chr($num);
+		}
+		
+		if ($num < $MIN_QUALITY + 33){ #33 comme le départ de la plage sanger
+			push(@bad_position_Q,$i);
+		}
+	}
+	if ($quality_converted){$quality = $quality_converted;}
 	
+	my @bad_position = (@bad_position_N, @bad_position_Q);
 	
 	if ($#bad_position>=0){
-		my %coord=%{&extract_longer_string_coordinates_from_bad_position($startnoN,$stopnoN,\@bad_position)};
-		$startnoN = $coord{"start"};
-		$stopnoN = $coord{"stop"};	
-	}
-	my $lengthnoN = $stopnoN - $startnoN + 1;
-	my $seqnoN = substr($seq,$startnoN,$lengthnoN);
-#	print "SEQnoN\t:$seqnoN\n";
-#	for (my $i=0;$i<=$#bad_position;$i++){
-#		print $bad_position[$i]."\t";
-#	}
-#	print "\n";
-	
-	if ($lengthnoN >= $MIN_LENGTH){
-		my $startTrim = $startnoN;
-		my $stopTrim = $stopnoN;
-		
-		my $quality_converted="";
-		#my @bad_position;
+		@bad_position = sort {$a <=> $b} @bad_position;
+		my %coord=%{&extract_longer_string_coordinates_from_bad_position(0,$stopTrim,\@bad_position)};
+		$startTrim = $coord{"start"};
+		$stopTrim = $coord{"stop"};
+#print "$startTrim .. $stopTrim\n";
 		
-		my @q = split(//,$quality);
-		#print "QUALITY\n";
-		#print "$quality\n";
-		for (my $i=0;$i<=$stopnoN;$i++){
-			my $chr = $q[$i];
-			my $num = ord($q[$i]);
-			if ($TYPE eq "illumina"){
-				$num = $num -64+33;
-				$quality_converted .= chr($num);
-			}
-			
-			if ($num <$MIN_QUALITY + 64 - 33 ){
-				push(@bad_position,$i+$startnoN);
-			}
-		}
-		if ($quality_converted){$quality = $quality_converted;}
-		#print "$quality\n";
-		
-		
-		
-		if ($#bad_position>=0){
-			@bad_position = sort {$a <=> $b} @bad_position;
-#			for (my $i=0;$i<=$#bad_position;$i++){
-#				print $bad_position[$i]."\t";
-#			}
-#			print "\n";
-			my %coord=%{&extract_longer_string_coordinates_from_bad_position($startnoN,$stopnoN,\@bad_position)};
-			$startTrim = $coord{"start"};
-			$stopTrim = $coord{"stop"};
-			#print "$startTrim .. $stopTrim\n";
-			
-		}
-		my $lengthTrim = $stopTrim - $startTrim +1;
-		
-		
-		my $fastq_lines="";
-		
-		if ($lengthTrim >= $MIN_LENGTH){
-			$fastq_lines .= $header;
-			$fastq_lines .= substr($seq,$startTrim,$lengthTrim)."\n";
-			$fastq_lines .= "+\n";
-			$fastq_lines .= substr($quality,$startTrim,$lengthTrim)."\n";
-#			print $fastq_lines;
-			return $fastq_lines;
-			
-		}
-		else {
-			return "";
-		}
-		
-		
+	}
+	my $lengthTrim = $stopTrim - $startTrim +1;
+	
+	my $fastq_lines="";
+	
+#	if ($header =~ /GA8\-EAS671_0005\:3\:1\:1043\:4432/){
+#		print "HEAD:\t$header";	
+#		print "SEQ:\n$seq\n";
+#		print "$quality_ori\n";
+#		print "$quality\n";
+#		for (my $i=0;$i<=$#bad_position;$i++){
+#			print $bad_position[$i]."(".$q[$bad_position[$i]]." : ".ord($q[$bad_position[$i]]).")"."\t";
+#		}
+#		print "\n";
+#		print "$startTrim .. $stopTrim / $lengthTrim \n";
+#		print $fastq_lines;
+#		print "\n";
+#	}
+
+	if ($lengthTrim >= $MIN_LENGTH){
+		$fastq_lines .= $header;
+		$fastq_lines .= substr($seq,$startTrim,$lengthTrim)."\n";
+		$fastq_lines .= "+\n";
+		$fastq_lines .= substr($quality,$startTrim,$lengthTrim)."\n";
+		return $fastq_lines;
 		
 	}
 	else {
+		#print "Insufficient length after trimming\n";
 		return "";
 	}
-	
-	
-	# my @s = split(//,$seq);
-	# my $sanger_quality="";
-
-	
-	
-
-	# return $sanger_quality;
 }
 
 sub extract_longer_string_coordinates_from_bad_position{