changeset 32:a86903ab5266 draft

Uploaded
author mcharles
date Thu, 09 Oct 2014 08:28:22 -0400
parents 7631b341e0cf
children 193b8462b80c
files rapsodyn/PrepareFastqLight.pl rapsodyn/PrepareFastqLight.pl~ rapsodyn/PrepareFastqLight.xml rapsodyn/PrepareFastqLight.xml~
diffstat 4 files changed, 546 insertions(+), 5 deletions(-) [+]
line wrap: on
line diff
--- a/rapsodyn/PrepareFastqLight.pl	Thu Oct 09 06:45:58 2014 -0400
+++ b/rapsodyn/PrepareFastqLight.pl	Thu Oct 09 08:28:22 2014 -0400
@@ -1,4 +1,5 @@
 #!/usr/bin/perl
+#v1.0.3 support rapsodyn header (.... 1:...  / .... 2:...)
 #V1.0.2 added auto type detection
 #V1.0.1 added log, option parameters
 use strict;
@@ -174,19 +175,19 @@
 		my $repheader2="";
 
 		
-		if ($ligne1_r1 =~/^\@(.*?)\#/){
+		if ($ligne1_r1 =~/^\@(.*?)[\s\/]/){
 			$header1 = $1;
 		}
 		
-		if ($ligne3_r1 =~/^\+(.*?)\#/){
+		if ($ligne3_r1 =~/^\+(.*?)[\s\/]/){
                         $repheader1 = $1;
                	}
 
-		if ($ligne1_r2 =~/^\@(.*?)\#/){
+		if ($ligne1_r2 =~/^\@(.*?)[\s\/]/){
                         $header2 = $1;
                	}
 
-		if ($ligne3_r2 =~/^\+(.*?)\#/){
+		if ($ligne3_r2 =~/^\+(.*?)[\s\/]/){
                         $repheader2 = $1;
                	}
 #@ 2 sec
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rapsodyn/PrepareFastqLight.pl~	Thu Oct 09 08:28:22 2014 -0400
@@ -0,0 +1,512 @@
+#!/usr/bin/perl
+#V1.0.2 added auto type detection
+#V1.0.1 added log, option parameters
+use strict;
+use warnings;
+use Getopt::Long;
+
+my $read1_file;
+my $read2_file;
+my $log_file;
+my $output1_file;
+my $output2_file;
+
+my $TYPE="sanger";
+my $MIN_LENGTH=30;
+my $MIN_QUALITY=30;
+
+my $VERBOSE = "OFF";
+
+GetOptions (
+"read1_file=s" => \$read1_file,
+"read2_file=s" => \$read2_file,
+"log_file=s" => \$log_file,
+"output1_file=s" => \$output1_file,
+"output2_file=s" => \$output2_file,
+"type=s" => \$TYPE,
+"min_length=i" => \$MIN_LENGTH,
+"min_quality=i" => \$MIN_QUALITY,
+"verbose=s" => \$VERBOSE
+) or die("Error in command line arguments\n");
+
+
+my $nb_read1=0;
+my $nb_base_read1=0;
+my $nb_read2=0;
+my $nb_base_read2=0;
+
+my $nb_read1_t=0;
+my $nb_base_read1_t=0;
+my $nb_read2_t=0;
+my $nb_base_read2_t=0;
+
+my $nb_base_current_t=0;
+
+open(READ1, $read1_file) or die ("Can't open $read1_file\n");
+open(READ2, $read2_file) or die ("Can't open $read2_file\n");
+open(OUT1, ">$output1_file") or die ("Can't open $output1_file\n");
+open(OUT2, ">$output2_file") or die ("Can't open $output2_file\n");
+open (LF,">$log_file") or die("Can't open $log_file\n");
+
+
+my $error1=0;
+my $error2=0;
+my $error3=0;
+my $error4=0;
+my $error5=0;
+my $error6=0;
+my $error7=0;
+my $error8=0;
+my $error9=0;
+my $error10=0;
+
+my $auto_type="";
+my %qual;
+if ($TYPE eq "auto"){
+	my $compt=0;
+	open(DETECT, $read1_file) or die ("Can't open $read1_file\n"); 
+	while (my $ligne1_r1 =<DETECT>){
+		my $ligne2_r1 =<DETECT>;
+		my $ligne3_r1 =<DETECT>;
+		my $ligne4_r1 =<DETECT>;
+		$compt++;
+		if ($ligne4_r1 =~ /^(.*)\s*$/i){
+			my $qual = $1;
+			my @q = split(//,$qual);
+			for (my $i=0;$i<=$#q;$i++){
+				my $num = ord($q[$i]);
+				if ($qual{$num}){
+					$qual{$num}++;
+				}
+				else {
+					$qual{$num} = 1;
+				}
+				#range sanger / illumina 1.8+ 	: 33->94
+				#range illumina 1.3->1.7 	: 64->105
+				if ($num > 94){$auto_type = "illumina";last;}
+				if ($num < 64){$auto_type = "sanger";last;}
+			}
+		}
+		else {
+			print STDERR "Error in format detection : quality not recognized\n$ligne4_r1";
+			exit(0);
+		}
+	
+		if ($auto_type ne ""){
+			last;
+		}
+
+	}
+	close (DETECT);
+	if ($auto_type eq ""){
+		print STDERR "Error in format detection : type not recognized parsing read1\n";
+		foreach my $key (sort {$a <=> $b} keys %qual){
+			print "$key\t:\t",$qual{$key},"\n";
+		}
+		exit(0);
+	}
+	else {
+		$TYPE = $auto_type;
+	}
+}
+
+
+
+
+while (my $ligne1_r1 =<READ1>){
+	my $ligne2_r1 =<READ1>;
+	my $ligne3_r1 =<READ1>;
+	my $ligne4_r1 =<READ1>;
+	my $ligne1_r2 =<READ2>;
+	my $ligne2_r2 =<READ2>;
+	my $ligne3_r2 =<READ2>;
+	my $ligne4_r2 =<READ2>;
+
+	$nb_read1++;
+	$nb_read2++;
+	
+#@ 1 sec
+	if ((!$ligne1_r1)||(!$ligne2_r1)||(!$ligne3_r1)||(!$ligne4_r1)||(!$ligne1_r2)||(!$ligne2_r2)||(!$ligne3_r2)||(!$ligne4_r2)){
+		if ($VERBOSE eq "ON"){
+			print "Error in file format";
+			if ($ligne1_r1){print $ligne1_r1;}
+			if ($ligne2_r1){print $ligne2_r1;}
+			if ($ligne3_r1){print $ligne3_r1;}
+			if ($ligne4_r1){print $ligne4_r1;}
+			if ($ligne1_r2){print $ligne1_r2;}
+			if ($ligne2_r2){print $ligne2_r2;}
+			if ($ligne3_r2){print $ligne3_r2;}
+			if ($ligne4_r2){print $ligne4_r2;}
+			print "\n";
+		}
+		$error1++;
+	}
+	elsif(($ligne1_r1 !~/^\@/)||($ligne1_r2 !~/^\@/)||($ligne3_r1 !~/^\+/)||($ligne3_r2 !~/^\+/)){
+		if ($VERBOSE eq "ON"){
+			print "Error in header : format\n";
+			print $ligne1_r1;
+			print $ligne2_r1;
+			print $ligne3_r1;
+			print $ligne4_r1;
+			print $ligne1_r2;
+			print $ligne2_r2;
+			print $ligne3_r2;
+			print $ligne4_r2;
+			print "\n";
+		}
+		$error2++;
+	}
+#@ 1 - 2 sec
+	else {
+		
+		my $length_seq1 = length($ligne2_r1);
+		my $length_qual1 =length($ligne4_r1);
+		my $seq1;
+		my $qual1;
+	
+		my $length_seq2 = length($ligne2_r2);
+		my $length_qual2 =length($ligne4_r2);
+		my $seq2;
+		my $qual2;
+		my $header1="";
+		my $header2="";
+		my $repheader1="";
+		my $repheader2="";
+
+		
+		if ($ligne1_r1 =~/^\@(.*?)[\s\/]/){
+			$header1 = $1;
+		}
+		
+		if ($ligne3_r1 =~/^\+(.*?)[\s\/]/){
+                        $repheader1 = $1;
+               	}
+
+		if ($ligne1_r2 =~/^\@(.*?)[\s\/]/){
+                        $header2 = $1;
+               	}
+
+		if ($ligne3_r2 =~/^\+(.*?)[\s\/]/){
+                        $repheader2 = $1;
+               	}
+#@ 2 sec
+		
+		### Verification de la coherence sequence /qualité   @ 1 sec
+		if (($TYPE eq "illumina")&&((!$header1)||(!$header2)||(!$repheader1)||(!$repheader2))){
+			if ($VERBOSE eq "ON"){
+				print "Error in header : empty\n";
+				print $ligne1_r1;
+				print $ligne2_r1;
+				print $ligne3_r1;
+				print $ligne4_r1;
+				print $ligne1_r2;
+				print $ligne2_r2;
+				print $ligne3_r2;
+				print $ligne4_r2;
+				print "\n";
+			}
+			$error3++;
+		}
+		elsif (($TYPE eq "sanger")&&((!$header1)||(!$header2))){
+			if ($VERBOSE eq "ON"){
+				print "Error in header refgsd : empty\n";
+				print $ligne1_r1;
+				print $ligne2_r1;
+				print $ligne3_r1;
+				print $ligne4_r1;
+				print $ligne1_r2;
+				print $ligne2_r2;
+				print $ligne3_r2;
+				print $ligne4_r2;
+				print "\n";
+			}
+			$error3++;
+		}
+		elsif (($TYPE eq "illumina")&&(($header1 ne $repheader1)||($header2 ne $repheader2)||($header1 ne $header2))){
+			if ($VERBOSE eq "ON"){
+				print "Error in header : different\n";
+				print $ligne1_r1;
+				print $ligne2_r1;
+				print $ligne3_r1;
+				print $ligne4_r1;
+				print $ligne1_r2;
+				print $ligne2_r2;
+				print $ligne3_r2;
+				print $ligne4_r2;
+				print "\n";
+			}
+			$error4++;
+		}
+		elsif (($TYPE eq "sanger")&&($header1 ne $header2)){
+			if ($VERBOSE eq "ON"){
+				print "Error in header : different\n";
+				print $ligne1_r1;
+				print $ligne2_r1;
+				print $ligne3_r1;
+				print $ligne4_r1;
+				print $ligne1_r2;
+				print $ligne2_r2;
+				print $ligne3_r2;
+				print $ligne4_r2;
+				print "\n";
+			}
+			$error4++;
+		}
+		elsif (($length_seq1 != $length_qual1)||($length_seq2 != $length_qual2)){
+			if ($VERBOSE eq "ON"){
+				print "Error in seq/qual length\n";
+				print $ligne1_r1;
+				print $ligne2_r1;
+				print $ligne3_r1;
+				print $ligne4_r1;
+				print $ligne1_r2;
+				print $ligne2_r2;
+				print $ligne3_r2;
+				print $ligne4_r2;
+				print "\n";
+			}
+			$error5++;
+		}
+#@ 1 - 2 sec	
+		else {
+			### Parsing sequence & qualité
+			if ($ligne2_r1 =~ /^([ATGCNX]+)\s*$/i){
+				$seq1 = $1;
+				$nb_base_read1 += length($seq1);
+			}
+			if ($ligne2_r2 =~ /^([ATGCNX]+)\s*$/i){
+				$seq2 = $1;
+				$nb_base_read2 += length($seq2);
+			}
+			if ($ligne4_r1 =~ /^(.*)\s*$/i){
+				$qual1 = $1;
+			}
+			if ($ligne4_r2 =~ /^(.*)\s*$/i){
+				$qual2 = $1;
+			}
+#@ 2 sec			
+			### Verification du parsing et de la coherence sequence /qualité (n°2)
+			if ((!$seq1)||(!$seq2)||(!$qual1)||(!$qual2)){
+				if ($VERBOSE eq "ON"){
+					print "Error parsing seq / quality \n";
+					print $ligne1_r1;
+					print $ligne2_r1;
+					print $ligne3_r1;
+					print $ligne4_r1;
+					print $ligne1_r2;
+					print $ligne2_r2;
+					print $ligne3_r2;
+					print $ligne4_r2;
+					print "\n";
+				}
+				$error6++;
+			}
+			elsif ((length($seq1) != length($qual1))||(length($seq2) != length($qual2))){
+				if ($VERBOSE eq "ON"){
+					print "Error in seq/qual length after parsing\n";
+					print $ligne1_r1;
+					print $ligne2_r1;
+					print $ligne3_r1;
+					print $ligne4_r1;
+					print $ligne1_r2;
+					print $ligne2_r2;
+					print $ligne3_r2;
+					print $ligne4_r2;
+					print "\n";
+				}
+				$error7++;
+			}
+#@ <1 sec
+			else {
+				my $fastq_lines_r1="";
+				my $fastq_lines_r2="";
+				my $nb_base_current_read1_t = 0;
+				my $nb_base_current_read2_t = 0;
+				
+				$fastq_lines_r1 = &grooming_and_trimming($ligne1_r1,$seq1,$qual1);
+				$nb_base_current_read1_t = $nb_base_current_t;
+				if ($fastq_lines_r1){
+					$fastq_lines_r2 = &grooming_and_trimming($ligne1_r2,$seq2,$qual2);
+					$nb_base_current_read2_t = $nb_base_current_t;
+				}
+				if ($fastq_lines_r2){
+					print OUT1 $fastq_lines_r1;
+					print OUT2 $fastq_lines_r2;
+
+					$nb_read1_t++;
+					$nb_read2_t++;
+					$nb_base_read1_t += $nb_base_current_read1_t;
+					$nb_base_read2_t += $nb_base_current_read2_t;
+
+
+				}
+			}
+		}
+	
+
+#@ 7 sec
+	}
+}
+
+close (READ1);
+close (READ2);
+close (OUT1);
+close (OUT2);
+
+print LF "\n####\t Fastq preparation \n";
+print LF "Fastq format : $TYPE\n";
+print LF "## Before preparation\n";
+print LF "#Read1 :\t$nb_read1\t#Base :\t$nb_base_read1\n";
+print LF "#Read2 :\t$nb_read2\t#Base :\t$nb_base_read2\n";
+print LF "## After preparation\n";
+print LF "#Read1 :\t$nb_read1_t\t#Base :\t$nb_base_read1_t\n";
+print LF "#Read2 :\t$nb_read2_t\t#Base :\t$nb_base_read2_t\n";
+close (LF);
+		
+
+sub grooming_and_trimming{
+	my $header = shift;
+	my $seq = shift;
+	my $quality = shift;
+	my $quality_converted="";
+	my $quality_ori=$quality;
+	
+	my $lengthseq = length($seq);
+	my $startTrim = 0;
+	my $stopTrim = length($quality)-1;
+	my $startnoN = $startTrim;
+	my $stopnoN = $stopTrim;
+	
+	
+	my $chercheN = $seq;
+	my @bad_position_N;
+	my @bad_position_Q;
+	my $current_index = index($chercheN,"N");
+	my $abs_index = $current_index;
+	while ($current_index >=0){
+		push (@bad_position_N,$abs_index);
+
+		if ($current_index<length($seq)){
+			$chercheN = substr($chercheN,$current_index+1);
+			$current_index = index($chercheN,"N");
+			$abs_index = $current_index + $bad_position_N[$#bad_position_N]+1;
+		}
+		else {
+			last;
+		}
+	}
+		
+	my @q = split(//,$quality);
+	for (my $i=0;$i<=$#q;$i++){
+		my $chr = $q[$i];
+		my $num = ord($q[$i]);
+		if ($TYPE eq "illumina"){
+			$num = $num - 31; # 31 comme la difference entre la plage sanger (33-> 93 / 0->60) et illumina (64->104 / 0->40)
+			$quality_converted .= chr($num);
+		}
+		
+		if ($num < $MIN_QUALITY + 33){ #33 comme le départ de la plage sanger
+			push(@bad_position_Q,$i);
+		}
+	}
+	if ($quality_converted){$quality = $quality_converted;}
+	
+	my @bad_position = (@bad_position_N, @bad_position_Q);
+	
+	if ($#bad_position>=0){
+		@bad_position = sort {$a <=> $b} @bad_position;
+		my %coord=%{&extract_longer_string_coordinates_from_bad_position(0,$stopTrim,\@bad_position)};
+		$startTrim = $coord{"start"};
+		$stopTrim = $coord{"stop"};
+#print "$startTrim .. $stopTrim\n";
+		
+	}
+	my $lengthTrim = $stopTrim - $startTrim +1;
+
+	#if ($stats_length{$lengthTrim}){
+	#	$stats_length{$lengthTrim} = 1;		
+	#}
+	#else {
+	#	$stats_length{$lengthTrim}++;
+	#}
+	my $fastq_lines="";
+	
+#	if ($header =~ /GA8\-EAS671_0005\:3\:1\:1043\:4432/){
+#		print "HEAD:\t$header";	
+#		print "SEQ:\n$seq\n";
+#		print "$quality_ori\n";
+#		print "$quality\n";
+#		for (my $i=0;$i<=$#bad_position;$i++){
+#			print $bad_position[$i]."(".$q[$bad_position[$i]]." : ".ord($q[$bad_position[$i]]).")"."\t";
+#		}
+#		print "\n";
+#		print "$startTrim .. $stopTrim / $lengthTrim \n";
+#		print $fastq_lines;
+#		print "\n";
+#	}
+
+	#for (my $i=$startTrim;$i<=$stopTrim;$i++){
+	#	if ($stats_quality{ord($q{$i])}){
+	#		$stats_quality{ord($q{$i])}=1;
+	#	}
+	#	else {
+	#		$stats_quality{ord($q{$i])}++;
+	#	}
+	#}
+
+	if ($lengthTrim >= $MIN_LENGTH){
+		$fastq_lines .= $header;
+		my $new_seq = substr($seq,$startTrim,$lengthTrim);
+		$nb_base_current_t = length($new_seq);
+		$fastq_lines .= $new_seq."\n";
+		$fastq_lines .= "+\n";
+		my $new_q = substr($quality,$startTrim,$lengthTrim);
+		$fastq_lines .= $new_q."\n";
+		return $fastq_lines;
+		
+	}
+	else {
+		#print "Insufficient length after trimming\n";
+		return "";
+	}
+}
+
+sub extract_longer_string_coordinates_from_bad_position{
+	my $start=shift;
+	my $stop =shift;
+	my $refbad = shift;
+	my @bad_position = @$refbad;
+	my %coord;
+	
+	my $current_start = $start;
+	my $current_stop = $bad_position[0]-1;
+	if ($current_stop < $start){$current_stop = $start;}
+	
+	
+	#debut -> premier N
+	my $current_length = $current_stop - $current_start +1;
+	my $test_length;
+	
+	#entre les N
+	for (my $i=1;$i<=$#bad_position;$i++){
+		$test_length = $bad_position[$i]+1-$bad_position[$i-1]-1;
+		if ( $test_length >  $current_length){
+			$current_start = $bad_position[$i-1]+1;
+			$current_stop = $bad_position[$i]-1;
+			$current_length = $current_stop - $current_start +1;
+		}
+	}
+	
+	#dernier N -> fin
+	$test_length = $stop-$bad_position[$#bad_position]+1;
+	if ( $test_length >  $current_length){
+		$current_start = $bad_position[$#bad_position]+1;
+		if ($current_start > $stop){$current_start=$stop;}
+		$current_stop = $stop;
+	}	
+	$coord{"start"}=$current_start;
+	$coord{"stop"}= $current_stop;
+	$coord{"lenght"}=$current_stop-$current_start+1;
+	
+	return \%coord;
+}
--- a/rapsodyn/PrepareFastqLight.xml	Thu Oct 09 06:45:58 2014 -0400
+++ b/rapsodyn/PrepareFastqLight.xml	Thu Oct 09 08:28:22 2014 -0400
@@ -1,4 +1,4 @@
-<tool id="PrepareFastqLight" name="PrepareFastqLight" version="1.02">
+<tool id="PrepareFastqLight" name="PrepareFastqLight" version="1.03">
 <description>Fastq preparation</description>
 <command interpreter="perl">
     PrepareFastqLight.pl -read1_file $input_read1_file -read2_file $input_read2_file -output1 $output_read1_file -output2 $output_read2_file -log_file $log_file -type $quality_type -min_quality $min_quality -min_length $min_length
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rapsodyn/PrepareFastqLight.xml~	Thu Oct 09 08:28:22 2014 -0400
@@ -0,0 +1,28 @@
+<tool id="PrepareFastqLight" name="PrepareFastqLight" version="1.02">
+<description>Fastq preparation</description>
+<command interpreter="perl">
+    PrepareFastqLight.pl -read1_file $input_read1_file -read2_file $input_read2_file -output1 $output_read1_file -output2 $output_read2_file -log_file $log_file -type $quality_type -min_quality $min_quality -min_length $min_length
+</command>
+<inputs>
+	<param name="input_read1_file"  type="data" format="txt,fastq" label="Select a suitable FASTQ READ 1 file from your history"/>
+	<param name="input_read2_file"  type="data" format="txt,fastq" label="Select a suitable FASTQ READ 2 file from your history"/>
+	<param name="quality_type" type="select" label="Select input quality format">
+		<option value="auto" selected="true">Auto-detect</option>
+		<option value="sanger">Sanger</option>
+		<option value="illumina">Illumina 1.3-1.7</option>
+	</param>
+	<param name="min_quality" type="integer" value="30" label="Minimum quality for 5' and 3' trimming "/>
+	<param name="min_length" type="integer" value="30" label="Minimum sequence length after trimming"/>
+</inputs>
+<outputs>
+	<data name="output_read1_file" format="fastqsanger" label="${tool.name} on ${on_string}"/>
+	<data name="output_read2_file" format="fastqsanger" label="${tool.name} on ${on_string}"/>
+	<data name="log_file" format="txt" label="${tool.name} LOG on ${on_string}"/>
+</outputs>
+
+<help>
+
+
+
+</help>
+</tool>