view rapsodyn/PrepareFastqLight.pl @ 4:9074a5104cdd draft

Uploaded
author mcharles
date Wed, 17 Sep 2014 04:20:08 -0400
parents 442a7c88b886
children b0cbb9d21aa9
line wrap: on
line source

#!/usr/bin/perl
use strict;
use warnings;

my $read1 = $ARGV[0];
my $read2 = $ARGV[1];

my $output1 = $ARGV[2];
my $output2 = $ARGV[3];

my $TYPE = $ARGV[4];
my $MIN_LENGTH = $ARGV[5];
my $MIN_QUALITY = $ARGV[6];

my $VERBOSE = $ARGV[7];

if (!$VERBOSE){
	$VERBOSE ="OFF";
}

open(READ1, $read1) or die ("Can't open $read1\n");
open(READ2, $read2) or die ("Can't open $read2\n");
open(OUT1, ">$output1") or die ("Can't open $output1\n");
open(OUT2, ">$output2") or die ("Can't open $output2\n");


my $error1=0;
my $error2=0;
my $error3=0;
my $error4=0;
my $error5=0;
my $error6=0;
my $error7=0;
my $error8=0;
my $error9=0;
my $error10=0;

while (my $ligne1_r1 =<READ1>){
	my $ligne2_r1 =<READ1>;
	my $ligne3_r1 =<READ1>;
	my $ligne4_r1 =<READ1>;
	my $ligne1_r2 =<READ2>;
	my $ligne2_r2 =<READ2>;
	my $ligne3_r2 =<READ2>;
	my $ligne4_r2 =<READ2>;
	
#@ 1 sec
	if ((!$ligne1_r1)||(!$ligne2_r1)||(!$ligne3_r1)||(!$ligne4_r1)||(!$ligne1_r2)||(!$ligne2_r2)||(!$ligne3_r2)||(!$ligne4_r2)){
		if ($VERBOSE eq "ON"){
			print "Error in file format";
			if ($ligne1_r1){print $ligne1_r1;}
			if ($ligne2_r1){print $ligne2_r1;}
			if ($ligne3_r1){print $ligne3_r1;}
			if ($ligne4_r1){print $ligne4_r1;}
			if ($ligne1_r2){print $ligne1_r2;}
			if ($ligne2_r2){print $ligne2_r2;}
			if ($ligne3_r2){print $ligne3_r2;}
			if ($ligne4_r2){print $ligne4_r2;}
			print "\n";
		}
		$error1++;
	}
	elsif(($ligne1_r1 !~/^\@/)||($ligne1_r2 !~/^\@/)||($ligne3_r1 !~/^\+/)||($ligne3_r2 !~/^\+/)){
		if ($VERBOSE eq "ON"){
			print "Error in header : format\n";
			print $ligne1_r1;
			print $ligne2_r1;
			print $ligne3_r1;
			print $ligne4_r1;
			print $ligne1_r2;
			print $ligne2_r2;
			print $ligne3_r2;
			print $ligne4_r2;
			print "\n";
		}
		$error2++;
	}
#@ 1 - 2 sec
	else {
		
		my $length_seq1 = length($ligne2_r1);
		my $length_qual1 =length($ligne4_r1);
		my $seq1;
		my $qual1;
	
		my $length_seq2 = length($ligne2_r2);
		my $length_qual2 =length($ligne4_r2);
		my $seq2;
		my $qual2;
		my $header1="";
		my $header2="";
		my $repheader1="";
		my $repheader2="";
		
		if ($ligne1_r1 =~/^\@(.*?)\#/){
			$header1 = $1;
		}
		
		if ($ligne3_r1 =~/^\+(.*?)\#/){
                        $repheader1 = $1;
               	}

		if ($ligne1_r2 =~/^\@(.*?)\#/){
                        $header2 = $1;
               	}

		if ($ligne3_r2 =~/^\+(.*?)\#/){
                        $repheader2 = $1;
               	}
#@ 2 sec
		
		### Verification de la coherence sequence /qualité   @ 1 sec
		if (($TYPE eq "illumina")&&((!$header1)||(!$header2)||(!$repheader1)||(!$repheader2))){
			if ($VERBOSE eq "ON"){
				print "Error in header : empty\n";
				print $ligne1_r1;
				print $ligne2_r1;
				print $ligne3_r1;
				print $ligne4_r1;
				print $ligne1_r2;
				print $ligne2_r2;
				print $ligne3_r2;
				print $ligne4_r2;
				print "\n";
			}
			$error3++;
		}
		elsif (($TYPE eq "sanger")&&((!$header1)||(!$header2))){
			if ($VERBOSE eq "ON"){
				print "Error in header refgsd : empty\n";
				print $ligne1_r1;
				print $ligne2_r1;
				print $ligne3_r1;
				print $ligne4_r1;
				print $ligne1_r2;
				print $ligne2_r2;
				print $ligne3_r2;
				print $ligne4_r2;
				print "\n";
			}
			$error3++;
		}
		elsif (($TYPE eq "illumina")&&(($header1 ne $repheader1)||($header2 ne $repheader2)||($header1 ne $header2))){
			if ($VERBOSE eq "ON"){
				print "Error in header : different\n";
				print $ligne1_r1;
				print $ligne2_r1;
				print $ligne3_r1;
				print $ligne4_r1;
				print $ligne1_r2;
				print $ligne2_r2;
				print $ligne3_r2;
				print $ligne4_r2;
				print "\n";
			}
			$error4++;
		}
		elsif (($TYPE eq "sanger")&&($header1 ne $header2)){
			if ($VERBOSE eq "ON"){
				print "Error in header : different\n";
				print $ligne1_r1;
				print $ligne2_r1;
				print $ligne3_r1;
				print $ligne4_r1;
				print $ligne1_r2;
				print $ligne2_r2;
				print $ligne3_r2;
				print $ligne4_r2;
				print "\n";
			}
			$error4++;
		}
		elsif (($length_seq1 != $length_qual1)||($length_seq2 != $length_qual2)){
			if ($VERBOSE eq "ON"){
				print "Error in seq/qual length\n";
				print $ligne1_r1;
				print $ligne2_r1;
				print $ligne3_r1;
				print $ligne4_r1;
				print $ligne1_r2;
				print $ligne2_r2;
				print $ligne3_r2;
				print $ligne4_r2;
				print "\n";
			}
			$error5++;
		}
#@ 1 - 2 sec	
		else {
			### Parsing sequence & qualité
			if ($ligne2_r1 =~ /^([ATGCNX]+)\s*$/i){
				$seq1 = $1;
			}
			if ($ligne2_r2 =~ /^([ATGCNX]+)\s*$/i){
				$seq2 = $1;
			}
			if ($ligne4_r1 =~ /^(.*)\s*$/i){
				$qual1 = $1;
			}
			if ($ligne4_r2 =~ /^(.*)\s*$/i){
				$qual2 = $1;
			}
#@ 2 sec			
			### Verification du parsing et de la coherence sequence /qualité (n°2)
			if ((!$seq1)||(!$seq2)||(!$qual1)||(!$qual2)){
				if ($VERBOSE eq "ON"){
					print "Error parsing seq / quality \n";
					print $ligne1_r1;
					print $ligne2_r1;
					print $ligne3_r1;
					print $ligne4_r1;
					print $ligne1_r2;
					print $ligne2_r2;
					print $ligne3_r2;
					print $ligne4_r2;
					print "\n";
				}
				$error6++;
			}
			elsif ((length($seq1) != length($qual1))||(length($seq2) != length($qual2))){
				if ($VERBOSE eq "ON"){
					print "Error in seq/qual length after parsing\n";
					print $ligne1_r1;
					print $ligne2_r1;
					print $ligne3_r1;
					print $ligne4_r1;
					print $ligne1_r2;
					print $ligne2_r2;
					print $ligne3_r2;
					print $ligne4_r2;
					print "\n";
				}
				$error7++;
			}
#@ <1 sec
			else {
				my $fastq_lines_r1="";
				my $fastq_lines_r2="";
				$fastq_lines_r1 = &grooming_and_trimming($ligne1_r1,$seq1,$qual1);
				if ($fastq_lines_r1){
					$fastq_lines_r2 = &grooming_and_trimming($ligne1_r2,$seq2,$qual2);
				}
				if ($fastq_lines_r2){
					print OUT1 $fastq_lines_r1;
					print OUT2 $fastq_lines_r2;
				}
			}
		}
	
		# print OUT1 $ligne1_r1;
		# print OUT1 $ligne2_r1;
		# print OUT1 $ligne3_r1;
		# print OUT1 $ligne4_r1;
		# print OUT2 $ligne1_r2;
		# print OUT2 $ligne2_r2;
		# print OUT2 $ligne3_r2;
		# print OUT2 $ligne4_r2;
		
#@ 7 sec
	}
}



close (READ1);
close (READ2);
close (OUT1);
close (OUT2);




sub grooming_and_trimming{
	my $header = shift;
	my $seq = shift;
	my $quality = shift;
	my $quality_converted="";
	
	my $startnoN = 0;
	my $stopnoN = length($quality)-1;
	
#print "HEAD:\t$header";	
#print "SEQ:\t$seq\n";
	
	my $chercheN = $seq;
	my @bad_position;
	my $current_index = index($chercheN,"N");
	my $abs_index = $current_index;
	while ($current_index >=0){
		push (@bad_position,$abs_index);

		if ($current_index<length($seq)){
			$chercheN = substr($chercheN,$current_index+1);
			$current_index = index($chercheN,"N");
			$abs_index = $current_index + $bad_position[$#bad_position]+1;
		}
		else {
			last;
		}
	}
	
	
	if ($#bad_position>=0){
		my %coord=%{&extract_longer_string_coordinates_from_bad_position($startnoN,$stopnoN,\@bad_position)};
		$startnoN = $coord{"start"};
		$stopnoN = $coord{"stop"};	
	}
	my $lengthnoN = $stopnoN - $startnoN + 1;
	my $seqnoN = substr($seq,$startnoN,$lengthnoN);
#	print "SEQnoN\t:$seqnoN\n";
#	for (my $i=0;$i<=$#bad_position;$i++){
#		print $bad_position[$i]."\t";
#	}
#	print "\n";
	
	if ($lengthnoN >= $MIN_LENGTH){
		my $startTrim = $startnoN;
		my $stopTrim = $stopnoN;
		
		my $quality_converted="";
		#my @bad_position;
		
		my @q = split(//,$quality);
		#print "QUALITY\n";
		#print "$quality\n";
		for (my $i=0;$i<=$stopnoN;$i++){
			my $chr = $q[$i];
			my $num = ord($q[$i]);
			if ($TYPE eq "illumina"){
				$num = $num -64+33;
				$quality_converted .= chr($num);
			}
			
			if ($num <$MIN_QUALITY + 64 - 33 ){
				push(@bad_position,$i+$startnoN);
			}
		}
		if ($quality_converted){$quality = $quality_converted;}
		#print "$quality\n";
		
		
		
		if ($#bad_position>=0){
			@bad_position = sort {$a <=> $b} @bad_position;
#			for (my $i=0;$i<=$#bad_position;$i++){
#				print $bad_position[$i]."\t";
#			}
#			print "\n";
			my %coord=%{&extract_longer_string_coordinates_from_bad_position($startnoN,$stopnoN,\@bad_position)};
			$startTrim = $coord{"start"};
			$stopTrim = $coord{"stop"};
			#print "$startTrim .. $stopTrim\n";
			
		}
		my $lengthTrim = $stopTrim - $startTrim +1;
		
		
		my $fastq_lines="";
		
		if ($lengthTrim >= $MIN_LENGTH){
			$fastq_lines .= $header;
			$fastq_lines .= substr($seq,$startTrim,$lengthTrim)."\n";
			$fastq_lines .= "+\n";
			$fastq_lines .= substr($quality,$startTrim,$lengthTrim)."\n";
#			print $fastq_lines;
			return $fastq_lines;
			
		}
		else {
			return "";
		}
		
		
		
	}
	else {
		return "";
	}
	
	
	# my @s = split(//,$seq);
	# my $sanger_quality="";

	
	

	# return $sanger_quality;
}

sub extract_longer_string_coordinates_from_bad_position{
	my $start=shift;
	my $stop =shift;
	my $refbad = shift;
	my @bad_position = @$refbad;
	my %coord;
	
	my $current_start = $start;
	my $current_stop = $bad_position[0]-1;
	if ($current_stop < $start){$current_stop = $start;}
	
	
	#debut -> premier N
	my $current_length = $current_stop - $current_start +1;
	my $test_length;
	
	#entre les N
	for (my $i=1;$i<=$#bad_position;$i++){
		$test_length = $bad_position[$i]+1-$bad_position[$i-1]-1;
		if ( $test_length >  $current_length){
			$current_start = $bad_position[$i-1]+1;
			$current_stop = $bad_position[$i]-1;
			$current_length = $current_stop - $current_start +1;
		}
	}
	
	#dernier N -> fin
	$test_length = $stop-$bad_position[$#bad_position]+1;
	if ( $test_length >  $current_length){
		$current_start = $bad_position[$#bad_position]+1;
		if ($current_start > $stop){$current_start=$stop;}
		$current_stop = $stop;
	}	
	$coord{"start"}=$current_start;
	$coord{"stop"}= $current_stop;
	$coord{"lenght"}=$current_stop-$current_start+1;
	
	return \%coord;
}