diff rapsodyn/mpileupfilterandstat.pl @ 24:e8e6b962c1f2 draft

Uploaded
author mcharles
date Fri, 05 Sep 2014 06:12:10 -0400
parents
children 39376c7204be
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rapsodyn/mpileupfilterandstat.pl	Fri Sep 05 06:12:10 2014 -0400
@@ -0,0 +1,469 @@
+#!/usr/bin/perl
+use strict;
+use Getopt::Long;
+
+#
+# Filter a pileup file on forward/reverse presence and %read having the variant
+# The error code 
+#  1 : multiple variant type detected insertion/deletion/mutation
+#  1i : inconsistency in insertion
+#  1d : inconsistency in deletion
+#  1m : inconsistency in mutation
+#  2 : insufficient depth	
+#  3 : insufficient variant frequency 
+#  4 : variant position not covered by forward and reverse reads
+#  5 : variant with other variant in neighbourhood
+#  6 : too much depth
+#  8 : parsing error (couldn't parse the mpileup line correctly)
+#  9 : parsing error (couldn't parse the readbase string correctly)
+
+
+my $inputfile;
+my $logfile;
+my $MIN_DISTANCE=0;
+my $MIN_VARIANTFREQUENCY=0;
+my $MIN_FORWARDREVERSE=0;
+my $MIN_DEPTH=0;
+my $MAX_DEPTH=500;
+my $VERBOSE=0;
+my $ONLY_UNFILTERED_VARIANT="OFF";
+my $DO_STAT="NO";
+
+if ($#ARGV<0){
+	print "\n";
+	print "perl 020_FilterPileupv6 -input_file <mpileup_file> [OPTION]\n";
+	print "-input_file \tinputfile in mpileup format\n";
+	print "-log_file \tlogfile containing discarded mpileup lines and the errorcode associated\n";
+	print "-min_depth \tminimum depth required [1]\n";
+	print "-max_depth \tmaximim depth (position with more coverage will be discarded) [100]\n";
+	print "-min_frequency \tminimum variant frequency (0->1) [1] (default 1 => 100% reads show the variant at this position)\n";
+	print "-min_distance \tminimum distance between variant [0]\n";
+	print "-min_forward_and_reverse \tminimum number of reads in forward and reverse covering the variant required [0]\n";
+	print "\n";
+	exit(0);
+}
+
+GetOptions (
+"input_file=s" => \$inputfile,
+"log_file=s" => \$logfile,
+"min_depth=i" => \$MIN_DEPTH,
+"max_depth=i" => \$MAX_DEPTH,
+"min_frequency=f" => \$MIN_VARIANTFREQUENCY,
+"min_distance=i" => \$MIN_DISTANCE,
+"min_forward_and_reverse=i" => \$MIN_FORWARDREVERSE,
+"variant_only=s" => \$ONLY_UNFILTERED_VARIANT,
+"v=i" => \$VERBOSE,
+"do_stat=s" => \$DO_STAT
+) or die("Error in command line arguments\n");
+
+
+open(IF, $inputfile)  or die("Can't open $inputfile\n");
+
+my @tbl_line;
+my %USR_PARAM;
+$USR_PARAM{"min_depth"} = $MIN_DEPTH;
+$USR_PARAM{"max_depth"} = $MAX_DEPTH;
+$USR_PARAM{"min_freq"} = $MIN_VARIANTFREQUENCY;
+$USR_PARAM{"min_dist"} = $MIN_DISTANCE;
+$USR_PARAM{"min_fr"} = $MIN_FORWARDREVERSE;
+
+
+
+#Extraction des variants
+my $nb_line=0;
+while (my $line=<IF>){
+	$nb_line++;
+	if (($nb_line % 1000000 == 0)&&($VERBOSE==1)){
+		print "$nb_line\n";
+	}
+	my $error_code=0;
+	if ($line=~/(.*?)\s+(\d+)\s+([ATGCN])\s+(\d+)\s+(.*?)\s+(.*?)$/){
+		my $current_chromosome = $1;
+		my $current_position = $2;
+		my $current_refbase = $3;
+		my $current_coverage = $4;
+		my $current_readbase_string = $5;
+		my $current_quality_string = $6;
+		
+		#Suppression of mPileUp special character
+		$current_readbase_string =~ s/\$//g; #the read start at this position
+		$current_readbase_string =~ s/\^.//g; #the read end at this position followed by quality char
+		
+		if ($current_readbase_string =~ /[ATGCNatgcn\d]/){
+			my %variant;
+			$variant{"line"} = $line;
+			$variant{"chr"} = $current_chromosome;
+			$variant{"pos"} = $current_position;
+			$variant{"refbase"} = $current_refbase;
+			$variant{"coverage"} = $current_coverage;
+			$variant{"readbase"} = $current_readbase_string;
+			$variant{"quality"} = $current_quality_string;
+			push(@tbl_line,\%variant);
+
+			if ($ONLY_UNFILTERED_VARIANT eq "ON"){
+				print $line;
+			}
+			
+		}
+		else {
+			#Position with no variant
+		}
+		
+	}
+	else {
+		#Error Parsing
+		print STDERR "$line #8";
+	}
+}
+close(IF);
+
+if ($ONLY_UNFILTERED_VARIANT eq "ON"){
+	exit(0);
+}
+
+####Checking the distance between variant and other filter
+
+
+my @error;
+for (my $i=0;$i<=$#tbl_line;$i++){
+	# print "ligne : $tbl_line[$i]\n";
+	my $before="";
+	my $after="";
+	my %line = %{$tbl_line[$i]};
+
+	if ($tbl_line[$i-1]){
+		$before = $tbl_line[$i-1];
+	}
+	if ($tbl_line[$i+1]){
+		$after = $tbl_line[$i+1];
+	}		
+	my $error_code = check_error($tbl_line[$i],$before,$after,\%USR_PARAM);
+	if ($error_code == 0){
+		print $line{"line"};
+	}
+	else {
+		push(@error,$error_code,"\t",$line{"line"});
+	}
+}
+
+### LOG
+open(LF,">$logfile") or die ("Can't open $logfile\n");
+
+if ($DO_STAT eq "YES"){
+	for (my $idx_min_depth=2;$idx_min_depth<=32;$idx_min_depth = $idx_min_depth*2){
+		for (my $idx_freq = 0.5;$idx_freq<=1;$idx_freq= $idx_freq+0.1){ 
+			for (my $idx_dist=0;$idx_dist<=50;$idx_dist = $idx_dist + 50){
+				for (my $idx_fr=0;$idx_fr<=1;$idx_fr++){
+					my %stat_param;
+					$stat_param{"min_depth"}=$idx_min_depth;
+					$stat_param{"max_depth"}=250;
+					$stat_param{"min_freq"}=$idx_freq;
+					$stat_param{"min_fr"}=$idx_fr;
+					$stat_param{"min_dist"}=$idx_dist;
+
+					print LF "#SNP = ",&test_check(\@tbl_line,\%stat_param),"\tdepth (min/max) = ",$stat_param{"min_depth"}," / ",$stat_param{"max_depth"},"\tmin_dist=",$stat_param{"min_dist"},"\tmin_freq=",$stat_param{"min_freq"},"\tmin_forwardreverse = ",$stat_param{"min_fr"},"\n";
+				}
+			}	
+		}
+		print "\n";
+	}
+}
+
+
+for (my $i=0;$i<=$#error;$i++){
+	print LF $error[$i];
+}
+close (LF);
+
+
+
+
+sub test_check{
+	my $ref_tbl_line = shift;
+	my $ref_param = shift;
+	my @tbl_line = @$ref_tbl_line;
+	my %param = %$ref_param;
+	my $nb=0;
+
+	for (my $i=0;$i<=$#tbl_line;$i++){
+		my $before="";
+		my $after="";
+		my %line = %{$tbl_line[$i]};
+
+		if ($tbl_line[$i-1]){
+			$before = $tbl_line[$i-1];
+		}
+		if ($tbl_line[$i+1]){
+			$after = $tbl_line[$i+1];
+		}		
+		my $error_code = check_error($tbl_line[$i],$before,$after,\%param);
+		if ($error_code == 0){
+			$nb++;
+		}
+	}
+
+	return $nb;
+}
+
+sub check_error{
+	my $refline = shift;
+	my %line = %$refline;
+	my $refbefore = shift;
+	my $refafter = shift;
+	my $refparam = shift;
+	my %param = %$refparam;
+	
+	
+	my $current_chromosome = $line{"chr"};
+	my $current_position = $line{"pos"};
+	my $current_refbase = $line{"refbase"};
+	my $current_coverage = $line{"coverage"};
+	my $current_readbase_string = $line{"readbase"};
+	
+
+	my $min_depth = $param{"min_depth"};
+	my $max_depth = $param{"max_depth"};
+	my $min_variant_frequency = $param{"min_freq"};
+	my $min_forward_reverse = $param{"min_fr"};
+	my $min_dist = $param{"min_dist"};
+	
+	#Verification of neightbourhood
+	if ($refbefore){
+		my %compareline = %$refbefore;
+		my $compare_chromosome = $compareline{"chr"};
+		my $compare_position = $compareline{"pos"};
+		my $compare_refbase = $compareline{"refbase"};
+		my $compare_coverage = $compareline{"coverage"};
+		my $compare_readbase_string = $compareline{"readbase"};
+		
+		if (($current_chromosome eq $compare_chromosome )&&($compare_position + $min_dist >= $current_position)){
+			return 5;
+		}
+	}
+	
+	if ($refafter){
+		my %compareline = %$refafter;
+		my $compare_chromosome = $compareline{"chr"};
+		my $compare_position = $compareline{"pos"};
+		my $compare_refbase = $compareline{"refbase"};
+		my $compare_coverage = $compareline{"coverage"};
+		my $compare_readbase_string = $compareline{"readbase"};
+		
+		if (($current_chromosome eq $compare_chromosome )&&($current_position + $min_dist >= $compare_position)){
+			return 5;
+		}
+	}
+	
+	
+
+	
+	#Extraction of insertions
+	
+	##################################################################
+	# my @IN = $current_readbase_string =~ m/\+[0-9]+[ACGTNacgtn]+/g;
+	# my @DEL = $current_readbase_string =~ m/\-[0-9]+[ACGTNacgtn]+/g;
+	# print "IN : @IN\n";
+	# print "DEL :@DEL\n";
+	#$current_readbase_string=~s/[\+\-][0-9]+[ACGTNacgtn]+//g; 
+	##################################################################
+	#!!! marche pas : exemple .+1Ct. correspond a . / +1C / t /. mais le match de l'expression vire +1Ct
+	##################################################################
+	
+	# => parcours de boucle
+	my @readbase = split(//,$current_readbase_string);
+	my $cleaned_readbase_string="";
+	my @IN;
+	my @DEL;
+	my $current_IN="";
+	my $current_DEL="";
+	my $current_size=0;
+	
+	for (my $i=0;$i<=$#readbase;$i++){
+		if ($readbase[$i] eq "+"){
+			#Ouverture de IN
+			$current_IN="+";
+			
+			#Recuperation de la taille
+			my $sub = substr $current_readbase_string,$i;
+			if ($sub=~/^\+(\d+)/){
+				$current_size = $1;
+			}
+			my $remaining_size = $current_size;
+			while (($remaining_size>0)&&($i<=$#readbase)){
+				$i++;
+				$current_IN.=$readbase[$i];
+				if ($readbase[$i]=~ /[ATGCNatgcn]/){
+					$remaining_size--;
+				}
+			}
+			push(@IN,$current_IN);
+		}
+		elsif ($readbase[$i] eq "-"){
+			#Ouverture de DEL
+			$current_DEL="-";
+			
+			#Recuperation de la taille
+			my $sub = substr $current_readbase_string,$i;
+			if ($sub=~/^\-(\d+)/){
+				$current_size = $1;
+			}
+			my $remaining_size = $current_size;
+			while (($remaining_size>0)&&($i<=$#readbase)){
+				$i++;
+				$current_DEL.=$readbase[$i];
+				if ($readbase[$i]=~ /[ATGCNatgcn]/){
+					$remaining_size--;
+				}
+			}
+			push(@DEL,$current_DEL);
+			
+		}
+		else {
+			#Ajout a la string
+			$cleaned_readbase_string .= $readbase[$i];
+		}
+	}
+
+	
+	# print "IN : @IN\n";
+	# print "DEL :@DEL\n";
+	# print "$cleaned_readbase_string\n";	
+	
+	my @current_readbase_array = split(//,$cleaned_readbase_string);
+
+	#Filtering : error detection
+
+	if ($#current_readbase_array+1 != $current_coverage){
+		return 9;
+		#parsing error (couldn't parse the readbase string correctly)
+	}
+	elsif ($current_coverage<$min_depth){
+		return 2;
+		#  2 : insufficient depth
+	}
+	elsif ($current_coverage>$max_depth){
+		return 6;
+		#  6 : too much depth
+	}
+	else {
+		if ($#IN>=0){
+			if (($cleaned_readbase_string=~/[ACGTNacgtn]/)){
+				return 1;
+				#  1 : variant type overload (multiple variant type detected insertion/deletion/mutation)
+			}
+			else {
+				########## TEST de coherence des insertions ################
+				# for (my $i=0;$i<=$#IN;$i++){
+					# if (uc($IN[0]) ne uc($IN[$i])){
+						# print uc($IN[0]),"\n";
+						# print uc($IN[$i]),"\n";
+						# return "1i";
+					# }
+				# }		
+				###########################################################
+
+				if($#IN+1 < $current_coverage*$min_variant_frequency ){
+					return 3;
+					#  3 : insufficient variant frequency 
+				}
+			}
+		}
+		elsif ($#DEL>=0){
+			if (($cleaned_readbase_string=~/[ACGTNacgtn]/)){
+				return 1;
+				#  1 : variant type overload (multiple variant type detected insertion/deletion/mutation)
+			}
+			else {
+				########## TEST de coherence des deletions ################
+				# for (my $i=0;$i<=$#DEL;$i++){
+					# if (uc($DEL[0]) ne uc($DEL[$i])){
+						# print uc($DEL[0]),"\n";
+						# print uc($DEL[$i]),"\n";
+						# return "1d";
+					# }
+				# }
+				###########################################################
+
+				if($#DEL+1 < $current_coverage*$min_variant_frequency){
+					return 3;
+					#  3 : insufficient variant frequency 
+				}
+			}
+		}
+		else {
+			my $nbA=0;
+			$nbA++ while ($current_readbase_string =~ m/A/g);
+			my $nbC=0;
+			$nbC++ while ($current_readbase_string =~ m/C/g);
+			my $nbT=0;
+			$nbT++ while ($current_readbase_string =~ m/T/g);
+			my $nbG=0;
+			$nbG++ while ($current_readbase_string =~ m/G/g);
+			my $nbN=0;
+			$nbN++ while ($current_readbase_string =~ m/N/g);
+			my $nba=0;
+			$nba++ while ($current_readbase_string =~ m/a/g);
+			my $nbc=0;
+			$nbc++ while ($current_readbase_string =~ m/c/g);
+			my $nbt=0;
+			$nbt++ while ($current_readbase_string =~ m/t/g);
+			my $nbg=0;
+			$nbg++ while ($current_readbase_string =~ m/g/g);
+			my $nbn=0;
+			$nbn++ while ($current_readbase_string =~ m/n/g);
+
+			if (($nbA+$nba>0)&&($nbT+$nbt+$nbG+$nbg+$nbC+$nbc+$nbN+$nbn>0)){
+				return "1m";
+			}
+			if (($nbT+$nbt>0)&&($nbA+$nba+$nbG+$nbg+$nbC+$nbc+$nbN+$nbn>0)){
+				return "1m";
+			}
+			if (($nbG+$nbg>0)&&($nbA+$nba+$nbT+$nbt+$nbC+$nbc+$nbN+$nbn>0)){
+				return "1m";
+			}
+			if (($nbC+$nbc>0)&&($nbA+$nba+$nbT+$nbt+$nbG+$nbg+$nbN+$nbn>0)){
+				return "1m";
+			}
+			if (($nbN+$nbn>0)&&($nbA+$nba+$nbT+$nbt+$nbG+$nbg+$nbC+$nbc>0)){
+				return "1m";
+			}
+
+			if ($nbA+$nba >= $current_coverage*$min_variant_frequency){
+				if (($nbA<$min_forward_reverse)||($nba<$min_forward_reverse)){
+					return 4;
+					#  4 : variant position not covered by forward and reverse reads
+				}
+			}
+			elsif ($nbT+$nbt >= $current_coverage*$min_variant_frequency){
+				if (($nbT<$min_forward_reverse)||($nbt<$min_forward_reverse)){
+					return 4;
+					#  4 : variant position not covered by forward and reverse reads
+				}
+			}	 
+			elsif ($nbG+$nbg >= $current_coverage*$min_variant_frequency){
+				if (($nbG<$min_forward_reverse)||($nbg<$min_forward_reverse)){
+					return 4;
+					#  4 : variant position not covered by forward and reverse reads
+				}
+			}
+			elsif ($nbC+$nbc >= $current_coverage*$min_variant_frequency){
+				if (($nbC<$min_forward_reverse)||($nbc<$min_forward_reverse)){
+					return 4;
+					#  4 : variant position not covered by forward and reverse reads
+				}
+			}
+			elsif ($nbN+$nbn >= $current_coverage*$min_variant_frequency){
+				if (($nbN<$min_forward_reverse)||($nbn<$min_forward_reverse)){
+					return 4;
+					#  4 : variant position not covered by forward and reverse reads
+				}
+			}
+			else {
+				return 3;
+				#  3 : insufficient variant frequency 
+			}	
+		}
+	}
+	
+	return 0;
+}