Mercurial > repos > mcharles > rapsosnp
diff rapsodyn/mpileupfilterandstat.pl @ 0:442a7c88b886 draft
Uploaded
author | mcharles |
---|---|
date | Wed, 10 Sep 2014 09:18:15 -0400 |
parents | |
children | 3f7b0788a1c4 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rapsodyn/mpileupfilterandstat.pl Wed Sep 10 09:18:15 2014 -0400 @@ -0,0 +1,482 @@ +#!/usr/bin/perl +use strict; +use Getopt::Long; + +# +# Filter a pileup file on forward/reverse presence and %read having the variant +# The error code +# 1 : multiple variant type detected insertion/deletion/mutation +# 1i : inconsistency in insertion +# 1d : inconsistency in deletion +# 1m : inconsistency in mutation +# 2 : insufficient depth +# 3 : insufficient variant frequency +# 4 : variant position not covered by forward and reverse reads +# 5 : variant with other variant in neighbourhood +# 6 : too much depth +# 8 : parsing error (couldn't parse the mpileup line correctly) +# 9 : parsing error (couldn't parse the readbase string correctly) + + +my $inputfile; +my $logfile; +my $MIN_DISTANCE=0; +my $MIN_VARIANTFREQUENCY=0; +my $MIN_FORWARDREVERSE=0; +my $MIN_DEPTH=0; +my $MAX_DEPTH=500; +my $VERBOSE=0; +my $ONLY_UNFILTERED_VARIANT="OFF"; +my $DO_STAT="NO"; + + +my $STAT_MIN_DEPTH_MIN = 2; +my $STAT_MIN_DEPTH_MAX = 10; +my $STAT_MIN_DEPTH_STEP = 2; +my $STAT_MAX_DEPTH_MIN = 100; +my $STAT_MAX_DEPTH_MAX = 200; +my $STAT_MAX_DEPTH_STEP = 100; +my $STAT_FREQ_MIN = 0.8; +my $STAT_FREQ_MAX = 1; +my $STAT_FREQ_STEP = 0.1; +my $STAT_DIST_MIN = 0; +my $STAT_DIST_MAX = 50; +my $STAT_DIST_STEP = 50; + +GetOptions ( +"input_file=s" => \$inputfile, +"log_file=s" => \$logfile, +"min_depth=i" => \$MIN_DEPTH, +"max_depth=i" => \$MAX_DEPTH, +"min_frequency=f" => \$MIN_VARIANTFREQUENCY, +"min_distance=i" => \$MIN_DISTANCE, +"min_forward_and_reverse=i" => \$MIN_FORWARDREVERSE, +"variant_only=s" => \$ONLY_UNFILTERED_VARIANT, +"v=i" => \$VERBOSE, +"do_stat=s" => \$DO_STAT, +"stat_min_depth_min=i" => \$STAT_MIN_DEPTH_MIN, +"stat_min_depth_max=i" => \$STAT_MIN_DEPTH_MAX, +"stat_min_depth_step=i" => \$STAT_MIN_DEPTH_STEP, +"stat_max_depth_min=i" => \$STAT_MAX_DEPTH_MIN, +"stat_max_depth_max=i" => \$STAT_MAX_DEPTH_MAX, +"stat_max_depth_step=i" => \$STAT_MAX_DEPTH_STEP, +"stat_freq_min=f" => \$STAT_FREQ_MIN, +"stat_freq_max=f" => \$STAT_FREQ_MAX, +"stat_freq_step=f" => \$STAT_FREQ_STEP, +"stat_dist_min=i" => \$STAT_DIST_MIN, +"stat_dist_max=i" => \$STAT_DIST_MAX, +"stat_dist_step=i" => \$STAT_DIST_STEP +) or die("Error in command line arguments\n"); + +open(IF, $inputfile) or die("Can't open $inputfile\n"); + +my @tbl_line; +my %USR_PARAM; +$USR_PARAM{"min_depth"} = $MIN_DEPTH; +$USR_PARAM{"max_depth"} = $MAX_DEPTH; +$USR_PARAM{"min_freq"} = $MIN_VARIANTFREQUENCY; +$USR_PARAM{"min_dist"} = $MIN_DISTANCE; +$USR_PARAM{"min_fr"} = $MIN_FORWARDREVERSE; + + + +#Extraction des variants +my $nb_line=0; +while (my $line=<IF>){ + $nb_line++; + if (($nb_line % 1000000 == 0)&&($VERBOSE==1)){ + print "$nb_line\n"; + } + my $error_code=0; + if ($line=~/(.*?)\s+(\d+)\s+([ATGCN])\s+(\d+)\s+(.*?)\s+(.*?)$/){ + my $current_chromosome = $1; + my $current_position = $2; + my $current_refbase = $3; + my $current_coverage = $4; + my $current_readbase_string = $5; + my $current_quality_string = $6; + + #Suppression of mPileUp special character + $current_readbase_string =~ s/\$//g; #the read start at this position + $current_readbase_string =~ s/\^.//g; #the read end at this position followed by quality char + + if ($current_readbase_string =~ /[ATGCNatgcn\d]/){ + my %variant; + $variant{"line"} = $line; + $variant{"chr"} = $current_chromosome; + $variant{"pos"} = $current_position; + $variant{"refbase"} = $current_refbase; + $variant{"coverage"} = $current_coverage; + $variant{"readbase"} = $current_readbase_string; + $variant{"quality"} = $current_quality_string; + push(@tbl_line,\%variant); + + if ($ONLY_UNFILTERED_VARIANT eq "ON"){ + print $line; + } + + } + else { + #Position with no variant + } + + } + else { + #Error Parsing + print STDERR "$line #8"; + } +} +close(IF); + +if ($ONLY_UNFILTERED_VARIANT eq "ON"){ + exit(0); +} + +####Checking the distance between variant and other filter + + +my @error; +for (my $i=0;$i<=$#tbl_line;$i++){ + # print "ligne : $tbl_line[$i]\n"; + my $before=""; + my $after=""; + my %line = %{$tbl_line[$i]}; + + if ($tbl_line[$i-1]){ + $before = $tbl_line[$i-1]; + } + if ($tbl_line[$i+1]){ + $after = $tbl_line[$i+1]; + } + my $error_code = check_error($tbl_line[$i],$before,$after,\%USR_PARAM); + if ($error_code == 0){ + print $line{"line"}; + } + else { + push(@error,$error_code,"\t",$line{"line"}); + } +} + +### LOG +open(LF,">$logfile") or die ("Can't open $logfile\n"); + +if ($DO_STAT eq "YES"){ + for (my $idx_min_depth=$STAT_MIN_DEPTH_MIN;$idx_min_depth<=$STAT_MIN_DEPTH_MAX;$idx_min_depth = $idx_min_depth + $STAT_MIN_DEPTH_STEP ){ + for (my $idx_max_depth=$STAT_MAX_DEPTH_MIN;$idx_max_depth<=$STAT_MAX_DEPTH_MAX;$idx_max_depth = $idx_max_depth + $STAT_MAX_DEPTH_STEP ){ + for (my $idx_freq = $STAT_FREQ_MIN;$idx_freq<=$STAT_FREQ_MAX;$idx_freq= $idx_freq+$STAT_FREQ_STEP){ + for (my $idx_dist=$STAT_DIST_MIN;$idx_dist<=$STAT_DIST_MAX;$idx_dist = $idx_dist + $STAT_DIST_STEP){ + for (my $idx_fr=0;$idx_fr<=1;$idx_fr++){ + my %stat_param; + $stat_param{"min_depth"}=$idx_min_depth; + $stat_param{"max_depth"}=$idx_max_depth; + $stat_param{"min_freq"}=$idx_freq; + $stat_param{"min_fr"}=$idx_fr; + $stat_param{"min_dist"}=$idx_dist; + + print LF "#SNP = ",&test_check(\@tbl_line,\%stat_param),"\tdepth (min/max) = ",$stat_param{"min_depth"}," / ",$stat_param{"max_depth"},"\tmin_dist=",$stat_param{"min_dist"},"\tmin_freq=",$stat_param{"min_freq"},"\tmin_forwardreverse = ",$stat_param{"min_fr"},"\n"; + } + } + } + print "\n"; + } + } +} + + +for (my $i=0;$i<=$#error;$i++){ + print LF $error[$i]; +} +close (LF); + + + + +sub test_check{ + my $ref_tbl_line = shift; + my $ref_param = shift; + my @tbl_line = @$ref_tbl_line; + my %param = %$ref_param; + my $nb=0; + + for (my $i=0;$i<=$#tbl_line;$i++){ + my $before=""; + my $after=""; + my %line = %{$tbl_line[$i]}; + + if ($tbl_line[$i-1]){ + $before = $tbl_line[$i-1]; + } + if ($tbl_line[$i+1]){ + $after = $tbl_line[$i+1]; + } + my $error_code = check_error($tbl_line[$i],$before,$after,\%param); + if ($error_code == 0){ + $nb++; + } + } + + return $nb; +} + +sub check_error{ + my $refline = shift; + my %line = %$refline; + my $refbefore = shift; + my $refafter = shift; + my $refparam = shift; + my %param = %$refparam; + + + my $current_chromosome = $line{"chr"}; + my $current_position = $line{"pos"}; + my $current_refbase = $line{"refbase"}; + my $current_coverage = $line{"coverage"}; + my $current_readbase_string = $line{"readbase"}; + + + my $min_depth = $param{"min_depth"}; + my $max_depth = $param{"max_depth"}; + my $min_variant_frequency = $param{"min_freq"}; + my $min_forward_reverse = $param{"min_fr"}; + my $min_dist = $param{"min_dist"}; + + #Verification of neightbourhood + if ($refbefore){ + my %compareline = %$refbefore; + my $compare_chromosome = $compareline{"chr"}; + my $compare_position = $compareline{"pos"}; + my $compare_refbase = $compareline{"refbase"}; + my $compare_coverage = $compareline{"coverage"}; + my $compare_readbase_string = $compareline{"readbase"}; + + if (($current_chromosome eq $compare_chromosome )&&($compare_position + $min_dist >= $current_position)){ + return 5; + } + } + + if ($refafter){ + my %compareline = %$refafter; + my $compare_chromosome = $compareline{"chr"}; + my $compare_position = $compareline{"pos"}; + my $compare_refbase = $compareline{"refbase"}; + my $compare_coverage = $compareline{"coverage"}; + my $compare_readbase_string = $compareline{"readbase"}; + + if (($current_chromosome eq $compare_chromosome )&&($current_position + $min_dist >= $compare_position)){ + return 5; + } + } + + + + + #Extraction of insertions + + ################################################################## + # my @IN = $current_readbase_string =~ m/\+[0-9]+[ACGTNacgtn]+/g; + # my @DEL = $current_readbase_string =~ m/\-[0-9]+[ACGTNacgtn]+/g; + # print "IN : @IN\n"; + # print "DEL :@DEL\n"; + #$current_readbase_string=~s/[\+\-][0-9]+[ACGTNacgtn]+//g; + ################################################################## + #!!! marche pas : exemple .+1Ct. correspond a . / +1C / t /. mais le match de l'expression vire +1Ct + ################################################################## + + # => parcours de boucle + my @readbase = split(//,$current_readbase_string); + my $cleaned_readbase_string=""; + my @IN; + my @DEL; + my $current_IN=""; + my $current_DEL=""; + my $current_size=0; + + for (my $i=0;$i<=$#readbase;$i++){ + if ($readbase[$i] eq "+"){ + #Ouverture de IN + $current_IN="+"; + + #Recuperation de la taille + my $sub = substr $current_readbase_string,$i; + if ($sub=~/^\+(\d+)/){ + $current_size = $1; + } + my $remaining_size = $current_size; + while (($remaining_size>0)&&($i<=$#readbase)){ + $i++; + $current_IN.=$readbase[$i]; + if ($readbase[$i]=~ /[ATGCNatgcn]/){ + $remaining_size--; + } + } + push(@IN,$current_IN); + } + elsif ($readbase[$i] eq "-"){ + #Ouverture de DEL + $current_DEL="-"; + + #Recuperation de la taille + my $sub = substr $current_readbase_string,$i; + if ($sub=~/^\-(\d+)/){ + $current_size = $1; + } + my $remaining_size = $current_size; + while (($remaining_size>0)&&($i<=$#readbase)){ + $i++; + $current_DEL.=$readbase[$i]; + if ($readbase[$i]=~ /[ATGCNatgcn]/){ + $remaining_size--; + } + } + push(@DEL,$current_DEL); + + } + else { + #Ajout a la string + $cleaned_readbase_string .= $readbase[$i]; + } + } + + + # print "IN : @IN\n"; + # print "DEL :@DEL\n"; + # print "$cleaned_readbase_string\n"; + + my @current_readbase_array = split(//,$cleaned_readbase_string); + + #Filtering : error detection + + if ($#current_readbase_array+1 != $current_coverage){ + return 9; + #parsing error (couldn't parse the readbase string correctly) + } + elsif ($current_coverage<$min_depth){ + return 2; + # 2 : insufficient depth + } + elsif ($current_coverage>$max_depth){ + return 6; + # 6 : too much depth + } + else { + if ($#IN>=0){ + if (($cleaned_readbase_string=~/[ACGTNacgtn]/)){ + return 1; + # 1 : variant type overload (multiple variant type detected insertion/deletion/mutation) + } + else { + ########## TEST de coherence des insertions ################ + # for (my $i=0;$i<=$#IN;$i++){ + # if (uc($IN[0]) ne uc($IN[$i])){ + # print uc($IN[0]),"\n"; + # print uc($IN[$i]),"\n"; + # return "1i"; + # } + # } + ########################################################### + + if($#IN+1 < $current_coverage*$min_variant_frequency ){ + return 3; + # 3 : insufficient variant frequency + } + } + } + elsif ($#DEL>=0){ + if (($cleaned_readbase_string=~/[ACGTNacgtn]/)){ + return 1; + # 1 : variant type overload (multiple variant type detected insertion/deletion/mutation) + } + else { + ########## TEST de coherence des deletions ################ + # for (my $i=0;$i<=$#DEL;$i++){ + # if (uc($DEL[0]) ne uc($DEL[$i])){ + # print uc($DEL[0]),"\n"; + # print uc($DEL[$i]),"\n"; + # return "1d"; + # } + # } + ########################################################### + + if($#DEL+1 < $current_coverage*$min_variant_frequency){ + return 3; + # 3 : insufficient variant frequency + } + } + } + else { + my $nbA=0; + $nbA++ while ($current_readbase_string =~ m/A/g); + my $nbC=0; + $nbC++ while ($current_readbase_string =~ m/C/g); + my $nbT=0; + $nbT++ while ($current_readbase_string =~ m/T/g); + my $nbG=0; + $nbG++ while ($current_readbase_string =~ m/G/g); + my $nbN=0; + $nbN++ while ($current_readbase_string =~ m/N/g); + my $nba=0; + $nba++ while ($current_readbase_string =~ m/a/g); + my $nbc=0; + $nbc++ while ($current_readbase_string =~ m/c/g); + my $nbt=0; + $nbt++ while ($current_readbase_string =~ m/t/g); + my $nbg=0; + $nbg++ while ($current_readbase_string =~ m/g/g); + my $nbn=0; + $nbn++ while ($current_readbase_string =~ m/n/g); + + if (($nbA+$nba>0)&&($nbT+$nbt+$nbG+$nbg+$nbC+$nbc+$nbN+$nbn>0)){ + return "1m"; + } + if (($nbT+$nbt>0)&&($nbA+$nba+$nbG+$nbg+$nbC+$nbc+$nbN+$nbn>0)){ + return "1m"; + } + if (($nbG+$nbg>0)&&($nbA+$nba+$nbT+$nbt+$nbC+$nbc+$nbN+$nbn>0)){ + return "1m"; + } + if (($nbC+$nbc>0)&&($nbA+$nba+$nbT+$nbt+$nbG+$nbg+$nbN+$nbn>0)){ + return "1m"; + } + if (($nbN+$nbn>0)&&($nbA+$nba+$nbT+$nbt+$nbG+$nbg+$nbC+$nbc>0)){ + return "1m"; + } + + if ($nbA+$nba >= $current_coverage*$min_variant_frequency){ + if (($nbA<$min_forward_reverse)||($nba<$min_forward_reverse)){ + return 4; + # 4 : variant position not covered by forward and reverse reads + } + } + elsif ($nbT+$nbt >= $current_coverage*$min_variant_frequency){ + if (($nbT<$min_forward_reverse)||($nbt<$min_forward_reverse)){ + return 4; + # 4 : variant position not covered by forward and reverse reads + } + } + elsif ($nbG+$nbg >= $current_coverage*$min_variant_frequency){ + if (($nbG<$min_forward_reverse)||($nbg<$min_forward_reverse)){ + return 4; + # 4 : variant position not covered by forward and reverse reads + } + } + elsif ($nbC+$nbc >= $current_coverage*$min_variant_frequency){ + if (($nbC<$min_forward_reverse)||($nbc<$min_forward_reverse)){ + return 4; + # 4 : variant position not covered by forward and reverse reads + } + } + elsif ($nbN+$nbn >= $current_coverage*$min_variant_frequency){ + if (($nbN<$min_forward_reverse)||($nbn<$min_forward_reverse)){ + return 4; + # 4 : variant position not covered by forward and reverse reads + } + } + else { + return 3; + # 3 : insufficient variant frequency + } + } + } + + return 0; +}