Mercurial > repos > mcharles > rapsosnp
view rapsodyn/mpileupfilterandstat.pl @ 3:9332b9da7491 draft
Uploaded
author | mcharles |
---|---|
date | Thu, 11 Sep 2014 07:31:20 -0400 |
parents | 442a7c88b886 |
children | 3f7b0788a1c4 |
line wrap: on
line source
#!/usr/bin/perl use strict; use Getopt::Long; # # Filter a pileup file on forward/reverse presence and %read having the variant # The error code # 1 : multiple variant type detected insertion/deletion/mutation # 1i : inconsistency in insertion # 1d : inconsistency in deletion # 1m : inconsistency in mutation # 2 : insufficient depth # 3 : insufficient variant frequency # 4 : variant position not covered by forward and reverse reads # 5 : variant with other variant in neighbourhood # 6 : too much depth # 8 : parsing error (couldn't parse the mpileup line correctly) # 9 : parsing error (couldn't parse the readbase string correctly) my $inputfile; my $logfile; my $MIN_DISTANCE=0; my $MIN_VARIANTFREQUENCY=0; my $MIN_FORWARDREVERSE=0; my $MIN_DEPTH=0; my $MAX_DEPTH=500; my $VERBOSE=0; my $ONLY_UNFILTERED_VARIANT="OFF"; my $DO_STAT="NO"; my $STAT_MIN_DEPTH_MIN = 2; my $STAT_MIN_DEPTH_MAX = 10; my $STAT_MIN_DEPTH_STEP = 2; my $STAT_MAX_DEPTH_MIN = 100; my $STAT_MAX_DEPTH_MAX = 200; my $STAT_MAX_DEPTH_STEP = 100; my $STAT_FREQ_MIN = 0.8; my $STAT_FREQ_MAX = 1; my $STAT_FREQ_STEP = 0.1; my $STAT_DIST_MIN = 0; my $STAT_DIST_MAX = 50; my $STAT_DIST_STEP = 50; GetOptions ( "input_file=s" => \$inputfile, "log_file=s" => \$logfile, "min_depth=i" => \$MIN_DEPTH, "max_depth=i" => \$MAX_DEPTH, "min_frequency=f" => \$MIN_VARIANTFREQUENCY, "min_distance=i" => \$MIN_DISTANCE, "min_forward_and_reverse=i" => \$MIN_FORWARDREVERSE, "variant_only=s" => \$ONLY_UNFILTERED_VARIANT, "v=i" => \$VERBOSE, "do_stat=s" => \$DO_STAT, "stat_min_depth_min=i" => \$STAT_MIN_DEPTH_MIN, "stat_min_depth_max=i" => \$STAT_MIN_DEPTH_MAX, "stat_min_depth_step=i" => \$STAT_MIN_DEPTH_STEP, "stat_max_depth_min=i" => \$STAT_MAX_DEPTH_MIN, "stat_max_depth_max=i" => \$STAT_MAX_DEPTH_MAX, "stat_max_depth_step=i" => \$STAT_MAX_DEPTH_STEP, "stat_freq_min=f" => \$STAT_FREQ_MIN, "stat_freq_max=f" => \$STAT_FREQ_MAX, "stat_freq_step=f" => \$STAT_FREQ_STEP, "stat_dist_min=i" => \$STAT_DIST_MIN, "stat_dist_max=i" => \$STAT_DIST_MAX, "stat_dist_step=i" => \$STAT_DIST_STEP ) or die("Error in command line arguments\n"); open(IF, $inputfile) or die("Can't open $inputfile\n"); my @tbl_line; my %USR_PARAM; $USR_PARAM{"min_depth"} = $MIN_DEPTH; $USR_PARAM{"max_depth"} = $MAX_DEPTH; $USR_PARAM{"min_freq"} = $MIN_VARIANTFREQUENCY; $USR_PARAM{"min_dist"} = $MIN_DISTANCE; $USR_PARAM{"min_fr"} = $MIN_FORWARDREVERSE; #Extraction des variants my $nb_line=0; while (my $line=<IF>){ $nb_line++; if (($nb_line % 1000000 == 0)&&($VERBOSE==1)){ print "$nb_line\n"; } my $error_code=0; if ($line=~/(.*?)\s+(\d+)\s+([ATGCN])\s+(\d+)\s+(.*?)\s+(.*?)$/){ my $current_chromosome = $1; my $current_position = $2; my $current_refbase = $3; my $current_coverage = $4; my $current_readbase_string = $5; my $current_quality_string = $6; #Suppression of mPileUp special character $current_readbase_string =~ s/\$//g; #the read start at this position $current_readbase_string =~ s/\^.//g; #the read end at this position followed by quality char if ($current_readbase_string =~ /[ATGCNatgcn\d]/){ my %variant; $variant{"line"} = $line; $variant{"chr"} = $current_chromosome; $variant{"pos"} = $current_position; $variant{"refbase"} = $current_refbase; $variant{"coverage"} = $current_coverage; $variant{"readbase"} = $current_readbase_string; $variant{"quality"} = $current_quality_string; push(@tbl_line,\%variant); if ($ONLY_UNFILTERED_VARIANT eq "ON"){ print $line; } } else { #Position with no variant } } else { #Error Parsing print STDERR "$line #8"; } } close(IF); if ($ONLY_UNFILTERED_VARIANT eq "ON"){ exit(0); } ####Checking the distance between variant and other filter my @error; for (my $i=0;$i<=$#tbl_line;$i++){ # print "ligne : $tbl_line[$i]\n"; my $before=""; my $after=""; my %line = %{$tbl_line[$i]}; if ($tbl_line[$i-1]){ $before = $tbl_line[$i-1]; } if ($tbl_line[$i+1]){ $after = $tbl_line[$i+1]; } my $error_code = check_error($tbl_line[$i],$before,$after,\%USR_PARAM); if ($error_code == 0){ print $line{"line"}; } else { push(@error,$error_code,"\t",$line{"line"}); } } ### LOG open(LF,">$logfile") or die ("Can't open $logfile\n"); if ($DO_STAT eq "YES"){ for (my $idx_min_depth=$STAT_MIN_DEPTH_MIN;$idx_min_depth<=$STAT_MIN_DEPTH_MAX;$idx_min_depth = $idx_min_depth + $STAT_MIN_DEPTH_STEP ){ for (my $idx_max_depth=$STAT_MAX_DEPTH_MIN;$idx_max_depth<=$STAT_MAX_DEPTH_MAX;$idx_max_depth = $idx_max_depth + $STAT_MAX_DEPTH_STEP ){ for (my $idx_freq = $STAT_FREQ_MIN;$idx_freq<=$STAT_FREQ_MAX;$idx_freq= $idx_freq+$STAT_FREQ_STEP){ for (my $idx_dist=$STAT_DIST_MIN;$idx_dist<=$STAT_DIST_MAX;$idx_dist = $idx_dist + $STAT_DIST_STEP){ for (my $idx_fr=0;$idx_fr<=1;$idx_fr++){ my %stat_param; $stat_param{"min_depth"}=$idx_min_depth; $stat_param{"max_depth"}=$idx_max_depth; $stat_param{"min_freq"}=$idx_freq; $stat_param{"min_fr"}=$idx_fr; $stat_param{"min_dist"}=$idx_dist; print LF "#SNP = ",&test_check(\@tbl_line,\%stat_param),"\tdepth (min/max) = ",$stat_param{"min_depth"}," / ",$stat_param{"max_depth"},"\tmin_dist=",$stat_param{"min_dist"},"\tmin_freq=",$stat_param{"min_freq"},"\tmin_forwardreverse = ",$stat_param{"min_fr"},"\n"; } } } print "\n"; } } } for (my $i=0;$i<=$#error;$i++){ print LF $error[$i]; } close (LF); sub test_check{ my $ref_tbl_line = shift; my $ref_param = shift; my @tbl_line = @$ref_tbl_line; my %param = %$ref_param; my $nb=0; for (my $i=0;$i<=$#tbl_line;$i++){ my $before=""; my $after=""; my %line = %{$tbl_line[$i]}; if ($tbl_line[$i-1]){ $before = $tbl_line[$i-1]; } if ($tbl_line[$i+1]){ $after = $tbl_line[$i+1]; } my $error_code = check_error($tbl_line[$i],$before,$after,\%param); if ($error_code == 0){ $nb++; } } return $nb; } sub check_error{ my $refline = shift; my %line = %$refline; my $refbefore = shift; my $refafter = shift; my $refparam = shift; my %param = %$refparam; my $current_chromosome = $line{"chr"}; my $current_position = $line{"pos"}; my $current_refbase = $line{"refbase"}; my $current_coverage = $line{"coverage"}; my $current_readbase_string = $line{"readbase"}; my $min_depth = $param{"min_depth"}; my $max_depth = $param{"max_depth"}; my $min_variant_frequency = $param{"min_freq"}; my $min_forward_reverse = $param{"min_fr"}; my $min_dist = $param{"min_dist"}; #Verification of neightbourhood if ($refbefore){ my %compareline = %$refbefore; my $compare_chromosome = $compareline{"chr"}; my $compare_position = $compareline{"pos"}; my $compare_refbase = $compareline{"refbase"}; my $compare_coverage = $compareline{"coverage"}; my $compare_readbase_string = $compareline{"readbase"}; if (($current_chromosome eq $compare_chromosome )&&($compare_position + $min_dist >= $current_position)){ return 5; } } if ($refafter){ my %compareline = %$refafter; my $compare_chromosome = $compareline{"chr"}; my $compare_position = $compareline{"pos"}; my $compare_refbase = $compareline{"refbase"}; my $compare_coverage = $compareline{"coverage"}; my $compare_readbase_string = $compareline{"readbase"}; if (($current_chromosome eq $compare_chromosome )&&($current_position + $min_dist >= $compare_position)){ return 5; } } #Extraction of insertions ################################################################## # my @IN = $current_readbase_string =~ m/\+[0-9]+[ACGTNacgtn]+/g; # my @DEL = $current_readbase_string =~ m/\-[0-9]+[ACGTNacgtn]+/g; # print "IN : @IN\n"; # print "DEL :@DEL\n"; #$current_readbase_string=~s/[\+\-][0-9]+[ACGTNacgtn]+//g; ################################################################## #!!! marche pas : exemple .+1Ct. correspond a . / +1C / t /. mais le match de l'expression vire +1Ct ################################################################## # => parcours de boucle my @readbase = split(//,$current_readbase_string); my $cleaned_readbase_string=""; my @IN; my @DEL; my $current_IN=""; my $current_DEL=""; my $current_size=0; for (my $i=0;$i<=$#readbase;$i++){ if ($readbase[$i] eq "+"){ #Ouverture de IN $current_IN="+"; #Recuperation de la taille my $sub = substr $current_readbase_string,$i; if ($sub=~/^\+(\d+)/){ $current_size = $1; } my $remaining_size = $current_size; while (($remaining_size>0)&&($i<=$#readbase)){ $i++; $current_IN.=$readbase[$i]; if ($readbase[$i]=~ /[ATGCNatgcn]/){ $remaining_size--; } } push(@IN,$current_IN); } elsif ($readbase[$i] eq "-"){ #Ouverture de DEL $current_DEL="-"; #Recuperation de la taille my $sub = substr $current_readbase_string,$i; if ($sub=~/^\-(\d+)/){ $current_size = $1; } my $remaining_size = $current_size; while (($remaining_size>0)&&($i<=$#readbase)){ $i++; $current_DEL.=$readbase[$i]; if ($readbase[$i]=~ /[ATGCNatgcn]/){ $remaining_size--; } } push(@DEL,$current_DEL); } else { #Ajout a la string $cleaned_readbase_string .= $readbase[$i]; } } # print "IN : @IN\n"; # print "DEL :@DEL\n"; # print "$cleaned_readbase_string\n"; my @current_readbase_array = split(//,$cleaned_readbase_string); #Filtering : error detection if ($#current_readbase_array+1 != $current_coverage){ return 9; #parsing error (couldn't parse the readbase string correctly) } elsif ($current_coverage<$min_depth){ return 2; # 2 : insufficient depth } elsif ($current_coverage>$max_depth){ return 6; # 6 : too much depth } else { if ($#IN>=0){ if (($cleaned_readbase_string=~/[ACGTNacgtn]/)){ return 1; # 1 : variant type overload (multiple variant type detected insertion/deletion/mutation) } else { ########## TEST de coherence des insertions ################ # for (my $i=0;$i<=$#IN;$i++){ # if (uc($IN[0]) ne uc($IN[$i])){ # print uc($IN[0]),"\n"; # print uc($IN[$i]),"\n"; # return "1i"; # } # } ########################################################### if($#IN+1 < $current_coverage*$min_variant_frequency ){ return 3; # 3 : insufficient variant frequency } } } elsif ($#DEL>=0){ if (($cleaned_readbase_string=~/[ACGTNacgtn]/)){ return 1; # 1 : variant type overload (multiple variant type detected insertion/deletion/mutation) } else { ########## TEST de coherence des deletions ################ # for (my $i=0;$i<=$#DEL;$i++){ # if (uc($DEL[0]) ne uc($DEL[$i])){ # print uc($DEL[0]),"\n"; # print uc($DEL[$i]),"\n"; # return "1d"; # } # } ########################################################### if($#DEL+1 < $current_coverage*$min_variant_frequency){ return 3; # 3 : insufficient variant frequency } } } else { my $nbA=0; $nbA++ while ($current_readbase_string =~ m/A/g); my $nbC=0; $nbC++ while ($current_readbase_string =~ m/C/g); my $nbT=0; $nbT++ while ($current_readbase_string =~ m/T/g); my $nbG=0; $nbG++ while ($current_readbase_string =~ m/G/g); my $nbN=0; $nbN++ while ($current_readbase_string =~ m/N/g); my $nba=0; $nba++ while ($current_readbase_string =~ m/a/g); my $nbc=0; $nbc++ while ($current_readbase_string =~ m/c/g); my $nbt=0; $nbt++ while ($current_readbase_string =~ m/t/g); my $nbg=0; $nbg++ while ($current_readbase_string =~ m/g/g); my $nbn=0; $nbn++ while ($current_readbase_string =~ m/n/g); if (($nbA+$nba>0)&&($nbT+$nbt+$nbG+$nbg+$nbC+$nbc+$nbN+$nbn>0)){ return "1m"; } if (($nbT+$nbt>0)&&($nbA+$nba+$nbG+$nbg+$nbC+$nbc+$nbN+$nbn>0)){ return "1m"; } if (($nbG+$nbg>0)&&($nbA+$nba+$nbT+$nbt+$nbC+$nbc+$nbN+$nbn>0)){ return "1m"; } if (($nbC+$nbc>0)&&($nbA+$nba+$nbT+$nbt+$nbG+$nbg+$nbN+$nbn>0)){ return "1m"; } if (($nbN+$nbn>0)&&($nbA+$nba+$nbT+$nbt+$nbG+$nbg+$nbC+$nbc>0)){ return "1m"; } if ($nbA+$nba >= $current_coverage*$min_variant_frequency){ if (($nbA<$min_forward_reverse)||($nba<$min_forward_reverse)){ return 4; # 4 : variant position not covered by forward and reverse reads } } elsif ($nbT+$nbt >= $current_coverage*$min_variant_frequency){ if (($nbT<$min_forward_reverse)||($nbt<$min_forward_reverse)){ return 4; # 4 : variant position not covered by forward and reverse reads } } elsif ($nbG+$nbg >= $current_coverage*$min_variant_frequency){ if (($nbG<$min_forward_reverse)||($nbg<$min_forward_reverse)){ return 4; # 4 : variant position not covered by forward and reverse reads } } elsif ($nbC+$nbc >= $current_coverage*$min_variant_frequency){ if (($nbC<$min_forward_reverse)||($nbc<$min_forward_reverse)){ return 4; # 4 : variant position not covered by forward and reverse reads } } elsif ($nbN+$nbn >= $current_coverage*$min_variant_frequency){ if (($nbN<$min_forward_reverse)||($nbn<$min_forward_reverse)){ return 4; # 4 : variant position not covered by forward and reverse reads } } else { return 3; # 3 : insufficient variant frequency } } } return 0; }