Mercurial > repos > mcharles > rapsosnp
view rapsodyn/ParseBlastForUniqueMatch.pl @ 15:56d328bce3a7 draft default tip
Uploaded
author | mcharles |
---|---|
date | Thu, 29 Jan 2015 08:54:06 -0500 |
parents | 0a6c1cfe4dc8 |
children |
line wrap: on
line source
#!/usr/bin/perl #V1.1.0 manage empty files #V1.0.1 added log, option parameters use strict; use warnings; use Getopt::Long; my $input_variant_file; my $input_blast_file; my $log_file; my $WINDOW_LENGTH= 50; my $NB_MISMATCH_MAX = 3; GetOptions ( "input_variant_file=s" => \$input_variant_file, "input_blast_file=s" => \$input_blast_file, "window_length=s" => \$WINDOW_LENGTH, "log_file=s" => \$log_file, "nb_mismatch_max=s" => \$NB_MISMATCH_MAX ) or die("Error in command line arguments\n"); my $nb_variant_checked=0; my $nb_variant_selected=0; my %hash_name; open(INB, $input_blast_file) or die ("Can't open $input_blast_file\n"); if ( -z INB){ exit(0); } while (my $line =<INB>){ my @fields = split (/\s+/,$line); # print $#fields,"\n"; # print $line; # exit(0); my $query_name; my $query_start; my $query_stop; my $query_aln; my $subject_aln; my $compt_mismatch_5p=0; my $compt_mismatch_3p=0; if ($#fields == 24){ $query_name = $fields[0]; $query_start = $fields[6]; $query_stop = $fields[7]; $query_aln = $fields[20]; $subject_aln = $fields[21]; } elsif ($#fields == 4){ $query_name = $fields[0]; $query_start = $fields[1]; $query_stop = $fields[2]; $query_aln = $fields[3]; $subject_aln = $fields[4]; } else { print STDERR "Unrecongnized tabular format for blast result\nScript works with 25 column or 5 custom column(qseqid,qstart,qend,ssseq,sseq)\n"; exit(0); } my @field_name = split (/\_/,$query_name); my $name = $field_name[0]."_".$field_name[1]; if ($query_name =~ /random/){ $name .= "_".$field_name[2]; # print $name."\n"; } if (!$hash_name{$name}){ $hash_name{$name}=0; } elsif ($hash_name{$name}>1){ next; } # if ($query_name eq "chrUnn_random_8279117_1_M1_a"){ # print "READ : $query_name\n$name\n"; # print "HASH : ",$hash_name{$name},"\n"; # # exit(0); # } chomp($query_aln); chomp($subject_aln); my $nb_gap_query=0; if (length($query_aln) == length($subject_aln)){ if (length($query_aln)<$WINDOW_LENGTH-$NB_MISMATCH_MAX){ } else { my @q = split(//,$query_aln); my @s = split(//,$subject_aln); for (my $i=0;$i<=$#q;$i++){ my $global_idx = $query_start-1+$i-$nb_gap_query; if ($q[$i] eq "-"){ if ($global_idx < $WINDOW_LENGTH){ $compt_mismatch_5p++; } elsif ($global_idx > $WINDOW_LENGTH){ $compt_mismatch_3p++; } $nb_gap_query++; #On compte les gap dans la query pour les soustraire de l'index global } else { if ($q[$i] ne $s[$i]){ if ($global_idx < $WINDOW_LENGTH){ $compt_mismatch_5p++; } elsif ($global_idx > $WINDOW_LENGTH){ $compt_mismatch_3p++; } } } } $compt_mismatch_5p += $query_start-1; $compt_mismatch_3p += $WINDOW_LENGTH *2 + 1 - $query_stop; # for (my $i=0;$i<$window_length;$i++){ # if ($tbl_q_aln[$i] eq "#"){ # $compt_mismatch_5p++; # } # elsif ($tbl_q_aln[$i] ne $tbl_s_aln[$i]){ # $compt_mismatch_5p++; # } # else { # } # } # for (my $i=$window_length+1;$i<=$window_length*2;$i++){ # if ($tbl_q_aln[$i] eq "#"){ # $compt_mismatch_3p++; # } # elsif ($tbl_q_aln[$i] ne $tbl_s_aln[$i]){ # $compt_mismatch_3p++; # } # else { # } # } if (($compt_mismatch_5p <= $NB_MISMATCH_MAX)||($compt_mismatch_3p <= $NB_MISMATCH_MAX)){ $hash_name{$name}++; } } } else { print STDERR "incompatible subject and query alignement length\n $query_aln\n$subject_aln\n"; } # if ($line=~/chrCnn_random_49828229/){ # print $line; # print $query_aln,"\n"; # print $subject_aln,"\n"; # print $compt_mismatch_5p,"\t",$compt_mismatch_3p,"\n"; # print $hash_name{"chrCnn_random_49828229"},"\n"; # print "\n"; # } } # exit(0); close (INB); open(INV, $input_variant_file) or die ("Can't open $input_variant_file\n"); while (my $ligne = <INV>) { $nb_variant_checked++; my @champs = split (/\s+/,$ligne); my $header = $champs[0]."_".$champs[1]; if ($hash_name{$header}){ if ($hash_name{$header}==1){ print $ligne; $nb_variant_selected++; } } else { #print STDERR "No blast result for ",$header,"\n"; } } close(INV); open (LF,">$log_file") or die("Can't open $log_file\n"); print LF "\n####\t Blast filtering \n"; print LF "Variant checked :\t$nb_variant_checked\n"; print LF "Variant selected :\t$nb_variant_selected\n"; close (LF); # foreach my $key (sort keys %hash_name){ # print $key,"\t",$hash_name{$key},"\n"; # }