Mercurial > repos > mcharles > rapsosnp
view rapsodyn/ParseBlastForUniqueMatch.pl @ 4:9074a5104cdd draft
Uploaded
author | mcharles |
---|---|
date | Wed, 17 Sep 2014 04:20:08 -0400 |
parents | 442a7c88b886 |
children | 3f7b0788a1c4 |
line wrap: on
line source
#!/usr/bin/perl use strict; use warnings; my $input_variant_file = $ARGV[0]; my $input_blast_file = $ARGV[1]; my $window_length = $ARGV[2]; my $nb_mismatch_max = $ARGV[3]; my %hash_name; open(INB, $input_blast_file) or die ("Can't open $input_blast_file\n"); while (my $line =<INB>){ my @fields = split (/\s+/,$line); # print $#fields,"\n"; # print $line; # exit(0); my $query_name; my $query_start; my $query_stop; my $query_aln; my $subject_aln; my $compt_mismatch_5p=0; my $compt_mismatch_3p=0; if ($#fields == 24){ $query_name = $fields[0]; $query_start = $fields[6]; $query_stop = $fields[7]; $query_aln = $fields[20]; $subject_aln = $fields[21]; } elsif ($#fields == 4){ $query_name = $fields[0]; $query_start = $fields[1]; $query_stop = $fields[2]; $query_aln = $fields[3]; $subject_aln = $fields[4]; } else { print STDERR "Unrecongnized tabular format for blast result\nScript works with 25 column or 5 custom column(qseqid,qstart,qend,ssseq,sseq)\n"; exit(0); } my @field_name = split (/\_/,$query_name); my $name = $field_name[0]."_".$field_name[1]; if ($query_name =~ /random/){ $name .= "_".$field_name[2]; # print $name."\n"; } if (!$hash_name{$name}){ $hash_name{$name}=0; } elsif ($hash_name{$name}>1){ next; } # if ($query_name eq "chrUnn_random_8279117_1_M1_a"){ # print "READ : $query_name\n$name\n"; # print "HASH : ",$hash_name{$name},"\n"; # # exit(0); # } chomp($query_aln); chomp($subject_aln); my $nb_gap_query=0; if (length($query_aln) == length($subject_aln)){ if (length($query_aln)<$window_length-$nb_mismatch_max){ } else { my @q = split(//,$query_aln); my @s = split(//,$subject_aln); for (my $i=0;$i<=$#q;$i++){ my $global_idx = $query_start-1+$i-$nb_gap_query; if ($q[$i] eq "-"){ if ($global_idx < $window_length){ $compt_mismatch_5p++; } elsif ($global_idx > $window_length){ $compt_mismatch_3p++; } $nb_gap_query++; #On compte les gap dans la query pour les soustraire de l'index global } else { if ($q[$i] ne $s[$i]){ if ($global_idx < $window_length){ $compt_mismatch_5p++; } elsif ($global_idx > $window_length){ $compt_mismatch_3p++; } } } } $compt_mismatch_5p += $query_start-1; $compt_mismatch_3p += $window_length *2 + 1 - $query_stop; # for (my $i=0;$i<$window_length;$i++){ # if ($tbl_q_aln[$i] eq "#"){ # $compt_mismatch_5p++; # } # elsif ($tbl_q_aln[$i] ne $tbl_s_aln[$i]){ # $compt_mismatch_5p++; # } # else { # } # } # for (my $i=$window_length+1;$i<=$window_length*2;$i++){ # if ($tbl_q_aln[$i] eq "#"){ # $compt_mismatch_3p++; # } # elsif ($tbl_q_aln[$i] ne $tbl_s_aln[$i]){ # $compt_mismatch_3p++; # } # else { # } # } if (($compt_mismatch_5p <= $nb_mismatch_max)||($compt_mismatch_3p <= $nb_mismatch_max)){ $hash_name{$name}++; } } } else { print STDERR "incompatible subject and query alignement length\n $query_aln\n$subject_aln\n"; } # if ($line=~/chrCnn_random_49828229/){ # print $line; # print $query_aln,"\n"; # print $subject_aln,"\n"; # print $compt_mismatch_5p,"\t",$compt_mismatch_3p,"\n"; # print $hash_name{"chrCnn_random_49828229"},"\n"; # print "\n"; # } } # exit(0); close (INB); open(INV, $input_variant_file) or die ("Can't open $input_variant_file\n"); while (my $ligne = <INV>) { my @champs = split (/\s+/,$ligne); my $header = $champs[0]."_".$champs[1]; if ($hash_name{$header}){ if ($hash_name{$header}==1){ print $ligne; } } else { #print STDERR "No blast result for ",$header,"\n"; } } close(INV); # foreach my $key (sort keys %hash_name){ # print $key,"\t",$hash_name{$key},"\n"; # }