Mercurial > repos > mcharles > rapsosnp
diff rapsodyn/ParseBlastForUniqueMatch.pl @ 0:442a7c88b886 draft
Uploaded
author | mcharles |
---|---|
date | Wed, 10 Sep 2014 09:18:15 -0400 |
parents | |
children | 3f7b0788a1c4 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rapsodyn/ParseBlastForUniqueMatch.pl Wed Sep 10 09:18:15 2014 -0400 @@ -0,0 +1,176 @@ +#!/usr/bin/perl +use strict; +use warnings; + + +my $input_variant_file = $ARGV[0]; +my $input_blast_file = $ARGV[1]; +my $window_length = $ARGV[2]; +my $nb_mismatch_max = $ARGV[3]; + +my %hash_name; + +open(INB, $input_blast_file) or die ("Can't open $input_blast_file\n"); +while (my $line =<INB>){ + my @fields = split (/\s+/,$line); + # print $#fields,"\n"; + # print $line; + # exit(0); + my $query_name; + my $query_start; + my $query_stop; + my $query_aln; + my $subject_aln; + my $compt_mismatch_5p=0; + my $compt_mismatch_3p=0; + + if ($#fields == 24){ + $query_name = $fields[0]; + $query_start = $fields[6]; + $query_stop = $fields[7]; + $query_aln = $fields[20]; + $subject_aln = $fields[21]; + } + elsif ($#fields == 4){ + $query_name = $fields[0]; + $query_start = $fields[1]; + $query_stop = $fields[2]; + $query_aln = $fields[3]; + $subject_aln = $fields[4]; + } + else { + print STDERR "Unrecongnized tabular format for blast result\nScript works with 25 column or 5 custom column(qseqid,qstart,qend,ssseq,sseq)\n"; + exit(0); + } + + + my @field_name = split (/\_/,$query_name); + my $name = $field_name[0]."_".$field_name[1]; + if ($query_name =~ /random/){ + $name .= "_".$field_name[2]; + # print $name."\n"; + + } + if (!$hash_name{$name}){ + $hash_name{$name}=0; + } + elsif ($hash_name{$name}>1){ + next; + } + # if ($query_name eq "chrUnn_random_8279117_1_M1_a"){ + # print "READ : $query_name\n$name\n"; + # print "HASH : ",$hash_name{$name},"\n"; + # # exit(0); + # } + + + + chomp($query_aln); + chomp($subject_aln); + + my $nb_gap_query=0; + + if (length($query_aln) == length($subject_aln)){ + if (length($query_aln)<$window_length-$nb_mismatch_max){ + } + else { + my @q = split(//,$query_aln); + my @s = split(//,$subject_aln); + for (my $i=0;$i<=$#q;$i++){ + my $global_idx = $query_start-1+$i-$nb_gap_query; + if ($q[$i] eq "-"){ + if ($global_idx < $window_length){ + $compt_mismatch_5p++; + } + elsif ($global_idx > $window_length){ + $compt_mismatch_3p++; + } + $nb_gap_query++; #On compte les gap dans la query pour les soustraire de l'index global + } + else { + if ($q[$i] ne $s[$i]){ + if ($global_idx < $window_length){ + $compt_mismatch_5p++; + } + elsif ($global_idx > $window_length){ + $compt_mismatch_3p++; + } + } + } + } + $compt_mismatch_5p += $query_start-1; + $compt_mismatch_3p += $window_length *2 + 1 - $query_stop; + + # for (my $i=0;$i<$window_length;$i++){ + # if ($tbl_q_aln[$i] eq "#"){ + # $compt_mismatch_5p++; + # } + # elsif ($tbl_q_aln[$i] ne $tbl_s_aln[$i]){ + # $compt_mismatch_5p++; + # } + # else { + # } + + # } + # for (my $i=$window_length+1;$i<=$window_length*2;$i++){ + # if ($tbl_q_aln[$i] eq "#"){ + # $compt_mismatch_3p++; + # } + # elsif ($tbl_q_aln[$i] ne $tbl_s_aln[$i]){ + # $compt_mismatch_3p++; + # } + # else { + # } + # } + if (($compt_mismatch_5p <= $nb_mismatch_max)||($compt_mismatch_3p <= $nb_mismatch_max)){ + $hash_name{$name}++; + } + + } + } + else { + print STDERR "incompatible subject and query alignement length\n $query_aln\n$subject_aln\n"; + } + + # if ($line=~/chrCnn_random_49828229/){ + # print $line; + # print $query_aln,"\n"; + # print $subject_aln,"\n"; + # print $compt_mismatch_5p,"\t",$compt_mismatch_3p,"\n"; + # print $hash_name{"chrCnn_random_49828229"},"\n"; + # print "\n"; + # } + + + +} +# exit(0); + +close (INB); + +open(INV, $input_variant_file) or die ("Can't open $input_variant_file\n"); + +while (my $ligne = <INV>) { + + my @champs = split (/\s+/,$ligne); + my $header = $champs[0]."_".$champs[1]; + + if ($hash_name{$header}){ + if ($hash_name{$header}==1){ + print $ligne; + } + } + else { + #print STDERR "No blast result for ",$header,"\n"; + } + + +} + +close(INV); + + +# foreach my $key (sort keys %hash_name){ + # print $key,"\t",$hash_name{$key},"\n"; + +# }