0
|
1 #!/usr/bin/perl
|
10
|
2 #V1.1.0 manage empty files
|
7
|
3 #V1.0.1 added log, option parameters
|
0
|
4 use strict;
|
|
5 use warnings;
|
7
|
6 use Getopt::Long;
|
0
|
7
|
|
8
|
7
|
9 my $input_variant_file;
|
|
10 my $input_blast_file;
|
|
11 my $log_file;
|
|
12 my $WINDOW_LENGTH= 50;
|
|
13 my $NB_MISMATCH_MAX = 3;
|
0
|
14
|
7
|
15 GetOptions (
|
|
16 "input_variant_file=s" => \$input_variant_file,
|
|
17 "input_blast_file=s" => \$input_blast_file,
|
|
18 "window_length=s" => \$WINDOW_LENGTH,
|
|
19 "log_file=s" => \$log_file,
|
|
20 "nb_mismatch_max=s" => \$NB_MISMATCH_MAX
|
|
21 ) or die("Error in command line arguments\n");
|
|
22
|
|
23 my $nb_variant_checked=0;
|
|
24 my $nb_variant_selected=0;
|
0
|
25 my %hash_name;
|
|
26
|
|
27 open(INB, $input_blast_file) or die ("Can't open $input_blast_file\n");
|
10
|
28 if ( -z INB){
|
|
29 exit(0);
|
|
30 }
|
|
31
|
0
|
32 while (my $line =<INB>){
|
|
33 my @fields = split (/\s+/,$line);
|
|
34 # print $#fields,"\n";
|
|
35 # print $line;
|
|
36 # exit(0);
|
|
37 my $query_name;
|
|
38 my $query_start;
|
|
39 my $query_stop;
|
|
40 my $query_aln;
|
|
41 my $subject_aln;
|
|
42 my $compt_mismatch_5p=0;
|
|
43 my $compt_mismatch_3p=0;
|
|
44
|
|
45 if ($#fields == 24){
|
|
46 $query_name = $fields[0];
|
|
47 $query_start = $fields[6];
|
|
48 $query_stop = $fields[7];
|
|
49 $query_aln = $fields[20];
|
|
50 $subject_aln = $fields[21];
|
|
51 }
|
|
52 elsif ($#fields == 4){
|
|
53 $query_name = $fields[0];
|
|
54 $query_start = $fields[1];
|
|
55 $query_stop = $fields[2];
|
|
56 $query_aln = $fields[3];
|
|
57 $subject_aln = $fields[4];
|
|
58 }
|
|
59 else {
|
|
60 print STDERR "Unrecongnized tabular format for blast result\nScript works with 25 column or 5 custom column(qseqid,qstart,qend,ssseq,sseq)\n";
|
|
61 exit(0);
|
|
62 }
|
|
63
|
|
64
|
|
65 my @field_name = split (/\_/,$query_name);
|
|
66 my $name = $field_name[0]."_".$field_name[1];
|
|
67 if ($query_name =~ /random/){
|
|
68 $name .= "_".$field_name[2];
|
|
69 # print $name."\n";
|
|
70
|
|
71 }
|
|
72 if (!$hash_name{$name}){
|
|
73 $hash_name{$name}=0;
|
|
74 }
|
|
75 elsif ($hash_name{$name}>1){
|
|
76 next;
|
|
77 }
|
|
78 # if ($query_name eq "chrUnn_random_8279117_1_M1_a"){
|
|
79 # print "READ : $query_name\n$name\n";
|
|
80 # print "HASH : ",$hash_name{$name},"\n";
|
|
81 # # exit(0);
|
|
82 # }
|
|
83
|
|
84
|
|
85
|
|
86 chomp($query_aln);
|
|
87 chomp($subject_aln);
|
|
88
|
|
89 my $nb_gap_query=0;
|
|
90
|
|
91 if (length($query_aln) == length($subject_aln)){
|
7
|
92 if (length($query_aln)<$WINDOW_LENGTH-$NB_MISMATCH_MAX){
|
0
|
93 }
|
|
94 else {
|
|
95 my @q = split(//,$query_aln);
|
|
96 my @s = split(//,$subject_aln);
|
|
97 for (my $i=0;$i<=$#q;$i++){
|
|
98 my $global_idx = $query_start-1+$i-$nb_gap_query;
|
|
99 if ($q[$i] eq "-"){
|
7
|
100 if ($global_idx < $WINDOW_LENGTH){
|
0
|
101 $compt_mismatch_5p++;
|
|
102 }
|
7
|
103 elsif ($global_idx > $WINDOW_LENGTH){
|
0
|
104 $compt_mismatch_3p++;
|
|
105 }
|
|
106 $nb_gap_query++; #On compte les gap dans la query pour les soustraire de l'index global
|
|
107 }
|
|
108 else {
|
|
109 if ($q[$i] ne $s[$i]){
|
7
|
110 if ($global_idx < $WINDOW_LENGTH){
|
0
|
111 $compt_mismatch_5p++;
|
|
112 }
|
7
|
113 elsif ($global_idx > $WINDOW_LENGTH){
|
0
|
114 $compt_mismatch_3p++;
|
|
115 }
|
|
116 }
|
|
117 }
|
|
118 }
|
|
119 $compt_mismatch_5p += $query_start-1;
|
7
|
120 $compt_mismatch_3p += $WINDOW_LENGTH *2 + 1 - $query_stop;
|
0
|
121
|
|
122 # for (my $i=0;$i<$window_length;$i++){
|
|
123 # if ($tbl_q_aln[$i] eq "#"){
|
|
124 # $compt_mismatch_5p++;
|
|
125 # }
|
|
126 # elsif ($tbl_q_aln[$i] ne $tbl_s_aln[$i]){
|
|
127 # $compt_mismatch_5p++;
|
|
128 # }
|
|
129 # else {
|
|
130 # }
|
|
131
|
|
132 # }
|
|
133 # for (my $i=$window_length+1;$i<=$window_length*2;$i++){
|
|
134 # if ($tbl_q_aln[$i] eq "#"){
|
|
135 # $compt_mismatch_3p++;
|
|
136 # }
|
|
137 # elsif ($tbl_q_aln[$i] ne $tbl_s_aln[$i]){
|
|
138 # $compt_mismatch_3p++;
|
|
139 # }
|
|
140 # else {
|
|
141 # }
|
|
142 # }
|
7
|
143 if (($compt_mismatch_5p <= $NB_MISMATCH_MAX)||($compt_mismatch_3p <= $NB_MISMATCH_MAX)){
|
0
|
144 $hash_name{$name}++;
|
|
145 }
|
|
146
|
|
147 }
|
|
148 }
|
|
149 else {
|
|
150 print STDERR "incompatible subject and query alignement length\n $query_aln\n$subject_aln\n";
|
|
151 }
|
|
152
|
|
153 # if ($line=~/chrCnn_random_49828229/){
|
|
154 # print $line;
|
|
155 # print $query_aln,"\n";
|
|
156 # print $subject_aln,"\n";
|
|
157 # print $compt_mismatch_5p,"\t",$compt_mismatch_3p,"\n";
|
|
158 # print $hash_name{"chrCnn_random_49828229"},"\n";
|
|
159 # print "\n";
|
|
160 # }
|
|
161
|
|
162
|
|
163
|
|
164 }
|
|
165 # exit(0);
|
|
166
|
|
167 close (INB);
|
|
168
|
|
169 open(INV, $input_variant_file) or die ("Can't open $input_variant_file\n");
|
|
170
|
|
171 while (my $ligne = <INV>) {
|
7
|
172 $nb_variant_checked++;
|
0
|
173
|
|
174 my @champs = split (/\s+/,$ligne);
|
|
175 my $header = $champs[0]."_".$champs[1];
|
|
176
|
|
177 if ($hash_name{$header}){
|
|
178 if ($hash_name{$header}==1){
|
|
179 print $ligne;
|
7
|
180 $nb_variant_selected++;
|
0
|
181 }
|
|
182 }
|
|
183 else {
|
|
184 #print STDERR "No blast result for ",$header,"\n";
|
|
185 }
|
|
186
|
|
187
|
|
188 }
|
|
189
|
|
190 close(INV);
|
|
191
|
7
|
192 open (LF,">$log_file") or die("Can't open $log_file\n");
|
|
193 print LF "\n####\t Blast filtering \n";
|
|
194 print LF "Variant checked :\t$nb_variant_checked\n";
|
|
195 print LF "Variant selected :\t$nb_variant_selected\n";
|
|
196 close (LF);
|
|
197
|
0
|
198
|
|
199 # foreach my $key (sort keys %hash_name){
|
|
200 # print $key,"\t",$hash_name{$key},"\n";
|
|
201
|
|
202 # }
|