3
|
1 #!/usr/bin/perl
|
|
2 #V1.0.3 header added
|
|
3 #V1.0.2 suppressed the final sort (very heavyload) and replaced it by another level of hash
|
|
4 #V1.0.1 added log, option parameters
|
|
5 use strict;
|
|
6 use warnings;
|
|
7 use Getopt::Long;
|
|
8
|
|
9 my $inputblast;
|
|
10 my $outputjoin;
|
|
11 my $log_file;
|
|
12 my $MAX_OVERLAP_FRACTION = 0.5;
|
|
13 my $MAX_OVERLAP_LENGTH_IGNORED = 3;
|
|
14 my $VERBOSE = "OFF";
|
|
15 my $ALLOWED_GAP_FRACTION_FOR_MERGING = 0.3;
|
|
16 my $HEADER ="";
|
|
17
|
|
18 GetOptions (
|
|
19 "input_blasttab_file=s" => \$inputblast,
|
|
20 "output_joinmatch_file=s" => \$outputjoin,
|
|
21 "log_file=s" => \$log_file,
|
|
22 "header=s" => \$HEADER,
|
|
23 "max_overlap_fraction=f" => \$MAX_OVERLAP_FRACTION,
|
|
24 "max_overlap_length_ignored=i" =>\$MAX_OVERLAP_LENGTH_IGNORED
|
|
25 ) or die("Error in command line arguments\n");
|
|
26
|
|
27 open(IB, $inputblast) or die ("Can't open $inputblast \n");
|
|
28 open (LF,">$log_file") or die("Can't open $log_file\n");
|
|
29
|
|
30
|
|
31 my %match_by_query;
|
|
32
|
|
33 my @query_keys;
|
|
34 my %querys;
|
|
35
|
|
36
|
|
37 my $stats_nb_match=0;
|
|
38 my $stats_included=0;
|
|
39 my $stats_large_overlapping=0;
|
|
40 my %stats_query_coverage;
|
|
41 $stats_query_coverage{"0-10%"}=0;
|
|
42 $stats_query_coverage{"10-20%"}=0;
|
|
43 $stats_query_coverage{"20-30%"}=0;
|
|
44 $stats_query_coverage{"30-40%"}=0;
|
|
45 $stats_query_coverage{"40-50%"}=0;
|
|
46 $stats_query_coverage{"50-60%"}=0;
|
|
47 $stats_query_coverage{"60-70%"}=0;
|
|
48 $stats_query_coverage{"70-80%"}=0;
|
|
49 $stats_query_coverage{"80-90%"}=0;
|
|
50 $stats_query_coverage{"90-100%"}=0;
|
|
51
|
|
52 my $current_query="";
|
|
53 while (my $ligne = <IB>){
|
|
54 my @fields = split (/\t/,$ligne);
|
|
55 if ($#fields != 9){
|
|
56 print STDERR "Invalid blasttab format, must have 10 columns\n";
|
|
57 exit(0);
|
|
58 }
|
|
59 $stats_nb_match++;
|
|
60 my %match;
|
|
61 $match{"Query"}=$fields[0];
|
|
62 if (!$querys{$match{"Query"}}){
|
|
63 push(@query_keys,$match{"Query"});
|
|
64 $querys{$match{"Query"}} = 1;
|
|
65 }
|
|
66 $match{"Subject_id"}=$fields[1];
|
|
67 $match{"Orientation"}="+";
|
|
68 $match{"Query_start"}=$fields[2];
|
|
69 $match{"Query_end"}=$fields[3];
|
|
70 $match{"Subject_start"}=$fields[4];
|
|
71 $match{"Subject_end"}=$fields[5];
|
|
72
|
|
73
|
|
74 if ($fields[2]>$fields[3]){
|
|
75 $match{"Query_start"}=$fields[3];
|
|
76 $match{"Query_end"}=$fields[2];
|
|
77 $match{"Orientation"}="-";
|
|
78 #print "- $ligne";
|
|
79 }
|
|
80
|
|
81 if ($fields[4]>$fields[5]){
|
|
82 $match{"Subject_start"}=$fields[5];
|
|
83 $match{"Subject_end"}=$fields[4];
|
|
84 $match{"Orientation"}="-";
|
|
85 #print "- $ligne";
|
|
86 }
|
|
87 $match{"Similarity"}=$fields[6];
|
|
88 $match{"Query_length"}=$fields[7];
|
|
89 $match{"Subject_length"}=$fields[8];
|
|
90 chomp($fields[9]);
|
|
91 $match{"Subject"}=$fields[9];
|
|
92
|
|
93 $match{"Ligne"}=$ligne;
|
|
94
|
|
95 my $querykey = $match{"Query"};
|
|
96 my $key = $match{"Query"}."##".$match{"Subject"}."##".$match{"Orientation"};
|
|
97 if ($match{"Subject_length"}==0){
|
|
98 print LF "Match 0",$ligne,"\n",$match{"Subject_length"},"\n";
|
|
99 }
|
|
100 my %match_by_query_and_subject;
|
|
101 my @match_table;
|
|
102
|
|
103 if ($match_by_query{$querykey}){
|
|
104 %match_by_query_and_subject = %{$match_by_query{$querykey}};
|
|
105 }
|
|
106 if ($match_by_query_and_subject{$key}){
|
|
107 @match_table=@{$match_by_query_and_subject{$key}};
|
|
108 }
|
|
109
|
|
110 push (@match_table,\%match);
|
|
111 $match_by_query_and_subject{$key} = \@match_table;
|
|
112 $match_by_query{$querykey}= \%match_by_query_and_subject;
|
|
113 }
|
|
114
|
|
115 close (IB);
|
|
116
|
|
117 #print LF "NB query : $#query_keys\n";
|
|
118 #foreach my $querykey (sort @query_keys){
|
|
119 # my %current_match_by_query_and_subject = %{$match_by_query{$querykey}};
|
|
120 # foreach my $key (sort {$a cmp $b} keys %current_match_by_query_and_subject){
|
|
121 # my @current_match_table = sort sortbyquerycoord @{$current_match_by_query_and_subject{$key}};
|
|
122 # for (my $i=0;$i<=$#current_match_table;$i++){
|
|
123 # my %current_match = %{$current_match_table[$i]};
|
|
124 # print LF $current_match{"Ligne"}."\n";
|
|
125 # }
|
|
126 # }
|
|
127 #}
|
|
128 #exit(0);
|
|
129
|
|
130 open (OJ, ">$outputjoin") or die ("Can't open $outputjoin \n");
|
|
131 print OJ "##",$HEADER,"\n";
|
|
132 print OJ "#Query\tSubject_Id\torientation\tQuery_coverage\tSubject_coverage\tIdentity\tmin_query\tmax_query\tmin_subject\tmax_subject\tNBmatch\tq_length\tsub_length\tsubject\n";
|
|
133
|
|
134 foreach my $querykey (sort @query_keys){
|
|
135 my %current_match_by_query_and_subject = %{$match_by_query{$querykey}};
|
|
136 my @match_joined; #la table qui contient les matchs (hash) projeté de chaque subject pour une query
|
|
137 foreach my $key (sort {$a cmp $b} keys %current_match_by_query_and_subject){
|
|
138 my @current_match_table = sort sortbyquerycoord @{$current_match_by_query_and_subject{$key}};
|
|
139 my @duplicate;
|
|
140 my @overlap;
|
|
141 for (my $i=0;$i<=$#current_match_table;$i++){
|
|
142 push (@duplicate,0);
|
|
143 }
|
|
144 if ($VERBOSE eq "ON"){
|
|
145 print LF "\nTable Match ($#current_match_table)\t$key\n";
|
|
146 for (my $i=0;$i<=$#current_match_table;$i++){
|
|
147 my %match=%{$current_match_table[$i]};
|
|
148 print LF $match{"Query"},"\t",$match{"Subject_id"},"\t",$match{"Orientation"},"\t",$match{"Query_start"},"\t",$match{"Query_end"},"\t";
|
|
149 print LF $match{"Subject_start"},"\t",$match{"Subject_end"},"\t",$match{"Subject_length"},"\t",$match{"Similarity"},"\n";
|
|
150 }
|
|
151 }
|
|
152
|
|
153 #Scan d'inclusion strict
|
|
154 for (my $i=0;$i<=$#current_match_table;$i++){
|
|
155 my %match1=%{$current_match_table[$i]};
|
|
156 for (my $j=0;$j<=$#current_match_table;$j++){
|
|
157 if (($j != $i)&&($duplicate[$j]==0)){ # On scan dans les deux sens, pas seulement $j = $i+1 a cause du last;
|
|
158 my %match2=%{$current_match_table[$j]};
|
|
159 # Inclus Subject
|
|
160 if (($match1{"Subject_start"}>=$match2{"Subject_start"})&&($match1{"Subject_end"}<=$match2{"Subject_end"}))
|
|
161 {
|
|
162 $duplicate[$i]=1;
|
|
163 last;
|
|
164 }
|
|
165 # Inclus Query
|
|
166 elsif (($match1{"Query_start"}>=$match2{"Query_start"})&&($match1{"Query_end"}<=$match2{"Query_end"}))
|
|
167 {
|
|
168 $duplicate[$i]=2;
|
|
169 last;
|
|
170 }
|
|
171
|
|
172 }
|
|
173 }
|
|
174 }
|
|
175
|
|
176 my @current_match_table_filtered;
|
|
177 for (my $i=0;$i<=$#current_match_table;$i++){
|
|
178 if ($duplicate[$i] == 0){
|
|
179 push (@current_match_table_filtered,$current_match_table[$i]);
|
|
180 }
|
|
181 else{
|
|
182 $stats_included++;
|
|
183 }
|
|
184 }
|
|
185
|
|
186 if ($#current_match_table > $#current_match_table_filtered){
|
|
187 @current_match_table = @current_match_table_filtered;
|
|
188 if ($VERBOSE eq "ON"){
|
|
189 print LF "Table Match filtered 1 ($#current_match_table)\n";
|
|
190 for (my $i=0;$i<=$#current_match_table;$i++){
|
|
191 my %match=%{$current_match_table[$i]};
|
|
192 print LF $match{"Query"},"\t",$match{"Subject_id"},"\t",$match{"Orientation"},"\t",$match{"Query_start"},"\t",$match{"Query_end"},"\t";
|
|
193 print LF $match{"Subject_start"},"\t",$match{"Subject_end"},"\t",$match{"Subject_length"},"\t",$match{"Similarity"},"\n";
|
|
194 }
|
|
195 }
|
|
196 }
|
|
197
|
|
198 undef @duplicate;
|
|
199 for (my $i=0;$i<=$#current_match_table;$i++){
|
|
200 push (@duplicate,0);
|
|
201 }
|
|
202 ########
|
|
203
|
|
204 ###Scan d'overlap trop important
|
|
205 # D'abord subject
|
|
206 for (my $i=0;$i<=$#current_match_table-1;$i++){
|
|
207 my %match1=%{$current_match_table[$i]};
|
|
208 for (my $j=$i+1;$j<=$#current_match_table;$j++){
|
|
209 my %match2=%{$current_match_table[$j]};
|
|
210 if (($match1{"Subject_start"}<=$match2{"Subject_start"})&&($match1{"Subject_end"}>=$match2{"Subject_start"})&&($match1{"Subject_end"}<=$match2{"Subject_end"}))
|
|
211 {
|
|
212 my $overlap_length = $match1{"Subject_end"} - $match2{"Subject_start"}+1;
|
|
213 my $length1 = $match1{"Subject_end"} - $match1{"Subject_start"} +1;
|
|
214 my $length2 = $match2{"Subject_end"} - $match2{"Subject_start"} +1;
|
|
215
|
|
216 if (($length1 >= $length2)&&($overlap_length > $length2 * $MAX_OVERLAP_FRACTION)&&($overlap_length > $MAX_OVERLAP_LENGTH_IGNORED)){
|
|
217 $duplicate[$j]=1;
|
|
218 }
|
|
219 elsif (($length2 >= $length1)&&($overlap_length > $length1 * $MAX_OVERLAP_FRACTION)&&($overlap_length > $MAX_OVERLAP_LENGTH_IGNORED)){
|
|
220 $duplicate[$i]=1;
|
|
221 }
|
|
222 else {
|
|
223 }
|
|
224 }
|
|
225 elsif (($match2{"Subject_start"}<=$match1{"Subject_start"})&&($match2{"Subject_end"}>=$match1{"Subject_start"})&&($match2{"Subject_end"}<=$match1{"Subject_end"})) #Recherche d'inclusion dans les deux sens car els match sont classé par query coord, par par subject coord
|
|
226 {
|
|
227 my $overlap_length = $match2{"Subject_end"} - $match1{"Subject_start"}+1;
|
|
228 my $length1 = $match1{"Subject_end"} - $match1{"Subject_start"} +1;
|
|
229 my $length2 = $match2{"Subject_end"} - $match2{"Subject_start"} +1;
|
|
230
|
|
231 if (($length1 >= $length2)&&($overlap_length > $length2 * $MAX_OVERLAP_FRACTION)&&($overlap_length > $MAX_OVERLAP_LENGTH_IGNORED)){
|
|
232 $duplicate[$j]=1;
|
|
233 }
|
|
234 elsif (($length2 >= $length1)&&($overlap_length > $length1 * $MAX_OVERLAP_FRACTION)&&($overlap_length > $MAX_OVERLAP_LENGTH_IGNORED)){
|
|
235 $duplicate[$i]=1;
|
|
236 }
|
|
237 else {
|
|
238 }
|
|
239 }
|
|
240 }
|
|
241 }
|
|
242
|
|
243 undef @current_match_table_filtered;
|
|
244 for (my $i=0;$i<=$#current_match_table;$i++){
|
|
245 if ($duplicate[$i] == 0){
|
|
246 push (@current_match_table_filtered,$current_match_table[$i]);
|
|
247 }
|
|
248 else {
|
|
249 $stats_large_overlapping++;
|
|
250 }
|
|
251 }
|
|
252
|
|
253 if ($#current_match_table > $#current_match_table_filtered){
|
|
254 @current_match_table = @current_match_table_filtered;
|
|
255 if ($VERBOSE eq "ON"){
|
|
256 print LF "Table Match filtered 2 ($#current_match_table)\n";
|
|
257 for (my $i=0;$i<=$#current_match_table;$i++){
|
|
258 my %match=%{$current_match_table[$i]};
|
|
259 print LF $match{"Query"},"\t",$match{"Subject_id"},"\t",$match{"Orientation"},"\t",$match{"Query_start"},"\t",$match{"Query_end"},"\t";
|
|
260 print LF $match{"Subject_start"},"\t",$match{"Subject_end"},"\t",$match{"Subject_length"},"\t",$match{"Similarity"},"\n";
|
|
261 }
|
|
262 }
|
|
263
|
|
264 }
|
|
265
|
|
266 undef @duplicate;
|
|
267 for (my $i=0;$i<=$#current_match_table;$i++){
|
|
268 push (@duplicate,0);
|
|
269 }
|
|
270
|
|
271 # Ensuite Query (Subject puis Query pour evitez des deletions complementaires query update qui enleverait toutes les entrées)
|
|
272
|
|
273 for (my $i=0;$i<=$#current_match_table-1;$i++){
|
|
274 my %match1=%{$current_match_table[$i]};
|
|
275 for (my $j=$i+1;$j<=$#current_match_table;$j++){
|
|
276 my %match2=%{$current_match_table[$j]};
|
|
277 if (($match1{"Query_start"}<=$match2{"Query_start"})&&($match1{"Query_end"}>=$match2{"Query_start"})&&($match1{"Query_end"}<=$match2{"Query_end"}))
|
|
278 {
|
|
279 my $overlap_length = $match1{"Query_end"} - $match2{"Query_start"}+1;
|
|
280 my $length1 = $match1{"Query_end"} - $match1{"Query_start"} +1;
|
|
281 my $length2 = $match2{"Query_end"} - $match2{"Query_start"} +1;
|
|
282
|
|
283 if (($length1 >= $length2)&&($overlap_length > $length2 * $MAX_OVERLAP_FRACTION)&&($overlap_length > $MAX_OVERLAP_LENGTH_IGNORED)){
|
|
284 $duplicate[$j]=1;
|
|
285 }
|
|
286 elsif (($length2 >= $length1)&&($overlap_length > $length1 * $MAX_OVERLAP_FRACTION)&&($overlap_length > $MAX_OVERLAP_LENGTH_IGNORED)){
|
|
287 $duplicate[$i]=1;
|
|
288 }
|
|
289 else {
|
|
290 }
|
|
291 }
|
|
292 #un seul sans les query sont classés
|
|
293 }
|
|
294 }
|
|
295
|
|
296 undef @current_match_table_filtered;
|
|
297 for (my $i=0;$i<=$#current_match_table;$i++){
|
|
298 if ($duplicate[$i] == 0){
|
|
299 push (@current_match_table_filtered,$current_match_table[$i]);
|
|
300 }
|
|
301 else {
|
|
302 $stats_large_overlapping++;
|
|
303 }
|
|
304 }
|
|
305
|
|
306 if ($#current_match_table > $#current_match_table_filtered){
|
|
307 @current_match_table = @current_match_table_filtered;
|
|
308 if ($VERBOSE eq "ON"){
|
|
309 print LF "Table Match filtered 3 ($#current_match_table)\n";
|
|
310 for (my $i=0;$i<=$#current_match_table;$i++){
|
|
311 my %match=%{$current_match_table[$i]};
|
|
312 print LF $match{"Query"},"\t",$match{"Subject_id"},"\t",$match{"Orientation"},"\t",$match{"Query_start"},"\t",$match{"Query_end"},"\t";
|
|
313 print LF $match{"Subject_start"},"\t",$match{"Subject_end"},"\t",$match{"Subject_length"},"\t",$match{"Similarity"},"\n";
|
|
314 }
|
|
315 }
|
|
316 }
|
|
317
|
|
318
|
|
319
|
|
320 #Fusion des Hsp et Calcul des nouvelles metriques
|
|
321
|
|
322 my $overlap_length = 0;
|
|
323 my $Subject_coverage;
|
|
324 my $Query_coverage;
|
|
325 my $Identity;
|
|
326 my $nb_match = 0;
|
|
327
|
|
328
|
|
329 my $nb_covered_subject=0;
|
|
330 my $nb_covered_query=0;
|
|
331 my %match=%{$current_match_table[0]};
|
|
332 my $q_length = $match{"Query_length"};
|
|
333 my $sub_length = $match{"Subject_length"};
|
|
334 my $subject = $match{"Subject"};
|
|
335 my $min_query = $match{"Query_start"};
|
|
336 my $max_query = $match{"Query_end"};
|
|
337 my $min_subject = $match{"Subject_start"};
|
|
338 my $max_subject = $match{"Subject_end"};
|
|
339 my $orientation = $match{"Orientation"};
|
|
340 my $Query = $match{"Query"};
|
|
341 my $Subject_Id = $match{"Subject_id"};
|
|
342 my $Subject = $match{"Subject"};
|
|
343
|
|
344
|
|
345 for (my $i=0;$i<=$#current_match_table;$i++){
|
|
346 my %match1=%{$current_match_table[$i]};
|
|
347 $nb_covered_subject += $match1{"Subject_end"} - $match1{"Subject_start"} +1;
|
|
348 $nb_covered_query += $match1{"Query_end"} - $match1{"Query_start"} +1;
|
|
349
|
|
350 if ($match1{"Query_start"}<$min_query){
|
|
351 $min_query = $match1{"Query_start"};
|
|
352 }
|
|
353 if ($match1{"Query_end"}>$max_query){
|
|
354 $max_query = $match1{"Query_end"};
|
|
355 }
|
|
356 if ($match1{"Subject_start"}<$min_subject){
|
|
357 $min_subject = $match1{"Subject_start"};
|
|
358 }
|
|
359 if ($match1{"Subject_end"}>$max_subject){
|
|
360 $max_subject = $match1{"Subject_end"};
|
|
361 }
|
|
362
|
|
363 $nb_match += $match1{"Similarity"};
|
|
364
|
|
365 for (my $j=$i+1;$j<=$#current_match_table;$j++){
|
|
366 my $overlap_query=0;
|
|
367 my $overlap_subject=0;
|
|
368 my %match2=%{$current_match_table[$j]};
|
|
369
|
|
370 if (($match1{"Subject_start"}<=$match2{"Subject_start"})&&($match1{"Subject_end"}>=$match2{"Subject_start"})&&($match1{"Subject_end"}<=$match2{"Subject_end"}))
|
|
371 {
|
|
372 $overlap_subject = $match1{"Subject_end"} - $match2{"Subject_start"}+1;
|
|
373 }
|
|
374 elsif (($match2{"Subject_start"}<=$match1{"Subject_start"})&&($match2{"Subject_end"}>=$match1{"Subject_start"})&&($match2{"Subject_end"}<=$match1{"Subject_end"}))
|
|
375 {
|
|
376 $overlap_subject = $match2{"Subject_end"} - $match1{"Subject_start"}+1;
|
|
377 }
|
|
378
|
|
379 if (($match1{"Query_start"}<=$match2{"Query_start"})&&($match1{"Query_end"}>=$match2{"Query_start"})&&($match1{"Query_end"}<=$match2{"Query_end"}))
|
|
380 {
|
|
381 $overlap_query = $match1{"Query_end"} - $match2{"Query_start"} +1;
|
|
382 }
|
|
383 elsif (($match2{"Query_start"}<=$match1{"Query_start"})&&($match2{"Query_end"}>=$match1{"Query_start"})&&($match2{"Query_end"}<=$match1{"Query_end"}))
|
|
384 {
|
|
385 $overlap_query = $match2{"Query_end"} - $match1{"Query_start"}+1;
|
|
386 }
|
|
387
|
|
388 if ($overlap_query > $overlap_subject){
|
|
389 $overlap_length += $overlap_query;
|
|
390 }
|
|
391 else {
|
|
392 $overlap_length += $overlap_subject;
|
|
393 }
|
|
394
|
|
395 $nb_covered_subject -= $overlap_subject;
|
|
396 $nb_covered_query-= $overlap_query;
|
|
397 }
|
|
398
|
|
399 }
|
|
400 ### Cas des tblastx ou le nb match est en aa, et les nb_covered en base
|
|
401 if ($nb_match/$nb_covered_subject<0.34){
|
|
402 $nb_match = $nb_match *3;
|
|
403 }
|
|
404 ####
|
|
405
|
|
406 $Identity = sprintf("%.2f",($nb_match-$overlap_length)*100/$nb_covered_subject);
|
|
407
|
|
408 $Subject_coverage = sprintf("%.2f",$nb_covered_subject*100/$sub_length);
|
|
409 $Query_coverage = sprintf("%.2f",$nb_covered_query*100/$q_length);
|
|
410
|
|
411 if ($Subject_coverage<0.1){$stats_query_coverage{"0-10%"}++;}
|
|
412 elsif($Subject_coverage<0.2){$stats_query_coverage{"10-20%"}++;}
|
|
413 elsif($Subject_coverage<0.3){$stats_query_coverage{"20-30%"}++;}
|
|
414 elsif($Subject_coverage<0.4){$stats_query_coverage{"30-40%"}++;}
|
|
415 elsif($Subject_coverage<0.5){$stats_query_coverage{"40-50%"}++;}
|
|
416 elsif($Subject_coverage<0.6){$stats_query_coverage{"50-60%"}++;}
|
|
417 elsif($Subject_coverage<0.7){$stats_query_coverage{"60-70%"}++;}
|
|
418 elsif($Subject_coverage<0.8){$stats_query_coverage{"70-80%"}++;}
|
|
419 elsif($Subject_coverage<0.9){$stats_query_coverage{"80-90%"}++;}
|
|
420 else{$stats_query_coverage{"90-100%"}++;}
|
|
421
|
|
422 if ($VERBOSE eq "ON"){
|
|
423 print LF "Final\n";
|
|
424 print LF $Query,"\t",$Subject_Id,"\t",$orientation,"\t",$min_query,"\t",$max_query,"\t",$min_subject,"\t",$max_subject,"\t",$sub_length,"\t";
|
|
425 print LF "NB:",$nb_match,"\t","O:",$overlap_length,"\t","CQ:",$nb_covered_query,"\t","CS:",$nb_covered_subject,"\t",$Query_coverage,"\t",$Subject_coverage,"\t",$Identity,"\n";
|
|
426
|
|
427 }
|
|
428
|
|
429 if ($subject=~/^(.*?)\s*$/){
|
|
430 $subject = $1;
|
|
431 }
|
|
432 my %current_match_joined;
|
|
433 $current_match_joined{"Query"}=$Query;
|
|
434 $current_match_joined{"Query_start"}=$min_query;
|
|
435 $current_match_joined{"Query_end"}=$max_query;
|
|
436 $current_match_joined{"Query_length"}=$q_length;
|
|
437 $current_match_joined{"QCoverage"} = $Query_coverage;
|
|
438 $current_match_joined{"Subject_id"}=$Subject_Id;
|
|
439 $current_match_joined{"Subject"}=$subject;
|
|
440 $current_match_joined{"Subject_start"}=$min_subject;
|
|
441 $current_match_joined{"Subject_end"}=$max_subject;
|
|
442 $current_match_joined{"Subject_length"}=$sub_length;
|
|
443 $current_match_joined{"SCoverage"} = $Subject_coverage;
|
|
444 $current_match_joined{"Similarity"}=$Identity;
|
|
445 my $NBmatch = $nb_match-$overlap_length;
|
|
446 $current_match_joined{"Nbmatch"}=$NBmatch;
|
|
447 $current_match_joined{"Display"}="$Query\t$Subject_Id\t$orientation\t$Query_coverage%\t$Subject_coverage%\t$Identity%\t$min_query\t$max_query\t$min_subject\t$max_subject\t$NBmatch\t$q_length\t$sub_length\t$subject";
|
|
448
|
|
449 push(@match_joined,\%current_match_joined);
|
|
450 #print OJ $match_joined{"Display"},"\n";
|
|
451 }
|
|
452 my @match_joined_sorted = sort sortbyrelevanceandsubject @match_joined;
|
|
453 for (my $i=0;$i<=$#match_joined_sorted;$i++){
|
|
454 my %match = %{$match_joined_sorted[$i]};
|
|
455 print OJ $match{"Display"},"\n";
|
|
456 }
|
|
457 }
|
|
458
|
|
459
|
|
460 #my %all_match_joined_best;
|
|
461
|
|
462 #foreach my $key (sort sortkey keys %all_match_joined){
|
|
463 # my %match = %{$all_match_joined{$key}};
|
|
464 # print OJ $match{"Display"},"\n";
|
|
465 #}
|
|
466
|
|
467 #close (OB);
|
|
468 close (OJ);
|
|
469
|
|
470
|
|
471 print LF "Nb query : $#query_keys\n";
|
|
472 print LF "Nb match : $stats_nb_match\n";
|
|
473 print LF "Nb match filtered included / too large overlap : $stats_included / $stats_large_overlapping \n";
|
|
474 print LF "Query coverage\n";
|
|
475 print LF "percent:\t";
|
|
476 foreach my $key (sort {$a cmp $b} keys %stats_query_coverage) {
|
|
477 print LF $key,"\t";
|
|
478 }
|
|
479 print LF "\n number :\t";
|
|
480 foreach my $key (sort {$a cmp $b} keys %stats_query_coverage) {
|
|
481 print LF $stats_query_coverage{$key},"\t";
|
|
482 }
|
|
483 print LF "\n";
|
|
484
|
|
485
|
|
486 close (LF);
|
|
487
|
|
488
|
|
489 # for (my $i=0;$i<=$#all_match_joined;$i++){
|
|
490 # my $match_joined = %{$all_match_joined[$i]};
|
|
491 # print $match_joined{"Query"},"\t",$match_joined{"Subject"},"\t",$match_joined{"Subject_id"},"\t",$match_joined{"Similarity"},"\t",$match_joined{"Query_length"},"\t",$match_joined{"Subject_length"},"\n";
|
|
492 # }
|
|
493
|
|
494
|
|
495 sub mysort{
|
|
496 my %matcha=%{$a};
|
|
497 my %matchb=%{$b};
|
|
498
|
|
499 #print "TEST : ",$matcha{"Query_start"}, " / ", $matchb{"Query_start"},"\n";
|
|
500
|
|
501 $matcha{"Query_start"} <=> $matchb{"Query_start"}
|
|
502 ||
|
|
503 $matcha{"Query_end"} <=> $matchb{"Query_end"}
|
|
504
|
|
505 }
|
|
506
|
|
507 sub sortbyquerycoord{
|
|
508 my %matcha=%{$a};
|
|
509 my %matchb=%{$b};
|
|
510
|
|
511 #print "TEST : ",$matcha{"Query_start"}, " / ", $matchb{"Query_start"},"\n";
|
|
512
|
|
513 $matcha{"Query_start"} <=> $matchb{"Query_start"}
|
|
514 ||
|
|
515 $matcha{"Query_end"} <=> $matchb{"Query_end"}
|
|
516
|
|
517 }
|
|
518
|
|
519 sub sortbyrelevanceandsubject{
|
|
520 my %matcha=%{$a};
|
|
521 my %matchb=%{$b};
|
|
522
|
|
523 $matchb{"Nbmatch"} <=> $matcha{"Nbmatch"}
|
|
524 ||
|
|
525 $matchb{"QCoverage"} <=> $matcha{"QCoverage"}
|
|
526 ||
|
|
527 $matcha{"Subject"} cmp $matchb{"Subject"}
|
|
528 }
|
|
529
|
|
530
|
|
531 sub sortkey {
|
|
532 my @fieldsa = split (/\#/,$a);
|
|
533 my @fieldsb = split (/\#/,$b);
|
|
534
|
|
535 #print "$a\n$b\n";
|
|
536 #print $fieldsa[0]," cmp ",$fieldsb[0],"\n";
|
|
537 #exit(0);
|
|
538
|
|
539 $fieldsa[0] cmp $fieldsb[0]
|
|
540 ||
|
|
541 $fieldsa[1] cmp $fieldsb[1]
|
|
542 ||
|
|
543 $fieldsa[2] <=> $fieldsb[2]
|
|
544
|
|
545 }
|