0
|
1 #!/usr/bin/perl
|
|
2 use strict;
|
|
3 use warnings;
|
|
4 #use lib '/afs/pdc.kth.se/home/k/krifo/vol03/domainAligner/Inparanoid_new/lib64';
|
|
5 #use lib '/afs/pdc.kth.se/home/k/krifo/vol03/domainAligner/Inparanoid_new/lib64/lib64/perl5/site_perl/5.8.8/x86_64-linux-thread-multi';
|
|
6 use XML::Parser;
|
|
7
|
|
8 ##############################################################################################
|
|
9 #
|
|
10 # This parser parses output files from blastall (blastp) and outputs one-line descriptions
|
|
11 # for each hit with a score above or equal to a cut-off score that is specified as a parameter
|
|
12 # to the program. The one-line description contains the following ifomation separated by tab
|
|
13 # * Query id
|
|
14 # * Hit id
|
|
15 # * Bit score
|
|
16 # * Query length
|
|
17 # * Hit length
|
|
18 # * Length of longest segment on query. This is defined as the length of the segment from the
|
|
19 # first position od the first hsp to the last position of the last hsp. I.e. if the
|
|
20 # hsps are 1-10 and 90-100, the length of the longest segment will be 100.
|
|
21 # * Length of longest segment on hit, see explanation above.
|
|
22 # * Total match length on query. This is defined as the total length of all hsps. If the hsps
|
|
23 # are 1-10 and 90-100, the length will be 20.
|
|
24 # * Total match length on hit. see explanation above
|
|
25 # * Positions for all segments on the query and on the hit. Positions on the query is written
|
|
26 # as p:1-100 and positions on the hit is writteh as h:1-100. Positions for all hsps are
|
|
27 # specified separated nt tab. Positions for query and hut for a segment is separated by
|
|
28 # one whitespace.
|
|
29 #
|
|
30
|
|
31 # MODIFICATION: also add %identity as next output field
|
|
32 # HOWEVER note this is specifically protein sequence identity of the aligned regions, not all of the proteins
|
|
33
|
|
34 # If alignment mode is used, the parser prints a > before all data on the one-line description.
|
|
35 # The next two lines will contain the aligned sequences, including gaps. Alignment mode is off
|
|
36 # by default but can be used if the second parameter sent to the parser is -a. If alignment
|
|
37 # mode is not used only the score cutoff and the Blast XML file should be sent to the parser.
|
|
38 #
|
|
39 # NB: This parser was created for blast 2.2.16. If you run earlier versions of blast or
|
|
40 # blast with -VT to emulate the old behaviour the code in this script must be changed a bit.
|
|
41 # The neccesary changes are few and the first change is to un-comment all lines
|
|
42 # marked by ##. Lines marked with ## occurs here at the beginning of the script and on a couple
|
|
43 # of places in the My::TagHandler package (In the Init subroutine). The other change neccessary
|
|
44 # is to comment/remove all lines marked with #REMOVE THIS LINE FOR OLDER VERSIONS OF BLAST.
|
|
45 #
|
|
46 # Written by Isabella Pekkari 2007-11-19
|
|
47 # Modified by Kristoffer Forslund 2008-05-07 to add biased composition handling
|
|
48 #
|
|
49 ##############################################################################################
|
|
50
|
|
51 # First argument is score cutt-of in bits
|
|
52 # Second argument is beta threshold for composition filtering. Set to 0.0 to disable.
|
|
53 # Third argument is blast result file (.xml, i.i run blast with option -m7) that shall be parsed
|
|
54 my $score_cutoff = shift;
|
|
55
|
|
56
|
|
57 # If alignment mode shall be used, the flag -a is specified as the first parameter
|
|
58 # Alignment mode is off by defalut so the flag -a must be specified if this mode shall be used.
|
|
59 my $alignment_mode = 0;
|
|
60 if (defined $ARGV[0] && $ARGV[0] eq '-a') {
|
|
61
|
|
62 $alignment_mode = 1;
|
|
63 shift;
|
|
64 }
|
|
65
|
|
66 # If segments must be ordered in a linear fashion on both sequences.
|
|
67 # If 1, the hsp order must be the same on the query and the hit.
|
|
68 # For example, if the n-terminal of the hit matches the c-terminal of
|
|
69 # the query another segment where the c-terminal of the hit matches
|
|
70 # the n-terminal of the hit si not allowed.
|
|
71 # If this parameter is set to 0, hsps can be ordered differently on the
|
|
72 # query and the hit. In both cases, an overlap of maximum 5% of the shortest segment
|
|
73 # is allowed.
|
|
74 my $linear_segment_mode = 1;
|
|
75
|
|
76 # Only for older versions of blast
|
|
77 ##my $done = 0;
|
|
78
|
|
79
|
|
80
|
|
81 # Create an xml parser
|
|
82 my $p = XML::Parser->new( Style => 'My::TagHandler', );
|
|
83
|
|
84 # Create an Expat::NB parser
|
|
85 my $nb_p = $p->parse_start();
|
|
86
|
|
87 # Parse the xml file
|
|
88 parse_document();
|
|
89
|
|
90 sub parse_document {
|
|
91
|
|
92 while(my $l = <>){
|
|
93 ##if ($done) {
|
|
94
|
|
95 ##$nb_p->parse_done;
|
|
96 ##$nb_p = $p->parse_start();
|
|
97 ##$done = 0;
|
|
98
|
|
99 ##} else {
|
|
100 # Remove whitespace at the end of the line
|
|
101 chomp($l);
|
|
102 ##/DEBUG
|
|
103 # print STDERR "###" .$l . "###\n";
|
|
104 # Parse the line
|
|
105 $nb_p->parse_more($l);
|
|
106 ##}
|
|
107 }
|
|
108
|
|
109 # Shut the parser down
|
|
110 $nb_p->parse_done;
|
|
111
|
|
112 }
|
|
113
|
|
114 # Only used for older versions of blast
|
|
115 ##sub next_doc {
|
|
116
|
|
117 ## $done = 1;
|
|
118
|
|
119 ##}
|
|
120
|
|
121
|
|
122
|
|
123 ############################################
|
|
124 # Package for parsing xml files obtained
|
|
125 # from blast.
|
|
126 ############################################
|
|
127 package My::TagHandler;
|
|
128
|
|
129 use strict;
|
|
130 use warnings;
|
|
131
|
|
132 #Subroutines
|
|
133 my $create_query;
|
|
134 my $store_query_length;
|
|
135 my $new_hit;
|
|
136 my $save_hit_length;
|
|
137 my $new_hsp;
|
|
138 my $save_hsp_start_query;
|
|
139 my $save_hsp_end_query;
|
|
140 my $save_hsp_start_hit;
|
|
141 my $save_hsp_end_hit;
|
|
142 my $save_hsp_qseq;
|
|
143 my $save_hsp_hseq;
|
|
144 my $end_hit;
|
|
145 my $print_query;
|
|
146 my $check_if_overlap_non_linear_allowed;
|
|
147 my $check_if_overlap_linear;
|
|
148
|
|
149 my @current_query; # The current query
|
|
150 my $current_hit; # The current hit
|
|
151 my $current_hsp; # The current hsp
|
|
152
|
|
153 my $currentText = ''; # Text from the current tag
|
|
154
|
|
155 # Names of tags that shall be parsed as key and references to
|
|
156 # the anonymous subroutine that shall be run when parsing the tag as value.
|
|
157 my %tags_to_parse;
|
|
158
|
|
159 # Reference to the anonymous subroutine that shall be run
|
|
160 # to check whether there is an overlap between two hsps.
|
|
161 my $overlap_sub;
|
|
162
|
|
163 sub Init {
|
|
164
|
|
165 my($expat) = @_;
|
|
166
|
|
167 #Subroutine for creating new query
|
|
168 $create_query = sub {
|
|
169
|
|
170 # The the query id. Additional information for the query is ignored.
|
|
171 my @tmp = split(/\s+/, $currentText);
|
|
172 @current_query = ($tmp[0], 0, [])
|
|
173 };
|
|
174
|
|
175 #Subroutine for storing query length
|
|
176 $store_query_length = sub {$current_query[1] = $currentText};
|
|
177
|
|
178 #Subroutine for creating new hit
|
|
179 $new_hit = sub {
|
|
180
|
|
181 # The the hit id. Additional information for the hit is ignored.
|
|
182 my @tmp = split(/\s+/, $currentText);
|
|
183 $current_hit = [$tmp[0], 0, 0, 0, 0, [], 0];
|
|
184 };
|
|
185
|
|
186 #Subroutine for saving hit length
|
|
187 $save_hit_length = sub {$current_hit->[1] = $currentText};
|
|
188
|
|
189 #Subroutine for creating new hsp
|
|
190 $new_hsp = sub {$current_hsp = [$currentText]};
|
|
191
|
|
192 # Subroutine for saving hsp start on query
|
|
193 $save_hsp_start_query = sub {$current_hsp->[1] = $currentText};
|
|
194
|
|
195 # Subroutine for saving hsp end on query
|
|
196 $save_hsp_end_query = sub {$current_hsp->[2] = $currentText};
|
|
197
|
|
198 # Subroutine for saving hsp start on hit
|
|
199 $save_hsp_start_hit = sub {$current_hsp->[3] = $currentText};
|
|
200
|
|
201 # Subroutine for saving hsp end on hit
|
|
202 $save_hsp_end_hit = sub {$current_hsp->[4] = $currentText;};
|
|
203
|
|
204 # Subroutine for saving hsp query sequence (as in alignment)
|
|
205 $save_hsp_qseq = sub {$current_hsp->[5] = $currentText;};
|
|
206
|
|
207 # Subroutine for saving hsp hit sequence (as in alignment)
|
|
208 $save_hsp_hseq = sub {$current_hsp->[6] = $currentText;
|
|
209
|
|
210 # Check if this hsp overlaps with any of the
|
|
211 # existing hsps
|
|
212 my $overlap_found = 0;
|
|
213 foreach my $existing_hsp (@{$current_hit->[5]}) {
|
|
214
|
|
215 if ($overlap_sub->($current_hsp, $existing_hsp)) {
|
|
216 $overlap_found = 1;
|
|
217 last;
|
|
218 }
|
|
219
|
|
220 }
|
|
221
|
|
222 # If this hsp does not overlap with any hsp it is added
|
|
223
|
|
224
|
|
225 unless ($overlap_found) {
|
|
226
|
|
227 # Add the hsp to the hit
|
|
228 push( @{$current_hit->[5]}, $current_hsp);
|
|
229
|
|
230 # Increase number of hsps for the hit with one
|
|
231 $current_hit->[6]++;
|
|
232
|
|
233 # Add the hsp score to the total score
|
|
234 $current_hit->[2] += $current_hsp->[0];
|
|
235
|
|
236 # Add the hsp length on the query to the total hsp length on the query
|
|
237 $current_hit->[3] += ($current_hsp->[2] - $current_hsp->[1] + 1);
|
|
238
|
|
239 # Add the hsp length on the hit to the total hsp length on the hit
|
|
240 $current_hit->[4] += ($current_hsp->[4] - $current_hsp->[3] + 1);
|
|
241 }
|
|
242
|
|
243 };
|
|
244
|
|
245 # Subroutine for saving hit
|
|
246 $end_hit = sub {
|
|
247
|
|
248 #Add the hit to the qurrent query
|
|
249 unless ($current_hit->[2] < $score_cutoff ) {
|
|
250 push( @{$current_query[2]}, $current_hit );
|
|
251 }
|
|
252 };
|
|
253
|
|
254 # Subroutine for printing all hits for a query
|
|
255 $print_query = sub {
|
|
256
|
|
257 # Sort the hits after score with hit with highest score first
|
|
258 my @sorted_hits = sort {$b->[2] <=> $a->[2]} @{$current_query[2]};
|
|
259
|
|
260 # For all hits
|
|
261 foreach my $hit (@sorted_hits) {
|
|
262
|
|
263 if ($alignment_mode) {
|
|
264
|
|
265 # When alignment mode is used, self hits are not printed
|
|
266 # Therefore, the hit is ignored if it has the same id as the query
|
|
267 next if ($current_query[0] eq $hit->[0]);
|
|
268
|
|
269 # When alignment mode is used, a > is printed at the start of
|
|
270 # lines containing data, i.e. lines that do not contain sequences
|
|
271 print ">";
|
|
272 }
|
|
273
|
|
274 print "$current_query[0]\t"; # Print query id
|
|
275 print "$hit->[0]\t"; # Print hit id
|
|
276 printf ("%.1f\t", $hit->[2]); # Print bit score
|
|
277 print "$current_query[1]\t"; # Print query length
|
|
278 print "$hit->[1]\t"; # Print hit length
|
|
279
|
|
280 if ($hit->[6] > 1) { # if more than one segment
|
|
281
|
|
282 # Sort hsps on query
|
|
283 my @hsps_sorted_on_query = sort {$a->[1] <=> $b->[1]} @{$hit->[5]};
|
|
284
|
|
285 # Sort hsps on hit
|
|
286 my @hsps_sorted_on_hit = sort {$a->[3] <=> $b->[3]} @{$hit->[5]};
|
|
287
|
|
288 # Get total segment length on query
|
|
289 my $segm_length_query = $hsps_sorted_on_query[@hsps_sorted_on_query - 1]->[2] - $hsps_sorted_on_query[0]->[1] + 1;
|
|
290
|
|
291 # Get total segment length on hit
|
|
292 my $segm_length_hit = $hsps_sorted_on_hit[@hsps_sorted_on_hit - 1]->[4] - $hsps_sorted_on_hit[0]->[3] + 1;
|
|
293
|
|
294 print "$segm_length_query\t"; # Print segment length on query (i.e lengt of the segment started by the first hsp and ended by the last hsp)
|
|
295 print "$segm_length_hit\t"; # Print segment length on query
|
|
296 print "$hit->[3]\t$hit->[4]\t"; # Print total length of all hsps on the query and on the match, i.e length of actually matching region
|
|
297
|
|
298 # In alignment mode, the aligned sequences shall be printed on the lines following the data line
|
|
299 my $hsp_qseq = '';
|
|
300 my $hsp_hseq = '';
|
|
301
|
|
302 # Print query and hit segment positions
|
|
303 foreach my $segment (@hsps_sorted_on_query) {
|
|
304
|
|
305 print "q:$segment->[1]-$segment->[2] ";
|
|
306 print "h:$segment->[3]-$segment->[4]\t";
|
|
307
|
|
308 if ($alignment_mode) {
|
|
309
|
|
310 # Add the hsp sequences to the total sequence
|
|
311 $hsp_qseq .= $segment->[5];
|
|
312 $hsp_hseq .= $segment->[6];
|
|
313 }
|
|
314
|
|
315 }
|
|
316
|
|
317 print "\n";
|
|
318
|
|
319 if ($alignment_mode) {
|
|
320
|
|
321 # Print sequences
|
|
322 print "$hsp_qseq\n";
|
|
323 print "$hsp_hseq\n";
|
|
324 }
|
|
325
|
|
326 } else {
|
|
327 # Get the only hsp that was found for this hit
|
|
328 my $segment = $hit->[5]->[0];
|
|
329
|
|
330 # Get total segment length on query
|
|
331 my $segm_length_query = $segment->[2] - $segment->[1] + 1;
|
|
332
|
|
333 # Get total segment length on hit
|
|
334 my $segm_length_hit = $segment->[4] - $segment->[3] + 1;
|
|
335
|
|
336 print "$segm_length_query\t"; # Print segment length on query, i.e. length of this hsp on query
|
|
337 print "$segm_length_hit\t"; # Print segment length on hit, i.e. length of this hsp on hit
|
|
338
|
|
339 # Print total length of all hsps on the query and on the match, i.e length of actually matching region
|
|
340 # Sice there is only one segment, these lengths will be the same as the segment lengths printed above.
|
|
341 print "$hit->[3]\t$hit->[4]\t";
|
|
342
|
|
343 # Print segment positions
|
|
344
|
|
345 print "q:$segment->[1]-$segment->[2] ";
|
|
346 print "h:$segment->[3]-$segment->[4]\t";
|
|
347 print "\n";
|
|
348
|
|
349 if ($alignment_mode) {
|
|
350
|
|
351 # Print sequences
|
|
352 print "$segment->[5]\n";
|
|
353 print "$segment->[6]\n";
|
|
354 }
|
|
355
|
|
356 }
|
|
357 }
|
|
358 ##main::next_doc(); #NB! Un-comment for older blast versions
|
|
359
|
|
360 };
|
|
361
|
|
362 # Subroutine for checking if two hsps overlap.
|
|
363 # When this subroutine is used, non-linear arrangements of the hsps are allowed.
|
|
364 $check_if_overlap_non_linear_allowed = sub {
|
|
365
|
|
366 my $hsp1 = shift; # One hsp
|
|
367 my $hsp2 = shift; # Another hsp
|
|
368
|
|
369 # Check if there is an overlap.
|
|
370 return (_check_overlap_non_linear_allowed($hsp1->[1], $hsp1->[2], $hsp2->[1], $hsp2->[2])
|
|
371 || _check_overlap_non_linear_allowed($hsp1->[3], $hsp1->[4], $hsp2->[3], $hsp2->[4]));
|
|
372
|
|
373 };
|
|
374
|
|
375 # Subroutine for checking if two hsps overlap.
|
|
376 # When this subroutine is used, non-linear arrangements of the hsps are not allowed.
|
|
377 $check_if_overlap_linear = sub {
|
|
378
|
|
379 my $hsp1 = shift; # One hsp
|
|
380 my $hsp2 = shift; # Another hsp
|
|
381
|
|
382 # Get start point for hsp1 on query
|
|
383 my $start1_hsp1 = $hsp1->[1];
|
|
384
|
|
385 # Get start point for hsp2 on query
|
|
386 my $start1_hsp2 = $hsp2->[1];
|
|
387
|
|
388 # The hsp that comes first oon the query (N-terminal hsp)
|
|
389 my $first_hsp;
|
|
390
|
|
391 # The hsp that comes last on the query
|
|
392 my $last_hsp;
|
|
393
|
|
394 # Check which hsp is N-teminal.
|
|
395 if ($start1_hsp1 eq $start1_hsp2) { # If the fragments start at the same position, there is an overlap (100% of shorter sequence)
|
|
396 return 1;
|
|
397 } elsif ($start1_hsp1 < $start1_hsp2) {
|
|
398
|
|
399 $first_hsp = $hsp1;
|
|
400 $last_hsp = $hsp2;
|
|
401
|
|
402 } else {
|
|
403
|
|
404 $first_hsp = $hsp2;
|
|
405 $last_hsp = $hsp1;
|
|
406
|
|
407 }
|
|
408
|
|
409 # Return whether there is an overlap or not.
|
|
410 return (_check_overlap_linear($first_hsp->[1], $first_hsp->[2], $last_hsp->[1], $last_hsp->[2])
|
|
411 || _check_overlap_linear($first_hsp->[3], $first_hsp->[4], $last_hsp->[3], $last_hsp->[4]));
|
|
412 };
|
|
413
|
|
414 %tags_to_parse = (
|
|
415
|
|
416 ##'BlastOutput_query-def' => $create_query,
|
|
417 ##'BlastOutput_query-def' => $create_query,
|
|
418 ## 'BlastOutput_query-def' => $create_query,
|
|
419 ## 'BlastOutput_query-len' => $store_query_length,
|
|
420 'Iteration_query-def' => $create_query, # Query id #REMOVE THIS LINE FOR OLDER VERSIONS OF BLAST
|
|
421 'Iteration_query-len' => $store_query_length, # Query length #REMOVE THIS LINE FOR OLDER VERSIONS OF BLAST
|
|
422 'Hit_def' => $new_hit, # Hit id
|
|
423 'Hit_len' => $save_hit_length, # Hit length
|
|
424 'Hsp_bit-score' => $new_hsp, # Hsp bit score
|
|
425 'Hsp_query-from' => $save_hsp_start_query, # Start point for hsp on query
|
|
426 'Hsp_query-to' => $save_hsp_end_query, # End point for hsp on query
|
|
427 'Hsp_hit-from' => $save_hsp_start_hit, # Start position for hsp on hit
|
|
428 'Hsp_hit-to' => $save_hsp_end_hit, # End position for hsp on hit
|
|
429 'Hsp_qseq' => $save_hsp_qseq, # Hsp query sequence (gapped as in the alignment)
|
|
430 'Hsp_hseq' => $save_hsp_hseq, # Hsp hit sequence (gapped as in the alignment)
|
|
431 'Hit_hsps' => $end_hit, # End of hit
|
|
432 ##'BlastOutput' => $print_query
|
|
433 'Iteration' => $print_query # End of query #REMOVE THIS LINE FOR OLDER VERSIONS OF BLAST
|
|
434 );
|
|
435
|
|
436 # Set overlap subroutine to use
|
|
437 if ($linear_segment_mode eq 1) {
|
|
438 $overlap_sub = $check_if_overlap_linear;
|
|
439 } else {
|
|
440 $overlap_sub = $check_if_overlap_non_linear_allowed;
|
|
441 }
|
|
442 }
|
|
443
|
|
444 # Nothing is done when encountering a start tag
|
|
445 #sub Start
|
|
446 #{
|
|
447
|
|
448 #}
|
|
449
|
|
450 sub End {
|
|
451
|
|
452 my($expat, $tag) = @_;
|
|
453
|
|
454 # If the name of the tag is in the table with names of tags to parse,
|
|
455 # run the corresponding subroutine
|
|
456 $tags_to_parse{$tag} && do { $tags_to_parse{$tag}->()};
|
|
457
|
|
458 }
|
|
459
|
|
460 sub Char
|
|
461 {
|
|
462 my($expat, $text) = @_;
|
|
463 # Save the tag text
|
|
464 $currentText = $text;
|
|
465 }
|
|
466
|
|
467 sub _check_overlap_linear {
|
|
468
|
|
469 my ($start1, $end1, $start2, $end2) = @_;
|
|
470
|
|
471 # Length of segment 1
|
|
472 my $length1 = $end1 - $start1 + 1;
|
|
473
|
|
474 # Length of segment 2
|
|
475 my $length2 = $end2 - $start2 + 1;
|
|
476
|
|
477 # Get the length of the sortest of these segments
|
|
478 my $shortest_length = ($length1 < $length2)?$length1:$length2;
|
|
479
|
|
480 # Maxumin of 5% overlap (witg regard to the shorter segment) is allowed
|
|
481 return (($start2 - $end1 - 1) / $shortest_length < - 0.05);
|
|
482 }
|
|
483
|
|
484
|
|
485
|
|
486 sub _check_overlap_non_linear_allowed {
|
|
487
|
|
488 my ($start1, $end1, $start2, $end2) = @_;
|
|
489
|
|
490 # Length of segment 1
|
|
491 my $length1 = $end1 - $start1 + 1;
|
|
492
|
|
493 # Length of segment 2
|
|
494 my $length2 = $end2 - $start2 + 1;
|
|
495
|
|
496 # Get the length of the sortest of these segments
|
|
497 my $shortest_length = ($length1 < $length2)?$length1:$length2;
|
|
498
|
|
499 if ($start1 eq $start2) { # If the fragment start at the same position, there is an overlap (100% of shorter sequence)
|
|
500 return 1;
|
|
501 } elsif ($start1 < $start2) {
|
|
502
|
|
503 if (($start2 - $end1 + 1) / $shortest_length < - 0.05) {
|
|
504 return 1;
|
|
505 }
|
|
506
|
|
507 } else {
|
|
508
|
|
509 if (($start1 - $end2 + 1) / $shortest_length < - 0.05) {
|
|
510 return 1;
|
|
511 }
|
|
512
|
|
513 }
|
|
514
|
|
515 # If the method has not returned yet, there is no overlap
|
|
516 return 0;
|
|
517 }
|