comparison PGAP-1.2.1/blast_parser.pl @ 0:83e62a1aeeeb draft

Uploaded
author dereeper
date Thu, 24 Jun 2021 13:51:52 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:83e62a1aeeeb
1 #!/usr/bin/perl
2 use strict;
3 use warnings;
4 #use lib '/afs/pdc.kth.se/home/k/krifo/vol03/domainAligner/Inparanoid_new/lib64';
5 #use lib '/afs/pdc.kth.se/home/k/krifo/vol03/domainAligner/Inparanoid_new/lib64/lib64/perl5/site_perl/5.8.8/x86_64-linux-thread-multi';
6 use XML::Parser;
7
8 ##############################################################################################
9 #
10 # This parser parses output files from blastall (blastp) and outputs one-line descriptions
11 # for each hit with a score above or equal to a cut-off score that is specified as a parameter
12 # to the program. The one-line description contains the following ifomation separated by tab
13 # * Query id
14 # * Hit id
15 # * Bit score
16 # * Query length
17 # * Hit length
18 # * Length of longest segment on query. This is defined as the length of the segment from the
19 # first position od the first hsp to the last position of the last hsp. I.e. if the
20 # hsps are 1-10 and 90-100, the length of the longest segment will be 100.
21 # * Length of longest segment on hit, see explanation above.
22 # * Total match length on query. This is defined as the total length of all hsps. If the hsps
23 # are 1-10 and 90-100, the length will be 20.
24 # * Total match length on hit. see explanation above
25 # * Positions for all segments on the query and on the hit. Positions on the query is written
26 # as p:1-100 and positions on the hit is writteh as h:1-100. Positions for all hsps are
27 # specified separated nt tab. Positions for query and hut for a segment is separated by
28 # one whitespace.
29 #
30
31 # MODIFICATION: also add %identity as next output field
32 # HOWEVER note this is specifically protein sequence identity of the aligned regions, not all of the proteins
33
34 # If alignment mode is used, the parser prints a > before all data on the one-line description.
35 # The next two lines will contain the aligned sequences, including gaps. Alignment mode is off
36 # by default but can be used if the second parameter sent to the parser is -a. If alignment
37 # mode is not used only the score cutoff and the Blast XML file should be sent to the parser.
38 #
39 # NB: This parser was created for blast 2.2.16. If you run earlier versions of blast or
40 # blast with -VT to emulate the old behaviour the code in this script must be changed a bit.
41 # The neccesary changes are few and the first change is to un-comment all lines
42 # marked by ##. Lines marked with ## occurs here at the beginning of the script and on a couple
43 # of places in the My::TagHandler package (In the Init subroutine). The other change neccessary
44 # is to comment/remove all lines marked with #REMOVE THIS LINE FOR OLDER VERSIONS OF BLAST.
45 #
46 # Written by Isabella Pekkari 2007-11-19
47 # Modified by Kristoffer Forslund 2008-05-07 to add biased composition handling
48 #
49 ##############################################################################################
50
51 # First argument is score cutt-of in bits
52 # Second argument is beta threshold for composition filtering. Set to 0.0 to disable.
53 # Third argument is blast result file (.xml, i.i run blast with option -m7) that shall be parsed
54 my $score_cutoff = shift;
55
56
57 # If alignment mode shall be used, the flag -a is specified as the first parameter
58 # Alignment mode is off by defalut so the flag -a must be specified if this mode shall be used.
59 my $alignment_mode = 0;
60 if (defined $ARGV[0] && $ARGV[0] eq '-a') {
61
62 $alignment_mode = 1;
63 shift;
64 }
65
66 # If segments must be ordered in a linear fashion on both sequences.
67 # If 1, the hsp order must be the same on the query and the hit.
68 # For example, if the n-terminal of the hit matches the c-terminal of
69 # the query another segment where the c-terminal of the hit matches
70 # the n-terminal of the hit si not allowed.
71 # If this parameter is set to 0, hsps can be ordered differently on the
72 # query and the hit. In both cases, an overlap of maximum 5% of the shortest segment
73 # is allowed.
74 my $linear_segment_mode = 1;
75
76 # Only for older versions of blast
77 ##my $done = 0;
78
79
80
81 # Create an xml parser
82 my $p = XML::Parser->new( Style => 'My::TagHandler', );
83
84 # Create an Expat::NB parser
85 my $nb_p = $p->parse_start();
86
87 # Parse the xml file
88 parse_document();
89
90 sub parse_document {
91
92 while(my $l = <>){
93 ##if ($done) {
94
95 ##$nb_p->parse_done;
96 ##$nb_p = $p->parse_start();
97 ##$done = 0;
98
99 ##} else {
100 # Remove whitespace at the end of the line
101 chomp($l);
102 ##/DEBUG
103 # print STDERR "###" .$l . "###\n";
104 # Parse the line
105 $nb_p->parse_more($l);
106 ##}
107 }
108
109 # Shut the parser down
110 $nb_p->parse_done;
111
112 }
113
114 # Only used for older versions of blast
115 ##sub next_doc {
116
117 ## $done = 1;
118
119 ##}
120
121
122
123 ############################################
124 # Package for parsing xml files obtained
125 # from blast.
126 ############################################
127 package My::TagHandler;
128
129 use strict;
130 use warnings;
131
132 #Subroutines
133 my $create_query;
134 my $store_query_length;
135 my $new_hit;
136 my $save_hit_length;
137 my $new_hsp;
138 my $save_hsp_start_query;
139 my $save_hsp_end_query;
140 my $save_hsp_start_hit;
141 my $save_hsp_end_hit;
142 my $save_hsp_qseq;
143 my $save_hsp_hseq;
144 my $end_hit;
145 my $print_query;
146 my $check_if_overlap_non_linear_allowed;
147 my $check_if_overlap_linear;
148
149 my @current_query; # The current query
150 my $current_hit; # The current hit
151 my $current_hsp; # The current hsp
152
153 my $currentText = ''; # Text from the current tag
154
155 # Names of tags that shall be parsed as key and references to
156 # the anonymous subroutine that shall be run when parsing the tag as value.
157 my %tags_to_parse;
158
159 # Reference to the anonymous subroutine that shall be run
160 # to check whether there is an overlap between two hsps.
161 my $overlap_sub;
162
163 sub Init {
164
165 my($expat) = @_;
166
167 #Subroutine for creating new query
168 $create_query = sub {
169
170 # The the query id. Additional information for the query is ignored.
171 my @tmp = split(/\s+/, $currentText);
172 @current_query = ($tmp[0], 0, [])
173 };
174
175 #Subroutine for storing query length
176 $store_query_length = sub {$current_query[1] = $currentText};
177
178 #Subroutine for creating new hit
179 $new_hit = sub {
180
181 # The the hit id. Additional information for the hit is ignored.
182 my @tmp = split(/\s+/, $currentText);
183 $current_hit = [$tmp[0], 0, 0, 0, 0, [], 0];
184 };
185
186 #Subroutine for saving hit length
187 $save_hit_length = sub {$current_hit->[1] = $currentText};
188
189 #Subroutine for creating new hsp
190 $new_hsp = sub {$current_hsp = [$currentText]};
191
192 # Subroutine for saving hsp start on query
193 $save_hsp_start_query = sub {$current_hsp->[1] = $currentText};
194
195 # Subroutine for saving hsp end on query
196 $save_hsp_end_query = sub {$current_hsp->[2] = $currentText};
197
198 # Subroutine for saving hsp start on hit
199 $save_hsp_start_hit = sub {$current_hsp->[3] = $currentText};
200
201 # Subroutine for saving hsp end on hit
202 $save_hsp_end_hit = sub {$current_hsp->[4] = $currentText;};
203
204 # Subroutine for saving hsp query sequence (as in alignment)
205 $save_hsp_qseq = sub {$current_hsp->[5] = $currentText;};
206
207 # Subroutine for saving hsp hit sequence (as in alignment)
208 $save_hsp_hseq = sub {$current_hsp->[6] = $currentText;
209
210 # Check if this hsp overlaps with any of the
211 # existing hsps
212 my $overlap_found = 0;
213 foreach my $existing_hsp (@{$current_hit->[5]}) {
214
215 if ($overlap_sub->($current_hsp, $existing_hsp)) {
216 $overlap_found = 1;
217 last;
218 }
219
220 }
221
222 # If this hsp does not overlap with any hsp it is added
223
224
225 unless ($overlap_found) {
226
227 # Add the hsp to the hit
228 push( @{$current_hit->[5]}, $current_hsp);
229
230 # Increase number of hsps for the hit with one
231 $current_hit->[6]++;
232
233 # Add the hsp score to the total score
234 $current_hit->[2] += $current_hsp->[0];
235
236 # Add the hsp length on the query to the total hsp length on the query
237 $current_hit->[3] += ($current_hsp->[2] - $current_hsp->[1] + 1);
238
239 # Add the hsp length on the hit to the total hsp length on the hit
240 $current_hit->[4] += ($current_hsp->[4] - $current_hsp->[3] + 1);
241 }
242
243 };
244
245 # Subroutine for saving hit
246 $end_hit = sub {
247
248 #Add the hit to the qurrent query
249 unless ($current_hit->[2] < $score_cutoff ) {
250 push( @{$current_query[2]}, $current_hit );
251 }
252 };
253
254 # Subroutine for printing all hits for a query
255 $print_query = sub {
256
257 # Sort the hits after score with hit with highest score first
258 my @sorted_hits = sort {$b->[2] <=> $a->[2]} @{$current_query[2]};
259
260 # For all hits
261 foreach my $hit (@sorted_hits) {
262
263 if ($alignment_mode) {
264
265 # When alignment mode is used, self hits are not printed
266 # Therefore, the hit is ignored if it has the same id as the query
267 next if ($current_query[0] eq $hit->[0]);
268
269 # When alignment mode is used, a > is printed at the start of
270 # lines containing data, i.e. lines that do not contain sequences
271 print ">";
272 }
273
274 print "$current_query[0]\t"; # Print query id
275 print "$hit->[0]\t"; # Print hit id
276 printf ("%.1f\t", $hit->[2]); # Print bit score
277 print "$current_query[1]\t"; # Print query length
278 print "$hit->[1]\t"; # Print hit length
279
280 if ($hit->[6] > 1) { # if more than one segment
281
282 # Sort hsps on query
283 my @hsps_sorted_on_query = sort {$a->[1] <=> $b->[1]} @{$hit->[5]};
284
285 # Sort hsps on hit
286 my @hsps_sorted_on_hit = sort {$a->[3] <=> $b->[3]} @{$hit->[5]};
287
288 # Get total segment length on query
289 my $segm_length_query = $hsps_sorted_on_query[@hsps_sorted_on_query - 1]->[2] - $hsps_sorted_on_query[0]->[1] + 1;
290
291 # Get total segment length on hit
292 my $segm_length_hit = $hsps_sorted_on_hit[@hsps_sorted_on_hit - 1]->[4] - $hsps_sorted_on_hit[0]->[3] + 1;
293
294 print "$segm_length_query\t"; # Print segment length on query (i.e lengt of the segment started by the first hsp and ended by the last hsp)
295 print "$segm_length_hit\t"; # Print segment length on query
296 print "$hit->[3]\t$hit->[4]\t"; # Print total length of all hsps on the query and on the match, i.e length of actually matching region
297
298 # In alignment mode, the aligned sequences shall be printed on the lines following the data line
299 my $hsp_qseq = '';
300 my $hsp_hseq = '';
301
302 # Print query and hit segment positions
303 foreach my $segment (@hsps_sorted_on_query) {
304
305 print "q:$segment->[1]-$segment->[2] ";
306 print "h:$segment->[3]-$segment->[4]\t";
307
308 if ($alignment_mode) {
309
310 # Add the hsp sequences to the total sequence
311 $hsp_qseq .= $segment->[5];
312 $hsp_hseq .= $segment->[6];
313 }
314
315 }
316
317 print "\n";
318
319 if ($alignment_mode) {
320
321 # Print sequences
322 print "$hsp_qseq\n";
323 print "$hsp_hseq\n";
324 }
325
326 } else {
327 # Get the only hsp that was found for this hit
328 my $segment = $hit->[5]->[0];
329
330 # Get total segment length on query
331 my $segm_length_query = $segment->[2] - $segment->[1] + 1;
332
333 # Get total segment length on hit
334 my $segm_length_hit = $segment->[4] - $segment->[3] + 1;
335
336 print "$segm_length_query\t"; # Print segment length on query, i.e. length of this hsp on query
337 print "$segm_length_hit\t"; # Print segment length on hit, i.e. length of this hsp on hit
338
339 # Print total length of all hsps on the query and on the match, i.e length of actually matching region
340 # Sice there is only one segment, these lengths will be the same as the segment lengths printed above.
341 print "$hit->[3]\t$hit->[4]\t";
342
343 # Print segment positions
344
345 print "q:$segment->[1]-$segment->[2] ";
346 print "h:$segment->[3]-$segment->[4]\t";
347 print "\n";
348
349 if ($alignment_mode) {
350
351 # Print sequences
352 print "$segment->[5]\n";
353 print "$segment->[6]\n";
354 }
355
356 }
357 }
358 ##main::next_doc(); #NB! Un-comment for older blast versions
359
360 };
361
362 # Subroutine for checking if two hsps overlap.
363 # When this subroutine is used, non-linear arrangements of the hsps are allowed.
364 $check_if_overlap_non_linear_allowed = sub {
365
366 my $hsp1 = shift; # One hsp
367 my $hsp2 = shift; # Another hsp
368
369 # Check if there is an overlap.
370 return (_check_overlap_non_linear_allowed($hsp1->[1], $hsp1->[2], $hsp2->[1], $hsp2->[2])
371 || _check_overlap_non_linear_allowed($hsp1->[3], $hsp1->[4], $hsp2->[3], $hsp2->[4]));
372
373 };
374
375 # Subroutine for checking if two hsps overlap.
376 # When this subroutine is used, non-linear arrangements of the hsps are not allowed.
377 $check_if_overlap_linear = sub {
378
379 my $hsp1 = shift; # One hsp
380 my $hsp2 = shift; # Another hsp
381
382 # Get start point for hsp1 on query
383 my $start1_hsp1 = $hsp1->[1];
384
385 # Get start point for hsp2 on query
386 my $start1_hsp2 = $hsp2->[1];
387
388 # The hsp that comes first oon the query (N-terminal hsp)
389 my $first_hsp;
390
391 # The hsp that comes last on the query
392 my $last_hsp;
393
394 # Check which hsp is N-teminal.
395 if ($start1_hsp1 eq $start1_hsp2) { # If the fragments start at the same position, there is an overlap (100% of shorter sequence)
396 return 1;
397 } elsif ($start1_hsp1 < $start1_hsp2) {
398
399 $first_hsp = $hsp1;
400 $last_hsp = $hsp2;
401
402 } else {
403
404 $first_hsp = $hsp2;
405 $last_hsp = $hsp1;
406
407 }
408
409 # Return whether there is an overlap or not.
410 return (_check_overlap_linear($first_hsp->[1], $first_hsp->[2], $last_hsp->[1], $last_hsp->[2])
411 || _check_overlap_linear($first_hsp->[3], $first_hsp->[4], $last_hsp->[3], $last_hsp->[4]));
412 };
413
414 %tags_to_parse = (
415
416 ##'BlastOutput_query-def' => $create_query,
417 ##'BlastOutput_query-def' => $create_query,
418 ## 'BlastOutput_query-def' => $create_query,
419 ## 'BlastOutput_query-len' => $store_query_length,
420 'Iteration_query-def' => $create_query, # Query id #REMOVE THIS LINE FOR OLDER VERSIONS OF BLAST
421 'Iteration_query-len' => $store_query_length, # Query length #REMOVE THIS LINE FOR OLDER VERSIONS OF BLAST
422 'Hit_def' => $new_hit, # Hit id
423 'Hit_len' => $save_hit_length, # Hit length
424 'Hsp_bit-score' => $new_hsp, # Hsp bit score
425 'Hsp_query-from' => $save_hsp_start_query, # Start point for hsp on query
426 'Hsp_query-to' => $save_hsp_end_query, # End point for hsp on query
427 'Hsp_hit-from' => $save_hsp_start_hit, # Start position for hsp on hit
428 'Hsp_hit-to' => $save_hsp_end_hit, # End position for hsp on hit
429 'Hsp_qseq' => $save_hsp_qseq, # Hsp query sequence (gapped as in the alignment)
430 'Hsp_hseq' => $save_hsp_hseq, # Hsp hit sequence (gapped as in the alignment)
431 'Hit_hsps' => $end_hit, # End of hit
432 ##'BlastOutput' => $print_query
433 'Iteration' => $print_query # End of query #REMOVE THIS LINE FOR OLDER VERSIONS OF BLAST
434 );
435
436 # Set overlap subroutine to use
437 if ($linear_segment_mode eq 1) {
438 $overlap_sub = $check_if_overlap_linear;
439 } else {
440 $overlap_sub = $check_if_overlap_non_linear_allowed;
441 }
442 }
443
444 # Nothing is done when encountering a start tag
445 #sub Start
446 #{
447
448 #}
449
450 sub End {
451
452 my($expat, $tag) = @_;
453
454 # If the name of the tag is in the table with names of tags to parse,
455 # run the corresponding subroutine
456 $tags_to_parse{$tag} && do { $tags_to_parse{$tag}->()};
457
458 }
459
460 sub Char
461 {
462 my($expat, $text) = @_;
463 # Save the tag text
464 $currentText = $text;
465 }
466
467 sub _check_overlap_linear {
468
469 my ($start1, $end1, $start2, $end2) = @_;
470
471 # Length of segment 1
472 my $length1 = $end1 - $start1 + 1;
473
474 # Length of segment 2
475 my $length2 = $end2 - $start2 + 1;
476
477 # Get the length of the sortest of these segments
478 my $shortest_length = ($length1 < $length2)?$length1:$length2;
479
480 # Maxumin of 5% overlap (witg regard to the shorter segment) is allowed
481 return (($start2 - $end1 - 1) / $shortest_length < - 0.05);
482 }
483
484
485
486 sub _check_overlap_non_linear_allowed {
487
488 my ($start1, $end1, $start2, $end2) = @_;
489
490 # Length of segment 1
491 my $length1 = $end1 - $start1 + 1;
492
493 # Length of segment 2
494 my $length2 = $end2 - $start2 + 1;
495
496 # Get the length of the sortest of these segments
497 my $shortest_length = ($length1 < $length2)?$length1:$length2;
498
499 if ($start1 eq $start2) { # If the fragment start at the same position, there is an overlap (100% of shorter sequence)
500 return 1;
501 } elsif ($start1 < $start2) {
502
503 if (($start2 - $end1 + 1) / $shortest_length < - 0.05) {
504 return 1;
505 }
506
507 } else {
508
509 if (($start1 - $end2 + 1) / $shortest_length < - 0.05) {
510 return 1;
511 }
512
513 }
514
515 # If the method has not returned yet, there is no overlap
516 return 0;
517 }