Mercurial > repos > dereeper > pgap
comparison PGAP-1.2.1/blast_parser.pl @ 0:83e62a1aeeeb draft
Uploaded
author | dereeper |
---|---|
date | Thu, 24 Jun 2021 13:51:52 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:83e62a1aeeeb |
---|---|
1 #!/usr/bin/perl | |
2 use strict; | |
3 use warnings; | |
4 #use lib '/afs/pdc.kth.se/home/k/krifo/vol03/domainAligner/Inparanoid_new/lib64'; | |
5 #use lib '/afs/pdc.kth.se/home/k/krifo/vol03/domainAligner/Inparanoid_new/lib64/lib64/perl5/site_perl/5.8.8/x86_64-linux-thread-multi'; | |
6 use XML::Parser; | |
7 | |
8 ############################################################################################## | |
9 # | |
10 # This parser parses output files from blastall (blastp) and outputs one-line descriptions | |
11 # for each hit with a score above or equal to a cut-off score that is specified as a parameter | |
12 # to the program. The one-line description contains the following ifomation separated by tab | |
13 # * Query id | |
14 # * Hit id | |
15 # * Bit score | |
16 # * Query length | |
17 # * Hit length | |
18 # * Length of longest segment on query. This is defined as the length of the segment from the | |
19 # first position od the first hsp to the last position of the last hsp. I.e. if the | |
20 # hsps are 1-10 and 90-100, the length of the longest segment will be 100. | |
21 # * Length of longest segment on hit, see explanation above. | |
22 # * Total match length on query. This is defined as the total length of all hsps. If the hsps | |
23 # are 1-10 and 90-100, the length will be 20. | |
24 # * Total match length on hit. see explanation above | |
25 # * Positions for all segments on the query and on the hit. Positions on the query is written | |
26 # as p:1-100 and positions on the hit is writteh as h:1-100. Positions for all hsps are | |
27 # specified separated nt tab. Positions for query and hut for a segment is separated by | |
28 # one whitespace. | |
29 # | |
30 | |
31 # MODIFICATION: also add %identity as next output field | |
32 # HOWEVER note this is specifically protein sequence identity of the aligned regions, not all of the proteins | |
33 | |
34 # If alignment mode is used, the parser prints a > before all data on the one-line description. | |
35 # The next two lines will contain the aligned sequences, including gaps. Alignment mode is off | |
36 # by default but can be used if the second parameter sent to the parser is -a. If alignment | |
37 # mode is not used only the score cutoff and the Blast XML file should be sent to the parser. | |
38 # | |
39 # NB: This parser was created for blast 2.2.16. If you run earlier versions of blast or | |
40 # blast with -VT to emulate the old behaviour the code in this script must be changed a bit. | |
41 # The neccesary changes are few and the first change is to un-comment all lines | |
42 # marked by ##. Lines marked with ## occurs here at the beginning of the script and on a couple | |
43 # of places in the My::TagHandler package (In the Init subroutine). The other change neccessary | |
44 # is to comment/remove all lines marked with #REMOVE THIS LINE FOR OLDER VERSIONS OF BLAST. | |
45 # | |
46 # Written by Isabella Pekkari 2007-11-19 | |
47 # Modified by Kristoffer Forslund 2008-05-07 to add biased composition handling | |
48 # | |
49 ############################################################################################## | |
50 | |
51 # First argument is score cutt-of in bits | |
52 # Second argument is beta threshold for composition filtering. Set to 0.0 to disable. | |
53 # Third argument is blast result file (.xml, i.i run blast with option -m7) that shall be parsed | |
54 my $score_cutoff = shift; | |
55 | |
56 | |
57 # If alignment mode shall be used, the flag -a is specified as the first parameter | |
58 # Alignment mode is off by defalut so the flag -a must be specified if this mode shall be used. | |
59 my $alignment_mode = 0; | |
60 if (defined $ARGV[0] && $ARGV[0] eq '-a') { | |
61 | |
62 $alignment_mode = 1; | |
63 shift; | |
64 } | |
65 | |
66 # If segments must be ordered in a linear fashion on both sequences. | |
67 # If 1, the hsp order must be the same on the query and the hit. | |
68 # For example, if the n-terminal of the hit matches the c-terminal of | |
69 # the query another segment where the c-terminal of the hit matches | |
70 # the n-terminal of the hit si not allowed. | |
71 # If this parameter is set to 0, hsps can be ordered differently on the | |
72 # query and the hit. In both cases, an overlap of maximum 5% of the shortest segment | |
73 # is allowed. | |
74 my $linear_segment_mode = 1; | |
75 | |
76 # Only for older versions of blast | |
77 ##my $done = 0; | |
78 | |
79 | |
80 | |
81 # Create an xml parser | |
82 my $p = XML::Parser->new( Style => 'My::TagHandler', ); | |
83 | |
84 # Create an Expat::NB parser | |
85 my $nb_p = $p->parse_start(); | |
86 | |
87 # Parse the xml file | |
88 parse_document(); | |
89 | |
90 sub parse_document { | |
91 | |
92 while(my $l = <>){ | |
93 ##if ($done) { | |
94 | |
95 ##$nb_p->parse_done; | |
96 ##$nb_p = $p->parse_start(); | |
97 ##$done = 0; | |
98 | |
99 ##} else { | |
100 # Remove whitespace at the end of the line | |
101 chomp($l); | |
102 ##/DEBUG | |
103 # print STDERR "###" .$l . "###\n"; | |
104 # Parse the line | |
105 $nb_p->parse_more($l); | |
106 ##} | |
107 } | |
108 | |
109 # Shut the parser down | |
110 $nb_p->parse_done; | |
111 | |
112 } | |
113 | |
114 # Only used for older versions of blast | |
115 ##sub next_doc { | |
116 | |
117 ## $done = 1; | |
118 | |
119 ##} | |
120 | |
121 | |
122 | |
123 ############################################ | |
124 # Package for parsing xml files obtained | |
125 # from blast. | |
126 ############################################ | |
127 package My::TagHandler; | |
128 | |
129 use strict; | |
130 use warnings; | |
131 | |
132 #Subroutines | |
133 my $create_query; | |
134 my $store_query_length; | |
135 my $new_hit; | |
136 my $save_hit_length; | |
137 my $new_hsp; | |
138 my $save_hsp_start_query; | |
139 my $save_hsp_end_query; | |
140 my $save_hsp_start_hit; | |
141 my $save_hsp_end_hit; | |
142 my $save_hsp_qseq; | |
143 my $save_hsp_hseq; | |
144 my $end_hit; | |
145 my $print_query; | |
146 my $check_if_overlap_non_linear_allowed; | |
147 my $check_if_overlap_linear; | |
148 | |
149 my @current_query; # The current query | |
150 my $current_hit; # The current hit | |
151 my $current_hsp; # The current hsp | |
152 | |
153 my $currentText = ''; # Text from the current tag | |
154 | |
155 # Names of tags that shall be parsed as key and references to | |
156 # the anonymous subroutine that shall be run when parsing the tag as value. | |
157 my %tags_to_parse; | |
158 | |
159 # Reference to the anonymous subroutine that shall be run | |
160 # to check whether there is an overlap between two hsps. | |
161 my $overlap_sub; | |
162 | |
163 sub Init { | |
164 | |
165 my($expat) = @_; | |
166 | |
167 #Subroutine for creating new query | |
168 $create_query = sub { | |
169 | |
170 # The the query id. Additional information for the query is ignored. | |
171 my @tmp = split(/\s+/, $currentText); | |
172 @current_query = ($tmp[0], 0, []) | |
173 }; | |
174 | |
175 #Subroutine for storing query length | |
176 $store_query_length = sub {$current_query[1] = $currentText}; | |
177 | |
178 #Subroutine for creating new hit | |
179 $new_hit = sub { | |
180 | |
181 # The the hit id. Additional information for the hit is ignored. | |
182 my @tmp = split(/\s+/, $currentText); | |
183 $current_hit = [$tmp[0], 0, 0, 0, 0, [], 0]; | |
184 }; | |
185 | |
186 #Subroutine for saving hit length | |
187 $save_hit_length = sub {$current_hit->[1] = $currentText}; | |
188 | |
189 #Subroutine for creating new hsp | |
190 $new_hsp = sub {$current_hsp = [$currentText]}; | |
191 | |
192 # Subroutine for saving hsp start on query | |
193 $save_hsp_start_query = sub {$current_hsp->[1] = $currentText}; | |
194 | |
195 # Subroutine for saving hsp end on query | |
196 $save_hsp_end_query = sub {$current_hsp->[2] = $currentText}; | |
197 | |
198 # Subroutine for saving hsp start on hit | |
199 $save_hsp_start_hit = sub {$current_hsp->[3] = $currentText}; | |
200 | |
201 # Subroutine for saving hsp end on hit | |
202 $save_hsp_end_hit = sub {$current_hsp->[4] = $currentText;}; | |
203 | |
204 # Subroutine for saving hsp query sequence (as in alignment) | |
205 $save_hsp_qseq = sub {$current_hsp->[5] = $currentText;}; | |
206 | |
207 # Subroutine for saving hsp hit sequence (as in alignment) | |
208 $save_hsp_hseq = sub {$current_hsp->[6] = $currentText; | |
209 | |
210 # Check if this hsp overlaps with any of the | |
211 # existing hsps | |
212 my $overlap_found = 0; | |
213 foreach my $existing_hsp (@{$current_hit->[5]}) { | |
214 | |
215 if ($overlap_sub->($current_hsp, $existing_hsp)) { | |
216 $overlap_found = 1; | |
217 last; | |
218 } | |
219 | |
220 } | |
221 | |
222 # If this hsp does not overlap with any hsp it is added | |
223 | |
224 | |
225 unless ($overlap_found) { | |
226 | |
227 # Add the hsp to the hit | |
228 push( @{$current_hit->[5]}, $current_hsp); | |
229 | |
230 # Increase number of hsps for the hit with one | |
231 $current_hit->[6]++; | |
232 | |
233 # Add the hsp score to the total score | |
234 $current_hit->[2] += $current_hsp->[0]; | |
235 | |
236 # Add the hsp length on the query to the total hsp length on the query | |
237 $current_hit->[3] += ($current_hsp->[2] - $current_hsp->[1] + 1); | |
238 | |
239 # Add the hsp length on the hit to the total hsp length on the hit | |
240 $current_hit->[4] += ($current_hsp->[4] - $current_hsp->[3] + 1); | |
241 } | |
242 | |
243 }; | |
244 | |
245 # Subroutine for saving hit | |
246 $end_hit = sub { | |
247 | |
248 #Add the hit to the qurrent query | |
249 unless ($current_hit->[2] < $score_cutoff ) { | |
250 push( @{$current_query[2]}, $current_hit ); | |
251 } | |
252 }; | |
253 | |
254 # Subroutine for printing all hits for a query | |
255 $print_query = sub { | |
256 | |
257 # Sort the hits after score with hit with highest score first | |
258 my @sorted_hits = sort {$b->[2] <=> $a->[2]} @{$current_query[2]}; | |
259 | |
260 # For all hits | |
261 foreach my $hit (@sorted_hits) { | |
262 | |
263 if ($alignment_mode) { | |
264 | |
265 # When alignment mode is used, self hits are not printed | |
266 # Therefore, the hit is ignored if it has the same id as the query | |
267 next if ($current_query[0] eq $hit->[0]); | |
268 | |
269 # When alignment mode is used, a > is printed at the start of | |
270 # lines containing data, i.e. lines that do not contain sequences | |
271 print ">"; | |
272 } | |
273 | |
274 print "$current_query[0]\t"; # Print query id | |
275 print "$hit->[0]\t"; # Print hit id | |
276 printf ("%.1f\t", $hit->[2]); # Print bit score | |
277 print "$current_query[1]\t"; # Print query length | |
278 print "$hit->[1]\t"; # Print hit length | |
279 | |
280 if ($hit->[6] > 1) { # if more than one segment | |
281 | |
282 # Sort hsps on query | |
283 my @hsps_sorted_on_query = sort {$a->[1] <=> $b->[1]} @{$hit->[5]}; | |
284 | |
285 # Sort hsps on hit | |
286 my @hsps_sorted_on_hit = sort {$a->[3] <=> $b->[3]} @{$hit->[5]}; | |
287 | |
288 # Get total segment length on query | |
289 my $segm_length_query = $hsps_sorted_on_query[@hsps_sorted_on_query - 1]->[2] - $hsps_sorted_on_query[0]->[1] + 1; | |
290 | |
291 # Get total segment length on hit | |
292 my $segm_length_hit = $hsps_sorted_on_hit[@hsps_sorted_on_hit - 1]->[4] - $hsps_sorted_on_hit[0]->[3] + 1; | |
293 | |
294 print "$segm_length_query\t"; # Print segment length on query (i.e lengt of the segment started by the first hsp and ended by the last hsp) | |
295 print "$segm_length_hit\t"; # Print segment length on query | |
296 print "$hit->[3]\t$hit->[4]\t"; # Print total length of all hsps on the query and on the match, i.e length of actually matching region | |
297 | |
298 # In alignment mode, the aligned sequences shall be printed on the lines following the data line | |
299 my $hsp_qseq = ''; | |
300 my $hsp_hseq = ''; | |
301 | |
302 # Print query and hit segment positions | |
303 foreach my $segment (@hsps_sorted_on_query) { | |
304 | |
305 print "q:$segment->[1]-$segment->[2] "; | |
306 print "h:$segment->[3]-$segment->[4]\t"; | |
307 | |
308 if ($alignment_mode) { | |
309 | |
310 # Add the hsp sequences to the total sequence | |
311 $hsp_qseq .= $segment->[5]; | |
312 $hsp_hseq .= $segment->[6]; | |
313 } | |
314 | |
315 } | |
316 | |
317 print "\n"; | |
318 | |
319 if ($alignment_mode) { | |
320 | |
321 # Print sequences | |
322 print "$hsp_qseq\n"; | |
323 print "$hsp_hseq\n"; | |
324 } | |
325 | |
326 } else { | |
327 # Get the only hsp that was found for this hit | |
328 my $segment = $hit->[5]->[0]; | |
329 | |
330 # Get total segment length on query | |
331 my $segm_length_query = $segment->[2] - $segment->[1] + 1; | |
332 | |
333 # Get total segment length on hit | |
334 my $segm_length_hit = $segment->[4] - $segment->[3] + 1; | |
335 | |
336 print "$segm_length_query\t"; # Print segment length on query, i.e. length of this hsp on query | |
337 print "$segm_length_hit\t"; # Print segment length on hit, i.e. length of this hsp on hit | |
338 | |
339 # Print total length of all hsps on the query and on the match, i.e length of actually matching region | |
340 # Sice there is only one segment, these lengths will be the same as the segment lengths printed above. | |
341 print "$hit->[3]\t$hit->[4]\t"; | |
342 | |
343 # Print segment positions | |
344 | |
345 print "q:$segment->[1]-$segment->[2] "; | |
346 print "h:$segment->[3]-$segment->[4]\t"; | |
347 print "\n"; | |
348 | |
349 if ($alignment_mode) { | |
350 | |
351 # Print sequences | |
352 print "$segment->[5]\n"; | |
353 print "$segment->[6]\n"; | |
354 } | |
355 | |
356 } | |
357 } | |
358 ##main::next_doc(); #NB! Un-comment for older blast versions | |
359 | |
360 }; | |
361 | |
362 # Subroutine for checking if two hsps overlap. | |
363 # When this subroutine is used, non-linear arrangements of the hsps are allowed. | |
364 $check_if_overlap_non_linear_allowed = sub { | |
365 | |
366 my $hsp1 = shift; # One hsp | |
367 my $hsp2 = shift; # Another hsp | |
368 | |
369 # Check if there is an overlap. | |
370 return (_check_overlap_non_linear_allowed($hsp1->[1], $hsp1->[2], $hsp2->[1], $hsp2->[2]) | |
371 || _check_overlap_non_linear_allowed($hsp1->[3], $hsp1->[4], $hsp2->[3], $hsp2->[4])); | |
372 | |
373 }; | |
374 | |
375 # Subroutine for checking if two hsps overlap. | |
376 # When this subroutine is used, non-linear arrangements of the hsps are not allowed. | |
377 $check_if_overlap_linear = sub { | |
378 | |
379 my $hsp1 = shift; # One hsp | |
380 my $hsp2 = shift; # Another hsp | |
381 | |
382 # Get start point for hsp1 on query | |
383 my $start1_hsp1 = $hsp1->[1]; | |
384 | |
385 # Get start point for hsp2 on query | |
386 my $start1_hsp2 = $hsp2->[1]; | |
387 | |
388 # The hsp that comes first oon the query (N-terminal hsp) | |
389 my $first_hsp; | |
390 | |
391 # The hsp that comes last on the query | |
392 my $last_hsp; | |
393 | |
394 # Check which hsp is N-teminal. | |
395 if ($start1_hsp1 eq $start1_hsp2) { # If the fragments start at the same position, there is an overlap (100% of shorter sequence) | |
396 return 1; | |
397 } elsif ($start1_hsp1 < $start1_hsp2) { | |
398 | |
399 $first_hsp = $hsp1; | |
400 $last_hsp = $hsp2; | |
401 | |
402 } else { | |
403 | |
404 $first_hsp = $hsp2; | |
405 $last_hsp = $hsp1; | |
406 | |
407 } | |
408 | |
409 # Return whether there is an overlap or not. | |
410 return (_check_overlap_linear($first_hsp->[1], $first_hsp->[2], $last_hsp->[1], $last_hsp->[2]) | |
411 || _check_overlap_linear($first_hsp->[3], $first_hsp->[4], $last_hsp->[3], $last_hsp->[4])); | |
412 }; | |
413 | |
414 %tags_to_parse = ( | |
415 | |
416 ##'BlastOutput_query-def' => $create_query, | |
417 ##'BlastOutput_query-def' => $create_query, | |
418 ## 'BlastOutput_query-def' => $create_query, | |
419 ## 'BlastOutput_query-len' => $store_query_length, | |
420 'Iteration_query-def' => $create_query, # Query id #REMOVE THIS LINE FOR OLDER VERSIONS OF BLAST | |
421 'Iteration_query-len' => $store_query_length, # Query length #REMOVE THIS LINE FOR OLDER VERSIONS OF BLAST | |
422 'Hit_def' => $new_hit, # Hit id | |
423 'Hit_len' => $save_hit_length, # Hit length | |
424 'Hsp_bit-score' => $new_hsp, # Hsp bit score | |
425 'Hsp_query-from' => $save_hsp_start_query, # Start point for hsp on query | |
426 'Hsp_query-to' => $save_hsp_end_query, # End point for hsp on query | |
427 'Hsp_hit-from' => $save_hsp_start_hit, # Start position for hsp on hit | |
428 'Hsp_hit-to' => $save_hsp_end_hit, # End position for hsp on hit | |
429 'Hsp_qseq' => $save_hsp_qseq, # Hsp query sequence (gapped as in the alignment) | |
430 'Hsp_hseq' => $save_hsp_hseq, # Hsp hit sequence (gapped as in the alignment) | |
431 'Hit_hsps' => $end_hit, # End of hit | |
432 ##'BlastOutput' => $print_query | |
433 'Iteration' => $print_query # End of query #REMOVE THIS LINE FOR OLDER VERSIONS OF BLAST | |
434 ); | |
435 | |
436 # Set overlap subroutine to use | |
437 if ($linear_segment_mode eq 1) { | |
438 $overlap_sub = $check_if_overlap_linear; | |
439 } else { | |
440 $overlap_sub = $check_if_overlap_non_linear_allowed; | |
441 } | |
442 } | |
443 | |
444 # Nothing is done when encountering a start tag | |
445 #sub Start | |
446 #{ | |
447 | |
448 #} | |
449 | |
450 sub End { | |
451 | |
452 my($expat, $tag) = @_; | |
453 | |
454 # If the name of the tag is in the table with names of tags to parse, | |
455 # run the corresponding subroutine | |
456 $tags_to_parse{$tag} && do { $tags_to_parse{$tag}->()}; | |
457 | |
458 } | |
459 | |
460 sub Char | |
461 { | |
462 my($expat, $text) = @_; | |
463 # Save the tag text | |
464 $currentText = $text; | |
465 } | |
466 | |
467 sub _check_overlap_linear { | |
468 | |
469 my ($start1, $end1, $start2, $end2) = @_; | |
470 | |
471 # Length of segment 1 | |
472 my $length1 = $end1 - $start1 + 1; | |
473 | |
474 # Length of segment 2 | |
475 my $length2 = $end2 - $start2 + 1; | |
476 | |
477 # Get the length of the sortest of these segments | |
478 my $shortest_length = ($length1 < $length2)?$length1:$length2; | |
479 | |
480 # Maxumin of 5% overlap (witg regard to the shorter segment) is allowed | |
481 return (($start2 - $end1 - 1) / $shortest_length < - 0.05); | |
482 } | |
483 | |
484 | |
485 | |
486 sub _check_overlap_non_linear_allowed { | |
487 | |
488 my ($start1, $end1, $start2, $end2) = @_; | |
489 | |
490 # Length of segment 1 | |
491 my $length1 = $end1 - $start1 + 1; | |
492 | |
493 # Length of segment 2 | |
494 my $length2 = $end2 - $start2 + 1; | |
495 | |
496 # Get the length of the sortest of these segments | |
497 my $shortest_length = ($length1 < $length2)?$length1:$length2; | |
498 | |
499 if ($start1 eq $start2) { # If the fragment start at the same position, there is an overlap (100% of shorter sequence) | |
500 return 1; | |
501 } elsif ($start1 < $start2) { | |
502 | |
503 if (($start2 - $end1 + 1) / $shortest_length < - 0.05) { | |
504 return 1; | |
505 } | |
506 | |
507 } else { | |
508 | |
509 if (($start1 - $end2 + 1) / $shortest_length < - 0.05) { | |
510 return 1; | |
511 } | |
512 | |
513 } | |
514 | |
515 # If the method has not returned yet, there is no overlap | |
516 return 0; | |
517 } |