diff PGAP-1.2.1/blast_parser.pl @ 0:83e62a1aeeeb draft

Uploaded
author dereeper
date Thu, 24 Jun 2021 13:51:52 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/PGAP-1.2.1/blast_parser.pl	Thu Jun 24 13:51:52 2021 +0000
@@ -0,0 +1,517 @@
+#!/usr/bin/perl
+use strict; 
+use warnings; 
+#use lib '/afs/pdc.kth.se/home/k/krifo/vol03/domainAligner/Inparanoid_new/lib64';
+#use lib '/afs/pdc.kth.se/home/k/krifo/vol03/domainAligner/Inparanoid_new/lib64/lib64/perl5/site_perl/5.8.8/x86_64-linux-thread-multi';
+use XML::Parser; 
+
+##############################################################################################
+#
+# This parser parses output files from blastall (blastp) and outputs one-line descriptions
+# for each hit with a score above or equal to a cut-off score that is specified as a parameter
+# to the program. The one-line description contains the following ifomation separated by tab
+# * Query id
+# * Hit id
+# * Bit score
+# * Query length
+# * Hit length
+# * Length of longest segment on query. This is defined as the length of the segment from the
+#  	first position od the first hsp to the last position of the last hsp. I.e. if the
+# 	hsps are 1-10 and 90-100, the length of the longest segment will be 100.
+# * Length of longest segment on hit, see explanation above.
+# * Total match length on query. This is defined as the total length of all hsps. If the hsps
+# 	are 1-10 and 90-100, the length will be 20.
+# * Total match length on hit. see explanation above
+# * Positions for all segments on the query and on the hit. Positions on the query is written
+# 	as p:1-100 and positions on the hit is writteh as h:1-100. Positions for all hsps are
+#	specified separated nt tab. Positions for query and hut for a segment is separated by 
+# 	one whitespace. 
+# 
+
+# MODIFICATION: also add %identity as next output field
+# HOWEVER note this is specifically protein sequence identity of the aligned regions, not all of the proteins
+
+# If alignment mode is used, the parser prints a > before all data on the one-line description.
+# The next two lines will contain the aligned sequences, including gaps. Alignment mode is off
+# by default but can be used if the second parameter sent to the parser is -a. If alignment
+# mode is not used only the score cutoff and the Blast XML file should be sent to the parser.
+#
+# NB: This parser was created for blast 2.2.16. If you run earlier versions of blast or 
+# blast with -VT to emulate the old behaviour the code in this script must be changed a bit.
+# The neccesary changes are few and the first change is to un-comment all lines
+# marked by ##. Lines marked with ## occurs here at the beginning of the script and on a couple
+# of places in the My::TagHandler package (In the Init subroutine). The other change neccessary 
+# is to comment/remove all lines marked with #REMOVE THIS LINE FOR OLDER VERSIONS OF BLAST.
+#
+# Written by Isabella Pekkari 2007-11-19
+# Modified by Kristoffer Forslund 2008-05-07 to add biased composition handling
+#
+##############################################################################################
+
+# First argument is score cutt-of in bits
+# Second argument is beta threshold for composition filtering. Set to 0.0 to disable.
+# Third argument is blast result file (.xml, i.i run blast with option -m7) that shall be parsed
+my $score_cutoff = shift;
+
+
+# If alignment mode shall be used, the flag -a is specified as the first parameter
+# Alignment mode is off by defalut so the flag -a must be specified if this mode shall be used.
+my $alignment_mode = 0;
+if (defined $ARGV[0] && $ARGV[0] eq '-a') {
+
+	$alignment_mode = 1;
+	shift;
+} 
+
+# If segments must be ordered in a linear fashion on both sequences.
+# If 1, the hsp order must be the same on the query and the hit. 
+# For example, if the n-terminal of the hit matches the c-terminal of
+# the query another segment where the c-terminal of the hit matches
+# the n-terminal of the hit si not allowed.
+# If this parameter is set to 0, hsps can be ordered differently on the
+# query and the hit. In both cases, an overlap of maximum 5% of the shortest segment
+# is allowed.
+my $linear_segment_mode = 1;
+
+# Only for older versions of blast
+##my $done = 0;
+
+
+
+# Create an xml parser
+my $p = XML::Parser->new( Style => 'My::TagHandler', ); 
+
+# Create an Expat::NB parser
+my $nb_p = $p->parse_start();
+
+# Parse the xml file
+parse_document();
+
+sub parse_document {
+
+	while(my $l = <>){ 
+	 ##if ($done) {
+
+			##$nb_p->parse_done;
+			##$nb_p = $p->parse_start(); 
+			##$done = 0;
+
+		##} else {
+			# Remove whitespace at the end of the line
+			chomp($l); 
+##/DEBUG
+#	    print STDERR "###" .$l . "###\n";
+			# Parse the line
+			$nb_p->parse_more($l); 
+		##}
+	} 
+	
+	# Shut the parser down
+	$nb_p->parse_done;
+
+}
+
+# Only used for older versions of blast
+##sub next_doc {
+
+##	$done = 1;
+
+##}
+
+
+
+############################################
+# Package for parsing xml files obtained
+# from blast.
+############################################
+package My::TagHandler;
+
+use strict;
+use warnings;
+
+#Subroutines
+my $create_query;
+my $store_query_length;
+my $new_hit;
+my $save_hit_length;
+my $new_hsp;
+my $save_hsp_start_query;
+my $save_hsp_end_query;
+my $save_hsp_start_hit;
+my $save_hsp_end_hit;
+my $save_hsp_qseq;
+my $save_hsp_hseq;
+my $end_hit;
+my $print_query;
+my $check_if_overlap_non_linear_allowed;
+my $check_if_overlap_linear;
+
+my @current_query;	# The current query
+my $current_hit;	# The current hit
+my $current_hsp;	# The current hsp
+
+my $currentText = '';	# Text from the current tag
+
+# Names of tags that shall be parsed as key and references to
+# the anonymous subroutine that shall be run when parsing the tag as value.
+my %tags_to_parse;	
+
+# Reference to the anonymous subroutine that shall be run
+# to check whether there is an overlap between two hsps.
+my $overlap_sub;
+
+sub Init { 
+
+	my($expat) = @_; 
+
+	#Subroutine for creating new query
+	$create_query = sub {
+
+	    # The the query id. Additional information for the query is ignored.
+	    my @tmp = split(/\s+/, $currentText);
+	    @current_query = ($tmp[0], 0, [])
+	};
+
+	#Subroutine for storing query length
+	$store_query_length = sub {$current_query[1] = $currentText};
+
+	#Subroutine for creating new hit
+	$new_hit = sub {
+
+	    # The the hit id. Additional information for the hit is ignored.
+	    my @tmp = split(/\s+/, $currentText);
+	    $current_hit = [$tmp[0], 0, 0, 0, 0, [], 0];
+	};
+
+	#Subroutine for saving hit length
+	$save_hit_length = sub {$current_hit->[1] = $currentText};
+
+	#Subroutine for creating new hsp
+	$new_hsp = sub {$current_hsp = [$currentText]};
+
+	# Subroutine for saving hsp start on query
+	$save_hsp_start_query = sub {$current_hsp->[1] = $currentText};
+
+	# Subroutine for saving hsp end on query
+	$save_hsp_end_query = sub {$current_hsp->[2] = $currentText};
+
+	# Subroutine for saving hsp start on hit
+	$save_hsp_start_hit = sub {$current_hsp->[3] = $currentText};
+
+	# Subroutine for saving hsp end on hit
+	$save_hsp_end_hit = sub {$current_hsp->[4] = $currentText;};
+
+	# Subroutine for saving hsp query sequence (as in alignment)
+	$save_hsp_qseq = sub {$current_hsp->[5] = $currentText;};
+
+	# Subroutine for saving hsp hit sequence (as in alignment)
+	$save_hsp_hseq = sub {$current_hsp->[6] = $currentText;
+
+   	    	# Check if this hsp overlaps with any of the
+    	    	# existing hsps
+		my $overlap_found = 0;
+   	    	foreach my $existing_hsp (@{$current_hit->[5]}) {
+
+	    		if ($overlap_sub->($current_hsp, $existing_hsp)) {
+				$overlap_found = 1;
+				last;
+	    		}
+
+   	    	}
+
+    	   	 # If this hsp does not overlap with any hsp it is added
+
+
+    	    	unless ($overlap_found) {
+
+			# Add the hsp to the hit
+			push( @{$current_hit->[5]}, $current_hsp);
+
+			# Increase number of hsps for the hit with one
+			$current_hit->[6]++;
+
+			# Add the hsp score to the total score
+			$current_hit->[2] += $current_hsp->[0];
+
+			# Add the hsp length on the query to the total hsp length on the query
+			$current_hit->[3] += ($current_hsp->[2] - $current_hsp->[1] + 1);
+
+			# Add the hsp length on the hit to the total hsp length on the hit
+			$current_hit->[4] += ($current_hsp->[4] - $current_hsp->[3] + 1);
+   	    	}
+
+	};
+
+	# Subroutine for saving hit
+	$end_hit = sub {
+		
+	    #Add the hit to the qurrent query
+	    unless ($current_hit->[2] < $score_cutoff ) {
+	    	push( @{$current_query[2]}, $current_hit );
+	    }
+	};
+
+	# Subroutine for printing all hits for a query
+	$print_query = sub {
+
+		# Sort the hits after score with hit with highest score first
+   	   	my @sorted_hits = sort {$b->[2] <=> $a->[2]} @{$current_query[2]};
+
+    	    	# For all hits
+    	    	foreach my $hit (@sorted_hits) {
+
+			if ($alignment_mode) {
+		
+				# When alignment mode is used, self hits are not printed
+				# Therefore, the hit is ignored if it has the same id as the query
+				next if ($current_query[0] eq $hit->[0]);
+
+				# When alignment mode is used, a > is printed at the start of 
+				# lines containing data, i.e. lines that do not contain sequences
+				print ">";
+			}
+
+			print "$current_query[0]\t"; 	# Print query id
+			print "$hit->[0]\t";		# Print hit id
+			printf  ("%.1f\t", $hit->[2]);  # Print bit score
+			print "$current_query[1]\t";	# Print query length
+			print "$hit->[1]\t";		# Print hit length
+
+			if ($hit->[6] > 1) { # if more than one segment
+
+				# Sort hsps on query
+    				my @hsps_sorted_on_query = sort {$a->[1] <=> $b->[1]} @{$hit->[5]};
+    
+    				# Sort hsps on hit
+   				my @hsps_sorted_on_hit = sort {$a->[3] <=> $b->[3]} @{$hit->[5]};
+
+    				# Get total segment length on query
+    				my $segm_length_query = $hsps_sorted_on_query[@hsps_sorted_on_query - 1]->[2] - $hsps_sorted_on_query[0]->[1] + 1;
+    
+   		       		# Get total segment length on hit
+    				my $segm_length_hit = $hsps_sorted_on_hit[@hsps_sorted_on_hit - 1]->[4] - $hsps_sorted_on_hit[0]->[3] + 1;
+
+				print "$segm_length_query\t";	# Print segment length on query (i.e lengt of the segment started by the first hsp and ended by the last hsp)
+				print "$segm_length_hit\t";	# Print segment length on query
+				print "$hit->[3]\t$hit->[4]\t";	# Print total length of all hsps on the query and on the match, i.e length of actually matching region
+
+				# In alignment mode, the aligned sequences shall be printed on the lines following the data line
+				my $hsp_qseq = '';
+				my $hsp_hseq = '';
+
+    				# Print query and hit segment positions
+    				foreach my $segment (@hsps_sorted_on_query) {
+
+					print "q:$segment->[1]-$segment->[2] ";
+					print "h:$segment->[3]-$segment->[4]\t";
+
+					if ($alignment_mode) {
+						
+						# Add the hsp sequences to the total sequence
+						$hsp_qseq .= $segment->[5];
+						$hsp_hseq .= $segment->[6];
+					}
+
+    				}
+
+   				print "\n";
+
+				if ($alignment_mode) {
+			
+					# Print sequences
+					print "$hsp_qseq\n";
+					print "$hsp_hseq\n";
+				}
+
+			} else {
+				# Get the only hsp that was found for this hit
+				my $segment = $hit->[5]->[0];
+
+				# Get total segment length on query
+    				my $segm_length_query = $segment->[2] - $segment->[1] + 1;
+    
+   		       		# Get total segment length on hit
+    				my $segm_length_hit = $segment->[4] - $segment->[3] + 1;
+
+				print "$segm_length_query\t"; 	# Print segment length on query, i.e. length of this hsp on query
+				print "$segm_length_hit\t";	# Print segment length on hit, i.e. length of this hsp on hit
+
+				# Print total length of all hsps on the query and on the match, i.e length of actually matching region
+				# Sice there is only one segment, these lengths will be the same as the segment lengths printed above.
+				print "$hit->[3]\t$hit->[4]\t"; 
+
+				# Print segment positions
+
+				print "q:$segment->[1]-$segment->[2] ";
+				print "h:$segment->[3]-$segment->[4]\t";
+				print "\n";
+
+				if ($alignment_mode) {
+			
+					# Print sequences
+					print "$segment->[5]\n";
+					print "$segment->[6]\n";
+				}
+
+			}
+   	    	}
+	    	##main::next_doc(); #NB! Un-comment for older blast versions 
+
+	};
+
+	# Subroutine for checking if two hsps overlap.
+	# When this subroutine is used, non-linear arrangements of the hsps are allowed.
+	$check_if_overlap_non_linear_allowed = sub {
+
+    		my $hsp1  = shift;	# One hsp
+   		my $hsp2 = shift;	# Another hsp 
+
+		# Check if there is an overlap.
+    		return (_check_overlap_non_linear_allowed($hsp1->[1], $hsp1->[2], $hsp2->[1], $hsp2->[2]) 
+			|| _check_overlap_non_linear_allowed($hsp1->[3], $hsp1->[4], $hsp2->[3], $hsp2->[4]));
+
+	};
+
+	# Subroutine for checking if two hsps overlap.
+	# When this subroutine is used, non-linear arrangements of the hsps are not allowed.
+	$check_if_overlap_linear = sub {
+
+    		my $hsp1  = shift;	# One hsp
+   		my $hsp2 = shift;	# Another hsp 
+
+    		# Get start point for hsp1 on query
+   		my $start1_hsp1 = $hsp1->[1];
+
+    		# Get start point for hsp2 on query
+    		my $start1_hsp2 = $hsp2->[1];
+
+    		# The hsp that comes first oon the query (N-terminal hsp)
+    		my $first_hsp;
+
+    		# The hsp that comes last on the query
+    		my $last_hsp;
+
+    		# Check which hsp is N-teminal.
+    		if ($start1_hsp1 eq $start1_hsp2) { # If the fragments start at the same position, there is an overlap (100% of shorter sequence)
+			return 1;
+    		} elsif ($start1_hsp1 < $start1_hsp2) { 
+
+			$first_hsp = $hsp1;
+			$last_hsp = $hsp2;
+
+   		} else {
+	
+			$first_hsp = $hsp2;
+			$last_hsp = $hsp1;
+
+    		}
+		
+   		# Return whether there is an overlap or not.
+		return (_check_overlap_linear($first_hsp->[1], $first_hsp->[2], $last_hsp->[1], $last_hsp->[2]) 
+			|| _check_overlap_linear($first_hsp->[3], $first_hsp->[4], $last_hsp->[3], $last_hsp->[4]));
+	};
+
+	%tags_to_parse = (
+
+	         ##'BlastOutput_query-def' => $create_query, 
+		 ##'BlastOutput_query-def' => $create_query,		
+##	         'BlastOutput_query-def' => $create_query, 
+##		 'BlastOutput_query-len' => $store_query_length,		
+		 'Iteration_query-def' => $create_query,		# Query id		#REMOVE THIS LINE FOR OLDER VERSIONS OF BLAST
+		 'Iteration_query-len' => $store_query_length, 		# Query length		#REMOVE THIS LINE FOR OLDER VERSIONS OF BLAST
+		 'Hit_def' => $new_hit,               			# Hit id
+		 'Hit_len' => $save_hit_length,              		# Hit length
+		 'Hsp_bit-score' => $new_hsp,         			# Hsp bit score
+		 'Hsp_query-from' => $save_hsp_start_query,        	# Start point for hsp on query
+		 'Hsp_query-to' => $save_hsp_end_query,         	# End point for hsp on query
+		 'Hsp_hit-from' => $save_hsp_start_hit,          	# Start position for hsp on hit
+		 'Hsp_hit-to' => $save_hsp_end_hit,            		# End position for hsp on hit
+		 'Hsp_qseq' => $save_hsp_qseq,				# Hsp query sequence (gapped as in the alignment)	
+		 'Hsp_hseq' => $save_hsp_hseq,				# Hsp hit sequence (gapped as in the alignment)		
+		 'Hit_hsps' => $end_hit,				# End of hit
+		 ##'BlastOutput' => $print_query
+		 'Iteration' => $print_query				# End of query		#REMOVE THIS LINE FOR OLDER VERSIONS OF BLAST	
+	);
+
+	# Set overlap subroutine to use
+	if ($linear_segment_mode eq 1) {
+		$overlap_sub = $check_if_overlap_linear;
+	} else {
+		$overlap_sub = $check_if_overlap_non_linear_allowed;
+	}
+} 
+
+# Nothing is done when encountering a start tag
+#sub Start
+#{
+
+#}
+
+sub End {
+
+	my($expat, $tag) = @_; 
+
+	# If the name of the tag is in the table with names of tags to parse,
+	# run the corresponding subroutine
+	$tags_to_parse{$tag} && do { $tags_to_parse{$tag}->()};
+
+}
+
+sub Char
+{
+	my($expat, $text) = @_;
+	# Save the tag text
+	$currentText = $text;
+}
+
+sub _check_overlap_linear {
+
+    my ($start1, $end1, $start2, $end2) = @_;
+
+    # Length of segment 1
+    my $length1 = $end1 - $start1 + 1;
+
+    # Length of segment 2
+    my $length2 = $end2 - $start2 + 1;
+
+    # Get the length of the sortest of these segments
+    my $shortest_length = ($length1 < $length2)?$length1:$length2;
+
+    # Maxumin of 5% overlap (witg regard to the shorter segment) is allowed
+    return (($start2 - $end1 - 1) / $shortest_length < - 0.05);  
+}
+
+
+
+sub _check_overlap_non_linear_allowed {
+
+    my ($start1, $end1, $start2, $end2) = @_;
+
+    # Length of segment 1
+    my $length1 = $end1 - $start1 + 1;
+
+    # Length of segment 2
+    my $length2 = $end2 - $start2 + 1;
+
+    # Get the length of the sortest of these segments
+    my $shortest_length = ($length1 < $length2)?$length1:$length2;
+
+    if ($start1 eq $start2) { # If the fragment start at the same position, there is an overlap (100% of shorter sequence)
+	return 1;
+    } elsif ($start1 < $start2) { 
+
+	if (($start2 - $end1 + 1) / $shortest_length < - 0.05) {
+		return 1;
+	}
+
+    } else {
+	
+	if (($start1 - $end2 + 1) / $shortest_length < - 0.05) {
+		return 1;
+	}
+
+    }
+
+    # If the method has not returned yet, there is no overlap
+    return 0;
+}