view genephys/MergeBlastResults.pl @ 3:8dfa09868059 draft

Uploaded
author mcharles
date Fri, 24 Oct 2014 05:54:20 -0400
parents
children
line wrap: on
line source

#!/usr/bin/perl
#V1.0.3 header added
#V1.0.2 suppressed the final sort (very heavyload) and replaced it by another level of hash
#V1.0.1 added log, option parameters
use strict;
use warnings;
use Getopt::Long;

my $inputblast;
my $outputjoin;
my $log_file;
my $MAX_OVERLAP_FRACTION = 0.5;
my $MAX_OVERLAP_LENGTH_IGNORED = 3;
my $VERBOSE = "OFF";
my $ALLOWED_GAP_FRACTION_FOR_MERGING = 0.3;
my $HEADER ="";

GetOptions (
"input_blasttab_file=s" => \$inputblast,
"output_joinmatch_file=s" => \$outputjoin,
"log_file=s" => \$log_file,
"header=s" => \$HEADER,
"max_overlap_fraction=f" => \$MAX_OVERLAP_FRACTION,
"max_overlap_length_ignored=i" =>\$MAX_OVERLAP_LENGTH_IGNORED
) or die("Error in command line arguments\n");

open(IB, $inputblast) or die ("Can't open $inputblast \n");
open (LF,">$log_file") or die("Can't open $log_file\n");


my %match_by_query;

my @query_keys;
my %querys;


my $stats_nb_match=0;
my $stats_included=0;
my $stats_large_overlapping=0;
my %stats_query_coverage;
$stats_query_coverage{"0-10%"}=0;
$stats_query_coverage{"10-20%"}=0;
$stats_query_coverage{"20-30%"}=0;
$stats_query_coverage{"30-40%"}=0;
$stats_query_coverage{"40-50%"}=0;
$stats_query_coverage{"50-60%"}=0;
$stats_query_coverage{"60-70%"}=0;
$stats_query_coverage{"70-80%"}=0;
$stats_query_coverage{"80-90%"}=0;
$stats_query_coverage{"90-100%"}=0;

my $current_query="";
while (my $ligne = <IB>){
	my @fields = split (/\t/,$ligne);
	if ($#fields != 9){
		print STDERR "Invalid blasttab format, must have 10 columns\n";
		exit(0);
	}
	$stats_nb_match++;
	my %match;
	$match{"Query"}=$fields[0];
	if (!$querys{$match{"Query"}}){
		push(@query_keys,$match{"Query"});
		$querys{$match{"Query"}} = 1;
	}
	$match{"Subject_id"}=$fields[1];
	$match{"Orientation"}="+";
	$match{"Query_start"}=$fields[2];
	$match{"Query_end"}=$fields[3];
	$match{"Subject_start"}=$fields[4];
	$match{"Subject_end"}=$fields[5];
			

	if ($fields[2]>$fields[3]){
		$match{"Query_start"}=$fields[3];
		$match{"Query_end"}=$fields[2];
		$match{"Orientation"}="-";
		#print "- $ligne";
	}
	
	if ($fields[4]>$fields[5]){
		$match{"Subject_start"}=$fields[5];
		$match{"Subject_end"}=$fields[4];
		$match{"Orientation"}="-";
		#print "- $ligne";
	}
	$match{"Similarity"}=$fields[6];
	$match{"Query_length"}=$fields[7];
	$match{"Subject_length"}=$fields[8];
	chomp($fields[9]);
	$match{"Subject"}=$fields[9];
	
	$match{"Ligne"}=$ligne;
	
	my $querykey = $match{"Query"};
	my $key = $match{"Query"}."##".$match{"Subject"}."##".$match{"Orientation"};
	if ($match{"Subject_length"}==0){
		print LF "Match 0",$ligne,"\n",$match{"Subject_length"},"\n";
	}	
	my %match_by_query_and_subject;
	my @match_table;

	if ($match_by_query{$querykey}){
		%match_by_query_and_subject = %{$match_by_query{$querykey}};
	}
	if ($match_by_query_and_subject{$key}){
		@match_table=@{$match_by_query_and_subject{$key}};
	}

	push (@match_table,\%match);
	$match_by_query_and_subject{$key} = \@match_table;
	$match_by_query{$querykey}= \%match_by_query_and_subject;
}

close (IB);

#print LF "NB query : $#query_keys\n";
#foreach my $querykey (sort @query_keys){
#	my %current_match_by_query_and_subject = %{$match_by_query{$querykey}};
#	foreach my $key (sort {$a cmp $b} keys %current_match_by_query_and_subject){
#		my @current_match_table = sort sortbyquerycoord @{$current_match_by_query_and_subject{$key}};
#		for (my $i=0;$i<=$#current_match_table;$i++){
#			my %current_match = %{$current_match_table[$i]};
#			print LF $current_match{"Ligne"}."\n";
#		}
#	}
#}
#exit(0);

open (OJ, ">$outputjoin") or die ("Can't open $outputjoin \n");
print OJ "##",$HEADER,"\n";
print OJ "#Query\tSubject_Id\torientation\tQuery_coverage\tSubject_coverage\tIdentity\tmin_query\tmax_query\tmin_subject\tmax_subject\tNBmatch\tq_length\tsub_length\tsubject\n";
	
foreach my $querykey (sort @query_keys){
	my %current_match_by_query_and_subject = %{$match_by_query{$querykey}};
	my @match_joined; #la table qui contient les matchs (hash) projeté de chaque subject pour une query
	foreach my $key (sort {$a cmp $b} keys %current_match_by_query_and_subject){
		my @current_match_table = sort sortbyquerycoord @{$current_match_by_query_and_subject{$key}};
		my @duplicate;
		my @overlap;
		for (my $i=0;$i<=$#current_match_table;$i++){
			push (@duplicate,0);
		}
		if ($VERBOSE eq "ON"){
			print LF "\nTable Match ($#current_match_table)\t$key\n";
			for (my $i=0;$i<=$#current_match_table;$i++){
				my %match=%{$current_match_table[$i]};
				print LF $match{"Query"},"\t",$match{"Subject_id"},"\t",$match{"Orientation"},"\t",$match{"Query_start"},"\t",$match{"Query_end"},"\t";
				print LF $match{"Subject_start"},"\t",$match{"Subject_end"},"\t",$match{"Subject_length"},"\t",$match{"Similarity"},"\n";
			}
		}
	
		#Scan d'inclusion strict
		for (my $i=0;$i<=$#current_match_table;$i++){
			my %match1=%{$current_match_table[$i]};
			for (my $j=0;$j<=$#current_match_table;$j++){
				if (($j != $i)&&($duplicate[$j]==0)){ # On scan dans les deux sens, pas seulement $j = $i+1 a cause du last;
					my %match2=%{$current_match_table[$j]};
					# Inclus Subject
					if (($match1{"Subject_start"}>=$match2{"Subject_start"})&&($match1{"Subject_end"}<=$match2{"Subject_end"}))
					{
						$duplicate[$i]=1;
						last;
					}
					# Inclus Query
					elsif (($match1{"Query_start"}>=$match2{"Query_start"})&&($match1{"Query_end"}<=$match2{"Query_end"}))
					{
						$duplicate[$i]=2;
						last;
					}

				}
			}
		}
	
		my @current_match_table_filtered;
		for (my $i=0;$i<=$#current_match_table;$i++){
			if ($duplicate[$i] == 0){
				push (@current_match_table_filtered,$current_match_table[$i]);
			}
			else{
				$stats_included++;
			}
		}
	
		if ($#current_match_table > $#current_match_table_filtered){
			@current_match_table = @current_match_table_filtered;
			if ($VERBOSE eq "ON"){
				print LF "Table Match filtered 1 ($#current_match_table)\n";
				for (my $i=0;$i<=$#current_match_table;$i++){
					my %match=%{$current_match_table[$i]};
					print LF $match{"Query"},"\t",$match{"Subject_id"},"\t",$match{"Orientation"},"\t",$match{"Query_start"},"\t",$match{"Query_end"},"\t";
					print LF $match{"Subject_start"},"\t",$match{"Subject_end"},"\t",$match{"Subject_length"},"\t",$match{"Similarity"},"\n";
				}
			}
		}
	
		undef @duplicate;
		for (my $i=0;$i<=$#current_match_table;$i++){
			push (@duplicate,0);
		}
		########
	
		###Scan d'overlap trop important
		# D'abord subject
		for (my $i=0;$i<=$#current_match_table-1;$i++){
			my %match1=%{$current_match_table[$i]};
			for (my $j=$i+1;$j<=$#current_match_table;$j++){
				my %match2=%{$current_match_table[$j]};
				if (($match1{"Subject_start"}<=$match2{"Subject_start"})&&($match1{"Subject_end"}>=$match2{"Subject_start"})&&($match1{"Subject_end"}<=$match2{"Subject_end"}))
				 {
					my $overlap_length = $match1{"Subject_end"} - $match2{"Subject_start"}+1;
					my $length1 = $match1{"Subject_end"} - $match1{"Subject_start"} +1;
					my $length2 = $match2{"Subject_end"} - $match2{"Subject_start"} +1;
				
					if (($length1 >= $length2)&&($overlap_length > $length2 * $MAX_OVERLAP_FRACTION)&&($overlap_length > $MAX_OVERLAP_LENGTH_IGNORED)){
						$duplicate[$j]=1;
					}
					elsif (($length2 >= $length1)&&($overlap_length > $length1 * $MAX_OVERLAP_FRACTION)&&($overlap_length > $MAX_OVERLAP_LENGTH_IGNORED)){
						$duplicate[$i]=1;
					}
					else {
					}
				}
				elsif (($match2{"Subject_start"}<=$match1{"Subject_start"})&&($match2{"Subject_end"}>=$match1{"Subject_start"})&&($match2{"Subject_end"}<=$match1{"Subject_end"})) #Recherche d'inclusion dans les deux sens car els match sont classé par query coord, par par subject coord
				{
					my $overlap_length = $match2{"Subject_end"} - $match1{"Subject_start"}+1;
					my $length1 = $match1{"Subject_end"} - $match1{"Subject_start"} +1;
					my $length2 = $match2{"Subject_end"} - $match2{"Subject_start"} +1;
				
					if (($length1 >= $length2)&&($overlap_length > $length2 * $MAX_OVERLAP_FRACTION)&&($overlap_length > $MAX_OVERLAP_LENGTH_IGNORED)){
						$duplicate[$j]=1;
					}
					elsif (($length2 >= $length1)&&($overlap_length > $length1 * $MAX_OVERLAP_FRACTION)&&($overlap_length > $MAX_OVERLAP_LENGTH_IGNORED)){
						$duplicate[$i]=1;
					}
					else {
					}	
				}
			}
		}
		
		undef @current_match_table_filtered;
		for (my $i=0;$i<=$#current_match_table;$i++){
			if ($duplicate[$i] == 0){
				push (@current_match_table_filtered,$current_match_table[$i]);
			}
			else {
				$stats_large_overlapping++;
			}
		}
	
		if ($#current_match_table > $#current_match_table_filtered){
			@current_match_table = @current_match_table_filtered;
			if ($VERBOSE eq "ON"){
				print LF "Table Match filtered 2 ($#current_match_table)\n";
				for (my $i=0;$i<=$#current_match_table;$i++){
					my %match=%{$current_match_table[$i]};
					print LF $match{"Query"},"\t",$match{"Subject_id"},"\t",$match{"Orientation"},"\t",$match{"Query_start"},"\t",$match{"Query_end"},"\t";
					print LF $match{"Subject_start"},"\t",$match{"Subject_end"},"\t",$match{"Subject_length"},"\t",$match{"Similarity"},"\n";
				}
			}	

		}
	
		undef @duplicate;
		for (my $i=0;$i<=$#current_match_table;$i++){
			push (@duplicate,0);
		}
	
		# Ensuite Query (Subject puis Query pour evitez des deletions complementaires query update qui enleverait toutes les entrées)
			
		for (my $i=0;$i<=$#current_match_table-1;$i++){
			my %match1=%{$current_match_table[$i]};
			for (my $j=$i+1;$j<=$#current_match_table;$j++){
				my %match2=%{$current_match_table[$j]};			
				if (($match1{"Query_start"}<=$match2{"Query_start"})&&($match1{"Query_end"}>=$match2{"Query_start"})&&($match1{"Query_end"}<=$match2{"Query_end"}))
				{
					my $overlap_length = $match1{"Query_end"} - $match2{"Query_start"}+1;
					my $length1 = $match1{"Query_end"} - $match1{"Query_start"} +1;
					my $length2 = $match2{"Query_end"} - $match2{"Query_start"} +1;
				
					if (($length1 >= $length2)&&($overlap_length > $length2 * $MAX_OVERLAP_FRACTION)&&($overlap_length > $MAX_OVERLAP_LENGTH_IGNORED)){
						$duplicate[$j]=1;
					}
					elsif (($length2 >= $length1)&&($overlap_length > $length1 * $MAX_OVERLAP_FRACTION)&&($overlap_length > $MAX_OVERLAP_LENGTH_IGNORED)){
						$duplicate[$i]=1;
					}
					else {
					}
				}
				#un seul sans les query sont classés
			}
		}
			
		undef @current_match_table_filtered;
		for (my $i=0;$i<=$#current_match_table;$i++){
			if ($duplicate[$i] == 0){
				push (@current_match_table_filtered,$current_match_table[$i]);
			}
			else {
				$stats_large_overlapping++;
			}
		}
	
		if ($#current_match_table > $#current_match_table_filtered){
			@current_match_table = @current_match_table_filtered;
			if ($VERBOSE eq "ON"){
				print LF "Table Match filtered 3 ($#current_match_table)\n";
				for (my $i=0;$i<=$#current_match_table;$i++){
					my %match=%{$current_match_table[$i]};
					print LF $match{"Query"},"\t",$match{"Subject_id"},"\t",$match{"Orientation"},"\t",$match{"Query_start"},"\t",$match{"Query_end"},"\t";
					print LF $match{"Subject_start"},"\t",$match{"Subject_end"},"\t",$match{"Subject_length"},"\t",$match{"Similarity"},"\n";
				}
			}
		}	
	

	
		#Fusion des Hsp et Calcul des nouvelles metriques
	
		my $overlap_length = 0;
		my $Subject_coverage;
		my $Query_coverage;
		my $Identity;
		my $nb_match = 0;
	

		my $nb_covered_subject=0;
		my $nb_covered_query=0;
		my %match=%{$current_match_table[0]};
		my $q_length = $match{"Query_length"};
		my $sub_length = $match{"Subject_length"};
		my $subject = $match{"Subject"};
		my $min_query = $match{"Query_start"};
		my $max_query = $match{"Query_end"};
		my $min_subject = $match{"Subject_start"};
		my $max_subject = $match{"Subject_end"};
		my $orientation = $match{"Orientation"};
		my $Query = $match{"Query"};
		my $Subject_Id = $match{"Subject_id"};
		my $Subject = $match{"Subject"};
	
	
		for (my $i=0;$i<=$#current_match_table;$i++){
			my %match1=%{$current_match_table[$i]};
			$nb_covered_subject += $match1{"Subject_end"} - $match1{"Subject_start"} +1;
			$nb_covered_query += $match1{"Query_end"} - $match1{"Query_start"} +1;
			
			if ($match1{"Query_start"}<$min_query){
				$min_query = $match1{"Query_start"};
			}
			if ($match1{"Query_end"}>$max_query){
				$max_query = $match1{"Query_end"};
			}
			if ($match1{"Subject_start"}<$min_subject){
				$min_subject = $match1{"Subject_start"};
			}
			if ($match1{"Subject_end"}>$max_subject){
				$max_subject = $match1{"Subject_end"};
			}
		
			$nb_match += $match1{"Similarity"};
		
			for (my $j=$i+1;$j<=$#current_match_table;$j++){
				my $overlap_query=0;
				my $overlap_subject=0;
				my %match2=%{$current_match_table[$j]};

				if (($match1{"Subject_start"}<=$match2{"Subject_start"})&&($match1{"Subject_end"}>=$match2{"Subject_start"})&&($match1{"Subject_end"}<=$match2{"Subject_end"}))
				{
					$overlap_subject = $match1{"Subject_end"} - $match2{"Subject_start"}+1;
				}
				elsif (($match2{"Subject_start"}<=$match1{"Subject_start"})&&($match2{"Subject_end"}>=$match1{"Subject_start"})&&($match2{"Subject_end"}<=$match1{"Subject_end"}))
				{
					$overlap_subject = $match2{"Subject_end"} - $match1{"Subject_start"}+1;
				}
			
				if (($match1{"Query_start"}<=$match2{"Query_start"})&&($match1{"Query_end"}>=$match2{"Query_start"})&&($match1{"Query_end"}<=$match2{"Query_end"}))
				{
					$overlap_query = $match1{"Query_end"} - $match2{"Query_start"} +1;
				}
				elsif (($match2{"Query_start"}<=$match1{"Query_start"})&&($match2{"Query_end"}>=$match1{"Query_start"})&&($match2{"Query_end"}<=$match1{"Query_end"}))
				{
					$overlap_query = $match2{"Query_end"} - $match1{"Query_start"}+1;
				}
			
				if ($overlap_query > $overlap_subject){
					$overlap_length += $overlap_query;
				}
				else {
					$overlap_length += $overlap_subject;
				}
			
				$nb_covered_subject -= $overlap_subject;
				$nb_covered_query-= $overlap_query;
			}
		
		}
		### Cas des tblastx ou le nb match est en aa, et les nb_covered en base
		if ($nb_match/$nb_covered_subject<0.34){
			$nb_match = $nb_match *3;
		}
		####
	
		$Identity = sprintf("%.2f",($nb_match-$overlap_length)*100/$nb_covered_subject);

		$Subject_coverage = sprintf("%.2f",$nb_covered_subject*100/$sub_length);
		$Query_coverage = sprintf("%.2f",$nb_covered_query*100/$q_length);		

		if ($Subject_coverage<0.1){$stats_query_coverage{"0-10%"}++;}
		elsif($Subject_coverage<0.2){$stats_query_coverage{"10-20%"}++;}
		elsif($Subject_coverage<0.3){$stats_query_coverage{"20-30%"}++;}
		elsif($Subject_coverage<0.4){$stats_query_coverage{"30-40%"}++;}
		elsif($Subject_coverage<0.5){$stats_query_coverage{"40-50%"}++;}
		elsif($Subject_coverage<0.6){$stats_query_coverage{"50-60%"}++;}
		elsif($Subject_coverage<0.7){$stats_query_coverage{"60-70%"}++;}
		elsif($Subject_coverage<0.8){$stats_query_coverage{"70-80%"}++;}
		elsif($Subject_coverage<0.9){$stats_query_coverage{"80-90%"}++;}
		else{$stats_query_coverage{"90-100%"}++;}

		if ($VERBOSE eq "ON"){
			print LF "Final\n";
			print LF $Query,"\t",$Subject_Id,"\t",$orientation,"\t",$min_query,"\t",$max_query,"\t",$min_subject,"\t",$max_subject,"\t",$sub_length,"\t";
			print LF "NB:",$nb_match,"\t","O:",$overlap_length,"\t","CQ:",$nb_covered_query,"\t","CS:",$nb_covered_subject,"\t",$Query_coverage,"\t",$Subject_coverage,"\t",$Identity,"\n";

		}

		if ($subject=~/^(.*?)\s*$/){
			$subject = $1;
		}
		my %current_match_joined;
		$current_match_joined{"Query"}=$Query;
		$current_match_joined{"Query_start"}=$min_query;
		$current_match_joined{"Query_end"}=$max_query;
		$current_match_joined{"Query_length"}=$q_length;
		$current_match_joined{"QCoverage"} = $Query_coverage;
		$current_match_joined{"Subject_id"}=$Subject_Id;
		$current_match_joined{"Subject"}=$subject;
		$current_match_joined{"Subject_start"}=$min_subject;
		$current_match_joined{"Subject_end"}=$max_subject;
		$current_match_joined{"Subject_length"}=$sub_length;
		$current_match_joined{"SCoverage"} = $Subject_coverage;
		$current_match_joined{"Similarity"}=$Identity;
		my $NBmatch = $nb_match-$overlap_length;
		$current_match_joined{"Nbmatch"}=$NBmatch;
		$current_match_joined{"Display"}="$Query\t$Subject_Id\t$orientation\t$Query_coverage%\t$Subject_coverage%\t$Identity%\t$min_query\t$max_query\t$min_subject\t$max_subject\t$NBmatch\t$q_length\t$sub_length\t$subject";

		push(@match_joined,\%current_match_joined);
		#print OJ $match_joined{"Display"},"\n";
	}
	my @match_joined_sorted = sort sortbyrelevanceandsubject @match_joined;
	for (my $i=0;$i<=$#match_joined_sorted;$i++){
		my %match = %{$match_joined_sorted[$i]};
		print OJ $match{"Display"},"\n";
	}
}


#my %all_match_joined_best;

#foreach my $key (sort sortkey keys %all_match_joined){
#	my %match = %{$all_match_joined{$key}};
#	print OJ $match{"Display"},"\n";
#}

#close (OB);
close (OJ);


print LF "Nb query : $#query_keys\n";
print LF "Nb match : $stats_nb_match\n";
print LF "Nb match filtered included / too large overlap : $stats_included / $stats_large_overlapping \n";
print LF "Query coverage\n";
print LF "percent:\t";
foreach my $key (sort {$a cmp $b} keys %stats_query_coverage) {
	print LF $key,"\t";
}
print LF "\n number :\t";
foreach my $key (sort {$a cmp $b} keys %stats_query_coverage) {
	print LF $stats_query_coverage{$key},"\t";
}
print LF "\n";


close (LF);


# for (my $i=0;$i<=$#all_match_joined;$i++){
	# my $match_joined = %{$all_match_joined[$i]};
	# print $match_joined{"Query"},"\t",$match_joined{"Subject"},"\t",$match_joined{"Subject_id"},"\t",$match_joined{"Similarity"},"\t",$match_joined{"Query_length"},"\t",$match_joined{"Subject_length"},"\n";
# }


sub mysort{
	my %matcha=%{$a};
	my %matchb=%{$b};
	
	#print "TEST : ",$matcha{"Query_start"}, " / ", $matchb{"Query_start"},"\n";
	
	$matcha{"Query_start"} <=> $matchb{"Query_start"}
	||
	$matcha{"Query_end"} <=> $matchb{"Query_end"}
	
}

sub sortbyquerycoord{
	my %matcha=%{$a};
	my %matchb=%{$b};
	
	#print "TEST : ",$matcha{"Query_start"}, " / ", $matchb{"Query_start"},"\n";
	
	$matcha{"Query_start"} <=> $matchb{"Query_start"}
	||
	$matcha{"Query_end"} <=> $matchb{"Query_end"}
	
}

sub sortbyrelevanceandsubject{
	my %matcha=%{$a};
	my %matchb=%{$b};
	
	$matchb{"Nbmatch"} <=> $matcha{"Nbmatch"}
	||
	$matchb{"QCoverage"} <=> $matcha{"QCoverage"}
	||
	$matcha{"Subject"} cmp $matchb{"Subject"}
}


sub sortkey {
	my @fieldsa = split (/\#/,$a);
	my @fieldsb = split (/\#/,$b);
	
	#print "$a\n$b\n";
	#print $fieldsa[0]," cmp ",$fieldsb[0],"\n";
	#exit(0);

	$fieldsa[0] cmp $fieldsb[0]
	||
	$fieldsa[1] cmp $fieldsb[1]
	||
	$fieldsa[2] <=> $fieldsb[2]

}