Mercurial > repos > dereeper > pgap

diff PGAP-1.2.1/inparanoid.pl @ 0:83e62a1aeeeb draft
Uploaded
author: dereeper
date: Thu, 24 Jun 2021 13:51:52 +0000
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/PGAP-1.2.1/inparanoid.pl	Thu Jun 24 13:51:52 2021 +0000
@@ -0,0 +1,1868 @@
+#! /usr/bin/perl
+###############################################################################
+# InParanoid version 4.1
+# Copyright (C) Erik Sonnhammer, Kristoffer Forslund, Isabella Pekkari, 
+# Ann-Charlotte Berglund, Maido Remm, 2007
+#
+# This program is provided under the terms of a personal license to the recipient and may only 
+# be used for the recipient's own research at an academic insititution.
+#
+# Distribution of the results of this program must be discussed with the authors.
+# For using this program in a company or for commercial purposes, a commercial license is required. 
+# Contact Erik.Sonnhammer@sbc.su.se in both cases
+#
+# Make sure that Perl XML libraries are installed!
+#
+# NOTE: This script requires blastall (NCBI BLAST) version 2.2.16 or higher, that supports 
+# compositional score matrix adjustment (-C2 flag).
+
+my $usage =" Usage: inparanoid.pl <FASTAFILE with sequences of species A> <FASTAFILE with sequences of species B> [FASTAFILE with sequences of species C]
+
+";
+
+###############################################################################
+# The program calculates orthologs between 2 datasets of proteins 
+# called A and B. Both datasets should be in multi-fasta file
+# - Additionally, it tries to assign paralogous sequences (in-paralogs) to each 
+#   thus forming paralogous clusters. 
+# - Confidence of in-paralogs is calculated in relative scale between 0-100%.
+#   This confidence value is dependent on how far is given sequence from the
+#   seed ortholog of given group
+# - Confidence of groups can be calculated with bootstrapping. This is related 
+#   to score difference between best hit and second best hit.
+# - Optionally it can use a species C as outgroup.
+
+###############################################################################
+# You may need to run the following command manually to increase your 
+# default datasize limit: 'limit datasize 500000 kb'
+
+###############################################################################
+# Set following variables:                                                    #
+###############################################################################
+
+# What do you want the program to do?                                         #
+$run_blast = 1;  # Set to 1 if you don't have the 4 BLAST output files        #
+                 # Requires 'blastall', 'formatdb' (NCBI BLAST2)              #
+                 # and parser 'blast_parser.pl'                               #
+$blast_two_passes = 1;  # Set to 1 to run 2-pass strategy                     #
+                 # (strongly recommended, but slower)                         #
+$run_inparanoid = 1;
+$use_bootstrap = 1;# Use bootstrapping to estimate the confidence of orthologs#
+                   # Needs additional programs 'seqstat.jar' and 'blast2faa.pl'
+$use_outgroup = 0; # Use proteins from the third genome as an outgroup        #
+                   # Reject best-best hit if outgroup sequence is MORE        #
+                   # similar to one of the sequences                          #
+                   # (by more than $outgroup_cutoff bits)                     #
+
+# Define location of files and programs:
+#$blastall = "blastall -VT"; #Remove -VT for blast version 2.2.12 or earlier
+$blastall = "$ARGV[0] -a $ARGV[1]";  #Add -aN to use N processors
+$formatdb = "$ARGV[2]"; 
+$seqstat = "seqstat.jar";
+$blastParser = "blast_parser.pl";
+
+#$matrix = "BLOSUM62"; # Reasonable default for comparison of eukaryotes.
+$matrix = "BLOSUM45"; #(for prokaryotes),
+#$matrix = "BLOSUM80"; #(orthologs within metazoa),
+#$matrix = "PAM70";
+#$matrix = "PAM30";
+
+# Output options:                                                              #
+$output = 0;      # table_stats-format output                                  #
+$table = 0;       # Print tab-delimited table of orthologs to file "table.txt" #
+                  # Each orthologous group with all inparalogs is on one line  #
+$mysql_table = 1; # Print out sql tables for the web server                    #
+                  # Each inparalog is on separate line                         #
+$html = 0;        # HTML-format output                                         #
+
+# Algorithm parameters:                                                        
+# Default values should work without problems.
+# MAKE SURE, however, that the score cutoff here matches what you used for BLAST!
+$score_cutoff = $ARGV[3];    # In bits. Any match below this is ignored             #
+$outgroup_cutoff = 50; # In bits. Outgroup sequence hit must be this many bits#
+                       # stronger to reject best-best hit between A and B     #
+$conf_cutoff = 0.05;   # Include in-paralogs with this confidence or better   #
+$group_overlap_cutoff = 0.5; # Merge groups if ortholog in one group has more #
+                             # than this confidence in other group            #
+$grey_zone = 0;  # This many bits signifies the difference between 2 scores   #
+$show_times = 0; # Show times spent for execution of each part of the program #
+                 # (This does not work properly)                              #
+$debug = 0;      # Print debugging messages or not. Levels 0,1,2 and 4 exist  #
+
+my $seq_overlap_cutoff = $ARGV[4]; 		# Match area should cover at least this much of longer sequence. Match area is defined as area from start of
+					# first segment to end of last segment, i.e segments 1-10 and 90-100 gives a match length of 100.
+my $segment_coverage_cutoff = $ARGV[5]; 	# Actually matching segments must cover this much of longer sequence. 
+					# For example, segments 1-10 and 90-100 gives a total length of 20.
+
+splice(@ARGV,0,6);
+
+###############################################################################
+# No changes should be required below this line                               #
+###############################################################################
+$ENV{CLASSPATH} = "./$seqstat" if ($use_bootstrap);
+
+if (!@ARGV){
+    print STDERR $usage;
+    exit 1;
+}
+
+if ((@ARGV < 2) and ($run_inparanoid)){
+    print STDERR "\n When \$run_inparanoid=1, at least two distinct FASTA files have to be specified.\n";
+
+    print STDERR $usage;
+    exit 1;
+}
+
+if ((!$run_blast) and (!$run_inparanoid)){
+    print STDERR "run_blast or run_inparanoid has to be set!\n";
+    exit 1;
+}
+
+# Input files:                                                            
+$fasta_seq_fileA = "$ARGV[0]";                                            
+$fasta_seq_fileB = "$ARGV[1]";                                            
+$fasta_seq_fileC = "$ARGV[2]" if ($use_outgroup); # This is outgroup file  
+
+my $blast_outputAB = $fasta_seq_fileA . "-" . $fasta_seq_fileB;
+my $blast_outputBA = $fasta_seq_fileB . "-" . $fasta_seq_fileA;
+my $blast_outputAA = $fasta_seq_fileA . "-" . $fasta_seq_fileA;
+my $blast_outputBB = $fasta_seq_fileB . "-" . $fasta_seq_fileB;
+
+if ($use_outgroup){
+    $blast_outputAC = $fasta_seq_fileA . "-" . $fasta_seq_fileC;
+    $blast_outputBC = $fasta_seq_fileB . "-" . $fasta_seq_fileC;
+}
+my %idA;        # Name -> ID combinations for species 1
+my %idB;        # Name -> ID combinations for species 2
+my @nameA;      # ID -> Name combinations for species 1
+my @nameB;      # ID -> Name combinations for species 2
+my @nameC;
+my %scoreAB;    # Hashes with pairwise BLAST scores (in bits)
+my %scoreBA;
+my %scoreAA;
+my %scoreBB;
+my @hitnAB;     # 1-D arrays that keep the number of pairwise hits
+my @hitnBA;
+my @hitnAA;
+my @hitnBB;
+my @hitAB;      # 2-D arrays that keep the actual matching IDs 
+my @hitBA;
+my @hitAA;
+my @hitBB;
+my @besthitAB;  # IDs of best hits in other species (may contain more than one ID)
+my @besthitBA;  # IDs of best hits in other species (may contain more than one ID)
+my @bestscoreAB; # best match A -> B 
+my @bestscoreBA; # best match B -> A 
+my @ortoA;      # IDs of ortholog candidates from species A 
+my @ortoB;      # IDs of ortholog candidates from species B
+my @ortoS;      # Scores between ortoA and ortoB pairs
+my @paralogsA;  # List of paralog IDs in given cluster
+my @paralogsB;  # List of paralog IDs in given cluster
+my @confPA;     # Confidence values for A paralogs
+my @confPB;     # Confidence values for B paralogs
+my @confA;      # Confidence values for orthologous groups
+my @confB;      # Confidence values for orthologous groups 
+my $prev_time = 0;
+
+$outputfile = "Output." . $ARGV[0] . "-" . $ARGV[1];
+if ($output){
+    open OUTPUT, ">$outputfile" or warn "Could not write to OUTPUT file $filename\n";
+}
+
+#################################################
+# Assign ID numbers for species A
+#################################################
+open A, "$fasta_seq_fileA" or die "File A with sequences in FASTA format is missing
+Usage $0 <FASTAFILE with sequences of species A> <FASTAFILE with sequences of species B> <FASTAFILE with sequences of species C>\n";
+$id = 0;
+while (<A>){
+    if(/^\>/){
+	++$id;
+	chomp;
+	s/\>//;
+	@tmp = split(/\s+/);
+	#$name = substr($tmp[0],0,25);
+	$name = $tmp[0];
+	$idA{$name} = int($id);
+	$nameA[$id] = $name;
+    }
+}
+close A;
+$A = $id;
+print "$A sequences in file $fasta_seq_fileA\n";
+
+if ($output){
+    print OUTPUT "$A sequences in file $fasta_seq_fileA\n";
+}
+
+if (@ARGV >= 2) {
+#################################################
+# Assign ID numbers for species B
+#################################################
+    open B, "$fasta_seq_fileB" or die "File B with sequences in FASTA format is missing
+Usage $0 <FASTAFILE with sequences of species A> <FASTAFILE with sequences of species B> <FASTAFILE with sequences of species C>\n";
+    $id = 0;
+    while (<B>){
+	if(/^\>/){
+	    ++$id;
+	    chomp;
+	    s/\>//;
+	    @tmp = split(/\s+/);
+	    #$name = substr($tmp[0],0,25);
+	    $name = $tmp[0];
+	    $idB{$name} = int($id);
+	    $nameB[$id] = $name;
+	}
+    }
+    $B = $id;
+    print "$B sequences in file $fasta_seq_fileB\n";
+    close B;
+
+    if ($output){
+	print OUTPUT "$B sequences in file $fasta_seq_fileB\n";
+    }
+}
+#################################################
+# Assign ID numbers for species C (outgroup)
+#################################################
+if ($use_outgroup){
+    open C, "$fasta_seq_fileC" or die "File C with sequences in FASTA format is missing
+   Usage $0 <FASTAFILE with sequences of species A> <FASTAFILE with sequences of species B> <FASTAFILE with sequences of species C>\n";
+    $id = 0;
+    while (<C>){
+	if(/^\>/){
+	    ++$id;
+	    chomp;
+	    s/\>//;
+	    @tmp = split(/\s+/);
+	    #$name = substr($tmp[0],0,25);
+	    $name = $tmp[0];
+	    $idC{$name} = int($id);
+	    $nameC[$id] = $name;
+	}
+    }
+    $C = $id;
+    print "$C sequences in file $fasta_seq_fileC\n";
+    close C;
+    if ($output){
+	print OUTPUT "$C sequences in file $fasta_seq_fileC\n";
+    }
+}
+if ($show_times){
+    ($user_time,,,) = times;
+    printf ("Indexing sequences took %.2f seconds\n", ($user_time - $prev_time));
+    $prev_time = $user_time;
+}
+
+#################################################
+# Run BLAST if not done already
+#################################################
+if ($run_blast){
+    print "Trying to run BLAST now - this may take several hours ... or days in worst case!\n";
+    print STDERR "Formatting BLAST databases\n";
+    system ("$formatdb -i $fasta_seq_fileA");
+    system ("$formatdb -i $fasta_seq_fileB") if (@ARGV >= 2);
+    system ("$formatdb -i $fasta_seq_fileC") if ($use_outgroup);   
+    print STDERR "Done formatting\nStarting BLAST searches...\n";
+
+# Run blast only if the files do not already exist is not default. 
+# NOTE: you should have done this beforehand, because you probably
+# want two-pass blasting anyway which is not implemented here
+# this is also not adapted to use specific compositional adjustment settings
+# and might not use the proper blast parser...
+
+    do_blast ($fasta_seq_fileA, $fasta_seq_fileA, $A, $A, $blast_outputAA);
+
+    if (@ARGV >= 2) {
+      do_blast ($fasta_seq_fileA, $fasta_seq_fileB, $B, $B, $blast_outputAB);
+      do_blast ($fasta_seq_fileB, $fasta_seq_fileA, $A, $A, $blast_outputBA);
+      do_blast ($fasta_seq_fileB, $fasta_seq_fileB, $B, $B, $blast_outputBB);
+    }
+
+    if ($use_outgroup){
+
+	do_blast ($fasta_seq_fileA, $fasta_seq_fileC, $A, $C, $blast_outputAC);
+	do_blast ($fasta_seq_fileB, $fasta_seq_fileC, $B, $C, $blast_outputBC);
+    }
+
+    if ($show_times){
+	($user_time,,,) = times;
+	printf ("BLAST searches took %.2f seconds\n", ($user_time - $prev_time));
+	$prev_time = $user_time;
+    }
+    print STDERR "Done BLAST searches. ";
+
+} else {
+	print STDERR "Skipping blast! \n";
+}
+
+if ($run_inparanoid){
+    print STDERR "Starting ortholog detection...\n";   
+#################################################
+# Read in best hits from blast output file AB
+#################################################
+    $count = 0;
+    open AB, "$blast_outputAB" or die "Blast output file A->B is missing\n";
+    $old_idQ = 0;
+    while (<AB>){
+	chomp;
+	@Fld = split(/\s+/);    # Get query, match and score
+
+	if( scalar @Fld < 9 ){
+	    if($Fld[0]=~/done/){
+		print STDERR "AB ok\n";
+	    }
+	    next;
+	} 
+
+	$q = $Fld[0];
+	$m = $Fld[1];
+	$idQ = $idA{$q}; # ID of query sequence
+	$idM = $idB{$m}; # ID of match sequence
+	$score = $Fld[2];
+
+	next if (!overlap_test(@Fld));
+
+	# Score must be equal to or above cut-off
+	next if ($score < $score_cutoff);
+
+	if(!$count || $q ne $oldq){
+	    print "Match $m, score $score, ID for $q is missing\n" if ($debug == 2 and !(exists($idA{$q})));
+	    $hitnAB[$idA{$oldq}] = $hit if($count); # Record number of hits for previous query
+	    $hit = 0;
+	    ++$count;
+	    $oldq = $q;
+	}
+	++$hit;
+	$hitAB[$idQ][$hit] = int($idM);
+#	printf ("hitAB[%d][%d] = %d\n",$idQ,$hit,$idM);
+	$scoreAB{"$idQ:$idM"} = $score;
+	$scoreBA{"$idM:$idQ"} = $score_cutoff; # Initialize mutual hit score - sometimes this is below score_cutoff
+	$old_idQ = $idQ;
+#    }
+    }
+    $hitnAB[$idQ] = $hit; # For the last query
+#printf ("hitnAB[1] = %d\n",$hitnAB[1]); 
+#printf ("hitnAB[%d] = %d\n",$idQ,$hit); 
+    close AB;
+    if ($output){
+	print OUTPUT "$count sequences $fasta_seq_fileA have homologs in dataset $fasta_seq_fileB\n";
+    }
+#################################################
+# Read in best hits from blast output file BA
+#################################################
+    $count = 0;
+    open BA, "$blast_outputBA" or die "Blast output file B->A is missing\n";
+    $old_idQ = 0;
+    while (<BA>){
+	chomp;
+	@Fld = split(/\s+/);    # Get query, match and score
+
+	if( scalar @Fld < 9 ){
+	    if($Fld[0]=~/done/){
+		print STDERR "BA ok\n";
+	    }
+	    next;
+	} 
+
+	$q = $Fld[0];
+	$m = $Fld[1];
+	$idQ = $idB{$q};
+	$idM = $idA{$m};	 
+	$score = $Fld[2];
+	
+	next if (!overlap_test(@Fld));
+	
+	next if ($score < $score_cutoff);
+	
+	if(!$count || $q ne $oldq){
+	    print "ID for $q is missing\n" if ($debug == 2 and (!exists($idB{$q})));
+	    $hitnBA[$idB{$oldq}] = $hit if($count); # Record number of hits for previous query
+	    $hit = 0;
+	    ++$count;
+	    $oldq = $q;
+	}
+	++$hit;
+	$hitBA[$idQ][$hit] = int($idM);
+#	printf ("hitBA[%d][%d] = %d\n",$idQ,$hit,$idM);
+	$scoreBA{"$idQ:$idM"} = $score;   
+	$scoreAB{"$idM:$idQ"} = $score_cutoff if ($scoreAB{"$idM:$idQ"} < $score_cutoff); # Initialize missing scores
+	$old_idQ = $idQ;
+#    }
+    }
+    $hitnBA[$idQ] = $hit; # For the last query
+#printf ("hitnBA[%d] = %d\n",$idQ,$hit); 
+    close BA;
+    if ($output){
+	print OUTPUT "$count sequences $fasta_seq_fileB have homologs in dataset $fasta_seq_fileA\n";
+    }
+##################### Equalize AB scores and BA scores ##########################
+
+###################################################################################################################################### Modification by Isabella 1
+
+	# I removed the time consuming all vs all search and equalize scores for all pairs where there was a hit
+
+    	foreach my $key (keys %scoreAB) {
+
+		my ($a, $b) = split(':', $key);
+		my $key2 = $b . ':' . $a;
+
+		# If debugg mod is 5 and the scores A-B and B-A are unequal
+	   	 # the names of the two sequences and their scores are printed
+	    	if ($scoreAB{$key} != $scoreBA{$key2}){
+			printf ("%-20s\t%-20s\t%d\t%d\n",$nameA[$a], $nameB[$b], $scoreAB{$key}, $scoreBA{$key2}) if ($debug == 5);
+	    	}
+
+		# Set score AB and score BA to the mean of scores AB and BA.
+	  	# The final score is saved as an integer so .5 needs to be added to avoid rounding errors
+	    	$scoreAB{$key} = $scoreBA{$key2} = int(($scoreAB{$key} + $scoreBA{$key2})/2.0 +.5);
+	}
+
+    # For all ids for sequences from organism A	
+    #for $a(1..$A){
+	#For all ids for sequences from organism B
+	#for $b(1..$B){
+
+	    # No need to equalize score if there was no match between sequence with id $a from species A
+	    # and sequence with id $b from species B
+	 #   next if (!$scoreAB{"$a:$b"});
+
+	    # If debugg mod is 5 and the scores A-B and B-A are unequal
+	    # the names of the two sequences and their scores are printed
+	  #  if ($scoreAB{"$a:$b"} != $scoreBA{"$b:$a"}){
+	#	printf ("%-20s\t%-20s\t%d\t%d\n",$nameA[$a], $nameB[$b], $scoreAB{"$a:$b"}, $scoreBA{"$b:$a"}) if ($debug == 5);
+	 #   }
+
+	    # Set score AB and score BA to the mean of scores AB and BA.
+	    # The final score is saved as an integer so .5 needs to be added to avoid rounding errors
+	 #   $scoreAB{"$a:$b"} = $scoreBA{"$b:$a"} = int(($scoreAB{"$a:$b"} + $scoreBA{"$b:$a"})/2.0 +.5);
+
+#	printf ("scoreAB{%d: %d} = %d\n",	$a, $b, $scoreAB{"$a:$b"});
+#	printf ("scoreBA{%d: %d} = %d\n",	$b, $a, $scoreBA{"$a:$b"});
+	#}
+#    }
+
+####################################################################################################################################### End modification by Isabella 1
+
+##################### Re-sort hits, besthits and bestscore #######################
+    for $idA(1..$A){
+#    print "Loop index = $idA\n";
+#    printf ("hitnAB[%d] = %d\n",$idA, $hitnAB[$idA]);
+	next if (!($hitnAB[$idA]));
+	for $hit (1..($hitnAB[$idA]-1)){ # Sort hits by score
+	    while($scoreAB{"$idA:$hitAB[$idA][$hit]"} < $scoreAB{"$idA:$hitAB[$idA][$hit+1]"}){ 
+		$tmp = $hitAB[$idA][$hit];
+		$hitAB[$idA][$hit] = $hitAB[$idA][$hit+1];
+		$hitAB[$idA][$hit+1] = $tmp;
+		--$hit if ($hit > 1);      
+	    }
+	}
+	$bestscore = $bestscoreAB[$idA] = $scoreAB{"$idA:$hitAB[$idA][1]"};
+	$besthitAB[$idA] = $hitAB[$idA][1];   
+	for $hit (2..$hitnAB[$idA]){
+	    if ($bestscore - $scoreAB{"$idA:$hitAB[$idA][$hit]"} <= $grey_zone){
+		$besthitAB[$idA] .= " $hitAB[$idA][$hit]";
+	    }
+	    else {
+		last;
+	    }
+	}
+	undef $is_besthitAB[$idA]; # Create index that we can check later
+	grep (vec($is_besthitAB[$idA],$_,1) = 1, split(/ /,$besthitAB[$idA]));
+#    printf ("besthitAB[%d] = hitAB[%d][%d] = %d\n",$idA,$idA,$hit,$besthitAB[$idA]);
+	
+    }
+    
+    for $idB(1..$B){
+#    print "Loop index = $idB\n";
+	next if (!($hitnBA[$idB]));
+	for $hit (1..($hitnBA[$idB]-1)){
+# Sort hits by score
+	    while($scoreBA{"$idB:$hitBA[$idB][$hit]"} < $scoreBA{"$idB:$hitBA[$idB][$hit+1]"}){
+		$tmp = $hitBA[$idB][$hit];
+		$hitBA[$idB][$hit] = $hitBA[$idB][$hit+1];
+		$hitBA[$idB][$hit+1] = $tmp;
+		--$hit if ($hit > 1);
+	    }
+	}
+	$bestscore = $bestscoreBA[$idB] = $scoreBA{"$idB:$hitBA[$idB][1]"};
+	$besthitBA[$idB] = $hitBA[$idB][1];
+	for $hit (2..$hitnBA[$idB]){
+	    if ($bestscore - $scoreBA{"$idB:$hitBA[$idB][$hit]"} <= $grey_zone){
+		$besthitBA[$idB] .= " $hitBA[$idB][$hit]";
+	    }
+	    else {last;}
+	}
+	undef $is_besthitBA[$idB]; # Create index that we can check later
+	grep (vec($is_besthitBA[$idB],$_,1) = 1, split(/ /,$besthitBA[$idB]));
+#    printf ("besthitBA[%d] = %d\n",$idA,$besthitAB[$idA]);
+    }      
+    
+    if ($show_times){
+	($user_time,,,) = times;
+	printf ("Reading and sorting homologs took %.2f seconds\n", ($user_time - $prev_time));
+	$prev_time = $user_time;
+    }
+    
+######################################################
+# Now find orthologs:
+######################################################
+    $o = 0;
+    
+    for $i(1..$A){   # For each ID in file A
+	if (defined $besthitAB[$i]){
+	    @besthits = split(/ /,$besthitAB[$i]);
+	    for $hit (@besthits){
+		if (vec($is_besthitBA[$hit],$i,1)){
+		    ++$o;
+		    $ortoA[$o] = $i;
+		    $ortoB[$o] = $hit;
+		    $ortoS[$o] = $scoreAB{"$i:$hit"}; # Should be equal both ways
+#	    --$o if ($ortoS[$o] == $score_cutoff); # Ignore orthologs that are exactly at score_cutoff
+		    print "Accept! " if ($debug == 2);
+		}
+		else {print "        " if ($debug == 2);}
+		printf ("%-20s\t%d\t%-20s\t", $nameA[$i], $bestscoreAB[$i], $nameB[$hit]) if ($debug == 2);
+		print "$bestscoreBA[$hit]\t$besthitBA[$hit]\n" if ($debug == 2);
+	    }
+	}   
+    }
+    print "$o ortholog candidates detected\n" if ($debug);
+#####################################################
+# Sort orthologs by ID and then by score:
+#####################################################
+
+####################################################################################################### Modification by Isabella 2
+
+    # Removed time consuiming bubble sort. Created an index array and sort that according to id and score.
+    # The put all clusters on the right place.
+
+    # Create an array used to store the position each element shall have in the final array
+    # The elements are initialized with the position numbers
+    my @position_index_array = (1..$o);
+
+    # Sort the position list according to id
+    my @id_sorted_position_list = sort { ($ortoA[$a]+$ortoB[$a]) <=> ($ortoA[$b] + $ortoB[$b]) } @position_index_array;
+
+    # Sort the list according to score
+    my @score_id_sorted_position_list = sort { $ortoS[$b] <=> $ortoS[$a] } @id_sorted_position_list;
+
+    # Create new arrays for the sorted information
+    my @new_ortoA;
+    my @new_ortoB;
+    my @new_orthoS;
+
+   # Add the information to the new arrays in the orer specifeid by the index array
+   for (my $index_in_list = 0; $index_in_list < scalar @score_id_sorted_position_list; $index_in_list++) {
+	
+
+	my $old_index = $score_id_sorted_position_list[$index_in_list];
+	$new_ortoA[$index_in_list + 1] = $ortoA[$old_index];
+	$new_ortoB[$index_in_list + 1] = $ortoB[$old_index];
+	$new_ortoS[$index_in_list + 1] = $ortoS[$old_index];
+   }
+
+    @ortoA = @new_ortoA;
+    @ortoB = @new_ortoB;
+    @ortoS = @new_ortoS;
+
+    # Use bubblesort to sort ortholog pairs by id
+#    for $i(1..($o-1)){ 
+#	while(($ortoA[$i]+$ortoB[$i]) > ($ortoA[$i+1] + $ortoB[$i+1])){
+#	    $tempA =  $ortoA[$i];
+#	    $tempB =  $ortoB[$i];
+#	    $tempS =  $ortoS[$i];
+#	    
+#	    $ortoA[$i] = $ortoA[$i+1];
+#	    $ortoB[$i] = $ortoB[$i+1];
+#	    $ortoS[$i] = $ortoS[$i+1];
+#	    
+#	    $ortoA[$i+1] = $tempA;
+#	    $ortoB[$i+1] = $tempB;
+#	    $ortoS[$i+1] = $tempS;
+#	    
+#	    --$i if ($i > 1);
+#	}
+#    }
+#
+#    # Use bubblesort to sort ortholog pairs by score
+#    for $i(1..($o-1)){
+#	while($ortoS[$i] < $ortoS[$i+1]){
+#	    # Swap places:
+#	    $tempA =  $ortoA[$i];
+#	    $tempB =  $ortoB[$i];
+#	    $tempS =  $ortoS[$i];
+#	    
+#	    $ortoA[$i] = $ortoA[$i+1];
+#	    $ortoB[$i] = $ortoB[$i+1];
+#	    $ortoS[$i] = $ortoS[$i+1];
+#	    
+#	    $ortoA[$i+1] = $tempA;
+#	    $ortoB[$i+1] = $tempB;
+#	    $ortoS[$i+1] = $tempS;
+#	    
+#	    --$i if ($i > 1);
+#	}
+#    }
+
+###################################################################################################### End modification bt Isabella 2
+
+    @all_ortologsA = ();
+    @all_ortologsB = ();
+    for $i(1..$o){
+	push(@all_ortologsA,$ortoA[$i]); # List of all orthologs
+	push(@all_ortologsB,$ortoB[$i]);
+    }
+    undef $is_ortologA; # Create index that we can check later
+    undef $is_ortologB;
+    grep (vec($is_ortologA,$_,1) = 1, @all_ortologsA);
+    grep (vec($is_ortologB,$_,1) = 1, @all_ortologsB);
+    
+    if ($show_times){
+	($user_time,,,) = times;
+	printf ("Finding and sorting orthologs took %.2f seconds\n", ($user_time - $prev_time));
+	$prev_time = $user_time;
+    }
+#################################################
+# Read in best hits from blast output file AC
+#################################################
+    if ($use_outgroup){
+	$count = 0;
+	open AC, "$blast_outputAC" or die "Blast output file A->C is missing\n";
+	while (<AC>){
+	    chomp;
+	    @Fld = split(/\s+/);    # Get query, match and score
+
+            if( scalar @Fld < 9 ){
+	       if($Fld[0]=~/done/){
+		   print STDERR "AC ok\n";
+   	       }
+	       next;
+            } 
+
+	    $q = $Fld[0];
+	    $m = $Fld[1];
+	    $idQ = $idA{$q};
+	    $idM = $idC{$m};
+	    $score = $Fld[2];
+	    next unless (vec($is_ortologA,$idQ,1));
+	    
+	    next if (!overlap_test(@Fld));
+
+	    next if ($score < $score_cutoff);
+
+	    next if ($count and ($q eq $oldq));
+	    # Only comes here if this is the best hit:
+	    $besthitAC[$idQ] = $idM; 
+	    $bestscoreAC[$idQ] = $score;
+	    $oldq = $q;
+	    ++$count;
+	}
+	close AC;
+#################################################
+# Read in best hits from blast output file BC
+#################################################
+	$count = 0;
+	open BC, "$blast_outputBC" or die "Blast output file B->C is missing\n";
+	while (<BC>){
+	    chomp;
+	    @Fld = split(/\s+/);    # Get query, match and score
+
+            if( scalar @Fld < 9 ){
+	       if($Fld[0]=~/done/){
+		   print STDERR "BC ok\n";
+   	       }
+	       next;
+            } 
+
+	    $q = $Fld[0];
+	    $m = $Fld[1];
+	    $idQ = $idB{$q};
+	    $idM = $idC{$m};
+	    $score = $Fld[2];
+	    next unless (vec($is_ortologB,$idQ,1));
+	    
+	    next if (!overlap_test(@Fld));
+
+	    next if ($score < $score_cutoff);
+
+	    next if ($count and ($q eq $oldq));
+	    # Only comes here if this is the best hit:
+	    $besthitBC[$idQ] = $idM;
+	    $bestscoreBC[$idQ] = $score;
+	    $oldq = $q;
+	    ++$count;
+	}
+	close BC;
+################################
+# Detect rooting problems
+################################
+	$rejected = 0;
+	@del = ();
+	$file = "rejected_sequences." . $fasta_seq_fileC;
+	open OUTGR, ">$file";
+	for $i (1..$o){
+	    $diff1 = $diff2 = 0;
+	    $idA = $ortoA[$i];
+	    $idB = $ortoB[$i];
+	    $diff1 = $bestscoreAC[$idA] - $ortoS[$i];
+	    $diff2 = $bestscoreBC[$idB] - $ortoS[$i];
+	    if ($diff1 > $outgroup_cutoff){
+		print OUTGR "Ortholog pair $i ($nameA[$idA]-$nameB[$idB]). 
+   $nameA[$idA] from $fasta_seq_fileA is closer to $nameC[$besthitAC[$idA]] than to $nameB[$idB]\n";
+		print OUTGR "   $ortoS[$i] < $bestscoreAC[$idA] by $diff1\n";
+	    }
+	    if ($diff2 > $outgroup_cutoff){
+		print OUTGR "Ortholog pair $i ($nameA[$idA]-$nameB[$idB]). 
+   $nameB[$idB] from $fasta_seq_fileB is closer to $nameC[$besthitBC[$idB]] than to $nameA[$idA]\n";
+		print OUTGR "   $ortoS[$i] < $bestscoreBC[$idB] by $diff2\n";
+	    }
+	    if (($diff1 > $outgroup_cutoff) or ($diff2 > $outgroup_cutoff)){
+		++$rejected;
+		$del[$i] = 1;
+	    }
+	}
+	print "Number of rejected groups: $rejected (outgroup sequence was closer by more than $outgroup_cutoff bits)\n";
+	close OUTGR;
+    } # End of $use_outgroup
+################################
+# Read inside scores from AA
+################################
+    $count = 0;
+    $max_hit = 0;
+    open AA, "$blast_outputAA" or die "Blast output file A->A is missing\n";
+    while (<AA>) {
+	chomp;                  # strip newline
+	
+	@Fld = split(/\s+/);    # Get query and match names
+
+	if( scalar @Fld < 9 ){
+	    if($Fld[0]=~/done/){
+		print STDERR "AA ok\n";
+	    }
+	    next;
+	} 
+
+	$q = $Fld[0];
+	$m = $Fld[1];
+	$score = $Fld[2];
+	next unless (vec($is_ortologA,$idA{$q},1));
+	
+	next if (!overlap_test(@Fld));
+
+	next if ($score < $score_cutoff);
+
+	if(!$count || $q ne $oldq){ # New query 
+	    $max_hit = $hit if ($hit > $max_hit);
+	    $hit = 0;
+	    $oldq = $q;
+	}   
+	++$hit;
+	++$count;
+	$scoreAA{"$idA{$q}:$idA{$m}"}  = int($score + 0.5);
+	$hitAA[$idA{$q}][$hit] = int($idA{$m});
+	$hitnAA[$idA{$q}] = $hit;
+    }
+    close AA;
+    if ($output){
+	print OUTPUT "$count $fasta_seq_fileA-$fasta_seq_fileA matches\n";
+    }
+################################
+# Read inside scores from BB
+################################
+    $count = 0;
+    open BB, "$blast_outputBB" or die "Blast output file B->B is missing\n";
+    while (<BB>) {
+	chomp;                  # strip newline
+	
+	@Fld = split(/\s+/);    # Get query and match names
+
+	if( scalar @Fld < 9 ){
+	    if($Fld[0]=~/done/){
+		print STDERR "BB ok\n";
+	    }
+	    next;
+	} 
+
+	$q = $Fld[0];
+	$m = $Fld[1];
+	$score = $Fld[2];
+	next unless (vec($is_ortologB,$idB{$q},1));
+	
+	next if (!overlap_test(@Fld));
+
+	next if ($score < $score_cutoff);
+	
+	if(!$count || $q ne $oldq){ # New query 
+	    $max_hit = $hit if ($hit > $max_hit);
+	    $oldq = $q;
+	    $hit = 0;
+	}
+	++$count;
+	++$hit;
+	$scoreBB{"$idB{$q}:$idB{$m}"} = int($score + 0.5);
+	$hitBB[$idB{$q}][$hit] = int($idB{$m});
+	$hitnBB[$idB{$q}] = $hit;
+    }
+    close BB;
+    if ($output){
+	print OUTPUT "$count $fasta_seq_fileB-$fasta_seq_fileB matches\n";
+    }
+    if ($show_times){
+	($user_time,,,) = times;
+	printf ("Reading paralogous hits took %.2f seconds\n", ($user_time - $prev_time));
+	$prev_time = $user_time;
+    }
+    print "Maximum number of hits per sequence was $max_hit\n" if ($debug);
+#####################################################
+# Find paralogs:
+#####################################################
+    for $i(1..$o){
+	$merge[$i] = 0;
+	next if($del[$i]); # If outgroup species was closer to one of the seed orthologs
+	$idA = $ortoA[$i];
+	$idB = $ortoB[$i];
+	local @membersA = ();
+	local @membersB = ();
+	
+	undef $is_paralogA[$i];
+	undef $is_paralogB[$i];
+	
+	print "$i: Ortholog pair $nameA[$idA] and $nameB[$idB]. $hitnAA[$idA] hits for A and $hitnBB[$idB] hits for B\n"  if ($debug);
+	# Check if current ortholog is already clustered:
+	for $j(1..($i-1)){
+	    # Overlap type 1: Both orthologs already clustered here -> merge  
+	    if ((vec($is_paralogA[$j],$idA,1)) and (vec($is_paralogB[$j],$idB,1))){
+		$merge[$i] = $j;
+		print "Merge CASE 1: group $i ($nameB[$idB]-$nameA[$idA]) and $j ($nameB[$ortoB[$j]]-$nameA[$ortoA[$j]])\n" if ($debug);
+		last;
+	    }
+	    # Overlap type 2: 2 competing ortholog pairs -> merge
+	    elsif (($ortoS[$j] - $ortoS[$i] <= $grey_zone)
+		   and (($ortoA[$j] == $ortoA[$i]) or ($ortoB[$j] == $ortoB[$i]))
+#       and ($paralogsA[$j])
+		   ){ # The last condition is false if the previous cluster has been already deleted
+		$merge[$i] = $j;
+		print "Merge CASE 2: group $i ($nameA[$ortoA[$i]]-$nameB[$ortoB[$i]]) and $j ($nameA[$ortoA[$j]]-$nameB[$ortoB[$j]])\n" if ($debug);
+		last;
+	    }
+	    # Overlap type 3: DELETE One of the orthologs belongs to some much stronger cluster -> delete
+	    elsif (((vec($is_paralogA[$j],$idA,1)) or (vec($is_paralogB[$j],$idB,1))) and
+		   ($ortoS[$j] - $ortoS[$i] > $score_cutoff)){
+		print "Delete CASE 3: Cluster $i -> $j, score $ortoS[$i] -> $ortoS[$j], ($nameA[$ortoA[$j]]-$nameB[$ortoB[$j]])\n" if ($debug);
+		$merge[$i]= -1; # Means - do not add sequences to this cluster
+		$paralogsA[$i] = "";
+		$paralogsB[$i] = "";
+		last;
+	    }
+	    # Overlap type 4: One of the orthologs is close to the center of other cluster
+	    elsif (((vec($is_paralogA[$j],$idA,1)) and ($confPA[$idA] > $group_overlap_cutoff)) or
+		   ((vec($is_paralogB[$j],$idB,1)) and ($confPB[$idB] > $group_overlap_cutoff))){
+		print "Merge CASE 4: Cluster $i -> $j, score $ortoS[$i] -> $ortoS[$j], ($nameA[$ortoA[$j]]-$nameB[$ortoB[$j]])\n" if ($debug);
+		$merge[$i] = $j;
+		last;
+	    }
+	    # Overlap type 5:
+	    # All clusters that were overlapping, but not catched by previous "if" statements will be DIVIDED!
+	} 
+	next if ($merge[$i] < 0); # This cluster should be deleted
+##### Check for paralogs in A
+	$N = $hitnAA[$idA];
+	for $j(1..$N){
+	    $hitID = $hitAA[$idA][$j]; # hit of idA 
+#      print "Working with $nameA[$hitID]\n" if ($debug == 2);
+	    # Decide whether this hit is inside the paralog circle:
+	    if ( ($idA == $hitID) or ($scoreAA{"$idA:$hitID"} >= $bestscoreAB[$idA]) and 
+		($scoreAA{"$idA:$hitID"} >= $bestscoreAB[$hitID])){
+		if ($debug == 2){
+		    print "   Paralog candidates: ";
+		    printf ("%-20s: %-20s", $nameA[$idA], $nameA[$hitID]);
+		    print "\t$scoreAA{\"$idA:$hitID\"} : $bestscoreAB[$idA] : $bestscoreAB[$hitID]\n";
+		}
+		$paralogs = 1;
+		if ($scoreAA{"$idA:$idA"} == $ortoS[$i]){
+		    if ($scoreAA{"$idA:$hitID"} == $scoreAA{"$idA:$idA"}){
+			$conf_here = 1.0; # In the center
+		    }
+		    else{
+			$conf_here = 0.0; # On the border
+		    }
+		}
+		else {
+		    $conf_here = ($scoreAA{"$idA:$hitID"} - $ortoS[$i]) /
+			($scoreAA{"$idA:$idA"} - $ortoS[$i]);
+		}
+		# Check if this paralog candidate is already clustered in other clusters
+		for $k(1..($i-1)){ 
+		    if (vec($is_paralogA[$k],$hitID,1)){ # Yes, found in cluster $k
+			if($debug == 2){
+			    print "      $nameA[$hitID] is already in cluster $k, together with:";
+			    print " $nameA[$ortoA[$k]] and $nameB[$ortoB[$k]] ";
+			    print "($scoreAA{\"$ortoA[$k]:$hitID\"})";         
+			}
+			if (($confPA[$hitID] >= $conf_here) and 
+			    ($j != 1)){ # The seed ortholog CAN NOT remain there
+			    print " and remains there.\n" if ($debug == 2);
+			    $paralogs = 0; # No action
+			}
+			else { # Ortholog of THIS cluster is closer than ortholog of competing cluster $k 
+			    print " and should be moved here!\n" if ($debug == 2); # Remove from other cluster, add to this cluster
+			    @membersAK = split(/ /, $paralogsA[$k]); # This array contains IDs
+			    $paralogsA[$k] = "";# Remove all paralogs from cluster $k
+				@tmp = ();
+			    for $m(@membersAK){   
+				push(@tmp,$m) if ($m != $hitID); # Put other members back  
+			    }  
+			    $paralogsA[$k] = join(' ',@tmp);
+			    undef $is_paralogA[$k]; # Create index that we can check later
+			    grep (vec($is_paralogA[$k],$_,1) = 1, @tmp);
+			}
+			last;
+		    }
+		}
+		next if (! $paralogs); # Skip that paralog - it is already in cluster $k
+		push (@membersA,$hitID); # Add this hit to paralogs of A 
+	    } 
+	}
+	# Calculate confidence values now:
+	@tmp = ();
+	for $idP (@membersA){ # For each paralog calculate conf value
+	    if($scoreAA{"$idA:$idA"} == $ortoS[$i]){
+		if ($scoreAA{"$idA:$idP"} == $scoreAA{"$idA:$idA"}){
+		    $confPA[$idP] = 1.00;
+		}
+		else{ 
+		    $confPA[$idP] = 0.00;
+		}
+	    }
+	    else{ 
+		$confPA[$idP] = ($scoreAA{"$idA:$idP"} - $ortoS[$i]) / 
+		    ($scoreAA{"$idA:$idA"} - $ortoS[$i]);
+	    }
+	    push (@tmp, $idP) if ($confPA[$idP] >= $conf_cutoff); # If one wishes to use only significant paralogs
+	}
+	@membersA = @tmp;
+	########### Merge if necessary:
+	if ($merge[$i] > 0){ # Merge existing cluster with overlapping cluster
+	    @tmp = split(/ /,$paralogsA[$merge[$i]]);      
+	    for $m (@membersA){
+		push (@tmp, $m)  unless (vec($is_paralogA[$merge[$i]],$m,1));
+	    }
+	    $paralogsA[$merge[$i]] = join(' ',@tmp);
+	    undef $is_paralogA[$merge[$i]];
+	    grep (vec($is_paralogA[$merge[$i]],$_,1) = 1, @tmp); # Refresh index of paralog array
+	}
+	######### Typical new cluster:
+	else{  # Create a new cluster
+	    $paralogsA[$i] = join(' ',@membersA);
+	    undef $is_paralogA; # Create index that we can check later
+	    grep (vec($is_paralogA[$i],$_,1) = 1, @membersA);
+	}  
+##### The same procedure for species B:   
+	$N = $hitnBB[$idB];
+	for $j(1..$N){
+	    $hitID = $hitBB[$idB][$j];
+#      print "Working with $nameB[$hitID]\n" if ($debug == 2);
+	    if ( ($idB == $hitID) or ($scoreBB{"$idB:$hitID"} >= $bestscoreBA[$idB]) and 
+		($scoreBB{"$idB:$hitID"} >= $bestscoreBA[$hitID])){
+		if ($debug == 2){
+		    print "   Paralog candidates: ";
+		    printf ("%-20s: %-20s", $nameB[$idB], $nameB[$hitID]);
+		    print "\t$scoreBB{\"$idB:$hitID\"} : ";
+		    print "$bestscoreBA[$idB] : $bestscoreBA[$hitID]\n";
+		}         
+		$paralogs = 1;
+		if ($scoreBB{"$idB:$idB"} == $ortoS[$i]){
+		    if ($scoreBB{"$idB:$hitID"} == $scoreBB{"$idB:$idB"}){
+			$conf_here = 1.0;
+		    }
+		    else{
+			$conf_here = 0.0;
+		    }
+		}
+		else{
+		    $conf_here = ($scoreBB{"$idB:$hitID"} - $ortoS[$i]) /
+			($scoreBB{"$idB:$idB"} - $ortoS[$i]);
+		}
+		
+		# Check if this paralog candidate is already clustered in other clusters
+		for $k(1..($i-1)){ 
+		    if (vec($is_paralogB[$k],$hitID,1)){ # Yes, found in cluster $k
+			if($debug == 2){
+			    print "      $nameB[$hitID] is already in cluster $k, together with:";
+			    print " $nameB[$ortoB[$k]] and $nameA[$ortoA[$k]] ";
+			    print "($scoreBB{\"$ortoB[$k]:$hitID\"})";
+			}
+			if (($confPB[$hitID] >= $conf_here) and
+			    ($j != 1)){ # The seed ortholog CAN NOT remain there
+			    print " and remains there.\n" if ($debug == 2);
+			    $paralogs = 0; # No action
+			}
+			else { # Ortholog of THIS cluster is closer than ortholog of competing cluster $k 
+			    print " and should be moved here!\n" if ($debug == 2); # Remove from other cluster, add to this cluster
+			    @membersBK = split(/ /, $paralogsB[$k]); # This array contains names, not IDs
+			    $paralogsB[$k] = "";
+			    @tmp = ();
+			    for $m(@membersBK){   
+				push(@tmp,$m) if ($m != $hitID); # Put other members back  
+			    }  
+			    $paralogsB[$k] = join(' ',@tmp);
+			    undef $is_paralogB[$k]; # Create index that we can check later
+			    grep (vec($is_paralogB[$k],$_,1) = 1, @tmp);
+			}
+			last; # Don't search in other clusters
+		    }
+		}
+		next if (! $paralogs); # Skip that paralog - it is already in cluster $k
+		push (@membersB,$hitID);
+	    }
+	}
+	# Calculate confidence values now:
+	@tmp = ();
+	for $idP (@membersB){ # For each paralog calculate conf value
+	    if($scoreBB{"$idB:$idB"} == $ortoS[$i]){
+		if ($scoreBB{"$idB:$idP"} == $scoreBB{"$idB:$idB"}){
+		    $confPB[$idP] = 1.0;
+		}
+		else{
+		    $confPB[$idP] = 0.0;
+		}
+	    }   
+	    else{
+		$confPB[$idP] = ($scoreBB{"$idB:$idP"} - $ortoS[$i]) / 
+		    ($scoreBB{"$idB:$idB"} - $ortoS[$i]);
+	    }
+	    push (@tmp, $idP) if ($confPB[$idP] >= $conf_cutoff); # If one wishes to use only significant paralogs
+	}   
+	@membersB = @tmp;
+	########### Merge if necessary:
+	if ($merge[$i] > 0){ # Merge existing cluster with overlapping cluster
+	    @tmp = split(/ /,$paralogsB[$merge[$i]]);      
+	    for $m (@membersB){
+		push (@tmp, $m)  unless (vec($is_paralogB[$merge[$i]],$m,1));
+	    }
+	    $paralogsB[$merge[$i]] = join(' ',@tmp);
+	    undef $is_paralogB[$merge[$i]];
+	    grep (vec($is_paralogB[$merge[$i]],$_,1) = 1, @tmp); # Refresh index of paralog array
+	}
+	######### Typical new cluster:
+	else{  # Create a new cluster
+	    $paralogsB[$i] = join(' ',@membersB);
+	    undef $is_paralogB; # Create index that we can check later
+	    grep (vec($is_paralogB[$i],$_,1) = 1, @membersB);
+	}
+    }
+    if ($show_times){
+	($user_time,,,) = times;
+	printf ("Finding in-paralogs took %.2f seconds\n", ($user_time - $prev_time));
+	$prev_time = $user_time;
+    }
+#####################################################
+    &clean_up(1);
+####################################################
+# Find group for orphans. If cluster contains only one member, find where it should go:
+    for $i (1..$o){ 
+	@membersA = split(/ /, $paralogsA[$i]);
+	@membersB = split(/ /, $paralogsB[$i]);
+	$na = @membersA;
+	$nb = @membersB;
+	if (($na == 0) and $nb){
+	    print "Warning: empty A cluster $i\n";
+	    for $m (@membersB){
+		$bestscore = 0;
+		$bestgroup = 0;
+		$bestmatch = 0;
+		for $j (1..$o) {
+		    next if ($i == $j); # Really need to check against all 100% members of the group.
+		    @membersBJ = split(/ /, $paralogsB[$j]);
+		    for $k (@membersBJ){
+			next if ($confPB[$k] != 1); # For all 100% in-paralogs
+			$score = $scoreBB{"$m:$k"};
+			if ($score > $bestscore){
+			    $bestscore = $score;
+			    $bestgroup = $j;
+			    $bestmatch = $k;
+			}
+		    }
+		}
+		print "Orphan $nameB[$m] goes to group $bestgroup with $nameB[$bestmatch]\n" ;
+		@members = split(/ /, $paralogsB[$bestgroup]);
+		push (@members, $m);
+		$paralogsB[$bestgroup] = join(' ',@members);
+		$paralogsB[$i] = "";
+		undef $is_paralogB[$bestgroup];
+		undef $is_paralogB[$i];
+		grep (vec($is_paralogB[$bestgroup],$_,1) = 1, @members); # Refresh index of paralog array
+#		 grep (vec($is_paralogB[$i],$_,1) = 1, ());
+	    }
+	}
+	if ($na and ($nb == 0)){
+	    print "Warning: empty B cluster $i\n";
+	    for $m (@membersA){	  
+		$bestscore = 0;
+		$bestgroup = 0;
+		$bestmatch = 0;
+		for $j (1..$o) {
+		    next if ($i == $j);
+		    @membersAJ = split(/ /, $paralogsA[$j]);
+		    for $k (@membersAJ){
+			next if ($confPA[$k] != 1); # For all 100% in-paralogs
+			$score = $scoreAA{"$m:$k"};
+			if ($score > $bestscore){
+			    $bestscore = $score;
+			    $bestgroup = $j;
+			    $bestmatch = $k;
+			}
+		    }
+		}
+		print "Orphan $nameA[$m] goes to group $bestgroup with $nameA[$bestmatch]\n";
+		@members = split(/ /, $paralogsA[$bestgroup]);
+		push (@members, $m);
+		$paralogsA[$bestgroup] = join(' ',@members);
+		$paralogsA[$i] = "";
+		undef $is_paralogA[$bestgroup];
+		undef $is_paralogA[$i];
+		grep (vec($is_paralogA[$bestgroup],$_,1) = 1, @members); # Refresh index of paralog array
+#	     grep (vec($is_paralogA[$i],$_,1) = 1, ());
+	    }
+	}
+    }
+    
+    &clean_up(1);
+###################
+    $htmlfile = "orthologs." . $ARGV[0] . "-" . $ARGV[1] . ".html";
+    if ($html){
+	open HTML, ">$htmlfile" or warn "Could not write to HTML file $filename\n";
+    }
+    
+    
+    if ($output){
+	print OUTPUT "\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\n";
+	print OUTPUT "$o groups of orthologs\n";
+	print OUTPUT "$totalA in-paralogs from $fasta_seq_fileA\n";
+	print OUTPUT "$totalB in-paralogs from $fasta_seq_fileB\n";
+	print OUTPUT "Grey zone $grey_zone bits\n";
+	print OUTPUT "Score cutoff $score_cutoff bits\n";
+	print OUTPUT "In-paralogs with confidence less than $conf_cutoff not shown\n";
+	print OUTPUT "Sequence overlap cutoff $seq_overlap_cutoff\n";
+	print OUTPUT "Group merging cutoff $group_overlap_cutoff\n";
+	print OUTPUT "Scoring matrix $matrix\n"; 
+	print OUTPUT "\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\n";
+    }
+    if ($html){
+	print HTML "<pre>\n";
+	print HTML "\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\n";
+	print HTML "$o groups of orthologs\n";
+	print HTML "$totalA in-paralogs from $fasta_seq_fileA\n";
+	print HTML "$totalB in-paralogs from $fasta_seq_fileB\n";
+	print HTML "Grey zone $grey_zone bits\n";
+	print HTML "Score cutoff $score_cutoff bits\n";
+	print HTML "In-paralogs with confidence less than $conf_cutoff not shown\n";
+	print HTML "Sequence overlap cutoff $seq_overlap_cutoff\n";
+	print HTML "Group merging cutoff $group_overlap_cutoff\n";
+	print HTML "Scoring matrix $matrix\n";
+	print HTML "\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\n";
+    }
+# ##############################################################################
+# Check for alternative orthologs, sort paralogs by confidence and print results
+# ##############################################################################
+    if ($use_bootstrap and $debug){
+	open FF, ">BS_vs_bits" or warn "Could not write to file BS_vs_bits\n";
+    }
+    for $i(1..$o){
+	@membersA = split(/ /, $paralogsA[$i]);
+	@membersB = split(/ /, $paralogsB[$i]);		 
+	$message = "";
+	$htmlmessage = "";
+	
+	$idB = $ortoB[$i];
+	$nB = $hitnBA[$idB];
+	for $idA(@membersA){
+	    next if ($confPA[$idA] != 1.0);
+	    $nA = $hitnAB[$idA];
+	    $confA[$i] = $ortoS[$i]; # default
+	    $bsA[$idA] = 1.0;
+	    ##############
+	    for $j(1..$nB){
+		$idH = $hitBA[$idB][$j];
+		################ Some checks for alternative orthologs:
+		# 1. Don't consider sequences that are already in this cluster
+		next if (vec($is_paralogA[$i],$idH,1));
+		next if ($confPA[$idH] > 0); # If $conf_cutoff > 0 idH might be incide circle, but not paralog
+		
+		# 2. Check if candidate for alternative ortholog is already clustered in stronger clusters
+		$in_other_cluster = 0;
+		for $k(1..($i-1)){ # Check if current ortholog is already clustered
+		    if (vec($is_paralogA[$k],$idH,1)){
+			$in_other_cluster = $k;
+			last;
+		    }
+		}
+#		 next if ($in_other_cluster); # This hit is clustered in cluster $k. It cannot be alternative ortholog 
+		
+		# 3. The best hit of candidate ortholog should be ortoA or at least to belong into this cluster
+		@besthits = split (/ /,$besthitAB[$idH]);
+		$this_family = 0;
+		for $bh (@besthits){
+		    $this_family = 1 if ($idB == $bh);
+		}
+#		 next unless ($this_family); # There was an alternative BA match but it's best match did not belong here
+		################# Done with checks - if sequence passed, then it could be an alternative ortholog
+		$confA[$i] = $ortoS[$i] - $scoreBA{"$idB:$idH"};
+		if ($use_bootstrap){ 
+		    if ($confA[$i] < $ortoS[$i]){ # Danger zone - check for bootstrap
+			$bsA[$idA] = &bootstrap($fasta_seq_fileB,$idB,$idA,$idH);
+		    }
+		    else { 
+			$bsA[$idA] = 1.0;
+		    }
+		}
+		last;
+	    }
+	    $message .= sprintf("Bootstrap support for %s as seed ortholog is %d%%.", $nameA[$idA], 100*$bsA[$idA]);
+	    $message .= sprintf(" Alternative seed ortholog is %s (%d bits away from this cluster)", $nameA[$idH], $confA[$i]) if ($bsA[$idA] < 0.75);
+	    $message .= sprintf("\n");
+	    if ($html){
+		if ($bsA[$idA] < 0.75){
+		    $htmlmessage .= sprintf("<font color=\"red\">"); 			
+		}
+		elsif ($bsA[$idA] < 0.95){
+		    $htmlmessage .= sprintf("<font color=\"\#FFCC00\">");
+		}  
+		else {
+		    $htmlmessage .= sprintf("<font color=\"green\">");
+		}  
+		$htmlmessage .= sprintf("Bootstrap support for %s as seed ortholog is %d%%.\n", $nameA[$idA], 100*$bsA[$idA]);
+		$htmlmessage .= sprintf("Alternative seed ortholog is %s (%d bits away from this cluster)\n", $nameA[$idH], $confA[$i]) if ($bsA[$idA] < 0.75);
+		$htmlmessage .= sprintf("</font>");
+	    } 
+	    printf (FF "%s\t%d\t%d\n", $nameA[$idA], $confA[$i], 100*$bsA[$idA]) if ($use_bootstrap and $debug); 
+	}
+	########
+	$idA = $ortoA[$i];
+	$nA = $hitnAB[$idA];
+	for $idB(@membersB){
+	    next if ($confPB[$idB] != 1.0);
+	    $nB = $hitnBA[$idB];
+	    $confB[$i] = $ortoS[$i]; # default
+	    $bsB[$idB] = 1.0;
+	    
+	    for $j(1..$nA){ # For all AB hits of given ortholog
+		$idH = $hitAB[$idA][$j];
+		# ############### Some checks for alternative orthologs:
+		# 1. Don't consider sequences that are already in this cluster
+		next if (vec($is_paralogB[$i],$idH,1));
+		next if ($confPB[$idH] > 0); # If $conf_cutoff > 0 idH might be incide circle, but not paralog
+		
+		# 2. Check if candidate for alternative ortholog is already clustered in stronger clusters
+		$in_other_cluster = 0;
+		for $k(1..($i-1)){
+		    if (vec($is_paralogB[$k],$idH,1)){
+			$in_other_cluster = $k;
+			last; # out from this cycle
+		    }
+		}
+#		 next if ($in_other_cluster); # This hit is clustered in cluster $k. It cannot be alternative ortholog 
+		
+		# 3. The best hit of candidate ortholog should be ortoA
+		@besthits = split (/ /,$besthitBA[$idH]);
+		$this_family = 0;
+		for $bh (@besthits){
+		    $this_family = 1 if ($idA == $bh);
+		}
+#		 next unless ($this_family); # There was an alternative BA match but it's best match did not belong here
+		# ################ Done with checks - if sequence passed, then it could be an alternative ortholog
+		$confB[$i] = $ortoS[$i] - $scoreAB{"$idA:$idH"};
+		if ($use_bootstrap){
+		    if ($confB[$i] < $ortoS[$i]){
+			$bsB[$idB] = &bootstrap($fasta_seq_fileA,$idA,$idB,$idH);
+		    }
+		    else {
+			$bsB[$idB] = 1.0;
+		    }
+		}
+		last;
+	    }
+	    $message .= sprintf("Bootstrap support for %s as seed ortholog is %d%%.", $nameB[$idB], 100*$bsB[$idB]);
+	    $message .= sprintf(" Alternative seed ortholog is %s (%d bits away from this cluster)", $nameB[$idH],$confB[$i]) if ($bsB[$idB] < 0.75);
+	    $message .= sprintf("\n");
+	    if ($html){
+		if ($bsB[$idB] < 0.75){
+		    $htmlmessage .= sprintf("<font color=\"red\">");
+		}      
+		elsif ($bsB[$idB] < 0.95){
+		    $htmlmessage .= sprintf("<font color=\"\#FFCC00\">");
+		}
+		else {
+		    $htmlmessage .= sprintf("<font color=\"green\">");
+		}
+		$htmlmessage .= sprintf("Bootstrap support for %s as seed ortholog is %d%%.\n", $nameB[$idB], 100*$bsB[$idB]);
+		$htmlmessage .= sprintf("Alternative seed ortholog is %s (%d bits away from this cluster)\n",  $nameB[$idH],$confB[$i]) if ($bsB[$idB] < 0.75);
+		$htmlmessage .= sprintf("</font>");
+	    }
+	    printf (FF "%s\t%d\t%d\n", $nameB[$idB], $confB[$i], 100*$bsB[$idB]) if ($use_bootstrap and $debug);
+	}		
+	close FF;
+	########### Print header ###############
+	if ($output){
+	    print OUTPUT "___________________________________________________________________________________\n";
+	    print OUTPUT "Group of orthologs #" . $i .". Best score $ortoS[$i] bits\n";
+		print OUTPUT "Score difference with first non-orthologous sequence - ";
+	    printf (OUTPUT "%s:%d   %s:%d\n", $fasta_seq_fileA,$confA[$i],$fasta_seq_fileB,$confB[$i]);
+	}
+	
+	if ($html){
+	    print HTML "</pre>\n";
+	    print HTML "<hr WIDTH=\"100%\">";
+	    print HTML "<h3>";
+	    print HTML "Group of orthologs #" . $i .". Best score $ortoS[$i] bits<br>\n";
+		print HTML "Score difference with first non-orthologous sequence - ";
+	    printf (HTML "%s:%d   %s:%d</h3><pre>\n", $fasta_seq_fileA,$confA[$i],$fasta_seq_fileB,$confB[$i]);			
+	}
+	########### Sort and print members of A ############
+	$nA = @membersA;
+	$nB = @membersB;
+	$nMAX = ($nA > $nB) ? $nA : $nB;
+	# Sort membersA inside the cluster by confidence:
+	for $m (0..($nA-1)){
+	    while($confPA[$membersA[$m]] < $confPA[$membersA[$m+1]]){
+		$temp = $membersA[$m];
+		$membersA[$m] = $membersA[$m+1];
+		$membersA[$m+1] = $temp;
+		--$m if ($m > 1);
+	    }
+	}
+	$paralogsA[$i] = join(' ',@membersA); # Put them back together
+	# Sort membersB inside the cluster by confidence:
+	for $m (0..($nB-1)){
+	    while($confPB[$membersB[$m]] < $confPB[$membersB[$m+1]]){
+		$temp = $membersB[$m];
+		$membersB[$m] = $membersB[$m+1];
+		$membersB[$m+1] = $temp;
+		--$m if ($m > 1);
+	    }
+	}   
+	$paralogsB[$i] = join(' ',@membersB); # Put them back together
+	# Print to text file and to HTML file
+	for $m (0..($nMAX-1)){
+	    if ($m < $nA){
+		if ($output){
+		    printf (OUTPUT "%-20s\t%.2f%%\t\t", $nameA[$membersA[$m]], (100*$confPA[$membersA[$m]]));
+		}
+		if ($html){
+		    print HTML "<B>" if ($confPA[$membersA[$m]] == 1);
+		    printf (HTML "%-20s\t%.2f%%\t\t", $nameA[$membersA[$m]], (100*$confPA[$membersA[$m]]));
+		    print HTML "</B>" if ($confPA[$membersA[$m]] == 1);
+		}
+	    }
+	    else {
+		printf (OUTPUT "%-20s\t%-7s\t\t", "                    ", "       ");
+		printf (HTML "%-20s\t%-7s\t\t", "                    ", "       ") if ($html);
+	    }
+	    if ($m < $nB){
+		if ($output){
+		    printf (OUTPUT "%-20s\t%.2f%%\n", $nameB[$membersB[$m]], (100*$confPB[$membersB[$m]]));
+		}
+		if ($html){
+		    print HTML "<B>" if ($confPB[$membersB[$m]] == 1);
+		    printf (HTML "%-20s\t%.2f%%", $nameB[$membersB[$m]], (100*$confPB[$membersB[$m]]));
+		    print HTML "</B>" if ($confPB[$membersB[$m]] == 1);
+		    print HTML "\n";
+		}
+	    }
+	    else {
+		printf (OUTPUT "%-20s\t%-7s\n", "                    ", "       ") if($output);
+		print HTML "\n" if ($html);
+	    }
+	}
+	print OUTPUT $message if ($use_bootstrap and $output);
+	print HTML "$htmlmessage" if ($use_bootstrap and $html);
+    }
+    if ($output) {
+      close OUTPUT;
+      print "Output saved to file $outputfile\n";
+    }
+    if ($html){
+      close HTML;
+      print "HTML output saved to $htmlfile\n";
+    }
+    
+    if ($table){
+	$filename = "table." . $ARGV[0] . "-" . $ARGV[1];
+	open F, ">$filename" or die;
+	print F "OrtoID\tScore\tOrtoA\tOrtoB\n";
+	for $i(1..$o){
+	    print F "$i\t$ortoS[$i]\t";
+	    @members = split(/ /, $paralogsA[$i]);
+	    for $m (@members){
+		$m =~ s/://g;
+		printf (F "%s %.3f ", $nameA[$m], $confPA[$m]);
+	    }
+	    print F "\t";
+	    @members = split(/ /, $paralogsB[$i]);
+	    for $m (@members){
+		$m =~ s/://g;
+		printf (F "%s %.3f ", $nameB[$m], $confPB[$m]);
+	    }
+	    print F "\n";
+	}  
+	close F;
+	print "Table output saved to $filename\n";
+    }
+    if ($mysql_table){
+	$filename2 = "sqltable." . $ARGV[0] . "-" . $ARGV[1];
+	open F2, ">$filename2" or die;
+	for $i(1..$o){
+	    @membersA = split(/ /, $paralogsA[$i]);
+	    for $m (@membersA){
+		# $m =~ s/://g;
+		if ($use_bootstrap && $bsA[$m]) {
+		    printf (F2 "%d\t%d\t%s\t%.3f\t%s\t%d%\n", $i, $ortoS[$i], $ARGV[0], $confPA[$m], $nameA[$m], 100*$bsA[$m]);
+		} else {
+		    printf (F2 "%d\t%d\t%s\t%.3f\t%s\n", $i, $ortoS[$i], $ARGV[0], $confPA[$m], $nameA[$m]);
+		}
+	    }     
+	    @membersB = split(/ /, $paralogsB[$i]);
+	    for $m (@membersB){
+		# $m =~ s/://g;
+		if ($use_bootstrap && $bsB[$m]) {
+		    printf (F2 "%d\t%d\t%s\t%.3f\t%s\t%d%\n", $i, $ortoS[$i], $ARGV[1], $confPB[$m], $nameB[$m], 100*$bsB[$m]);
+		}else {
+		    printf (F2 "%d\t%d\t%s\t%.3f\t%s\n", $i, $ortoS[$i], $ARGV[1], $confPB[$m], $nameB[$m]);
+		}
+	    }
+	}
+	close F2;
+	print "mysql output saved to $filename2\n";
+    }
+    if ($show_times){
+      ($user_time,,,) = times;
+      printf ("Finding bootstrap values and printing took %.2f seconds\n", ($user_time - $prev_time));
+      printf ("The overall execution time: %.2f seconds\n", $user_time);
+    }
+    if ($run_blast) { 
+      unlink "formatdb.log";
+      unlink "$fasta_seq_fileA.phr", "$fasta_seq_fileA.pin", "$fasta_seq_fileA.psq";
+      unlink "$fasta_seq_fileB.phr", "$fasta_seq_fileB.pin", "$fasta_seq_fileB.psq" if (@ARGV >= 2);
+      unlink "$fasta_seq_fileC.phr", "$fasta_seq_fileC.pin", "$fasta_seq_fileC.psq" if ($use_outgroup);   
+    }
+  }
+
+##############################################################
+# Functions:
+##############################################################
+sub clean_up { # Sort members within cluster and clusters by size
+############################################################################################### Modification by Isabella 3
+
+    # Sort on index arrays with perl's built in sort instead of using bubble sort.
+
+    $var = shift;
+    $totalA = $totalB = 0;
+    # First pass: count members within each cluster
+    foreach $i (1..$o) {
+	@membersA = split(/ /, $paralogsA[$i]);      
+	$clusnA[$i] = @membersA; # Number of members in this cluster
+	$totalA += $clusnA[$i];
+	$paralogsA[$i] = join(' ',@membersA);
+	
+	@membersB = split(/ /, $paralogsB[$i]);      
+	$clusnB[$i] = @membersB; # Number of members in this cluster
+	$totalB += $clusnB[$i];
+	$paralogsB[$i] = join(' ',@membersB);
+	
+	$clusn[$i] =  $clusnB[$i] + $clusnA[$i]; # Number of members in given group
+    }
+
+    # Create an array used to store the position each element shall have in the final array
+    # The elements are initialized with the position numbers
+    my @position_index_array = (1..$o);
+
+    # Sort the position list according to cluster size
+    my @cluster_sorted_position_list = sort { $clusn[$b] <=> $clusn[$a]} @position_index_array;
+
+    # Create new arrays for the sorted information
+    my @new_paralogsA;
+    my @new_paralogsB;
+    my @new_is_paralogA;
+    my @new_is_paralogB;
+    my @new_clusn;
+    my @new_ortoS;
+    my @new_ortoA;
+    my @new_ortoB;
+
+
+   # Add the information to the new arrays in the orer specifeid by the index array
+   for (my $index_in_list = 0; $index_in_list < scalar @cluster_sorted_position_list; $index_in_list++) {
+	
+	my $old_index = $cluster_sorted_position_list[$index_in_list];
+	
+	if (!$clusn[$old_index]) {
+		$o = (scalar @new_ortoS) - 1;
+		last;
+	}
+
+	$new_paralogsA[$index_in_list + 1] = $paralogsA[$old_index];
+        $new_paralogsB[$index_in_list + 1] = $paralogsB[$old_index];
+    	$new_is_paralogA[$index_in_list + 1] = $is_paralogA[$old_index];
+    	$new_is_paralogB[$index_in_list + 1] = $is_paralogB[$old_index];
+   	$new_clusn[$index_in_list + 1] = $clusn[$old_index];
+	$new_ortoA[$index_in_list + 1] = $ortoA[$old_index];
+	$new_ortoB[$index_in_list + 1] = $ortoB[$old_index];
+	$new_ortoS[$index_in_list + 1] = $ortoS[$old_index];
+   }
+
+    @paralogsA = @new_paralogsA;
+    @paralogsB = @new_paralogsB;
+    @is_paralogA = @new_is_paralogA;
+    @is_paralogB = @new_is_paralogB;
+    @clusn = @new_clusn;
+    @ortoS = @new_ortoS;
+    @ortoA = @new_ortoA;
+    @ortoB = @new_ortoB;
+
+    # Create an array used to store the position each element shall have in the final array
+    # The elements are initialized with the position numbers
+    @position_index_array = (1..$o);
+
+    # Sort the position list according to score
+    @score_sorted_position_list = sort { $ortoS[$b] <=> $ortoS[$a] } @position_index_array;
+
+    # Create new arrays for the sorted information
+    my @new_paralogsA2 = ();
+    my @new_paralogsB2 = ();
+    my @new_is_paralogA2 = ();
+    my @new_is_paralogB2 = ();
+    my @new_clusn2 = ();
+    my @new_ortoS2 = ();
+    my @new_ortoA2 = ();
+    my @new_ortoB2 = ();
+
+   # Add the information to the new arrays in the orer specifeid by the index array
+   for (my $index_in_list = 0; $index_in_list < scalar @score_sorted_position_list; $index_in_list++) {
+	
+	my $old_index = $score_sorted_position_list[$index_in_list];
+	$new_paralogsA2[$index_in_list + 1] = $paralogsA[$old_index];
+        $new_paralogsB2[$index_in_list + 1] = $paralogsB[$old_index];
+    	$new_is_paralogA2[$index_in_list + 1] = $is_paralogA[$old_index];
+    	$new_is_paralogB2[$index_in_list + 1] = $is_paralogB[$old_index];
+   	$new_clusn2[$index_in_list + 1] = $clusn[$old_index];
+	$new_ortoA2[$index_in_list + 1] = $ortoA[$old_index];
+	$new_ortoB2[$index_in_list + 1] = $ortoB[$old_index];
+	$new_ortoS2[$index_in_list + 1] = $ortoS[$old_index];
+   }
+
+    @paralogsA = @new_paralogsA2;
+    @paralogsB = @new_paralogsB2;
+    @is_paralogA = @new_is_paralogA2;
+    @is_paralogB = @new_is_paralogB2;
+    @clusn = @new_clusn2;
+    @ortoS = @new_ortoS2;
+    @ortoA = @new_ortoA2;
+    @ortoB = @new_ortoB2;
+
+#################################################################################### End modification by Isabella 3
+
+}
+sub bootstrap{
+    my $species = shift;
+    my $seq_id1 = shift; # Query ID from $species
+    my $seq_id2 = shift; # Best hit ID from other species
+    my $seq_id3 = shift; # Second best hit
+    # Retrieve sequence 1 from $species and sequence 2 from opposite species
+    my $significance = 0.0;
+    
+    if ($species eq $fasta_seq_fileA){
+	$file1 = $fasta_seq_fileA;
+	$file2 = $fasta_seq_fileB;
+    }
+    elsif ($species eq $fasta_seq_fileB){
+	$file1 = $fasta_seq_fileB;
+	$file2 = $fasta_seq_fileA;
+    }
+    else {
+	print "Bootstrap values for ortholog groups are not calculated\n";
+	return 0;
+    }
+
+    open A, $file1 or die;
+    $id = 0;
+    $print_this_seq = 0;
+    $seq1 = "";
+    $seq2 = "";
+
+    $query_file = $seq_id1 . ".faq";                                                                              
+    open Q, ">$query_file" or die; 
+
+    while (<A>){
+	if(/^\>/){
+	    ++$id;
+	    $print_this_seq = ($id == $seq_id1)?1:0;
+	}
+	print Q if ($print_this_seq);
+    }
+    close A;
+    close Q;
+    ###
+    open B, $file2 or die;
+    $db_file = $seq_id2 . ".fas";
+    open DB, ">$db_file" or die;
+    $id = 0;
+    $print_this_seq = 0;
+
+    while (<B>){
+	if(/^\>/){
+	    ++$id;
+	    $print_this_seq = (($id == $seq_id2) or ($id == $seq_id3))?1:0;
+	}
+	print DB if ($print_this_seq);
+    }
+    close B;
+    close DB;
+    
+    system "$formatdb -i $db_file";
+    # Use soft masking in 1-pass mode for simplicity.
+    system "$blastall -F\"m S\" -i $query_file -z 5000000 -d $db_file -p blastp -M $matrix -m7 | ./$blastParser 0 -a > $seq_id2.faa";
+
+    # Note: Changed score cutoff 50 to 0 for blast2faa.pl (060402).
+    # Reason: after a cluster merger a score can be less than the cutoff (50) 
+    # which will remove the sequence in blast2faa.pl.  The bootstrapping will 
+    # then fail.
+    # AGAIN, updaye
+    
+    if (-s("$seq_id2.faa")){
+
+	system("java -jar $seqstat -m $matrix -n 1000 -i $seq_id2.faa > $seq_id2.bs"); # Can handle U, u
+
+	if (-s("$seq_id2.bs")){
+	    open BS, "$seq_id2.bs" or die "pac failed\n";
+	    $_ = <BS>;
+	    ($dummy1,$dummy2,$dummy3,$dummy4,$significance) = split(/\s+/);
+	    close BS;	
+	}
+	else{
+	    print STDERR "pac failed\n"; # if ($debug);
+	    $significance = -0.01;
+	}	
+    }
+    else{
+	print STDERR "blast2faa for $query_file / $db_file failed\n"; # if ($debug);                                            
+	$significance = 0.0; 
+    }
+    
+    unlink "$seq_id2.fas", "$seq_id2.faa", "$seq_id2.bs", "$seq_id1.faq"; 
+    unlink "formatdb.log", "$seq_id2.fas.psq", "$seq_id2.fas.pin", "$seq_id2.fas.phr";
+    
+    return $significance;
+}
+
+sub overlap_test{
+        my @Fld = @_;
+
+	# Filter out fragmentary hits by:
+	# Ignore hit if aggregate matching area covers less than $seq_overlap_cutoff of sequence.
+	# Ignore hit if local matching segments cover less than $segment_coverage_cutoff of sequence.
+	#
+	# $Fld[3] and $Fld[4] are query and subject lengths.
+	# $Fld[5] and $Fld[6] are lengths of the aggregate matching region on query and subject. (From start of first matching segment to end of last matching segment).
+	# $Fld[7] and $Fld[8] are local matching length on query and subject (Sum of all segments length's on query).
+
+	$retval = 1;
+#	if ($Fld[3] >= $Fld[4]) {
+		if ($Fld[5] < ($seq_overlap_cutoff * $Fld[3])) {$retval = 0};
+		if ($Fld[7] < ($segment_coverage_cutoff * $Fld[3])) {$retval = 0};
+#	} 
+	
+#	if ($Fld[4] >= $Fld[3]) { 
+		if ($Fld[6] < ($seq_overlap_cutoff * $Fld[4])) {$retval = 0};
+		if ($Fld[8] < ($segment_coverage_cutoff * $Fld[4])) {$retval = 0};
+#	}
+	
+	# print "$Fld[3] $Fld[5] $Fld[7]; $Fld[4] $Fld[6] $Fld[8]; retval=$retval\n";
+
+	return $retval;
+}
+
+sub do_blast 
+	{
+		my @parameter=@_;
+		my $resultfile=@parameter[@parameter-1];
+		my $go_to_blast=1;
+		my $resultfilesize;
+		if (-e $resultfile) 
+			{
+				$resultfilesize= -s "$resultfile";
+				if ($resultfilesize >10240) 
+					{
+						$go_to_blast=0;
+					}
+			}
+		if ($go_to_blast) 
+			{
+				if ($blast_two_passes) 
+					{
+						do_blast_2pass(@parameter);
+					} else 
+						{
+							do_blast_1pass(@parameter);
+						}
+			}
+	}
+
+sub do_blast_1pass {
+  my @Fld = @_;
+  
+  # $Fld [0] is query
+  # $Fld [1] is database
+  # $Fld [2] is query size
+  # $Fld [3] is database size
+  # $Fld [4] is output name
+  
+  # Use soft masking (low complexity masking by SEG in search phase, not in alignment phase). 
+  system ("$blastall -F\"m S\" -i $Fld[0] -d $Fld[1] -p blastp -v $Fld[3] -b $Fld[3] -M $matrix -z 5000000 -m7 | ./$blastParser $score_cutoff > $Fld[4]");
+}
+
+sub do_blast_2pass {
+
+	my @Fld = @_;
+
+	# $Fld [0] is query
+	# $Fld [1] is database
+	# $Fld [2] is query size
+	# $Fld [3] is database size
+	# $Fld [4] is output name
+
+	# assume the script has already formatted the database
+	# we will now do 2-pass approach
+
+	# load sequences
+
+	%sequencesA = ();
+	%sequencesB = ();
+
+	open (FHA, $Fld [0]);
+	while (<FHA>) {
+
+		$aLine = $_;
+		chomp ($aLine);
+
+		$seq = "";
+
+		if ($aLine =~ />/) {
+			@words = split (/\s/, $aLine);
+			$seqID = $words[0];
+			$sequencesA {$seqID} = "";
+		}
+		else {
+			$sequencesA {$seqID} = $sequencesA {$seqID}.$aLine;
+		}		
+	}
+	close (FHA);
+
+	open (FHB, $Fld [1]);
+	while (<FHB>) {
+		$aLine = $_;
+		chomp ($aLine);
+
+		$seq = "";
+
+		if ($aLine =~ />/) {
+			@words = split (/\s/, $aLine);
+			$seqID = $words[0];
+			$sequencesB {$seqID} = "";
+		}
+		else {
+			$sequencesB {$seqID} = $sequencesB {$seqID}.$aLine;
+		}		
+	}
+	close (FHB);
+
+	# Do first pass with compositional adjustment on and soft masking.  
+	# This efficiently removes low complexity matches but truncates alignments,
+	# making a second pass necessary.
+	print STDERR "\nStarting first BLAST pass for $Fld[0] - $Fld[1] on ";
+	system("date");
+	open FHR, "$blastall -C3 -F\"m S\" -i $Fld[0] -d $Fld[1] -p blastp -v $Fld[3] -b $Fld[3] -M $matrix -z 5000000 -m7 | ./$blastParser $score_cutoff|";
+
+	%theHits = ();
+	while (<FHR>) {
+		$aLine = $_;
+		chomp ($aLine);
+		@words = split (/\s+/, $aLine);
+
+		if (exists ($theHits {$words [0]})) {
+			$theHits {$words [0]} = $theHits {$words [0]}." ".$words [1];
+		}
+		else {
+			$theHits {$words [0]} = $words [1];
+		}
+
+	}
+	close (FHR);
+
+	$tmpdir = ".";   # May be slightly (5%) faster using the RAM disk "/dev/shm".
+	$tmpi = "$tmpdir/tmpi";
+	$tmpd = "$tmpdir/tmpd";
+
+	# Do second pass with compositional adjustment off to get full-length alignments.
+	print STDERR "\nStarting second BLAST pass for $Fld[0] - $Fld[1] on ";
+	system("date");
+	unlink "$Fld[4]";
+	foreach $aQuery (keys % theHits) {
+
+		# Create single-query file
+		open (FHT, ">$tmpi");
+		print FHT ">$aQuery\n".$sequencesA {">$aQuery"}."\n";
+		close (FHT);
+
+	        # Create mini-database of hit sequences
+		open (FHT, ">$tmpd");
+		foreach $aHit (split (/\s/, $theHits {$aQuery})) {
+			print FHT ">$aHit\n".$sequencesB {">$aHit"}."\n";
+		}
+		close (FHT);
+
+		# Run Blast and add to output
+		system ("$formatdb -i $tmpd");
+		system ("$blastall -C0 -FF -i $tmpi -d $tmpd -p blastp -v $Fld[3] -b $Fld[3] -M $matrix -z 5000000 -m7 | ./$blastParser $score_cutoff >> $Fld[4]");
+	}
+
+	unlink "$tmpi", "$tmpd", "formatdb.log", "$tmpd.phr", "$tmpd.pin", "$tmpd.psq";
+}
+
+				 
+#   Date                                 Modification
+# --------          ---------------------------------------------------
+#
+# 2006-04-02 [1.36] - Changed score cutoff 50 to 0 for blast2faa.pl.
+#                   Reason: after a cluster merger a score can be less than the cutoff (50)
+#                   which will remove the sequence in blast2faa.pl.  The bootstrapping will
+#                   then fail.
+#                   - Fixed bug with index variable in bootstrap routine.
+#
+# 2006-06-01 [2.0]  - Fixed bug in blast_parser.pl: fields 7 and 8 were swapped,
+#                   it was supposed to print match_area before HSP_length.
+#                   - Fixed bug in blastall call: -v param was wrong for the A-B
+#                   and B-A comparisons.
+#                   - 
+#                   - Changed "cluster" to "group" consistently in output.
+#                   - Changed "main ortholog" to "seed ortholog" in output.
+#                   - Replace U -> X before running seqstat.jar, otherwise it crashes.
+# 2006-08-04 [2.0]  - In bootstrap subroutine, replace U with X, otherwise seqstat
+#                       will crash as this is not in the matrix (should fix this in seqstat)
+# 2006-08-04 [2.1]  - Changed to writing default output to file.
+#                   - Added options to run blast only.
+#                   - Fixed some file closing bugs.
+# 2007-12-14 [3.0]  - Sped up sorting routines (by Isabella).
+#                   - New XML-based blast_parser.
+#                   - New seqstat.jar to handle u and U.
+#                   - Modified overlap criterion for rejecting matches.  Now it agrees with the paper.
+# 2009-04-01 [4.0]  - Further modification of overlap criteria (require that they are met for both query and subject).
+#		    - Changed bit score cutoff to 40, which is suitable for compositionally adjusted BLAST.
+#		    - Added in 2-pass algorithm.
+# 2009-06-11 [4.0]  - Moved blasting out to subroutine.
+#		    - Changed blasting in bootstrap subroutine to use unconditional score matrix adjustment and SEG hard masking,
+#		      to be the same as first step of 2-pass blast.
+# 2009-06-17 [4.0]  - Compensated a Blast "bug" that sometimes gives a self-match lower score than a non-identical match.
+#                      This can happen with score matrix adjustment and can lead to missed orthologs.
+# 2009-08-18 [4.0]  - Consolidated Blast filtering parameters for 2-pass (-C3 -F\"m S\"; -C0 -FF)
+# 2009-10-09 [4.1]  - Fixed bug that caused failure if Fasta header lines had more than one word.
author	dereeper
date	Thu, 24 Jun 2021 13:51:52 +0000
parents
children