Mercurial > repos > dereeper > pgap
comparison PGAP-1.2.1/inparanoid.pl @ 0:83e62a1aeeeb draft
Uploaded
author | dereeper |
---|---|
date | Thu, 24 Jun 2021 13:51:52 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:83e62a1aeeeb |
---|---|
1 #! /usr/bin/perl | |
2 ############################################################################### | |
3 # InParanoid version 4.1 | |
4 # Copyright (C) Erik Sonnhammer, Kristoffer Forslund, Isabella Pekkari, | |
5 # Ann-Charlotte Berglund, Maido Remm, 2007 | |
6 # | |
7 # This program is provided under the terms of a personal license to the recipient and may only | |
8 # be used for the recipient's own research at an academic insititution. | |
9 # | |
10 # Distribution of the results of this program must be discussed with the authors. | |
11 # For using this program in a company or for commercial purposes, a commercial license is required. | |
12 # Contact Erik.Sonnhammer@sbc.su.se in both cases | |
13 # | |
14 # Make sure that Perl XML libraries are installed! | |
15 # | |
16 # NOTE: This script requires blastall (NCBI BLAST) version 2.2.16 or higher, that supports | |
17 # compositional score matrix adjustment (-C2 flag). | |
18 | |
19 my $usage =" Usage: inparanoid.pl <FASTAFILE with sequences of species A> <FASTAFILE with sequences of species B> [FASTAFILE with sequences of species C] | |
20 | |
21 "; | |
22 | |
23 ############################################################################### | |
24 # The program calculates orthologs between 2 datasets of proteins | |
25 # called A and B. Both datasets should be in multi-fasta file | |
26 # - Additionally, it tries to assign paralogous sequences (in-paralogs) to each | |
27 # thus forming paralogous clusters. | |
28 # - Confidence of in-paralogs is calculated in relative scale between 0-100%. | |
29 # This confidence value is dependent on how far is given sequence from the | |
30 # seed ortholog of given group | |
31 # - Confidence of groups can be calculated with bootstrapping. This is related | |
32 # to score difference between best hit and second best hit. | |
33 # - Optionally it can use a species C as outgroup. | |
34 | |
35 ############################################################################### | |
36 # You may need to run the following command manually to increase your | |
37 # default datasize limit: 'limit datasize 500000 kb' | |
38 | |
39 ############################################################################### | |
40 # Set following variables: # | |
41 ############################################################################### | |
42 | |
43 # What do you want the program to do? # | |
44 $run_blast = 1; # Set to 1 if you don't have the 4 BLAST output files # | |
45 # Requires 'blastall', 'formatdb' (NCBI BLAST2) # | |
46 # and parser 'blast_parser.pl' # | |
47 $blast_two_passes = 1; # Set to 1 to run 2-pass strategy # | |
48 # (strongly recommended, but slower) # | |
49 $run_inparanoid = 1; | |
50 $use_bootstrap = 1;# Use bootstrapping to estimate the confidence of orthologs# | |
51 # Needs additional programs 'seqstat.jar' and 'blast2faa.pl' | |
52 $use_outgroup = 0; # Use proteins from the third genome as an outgroup # | |
53 # Reject best-best hit if outgroup sequence is MORE # | |
54 # similar to one of the sequences # | |
55 # (by more than $outgroup_cutoff bits) # | |
56 | |
57 # Define location of files and programs: | |
58 #$blastall = "blastall -VT"; #Remove -VT for blast version 2.2.12 or earlier | |
59 $blastall = "$ARGV[0] -a $ARGV[1]"; #Add -aN to use N processors | |
60 $formatdb = "$ARGV[2]"; | |
61 $seqstat = "seqstat.jar"; | |
62 $blastParser = "blast_parser.pl"; | |
63 | |
64 #$matrix = "BLOSUM62"; # Reasonable default for comparison of eukaryotes. | |
65 $matrix = "BLOSUM45"; #(for prokaryotes), | |
66 #$matrix = "BLOSUM80"; #(orthologs within metazoa), | |
67 #$matrix = "PAM70"; | |
68 #$matrix = "PAM30"; | |
69 | |
70 # Output options: # | |
71 $output = 0; # table_stats-format output # | |
72 $table = 0; # Print tab-delimited table of orthologs to file "table.txt" # | |
73 # Each orthologous group with all inparalogs is on one line # | |
74 $mysql_table = 1; # Print out sql tables for the web server # | |
75 # Each inparalog is on separate line # | |
76 $html = 0; # HTML-format output # | |
77 | |
78 # Algorithm parameters: | |
79 # Default values should work without problems. | |
80 # MAKE SURE, however, that the score cutoff here matches what you used for BLAST! | |
81 $score_cutoff = $ARGV[3]; # In bits. Any match below this is ignored # | |
82 $outgroup_cutoff = 50; # In bits. Outgroup sequence hit must be this many bits# | |
83 # stronger to reject best-best hit between A and B # | |
84 $conf_cutoff = 0.05; # Include in-paralogs with this confidence or better # | |
85 $group_overlap_cutoff = 0.5; # Merge groups if ortholog in one group has more # | |
86 # than this confidence in other group # | |
87 $grey_zone = 0; # This many bits signifies the difference between 2 scores # | |
88 $show_times = 0; # Show times spent for execution of each part of the program # | |
89 # (This does not work properly) # | |
90 $debug = 0; # Print debugging messages or not. Levels 0,1,2 and 4 exist # | |
91 | |
92 my $seq_overlap_cutoff = $ARGV[4]; # Match area should cover at least this much of longer sequence. Match area is defined as area from start of | |
93 # first segment to end of last segment, i.e segments 1-10 and 90-100 gives a match length of 100. | |
94 my $segment_coverage_cutoff = $ARGV[5]; # Actually matching segments must cover this much of longer sequence. | |
95 # For example, segments 1-10 and 90-100 gives a total length of 20. | |
96 | |
97 splice(@ARGV,0,6); | |
98 | |
99 ############################################################################### | |
100 # No changes should be required below this line # | |
101 ############################################################################### | |
102 $ENV{CLASSPATH} = "./$seqstat" if ($use_bootstrap); | |
103 | |
104 if (!@ARGV){ | |
105 print STDERR $usage; | |
106 exit 1; | |
107 } | |
108 | |
109 if ((@ARGV < 2) and ($run_inparanoid)){ | |
110 print STDERR "\n When \$run_inparanoid=1, at least two distinct FASTA files have to be specified.\n"; | |
111 | |
112 print STDERR $usage; | |
113 exit 1; | |
114 } | |
115 | |
116 if ((!$run_blast) and (!$run_inparanoid)){ | |
117 print STDERR "run_blast or run_inparanoid has to be set!\n"; | |
118 exit 1; | |
119 } | |
120 | |
121 # Input files: | |
122 $fasta_seq_fileA = "$ARGV[0]"; | |
123 $fasta_seq_fileB = "$ARGV[1]"; | |
124 $fasta_seq_fileC = "$ARGV[2]" if ($use_outgroup); # This is outgroup file | |
125 | |
126 my $blast_outputAB = $fasta_seq_fileA . "-" . $fasta_seq_fileB; | |
127 my $blast_outputBA = $fasta_seq_fileB . "-" . $fasta_seq_fileA; | |
128 my $blast_outputAA = $fasta_seq_fileA . "-" . $fasta_seq_fileA; | |
129 my $blast_outputBB = $fasta_seq_fileB . "-" . $fasta_seq_fileB; | |
130 | |
131 if ($use_outgroup){ | |
132 $blast_outputAC = $fasta_seq_fileA . "-" . $fasta_seq_fileC; | |
133 $blast_outputBC = $fasta_seq_fileB . "-" . $fasta_seq_fileC; | |
134 } | |
135 my %idA; # Name -> ID combinations for species 1 | |
136 my %idB; # Name -> ID combinations for species 2 | |
137 my @nameA; # ID -> Name combinations for species 1 | |
138 my @nameB; # ID -> Name combinations for species 2 | |
139 my @nameC; | |
140 my %scoreAB; # Hashes with pairwise BLAST scores (in bits) | |
141 my %scoreBA; | |
142 my %scoreAA; | |
143 my %scoreBB; | |
144 my @hitnAB; # 1-D arrays that keep the number of pairwise hits | |
145 my @hitnBA; | |
146 my @hitnAA; | |
147 my @hitnBB; | |
148 my @hitAB; # 2-D arrays that keep the actual matching IDs | |
149 my @hitBA; | |
150 my @hitAA; | |
151 my @hitBB; | |
152 my @besthitAB; # IDs of best hits in other species (may contain more than one ID) | |
153 my @besthitBA; # IDs of best hits in other species (may contain more than one ID) | |
154 my @bestscoreAB; # best match A -> B | |
155 my @bestscoreBA; # best match B -> A | |
156 my @ortoA; # IDs of ortholog candidates from species A | |
157 my @ortoB; # IDs of ortholog candidates from species B | |
158 my @ortoS; # Scores between ortoA and ortoB pairs | |
159 my @paralogsA; # List of paralog IDs in given cluster | |
160 my @paralogsB; # List of paralog IDs in given cluster | |
161 my @confPA; # Confidence values for A paralogs | |
162 my @confPB; # Confidence values for B paralogs | |
163 my @confA; # Confidence values for orthologous groups | |
164 my @confB; # Confidence values for orthologous groups | |
165 my $prev_time = 0; | |
166 | |
167 $outputfile = "Output." . $ARGV[0] . "-" . $ARGV[1]; | |
168 if ($output){ | |
169 open OUTPUT, ">$outputfile" or warn "Could not write to OUTPUT file $filename\n"; | |
170 } | |
171 | |
172 ################################################# | |
173 # Assign ID numbers for species A | |
174 ################################################# | |
175 open A, "$fasta_seq_fileA" or die "File A with sequences in FASTA format is missing | |
176 Usage $0 <FASTAFILE with sequences of species A> <FASTAFILE with sequences of species B> <FASTAFILE with sequences of species C>\n"; | |
177 $id = 0; | |
178 while (<A>){ | |
179 if(/^\>/){ | |
180 ++$id; | |
181 chomp; | |
182 s/\>//; | |
183 @tmp = split(/\s+/); | |
184 #$name = substr($tmp[0],0,25); | |
185 $name = $tmp[0]; | |
186 $idA{$name} = int($id); | |
187 $nameA[$id] = $name; | |
188 } | |
189 } | |
190 close A; | |
191 $A = $id; | |
192 print "$A sequences in file $fasta_seq_fileA\n"; | |
193 | |
194 if ($output){ | |
195 print OUTPUT "$A sequences in file $fasta_seq_fileA\n"; | |
196 } | |
197 | |
198 if (@ARGV >= 2) { | |
199 ################################################# | |
200 # Assign ID numbers for species B | |
201 ################################################# | |
202 open B, "$fasta_seq_fileB" or die "File B with sequences in FASTA format is missing | |
203 Usage $0 <FASTAFILE with sequences of species A> <FASTAFILE with sequences of species B> <FASTAFILE with sequences of species C>\n"; | |
204 $id = 0; | |
205 while (<B>){ | |
206 if(/^\>/){ | |
207 ++$id; | |
208 chomp; | |
209 s/\>//; | |
210 @tmp = split(/\s+/); | |
211 #$name = substr($tmp[0],0,25); | |
212 $name = $tmp[0]; | |
213 $idB{$name} = int($id); | |
214 $nameB[$id] = $name; | |
215 } | |
216 } | |
217 $B = $id; | |
218 print "$B sequences in file $fasta_seq_fileB\n"; | |
219 close B; | |
220 | |
221 if ($output){ | |
222 print OUTPUT "$B sequences in file $fasta_seq_fileB\n"; | |
223 } | |
224 } | |
225 ################################################# | |
226 # Assign ID numbers for species C (outgroup) | |
227 ################################################# | |
228 if ($use_outgroup){ | |
229 open C, "$fasta_seq_fileC" or die "File C with sequences in FASTA format is missing | |
230 Usage $0 <FASTAFILE with sequences of species A> <FASTAFILE with sequences of species B> <FASTAFILE with sequences of species C>\n"; | |
231 $id = 0; | |
232 while (<C>){ | |
233 if(/^\>/){ | |
234 ++$id; | |
235 chomp; | |
236 s/\>//; | |
237 @tmp = split(/\s+/); | |
238 #$name = substr($tmp[0],0,25); | |
239 $name = $tmp[0]; | |
240 $idC{$name} = int($id); | |
241 $nameC[$id] = $name; | |
242 } | |
243 } | |
244 $C = $id; | |
245 print "$C sequences in file $fasta_seq_fileC\n"; | |
246 close C; | |
247 if ($output){ | |
248 print OUTPUT "$C sequences in file $fasta_seq_fileC\n"; | |
249 } | |
250 } | |
251 if ($show_times){ | |
252 ($user_time,,,) = times; | |
253 printf ("Indexing sequences took %.2f seconds\n", ($user_time - $prev_time)); | |
254 $prev_time = $user_time; | |
255 } | |
256 | |
257 ################################################# | |
258 # Run BLAST if not done already | |
259 ################################################# | |
260 if ($run_blast){ | |
261 print "Trying to run BLAST now - this may take several hours ... or days in worst case!\n"; | |
262 print STDERR "Formatting BLAST databases\n"; | |
263 system ("$formatdb -i $fasta_seq_fileA"); | |
264 system ("$formatdb -i $fasta_seq_fileB") if (@ARGV >= 2); | |
265 system ("$formatdb -i $fasta_seq_fileC") if ($use_outgroup); | |
266 print STDERR "Done formatting\nStarting BLAST searches...\n"; | |
267 | |
268 # Run blast only if the files do not already exist is not default. | |
269 # NOTE: you should have done this beforehand, because you probably | |
270 # want two-pass blasting anyway which is not implemented here | |
271 # this is also not adapted to use specific compositional adjustment settings | |
272 # and might not use the proper blast parser... | |
273 | |
274 do_blast ($fasta_seq_fileA, $fasta_seq_fileA, $A, $A, $blast_outputAA); | |
275 | |
276 if (@ARGV >= 2) { | |
277 do_blast ($fasta_seq_fileA, $fasta_seq_fileB, $B, $B, $blast_outputAB); | |
278 do_blast ($fasta_seq_fileB, $fasta_seq_fileA, $A, $A, $blast_outputBA); | |
279 do_blast ($fasta_seq_fileB, $fasta_seq_fileB, $B, $B, $blast_outputBB); | |
280 } | |
281 | |
282 if ($use_outgroup){ | |
283 | |
284 do_blast ($fasta_seq_fileA, $fasta_seq_fileC, $A, $C, $blast_outputAC); | |
285 do_blast ($fasta_seq_fileB, $fasta_seq_fileC, $B, $C, $blast_outputBC); | |
286 } | |
287 | |
288 if ($show_times){ | |
289 ($user_time,,,) = times; | |
290 printf ("BLAST searches took %.2f seconds\n", ($user_time - $prev_time)); | |
291 $prev_time = $user_time; | |
292 } | |
293 print STDERR "Done BLAST searches. "; | |
294 | |
295 } else { | |
296 print STDERR "Skipping blast! \n"; | |
297 } | |
298 | |
299 if ($run_inparanoid){ | |
300 print STDERR "Starting ortholog detection...\n"; | |
301 ################################################# | |
302 # Read in best hits from blast output file AB | |
303 ################################################# | |
304 $count = 0; | |
305 open AB, "$blast_outputAB" or die "Blast output file A->B is missing\n"; | |
306 $old_idQ = 0; | |
307 while (<AB>){ | |
308 chomp; | |
309 @Fld = split(/\s+/); # Get query, match and score | |
310 | |
311 if( scalar @Fld < 9 ){ | |
312 if($Fld[0]=~/done/){ | |
313 print STDERR "AB ok\n"; | |
314 } | |
315 next; | |
316 } | |
317 | |
318 $q = $Fld[0]; | |
319 $m = $Fld[1]; | |
320 $idQ = $idA{$q}; # ID of query sequence | |
321 $idM = $idB{$m}; # ID of match sequence | |
322 $score = $Fld[2]; | |
323 | |
324 next if (!overlap_test(@Fld)); | |
325 | |
326 # Score must be equal to or above cut-off | |
327 next if ($score < $score_cutoff); | |
328 | |
329 if(!$count || $q ne $oldq){ | |
330 print "Match $m, score $score, ID for $q is missing\n" if ($debug == 2 and !(exists($idA{$q}))); | |
331 $hitnAB[$idA{$oldq}] = $hit if($count); # Record number of hits for previous query | |
332 $hit = 0; | |
333 ++$count; | |
334 $oldq = $q; | |
335 } | |
336 ++$hit; | |
337 $hitAB[$idQ][$hit] = int($idM); | |
338 # printf ("hitAB[%d][%d] = %d\n",$idQ,$hit,$idM); | |
339 $scoreAB{"$idQ:$idM"} = $score; | |
340 $scoreBA{"$idM:$idQ"} = $score_cutoff; # Initialize mutual hit score - sometimes this is below score_cutoff | |
341 $old_idQ = $idQ; | |
342 # } | |
343 } | |
344 $hitnAB[$idQ] = $hit; # For the last query | |
345 #printf ("hitnAB[1] = %d\n",$hitnAB[1]); | |
346 #printf ("hitnAB[%d] = %d\n",$idQ,$hit); | |
347 close AB; | |
348 if ($output){ | |
349 print OUTPUT "$count sequences $fasta_seq_fileA have homologs in dataset $fasta_seq_fileB\n"; | |
350 } | |
351 ################################################# | |
352 # Read in best hits from blast output file BA | |
353 ################################################# | |
354 $count = 0; | |
355 open BA, "$blast_outputBA" or die "Blast output file B->A is missing\n"; | |
356 $old_idQ = 0; | |
357 while (<BA>){ | |
358 chomp; | |
359 @Fld = split(/\s+/); # Get query, match and score | |
360 | |
361 if( scalar @Fld < 9 ){ | |
362 if($Fld[0]=~/done/){ | |
363 print STDERR "BA ok\n"; | |
364 } | |
365 next; | |
366 } | |
367 | |
368 $q = $Fld[0]; | |
369 $m = $Fld[1]; | |
370 $idQ = $idB{$q}; | |
371 $idM = $idA{$m}; | |
372 $score = $Fld[2]; | |
373 | |
374 next if (!overlap_test(@Fld)); | |
375 | |
376 next if ($score < $score_cutoff); | |
377 | |
378 if(!$count || $q ne $oldq){ | |
379 print "ID for $q is missing\n" if ($debug == 2 and (!exists($idB{$q}))); | |
380 $hitnBA[$idB{$oldq}] = $hit if($count); # Record number of hits for previous query | |
381 $hit = 0; | |
382 ++$count; | |
383 $oldq = $q; | |
384 } | |
385 ++$hit; | |
386 $hitBA[$idQ][$hit] = int($idM); | |
387 # printf ("hitBA[%d][%d] = %d\n",$idQ,$hit,$idM); | |
388 $scoreBA{"$idQ:$idM"} = $score; | |
389 $scoreAB{"$idM:$idQ"} = $score_cutoff if ($scoreAB{"$idM:$idQ"} < $score_cutoff); # Initialize missing scores | |
390 $old_idQ = $idQ; | |
391 # } | |
392 } | |
393 $hitnBA[$idQ] = $hit; # For the last query | |
394 #printf ("hitnBA[%d] = %d\n",$idQ,$hit); | |
395 close BA; | |
396 if ($output){ | |
397 print OUTPUT "$count sequences $fasta_seq_fileB have homologs in dataset $fasta_seq_fileA\n"; | |
398 } | |
399 ##################### Equalize AB scores and BA scores ########################## | |
400 | |
401 ###################################################################################################################################### Modification by Isabella 1 | |
402 | |
403 # I removed the time consuming all vs all search and equalize scores for all pairs where there was a hit | |
404 | |
405 foreach my $key (keys %scoreAB) { | |
406 | |
407 my ($a, $b) = split(':', $key); | |
408 my $key2 = $b . ':' . $a; | |
409 | |
410 # If debugg mod is 5 and the scores A-B and B-A are unequal | |
411 # the names of the two sequences and their scores are printed | |
412 if ($scoreAB{$key} != $scoreBA{$key2}){ | |
413 printf ("%-20s\t%-20s\t%d\t%d\n",$nameA[$a], $nameB[$b], $scoreAB{$key}, $scoreBA{$key2}) if ($debug == 5); | |
414 } | |
415 | |
416 # Set score AB and score BA to the mean of scores AB and BA. | |
417 # The final score is saved as an integer so .5 needs to be added to avoid rounding errors | |
418 $scoreAB{$key} = $scoreBA{$key2} = int(($scoreAB{$key} + $scoreBA{$key2})/2.0 +.5); | |
419 } | |
420 | |
421 # For all ids for sequences from organism A | |
422 #for $a(1..$A){ | |
423 #For all ids for sequences from organism B | |
424 #for $b(1..$B){ | |
425 | |
426 # No need to equalize score if there was no match between sequence with id $a from species A | |
427 # and sequence with id $b from species B | |
428 # next if (!$scoreAB{"$a:$b"}); | |
429 | |
430 # If debugg mod is 5 and the scores A-B and B-A are unequal | |
431 # the names of the two sequences and their scores are printed | |
432 # if ($scoreAB{"$a:$b"} != $scoreBA{"$b:$a"}){ | |
433 # printf ("%-20s\t%-20s\t%d\t%d\n",$nameA[$a], $nameB[$b], $scoreAB{"$a:$b"}, $scoreBA{"$b:$a"}) if ($debug == 5); | |
434 # } | |
435 | |
436 # Set score AB and score BA to the mean of scores AB and BA. | |
437 # The final score is saved as an integer so .5 needs to be added to avoid rounding errors | |
438 # $scoreAB{"$a:$b"} = $scoreBA{"$b:$a"} = int(($scoreAB{"$a:$b"} + $scoreBA{"$b:$a"})/2.0 +.5); | |
439 | |
440 # printf ("scoreAB{%d: %d} = %d\n", $a, $b, $scoreAB{"$a:$b"}); | |
441 # printf ("scoreBA{%d: %d} = %d\n", $b, $a, $scoreBA{"$a:$b"}); | |
442 #} | |
443 # } | |
444 | |
445 ####################################################################################################################################### End modification by Isabella 1 | |
446 | |
447 ##################### Re-sort hits, besthits and bestscore ####################### | |
448 for $idA(1..$A){ | |
449 # print "Loop index = $idA\n"; | |
450 # printf ("hitnAB[%d] = %d\n",$idA, $hitnAB[$idA]); | |
451 next if (!($hitnAB[$idA])); | |
452 for $hit (1..($hitnAB[$idA]-1)){ # Sort hits by score | |
453 while($scoreAB{"$idA:$hitAB[$idA][$hit]"} < $scoreAB{"$idA:$hitAB[$idA][$hit+1]"}){ | |
454 $tmp = $hitAB[$idA][$hit]; | |
455 $hitAB[$idA][$hit] = $hitAB[$idA][$hit+1]; | |
456 $hitAB[$idA][$hit+1] = $tmp; | |
457 --$hit if ($hit > 1); | |
458 } | |
459 } | |
460 $bestscore = $bestscoreAB[$idA] = $scoreAB{"$idA:$hitAB[$idA][1]"}; | |
461 $besthitAB[$idA] = $hitAB[$idA][1]; | |
462 for $hit (2..$hitnAB[$idA]){ | |
463 if ($bestscore - $scoreAB{"$idA:$hitAB[$idA][$hit]"} <= $grey_zone){ | |
464 $besthitAB[$idA] .= " $hitAB[$idA][$hit]"; | |
465 } | |
466 else { | |
467 last; | |
468 } | |
469 } | |
470 undef $is_besthitAB[$idA]; # Create index that we can check later | |
471 grep (vec($is_besthitAB[$idA],$_,1) = 1, split(/ /,$besthitAB[$idA])); | |
472 # printf ("besthitAB[%d] = hitAB[%d][%d] = %d\n",$idA,$idA,$hit,$besthitAB[$idA]); | |
473 | |
474 } | |
475 | |
476 for $idB(1..$B){ | |
477 # print "Loop index = $idB\n"; | |
478 next if (!($hitnBA[$idB])); | |
479 for $hit (1..($hitnBA[$idB]-1)){ | |
480 # Sort hits by score | |
481 while($scoreBA{"$idB:$hitBA[$idB][$hit]"} < $scoreBA{"$idB:$hitBA[$idB][$hit+1]"}){ | |
482 $tmp = $hitBA[$idB][$hit]; | |
483 $hitBA[$idB][$hit] = $hitBA[$idB][$hit+1]; | |
484 $hitBA[$idB][$hit+1] = $tmp; | |
485 --$hit if ($hit > 1); | |
486 } | |
487 } | |
488 $bestscore = $bestscoreBA[$idB] = $scoreBA{"$idB:$hitBA[$idB][1]"}; | |
489 $besthitBA[$idB] = $hitBA[$idB][1]; | |
490 for $hit (2..$hitnBA[$idB]){ | |
491 if ($bestscore - $scoreBA{"$idB:$hitBA[$idB][$hit]"} <= $grey_zone){ | |
492 $besthitBA[$idB] .= " $hitBA[$idB][$hit]"; | |
493 } | |
494 else {last;} | |
495 } | |
496 undef $is_besthitBA[$idB]; # Create index that we can check later | |
497 grep (vec($is_besthitBA[$idB],$_,1) = 1, split(/ /,$besthitBA[$idB])); | |
498 # printf ("besthitBA[%d] = %d\n",$idA,$besthitAB[$idA]); | |
499 } | |
500 | |
501 if ($show_times){ | |
502 ($user_time,,,) = times; | |
503 printf ("Reading and sorting homologs took %.2f seconds\n", ($user_time - $prev_time)); | |
504 $prev_time = $user_time; | |
505 } | |
506 | |
507 ###################################################### | |
508 # Now find orthologs: | |
509 ###################################################### | |
510 $o = 0; | |
511 | |
512 for $i(1..$A){ # For each ID in file A | |
513 if (defined $besthitAB[$i]){ | |
514 @besthits = split(/ /,$besthitAB[$i]); | |
515 for $hit (@besthits){ | |
516 if (vec($is_besthitBA[$hit],$i,1)){ | |
517 ++$o; | |
518 $ortoA[$o] = $i; | |
519 $ortoB[$o] = $hit; | |
520 $ortoS[$o] = $scoreAB{"$i:$hit"}; # Should be equal both ways | |
521 # --$o if ($ortoS[$o] == $score_cutoff); # Ignore orthologs that are exactly at score_cutoff | |
522 print "Accept! " if ($debug == 2); | |
523 } | |
524 else {print " " if ($debug == 2);} | |
525 printf ("%-20s\t%d\t%-20s\t", $nameA[$i], $bestscoreAB[$i], $nameB[$hit]) if ($debug == 2); | |
526 print "$bestscoreBA[$hit]\t$besthitBA[$hit]\n" if ($debug == 2); | |
527 } | |
528 } | |
529 } | |
530 print "$o ortholog candidates detected\n" if ($debug); | |
531 ##################################################### | |
532 # Sort orthologs by ID and then by score: | |
533 ##################################################### | |
534 | |
535 ####################################################################################################### Modification by Isabella 2 | |
536 | |
537 # Removed time consuiming bubble sort. Created an index array and sort that according to id and score. | |
538 # The put all clusters on the right place. | |
539 | |
540 # Create an array used to store the position each element shall have in the final array | |
541 # The elements are initialized with the position numbers | |
542 my @position_index_array = (1..$o); | |
543 | |
544 # Sort the position list according to id | |
545 my @id_sorted_position_list = sort { ($ortoA[$a]+$ortoB[$a]) <=> ($ortoA[$b] + $ortoB[$b]) } @position_index_array; | |
546 | |
547 # Sort the list according to score | |
548 my @score_id_sorted_position_list = sort { $ortoS[$b] <=> $ortoS[$a] } @id_sorted_position_list; | |
549 | |
550 # Create new arrays for the sorted information | |
551 my @new_ortoA; | |
552 my @new_ortoB; | |
553 my @new_orthoS; | |
554 | |
555 # Add the information to the new arrays in the orer specifeid by the index array | |
556 for (my $index_in_list = 0; $index_in_list < scalar @score_id_sorted_position_list; $index_in_list++) { | |
557 | |
558 | |
559 my $old_index = $score_id_sorted_position_list[$index_in_list]; | |
560 $new_ortoA[$index_in_list + 1] = $ortoA[$old_index]; | |
561 $new_ortoB[$index_in_list + 1] = $ortoB[$old_index]; | |
562 $new_ortoS[$index_in_list + 1] = $ortoS[$old_index]; | |
563 } | |
564 | |
565 @ortoA = @new_ortoA; | |
566 @ortoB = @new_ortoB; | |
567 @ortoS = @new_ortoS; | |
568 | |
569 # Use bubblesort to sort ortholog pairs by id | |
570 # for $i(1..($o-1)){ | |
571 # while(($ortoA[$i]+$ortoB[$i]) > ($ortoA[$i+1] + $ortoB[$i+1])){ | |
572 # $tempA = $ortoA[$i]; | |
573 # $tempB = $ortoB[$i]; | |
574 # $tempS = $ortoS[$i]; | |
575 # | |
576 # $ortoA[$i] = $ortoA[$i+1]; | |
577 # $ortoB[$i] = $ortoB[$i+1]; | |
578 # $ortoS[$i] = $ortoS[$i+1]; | |
579 # | |
580 # $ortoA[$i+1] = $tempA; | |
581 # $ortoB[$i+1] = $tempB; | |
582 # $ortoS[$i+1] = $tempS; | |
583 # | |
584 # --$i if ($i > 1); | |
585 # } | |
586 # } | |
587 # | |
588 # # Use bubblesort to sort ortholog pairs by score | |
589 # for $i(1..($o-1)){ | |
590 # while($ortoS[$i] < $ortoS[$i+1]){ | |
591 # # Swap places: | |
592 # $tempA = $ortoA[$i]; | |
593 # $tempB = $ortoB[$i]; | |
594 # $tempS = $ortoS[$i]; | |
595 # | |
596 # $ortoA[$i] = $ortoA[$i+1]; | |
597 # $ortoB[$i] = $ortoB[$i+1]; | |
598 # $ortoS[$i] = $ortoS[$i+1]; | |
599 # | |
600 # $ortoA[$i+1] = $tempA; | |
601 # $ortoB[$i+1] = $tempB; | |
602 # $ortoS[$i+1] = $tempS; | |
603 # | |
604 # --$i if ($i > 1); | |
605 # } | |
606 # } | |
607 | |
608 ###################################################################################################### End modification bt Isabella 2 | |
609 | |
610 @all_ortologsA = (); | |
611 @all_ortologsB = (); | |
612 for $i(1..$o){ | |
613 push(@all_ortologsA,$ortoA[$i]); # List of all orthologs | |
614 push(@all_ortologsB,$ortoB[$i]); | |
615 } | |
616 undef $is_ortologA; # Create index that we can check later | |
617 undef $is_ortologB; | |
618 grep (vec($is_ortologA,$_,1) = 1, @all_ortologsA); | |
619 grep (vec($is_ortologB,$_,1) = 1, @all_ortologsB); | |
620 | |
621 if ($show_times){ | |
622 ($user_time,,,) = times; | |
623 printf ("Finding and sorting orthologs took %.2f seconds\n", ($user_time - $prev_time)); | |
624 $prev_time = $user_time; | |
625 } | |
626 ################################################# | |
627 # Read in best hits from blast output file AC | |
628 ################################################# | |
629 if ($use_outgroup){ | |
630 $count = 0; | |
631 open AC, "$blast_outputAC" or die "Blast output file A->C is missing\n"; | |
632 while (<AC>){ | |
633 chomp; | |
634 @Fld = split(/\s+/); # Get query, match and score | |
635 | |
636 if( scalar @Fld < 9 ){ | |
637 if($Fld[0]=~/done/){ | |
638 print STDERR "AC ok\n"; | |
639 } | |
640 next; | |
641 } | |
642 | |
643 $q = $Fld[0]; | |
644 $m = $Fld[1]; | |
645 $idQ = $idA{$q}; | |
646 $idM = $idC{$m}; | |
647 $score = $Fld[2]; | |
648 next unless (vec($is_ortologA,$idQ,1)); | |
649 | |
650 next if (!overlap_test(@Fld)); | |
651 | |
652 next if ($score < $score_cutoff); | |
653 | |
654 next if ($count and ($q eq $oldq)); | |
655 # Only comes here if this is the best hit: | |
656 $besthitAC[$idQ] = $idM; | |
657 $bestscoreAC[$idQ] = $score; | |
658 $oldq = $q; | |
659 ++$count; | |
660 } | |
661 close AC; | |
662 ################################################# | |
663 # Read in best hits from blast output file BC | |
664 ################################################# | |
665 $count = 0; | |
666 open BC, "$blast_outputBC" or die "Blast output file B->C is missing\n"; | |
667 while (<BC>){ | |
668 chomp; | |
669 @Fld = split(/\s+/); # Get query, match and score | |
670 | |
671 if( scalar @Fld < 9 ){ | |
672 if($Fld[0]=~/done/){ | |
673 print STDERR "BC ok\n"; | |
674 } | |
675 next; | |
676 } | |
677 | |
678 $q = $Fld[0]; | |
679 $m = $Fld[1]; | |
680 $idQ = $idB{$q}; | |
681 $idM = $idC{$m}; | |
682 $score = $Fld[2]; | |
683 next unless (vec($is_ortologB,$idQ,1)); | |
684 | |
685 next if (!overlap_test(@Fld)); | |
686 | |
687 next if ($score < $score_cutoff); | |
688 | |
689 next if ($count and ($q eq $oldq)); | |
690 # Only comes here if this is the best hit: | |
691 $besthitBC[$idQ] = $idM; | |
692 $bestscoreBC[$idQ] = $score; | |
693 $oldq = $q; | |
694 ++$count; | |
695 } | |
696 close BC; | |
697 ################################ | |
698 # Detect rooting problems | |
699 ################################ | |
700 $rejected = 0; | |
701 @del = (); | |
702 $file = "rejected_sequences." . $fasta_seq_fileC; | |
703 open OUTGR, ">$file"; | |
704 for $i (1..$o){ | |
705 $diff1 = $diff2 = 0; | |
706 $idA = $ortoA[$i]; | |
707 $idB = $ortoB[$i]; | |
708 $diff1 = $bestscoreAC[$idA] - $ortoS[$i]; | |
709 $diff2 = $bestscoreBC[$idB] - $ortoS[$i]; | |
710 if ($diff1 > $outgroup_cutoff){ | |
711 print OUTGR "Ortholog pair $i ($nameA[$idA]-$nameB[$idB]). | |
712 $nameA[$idA] from $fasta_seq_fileA is closer to $nameC[$besthitAC[$idA]] than to $nameB[$idB]\n"; | |
713 print OUTGR " $ortoS[$i] < $bestscoreAC[$idA] by $diff1\n"; | |
714 } | |
715 if ($diff2 > $outgroup_cutoff){ | |
716 print OUTGR "Ortholog pair $i ($nameA[$idA]-$nameB[$idB]). | |
717 $nameB[$idB] from $fasta_seq_fileB is closer to $nameC[$besthitBC[$idB]] than to $nameA[$idA]\n"; | |
718 print OUTGR " $ortoS[$i] < $bestscoreBC[$idB] by $diff2\n"; | |
719 } | |
720 if (($diff1 > $outgroup_cutoff) or ($diff2 > $outgroup_cutoff)){ | |
721 ++$rejected; | |
722 $del[$i] = 1; | |
723 } | |
724 } | |
725 print "Number of rejected groups: $rejected (outgroup sequence was closer by more than $outgroup_cutoff bits)\n"; | |
726 close OUTGR; | |
727 } # End of $use_outgroup | |
728 ################################ | |
729 # Read inside scores from AA | |
730 ################################ | |
731 $count = 0; | |
732 $max_hit = 0; | |
733 open AA, "$blast_outputAA" or die "Blast output file A->A is missing\n"; | |
734 while (<AA>) { | |
735 chomp; # strip newline | |
736 | |
737 @Fld = split(/\s+/); # Get query and match names | |
738 | |
739 if( scalar @Fld < 9 ){ | |
740 if($Fld[0]=~/done/){ | |
741 print STDERR "AA ok\n"; | |
742 } | |
743 next; | |
744 } | |
745 | |
746 $q = $Fld[0]; | |
747 $m = $Fld[1]; | |
748 $score = $Fld[2]; | |
749 next unless (vec($is_ortologA,$idA{$q},1)); | |
750 | |
751 next if (!overlap_test(@Fld)); | |
752 | |
753 next if ($score < $score_cutoff); | |
754 | |
755 if(!$count || $q ne $oldq){ # New query | |
756 $max_hit = $hit if ($hit > $max_hit); | |
757 $hit = 0; | |
758 $oldq = $q; | |
759 } | |
760 ++$hit; | |
761 ++$count; | |
762 $scoreAA{"$idA{$q}:$idA{$m}"} = int($score + 0.5); | |
763 $hitAA[$idA{$q}][$hit] = int($idA{$m}); | |
764 $hitnAA[$idA{$q}] = $hit; | |
765 } | |
766 close AA; | |
767 if ($output){ | |
768 print OUTPUT "$count $fasta_seq_fileA-$fasta_seq_fileA matches\n"; | |
769 } | |
770 ################################ | |
771 # Read inside scores from BB | |
772 ################################ | |
773 $count = 0; | |
774 open BB, "$blast_outputBB" or die "Blast output file B->B is missing\n"; | |
775 while (<BB>) { | |
776 chomp; # strip newline | |
777 | |
778 @Fld = split(/\s+/); # Get query and match names | |
779 | |
780 if( scalar @Fld < 9 ){ | |
781 if($Fld[0]=~/done/){ | |
782 print STDERR "BB ok\n"; | |
783 } | |
784 next; | |
785 } | |
786 | |
787 $q = $Fld[0]; | |
788 $m = $Fld[1]; | |
789 $score = $Fld[2]; | |
790 next unless (vec($is_ortologB,$idB{$q},1)); | |
791 | |
792 next if (!overlap_test(@Fld)); | |
793 | |
794 next if ($score < $score_cutoff); | |
795 | |
796 if(!$count || $q ne $oldq){ # New query | |
797 $max_hit = $hit if ($hit > $max_hit); | |
798 $oldq = $q; | |
799 $hit = 0; | |
800 } | |
801 ++$count; | |
802 ++$hit; | |
803 $scoreBB{"$idB{$q}:$idB{$m}"} = int($score + 0.5); | |
804 $hitBB[$idB{$q}][$hit] = int($idB{$m}); | |
805 $hitnBB[$idB{$q}] = $hit; | |
806 } | |
807 close BB; | |
808 if ($output){ | |
809 print OUTPUT "$count $fasta_seq_fileB-$fasta_seq_fileB matches\n"; | |
810 } | |
811 if ($show_times){ | |
812 ($user_time,,,) = times; | |
813 printf ("Reading paralogous hits took %.2f seconds\n", ($user_time - $prev_time)); | |
814 $prev_time = $user_time; | |
815 } | |
816 print "Maximum number of hits per sequence was $max_hit\n" if ($debug); | |
817 ##################################################### | |
818 # Find paralogs: | |
819 ##################################################### | |
820 for $i(1..$o){ | |
821 $merge[$i] = 0; | |
822 next if($del[$i]); # If outgroup species was closer to one of the seed orthologs | |
823 $idA = $ortoA[$i]; | |
824 $idB = $ortoB[$i]; | |
825 local @membersA = (); | |
826 local @membersB = (); | |
827 | |
828 undef $is_paralogA[$i]; | |
829 undef $is_paralogB[$i]; | |
830 | |
831 print "$i: Ortholog pair $nameA[$idA] and $nameB[$idB]. $hitnAA[$idA] hits for A and $hitnBB[$idB] hits for B\n" if ($debug); | |
832 # Check if current ortholog is already clustered: | |
833 for $j(1..($i-1)){ | |
834 # Overlap type 1: Both orthologs already clustered here -> merge | |
835 if ((vec($is_paralogA[$j],$idA,1)) and (vec($is_paralogB[$j],$idB,1))){ | |
836 $merge[$i] = $j; | |
837 print "Merge CASE 1: group $i ($nameB[$idB]-$nameA[$idA]) and $j ($nameB[$ortoB[$j]]-$nameA[$ortoA[$j]])\n" if ($debug); | |
838 last; | |
839 } | |
840 # Overlap type 2: 2 competing ortholog pairs -> merge | |
841 elsif (($ortoS[$j] - $ortoS[$i] <= $grey_zone) | |
842 and (($ortoA[$j] == $ortoA[$i]) or ($ortoB[$j] == $ortoB[$i])) | |
843 # and ($paralogsA[$j]) | |
844 ){ # The last condition is false if the previous cluster has been already deleted | |
845 $merge[$i] = $j; | |
846 print "Merge CASE 2: group $i ($nameA[$ortoA[$i]]-$nameB[$ortoB[$i]]) and $j ($nameA[$ortoA[$j]]-$nameB[$ortoB[$j]])\n" if ($debug); | |
847 last; | |
848 } | |
849 # Overlap type 3: DELETE One of the orthologs belongs to some much stronger cluster -> delete | |
850 elsif (((vec($is_paralogA[$j],$idA,1)) or (vec($is_paralogB[$j],$idB,1))) and | |
851 ($ortoS[$j] - $ortoS[$i] > $score_cutoff)){ | |
852 print "Delete CASE 3: Cluster $i -> $j, score $ortoS[$i] -> $ortoS[$j], ($nameA[$ortoA[$j]]-$nameB[$ortoB[$j]])\n" if ($debug); | |
853 $merge[$i]= -1; # Means - do not add sequences to this cluster | |
854 $paralogsA[$i] = ""; | |
855 $paralogsB[$i] = ""; | |
856 last; | |
857 } | |
858 # Overlap type 4: One of the orthologs is close to the center of other cluster | |
859 elsif (((vec($is_paralogA[$j],$idA,1)) and ($confPA[$idA] > $group_overlap_cutoff)) or | |
860 ((vec($is_paralogB[$j],$idB,1)) and ($confPB[$idB] > $group_overlap_cutoff))){ | |
861 print "Merge CASE 4: Cluster $i -> $j, score $ortoS[$i] -> $ortoS[$j], ($nameA[$ortoA[$j]]-$nameB[$ortoB[$j]])\n" if ($debug); | |
862 $merge[$i] = $j; | |
863 last; | |
864 } | |
865 # Overlap type 5: | |
866 # All clusters that were overlapping, but not catched by previous "if" statements will be DIVIDED! | |
867 } | |
868 next if ($merge[$i] < 0); # This cluster should be deleted | |
869 ##### Check for paralogs in A | |
870 $N = $hitnAA[$idA]; | |
871 for $j(1..$N){ | |
872 $hitID = $hitAA[$idA][$j]; # hit of idA | |
873 # print "Working with $nameA[$hitID]\n" if ($debug == 2); | |
874 # Decide whether this hit is inside the paralog circle: | |
875 if ( ($idA == $hitID) or ($scoreAA{"$idA:$hitID"} >= $bestscoreAB[$idA]) and | |
876 ($scoreAA{"$idA:$hitID"} >= $bestscoreAB[$hitID])){ | |
877 if ($debug == 2){ | |
878 print " Paralog candidates: "; | |
879 printf ("%-20s: %-20s", $nameA[$idA], $nameA[$hitID]); | |
880 print "\t$scoreAA{\"$idA:$hitID\"} : $bestscoreAB[$idA] : $bestscoreAB[$hitID]\n"; | |
881 } | |
882 $paralogs = 1; | |
883 if ($scoreAA{"$idA:$idA"} == $ortoS[$i]){ | |
884 if ($scoreAA{"$idA:$hitID"} == $scoreAA{"$idA:$idA"}){ | |
885 $conf_here = 1.0; # In the center | |
886 } | |
887 else{ | |
888 $conf_here = 0.0; # On the border | |
889 } | |
890 } | |
891 else { | |
892 $conf_here = ($scoreAA{"$idA:$hitID"} - $ortoS[$i]) / | |
893 ($scoreAA{"$idA:$idA"} - $ortoS[$i]); | |
894 } | |
895 # Check if this paralog candidate is already clustered in other clusters | |
896 for $k(1..($i-1)){ | |
897 if (vec($is_paralogA[$k],$hitID,1)){ # Yes, found in cluster $k | |
898 if($debug == 2){ | |
899 print " $nameA[$hitID] is already in cluster $k, together with:"; | |
900 print " $nameA[$ortoA[$k]] and $nameB[$ortoB[$k]] "; | |
901 print "($scoreAA{\"$ortoA[$k]:$hitID\"})"; | |
902 } | |
903 if (($confPA[$hitID] >= $conf_here) and | |
904 ($j != 1)){ # The seed ortholog CAN NOT remain there | |
905 print " and remains there.\n" if ($debug == 2); | |
906 $paralogs = 0; # No action | |
907 } | |
908 else { # Ortholog of THIS cluster is closer than ortholog of competing cluster $k | |
909 print " and should be moved here!\n" if ($debug == 2); # Remove from other cluster, add to this cluster | |
910 @membersAK = split(/ /, $paralogsA[$k]); # This array contains IDs | |
911 $paralogsA[$k] = "";# Remove all paralogs from cluster $k | |
912 @tmp = (); | |
913 for $m(@membersAK){ | |
914 push(@tmp,$m) if ($m != $hitID); # Put other members back | |
915 } | |
916 $paralogsA[$k] = join(' ',@tmp); | |
917 undef $is_paralogA[$k]; # Create index that we can check later | |
918 grep (vec($is_paralogA[$k],$_,1) = 1, @tmp); | |
919 } | |
920 last; | |
921 } | |
922 } | |
923 next if (! $paralogs); # Skip that paralog - it is already in cluster $k | |
924 push (@membersA,$hitID); # Add this hit to paralogs of A | |
925 } | |
926 } | |
927 # Calculate confidence values now: | |
928 @tmp = (); | |
929 for $idP (@membersA){ # For each paralog calculate conf value | |
930 if($scoreAA{"$idA:$idA"} == $ortoS[$i]){ | |
931 if ($scoreAA{"$idA:$idP"} == $scoreAA{"$idA:$idA"}){ | |
932 $confPA[$idP] = 1.00; | |
933 } | |
934 else{ | |
935 $confPA[$idP] = 0.00; | |
936 } | |
937 } | |
938 else{ | |
939 $confPA[$idP] = ($scoreAA{"$idA:$idP"} - $ortoS[$i]) / | |
940 ($scoreAA{"$idA:$idA"} - $ortoS[$i]); | |
941 } | |
942 push (@tmp, $idP) if ($confPA[$idP] >= $conf_cutoff); # If one wishes to use only significant paralogs | |
943 } | |
944 @membersA = @tmp; | |
945 ########### Merge if necessary: | |
946 if ($merge[$i] > 0){ # Merge existing cluster with overlapping cluster | |
947 @tmp = split(/ /,$paralogsA[$merge[$i]]); | |
948 for $m (@membersA){ | |
949 push (@tmp, $m) unless (vec($is_paralogA[$merge[$i]],$m,1)); | |
950 } | |
951 $paralogsA[$merge[$i]] = join(' ',@tmp); | |
952 undef $is_paralogA[$merge[$i]]; | |
953 grep (vec($is_paralogA[$merge[$i]],$_,1) = 1, @tmp); # Refresh index of paralog array | |
954 } | |
955 ######### Typical new cluster: | |
956 else{ # Create a new cluster | |
957 $paralogsA[$i] = join(' ',@membersA); | |
958 undef $is_paralogA; # Create index that we can check later | |
959 grep (vec($is_paralogA[$i],$_,1) = 1, @membersA); | |
960 } | |
961 ##### The same procedure for species B: | |
962 $N = $hitnBB[$idB]; | |
963 for $j(1..$N){ | |
964 $hitID = $hitBB[$idB][$j]; | |
965 # print "Working with $nameB[$hitID]\n" if ($debug == 2); | |
966 if ( ($idB == $hitID) or ($scoreBB{"$idB:$hitID"} >= $bestscoreBA[$idB]) and | |
967 ($scoreBB{"$idB:$hitID"} >= $bestscoreBA[$hitID])){ | |
968 if ($debug == 2){ | |
969 print " Paralog candidates: "; | |
970 printf ("%-20s: %-20s", $nameB[$idB], $nameB[$hitID]); | |
971 print "\t$scoreBB{\"$idB:$hitID\"} : "; | |
972 print "$bestscoreBA[$idB] : $bestscoreBA[$hitID]\n"; | |
973 } | |
974 $paralogs = 1; | |
975 if ($scoreBB{"$idB:$idB"} == $ortoS[$i]){ | |
976 if ($scoreBB{"$idB:$hitID"} == $scoreBB{"$idB:$idB"}){ | |
977 $conf_here = 1.0; | |
978 } | |
979 else{ | |
980 $conf_here = 0.0; | |
981 } | |
982 } | |
983 else{ | |
984 $conf_here = ($scoreBB{"$idB:$hitID"} - $ortoS[$i]) / | |
985 ($scoreBB{"$idB:$idB"} - $ortoS[$i]); | |
986 } | |
987 | |
988 # Check if this paralog candidate is already clustered in other clusters | |
989 for $k(1..($i-1)){ | |
990 if (vec($is_paralogB[$k],$hitID,1)){ # Yes, found in cluster $k | |
991 if($debug == 2){ | |
992 print " $nameB[$hitID] is already in cluster $k, together with:"; | |
993 print " $nameB[$ortoB[$k]] and $nameA[$ortoA[$k]] "; | |
994 print "($scoreBB{\"$ortoB[$k]:$hitID\"})"; | |
995 } | |
996 if (($confPB[$hitID] >= $conf_here) and | |
997 ($j != 1)){ # The seed ortholog CAN NOT remain there | |
998 print " and remains there.\n" if ($debug == 2); | |
999 $paralogs = 0; # No action | |
1000 } | |
1001 else { # Ortholog of THIS cluster is closer than ortholog of competing cluster $k | |
1002 print " and should be moved here!\n" if ($debug == 2); # Remove from other cluster, add to this cluster | |
1003 @membersBK = split(/ /, $paralogsB[$k]); # This array contains names, not IDs | |
1004 $paralogsB[$k] = ""; | |
1005 @tmp = (); | |
1006 for $m(@membersBK){ | |
1007 push(@tmp,$m) if ($m != $hitID); # Put other members back | |
1008 } | |
1009 $paralogsB[$k] = join(' ',@tmp); | |
1010 undef $is_paralogB[$k]; # Create index that we can check later | |
1011 grep (vec($is_paralogB[$k],$_,1) = 1, @tmp); | |
1012 } | |
1013 last; # Don't search in other clusters | |
1014 } | |
1015 } | |
1016 next if (! $paralogs); # Skip that paralog - it is already in cluster $k | |
1017 push (@membersB,$hitID); | |
1018 } | |
1019 } | |
1020 # Calculate confidence values now: | |
1021 @tmp = (); | |
1022 for $idP (@membersB){ # For each paralog calculate conf value | |
1023 if($scoreBB{"$idB:$idB"} == $ortoS[$i]){ | |
1024 if ($scoreBB{"$idB:$idP"} == $scoreBB{"$idB:$idB"}){ | |
1025 $confPB[$idP] = 1.0; | |
1026 } | |
1027 else{ | |
1028 $confPB[$idP] = 0.0; | |
1029 } | |
1030 } | |
1031 else{ | |
1032 $confPB[$idP] = ($scoreBB{"$idB:$idP"} - $ortoS[$i]) / | |
1033 ($scoreBB{"$idB:$idB"} - $ortoS[$i]); | |
1034 } | |
1035 push (@tmp, $idP) if ($confPB[$idP] >= $conf_cutoff); # If one wishes to use only significant paralogs | |
1036 } | |
1037 @membersB = @tmp; | |
1038 ########### Merge if necessary: | |
1039 if ($merge[$i] > 0){ # Merge existing cluster with overlapping cluster | |
1040 @tmp = split(/ /,$paralogsB[$merge[$i]]); | |
1041 for $m (@membersB){ | |
1042 push (@tmp, $m) unless (vec($is_paralogB[$merge[$i]],$m,1)); | |
1043 } | |
1044 $paralogsB[$merge[$i]] = join(' ',@tmp); | |
1045 undef $is_paralogB[$merge[$i]]; | |
1046 grep (vec($is_paralogB[$merge[$i]],$_,1) = 1, @tmp); # Refresh index of paralog array | |
1047 } | |
1048 ######### Typical new cluster: | |
1049 else{ # Create a new cluster | |
1050 $paralogsB[$i] = join(' ',@membersB); | |
1051 undef $is_paralogB; # Create index that we can check later | |
1052 grep (vec($is_paralogB[$i],$_,1) = 1, @membersB); | |
1053 } | |
1054 } | |
1055 if ($show_times){ | |
1056 ($user_time,,,) = times; | |
1057 printf ("Finding in-paralogs took %.2f seconds\n", ($user_time - $prev_time)); | |
1058 $prev_time = $user_time; | |
1059 } | |
1060 ##################################################### | |
1061 &clean_up(1); | |
1062 #################################################### | |
1063 # Find group for orphans. If cluster contains only one member, find where it should go: | |
1064 for $i (1..$o){ | |
1065 @membersA = split(/ /, $paralogsA[$i]); | |
1066 @membersB = split(/ /, $paralogsB[$i]); | |
1067 $na = @membersA; | |
1068 $nb = @membersB; | |
1069 if (($na == 0) and $nb){ | |
1070 print "Warning: empty A cluster $i\n"; | |
1071 for $m (@membersB){ | |
1072 $bestscore = 0; | |
1073 $bestgroup = 0; | |
1074 $bestmatch = 0; | |
1075 for $j (1..$o) { | |
1076 next if ($i == $j); # Really need to check against all 100% members of the group. | |
1077 @membersBJ = split(/ /, $paralogsB[$j]); | |
1078 for $k (@membersBJ){ | |
1079 next if ($confPB[$k] != 1); # For all 100% in-paralogs | |
1080 $score = $scoreBB{"$m:$k"}; | |
1081 if ($score > $bestscore){ | |
1082 $bestscore = $score; | |
1083 $bestgroup = $j; | |
1084 $bestmatch = $k; | |
1085 } | |
1086 } | |
1087 } | |
1088 print "Orphan $nameB[$m] goes to group $bestgroup with $nameB[$bestmatch]\n" ; | |
1089 @members = split(/ /, $paralogsB[$bestgroup]); | |
1090 push (@members, $m); | |
1091 $paralogsB[$bestgroup] = join(' ',@members); | |
1092 $paralogsB[$i] = ""; | |
1093 undef $is_paralogB[$bestgroup]; | |
1094 undef $is_paralogB[$i]; | |
1095 grep (vec($is_paralogB[$bestgroup],$_,1) = 1, @members); # Refresh index of paralog array | |
1096 # grep (vec($is_paralogB[$i],$_,1) = 1, ()); | |
1097 } | |
1098 } | |
1099 if ($na and ($nb == 0)){ | |
1100 print "Warning: empty B cluster $i\n"; | |
1101 for $m (@membersA){ | |
1102 $bestscore = 0; | |
1103 $bestgroup = 0; | |
1104 $bestmatch = 0; | |
1105 for $j (1..$o) { | |
1106 next if ($i == $j); | |
1107 @membersAJ = split(/ /, $paralogsA[$j]); | |
1108 for $k (@membersAJ){ | |
1109 next if ($confPA[$k] != 1); # For all 100% in-paralogs | |
1110 $score = $scoreAA{"$m:$k"}; | |
1111 if ($score > $bestscore){ | |
1112 $bestscore = $score; | |
1113 $bestgroup = $j; | |
1114 $bestmatch = $k; | |
1115 } | |
1116 } | |
1117 } | |
1118 print "Orphan $nameA[$m] goes to group $bestgroup with $nameA[$bestmatch]\n"; | |
1119 @members = split(/ /, $paralogsA[$bestgroup]); | |
1120 push (@members, $m); | |
1121 $paralogsA[$bestgroup] = join(' ',@members); | |
1122 $paralogsA[$i] = ""; | |
1123 undef $is_paralogA[$bestgroup]; | |
1124 undef $is_paralogA[$i]; | |
1125 grep (vec($is_paralogA[$bestgroup],$_,1) = 1, @members); # Refresh index of paralog array | |
1126 # grep (vec($is_paralogA[$i],$_,1) = 1, ()); | |
1127 } | |
1128 } | |
1129 } | |
1130 | |
1131 &clean_up(1); | |
1132 ################### | |
1133 $htmlfile = "orthologs." . $ARGV[0] . "-" . $ARGV[1] . ".html"; | |
1134 if ($html){ | |
1135 open HTML, ">$htmlfile" or warn "Could not write to HTML file $filename\n"; | |
1136 } | |
1137 | |
1138 | |
1139 if ($output){ | |
1140 print OUTPUT "\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\n"; | |
1141 print OUTPUT "$o groups of orthologs\n"; | |
1142 print OUTPUT "$totalA in-paralogs from $fasta_seq_fileA\n"; | |
1143 print OUTPUT "$totalB in-paralogs from $fasta_seq_fileB\n"; | |
1144 print OUTPUT "Grey zone $grey_zone bits\n"; | |
1145 print OUTPUT "Score cutoff $score_cutoff bits\n"; | |
1146 print OUTPUT "In-paralogs with confidence less than $conf_cutoff not shown\n"; | |
1147 print OUTPUT "Sequence overlap cutoff $seq_overlap_cutoff\n"; | |
1148 print OUTPUT "Group merging cutoff $group_overlap_cutoff\n"; | |
1149 print OUTPUT "Scoring matrix $matrix\n"; | |
1150 print OUTPUT "\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\n"; | |
1151 } | |
1152 if ($html){ | |
1153 print HTML "<pre>\n"; | |
1154 print HTML "\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\n"; | |
1155 print HTML "$o groups of orthologs\n"; | |
1156 print HTML "$totalA in-paralogs from $fasta_seq_fileA\n"; | |
1157 print HTML "$totalB in-paralogs from $fasta_seq_fileB\n"; | |
1158 print HTML "Grey zone $grey_zone bits\n"; | |
1159 print HTML "Score cutoff $score_cutoff bits\n"; | |
1160 print HTML "In-paralogs with confidence less than $conf_cutoff not shown\n"; | |
1161 print HTML "Sequence overlap cutoff $seq_overlap_cutoff\n"; | |
1162 print HTML "Group merging cutoff $group_overlap_cutoff\n"; | |
1163 print HTML "Scoring matrix $matrix\n"; | |
1164 print HTML "\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\n"; | |
1165 } | |
1166 # ############################################################################## | |
1167 # Check for alternative orthologs, sort paralogs by confidence and print results | |
1168 # ############################################################################## | |
1169 if ($use_bootstrap and $debug){ | |
1170 open FF, ">BS_vs_bits" or warn "Could not write to file BS_vs_bits\n"; | |
1171 } | |
1172 for $i(1..$o){ | |
1173 @membersA = split(/ /, $paralogsA[$i]); | |
1174 @membersB = split(/ /, $paralogsB[$i]); | |
1175 $message = ""; | |
1176 $htmlmessage = ""; | |
1177 | |
1178 $idB = $ortoB[$i]; | |
1179 $nB = $hitnBA[$idB]; | |
1180 for $idA(@membersA){ | |
1181 next if ($confPA[$idA] != 1.0); | |
1182 $nA = $hitnAB[$idA]; | |
1183 $confA[$i] = $ortoS[$i]; # default | |
1184 $bsA[$idA] = 1.0; | |
1185 ############## | |
1186 for $j(1..$nB){ | |
1187 $idH = $hitBA[$idB][$j]; | |
1188 ################ Some checks for alternative orthologs: | |
1189 # 1. Don't consider sequences that are already in this cluster | |
1190 next if (vec($is_paralogA[$i],$idH,1)); | |
1191 next if ($confPA[$idH] > 0); # If $conf_cutoff > 0 idH might be incide circle, but not paralog | |
1192 | |
1193 # 2. Check if candidate for alternative ortholog is already clustered in stronger clusters | |
1194 $in_other_cluster = 0; | |
1195 for $k(1..($i-1)){ # Check if current ortholog is already clustered | |
1196 if (vec($is_paralogA[$k],$idH,1)){ | |
1197 $in_other_cluster = $k; | |
1198 last; | |
1199 } | |
1200 } | |
1201 # next if ($in_other_cluster); # This hit is clustered in cluster $k. It cannot be alternative ortholog | |
1202 | |
1203 # 3. The best hit of candidate ortholog should be ortoA or at least to belong into this cluster | |
1204 @besthits = split (/ /,$besthitAB[$idH]); | |
1205 $this_family = 0; | |
1206 for $bh (@besthits){ | |
1207 $this_family = 1 if ($idB == $bh); | |
1208 } | |
1209 # next unless ($this_family); # There was an alternative BA match but it's best match did not belong here | |
1210 ################# Done with checks - if sequence passed, then it could be an alternative ortholog | |
1211 $confA[$i] = $ortoS[$i] - $scoreBA{"$idB:$idH"}; | |
1212 if ($use_bootstrap){ | |
1213 if ($confA[$i] < $ortoS[$i]){ # Danger zone - check for bootstrap | |
1214 $bsA[$idA] = &bootstrap($fasta_seq_fileB,$idB,$idA,$idH); | |
1215 } | |
1216 else { | |
1217 $bsA[$idA] = 1.0; | |
1218 } | |
1219 } | |
1220 last; | |
1221 } | |
1222 $message .= sprintf("Bootstrap support for %s as seed ortholog is %d%%.", $nameA[$idA], 100*$bsA[$idA]); | |
1223 $message .= sprintf(" Alternative seed ortholog is %s (%d bits away from this cluster)", $nameA[$idH], $confA[$i]) if ($bsA[$idA] < 0.75); | |
1224 $message .= sprintf("\n"); | |
1225 if ($html){ | |
1226 if ($bsA[$idA] < 0.75){ | |
1227 $htmlmessage .= sprintf("<font color=\"red\">"); | |
1228 } | |
1229 elsif ($bsA[$idA] < 0.95){ | |
1230 $htmlmessage .= sprintf("<font color=\"\#FFCC00\">"); | |
1231 } | |
1232 else { | |
1233 $htmlmessage .= sprintf("<font color=\"green\">"); | |
1234 } | |
1235 $htmlmessage .= sprintf("Bootstrap support for %s as seed ortholog is %d%%.\n", $nameA[$idA], 100*$bsA[$idA]); | |
1236 $htmlmessage .= sprintf("Alternative seed ortholog is %s (%d bits away from this cluster)\n", $nameA[$idH], $confA[$i]) if ($bsA[$idA] < 0.75); | |
1237 $htmlmessage .= sprintf("</font>"); | |
1238 } | |
1239 printf (FF "%s\t%d\t%d\n", $nameA[$idA], $confA[$i], 100*$bsA[$idA]) if ($use_bootstrap and $debug); | |
1240 } | |
1241 ######## | |
1242 $idA = $ortoA[$i]; | |
1243 $nA = $hitnAB[$idA]; | |
1244 for $idB(@membersB){ | |
1245 next if ($confPB[$idB] != 1.0); | |
1246 $nB = $hitnBA[$idB]; | |
1247 $confB[$i] = $ortoS[$i]; # default | |
1248 $bsB[$idB] = 1.0; | |
1249 | |
1250 for $j(1..$nA){ # For all AB hits of given ortholog | |
1251 $idH = $hitAB[$idA][$j]; | |
1252 # ############### Some checks for alternative orthologs: | |
1253 # 1. Don't consider sequences that are already in this cluster | |
1254 next if (vec($is_paralogB[$i],$idH,1)); | |
1255 next if ($confPB[$idH] > 0); # If $conf_cutoff > 0 idH might be incide circle, but not paralog | |
1256 | |
1257 # 2. Check if candidate for alternative ortholog is already clustered in stronger clusters | |
1258 $in_other_cluster = 0; | |
1259 for $k(1..($i-1)){ | |
1260 if (vec($is_paralogB[$k],$idH,1)){ | |
1261 $in_other_cluster = $k; | |
1262 last; # out from this cycle | |
1263 } | |
1264 } | |
1265 # next if ($in_other_cluster); # This hit is clustered in cluster $k. It cannot be alternative ortholog | |
1266 | |
1267 # 3. The best hit of candidate ortholog should be ortoA | |
1268 @besthits = split (/ /,$besthitBA[$idH]); | |
1269 $this_family = 0; | |
1270 for $bh (@besthits){ | |
1271 $this_family = 1 if ($idA == $bh); | |
1272 } | |
1273 # next unless ($this_family); # There was an alternative BA match but it's best match did not belong here | |
1274 # ################ Done with checks - if sequence passed, then it could be an alternative ortholog | |
1275 $confB[$i] = $ortoS[$i] - $scoreAB{"$idA:$idH"}; | |
1276 if ($use_bootstrap){ | |
1277 if ($confB[$i] < $ortoS[$i]){ | |
1278 $bsB[$idB] = &bootstrap($fasta_seq_fileA,$idA,$idB,$idH); | |
1279 } | |
1280 else { | |
1281 $bsB[$idB] = 1.0; | |
1282 } | |
1283 } | |
1284 last; | |
1285 } | |
1286 $message .= sprintf("Bootstrap support for %s as seed ortholog is %d%%.", $nameB[$idB], 100*$bsB[$idB]); | |
1287 $message .= sprintf(" Alternative seed ortholog is %s (%d bits away from this cluster)", $nameB[$idH],$confB[$i]) if ($bsB[$idB] < 0.75); | |
1288 $message .= sprintf("\n"); | |
1289 if ($html){ | |
1290 if ($bsB[$idB] < 0.75){ | |
1291 $htmlmessage .= sprintf("<font color=\"red\">"); | |
1292 } | |
1293 elsif ($bsB[$idB] < 0.95){ | |
1294 $htmlmessage .= sprintf("<font color=\"\#FFCC00\">"); | |
1295 } | |
1296 else { | |
1297 $htmlmessage .= sprintf("<font color=\"green\">"); | |
1298 } | |
1299 $htmlmessage .= sprintf("Bootstrap support for %s as seed ortholog is %d%%.\n", $nameB[$idB], 100*$bsB[$idB]); | |
1300 $htmlmessage .= sprintf("Alternative seed ortholog is %s (%d bits away from this cluster)\n", $nameB[$idH],$confB[$i]) if ($bsB[$idB] < 0.75); | |
1301 $htmlmessage .= sprintf("</font>"); | |
1302 } | |
1303 printf (FF "%s\t%d\t%d\n", $nameB[$idB], $confB[$i], 100*$bsB[$idB]) if ($use_bootstrap and $debug); | |
1304 } | |
1305 close FF; | |
1306 ########### Print header ############### | |
1307 if ($output){ | |
1308 print OUTPUT "___________________________________________________________________________________\n"; | |
1309 print OUTPUT "Group of orthologs #" . $i .". Best score $ortoS[$i] bits\n"; | |
1310 print OUTPUT "Score difference with first non-orthologous sequence - "; | |
1311 printf (OUTPUT "%s:%d %s:%d\n", $fasta_seq_fileA,$confA[$i],$fasta_seq_fileB,$confB[$i]); | |
1312 } | |
1313 | |
1314 if ($html){ | |
1315 print HTML "</pre>\n"; | |
1316 print HTML "<hr WIDTH=\"100%\">"; | |
1317 print HTML "<h3>"; | |
1318 print HTML "Group of orthologs #" . $i .". Best score $ortoS[$i] bits<br>\n"; | |
1319 print HTML "Score difference with first non-orthologous sequence - "; | |
1320 printf (HTML "%s:%d %s:%d</h3><pre>\n", $fasta_seq_fileA,$confA[$i],$fasta_seq_fileB,$confB[$i]); | |
1321 } | |
1322 ########### Sort and print members of A ############ | |
1323 $nA = @membersA; | |
1324 $nB = @membersB; | |
1325 $nMAX = ($nA > $nB) ? $nA : $nB; | |
1326 # Sort membersA inside the cluster by confidence: | |
1327 for $m (0..($nA-1)){ | |
1328 while($confPA[$membersA[$m]] < $confPA[$membersA[$m+1]]){ | |
1329 $temp = $membersA[$m]; | |
1330 $membersA[$m] = $membersA[$m+1]; | |
1331 $membersA[$m+1] = $temp; | |
1332 --$m if ($m > 1); | |
1333 } | |
1334 } | |
1335 $paralogsA[$i] = join(' ',@membersA); # Put them back together | |
1336 # Sort membersB inside the cluster by confidence: | |
1337 for $m (0..($nB-1)){ | |
1338 while($confPB[$membersB[$m]] < $confPB[$membersB[$m+1]]){ | |
1339 $temp = $membersB[$m]; | |
1340 $membersB[$m] = $membersB[$m+1]; | |
1341 $membersB[$m+1] = $temp; | |
1342 --$m if ($m > 1); | |
1343 } | |
1344 } | |
1345 $paralogsB[$i] = join(' ',@membersB); # Put them back together | |
1346 # Print to text file and to HTML file | |
1347 for $m (0..($nMAX-1)){ | |
1348 if ($m < $nA){ | |
1349 if ($output){ | |
1350 printf (OUTPUT "%-20s\t%.2f%%\t\t", $nameA[$membersA[$m]], (100*$confPA[$membersA[$m]])); | |
1351 } | |
1352 if ($html){ | |
1353 print HTML "<B>" if ($confPA[$membersA[$m]] == 1); | |
1354 printf (HTML "%-20s\t%.2f%%\t\t", $nameA[$membersA[$m]], (100*$confPA[$membersA[$m]])); | |
1355 print HTML "</B>" if ($confPA[$membersA[$m]] == 1); | |
1356 } | |
1357 } | |
1358 else { | |
1359 printf (OUTPUT "%-20s\t%-7s\t\t", " ", " "); | |
1360 printf (HTML "%-20s\t%-7s\t\t", " ", " ") if ($html); | |
1361 } | |
1362 if ($m < $nB){ | |
1363 if ($output){ | |
1364 printf (OUTPUT "%-20s\t%.2f%%\n", $nameB[$membersB[$m]], (100*$confPB[$membersB[$m]])); | |
1365 } | |
1366 if ($html){ | |
1367 print HTML "<B>" if ($confPB[$membersB[$m]] == 1); | |
1368 printf (HTML "%-20s\t%.2f%%", $nameB[$membersB[$m]], (100*$confPB[$membersB[$m]])); | |
1369 print HTML "</B>" if ($confPB[$membersB[$m]] == 1); | |
1370 print HTML "\n"; | |
1371 } | |
1372 } | |
1373 else { | |
1374 printf (OUTPUT "%-20s\t%-7s\n", " ", " ") if($output); | |
1375 print HTML "\n" if ($html); | |
1376 } | |
1377 } | |
1378 print OUTPUT $message if ($use_bootstrap and $output); | |
1379 print HTML "$htmlmessage" if ($use_bootstrap and $html); | |
1380 } | |
1381 if ($output) { | |
1382 close OUTPUT; | |
1383 print "Output saved to file $outputfile\n"; | |
1384 } | |
1385 if ($html){ | |
1386 close HTML; | |
1387 print "HTML output saved to $htmlfile\n"; | |
1388 } | |
1389 | |
1390 if ($table){ | |
1391 $filename = "table." . $ARGV[0] . "-" . $ARGV[1]; | |
1392 open F, ">$filename" or die; | |
1393 print F "OrtoID\tScore\tOrtoA\tOrtoB\n"; | |
1394 for $i(1..$o){ | |
1395 print F "$i\t$ortoS[$i]\t"; | |
1396 @members = split(/ /, $paralogsA[$i]); | |
1397 for $m (@members){ | |
1398 $m =~ s/://g; | |
1399 printf (F "%s %.3f ", $nameA[$m], $confPA[$m]); | |
1400 } | |
1401 print F "\t"; | |
1402 @members = split(/ /, $paralogsB[$i]); | |
1403 for $m (@members){ | |
1404 $m =~ s/://g; | |
1405 printf (F "%s %.3f ", $nameB[$m], $confPB[$m]); | |
1406 } | |
1407 print F "\n"; | |
1408 } | |
1409 close F; | |
1410 print "Table output saved to $filename\n"; | |
1411 } | |
1412 if ($mysql_table){ | |
1413 $filename2 = "sqltable." . $ARGV[0] . "-" . $ARGV[1]; | |
1414 open F2, ">$filename2" or die; | |
1415 for $i(1..$o){ | |
1416 @membersA = split(/ /, $paralogsA[$i]); | |
1417 for $m (@membersA){ | |
1418 # $m =~ s/://g; | |
1419 if ($use_bootstrap && $bsA[$m]) { | |
1420 printf (F2 "%d\t%d\t%s\t%.3f\t%s\t%d%\n", $i, $ortoS[$i], $ARGV[0], $confPA[$m], $nameA[$m], 100*$bsA[$m]); | |
1421 } else { | |
1422 printf (F2 "%d\t%d\t%s\t%.3f\t%s\n", $i, $ortoS[$i], $ARGV[0], $confPA[$m], $nameA[$m]); | |
1423 } | |
1424 } | |
1425 @membersB = split(/ /, $paralogsB[$i]); | |
1426 for $m (@membersB){ | |
1427 # $m =~ s/://g; | |
1428 if ($use_bootstrap && $bsB[$m]) { | |
1429 printf (F2 "%d\t%d\t%s\t%.3f\t%s\t%d%\n", $i, $ortoS[$i], $ARGV[1], $confPB[$m], $nameB[$m], 100*$bsB[$m]); | |
1430 }else { | |
1431 printf (F2 "%d\t%d\t%s\t%.3f\t%s\n", $i, $ortoS[$i], $ARGV[1], $confPB[$m], $nameB[$m]); | |
1432 } | |
1433 } | |
1434 } | |
1435 close F2; | |
1436 print "mysql output saved to $filename2\n"; | |
1437 } | |
1438 if ($show_times){ | |
1439 ($user_time,,,) = times; | |
1440 printf ("Finding bootstrap values and printing took %.2f seconds\n", ($user_time - $prev_time)); | |
1441 printf ("The overall execution time: %.2f seconds\n", $user_time); | |
1442 } | |
1443 if ($run_blast) { | |
1444 unlink "formatdb.log"; | |
1445 unlink "$fasta_seq_fileA.phr", "$fasta_seq_fileA.pin", "$fasta_seq_fileA.psq"; | |
1446 unlink "$fasta_seq_fileB.phr", "$fasta_seq_fileB.pin", "$fasta_seq_fileB.psq" if (@ARGV >= 2); | |
1447 unlink "$fasta_seq_fileC.phr", "$fasta_seq_fileC.pin", "$fasta_seq_fileC.psq" if ($use_outgroup); | |
1448 } | |
1449 } | |
1450 | |
1451 ############################################################## | |
1452 # Functions: | |
1453 ############################################################## | |
1454 sub clean_up { # Sort members within cluster and clusters by size | |
1455 ############################################################################################### Modification by Isabella 3 | |
1456 | |
1457 # Sort on index arrays with perl's built in sort instead of using bubble sort. | |
1458 | |
1459 $var = shift; | |
1460 $totalA = $totalB = 0; | |
1461 # First pass: count members within each cluster | |
1462 foreach $i (1..$o) { | |
1463 @membersA = split(/ /, $paralogsA[$i]); | |
1464 $clusnA[$i] = @membersA; # Number of members in this cluster | |
1465 $totalA += $clusnA[$i]; | |
1466 $paralogsA[$i] = join(' ',@membersA); | |
1467 | |
1468 @membersB = split(/ /, $paralogsB[$i]); | |
1469 $clusnB[$i] = @membersB; # Number of members in this cluster | |
1470 $totalB += $clusnB[$i]; | |
1471 $paralogsB[$i] = join(' ',@membersB); | |
1472 | |
1473 $clusn[$i] = $clusnB[$i] + $clusnA[$i]; # Number of members in given group | |
1474 } | |
1475 | |
1476 # Create an array used to store the position each element shall have in the final array | |
1477 # The elements are initialized with the position numbers | |
1478 my @position_index_array = (1..$o); | |
1479 | |
1480 # Sort the position list according to cluster size | |
1481 my @cluster_sorted_position_list = sort { $clusn[$b] <=> $clusn[$a]} @position_index_array; | |
1482 | |
1483 # Create new arrays for the sorted information | |
1484 my @new_paralogsA; | |
1485 my @new_paralogsB; | |
1486 my @new_is_paralogA; | |
1487 my @new_is_paralogB; | |
1488 my @new_clusn; | |
1489 my @new_ortoS; | |
1490 my @new_ortoA; | |
1491 my @new_ortoB; | |
1492 | |
1493 | |
1494 # Add the information to the new arrays in the orer specifeid by the index array | |
1495 for (my $index_in_list = 0; $index_in_list < scalar @cluster_sorted_position_list; $index_in_list++) { | |
1496 | |
1497 my $old_index = $cluster_sorted_position_list[$index_in_list]; | |
1498 | |
1499 if (!$clusn[$old_index]) { | |
1500 $o = (scalar @new_ortoS) - 1; | |
1501 last; | |
1502 } | |
1503 | |
1504 $new_paralogsA[$index_in_list + 1] = $paralogsA[$old_index]; | |
1505 $new_paralogsB[$index_in_list + 1] = $paralogsB[$old_index]; | |
1506 $new_is_paralogA[$index_in_list + 1] = $is_paralogA[$old_index]; | |
1507 $new_is_paralogB[$index_in_list + 1] = $is_paralogB[$old_index]; | |
1508 $new_clusn[$index_in_list + 1] = $clusn[$old_index]; | |
1509 $new_ortoA[$index_in_list + 1] = $ortoA[$old_index]; | |
1510 $new_ortoB[$index_in_list + 1] = $ortoB[$old_index]; | |
1511 $new_ortoS[$index_in_list + 1] = $ortoS[$old_index]; | |
1512 } | |
1513 | |
1514 @paralogsA = @new_paralogsA; | |
1515 @paralogsB = @new_paralogsB; | |
1516 @is_paralogA = @new_is_paralogA; | |
1517 @is_paralogB = @new_is_paralogB; | |
1518 @clusn = @new_clusn; | |
1519 @ortoS = @new_ortoS; | |
1520 @ortoA = @new_ortoA; | |
1521 @ortoB = @new_ortoB; | |
1522 | |
1523 # Create an array used to store the position each element shall have in the final array | |
1524 # The elements are initialized with the position numbers | |
1525 @position_index_array = (1..$o); | |
1526 | |
1527 # Sort the position list according to score | |
1528 @score_sorted_position_list = sort { $ortoS[$b] <=> $ortoS[$a] } @position_index_array; | |
1529 | |
1530 # Create new arrays for the sorted information | |
1531 my @new_paralogsA2 = (); | |
1532 my @new_paralogsB2 = (); | |
1533 my @new_is_paralogA2 = (); | |
1534 my @new_is_paralogB2 = (); | |
1535 my @new_clusn2 = (); | |
1536 my @new_ortoS2 = (); | |
1537 my @new_ortoA2 = (); | |
1538 my @new_ortoB2 = (); | |
1539 | |
1540 # Add the information to the new arrays in the orer specifeid by the index array | |
1541 for (my $index_in_list = 0; $index_in_list < scalar @score_sorted_position_list; $index_in_list++) { | |
1542 | |
1543 my $old_index = $score_sorted_position_list[$index_in_list]; | |
1544 $new_paralogsA2[$index_in_list + 1] = $paralogsA[$old_index]; | |
1545 $new_paralogsB2[$index_in_list + 1] = $paralogsB[$old_index]; | |
1546 $new_is_paralogA2[$index_in_list + 1] = $is_paralogA[$old_index]; | |
1547 $new_is_paralogB2[$index_in_list + 1] = $is_paralogB[$old_index]; | |
1548 $new_clusn2[$index_in_list + 1] = $clusn[$old_index]; | |
1549 $new_ortoA2[$index_in_list + 1] = $ortoA[$old_index]; | |
1550 $new_ortoB2[$index_in_list + 1] = $ortoB[$old_index]; | |
1551 $new_ortoS2[$index_in_list + 1] = $ortoS[$old_index]; | |
1552 } | |
1553 | |
1554 @paralogsA = @new_paralogsA2; | |
1555 @paralogsB = @new_paralogsB2; | |
1556 @is_paralogA = @new_is_paralogA2; | |
1557 @is_paralogB = @new_is_paralogB2; | |
1558 @clusn = @new_clusn2; | |
1559 @ortoS = @new_ortoS2; | |
1560 @ortoA = @new_ortoA2; | |
1561 @ortoB = @new_ortoB2; | |
1562 | |
1563 #################################################################################### End modification by Isabella 3 | |
1564 | |
1565 } | |
1566 sub bootstrap{ | |
1567 my $species = shift; | |
1568 my $seq_id1 = shift; # Query ID from $species | |
1569 my $seq_id2 = shift; # Best hit ID from other species | |
1570 my $seq_id3 = shift; # Second best hit | |
1571 # Retrieve sequence 1 from $species and sequence 2 from opposite species | |
1572 my $significance = 0.0; | |
1573 | |
1574 if ($species eq $fasta_seq_fileA){ | |
1575 $file1 = $fasta_seq_fileA; | |
1576 $file2 = $fasta_seq_fileB; | |
1577 } | |
1578 elsif ($species eq $fasta_seq_fileB){ | |
1579 $file1 = $fasta_seq_fileB; | |
1580 $file2 = $fasta_seq_fileA; | |
1581 } | |
1582 else { | |
1583 print "Bootstrap values for ortholog groups are not calculated\n"; | |
1584 return 0; | |
1585 } | |
1586 | |
1587 open A, $file1 or die; | |
1588 $id = 0; | |
1589 $print_this_seq = 0; | |
1590 $seq1 = ""; | |
1591 $seq2 = ""; | |
1592 | |
1593 $query_file = $seq_id1 . ".faq"; | |
1594 open Q, ">$query_file" or die; | |
1595 | |
1596 while (<A>){ | |
1597 if(/^\>/){ | |
1598 ++$id; | |
1599 $print_this_seq = ($id == $seq_id1)?1:0; | |
1600 } | |
1601 print Q if ($print_this_seq); | |
1602 } | |
1603 close A; | |
1604 close Q; | |
1605 ### | |
1606 open B, $file2 or die; | |
1607 $db_file = $seq_id2 . ".fas"; | |
1608 open DB, ">$db_file" or die; | |
1609 $id = 0; | |
1610 $print_this_seq = 0; | |
1611 | |
1612 while (<B>){ | |
1613 if(/^\>/){ | |
1614 ++$id; | |
1615 $print_this_seq = (($id == $seq_id2) or ($id == $seq_id3))?1:0; | |
1616 } | |
1617 print DB if ($print_this_seq); | |
1618 } | |
1619 close B; | |
1620 close DB; | |
1621 | |
1622 system "$formatdb -i $db_file"; | |
1623 # Use soft masking in 1-pass mode for simplicity. | |
1624 system "$blastall -F\"m S\" -i $query_file -z 5000000 -d $db_file -p blastp -M $matrix -m7 | ./$blastParser 0 -a > $seq_id2.faa"; | |
1625 | |
1626 # Note: Changed score cutoff 50 to 0 for blast2faa.pl (060402). | |
1627 # Reason: after a cluster merger a score can be less than the cutoff (50) | |
1628 # which will remove the sequence in blast2faa.pl. The bootstrapping will | |
1629 # then fail. | |
1630 # AGAIN, updaye | |
1631 | |
1632 if (-s("$seq_id2.faa")){ | |
1633 | |
1634 system("java -jar $seqstat -m $matrix -n 1000 -i $seq_id2.faa > $seq_id2.bs"); # Can handle U, u | |
1635 | |
1636 if (-s("$seq_id2.bs")){ | |
1637 open BS, "$seq_id2.bs" or die "pac failed\n"; | |
1638 $_ = <BS>; | |
1639 ($dummy1,$dummy2,$dummy3,$dummy4,$significance) = split(/\s+/); | |
1640 close BS; | |
1641 } | |
1642 else{ | |
1643 print STDERR "pac failed\n"; # if ($debug); | |
1644 $significance = -0.01; | |
1645 } | |
1646 } | |
1647 else{ | |
1648 print STDERR "blast2faa for $query_file / $db_file failed\n"; # if ($debug); | |
1649 $significance = 0.0; | |
1650 } | |
1651 | |
1652 unlink "$seq_id2.fas", "$seq_id2.faa", "$seq_id2.bs", "$seq_id1.faq"; | |
1653 unlink "formatdb.log", "$seq_id2.fas.psq", "$seq_id2.fas.pin", "$seq_id2.fas.phr"; | |
1654 | |
1655 return $significance; | |
1656 } | |
1657 | |
1658 sub overlap_test{ | |
1659 my @Fld = @_; | |
1660 | |
1661 # Filter out fragmentary hits by: | |
1662 # Ignore hit if aggregate matching area covers less than $seq_overlap_cutoff of sequence. | |
1663 # Ignore hit if local matching segments cover less than $segment_coverage_cutoff of sequence. | |
1664 # | |
1665 # $Fld[3] and $Fld[4] are query and subject lengths. | |
1666 # $Fld[5] and $Fld[6] are lengths of the aggregate matching region on query and subject. (From start of first matching segment to end of last matching segment). | |
1667 # $Fld[7] and $Fld[8] are local matching length on query and subject (Sum of all segments length's on query). | |
1668 | |
1669 $retval = 1; | |
1670 # if ($Fld[3] >= $Fld[4]) { | |
1671 if ($Fld[5] < ($seq_overlap_cutoff * $Fld[3])) {$retval = 0}; | |
1672 if ($Fld[7] < ($segment_coverage_cutoff * $Fld[3])) {$retval = 0}; | |
1673 # } | |
1674 | |
1675 # if ($Fld[4] >= $Fld[3]) { | |
1676 if ($Fld[6] < ($seq_overlap_cutoff * $Fld[4])) {$retval = 0}; | |
1677 if ($Fld[8] < ($segment_coverage_cutoff * $Fld[4])) {$retval = 0}; | |
1678 # } | |
1679 | |
1680 # print "$Fld[3] $Fld[5] $Fld[7]; $Fld[4] $Fld[6] $Fld[8]; retval=$retval\n"; | |
1681 | |
1682 return $retval; | |
1683 } | |
1684 | |
1685 sub do_blast | |
1686 { | |
1687 my @parameter=@_; | |
1688 my $resultfile=@parameter[@parameter-1]; | |
1689 my $go_to_blast=1; | |
1690 my $resultfilesize; | |
1691 if (-e $resultfile) | |
1692 { | |
1693 $resultfilesize= -s "$resultfile"; | |
1694 if ($resultfilesize >10240) | |
1695 { | |
1696 $go_to_blast=0; | |
1697 } | |
1698 } | |
1699 if ($go_to_blast) | |
1700 { | |
1701 if ($blast_two_passes) | |
1702 { | |
1703 do_blast_2pass(@parameter); | |
1704 } else | |
1705 { | |
1706 do_blast_1pass(@parameter); | |
1707 } | |
1708 } | |
1709 } | |
1710 | |
1711 sub do_blast_1pass { | |
1712 my @Fld = @_; | |
1713 | |
1714 # $Fld [0] is query | |
1715 # $Fld [1] is database | |
1716 # $Fld [2] is query size | |
1717 # $Fld [3] is database size | |
1718 # $Fld [4] is output name | |
1719 | |
1720 # Use soft masking (low complexity masking by SEG in search phase, not in alignment phase). | |
1721 system ("$blastall -F\"m S\" -i $Fld[0] -d $Fld[1] -p blastp -v $Fld[3] -b $Fld[3] -M $matrix -z 5000000 -m7 | ./$blastParser $score_cutoff > $Fld[4]"); | |
1722 } | |
1723 | |
1724 sub do_blast_2pass { | |
1725 | |
1726 my @Fld = @_; | |
1727 | |
1728 # $Fld [0] is query | |
1729 # $Fld [1] is database | |
1730 # $Fld [2] is query size | |
1731 # $Fld [3] is database size | |
1732 # $Fld [4] is output name | |
1733 | |
1734 # assume the script has already formatted the database | |
1735 # we will now do 2-pass approach | |
1736 | |
1737 # load sequences | |
1738 | |
1739 %sequencesA = (); | |
1740 %sequencesB = (); | |
1741 | |
1742 open (FHA, $Fld [0]); | |
1743 while (<FHA>) { | |
1744 | |
1745 $aLine = $_; | |
1746 chomp ($aLine); | |
1747 | |
1748 $seq = ""; | |
1749 | |
1750 if ($aLine =~ />/) { | |
1751 @words = split (/\s/, $aLine); | |
1752 $seqID = $words[0]; | |
1753 $sequencesA {$seqID} = ""; | |
1754 } | |
1755 else { | |
1756 $sequencesA {$seqID} = $sequencesA {$seqID}.$aLine; | |
1757 } | |
1758 } | |
1759 close (FHA); | |
1760 | |
1761 open (FHB, $Fld [1]); | |
1762 while (<FHB>) { | |
1763 $aLine = $_; | |
1764 chomp ($aLine); | |
1765 | |
1766 $seq = ""; | |
1767 | |
1768 if ($aLine =~ />/) { | |
1769 @words = split (/\s/, $aLine); | |
1770 $seqID = $words[0]; | |
1771 $sequencesB {$seqID} = ""; | |
1772 } | |
1773 else { | |
1774 $sequencesB {$seqID} = $sequencesB {$seqID}.$aLine; | |
1775 } | |
1776 } | |
1777 close (FHB); | |
1778 | |
1779 # Do first pass with compositional adjustment on and soft masking. | |
1780 # This efficiently removes low complexity matches but truncates alignments, | |
1781 # making a second pass necessary. | |
1782 print STDERR "\nStarting first BLAST pass for $Fld[0] - $Fld[1] on "; | |
1783 system("date"); | |
1784 open FHR, "$blastall -C3 -F\"m S\" -i $Fld[0] -d $Fld[1] -p blastp -v $Fld[3] -b $Fld[3] -M $matrix -z 5000000 -m7 | ./$blastParser $score_cutoff|"; | |
1785 | |
1786 %theHits = (); | |
1787 while (<FHR>) { | |
1788 $aLine = $_; | |
1789 chomp ($aLine); | |
1790 @words = split (/\s+/, $aLine); | |
1791 | |
1792 if (exists ($theHits {$words [0]})) { | |
1793 $theHits {$words [0]} = $theHits {$words [0]}." ".$words [1]; | |
1794 } | |
1795 else { | |
1796 $theHits {$words [0]} = $words [1]; | |
1797 } | |
1798 | |
1799 } | |
1800 close (FHR); | |
1801 | |
1802 $tmpdir = "."; # May be slightly (5%) faster using the RAM disk "/dev/shm". | |
1803 $tmpi = "$tmpdir/tmpi"; | |
1804 $tmpd = "$tmpdir/tmpd"; | |
1805 | |
1806 # Do second pass with compositional adjustment off to get full-length alignments. | |
1807 print STDERR "\nStarting second BLAST pass for $Fld[0] - $Fld[1] on "; | |
1808 system("date"); | |
1809 unlink "$Fld[4]"; | |
1810 foreach $aQuery (keys % theHits) { | |
1811 | |
1812 # Create single-query file | |
1813 open (FHT, ">$tmpi"); | |
1814 print FHT ">$aQuery\n".$sequencesA {">$aQuery"}."\n"; | |
1815 close (FHT); | |
1816 | |
1817 # Create mini-database of hit sequences | |
1818 open (FHT, ">$tmpd"); | |
1819 foreach $aHit (split (/\s/, $theHits {$aQuery})) { | |
1820 print FHT ">$aHit\n".$sequencesB {">$aHit"}."\n"; | |
1821 } | |
1822 close (FHT); | |
1823 | |
1824 # Run Blast and add to output | |
1825 system ("$formatdb -i $tmpd"); | |
1826 system ("$blastall -C0 -FF -i $tmpi -d $tmpd -p blastp -v $Fld[3] -b $Fld[3] -M $matrix -z 5000000 -m7 | ./$blastParser $score_cutoff >> $Fld[4]"); | |
1827 } | |
1828 | |
1829 unlink "$tmpi", "$tmpd", "formatdb.log", "$tmpd.phr", "$tmpd.pin", "$tmpd.psq"; | |
1830 } | |
1831 | |
1832 | |
1833 # Date Modification | |
1834 # -------- --------------------------------------------------- | |
1835 # | |
1836 # 2006-04-02 [1.36] - Changed score cutoff 50 to 0 for blast2faa.pl. | |
1837 # Reason: after a cluster merger a score can be less than the cutoff (50) | |
1838 # which will remove the sequence in blast2faa.pl. The bootstrapping will | |
1839 # then fail. | |
1840 # - Fixed bug with index variable in bootstrap routine. | |
1841 # | |
1842 # 2006-06-01 [2.0] - Fixed bug in blast_parser.pl: fields 7 and 8 were swapped, | |
1843 # it was supposed to print match_area before HSP_length. | |
1844 # - Fixed bug in blastall call: -v param was wrong for the A-B | |
1845 # and B-A comparisons. | |
1846 # - | |
1847 # - Changed "cluster" to "group" consistently in output. | |
1848 # - Changed "main ortholog" to "seed ortholog" in output. | |
1849 # - Replace U -> X before running seqstat.jar, otherwise it crashes. | |
1850 # 2006-08-04 [2.0] - In bootstrap subroutine, replace U with X, otherwise seqstat | |
1851 # will crash as this is not in the matrix (should fix this in seqstat) | |
1852 # 2006-08-04 [2.1] - Changed to writing default output to file. | |
1853 # - Added options to run blast only. | |
1854 # - Fixed some file closing bugs. | |
1855 # 2007-12-14 [3.0] - Sped up sorting routines (by Isabella). | |
1856 # - New XML-based blast_parser. | |
1857 # - New seqstat.jar to handle u and U. | |
1858 # - Modified overlap criterion for rejecting matches. Now it agrees with the paper. | |
1859 # 2009-04-01 [4.0] - Further modification of overlap criteria (require that they are met for both query and subject). | |
1860 # - Changed bit score cutoff to 40, which is suitable for compositionally adjusted BLAST. | |
1861 # - Added in 2-pass algorithm. | |
1862 # 2009-06-11 [4.0] - Moved blasting out to subroutine. | |
1863 # - Changed blasting in bootstrap subroutine to use unconditional score matrix adjustment and SEG hard masking, | |
1864 # to be the same as first step of 2-pass blast. | |
1865 # 2009-06-17 [4.0] - Compensated a Blast "bug" that sometimes gives a self-match lower score than a non-identical match. | |
1866 # This can happen with score matrix adjustment and can lead to missed orthologs. | |
1867 # 2009-08-18 [4.0] - Consolidated Blast filtering parameters for 2-pass (-C3 -F\"m S\"; -C0 -FF) | |
1868 # 2009-10-09 [4.1] - Fixed bug that caused failure if Fasta header lines had more than one word. |