comparison PGAP-1.2.1/multiparanoid.pl @ 0:83e62a1aeeeb draft

Uploaded
author dereeper
date Thu, 24 Jun 2021 13:51:52 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:83e62a1aeeeb
1 #!/usr/bin/perl
2 #
3 # Copyright (C) Andrey Alexeyenko and Erik Sonnhammer, 2006
4 # For a license to use this program, contact Erik.Sonnhammer@cbc.su.se
5 ###############################################################################
6 # Andrey Alexeyenko, Stockholm Bioinformatics Center, Albanova, Stockholm 2006
7 # mailto:avalex99@mail.ru
8 ###############################################################################
9 # The program is a postprocessor of ortholog tables produced by InParanoid. It combines pairwise InParanoid outputs (say, A-B, B-C, A-C) and makes multi-species clusters of orthologs ABC.
10
11 use strict vars;
12 use DBI;
13 use Cwd;
14 require 5.002;
15
16 our($inputdir, $makeunique, $indirect, $Pair,
17 @fields, @cutoffs, $output, @fields_out, $out, $algo, $debug, $SQLprefix,
18 %place, $nplaces, $source_by_cluster, $source_by_gene, $source_by_taken, $seeds, $use_bootstrap, $maxINPscore, $toMainOrthologOfSelf);
19 my($npairs, $i, $j, $ii, $ee, $gg, $nn, $oo, $s2, $s1, $nspec, $as, $newmain, @distances, @species, $sp1, $sp2, $clusno, $specletters, $a_pair, $a_cluster, $pms, $input, $output, $inputfile, $new, $thepair);
20 ###############################################################################
21
22 #FIRST OF ALL, set the variables $inputdir and $output below. MultiParanoid may not work without it!!!
23
24 #Parameters to submit (may be abbreviated to the first letter):
25 # OBLIGATORY:
26 # -species: a list of several species connected with "+"; e.g. mouse+cat+dog; names of the genomes exactly how they are called both in file names and inside of files. NOTE: due to phylogenetic tree conflicts, order of the genomes slightly changes the result. The input files (the InParanoid output) should thus be called:
27 # sqltable.dog-cat, sqltable.dog-mouse etc.
28 #To use other file names, one should change the variable $SQLprefix below.
29
30 # OPTIONAL:
31 # -cutoff: deprecated. But one can use it to tighten the clusters (to filter out weaker in paralogs by restricting with the distance to the seed orthologs); Example: " -cutoff 0.5 " or, for doing 3 different cluster versions at the same time: 0+0.2+0.5. Default: 0 (no cutoff).
32 # -algo: how to define between-paralogs distances while clustering.
33 # Options: onlymains (default and only reasonable)
34 # toallsorted
35 # -debug: 0 or 1; default: 0
36 # -input: deprecated, now defined by -species parameter, and the input files shall have conventional names
37 # -output: deprecated, special message will tell the name of the output file
38 # -unique: 0 or 1; set to 1, if the duplicates (genes clustered twice) should be removed; default: 0
39
40 #The MultiParanoid output header:
41 ### clusterID species gene is_seed_ortholog confidence_score species_in_cluster tree_conflict###
42
43 #is (hopefully) self-explanatory. Though:
44
45 # "is_seed_ortholog" means the protein was a seed ortholog in at least one InParanoid cluster.
46 # "confidence_score" is an average InParanoid score across the input clusters
47 # "tree_conflict" indicates that, from the point of view of different species, the number of inparalogs varied in at least one other species ("diff.numbers") or the numbers were the same, but the IDs differed ("diff.names").
48
49 #NOTE: s.c. 'distances' between cluster members are, in fact, similarities. Variables are called 'distances' for historical reasons.
50 # 'main orthologs' (defined by InParanoid) are better to be called 'seed orthologs'.
51
52 # Settings:
53
54 $algo = 'onlymains';
55 $debug = 0;
56 $use_bootstrap = 0;
57 $maxINPscore = 1;
58 $toMainOrthologOfSelf = 0;
59 $indirect = 1; #this option allows using the currently default InParanoid output, were seed orthologs are recognized by the feature "IPscore = 1.00". Otherwise ($indirect = 0) a special format should be used (needs a change in the InParanoid script and is now deprecated).
60
61 if ($indirect) {
62 $SQLprefix = 'sqltable.';
63 $inputdir = './'; #change it respectively
64 $place{'clusterID'} = 0; #these should correspond to the current InParanoid output format
65 $place{'BLASTscore'} = 1;
66 $place{'species'} = 2;
67 $place{'IPscore'} = 3;
68 $place{'gene'} = 4;
69 $place{'main'} = 5;
70 $place{'pair'} = 6;
71 }
72 else {
73 $SQLprefix = 'ava_sql';
74 $inputdir = 'disk/dir/multiparanoid/'; #change it respectively
75 ###The older, ava_sql, version:
76 $place{'clusterID'} = 0;
77 $place{'BLASTscore'} = 1;
78 $place{'pair'} = 2;
79 $place{'species'} = 3;
80 $place{'gene'} = 4;
81 $place{'IPscore'} = 5;
82 $place{'main'} = 6;
83 #$indirect = 1 if !defined($place{'main'});
84 }
85
86 $pms = parseParameters(join(' ', @ARGV));
87 if (!$pms->{'s'}) {die "No '+'-delimited input species list (pointing to INPARANOID output) specified. Check parameter '-species'... ";}
88 @species = split(/\+/, $pms->{'s'}); #list of species (genomes) as they appear in the file names. Default: no default
89 print "Species list: $pms->{'s'}\n is treated as species ".join(', ', @species)."\n";
90 #print "Genomes @species are being processed...\n";
91 $makeunique = 0;#also, mind the variables $inputdir and $output below
92 $makeunique = $pms->{'u'};
93
94 $specletters = fstLetters(@species);
95 # $output = 'diskZ/MultiparanoidOuput/'.$specletters.'.MP.sql' if !$output; #change it respectively
96
97 $output = './MP.Cluster' ; #change it respectively
98
99
100 @cutoffs = split(/\+/, $pms->{'c'}); #Can restrict included in-paralogs by their distance to the main ones (see example below). Default: 0 (no cutooff)
101 #@cutoffs = (0, 0.25, 0.5, 0.75, 1);
102 @cutoffs = (0) if !exists($cutoffs[0]);
103
104 $algo = $pms->{'a'} if !$algo; #which algorithm to use. Default: onlymains
105 $debug = $pms->{'d'}; #degree of debug messages' output. Default: no
106 $output = $pms->{'o'} if !$output;
107 $makeunique = $pms->{'u'} if !$makeunique;
108
109 $nplaces = scalar(keys(%place));
110 @fields_out = (clusterID, species, gene, is_seed_ortholog, confidence_score, species_in_cluster, tree_conflict);
111 push @fields_out, 'cutoff' if scalar(@cutoffs) > 1;
112 $nspec = scalar(@species);
113 open OUT, ">$output";
114
115 print "Genome pairs found and used: \n" if $debug > 0;
116 for ($s1 = 0; $s1 < $nspec; $s1++) {
117 for ($s2 = 0; $s2 < $nspec; $s2++) {
118 next if ($species[$s1] eq $species[$s2]);
119 $Pair = $species[$s1].'-'.$species[$s2];
120 $inputfile = $inputdir.$SQLprefix.$Pair;
121 if (!system("ls $inputfile")) {
122 loadData($inputfile);
123 $npairs++;
124 }}}
125 die "Not enough (or too many) species pair files for ".join(", ", @species)."\n" if $npairs != ($nspec * ($nspec - 1)) / 2;
126 $clusno = 1;
127
128 for $sp1(@species) {
129 for $sp2(@species) {
130 $thepair = "$sp1-$sp2";
131
132 next if (!$source_by_cluster->{$thepair} or ($sp1 eq $sp2));
133 print "Analyzing $sp1 vs. $sp2: " if $debug;
134 print scalar(keys(%{$source_by_cluster->{$thepair}}))." clusters for this pair...\n" if $debug;
135 for $a_cluster(keys(%{$source_by_cluster->{$thepair}})) {
136 undef $new; undef $seeds;
137 $seeds = findOrthos($a_cluster, $thepair);
138
139 #find recordset of orthologs of the both species (for cluster a) from the pairwise subset i-j
140 next if !defined($seeds->[0]);
141 LINE1:for $gg(@{$seeds}) {
142 next if (!$gg->[$place{'main'}] or $gg->[$nplaces]);
143 $gg->[$nplaces] = 1;
144 for $a_pair(keys(%{$source_by_cluster})) {
145 if (($a_pair =~ $gg->[$place{'species'}]) and ($gg->[$place{'pair'}] ne $a_pair)) {
146 $new = findNewOrtho($gg->[$place{'gene'}], $a_pair);
147 undef $newmain;
148 if (defined($new)) {
149 for $nn(@{$new}) {
150 push(@{$seeds}, $nn) if isNew($seeds, $nn);
151 $newmain = 1 if $nn->[$place{'main'}];
152 }
153 redo LINE1 if $newmain;
154 }}}}
155
156 makeClusters($seeds, \@cutoffs, \@species, $clusno++, treeConflict($seeds), $specletters);
157 }}}
158 if ($clusno > 20) {
159 print OUT join("\t", @fields_out)."\n";
160 for $oo(@{$out->{'byRecord'}}) {
161 print OUT join("\t", @{$oo})."\n" if defined($oo);
162 }
163 print "Making ortholog clusters of ".join(', ', @species)." done. The result is in file $output\n";
164 }
165 else {
166 die "No output has been produced..."
167 }
168 ###########################################################################
169
170 sub parseParameters {
171 my($parameters) = @_;
172 my($key, $value, $pars, %pars);
173
174 #print "$parameters\n";
175 $_ = $parameters;
176 while (m/\-(\w+)\s+([a-zA-Z0-9._=+]+)/g) {
177 next if ((lc($2) eq 'f') or (lc($2) eq 'false') or (lc($2) eq 'no') or (lc($2) eq '0'));
178 $pars{substr($1, 0, 1)} = $2;
179 }
180 if (scalar(keys(%pars)) < 1 or !$pars{'s'} or m/\-h/) {
181 print "\nNOT ENOUGH PARAMETERS!\nOBLIGATORY:
182 -species: a list of several species connected with '+'; e.g. mouse+cat+dog; names of the genomes exactly how they are called both in file names and inside of files. NOTE: due to phylogenetic tree conflicts, order of the genomes slightly changes the result. The input files (the InParanoid output) should thus be called:
183 sqltable.dog-cat, sqltable.dog-mouse etc.
184 To use other file names, one should change the variable \$SQLprefix inside the scrtipt.
185 \nOPTIONAL:
186 \n-cutoff: deprecated. But one can use it to tighten the clusters (to filter out weaker in paralogs by restricting with the distance to the seed orthologs); Example: ' -cutoff 0.5 ' or, for doing 3 different cluster versions at the same time: 0+0.2+0.5. Default: 0 (no cutoff).
187 \n-algo: how to define between-paralogs distances while clustering.
188 Options: onlymains (default and only reasonable)
189 toallsorted
190 \n-debug: 0 or 1; default: 0
191 \n-input: deprecated, now defined by -species parameter and the variables \$inputdir and \$SQLprefix, and the input files shall have conventional names
192 \n-output: deprecated, special message will tell the name of the output file
193 \n-unique: 0 or 1; set to '0' if the duplicates (genes clustered twice) should BE REMOVED, to '1' otherwise; default: 0\n\n"; exit;
194 }
195 while (($key, $value) = each(%pars)) {
196 print "Parameter $key = $value;\n";
197 }
198
199 return(\%pars);
200 }
201
202 sub treeConflict { #tells if there is a discrepancy, for species B, between 2-species trees A-B and B-C
203 my($seeds) = @_;
204 my($i, $ii, $jj, $kk, $sets);
205
206 for $ii(@{$seeds}) {
207 push @{$sets->{$ii->[$place{'species'}]}->{$ii->[$place{'pair'}]}}, $ii->[$place{'gene'}];
208 }
209
210 for $ii(keys(%{$sets})) {
211 for $jj(keys(%{$sets->{$ii}})) {
212 for $kk(keys(%{$sets->{$ii}})) {
213 next if $jj eq $kk;
214 if (scalar(@{$sets->{$ii}->{$jj}}) != scalar(@{$sets->{$ii}->{$kk}})) {return('diff. numbers');}
215 sort {$a <=> $b} @{$jj};
216 sort {$a <=> $b} @{$kk};
217 for ($i = 0; $i < scalar(@{$sets->{$ii}->{$jj}}); $i++) {
218 if ($sets->{$ii}->{$jj}->[$i] ne $sets->{$ii}->{$kk}->[$i]) {
219 return('diff. names');
220 }}}}}
221 return(undef);
222 }
223
224 sub makeClusters {
225 my($seeds, $cutoffs, $species, $clusno, $conflict, $param) = @_;
226 my($clo, $le, $gg, @included, $distances, $specset, $cco);
227
228 array2StdOut($seeds, "Primers") if $debug;
229 if ($algo eq 'onlymains') {
230 $distances = pairWiseDist2Main($seeds);
231 }
232 elsif ($algo eq 'toallsorted') {
233 $distances = pairWiseDist2All($seeds);
234 }
235 else {die "Algorithm is not set...";}
236 array2StdOut(setUnique($seeds), "Unique") if $debug;
237 for $cco(@{$cutoffs}) {
238 undef @included;
239
240 @included = firstMains($seeds);
241 @included = @{setUnique(\@included)} if defined(@included);
242 array2StdOut(\@included, 'Mains') if $debug;
243 goto LINE3 if $cco == 1;
244
245 if ($algo eq 'onlymains') {
246 push @included, @{getOthers(
247 setUnique($seeds),
248 $distances,
249 \@included,
250 $cco)};
251 }
252 elsif ($algo eq 'toallsorted') {
253 @included = @{getClosests(
254 setUnique($seeds),
255 $distances,
256 \@included,
257 $cco)};}
258 LINE3:array2StdOut(\@included, "Cluster with cut-off $cco") if $debug;
259 LINE4: $specset = join('', fstLetters(sort(listSpecies(\@included, $species))));
260 $conflict = (($conflict) ? $conflict : 'No');
261 for $gg(@included) {
262 next if $makeunique and alreadyClustered($gg, $cco);
263 $ii = scalar(@{$out->{'byRecord'}});
264 $out->{'byName'}->{$gg->[$place{'gene'}]}->{$cco} = $ii;
265
266 for $ee($clusno,
267 $gg->[$place{'species'}],
268 $gg->[$place{'gene'}],
269 $gg->[$place{'main'}],
270 $gg->[$place{'IPscore'}],
271 $specset,
272 $conflict) {push @{$out->{'byRecord'}->[$ii]}, $ee;}
273 push @{$out->{'byRecord'}->[$ii]}, $cco if scalar(@cutoffs) > 1;
274
275 }
276 #$out .= join("\t", (100 * $cco, $clusno, $gg->[$place{'species'}], $gg->[$place{'gene'}], $gg->[$place{'main'}], $gg->[$place{'IPscore'}], $specset, $conflict, $param, $algo))."\n" if !$makeunique or !alreadyClustered($gg);
277 }
278 return;
279 }
280
281
282 sub fstLetters {
283 my(@list) = @_;
284 my($letters, $ii, $firstLetter, $lastLetter);
285
286 return(join('-', sort {$a cmp $b} @list));
287 $firstLetter = 0;
288 for $ii(@list) {
289 $lastLetter = length($ii) - $firstLetter;
290 $letters .= "_".substr($ii, $firstLetter, $lastLetter);
291 }
292
293 return $letters;
294 }
295
296 sub listSpecies {
297 my($ar, $species) = @_;
298 my($already, %already, $ii, @flags);
299 # checks for species present in the current cluster
300
301 for $ii(@{$ar}) {
302 push(@flags, $ii->[$place{'species'}]) if !$already{$ii->[$place{'species'}]};
303 $already{$ii->[$place{'species'}]} = 1;
304 }
305 return(@flags);
306 }
307
308 sub loadData {
309 my($inputfile) = @_;
310 my($line, @line, $i);
311
312 open(IN, $inputfile);
313 print "Loading $inputfile...\n";
314 while (<IN>) {
315 @line = split /\s+/;
316 #$place{'BLASTscore'}, $place{'species'}, $place{'IPscore'}, $place{'main'}
317 for $i(0..scalar(keys(%place))) {
318 if ($indirect) {
319 if ($i == $place{'main'}) {
320 $source_by_cluster->{$indirect ? $Pair : $line[$place{'pair'}]}->{$line[$place{'clusterID'}]}->{$line[$place{'gene'}]}->[$i] = ($line[$place{'IPscore'}] == $maxINPscore) ? 1 : 0;
321 $source_by_gene->{$indirect ? $Pair : $line[$place{'pair'}]}->{$line[$place{'gene'}]}->{$line[$place{'clusterID'}]}->[$i] = ($line[$place{'IPscore'}] == $maxINPscore) ? 1 : 0;
322 next;
323 }
324 if ($i == $place{'pair'}) {
325 $source_by_cluster->{$Pair}->{$line[$place{'clusterID'}]}->{$line[$place{'gene'}]}->[$i] = $Pair;
326 $source_by_gene->{$Pair}->{$line[$place{'gene'}]}->{$line[$place{'clusterID'}]}->[$i] = $Pair;
327 next;
328 }}
329 $source_by_cluster->{$indirect ? $Pair : $line[$place{'pair'}]}->{$line[$place{'clusterID'}]}->{$line[$place{'gene'}]}->[$i] = $line[$i];
330 $source_by_gene->{$indirect ? $Pair :$line[$place{'pair'}]}->{$line[$place{'gene'}]}->{$line[$place{'clusterID'}]}->[$i] = $line[$i];
331 }}
332 return;
333 }
334
335 sub findOrthos { #initially picks up orthologs in the first pair of genomes
336 my($a, $thepair) = @_;
337 my($set, $sm, $i, $newel);
338
339 return(undef) if $source_by_taken->{$thepair}->{$a};
340 for (keys(%{$source_by_cluster->{$thepair}->{$a}})) {
341 $newel = $#{$seeds} + 1;
342 for $i(0..($nplaces - 1)) {
343 $seeds->[$newel]->[$i] = $source_by_cluster->{$thepair}->{$a}->{$_}->[$i];
344 }
345 $source_by_taken->{$thepair}->{$a} = 1;
346 }
347 return $seeds;
348 }
349
350 sub findNewOrtho { #adds orthologs from the remaining genome pairs
351 my($g, $thepair) = @_;
352 my($new, $ii, $j, $jj, $sm, $newel);
353
354 for (keys(%{$source_by_gene->{$thepair}->{$g}})) {
355 next if $source_by_taken->{$thepair}->{$_};
356 $newel = $#{$new} + 1;
357 for $jj(@{$source_by_gene->{$thepair}->{$g}->{$_}}) {
358 $new->[$newel]->[$j] = $jj;
359 $j++;
360 }
361 }
362 if (scalar(@{$new}) == 1) {
363 $new = findOrthos($new->[0]->[$place{'clusterID'}], $new->[0]->[$place{'pair'}]);
364 if (scalar(@{$new}) > 1) {
365 return($new);}
366 else {print "Unpaired seed ortholog...$new->[0]->[$place{'gene'}]\n";}
367 }
368 elsif (scalar(@{$new}) > 1) {
369 print "Multiple seed orthologs...\nPair '$thepair'\tGene '$g'";
370 }
371 return(undef);
372 }
373
374 sub isNew {
375 my($seeds, $new) = @_;
376 my($ii);
377
378 for $ii(@{$seeds}) {
379 if (($ii->[$place{'gene'}] eq $new->[$place{'gene'}]) and ($ii->[$place{'pair'}] eq $new->[$place{'pair'}])) {
380 return(undef);
381 }
382 }
383 return(1);
384 }
385
386 sub pairWiseDist2All { #between-gene distances for algortihm 'ToAllSorted'; deprecated
387 my($seeds) = @_;
388 my($i, $j, $ii, $jj, $new, $old, $same, $le, $distances, $ke, $va);
389
390 $le = scalar(@{$seeds});
391
392 for ($i = 0; $i < $le; $i++) {
393 $ii = $seeds->[$i]->[$place{'gene'}];
394 #next if $distances->{$ii}->{$jj};
395 for ($j = 0; $j < $i; $j++) {
396 $jj = $seeds->[$j]->[$place{'gene'}];
397 undef $same; undef $old; undef $new;
398 $same = 1 if ($seeds->[$i]->[$place{'species'}] eq $seeds->[$j]->[$place{'species'}]);
399 if ($same) {
400 $old = $distances->{$ii}->{$jj}->{$seeds->[$i]->[$place{'pair'}]};}
401 else {$old = $distances->{$ii}->{$jj};}
402
403 next if ($seeds->[$i]->[$place{'pair'}] ne $seeds->[$j]->[$place{'pair'}]);
404 #THEY DESCRIBE THE SAME PAIR OF GENOMES
405 if ($same) {
406 #THEY ARE FROM THE SAME SPECIES
407 #next if $distances-> {$ii}->{$jj}->{$seeds->[$i]->[$place{'species'}]};
408 if (!$seeds->[$i]->[$place{'main'}] and !$seeds->[$j]->[$place{'main'}]) { #BOTH ARE IN-PARALOGS
409 $distances->{$ii}->{$jj}->{$seeds->[$i]->[$place{'pair'}]} = $distances->{$jj}->{$ii}->{$seeds->[$i]->[$place{'pair'}]} = $seeds->[$i]->[$place{'IPscore'}] * $seeds->[$j]->[$place{'IPscore'}];
410 }
411 elsif ($seeds->[$i]->[$place{'main'}]) { #THIS ONE IS IN-PARALOG
412 $distances->{$ii}->{$jj}->{$seeds->[$i]->[$place{'pair'}]} = $distances->{$jj}->{$ii}->{$seeds->[$i]->[$place{'pair'}]} = $seeds->[$j]->[$place{'IPscore'}];
413 }
414 elsif ($seeds->[$j]->[$place{'main'}]) { #THIS ONE IS IN-PARALOG
415 $distances->{$ii}->{$jj}->{$seeds->[$i]->[$place{'pair'}]} = $distances->{$jj}->{$ii}->{$seeds->[$i]->[$place{'pair'}]} = $seeds->[$i]->[$place{'IPscore'}];
416 }
417 else { #BOTH ARE SEED ORTHOLOGS
418 $distances->{$ii}->{$jj}->{$seeds->[$i]->[$place{'pair'}]} = $distances->{$jj}->{$ii}->{$seeds->[$i]->[$place{'pair'}]} = $maxINPscore;
419 }
420 $new = $distances->{$ii}->{$jj}->{$seeds->[$i]->[$place{'pair'}]};
421 }
422 else {
423 #THEY ARE FROM THE DIFFERENT SPECIES
424 if ($seeds->[$i]->[$place{'main'}] and $seeds->[$j]->[$place{'main'}]) #BOTH ARE SEED ORTHOLOGS
425 {
426 $distances->{$ii}->{$jj} = inpDist($seeds, $i, $j);
427 $distances->{$jj}->{$ii} = inpDist($seeds, $j, $i);
428 }
429 elsif ($seeds->[$i]->[$place{'main'}] or $seeds->[$j]->[$place{'main'}]) #STILL, ONE OF THEM IS IN-PARALOG (NO MATTER WHICH)
430 {
431 $distances->{$ii}->{$jj} =
432 $distances->{$jj}->{$ii} =
433 inpDist ($seeds, $i, $j) * inpDist ($seeds, $j, $i) ;
434 }
435 else
436 #BOTH ARE IN-PARALOGS
437 {
438 $distances->{$ii}->{$jj} =
439 mainDist($seeds, $i, $j) *
440 inpDist ($seeds, $i, $j) *
441 inpDist ($seeds, $j, $i) ;
442 $distances->{$jj}->{$ii} =
443 mainDist($seeds, $j, $i) *
444 inpDist ($seeds, $i, $j) *
445 inpDist ($seeds, $j, $i) ;
446 }
447 }
448 $new = $distances->{$ii}->{$jj};
449 if (!$same) {print "Distance $ii-$jj is $new\n" if $debug; print "Distance $jj-$ii is $new\n" if $debug;}
450 else {
451 if ($debug) {
452 print "Distance $ii-$jj is\n";
453 while (($ke, $va) = each(%{$distances->{$ii}->{$jj}})) {
454 print "\tfor $ke: $va\n";
455 }
456 }
457 }
458 print "Distance $ii-$jj is not equal to old one $old\n" if ($old and ($old != $new));
459 }
460 }
461 #for ($k = 0; $k < $le; $k++) {$distances->[$k]->[$k] = 1;}
462 return($distances);
463 }
464
465 sub pairWiseDist2Main ($) {#between-gene distances for algortihm 'OnlyMains'
466 my($seeds) = @_;
467 my($le, $i, $j, $ii, $jj, $new, $old, $distances);
468
469 $le = scalar(@{$seeds});
470 #$le2 = scalar(@{$species});
471 for ($i = 0; $i < $le; $i++) {
472 $ii = $seeds->[$i]->[$place{'gene'}];
473 for ($j = 0; $j < $i; $j++) {
474 $jj = $seeds->[$j]->[$place{'gene'}];
475 undef $old; undef $new;
476 $old = $distances->{$ii}->{$jj};
477 next if ($seeds->[$i]->[$place{'pair'}] ne $seeds->[$j]->[$place{'pair'}]);
478 next if ($seeds->[$i]->[$place{'main'}] and $seeds->[$j]->[$place{'main'}]); #SEED ORTHOLOGS WILL BE CLUSTERED ANYWAY
479 if ($seeds->[$i]->[$place{'species'}] eq $seeds->[$j]->[$place{'species'}]) {
480 #if (!$use_bootstrap) {$distances->{$ii}->{$jj} = $distances->{$jj}->{$ii} = $maxINPscore ;next;}
481 next if !$toMainOrthologOfSelf;
482 $distances->{$ii}->{$jj} = $distances->{$jj}->{$ii} =
483 $seeds->[$j]->[$place{'IPscore'}] if $seeds->[$i]->[$place{'main'}];
484 $distances->{$ii}->{$jj} = $distances->{$jj}->{$ii} =
485 $seeds->[$i]->[$place{'IPscore'}] if $seeds->[$j]->[$place{'main'}];
486 } #THEY ARE FROM THE SAME SPECIES AND THE DISTANCE IS NOT USED
487 else { #THEY ARE FROM DIFFERENT SPECIES
488 if ($seeds->[$i]->[$place{'main'}] or $seeds->[$j]->[$place{'main'}]) {
489 #ONE OF THEM IS SEED ORTHOLOG (NO MATTER WHICH)
490 $distances->{$ii}->{$jj} =
491 $distances->{$jj}->{$ii} =
492 inpDist ($seeds, $i, $j) *
493 inpDist ($seeds, $j, $i) ;
494 }
495 else {
496 #BOTH ARE IN-PARALOGS
497 $distances->{$ii}->{$jj} =
498 mainDist($seeds, $i, $j) *
499 inpDist ($seeds, $i, $j) *
500 inpDist ($seeds, $j, $i) ;
501 $distances->{$jj}->{$ii} =
502 mainDist($seeds, $j, $i) *
503 inpDist ($seeds, $i, $j) *
504 inpDist ($seeds, $j, $i) ;
505 }
506 }
507 $new = $distances->{$ii}->{$jj};
508 print "Distance $ii-$jj is $new\n" if $debug;
509 print "Distance $jj-$ii is $new\n" if $debug;
510 print "Distance $ii-$jj is not equal to old one $old\n" if ($old and ($old != $new));
511 }
512 }
513 return($distances);
514 }
515
516 sub mainDist ($$$) { #finds distances between two main (seed) orthologs
517 my($seeds, $from, $to) = @_;
518 my($ii, $pair1, $pair2, $sum, $nu, $nc);
519
520 return($maxINPscore) if !$use_bootstrap; #if we think bootstrap values are not relevant in calculating the distance, then every Main-Main distance should bemaxINPscore = 1 (and this is the default)
521 $pair1 = $seeds->[$from]->[$place{'species'}].'-'.$seeds->[$to]->[$place{'species'}];
522 $pair2 = $seeds->[$to]->[$place{'species'}].'-'.$seeds->[$from]->[$place{'species'}];
523 $sum = $nc = $nu = 0;
524
525 for $ii(@{$seeds}) {
526 if (
527 ($ii->[$place{'main'}]) and
528 (($ii->[$place{'pair'}] eq $pair1) or ($ii->[$place{'pair'}] eq $pair2)) and
529 ($ii->[$place{'species'}]) eq $seeds->[$from]->[$place{'species'}]) {
530 $nc++;
531 $sum += $ii->[$place{'IPscore'}];
532 }
533 }
534
535 if (!$nc) {
536 print "Distance $seeds->[$from]->[$place{'gene'}]-$seeds->[$to]->[$place{'gene'}] (pair $pair1 or $pair2) not found\n";
537 return(undef);
538 }
539 else
540 {
541 return($sum / $nc);
542 }
543 }
544
545 sub inpDist ($$$) { #finds a distance from main (seed) ortholog to in-paralog
546 my($seeds, $from, $to) = @_;
547 my($le, $ii, $pair1, $pair2, $sum, $nu, $nc);
548
549 return($maxINPscore) if ($seeds->[$from]->[$place{'main'}] and !$use_bootstrap);
550 $le = scalar(@{$seeds});
551 $pair1 = $seeds->[$from]->[$place{'species'}].'-'.$seeds->[$to]->[$place{'species'}];
552 $pair2 = $seeds->[$to]->[$place{'species'}].'-'.$seeds->[$from]->[$place{'species'}];
553 $sum = $nc = $nu = 0;
554 for $ii(@{$seeds}) {
555 if (
556 ($ii->[$place{'gene'}] eq $seeds->[$from]->[$place{'gene'}]) and
557 (($ii->[$place{'pair'}] eq $pair1) or
558 ($ii->[$place{'pair'}] eq $pair2))) {
559 return($ii->[$place{'IPscore'}]);
560 }
561 }
562 print "Distance $seeds->[$from]->[$place{'gene'}]-$seeds->[$to]->[$place{'gene'}] not found\n";
563 return undef;
564 }
565
566 sub sortBy ($$) {
567 my($num, $ar) = @_;
568 my($i, $max, $maxnum, @sorted);
569
570 #array2StdOut($ar, 'Before');
571
572 LINE001: while (scalar(@{$ar}) > 0) {
573 if (scalar(@{$ar}) == 1 and !$ar->[$i]->[$num]) {
574 last;
575 }
576 $max = 0;
577 undef $maxnum;
578 for ($i = 0; $i < scalar(@{$ar}); $i++) {
579 last if !$ar->[$i]->[$num];
580 if ($ar->[$i]->[$num] > $max) {
581 $max = $ar->[$i]->[$num];
582 $maxnum = $i;
583 }
584 }
585 if (defined($maxnum)) {
586 push @sorted, splice(@{$ar}, $maxnum, 1);
587 next LINE001;
588 }
589 }
590 return(\@sorted);
591 }
592
593 sub setUnique ($) { #removes duplicated entries from the (pre-)cluster
594 my($seeds) = @_;
595 my($i, $j, $unique, $sds);
596
597 $sds->[$place{'clusterID'}] = $seeds->[$place{'clusterID'}];
598 for ($i = 1; $i < scalar(@{$seeds}); $i++) {
599 $unique = 1;
600 for ($j = 0; $j < scalar(@{$sds}); $j++) {
601 if (($seeds->[$i]->[$place{'species'}] eq $sds->[$j]->[$place{'species'}]) and ($seeds->[$i]->[$place{'gene'}] eq $sds->[$j]->[$place{'gene'}]))
602 {$unique = 0;
603 last;}
604 }
605 if ($unique) {push(@{$sds}, $seeds->[$i]);}
606 }
607 return($sds);
608 }
609
610 sub isAlready ($$) {
611 my($as, $included) = @_;
612 my($ai);
613
614 for $ai(@{$included}) {
615 return(1) if ($ai->[4] eq $as);
616 }
617 return(undef);
618 }
619
620 sub alreadyClustered { #helps to avoid clustering same gene twice. Of two alternatives, the one is chosen where the gene is a main (seed) ortholog or was included first.
621 my($g, $cco) = @_;
622 my($ai);
623
624 return(undef) if !defined($out->{'byName'}->{$g->[$place{'gene'}]}->{$cco});
625
626 if ($out->{'byRecord'}->[$out->{'byName'}->{$g->[$place{'gene'}]}->{$cco}]->[3]) {
627 return 1;
628 }
629 else {
630 undef $out->{'byRecord'}->[$out->{'byName'}->{$g->[$place{'gene'}]}->{$cco}];
631 return(undef);
632 }}
633
634 sub firstMains { #seed orthologs are included unconditionally
635 my($seeds) = @_;
636 my($i, $le, @included);
637
638 $le = scalar(@{$seeds});
639 for ($i = 0; $i < $le; $i++) {
640 # and ($seeds->[$i]->[$place{'IPscore'}] == 1)
641 if ($seeds->[$i]->[$place{'main'}]) {
642 push @included, $seeds->[$i];
643 }
644 }
645 return(@included);
646 }
647
648 sub getOthers ($$$$) { #retains only genes with distance (well, it is in fact a SIMILARITY) above the cut-off $cco. If it is not set, or equals to 0, the procedure just returns IPscore as a reference - to be saved in the output file.
649 my($seeds, $distances, $included, $cco) = @_;
650 my($i, $le, $les2, $nc, $ii, $jj, $dist, $ad, $nai, @others);
651
652 $le = scalar(@{$seeds});
653 for $ii(@{$seeds}) {
654 next if isAlready($ii->[$place{'gene'}], $included);
655 undef $dist; undef $nc;
656 for $jj(@{$included}) {
657 next if (($jj->[$place{'species'}] eq $ii->[$place{'species'}]) and !$toMainOrthologOfSelf);
658 if (!$distances->{$ii->[$place{'gene'}]}->{$jj->[$place{'gene'}]}) {
659 print "The distance between $ii->[$place{'gene'}] and $jj->[$place{'gene'}] does not exist...\n" if $debug;
660 next;
661 }
662 if ($jj->[$place{'main'}]) {
663 print "( $ii->[$place{'gene'}] - $jj->[$place{'gene'}] ) + " if $debug;
664 $dist += $distances->{$ii->[$place{'gene'}]}->{$jj->[$place{'gene'}]};
665 $nc++;
666 }
667 }
668 print " / $nc = ".$dist/$nc."\n" if $debug;
669 return(undef) if !$nc;
670 if ($dist / $nc >= $cco) {
671 push @others, $ii;
672 $others[$#others]->[$place{'IPscore'}] = $dist / $nc;
673 }
674 }
675 return(\@others);
676 }
677
678 sub getClosests ($$$$) { #used only in the deprecated 'ToAllSorted' algorithm
679 my($seeds, $distances, $included, $cco) = @_;
680 my($ni, $ii, $jj, $kk, $dist, $gotnew, $toadd, $nt, $max);
681
682 do {
683 undef $gotnew;
684 for $ii(@{$seeds}) {
685 next if isAlready($ii->[$place{'gene'}], $included);
686 undef $ni; undef $dist; undef $max;
687 for $jj(@{$included}) {
688 undef $nt; undef $toadd;
689 print "$ii->[$place{'gene'}] vs $jj->[$place{'gene'}] \n" if $debug;
690 if ($ii->[$place{'species'}] ne $jj->[$place{'species'}]) {
691 $toadd = $distances->{$ii->[$place{'gene'}]}->{$jj->[$place{'gene'}]};
692 }
693 else {
694 for $kk(keys(%{$distances->{$ii->[$place{'gene'}]}->{$jj->[$place{'gene'}]}})) {
695 $toadd += $distances->{$ii->[$place{'gene'}]}->{$jj->[$place{'gene'}]}->{$kk}; $nt++;
696 }
697 $toadd = $toadd / $nt if $nt;
698 }
699 $dist += $toadd; $ni++ if $toadd;
700 print "A distance for $ii->[$place{'gene'}] and $jj->[$place{'gene'}] does not exist...\n" if (!$toadd and $debug);
701 }
702 if ($ni and ($dist / $ni > $max->['IPscore'])) {
703 $max = $ii; $max->['IPscore'] = $dist / $ni;
704 }
705 if ($max->['IPscore'] >= $cco) {
706 print "Included $max->[4] as $dist / $ni > $cco\n" if $debug;
707 push(@{$included}, $max);
708 $gotnew = 1;
709 }
710 }
711 } while ($gotnew);
712 return($included);
713 }
714
715 sub array2StdOut { #for debug purpose only
716 my($ar, $name) = @_;
717 my($ii, $jj);
718
719 print "$name: \n";
720 return(0) if !scalar(@{$ar});
721 for $ii(@{$ar}) {
722 for $jj(@{$ii}) {
723 print "$jj\t" if defined($jj);
724 }
725 print "\n";
726 }
727 return undef;
728 }