Mercurial > repos > dereeper > pgap
comparison PGAP-1.2.1/multiparanoid.pl @ 0:83e62a1aeeeb draft
Uploaded
author | dereeper |
---|---|
date | Thu, 24 Jun 2021 13:51:52 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:83e62a1aeeeb |
---|---|
1 #!/usr/bin/perl | |
2 # | |
3 # Copyright (C) Andrey Alexeyenko and Erik Sonnhammer, 2006 | |
4 # For a license to use this program, contact Erik.Sonnhammer@cbc.su.se | |
5 ############################################################################### | |
6 # Andrey Alexeyenko, Stockholm Bioinformatics Center, Albanova, Stockholm 2006 | |
7 # mailto:avalex99@mail.ru | |
8 ############################################################################### | |
9 # The program is a postprocessor of ortholog tables produced by InParanoid. It combines pairwise InParanoid outputs (say, A-B, B-C, A-C) and makes multi-species clusters of orthologs ABC. | |
10 | |
11 use strict vars; | |
12 use DBI; | |
13 use Cwd; | |
14 require 5.002; | |
15 | |
16 our($inputdir, $makeunique, $indirect, $Pair, | |
17 @fields, @cutoffs, $output, @fields_out, $out, $algo, $debug, $SQLprefix, | |
18 %place, $nplaces, $source_by_cluster, $source_by_gene, $source_by_taken, $seeds, $use_bootstrap, $maxINPscore, $toMainOrthologOfSelf); | |
19 my($npairs, $i, $j, $ii, $ee, $gg, $nn, $oo, $s2, $s1, $nspec, $as, $newmain, @distances, @species, $sp1, $sp2, $clusno, $specletters, $a_pair, $a_cluster, $pms, $input, $output, $inputfile, $new, $thepair); | |
20 ############################################################################### | |
21 | |
22 #FIRST OF ALL, set the variables $inputdir and $output below. MultiParanoid may not work without it!!! | |
23 | |
24 #Parameters to submit (may be abbreviated to the first letter): | |
25 # OBLIGATORY: | |
26 # -species: a list of several species connected with "+"; e.g. mouse+cat+dog; names of the genomes exactly how they are called both in file names and inside of files. NOTE: due to phylogenetic tree conflicts, order of the genomes slightly changes the result. The input files (the InParanoid output) should thus be called: | |
27 # sqltable.dog-cat, sqltable.dog-mouse etc. | |
28 #To use other file names, one should change the variable $SQLprefix below. | |
29 | |
30 # OPTIONAL: | |
31 # -cutoff: deprecated. But one can use it to tighten the clusters (to filter out weaker in paralogs by restricting with the distance to the seed orthologs); Example: " -cutoff 0.5 " or, for doing 3 different cluster versions at the same time: 0+0.2+0.5. Default: 0 (no cutoff). | |
32 # -algo: how to define between-paralogs distances while clustering. | |
33 # Options: onlymains (default and only reasonable) | |
34 # toallsorted | |
35 # -debug: 0 or 1; default: 0 | |
36 # -input: deprecated, now defined by -species parameter, and the input files shall have conventional names | |
37 # -output: deprecated, special message will tell the name of the output file | |
38 # -unique: 0 or 1; set to 1, if the duplicates (genes clustered twice) should be removed; default: 0 | |
39 | |
40 #The MultiParanoid output header: | |
41 ### clusterID species gene is_seed_ortholog confidence_score species_in_cluster tree_conflict### | |
42 | |
43 #is (hopefully) self-explanatory. Though: | |
44 | |
45 # "is_seed_ortholog" means the protein was a seed ortholog in at least one InParanoid cluster. | |
46 # "confidence_score" is an average InParanoid score across the input clusters | |
47 # "tree_conflict" indicates that, from the point of view of different species, the number of inparalogs varied in at least one other species ("diff.numbers") or the numbers were the same, but the IDs differed ("diff.names"). | |
48 | |
49 #NOTE: s.c. 'distances' between cluster members are, in fact, similarities. Variables are called 'distances' for historical reasons. | |
50 # 'main orthologs' (defined by InParanoid) are better to be called 'seed orthologs'. | |
51 | |
52 # Settings: | |
53 | |
54 $algo = 'onlymains'; | |
55 $debug = 0; | |
56 $use_bootstrap = 0; | |
57 $maxINPscore = 1; | |
58 $toMainOrthologOfSelf = 0; | |
59 $indirect = 1; #this option allows using the currently default InParanoid output, were seed orthologs are recognized by the feature "IPscore = 1.00". Otherwise ($indirect = 0) a special format should be used (needs a change in the InParanoid script and is now deprecated). | |
60 | |
61 if ($indirect) { | |
62 $SQLprefix = 'sqltable.'; | |
63 $inputdir = './'; #change it respectively | |
64 $place{'clusterID'} = 0; #these should correspond to the current InParanoid output format | |
65 $place{'BLASTscore'} = 1; | |
66 $place{'species'} = 2; | |
67 $place{'IPscore'} = 3; | |
68 $place{'gene'} = 4; | |
69 $place{'main'} = 5; | |
70 $place{'pair'} = 6; | |
71 } | |
72 else { | |
73 $SQLprefix = 'ava_sql'; | |
74 $inputdir = 'disk/dir/multiparanoid/'; #change it respectively | |
75 ###The older, ava_sql, version: | |
76 $place{'clusterID'} = 0; | |
77 $place{'BLASTscore'} = 1; | |
78 $place{'pair'} = 2; | |
79 $place{'species'} = 3; | |
80 $place{'gene'} = 4; | |
81 $place{'IPscore'} = 5; | |
82 $place{'main'} = 6; | |
83 #$indirect = 1 if !defined($place{'main'}); | |
84 } | |
85 | |
86 $pms = parseParameters(join(' ', @ARGV)); | |
87 if (!$pms->{'s'}) {die "No '+'-delimited input species list (pointing to INPARANOID output) specified. Check parameter '-species'... ";} | |
88 @species = split(/\+/, $pms->{'s'}); #list of species (genomes) as they appear in the file names. Default: no default | |
89 print "Species list: $pms->{'s'}\n is treated as species ".join(', ', @species)."\n"; | |
90 #print "Genomes @species are being processed...\n"; | |
91 $makeunique = 0;#also, mind the variables $inputdir and $output below | |
92 $makeunique = $pms->{'u'}; | |
93 | |
94 $specletters = fstLetters(@species); | |
95 # $output = 'diskZ/MultiparanoidOuput/'.$specletters.'.MP.sql' if !$output; #change it respectively | |
96 | |
97 $output = './MP.Cluster' ; #change it respectively | |
98 | |
99 | |
100 @cutoffs = split(/\+/, $pms->{'c'}); #Can restrict included in-paralogs by their distance to the main ones (see example below). Default: 0 (no cutooff) | |
101 #@cutoffs = (0, 0.25, 0.5, 0.75, 1); | |
102 @cutoffs = (0) if !exists($cutoffs[0]); | |
103 | |
104 $algo = $pms->{'a'} if !$algo; #which algorithm to use. Default: onlymains | |
105 $debug = $pms->{'d'}; #degree of debug messages' output. Default: no | |
106 $output = $pms->{'o'} if !$output; | |
107 $makeunique = $pms->{'u'} if !$makeunique; | |
108 | |
109 $nplaces = scalar(keys(%place)); | |
110 @fields_out = (clusterID, species, gene, is_seed_ortholog, confidence_score, species_in_cluster, tree_conflict); | |
111 push @fields_out, 'cutoff' if scalar(@cutoffs) > 1; | |
112 $nspec = scalar(@species); | |
113 open OUT, ">$output"; | |
114 | |
115 print "Genome pairs found and used: \n" if $debug > 0; | |
116 for ($s1 = 0; $s1 < $nspec; $s1++) { | |
117 for ($s2 = 0; $s2 < $nspec; $s2++) { | |
118 next if ($species[$s1] eq $species[$s2]); | |
119 $Pair = $species[$s1].'-'.$species[$s2]; | |
120 $inputfile = $inputdir.$SQLprefix.$Pair; | |
121 if (!system("ls $inputfile")) { | |
122 loadData($inputfile); | |
123 $npairs++; | |
124 }}} | |
125 die "Not enough (or too many) species pair files for ".join(", ", @species)."\n" if $npairs != ($nspec * ($nspec - 1)) / 2; | |
126 $clusno = 1; | |
127 | |
128 for $sp1(@species) { | |
129 for $sp2(@species) { | |
130 $thepair = "$sp1-$sp2"; | |
131 | |
132 next if (!$source_by_cluster->{$thepair} or ($sp1 eq $sp2)); | |
133 print "Analyzing $sp1 vs. $sp2: " if $debug; | |
134 print scalar(keys(%{$source_by_cluster->{$thepair}}))." clusters for this pair...\n" if $debug; | |
135 for $a_cluster(keys(%{$source_by_cluster->{$thepair}})) { | |
136 undef $new; undef $seeds; | |
137 $seeds = findOrthos($a_cluster, $thepair); | |
138 | |
139 #find recordset of orthologs of the both species (for cluster a) from the pairwise subset i-j | |
140 next if !defined($seeds->[0]); | |
141 LINE1:for $gg(@{$seeds}) { | |
142 next if (!$gg->[$place{'main'}] or $gg->[$nplaces]); | |
143 $gg->[$nplaces] = 1; | |
144 for $a_pair(keys(%{$source_by_cluster})) { | |
145 if (($a_pair =~ $gg->[$place{'species'}]) and ($gg->[$place{'pair'}] ne $a_pair)) { | |
146 $new = findNewOrtho($gg->[$place{'gene'}], $a_pair); | |
147 undef $newmain; | |
148 if (defined($new)) { | |
149 for $nn(@{$new}) { | |
150 push(@{$seeds}, $nn) if isNew($seeds, $nn); | |
151 $newmain = 1 if $nn->[$place{'main'}]; | |
152 } | |
153 redo LINE1 if $newmain; | |
154 }}}} | |
155 | |
156 makeClusters($seeds, \@cutoffs, \@species, $clusno++, treeConflict($seeds), $specletters); | |
157 }}} | |
158 if ($clusno > 20) { | |
159 print OUT join("\t", @fields_out)."\n"; | |
160 for $oo(@{$out->{'byRecord'}}) { | |
161 print OUT join("\t", @{$oo})."\n" if defined($oo); | |
162 } | |
163 print "Making ortholog clusters of ".join(', ', @species)." done. The result is in file $output\n"; | |
164 } | |
165 else { | |
166 die "No output has been produced..." | |
167 } | |
168 ########################################################################### | |
169 | |
170 sub parseParameters { | |
171 my($parameters) = @_; | |
172 my($key, $value, $pars, %pars); | |
173 | |
174 #print "$parameters\n"; | |
175 $_ = $parameters; | |
176 while (m/\-(\w+)\s+([a-zA-Z0-9._=+]+)/g) { | |
177 next if ((lc($2) eq 'f') or (lc($2) eq 'false') or (lc($2) eq 'no') or (lc($2) eq '0')); | |
178 $pars{substr($1, 0, 1)} = $2; | |
179 } | |
180 if (scalar(keys(%pars)) < 1 or !$pars{'s'} or m/\-h/) { | |
181 print "\nNOT ENOUGH PARAMETERS!\nOBLIGATORY: | |
182 -species: a list of several species connected with '+'; e.g. mouse+cat+dog; names of the genomes exactly how they are called both in file names and inside of files. NOTE: due to phylogenetic tree conflicts, order of the genomes slightly changes the result. The input files (the InParanoid output) should thus be called: | |
183 sqltable.dog-cat, sqltable.dog-mouse etc. | |
184 To use other file names, one should change the variable \$SQLprefix inside the scrtipt. | |
185 \nOPTIONAL: | |
186 \n-cutoff: deprecated. But one can use it to tighten the clusters (to filter out weaker in paralogs by restricting with the distance to the seed orthologs); Example: ' -cutoff 0.5 ' or, for doing 3 different cluster versions at the same time: 0+0.2+0.5. Default: 0 (no cutoff). | |
187 \n-algo: how to define between-paralogs distances while clustering. | |
188 Options: onlymains (default and only reasonable) | |
189 toallsorted | |
190 \n-debug: 0 or 1; default: 0 | |
191 \n-input: deprecated, now defined by -species parameter and the variables \$inputdir and \$SQLprefix, and the input files shall have conventional names | |
192 \n-output: deprecated, special message will tell the name of the output file | |
193 \n-unique: 0 or 1; set to '0' if the duplicates (genes clustered twice) should BE REMOVED, to '1' otherwise; default: 0\n\n"; exit; | |
194 } | |
195 while (($key, $value) = each(%pars)) { | |
196 print "Parameter $key = $value;\n"; | |
197 } | |
198 | |
199 return(\%pars); | |
200 } | |
201 | |
202 sub treeConflict { #tells if there is a discrepancy, for species B, between 2-species trees A-B and B-C | |
203 my($seeds) = @_; | |
204 my($i, $ii, $jj, $kk, $sets); | |
205 | |
206 for $ii(@{$seeds}) { | |
207 push @{$sets->{$ii->[$place{'species'}]}->{$ii->[$place{'pair'}]}}, $ii->[$place{'gene'}]; | |
208 } | |
209 | |
210 for $ii(keys(%{$sets})) { | |
211 for $jj(keys(%{$sets->{$ii}})) { | |
212 for $kk(keys(%{$sets->{$ii}})) { | |
213 next if $jj eq $kk; | |
214 if (scalar(@{$sets->{$ii}->{$jj}}) != scalar(@{$sets->{$ii}->{$kk}})) {return('diff. numbers');} | |
215 sort {$a <=> $b} @{$jj}; | |
216 sort {$a <=> $b} @{$kk}; | |
217 for ($i = 0; $i < scalar(@{$sets->{$ii}->{$jj}}); $i++) { | |
218 if ($sets->{$ii}->{$jj}->[$i] ne $sets->{$ii}->{$kk}->[$i]) { | |
219 return('diff. names'); | |
220 }}}}} | |
221 return(undef); | |
222 } | |
223 | |
224 sub makeClusters { | |
225 my($seeds, $cutoffs, $species, $clusno, $conflict, $param) = @_; | |
226 my($clo, $le, $gg, @included, $distances, $specset, $cco); | |
227 | |
228 array2StdOut($seeds, "Primers") if $debug; | |
229 if ($algo eq 'onlymains') { | |
230 $distances = pairWiseDist2Main($seeds); | |
231 } | |
232 elsif ($algo eq 'toallsorted') { | |
233 $distances = pairWiseDist2All($seeds); | |
234 } | |
235 else {die "Algorithm is not set...";} | |
236 array2StdOut(setUnique($seeds), "Unique") if $debug; | |
237 for $cco(@{$cutoffs}) { | |
238 undef @included; | |
239 | |
240 @included = firstMains($seeds); | |
241 @included = @{setUnique(\@included)} if defined(@included); | |
242 array2StdOut(\@included, 'Mains') if $debug; | |
243 goto LINE3 if $cco == 1; | |
244 | |
245 if ($algo eq 'onlymains') { | |
246 push @included, @{getOthers( | |
247 setUnique($seeds), | |
248 $distances, | |
249 \@included, | |
250 $cco)}; | |
251 } | |
252 elsif ($algo eq 'toallsorted') { | |
253 @included = @{getClosests( | |
254 setUnique($seeds), | |
255 $distances, | |
256 \@included, | |
257 $cco)};} | |
258 LINE3:array2StdOut(\@included, "Cluster with cut-off $cco") if $debug; | |
259 LINE4: $specset = join('', fstLetters(sort(listSpecies(\@included, $species)))); | |
260 $conflict = (($conflict) ? $conflict : 'No'); | |
261 for $gg(@included) { | |
262 next if $makeunique and alreadyClustered($gg, $cco); | |
263 $ii = scalar(@{$out->{'byRecord'}}); | |
264 $out->{'byName'}->{$gg->[$place{'gene'}]}->{$cco} = $ii; | |
265 | |
266 for $ee($clusno, | |
267 $gg->[$place{'species'}], | |
268 $gg->[$place{'gene'}], | |
269 $gg->[$place{'main'}], | |
270 $gg->[$place{'IPscore'}], | |
271 $specset, | |
272 $conflict) {push @{$out->{'byRecord'}->[$ii]}, $ee;} | |
273 push @{$out->{'byRecord'}->[$ii]}, $cco if scalar(@cutoffs) > 1; | |
274 | |
275 } | |
276 #$out .= join("\t", (100 * $cco, $clusno, $gg->[$place{'species'}], $gg->[$place{'gene'}], $gg->[$place{'main'}], $gg->[$place{'IPscore'}], $specset, $conflict, $param, $algo))."\n" if !$makeunique or !alreadyClustered($gg); | |
277 } | |
278 return; | |
279 } | |
280 | |
281 | |
282 sub fstLetters { | |
283 my(@list) = @_; | |
284 my($letters, $ii, $firstLetter, $lastLetter); | |
285 | |
286 return(join('-', sort {$a cmp $b} @list)); | |
287 $firstLetter = 0; | |
288 for $ii(@list) { | |
289 $lastLetter = length($ii) - $firstLetter; | |
290 $letters .= "_".substr($ii, $firstLetter, $lastLetter); | |
291 } | |
292 | |
293 return $letters; | |
294 } | |
295 | |
296 sub listSpecies { | |
297 my($ar, $species) = @_; | |
298 my($already, %already, $ii, @flags); | |
299 # checks for species present in the current cluster | |
300 | |
301 for $ii(@{$ar}) { | |
302 push(@flags, $ii->[$place{'species'}]) if !$already{$ii->[$place{'species'}]}; | |
303 $already{$ii->[$place{'species'}]} = 1; | |
304 } | |
305 return(@flags); | |
306 } | |
307 | |
308 sub loadData { | |
309 my($inputfile) = @_; | |
310 my($line, @line, $i); | |
311 | |
312 open(IN, $inputfile); | |
313 print "Loading $inputfile...\n"; | |
314 while (<IN>) { | |
315 @line = split /\s+/; | |
316 #$place{'BLASTscore'}, $place{'species'}, $place{'IPscore'}, $place{'main'} | |
317 for $i(0..scalar(keys(%place))) { | |
318 if ($indirect) { | |
319 if ($i == $place{'main'}) { | |
320 $source_by_cluster->{$indirect ? $Pair : $line[$place{'pair'}]}->{$line[$place{'clusterID'}]}->{$line[$place{'gene'}]}->[$i] = ($line[$place{'IPscore'}] == $maxINPscore) ? 1 : 0; | |
321 $source_by_gene->{$indirect ? $Pair : $line[$place{'pair'}]}->{$line[$place{'gene'}]}->{$line[$place{'clusterID'}]}->[$i] = ($line[$place{'IPscore'}] == $maxINPscore) ? 1 : 0; | |
322 next; | |
323 } | |
324 if ($i == $place{'pair'}) { | |
325 $source_by_cluster->{$Pair}->{$line[$place{'clusterID'}]}->{$line[$place{'gene'}]}->[$i] = $Pair; | |
326 $source_by_gene->{$Pair}->{$line[$place{'gene'}]}->{$line[$place{'clusterID'}]}->[$i] = $Pair; | |
327 next; | |
328 }} | |
329 $source_by_cluster->{$indirect ? $Pair : $line[$place{'pair'}]}->{$line[$place{'clusterID'}]}->{$line[$place{'gene'}]}->[$i] = $line[$i]; | |
330 $source_by_gene->{$indirect ? $Pair :$line[$place{'pair'}]}->{$line[$place{'gene'}]}->{$line[$place{'clusterID'}]}->[$i] = $line[$i]; | |
331 }} | |
332 return; | |
333 } | |
334 | |
335 sub findOrthos { #initially picks up orthologs in the first pair of genomes | |
336 my($a, $thepair) = @_; | |
337 my($set, $sm, $i, $newel); | |
338 | |
339 return(undef) if $source_by_taken->{$thepair}->{$a}; | |
340 for (keys(%{$source_by_cluster->{$thepair}->{$a}})) { | |
341 $newel = $#{$seeds} + 1; | |
342 for $i(0..($nplaces - 1)) { | |
343 $seeds->[$newel]->[$i] = $source_by_cluster->{$thepair}->{$a}->{$_}->[$i]; | |
344 } | |
345 $source_by_taken->{$thepair}->{$a} = 1; | |
346 } | |
347 return $seeds; | |
348 } | |
349 | |
350 sub findNewOrtho { #adds orthologs from the remaining genome pairs | |
351 my($g, $thepair) = @_; | |
352 my($new, $ii, $j, $jj, $sm, $newel); | |
353 | |
354 for (keys(%{$source_by_gene->{$thepair}->{$g}})) { | |
355 next if $source_by_taken->{$thepair}->{$_}; | |
356 $newel = $#{$new} + 1; | |
357 for $jj(@{$source_by_gene->{$thepair}->{$g}->{$_}}) { | |
358 $new->[$newel]->[$j] = $jj; | |
359 $j++; | |
360 } | |
361 } | |
362 if (scalar(@{$new}) == 1) { | |
363 $new = findOrthos($new->[0]->[$place{'clusterID'}], $new->[0]->[$place{'pair'}]); | |
364 if (scalar(@{$new}) > 1) { | |
365 return($new);} | |
366 else {print "Unpaired seed ortholog...$new->[0]->[$place{'gene'}]\n";} | |
367 } | |
368 elsif (scalar(@{$new}) > 1) { | |
369 print "Multiple seed orthologs...\nPair '$thepair'\tGene '$g'"; | |
370 } | |
371 return(undef); | |
372 } | |
373 | |
374 sub isNew { | |
375 my($seeds, $new) = @_; | |
376 my($ii); | |
377 | |
378 for $ii(@{$seeds}) { | |
379 if (($ii->[$place{'gene'}] eq $new->[$place{'gene'}]) and ($ii->[$place{'pair'}] eq $new->[$place{'pair'}])) { | |
380 return(undef); | |
381 } | |
382 } | |
383 return(1); | |
384 } | |
385 | |
386 sub pairWiseDist2All { #between-gene distances for algortihm 'ToAllSorted'; deprecated | |
387 my($seeds) = @_; | |
388 my($i, $j, $ii, $jj, $new, $old, $same, $le, $distances, $ke, $va); | |
389 | |
390 $le = scalar(@{$seeds}); | |
391 | |
392 for ($i = 0; $i < $le; $i++) { | |
393 $ii = $seeds->[$i]->[$place{'gene'}]; | |
394 #next if $distances->{$ii}->{$jj}; | |
395 for ($j = 0; $j < $i; $j++) { | |
396 $jj = $seeds->[$j]->[$place{'gene'}]; | |
397 undef $same; undef $old; undef $new; | |
398 $same = 1 if ($seeds->[$i]->[$place{'species'}] eq $seeds->[$j]->[$place{'species'}]); | |
399 if ($same) { | |
400 $old = $distances->{$ii}->{$jj}->{$seeds->[$i]->[$place{'pair'}]};} | |
401 else {$old = $distances->{$ii}->{$jj};} | |
402 | |
403 next if ($seeds->[$i]->[$place{'pair'}] ne $seeds->[$j]->[$place{'pair'}]); | |
404 #THEY DESCRIBE THE SAME PAIR OF GENOMES | |
405 if ($same) { | |
406 #THEY ARE FROM THE SAME SPECIES | |
407 #next if $distances-> {$ii}->{$jj}->{$seeds->[$i]->[$place{'species'}]}; | |
408 if (!$seeds->[$i]->[$place{'main'}] and !$seeds->[$j]->[$place{'main'}]) { #BOTH ARE IN-PARALOGS | |
409 $distances->{$ii}->{$jj}->{$seeds->[$i]->[$place{'pair'}]} = $distances->{$jj}->{$ii}->{$seeds->[$i]->[$place{'pair'}]} = $seeds->[$i]->[$place{'IPscore'}] * $seeds->[$j]->[$place{'IPscore'}]; | |
410 } | |
411 elsif ($seeds->[$i]->[$place{'main'}]) { #THIS ONE IS IN-PARALOG | |
412 $distances->{$ii}->{$jj}->{$seeds->[$i]->[$place{'pair'}]} = $distances->{$jj}->{$ii}->{$seeds->[$i]->[$place{'pair'}]} = $seeds->[$j]->[$place{'IPscore'}]; | |
413 } | |
414 elsif ($seeds->[$j]->[$place{'main'}]) { #THIS ONE IS IN-PARALOG | |
415 $distances->{$ii}->{$jj}->{$seeds->[$i]->[$place{'pair'}]} = $distances->{$jj}->{$ii}->{$seeds->[$i]->[$place{'pair'}]} = $seeds->[$i]->[$place{'IPscore'}]; | |
416 } | |
417 else { #BOTH ARE SEED ORTHOLOGS | |
418 $distances->{$ii}->{$jj}->{$seeds->[$i]->[$place{'pair'}]} = $distances->{$jj}->{$ii}->{$seeds->[$i]->[$place{'pair'}]} = $maxINPscore; | |
419 } | |
420 $new = $distances->{$ii}->{$jj}->{$seeds->[$i]->[$place{'pair'}]}; | |
421 } | |
422 else { | |
423 #THEY ARE FROM THE DIFFERENT SPECIES | |
424 if ($seeds->[$i]->[$place{'main'}] and $seeds->[$j]->[$place{'main'}]) #BOTH ARE SEED ORTHOLOGS | |
425 { | |
426 $distances->{$ii}->{$jj} = inpDist($seeds, $i, $j); | |
427 $distances->{$jj}->{$ii} = inpDist($seeds, $j, $i); | |
428 } | |
429 elsif ($seeds->[$i]->[$place{'main'}] or $seeds->[$j]->[$place{'main'}]) #STILL, ONE OF THEM IS IN-PARALOG (NO MATTER WHICH) | |
430 { | |
431 $distances->{$ii}->{$jj} = | |
432 $distances->{$jj}->{$ii} = | |
433 inpDist ($seeds, $i, $j) * inpDist ($seeds, $j, $i) ; | |
434 } | |
435 else | |
436 #BOTH ARE IN-PARALOGS | |
437 { | |
438 $distances->{$ii}->{$jj} = | |
439 mainDist($seeds, $i, $j) * | |
440 inpDist ($seeds, $i, $j) * | |
441 inpDist ($seeds, $j, $i) ; | |
442 $distances->{$jj}->{$ii} = | |
443 mainDist($seeds, $j, $i) * | |
444 inpDist ($seeds, $i, $j) * | |
445 inpDist ($seeds, $j, $i) ; | |
446 } | |
447 } | |
448 $new = $distances->{$ii}->{$jj}; | |
449 if (!$same) {print "Distance $ii-$jj is $new\n" if $debug; print "Distance $jj-$ii is $new\n" if $debug;} | |
450 else { | |
451 if ($debug) { | |
452 print "Distance $ii-$jj is\n"; | |
453 while (($ke, $va) = each(%{$distances->{$ii}->{$jj}})) { | |
454 print "\tfor $ke: $va\n"; | |
455 } | |
456 } | |
457 } | |
458 print "Distance $ii-$jj is not equal to old one $old\n" if ($old and ($old != $new)); | |
459 } | |
460 } | |
461 #for ($k = 0; $k < $le; $k++) {$distances->[$k]->[$k] = 1;} | |
462 return($distances); | |
463 } | |
464 | |
465 sub pairWiseDist2Main ($) {#between-gene distances for algortihm 'OnlyMains' | |
466 my($seeds) = @_; | |
467 my($le, $i, $j, $ii, $jj, $new, $old, $distances); | |
468 | |
469 $le = scalar(@{$seeds}); | |
470 #$le2 = scalar(@{$species}); | |
471 for ($i = 0; $i < $le; $i++) { | |
472 $ii = $seeds->[$i]->[$place{'gene'}]; | |
473 for ($j = 0; $j < $i; $j++) { | |
474 $jj = $seeds->[$j]->[$place{'gene'}]; | |
475 undef $old; undef $new; | |
476 $old = $distances->{$ii}->{$jj}; | |
477 next if ($seeds->[$i]->[$place{'pair'}] ne $seeds->[$j]->[$place{'pair'}]); | |
478 next if ($seeds->[$i]->[$place{'main'}] and $seeds->[$j]->[$place{'main'}]); #SEED ORTHOLOGS WILL BE CLUSTERED ANYWAY | |
479 if ($seeds->[$i]->[$place{'species'}] eq $seeds->[$j]->[$place{'species'}]) { | |
480 #if (!$use_bootstrap) {$distances->{$ii}->{$jj} = $distances->{$jj}->{$ii} = $maxINPscore ;next;} | |
481 next if !$toMainOrthologOfSelf; | |
482 $distances->{$ii}->{$jj} = $distances->{$jj}->{$ii} = | |
483 $seeds->[$j]->[$place{'IPscore'}] if $seeds->[$i]->[$place{'main'}]; | |
484 $distances->{$ii}->{$jj} = $distances->{$jj}->{$ii} = | |
485 $seeds->[$i]->[$place{'IPscore'}] if $seeds->[$j]->[$place{'main'}]; | |
486 } #THEY ARE FROM THE SAME SPECIES AND THE DISTANCE IS NOT USED | |
487 else { #THEY ARE FROM DIFFERENT SPECIES | |
488 if ($seeds->[$i]->[$place{'main'}] or $seeds->[$j]->[$place{'main'}]) { | |
489 #ONE OF THEM IS SEED ORTHOLOG (NO MATTER WHICH) | |
490 $distances->{$ii}->{$jj} = | |
491 $distances->{$jj}->{$ii} = | |
492 inpDist ($seeds, $i, $j) * | |
493 inpDist ($seeds, $j, $i) ; | |
494 } | |
495 else { | |
496 #BOTH ARE IN-PARALOGS | |
497 $distances->{$ii}->{$jj} = | |
498 mainDist($seeds, $i, $j) * | |
499 inpDist ($seeds, $i, $j) * | |
500 inpDist ($seeds, $j, $i) ; | |
501 $distances->{$jj}->{$ii} = | |
502 mainDist($seeds, $j, $i) * | |
503 inpDist ($seeds, $i, $j) * | |
504 inpDist ($seeds, $j, $i) ; | |
505 } | |
506 } | |
507 $new = $distances->{$ii}->{$jj}; | |
508 print "Distance $ii-$jj is $new\n" if $debug; | |
509 print "Distance $jj-$ii is $new\n" if $debug; | |
510 print "Distance $ii-$jj is not equal to old one $old\n" if ($old and ($old != $new)); | |
511 } | |
512 } | |
513 return($distances); | |
514 } | |
515 | |
516 sub mainDist ($$$) { #finds distances between two main (seed) orthologs | |
517 my($seeds, $from, $to) = @_; | |
518 my($ii, $pair1, $pair2, $sum, $nu, $nc); | |
519 | |
520 return($maxINPscore) if !$use_bootstrap; #if we think bootstrap values are not relevant in calculating the distance, then every Main-Main distance should bemaxINPscore = 1 (and this is the default) | |
521 $pair1 = $seeds->[$from]->[$place{'species'}].'-'.$seeds->[$to]->[$place{'species'}]; | |
522 $pair2 = $seeds->[$to]->[$place{'species'}].'-'.$seeds->[$from]->[$place{'species'}]; | |
523 $sum = $nc = $nu = 0; | |
524 | |
525 for $ii(@{$seeds}) { | |
526 if ( | |
527 ($ii->[$place{'main'}]) and | |
528 (($ii->[$place{'pair'}] eq $pair1) or ($ii->[$place{'pair'}] eq $pair2)) and | |
529 ($ii->[$place{'species'}]) eq $seeds->[$from]->[$place{'species'}]) { | |
530 $nc++; | |
531 $sum += $ii->[$place{'IPscore'}]; | |
532 } | |
533 } | |
534 | |
535 if (!$nc) { | |
536 print "Distance $seeds->[$from]->[$place{'gene'}]-$seeds->[$to]->[$place{'gene'}] (pair $pair1 or $pair2) not found\n"; | |
537 return(undef); | |
538 } | |
539 else | |
540 { | |
541 return($sum / $nc); | |
542 } | |
543 } | |
544 | |
545 sub inpDist ($$$) { #finds a distance from main (seed) ortholog to in-paralog | |
546 my($seeds, $from, $to) = @_; | |
547 my($le, $ii, $pair1, $pair2, $sum, $nu, $nc); | |
548 | |
549 return($maxINPscore) if ($seeds->[$from]->[$place{'main'}] and !$use_bootstrap); | |
550 $le = scalar(@{$seeds}); | |
551 $pair1 = $seeds->[$from]->[$place{'species'}].'-'.$seeds->[$to]->[$place{'species'}]; | |
552 $pair2 = $seeds->[$to]->[$place{'species'}].'-'.$seeds->[$from]->[$place{'species'}]; | |
553 $sum = $nc = $nu = 0; | |
554 for $ii(@{$seeds}) { | |
555 if ( | |
556 ($ii->[$place{'gene'}] eq $seeds->[$from]->[$place{'gene'}]) and | |
557 (($ii->[$place{'pair'}] eq $pair1) or | |
558 ($ii->[$place{'pair'}] eq $pair2))) { | |
559 return($ii->[$place{'IPscore'}]); | |
560 } | |
561 } | |
562 print "Distance $seeds->[$from]->[$place{'gene'}]-$seeds->[$to]->[$place{'gene'}] not found\n"; | |
563 return undef; | |
564 } | |
565 | |
566 sub sortBy ($$) { | |
567 my($num, $ar) = @_; | |
568 my($i, $max, $maxnum, @sorted); | |
569 | |
570 #array2StdOut($ar, 'Before'); | |
571 | |
572 LINE001: while (scalar(@{$ar}) > 0) { | |
573 if (scalar(@{$ar}) == 1 and !$ar->[$i]->[$num]) { | |
574 last; | |
575 } | |
576 $max = 0; | |
577 undef $maxnum; | |
578 for ($i = 0; $i < scalar(@{$ar}); $i++) { | |
579 last if !$ar->[$i]->[$num]; | |
580 if ($ar->[$i]->[$num] > $max) { | |
581 $max = $ar->[$i]->[$num]; | |
582 $maxnum = $i; | |
583 } | |
584 } | |
585 if (defined($maxnum)) { | |
586 push @sorted, splice(@{$ar}, $maxnum, 1); | |
587 next LINE001; | |
588 } | |
589 } | |
590 return(\@sorted); | |
591 } | |
592 | |
593 sub setUnique ($) { #removes duplicated entries from the (pre-)cluster | |
594 my($seeds) = @_; | |
595 my($i, $j, $unique, $sds); | |
596 | |
597 $sds->[$place{'clusterID'}] = $seeds->[$place{'clusterID'}]; | |
598 for ($i = 1; $i < scalar(@{$seeds}); $i++) { | |
599 $unique = 1; | |
600 for ($j = 0; $j < scalar(@{$sds}); $j++) { | |
601 if (($seeds->[$i]->[$place{'species'}] eq $sds->[$j]->[$place{'species'}]) and ($seeds->[$i]->[$place{'gene'}] eq $sds->[$j]->[$place{'gene'}])) | |
602 {$unique = 0; | |
603 last;} | |
604 } | |
605 if ($unique) {push(@{$sds}, $seeds->[$i]);} | |
606 } | |
607 return($sds); | |
608 } | |
609 | |
610 sub isAlready ($$) { | |
611 my($as, $included) = @_; | |
612 my($ai); | |
613 | |
614 for $ai(@{$included}) { | |
615 return(1) if ($ai->[4] eq $as); | |
616 } | |
617 return(undef); | |
618 } | |
619 | |
620 sub alreadyClustered { #helps to avoid clustering same gene twice. Of two alternatives, the one is chosen where the gene is a main (seed) ortholog or was included first. | |
621 my($g, $cco) = @_; | |
622 my($ai); | |
623 | |
624 return(undef) if !defined($out->{'byName'}->{$g->[$place{'gene'}]}->{$cco}); | |
625 | |
626 if ($out->{'byRecord'}->[$out->{'byName'}->{$g->[$place{'gene'}]}->{$cco}]->[3]) { | |
627 return 1; | |
628 } | |
629 else { | |
630 undef $out->{'byRecord'}->[$out->{'byName'}->{$g->[$place{'gene'}]}->{$cco}]; | |
631 return(undef); | |
632 }} | |
633 | |
634 sub firstMains { #seed orthologs are included unconditionally | |
635 my($seeds) = @_; | |
636 my($i, $le, @included); | |
637 | |
638 $le = scalar(@{$seeds}); | |
639 for ($i = 0; $i < $le; $i++) { | |
640 # and ($seeds->[$i]->[$place{'IPscore'}] == 1) | |
641 if ($seeds->[$i]->[$place{'main'}]) { | |
642 push @included, $seeds->[$i]; | |
643 } | |
644 } | |
645 return(@included); | |
646 } | |
647 | |
648 sub getOthers ($$$$) { #retains only genes with distance (well, it is in fact a SIMILARITY) above the cut-off $cco. If it is not set, or equals to 0, the procedure just returns IPscore as a reference - to be saved in the output file. | |
649 my($seeds, $distances, $included, $cco) = @_; | |
650 my($i, $le, $les2, $nc, $ii, $jj, $dist, $ad, $nai, @others); | |
651 | |
652 $le = scalar(@{$seeds}); | |
653 for $ii(@{$seeds}) { | |
654 next if isAlready($ii->[$place{'gene'}], $included); | |
655 undef $dist; undef $nc; | |
656 for $jj(@{$included}) { | |
657 next if (($jj->[$place{'species'}] eq $ii->[$place{'species'}]) and !$toMainOrthologOfSelf); | |
658 if (!$distances->{$ii->[$place{'gene'}]}->{$jj->[$place{'gene'}]}) { | |
659 print "The distance between $ii->[$place{'gene'}] and $jj->[$place{'gene'}] does not exist...\n" if $debug; | |
660 next; | |
661 } | |
662 if ($jj->[$place{'main'}]) { | |
663 print "( $ii->[$place{'gene'}] - $jj->[$place{'gene'}] ) + " if $debug; | |
664 $dist += $distances->{$ii->[$place{'gene'}]}->{$jj->[$place{'gene'}]}; | |
665 $nc++; | |
666 } | |
667 } | |
668 print " / $nc = ".$dist/$nc."\n" if $debug; | |
669 return(undef) if !$nc; | |
670 if ($dist / $nc >= $cco) { | |
671 push @others, $ii; | |
672 $others[$#others]->[$place{'IPscore'}] = $dist / $nc; | |
673 } | |
674 } | |
675 return(\@others); | |
676 } | |
677 | |
678 sub getClosests ($$$$) { #used only in the deprecated 'ToAllSorted' algorithm | |
679 my($seeds, $distances, $included, $cco) = @_; | |
680 my($ni, $ii, $jj, $kk, $dist, $gotnew, $toadd, $nt, $max); | |
681 | |
682 do { | |
683 undef $gotnew; | |
684 for $ii(@{$seeds}) { | |
685 next if isAlready($ii->[$place{'gene'}], $included); | |
686 undef $ni; undef $dist; undef $max; | |
687 for $jj(@{$included}) { | |
688 undef $nt; undef $toadd; | |
689 print "$ii->[$place{'gene'}] vs $jj->[$place{'gene'}] \n" if $debug; | |
690 if ($ii->[$place{'species'}] ne $jj->[$place{'species'}]) { | |
691 $toadd = $distances->{$ii->[$place{'gene'}]}->{$jj->[$place{'gene'}]}; | |
692 } | |
693 else { | |
694 for $kk(keys(%{$distances->{$ii->[$place{'gene'}]}->{$jj->[$place{'gene'}]}})) { | |
695 $toadd += $distances->{$ii->[$place{'gene'}]}->{$jj->[$place{'gene'}]}->{$kk}; $nt++; | |
696 } | |
697 $toadd = $toadd / $nt if $nt; | |
698 } | |
699 $dist += $toadd; $ni++ if $toadd; | |
700 print "A distance for $ii->[$place{'gene'}] and $jj->[$place{'gene'}] does not exist...\n" if (!$toadd and $debug); | |
701 } | |
702 if ($ni and ($dist / $ni > $max->['IPscore'])) { | |
703 $max = $ii; $max->['IPscore'] = $dist / $ni; | |
704 } | |
705 if ($max->['IPscore'] >= $cco) { | |
706 print "Included $max->[4] as $dist / $ni > $cco\n" if $debug; | |
707 push(@{$included}, $max); | |
708 $gotnew = 1; | |
709 } | |
710 } | |
711 } while ($gotnew); | |
712 return($included); | |
713 } | |
714 | |
715 sub array2StdOut { #for debug purpose only | |
716 my($ar, $name) = @_; | |
717 my($ii, $jj); | |
718 | |
719 print "$name: \n"; | |
720 return(0) if !scalar(@{$ar}); | |
721 for $ii(@{$ar}) { | |
722 for $jj(@{$ii}) { | |
723 print "$jj\t" if defined($jj); | |
724 } | |
725 print "\n"; | |
726 } | |
727 return undef; | |
728 } |