annotate PGAP-1.2.1/multiparanoid.pl @ 5:d8c5bea1cce2 draft

Uploaded
author dereeper
date Thu, 24 Jun 2021 17:56:22 +0000
parents 83e62a1aeeeb
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
1 #!/usr/bin/perl
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
2 #
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
3 # Copyright (C) Andrey Alexeyenko and Erik Sonnhammer, 2006
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
4 # For a license to use this program, contact Erik.Sonnhammer@cbc.su.se
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
5 ###############################################################################
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
6 # Andrey Alexeyenko, Stockholm Bioinformatics Center, Albanova, Stockholm 2006
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
7 # mailto:avalex99@mail.ru
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
8 ###############################################################################
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
9 # The program is a postprocessor of ortholog tables produced by InParanoid. It combines pairwise InParanoid outputs (say, A-B, B-C, A-C) and makes multi-species clusters of orthologs ABC.
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
10
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
11 use strict vars;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
12 use DBI;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
13 use Cwd;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
14 require 5.002;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
15
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
16 our($inputdir, $makeunique, $indirect, $Pair,
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
17 @fields, @cutoffs, $output, @fields_out, $out, $algo, $debug, $SQLprefix,
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
18 %place, $nplaces, $source_by_cluster, $source_by_gene, $source_by_taken, $seeds, $use_bootstrap, $maxINPscore, $toMainOrthologOfSelf);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
19 my($npairs, $i, $j, $ii, $ee, $gg, $nn, $oo, $s2, $s1, $nspec, $as, $newmain, @distances, @species, $sp1, $sp2, $clusno, $specletters, $a_pair, $a_cluster, $pms, $input, $output, $inputfile, $new, $thepair);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
20 ###############################################################################
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
21
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
22 #FIRST OF ALL, set the variables $inputdir and $output below. MultiParanoid may not work without it!!!
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
23
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
24 #Parameters to submit (may be abbreviated to the first letter):
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
25 # OBLIGATORY:
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
26 # -species: a list of several species connected with "+"; e.g. mouse+cat+dog; names of the genomes exactly how they are called both in file names and inside of files. NOTE: due to phylogenetic tree conflicts, order of the genomes slightly changes the result. The input files (the InParanoid output) should thus be called:
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
27 # sqltable.dog-cat, sqltable.dog-mouse etc.
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
28 #To use other file names, one should change the variable $SQLprefix below.
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
29
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
30 # OPTIONAL:
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
31 # -cutoff: deprecated. But one can use it to tighten the clusters (to filter out weaker in paralogs by restricting with the distance to the seed orthologs); Example: " -cutoff 0.5 " or, for doing 3 different cluster versions at the same time: 0+0.2+0.5. Default: 0 (no cutoff).
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
32 # -algo: how to define between-paralogs distances while clustering.
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
33 # Options: onlymains (default and only reasonable)
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
34 # toallsorted
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
35 # -debug: 0 or 1; default: 0
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
36 # -input: deprecated, now defined by -species parameter, and the input files shall have conventional names
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
37 # -output: deprecated, special message will tell the name of the output file
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
38 # -unique: 0 or 1; set to 1, if the duplicates (genes clustered twice) should be removed; default: 0
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
39
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
40 #The MultiParanoid output header:
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
41 ### clusterID species gene is_seed_ortholog confidence_score species_in_cluster tree_conflict###
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
42
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
43 #is (hopefully) self-explanatory. Though:
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
44
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
45 # "is_seed_ortholog" means the protein was a seed ortholog in at least one InParanoid cluster.
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
46 # "confidence_score" is an average InParanoid score across the input clusters
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
47 # "tree_conflict" indicates that, from the point of view of different species, the number of inparalogs varied in at least one other species ("diff.numbers") or the numbers were the same, but the IDs differed ("diff.names").
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
48
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
49 #NOTE: s.c. 'distances' between cluster members are, in fact, similarities. Variables are called 'distances' for historical reasons.
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
50 # 'main orthologs' (defined by InParanoid) are better to be called 'seed orthologs'.
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
51
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
52 # Settings:
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
53
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
54 $algo = 'onlymains';
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
55 $debug = 0;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
56 $use_bootstrap = 0;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
57 $maxINPscore = 1;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
58 $toMainOrthologOfSelf = 0;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
59 $indirect = 1; #this option allows using the currently default InParanoid output, were seed orthologs are recognized by the feature "IPscore = 1.00". Otherwise ($indirect = 0) a special format should be used (needs a change in the InParanoid script and is now deprecated).
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
60
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
61 if ($indirect) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
62 $SQLprefix = 'sqltable.';
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
63 $inputdir = './'; #change it respectively
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
64 $place{'clusterID'} = 0; #these should correspond to the current InParanoid output format
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
65 $place{'BLASTscore'} = 1;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
66 $place{'species'} = 2;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
67 $place{'IPscore'} = 3;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
68 $place{'gene'} = 4;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
69 $place{'main'} = 5;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
70 $place{'pair'} = 6;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
71 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
72 else {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
73 $SQLprefix = 'ava_sql';
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
74 $inputdir = 'disk/dir/multiparanoid/'; #change it respectively
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
75 ###The older, ava_sql, version:
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
76 $place{'clusterID'} = 0;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
77 $place{'BLASTscore'} = 1;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
78 $place{'pair'} = 2;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
79 $place{'species'} = 3;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
80 $place{'gene'} = 4;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
81 $place{'IPscore'} = 5;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
82 $place{'main'} = 6;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
83 #$indirect = 1 if !defined($place{'main'});
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
84 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
85
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
86 $pms = parseParameters(join(' ', @ARGV));
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
87 if (!$pms->{'s'}) {die "No '+'-delimited input species list (pointing to INPARANOID output) specified. Check parameter '-species'... ";}
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
88 @species = split(/\+/, $pms->{'s'}); #list of species (genomes) as they appear in the file names. Default: no default
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
89 print "Species list: $pms->{'s'}\n is treated as species ".join(', ', @species)."\n";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
90 #print "Genomes @species are being processed...\n";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
91 $makeunique = 0;#also, mind the variables $inputdir and $output below
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
92 $makeunique = $pms->{'u'};
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
93
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
94 $specletters = fstLetters(@species);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
95 # $output = 'diskZ/MultiparanoidOuput/'.$specletters.'.MP.sql' if !$output; #change it respectively
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
96
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
97 $output = './MP.Cluster' ; #change it respectively
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
98
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
99
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
100 @cutoffs = split(/\+/, $pms->{'c'}); #Can restrict included in-paralogs by their distance to the main ones (see example below). Default: 0 (no cutooff)
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
101 #@cutoffs = (0, 0.25, 0.5, 0.75, 1);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
102 @cutoffs = (0) if !exists($cutoffs[0]);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
103
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
104 $algo = $pms->{'a'} if !$algo; #which algorithm to use. Default: onlymains
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
105 $debug = $pms->{'d'}; #degree of debug messages' output. Default: no
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
106 $output = $pms->{'o'} if !$output;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
107 $makeunique = $pms->{'u'} if !$makeunique;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
108
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
109 $nplaces = scalar(keys(%place));
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
110 @fields_out = (clusterID, species, gene, is_seed_ortholog, confidence_score, species_in_cluster, tree_conflict);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
111 push @fields_out, 'cutoff' if scalar(@cutoffs) > 1;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
112 $nspec = scalar(@species);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
113 open OUT, ">$output";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
114
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
115 print "Genome pairs found and used: \n" if $debug > 0;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
116 for ($s1 = 0; $s1 < $nspec; $s1++) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
117 for ($s2 = 0; $s2 < $nspec; $s2++) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
118 next if ($species[$s1] eq $species[$s2]);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
119 $Pair = $species[$s1].'-'.$species[$s2];
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
120 $inputfile = $inputdir.$SQLprefix.$Pair;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
121 if (!system("ls $inputfile")) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
122 loadData($inputfile);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
123 $npairs++;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
124 }}}
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
125 die "Not enough (or too many) species pair files for ".join(", ", @species)."\n" if $npairs != ($nspec * ($nspec - 1)) / 2;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
126 $clusno = 1;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
127
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
128 for $sp1(@species) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
129 for $sp2(@species) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
130 $thepair = "$sp1-$sp2";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
131
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
132 next if (!$source_by_cluster->{$thepair} or ($sp1 eq $sp2));
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
133 print "Analyzing $sp1 vs. $sp2: " if $debug;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
134 print scalar(keys(%{$source_by_cluster->{$thepair}}))." clusters for this pair...\n" if $debug;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
135 for $a_cluster(keys(%{$source_by_cluster->{$thepair}})) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
136 undef $new; undef $seeds;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
137 $seeds = findOrthos($a_cluster, $thepair);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
138
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
139 #find recordset of orthologs of the both species (for cluster a) from the pairwise subset i-j
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
140 next if !defined($seeds->[0]);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
141 LINE1:for $gg(@{$seeds}) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
142 next if (!$gg->[$place{'main'}] or $gg->[$nplaces]);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
143 $gg->[$nplaces] = 1;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
144 for $a_pair(keys(%{$source_by_cluster})) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
145 if (($a_pair =~ $gg->[$place{'species'}]) and ($gg->[$place{'pair'}] ne $a_pair)) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
146 $new = findNewOrtho($gg->[$place{'gene'}], $a_pair);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
147 undef $newmain;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
148 if (defined($new)) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
149 for $nn(@{$new}) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
150 push(@{$seeds}, $nn) if isNew($seeds, $nn);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
151 $newmain = 1 if $nn->[$place{'main'}];
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
152 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
153 redo LINE1 if $newmain;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
154 }}}}
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
155
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
156 makeClusters($seeds, \@cutoffs, \@species, $clusno++, treeConflict($seeds), $specletters);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
157 }}}
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
158 if ($clusno > 20) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
159 print OUT join("\t", @fields_out)."\n";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
160 for $oo(@{$out->{'byRecord'}}) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
161 print OUT join("\t", @{$oo})."\n" if defined($oo);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
162 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
163 print "Making ortholog clusters of ".join(', ', @species)." done. The result is in file $output\n";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
164 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
165 else {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
166 die "No output has been produced..."
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
167 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
168 ###########################################################################
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
169
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
170 sub parseParameters {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
171 my($parameters) = @_;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
172 my($key, $value, $pars, %pars);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
173
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
174 #print "$parameters\n";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
175 $_ = $parameters;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
176 while (m/\-(\w+)\s+([a-zA-Z0-9._=+]+)/g) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
177 next if ((lc($2) eq 'f') or (lc($2) eq 'false') or (lc($2) eq 'no') or (lc($2) eq '0'));
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
178 $pars{substr($1, 0, 1)} = $2;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
179 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
180 if (scalar(keys(%pars)) < 1 or !$pars{'s'} or m/\-h/) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
181 print "\nNOT ENOUGH PARAMETERS!\nOBLIGATORY:
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
182 -species: a list of several species connected with '+'; e.g. mouse+cat+dog; names of the genomes exactly how they are called both in file names and inside of files. NOTE: due to phylogenetic tree conflicts, order of the genomes slightly changes the result. The input files (the InParanoid output) should thus be called:
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
183 sqltable.dog-cat, sqltable.dog-mouse etc.
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
184 To use other file names, one should change the variable \$SQLprefix inside the scrtipt.
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
185 \nOPTIONAL:
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
186 \n-cutoff: deprecated. But one can use it to tighten the clusters (to filter out weaker in paralogs by restricting with the distance to the seed orthologs); Example: ' -cutoff 0.5 ' or, for doing 3 different cluster versions at the same time: 0+0.2+0.5. Default: 0 (no cutoff).
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
187 \n-algo: how to define between-paralogs distances while clustering.
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
188 Options: onlymains (default and only reasonable)
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
189 toallsorted
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
190 \n-debug: 0 or 1; default: 0
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
191 \n-input: deprecated, now defined by -species parameter and the variables \$inputdir and \$SQLprefix, and the input files shall have conventional names
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
192 \n-output: deprecated, special message will tell the name of the output file
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
193 \n-unique: 0 or 1; set to '0' if the duplicates (genes clustered twice) should BE REMOVED, to '1' otherwise; default: 0\n\n"; exit;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
194 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
195 while (($key, $value) = each(%pars)) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
196 print "Parameter $key = $value;\n";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
197 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
198
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
199 return(\%pars);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
200 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
201
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
202 sub treeConflict { #tells if there is a discrepancy, for species B, between 2-species trees A-B and B-C
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
203 my($seeds) = @_;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
204 my($i, $ii, $jj, $kk, $sets);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
205
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
206 for $ii(@{$seeds}) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
207 push @{$sets->{$ii->[$place{'species'}]}->{$ii->[$place{'pair'}]}}, $ii->[$place{'gene'}];
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
208 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
209
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
210 for $ii(keys(%{$sets})) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
211 for $jj(keys(%{$sets->{$ii}})) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
212 for $kk(keys(%{$sets->{$ii}})) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
213 next if $jj eq $kk;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
214 if (scalar(@{$sets->{$ii}->{$jj}}) != scalar(@{$sets->{$ii}->{$kk}})) {return('diff. numbers');}
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
215 sort {$a <=> $b} @{$jj};
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
216 sort {$a <=> $b} @{$kk};
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
217 for ($i = 0; $i < scalar(@{$sets->{$ii}->{$jj}}); $i++) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
218 if ($sets->{$ii}->{$jj}->[$i] ne $sets->{$ii}->{$kk}->[$i]) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
219 return('diff. names');
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
220 }}}}}
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
221 return(undef);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
222 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
223
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
224 sub makeClusters {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
225 my($seeds, $cutoffs, $species, $clusno, $conflict, $param) = @_;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
226 my($clo, $le, $gg, @included, $distances, $specset, $cco);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
227
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
228 array2StdOut($seeds, "Primers") if $debug;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
229 if ($algo eq 'onlymains') {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
230 $distances = pairWiseDist2Main($seeds);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
231 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
232 elsif ($algo eq 'toallsorted') {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
233 $distances = pairWiseDist2All($seeds);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
234 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
235 else {die "Algorithm is not set...";}
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
236 array2StdOut(setUnique($seeds), "Unique") if $debug;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
237 for $cco(@{$cutoffs}) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
238 undef @included;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
239
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
240 @included = firstMains($seeds);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
241 @included = @{setUnique(\@included)} if defined(@included);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
242 array2StdOut(\@included, 'Mains') if $debug;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
243 goto LINE3 if $cco == 1;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
244
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
245 if ($algo eq 'onlymains') {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
246 push @included, @{getOthers(
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
247 setUnique($seeds),
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
248 $distances,
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
249 \@included,
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
250 $cco)};
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
251 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
252 elsif ($algo eq 'toallsorted') {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
253 @included = @{getClosests(
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
254 setUnique($seeds),
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
255 $distances,
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
256 \@included,
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
257 $cco)};}
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
258 LINE3:array2StdOut(\@included, "Cluster with cut-off $cco") if $debug;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
259 LINE4: $specset = join('', fstLetters(sort(listSpecies(\@included, $species))));
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
260 $conflict = (($conflict) ? $conflict : 'No');
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
261 for $gg(@included) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
262 next if $makeunique and alreadyClustered($gg, $cco);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
263 $ii = scalar(@{$out->{'byRecord'}});
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
264 $out->{'byName'}->{$gg->[$place{'gene'}]}->{$cco} = $ii;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
265
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
266 for $ee($clusno,
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
267 $gg->[$place{'species'}],
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
268 $gg->[$place{'gene'}],
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
269 $gg->[$place{'main'}],
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
270 $gg->[$place{'IPscore'}],
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
271 $specset,
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
272 $conflict) {push @{$out->{'byRecord'}->[$ii]}, $ee;}
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
273 push @{$out->{'byRecord'}->[$ii]}, $cco if scalar(@cutoffs) > 1;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
274
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
275 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
276 #$out .= join("\t", (100 * $cco, $clusno, $gg->[$place{'species'}], $gg->[$place{'gene'}], $gg->[$place{'main'}], $gg->[$place{'IPscore'}], $specset, $conflict, $param, $algo))."\n" if !$makeunique or !alreadyClustered($gg);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
277 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
278 return;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
279 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
280
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
281
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
282 sub fstLetters {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
283 my(@list) = @_;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
284 my($letters, $ii, $firstLetter, $lastLetter);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
285
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
286 return(join('-', sort {$a cmp $b} @list));
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
287 $firstLetter = 0;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
288 for $ii(@list) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
289 $lastLetter = length($ii) - $firstLetter;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
290 $letters .= "_".substr($ii, $firstLetter, $lastLetter);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
291 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
292
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
293 return $letters;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
294 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
295
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
296 sub listSpecies {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
297 my($ar, $species) = @_;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
298 my($already, %already, $ii, @flags);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
299 # checks for species present in the current cluster
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
300
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
301 for $ii(@{$ar}) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
302 push(@flags, $ii->[$place{'species'}]) if !$already{$ii->[$place{'species'}]};
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
303 $already{$ii->[$place{'species'}]} = 1;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
304 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
305 return(@flags);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
306 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
307
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
308 sub loadData {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
309 my($inputfile) = @_;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
310 my($line, @line, $i);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
311
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
312 open(IN, $inputfile);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
313 print "Loading $inputfile...\n";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
314 while (<IN>) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
315 @line = split /\s+/;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
316 #$place{'BLASTscore'}, $place{'species'}, $place{'IPscore'}, $place{'main'}
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
317 for $i(0..scalar(keys(%place))) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
318 if ($indirect) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
319 if ($i == $place{'main'}) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
320 $source_by_cluster->{$indirect ? $Pair : $line[$place{'pair'}]}->{$line[$place{'clusterID'}]}->{$line[$place{'gene'}]}->[$i] = ($line[$place{'IPscore'}] == $maxINPscore) ? 1 : 0;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
321 $source_by_gene->{$indirect ? $Pair : $line[$place{'pair'}]}->{$line[$place{'gene'}]}->{$line[$place{'clusterID'}]}->[$i] = ($line[$place{'IPscore'}] == $maxINPscore) ? 1 : 0;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
322 next;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
323 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
324 if ($i == $place{'pair'}) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
325 $source_by_cluster->{$Pair}->{$line[$place{'clusterID'}]}->{$line[$place{'gene'}]}->[$i] = $Pair;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
326 $source_by_gene->{$Pair}->{$line[$place{'gene'}]}->{$line[$place{'clusterID'}]}->[$i] = $Pair;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
327 next;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
328 }}
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
329 $source_by_cluster->{$indirect ? $Pair : $line[$place{'pair'}]}->{$line[$place{'clusterID'}]}->{$line[$place{'gene'}]}->[$i] = $line[$i];
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
330 $source_by_gene->{$indirect ? $Pair :$line[$place{'pair'}]}->{$line[$place{'gene'}]}->{$line[$place{'clusterID'}]}->[$i] = $line[$i];
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
331 }}
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
332 return;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
333 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
334
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
335 sub findOrthos { #initially picks up orthologs in the first pair of genomes
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
336 my($a, $thepair) = @_;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
337 my($set, $sm, $i, $newel);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
338
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
339 return(undef) if $source_by_taken->{$thepair}->{$a};
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
340 for (keys(%{$source_by_cluster->{$thepair}->{$a}})) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
341 $newel = $#{$seeds} + 1;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
342 for $i(0..($nplaces - 1)) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
343 $seeds->[$newel]->[$i] = $source_by_cluster->{$thepair}->{$a}->{$_}->[$i];
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
344 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
345 $source_by_taken->{$thepair}->{$a} = 1;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
346 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
347 return $seeds;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
348 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
349
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
350 sub findNewOrtho { #adds orthologs from the remaining genome pairs
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
351 my($g, $thepair) = @_;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
352 my($new, $ii, $j, $jj, $sm, $newel);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
353
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
354 for (keys(%{$source_by_gene->{$thepair}->{$g}})) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
355 next if $source_by_taken->{$thepair}->{$_};
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
356 $newel = $#{$new} + 1;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
357 for $jj(@{$source_by_gene->{$thepair}->{$g}->{$_}}) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
358 $new->[$newel]->[$j] = $jj;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
359 $j++;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
360 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
361 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
362 if (scalar(@{$new}) == 1) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
363 $new = findOrthos($new->[0]->[$place{'clusterID'}], $new->[0]->[$place{'pair'}]);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
364 if (scalar(@{$new}) > 1) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
365 return($new);}
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
366 else {print "Unpaired seed ortholog...$new->[0]->[$place{'gene'}]\n";}
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
367 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
368 elsif (scalar(@{$new}) > 1) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
369 print "Multiple seed orthologs...\nPair '$thepair'\tGene '$g'";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
370 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
371 return(undef);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
372 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
373
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
374 sub isNew {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
375 my($seeds, $new) = @_;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
376 my($ii);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
377
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
378 for $ii(@{$seeds}) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
379 if (($ii->[$place{'gene'}] eq $new->[$place{'gene'}]) and ($ii->[$place{'pair'}] eq $new->[$place{'pair'}])) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
380 return(undef);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
381 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
382 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
383 return(1);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
384 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
385
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
386 sub pairWiseDist2All { #between-gene distances for algortihm 'ToAllSorted'; deprecated
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
387 my($seeds) = @_;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
388 my($i, $j, $ii, $jj, $new, $old, $same, $le, $distances, $ke, $va);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
389
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
390 $le = scalar(@{$seeds});
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
391
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
392 for ($i = 0; $i < $le; $i++) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
393 $ii = $seeds->[$i]->[$place{'gene'}];
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
394 #next if $distances->{$ii}->{$jj};
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
395 for ($j = 0; $j < $i; $j++) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
396 $jj = $seeds->[$j]->[$place{'gene'}];
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
397 undef $same; undef $old; undef $new;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
398 $same = 1 if ($seeds->[$i]->[$place{'species'}] eq $seeds->[$j]->[$place{'species'}]);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
399 if ($same) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
400 $old = $distances->{$ii}->{$jj}->{$seeds->[$i]->[$place{'pair'}]};}
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
401 else {$old = $distances->{$ii}->{$jj};}
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
402
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
403 next if ($seeds->[$i]->[$place{'pair'}] ne $seeds->[$j]->[$place{'pair'}]);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
404 #THEY DESCRIBE THE SAME PAIR OF GENOMES
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
405 if ($same) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
406 #THEY ARE FROM THE SAME SPECIES
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
407 #next if $distances-> {$ii}->{$jj}->{$seeds->[$i]->[$place{'species'}]};
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
408 if (!$seeds->[$i]->[$place{'main'}] and !$seeds->[$j]->[$place{'main'}]) { #BOTH ARE IN-PARALOGS
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
409 $distances->{$ii}->{$jj}->{$seeds->[$i]->[$place{'pair'}]} = $distances->{$jj}->{$ii}->{$seeds->[$i]->[$place{'pair'}]} = $seeds->[$i]->[$place{'IPscore'}] * $seeds->[$j]->[$place{'IPscore'}];
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
410 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
411 elsif ($seeds->[$i]->[$place{'main'}]) { #THIS ONE IS IN-PARALOG
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
412 $distances->{$ii}->{$jj}->{$seeds->[$i]->[$place{'pair'}]} = $distances->{$jj}->{$ii}->{$seeds->[$i]->[$place{'pair'}]} = $seeds->[$j]->[$place{'IPscore'}];
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
413 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
414 elsif ($seeds->[$j]->[$place{'main'}]) { #THIS ONE IS IN-PARALOG
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
415 $distances->{$ii}->{$jj}->{$seeds->[$i]->[$place{'pair'}]} = $distances->{$jj}->{$ii}->{$seeds->[$i]->[$place{'pair'}]} = $seeds->[$i]->[$place{'IPscore'}];
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
416 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
417 else { #BOTH ARE SEED ORTHOLOGS
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
418 $distances->{$ii}->{$jj}->{$seeds->[$i]->[$place{'pair'}]} = $distances->{$jj}->{$ii}->{$seeds->[$i]->[$place{'pair'}]} = $maxINPscore;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
419 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
420 $new = $distances->{$ii}->{$jj}->{$seeds->[$i]->[$place{'pair'}]};
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
421 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
422 else {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
423 #THEY ARE FROM THE DIFFERENT SPECIES
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
424 if ($seeds->[$i]->[$place{'main'}] and $seeds->[$j]->[$place{'main'}]) #BOTH ARE SEED ORTHOLOGS
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
425 {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
426 $distances->{$ii}->{$jj} = inpDist($seeds, $i, $j);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
427 $distances->{$jj}->{$ii} = inpDist($seeds, $j, $i);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
428 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
429 elsif ($seeds->[$i]->[$place{'main'}] or $seeds->[$j]->[$place{'main'}]) #STILL, ONE OF THEM IS IN-PARALOG (NO MATTER WHICH)
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
430 {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
431 $distances->{$ii}->{$jj} =
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
432 $distances->{$jj}->{$ii} =
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
433 inpDist ($seeds, $i, $j) * inpDist ($seeds, $j, $i) ;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
434 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
435 else
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
436 #BOTH ARE IN-PARALOGS
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
437 {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
438 $distances->{$ii}->{$jj} =
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
439 mainDist($seeds, $i, $j) *
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
440 inpDist ($seeds, $i, $j) *
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
441 inpDist ($seeds, $j, $i) ;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
442 $distances->{$jj}->{$ii} =
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
443 mainDist($seeds, $j, $i) *
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
444 inpDist ($seeds, $i, $j) *
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
445 inpDist ($seeds, $j, $i) ;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
446 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
447 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
448 $new = $distances->{$ii}->{$jj};
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
449 if (!$same) {print "Distance $ii-$jj is $new\n" if $debug; print "Distance $jj-$ii is $new\n" if $debug;}
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
450 else {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
451 if ($debug) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
452 print "Distance $ii-$jj is\n";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
453 while (($ke, $va) = each(%{$distances->{$ii}->{$jj}})) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
454 print "\tfor $ke: $va\n";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
455 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
456 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
457 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
458 print "Distance $ii-$jj is not equal to old one $old\n" if ($old and ($old != $new));
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
459 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
460 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
461 #for ($k = 0; $k < $le; $k++) {$distances->[$k]->[$k] = 1;}
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
462 return($distances);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
463 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
464
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
465 sub pairWiseDist2Main ($) {#between-gene distances for algortihm 'OnlyMains'
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
466 my($seeds) = @_;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
467 my($le, $i, $j, $ii, $jj, $new, $old, $distances);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
468
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
469 $le = scalar(@{$seeds});
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
470 #$le2 = scalar(@{$species});
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
471 for ($i = 0; $i < $le; $i++) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
472 $ii = $seeds->[$i]->[$place{'gene'}];
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
473 for ($j = 0; $j < $i; $j++) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
474 $jj = $seeds->[$j]->[$place{'gene'}];
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
475 undef $old; undef $new;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
476 $old = $distances->{$ii}->{$jj};
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
477 next if ($seeds->[$i]->[$place{'pair'}] ne $seeds->[$j]->[$place{'pair'}]);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
478 next if ($seeds->[$i]->[$place{'main'}] and $seeds->[$j]->[$place{'main'}]); #SEED ORTHOLOGS WILL BE CLUSTERED ANYWAY
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
479 if ($seeds->[$i]->[$place{'species'}] eq $seeds->[$j]->[$place{'species'}]) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
480 #if (!$use_bootstrap) {$distances->{$ii}->{$jj} = $distances->{$jj}->{$ii} = $maxINPscore ;next;}
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
481 next if !$toMainOrthologOfSelf;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
482 $distances->{$ii}->{$jj} = $distances->{$jj}->{$ii} =
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
483 $seeds->[$j]->[$place{'IPscore'}] if $seeds->[$i]->[$place{'main'}];
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
484 $distances->{$ii}->{$jj} = $distances->{$jj}->{$ii} =
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
485 $seeds->[$i]->[$place{'IPscore'}] if $seeds->[$j]->[$place{'main'}];
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
486 } #THEY ARE FROM THE SAME SPECIES AND THE DISTANCE IS NOT USED
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
487 else { #THEY ARE FROM DIFFERENT SPECIES
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
488 if ($seeds->[$i]->[$place{'main'}] or $seeds->[$j]->[$place{'main'}]) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
489 #ONE OF THEM IS SEED ORTHOLOG (NO MATTER WHICH)
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
490 $distances->{$ii}->{$jj} =
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
491 $distances->{$jj}->{$ii} =
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
492 inpDist ($seeds, $i, $j) *
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
493 inpDist ($seeds, $j, $i) ;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
494 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
495 else {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
496 #BOTH ARE IN-PARALOGS
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
497 $distances->{$ii}->{$jj} =
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
498 mainDist($seeds, $i, $j) *
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
499 inpDist ($seeds, $i, $j) *
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
500 inpDist ($seeds, $j, $i) ;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
501 $distances->{$jj}->{$ii} =
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
502 mainDist($seeds, $j, $i) *
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
503 inpDist ($seeds, $i, $j) *
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
504 inpDist ($seeds, $j, $i) ;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
505 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
506 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
507 $new = $distances->{$ii}->{$jj};
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
508 print "Distance $ii-$jj is $new\n" if $debug;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
509 print "Distance $jj-$ii is $new\n" if $debug;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
510 print "Distance $ii-$jj is not equal to old one $old\n" if ($old and ($old != $new));
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
511 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
512 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
513 return($distances);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
514 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
515
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
516 sub mainDist ($$$) { #finds distances between two main (seed) orthologs
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
517 my($seeds, $from, $to) = @_;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
518 my($ii, $pair1, $pair2, $sum, $nu, $nc);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
519
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
520 return($maxINPscore) if !$use_bootstrap; #if we think bootstrap values are not relevant in calculating the distance, then every Main-Main distance should bemaxINPscore = 1 (and this is the default)
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
521 $pair1 = $seeds->[$from]->[$place{'species'}].'-'.$seeds->[$to]->[$place{'species'}];
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
522 $pair2 = $seeds->[$to]->[$place{'species'}].'-'.$seeds->[$from]->[$place{'species'}];
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
523 $sum = $nc = $nu = 0;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
524
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
525 for $ii(@{$seeds}) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
526 if (
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
527 ($ii->[$place{'main'}]) and
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
528 (($ii->[$place{'pair'}] eq $pair1) or ($ii->[$place{'pair'}] eq $pair2)) and
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
529 ($ii->[$place{'species'}]) eq $seeds->[$from]->[$place{'species'}]) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
530 $nc++;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
531 $sum += $ii->[$place{'IPscore'}];
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
532 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
533 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
534
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
535 if (!$nc) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
536 print "Distance $seeds->[$from]->[$place{'gene'}]-$seeds->[$to]->[$place{'gene'}] (pair $pair1 or $pair2) not found\n";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
537 return(undef);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
538 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
539 else
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
540 {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
541 return($sum / $nc);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
542 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
543 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
544
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
545 sub inpDist ($$$) { #finds a distance from main (seed) ortholog to in-paralog
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
546 my($seeds, $from, $to) = @_;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
547 my($le, $ii, $pair1, $pair2, $sum, $nu, $nc);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
548
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
549 return($maxINPscore) if ($seeds->[$from]->[$place{'main'}] and !$use_bootstrap);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
550 $le = scalar(@{$seeds});
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
551 $pair1 = $seeds->[$from]->[$place{'species'}].'-'.$seeds->[$to]->[$place{'species'}];
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
552 $pair2 = $seeds->[$to]->[$place{'species'}].'-'.$seeds->[$from]->[$place{'species'}];
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
553 $sum = $nc = $nu = 0;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
554 for $ii(@{$seeds}) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
555 if (
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
556 ($ii->[$place{'gene'}] eq $seeds->[$from]->[$place{'gene'}]) and
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
557 (($ii->[$place{'pair'}] eq $pair1) or
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
558 ($ii->[$place{'pair'}] eq $pair2))) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
559 return($ii->[$place{'IPscore'}]);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
560 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
561 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
562 print "Distance $seeds->[$from]->[$place{'gene'}]-$seeds->[$to]->[$place{'gene'}] not found\n";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
563 return undef;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
564 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
565
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
566 sub sortBy ($$) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
567 my($num, $ar) = @_;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
568 my($i, $max, $maxnum, @sorted);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
569
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
570 #array2StdOut($ar, 'Before');
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
571
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
572 LINE001: while (scalar(@{$ar}) > 0) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
573 if (scalar(@{$ar}) == 1 and !$ar->[$i]->[$num]) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
574 last;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
575 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
576 $max = 0;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
577 undef $maxnum;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
578 for ($i = 0; $i < scalar(@{$ar}); $i++) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
579 last if !$ar->[$i]->[$num];
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
580 if ($ar->[$i]->[$num] > $max) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
581 $max = $ar->[$i]->[$num];
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
582 $maxnum = $i;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
583 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
584 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
585 if (defined($maxnum)) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
586 push @sorted, splice(@{$ar}, $maxnum, 1);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
587 next LINE001;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
588 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
589 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
590 return(\@sorted);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
591 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
592
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
593 sub setUnique ($) { #removes duplicated entries from the (pre-)cluster
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
594 my($seeds) = @_;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
595 my($i, $j, $unique, $sds);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
596
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
597 $sds->[$place{'clusterID'}] = $seeds->[$place{'clusterID'}];
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
598 for ($i = 1; $i < scalar(@{$seeds}); $i++) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
599 $unique = 1;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
600 for ($j = 0; $j < scalar(@{$sds}); $j++) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
601 if (($seeds->[$i]->[$place{'species'}] eq $sds->[$j]->[$place{'species'}]) and ($seeds->[$i]->[$place{'gene'}] eq $sds->[$j]->[$place{'gene'}]))
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
602 {$unique = 0;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
603 last;}
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
604 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
605 if ($unique) {push(@{$sds}, $seeds->[$i]);}
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
606 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
607 return($sds);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
608 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
609
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
610 sub isAlready ($$) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
611 my($as, $included) = @_;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
612 my($ai);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
613
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
614 for $ai(@{$included}) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
615 return(1) if ($ai->[4] eq $as);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
616 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
617 return(undef);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
618 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
619
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
620 sub alreadyClustered { #helps to avoid clustering same gene twice. Of two alternatives, the one is chosen where the gene is a main (seed) ortholog or was included first.
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
621 my($g, $cco) = @_;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
622 my($ai);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
623
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
624 return(undef) if !defined($out->{'byName'}->{$g->[$place{'gene'}]}->{$cco});
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
625
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
626 if ($out->{'byRecord'}->[$out->{'byName'}->{$g->[$place{'gene'}]}->{$cco}]->[3]) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
627 return 1;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
628 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
629 else {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
630 undef $out->{'byRecord'}->[$out->{'byName'}->{$g->[$place{'gene'}]}->{$cco}];
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
631 return(undef);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
632 }}
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
633
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
634 sub firstMains { #seed orthologs are included unconditionally
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
635 my($seeds) = @_;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
636 my($i, $le, @included);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
637
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
638 $le = scalar(@{$seeds});
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
639 for ($i = 0; $i < $le; $i++) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
640 # and ($seeds->[$i]->[$place{'IPscore'}] == 1)
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
641 if ($seeds->[$i]->[$place{'main'}]) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
642 push @included, $seeds->[$i];
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
643 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
644 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
645 return(@included);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
646 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
647
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
648 sub getOthers ($$$$) { #retains only genes with distance (well, it is in fact a SIMILARITY) above the cut-off $cco. If it is not set, or equals to 0, the procedure just returns IPscore as a reference - to be saved in the output file.
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
649 my($seeds, $distances, $included, $cco) = @_;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
650 my($i, $le, $les2, $nc, $ii, $jj, $dist, $ad, $nai, @others);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
651
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
652 $le = scalar(@{$seeds});
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
653 for $ii(@{$seeds}) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
654 next if isAlready($ii->[$place{'gene'}], $included);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
655 undef $dist; undef $nc;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
656 for $jj(@{$included}) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
657 next if (($jj->[$place{'species'}] eq $ii->[$place{'species'}]) and !$toMainOrthologOfSelf);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
658 if (!$distances->{$ii->[$place{'gene'}]}->{$jj->[$place{'gene'}]}) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
659 print "The distance between $ii->[$place{'gene'}] and $jj->[$place{'gene'}] does not exist...\n" if $debug;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
660 next;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
661 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
662 if ($jj->[$place{'main'}]) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
663 print "( $ii->[$place{'gene'}] - $jj->[$place{'gene'}] ) + " if $debug;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
664 $dist += $distances->{$ii->[$place{'gene'}]}->{$jj->[$place{'gene'}]};
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
665 $nc++;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
666 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
667 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
668 print " / $nc = ".$dist/$nc."\n" if $debug;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
669 return(undef) if !$nc;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
670 if ($dist / $nc >= $cco) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
671 push @others, $ii;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
672 $others[$#others]->[$place{'IPscore'}] = $dist / $nc;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
673 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
674 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
675 return(\@others);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
676 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
677
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
678 sub getClosests ($$$$) { #used only in the deprecated 'ToAllSorted' algorithm
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
679 my($seeds, $distances, $included, $cco) = @_;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
680 my($ni, $ii, $jj, $kk, $dist, $gotnew, $toadd, $nt, $max);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
681
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
682 do {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
683 undef $gotnew;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
684 for $ii(@{$seeds}) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
685 next if isAlready($ii->[$place{'gene'}], $included);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
686 undef $ni; undef $dist; undef $max;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
687 for $jj(@{$included}) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
688 undef $nt; undef $toadd;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
689 print "$ii->[$place{'gene'}] vs $jj->[$place{'gene'}] \n" if $debug;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
690 if ($ii->[$place{'species'}] ne $jj->[$place{'species'}]) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
691 $toadd = $distances->{$ii->[$place{'gene'}]}->{$jj->[$place{'gene'}]};
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
692 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
693 else {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
694 for $kk(keys(%{$distances->{$ii->[$place{'gene'}]}->{$jj->[$place{'gene'}]}})) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
695 $toadd += $distances->{$ii->[$place{'gene'}]}->{$jj->[$place{'gene'}]}->{$kk}; $nt++;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
696 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
697 $toadd = $toadd / $nt if $nt;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
698 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
699 $dist += $toadd; $ni++ if $toadd;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
700 print "A distance for $ii->[$place{'gene'}] and $jj->[$place{'gene'}] does not exist...\n" if (!$toadd and $debug);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
701 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
702 if ($ni and ($dist / $ni > $max->['IPscore'])) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
703 $max = $ii; $max->['IPscore'] = $dist / $ni;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
704 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
705 if ($max->['IPscore'] >= $cco) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
706 print "Included $max->[4] as $dist / $ni > $cco\n" if $debug;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
707 push(@{$included}, $max);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
708 $gotnew = 1;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
709 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
710 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
711 } while ($gotnew);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
712 return($included);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
713 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
714
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
715 sub array2StdOut { #for debug purpose only
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
716 my($ar, $name) = @_;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
717 my($ii, $jj);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
718
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
719 print "$name: \n";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
720 return(0) if !scalar(@{$ar});
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
721 for $ii(@{$ar}) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
722 for $jj(@{$ii}) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
723 print "$jj\t" if defined($jj);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
724 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
725 print "\n";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
726 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
727 return undef;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
728 }