0
|
1 #! /usr/bin/perl
|
|
2 ###############################################################################
|
|
3 # InParanoid version 4.1
|
|
4 # Copyright (C) Erik Sonnhammer, Kristoffer Forslund, Isabella Pekkari,
|
|
5 # Ann-Charlotte Berglund, Maido Remm, 2007
|
|
6 #
|
|
7 # This program is provided under the terms of a personal license to the recipient and may only
|
|
8 # be used for the recipient's own research at an academic insititution.
|
|
9 #
|
|
10 # Distribution of the results of this program must be discussed with the authors.
|
|
11 # For using this program in a company or for commercial purposes, a commercial license is required.
|
|
12 # Contact Erik.Sonnhammer@sbc.su.se in both cases
|
|
13 #
|
|
14 # Make sure that Perl XML libraries are installed!
|
|
15 #
|
|
16 # NOTE: This script requires blastall (NCBI BLAST) version 2.2.16 or higher, that supports
|
|
17 # compositional score matrix adjustment (-C2 flag).
|
|
18
|
|
19 my $usage =" Usage: inparanoid.pl <FASTAFILE with sequences of species A> <FASTAFILE with sequences of species B> [FASTAFILE with sequences of species C]
|
|
20
|
|
21 ";
|
|
22
|
|
23 ###############################################################################
|
|
24 # The program calculates orthologs between 2 datasets of proteins
|
|
25 # called A and B. Both datasets should be in multi-fasta file
|
|
26 # - Additionally, it tries to assign paralogous sequences (in-paralogs) to each
|
|
27 # thus forming paralogous clusters.
|
|
28 # - Confidence of in-paralogs is calculated in relative scale between 0-100%.
|
|
29 # This confidence value is dependent on how far is given sequence from the
|
|
30 # seed ortholog of given group
|
|
31 # - Confidence of groups can be calculated with bootstrapping. This is related
|
|
32 # to score difference between best hit and second best hit.
|
|
33 # - Optionally it can use a species C as outgroup.
|
|
34
|
|
35 ###############################################################################
|
|
36 # You may need to run the following command manually to increase your
|
|
37 # default datasize limit: 'limit datasize 500000 kb'
|
|
38
|
|
39 ###############################################################################
|
|
40 # Set following variables: #
|
|
41 ###############################################################################
|
|
42
|
|
43 # What do you want the program to do? #
|
|
44 $run_blast = 1; # Set to 1 if you don't have the 4 BLAST output files #
|
|
45 # Requires 'blastall', 'formatdb' (NCBI BLAST2) #
|
|
46 # and parser 'blast_parser.pl' #
|
|
47 $blast_two_passes = 1; # Set to 1 to run 2-pass strategy #
|
|
48 # (strongly recommended, but slower) #
|
|
49 $run_inparanoid = 1;
|
|
50 $use_bootstrap = 1;# Use bootstrapping to estimate the confidence of orthologs#
|
|
51 # Needs additional programs 'seqstat.jar' and 'blast2faa.pl'
|
|
52 $use_outgroup = 0; # Use proteins from the third genome as an outgroup #
|
|
53 # Reject best-best hit if outgroup sequence is MORE #
|
|
54 # similar to one of the sequences #
|
|
55 # (by more than $outgroup_cutoff bits) #
|
|
56
|
|
57 # Define location of files and programs:
|
|
58 #$blastall = "blastall -VT"; #Remove -VT for blast version 2.2.12 or earlier
|
|
59 $blastall = "$ARGV[0] -a $ARGV[1]"; #Add -aN to use N processors
|
|
60 $formatdb = "$ARGV[2]";
|
|
61 $seqstat = "seqstat.jar";
|
|
62 $blastParser = "blast_parser.pl";
|
|
63
|
|
64 #$matrix = "BLOSUM62"; # Reasonable default for comparison of eukaryotes.
|
|
65 $matrix = "BLOSUM45"; #(for prokaryotes),
|
|
66 #$matrix = "BLOSUM80"; #(orthologs within metazoa),
|
|
67 #$matrix = "PAM70";
|
|
68 #$matrix = "PAM30";
|
|
69
|
|
70 # Output options: #
|
|
71 $output = 0; # table_stats-format output #
|
|
72 $table = 0; # Print tab-delimited table of orthologs to file "table.txt" #
|
|
73 # Each orthologous group with all inparalogs is on one line #
|
|
74 $mysql_table = 1; # Print out sql tables for the web server #
|
|
75 # Each inparalog is on separate line #
|
|
76 $html = 0; # HTML-format output #
|
|
77
|
|
78 # Algorithm parameters:
|
|
79 # Default values should work without problems.
|
|
80 # MAKE SURE, however, that the score cutoff here matches what you used for BLAST!
|
|
81 $score_cutoff = $ARGV[3]; # In bits. Any match below this is ignored #
|
|
82 $outgroup_cutoff = 50; # In bits. Outgroup sequence hit must be this many bits#
|
|
83 # stronger to reject best-best hit between A and B #
|
|
84 $conf_cutoff = 0.05; # Include in-paralogs with this confidence or better #
|
|
85 $group_overlap_cutoff = 0.5; # Merge groups if ortholog in one group has more #
|
|
86 # than this confidence in other group #
|
|
87 $grey_zone = 0; # This many bits signifies the difference between 2 scores #
|
|
88 $show_times = 0; # Show times spent for execution of each part of the program #
|
|
89 # (This does not work properly) #
|
|
90 $debug = 0; # Print debugging messages or not. Levels 0,1,2 and 4 exist #
|
|
91
|
|
92 my $seq_overlap_cutoff = $ARGV[4]; # Match area should cover at least this much of longer sequence. Match area is defined as area from start of
|
|
93 # first segment to end of last segment, i.e segments 1-10 and 90-100 gives a match length of 100.
|
|
94 my $segment_coverage_cutoff = $ARGV[5]; # Actually matching segments must cover this much of longer sequence.
|
|
95 # For example, segments 1-10 and 90-100 gives a total length of 20.
|
|
96
|
|
97 splice(@ARGV,0,6);
|
|
98
|
|
99 ###############################################################################
|
|
100 # No changes should be required below this line #
|
|
101 ###############################################################################
|
|
102 $ENV{CLASSPATH} = "./$seqstat" if ($use_bootstrap);
|
|
103
|
|
104 if (!@ARGV){
|
|
105 print STDERR $usage;
|
|
106 exit 1;
|
|
107 }
|
|
108
|
|
109 if ((@ARGV < 2) and ($run_inparanoid)){
|
|
110 print STDERR "\n When \$run_inparanoid=1, at least two distinct FASTA files have to be specified.\n";
|
|
111
|
|
112 print STDERR $usage;
|
|
113 exit 1;
|
|
114 }
|
|
115
|
|
116 if ((!$run_blast) and (!$run_inparanoid)){
|
|
117 print STDERR "run_blast or run_inparanoid has to be set!\n";
|
|
118 exit 1;
|
|
119 }
|
|
120
|
|
121 # Input files:
|
|
122 $fasta_seq_fileA = "$ARGV[0]";
|
|
123 $fasta_seq_fileB = "$ARGV[1]";
|
|
124 $fasta_seq_fileC = "$ARGV[2]" if ($use_outgroup); # This is outgroup file
|
|
125
|
|
126 my $blast_outputAB = $fasta_seq_fileA . "-" . $fasta_seq_fileB;
|
|
127 my $blast_outputBA = $fasta_seq_fileB . "-" . $fasta_seq_fileA;
|
|
128 my $blast_outputAA = $fasta_seq_fileA . "-" . $fasta_seq_fileA;
|
|
129 my $blast_outputBB = $fasta_seq_fileB . "-" . $fasta_seq_fileB;
|
|
130
|
|
131 if ($use_outgroup){
|
|
132 $blast_outputAC = $fasta_seq_fileA . "-" . $fasta_seq_fileC;
|
|
133 $blast_outputBC = $fasta_seq_fileB . "-" . $fasta_seq_fileC;
|
|
134 }
|
|
135 my %idA; # Name -> ID combinations for species 1
|
|
136 my %idB; # Name -> ID combinations for species 2
|
|
137 my @nameA; # ID -> Name combinations for species 1
|
|
138 my @nameB; # ID -> Name combinations for species 2
|
|
139 my @nameC;
|
|
140 my %scoreAB; # Hashes with pairwise BLAST scores (in bits)
|
|
141 my %scoreBA;
|
|
142 my %scoreAA;
|
|
143 my %scoreBB;
|
|
144 my @hitnAB; # 1-D arrays that keep the number of pairwise hits
|
|
145 my @hitnBA;
|
|
146 my @hitnAA;
|
|
147 my @hitnBB;
|
|
148 my @hitAB; # 2-D arrays that keep the actual matching IDs
|
|
149 my @hitBA;
|
|
150 my @hitAA;
|
|
151 my @hitBB;
|
|
152 my @besthitAB; # IDs of best hits in other species (may contain more than one ID)
|
|
153 my @besthitBA; # IDs of best hits in other species (may contain more than one ID)
|
|
154 my @bestscoreAB; # best match A -> B
|
|
155 my @bestscoreBA; # best match B -> A
|
|
156 my @ortoA; # IDs of ortholog candidates from species A
|
|
157 my @ortoB; # IDs of ortholog candidates from species B
|
|
158 my @ortoS; # Scores between ortoA and ortoB pairs
|
|
159 my @paralogsA; # List of paralog IDs in given cluster
|
|
160 my @paralogsB; # List of paralog IDs in given cluster
|
|
161 my @confPA; # Confidence values for A paralogs
|
|
162 my @confPB; # Confidence values for B paralogs
|
|
163 my @confA; # Confidence values for orthologous groups
|
|
164 my @confB; # Confidence values for orthologous groups
|
|
165 my $prev_time = 0;
|
|
166
|
|
167 $outputfile = "Output." . $ARGV[0] . "-" . $ARGV[1];
|
|
168 if ($output){
|
|
169 open OUTPUT, ">$outputfile" or warn "Could not write to OUTPUT file $filename\n";
|
|
170 }
|
|
171
|
|
172 #################################################
|
|
173 # Assign ID numbers for species A
|
|
174 #################################################
|
|
175 open A, "$fasta_seq_fileA" or die "File A with sequences in FASTA format is missing
|
|
176 Usage $0 <FASTAFILE with sequences of species A> <FASTAFILE with sequences of species B> <FASTAFILE with sequences of species C>\n";
|
|
177 $id = 0;
|
|
178 while (<A>){
|
|
179 if(/^\>/){
|
|
180 ++$id;
|
|
181 chomp;
|
|
182 s/\>//;
|
|
183 @tmp = split(/\s+/);
|
|
184 #$name = substr($tmp[0],0,25);
|
|
185 $name = $tmp[0];
|
|
186 $idA{$name} = int($id);
|
|
187 $nameA[$id] = $name;
|
|
188 }
|
|
189 }
|
|
190 close A;
|
|
191 $A = $id;
|
|
192 print "$A sequences in file $fasta_seq_fileA\n";
|
|
193
|
|
194 if ($output){
|
|
195 print OUTPUT "$A sequences in file $fasta_seq_fileA\n";
|
|
196 }
|
|
197
|
|
198 if (@ARGV >= 2) {
|
|
199 #################################################
|
|
200 # Assign ID numbers for species B
|
|
201 #################################################
|
|
202 open B, "$fasta_seq_fileB" or die "File B with sequences in FASTA format is missing
|
|
203 Usage $0 <FASTAFILE with sequences of species A> <FASTAFILE with sequences of species B> <FASTAFILE with sequences of species C>\n";
|
|
204 $id = 0;
|
|
205 while (<B>){
|
|
206 if(/^\>/){
|
|
207 ++$id;
|
|
208 chomp;
|
|
209 s/\>//;
|
|
210 @tmp = split(/\s+/);
|
|
211 #$name = substr($tmp[0],0,25);
|
|
212 $name = $tmp[0];
|
|
213 $idB{$name} = int($id);
|
|
214 $nameB[$id] = $name;
|
|
215 }
|
|
216 }
|
|
217 $B = $id;
|
|
218 print "$B sequences in file $fasta_seq_fileB\n";
|
|
219 close B;
|
|
220
|
|
221 if ($output){
|
|
222 print OUTPUT "$B sequences in file $fasta_seq_fileB\n";
|
|
223 }
|
|
224 }
|
|
225 #################################################
|
|
226 # Assign ID numbers for species C (outgroup)
|
|
227 #################################################
|
|
228 if ($use_outgroup){
|
|
229 open C, "$fasta_seq_fileC" or die "File C with sequences in FASTA format is missing
|
|
230 Usage $0 <FASTAFILE with sequences of species A> <FASTAFILE with sequences of species B> <FASTAFILE with sequences of species C>\n";
|
|
231 $id = 0;
|
|
232 while (<C>){
|
|
233 if(/^\>/){
|
|
234 ++$id;
|
|
235 chomp;
|
|
236 s/\>//;
|
|
237 @tmp = split(/\s+/);
|
|
238 #$name = substr($tmp[0],0,25);
|
|
239 $name = $tmp[0];
|
|
240 $idC{$name} = int($id);
|
|
241 $nameC[$id] = $name;
|
|
242 }
|
|
243 }
|
|
244 $C = $id;
|
|
245 print "$C sequences in file $fasta_seq_fileC\n";
|
|
246 close C;
|
|
247 if ($output){
|
|
248 print OUTPUT "$C sequences in file $fasta_seq_fileC\n";
|
|
249 }
|
|
250 }
|
|
251 if ($show_times){
|
|
252 ($user_time,,,) = times;
|
|
253 printf ("Indexing sequences took %.2f seconds\n", ($user_time - $prev_time));
|
|
254 $prev_time = $user_time;
|
|
255 }
|
|
256
|
|
257 #################################################
|
|
258 # Run BLAST if not done already
|
|
259 #################################################
|
|
260 if ($run_blast){
|
|
261 print "Trying to run BLAST now - this may take several hours ... or days in worst case!\n";
|
|
262 print STDERR "Formatting BLAST databases\n";
|
|
263 system ("$formatdb -i $fasta_seq_fileA");
|
|
264 system ("$formatdb -i $fasta_seq_fileB") if (@ARGV >= 2);
|
|
265 system ("$formatdb -i $fasta_seq_fileC") if ($use_outgroup);
|
|
266 print STDERR "Done formatting\nStarting BLAST searches...\n";
|
|
267
|
|
268 # Run blast only if the files do not already exist is not default.
|
|
269 # NOTE: you should have done this beforehand, because you probably
|
|
270 # want two-pass blasting anyway which is not implemented here
|
|
271 # this is also not adapted to use specific compositional adjustment settings
|
|
272 # and might not use the proper blast parser...
|
|
273
|
|
274 do_blast ($fasta_seq_fileA, $fasta_seq_fileA, $A, $A, $blast_outputAA);
|
|
275
|
|
276 if (@ARGV >= 2) {
|
|
277 do_blast ($fasta_seq_fileA, $fasta_seq_fileB, $B, $B, $blast_outputAB);
|
|
278 do_blast ($fasta_seq_fileB, $fasta_seq_fileA, $A, $A, $blast_outputBA);
|
|
279 do_blast ($fasta_seq_fileB, $fasta_seq_fileB, $B, $B, $blast_outputBB);
|
|
280 }
|
|
281
|
|
282 if ($use_outgroup){
|
|
283
|
|
284 do_blast ($fasta_seq_fileA, $fasta_seq_fileC, $A, $C, $blast_outputAC);
|
|
285 do_blast ($fasta_seq_fileB, $fasta_seq_fileC, $B, $C, $blast_outputBC);
|
|
286 }
|
|
287
|
|
288 if ($show_times){
|
|
289 ($user_time,,,) = times;
|
|
290 printf ("BLAST searches took %.2f seconds\n", ($user_time - $prev_time));
|
|
291 $prev_time = $user_time;
|
|
292 }
|
|
293 print STDERR "Done BLAST searches. ";
|
|
294
|
|
295 } else {
|
|
296 print STDERR "Skipping blast! \n";
|
|
297 }
|
|
298
|
|
299 if ($run_inparanoid){
|
|
300 print STDERR "Starting ortholog detection...\n";
|
|
301 #################################################
|
|
302 # Read in best hits from blast output file AB
|
|
303 #################################################
|
|
304 $count = 0;
|
|
305 open AB, "$blast_outputAB" or die "Blast output file A->B is missing\n";
|
|
306 $old_idQ = 0;
|
|
307 while (<AB>){
|
|
308 chomp;
|
|
309 @Fld = split(/\s+/); # Get query, match and score
|
|
310
|
|
311 if( scalar @Fld < 9 ){
|
|
312 if($Fld[0]=~/done/){
|
|
313 print STDERR "AB ok\n";
|
|
314 }
|
|
315 next;
|
|
316 }
|
|
317
|
|
318 $q = $Fld[0];
|
|
319 $m = $Fld[1];
|
|
320 $idQ = $idA{$q}; # ID of query sequence
|
|
321 $idM = $idB{$m}; # ID of match sequence
|
|
322 $score = $Fld[2];
|
|
323
|
|
324 next if (!overlap_test(@Fld));
|
|
325
|
|
326 # Score must be equal to or above cut-off
|
|
327 next if ($score < $score_cutoff);
|
|
328
|
|
329 if(!$count || $q ne $oldq){
|
|
330 print "Match $m, score $score, ID for $q is missing\n" if ($debug == 2 and !(exists($idA{$q})));
|
|
331 $hitnAB[$idA{$oldq}] = $hit if($count); # Record number of hits for previous query
|
|
332 $hit = 0;
|
|
333 ++$count;
|
|
334 $oldq = $q;
|
|
335 }
|
|
336 ++$hit;
|
|
337 $hitAB[$idQ][$hit] = int($idM);
|
|
338 # printf ("hitAB[%d][%d] = %d\n",$idQ,$hit,$idM);
|
|
339 $scoreAB{"$idQ:$idM"} = $score;
|
|
340 $scoreBA{"$idM:$idQ"} = $score_cutoff; # Initialize mutual hit score - sometimes this is below score_cutoff
|
|
341 $old_idQ = $idQ;
|
|
342 # }
|
|
343 }
|
|
344 $hitnAB[$idQ] = $hit; # For the last query
|
|
345 #printf ("hitnAB[1] = %d\n",$hitnAB[1]);
|
|
346 #printf ("hitnAB[%d] = %d\n",$idQ,$hit);
|
|
347 close AB;
|
|
348 if ($output){
|
|
349 print OUTPUT "$count sequences $fasta_seq_fileA have homologs in dataset $fasta_seq_fileB\n";
|
|
350 }
|
|
351 #################################################
|
|
352 # Read in best hits from blast output file BA
|
|
353 #################################################
|
|
354 $count = 0;
|
|
355 open BA, "$blast_outputBA" or die "Blast output file B->A is missing\n";
|
|
356 $old_idQ = 0;
|
|
357 while (<BA>){
|
|
358 chomp;
|
|
359 @Fld = split(/\s+/); # Get query, match and score
|
|
360
|
|
361 if( scalar @Fld < 9 ){
|
|
362 if($Fld[0]=~/done/){
|
|
363 print STDERR "BA ok\n";
|
|
364 }
|
|
365 next;
|
|
366 }
|
|
367
|
|
368 $q = $Fld[0];
|
|
369 $m = $Fld[1];
|
|
370 $idQ = $idB{$q};
|
|
371 $idM = $idA{$m};
|
|
372 $score = $Fld[2];
|
|
373
|
|
374 next if (!overlap_test(@Fld));
|
|
375
|
|
376 next if ($score < $score_cutoff);
|
|
377
|
|
378 if(!$count || $q ne $oldq){
|
|
379 print "ID for $q is missing\n" if ($debug == 2 and (!exists($idB{$q})));
|
|
380 $hitnBA[$idB{$oldq}] = $hit if($count); # Record number of hits for previous query
|
|
381 $hit = 0;
|
|
382 ++$count;
|
|
383 $oldq = $q;
|
|
384 }
|
|
385 ++$hit;
|
|
386 $hitBA[$idQ][$hit] = int($idM);
|
|
387 # printf ("hitBA[%d][%d] = %d\n",$idQ,$hit,$idM);
|
|
388 $scoreBA{"$idQ:$idM"} = $score;
|
|
389 $scoreAB{"$idM:$idQ"} = $score_cutoff if ($scoreAB{"$idM:$idQ"} < $score_cutoff); # Initialize missing scores
|
|
390 $old_idQ = $idQ;
|
|
391 # }
|
|
392 }
|
|
393 $hitnBA[$idQ] = $hit; # For the last query
|
|
394 #printf ("hitnBA[%d] = %d\n",$idQ,$hit);
|
|
395 close BA;
|
|
396 if ($output){
|
|
397 print OUTPUT "$count sequences $fasta_seq_fileB have homologs in dataset $fasta_seq_fileA\n";
|
|
398 }
|
|
399 ##################### Equalize AB scores and BA scores ##########################
|
|
400
|
|
401 ###################################################################################################################################### Modification by Isabella 1
|
|
402
|
|
403 # I removed the time consuming all vs all search and equalize scores for all pairs where there was a hit
|
|
404
|
|
405 foreach my $key (keys %scoreAB) {
|
|
406
|
|
407 my ($a, $b) = split(':', $key);
|
|
408 my $key2 = $b . ':' . $a;
|
|
409
|
|
410 # If debugg mod is 5 and the scores A-B and B-A are unequal
|
|
411 # the names of the two sequences and their scores are printed
|
|
412 if ($scoreAB{$key} != $scoreBA{$key2}){
|
|
413 printf ("%-20s\t%-20s\t%d\t%d\n",$nameA[$a], $nameB[$b], $scoreAB{$key}, $scoreBA{$key2}) if ($debug == 5);
|
|
414 }
|
|
415
|
|
416 # Set score AB and score BA to the mean of scores AB and BA.
|
|
417 # The final score is saved as an integer so .5 needs to be added to avoid rounding errors
|
|
418 $scoreAB{$key} = $scoreBA{$key2} = int(($scoreAB{$key} + $scoreBA{$key2})/2.0 +.5);
|
|
419 }
|
|
420
|
|
421 # For all ids for sequences from organism A
|
|
422 #for $a(1..$A){
|
|
423 #For all ids for sequences from organism B
|
|
424 #for $b(1..$B){
|
|
425
|
|
426 # No need to equalize score if there was no match between sequence with id $a from species A
|
|
427 # and sequence with id $b from species B
|
|
428 # next if (!$scoreAB{"$a:$b"});
|
|
429
|
|
430 # If debugg mod is 5 and the scores A-B and B-A are unequal
|
|
431 # the names of the two sequences and their scores are printed
|
|
432 # if ($scoreAB{"$a:$b"} != $scoreBA{"$b:$a"}){
|
|
433 # printf ("%-20s\t%-20s\t%d\t%d\n",$nameA[$a], $nameB[$b], $scoreAB{"$a:$b"}, $scoreBA{"$b:$a"}) if ($debug == 5);
|
|
434 # }
|
|
435
|
|
436 # Set score AB and score BA to the mean of scores AB and BA.
|
|
437 # The final score is saved as an integer so .5 needs to be added to avoid rounding errors
|
|
438 # $scoreAB{"$a:$b"} = $scoreBA{"$b:$a"} = int(($scoreAB{"$a:$b"} + $scoreBA{"$b:$a"})/2.0 +.5);
|
|
439
|
|
440 # printf ("scoreAB{%d: %d} = %d\n", $a, $b, $scoreAB{"$a:$b"});
|
|
441 # printf ("scoreBA{%d: %d} = %d\n", $b, $a, $scoreBA{"$a:$b"});
|
|
442 #}
|
|
443 # }
|
|
444
|
|
445 ####################################################################################################################################### End modification by Isabella 1
|
|
446
|
|
447 ##################### Re-sort hits, besthits and bestscore #######################
|
|
448 for $idA(1..$A){
|
|
449 # print "Loop index = $idA\n";
|
|
450 # printf ("hitnAB[%d] = %d\n",$idA, $hitnAB[$idA]);
|
|
451 next if (!($hitnAB[$idA]));
|
|
452 for $hit (1..($hitnAB[$idA]-1)){ # Sort hits by score
|
|
453 while($scoreAB{"$idA:$hitAB[$idA][$hit]"} < $scoreAB{"$idA:$hitAB[$idA][$hit+1]"}){
|
|
454 $tmp = $hitAB[$idA][$hit];
|
|
455 $hitAB[$idA][$hit] = $hitAB[$idA][$hit+1];
|
|
456 $hitAB[$idA][$hit+1] = $tmp;
|
|
457 --$hit if ($hit > 1);
|
|
458 }
|
|
459 }
|
|
460 $bestscore = $bestscoreAB[$idA] = $scoreAB{"$idA:$hitAB[$idA][1]"};
|
|
461 $besthitAB[$idA] = $hitAB[$idA][1];
|
|
462 for $hit (2..$hitnAB[$idA]){
|
|
463 if ($bestscore - $scoreAB{"$idA:$hitAB[$idA][$hit]"} <= $grey_zone){
|
|
464 $besthitAB[$idA] .= " $hitAB[$idA][$hit]";
|
|
465 }
|
|
466 else {
|
|
467 last;
|
|
468 }
|
|
469 }
|
|
470 undef $is_besthitAB[$idA]; # Create index that we can check later
|
|
471 grep (vec($is_besthitAB[$idA],$_,1) = 1, split(/ /,$besthitAB[$idA]));
|
|
472 # printf ("besthitAB[%d] = hitAB[%d][%d] = %d\n",$idA,$idA,$hit,$besthitAB[$idA]);
|
|
473
|
|
474 }
|
|
475
|
|
476 for $idB(1..$B){
|
|
477 # print "Loop index = $idB\n";
|
|
478 next if (!($hitnBA[$idB]));
|
|
479 for $hit (1..($hitnBA[$idB]-1)){
|
|
480 # Sort hits by score
|
|
481 while($scoreBA{"$idB:$hitBA[$idB][$hit]"} < $scoreBA{"$idB:$hitBA[$idB][$hit+1]"}){
|
|
482 $tmp = $hitBA[$idB][$hit];
|
|
483 $hitBA[$idB][$hit] = $hitBA[$idB][$hit+1];
|
|
484 $hitBA[$idB][$hit+1] = $tmp;
|
|
485 --$hit if ($hit > 1);
|
|
486 }
|
|
487 }
|
|
488 $bestscore = $bestscoreBA[$idB] = $scoreBA{"$idB:$hitBA[$idB][1]"};
|
|
489 $besthitBA[$idB] = $hitBA[$idB][1];
|
|
490 for $hit (2..$hitnBA[$idB]){
|
|
491 if ($bestscore - $scoreBA{"$idB:$hitBA[$idB][$hit]"} <= $grey_zone){
|
|
492 $besthitBA[$idB] .= " $hitBA[$idB][$hit]";
|
|
493 }
|
|
494 else {last;}
|
|
495 }
|
|
496 undef $is_besthitBA[$idB]; # Create index that we can check later
|
|
497 grep (vec($is_besthitBA[$idB],$_,1) = 1, split(/ /,$besthitBA[$idB]));
|
|
498 # printf ("besthitBA[%d] = %d\n",$idA,$besthitAB[$idA]);
|
|
499 }
|
|
500
|
|
501 if ($show_times){
|
|
502 ($user_time,,,) = times;
|
|
503 printf ("Reading and sorting homologs took %.2f seconds\n", ($user_time - $prev_time));
|
|
504 $prev_time = $user_time;
|
|
505 }
|
|
506
|
|
507 ######################################################
|
|
508 # Now find orthologs:
|
|
509 ######################################################
|
|
510 $o = 0;
|
|
511
|
|
512 for $i(1..$A){ # For each ID in file A
|
|
513 if (defined $besthitAB[$i]){
|
|
514 @besthits = split(/ /,$besthitAB[$i]);
|
|
515 for $hit (@besthits){
|
|
516 if (vec($is_besthitBA[$hit],$i,1)){
|
|
517 ++$o;
|
|
518 $ortoA[$o] = $i;
|
|
519 $ortoB[$o] = $hit;
|
|
520 $ortoS[$o] = $scoreAB{"$i:$hit"}; # Should be equal both ways
|
|
521 # --$o if ($ortoS[$o] == $score_cutoff); # Ignore orthologs that are exactly at score_cutoff
|
|
522 print "Accept! " if ($debug == 2);
|
|
523 }
|
|
524 else {print " " if ($debug == 2);}
|
|
525 printf ("%-20s\t%d\t%-20s\t", $nameA[$i], $bestscoreAB[$i], $nameB[$hit]) if ($debug == 2);
|
|
526 print "$bestscoreBA[$hit]\t$besthitBA[$hit]\n" if ($debug == 2);
|
|
527 }
|
|
528 }
|
|
529 }
|
|
530 print "$o ortholog candidates detected\n" if ($debug);
|
|
531 #####################################################
|
|
532 # Sort orthologs by ID and then by score:
|
|
533 #####################################################
|
|
534
|
|
535 ####################################################################################################### Modification by Isabella 2
|
|
536
|
|
537 # Removed time consuiming bubble sort. Created an index array and sort that according to id and score.
|
|
538 # The put all clusters on the right place.
|
|
539
|
|
540 # Create an array used to store the position each element shall have in the final array
|
|
541 # The elements are initialized with the position numbers
|
|
542 my @position_index_array = (1..$o);
|
|
543
|
|
544 # Sort the position list according to id
|
|
545 my @id_sorted_position_list = sort { ($ortoA[$a]+$ortoB[$a]) <=> ($ortoA[$b] + $ortoB[$b]) } @position_index_array;
|
|
546
|
|
547 # Sort the list according to score
|
|
548 my @score_id_sorted_position_list = sort { $ortoS[$b] <=> $ortoS[$a] } @id_sorted_position_list;
|
|
549
|
|
550 # Create new arrays for the sorted information
|
|
551 my @new_ortoA;
|
|
552 my @new_ortoB;
|
|
553 my @new_orthoS;
|
|
554
|
|
555 # Add the information to the new arrays in the orer specifeid by the index array
|
|
556 for (my $index_in_list = 0; $index_in_list < scalar @score_id_sorted_position_list; $index_in_list++) {
|
|
557
|
|
558
|
|
559 my $old_index = $score_id_sorted_position_list[$index_in_list];
|
|
560 $new_ortoA[$index_in_list + 1] = $ortoA[$old_index];
|
|
561 $new_ortoB[$index_in_list + 1] = $ortoB[$old_index];
|
|
562 $new_ortoS[$index_in_list + 1] = $ortoS[$old_index];
|
|
563 }
|
|
564
|
|
565 @ortoA = @new_ortoA;
|
|
566 @ortoB = @new_ortoB;
|
|
567 @ortoS = @new_ortoS;
|
|
568
|
|
569 # Use bubblesort to sort ortholog pairs by id
|
|
570 # for $i(1..($o-1)){
|
|
571 # while(($ortoA[$i]+$ortoB[$i]) > ($ortoA[$i+1] + $ortoB[$i+1])){
|
|
572 # $tempA = $ortoA[$i];
|
|
573 # $tempB = $ortoB[$i];
|
|
574 # $tempS = $ortoS[$i];
|
|
575 #
|
|
576 # $ortoA[$i] = $ortoA[$i+1];
|
|
577 # $ortoB[$i] = $ortoB[$i+1];
|
|
578 # $ortoS[$i] = $ortoS[$i+1];
|
|
579 #
|
|
580 # $ortoA[$i+1] = $tempA;
|
|
581 # $ortoB[$i+1] = $tempB;
|
|
582 # $ortoS[$i+1] = $tempS;
|
|
583 #
|
|
584 # --$i if ($i > 1);
|
|
585 # }
|
|
586 # }
|
|
587 #
|
|
588 # # Use bubblesort to sort ortholog pairs by score
|
|
589 # for $i(1..($o-1)){
|
|
590 # while($ortoS[$i] < $ortoS[$i+1]){
|
|
591 # # Swap places:
|
|
592 # $tempA = $ortoA[$i];
|
|
593 # $tempB = $ortoB[$i];
|
|
594 # $tempS = $ortoS[$i];
|
|
595 #
|
|
596 # $ortoA[$i] = $ortoA[$i+1];
|
|
597 # $ortoB[$i] = $ortoB[$i+1];
|
|
598 # $ortoS[$i] = $ortoS[$i+1];
|
|
599 #
|
|
600 # $ortoA[$i+1] = $tempA;
|
|
601 # $ortoB[$i+1] = $tempB;
|
|
602 # $ortoS[$i+1] = $tempS;
|
|
603 #
|
|
604 # --$i if ($i > 1);
|
|
605 # }
|
|
606 # }
|
|
607
|
|
608 ###################################################################################################### End modification bt Isabella 2
|
|
609
|
|
610 @all_ortologsA = ();
|
|
611 @all_ortologsB = ();
|
|
612 for $i(1..$o){
|
|
613 push(@all_ortologsA,$ortoA[$i]); # List of all orthologs
|
|
614 push(@all_ortologsB,$ortoB[$i]);
|
|
615 }
|
|
616 undef $is_ortologA; # Create index that we can check later
|
|
617 undef $is_ortologB;
|
|
618 grep (vec($is_ortologA,$_,1) = 1, @all_ortologsA);
|
|
619 grep (vec($is_ortologB,$_,1) = 1, @all_ortologsB);
|
|
620
|
|
621 if ($show_times){
|
|
622 ($user_time,,,) = times;
|
|
623 printf ("Finding and sorting orthologs took %.2f seconds\n", ($user_time - $prev_time));
|
|
624 $prev_time = $user_time;
|
|
625 }
|
|
626 #################################################
|
|
627 # Read in best hits from blast output file AC
|
|
628 #################################################
|
|
629 if ($use_outgroup){
|
|
630 $count = 0;
|
|
631 open AC, "$blast_outputAC" or die "Blast output file A->C is missing\n";
|
|
632 while (<AC>){
|
|
633 chomp;
|
|
634 @Fld = split(/\s+/); # Get query, match and score
|
|
635
|
|
636 if( scalar @Fld < 9 ){
|
|
637 if($Fld[0]=~/done/){
|
|
638 print STDERR "AC ok\n";
|
|
639 }
|
|
640 next;
|
|
641 }
|
|
642
|
|
643 $q = $Fld[0];
|
|
644 $m = $Fld[1];
|
|
645 $idQ = $idA{$q};
|
|
646 $idM = $idC{$m};
|
|
647 $score = $Fld[2];
|
|
648 next unless (vec($is_ortologA,$idQ,1));
|
|
649
|
|
650 next if (!overlap_test(@Fld));
|
|
651
|
|
652 next if ($score < $score_cutoff);
|
|
653
|
|
654 next if ($count and ($q eq $oldq));
|
|
655 # Only comes here if this is the best hit:
|
|
656 $besthitAC[$idQ] = $idM;
|
|
657 $bestscoreAC[$idQ] = $score;
|
|
658 $oldq = $q;
|
|
659 ++$count;
|
|
660 }
|
|
661 close AC;
|
|
662 #################################################
|
|
663 # Read in best hits from blast output file BC
|
|
664 #################################################
|
|
665 $count = 0;
|
|
666 open BC, "$blast_outputBC" or die "Blast output file B->C is missing\n";
|
|
667 while (<BC>){
|
|
668 chomp;
|
|
669 @Fld = split(/\s+/); # Get query, match and score
|
|
670
|
|
671 if( scalar @Fld < 9 ){
|
|
672 if($Fld[0]=~/done/){
|
|
673 print STDERR "BC ok\n";
|
|
674 }
|
|
675 next;
|
|
676 }
|
|
677
|
|
678 $q = $Fld[0];
|
|
679 $m = $Fld[1];
|
|
680 $idQ = $idB{$q};
|
|
681 $idM = $idC{$m};
|
|
682 $score = $Fld[2];
|
|
683 next unless (vec($is_ortologB,$idQ,1));
|
|
684
|
|
685 next if (!overlap_test(@Fld));
|
|
686
|
|
687 next if ($score < $score_cutoff);
|
|
688
|
|
689 next if ($count and ($q eq $oldq));
|
|
690 # Only comes here if this is the best hit:
|
|
691 $besthitBC[$idQ] = $idM;
|
|
692 $bestscoreBC[$idQ] = $score;
|
|
693 $oldq = $q;
|
|
694 ++$count;
|
|
695 }
|
|
696 close BC;
|
|
697 ################################
|
|
698 # Detect rooting problems
|
|
699 ################################
|
|
700 $rejected = 0;
|
|
701 @del = ();
|
|
702 $file = "rejected_sequences." . $fasta_seq_fileC;
|
|
703 open OUTGR, ">$file";
|
|
704 for $i (1..$o){
|
|
705 $diff1 = $diff2 = 0;
|
|
706 $idA = $ortoA[$i];
|
|
707 $idB = $ortoB[$i];
|
|
708 $diff1 = $bestscoreAC[$idA] - $ortoS[$i];
|
|
709 $diff2 = $bestscoreBC[$idB] - $ortoS[$i];
|
|
710 if ($diff1 > $outgroup_cutoff){
|
|
711 print OUTGR "Ortholog pair $i ($nameA[$idA]-$nameB[$idB]).
|
|
712 $nameA[$idA] from $fasta_seq_fileA is closer to $nameC[$besthitAC[$idA]] than to $nameB[$idB]\n";
|
|
713 print OUTGR " $ortoS[$i] < $bestscoreAC[$idA] by $diff1\n";
|
|
714 }
|
|
715 if ($diff2 > $outgroup_cutoff){
|
|
716 print OUTGR "Ortholog pair $i ($nameA[$idA]-$nameB[$idB]).
|
|
717 $nameB[$idB] from $fasta_seq_fileB is closer to $nameC[$besthitBC[$idB]] than to $nameA[$idA]\n";
|
|
718 print OUTGR " $ortoS[$i] < $bestscoreBC[$idB] by $diff2\n";
|
|
719 }
|
|
720 if (($diff1 > $outgroup_cutoff) or ($diff2 > $outgroup_cutoff)){
|
|
721 ++$rejected;
|
|
722 $del[$i] = 1;
|
|
723 }
|
|
724 }
|
|
725 print "Number of rejected groups: $rejected (outgroup sequence was closer by more than $outgroup_cutoff bits)\n";
|
|
726 close OUTGR;
|
|
727 } # End of $use_outgroup
|
|
728 ################################
|
|
729 # Read inside scores from AA
|
|
730 ################################
|
|
731 $count = 0;
|
|
732 $max_hit = 0;
|
|
733 open AA, "$blast_outputAA" or die "Blast output file A->A is missing\n";
|
|
734 while (<AA>) {
|
|
735 chomp; # strip newline
|
|
736
|
|
737 @Fld = split(/\s+/); # Get query and match names
|
|
738
|
|
739 if( scalar @Fld < 9 ){
|
|
740 if($Fld[0]=~/done/){
|
|
741 print STDERR "AA ok\n";
|
|
742 }
|
|
743 next;
|
|
744 }
|
|
745
|
|
746 $q = $Fld[0];
|
|
747 $m = $Fld[1];
|
|
748 $score = $Fld[2];
|
|
749 next unless (vec($is_ortologA,$idA{$q},1));
|
|
750
|
|
751 next if (!overlap_test(@Fld));
|
|
752
|
|
753 next if ($score < $score_cutoff);
|
|
754
|
|
755 if(!$count || $q ne $oldq){ # New query
|
|
756 $max_hit = $hit if ($hit > $max_hit);
|
|
757 $hit = 0;
|
|
758 $oldq = $q;
|
|
759 }
|
|
760 ++$hit;
|
|
761 ++$count;
|
|
762 $scoreAA{"$idA{$q}:$idA{$m}"} = int($score + 0.5);
|
|
763 $hitAA[$idA{$q}][$hit] = int($idA{$m});
|
|
764 $hitnAA[$idA{$q}] = $hit;
|
|
765 }
|
|
766 close AA;
|
|
767 if ($output){
|
|
768 print OUTPUT "$count $fasta_seq_fileA-$fasta_seq_fileA matches\n";
|
|
769 }
|
|
770 ################################
|
|
771 # Read inside scores from BB
|
|
772 ################################
|
|
773 $count = 0;
|
|
774 open BB, "$blast_outputBB" or die "Blast output file B->B is missing\n";
|
|
775 while (<BB>) {
|
|
776 chomp; # strip newline
|
|
777
|
|
778 @Fld = split(/\s+/); # Get query and match names
|
|
779
|
|
780 if( scalar @Fld < 9 ){
|
|
781 if($Fld[0]=~/done/){
|
|
782 print STDERR "BB ok\n";
|
|
783 }
|
|
784 next;
|
|
785 }
|
|
786
|
|
787 $q = $Fld[0];
|
|
788 $m = $Fld[1];
|
|
789 $score = $Fld[2];
|
|
790 next unless (vec($is_ortologB,$idB{$q},1));
|
|
791
|
|
792 next if (!overlap_test(@Fld));
|
|
793
|
|
794 next if ($score < $score_cutoff);
|
|
795
|
|
796 if(!$count || $q ne $oldq){ # New query
|
|
797 $max_hit = $hit if ($hit > $max_hit);
|
|
798 $oldq = $q;
|
|
799 $hit = 0;
|
|
800 }
|
|
801 ++$count;
|
|
802 ++$hit;
|
|
803 $scoreBB{"$idB{$q}:$idB{$m}"} = int($score + 0.5);
|
|
804 $hitBB[$idB{$q}][$hit] = int($idB{$m});
|
|
805 $hitnBB[$idB{$q}] = $hit;
|
|
806 }
|
|
807 close BB;
|
|
808 if ($output){
|
|
809 print OUTPUT "$count $fasta_seq_fileB-$fasta_seq_fileB matches\n";
|
|
810 }
|
|
811 if ($show_times){
|
|
812 ($user_time,,,) = times;
|
|
813 printf ("Reading paralogous hits took %.2f seconds\n", ($user_time - $prev_time));
|
|
814 $prev_time = $user_time;
|
|
815 }
|
|
816 print "Maximum number of hits per sequence was $max_hit\n" if ($debug);
|
|
817 #####################################################
|
|
818 # Find paralogs:
|
|
819 #####################################################
|
|
820 for $i(1..$o){
|
|
821 $merge[$i] = 0;
|
|
822 next if($del[$i]); # If outgroup species was closer to one of the seed orthologs
|
|
823 $idA = $ortoA[$i];
|
|
824 $idB = $ortoB[$i];
|
|
825 local @membersA = ();
|
|
826 local @membersB = ();
|
|
827
|
|
828 undef $is_paralogA[$i];
|
|
829 undef $is_paralogB[$i];
|
|
830
|
|
831 print "$i: Ortholog pair $nameA[$idA] and $nameB[$idB]. $hitnAA[$idA] hits for A and $hitnBB[$idB] hits for B\n" if ($debug);
|
|
832 # Check if current ortholog is already clustered:
|
|
833 for $j(1..($i-1)){
|
|
834 # Overlap type 1: Both orthologs already clustered here -> merge
|
|
835 if ((vec($is_paralogA[$j],$idA,1)) and (vec($is_paralogB[$j],$idB,1))){
|
|
836 $merge[$i] = $j;
|
|
837 print "Merge CASE 1: group $i ($nameB[$idB]-$nameA[$idA]) and $j ($nameB[$ortoB[$j]]-$nameA[$ortoA[$j]])\n" if ($debug);
|
|
838 last;
|
|
839 }
|
|
840 # Overlap type 2: 2 competing ortholog pairs -> merge
|
|
841 elsif (($ortoS[$j] - $ortoS[$i] <= $grey_zone)
|
|
842 and (($ortoA[$j] == $ortoA[$i]) or ($ortoB[$j] == $ortoB[$i]))
|
|
843 # and ($paralogsA[$j])
|
|
844 ){ # The last condition is false if the previous cluster has been already deleted
|
|
845 $merge[$i] = $j;
|
|
846 print "Merge CASE 2: group $i ($nameA[$ortoA[$i]]-$nameB[$ortoB[$i]]) and $j ($nameA[$ortoA[$j]]-$nameB[$ortoB[$j]])\n" if ($debug);
|
|
847 last;
|
|
848 }
|
|
849 # Overlap type 3: DELETE One of the orthologs belongs to some much stronger cluster -> delete
|
|
850 elsif (((vec($is_paralogA[$j],$idA,1)) or (vec($is_paralogB[$j],$idB,1))) and
|
|
851 ($ortoS[$j] - $ortoS[$i] > $score_cutoff)){
|
|
852 print "Delete CASE 3: Cluster $i -> $j, score $ortoS[$i] -> $ortoS[$j], ($nameA[$ortoA[$j]]-$nameB[$ortoB[$j]])\n" if ($debug);
|
|
853 $merge[$i]= -1; # Means - do not add sequences to this cluster
|
|
854 $paralogsA[$i] = "";
|
|
855 $paralogsB[$i] = "";
|
|
856 last;
|
|
857 }
|
|
858 # Overlap type 4: One of the orthologs is close to the center of other cluster
|
|
859 elsif (((vec($is_paralogA[$j],$idA,1)) and ($confPA[$idA] > $group_overlap_cutoff)) or
|
|
860 ((vec($is_paralogB[$j],$idB,1)) and ($confPB[$idB] > $group_overlap_cutoff))){
|
|
861 print "Merge CASE 4: Cluster $i -> $j, score $ortoS[$i] -> $ortoS[$j], ($nameA[$ortoA[$j]]-$nameB[$ortoB[$j]])\n" if ($debug);
|
|
862 $merge[$i] = $j;
|
|
863 last;
|
|
864 }
|
|
865 # Overlap type 5:
|
|
866 # All clusters that were overlapping, but not catched by previous "if" statements will be DIVIDED!
|
|
867 }
|
|
868 next if ($merge[$i] < 0); # This cluster should be deleted
|
|
869 ##### Check for paralogs in A
|
|
870 $N = $hitnAA[$idA];
|
|
871 for $j(1..$N){
|
|
872 $hitID = $hitAA[$idA][$j]; # hit of idA
|
|
873 # print "Working with $nameA[$hitID]\n" if ($debug == 2);
|
|
874 # Decide whether this hit is inside the paralog circle:
|
|
875 if ( ($idA == $hitID) or ($scoreAA{"$idA:$hitID"} >= $bestscoreAB[$idA]) and
|
|
876 ($scoreAA{"$idA:$hitID"} >= $bestscoreAB[$hitID])){
|
|
877 if ($debug == 2){
|
|
878 print " Paralog candidates: ";
|
|
879 printf ("%-20s: %-20s", $nameA[$idA], $nameA[$hitID]);
|
|
880 print "\t$scoreAA{\"$idA:$hitID\"} : $bestscoreAB[$idA] : $bestscoreAB[$hitID]\n";
|
|
881 }
|
|
882 $paralogs = 1;
|
|
883 if ($scoreAA{"$idA:$idA"} == $ortoS[$i]){
|
|
884 if ($scoreAA{"$idA:$hitID"} == $scoreAA{"$idA:$idA"}){
|
|
885 $conf_here = 1.0; # In the center
|
|
886 }
|
|
887 else{
|
|
888 $conf_here = 0.0; # On the border
|
|
889 }
|
|
890 }
|
|
891 else {
|
|
892 $conf_here = ($scoreAA{"$idA:$hitID"} - $ortoS[$i]) /
|
|
893 ($scoreAA{"$idA:$idA"} - $ortoS[$i]);
|
|
894 }
|
|
895 # Check if this paralog candidate is already clustered in other clusters
|
|
896 for $k(1..($i-1)){
|
|
897 if (vec($is_paralogA[$k],$hitID,1)){ # Yes, found in cluster $k
|
|
898 if($debug == 2){
|
|
899 print " $nameA[$hitID] is already in cluster $k, together with:";
|
|
900 print " $nameA[$ortoA[$k]] and $nameB[$ortoB[$k]] ";
|
|
901 print "($scoreAA{\"$ortoA[$k]:$hitID\"})";
|
|
902 }
|
|
903 if (($confPA[$hitID] >= $conf_here) and
|
|
904 ($j != 1)){ # The seed ortholog CAN NOT remain there
|
|
905 print " and remains there.\n" if ($debug == 2);
|
|
906 $paralogs = 0; # No action
|
|
907 }
|
|
908 else { # Ortholog of THIS cluster is closer than ortholog of competing cluster $k
|
|
909 print " and should be moved here!\n" if ($debug == 2); # Remove from other cluster, add to this cluster
|
|
910 @membersAK = split(/ /, $paralogsA[$k]); # This array contains IDs
|
|
911 $paralogsA[$k] = "";# Remove all paralogs from cluster $k
|
|
912 @tmp = ();
|
|
913 for $m(@membersAK){
|
|
914 push(@tmp,$m) if ($m != $hitID); # Put other members back
|
|
915 }
|
|
916 $paralogsA[$k] = join(' ',@tmp);
|
|
917 undef $is_paralogA[$k]; # Create index that we can check later
|
|
918 grep (vec($is_paralogA[$k],$_,1) = 1, @tmp);
|
|
919 }
|
|
920 last;
|
|
921 }
|
|
922 }
|
|
923 next if (! $paralogs); # Skip that paralog - it is already in cluster $k
|
|
924 push (@membersA,$hitID); # Add this hit to paralogs of A
|
|
925 }
|
|
926 }
|
|
927 # Calculate confidence values now:
|
|
928 @tmp = ();
|
|
929 for $idP (@membersA){ # For each paralog calculate conf value
|
|
930 if($scoreAA{"$idA:$idA"} == $ortoS[$i]){
|
|
931 if ($scoreAA{"$idA:$idP"} == $scoreAA{"$idA:$idA"}){
|
|
932 $confPA[$idP] = 1.00;
|
|
933 }
|
|
934 else{
|
|
935 $confPA[$idP] = 0.00;
|
|
936 }
|
|
937 }
|
|
938 else{
|
|
939 $confPA[$idP] = ($scoreAA{"$idA:$idP"} - $ortoS[$i]) /
|
|
940 ($scoreAA{"$idA:$idA"} - $ortoS[$i]);
|
|
941 }
|
|
942 push (@tmp, $idP) if ($confPA[$idP] >= $conf_cutoff); # If one wishes to use only significant paralogs
|
|
943 }
|
|
944 @membersA = @tmp;
|
|
945 ########### Merge if necessary:
|
|
946 if ($merge[$i] > 0){ # Merge existing cluster with overlapping cluster
|
|
947 @tmp = split(/ /,$paralogsA[$merge[$i]]);
|
|
948 for $m (@membersA){
|
|
949 push (@tmp, $m) unless (vec($is_paralogA[$merge[$i]],$m,1));
|
|
950 }
|
|
951 $paralogsA[$merge[$i]] = join(' ',@tmp);
|
|
952 undef $is_paralogA[$merge[$i]];
|
|
953 grep (vec($is_paralogA[$merge[$i]],$_,1) = 1, @tmp); # Refresh index of paralog array
|
|
954 }
|
|
955 ######### Typical new cluster:
|
|
956 else{ # Create a new cluster
|
|
957 $paralogsA[$i] = join(' ',@membersA);
|
|
958 undef $is_paralogA; # Create index that we can check later
|
|
959 grep (vec($is_paralogA[$i],$_,1) = 1, @membersA);
|
|
960 }
|
|
961 ##### The same procedure for species B:
|
|
962 $N = $hitnBB[$idB];
|
|
963 for $j(1..$N){
|
|
964 $hitID = $hitBB[$idB][$j];
|
|
965 # print "Working with $nameB[$hitID]\n" if ($debug == 2);
|
|
966 if ( ($idB == $hitID) or ($scoreBB{"$idB:$hitID"} >= $bestscoreBA[$idB]) and
|
|
967 ($scoreBB{"$idB:$hitID"} >= $bestscoreBA[$hitID])){
|
|
968 if ($debug == 2){
|
|
969 print " Paralog candidates: ";
|
|
970 printf ("%-20s: %-20s", $nameB[$idB], $nameB[$hitID]);
|
|
971 print "\t$scoreBB{\"$idB:$hitID\"} : ";
|
|
972 print "$bestscoreBA[$idB] : $bestscoreBA[$hitID]\n";
|
|
973 }
|
|
974 $paralogs = 1;
|
|
975 if ($scoreBB{"$idB:$idB"} == $ortoS[$i]){
|
|
976 if ($scoreBB{"$idB:$hitID"} == $scoreBB{"$idB:$idB"}){
|
|
977 $conf_here = 1.0;
|
|
978 }
|
|
979 else{
|
|
980 $conf_here = 0.0;
|
|
981 }
|
|
982 }
|
|
983 else{
|
|
984 $conf_here = ($scoreBB{"$idB:$hitID"} - $ortoS[$i]) /
|
|
985 ($scoreBB{"$idB:$idB"} - $ortoS[$i]);
|
|
986 }
|
|
987
|
|
988 # Check if this paralog candidate is already clustered in other clusters
|
|
989 for $k(1..($i-1)){
|
|
990 if (vec($is_paralogB[$k],$hitID,1)){ # Yes, found in cluster $k
|
|
991 if($debug == 2){
|
|
992 print " $nameB[$hitID] is already in cluster $k, together with:";
|
|
993 print " $nameB[$ortoB[$k]] and $nameA[$ortoA[$k]] ";
|
|
994 print "($scoreBB{\"$ortoB[$k]:$hitID\"})";
|
|
995 }
|
|
996 if (($confPB[$hitID] >= $conf_here) and
|
|
997 ($j != 1)){ # The seed ortholog CAN NOT remain there
|
|
998 print " and remains there.\n" if ($debug == 2);
|
|
999 $paralogs = 0; # No action
|
|
1000 }
|
|
1001 else { # Ortholog of THIS cluster is closer than ortholog of competing cluster $k
|
|
1002 print " and should be moved here!\n" if ($debug == 2); # Remove from other cluster, add to this cluster
|
|
1003 @membersBK = split(/ /, $paralogsB[$k]); # This array contains names, not IDs
|
|
1004 $paralogsB[$k] = "";
|
|
1005 @tmp = ();
|
|
1006 for $m(@membersBK){
|
|
1007 push(@tmp,$m) if ($m != $hitID); # Put other members back
|
|
1008 }
|
|
1009 $paralogsB[$k] = join(' ',@tmp);
|
|
1010 undef $is_paralogB[$k]; # Create index that we can check later
|
|
1011 grep (vec($is_paralogB[$k],$_,1) = 1, @tmp);
|
|
1012 }
|
|
1013 last; # Don't search in other clusters
|
|
1014 }
|
|
1015 }
|
|
1016 next if (! $paralogs); # Skip that paralog - it is already in cluster $k
|
|
1017 push (@membersB,$hitID);
|
|
1018 }
|
|
1019 }
|
|
1020 # Calculate confidence values now:
|
|
1021 @tmp = ();
|
|
1022 for $idP (@membersB){ # For each paralog calculate conf value
|
|
1023 if($scoreBB{"$idB:$idB"} == $ortoS[$i]){
|
|
1024 if ($scoreBB{"$idB:$idP"} == $scoreBB{"$idB:$idB"}){
|
|
1025 $confPB[$idP] = 1.0;
|
|
1026 }
|
|
1027 else{
|
|
1028 $confPB[$idP] = 0.0;
|
|
1029 }
|
|
1030 }
|
|
1031 else{
|
|
1032 $confPB[$idP] = ($scoreBB{"$idB:$idP"} - $ortoS[$i]) /
|
|
1033 ($scoreBB{"$idB:$idB"} - $ortoS[$i]);
|
|
1034 }
|
|
1035 push (@tmp, $idP) if ($confPB[$idP] >= $conf_cutoff); # If one wishes to use only significant paralogs
|
|
1036 }
|
|
1037 @membersB = @tmp;
|
|
1038 ########### Merge if necessary:
|
|
1039 if ($merge[$i] > 0){ # Merge existing cluster with overlapping cluster
|
|
1040 @tmp = split(/ /,$paralogsB[$merge[$i]]);
|
|
1041 for $m (@membersB){
|
|
1042 push (@tmp, $m) unless (vec($is_paralogB[$merge[$i]],$m,1));
|
|
1043 }
|
|
1044 $paralogsB[$merge[$i]] = join(' ',@tmp);
|
|
1045 undef $is_paralogB[$merge[$i]];
|
|
1046 grep (vec($is_paralogB[$merge[$i]],$_,1) = 1, @tmp); # Refresh index of paralog array
|
|
1047 }
|
|
1048 ######### Typical new cluster:
|
|
1049 else{ # Create a new cluster
|
|
1050 $paralogsB[$i] = join(' ',@membersB);
|
|
1051 undef $is_paralogB; # Create index that we can check later
|
|
1052 grep (vec($is_paralogB[$i],$_,1) = 1, @membersB);
|
|
1053 }
|
|
1054 }
|
|
1055 if ($show_times){
|
|
1056 ($user_time,,,) = times;
|
|
1057 printf ("Finding in-paralogs took %.2f seconds\n", ($user_time - $prev_time));
|
|
1058 $prev_time = $user_time;
|
|
1059 }
|
|
1060 #####################################################
|
|
1061 &clean_up(1);
|
|
1062 ####################################################
|
|
1063 # Find group for orphans. If cluster contains only one member, find where it should go:
|
|
1064 for $i (1..$o){
|
|
1065 @membersA = split(/ /, $paralogsA[$i]);
|
|
1066 @membersB = split(/ /, $paralogsB[$i]);
|
|
1067 $na = @membersA;
|
|
1068 $nb = @membersB;
|
|
1069 if (($na == 0) and $nb){
|
|
1070 print "Warning: empty A cluster $i\n";
|
|
1071 for $m (@membersB){
|
|
1072 $bestscore = 0;
|
|
1073 $bestgroup = 0;
|
|
1074 $bestmatch = 0;
|
|
1075 for $j (1..$o) {
|
|
1076 next if ($i == $j); # Really need to check against all 100% members of the group.
|
|
1077 @membersBJ = split(/ /, $paralogsB[$j]);
|
|
1078 for $k (@membersBJ){
|
|
1079 next if ($confPB[$k] != 1); # For all 100% in-paralogs
|
|
1080 $score = $scoreBB{"$m:$k"};
|
|
1081 if ($score > $bestscore){
|
|
1082 $bestscore = $score;
|
|
1083 $bestgroup = $j;
|
|
1084 $bestmatch = $k;
|
|
1085 }
|
|
1086 }
|
|
1087 }
|
|
1088 print "Orphan $nameB[$m] goes to group $bestgroup with $nameB[$bestmatch]\n" ;
|
|
1089 @members = split(/ /, $paralogsB[$bestgroup]);
|
|
1090 push (@members, $m);
|
|
1091 $paralogsB[$bestgroup] = join(' ',@members);
|
|
1092 $paralogsB[$i] = "";
|
|
1093 undef $is_paralogB[$bestgroup];
|
|
1094 undef $is_paralogB[$i];
|
|
1095 grep (vec($is_paralogB[$bestgroup],$_,1) = 1, @members); # Refresh index of paralog array
|
|
1096 # grep (vec($is_paralogB[$i],$_,1) = 1, ());
|
|
1097 }
|
|
1098 }
|
|
1099 if ($na and ($nb == 0)){
|
|
1100 print "Warning: empty B cluster $i\n";
|
|
1101 for $m (@membersA){
|
|
1102 $bestscore = 0;
|
|
1103 $bestgroup = 0;
|
|
1104 $bestmatch = 0;
|
|
1105 for $j (1..$o) {
|
|
1106 next if ($i == $j);
|
|
1107 @membersAJ = split(/ /, $paralogsA[$j]);
|
|
1108 for $k (@membersAJ){
|
|
1109 next if ($confPA[$k] != 1); # For all 100% in-paralogs
|
|
1110 $score = $scoreAA{"$m:$k"};
|
|
1111 if ($score > $bestscore){
|
|
1112 $bestscore = $score;
|
|
1113 $bestgroup = $j;
|
|
1114 $bestmatch = $k;
|
|
1115 }
|
|
1116 }
|
|
1117 }
|
|
1118 print "Orphan $nameA[$m] goes to group $bestgroup with $nameA[$bestmatch]\n";
|
|
1119 @members = split(/ /, $paralogsA[$bestgroup]);
|
|
1120 push (@members, $m);
|
|
1121 $paralogsA[$bestgroup] = join(' ',@members);
|
|
1122 $paralogsA[$i] = "";
|
|
1123 undef $is_paralogA[$bestgroup];
|
|
1124 undef $is_paralogA[$i];
|
|
1125 grep (vec($is_paralogA[$bestgroup],$_,1) = 1, @members); # Refresh index of paralog array
|
|
1126 # grep (vec($is_paralogA[$i],$_,1) = 1, ());
|
|
1127 }
|
|
1128 }
|
|
1129 }
|
|
1130
|
|
1131 &clean_up(1);
|
|
1132 ###################
|
|
1133 $htmlfile = "orthologs." . $ARGV[0] . "-" . $ARGV[1] . ".html";
|
|
1134 if ($html){
|
|
1135 open HTML, ">$htmlfile" or warn "Could not write to HTML file $filename\n";
|
|
1136 }
|
|
1137
|
|
1138
|
|
1139 if ($output){
|
|
1140 print OUTPUT "\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\n";
|
|
1141 print OUTPUT "$o groups of orthologs\n";
|
|
1142 print OUTPUT "$totalA in-paralogs from $fasta_seq_fileA\n";
|
|
1143 print OUTPUT "$totalB in-paralogs from $fasta_seq_fileB\n";
|
|
1144 print OUTPUT "Grey zone $grey_zone bits\n";
|
|
1145 print OUTPUT "Score cutoff $score_cutoff bits\n";
|
|
1146 print OUTPUT "In-paralogs with confidence less than $conf_cutoff not shown\n";
|
|
1147 print OUTPUT "Sequence overlap cutoff $seq_overlap_cutoff\n";
|
|
1148 print OUTPUT "Group merging cutoff $group_overlap_cutoff\n";
|
|
1149 print OUTPUT "Scoring matrix $matrix\n";
|
|
1150 print OUTPUT "\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\n";
|
|
1151 }
|
|
1152 if ($html){
|
|
1153 print HTML "<pre>\n";
|
|
1154 print HTML "\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\n";
|
|
1155 print HTML "$o groups of orthologs\n";
|
|
1156 print HTML "$totalA in-paralogs from $fasta_seq_fileA\n";
|
|
1157 print HTML "$totalB in-paralogs from $fasta_seq_fileB\n";
|
|
1158 print HTML "Grey zone $grey_zone bits\n";
|
|
1159 print HTML "Score cutoff $score_cutoff bits\n";
|
|
1160 print HTML "In-paralogs with confidence less than $conf_cutoff not shown\n";
|
|
1161 print HTML "Sequence overlap cutoff $seq_overlap_cutoff\n";
|
|
1162 print HTML "Group merging cutoff $group_overlap_cutoff\n";
|
|
1163 print HTML "Scoring matrix $matrix\n";
|
|
1164 print HTML "\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\n";
|
|
1165 }
|
|
1166 # ##############################################################################
|
|
1167 # Check for alternative orthologs, sort paralogs by confidence and print results
|
|
1168 # ##############################################################################
|
|
1169 if ($use_bootstrap and $debug){
|
|
1170 open FF, ">BS_vs_bits" or warn "Could not write to file BS_vs_bits\n";
|
|
1171 }
|
|
1172 for $i(1..$o){
|
|
1173 @membersA = split(/ /, $paralogsA[$i]);
|
|
1174 @membersB = split(/ /, $paralogsB[$i]);
|
|
1175 $message = "";
|
|
1176 $htmlmessage = "";
|
|
1177
|
|
1178 $idB = $ortoB[$i];
|
|
1179 $nB = $hitnBA[$idB];
|
|
1180 for $idA(@membersA){
|
|
1181 next if ($confPA[$idA] != 1.0);
|
|
1182 $nA = $hitnAB[$idA];
|
|
1183 $confA[$i] = $ortoS[$i]; # default
|
|
1184 $bsA[$idA] = 1.0;
|
|
1185 ##############
|
|
1186 for $j(1..$nB){
|
|
1187 $idH = $hitBA[$idB][$j];
|
|
1188 ################ Some checks for alternative orthologs:
|
|
1189 # 1. Don't consider sequences that are already in this cluster
|
|
1190 next if (vec($is_paralogA[$i],$idH,1));
|
|
1191 next if ($confPA[$idH] > 0); # If $conf_cutoff > 0 idH might be incide circle, but not paralog
|
|
1192
|
|
1193 # 2. Check if candidate for alternative ortholog is already clustered in stronger clusters
|
|
1194 $in_other_cluster = 0;
|
|
1195 for $k(1..($i-1)){ # Check if current ortholog is already clustered
|
|
1196 if (vec($is_paralogA[$k],$idH,1)){
|
|
1197 $in_other_cluster = $k;
|
|
1198 last;
|
|
1199 }
|
|
1200 }
|
|
1201 # next if ($in_other_cluster); # This hit is clustered in cluster $k. It cannot be alternative ortholog
|
|
1202
|
|
1203 # 3. The best hit of candidate ortholog should be ortoA or at least to belong into this cluster
|
|
1204 @besthits = split (/ /,$besthitAB[$idH]);
|
|
1205 $this_family = 0;
|
|
1206 for $bh (@besthits){
|
|
1207 $this_family = 1 if ($idB == $bh);
|
|
1208 }
|
|
1209 # next unless ($this_family); # There was an alternative BA match but it's best match did not belong here
|
|
1210 ################# Done with checks - if sequence passed, then it could be an alternative ortholog
|
|
1211 $confA[$i] = $ortoS[$i] - $scoreBA{"$idB:$idH"};
|
|
1212 if ($use_bootstrap){
|
|
1213 if ($confA[$i] < $ortoS[$i]){ # Danger zone - check for bootstrap
|
|
1214 $bsA[$idA] = &bootstrap($fasta_seq_fileB,$idB,$idA,$idH);
|
|
1215 }
|
|
1216 else {
|
|
1217 $bsA[$idA] = 1.0;
|
|
1218 }
|
|
1219 }
|
|
1220 last;
|
|
1221 }
|
|
1222 $message .= sprintf("Bootstrap support for %s as seed ortholog is %d%%.", $nameA[$idA], 100*$bsA[$idA]);
|
|
1223 $message .= sprintf(" Alternative seed ortholog is %s (%d bits away from this cluster)", $nameA[$idH], $confA[$i]) if ($bsA[$idA] < 0.75);
|
|
1224 $message .= sprintf("\n");
|
|
1225 if ($html){
|
|
1226 if ($bsA[$idA] < 0.75){
|
|
1227 $htmlmessage .= sprintf("<font color=\"red\">");
|
|
1228 }
|
|
1229 elsif ($bsA[$idA] < 0.95){
|
|
1230 $htmlmessage .= sprintf("<font color=\"\#FFCC00\">");
|
|
1231 }
|
|
1232 else {
|
|
1233 $htmlmessage .= sprintf("<font color=\"green\">");
|
|
1234 }
|
|
1235 $htmlmessage .= sprintf("Bootstrap support for %s as seed ortholog is %d%%.\n", $nameA[$idA], 100*$bsA[$idA]);
|
|
1236 $htmlmessage .= sprintf("Alternative seed ortholog is %s (%d bits away from this cluster)\n", $nameA[$idH], $confA[$i]) if ($bsA[$idA] < 0.75);
|
|
1237 $htmlmessage .= sprintf("</font>");
|
|
1238 }
|
|
1239 printf (FF "%s\t%d\t%d\n", $nameA[$idA], $confA[$i], 100*$bsA[$idA]) if ($use_bootstrap and $debug);
|
|
1240 }
|
|
1241 ########
|
|
1242 $idA = $ortoA[$i];
|
|
1243 $nA = $hitnAB[$idA];
|
|
1244 for $idB(@membersB){
|
|
1245 next if ($confPB[$idB] != 1.0);
|
|
1246 $nB = $hitnBA[$idB];
|
|
1247 $confB[$i] = $ortoS[$i]; # default
|
|
1248 $bsB[$idB] = 1.0;
|
|
1249
|
|
1250 for $j(1..$nA){ # For all AB hits of given ortholog
|
|
1251 $idH = $hitAB[$idA][$j];
|
|
1252 # ############### Some checks for alternative orthologs:
|
|
1253 # 1. Don't consider sequences that are already in this cluster
|
|
1254 next if (vec($is_paralogB[$i],$idH,1));
|
|
1255 next if ($confPB[$idH] > 0); # If $conf_cutoff > 0 idH might be incide circle, but not paralog
|
|
1256
|
|
1257 # 2. Check if candidate for alternative ortholog is already clustered in stronger clusters
|
|
1258 $in_other_cluster = 0;
|
|
1259 for $k(1..($i-1)){
|
|
1260 if (vec($is_paralogB[$k],$idH,1)){
|
|
1261 $in_other_cluster = $k;
|
|
1262 last; # out from this cycle
|
|
1263 }
|
|
1264 }
|
|
1265 # next if ($in_other_cluster); # This hit is clustered in cluster $k. It cannot be alternative ortholog
|
|
1266
|
|
1267 # 3. The best hit of candidate ortholog should be ortoA
|
|
1268 @besthits = split (/ /,$besthitBA[$idH]);
|
|
1269 $this_family = 0;
|
|
1270 for $bh (@besthits){
|
|
1271 $this_family = 1 if ($idA == $bh);
|
|
1272 }
|
|
1273 # next unless ($this_family); # There was an alternative BA match but it's best match did not belong here
|
|
1274 # ################ Done with checks - if sequence passed, then it could be an alternative ortholog
|
|
1275 $confB[$i] = $ortoS[$i] - $scoreAB{"$idA:$idH"};
|
|
1276 if ($use_bootstrap){
|
|
1277 if ($confB[$i] < $ortoS[$i]){
|
|
1278 $bsB[$idB] = &bootstrap($fasta_seq_fileA,$idA,$idB,$idH);
|
|
1279 }
|
|
1280 else {
|
|
1281 $bsB[$idB] = 1.0;
|
|
1282 }
|
|
1283 }
|
|
1284 last;
|
|
1285 }
|
|
1286 $message .= sprintf("Bootstrap support for %s as seed ortholog is %d%%.", $nameB[$idB], 100*$bsB[$idB]);
|
|
1287 $message .= sprintf(" Alternative seed ortholog is %s (%d bits away from this cluster)", $nameB[$idH],$confB[$i]) if ($bsB[$idB] < 0.75);
|
|
1288 $message .= sprintf("\n");
|
|
1289 if ($html){
|
|
1290 if ($bsB[$idB] < 0.75){
|
|
1291 $htmlmessage .= sprintf("<font color=\"red\">");
|
|
1292 }
|
|
1293 elsif ($bsB[$idB] < 0.95){
|
|
1294 $htmlmessage .= sprintf("<font color=\"\#FFCC00\">");
|
|
1295 }
|
|
1296 else {
|
|
1297 $htmlmessage .= sprintf("<font color=\"green\">");
|
|
1298 }
|
|
1299 $htmlmessage .= sprintf("Bootstrap support for %s as seed ortholog is %d%%.\n", $nameB[$idB], 100*$bsB[$idB]);
|
|
1300 $htmlmessage .= sprintf("Alternative seed ortholog is %s (%d bits away from this cluster)\n", $nameB[$idH],$confB[$i]) if ($bsB[$idB] < 0.75);
|
|
1301 $htmlmessage .= sprintf("</font>");
|
|
1302 }
|
|
1303 printf (FF "%s\t%d\t%d\n", $nameB[$idB], $confB[$i], 100*$bsB[$idB]) if ($use_bootstrap and $debug);
|
|
1304 }
|
|
1305 close FF;
|
|
1306 ########### Print header ###############
|
|
1307 if ($output){
|
|
1308 print OUTPUT "___________________________________________________________________________________\n";
|
|
1309 print OUTPUT "Group of orthologs #" . $i .". Best score $ortoS[$i] bits\n";
|
|
1310 print OUTPUT "Score difference with first non-orthologous sequence - ";
|
|
1311 printf (OUTPUT "%s:%d %s:%d\n", $fasta_seq_fileA,$confA[$i],$fasta_seq_fileB,$confB[$i]);
|
|
1312 }
|
|
1313
|
|
1314 if ($html){
|
|
1315 print HTML "</pre>\n";
|
|
1316 print HTML "<hr WIDTH=\"100%\">";
|
|
1317 print HTML "<h3>";
|
|
1318 print HTML "Group of orthologs #" . $i .". Best score $ortoS[$i] bits<br>\n";
|
|
1319 print HTML "Score difference with first non-orthologous sequence - ";
|
|
1320 printf (HTML "%s:%d %s:%d</h3><pre>\n", $fasta_seq_fileA,$confA[$i],$fasta_seq_fileB,$confB[$i]);
|
|
1321 }
|
|
1322 ########### Sort and print members of A ############
|
|
1323 $nA = @membersA;
|
|
1324 $nB = @membersB;
|
|
1325 $nMAX = ($nA > $nB) ? $nA : $nB;
|
|
1326 # Sort membersA inside the cluster by confidence:
|
|
1327 for $m (0..($nA-1)){
|
|
1328 while($confPA[$membersA[$m]] < $confPA[$membersA[$m+1]]){
|
|
1329 $temp = $membersA[$m];
|
|
1330 $membersA[$m] = $membersA[$m+1];
|
|
1331 $membersA[$m+1] = $temp;
|
|
1332 --$m if ($m > 1);
|
|
1333 }
|
|
1334 }
|
|
1335 $paralogsA[$i] = join(' ',@membersA); # Put them back together
|
|
1336 # Sort membersB inside the cluster by confidence:
|
|
1337 for $m (0..($nB-1)){
|
|
1338 while($confPB[$membersB[$m]] < $confPB[$membersB[$m+1]]){
|
|
1339 $temp = $membersB[$m];
|
|
1340 $membersB[$m] = $membersB[$m+1];
|
|
1341 $membersB[$m+1] = $temp;
|
|
1342 --$m if ($m > 1);
|
|
1343 }
|
|
1344 }
|
|
1345 $paralogsB[$i] = join(' ',@membersB); # Put them back together
|
|
1346 # Print to text file and to HTML file
|
|
1347 for $m (0..($nMAX-1)){
|
|
1348 if ($m < $nA){
|
|
1349 if ($output){
|
|
1350 printf (OUTPUT "%-20s\t%.2f%%\t\t", $nameA[$membersA[$m]], (100*$confPA[$membersA[$m]]));
|
|
1351 }
|
|
1352 if ($html){
|
|
1353 print HTML "<B>" if ($confPA[$membersA[$m]] == 1);
|
|
1354 printf (HTML "%-20s\t%.2f%%\t\t", $nameA[$membersA[$m]], (100*$confPA[$membersA[$m]]));
|
|
1355 print HTML "</B>" if ($confPA[$membersA[$m]] == 1);
|
|
1356 }
|
|
1357 }
|
|
1358 else {
|
|
1359 printf (OUTPUT "%-20s\t%-7s\t\t", " ", " ");
|
|
1360 printf (HTML "%-20s\t%-7s\t\t", " ", " ") if ($html);
|
|
1361 }
|
|
1362 if ($m < $nB){
|
|
1363 if ($output){
|
|
1364 printf (OUTPUT "%-20s\t%.2f%%\n", $nameB[$membersB[$m]], (100*$confPB[$membersB[$m]]));
|
|
1365 }
|
|
1366 if ($html){
|
|
1367 print HTML "<B>" if ($confPB[$membersB[$m]] == 1);
|
|
1368 printf (HTML "%-20s\t%.2f%%", $nameB[$membersB[$m]], (100*$confPB[$membersB[$m]]));
|
|
1369 print HTML "</B>" if ($confPB[$membersB[$m]] == 1);
|
|
1370 print HTML "\n";
|
|
1371 }
|
|
1372 }
|
|
1373 else {
|
|
1374 printf (OUTPUT "%-20s\t%-7s\n", " ", " ") if($output);
|
|
1375 print HTML "\n" if ($html);
|
|
1376 }
|
|
1377 }
|
|
1378 print OUTPUT $message if ($use_bootstrap and $output);
|
|
1379 print HTML "$htmlmessage" if ($use_bootstrap and $html);
|
|
1380 }
|
|
1381 if ($output) {
|
|
1382 close OUTPUT;
|
|
1383 print "Output saved to file $outputfile\n";
|
|
1384 }
|
|
1385 if ($html){
|
|
1386 close HTML;
|
|
1387 print "HTML output saved to $htmlfile\n";
|
|
1388 }
|
|
1389
|
|
1390 if ($table){
|
|
1391 $filename = "table." . $ARGV[0] . "-" . $ARGV[1];
|
|
1392 open F, ">$filename" or die;
|
|
1393 print F "OrtoID\tScore\tOrtoA\tOrtoB\n";
|
|
1394 for $i(1..$o){
|
|
1395 print F "$i\t$ortoS[$i]\t";
|
|
1396 @members = split(/ /, $paralogsA[$i]);
|
|
1397 for $m (@members){
|
|
1398 $m =~ s/://g;
|
|
1399 printf (F "%s %.3f ", $nameA[$m], $confPA[$m]);
|
|
1400 }
|
|
1401 print F "\t";
|
|
1402 @members = split(/ /, $paralogsB[$i]);
|
|
1403 for $m (@members){
|
|
1404 $m =~ s/://g;
|
|
1405 printf (F "%s %.3f ", $nameB[$m], $confPB[$m]);
|
|
1406 }
|
|
1407 print F "\n";
|
|
1408 }
|
|
1409 close F;
|
|
1410 print "Table output saved to $filename\n";
|
|
1411 }
|
|
1412 if ($mysql_table){
|
|
1413 $filename2 = "sqltable." . $ARGV[0] . "-" . $ARGV[1];
|
|
1414 open F2, ">$filename2" or die;
|
|
1415 for $i(1..$o){
|
|
1416 @membersA = split(/ /, $paralogsA[$i]);
|
|
1417 for $m (@membersA){
|
|
1418 # $m =~ s/://g;
|
|
1419 if ($use_bootstrap && $bsA[$m]) {
|
|
1420 printf (F2 "%d\t%d\t%s\t%.3f\t%s\t%d%\n", $i, $ortoS[$i], $ARGV[0], $confPA[$m], $nameA[$m], 100*$bsA[$m]);
|
|
1421 } else {
|
|
1422 printf (F2 "%d\t%d\t%s\t%.3f\t%s\n", $i, $ortoS[$i], $ARGV[0], $confPA[$m], $nameA[$m]);
|
|
1423 }
|
|
1424 }
|
|
1425 @membersB = split(/ /, $paralogsB[$i]);
|
|
1426 for $m (@membersB){
|
|
1427 # $m =~ s/://g;
|
|
1428 if ($use_bootstrap && $bsB[$m]) {
|
|
1429 printf (F2 "%d\t%d\t%s\t%.3f\t%s\t%d%\n", $i, $ortoS[$i], $ARGV[1], $confPB[$m], $nameB[$m], 100*$bsB[$m]);
|
|
1430 }else {
|
|
1431 printf (F2 "%d\t%d\t%s\t%.3f\t%s\n", $i, $ortoS[$i], $ARGV[1], $confPB[$m], $nameB[$m]);
|
|
1432 }
|
|
1433 }
|
|
1434 }
|
|
1435 close F2;
|
|
1436 print "mysql output saved to $filename2\n";
|
|
1437 }
|
|
1438 if ($show_times){
|
|
1439 ($user_time,,,) = times;
|
|
1440 printf ("Finding bootstrap values and printing took %.2f seconds\n", ($user_time - $prev_time));
|
|
1441 printf ("The overall execution time: %.2f seconds\n", $user_time);
|
|
1442 }
|
|
1443 if ($run_blast) {
|
|
1444 unlink "formatdb.log";
|
|
1445 unlink "$fasta_seq_fileA.phr", "$fasta_seq_fileA.pin", "$fasta_seq_fileA.psq";
|
|
1446 unlink "$fasta_seq_fileB.phr", "$fasta_seq_fileB.pin", "$fasta_seq_fileB.psq" if (@ARGV >= 2);
|
|
1447 unlink "$fasta_seq_fileC.phr", "$fasta_seq_fileC.pin", "$fasta_seq_fileC.psq" if ($use_outgroup);
|
|
1448 }
|
|
1449 }
|
|
1450
|
|
1451 ##############################################################
|
|
1452 # Functions:
|
|
1453 ##############################################################
|
|
1454 sub clean_up { # Sort members within cluster and clusters by size
|
|
1455 ############################################################################################### Modification by Isabella 3
|
|
1456
|
|
1457 # Sort on index arrays with perl's built in sort instead of using bubble sort.
|
|
1458
|
|
1459 $var = shift;
|
|
1460 $totalA = $totalB = 0;
|
|
1461 # First pass: count members within each cluster
|
|
1462 foreach $i (1..$o) {
|
|
1463 @membersA = split(/ /, $paralogsA[$i]);
|
|
1464 $clusnA[$i] = @membersA; # Number of members in this cluster
|
|
1465 $totalA += $clusnA[$i];
|
|
1466 $paralogsA[$i] = join(' ',@membersA);
|
|
1467
|
|
1468 @membersB = split(/ /, $paralogsB[$i]);
|
|
1469 $clusnB[$i] = @membersB; # Number of members in this cluster
|
|
1470 $totalB += $clusnB[$i];
|
|
1471 $paralogsB[$i] = join(' ',@membersB);
|
|
1472
|
|
1473 $clusn[$i] = $clusnB[$i] + $clusnA[$i]; # Number of members in given group
|
|
1474 }
|
|
1475
|
|
1476 # Create an array used to store the position each element shall have in the final array
|
|
1477 # The elements are initialized with the position numbers
|
|
1478 my @position_index_array = (1..$o);
|
|
1479
|
|
1480 # Sort the position list according to cluster size
|
|
1481 my @cluster_sorted_position_list = sort { $clusn[$b] <=> $clusn[$a]} @position_index_array;
|
|
1482
|
|
1483 # Create new arrays for the sorted information
|
|
1484 my @new_paralogsA;
|
|
1485 my @new_paralogsB;
|
|
1486 my @new_is_paralogA;
|
|
1487 my @new_is_paralogB;
|
|
1488 my @new_clusn;
|
|
1489 my @new_ortoS;
|
|
1490 my @new_ortoA;
|
|
1491 my @new_ortoB;
|
|
1492
|
|
1493
|
|
1494 # Add the information to the new arrays in the orer specifeid by the index array
|
|
1495 for (my $index_in_list = 0; $index_in_list < scalar @cluster_sorted_position_list; $index_in_list++) {
|
|
1496
|
|
1497 my $old_index = $cluster_sorted_position_list[$index_in_list];
|
|
1498
|
|
1499 if (!$clusn[$old_index]) {
|
|
1500 $o = (scalar @new_ortoS) - 1;
|
|
1501 last;
|
|
1502 }
|
|
1503
|
|
1504 $new_paralogsA[$index_in_list + 1] = $paralogsA[$old_index];
|
|
1505 $new_paralogsB[$index_in_list + 1] = $paralogsB[$old_index];
|
|
1506 $new_is_paralogA[$index_in_list + 1] = $is_paralogA[$old_index];
|
|
1507 $new_is_paralogB[$index_in_list + 1] = $is_paralogB[$old_index];
|
|
1508 $new_clusn[$index_in_list + 1] = $clusn[$old_index];
|
|
1509 $new_ortoA[$index_in_list + 1] = $ortoA[$old_index];
|
|
1510 $new_ortoB[$index_in_list + 1] = $ortoB[$old_index];
|
|
1511 $new_ortoS[$index_in_list + 1] = $ortoS[$old_index];
|
|
1512 }
|
|
1513
|
|
1514 @paralogsA = @new_paralogsA;
|
|
1515 @paralogsB = @new_paralogsB;
|
|
1516 @is_paralogA = @new_is_paralogA;
|
|
1517 @is_paralogB = @new_is_paralogB;
|
|
1518 @clusn = @new_clusn;
|
|
1519 @ortoS = @new_ortoS;
|
|
1520 @ortoA = @new_ortoA;
|
|
1521 @ortoB = @new_ortoB;
|
|
1522
|
|
1523 # Create an array used to store the position each element shall have in the final array
|
|
1524 # The elements are initialized with the position numbers
|
|
1525 @position_index_array = (1..$o);
|
|
1526
|
|
1527 # Sort the position list according to score
|
|
1528 @score_sorted_position_list = sort { $ortoS[$b] <=> $ortoS[$a] } @position_index_array;
|
|
1529
|
|
1530 # Create new arrays for the sorted information
|
|
1531 my @new_paralogsA2 = ();
|
|
1532 my @new_paralogsB2 = ();
|
|
1533 my @new_is_paralogA2 = ();
|
|
1534 my @new_is_paralogB2 = ();
|
|
1535 my @new_clusn2 = ();
|
|
1536 my @new_ortoS2 = ();
|
|
1537 my @new_ortoA2 = ();
|
|
1538 my @new_ortoB2 = ();
|
|
1539
|
|
1540 # Add the information to the new arrays in the orer specifeid by the index array
|
|
1541 for (my $index_in_list = 0; $index_in_list < scalar @score_sorted_position_list; $index_in_list++) {
|
|
1542
|
|
1543 my $old_index = $score_sorted_position_list[$index_in_list];
|
|
1544 $new_paralogsA2[$index_in_list + 1] = $paralogsA[$old_index];
|
|
1545 $new_paralogsB2[$index_in_list + 1] = $paralogsB[$old_index];
|
|
1546 $new_is_paralogA2[$index_in_list + 1] = $is_paralogA[$old_index];
|
|
1547 $new_is_paralogB2[$index_in_list + 1] = $is_paralogB[$old_index];
|
|
1548 $new_clusn2[$index_in_list + 1] = $clusn[$old_index];
|
|
1549 $new_ortoA2[$index_in_list + 1] = $ortoA[$old_index];
|
|
1550 $new_ortoB2[$index_in_list + 1] = $ortoB[$old_index];
|
|
1551 $new_ortoS2[$index_in_list + 1] = $ortoS[$old_index];
|
|
1552 }
|
|
1553
|
|
1554 @paralogsA = @new_paralogsA2;
|
|
1555 @paralogsB = @new_paralogsB2;
|
|
1556 @is_paralogA = @new_is_paralogA2;
|
|
1557 @is_paralogB = @new_is_paralogB2;
|
|
1558 @clusn = @new_clusn2;
|
|
1559 @ortoS = @new_ortoS2;
|
|
1560 @ortoA = @new_ortoA2;
|
|
1561 @ortoB = @new_ortoB2;
|
|
1562
|
|
1563 #################################################################################### End modification by Isabella 3
|
|
1564
|
|
1565 }
|
|
1566 sub bootstrap{
|
|
1567 my $species = shift;
|
|
1568 my $seq_id1 = shift; # Query ID from $species
|
|
1569 my $seq_id2 = shift; # Best hit ID from other species
|
|
1570 my $seq_id3 = shift; # Second best hit
|
|
1571 # Retrieve sequence 1 from $species and sequence 2 from opposite species
|
|
1572 my $significance = 0.0;
|
|
1573
|
|
1574 if ($species eq $fasta_seq_fileA){
|
|
1575 $file1 = $fasta_seq_fileA;
|
|
1576 $file2 = $fasta_seq_fileB;
|
|
1577 }
|
|
1578 elsif ($species eq $fasta_seq_fileB){
|
|
1579 $file1 = $fasta_seq_fileB;
|
|
1580 $file2 = $fasta_seq_fileA;
|
|
1581 }
|
|
1582 else {
|
|
1583 print "Bootstrap values for ortholog groups are not calculated\n";
|
|
1584 return 0;
|
|
1585 }
|
|
1586
|
|
1587 open A, $file1 or die;
|
|
1588 $id = 0;
|
|
1589 $print_this_seq = 0;
|
|
1590 $seq1 = "";
|
|
1591 $seq2 = "";
|
|
1592
|
|
1593 $query_file = $seq_id1 . ".faq";
|
|
1594 open Q, ">$query_file" or die;
|
|
1595
|
|
1596 while (<A>){
|
|
1597 if(/^\>/){
|
|
1598 ++$id;
|
|
1599 $print_this_seq = ($id == $seq_id1)?1:0;
|
|
1600 }
|
|
1601 print Q if ($print_this_seq);
|
|
1602 }
|
|
1603 close A;
|
|
1604 close Q;
|
|
1605 ###
|
|
1606 open B, $file2 or die;
|
|
1607 $db_file = $seq_id2 . ".fas";
|
|
1608 open DB, ">$db_file" or die;
|
|
1609 $id = 0;
|
|
1610 $print_this_seq = 0;
|
|
1611
|
|
1612 while (<B>){
|
|
1613 if(/^\>/){
|
|
1614 ++$id;
|
|
1615 $print_this_seq = (($id == $seq_id2) or ($id == $seq_id3))?1:0;
|
|
1616 }
|
|
1617 print DB if ($print_this_seq);
|
|
1618 }
|
|
1619 close B;
|
|
1620 close DB;
|
|
1621
|
|
1622 system "$formatdb -i $db_file";
|
|
1623 # Use soft masking in 1-pass mode for simplicity.
|
|
1624 system "$blastall -F\"m S\" -i $query_file -z 5000000 -d $db_file -p blastp -M $matrix -m7 | ./$blastParser 0 -a > $seq_id2.faa";
|
|
1625
|
|
1626 # Note: Changed score cutoff 50 to 0 for blast2faa.pl (060402).
|
|
1627 # Reason: after a cluster merger a score can be less than the cutoff (50)
|
|
1628 # which will remove the sequence in blast2faa.pl. The bootstrapping will
|
|
1629 # then fail.
|
|
1630 # AGAIN, updaye
|
|
1631
|
|
1632 if (-s("$seq_id2.faa")){
|
|
1633
|
|
1634 system("java -jar $seqstat -m $matrix -n 1000 -i $seq_id2.faa > $seq_id2.bs"); # Can handle U, u
|
|
1635
|
|
1636 if (-s("$seq_id2.bs")){
|
|
1637 open BS, "$seq_id2.bs" or die "pac failed\n";
|
|
1638 $_ = <BS>;
|
|
1639 ($dummy1,$dummy2,$dummy3,$dummy4,$significance) = split(/\s+/);
|
|
1640 close BS;
|
|
1641 }
|
|
1642 else{
|
|
1643 print STDERR "pac failed\n"; # if ($debug);
|
|
1644 $significance = -0.01;
|
|
1645 }
|
|
1646 }
|
|
1647 else{
|
|
1648 print STDERR "blast2faa for $query_file / $db_file failed\n"; # if ($debug);
|
|
1649 $significance = 0.0;
|
|
1650 }
|
|
1651
|
|
1652 unlink "$seq_id2.fas", "$seq_id2.faa", "$seq_id2.bs", "$seq_id1.faq";
|
|
1653 unlink "formatdb.log", "$seq_id2.fas.psq", "$seq_id2.fas.pin", "$seq_id2.fas.phr";
|
|
1654
|
|
1655 return $significance;
|
|
1656 }
|
|
1657
|
|
1658 sub overlap_test{
|
|
1659 my @Fld = @_;
|
|
1660
|
|
1661 # Filter out fragmentary hits by:
|
|
1662 # Ignore hit if aggregate matching area covers less than $seq_overlap_cutoff of sequence.
|
|
1663 # Ignore hit if local matching segments cover less than $segment_coverage_cutoff of sequence.
|
|
1664 #
|
|
1665 # $Fld[3] and $Fld[4] are query and subject lengths.
|
|
1666 # $Fld[5] and $Fld[6] are lengths of the aggregate matching region on query and subject. (From start of first matching segment to end of last matching segment).
|
|
1667 # $Fld[7] and $Fld[8] are local matching length on query and subject (Sum of all segments length's on query).
|
|
1668
|
|
1669 $retval = 1;
|
|
1670 # if ($Fld[3] >= $Fld[4]) {
|
|
1671 if ($Fld[5] < ($seq_overlap_cutoff * $Fld[3])) {$retval = 0};
|
|
1672 if ($Fld[7] < ($segment_coverage_cutoff * $Fld[3])) {$retval = 0};
|
|
1673 # }
|
|
1674
|
|
1675 # if ($Fld[4] >= $Fld[3]) {
|
|
1676 if ($Fld[6] < ($seq_overlap_cutoff * $Fld[4])) {$retval = 0};
|
|
1677 if ($Fld[8] < ($segment_coverage_cutoff * $Fld[4])) {$retval = 0};
|
|
1678 # }
|
|
1679
|
|
1680 # print "$Fld[3] $Fld[5] $Fld[7]; $Fld[4] $Fld[6] $Fld[8]; retval=$retval\n";
|
|
1681
|
|
1682 return $retval;
|
|
1683 }
|
|
1684
|
|
1685 sub do_blast
|
|
1686 {
|
|
1687 my @parameter=@_;
|
|
1688 my $resultfile=@parameter[@parameter-1];
|
|
1689 my $go_to_blast=1;
|
|
1690 my $resultfilesize;
|
|
1691 if (-e $resultfile)
|
|
1692 {
|
|
1693 $resultfilesize= -s "$resultfile";
|
|
1694 if ($resultfilesize >10240)
|
|
1695 {
|
|
1696 $go_to_blast=0;
|
|
1697 }
|
|
1698 }
|
|
1699 if ($go_to_blast)
|
|
1700 {
|
|
1701 if ($blast_two_passes)
|
|
1702 {
|
|
1703 do_blast_2pass(@parameter);
|
|
1704 } else
|
|
1705 {
|
|
1706 do_blast_1pass(@parameter);
|
|
1707 }
|
|
1708 }
|
|
1709 }
|
|
1710
|
|
1711 sub do_blast_1pass {
|
|
1712 my @Fld = @_;
|
|
1713
|
|
1714 # $Fld [0] is query
|
|
1715 # $Fld [1] is database
|
|
1716 # $Fld [2] is query size
|
|
1717 # $Fld [3] is database size
|
|
1718 # $Fld [4] is output name
|
|
1719
|
|
1720 # Use soft masking (low complexity masking by SEG in search phase, not in alignment phase).
|
|
1721 system ("$blastall -F\"m S\" -i $Fld[0] -d $Fld[1] -p blastp -v $Fld[3] -b $Fld[3] -M $matrix -z 5000000 -m7 | ./$blastParser $score_cutoff > $Fld[4]");
|
|
1722 }
|
|
1723
|
|
1724 sub do_blast_2pass {
|
|
1725
|
|
1726 my @Fld = @_;
|
|
1727
|
|
1728 # $Fld [0] is query
|
|
1729 # $Fld [1] is database
|
|
1730 # $Fld [2] is query size
|
|
1731 # $Fld [3] is database size
|
|
1732 # $Fld [4] is output name
|
|
1733
|
|
1734 # assume the script has already formatted the database
|
|
1735 # we will now do 2-pass approach
|
|
1736
|
|
1737 # load sequences
|
|
1738
|
|
1739 %sequencesA = ();
|
|
1740 %sequencesB = ();
|
|
1741
|
|
1742 open (FHA, $Fld [0]);
|
|
1743 while (<FHA>) {
|
|
1744
|
|
1745 $aLine = $_;
|
|
1746 chomp ($aLine);
|
|
1747
|
|
1748 $seq = "";
|
|
1749
|
|
1750 if ($aLine =~ />/) {
|
|
1751 @words = split (/\s/, $aLine);
|
|
1752 $seqID = $words[0];
|
|
1753 $sequencesA {$seqID} = "";
|
|
1754 }
|
|
1755 else {
|
|
1756 $sequencesA {$seqID} = $sequencesA {$seqID}.$aLine;
|
|
1757 }
|
|
1758 }
|
|
1759 close (FHA);
|
|
1760
|
|
1761 open (FHB, $Fld [1]);
|
|
1762 while (<FHB>) {
|
|
1763 $aLine = $_;
|
|
1764 chomp ($aLine);
|
|
1765
|
|
1766 $seq = "";
|
|
1767
|
|
1768 if ($aLine =~ />/) {
|
|
1769 @words = split (/\s/, $aLine);
|
|
1770 $seqID = $words[0];
|
|
1771 $sequencesB {$seqID} = "";
|
|
1772 }
|
|
1773 else {
|
|
1774 $sequencesB {$seqID} = $sequencesB {$seqID}.$aLine;
|
|
1775 }
|
|
1776 }
|
|
1777 close (FHB);
|
|
1778
|
|
1779 # Do first pass with compositional adjustment on and soft masking.
|
|
1780 # This efficiently removes low complexity matches but truncates alignments,
|
|
1781 # making a second pass necessary.
|
|
1782 print STDERR "\nStarting first BLAST pass for $Fld[0] - $Fld[1] on ";
|
|
1783 system("date");
|
|
1784 open FHR, "$blastall -C3 -F\"m S\" -i $Fld[0] -d $Fld[1] -p blastp -v $Fld[3] -b $Fld[3] -M $matrix -z 5000000 -m7 | ./$blastParser $score_cutoff|";
|
|
1785
|
|
1786 %theHits = ();
|
|
1787 while (<FHR>) {
|
|
1788 $aLine = $_;
|
|
1789 chomp ($aLine);
|
|
1790 @words = split (/\s+/, $aLine);
|
|
1791
|
|
1792 if (exists ($theHits {$words [0]})) {
|
|
1793 $theHits {$words [0]} = $theHits {$words [0]}." ".$words [1];
|
|
1794 }
|
|
1795 else {
|
|
1796 $theHits {$words [0]} = $words [1];
|
|
1797 }
|
|
1798
|
|
1799 }
|
|
1800 close (FHR);
|
|
1801
|
|
1802 $tmpdir = "."; # May be slightly (5%) faster using the RAM disk "/dev/shm".
|
|
1803 $tmpi = "$tmpdir/tmpi";
|
|
1804 $tmpd = "$tmpdir/tmpd";
|
|
1805
|
|
1806 # Do second pass with compositional adjustment off to get full-length alignments.
|
|
1807 print STDERR "\nStarting second BLAST pass for $Fld[0] - $Fld[1] on ";
|
|
1808 system("date");
|
|
1809 unlink "$Fld[4]";
|
|
1810 foreach $aQuery (keys % theHits) {
|
|
1811
|
|
1812 # Create single-query file
|
|
1813 open (FHT, ">$tmpi");
|
|
1814 print FHT ">$aQuery\n".$sequencesA {">$aQuery"}."\n";
|
|
1815 close (FHT);
|
|
1816
|
|
1817 # Create mini-database of hit sequences
|
|
1818 open (FHT, ">$tmpd");
|
|
1819 foreach $aHit (split (/\s/, $theHits {$aQuery})) {
|
|
1820 print FHT ">$aHit\n".$sequencesB {">$aHit"}."\n";
|
|
1821 }
|
|
1822 close (FHT);
|
|
1823
|
|
1824 # Run Blast and add to output
|
|
1825 system ("$formatdb -i $tmpd");
|
|
1826 system ("$blastall -C0 -FF -i $tmpi -d $tmpd -p blastp -v $Fld[3] -b $Fld[3] -M $matrix -z 5000000 -m7 | ./$blastParser $score_cutoff >> $Fld[4]");
|
|
1827 }
|
|
1828
|
|
1829 unlink "$tmpi", "$tmpd", "formatdb.log", "$tmpd.phr", "$tmpd.pin", "$tmpd.psq";
|
|
1830 }
|
|
1831
|
|
1832
|
|
1833 # Date Modification
|
|
1834 # -------- ---------------------------------------------------
|
|
1835 #
|
|
1836 # 2006-04-02 [1.36] - Changed score cutoff 50 to 0 for blast2faa.pl.
|
|
1837 # Reason: after a cluster merger a score can be less than the cutoff (50)
|
|
1838 # which will remove the sequence in blast2faa.pl. The bootstrapping will
|
|
1839 # then fail.
|
|
1840 # - Fixed bug with index variable in bootstrap routine.
|
|
1841 #
|
|
1842 # 2006-06-01 [2.0] - Fixed bug in blast_parser.pl: fields 7 and 8 were swapped,
|
|
1843 # it was supposed to print match_area before HSP_length.
|
|
1844 # - Fixed bug in blastall call: -v param was wrong for the A-B
|
|
1845 # and B-A comparisons.
|
|
1846 # -
|
|
1847 # - Changed "cluster" to "group" consistently in output.
|
|
1848 # - Changed "main ortholog" to "seed ortholog" in output.
|
|
1849 # - Replace U -> X before running seqstat.jar, otherwise it crashes.
|
|
1850 # 2006-08-04 [2.0] - In bootstrap subroutine, replace U with X, otherwise seqstat
|
|
1851 # will crash as this is not in the matrix (should fix this in seqstat)
|
|
1852 # 2006-08-04 [2.1] - Changed to writing default output to file.
|
|
1853 # - Added options to run blast only.
|
|
1854 # - Fixed some file closing bugs.
|
|
1855 # 2007-12-14 [3.0] - Sped up sorting routines (by Isabella).
|
|
1856 # - New XML-based blast_parser.
|
|
1857 # - New seqstat.jar to handle u and U.
|
|
1858 # - Modified overlap criterion for rejecting matches. Now it agrees with the paper.
|
|
1859 # 2009-04-01 [4.0] - Further modification of overlap criteria (require that they are met for both query and subject).
|
|
1860 # - Changed bit score cutoff to 40, which is suitable for compositionally adjusted BLAST.
|
|
1861 # - Added in 2-pass algorithm.
|
|
1862 # 2009-06-11 [4.0] - Moved blasting out to subroutine.
|
|
1863 # - Changed blasting in bootstrap subroutine to use unconditional score matrix adjustment and SEG hard masking,
|
|
1864 # to be the same as first step of 2-pass blast.
|
|
1865 # 2009-06-17 [4.0] - Compensated a Blast "bug" that sometimes gives a self-match lower score than a non-identical match.
|
|
1866 # This can happen with score matrix adjustment and can lead to missed orthologs.
|
|
1867 # 2009-08-18 [4.0] - Consolidated Blast filtering parameters for 2-pass (-C3 -F\"m S\"; -C0 -FF)
|
|
1868 # 2009-10-09 [4.1] - Fixed bug that caused failure if Fasta header lines had more than one word.
|