0
|
1 #!/usr/bin/perl
|
|
2 use strict;
|
|
3 use warnings;
|
|
4 use Getopt::Long;
|
4
|
5 use File::Basename;
|
|
6
|
|
7 use lib dirname(__FILE__);
|
0
|
8
|
|
9 ### programs from BLAST
|
4
|
10 my $formatdb=`which formatdb`;
|
|
11 chomp($formatdb);
|
|
12
|
|
13 my $blastall=`which blastall`;
|
|
14 chomp($blastall);
|
0
|
15
|
|
16 ### programs from mcl
|
4
|
17 my $mcl=`which mcl`;
|
|
18 chomp($mcl);
|
0
|
19
|
|
20 ### programs from mafft
|
4
|
21 my $mafft=`which mafft`;
|
|
22 chomp($mafft);
|
0
|
23
|
|
24 ### programs from PHYLIP
|
4
|
25 my $seqboot=`which seqboot`;chomp($seqboot);
|
|
26 my $neighbor=`which neighbor`;chomp($neighbor);
|
|
27 my $consense=`which consense`;chomp($consense);
|
|
28 my $dnaml=`which dnaml`;chomp($dnaml);
|
|
29 my $dnadist=`which dnadist`;chomp($dnadist);
|
|
30 my $dnapars=`which dnapars`;chomp($dnapars);
|
0
|
31
|
|
32 my $count_tree=0;
|
|
33
|
|
34 my $sampleSize=8000; # when calculate the pan-genome size, we will sample $sampleSize combinations
|
|
35 # if the total combination number is larger than $sampleSize for specific genomes
|
|
36 # Surely, the number of $sampleSize is, the larger, the better.
|
|
37 # However, the larger the $sampleSize is, the more time would be consumed.
|
|
38 # we suggest the range: 5000 ~ 20,000
|
|
39
|
|
40 #####################################################################
|
|
41 # DOn't modify the following code, unless you know their functions
|
|
42 #####################################################################
|
|
43
|
|
44 my %opt=qw();
|
|
45 GetOptions(\%opt,"strains:s","input:s","output:s","cluster!","pangenome!","variation!","evolution!","function!","method:s","thread:i","score:f","evalue:f","coverage:f","local:f","global:f","identity:f","bootstrap:i","help|h!");
|
|
46
|
|
47 my @usage=qq(
|
|
48 ====== Pan-Genome Analysis Pipeline (PGAP) ======
|
|
49 Version 1.2.1
|
|
50
|
|
51 Usage: perl PGAP.pl [Options]
|
|
52
|
|
53 Options:
|
|
54 --strains String Input strains nicknames, and join them with '+', for example: A+B+C
|
|
55 --input String Input data directory
|
|
56 --output String Result output directory
|
|
57
|
|
58 --cluster Run homologous gene clustering
|
|
59 --pangenome Run pan-genome analysis
|
|
60 --variation Run homologous clusters variation analysis
|
|
61 --evolution Run evolution analysis
|
|
62 --function Run Function analysis
|
|
63
|
|
64 --method String GF for GeneFamily method, and MP for MultiParanoid method
|
|
65 for GF: fast, but not very accurate
|
|
66 evalue, score, indentity, coverage are employed
|
|
67 for MP: slow, but more accurate
|
|
68 score, coverage, local, global are employed
|
|
69 --thread Int Number of processors to use in blastall. [default:1]
|
|
70 --score Int Minimum score in blastall. [default:40]
|
|
71 --evalue Decimal Maximal E-value in blastall. [default:1e-10]
|
|
72 --coverage Decimal Minimum alignment coverage for two homologous proteins. [default:0.5]
|
|
73 --local Decimal Minimum local alignment overlap in MP method. [default:0.25]
|
|
74 --global Decimal Minimum global alignment overlap in MP method. [default:0.5]
|
|
75 --identity Decimal Minimum alignment indentity for two homologous proteins. [default:0.5]
|
|
76 --bootstrap Int Bootstrap times for phylogenetics tree. [default:1]
|
|
77
|
|
78 --h or help Display this message
|
|
79 );
|
|
80
|
|
81
|
|
82 ############# specified variable #############
|
|
83 my $inputDIR;
|
|
84 my $outputDIR;
|
|
85 my $run_cluster;
|
|
86 my $run_pangenome;
|
|
87 my $run_variation;
|
|
88 my $run_evolution;
|
|
89 my $run_function;
|
|
90 my $method="";
|
|
91 my $thread;
|
|
92 my $score;
|
|
93 my $identity;
|
|
94 my $evalue;
|
|
95 my $coverage;
|
|
96 my $global;
|
|
97 my $local;
|
|
98 my $bootstrap;
|
|
99
|
|
100
|
|
101 my %pep;
|
|
102 my %nuc;
|
|
103 my $spnum;
|
|
104 my @clusters;
|
|
105 my $Cluster;
|
|
106 my @SpecieCombination;
|
|
107 my @spID;
|
|
108 my %genenum;
|
|
109 my %aaAln;
|
|
110 my %ntAln;
|
|
111 my %cog;
|
|
112 my %description;
|
|
113 #my %aa4tree; ### AA sequence for Phylogenetic Tree
|
|
114 my %nt4tree; ### nucleotide sequence for Phylogenetic Tree
|
|
115 my @SNPPosition; ### SNP position
|
|
116 my $dieMessage="You did not run PGAP.pl in the program directory\n";
|
|
117 my $section;
|
|
118
|
|
119 ######### common temporary variable #############
|
|
120 my $i;
|
|
121 my $j;
|
|
122 my $line;
|
|
123 my %tmpHash;
|
|
124 my @tmp;
|
|
125 my $tmp;
|
|
126 my $key;
|
|
127 my @row;
|
|
128 my $inparacount;
|
|
129 my $ClusterID;
|
|
130 my $orth;
|
|
131 my @content;
|
|
132 my $clusterName;
|
|
133 my @xdata;
|
|
134 my @ydata;
|
|
135 my @fit;
|
|
136
|
|
137 my $fit_A;
|
|
138 my $fit_A_interval;
|
|
139 my $fit_B;
|
|
140 my $fit_C;
|
|
141 my $fit_C_interval;
|
|
142 my $fit_Rsquare;
|
|
143
|
|
144
|
|
145
|
|
146 #### check option
|
|
147
|
|
148 my $opt_error=0;
|
|
149
|
|
150 if ((scalar(keys %opt) ==0) or (exists($opt{"help"})))
|
|
151 {
|
|
152 print join("\n",@usage)."\n";
|
|
153 exit;
|
|
154 }
|
|
155
|
|
156
|
|
157 ###################### public info
|
|
158 ### strains name
|
|
159 my @species;
|
|
160 if (exists($opt{"strains"}))
|
|
161 {
|
|
162 @species=split(/\+/,$opt{"strains"});
|
|
163 $spnum=scalar(@species);
|
|
164 }else
|
|
165 {
|
|
166 print "Please assign strains nick name!\n";
|
|
167 exit;
|
|
168 }
|
|
169
|
|
170 ### input data directory
|
|
171
|
|
172 if (exists($opt{"input"}))
|
|
173 {
|
|
174 $inputDIR=$opt{"input"};
|
|
175 if ($inputDIR!~/\/$/)
|
|
176 {
|
|
177 $inputDIR=$inputDIR."/";
|
|
178 }
|
|
179 }else
|
|
180 {
|
|
181 print "Please assign input data directory!\n\n";
|
|
182 exit;
|
|
183 }
|
|
184 ### output data directory
|
|
185
|
|
186 if (exists($opt{"output"}))
|
|
187 {
|
|
188 $outputDIR=$opt{"output"};
|
|
189 if ($outputDIR!~/\/$/)
|
|
190 {
|
|
191 $outputDIR=$outputDIR."/";
|
|
192 }
|
|
193 }else
|
|
194 {
|
|
195 print "Please assign result output directory!\n\n";
|
|
196 exit;
|
|
197 }
|
|
198
|
|
199 ###################### section info
|
|
200
|
|
201 if (exists($opt{"cluster"}))
|
|
202 {
|
|
203 $run_cluster=1;
|
|
204 }else
|
|
205 {
|
|
206 $run_cluster=0;
|
|
207 }
|
|
208
|
|
209 if (exists($opt{"pangenome"}))
|
|
210 {
|
|
211 $run_pangenome=1;
|
|
212 }else
|
|
213 {
|
|
214 $run_pangenome=0;
|
|
215 }
|
|
216
|
|
217 if (exists($opt{"variation"}))
|
|
218 {
|
|
219 $run_variation=1;
|
|
220 }else
|
|
221 {
|
|
222 $run_variation=0;
|
|
223 }
|
|
224
|
|
225 if (exists($opt{"evolution"}))
|
|
226 {
|
|
227 $run_evolution=1;
|
|
228 }else
|
|
229 {
|
|
230 $run_evolution=0;
|
|
231 }
|
|
232
|
|
233 if (exists($opt{"function"}))
|
|
234 {
|
|
235 $run_function=1;
|
|
236 }else
|
|
237 {
|
|
238 $run_function=0;
|
|
239 }
|
|
240
|
|
241 if ($run_cluster)
|
|
242 {
|
|
243 ### method
|
|
244 if (exists($opt{"method"}))
|
|
245 {
|
|
246 $method=uc($opt{"method"});
|
|
247 if ($method!~/^GF$/ and $method!~/^MP$/)
|
|
248 {
|
|
249 print "Unknown method: ".$opt{"method"}."\n";
|
|
250 exit;
|
|
251 }
|
|
252 }else
|
|
253 {
|
|
254 print "Please assign the cluster method!\n\n";
|
|
255 exit;
|
|
256 }
|
|
257
|
|
258 ##thread
|
|
259 if (exists($opt{"thread"}))
|
|
260 {
|
|
261 $thread=$opt{"thread"};
|
|
262 if ($thread==0)
|
|
263 {
|
|
264 print "please assign an applicable thread value.\n";
|
|
265 exit;
|
|
266 }
|
|
267 }else
|
|
268 {
|
|
269 $thread=1;
|
|
270 }
|
|
271
|
|
272 ##score
|
|
273 if (exists($opt{"score"}))
|
|
274 {
|
|
275 $score=$opt{"score"};
|
|
276 if ($score<=0)
|
|
277 {
|
|
278 print "please assign an applicable score value.\n";
|
|
279 exit;
|
|
280 }
|
|
281 }else
|
|
282 {
|
|
283 $score=40;
|
|
284 }
|
|
285
|
|
286 if ($method eq "GF")
|
|
287 {
|
|
288 ###identity
|
|
289 if (exists($opt{"identity"}))
|
|
290 {
|
|
291 $identity=$opt{"identity"};
|
|
292 if ($identity>1 or $identity<=0)
|
|
293 {
|
|
294 print "identity should be 0 ~ 1 \n";
|
|
295 exit;
|
|
296 }
|
|
297 }else
|
|
298 {
|
|
299 $identity=0.5;
|
|
300 }
|
|
301
|
|
302 ###evalue
|
|
303 if (exists($opt{"evalue"}))
|
|
304 {
|
|
305 $evalue=$opt{"evalue"};
|
|
306 }else
|
|
307 {
|
|
308 $evalue=1e-10;
|
|
309 }
|
|
310
|
|
311 ###coverage
|
|
312 if (exists($opt{"coverage"}))
|
|
313 {
|
|
314 $coverage=$opt{"coverage"};
|
|
315 if ($coverage>1 or $coverage<=0)
|
|
316 {
|
|
317 print "coverage should be 0 ~ 1 \n";
|
|
318 exit;
|
|
319 }
|
|
320 }else
|
|
321 {
|
|
322 $coverage=0.5;
|
|
323 }
|
|
324 }
|
|
325
|
|
326
|
|
327 if ($method eq "MP")
|
|
328 {
|
|
329 ###global
|
|
330 if (exists($opt{"global"}))
|
|
331 {
|
|
332 $global=$opt{"global"};
|
|
333 if ($global>1 or $global<=0)
|
|
334 {
|
|
335 print "global coverage should be 0 ~ 1 \n";
|
|
336 exit;
|
|
337 }
|
|
338 }else
|
|
339 {
|
|
340 $global=0.5;
|
|
341 }
|
|
342 ###local
|
|
343 if (exists($opt{"local"}))
|
|
344 {
|
|
345 $local=$opt{"local"};
|
|
346 if ($local<=0)
|
|
347 {
|
|
348 print "local coverage should be 0 ~ [global coverage value] \n";
|
|
349 exit;
|
|
350 }
|
|
351 if ($local>$global)
|
|
352 {
|
|
353 print "local coverage should be less than global coverage!\n";
|
|
354 exit;
|
|
355 }
|
|
356 }else
|
|
357 {
|
|
358 $local=0.25;
|
|
359 }
|
|
360 }
|
|
361 }
|
|
362
|
|
363 if ($run_evolution)
|
|
364 {
|
|
365 if (exists($opt{"bootstrap"}))
|
|
366 {
|
|
367 $bootstrap=$opt{"bootstrap"};
|
|
368 if ($bootstrap<=0)
|
|
369 {
|
|
370 print "please assign an applicable bootstrap value.\n";
|
|
371 }
|
|
372 }else
|
|
373 {
|
|
374 $bootstrap=1;
|
|
375 }
|
|
376 }
|
|
377
|
|
378 print "Program begin at ".localtime()."\n";
|
|
379 print "The following are the parameters for current process:\n";
|
|
380 print "Strains: ".join(",",@species)."\n";
|
|
381 print "Input directory: $inputDIR\n";
|
|
382 print "Output directory: $outputDIR\n";
|
|
383 if ($run_cluster)
|
|
384 {
|
|
385 print "Cluster analysis: yes\n";
|
|
386 print " Method: $method\n";
|
|
387 print " Thread: $thread\n";
|
|
388 if ($method eq "GF")
|
|
389 {
|
|
390 print " E-value: $evalue\n";
|
|
391 print " Identity: $identity\n";
|
|
392 print " Coverage: $coverage\n";
|
|
393 print " Score: $score\n";
|
|
394 }
|
|
395 if ($method eq "MP")
|
|
396 {
|
|
397 print " Local: $local\n";
|
|
398 print " Global: $global\n";
|
|
399 print " Score: $score\n";
|
|
400 }
|
|
401 }else
|
|
402 {
|
|
403 print "Cluster analysis: no\n";
|
|
404 }
|
|
405 if ($run_pangenome)
|
|
406 {
|
|
407 print "Pan-genome analysis: yes\n";
|
|
408 }else
|
|
409 {
|
|
410 print "Pan-genome analysis: no\n";
|
|
411 }
|
|
412 if ($run_variation)
|
|
413 {
|
|
414 print "Variation analysis: yes\n";
|
|
415 }else
|
|
416 {
|
|
417 print "Variation analysis: no\n";
|
|
418 }
|
|
419 if ($run_evolution)
|
|
420 {
|
|
421 print "Evolution analysis: yes\n";
|
|
422 print " Bootstrap: $bootstrap\n";
|
|
423 }else
|
|
424 {
|
|
425 print "Evolution analysis: no\n";
|
|
426 }
|
|
427 if ($run_function)
|
|
428 {
|
|
429 print "Function analysis: yes\n";
|
|
430 }else
|
|
431 {
|
|
432 print "Function analysis: no\n";
|
|
433 }
|
|
434
|
|
435 $section=$run_cluster.$run_pangenome.$run_variation.$run_evolution.$run_function;
|
|
436
|
|
437 ###############################################
|
|
438 # section 0) check input file and program
|
|
439 ###############################################
|
|
440 if (!(-e $outputDIR))
|
|
441 {
|
|
442 system("mkdir $outputDIR");
|
|
443 }
|
|
444 system("chmod +rw $outputDIR");
|
|
445
|
|
446 if (!(-w $outputDIR))
|
|
447 {
|
|
448 print "There is no WRITE permission in $outputDIR\n";
|
|
449 exit;
|
|
450 }
|
|
451 @tmp=qw();
|
|
452 &CheckInputFile(\@species,$inputDIR,$section,$method,\@tmp);
|
|
453 &CheckExtraProgram($section,$method,\@tmp);
|
|
454
|
|
455 if (scalar(@tmp)>0)
|
|
456 {
|
|
457 open(R,">".$outputDIR."0.error.message");
|
|
458 print R join("",@tmp)."\n";
|
|
459 close(R);
|
|
460 print "error!\nlog are saved in ${outputDIR}0.error.message\n";
|
|
461 exit;
|
|
462 }
|
|
463
|
|
464
|
|
465 ############################################
|
|
466 # section 1) cluster analysis
|
|
467 ############################################
|
|
468
|
|
469 if ($run_cluster)
|
|
470 {
|
|
471 print "\n\n############################################\n";
|
|
472 print "# section 1) cluster analysis\n";
|
|
473 print "############################################\n\n\n";
|
|
474
|
|
475 #### cluster gene and return result to the array @clusters
|
|
476
|
|
477 if ($method eq "MP")
|
|
478 {
|
|
479 print "Begin cluster gene with MP method ...\n";
|
|
480 &MP();
|
|
481 }else
|
|
482 {
|
|
483 print "Begin cluster gene with GF method ...\n";
|
|
484 &GF();
|
|
485 }
|
|
486
|
|
487 #### output normal cluster format
|
|
488
|
|
489 &FormatClusterOutPut(\@species,"${outputDIR}1.Orthologs_Cluster.txt",\@clusters);
|
|
490
|
|
491 #### Retrieve cluster
|
|
492
|
|
493 &RetrieveClusterFromFile("${outputDIR}1.Orthologs_Cluster.txt",\@clusters);
|
|
494
|
|
495 ##### gene distribution in each strains
|
|
496 %tmpHash=();
|
|
497 &GeneDistribution(\@clusters,\%tmpHash);
|
|
498
|
|
499 open(R,">${outputDIR}1.Gene_Distribution_By_Conservation.txt");
|
|
500 print R "SharedBy_Strains\t".join("\t",@species)."\n";
|
|
501
|
|
502 for ($i=$spnum;$i>0;$i--)
|
|
503 {
|
|
504 print R $i;
|
|
505 for ($j=0;$j<$spnum;$j++)
|
|
506 {
|
|
507 if (exists($tmpHash{$i."|".$j}))
|
|
508 {
|
|
509 print R "\t".$tmpHash{$i."|".$j};
|
|
510 }else
|
|
511 {
|
|
512 print R "\t0";
|
|
513 }
|
|
514 }
|
|
515 print R "\n";
|
|
516 }
|
|
517
|
|
518 close(R);
|
|
519
|
|
520
|
|
521 }else
|
|
522 {
|
|
523 print "Homologous gene clustering is skipped!\n";
|
|
524 }
|
|
525
|
|
526
|
|
527 if ($run_pangenome)
|
|
528 {
|
|
529 print "\n\n############################################\n";
|
|
530 print "# section 2) Pan-genome analysis\n";
|
|
531 print "############################################\n\n\n";
|
|
532
|
|
533 #### Retrieve cluster
|
|
534 &RetrieveClusterFromFile("${outputDIR}1.Orthologs_Cluster.txt",\@clusters);
|
|
535 chomp(@clusters);
|
|
536
|
|
537 #### convert file into 0-1 matrix
|
|
538 for ($line=0;$line<@clusters;$line++)
|
|
539 {
|
|
540 @row=split(/\t/,$clusters[$line]);
|
|
541 splice(@row,0,1);
|
|
542 for ($i=0;$i<@row;$i++)
|
|
543 {
|
|
544 if ($row[$i] eq "-")
|
|
545 {
|
|
546 $row[$i]=0;
|
|
547 }else
|
|
548 {
|
|
549 $row[$i]=1;
|
|
550 }
|
|
551 }
|
|
552 $clusters[$line]=join("\t",@row);
|
|
553 }
|
|
554
|
|
555 #### fetch gene number of each strains
|
|
556 for ($i=0;$i<$spnum;$i++)
|
|
557 {
|
|
558 open(F,"$inputDIR$species[$i].pep");
|
|
559 @tmp=<F>;
|
|
560 close(F);
|
|
561 @tmp=grep(/^>/,@tmp);
|
|
562 $genenum{$species[$i]}=scalar(@tmp);
|
|
563 }
|
|
564
|
|
565 #### pan genome size and core genome size
|
|
566 print "Deducing pan genome size and core genome size for each composition...\n\n";
|
|
567
|
|
568 open(PAN,">${outputDIR}2.PanGenome.Data.txt");
|
|
569 print PAN "ClusterConservation\tTotalGeneNumber\tPanGenome\tCoreGenome\n";
|
|
570
|
|
571 for ($i=1;$i<=scalar(@species);$i++)
|
|
572 {
|
|
573 #@SpecieCombination=&Combination(\@species,$i);
|
|
574 #@SpecieCombination=&Combination($spnum,$i);
|
|
575 if (&ChkCombinationValue($spnum,$i) !=0) ### transfer the array reference to the subroutine
|
|
576 {
|
|
577 &Combination($spnum,$i,\@SpecieCombination); ## if the combination number is less than sampleSize, then fecth all, else sample
|
|
578 }else
|
|
579 {
|
|
580 &SampleCombination($spnum,$i,\@SpecieCombination);
|
|
581 }
|
|
582
|
|
583 foreach $key (@SpecieCombination)
|
|
584 {
|
|
585 ##### count total gene number in current combination
|
|
586 $tmp=0;
|
|
587 @spID=split(/\t/,$key); #### speices id in current combination
|
|
588 foreach (@spID)
|
|
589 {
|
|
590 $tmp=$tmp+$genenum{$species[$_]};
|
|
591 }
|
|
592 ##### scan pangenome and coregenome
|
|
593 @tmp=split(/\t/,&PanGenomeNumber(\@spID));
|
|
594 print PAN "$i\t$tmp\t".join("\t",@tmp)."\n";
|
|
595
|
|
596 }
|
|
597 }
|
|
598
|
|
599 close(PAN);
|
|
600
|
|
601 #### data fit
|
|
602
|
|
603 #### for model A
|
|
604
|
|
605 if ($spnum<3)
|
|
606 {
|
|
607 print "There are $spnum strains. For pan-genome function fitting, at least 3 strains data are required.\n";
|
|
608 }else
|
|
609 {
|
|
610 open(R,">${outputDIR}2.PanGenome.Profile.txt");
|
|
611 ##### genome number & pan-genome size
|
|
612 @xdata=qw();
|
|
613 @ydata=qw();
|
|
614 &ReadData2Array("${outputDIR}2.PanGenome.Data.txt",\@xdata,0,\@ydata,2);
|
|
615 &SumData(\@xdata,\@ydata,"mean");
|
|
616 ($fit_Rsquare, $fit_A, $fit_A_interval, $fit_B, $fit_C, $fit_C_interval)=&fit_model_A(\@xdata,\@ydata);
|
|
617 print R "The relation bewteen genome number and pan-genome size\n\n";
|
|
618 print R "Function model: y=A*x**B +C \n";
|
|
619 print R "\ty denotes pan-genome size, x denotes genome number, and A, B, C are fitting parameters.\n\n";
|
|
620 print R "Fitting result:\n";
|
|
621 print R "\ty = $fit_A *x**$fit_B + $fit_C\n";
|
|
622 print R "\tR-square = $fit_Rsquare\n";
|
|
623 print R "\tA 95% confidence interval: ($fit_A - $fit_A_interval , $fit_A + $fit_A_interval)\n";
|
|
624 print R "\tC 95% confidence interval: ($fit_C - $fit_C_interval , $fit_C + $fit_C_interval)\n\n\n\n\n";
|
|
625
|
|
626 ##### total gene number & pan-genome size
|
|
627 #@xdata=qw();
|
|
628 #@ydata=qw();
|
|
629 #&ReadData2Array("${outputDIR}2.PanGenome.Data.txt",\@xdata,1,\@ydata,2);
|
|
630 #&SumDataByMedian(\@xdata,\@ydata);
|
|
631 #($fit_Rsquare, $fit_A, $fit_B, $fit_C)=&fit_model_A(\@xdata,\@ydata);
|
|
632 #print R "The relation bewteen total gene number and pan-genome size\n\n";
|
|
633 #print R "$fit_Rsquare, $fit_A, $fit_B, $fit_C\n";
|
|
634 #print R "\ty = $fit_A *x**$fit_B + $fit_C R-square = $fit_Rsquare\n";
|
|
635 #print R "\tx: total gene number\n";
|
|
636 #print R "\ty: pan-genome size\n\n\n\n\n";
|
|
637
|
|
638 ##### genome number & core genome
|
|
639 @xdata=qw();
|
|
640 @ydata=qw();
|
|
641 &ReadData2Array("${outputDIR}2.PanGenome.Data.txt",\@xdata,0,\@ydata,3);
|
|
642 &SumData(\@xdata,\@ydata,"mean");
|
|
643 ($fit_Rsquare, $fit_A, $fit_A_interval, $fit_B, $fit_C, $fit_C_interval)=&fit_model_B(\@xdata,\@ydata);
|
|
644 print R "The relation bewteen genome number and core genome size\n\n";
|
|
645 print R "Function model: y=A*exp(B*x) +C \n";
|
|
646 print R "\ty denotes pan-genome size, x denotes genome number, and A, B, C are fitting parameters.\n\n";
|
|
647 print R "Fitting result:\n";
|
|
648 print R "\ty = $fit_A *exp($fit_B * x) + $fit_C R-square = $fit_Rsquare\n";
|
|
649 print R "\tR-square = $fit_Rsquare\n";
|
|
650 print R "\tA 95% confidence interval: ($fit_A - $fit_A_interval , $fit_A + $fit_A_interval)\n";
|
|
651 print R "\tC 95% confidence interval: ($fit_C - $fit_C_interval , $fit_C + $fit_C_interval)\n\n\n\n\n";
|
|
652 close(R);
|
|
653 }
|
|
654
|
|
655 }
|
|
656
|
|
657
|
|
658 ############################################
|
|
659 # section 3) CDS variation analysis
|
|
660 ############################################
|
|
661
|
|
662 if ($run_variation)
|
|
663 {
|
|
664 print "\n\n############################################\n";
|
|
665 print "# section 3) CDS variation analysis\n";
|
|
666 print "############################################\n\n\n";
|
|
667
|
|
668 #### Retrieve cluster
|
|
669 &RetrieveClusterFromFile("${outputDIR}1.Orthologs_Cluster.txt",\@clusters);
|
|
670 chomp(@clusters);
|
|
671
|
|
672 ## protein
|
|
673 system("rm -rf *.pep");
|
|
674 &PrepareFasta(\@species,$inputDIR,".pep"); ###prepare pep file
|
|
675 system("cat *.pep > All.faa && rm -rf *.pep && mv All.faa All.pep");
|
|
676 &ReadSequenceInToHash("All.pep",\%pep);
|
|
677 ## nucleic
|
|
678 system("rm -rf *.nuc");
|
|
679 &PrepareFasta(\@species,$inputDIR,".nuc"); ###prepare nuc file
|
|
680 system("cat *.nuc > All.ffn && rm -rf *.nuc && mv All.ffn All.nuc");
|
|
681 &ReadSequenceInToHash("All.nuc",\%nuc);
|
|
682
|
|
683 ## scanning SNP
|
|
684 %nt4tree=();
|
|
685 for ($i=0;$i<$spnum;$i++)
|
|
686 {
|
|
687 $nt4tree{"S".$i}="";
|
|
688 }
|
|
689
|
|
690 open(VAR,">${outputDIR}3.CDS.variation.txt");
|
|
691 print VAR "ClusterID\tStrains_Number\tGene_Number\tPosition\taaType\tntType\tntProfile\tVariation type\n";
|
|
692
|
|
693 open(VA,">${outputDIR}3.CDS.variation.analysis.txt");
|
|
694 print VA "ClusterID\tInDel Base\tNonsynonymous mutation\tSynonymous mutation\n";
|
|
695
|
|
696 for ($line=0;$line<@clusters;$line++)
|
|
697 {
|
|
698 @row=split(/\t|\,/,$clusters[$line]);
|
|
699 $ClusterID=$row[0];
|
|
700 splice(@row,0,1);
|
|
701 @row=grep(/^S/,@row);
|
|
702 if (scalar(@row) >=2)
|
|
703 {
|
|
704 open(PEP,">$ClusterID.pep");
|
|
705 open(NUC,">$ClusterID.nuc");
|
|
706 foreach $key (@row)
|
|
707 {
|
|
708 print PEP ">$key\n$pep{$key}\n";
|
|
709 print NUC ">$key\n$nuc{$key}\n";
|
|
710 }
|
|
711 close(PEP);
|
|
712 close(NUC);
|
|
713 system("$mafft --quiet $ClusterID.pep > $ClusterID.pal");
|
|
714 #system("perl ./pal2nal.pl $ClusterID.pal $ClusterID.nuc -output fasta > $ClusterID.nal");
|
|
715 $tmp=&pal2nal("$ClusterID.pal","$ClusterID.nuc","$ClusterID.nal");
|
|
716 if ($tmp == 0)
|
|
717 {
|
|
718 system("rm -rf $ClusterID.*");
|
|
719 next;
|
|
720 }
|
|
721
|
|
722 @tmp=&DetectSNP();
|
|
723 if (scalar(@tmp)>0)
|
|
724 {
|
|
725 print VA $ClusterID."\t".&VarAnalysis(\@tmp)."\n";
|
|
726 print VAR join("",@tmp);
|
|
727 ### core orthologs
|
|
728 @row=split(/\t/,$clusters[$line]);
|
|
729 splice(@row,0,1);
|
|
730 if ((&CountGeneInCluster(join("\t",@row)) ==$spnum) and (&CountSpeicesInCluster(join("\t",@row)) == $spnum) )
|
|
731 {
|
|
732 $count_tree++;
|
|
733 %tmpHash=();
|
|
734 foreach (@row)
|
|
735 {
|
|
736 $tmpHash{$_}="";
|
|
737 }
|
|
738 &RemoveHeadGap("$ClusterID.nal",\%tmpHash);
|
|
739 &ExtractSNP4tree(\%tmpHash,\%nt4tree);
|
|
740 }
|
|
741 }
|
|
742 system("rm -rf $ClusterID.*");
|
|
743 }
|
|
744 }
|
|
745 close(VAR);
|
|
746 close(VA);
|
|
747
|
|
748 open(R,">${outputDIR}3.CDS.variation.for.evolution.txt");
|
|
749 foreach $key (keys %nt4tree)
|
|
750 {
|
|
751 $_=$key;
|
|
752 s/s//gi;
|
|
753 print R ">$species[$_]\n$nt4tree{$key}\n";
|
|
754 }
|
|
755 close(R);
|
|
756 print $count_tree."\n\n";
|
|
757
|
|
758 ###
|
|
759 system("rm All.nuc All.pep");
|
|
760 }else
|
|
761 {
|
|
762 print "CDS variation is skipped.\n";
|
|
763 }
|
|
764
|
|
765 ############################################
|
|
766 # section 4) CDS variation analysis
|
|
767 ############################################
|
|
768
|
|
769 if ($run_evolution)
|
|
770 {
|
|
771 #### Retrieve cluster
|
|
772 &RetrieveClusterFromFile("${outputDIR}1.Orthologs_Cluster.txt",\@clusters);
|
|
773 chomp(@clusters);
|
|
774
|
|
775
|
|
776
|
|
777 ##################
|
|
778 ##
|
|
779 ## Distance based
|
|
780 ##
|
|
781 ##################
|
|
782
|
|
783 ### caculate the distance between each two strains by cluster
|
|
784 &ClusterProfil4Specie(\%tmpHash); # caculate Clusters profile for each specie
|
|
785 &DistanceMatrix($spnum,\%tmpHash); # caculate distance matrix acoording to Clusters profile
|
|
786 ###output distance
|
|
787 open(DIST,">${outputDIR}4.Species_Distance_Clusters_Based.txt");
|
|
788 ###header
|
|
789 printf DIST "%5d", $spnum;
|
|
790 print DIST "\n";
|
|
791 foreach $i (0..($spnum-1))
|
|
792 {
|
|
793 $key="sp".$i."sp";
|
|
794 printf DIST "%-10s",$key;
|
|
795 foreach $j (0..($spnum-1))
|
|
796 {
|
|
797 printf DIST " %8f",$tmpHash{$i."-".$j};
|
|
798 }
|
|
799 print DIST "\n";
|
|
800 }
|
|
801 close(DIST);
|
|
802
|
|
803 %tmpHash=();
|
|
804
|
|
805 ### based on pan genome (distance)
|
|
806 print "\nDraw pangenome based phylogenetic tree ...\n\n";
|
|
807
|
|
808 &PanBasedTree("${outputDIR}4.Species_Distance_Clusters_Based.txt","${outputDIR}4.PanBased");
|
|
809
|
|
810 ##################
|
|
811 ##
|
|
812 ## SNP based
|
|
813 ##
|
|
814 ##################
|
|
815 %tmpHash=();
|
|
816 if (!(-e "${outputDIR}3.CDS.variation.for.evolution.txt"))
|
|
817 {
|
|
818 print "Variation in core orthologs cluster is not found from ${outputDIR}3.CDS.variation.for.evolution.txt.\n";
|
|
819 print "Maybe you have skipped CDS variation analysis.\n";
|
|
820 }else
|
|
821 {
|
|
822 &ReadSequenceInToHash("${outputDIR}3.CDS.variation.for.evolution.txt",\%tmpHash);
|
|
823 open(R,">mlst.aln");
|
|
824 for ($i=0;$i<@species;$i++)
|
|
825 {
|
|
826 print R ">sp${i}sp\n".$tmpHash{$species[$i]}."\n";
|
|
827 }
|
|
828 close(R);
|
|
829 &fasta2phylip("mlst.aln","mlst.phylip");
|
|
830 system("rm -rf mlst.aln");
|
|
831
|
|
832 print "\nDraw SNP based phylogenetic tree ...\n\n";
|
|
833 &SNPBasedTree("mlst.phylip","${outputDIR}4.SNPBased");
|
|
834 system("rm -rf mlst.phylip")
|
|
835 }
|
|
836
|
|
837 ########
|
|
838 # replace speices name
|
|
839 ########
|
|
840
|
|
841 opendir(DIR,"${outputDIR}");
|
|
842 @tmp=readdir(DIR);
|
|
843 closedir(DIR);
|
|
844 @tmp=grep(/^4/,@tmp);
|
|
845 foreach $tmp (@tmp)
|
|
846 {
|
|
847 &ReplaceName(\@species,"${outputDIR}$tmp");
|
|
848 }
|
|
849 }else
|
|
850 {
|
|
851 print "Evolution analysis is skipped.\n";
|
|
852 }
|
|
853
|
|
854
|
|
855 ############################################
|
|
856 # section 4) Function analysis
|
|
857 ############################################
|
|
858
|
|
859 if ($run_function)
|
|
860 {
|
|
861 #### Retrieve cluster
|
|
862 &RetrieveClusterFromFile("${outputDIR}1.Orthologs_Cluster.txt",\@clusters);
|
|
863 chomp(@clusters);
|
|
864
|
|
865 #### prepare annotation file
|
|
866 &PrepareTable(\@species,$inputDIR,".function"); ###prepare location file
|
|
867 &ReadAnnotation(\@species,\%cog,\%description);
|
|
868
|
|
869
|
|
870 #### assign function
|
|
871 open(R,">${outputDIR}5.Orthologs_Cluster_Function.txt");
|
|
872 print R "ClusterID\tConservation_Level\tCOG\tDescription\n";
|
|
873 for ($i=0;$i<@clusters;$i++)
|
|
874 {
|
|
875 @row=split(/\t/,$clusters[$i]);
|
|
876 $ClusterID=$row[0];
|
|
877 splice(@row,0,1);
|
|
878 print R $ClusterID."\t".&CountSpeicesInCluster(join("\t",@row))."\t".&getCOG(\@row,\%cog)."\t".&getDescription(\@row,\%description)."\n";
|
|
879 }
|
|
880 close(R);
|
|
881
|
|
882 #### COG distribution
|
|
883
|
|
884 ###Whole Clusters COG Distribution
|
|
885 &outputCOGStatistic("${outputDIR}5.Orthologs_Whole_Cluster_COG_Distribution.txt",&scanCOG("${outputDIR}5.Orthologs_Cluster_Function.txt",$spnum,1));
|
|
886
|
|
887 ###Core Clusters COG Distribution
|
|
888 &outputCOGStatistic("${outputDIR}5.Orthologs_Core_Cluster_COG_Distribution.txt",&scanCOG("${outputDIR}5.Orthologs_Cluster_Function.txt",$spnum,$spnum));
|
|
889
|
|
890 ###Dispensable Clusters COG Distribution
|
|
891 &outputCOGStatistic("${outputDIR}5.Orthologs_Dispensable_Cluster_COG_Distribution.txt",&scanCOG("${outputDIR}5.Orthologs_Cluster_Function.txt",($spnum-1),2));
|
|
892
|
|
893 ###strains specifc Clusters COG Distribution
|
|
894 &outputCOGStatistic("${outputDIR}5.Orthologs_specifc_Cluster_COG_Distribution.txt",&scanCOG("${outputDIR}5.Orthologs_Cluster_Function.txt",1,1));
|
|
895
|
|
896 system("rm -rf *.function");
|
|
897
|
|
898 }else
|
|
899 {
|
|
900 print "Function analysis is skipped.\n";
|
|
901 }
|
|
902
|
|
903 sub outputCOGStatistic()
|
|
904 {
|
|
905 (my $file,my $subcogcount)=@_;
|
|
906 my @cogcat=("J A K L B","D Y V T M N Z W U O","C G E F H I P Q","R S -");
|
|
907 my @cogdesc=("INFORMATION STORAGE AND PROCESSING","CELLULAR PROCESSES AND SIGNALING","METABOLISM","POORLY CHARACTERIZED");
|
|
908 my @subcogcat=qw(J A K L B D Y V T M N Z W U O C G E F H I P Q R S -);
|
|
909 my @subcogdesc=("[J] Translation, ribosomal structure and biogenesis","[A] RNA processing and modification","[K] Transcription","[L] Replication, recombination and repair","[B] Chromatin structure and dynamics","[D] Cell cycle control, cell division, chromosome partitioning","[Y] Nuclear structure","[V] Defense mechanisms","[T] Signal transduction mechanisms","[M] Cell wall/membrane/envelope biogenesis","[N] Cell motility","[Z] Cytoskeleton","[W] Extracellular structures","[U] Intracellular trafficking, secretion, and vesicular transport","[O] Posttranslational modification, protein turnover, chaperones","[C] Energy production and conversion","[G] Carbohydrate transport and metabolism","[E] Amino acid transport and metabolism","[F] Nucleotide transport and metabolism","[H] Coenzyme transport and metabolism","[I] Lipid transport and metabolism","[P] Inorganic ion transport and metabolism","[Q] Secondary metabolites biosynthesis, transport and catabolism","[R] General function prediction only","[S] Function unknown","[-] Unclassified");
|
|
910 my %subcogdesc;
|
|
911 my $key;
|
|
912 my @cog;
|
|
913 my $i;
|
|
914 my $cognum;
|
|
915
|
|
916 for ($i=0;$i<@subcogcat;$i++)
|
|
917 {
|
|
918 $subcogdesc{$subcogcat[$i]}=$subcogdesc[$i];
|
|
919 }
|
|
920
|
|
921 open(R,">$file");
|
|
922 for ($i=0;$i<@cogcat;$i++)
|
|
923 {
|
|
924 $cognum=0;
|
|
925 foreach $key (split(" ",$cogcat[$i]))
|
|
926 {
|
|
927 $cognum=$cognum+$$subcogcount{$key};
|
|
928 }
|
|
929 print R $cogdesc[$i]." ( ".$cognum." )\n";
|
|
930 foreach $key (split(" ",$cogcat[$i]))
|
|
931 {
|
|
932 printf R "%-6d %s\n",$$subcogcount{$key},$subcogdesc{$key};
|
|
933 }
|
|
934 print R "\n";
|
|
935
|
|
936 }
|
|
937 close(R);
|
|
938
|
|
939 }
|
|
940
|
|
941 sub scanCOG()
|
|
942 {
|
|
943 (my $file,my $max_orth,my $min_orth)=@_;
|
|
944 my @row;
|
|
945 my @subcogcat=qw(J A K L B D Y V T M N Z W U O C G E F H I P Q R S -);
|
|
946 my %subcogcount;
|
|
947 my $cog;
|
|
948 my $key;
|
|
949
|
|
950 foreach $key (@subcogcat)
|
|
951 {
|
|
952 $subcogcount{$key}=0;
|
|
953 }
|
|
954
|
|
955 @subcogcat=qw(J A K L B D Y V T M N Z W U O C G E F H I P Q R S);
|
|
956
|
|
957 open(F,"$file");
|
|
958 $_=<F>;
|
|
959 while (<F>)
|
|
960 {
|
|
961 @row=split(/\t/,$_);
|
|
962 if ($row[1]>=$min_orth and $row[1]<=$max_orth)
|
|
963 {
|
|
964 if ($row[2] eq "-")
|
|
965 {
|
|
966 $subcogcount{"-"}++;
|
|
967 }else
|
|
968 {
|
|
969 $_=uc($row[2]);
|
|
970 s/COG//gi;
|
|
971 $cog=$_;
|
|
972 foreach $key (@subcogcat)
|
|
973 {
|
|
974 if ($cog=~/$key/)
|
|
975 {
|
|
976 $subcogcount{$key}++;
|
|
977 }
|
|
978 }
|
|
979 }
|
|
980 }
|
|
981 }
|
|
982 close(F);
|
|
983
|
|
984 return \%subcogcount;
|
|
985 }
|
|
986
|
|
987
|
|
988
|
|
989
|
|
990 sub getCOG()
|
|
991 {
|
|
992 (my $data,my $coghash)=@_;
|
|
993 my $cog="";
|
|
994 my @cog;
|
|
995 my $key;
|
|
996 my %hash;
|
|
997 my @gene=split(/\t|\,/,join("\t",@$data));
|
|
998 @gene=grep(/^S/,@gene);
|
|
999
|
|
1000 foreach $key (@gene)
|
|
1001 {
|
|
1002 if (($$coghash{$key} ne "-") and ($$coghash{$key} ne ""))
|
|
1003 {
|
|
1004 $cog=$cog.",".$$coghash{$key};
|
|
1005 }
|
|
1006 }
|
|
1007 @cog=split(/,/,$cog);
|
|
1008 foreach $cog (@cog)
|
|
1009 {
|
|
1010 if ($cog ne "")
|
|
1011 {
|
|
1012 $hash{$cog}=1;
|
|
1013 }
|
|
1014 }
|
|
1015
|
|
1016 $cog=join(",",(keys %hash));
|
|
1017 if ($cog eq "")
|
|
1018 {
|
|
1019 $cog="-";
|
|
1020 }
|
|
1021 return $cog;
|
|
1022 }
|
|
1023
|
|
1024 sub getDescription()
|
|
1025 {
|
|
1026 (my $data,my $deschash)=@_;
|
|
1027 my $desc="";
|
|
1028 my $key;
|
|
1029 my @gene=split(/\t|\,/,join("\t",@$data));
|
|
1030 @gene=grep(/^S/,@gene);
|
|
1031
|
|
1032 foreach $key (@gene)
|
|
1033 {
|
|
1034 if ( ($$deschash{$key} ne "") and ($$deschash{$key} ne "-") and ($$deschash{$key}!~/hypothetical/))
|
|
1035 {
|
|
1036 $desc=$$deschash{$key};
|
|
1037 }
|
|
1038 }
|
|
1039
|
|
1040 if ($desc eq "")
|
|
1041 {
|
|
1042 $desc="hypothetical protein";
|
|
1043 }
|
|
1044
|
|
1045 return $desc;
|
|
1046 }
|
|
1047
|
|
1048
|
|
1049
|
|
1050 sub ReadAnnotation()
|
|
1051 {
|
|
1052 (my $species,my $cog,my $description)=@_;
|
|
1053 my $i;
|
|
1054 my @row;
|
|
1055
|
|
1056 for ($i=0;$i<@$species;$i++)
|
|
1057 {
|
|
1058 open(F,"$$species[$i].function");
|
|
1059 while (<F>)
|
|
1060 {
|
|
1061 chomp($_);
|
|
1062 @row=split(/\t/,$_);
|
|
1063 if (scalar(@row)>=2)
|
|
1064 {
|
|
1065 $$cog{$row[0]}=$row[1];
|
|
1066 }else
|
|
1067 {
|
|
1068 $$cog{$row[0]}="-";
|
|
1069 }
|
|
1070
|
|
1071 if (scalar(@row)>=3)
|
|
1072 {
|
|
1073 $$description{$row[0]}=$row[2];
|
|
1074 }else
|
|
1075 {
|
|
1076 $$description{$row[0]}="hypothetical protein";
|
|
1077 }
|
|
1078 }
|
|
1079 close(F);
|
|
1080 }
|
|
1081 }
|
|
1082
|
|
1083
|
|
1084 sub SNPBasedTree()
|
|
1085 {
|
|
1086 (my $infile,my $outfileprefix)=@_;
|
|
1087 my $tmpin=$infile;
|
|
1088 my $tmpout;
|
|
1089
|
|
1090
|
|
1091 #### boootstrap
|
|
1092 print "\n#### seqboot ...\n\n";
|
|
1093 open(R,">seqboot.cmd");
|
|
1094 #print R "$tmpin\n";
|
|
1095 print R "R\n";
|
|
1096 print R "$bootstrap\n";
|
|
1097 print R "Y\n";
|
|
1098 print R "1\n";
|
|
1099 close(R);
|
|
1100
|
|
1101 system("cp $tmpin infile");
|
|
1102 system("$seqboot < seqboot.cmd");
|
|
1103 system("mv outfile 100dnaseq");
|
|
1104 system("rm -rf infile");
|
|
1105 system("rm seqboot.cmd"); # 100dnasseq
|
|
1106
|
|
1107 #### dnaml
|
|
1108 print "\n#### dnaml ...\n\n";
|
|
1109 open(R,">dnaml.cmd");
|
|
1110 #print R "100dnaseq\n";
|
|
1111 print R "T\n";
|
|
1112 print R "25\n";
|
|
1113 if ($bootstrap>1)
|
|
1114 {
|
|
1115 print R "M\n";
|
|
1116 print R "D\n";
|
|
1117 #print R "100\n";
|
|
1118 print R "$bootstrap\n";
|
|
1119 print R "1\n"; # Random number seed (must be odd)?
|
|
1120 print R "5\n"; # Number of times to jumble?
|
|
1121 }
|
|
1122 print R "Y\n";
|
|
1123 close(R);
|
|
1124
|
|
1125 system("cp 100dnaseq infile");
|
|
1126 system("$dnaml < dnaml.cmd");
|
|
1127 system("rm -rf outfile");
|
|
1128 system("rm -rf infile");
|
|
1129 system("mv outtree 100dnaseqtree"); # 100dnaseq, 100dnaseqtree
|
|
1130
|
|
1131 #### consense
|
|
1132 print "\n#### dnaml consense ...\n\n";
|
|
1133
|
|
1134 open(R,">consense.cmd");
|
|
1135 #print R "100dnaseqtree\n";
|
|
1136 print R "Y\n";
|
|
1137 close(R);
|
|
1138
|
|
1139 system("cp 100dnaseqtree intree");
|
|
1140 system("$consense < consense.cmd");
|
|
1141 system("mv outfile ${outfileprefix}.ML.outfile");
|
|
1142 system("mv outtree ${outfileprefix}.ML.tree");
|
|
1143 system("rm -rf infile");
|
|
1144 system("rm -rf 100dnaseqtree"); # 100dnaseq
|
|
1145
|
|
1146 #### dnadist
|
|
1147 print "\n#### dnadist ...\n\n";
|
|
1148 open(R,">dnadist.cmd");
|
|
1149 #print R "100dnaseq\n";
|
|
1150 print R "T\n";
|
|
1151 print R "25\n";
|
|
1152 if ($bootstrap>1)
|
|
1153 {
|
|
1154 print R "M\n";
|
|
1155 print R "D\n";
|
|
1156 #print R "100\n";
|
|
1157 print R "$bootstrap\n";
|
|
1158 }
|
|
1159 print R "Y\n";
|
|
1160 close(R);
|
|
1161
|
|
1162 system("cp 100dnaseq infile");
|
|
1163 system("$dnadist < dnadist.cmd");
|
|
1164 system("rm -rf 100dnaseq");
|
|
1165 system("rm -rf infile");
|
|
1166 system("mv outfile 100dnadist"); # 100dnadist
|
|
1167
|
|
1168 #### Neighbor-joining tree
|
|
1169 print "\n#### Neighbor-joining ...\n\n";
|
|
1170 open(R,">NJ.cmd");
|
|
1171 if ($bootstrap>1)
|
|
1172 {
|
|
1173 #print R "100dnadist\n";
|
|
1174 print R "M\n";
|
|
1175 #print R "100\n";
|
|
1176 print R "$bootstrap\n";
|
|
1177 print R "1\n";
|
|
1178 }
|
|
1179 print R "Y\n";
|
|
1180 close(R);
|
|
1181
|
|
1182 system("cp 100dnadist infile");
|
|
1183 system("$neighbor < NJ.cmd");
|
|
1184 system("mv outtree 100dnadistNJtree");
|
|
1185 system("rm outfile");
|
|
1186 system("rm -rf infile");
|
|
1187 system("rm -rf NJ.cmd"); # 100dnadist,100dnadistNJtree
|
|
1188
|
|
1189 #### NJ-consense
|
|
1190 print "\n#### NJ-consense ...\n\n";
|
|
1191 open(R,">NJ-consense.cmd");
|
|
1192 #print R "100dnadistNJtree\n";
|
|
1193 print R "Y\n";
|
|
1194 close(R);
|
|
1195
|
|
1196 system("cp 100dnadistNJtree intree");
|
|
1197 system("$consense < NJ-consense.cmd");
|
|
1198 system("mv outfile ${outfileprefix}.Neighbor-joining.outfile");
|
|
1199 system("mv outtree ${outfileprefix}.Neighbor-joining.tree");
|
|
1200 system("rm -rf NJ-consense.cmd");
|
|
1201 system("rm -rf intree");
|
|
1202 system("rm -rf 100dnadistNJtree");
|
|
1203
|
|
1204
|
|
1205 #### UPGMA tree
|
|
1206 print "\n#### UPGMA ...\n\n";
|
|
1207 open(R,">UPGMA.cmd");
|
|
1208 #print R "100dnadist\n";
|
|
1209 print R "N\n";
|
|
1210 if ($bootstrap>1)
|
|
1211 {
|
|
1212 print R "M\n";
|
|
1213 #print R "100\n";
|
|
1214 print R "$bootstrap\n";
|
|
1215 print R "1\n";
|
|
1216 }
|
|
1217 print R "Y\n";
|
|
1218 close(R);
|
|
1219
|
|
1220 system("cp 100dnadist infile");
|
|
1221 system("$neighbor < UPGMA.cmd");
|
|
1222 system("mv outtree 100dnadistUPGMAtree");
|
|
1223 system("rm -rf outfile");
|
|
1224 system("rm -rf infile");
|
|
1225 system("rm -rf UPGMA.cmd");
|
|
1226
|
|
1227 #### UPGMA-consense
|
|
1228 print "\n#### UPGMA-consense ...\n\n";
|
|
1229 open(R,">UPGMA-consense.cmd");
|
|
1230 #print R "100dnadistUPGMAtree\n";
|
|
1231 print R "Y\n";
|
|
1232 close(R);
|
|
1233
|
|
1234 system("cp 100dnadistUPGMAtree intree");
|
|
1235 system("$consense < UPGMA-consense.cmd");
|
|
1236 system("mv outfile ${outfileprefix}.UPGMA.outfile");
|
|
1237 system("mv outtree ${outfileprefix}.UPGMA.tree");
|
|
1238 system("rm -rf UPGMA-consense.cmd");
|
|
1239 system("rm -rf 100dnadistUPGMAtree");
|
|
1240 system("rm -rf intree");
|
|
1241
|
|
1242 ###CLEAN TMP FILE
|
|
1243
|
|
1244 system("rm -rf *.cmd");
|
|
1245 system("rm -rf 100dnadist");
|
|
1246 }
|
|
1247
|
|
1248 sub PanBasedTree()
|
|
1249 {
|
|
1250 (my $infile,my $outfileprefix)=@_;
|
|
1251 my $tmpin;
|
|
1252 my $tmpout;
|
|
1253
|
|
1254 $tmpin=$infile;
|
|
1255 #### Neighbor-joining tree
|
|
1256
|
|
1257 open(R,">NJ.cmd");
|
|
1258 #print R "$tmpin\n";
|
|
1259 print R "Y\n";
|
|
1260 close(R);
|
|
1261
|
|
1262 system("cp $tmpin infile");
|
|
1263 system("$neighbor < NJ.cmd");
|
|
1264 system("mv outfile ${outfileprefix}.Neighbor-joining.outfile");
|
|
1265 system("mv outtree ${outfileprefix}.Neighbor-joining.tree");
|
|
1266 system("rm -rf NJ.cmd");
|
|
1267 system("rm -rf infile");
|
|
1268
|
|
1269 #### UPGMA tree
|
|
1270
|
|
1271 open(R,">UPGMA.cmd");
|
|
1272 #print R "$tmpin\n";
|
|
1273 print R "N\n";
|
|
1274 print R "Y\n";
|
|
1275 close(R);
|
|
1276
|
|
1277 system("cp $tmpin infile");
|
|
1278 system("$neighbor < UPGMA.cmd");
|
|
1279 system("mv outfile ${outfileprefix}.UPGMA.outfile");
|
|
1280 system("mv outtree ${outfileprefix}.UPGMA.tree");
|
|
1281 system("rm -rf UPGMA.cmd");
|
|
1282 system("rm -rf infile");
|
|
1283
|
|
1284
|
|
1285 ###CLEAN TMP FILE
|
|
1286 system("rm -rf *.cmd");
|
|
1287 }
|
|
1288
|
|
1289 sub DistanceMatrix()
|
|
1290 {
|
|
1291 (my $spnum,my $hash)=@_;
|
|
1292 my $i;
|
|
1293 my $j;
|
|
1294 my $k;
|
|
1295 my $dist;
|
|
1296 my $ref;
|
|
1297 my $query;
|
|
1298 foreach $i (0..($spnum-1))
|
|
1299 {
|
|
1300 foreach $j ($i..($spnum-1))
|
|
1301 {
|
|
1302 $ref=$$hash{$i};
|
|
1303 $query=$$hash{$j};
|
|
1304 $dist=0;
|
|
1305 for ($k=0;$k<length($ref);$k++)
|
|
1306 {
|
|
1307 if (substr($ref,$k,1) ne substr($query,$k,1))
|
|
1308 {
|
|
1309 $dist++;
|
|
1310 }
|
|
1311 }
|
|
1312 $$hash{$i."-".$j}=$dist;
|
|
1313 $$hash{$j."-".$i}=$dist;
|
|
1314 }
|
|
1315 }
|
|
1316 }
|
|
1317
|
|
1318
|
|
1319 sub ClusterProfil4Specie
|
|
1320 {
|
|
1321 (my $hash)=@_;
|
|
1322 my @row;
|
|
1323 my $i;
|
|
1324
|
|
1325 foreach (0..($spnum-1)) #initialization Hash
|
|
1326 {
|
|
1327 $$hash{$_}="";
|
|
1328 }
|
|
1329
|
|
1330 foreach (@clusters)
|
|
1331 {
|
|
1332 @row=split(/\t/,$_);
|
|
1333 splice(@row,0,1);
|
|
1334 if (&CountSpeicesInCluster(join("\t",@row))>1)
|
|
1335 {
|
|
1336 for ($i=0;$i<@row;$i++)
|
|
1337 {
|
|
1338 if ($row[$i] eq "-")
|
|
1339 {
|
|
1340 $$hash{$i}=$$hash{$i}."0";
|
|
1341 }else
|
|
1342 {
|
|
1343 $$hash{$i}=$$hash{$i}."1";
|
|
1344 }
|
|
1345 }
|
|
1346 }
|
|
1347 }
|
|
1348 }
|
|
1349
|
|
1350 # &ExtractSNP4tree(\%tmpHash,\%nt4tree);
|
|
1351
|
|
1352 sub ExtractSNP4tree()
|
|
1353 {
|
|
1354 (my $hash,my $nt4treeRef)=@_;
|
|
1355 my $key;
|
|
1356 my @row;
|
|
1357 my $i;
|
|
1358 my $len;
|
|
1359 my @tribases;
|
|
1360 foreach $key (keys %$hash)
|
|
1361 {
|
|
1362 $$hash{substr($key,0,index($key,"G"))}=$$hash{$key};
|
|
1363 delete($$hash{$key});
|
|
1364 }
|
|
1365
|
|
1366 for ($i=0;$i<$spnum;$i++)
|
|
1367 {
|
|
1368 $nt4tree{"S".$i}=$nt4tree{"S".$i}.$$hash{"S".$i};
|
|
1369 }
|
|
1370 }
|
|
1371
|
|
1372
|
|
1373 =pod
|
|
1374 sub ExtractSNP4tree()
|
|
1375 {
|
|
1376 (my $hash,my $nt4treeRef)=@_;
|
|
1377 my $key;
|
|
1378 my @row;
|
|
1379 my $i;
|
|
1380 my $len;
|
|
1381 my @tribases;
|
|
1382 foreach $key (keys %$hash)
|
|
1383 {
|
|
1384 $$hash{substr($key,0,index($key,"G"))}=$$hash{$key};
|
|
1385 delete($$hash{$key});
|
|
1386 }
|
|
1387 @_=(keys %$hash);
|
|
1388 $len=length($_[0]);
|
|
1389 for ($j=0;3*$j<$len;$j++)
|
|
1390 {
|
|
1391 ##### scanning each codon
|
|
1392 for ($i=0;$i<$spnum;$i++)
|
|
1393 {
|
|
1394 $tribases[$i]=substr($$hash{"S".$i},3*$j,3);
|
|
1395 }
|
|
1396 ##### checking each codon
|
|
1397 if (&IsTheSame(@tribases) ==0)
|
|
1398 {
|
|
1399 for ($i=0;$i<@tribases;$i++)
|
|
1400 {
|
|
1401 $nt4tree{"S".$i}=$nt4tree{"S".$i}.$tribases[$i];
|
|
1402 }
|
|
1403 }
|
|
1404 }
|
|
1405 }
|
|
1406 =cut
|
|
1407
|
|
1408
|
|
1409 sub pal2nal()
|
|
1410 {
|
|
1411 (my $pal,my $nuc, my $nal)=@_;
|
|
1412 my %aaAln=();
|
|
1413 my %ffn=();
|
|
1414 my %ntAln=();
|
|
1415 my %nt;
|
|
1416 my $dna;
|
|
1417 my $nt;
|
|
1418 my $key;
|
|
1419 my $flag=1;
|
|
1420 my $i=0;
|
|
1421 my $j;
|
|
1422
|
|
1423 ### read protein aligment result
|
|
1424 &ReadAlignmentToHash("$pal",\%aaAln);
|
|
1425 ### read nt sequences
|
|
1426 &ReadSequenceInToHash("$nuc",\%ffn);
|
|
1427 foreach $key (keys %ffn)
|
|
1428 {
|
|
1429 $dna=$ffn{$key};
|
|
1430 #if (int(length($nt{$key})/3)*3 ne length($nt{$key}))
|
|
1431 if (int(length($dna)/3)*3 ne length($dna))
|
|
1432 {
|
|
1433 $flag=0;
|
|
1434 print "The length of nucleotide sequence is not 3 integer times.\n";
|
|
1435 last;
|
|
1436 }else
|
|
1437 {
|
|
1438 for ($i=0;$i<(length($dna)/3);$i++)
|
|
1439 {
|
|
1440 $nt{$key."|".$i}=substr($dna,$i*3,3);
|
|
1441 }
|
|
1442 }
|
|
1443 }
|
|
1444
|
|
1445 if ($flag==0)
|
|
1446 {
|
|
1447 return 0;
|
|
1448 }else
|
|
1449 {
|
|
1450 foreach $key (keys %aaAln) ### replace aa with corresponding nt
|
|
1451 {
|
|
1452 $nt="";
|
|
1453 $i=0;
|
|
1454 for ($j=0;$j<length($aaAln{$key});$j++)
|
|
1455 {
|
|
1456 if (substr($aaAln{$key},$j,1) eq "-")
|
|
1457 {
|
|
1458 $nt=$nt."---";
|
|
1459 }else
|
|
1460 {
|
|
1461 $nt=$nt.$nt{$key."|".$i};
|
|
1462 $i++;
|
|
1463 }
|
|
1464 }
|
|
1465 $ntAln{$key}=$nt;
|
|
1466 }
|
|
1467
|
|
1468 ### output
|
|
1469 open(R,">$nal");
|
|
1470 foreach (keys %ntAln)
|
|
1471 {
|
|
1472 print R ">$_\n".$ntAln{$_}."\n";
|
|
1473 }
|
|
1474 close(R);
|
|
1475
|
|
1476 return 1;
|
|
1477 }
|
|
1478 }
|
|
1479
|
|
1480
|
|
1481
|
|
1482
|
|
1483 sub DetectSNP()
|
|
1484 {
|
|
1485 my %faa;
|
|
1486 my %ffn;
|
|
1487 my @row;
|
|
1488 my $count_gene;
|
|
1489 my $count_sp;
|
|
1490 my @genelist;
|
|
1491 my $i;
|
|
1492 my $j;
|
|
1493 my $pepalnlen;
|
|
1494 my @cdsvar=qw();
|
|
1495 my $cdi=0;
|
|
1496 my @tribases;
|
|
1497 my @bases;
|
|
1498 my @aa;
|
|
1499
|
|
1500
|
|
1501 ### fetch gene list
|
|
1502 open(F,"$ClusterID.pep");
|
|
1503 @genelist=<F>;
|
|
1504 close(F);
|
|
1505 @genelist=grep(/^>/,@genelist);
|
|
1506 chomp(@genelist);
|
|
1507 $_=join("\t",@genelist);
|
|
1508 s/>//g;
|
|
1509 @genelist=split(/\t/,$_);
|
|
1510
|
|
1511 ### count gene number and species number
|
|
1512 @row=split(/\t/,$clusters[$ClusterID-1]);
|
|
1513 splice(@row,0,1);
|
|
1514 $count_sp=&CountSpeicesInCluster(join("\t",@row));
|
|
1515 $count_gene=&CountGeneInCluster(join("\t",@row));
|
|
1516
|
|
1517 ### read alignment sequences
|
|
1518 &ReadAlignmentToHash("$ClusterID.pal",\%faa);
|
|
1519 &ReadAlignmentToHash("$ClusterID.nal",\%ffn);
|
|
1520
|
|
1521 @_=(keys %faa);
|
|
1522 $pepalnlen=length($faa{$_[0]});
|
|
1523 ### scan SNP
|
|
1524 for ($i=1;$i<=$pepalnlen;$i++)
|
|
1525 {
|
|
1526 @tmp=qw();
|
|
1527 @tribases=qw();
|
|
1528 for ($j=0;$j<@genelist;$j++) ### fetch triplet codon
|
|
1529 {
|
|
1530 $tribases[$j]=substr($ffn{$genelist[$j]},3*($i-1),3);
|
|
1531 }
|
|
1532 if (&IsTheSame(@tribases) ==0) ### if triplet codon is not consistent
|
|
1533 {
|
|
1534 @aa=qw();
|
|
1535 for ($j=0;$j<@genelist;$j++)
|
|
1536 {
|
|
1537 $aa[$j]=substr($faa{$genelist[$j]},($i-1),1);
|
|
1538 }
|
|
1539 if (&IsTheSame(@aa) ==0) ### aa is not consistent
|
|
1540 {
|
|
1541 if (join("",@aa) =~/-/)
|
|
1542 {
|
|
1543 $cdsvar[$cdi++]=$ClusterID."\t".$count_sp."\t".$count_gene."\t".$i."\t".&CharType(\@aa)."\t-\t-\tInDel\n";
|
|
1544 }else
|
|
1545 {
|
|
1546 #### base 1
|
|
1547 for ($j=0;$j<@genelist;$j++)
|
|
1548 {
|
|
1549 $bases[$j]=substr($ffn{$genelist[$j]},3*($i-1),1);
|
|
1550 }
|
|
1551 if (&IsTheSame(@bases) ==0)
|
|
1552 {
|
|
1553 $cdsvar[$cdi++]=$ClusterID."\t".$count_sp."\t".$count_gene."\t".($i+0.1)."\t".&CharType(\@aa)."\t".&CharType(\@bases)."\t".join("",@bases)."\tNonsynonymous mutation\n";
|
|
1554 }
|
|
1555 #### base 2
|
|
1556 for ($j=0;$j<@genelist;$j++)
|
|
1557 {
|
|
1558 $bases[$j]=substr($ffn{$genelist[$j]},3*($i-1)+1,1);
|
|
1559 }
|
|
1560 if (&IsTheSame(@bases) ==0)
|
|
1561 {
|
|
1562 $cdsvar[$cdi++]=$ClusterID."\t".$count_sp."\t".$count_gene."\t".($i+0.2)."\t".&CharType(\@aa)."\t".&CharType(\@bases)."\t".join("",@bases)."\tNonsynonymous mutation\n";
|
|
1563 }
|
|
1564 #### base 3
|
|
1565 for ($j=0;$j<@genelist;$j++)
|
|
1566 {
|
|
1567 $bases[$j]=substr($ffn{$genelist[$j]},3*($i-1)+2,1);
|
|
1568 }
|
|
1569 if (&IsTheSame(@bases) ==0)
|
|
1570 {
|
|
1571 $cdsvar[$cdi++]=$ClusterID."\t".$count_sp."\t".$count_gene."\t".($i+0.3)."\t".&CharType(\@aa)."\t".&CharType(\@bases)."\t".join("",@bases)."\tNonsynonymous mutation\n";
|
|
1572 }
|
|
1573 }
|
|
1574 }else
|
|
1575 {
|
|
1576 #### base 1
|
|
1577 for ($j=0;$j<@genelist;$j++)
|
|
1578 {
|
|
1579 $bases[$j]=substr($ffn{$genelist[$j]},3*($i-1),1);
|
|
1580 }
|
|
1581 if (&IsTheSame(@bases) ==0)
|
|
1582 {
|
|
1583 $cdsvar[$cdi++]=$ClusterID."\t".$count_sp."\t".$count_gene."\t".($i+0.1)."\t".&CharType(\@aa)."\t".&CharType(\@bases)."\t".join("",@bases)."\tSynonymous mutation\n";
|
|
1584 }
|
|
1585 #### base 2
|
|
1586 for ($j=0;$j<@genelist;$j++)
|
|
1587 {
|
|
1588 $bases[$j]=substr($ffn{$genelist[$j]},3*($i-1)+1,1);
|
|
1589 }
|
|
1590 if (&IsTheSame(@bases) ==0)
|
|
1591 {
|
|
1592 $cdsvar[$cdi++]=$ClusterID."\t".$count_sp."\t".$count_gene."\t".($i+0.2)."\t".&CharType(\@aa)."\t".&CharType(\@bases)."\t".join("",@bases)."\tSynonymous mutation\n";
|
|
1593 }
|
|
1594 #### base 3
|
|
1595 for ($j=0;$j<@genelist;$j++)
|
|
1596 {
|
|
1597 $bases[$j]=substr($ffn{$genelist[$j]},3*($i-1)+2,1);
|
|
1598 }
|
|
1599 if (&IsTheSame(@bases) ==0)
|
|
1600 {
|
|
1601 $cdsvar[$cdi++]=$ClusterID."\t".$count_sp."\t".$count_gene."\t".($i+0.3)."\t".&CharType(\@aa)."\t".&CharType(\@bases)."\t".join("",@bases)."\tSynonymous mutation\n";
|
|
1602 }
|
|
1603 }
|
|
1604 }
|
|
1605 }
|
|
1606 return @cdsvar;
|
|
1607 }
|
|
1608
|
|
1609
|
|
1610 sub VarAnalysis()
|
|
1611 {
|
|
1612 (my $data)=@_;
|
|
1613 my @data=@$data;
|
|
1614 my $indel=0;
|
|
1615 my $syn=0;
|
|
1616 my $nonsyn=0;
|
|
1617 my @tmp;
|
|
1618 $indel=scalar(grep(/InDel$/,@data));
|
|
1619 $nonsyn=scalar(grep(/Nonsynonymous mutation$/,@data));;
|
|
1620 $syn=scalar(grep(/Synonymous mutation$/,@data));
|
|
1621 return "$indel\t$nonsyn\t$syn";
|
|
1622 }
|
|
1623
|
|
1624
|
|
1625
|
|
1626 sub CharType()
|
|
1627 {
|
|
1628 (my $str)=@_;
|
|
1629 my %hash;
|
|
1630 my @data=@$str;
|
|
1631 foreach (@data)
|
|
1632 {
|
|
1633 $hash{$_}=1;
|
|
1634 }
|
|
1635 return join(",",(keys %hash));
|
|
1636 }
|
|
1637
|
|
1638 sub IsTheSame()
|
|
1639 {
|
|
1640 (my @data)=@_;
|
|
1641 my %hash;
|
|
1642 foreach (@data)
|
|
1643 {
|
|
1644 $hash{$_}=1;
|
|
1645 }
|
|
1646 if (scalar(keys %hash) ==1)
|
|
1647 {
|
|
1648 return 1;
|
|
1649 }else
|
|
1650 {
|
|
1651 return 0;
|
|
1652 }
|
|
1653 }
|
|
1654
|
|
1655
|
|
1656
|
|
1657 sub FormatClusterOutPut()
|
|
1658 {
|
|
1659 (my $speices,my $file,my $cluster)=@_;
|
|
1660 my @row;
|
|
1661 my $gid=1;
|
|
1662 my $key;
|
|
1663 my %hash;
|
|
1664 my $gene;
|
|
1665 my @tmp;
|
|
1666 my $i;
|
|
1667 my $j;
|
|
1668 open(R,">$file");
|
|
1669 print R "ClutserID\t".join("\t",@$speices)."\n";
|
|
1670 foreach $key (@$cluster)
|
|
1671 {
|
|
1672 @row=split(/\t/,$key);
|
|
1673 for ($i=0;$i<@row;$i++)
|
|
1674 {
|
|
1675 if ($row[$i] ne "-")
|
|
1676 {
|
|
1677 @tmp=split(/,/,$row[$i]);
|
|
1678 for ($j=0;$j<@tmp;$j++)
|
|
1679 {
|
|
1680 $_=$tmp[$j];
|
|
1681 s/^S[0-9]+G//;
|
|
1682 $tmp[$j]=$_;
|
|
1683 }
|
|
1684 $row[$i]=join(",",@tmp);
|
|
1685 }
|
|
1686 }
|
|
1687 print R $gid."\t".join("\t",@row)."\n";
|
|
1688 $gid++;
|
|
1689 }
|
|
1690 close(R);
|
|
1691 }
|
|
1692 sub RetrieveClusterFromFile()
|
|
1693 {
|
|
1694 (my $file,my $clusters)=@_;
|
|
1695 my @content;
|
|
1696 my @row;
|
|
1697 my $spid;
|
|
1698 my $line=0;
|
|
1699 my $i=0;
|
|
1700 my $j;
|
|
1701 my @tmp;
|
|
1702 open(F,$file) or die "Could open $file\n";
|
|
1703 @content=<F>;
|
|
1704 close(F);
|
|
1705 splice(@content,0,1);
|
|
1706 chomp(@content);
|
|
1707 foreach (@content)
|
|
1708 {
|
|
1709 @row=split(/\t/,$_);
|
|
1710 $$clusters[$line]=$row[0];
|
|
1711 splice(@row,0,1);
|
|
1712 for ($i=0;$i<@row;$i++)
|
|
1713 {
|
|
1714 if ($row[$i] ne "-")
|
|
1715 {
|
|
1716 @tmp=split(/,/,$row[$i]);
|
|
1717 for ($j=0;$j<@tmp;$j++)
|
|
1718 {
|
|
1719 $tmp[$j]="S${i}G".$tmp[$j];
|
|
1720 }
|
|
1721 $row[$i]=join(",",@tmp);
|
|
1722 }
|
|
1723 }
|
|
1724 $$clusters[$line]=$$clusters[$line]."\t".join("\t",@row)."\n";
|
|
1725 $line++;
|
|
1726 }
|
|
1727 }
|
|
1728
|
|
1729
|
|
1730
|
|
1731
|
|
1732 sub GeneDistribution()
|
|
1733 {
|
|
1734 (my $clusters,my $hash)=@_;
|
|
1735 my @row;
|
|
1736 my $spid;
|
|
1737 my $orth;
|
|
1738 my $key;
|
|
1739 foreach (@$clusters)
|
|
1740 {
|
|
1741 @row=split(/\t/,$_);
|
|
1742 splice(@row,0,1);
|
|
1743 $orth=&CountSpeicesInCluster(join("\t",@row));
|
|
1744 @row=split(/\t|\,/,join("\t",@row));
|
|
1745 foreach $key (@row)
|
|
1746 {
|
|
1747 if ($key ne "-")
|
|
1748 {
|
|
1749 $spid=substr($key,1,(index($key,'G')-1)); ###extract strains id
|
|
1750 if (exists($$hash{$orth."|".$spid}))
|
|
1751 {
|
|
1752 $$hash{$orth."|".$spid}++;
|
|
1753 }else
|
|
1754 {
|
|
1755 $$hash{$orth."|".$spid}=1;
|
|
1756 }
|
|
1757 }
|
|
1758 }
|
|
1759 }
|
|
1760 }
|
|
1761
|
|
1762 sub CountSpeicesInCluster()
|
|
1763 {
|
|
1764 (my $str)=@_;
|
|
1765 chomp($str);
|
|
1766 my @list=split(/\t/,$str);
|
|
1767 my $key;
|
|
1768 my $count=0;
|
|
1769
|
|
1770 foreach $key (@list)
|
|
1771 {
|
|
1772 if ($key ne "-")
|
|
1773 {
|
|
1774 $count++;
|
|
1775 }
|
|
1776 }
|
|
1777 return $count;
|
|
1778 }
|
|
1779
|
|
1780 sub CountGeneInCluster()
|
|
1781 {
|
|
1782 (my $str)=@_;
|
|
1783 chomp();
|
|
1784 my @list=split(/\t|\,/,$str);
|
|
1785 my $key;
|
|
1786 my $count=0;
|
|
1787 foreach $key (@list)
|
|
1788 {
|
|
1789 if ($key ne "-")
|
|
1790 {
|
|
1791 $count++;
|
|
1792 }
|
|
1793 }
|
|
1794 return $count;
|
|
1795 }
|
|
1796
|
|
1797
|
|
1798
|
|
1799
|
|
1800 sub GF()
|
|
1801 {
|
|
1802 &PrepareFasta(\@species,$inputDIR,".pep"); ###prepare pep file
|
|
1803 system("cat ".join(".pep ",@species).".pep > All.pep");
|
|
1804 system("grep '>' All.pep > genelist");
|
|
1805 system("$formatdb -p T -i All.pep");
|
|
1806 system("$blastall -p blastp -i All.pep -d All.pep -M BLOSUM45 -m9 -e $evalue -o All.blastp -a $thread");
|
|
1807 system("perl ./Blast_Filter.pl All.blastp All.pep $coverage $identity $score | $mcl - --abc -I 2.0 -o All.cluster");
|
|
1808 &FormatCluster("All.cluster","genelist",$spnum,\@clusters);
|
|
1809 #system("rm -rf *.pep* All.blastp All.cluster genelist");
|
|
1810 }
|
|
1811
|
|
1812 sub MP()
|
|
1813 {
|
|
1814 # (my $species,my $inputDIR,my $thread,my $evalue,my $score,my $coverage,my $identity)=@_;
|
|
1815 my $i;
|
|
1816 my $j;
|
|
1817 &PrepareFasta(\@species,$inputDIR,".pep"); ###prepare pep file
|
|
1818 system("cat ".join(".pep ",@species).".pep > All.pep");
|
|
1819 system("grep '>' All.pep > genelist");
|
|
1820 system("rm -rf All.pep");
|
|
1821 for ($i=0;$i<$spnum;$i++)
|
|
1822 {
|
|
1823 for ($j=$i+1;$j<$spnum;$j++)
|
|
1824 {
|
|
1825 system("perl ./inparanoid.pl $blastall $thread $formatdb $score $global $local $species[$i].pep $species[$j].pep");
|
|
1826 }
|
|
1827 }
|
|
1828 system("perl ./multiparanoid.pl -species ".join(".pep+",@species).".pep -unique 1");
|
|
1829 ###convert the MP result to table list based on gene
|
|
1830 &MP_Result_to_Table("MP.Cluster","All.cluster");
|
|
1831 &FormatCluster("All.cluster","genelist",$spnum,\@clusters);
|
|
1832 system("rm -rf sqltable.* *.pep* MP.Cluster genelist");
|
|
1833 }
|
|
1834
|
|
1835 sub fasta2phylip()
|
|
1836 {
|
|
1837 (my $input,my $output)=@_;
|
|
1838 use Bio::AlignIO;
|
|
1839 my $inputfilename = "10.aln";
|
|
1840 my $in= Bio::AlignIO->new(-file => $input ,
|
|
1841 -format => 'fasta');
|
|
1842 my $out = Bio::AlignIO->new(-file => ">$output" ,
|
|
1843 -format => 'phylip');
|
|
1844 while ( my $aln = $in->next_aln() )
|
|
1845 {
|
|
1846 $out->write_aln($aln);
|
|
1847 }
|
|
1848 }
|
|
1849
|
|
1850
|
|
1851 sub RemoveHeadGap()
|
|
1852 {
|
|
1853 (my $nal,my $hash)=@_;
|
|
1854 my %aln;
|
|
1855 my $key;
|
|
1856 my $gaplength=0;
|
|
1857 my $len1;
|
|
1858 my $len2;
|
|
1859 &ReadSequenceInToHash("$nal",\%aln);
|
|
1860 foreach $key (keys %aln)
|
|
1861 {
|
|
1862 $len1=length($aln{$key});
|
|
1863 $_=$aln{$key};
|
|
1864 s/^-+//;
|
|
1865 $len2=length($_);
|
|
1866 if (($len1-$len2)>$gaplength)
|
|
1867 {
|
|
1868 $gaplength=$len1-$len2;
|
|
1869 }
|
|
1870 }
|
|
1871 foreach $key (keys %aln)
|
|
1872 {
|
|
1873 $$hash{$key}=$$hash{$key}.substr($aln{$key},$gaplength,(length($aln{$key})-$gaplength));
|
|
1874 }
|
|
1875 }
|
|
1876
|
|
1877 sub PrepareFasta()
|
|
1878 {
|
|
1879 (my $species,my $inputDIR,my $extention)=@_;
|
|
1880 my $sp;
|
|
1881 my $file;
|
|
1882 my $i;
|
|
1883 my %hash;
|
|
1884 my $key;
|
|
1885 for ($i=0;$i<@$species;$i++)
|
|
1886 {
|
|
1887 $file=$inputDIR.$$species[$i].$extention;
|
|
1888 %hash=();
|
|
1889 &ReadSequenceInToHash($file,\%hash);
|
|
1890 open(R,">$$species[$i]${extention}") or die "Could write into $file\n";
|
|
1891 foreach $key (keys %hash)
|
|
1892 {
|
|
1893 print R ">S${i}G$key\n";
|
|
1894 print R $hash{$key}."\n";
|
|
1895 }
|
|
1896 close(R);
|
|
1897 }
|
|
1898 }
|
|
1899
|
|
1900 sub PrepareTable()
|
|
1901 {
|
|
1902 (my $species,my $inputDIR,my $extention)=@_;
|
|
1903 my @content;
|
|
1904 my $i;
|
|
1905 my @row;
|
|
1906 my $file;
|
|
1907 for ($i=0;$i<@$species;$i++)
|
|
1908 {
|
|
1909 $file=$inputDIR.$$species[$i].$extention;
|
|
1910 open(F,$file) or die "Could open $file\n";
|
|
1911 @content=<F>;
|
|
1912 close(F);
|
|
1913 chomp(@content);
|
|
1914 open(R,">$$species[$i]${extention}") or die "Could write into $file\n";
|
|
1915 foreach (@content)
|
|
1916 {
|
|
1917 @row=split(/\t/,$_);
|
|
1918 $row[0]="S${i}G$row[0]";
|
|
1919 if ($extention eq ".location")
|
|
1920 {
|
|
1921 $row[0]=$row[0]."\t".$row[0];
|
|
1922 }
|
|
1923 print R join("\t",@row)."\n";
|
|
1924 }
|
|
1925 close(R);
|
|
1926 }
|
|
1927 }
|
|
1928
|
|
1929 sub CheckExtraProgram
|
|
1930 {
|
|
1931 #(my $section, my $method, my $tmparray)=@_;
|
|
1932 my @error;
|
|
1933 my $ei=0;
|
|
1934
|
|
1935 #####cluster gene
|
|
1936 if (substr($section,0,1) eq "1")
|
|
1937 {
|
|
1938 ###MP: blastall formatdb
|
|
1939 ###GF: blastall formatdb mcl
|
|
1940
|
|
1941 if (!(-e $formatdb))
|
|
1942 {
|
|
1943 $error[$ei++]="formatdb is not found at $formatdb\n";
|
|
1944 }
|
|
1945
|
|
1946 if (!(-X $formatdb))
|
|
1947 {
|
|
1948 $error[$ei++]="there is not premission to execute $formatdb\n";
|
|
1949 }
|
|
1950
|
|
1951 if (!(-e $blastall))
|
|
1952 {
|
|
1953 $error[$ei++]="blastall is not found at $blastall\n";
|
|
1954 }
|
|
1955
|
|
1956 if (!(-X $blastall))
|
|
1957 {
|
|
1958 $error[$ei++]="there is not premission to execute $blastall\n";
|
|
1959 }
|
|
1960
|
|
1961 if ($method eq "GF")
|
|
1962 {
|
|
1963 if (!(-e $mcl))
|
|
1964 {
|
|
1965 $error[$ei++]="mcl is not found at $mcl\n";
|
|
1966 }
|
|
1967 if (!(-X $mcl))
|
|
1968 {
|
|
1969 $error[$ei++]="there is not premission to execute $mcl\n";
|
|
1970 }
|
|
1971 }
|
|
1972 }
|
|
1973
|
|
1974 #####CDS variation
|
|
1975 if (substr($section,2,1) eq "1")
|
|
1976 {
|
|
1977 if (!(-e $mafft))
|
|
1978 {
|
|
1979 $error[$ei++]="mafft is not found at $mafft\n";
|
|
1980 }
|
|
1981 if (!(-X $mafft))
|
|
1982 {
|
|
1983 $error[$ei++]="there is not premission to execute $mafft\n";
|
|
1984 }
|
|
1985 }
|
|
1986
|
|
1987 #####CDS variation
|
|
1988 if (substr($section,3,1) eq "1")
|
|
1989 {
|
|
1990 if (!(-e $mafft))
|
|
1991 {
|
|
1992 $error[$ei++]="mafft is not found at $mafft\n";
|
|
1993 }
|
|
1994 if (!(-X $mafft))
|
|
1995 {
|
|
1996 $error[$ei++]="there is not premission to execute $mafft\n";
|
|
1997 }
|
|
1998 }
|
|
1999 #####Evolution analysis
|
|
2000 if (substr($section,3,1) eq "1")
|
|
2001 {
|
|
2002 if (-e $seqboot)
|
|
2003 {
|
|
2004 $error[$ei++]="there is not premission to execute $seqboot\n" if(!(-X $seqboot));
|
|
2005 }else
|
|
2006 {
|
|
2007 $error[$ei++]="seqboot is not found at $seqboot\n";
|
|
2008 }
|
|
2009 if (-e $dnaml)
|
|
2010 {
|
|
2011 $error[$ei++]="there is not premission to execute $dnaml\n" if(!(-X $dnaml));
|
|
2012 }else
|
|
2013 {
|
|
2014 $error[$ei++]="dnaml is not found at $dnaml\n";
|
|
2015 }
|
|
2016 if (-e $dnadist)
|
|
2017 {
|
|
2018 $error[$ei++]="there is not premission to execute $dnadist\n" if(!(-X $dnadist));
|
|
2019 }else
|
|
2020 {
|
|
2021 $error[$ei++]="dnadist is not found at $dnadist\n";
|
|
2022 }
|
|
2023 if (-e $neighbor)
|
|
2024 {
|
|
2025 $error[$ei++]="there is not premission to execute $neighbor\n" if(!(-X $neighbor));
|
|
2026 }else
|
|
2027 {
|
|
2028 $error[$ei++]="neighbor is not found at $neighbor\n";
|
|
2029 }
|
|
2030 if (-e $consense)
|
|
2031 {
|
|
2032 $error[$ei++]="there is not premission to execute $consense\n" if(!(-X $consense));
|
|
2033 }else
|
|
2034 {
|
|
2035 $error[$ei++]="consense is not found at $consense\n";
|
|
2036 }
|
|
2037 if (-e $dnapars)
|
|
2038 {
|
|
2039 $error[$ei++]="there is not premission to execute $dnapars\n" if(!(-X $dnapars));
|
|
2040 }else
|
|
2041 {
|
|
2042 $error[$ei++]="dnapars is not found at $dnapars\n";
|
|
2043 }
|
|
2044 }
|
|
2045 #@$tmparray=(@$tmparray,@error);
|
|
2046 @tmp=(@tmp,@error);
|
|
2047 }
|
|
2048
|
|
2049
|
|
2050 sub CheckInputFile()
|
|
2051 {
|
|
2052 (my $species,my $inputDIR,my $section,my $method,my $tmparray)=@_;
|
|
2053 ####cluster
|
|
2054 if (substr($section,0,1) eq "1")
|
|
2055 {
|
|
2056 if ($method eq "MM")
|
|
2057 {
|
|
2058 @$tmparray=(@$tmparray,&chk2SEQ($species,$inputDIR)); ### check pep and nuc
|
|
2059 @$tmparray=(@$tmparray,&chktab($species,$inputDIR,".location"));### chk pep nuc location
|
|
2060 }else
|
|
2061 {
|
|
2062 @$tmparray=(@$tmparray,&chk1SEQ($species,$inputDIR));
|
|
2063 }
|
|
2064 }
|
|
2065 ###CDS variation
|
|
2066 if (substr($section,2,1) eq "1")
|
|
2067 {
|
|
2068 @$tmparray=(@$tmparray,&chk2SEQ($species,$inputDIR));
|
|
2069 }
|
|
2070 ###function analysis
|
|
2071 if (substr($section,4,1) eq "1")
|
|
2072 {
|
|
2073 @$tmparray=(@$tmparray,&chktab($species,$inputDIR,".function"));
|
|
2074 }
|
|
2075 }
|
|
2076
|
|
2077
|
|
2078 sub chk1SEQ()
|
|
2079 {
|
|
2080 (my $species,my $inputDIR)=@_;
|
|
2081 my @error;
|
|
2082 my $ei=0;
|
|
2083 my $sp;
|
|
2084 my $pepfile;
|
|
2085 my %pep;
|
|
2086 foreach $sp (@$species)
|
|
2087 {
|
|
2088 %pep=();
|
|
2089 $pepfile=$inputDIR.$sp.".pep";
|
|
2090 &ReadSequenceInToHash($pepfile,\%pep);
|
|
2091 if (scalar(keys %pep)<2)
|
|
2092 {
|
|
2093 $error[$ei++]="format error in $pepfile\n";
|
|
2094 }
|
|
2095 }
|
|
2096 return @error;
|
|
2097 }
|
|
2098
|
|
2099 sub chk2SEQ()
|
|
2100 {
|
|
2101 (my $species,my $inputDIR)=@_;
|
|
2102 my $sp;
|
|
2103 my %pep;
|
|
2104 my %nuc;
|
|
2105 my $pepfile;
|
|
2106 my $nucfile;
|
|
2107 my $key;
|
|
2108 my @error;
|
|
2109 my $ei=0;
|
|
2110 foreach $sp (@$species)
|
|
2111 {
|
|
2112 $pepfile=$inputDIR.$sp.".pep";
|
|
2113 $nucfile=$inputDIR.$sp.".nuc";
|
|
2114 %pep=();
|
|
2115 %nuc=();
|
|
2116 &ReadSequenceInToHash("$pepfile",\%pep);
|
|
2117 &ReadSequenceInToHash("$nucfile",\%nuc);
|
|
2118 if (scalar(keys %pep) ne scalar(keys %nuc))
|
|
2119 {
|
|
2120 $error[$ei++]="Sequences number is not consistent in the following two file:\n\t$pepfile\n\t$nucfile\n";
|
|
2121 }else
|
|
2122 {
|
|
2123 foreach $key (keys %pep)
|
|
2124 {
|
|
2125 if (exists($nuc{$key}))
|
|
2126 {
|
|
2127 if (length($nuc{$key}) ne ((length($pep{$key})+1)*3))
|
|
2128 {
|
|
2129 $error[$ei++]="the length of $key in $nucfile is not consistent with its corresponding protein length\n";
|
|
2130 }
|
|
2131 }else
|
|
2132 {
|
|
2133 $error[$ei++]="$key lost in $nucfile\n";
|
|
2134 }
|
|
2135 }
|
|
2136
|
|
2137 foreach $key (keys %nuc)
|
|
2138 {
|
|
2139 if (!exists($pep{$key}))
|
|
2140 {
|
|
2141 $error[$ei++]="1048 $key lost in $pepfile\n";
|
|
2142 }
|
|
2143 }
|
|
2144 }
|
|
2145 }
|
|
2146 return @error;
|
|
2147 }
|
|
2148
|
|
2149
|
|
2150 sub chktab()
|
|
2151 {
|
|
2152 (my $species,my $inputDIR,my $extention)=@_;
|
|
2153 my %pep;
|
|
2154 my @row;
|
|
2155 my $key;
|
|
2156 my %tab;
|
|
2157 my @error;
|
|
2158 my $ei=0;
|
|
2159 my $sp;
|
|
2160 my $tabfile;
|
|
2161 my $pepfile;
|
|
2162 foreach $sp (@$species)
|
|
2163 {
|
|
2164 %tab=();
|
|
2165 %pep=();
|
|
2166 $tabfile=$inputDIR.$sp.$extention;
|
|
2167 open(F,"$tabfile");
|
|
2168 while (<F>)
|
|
2169 {
|
|
2170 chomp();
|
|
2171 @row=split(/\t/,$_);
|
|
2172 if (scalar(@row)<3)
|
|
2173 {
|
|
2174 $error[$ei++]="format error in $tabfile\n";
|
|
2175 }else
|
|
2176 {
|
|
2177 $tab{$row[0]}=$row[1];
|
|
2178 }
|
|
2179 }
|
|
2180 close(F);
|
|
2181 $pepfile=$inputDIR.$sp.".pep";
|
|
2182 &ReadSequenceInToHash($pepfile,\%pep);
|
|
2183 foreach $key (keys %pep)
|
|
2184 {
|
|
2185 if (!exists($tab{$key}))
|
|
2186 {
|
|
2187 $error[$ei++]="sequence $key lost infomation in $tabfile\n";
|
|
2188 }
|
|
2189 }
|
|
2190 }
|
|
2191 return @error;
|
|
2192 }
|
|
2193
|
|
2194
|
|
2195
|
|
2196 sub ReadSequenceInToHash()
|
|
2197 {
|
|
2198 use Bio::SeqIO;
|
|
2199 (my $file,my $hash)=@_;
|
|
2200 my $seq;
|
|
2201 my $in=Bio::SeqIO->new(-file=>"$file",-format=>"fasta");
|
|
2202 while ($seq=$in->next_seq())
|
|
2203 {
|
|
2204 #$$hash{$id."|".$seq->id}=$seq->seq();
|
|
2205 $$hash{$seq->id}=$seq->seq();
|
|
2206 }
|
|
2207 }
|
|
2208
|
|
2209 sub ReadAlignmentToHash()
|
|
2210 {
|
|
2211 (my $file,my $hash)=@_;
|
|
2212 my $name="";
|
|
2213 my $seq="";
|
|
2214 my @content;
|
|
2215 my $line;
|
|
2216 open(F,"$file");
|
|
2217 @content=<F>;
|
|
2218 close(F);
|
|
2219 chomp(@content);
|
|
2220 for ($line=0;$line<@content;$line++)
|
|
2221 {
|
|
2222 if ($content[$line]=~/^>/)
|
|
2223 {
|
|
2224 if ($line>0)
|
|
2225 {
|
|
2226 $$hash{$name}=$seq;
|
|
2227 $name="";
|
|
2228 }
|
|
2229
|
|
2230 $_=$content[$line];
|
|
2231 s/^>//;
|
|
2232 $name=$_;
|
|
2233 $seq="";
|
|
2234 }else
|
|
2235 {
|
|
2236 if ($name ne "")
|
|
2237 {
|
|
2238 $seq=$seq.$content[$line];
|
|
2239 }
|
|
2240 }
|
|
2241 }
|
|
2242 $$hash{$name}=$seq;
|
|
2243 }
|
|
2244
|
|
2245
|
|
2246
|
|
2247 sub Combination()
|
|
2248 {
|
|
2249 (my $m,my $n,my $comRef)=@_;
|
|
2250 my $str="";
|
|
2251 my %hash;
|
|
2252 my $fpos;
|
|
2253 my $num0;
|
|
2254 my $rest;
|
|
2255 my $tmp;
|
|
2256 my $i;
|
|
2257 my $j;
|
|
2258 my $key;
|
|
2259 #my $m=scalar(@$array);
|
|
2260 my @combination;
|
|
2261
|
|
2262 for ($i=1;$i<=$n;$i++)
|
|
2263 {
|
|
2264 $str="1".$str;
|
|
2265 }
|
|
2266
|
|
2267 for ($i=1;$i<=($m-$n);$i++)
|
|
2268 {
|
|
2269 $str=$str."0";
|
|
2270 }
|
|
2271
|
|
2272 $hash{$str}=1;
|
|
2273 while ($str=~/10/)
|
|
2274 {
|
|
2275 $fpos=index($str,"10");
|
|
2276 $_=$str;
|
|
2277 s/10/01/;
|
|
2278 $str=$_;
|
|
2279 $tmp=substr($str,0,$fpos);
|
|
2280 $_=$tmp;
|
|
2281 s/0//g;
|
|
2282 $rest=$_;
|
|
2283 $num0=$fpos-length($_);
|
|
2284 for ($i=1;$i<=$num0;$i++)
|
|
2285 {
|
|
2286 $rest="$rest"."0";
|
|
2287 }
|
|
2288 $str="$rest".substr($str,$fpos,$m-$fpos);
|
|
2289 $hash{$str}=1;
|
|
2290 }
|
|
2291 $j=0;
|
|
2292 foreach $key (keys %hash)
|
|
2293 {
|
|
2294 $combination[$j]="";
|
|
2295 for ($i=0;$i<$m;$i++)
|
|
2296 {
|
|
2297 if (substr($key,$i,1) eq "1")
|
|
2298 {
|
|
2299 if ($combination[$j] ne "")
|
|
2300 {
|
|
2301 #$combination[$j]=$combination[$j]."\t".$$array[$i];
|
|
2302 $combination[$j]=$combination[$j]."\t".$i;
|
|
2303 }else
|
|
2304 {
|
|
2305 #$combination[$j]=$$array[$i]; ### For return species ID
|
|
2306 $combination[$j]=$i;
|
|
2307 }
|
|
2308 }
|
|
2309 }
|
|
2310 $j++;
|
|
2311 }
|
|
2312 @$comRef=@combination; ### update the data through the physic address
|
|
2313 }
|
|
2314
|
|
2315 sub ChkCombinationValue()
|
|
2316 {
|
|
2317 (my $m,my $n)=@_;
|
|
2318 my %hash;
|
|
2319 my %vhash;
|
|
2320 my $value=0;
|
|
2321 my $key;
|
|
2322 my @row;
|
|
2323 my @sdA;
|
|
2324 my @sdB;
|
|
2325
|
|
2326 ### initialization
|
|
2327 $hash{$m."-".$n}=1;
|
|
2328
|
|
2329 ### split combination
|
|
2330 while (scalar(keys %hash)>0 and $value<=$sampleSize)
|
|
2331 {
|
|
2332 foreach $key (keys %hash)
|
|
2333 {
|
|
2334 if ($value > $sampleSize) ### threshold
|
|
2335 {
|
|
2336 last;
|
|
2337 }
|
|
2338 if (!exists($hash{$key}))
|
|
2339 {
|
|
2340 next;
|
|
2341 }
|
|
2342 @row=split(/-/,$key);
|
|
2343 #print $row[0]."|".$row[1]."\n";
|
|
2344 if ($row[0] eq $row[1])
|
|
2345 {
|
|
2346 $value=$value+$hash{$key};
|
|
2347 }else
|
|
2348 {
|
|
2349 ##split
|
|
2350 $sdA[0]=$row[0]-1;
|
|
2351 $sdA[1]=$row[1];
|
|
2352 $sdB[0]=$row[0]-1;
|
|
2353 $sdB[1]=$row[1]-1;
|
|
2354 ##storing A
|
|
2355 if (($sdA[0] eq $sdA[1]) or $sdA[1] ==0)
|
|
2356 {
|
|
2357 $value=$value+$hash{$key};
|
|
2358 }else
|
|
2359 {
|
|
2360 if (exists($hash{$sdA[0]."-".$sdA[1]}))
|
|
2361 {
|
|
2362 $hash{$sdA[0]."-".$sdA[1]}=$hash{$sdA[0]."-".$sdA[1]}+$hash{$key};
|
|
2363 }else
|
|
2364 {
|
|
2365 $hash{$sdA[0]."-".$sdA[1]}=$hash{$key};
|
|
2366 }
|
|
2367 }
|
|
2368
|
|
2369 ##storing B
|
|
2370 if (($sdB[0] eq $sdB[1]) or $sdB[1]==0)
|
|
2371 {
|
|
2372 $value=$value+$hash{$key};
|
|
2373 }else
|
|
2374 {
|
|
2375 if (exists($hash{$sdB[0]."-".$sdB[1]}))
|
|
2376 {
|
|
2377 $hash{$sdB[0]."-".$sdB[1]}=$hash{$sdB[0]."-".$sdB[1]}+$hash{$key};
|
|
2378 }else
|
|
2379 {
|
|
2380 $hash{$sdB[0]."-".$sdB[1]}=$hash{$key};
|
|
2381 }
|
|
2382 }
|
|
2383 }
|
|
2384 #delete original combination
|
|
2385 delete($hash{$key});
|
|
2386 }
|
|
2387 }
|
|
2388
|
|
2389 if ($value>$sampleSize)
|
|
2390 {
|
|
2391 return 0;
|
|
2392 }else
|
|
2393 {
|
|
2394 return $value;
|
|
2395 }
|
|
2396 }
|
|
2397
|
|
2398
|
|
2399 sub SampleCombination()
|
|
2400 {
|
|
2401 (my $m,my $n,my $comRef)=@_;
|
|
2402 my %hash;
|
|
2403 my $sampleTimes=0;
|
|
2404 my @randNum;
|
|
2405 my @sortID;
|
|
2406 my $i;
|
|
2407 my $j;
|
|
2408 my $tmp;
|
|
2409 while ( scalar(keys %hash)<$sampleSize and $sampleTimes<($sampleSize*2))
|
|
2410 {
|
|
2411 for ($i=0;$i<$m;$i++) # generate random data
|
|
2412 {
|
|
2413 $randNum[$i]=int(100000 * rand(100));
|
|
2414 $sortID[$i]=$i;
|
|
2415 }
|
|
2416
|
|
2417 for ($i=0;$i<$m;$i++) # sorting random data
|
|
2418 {
|
|
2419 for ($j=0;$j<$m;$j++)
|
|
2420 {
|
|
2421 if ($randNum[$sortID[$i]]<$randNum[$sortID[$j]])
|
|
2422 {
|
|
2423 $tmp=$sortID[$i];
|
|
2424 $sortID[$i]=$sortID[$j];
|
|
2425 $sortID[$j]=$tmp;
|
|
2426 }
|
|
2427 }
|
|
2428 }
|
|
2429
|
|
2430 #storing data
|
|
2431 $tmp=join("\t",sort {$a<=>$b} (splice(@sortID,0,$n)));
|
|
2432 $hash{$tmp}=1;
|
|
2433 $sampleTimes++;
|
|
2434 }
|
|
2435 @$comRef=keys %hash;
|
|
2436 }
|
|
2437
|
|
2438
|
|
2439 sub PanGenomeNumber()
|
|
2440 {
|
|
2441 (my $spID)=@_;
|
|
2442 my $pan=0;
|
|
2443 my $core=0;
|
|
2444 my $count; #### counter;
|
|
2445 my @row;
|
|
2446
|
|
2447 foreach (@clusters)
|
|
2448 {
|
|
2449 $count=0;
|
|
2450 @row=split(/\t/,$_);
|
|
2451
|
|
2452 foreach (@$spID)
|
|
2453 {
|
|
2454 $count=$count+$row[$_];
|
|
2455 }
|
|
2456
|
|
2457 if ($count>0)
|
|
2458 {
|
|
2459 $pan++;
|
|
2460 if ($count == scalar(@$spID))
|
|
2461 {
|
|
2462 $core++;
|
|
2463 }
|
|
2464 }
|
|
2465 }
|
|
2466 return $pan."\t".$core;
|
|
2467 }
|
|
2468
|
|
2469 sub fit_model_A()
|
|
2470 {
|
|
2471 ### model y = A * x**B + C
|
|
2472 (my $xdata,my $ydata)=@_;
|
|
2473 my $i;
|
|
2474 my $b;
|
|
2475 my $max_B=0;
|
|
2476 my $max_R=0;
|
|
2477 my $max_A=0;
|
|
2478 my $max_A_interval;
|
|
2479 my $max_C=0;
|
|
2480 my $max_C_interval;
|
|
2481 my $R=1e-100;
|
|
2482 my $start;
|
|
2483 my $end;
|
|
2484 my $step;
|
|
2485 my @xValues;
|
|
2486 my @yValues;
|
|
2487
|
|
2488 $start=1;
|
|
2489 $step=0.001;
|
|
2490 $b=$start;
|
|
2491 $max_R=0;
|
|
2492 $R=1e-100;
|
|
2493
|
|
2494 use Statistics::LineFit;
|
|
2495 use Statistics::Distributions;
|
|
2496
|
|
2497 while ($max_R<=$R)
|
|
2498 {
|
|
2499 if (($b < 0.02) and ($b >-0.02))
|
|
2500 {
|
|
2501 $b=-0.02;
|
|
2502 }
|
|
2503
|
|
2504 for ($i=0;$i<@$xdata;$i++)
|
|
2505 {
|
|
2506 $xValues[$i]=$$xdata[$i]**$b;
|
|
2507 }
|
|
2508 @yValues=@$ydata;
|
|
2509 my $lineFit = Statistics::LineFit->new();
|
|
2510 $lineFit->setData (\@xValues, \@yValues) or die "Invalid data";
|
|
2511 (my $intercept, my $slope) = $lineFit->coefficients();
|
|
2512 my $rSquared = $lineFit->rSquared();
|
|
2513 my $meanSquaredError = $lineFit->meanSqError();
|
|
2514 my $durbinWatson = $lineFit->durbinWatson();
|
|
2515 my $sigma = $lineFit->sigma();
|
|
2516 (my $tStatIntercept, my $tStatSlope) = $lineFit->tStatistics();
|
|
2517 (my $varianceIntercept,my $varianceSlope) = $lineFit->varianceOfEstimates();
|
|
2518
|
|
2519 $max_R=$R;
|
|
2520 $R=$rSquared;
|
|
2521 if ($max_R<=$R)
|
|
2522 {
|
|
2523 $max_R=$R;
|
|
2524 ($max_C,$max_A)=$lineFit->coefficients();
|
|
2525 $max_A_interval=Statistics::Distributions::tdistr (($spnum-2),.025)*sqrt($varianceSlope);
|
|
2526 $max_C_interval=Statistics::Distributions::tdistr (($spnum-2),.025)*sqrt($varianceIntercept);
|
|
2527 }
|
|
2528 $b=$b-$step;
|
|
2529 }
|
|
2530 $max_B=$b;
|
|
2531 return ($max_R,$max_A,$max_A_interval,$max_B,$max_C,$max_C_interval);
|
|
2532
|
|
2533 }
|
|
2534
|
|
2535 sub fit_model_B()
|
|
2536 {
|
|
2537 ### model y = A * exp(x*B) + C
|
|
2538 (my $xdata,my $ydata)=@_;
|
|
2539 my $i;
|
|
2540 my $b;
|
|
2541 my $max_B=0;
|
|
2542 my $max_R=0;
|
|
2543 my $max_A=0;
|
|
2544 my $max_A_interval;
|
|
2545 my $max_C=0;
|
|
2546 my $max_C_interval;
|
|
2547 my $R=1e-100;
|
|
2548 my $start;
|
|
2549 my $end;
|
|
2550 my $step;
|
|
2551 my @xValues;
|
|
2552 my @yValues;
|
|
2553
|
|
2554 $start=0;
|
|
2555 $step=0.001;
|
|
2556 $b=$start;
|
|
2557 $max_R=0;
|
|
2558 $R=1e-100;
|
|
2559
|
|
2560 use Statistics::LineFit;
|
|
2561 use Statistics::Distributions;
|
|
2562
|
|
2563 while ($max_R<=$R)
|
|
2564 {
|
|
2565 if (($b < 0.02) and ($b >-0.02))
|
|
2566 {
|
|
2567 $b=-0.02;
|
|
2568 }
|
|
2569
|
|
2570 for ($i=0;$i<@$xdata;$i++)
|
|
2571 {
|
|
2572 $xValues[$i]=exp($$xdata[$i]*$b);
|
|
2573 }
|
|
2574 @yValues=@$ydata;
|
|
2575 my $lineFit = Statistics::LineFit->new();
|
|
2576 $lineFit->setData (\@xValues, \@yValues) or die "Invalid data";
|
|
2577 (my $intercept, my $slope) = $lineFit->coefficients();
|
|
2578 my $rSquared = $lineFit->rSquared();
|
|
2579 my $meanSquaredError = $lineFit->meanSqError();
|
|
2580 my $durbinWatson = $lineFit->durbinWatson();
|
|
2581 my $sigma = $lineFit->sigma();
|
|
2582 (my $tStatIntercept, my $tStatSlope) = $lineFit->tStatistics();
|
|
2583 (my $varianceIntercept,my $varianceSlope) = $lineFit->varianceOfEstimates();
|
|
2584
|
|
2585 $max_R=$R;
|
|
2586 $R=$rSquared;
|
|
2587 if ($max_R<=$R)
|
|
2588 {
|
|
2589 $max_R=$R;
|
|
2590 ($max_C,$max_A)=$lineFit->coefficients();
|
|
2591 $max_A_interval=Statistics::Distributions::tdistr (($spnum-2),.025)*sqrt($varianceSlope);
|
|
2592 $max_C_interval=Statistics::Distributions::tdistr (($spnum-2),.025)*sqrt($varianceIntercept);
|
|
2593 }
|
|
2594 $b=$b-$step;
|
|
2595 }
|
|
2596 $max_B=$b;
|
|
2597 return ($max_R,$max_A,$max_A_interval,$max_B,$max_C,$max_C_interval);
|
|
2598 }
|
|
2599
|
|
2600
|
|
2601 sub ReadData2Array()
|
|
2602 {
|
|
2603 (my $file, my $array1,my $col1,my $array2,my $col2)=@_;
|
|
2604 my $i=0;
|
|
2605 open(F,$file);
|
|
2606 $_=<F>;
|
|
2607 while (<F>)
|
|
2608 {
|
|
2609 chomp();
|
|
2610 @_=split(/\t/,$_);
|
|
2611 $$array1[$i]=$_[$col1];
|
|
2612 $$array2[$i]=$_[$col2];
|
|
2613 $i++;
|
|
2614 }
|
|
2615 close(F);
|
|
2616 }
|
|
2617
|
|
2618 sub SumData()
|
|
2619 {
|
|
2620 (my $xdata,my $ydata,my $SumMethod)=@_;
|
|
2621 my %hash;
|
|
2622 my $i;
|
|
2623 my $key;
|
|
2624 my $max=0;
|
|
2625 for ($i=0;$i<@$xdata;$i++)
|
|
2626 {
|
|
2627 if (exists($hash{$$xdata[$i]}))
|
|
2628 {
|
|
2629 $hash{$$xdata[$i]}=$hash{$$xdata[$i]}." ".$$ydata[$i];
|
|
2630 }else
|
|
2631 {
|
|
2632 $hash{$$xdata[$i]}=$$ydata[$i];
|
|
2633 if ($$xdata[$i]>$max)
|
|
2634 {
|
|
2635 $max=$$xdata[$i];
|
|
2636 }
|
|
2637 }
|
|
2638 }
|
|
2639 @$xdata=qw();
|
|
2640 @$ydata=qw();
|
|
2641 $i=0;
|
|
2642 foreach $i (1..$max)
|
|
2643 {
|
|
2644 $$xdata[$i-1]=$i;
|
|
2645 if ($SumMethod eq "median")
|
|
2646 {
|
|
2647 $$ydata[$i-1]=&median($hash{$i});
|
|
2648 }else
|
|
2649 {
|
|
2650 $$ydata[$i-1]=&mean($hash{$i});
|
|
2651 }
|
|
2652 }
|
|
2653 #print join(",",@$xdata)."\n";
|
|
2654 #print join(",",@$ydata)."\n";
|
|
2655 }
|
|
2656
|
|
2657 sub median()
|
|
2658 {
|
|
2659 (my $data)=@_;
|
|
2660 my @data=split(/ /,$data);
|
|
2661 my $arraylen=scalar(@data);
|
|
2662 @data=sort{$a<=>$b} @data;
|
|
2663 if (int($arraylen/2)*2 == $arraylen)
|
|
2664 {
|
|
2665 return ($data[$arraylen/2]+$data[$arraylen/2-1])/2;
|
|
2666 }else
|
|
2667 {
|
|
2668 return $data[int($arraylen/2)];
|
|
2669 }
|
|
2670 }
|
|
2671
|
|
2672 sub mean()
|
|
2673 {
|
|
2674 (my $data)=@_;
|
|
2675 my @data=split(/ /,$data);
|
|
2676 my $sum=0;
|
|
2677 foreach (@data)
|
|
2678 {
|
|
2679 $sum=$sum+$_;
|
|
2680 }
|
|
2681 return int(($sum/scalar(@data))*1000)/1000;
|
|
2682 }
|
|
2683
|
|
2684 sub ReplaceName()
|
|
2685 {
|
|
2686 (my $sp,my $file)=@_;
|
|
2687 my @content;
|
|
2688 my $line;
|
|
2689 my $i;
|
|
2690 my $target;
|
|
2691 open(F,$file);
|
|
2692 @content=<F>;
|
|
2693 close(F);
|
|
2694 for ($line=0;$line<@content;$line++)
|
|
2695 {
|
|
2696 for ($i=0;$i<@$sp;$i++)
|
|
2697 {
|
|
2698 $_=$content[$line];
|
|
2699 $target="sp".$i."sp";
|
|
2700 s/$target/$$sp[$i]/;
|
|
2701 $content[$line]=$_;
|
|
2702 }
|
|
2703 }
|
|
2704 open(R,">$file");
|
|
2705 print R @content;
|
|
2706 close(R);
|
|
2707 }
|
|
2708
|
|
2709 sub MP_Result_to_Table()
|
|
2710 {
|
|
2711 (my $MPresult, my $outputfile)=@_;
|
|
2712 my %hash;
|
|
2713 my $maxid=0;
|
|
2714 my $i;
|
|
2715 my @row;
|
|
2716
|
|
2717 open(F,"$MPresult");
|
|
2718 $_=<F>;
|
|
2719 while (<F>)
|
|
2720 {
|
|
2721 @row=split(/\t/,$_);
|
|
2722 if (exists($hash{$row[0]}))
|
|
2723 {
|
|
2724 $hash{$row[0]}=$hash{$row[0]}."\t".$row[2];
|
|
2725 }else
|
|
2726 {
|
|
2727 $hash{$row[0]}=$row[2];
|
|
2728 if ($row[0]>$maxid)
|
|
2729 {
|
|
2730 $maxid=$row[0];
|
|
2731 }
|
|
2732 }
|
|
2733 }
|
|
2734 close(F);
|
|
2735
|
|
2736 open(R,">$outputfile");
|
|
2737 foreach $i (1..$maxid)
|
|
2738 {
|
|
2739 print R $hash{$i}."\n";
|
|
2740 }
|
|
2741 close(R);
|
|
2742 }
|
|
2743
|
|
2744
|
|
2745
|
|
2746 sub FormatCluster()
|
|
2747 {
|
|
2748 (my $infile,my $genelist,my $spnum,my $cluster)=@_;
|
|
2749 my %hash;
|
|
2750 my %gene;
|
|
2751 my $key;
|
|
2752 my @row;
|
|
2753 my $sp;
|
|
2754 my $line;
|
|
2755 my $i=0;
|
|
2756 my $j=0;
|
|
2757 my @content;
|
|
2758
|
|
2759 ### record gene in clusters
|
|
2760 open(F,"$infile");
|
|
2761 @content=<F>;
|
|
2762 close(F);
|
|
2763 chomp(@content);
|
|
2764 for ($line=0;$line<@content;$line++)
|
|
2765 {
|
|
2766 @row=split(/\t/,$content[$line]);
|
|
2767 foreach $key (@row)
|
|
2768 {
|
|
2769 $gene{$key}=1;
|
|
2770 }
|
|
2771 }
|
|
2772 ###retrieves gene which is not in clutsers
|
|
2773
|
|
2774 open(F,"$genelist");
|
|
2775 while ($key=<F>)
|
|
2776 {
|
|
2777 if ($key=~/^>/)
|
|
2778 {
|
|
2779 chomp($key);
|
|
2780 $_=$key;
|
|
2781 s/^>//;
|
|
2782 $key=$_;
|
|
2783 if (!exists($gene{$key}))
|
|
2784 {
|
|
2785 $content[$line]=$key;
|
|
2786 $line++;
|
|
2787 }
|
|
2788 }
|
|
2789 }
|
|
2790 close(F);
|
|
2791
|
|
2792 #### initialization @cluster
|
|
2793 @$cluster=qw();
|
|
2794 $j=0;
|
|
2795
|
|
2796 foreach $line (@content)
|
|
2797 {
|
|
2798 if ($line ne "")
|
|
2799 {
|
|
2800 %hash=();
|
|
2801 @row=split(/\t/,$line);
|
|
2802 foreach $key (@row)
|
|
2803 {
|
|
2804 $sp=substr($key,0,index($key,"G"));
|
|
2805 $gene{$key}=1;
|
|
2806 if (exists($hash{$sp}))
|
|
2807 {
|
|
2808 $hash{$sp}=$hash{$sp}.",".$key;
|
|
2809 }else
|
|
2810 {
|
|
2811 $hash{$sp}=$key;
|
|
2812 }
|
|
2813 }
|
|
2814
|
|
2815 $i=0;
|
|
2816 @row=qw();
|
|
2817
|
|
2818 foreach $i (0..($spnum-1))
|
|
2819 {
|
|
2820 if (exists($hash{"S$i"}))
|
|
2821 {
|
|
2822 $row[$i]=$hash{"S$i"};
|
|
2823 }else
|
|
2824 {
|
|
2825 $row[$i]="-";
|
|
2826 }
|
|
2827 }
|
|
2828 $$cluster[$j++]=join("\t",@row);
|
|
2829 }
|
|
2830 }
|
|
2831 }
|
|
2832
|
|
2833
|