comparison PGAP-1.2.1/PGAP.pl @ 0:83e62a1aeeeb draft

Uploaded
author dereeper
date Thu, 24 Jun 2021 13:51:52 +0000
parents
children 70b7a5270968
comparison
equal deleted inserted replaced
-1:000000000000 0:83e62a1aeeeb
1 #!/usr/bin/perl
2 use strict;
3 use warnings;
4 use Getopt::Long;
5
6 ### programs from BLAST
7 my $formatdb="formatdb";
8 my $blastall="blastall";
9
10 ### programs from mcl
11 my $mcl="mcl";
12
13 ### programs from mafft
14 my $mafft="mafft";
15
16 ### programs from PHYLIP
17 my $seqboot="seqboot";
18 my $neighbor="neighbor";
19 my $consense="consense";
20 my $dnaml="dnaml";
21 my $dnadist="dnadist";
22 my $dnapars="dnapars";
23
24 my $count_tree=0;
25
26 my $sampleSize=8000; # when calculate the pan-genome size, we will sample $sampleSize combinations
27 # if the total combination number is larger than $sampleSize for specific genomes
28 # Surely, the number of $sampleSize is, the larger, the better.
29 # However, the larger the $sampleSize is, the more time would be consumed.
30 # we suggest the range: 5000 ~ 20,000
31
32 #####################################################################
33 # DOn't modify the following code, unless you know their functions
34 #####################################################################
35
36 my %opt=qw();
37 GetOptions(\%opt,"strains:s","input:s","output:s","cluster!","pangenome!","variation!","evolution!","function!","method:s","thread:i","score:f","evalue:f","coverage:f","local:f","global:f","identity:f","bootstrap:i","help|h!");
38
39 my @usage=qq(
40 ====== Pan-Genome Analysis Pipeline (PGAP) ======
41 Version 1.2.1
42
43 Usage: perl PGAP.pl [Options]
44
45 Options:
46 --strains String Input strains nicknames, and join them with '+', for example: A+B+C
47 --input String Input data directory
48 --output String Result output directory
49
50 --cluster Run homologous gene clustering
51 --pangenome Run pan-genome analysis
52 --variation Run homologous clusters variation analysis
53 --evolution Run evolution analysis
54 --function Run Function analysis
55
56 --method String GF for GeneFamily method, and MP for MultiParanoid method
57 for GF: fast, but not very accurate
58 evalue, score, indentity, coverage are employed
59 for MP: slow, but more accurate
60 score, coverage, local, global are employed
61 --thread Int Number of processors to use in blastall. [default:1]
62 --score Int Minimum score in blastall. [default:40]
63 --evalue Decimal Maximal E-value in blastall. [default:1e-10]
64 --coverage Decimal Minimum alignment coverage for two homologous proteins. [default:0.5]
65 --local Decimal Minimum local alignment overlap in MP method. [default:0.25]
66 --global Decimal Minimum global alignment overlap in MP method. [default:0.5]
67 --identity Decimal Minimum alignment indentity for two homologous proteins. [default:0.5]
68 --bootstrap Int Bootstrap times for phylogenetics tree. [default:1]
69
70 --h or help Display this message
71 );
72
73
74 ############# specified variable #############
75 my $inputDIR;
76 my $outputDIR;
77 my $run_cluster;
78 my $run_pangenome;
79 my $run_variation;
80 my $run_evolution;
81 my $run_function;
82 my $method="";
83 my $thread;
84 my $score;
85 my $identity;
86 my $evalue;
87 my $coverage;
88 my $global;
89 my $local;
90 my $bootstrap;
91
92
93 my %pep;
94 my %nuc;
95 my $spnum;
96 my @clusters;
97 my $Cluster;
98 my @SpecieCombination;
99 my @spID;
100 my %genenum;
101 my %aaAln;
102 my %ntAln;
103 my %cog;
104 my %description;
105 #my %aa4tree; ### AA sequence for Phylogenetic Tree
106 my %nt4tree; ### nucleotide sequence for Phylogenetic Tree
107 my @SNPPosition; ### SNP position
108 my $dieMessage="You did not run PGAP.pl in the program directory\n";
109 my $section;
110
111 ######### common temporary variable #############
112 my $i;
113 my $j;
114 my $line;
115 my %tmpHash;
116 my @tmp;
117 my $tmp;
118 my $key;
119 my @row;
120 my $inparacount;
121 my $ClusterID;
122 my $orth;
123 my @content;
124 my $clusterName;
125 my @xdata;
126 my @ydata;
127 my @fit;
128
129 my $fit_A;
130 my $fit_A_interval;
131 my $fit_B;
132 my $fit_C;
133 my $fit_C_interval;
134 my $fit_Rsquare;
135
136
137
138 #### check option
139
140 my $opt_error=0;
141
142 if ((scalar(keys %opt) ==0) or (exists($opt{"help"})))
143 {
144 print join("\n",@usage)."\n";
145 exit;
146 }
147
148
149 ###################### public info
150 ### strains name
151 my @species;
152 if (exists($opt{"strains"}))
153 {
154 @species=split(/\+/,$opt{"strains"});
155 $spnum=scalar(@species);
156 }else
157 {
158 print "Please assign strains nick name!\n";
159 exit;
160 }
161
162 ### input data directory
163
164 if (exists($opt{"input"}))
165 {
166 $inputDIR=$opt{"input"};
167 if ($inputDIR!~/\/$/)
168 {
169 $inputDIR=$inputDIR."/";
170 }
171 }else
172 {
173 print "Please assign input data directory!\n\n";
174 exit;
175 }
176 ### output data directory
177
178 if (exists($opt{"output"}))
179 {
180 $outputDIR=$opt{"output"};
181 if ($outputDIR!~/\/$/)
182 {
183 $outputDIR=$outputDIR."/";
184 }
185 }else
186 {
187 print "Please assign result output directory!\n\n";
188 exit;
189 }
190
191 ###################### section info
192
193 if (exists($opt{"cluster"}))
194 {
195 $run_cluster=1;
196 }else
197 {
198 $run_cluster=0;
199 }
200
201 if (exists($opt{"pangenome"}))
202 {
203 $run_pangenome=1;
204 }else
205 {
206 $run_pangenome=0;
207 }
208
209 if (exists($opt{"variation"}))
210 {
211 $run_variation=1;
212 }else
213 {
214 $run_variation=0;
215 }
216
217 if (exists($opt{"evolution"}))
218 {
219 $run_evolution=1;
220 }else
221 {
222 $run_evolution=0;
223 }
224
225 if (exists($opt{"function"}))
226 {
227 $run_function=1;
228 }else
229 {
230 $run_function=0;
231 }
232
233 if ($run_cluster)
234 {
235 ### method
236 if (exists($opt{"method"}))
237 {
238 $method=uc($opt{"method"});
239 if ($method!~/^GF$/ and $method!~/^MP$/)
240 {
241 print "Unknown method: ".$opt{"method"}."\n";
242 exit;
243 }
244 }else
245 {
246 print "Please assign the cluster method!\n\n";
247 exit;
248 }
249
250 ##thread
251 if (exists($opt{"thread"}))
252 {
253 $thread=$opt{"thread"};
254 if ($thread==0)
255 {
256 print "please assign an applicable thread value.\n";
257 exit;
258 }
259 }else
260 {
261 $thread=1;
262 }
263
264 ##score
265 if (exists($opt{"score"}))
266 {
267 $score=$opt{"score"};
268 if ($score<=0)
269 {
270 print "please assign an applicable score value.\n";
271 exit;
272 }
273 }else
274 {
275 $score=40;
276 }
277
278 if ($method eq "GF")
279 {
280 ###identity
281 if (exists($opt{"identity"}))
282 {
283 $identity=$opt{"identity"};
284 if ($identity>1 or $identity<=0)
285 {
286 print "identity should be 0 ~ 1 \n";
287 exit;
288 }
289 }else
290 {
291 $identity=0.5;
292 }
293
294 ###evalue
295 if (exists($opt{"evalue"}))
296 {
297 $evalue=$opt{"evalue"};
298 }else
299 {
300 $evalue=1e-10;
301 }
302
303 ###coverage
304 if (exists($opt{"coverage"}))
305 {
306 $coverage=$opt{"coverage"};
307 if ($coverage>1 or $coverage<=0)
308 {
309 print "coverage should be 0 ~ 1 \n";
310 exit;
311 }
312 }else
313 {
314 $coverage=0.5;
315 }
316 }
317
318
319 if ($method eq "MP")
320 {
321 ###global
322 if (exists($opt{"global"}))
323 {
324 $global=$opt{"global"};
325 if ($global>1 or $global<=0)
326 {
327 print "global coverage should be 0 ~ 1 \n";
328 exit;
329 }
330 }else
331 {
332 $global=0.5;
333 }
334 ###local
335 if (exists($opt{"local"}))
336 {
337 $local=$opt{"local"};
338 if ($local<=0)
339 {
340 print "local coverage should be 0 ~ [global coverage value] \n";
341 exit;
342 }
343 if ($local>$global)
344 {
345 print "local coverage should be less than global coverage!\n";
346 exit;
347 }
348 }else
349 {
350 $local=0.25;
351 }
352 }
353 }
354
355 if ($run_evolution)
356 {
357 if (exists($opt{"bootstrap"}))
358 {
359 $bootstrap=$opt{"bootstrap"};
360 if ($bootstrap<=0)
361 {
362 print "please assign an applicable bootstrap value.\n";
363 }
364 }else
365 {
366 $bootstrap=1;
367 }
368 }
369
370 print "Program begin at ".localtime()."\n";
371 print "The following are the parameters for current process:\n";
372 print "Strains: ".join(",",@species)."\n";
373 print "Input directory: $inputDIR\n";
374 print "Output directory: $outputDIR\n";
375 if ($run_cluster)
376 {
377 print "Cluster analysis: yes\n";
378 print " Method: $method\n";
379 print " Thread: $thread\n";
380 if ($method eq "GF")
381 {
382 print " E-value: $evalue\n";
383 print " Identity: $identity\n";
384 print " Coverage: $coverage\n";
385 print " Score: $score\n";
386 }
387 if ($method eq "MP")
388 {
389 print " Local: $local\n";
390 print " Global: $global\n";
391 print " Score: $score\n";
392 }
393 }else
394 {
395 print "Cluster analysis: no\n";
396 }
397 if ($run_pangenome)
398 {
399 print "Pan-genome analysis: yes\n";
400 }else
401 {
402 print "Pan-genome analysis: no\n";
403 }
404 if ($run_variation)
405 {
406 print "Variation analysis: yes\n";
407 }else
408 {
409 print "Variation analysis: no\n";
410 }
411 if ($run_evolution)
412 {
413 print "Evolution analysis: yes\n";
414 print " Bootstrap: $bootstrap\n";
415 }else
416 {
417 print "Evolution analysis: no\n";
418 }
419 if ($run_function)
420 {
421 print "Function analysis: yes\n";
422 }else
423 {
424 print "Function analysis: no\n";
425 }
426
427 $section=$run_cluster.$run_pangenome.$run_variation.$run_evolution.$run_function;
428
429 ###############################################
430 # section 0) check input file and program
431 ###############################################
432 if (!(-e $outputDIR))
433 {
434 system("mkdir $outputDIR");
435 }
436 system("chmod +rw $outputDIR");
437
438 if (!(-w $outputDIR))
439 {
440 print "There is no WRITE permission in $outputDIR\n";
441 exit;
442 }
443 @tmp=qw();
444 &CheckInputFile(\@species,$inputDIR,$section,$method,\@tmp);
445 &CheckExtraProgram($section,$method,\@tmp);
446
447 if (scalar(@tmp)>0)
448 {
449 open(R,">".$outputDIR."0.error.message");
450 print R join("",@tmp)."\n";
451 close(R);
452 print "error!\nlog are saved in ${outputDIR}0.error.message\n";
453 exit;
454 }
455
456
457 ############################################
458 # section 1) cluster analysis
459 ############################################
460
461 if ($run_cluster)
462 {
463 print "\n\n############################################\n";
464 print "# section 1) cluster analysis\n";
465 print "############################################\n\n\n";
466
467 #### cluster gene and return result to the array @clusters
468
469 if ($method eq "MP")
470 {
471 print "Begin cluster gene with MP method ...\n";
472 &MP();
473 }else
474 {
475 print "Begin cluster gene with GF method ...\n";
476 &GF();
477 }
478
479 #### output normal cluster format
480
481 &FormatClusterOutPut(\@species,"${outputDIR}1.Orthologs_Cluster.txt",\@clusters);
482
483 #### Retrieve cluster
484
485 &RetrieveClusterFromFile("${outputDIR}1.Orthologs_Cluster.txt",\@clusters);
486
487 ##### gene distribution in each strains
488 %tmpHash=();
489 &GeneDistribution(\@clusters,\%tmpHash);
490
491 open(R,">${outputDIR}1.Gene_Distribution_By_Conservation.txt");
492 print R "SharedBy_Strains\t".join("\t",@species)."\n";
493
494 for ($i=$spnum;$i>0;$i--)
495 {
496 print R $i;
497 for ($j=0;$j<$spnum;$j++)
498 {
499 if (exists($tmpHash{$i."|".$j}))
500 {
501 print R "\t".$tmpHash{$i."|".$j};
502 }else
503 {
504 print R "\t0";
505 }
506 }
507 print R "\n";
508 }
509
510 close(R);
511
512
513 }else
514 {
515 print "Homologous gene clustering is skipped!\n";
516 }
517
518
519 if ($run_pangenome)
520 {
521 print "\n\n############################################\n";
522 print "# section 2) Pan-genome analysis\n";
523 print "############################################\n\n\n";
524
525 #### Retrieve cluster
526 &RetrieveClusterFromFile("${outputDIR}1.Orthologs_Cluster.txt",\@clusters);
527 chomp(@clusters);
528
529 #### convert file into 0-1 matrix
530 for ($line=0;$line<@clusters;$line++)
531 {
532 @row=split(/\t/,$clusters[$line]);
533 splice(@row,0,1);
534 for ($i=0;$i<@row;$i++)
535 {
536 if ($row[$i] eq "-")
537 {
538 $row[$i]=0;
539 }else
540 {
541 $row[$i]=1;
542 }
543 }
544 $clusters[$line]=join("\t",@row);
545 }
546
547 #### fetch gene number of each strains
548 for ($i=0;$i<$spnum;$i++)
549 {
550 open(F,"$inputDIR$species[$i].pep");
551 @tmp=<F>;
552 close(F);
553 @tmp=grep(/^>/,@tmp);
554 $genenum{$species[$i]}=scalar(@tmp);
555 }
556
557 #### pan genome size and core genome size
558 print "Deducing pan genome size and core genome size for each composition...\n\n";
559
560 open(PAN,">${outputDIR}2.PanGenome.Data.txt");
561 print PAN "ClusterConservation\tTotalGeneNumber\tPanGenome\tCoreGenome\n";
562
563 for ($i=1;$i<=scalar(@species);$i++)
564 {
565 #@SpecieCombination=&Combination(\@species,$i);
566 #@SpecieCombination=&Combination($spnum,$i);
567 if (&ChkCombinationValue($spnum,$i) !=0) ### transfer the array reference to the subroutine
568 {
569 &Combination($spnum,$i,\@SpecieCombination); ## if the combination number is less than sampleSize, then fecth all, else sample
570 }else
571 {
572 &SampleCombination($spnum,$i,\@SpecieCombination);
573 }
574
575 foreach $key (@SpecieCombination)
576 {
577 ##### count total gene number in current combination
578 $tmp=0;
579 @spID=split(/\t/,$key); #### speices id in current combination
580 foreach (@spID)
581 {
582 $tmp=$tmp+$genenum{$species[$_]};
583 }
584 ##### scan pangenome and coregenome
585 @tmp=split(/\t/,&PanGenomeNumber(\@spID));
586 print PAN "$i\t$tmp\t".join("\t",@tmp)."\n";
587
588 }
589 }
590
591 close(PAN);
592
593 #### data fit
594
595 #### for model A
596
597 if ($spnum<3)
598 {
599 print "There are $spnum strains. For pan-genome function fitting, at least 3 strains data are required.\n";
600 }else
601 {
602 open(R,">${outputDIR}2.PanGenome.Profile.txt");
603 ##### genome number & pan-genome size
604 @xdata=qw();
605 @ydata=qw();
606 &ReadData2Array("${outputDIR}2.PanGenome.Data.txt",\@xdata,0,\@ydata,2);
607 &SumData(\@xdata,\@ydata,"mean");
608 ($fit_Rsquare, $fit_A, $fit_A_interval, $fit_B, $fit_C, $fit_C_interval)=&fit_model_A(\@xdata,\@ydata);
609 print R "The relation bewteen genome number and pan-genome size\n\n";
610 print R "Function model: y=A*x**B +C \n";
611 print R "\ty denotes pan-genome size, x denotes genome number, and A, B, C are fitting parameters.\n\n";
612 print R "Fitting result:\n";
613 print R "\ty = $fit_A *x**$fit_B + $fit_C\n";
614 print R "\tR-square = $fit_Rsquare\n";
615 print R "\tA 95% confidence interval: ($fit_A - $fit_A_interval , $fit_A + $fit_A_interval)\n";
616 print R "\tC 95% confidence interval: ($fit_C - $fit_C_interval , $fit_C + $fit_C_interval)\n\n\n\n\n";
617
618 ##### total gene number & pan-genome size
619 #@xdata=qw();
620 #@ydata=qw();
621 #&ReadData2Array("${outputDIR}2.PanGenome.Data.txt",\@xdata,1,\@ydata,2);
622 #&SumDataByMedian(\@xdata,\@ydata);
623 #($fit_Rsquare, $fit_A, $fit_B, $fit_C)=&fit_model_A(\@xdata,\@ydata);
624 #print R "The relation bewteen total gene number and pan-genome size\n\n";
625 #print R "$fit_Rsquare, $fit_A, $fit_B, $fit_C\n";
626 #print R "\ty = $fit_A *x**$fit_B + $fit_C R-square = $fit_Rsquare\n";
627 #print R "\tx: total gene number\n";
628 #print R "\ty: pan-genome size\n\n\n\n\n";
629
630 ##### genome number & core genome
631 @xdata=qw();
632 @ydata=qw();
633 &ReadData2Array("${outputDIR}2.PanGenome.Data.txt",\@xdata,0,\@ydata,3);
634 &SumData(\@xdata,\@ydata,"mean");
635 ($fit_Rsquare, $fit_A, $fit_A_interval, $fit_B, $fit_C, $fit_C_interval)=&fit_model_B(\@xdata,\@ydata);
636 print R "The relation bewteen genome number and core genome size\n\n";
637 print R "Function model: y=A*exp(B*x) +C \n";
638 print R "\ty denotes pan-genome size, x denotes genome number, and A, B, C are fitting parameters.\n\n";
639 print R "Fitting result:\n";
640 print R "\ty = $fit_A *exp($fit_B * x) + $fit_C R-square = $fit_Rsquare\n";
641 print R "\tR-square = $fit_Rsquare\n";
642 print R "\tA 95% confidence interval: ($fit_A - $fit_A_interval , $fit_A + $fit_A_interval)\n";
643 print R "\tC 95% confidence interval: ($fit_C - $fit_C_interval , $fit_C + $fit_C_interval)\n\n\n\n\n";
644 close(R);
645 }
646
647 }
648
649
650 ############################################
651 # section 3) CDS variation analysis
652 ############################################
653
654 if ($run_variation)
655 {
656 print "\n\n############################################\n";
657 print "# section 3) CDS variation analysis\n";
658 print "############################################\n\n\n";
659
660 #### Retrieve cluster
661 &RetrieveClusterFromFile("${outputDIR}1.Orthologs_Cluster.txt",\@clusters);
662 chomp(@clusters);
663
664 ## protein
665 system("rm -rf *.pep");
666 &PrepareFasta(\@species,$inputDIR,".pep"); ###prepare pep file
667 system("cat *.pep > All.faa && rm -rf *.pep && mv All.faa All.pep");
668 &ReadSequenceInToHash("All.pep",\%pep);
669 ## nucleic
670 system("rm -rf *.nuc");
671 &PrepareFasta(\@species,$inputDIR,".nuc"); ###prepare nuc file
672 system("cat *.nuc > All.ffn && rm -rf *.nuc && mv All.ffn All.nuc");
673 &ReadSequenceInToHash("All.nuc",\%nuc);
674
675 ## scanning SNP
676 %nt4tree=();
677 for ($i=0;$i<$spnum;$i++)
678 {
679 $nt4tree{"S".$i}="";
680 }
681
682 open(VAR,">${outputDIR}3.CDS.variation.txt");
683 print VAR "ClusterID\tStrains_Number\tGene_Number\tPosition\taaType\tntType\tntProfile\tVariation type\n";
684
685 open(VA,">${outputDIR}3.CDS.variation.analysis.txt");
686 print VA "ClusterID\tInDel Base\tNonsynonymous mutation\tSynonymous mutation\n";
687
688 for ($line=0;$line<@clusters;$line++)
689 {
690 @row=split(/\t|\,/,$clusters[$line]);
691 $ClusterID=$row[0];
692 splice(@row,0,1);
693 @row=grep(/^S/,@row);
694 if (scalar(@row) >=2)
695 {
696 open(PEP,">$ClusterID.pep");
697 open(NUC,">$ClusterID.nuc");
698 foreach $key (@row)
699 {
700 print PEP ">$key\n$pep{$key}\n";
701 print NUC ">$key\n$nuc{$key}\n";
702 }
703 close(PEP);
704 close(NUC);
705 system("$mafft --quiet $ClusterID.pep > $ClusterID.pal");
706 #system("perl ./pal2nal.pl $ClusterID.pal $ClusterID.nuc -output fasta > $ClusterID.nal");
707 $tmp=&pal2nal("$ClusterID.pal","$ClusterID.nuc","$ClusterID.nal");
708 if ($tmp == 0)
709 {
710 system("rm -rf $ClusterID.*");
711 next;
712 }
713
714 @tmp=&DetectSNP();
715 if (scalar(@tmp)>0)
716 {
717 print VA $ClusterID."\t".&VarAnalysis(\@tmp)."\n";
718 print VAR join("",@tmp);
719 ### core orthologs
720 @row=split(/\t/,$clusters[$line]);
721 splice(@row,0,1);
722 if ((&CountGeneInCluster(join("\t",@row)) ==$spnum) and (&CountSpeicesInCluster(join("\t",@row)) == $spnum) )
723 {
724 $count_tree++;
725 %tmpHash=();
726 foreach (@row)
727 {
728 $tmpHash{$_}="";
729 }
730 &RemoveHeadGap("$ClusterID.nal",\%tmpHash);
731 &ExtractSNP4tree(\%tmpHash,\%nt4tree);
732 }
733 }
734 system("rm -rf $ClusterID.*");
735 }
736 }
737 close(VAR);
738 close(VA);
739
740 open(R,">${outputDIR}3.CDS.variation.for.evolution.txt");
741 foreach $key (keys %nt4tree)
742 {
743 $_=$key;
744 s/s//gi;
745 print R ">$species[$_]\n$nt4tree{$key}\n";
746 }
747 close(R);
748 print $count_tree."\n\n";
749
750 ###
751 system("rm All.nuc All.pep");
752 }else
753 {
754 print "CDS variation is skipped.\n";
755 }
756
757 ############################################
758 # section 4) CDS variation analysis
759 ############################################
760
761 if ($run_evolution)
762 {
763 #### Retrieve cluster
764 &RetrieveClusterFromFile("${outputDIR}1.Orthologs_Cluster.txt",\@clusters);
765 chomp(@clusters);
766
767
768
769 ##################
770 ##
771 ## Distance based
772 ##
773 ##################
774
775 ### caculate the distance between each two strains by cluster
776 &ClusterProfil4Specie(\%tmpHash); # caculate Clusters profile for each specie
777 &DistanceMatrix($spnum,\%tmpHash); # caculate distance matrix acoording to Clusters profile
778 ###output distance
779 open(DIST,">${outputDIR}4.Species_Distance_Clusters_Based.txt");
780 ###header
781 printf DIST "%5d", $spnum;
782 print DIST "\n";
783 foreach $i (0..($spnum-1))
784 {
785 $key="sp".$i."sp";
786 printf DIST "%-10s",$key;
787 foreach $j (0..($spnum-1))
788 {
789 printf DIST " %8f",$tmpHash{$i."-".$j};
790 }
791 print DIST "\n";
792 }
793 close(DIST);
794
795 %tmpHash=();
796
797 ### based on pan genome (distance)
798 print "\nDraw pangenome based phylogenetic tree ...\n\n";
799
800 &PanBasedTree("${outputDIR}4.Species_Distance_Clusters_Based.txt","${outputDIR}4.PanBased");
801
802 ##################
803 ##
804 ## SNP based
805 ##
806 ##################
807 %tmpHash=();
808 if (!(-e "${outputDIR}3.CDS.variation.for.evolution.txt"))
809 {
810 print "Variation in core orthologs cluster is not found from ${outputDIR}3.CDS.variation.for.evolution.txt.\n";
811 print "Maybe you have skipped CDS variation analysis.\n";
812 }else
813 {
814 &ReadSequenceInToHash("${outputDIR}3.CDS.variation.for.evolution.txt",\%tmpHash);
815 open(R,">mlst.aln");
816 for ($i=0;$i<@species;$i++)
817 {
818 print R ">sp${i}sp\n".$tmpHash{$species[$i]}."\n";
819 }
820 close(R);
821 &fasta2phylip("mlst.aln","mlst.phylip");
822 system("rm -rf mlst.aln");
823
824 print "\nDraw SNP based phylogenetic tree ...\n\n";
825 &SNPBasedTree("mlst.phylip","${outputDIR}4.SNPBased");
826 system("rm -rf mlst.phylip")
827 }
828
829 ########
830 # replace speices name
831 ########
832
833 opendir(DIR,"${outputDIR}");
834 @tmp=readdir(DIR);
835 closedir(DIR);
836 @tmp=grep(/^4/,@tmp);
837 foreach $tmp (@tmp)
838 {
839 &ReplaceName(\@species,"${outputDIR}$tmp");
840 }
841 }else
842 {
843 print "Evolution analysis is skipped.\n";
844 }
845
846
847 ############################################
848 # section 4) Function analysis
849 ############################################
850
851 if ($run_function)
852 {
853 #### Retrieve cluster
854 &RetrieveClusterFromFile("${outputDIR}1.Orthologs_Cluster.txt",\@clusters);
855 chomp(@clusters);
856
857 #### prepare annotation file
858 &PrepareTable(\@species,$inputDIR,".function"); ###prepare location file
859 &ReadAnnotation(\@species,\%cog,\%description);
860
861
862 #### assign function
863 open(R,">${outputDIR}5.Orthologs_Cluster_Function.txt");
864 print R "ClusterID\tConservation_Level\tCOG\tDescription\n";
865 for ($i=0;$i<@clusters;$i++)
866 {
867 @row=split(/\t/,$clusters[$i]);
868 $ClusterID=$row[0];
869 splice(@row,0,1);
870 print R $ClusterID."\t".&CountSpeicesInCluster(join("\t",@row))."\t".&getCOG(\@row,\%cog)."\t".&getDescription(\@row,\%description)."\n";
871 }
872 close(R);
873
874 #### COG distribution
875
876 ###Whole Clusters COG Distribution
877 &outputCOGStatistic("${outputDIR}5.Orthologs_Whole_Cluster_COG_Distribution.txt",&scanCOG("${outputDIR}5.Orthologs_Cluster_Function.txt",$spnum,1));
878
879 ###Core Clusters COG Distribution
880 &outputCOGStatistic("${outputDIR}5.Orthologs_Core_Cluster_COG_Distribution.txt",&scanCOG("${outputDIR}5.Orthologs_Cluster_Function.txt",$spnum,$spnum));
881
882 ###Dispensable Clusters COG Distribution
883 &outputCOGStatistic("${outputDIR}5.Orthologs_Dispensable_Cluster_COG_Distribution.txt",&scanCOG("${outputDIR}5.Orthologs_Cluster_Function.txt",($spnum-1),2));
884
885 ###strains specifc Clusters COG Distribution
886 &outputCOGStatistic("${outputDIR}5.Orthologs_specifc_Cluster_COG_Distribution.txt",&scanCOG("${outputDIR}5.Orthologs_Cluster_Function.txt",1,1));
887
888 system("rm -rf *.function");
889
890 }else
891 {
892 print "Function analysis is skipped.\n";
893 }
894
895 sub outputCOGStatistic()
896 {
897 (my $file,my $subcogcount)=@_;
898 my @cogcat=("J A K L B","D Y V T M N Z W U O","C G E F H I P Q","R S -");
899 my @cogdesc=("INFORMATION STORAGE AND PROCESSING","CELLULAR PROCESSES AND SIGNALING","METABOLISM","POORLY CHARACTERIZED");
900 my @subcogcat=qw(J A K L B D Y V T M N Z W U O C G E F H I P Q R S -);
901 my @subcogdesc=("[J] Translation, ribosomal structure and biogenesis","[A] RNA processing and modification","[K] Transcription","[L] Replication, recombination and repair","[B] Chromatin structure and dynamics","[D] Cell cycle control, cell division, chromosome partitioning","[Y] Nuclear structure","[V] Defense mechanisms","[T] Signal transduction mechanisms","[M] Cell wall/membrane/envelope biogenesis","[N] Cell motility","[Z] Cytoskeleton","[W] Extracellular structures","[U] Intracellular trafficking, secretion, and vesicular transport","[O] Posttranslational modification, protein turnover, chaperones","[C] Energy production and conversion","[G] Carbohydrate transport and metabolism","[E] Amino acid transport and metabolism","[F] Nucleotide transport and metabolism","[H] Coenzyme transport and metabolism","[I] Lipid transport and metabolism","[P] Inorganic ion transport and metabolism","[Q] Secondary metabolites biosynthesis, transport and catabolism","[R] General function prediction only","[S] Function unknown","[-] Unclassified");
902 my %subcogdesc;
903 my $key;
904 my @cog;
905 my $i;
906 my $cognum;
907
908 for ($i=0;$i<@subcogcat;$i++)
909 {
910 $subcogdesc{$subcogcat[$i]}=$subcogdesc[$i];
911 }
912
913 open(R,">$file");
914 for ($i=0;$i<@cogcat;$i++)
915 {
916 $cognum=0;
917 foreach $key (split(" ",$cogcat[$i]))
918 {
919 $cognum=$cognum+$$subcogcount{$key};
920 }
921 print R $cogdesc[$i]." ( ".$cognum." )\n";
922 foreach $key (split(" ",$cogcat[$i]))
923 {
924 printf R "%-6d %s\n",$$subcogcount{$key},$subcogdesc{$key};
925 }
926 print R "\n";
927
928 }
929 close(R);
930
931 }
932
933 sub scanCOG()
934 {
935 (my $file,my $max_orth,my $min_orth)=@_;
936 my @row;
937 my @subcogcat=qw(J A K L B D Y V T M N Z W U O C G E F H I P Q R S -);
938 my %subcogcount;
939 my $cog;
940 my $key;
941
942 foreach $key (@subcogcat)
943 {
944 $subcogcount{$key}=0;
945 }
946
947 @subcogcat=qw(J A K L B D Y V T M N Z W U O C G E F H I P Q R S);
948
949 open(F,"$file");
950 $_=<F>;
951 while (<F>)
952 {
953 @row=split(/\t/,$_);
954 if ($row[1]>=$min_orth and $row[1]<=$max_orth)
955 {
956 if ($row[2] eq "-")
957 {
958 $subcogcount{"-"}++;
959 }else
960 {
961 $_=uc($row[2]);
962 s/COG//gi;
963 $cog=$_;
964 foreach $key (@subcogcat)
965 {
966 if ($cog=~/$key/)
967 {
968 $subcogcount{$key}++;
969 }
970 }
971 }
972 }
973 }
974 close(F);
975
976 return \%subcogcount;
977 }
978
979
980
981
982 sub getCOG()
983 {
984 (my $data,my $coghash)=@_;
985 my $cog="";
986 my @cog;
987 my $key;
988 my %hash;
989 my @gene=split(/\t|\,/,join("\t",@$data));
990 @gene=grep(/^S/,@gene);
991
992 foreach $key (@gene)
993 {
994 if (($$coghash{$key} ne "-") and ($$coghash{$key} ne ""))
995 {
996 $cog=$cog.",".$$coghash{$key};
997 }
998 }
999 @cog=split(/,/,$cog);
1000 foreach $cog (@cog)
1001 {
1002 if ($cog ne "")
1003 {
1004 $hash{$cog}=1;
1005 }
1006 }
1007
1008 $cog=join(",",(keys %hash));
1009 if ($cog eq "")
1010 {
1011 $cog="-";
1012 }
1013 return $cog;
1014 }
1015
1016 sub getDescription()
1017 {
1018 (my $data,my $deschash)=@_;
1019 my $desc="";
1020 my $key;
1021 my @gene=split(/\t|\,/,join("\t",@$data));
1022 @gene=grep(/^S/,@gene);
1023
1024 foreach $key (@gene)
1025 {
1026 if ( ($$deschash{$key} ne "") and ($$deschash{$key} ne "-") and ($$deschash{$key}!~/hypothetical/))
1027 {
1028 $desc=$$deschash{$key};
1029 }
1030 }
1031
1032 if ($desc eq "")
1033 {
1034 $desc="hypothetical protein";
1035 }
1036
1037 return $desc;
1038 }
1039
1040
1041
1042 sub ReadAnnotation()
1043 {
1044 (my $species,my $cog,my $description)=@_;
1045 my $i;
1046 my @row;
1047
1048 for ($i=0;$i<@$species;$i++)
1049 {
1050 open(F,"$$species[$i].function");
1051 while (<F>)
1052 {
1053 chomp($_);
1054 @row=split(/\t/,$_);
1055 if (scalar(@row)>=2)
1056 {
1057 $$cog{$row[0]}=$row[1];
1058 }else
1059 {
1060 $$cog{$row[0]}="-";
1061 }
1062
1063 if (scalar(@row)>=3)
1064 {
1065 $$description{$row[0]}=$row[2];
1066 }else
1067 {
1068 $$description{$row[0]}="hypothetical protein";
1069 }
1070 }
1071 close(F);
1072 }
1073 }
1074
1075
1076 sub SNPBasedTree()
1077 {
1078 (my $infile,my $outfileprefix)=@_;
1079 my $tmpin=$infile;
1080 my $tmpout;
1081
1082
1083 #### boootstrap
1084 print "\n#### seqboot ...\n\n";
1085 open(R,">seqboot.cmd");
1086 #print R "$tmpin\n";
1087 print R "R\n";
1088 print R "$bootstrap\n";
1089 print R "Y\n";
1090 print R "1\n";
1091 close(R);
1092
1093 system("cp $tmpin infile");
1094 system("$seqboot < seqboot.cmd");
1095 system("mv outfile 100dnaseq");
1096 system("rm -rf infile");
1097 system("rm seqboot.cmd"); # 100dnasseq
1098
1099 #### dnaml
1100 print "\n#### dnaml ...\n\n";
1101 open(R,">dnaml.cmd");
1102 #print R "100dnaseq\n";
1103 print R "T\n";
1104 print R "25\n";
1105 if ($bootstrap>1)
1106 {
1107 print R "M\n";
1108 print R "D\n";
1109 #print R "100\n";
1110 print R "$bootstrap\n";
1111 print R "1\n"; # Random number seed (must be odd)?
1112 print R "5\n"; # Number of times to jumble?
1113 }
1114 print R "Y\n";
1115 close(R);
1116
1117 system("cp 100dnaseq infile");
1118 system("$dnaml < dnaml.cmd");
1119 system("rm -rf outfile");
1120 system("rm -rf infile");
1121 system("mv outtree 100dnaseqtree"); # 100dnaseq, 100dnaseqtree
1122
1123 #### consense
1124 print "\n#### dnaml consense ...\n\n";
1125
1126 open(R,">consense.cmd");
1127 #print R "100dnaseqtree\n";
1128 print R "Y\n";
1129 close(R);
1130
1131 system("cp 100dnaseqtree intree");
1132 system("$consense < consense.cmd");
1133 system("mv outfile ${outfileprefix}.ML.outfile");
1134 system("mv outtree ${outfileprefix}.ML.tree");
1135 system("rm -rf infile");
1136 system("rm -rf 100dnaseqtree"); # 100dnaseq
1137
1138 #### dnadist
1139 print "\n#### dnadist ...\n\n";
1140 open(R,">dnadist.cmd");
1141 #print R "100dnaseq\n";
1142 print R "T\n";
1143 print R "25\n";
1144 if ($bootstrap>1)
1145 {
1146 print R "M\n";
1147 print R "D\n";
1148 #print R "100\n";
1149 print R "$bootstrap\n";
1150 }
1151 print R "Y\n";
1152 close(R);
1153
1154 system("cp 100dnaseq infile");
1155 system("$dnadist < dnadist.cmd");
1156 system("rm -rf 100dnaseq");
1157 system("rm -rf infile");
1158 system("mv outfile 100dnadist"); # 100dnadist
1159
1160 #### Neighbor-joining tree
1161 print "\n#### Neighbor-joining ...\n\n";
1162 open(R,">NJ.cmd");
1163 if ($bootstrap>1)
1164 {
1165 #print R "100dnadist\n";
1166 print R "M\n";
1167 #print R "100\n";
1168 print R "$bootstrap\n";
1169 print R "1\n";
1170 }
1171 print R "Y\n";
1172 close(R);
1173
1174 system("cp 100dnadist infile");
1175 system("$neighbor < NJ.cmd");
1176 system("mv outtree 100dnadistNJtree");
1177 system("rm outfile");
1178 system("rm -rf infile");
1179 system("rm -rf NJ.cmd"); # 100dnadist,100dnadistNJtree
1180
1181 #### NJ-consense
1182 print "\n#### NJ-consense ...\n\n";
1183 open(R,">NJ-consense.cmd");
1184 #print R "100dnadistNJtree\n";
1185 print R "Y\n";
1186 close(R);
1187
1188 system("cp 100dnadistNJtree intree");
1189 system("$consense < NJ-consense.cmd");
1190 system("mv outfile ${outfileprefix}.Neighbor-joining.outfile");
1191 system("mv outtree ${outfileprefix}.Neighbor-joining.tree");
1192 system("rm -rf NJ-consense.cmd");
1193 system("rm -rf intree");
1194 system("rm -rf 100dnadistNJtree");
1195
1196
1197 #### UPGMA tree
1198 print "\n#### UPGMA ...\n\n";
1199 open(R,">UPGMA.cmd");
1200 #print R "100dnadist\n";
1201 print R "N\n";
1202 if ($bootstrap>1)
1203 {
1204 print R "M\n";
1205 #print R "100\n";
1206 print R "$bootstrap\n";
1207 print R "1\n";
1208 }
1209 print R "Y\n";
1210 close(R);
1211
1212 system("cp 100dnadist infile");
1213 system("$neighbor < UPGMA.cmd");
1214 system("mv outtree 100dnadistUPGMAtree");
1215 system("rm -rf outfile");
1216 system("rm -rf infile");
1217 system("rm -rf UPGMA.cmd");
1218
1219 #### UPGMA-consense
1220 print "\n#### UPGMA-consense ...\n\n";
1221 open(R,">UPGMA-consense.cmd");
1222 #print R "100dnadistUPGMAtree\n";
1223 print R "Y\n";
1224 close(R);
1225
1226 system("cp 100dnadistUPGMAtree intree");
1227 system("$consense < UPGMA-consense.cmd");
1228 system("mv outfile ${outfileprefix}.UPGMA.outfile");
1229 system("mv outtree ${outfileprefix}.UPGMA.tree");
1230 system("rm -rf UPGMA-consense.cmd");
1231 system("rm -rf 100dnadistUPGMAtree");
1232 system("rm -rf intree");
1233
1234 ###CLEAN TMP FILE
1235
1236 system("rm -rf *.cmd");
1237 system("rm -rf 100dnadist");
1238 }
1239
1240 sub PanBasedTree()
1241 {
1242 (my $infile,my $outfileprefix)=@_;
1243 my $tmpin;
1244 my $tmpout;
1245
1246 $tmpin=$infile;
1247 #### Neighbor-joining tree
1248
1249 open(R,">NJ.cmd");
1250 #print R "$tmpin\n";
1251 print R "Y\n";
1252 close(R);
1253
1254 system("cp $tmpin infile");
1255 system("$neighbor < NJ.cmd");
1256 system("mv outfile ${outfileprefix}.Neighbor-joining.outfile");
1257 system("mv outtree ${outfileprefix}.Neighbor-joining.tree");
1258 system("rm -rf NJ.cmd");
1259 system("rm -rf infile");
1260
1261 #### UPGMA tree
1262
1263 open(R,">UPGMA.cmd");
1264 #print R "$tmpin\n";
1265 print R "N\n";
1266 print R "Y\n";
1267 close(R);
1268
1269 system("cp $tmpin infile");
1270 system("$neighbor < UPGMA.cmd");
1271 system("mv outfile ${outfileprefix}.UPGMA.outfile");
1272 system("mv outtree ${outfileprefix}.UPGMA.tree");
1273 system("rm -rf UPGMA.cmd");
1274 system("rm -rf infile");
1275
1276
1277 ###CLEAN TMP FILE
1278 system("rm -rf *.cmd");
1279 }
1280
1281 sub DistanceMatrix()
1282 {
1283 (my $spnum,my $hash)=@_;
1284 my $i;
1285 my $j;
1286 my $k;
1287 my $dist;
1288 my $ref;
1289 my $query;
1290 foreach $i (0..($spnum-1))
1291 {
1292 foreach $j ($i..($spnum-1))
1293 {
1294 $ref=$$hash{$i};
1295 $query=$$hash{$j};
1296 $dist=0;
1297 for ($k=0;$k<length($ref);$k++)
1298 {
1299 if (substr($ref,$k,1) ne substr($query,$k,1))
1300 {
1301 $dist++;
1302 }
1303 }
1304 $$hash{$i."-".$j}=$dist;
1305 $$hash{$j."-".$i}=$dist;
1306 }
1307 }
1308 }
1309
1310
1311 sub ClusterProfil4Specie
1312 {
1313 (my $hash)=@_;
1314 my @row;
1315 my $i;
1316
1317 foreach (0..($spnum-1)) #initialization Hash
1318 {
1319 $$hash{$_}="";
1320 }
1321
1322 foreach (@clusters)
1323 {
1324 @row=split(/\t/,$_);
1325 splice(@row,0,1);
1326 if (&CountSpeicesInCluster(join("\t",@row))>1)
1327 {
1328 for ($i=0;$i<@row;$i++)
1329 {
1330 if ($row[$i] eq "-")
1331 {
1332 $$hash{$i}=$$hash{$i}."0";
1333 }else
1334 {
1335 $$hash{$i}=$$hash{$i}."1";
1336 }
1337 }
1338 }
1339 }
1340 }
1341
1342 # &ExtractSNP4tree(\%tmpHash,\%nt4tree);
1343
1344 sub ExtractSNP4tree()
1345 {
1346 (my $hash,my $nt4treeRef)=@_;
1347 my $key;
1348 my @row;
1349 my $i;
1350 my $len;
1351 my @tribases;
1352 foreach $key (keys %$hash)
1353 {
1354 $$hash{substr($key,0,index($key,"G"))}=$$hash{$key};
1355 delete($$hash{$key});
1356 }
1357
1358 for ($i=0;$i<$spnum;$i++)
1359 {
1360 $nt4tree{"S".$i}=$nt4tree{"S".$i}.$$hash{"S".$i};
1361 }
1362 }
1363
1364
1365 =pod
1366 sub ExtractSNP4tree()
1367 {
1368 (my $hash,my $nt4treeRef)=@_;
1369 my $key;
1370 my @row;
1371 my $i;
1372 my $len;
1373 my @tribases;
1374 foreach $key (keys %$hash)
1375 {
1376 $$hash{substr($key,0,index($key,"G"))}=$$hash{$key};
1377 delete($$hash{$key});
1378 }
1379 @_=(keys %$hash);
1380 $len=length($_[0]);
1381 for ($j=0;3*$j<$len;$j++)
1382 {
1383 ##### scanning each codon
1384 for ($i=0;$i<$spnum;$i++)
1385 {
1386 $tribases[$i]=substr($$hash{"S".$i},3*$j,3);
1387 }
1388 ##### checking each codon
1389 if (&IsTheSame(@tribases) ==0)
1390 {
1391 for ($i=0;$i<@tribases;$i++)
1392 {
1393 $nt4tree{"S".$i}=$nt4tree{"S".$i}.$tribases[$i];
1394 }
1395 }
1396 }
1397 }
1398 =cut
1399
1400
1401 sub pal2nal()
1402 {
1403 (my $pal,my $nuc, my $nal)=@_;
1404 my %aaAln=();
1405 my %ffn=();
1406 my %ntAln=();
1407 my %nt;
1408 my $dna;
1409 my $nt;
1410 my $key;
1411 my $flag=1;
1412 my $i=0;
1413 my $j;
1414
1415 ### read protein aligment result
1416 &ReadAlignmentToHash("$pal",\%aaAln);
1417 ### read nt sequences
1418 &ReadSequenceInToHash("$nuc",\%ffn);
1419 foreach $key (keys %ffn)
1420 {
1421 $dna=$ffn{$key};
1422 #if (int(length($nt{$key})/3)*3 ne length($nt{$key}))
1423 if (int(length($dna)/3)*3 ne length($dna))
1424 {
1425 $flag=0;
1426 print "The length of nucleotide sequence is not 3 integer times.\n";
1427 last;
1428 }else
1429 {
1430 for ($i=0;$i<(length($dna)/3);$i++)
1431 {
1432 $nt{$key."|".$i}=substr($dna,$i*3,3);
1433 }
1434 }
1435 }
1436
1437 if ($flag==0)
1438 {
1439 return 0;
1440 }else
1441 {
1442 foreach $key (keys %aaAln) ### replace aa with corresponding nt
1443 {
1444 $nt="";
1445 $i=0;
1446 for ($j=0;$j<length($aaAln{$key});$j++)
1447 {
1448 if (substr($aaAln{$key},$j,1) eq "-")
1449 {
1450 $nt=$nt."---";
1451 }else
1452 {
1453 $nt=$nt.$nt{$key."|".$i};
1454 $i++;
1455 }
1456 }
1457 $ntAln{$key}=$nt;
1458 }
1459
1460 ### output
1461 open(R,">$nal");
1462 foreach (keys %ntAln)
1463 {
1464 print R ">$_\n".$ntAln{$_}."\n";
1465 }
1466 close(R);
1467
1468 return 1;
1469 }
1470 }
1471
1472
1473
1474
1475 sub DetectSNP()
1476 {
1477 my %faa;
1478 my %ffn;
1479 my @row;
1480 my $count_gene;
1481 my $count_sp;
1482 my @genelist;
1483 my $i;
1484 my $j;
1485 my $pepalnlen;
1486 my @cdsvar=qw();
1487 my $cdi=0;
1488 my @tribases;
1489 my @bases;
1490 my @aa;
1491
1492
1493 ### fetch gene list
1494 open(F,"$ClusterID.pep");
1495 @genelist=<F>;
1496 close(F);
1497 @genelist=grep(/^>/,@genelist);
1498 chomp(@genelist);
1499 $_=join("\t",@genelist);
1500 s/>//g;
1501 @genelist=split(/\t/,$_);
1502
1503 ### count gene number and species number
1504 @row=split(/\t/,$clusters[$ClusterID-1]);
1505 splice(@row,0,1);
1506 $count_sp=&CountSpeicesInCluster(join("\t",@row));
1507 $count_gene=&CountGeneInCluster(join("\t",@row));
1508
1509 ### read alignment sequences
1510 &ReadAlignmentToHash("$ClusterID.pal",\%faa);
1511 &ReadAlignmentToHash("$ClusterID.nal",\%ffn);
1512
1513 @_=(keys %faa);
1514 $pepalnlen=length($faa{$_[0]});
1515 ### scan SNP
1516 for ($i=1;$i<=$pepalnlen;$i++)
1517 {
1518 @tmp=qw();
1519 @tribases=qw();
1520 for ($j=0;$j<@genelist;$j++) ### fetch triplet codon
1521 {
1522 $tribases[$j]=substr($ffn{$genelist[$j]},3*($i-1),3);
1523 }
1524 if (&IsTheSame(@tribases) ==0) ### if triplet codon is not consistent
1525 {
1526 @aa=qw();
1527 for ($j=0;$j<@genelist;$j++)
1528 {
1529 $aa[$j]=substr($faa{$genelist[$j]},($i-1),1);
1530 }
1531 if (&IsTheSame(@aa) ==0) ### aa is not consistent
1532 {
1533 if (join("",@aa) =~/-/)
1534 {
1535 $cdsvar[$cdi++]=$ClusterID."\t".$count_sp."\t".$count_gene."\t".$i."\t".&CharType(\@aa)."\t-\t-\tInDel\n";
1536 }else
1537 {
1538 #### base 1
1539 for ($j=0;$j<@genelist;$j++)
1540 {
1541 $bases[$j]=substr($ffn{$genelist[$j]},3*($i-1),1);
1542 }
1543 if (&IsTheSame(@bases) ==0)
1544 {
1545 $cdsvar[$cdi++]=$ClusterID."\t".$count_sp."\t".$count_gene."\t".($i+0.1)."\t".&CharType(\@aa)."\t".&CharType(\@bases)."\t".join("",@bases)."\tNonsynonymous mutation\n";
1546 }
1547 #### base 2
1548 for ($j=0;$j<@genelist;$j++)
1549 {
1550 $bases[$j]=substr($ffn{$genelist[$j]},3*($i-1)+1,1);
1551 }
1552 if (&IsTheSame(@bases) ==0)
1553 {
1554 $cdsvar[$cdi++]=$ClusterID."\t".$count_sp."\t".$count_gene."\t".($i+0.2)."\t".&CharType(\@aa)."\t".&CharType(\@bases)."\t".join("",@bases)."\tNonsynonymous mutation\n";
1555 }
1556 #### base 3
1557 for ($j=0;$j<@genelist;$j++)
1558 {
1559 $bases[$j]=substr($ffn{$genelist[$j]},3*($i-1)+2,1);
1560 }
1561 if (&IsTheSame(@bases) ==0)
1562 {
1563 $cdsvar[$cdi++]=$ClusterID."\t".$count_sp."\t".$count_gene."\t".($i+0.3)."\t".&CharType(\@aa)."\t".&CharType(\@bases)."\t".join("",@bases)."\tNonsynonymous mutation\n";
1564 }
1565 }
1566 }else
1567 {
1568 #### base 1
1569 for ($j=0;$j<@genelist;$j++)
1570 {
1571 $bases[$j]=substr($ffn{$genelist[$j]},3*($i-1),1);
1572 }
1573 if (&IsTheSame(@bases) ==0)
1574 {
1575 $cdsvar[$cdi++]=$ClusterID."\t".$count_sp."\t".$count_gene."\t".($i+0.1)."\t".&CharType(\@aa)."\t".&CharType(\@bases)."\t".join("",@bases)."\tSynonymous mutation\n";
1576 }
1577 #### base 2
1578 for ($j=0;$j<@genelist;$j++)
1579 {
1580 $bases[$j]=substr($ffn{$genelist[$j]},3*($i-1)+1,1);
1581 }
1582 if (&IsTheSame(@bases) ==0)
1583 {
1584 $cdsvar[$cdi++]=$ClusterID."\t".$count_sp."\t".$count_gene."\t".($i+0.2)."\t".&CharType(\@aa)."\t".&CharType(\@bases)."\t".join("",@bases)."\tSynonymous mutation\n";
1585 }
1586 #### base 3
1587 for ($j=0;$j<@genelist;$j++)
1588 {
1589 $bases[$j]=substr($ffn{$genelist[$j]},3*($i-1)+2,1);
1590 }
1591 if (&IsTheSame(@bases) ==0)
1592 {
1593 $cdsvar[$cdi++]=$ClusterID."\t".$count_sp."\t".$count_gene."\t".($i+0.3)."\t".&CharType(\@aa)."\t".&CharType(\@bases)."\t".join("",@bases)."\tSynonymous mutation\n";
1594 }
1595 }
1596 }
1597 }
1598 return @cdsvar;
1599 }
1600
1601
1602 sub VarAnalysis()
1603 {
1604 (my $data)=@_;
1605 my @data=@$data;
1606 my $indel=0;
1607 my $syn=0;
1608 my $nonsyn=0;
1609 my @tmp;
1610 $indel=scalar(grep(/InDel$/,@data));
1611 $nonsyn=scalar(grep(/Nonsynonymous mutation$/,@data));;
1612 $syn=scalar(grep(/Synonymous mutation$/,@data));
1613 return "$indel\t$nonsyn\t$syn";
1614 }
1615
1616
1617
1618 sub CharType()
1619 {
1620 (my $str)=@_;
1621 my %hash;
1622 my @data=@$str;
1623 foreach (@data)
1624 {
1625 $hash{$_}=1;
1626 }
1627 return join(",",(keys %hash));
1628 }
1629
1630 sub IsTheSame()
1631 {
1632 (my @data)=@_;
1633 my %hash;
1634 foreach (@data)
1635 {
1636 $hash{$_}=1;
1637 }
1638 if (scalar(keys %hash) ==1)
1639 {
1640 return 1;
1641 }else
1642 {
1643 return 0;
1644 }
1645 }
1646
1647
1648
1649 sub FormatClusterOutPut()
1650 {
1651 (my $speices,my $file,my $cluster)=@_;
1652 my @row;
1653 my $gid=1;
1654 my $key;
1655 my %hash;
1656 my $gene;
1657 my @tmp;
1658 my $i;
1659 my $j;
1660 open(R,">$file");
1661 print R "ClutserID\t".join("\t",@$speices)."\n";
1662 foreach $key (@$cluster)
1663 {
1664 @row=split(/\t/,$key);
1665 for ($i=0;$i<@row;$i++)
1666 {
1667 if ($row[$i] ne "-")
1668 {
1669 @tmp=split(/,/,$row[$i]);
1670 for ($j=0;$j<@tmp;$j++)
1671 {
1672 $_=$tmp[$j];
1673 s/^S[0-9]+G//;
1674 $tmp[$j]=$_;
1675 }
1676 $row[$i]=join(",",@tmp);
1677 }
1678 }
1679 print R $gid."\t".join("\t",@row)."\n";
1680 $gid++;
1681 }
1682 close(R);
1683 }
1684 sub RetrieveClusterFromFile()
1685 {
1686 (my $file,my $clusters)=@_;
1687 my @content;
1688 my @row;
1689 my $spid;
1690 my $line=0;
1691 my $i=0;
1692 my $j;
1693 my @tmp;
1694 open(F,$file) or die "Could open $file\n";
1695 @content=<F>;
1696 close(F);
1697 splice(@content,0,1);
1698 chomp(@content);
1699 foreach (@content)
1700 {
1701 @row=split(/\t/,$_);
1702 $$clusters[$line]=$row[0];
1703 splice(@row,0,1);
1704 for ($i=0;$i<@row;$i++)
1705 {
1706 if ($row[$i] ne "-")
1707 {
1708 @tmp=split(/,/,$row[$i]);
1709 for ($j=0;$j<@tmp;$j++)
1710 {
1711 $tmp[$j]="S${i}G".$tmp[$j];
1712 }
1713 $row[$i]=join(",",@tmp);
1714 }
1715 }
1716 $$clusters[$line]=$$clusters[$line]."\t".join("\t",@row)."\n";
1717 $line++;
1718 }
1719 }
1720
1721
1722
1723
1724 sub GeneDistribution()
1725 {
1726 (my $clusters,my $hash)=@_;
1727 my @row;
1728 my $spid;
1729 my $orth;
1730 my $key;
1731 foreach (@$clusters)
1732 {
1733 @row=split(/\t/,$_);
1734 splice(@row,0,1);
1735 $orth=&CountSpeicesInCluster(join("\t",@row));
1736 @row=split(/\t|\,/,join("\t",@row));
1737 foreach $key (@row)
1738 {
1739 if ($key ne "-")
1740 {
1741 $spid=substr($key,1,(index($key,'G')-1)); ###extract strains id
1742 if (exists($$hash{$orth."|".$spid}))
1743 {
1744 $$hash{$orth."|".$spid}++;
1745 }else
1746 {
1747 $$hash{$orth."|".$spid}=1;
1748 }
1749 }
1750 }
1751 }
1752 }
1753
1754 sub CountSpeicesInCluster()
1755 {
1756 (my $str)=@_;
1757 chomp($str);
1758 my @list=split(/\t/,$str);
1759 my $key;
1760 my $count=0;
1761
1762 foreach $key (@list)
1763 {
1764 if ($key ne "-")
1765 {
1766 $count++;
1767 }
1768 }
1769 return $count;
1770 }
1771
1772 sub CountGeneInCluster()
1773 {
1774 (my $str)=@_;
1775 chomp();
1776 my @list=split(/\t|\,/,$str);
1777 my $key;
1778 my $count=0;
1779 foreach $key (@list)
1780 {
1781 if ($key ne "-")
1782 {
1783 $count++;
1784 }
1785 }
1786 return $count;
1787 }
1788
1789
1790
1791
1792 sub GF()
1793 {
1794 &PrepareFasta(\@species,$inputDIR,".pep"); ###prepare pep file
1795 system("cat ".join(".pep ",@species).".pep > All.pep");
1796 system("grep '>' All.pep > genelist");
1797 system("$formatdb -p T -i All.pep");
1798 system("$blastall -p blastp -i All.pep -d All.pep -M BLOSUM45 -m9 -e $evalue -o All.blastp -a $thread");
1799 system("perl ./Blast_Filter.pl All.blastp All.pep $coverage $identity $score | $mcl - --abc -I 2.0 -o All.cluster");
1800 &FormatCluster("All.cluster","genelist",$spnum,\@clusters);
1801 #system("rm -rf *.pep* All.blastp All.cluster genelist");
1802 }
1803
1804 sub MP()
1805 {
1806 # (my $species,my $inputDIR,my $thread,my $evalue,my $score,my $coverage,my $identity)=@_;
1807 my $i;
1808 my $j;
1809 &PrepareFasta(\@species,$inputDIR,".pep"); ###prepare pep file
1810 system("cat ".join(".pep ",@species).".pep > All.pep");
1811 system("grep '>' All.pep > genelist");
1812 system("rm -rf All.pep");
1813 for ($i=0;$i<$spnum;$i++)
1814 {
1815 for ($j=$i+1;$j<$spnum;$j++)
1816 {
1817 system("perl ./inparanoid.pl $blastall $thread $formatdb $score $global $local $species[$i].pep $species[$j].pep");
1818 }
1819 }
1820 system("perl ./multiparanoid.pl -species ".join(".pep+",@species).".pep -unique 1");
1821 ###convert the MP result to table list based on gene
1822 &MP_Result_to_Table("MP.Cluster","All.cluster");
1823 &FormatCluster("All.cluster","genelist",$spnum,\@clusters);
1824 system("rm -rf sqltable.* *.pep* MP.Cluster genelist");
1825 }
1826
1827 sub fasta2phylip()
1828 {
1829 (my $input,my $output)=@_;
1830 use Bio::AlignIO;
1831 my $inputfilename = "10.aln";
1832 my $in= Bio::AlignIO->new(-file => $input ,
1833 -format => 'fasta');
1834 my $out = Bio::AlignIO->new(-file => ">$output" ,
1835 -format => 'phylip');
1836 while ( my $aln = $in->next_aln() )
1837 {
1838 $out->write_aln($aln);
1839 }
1840 }
1841
1842
1843 sub RemoveHeadGap()
1844 {
1845 (my $nal,my $hash)=@_;
1846 my %aln;
1847 my $key;
1848 my $gaplength=0;
1849 my $len1;
1850 my $len2;
1851 &ReadSequenceInToHash("$nal",\%aln);
1852 foreach $key (keys %aln)
1853 {
1854 $len1=length($aln{$key});
1855 $_=$aln{$key};
1856 s/^-+//;
1857 $len2=length($_);
1858 if (($len1-$len2)>$gaplength)
1859 {
1860 $gaplength=$len1-$len2;
1861 }
1862 }
1863 foreach $key (keys %aln)
1864 {
1865 $$hash{$key}=$$hash{$key}.substr($aln{$key},$gaplength,(length($aln{$key})-$gaplength));
1866 }
1867 }
1868
1869 sub PrepareFasta()
1870 {
1871 (my $species,my $inputDIR,my $extention)=@_;
1872 my $sp;
1873 my $file;
1874 my $i;
1875 my %hash;
1876 my $key;
1877 for ($i=0;$i<@$species;$i++)
1878 {
1879 $file=$inputDIR.$$species[$i].$extention;
1880 %hash=();
1881 &ReadSequenceInToHash($file,\%hash);
1882 open(R,">$$species[$i]${extention}") or die "Could write into $file\n";
1883 foreach $key (keys %hash)
1884 {
1885 print R ">S${i}G$key\n";
1886 print R $hash{$key}."\n";
1887 }
1888 close(R);
1889 }
1890 }
1891
1892 sub PrepareTable()
1893 {
1894 (my $species,my $inputDIR,my $extention)=@_;
1895 my @content;
1896 my $i;
1897 my @row;
1898 my $file;
1899 for ($i=0;$i<@$species;$i++)
1900 {
1901 $file=$inputDIR.$$species[$i].$extention;
1902 open(F,$file) or die "Could open $file\n";
1903 @content=<F>;
1904 close(F);
1905 chomp(@content);
1906 open(R,">$$species[$i]${extention}") or die "Could write into $file\n";
1907 foreach (@content)
1908 {
1909 @row=split(/\t/,$_);
1910 $row[0]="S${i}G$row[0]";
1911 if ($extention eq ".location")
1912 {
1913 $row[0]=$row[0]."\t".$row[0];
1914 }
1915 print R join("\t",@row)."\n";
1916 }
1917 close(R);
1918 }
1919 }
1920
1921 sub CheckExtraProgram
1922 {
1923 #(my $section, my $method, my $tmparray)=@_;
1924 my @error;
1925 my $ei=0;
1926
1927 #####cluster gene
1928 if (substr($section,0,1) eq "1")
1929 {
1930 ###MP: blastall formatdb
1931 ###GF: blastall formatdb mcl
1932
1933 if (!(-e $formatdb))
1934 {
1935 $error[$ei++]="formatdb is not found at $formatdb\n";
1936 }
1937
1938 if (!(-X $formatdb))
1939 {
1940 $error[$ei++]="there is not premission to execute $formatdb\n";
1941 }
1942
1943 if (!(-e $blastall))
1944 {
1945 $error[$ei++]="blastall is not found at $blastall\n";
1946 }
1947
1948 if (!(-X $blastall))
1949 {
1950 $error[$ei++]="there is not premission to execute $blastall\n";
1951 }
1952
1953 if ($method eq "GF")
1954 {
1955 if (!(-e $mcl))
1956 {
1957 $error[$ei++]="mcl is not found at $mcl\n";
1958 }
1959 if (!(-X $mcl))
1960 {
1961 $error[$ei++]="there is not premission to execute $mcl\n";
1962 }
1963 }
1964 }
1965
1966 #####CDS variation
1967 if (substr($section,2,1) eq "1")
1968 {
1969 if (!(-e $mafft))
1970 {
1971 $error[$ei++]="mafft is not found at $mafft\n";
1972 }
1973 if (!(-X $mafft))
1974 {
1975 $error[$ei++]="there is not premission to execute $mafft\n";
1976 }
1977 }
1978
1979 #####CDS variation
1980 if (substr($section,3,1) eq "1")
1981 {
1982 if (!(-e $mafft))
1983 {
1984 $error[$ei++]="mafft is not found at $mafft\n";
1985 }
1986 if (!(-X $mafft))
1987 {
1988 $error[$ei++]="there is not premission to execute $mafft\n";
1989 }
1990 }
1991 #####Evolution analysis
1992 if (substr($section,3,1) eq "1")
1993 {
1994 if (-e $seqboot)
1995 {
1996 $error[$ei++]="there is not premission to execute $seqboot\n" if(!(-X $seqboot));
1997 }else
1998 {
1999 $error[$ei++]="seqboot is not found at $seqboot\n";
2000 }
2001 if (-e $dnaml)
2002 {
2003 $error[$ei++]="there is not premission to execute $dnaml\n" if(!(-X $dnaml));
2004 }else
2005 {
2006 $error[$ei++]="dnaml is not found at $dnaml\n";
2007 }
2008 if (-e $dnadist)
2009 {
2010 $error[$ei++]="there is not premission to execute $dnadist\n" if(!(-X $dnadist));
2011 }else
2012 {
2013 $error[$ei++]="dnadist is not found at $dnadist\n";
2014 }
2015 if (-e $neighbor)
2016 {
2017 $error[$ei++]="there is not premission to execute $neighbor\n" if(!(-X $neighbor));
2018 }else
2019 {
2020 $error[$ei++]="neighbor is not found at $neighbor\n";
2021 }
2022 if (-e $consense)
2023 {
2024 $error[$ei++]="there is not premission to execute $consense\n" if(!(-X $consense));
2025 }else
2026 {
2027 $error[$ei++]="consense is not found at $consense\n";
2028 }
2029 if (-e $dnapars)
2030 {
2031 $error[$ei++]="there is not premission to execute $dnapars\n" if(!(-X $dnapars));
2032 }else
2033 {
2034 $error[$ei++]="dnapars is not found at $dnapars\n";
2035 }
2036 }
2037 #@$tmparray=(@$tmparray,@error);
2038 @tmp=(@tmp,@error);
2039 }
2040
2041
2042 sub CheckInputFile()
2043 {
2044 (my $species,my $inputDIR,my $section,my $method,my $tmparray)=@_;
2045 ####cluster
2046 if (substr($section,0,1) eq "1")
2047 {
2048 if ($method eq "MM")
2049 {
2050 @$tmparray=(@$tmparray,&chk2SEQ($species,$inputDIR)); ### check pep and nuc
2051 @$tmparray=(@$tmparray,&chktab($species,$inputDIR,".location"));### chk pep nuc location
2052 }else
2053 {
2054 @$tmparray=(@$tmparray,&chk1SEQ($species,$inputDIR));
2055 }
2056 }
2057 ###CDS variation
2058 if (substr($section,2,1) eq "1")
2059 {
2060 @$tmparray=(@$tmparray,&chk2SEQ($species,$inputDIR));
2061 }
2062 ###function analysis
2063 if (substr($section,4,1) eq "1")
2064 {
2065 @$tmparray=(@$tmparray,&chktab($species,$inputDIR,".function"));
2066 }
2067 }
2068
2069
2070 sub chk1SEQ()
2071 {
2072 (my $species,my $inputDIR)=@_;
2073 my @error;
2074 my $ei=0;
2075 my $sp;
2076 my $pepfile;
2077 my %pep;
2078 foreach $sp (@$species)
2079 {
2080 %pep=();
2081 $pepfile=$inputDIR.$sp.".pep";
2082 &ReadSequenceInToHash($pepfile,\%pep);
2083 if (scalar(keys %pep)<2)
2084 {
2085 $error[$ei++]="format error in $pepfile\n";
2086 }
2087 }
2088 return @error;
2089 }
2090
2091 sub chk2SEQ()
2092 {
2093 (my $species,my $inputDIR)=@_;
2094 my $sp;
2095 my %pep;
2096 my %nuc;
2097 my $pepfile;
2098 my $nucfile;
2099 my $key;
2100 my @error;
2101 my $ei=0;
2102 foreach $sp (@$species)
2103 {
2104 $pepfile=$inputDIR.$sp.".pep";
2105 $nucfile=$inputDIR.$sp.".nuc";
2106 %pep=();
2107 %nuc=();
2108 &ReadSequenceInToHash("$pepfile",\%pep);
2109 &ReadSequenceInToHash("$nucfile",\%nuc);
2110 if (scalar(keys %pep) ne scalar(keys %nuc))
2111 {
2112 $error[$ei++]="Sequences number is not consistent in the following two file:\n\t$pepfile\n\t$nucfile\n";
2113 }else
2114 {
2115 foreach $key (keys %pep)
2116 {
2117 if (exists($nuc{$key}))
2118 {
2119 if (length($nuc{$key}) ne ((length($pep{$key})+1)*3))
2120 {
2121 $error[$ei++]="the length of $key in $nucfile is not consistent with its corresponding protein length\n";
2122 }
2123 }else
2124 {
2125 $error[$ei++]="$key lost in $nucfile\n";
2126 }
2127 }
2128
2129 foreach $key (keys %nuc)
2130 {
2131 if (!exists($pep{$key}))
2132 {
2133 $error[$ei++]="1048 $key lost in $pepfile\n";
2134 }
2135 }
2136 }
2137 }
2138 return @error;
2139 }
2140
2141
2142 sub chktab()
2143 {
2144 (my $species,my $inputDIR,my $extention)=@_;
2145 my %pep;
2146 my @row;
2147 my $key;
2148 my %tab;
2149 my @error;
2150 my $ei=0;
2151 my $sp;
2152 my $tabfile;
2153 my $pepfile;
2154 foreach $sp (@$species)
2155 {
2156 %tab=();
2157 %pep=();
2158 $tabfile=$inputDIR.$sp.$extention;
2159 open(F,"$tabfile");
2160 while (<F>)
2161 {
2162 chomp();
2163 @row=split(/\t/,$_);
2164 if (scalar(@row)<3)
2165 {
2166 $error[$ei++]="format error in $tabfile\n";
2167 }else
2168 {
2169 $tab{$row[0]}=$row[1];
2170 }
2171 }
2172 close(F);
2173 $pepfile=$inputDIR.$sp.".pep";
2174 &ReadSequenceInToHash($pepfile,\%pep);
2175 foreach $key (keys %pep)
2176 {
2177 if (!exists($tab{$key}))
2178 {
2179 $error[$ei++]="sequence $key lost infomation in $tabfile\n";
2180 }
2181 }
2182 }
2183 return @error;
2184 }
2185
2186
2187
2188 sub ReadSequenceInToHash()
2189 {
2190 use Bio::SeqIO;
2191 (my $file,my $hash)=@_;
2192 my $seq;
2193 my $in=Bio::SeqIO->new(-file=>"$file",-format=>"fasta");
2194 while ($seq=$in->next_seq())
2195 {
2196 #$$hash{$id."|".$seq->id}=$seq->seq();
2197 $$hash{$seq->id}=$seq->seq();
2198 }
2199 }
2200
2201 sub ReadAlignmentToHash()
2202 {
2203 (my $file,my $hash)=@_;
2204 my $name="";
2205 my $seq="";
2206 my @content;
2207 my $line;
2208 open(F,"$file");
2209 @content=<F>;
2210 close(F);
2211 chomp(@content);
2212 for ($line=0;$line<@content;$line++)
2213 {
2214 if ($content[$line]=~/^>/)
2215 {
2216 if ($line>0)
2217 {
2218 $$hash{$name}=$seq;
2219 $name="";
2220 }
2221
2222 $_=$content[$line];
2223 s/^>//;
2224 $name=$_;
2225 $seq="";
2226 }else
2227 {
2228 if ($name ne "")
2229 {
2230 $seq=$seq.$content[$line];
2231 }
2232 }
2233 }
2234 $$hash{$name}=$seq;
2235 }
2236
2237
2238
2239 sub Combination()
2240 {
2241 (my $m,my $n,my $comRef)=@_;
2242 my $str="";
2243 my %hash;
2244 my $fpos;
2245 my $num0;
2246 my $rest;
2247 my $tmp;
2248 my $i;
2249 my $j;
2250 my $key;
2251 #my $m=scalar(@$array);
2252 my @combination;
2253
2254 for ($i=1;$i<=$n;$i++)
2255 {
2256 $str="1".$str;
2257 }
2258
2259 for ($i=1;$i<=($m-$n);$i++)
2260 {
2261 $str=$str."0";
2262 }
2263
2264 $hash{$str}=1;
2265 while ($str=~/10/)
2266 {
2267 $fpos=index($str,"10");
2268 $_=$str;
2269 s/10/01/;
2270 $str=$_;
2271 $tmp=substr($str,0,$fpos);
2272 $_=$tmp;
2273 s/0//g;
2274 $rest=$_;
2275 $num0=$fpos-length($_);
2276 for ($i=1;$i<=$num0;$i++)
2277 {
2278 $rest="$rest"."0";
2279 }
2280 $str="$rest".substr($str,$fpos,$m-$fpos);
2281 $hash{$str}=1;
2282 }
2283 $j=0;
2284 foreach $key (keys %hash)
2285 {
2286 $combination[$j]="";
2287 for ($i=0;$i<$m;$i++)
2288 {
2289 if (substr($key,$i,1) eq "1")
2290 {
2291 if ($combination[$j] ne "")
2292 {
2293 #$combination[$j]=$combination[$j]."\t".$$array[$i];
2294 $combination[$j]=$combination[$j]."\t".$i;
2295 }else
2296 {
2297 #$combination[$j]=$$array[$i]; ### For return species ID
2298 $combination[$j]=$i;
2299 }
2300 }
2301 }
2302 $j++;
2303 }
2304 @$comRef=@combination; ### update the data through the physic address
2305 }
2306
2307 sub ChkCombinationValue()
2308 {
2309 (my $m,my $n)=@_;
2310 my %hash;
2311 my %vhash;
2312 my $value=0;
2313 my $key;
2314 my @row;
2315 my @sdA;
2316 my @sdB;
2317
2318 ### initialization
2319 $hash{$m."-".$n}=1;
2320
2321 ### split combination
2322 while (scalar(keys %hash)>0 and $value<=$sampleSize)
2323 {
2324 foreach $key (keys %hash)
2325 {
2326 if ($value > $sampleSize) ### threshold
2327 {
2328 last;
2329 }
2330 if (!exists($hash{$key}))
2331 {
2332 next;
2333 }
2334 @row=split(/-/,$key);
2335 #print $row[0]."|".$row[1]."\n";
2336 if ($row[0] eq $row[1])
2337 {
2338 $value=$value+$hash{$key};
2339 }else
2340 {
2341 ##split
2342 $sdA[0]=$row[0]-1;
2343 $sdA[1]=$row[1];
2344 $sdB[0]=$row[0]-1;
2345 $sdB[1]=$row[1]-1;
2346 ##storing A
2347 if (($sdA[0] eq $sdA[1]) or $sdA[1] ==0)
2348 {
2349 $value=$value+$hash{$key};
2350 }else
2351 {
2352 if (exists($hash{$sdA[0]."-".$sdA[1]}))
2353 {
2354 $hash{$sdA[0]."-".$sdA[1]}=$hash{$sdA[0]."-".$sdA[1]}+$hash{$key};
2355 }else
2356 {
2357 $hash{$sdA[0]."-".$sdA[1]}=$hash{$key};
2358 }
2359 }
2360
2361 ##storing B
2362 if (($sdB[0] eq $sdB[1]) or $sdB[1]==0)
2363 {
2364 $value=$value+$hash{$key};
2365 }else
2366 {
2367 if (exists($hash{$sdB[0]."-".$sdB[1]}))
2368 {
2369 $hash{$sdB[0]."-".$sdB[1]}=$hash{$sdB[0]."-".$sdB[1]}+$hash{$key};
2370 }else
2371 {
2372 $hash{$sdB[0]."-".$sdB[1]}=$hash{$key};
2373 }
2374 }
2375 }
2376 #delete original combination
2377 delete($hash{$key});
2378 }
2379 }
2380
2381 if ($value>$sampleSize)
2382 {
2383 return 0;
2384 }else
2385 {
2386 return $value;
2387 }
2388 }
2389
2390
2391 sub SampleCombination()
2392 {
2393 (my $m,my $n,my $comRef)=@_;
2394 my %hash;
2395 my $sampleTimes=0;
2396 my @randNum;
2397 my @sortID;
2398 my $i;
2399 my $j;
2400 my $tmp;
2401 while ( scalar(keys %hash)<$sampleSize and $sampleTimes<($sampleSize*2))
2402 {
2403 for ($i=0;$i<$m;$i++) # generate random data
2404 {
2405 $randNum[$i]=int(100000 * rand(100));
2406 $sortID[$i]=$i;
2407 }
2408
2409 for ($i=0;$i<$m;$i++) # sorting random data
2410 {
2411 for ($j=0;$j<$m;$j++)
2412 {
2413 if ($randNum[$sortID[$i]]<$randNum[$sortID[$j]])
2414 {
2415 $tmp=$sortID[$i];
2416 $sortID[$i]=$sortID[$j];
2417 $sortID[$j]=$tmp;
2418 }
2419 }
2420 }
2421
2422 #storing data
2423 $tmp=join("\t",sort {$a<=>$b} (splice(@sortID,0,$n)));
2424 $hash{$tmp}=1;
2425 $sampleTimes++;
2426 }
2427 @$comRef=keys %hash;
2428 }
2429
2430
2431 sub PanGenomeNumber()
2432 {
2433 (my $spID)=@_;
2434 my $pan=0;
2435 my $core=0;
2436 my $count; #### counter;
2437 my @row;
2438
2439 foreach (@clusters)
2440 {
2441 $count=0;
2442 @row=split(/\t/,$_);
2443
2444 foreach (@$spID)
2445 {
2446 $count=$count+$row[$_];
2447 }
2448
2449 if ($count>0)
2450 {
2451 $pan++;
2452 if ($count == scalar(@$spID))
2453 {
2454 $core++;
2455 }
2456 }
2457 }
2458 return $pan."\t".$core;
2459 }
2460
2461 sub fit_model_A()
2462 {
2463 ### model y = A * x**B + C
2464 (my $xdata,my $ydata)=@_;
2465 my $i;
2466 my $b;
2467 my $max_B=0;
2468 my $max_R=0;
2469 my $max_A=0;
2470 my $max_A_interval;
2471 my $max_C=0;
2472 my $max_C_interval;
2473 my $R=1e-100;
2474 my $start;
2475 my $end;
2476 my $step;
2477 my @xValues;
2478 my @yValues;
2479
2480 $start=1;
2481 $step=0.001;
2482 $b=$start;
2483 $max_R=0;
2484 $R=1e-100;
2485
2486 use Statistics::LineFit;
2487 use Statistics::Distributions;
2488
2489 while ($max_R<=$R)
2490 {
2491 if (($b < 0.02) and ($b >-0.02))
2492 {
2493 $b=-0.02;
2494 }
2495
2496 for ($i=0;$i<@$xdata;$i++)
2497 {
2498 $xValues[$i]=$$xdata[$i]**$b;
2499 }
2500 @yValues=@$ydata;
2501 my $lineFit = Statistics::LineFit->new();
2502 $lineFit->setData (\@xValues, \@yValues) or die "Invalid data";
2503 (my $intercept, my $slope) = $lineFit->coefficients();
2504 my $rSquared = $lineFit->rSquared();
2505 my $meanSquaredError = $lineFit->meanSqError();
2506 my $durbinWatson = $lineFit->durbinWatson();
2507 my $sigma = $lineFit->sigma();
2508 (my $tStatIntercept, my $tStatSlope) = $lineFit->tStatistics();
2509 (my $varianceIntercept,my $varianceSlope) = $lineFit->varianceOfEstimates();
2510
2511 $max_R=$R;
2512 $R=$rSquared;
2513 if ($max_R<=$R)
2514 {
2515 $max_R=$R;
2516 ($max_C,$max_A)=$lineFit->coefficients();
2517 $max_A_interval=Statistics::Distributions::tdistr (($spnum-2),.025)*sqrt($varianceSlope);
2518 $max_C_interval=Statistics::Distributions::tdistr (($spnum-2),.025)*sqrt($varianceIntercept);
2519 }
2520 $b=$b-$step;
2521 }
2522 $max_B=$b;
2523 return ($max_R,$max_A,$max_A_interval,$max_B,$max_C,$max_C_interval);
2524
2525 }
2526
2527 sub fit_model_B()
2528 {
2529 ### model y = A * exp(x*B) + C
2530 (my $xdata,my $ydata)=@_;
2531 my $i;
2532 my $b;
2533 my $max_B=0;
2534 my $max_R=0;
2535 my $max_A=0;
2536 my $max_A_interval;
2537 my $max_C=0;
2538 my $max_C_interval;
2539 my $R=1e-100;
2540 my $start;
2541 my $end;
2542 my $step;
2543 my @xValues;
2544 my @yValues;
2545
2546 $start=0;
2547 $step=0.001;
2548 $b=$start;
2549 $max_R=0;
2550 $R=1e-100;
2551
2552 use Statistics::LineFit;
2553 use Statistics::Distributions;
2554
2555 while ($max_R<=$R)
2556 {
2557 if (($b < 0.02) and ($b >-0.02))
2558 {
2559 $b=-0.02;
2560 }
2561
2562 for ($i=0;$i<@$xdata;$i++)
2563 {
2564 $xValues[$i]=exp($$xdata[$i]*$b);
2565 }
2566 @yValues=@$ydata;
2567 my $lineFit = Statistics::LineFit->new();
2568 $lineFit->setData (\@xValues, \@yValues) or die "Invalid data";
2569 (my $intercept, my $slope) = $lineFit->coefficients();
2570 my $rSquared = $lineFit->rSquared();
2571 my $meanSquaredError = $lineFit->meanSqError();
2572 my $durbinWatson = $lineFit->durbinWatson();
2573 my $sigma = $lineFit->sigma();
2574 (my $tStatIntercept, my $tStatSlope) = $lineFit->tStatistics();
2575 (my $varianceIntercept,my $varianceSlope) = $lineFit->varianceOfEstimates();
2576
2577 $max_R=$R;
2578 $R=$rSquared;
2579 if ($max_R<=$R)
2580 {
2581 $max_R=$R;
2582 ($max_C,$max_A)=$lineFit->coefficients();
2583 $max_A_interval=Statistics::Distributions::tdistr (($spnum-2),.025)*sqrt($varianceSlope);
2584 $max_C_interval=Statistics::Distributions::tdistr (($spnum-2),.025)*sqrt($varianceIntercept);
2585 }
2586 $b=$b-$step;
2587 }
2588 $max_B=$b;
2589 return ($max_R,$max_A,$max_A_interval,$max_B,$max_C,$max_C_interval);
2590 }
2591
2592
2593 sub ReadData2Array()
2594 {
2595 (my $file, my $array1,my $col1,my $array2,my $col2)=@_;
2596 my $i=0;
2597 open(F,$file);
2598 $_=<F>;
2599 while (<F>)
2600 {
2601 chomp();
2602 @_=split(/\t/,$_);
2603 $$array1[$i]=$_[$col1];
2604 $$array2[$i]=$_[$col2];
2605 $i++;
2606 }
2607 close(F);
2608 }
2609
2610 sub SumData()
2611 {
2612 (my $xdata,my $ydata,my $SumMethod)=@_;
2613 my %hash;
2614 my $i;
2615 my $key;
2616 my $max=0;
2617 for ($i=0;$i<@$xdata;$i++)
2618 {
2619 if (exists($hash{$$xdata[$i]}))
2620 {
2621 $hash{$$xdata[$i]}=$hash{$$xdata[$i]}." ".$$ydata[$i];
2622 }else
2623 {
2624 $hash{$$xdata[$i]}=$$ydata[$i];
2625 if ($$xdata[$i]>$max)
2626 {
2627 $max=$$xdata[$i];
2628 }
2629 }
2630 }
2631 @$xdata=qw();
2632 @$ydata=qw();
2633 $i=0;
2634 foreach $i (1..$max)
2635 {
2636 $$xdata[$i-1]=$i;
2637 if ($SumMethod eq "median")
2638 {
2639 $$ydata[$i-1]=&median($hash{$i});
2640 }else
2641 {
2642 $$ydata[$i-1]=&mean($hash{$i});
2643 }
2644 }
2645 #print join(",",@$xdata)."\n";
2646 #print join(",",@$ydata)."\n";
2647 }
2648
2649 sub median()
2650 {
2651 (my $data)=@_;
2652 my @data=split(/ /,$data);
2653 my $arraylen=scalar(@data);
2654 @data=sort{$a<=>$b} @data;
2655 if (int($arraylen/2)*2 == $arraylen)
2656 {
2657 return ($data[$arraylen/2]+$data[$arraylen/2-1])/2;
2658 }else
2659 {
2660 return $data[int($arraylen/2)];
2661 }
2662 }
2663
2664 sub mean()
2665 {
2666 (my $data)=@_;
2667 my @data=split(/ /,$data);
2668 my $sum=0;
2669 foreach (@data)
2670 {
2671 $sum=$sum+$_;
2672 }
2673 return int(($sum/scalar(@data))*1000)/1000;
2674 }
2675
2676 sub ReplaceName()
2677 {
2678 (my $sp,my $file)=@_;
2679 my @content;
2680 my $line;
2681 my $i;
2682 my $target;
2683 open(F,$file);
2684 @content=<F>;
2685 close(F);
2686 for ($line=0;$line<@content;$line++)
2687 {
2688 for ($i=0;$i<@$sp;$i++)
2689 {
2690 $_=$content[$line];
2691 $target="sp".$i."sp";
2692 s/$target/$$sp[$i]/;
2693 $content[$line]=$_;
2694 }
2695 }
2696 open(R,">$file");
2697 print R @content;
2698 close(R);
2699 }
2700
2701 sub MP_Result_to_Table()
2702 {
2703 (my $MPresult, my $outputfile)=@_;
2704 my %hash;
2705 my $maxid=0;
2706 my $i;
2707 my @row;
2708
2709 open(F,"$MPresult");
2710 $_=<F>;
2711 while (<F>)
2712 {
2713 @row=split(/\t/,$_);
2714 if (exists($hash{$row[0]}))
2715 {
2716 $hash{$row[0]}=$hash{$row[0]}."\t".$row[2];
2717 }else
2718 {
2719 $hash{$row[0]}=$row[2];
2720 if ($row[0]>$maxid)
2721 {
2722 $maxid=$row[0];
2723 }
2724 }
2725 }
2726 close(F);
2727
2728 open(R,">$outputfile");
2729 foreach $i (1..$maxid)
2730 {
2731 print R $hash{$i}."\n";
2732 }
2733 close(R);
2734 }
2735
2736
2737
2738 sub FormatCluster()
2739 {
2740 (my $infile,my $genelist,my $spnum,my $cluster)=@_;
2741 my %hash;
2742 my %gene;
2743 my $key;
2744 my @row;
2745 my $sp;
2746 my $line;
2747 my $i=0;
2748 my $j=0;
2749 my @content;
2750
2751 ### record gene in clusters
2752 open(F,"$infile");
2753 @content=<F>;
2754 close(F);
2755 chomp(@content);
2756 for ($line=0;$line<@content;$line++)
2757 {
2758 @row=split(/\t/,$content[$line]);
2759 foreach $key (@row)
2760 {
2761 $gene{$key}=1;
2762 }
2763 }
2764 ###retrieves gene which is not in clutsers
2765
2766 open(F,"$genelist");
2767 while ($key=<F>)
2768 {
2769 if ($key=~/^>/)
2770 {
2771 chomp($key);
2772 $_=$key;
2773 s/^>//;
2774 $key=$_;
2775 if (!exists($gene{$key}))
2776 {
2777 $content[$line]=$key;
2778 $line++;
2779 }
2780 }
2781 }
2782 close(F);
2783
2784 #### initialization @cluster
2785 @$cluster=qw();
2786 $j=0;
2787
2788 foreach $line (@content)
2789 {
2790 if ($line ne "")
2791 {
2792 %hash=();
2793 @row=split(/\t/,$line);
2794 foreach $key (@row)
2795 {
2796 $sp=substr($key,0,index($key,"G"));
2797 $gene{$key}=1;
2798 if (exists($hash{$sp}))
2799 {
2800 $hash{$sp}=$hash{$sp}.",".$key;
2801 }else
2802 {
2803 $hash{$sp}=$key;
2804 }
2805 }
2806
2807 $i=0;
2808 @row=qw();
2809
2810 foreach $i (0..($spnum-1))
2811 {
2812 if (exists($hash{"S$i"}))
2813 {
2814 $row[$i]=$hash{"S$i"};
2815 }else
2816 {
2817 $row[$i]="-";
2818 }
2819 }
2820 $$cluster[$j++]=join("\t",@row);
2821 }
2822 }
2823 }
2824
2825