0
|
1 #!/usr/bin/perl
|
|
2 use strict;
|
|
3 use warnings;
|
|
4 use Getopt::Long;
|
|
5
|
|
6 ### programs from BLAST
|
|
7 my $formatdb="formatdb";
|
|
8 my $blastall="blastall";
|
|
9
|
|
10 ### programs from mcl
|
|
11 my $mcl="mcl";
|
|
12
|
|
13 ### programs from mafft
|
|
14 my $mafft="mafft";
|
|
15
|
|
16 ### programs from PHYLIP
|
|
17 my $seqboot="seqboot";
|
|
18 my $neighbor="neighbor";
|
|
19 my $consense="consense";
|
|
20 my $dnaml="dnaml";
|
|
21 my $dnadist="dnadist";
|
|
22 my $dnapars="dnapars";
|
|
23
|
|
24 my $count_tree=0;
|
|
25
|
|
26 my $sampleSize=8000; # when calculate the pan-genome size, we will sample $sampleSize combinations
|
|
27 # if the total combination number is larger than $sampleSize for specific genomes
|
|
28 # Surely, the number of $sampleSize is, the larger, the better.
|
|
29 # However, the larger the $sampleSize is, the more time would be consumed.
|
|
30 # we suggest the range: 5000 ~ 20,000
|
|
31
|
|
32 #####################################################################
|
|
33 # DOn't modify the following code, unless you know their functions
|
|
34 #####################################################################
|
|
35
|
|
36 my %opt=qw();
|
|
37 GetOptions(\%opt,"strains:s","input:s","output:s","cluster!","pangenome!","variation!","evolution!","function!","method:s","thread:i","score:f","evalue:f","coverage:f","local:f","global:f","identity:f","bootstrap:i","help|h!");
|
|
38
|
|
39 my @usage=qq(
|
|
40 ====== Pan-Genome Analysis Pipeline (PGAP) ======
|
|
41 Version 1.2.1
|
|
42
|
|
43 Usage: perl PGAP.pl [Options]
|
|
44
|
|
45 Options:
|
|
46 --strains String Input strains nicknames, and join them with '+', for example: A+B+C
|
|
47 --input String Input data directory
|
|
48 --output String Result output directory
|
|
49
|
|
50 --cluster Run homologous gene clustering
|
|
51 --pangenome Run pan-genome analysis
|
|
52 --variation Run homologous clusters variation analysis
|
|
53 --evolution Run evolution analysis
|
|
54 --function Run Function analysis
|
|
55
|
|
56 --method String GF for GeneFamily method, and MP for MultiParanoid method
|
|
57 for GF: fast, but not very accurate
|
|
58 evalue, score, indentity, coverage are employed
|
|
59 for MP: slow, but more accurate
|
|
60 score, coverage, local, global are employed
|
|
61 --thread Int Number of processors to use in blastall. [default:1]
|
|
62 --score Int Minimum score in blastall. [default:40]
|
|
63 --evalue Decimal Maximal E-value in blastall. [default:1e-10]
|
|
64 --coverage Decimal Minimum alignment coverage for two homologous proteins. [default:0.5]
|
|
65 --local Decimal Minimum local alignment overlap in MP method. [default:0.25]
|
|
66 --global Decimal Minimum global alignment overlap in MP method. [default:0.5]
|
|
67 --identity Decimal Minimum alignment indentity for two homologous proteins. [default:0.5]
|
|
68 --bootstrap Int Bootstrap times for phylogenetics tree. [default:1]
|
|
69
|
|
70 --h or help Display this message
|
|
71 );
|
|
72
|
|
73
|
|
74 ############# specified variable #############
|
|
75 my $inputDIR;
|
|
76 my $outputDIR;
|
|
77 my $run_cluster;
|
|
78 my $run_pangenome;
|
|
79 my $run_variation;
|
|
80 my $run_evolution;
|
|
81 my $run_function;
|
|
82 my $method="";
|
|
83 my $thread;
|
|
84 my $score;
|
|
85 my $identity;
|
|
86 my $evalue;
|
|
87 my $coverage;
|
|
88 my $global;
|
|
89 my $local;
|
|
90 my $bootstrap;
|
|
91
|
|
92
|
|
93 my %pep;
|
|
94 my %nuc;
|
|
95 my $spnum;
|
|
96 my @clusters;
|
|
97 my $Cluster;
|
|
98 my @SpecieCombination;
|
|
99 my @spID;
|
|
100 my %genenum;
|
|
101 my %aaAln;
|
|
102 my %ntAln;
|
|
103 my %cog;
|
|
104 my %description;
|
|
105 #my %aa4tree; ### AA sequence for Phylogenetic Tree
|
|
106 my %nt4tree; ### nucleotide sequence for Phylogenetic Tree
|
|
107 my @SNPPosition; ### SNP position
|
|
108 my $dieMessage="You did not run PGAP.pl in the program directory\n";
|
|
109 my $section;
|
|
110
|
|
111 ######### common temporary variable #############
|
|
112 my $i;
|
|
113 my $j;
|
|
114 my $line;
|
|
115 my %tmpHash;
|
|
116 my @tmp;
|
|
117 my $tmp;
|
|
118 my $key;
|
|
119 my @row;
|
|
120 my $inparacount;
|
|
121 my $ClusterID;
|
|
122 my $orth;
|
|
123 my @content;
|
|
124 my $clusterName;
|
|
125 my @xdata;
|
|
126 my @ydata;
|
|
127 my @fit;
|
|
128
|
|
129 my $fit_A;
|
|
130 my $fit_A_interval;
|
|
131 my $fit_B;
|
|
132 my $fit_C;
|
|
133 my $fit_C_interval;
|
|
134 my $fit_Rsquare;
|
|
135
|
|
136
|
|
137
|
|
138 #### check option
|
|
139
|
|
140 my $opt_error=0;
|
|
141
|
|
142 if ((scalar(keys %opt) ==0) or (exists($opt{"help"})))
|
|
143 {
|
|
144 print join("\n",@usage)."\n";
|
|
145 exit;
|
|
146 }
|
|
147
|
|
148
|
|
149 ###################### public info
|
|
150 ### strains name
|
|
151 my @species;
|
|
152 if (exists($opt{"strains"}))
|
|
153 {
|
|
154 @species=split(/\+/,$opt{"strains"});
|
|
155 $spnum=scalar(@species);
|
|
156 }else
|
|
157 {
|
|
158 print "Please assign strains nick name!\n";
|
|
159 exit;
|
|
160 }
|
|
161
|
|
162 ### input data directory
|
|
163
|
|
164 if (exists($opt{"input"}))
|
|
165 {
|
|
166 $inputDIR=$opt{"input"};
|
|
167 if ($inputDIR!~/\/$/)
|
|
168 {
|
|
169 $inputDIR=$inputDIR."/";
|
|
170 }
|
|
171 }else
|
|
172 {
|
|
173 print "Please assign input data directory!\n\n";
|
|
174 exit;
|
|
175 }
|
|
176 ### output data directory
|
|
177
|
|
178 if (exists($opt{"output"}))
|
|
179 {
|
|
180 $outputDIR=$opt{"output"};
|
|
181 if ($outputDIR!~/\/$/)
|
|
182 {
|
|
183 $outputDIR=$outputDIR."/";
|
|
184 }
|
|
185 }else
|
|
186 {
|
|
187 print "Please assign result output directory!\n\n";
|
|
188 exit;
|
|
189 }
|
|
190
|
|
191 ###################### section info
|
|
192
|
|
193 if (exists($opt{"cluster"}))
|
|
194 {
|
|
195 $run_cluster=1;
|
|
196 }else
|
|
197 {
|
|
198 $run_cluster=0;
|
|
199 }
|
|
200
|
|
201 if (exists($opt{"pangenome"}))
|
|
202 {
|
|
203 $run_pangenome=1;
|
|
204 }else
|
|
205 {
|
|
206 $run_pangenome=0;
|
|
207 }
|
|
208
|
|
209 if (exists($opt{"variation"}))
|
|
210 {
|
|
211 $run_variation=1;
|
|
212 }else
|
|
213 {
|
|
214 $run_variation=0;
|
|
215 }
|
|
216
|
|
217 if (exists($opt{"evolution"}))
|
|
218 {
|
|
219 $run_evolution=1;
|
|
220 }else
|
|
221 {
|
|
222 $run_evolution=0;
|
|
223 }
|
|
224
|
|
225 if (exists($opt{"function"}))
|
|
226 {
|
|
227 $run_function=1;
|
|
228 }else
|
|
229 {
|
|
230 $run_function=0;
|
|
231 }
|
|
232
|
|
233 if ($run_cluster)
|
|
234 {
|
|
235 ### method
|
|
236 if (exists($opt{"method"}))
|
|
237 {
|
|
238 $method=uc($opt{"method"});
|
|
239 if ($method!~/^GF$/ and $method!~/^MP$/)
|
|
240 {
|
|
241 print "Unknown method: ".$opt{"method"}."\n";
|
|
242 exit;
|
|
243 }
|
|
244 }else
|
|
245 {
|
|
246 print "Please assign the cluster method!\n\n";
|
|
247 exit;
|
|
248 }
|
|
249
|
|
250 ##thread
|
|
251 if (exists($opt{"thread"}))
|
|
252 {
|
|
253 $thread=$opt{"thread"};
|
|
254 if ($thread==0)
|
|
255 {
|
|
256 print "please assign an applicable thread value.\n";
|
|
257 exit;
|
|
258 }
|
|
259 }else
|
|
260 {
|
|
261 $thread=1;
|
|
262 }
|
|
263
|
|
264 ##score
|
|
265 if (exists($opt{"score"}))
|
|
266 {
|
|
267 $score=$opt{"score"};
|
|
268 if ($score<=0)
|
|
269 {
|
|
270 print "please assign an applicable score value.\n";
|
|
271 exit;
|
|
272 }
|
|
273 }else
|
|
274 {
|
|
275 $score=40;
|
|
276 }
|
|
277
|
|
278 if ($method eq "GF")
|
|
279 {
|
|
280 ###identity
|
|
281 if (exists($opt{"identity"}))
|
|
282 {
|
|
283 $identity=$opt{"identity"};
|
|
284 if ($identity>1 or $identity<=0)
|
|
285 {
|
|
286 print "identity should be 0 ~ 1 \n";
|
|
287 exit;
|
|
288 }
|
|
289 }else
|
|
290 {
|
|
291 $identity=0.5;
|
|
292 }
|
|
293
|
|
294 ###evalue
|
|
295 if (exists($opt{"evalue"}))
|
|
296 {
|
|
297 $evalue=$opt{"evalue"};
|
|
298 }else
|
|
299 {
|
|
300 $evalue=1e-10;
|
|
301 }
|
|
302
|
|
303 ###coverage
|
|
304 if (exists($opt{"coverage"}))
|
|
305 {
|
|
306 $coverage=$opt{"coverage"};
|
|
307 if ($coverage>1 or $coverage<=0)
|
|
308 {
|
|
309 print "coverage should be 0 ~ 1 \n";
|
|
310 exit;
|
|
311 }
|
|
312 }else
|
|
313 {
|
|
314 $coverage=0.5;
|
|
315 }
|
|
316 }
|
|
317
|
|
318
|
|
319 if ($method eq "MP")
|
|
320 {
|
|
321 ###global
|
|
322 if (exists($opt{"global"}))
|
|
323 {
|
|
324 $global=$opt{"global"};
|
|
325 if ($global>1 or $global<=0)
|
|
326 {
|
|
327 print "global coverage should be 0 ~ 1 \n";
|
|
328 exit;
|
|
329 }
|
|
330 }else
|
|
331 {
|
|
332 $global=0.5;
|
|
333 }
|
|
334 ###local
|
|
335 if (exists($opt{"local"}))
|
|
336 {
|
|
337 $local=$opt{"local"};
|
|
338 if ($local<=0)
|
|
339 {
|
|
340 print "local coverage should be 0 ~ [global coverage value] \n";
|
|
341 exit;
|
|
342 }
|
|
343 if ($local>$global)
|
|
344 {
|
|
345 print "local coverage should be less than global coverage!\n";
|
|
346 exit;
|
|
347 }
|
|
348 }else
|
|
349 {
|
|
350 $local=0.25;
|
|
351 }
|
|
352 }
|
|
353 }
|
|
354
|
|
355 if ($run_evolution)
|
|
356 {
|
|
357 if (exists($opt{"bootstrap"}))
|
|
358 {
|
|
359 $bootstrap=$opt{"bootstrap"};
|
|
360 if ($bootstrap<=0)
|
|
361 {
|
|
362 print "please assign an applicable bootstrap value.\n";
|
|
363 }
|
|
364 }else
|
|
365 {
|
|
366 $bootstrap=1;
|
|
367 }
|
|
368 }
|
|
369
|
|
370 print "Program begin at ".localtime()."\n";
|
|
371 print "The following are the parameters for current process:\n";
|
|
372 print "Strains: ".join(",",@species)."\n";
|
|
373 print "Input directory: $inputDIR\n";
|
|
374 print "Output directory: $outputDIR\n";
|
|
375 if ($run_cluster)
|
|
376 {
|
|
377 print "Cluster analysis: yes\n";
|
|
378 print " Method: $method\n";
|
|
379 print " Thread: $thread\n";
|
|
380 if ($method eq "GF")
|
|
381 {
|
|
382 print " E-value: $evalue\n";
|
|
383 print " Identity: $identity\n";
|
|
384 print " Coverage: $coverage\n";
|
|
385 print " Score: $score\n";
|
|
386 }
|
|
387 if ($method eq "MP")
|
|
388 {
|
|
389 print " Local: $local\n";
|
|
390 print " Global: $global\n";
|
|
391 print " Score: $score\n";
|
|
392 }
|
|
393 }else
|
|
394 {
|
|
395 print "Cluster analysis: no\n";
|
|
396 }
|
|
397 if ($run_pangenome)
|
|
398 {
|
|
399 print "Pan-genome analysis: yes\n";
|
|
400 }else
|
|
401 {
|
|
402 print "Pan-genome analysis: no\n";
|
|
403 }
|
|
404 if ($run_variation)
|
|
405 {
|
|
406 print "Variation analysis: yes\n";
|
|
407 }else
|
|
408 {
|
|
409 print "Variation analysis: no\n";
|
|
410 }
|
|
411 if ($run_evolution)
|
|
412 {
|
|
413 print "Evolution analysis: yes\n";
|
|
414 print " Bootstrap: $bootstrap\n";
|
|
415 }else
|
|
416 {
|
|
417 print "Evolution analysis: no\n";
|
|
418 }
|
|
419 if ($run_function)
|
|
420 {
|
|
421 print "Function analysis: yes\n";
|
|
422 }else
|
|
423 {
|
|
424 print "Function analysis: no\n";
|
|
425 }
|
|
426
|
|
427 $section=$run_cluster.$run_pangenome.$run_variation.$run_evolution.$run_function;
|
|
428
|
|
429 ###############################################
|
|
430 # section 0) check input file and program
|
|
431 ###############################################
|
|
432 if (!(-e $outputDIR))
|
|
433 {
|
|
434 system("mkdir $outputDIR");
|
|
435 }
|
|
436 system("chmod +rw $outputDIR");
|
|
437
|
|
438 if (!(-w $outputDIR))
|
|
439 {
|
|
440 print "There is no WRITE permission in $outputDIR\n";
|
|
441 exit;
|
|
442 }
|
|
443 @tmp=qw();
|
|
444 &CheckInputFile(\@species,$inputDIR,$section,$method,\@tmp);
|
|
445 &CheckExtraProgram($section,$method,\@tmp);
|
|
446
|
|
447 if (scalar(@tmp)>0)
|
|
448 {
|
|
449 open(R,">".$outputDIR."0.error.message");
|
|
450 print R join("",@tmp)."\n";
|
|
451 close(R);
|
|
452 print "error!\nlog are saved in ${outputDIR}0.error.message\n";
|
|
453 exit;
|
|
454 }
|
|
455
|
|
456
|
|
457 ############################################
|
|
458 # section 1) cluster analysis
|
|
459 ############################################
|
|
460
|
|
461 if ($run_cluster)
|
|
462 {
|
|
463 print "\n\n############################################\n";
|
|
464 print "# section 1) cluster analysis\n";
|
|
465 print "############################################\n\n\n";
|
|
466
|
|
467 #### cluster gene and return result to the array @clusters
|
|
468
|
|
469 if ($method eq "MP")
|
|
470 {
|
|
471 print "Begin cluster gene with MP method ...\n";
|
|
472 &MP();
|
|
473 }else
|
|
474 {
|
|
475 print "Begin cluster gene with GF method ...\n";
|
|
476 &GF();
|
|
477 }
|
|
478
|
|
479 #### output normal cluster format
|
|
480
|
|
481 &FormatClusterOutPut(\@species,"${outputDIR}1.Orthologs_Cluster.txt",\@clusters);
|
|
482
|
|
483 #### Retrieve cluster
|
|
484
|
|
485 &RetrieveClusterFromFile("${outputDIR}1.Orthologs_Cluster.txt",\@clusters);
|
|
486
|
|
487 ##### gene distribution in each strains
|
|
488 %tmpHash=();
|
|
489 &GeneDistribution(\@clusters,\%tmpHash);
|
|
490
|
|
491 open(R,">${outputDIR}1.Gene_Distribution_By_Conservation.txt");
|
|
492 print R "SharedBy_Strains\t".join("\t",@species)."\n";
|
|
493
|
|
494 for ($i=$spnum;$i>0;$i--)
|
|
495 {
|
|
496 print R $i;
|
|
497 for ($j=0;$j<$spnum;$j++)
|
|
498 {
|
|
499 if (exists($tmpHash{$i."|".$j}))
|
|
500 {
|
|
501 print R "\t".$tmpHash{$i."|".$j};
|
|
502 }else
|
|
503 {
|
|
504 print R "\t0";
|
|
505 }
|
|
506 }
|
|
507 print R "\n";
|
|
508 }
|
|
509
|
|
510 close(R);
|
|
511
|
|
512
|
|
513 }else
|
|
514 {
|
|
515 print "Homologous gene clustering is skipped!\n";
|
|
516 }
|
|
517
|
|
518
|
|
519 if ($run_pangenome)
|
|
520 {
|
|
521 print "\n\n############################################\n";
|
|
522 print "# section 2) Pan-genome analysis\n";
|
|
523 print "############################################\n\n\n";
|
|
524
|
|
525 #### Retrieve cluster
|
|
526 &RetrieveClusterFromFile("${outputDIR}1.Orthologs_Cluster.txt",\@clusters);
|
|
527 chomp(@clusters);
|
|
528
|
|
529 #### convert file into 0-1 matrix
|
|
530 for ($line=0;$line<@clusters;$line++)
|
|
531 {
|
|
532 @row=split(/\t/,$clusters[$line]);
|
|
533 splice(@row,0,1);
|
|
534 for ($i=0;$i<@row;$i++)
|
|
535 {
|
|
536 if ($row[$i] eq "-")
|
|
537 {
|
|
538 $row[$i]=0;
|
|
539 }else
|
|
540 {
|
|
541 $row[$i]=1;
|
|
542 }
|
|
543 }
|
|
544 $clusters[$line]=join("\t",@row);
|
|
545 }
|
|
546
|
|
547 #### fetch gene number of each strains
|
|
548 for ($i=0;$i<$spnum;$i++)
|
|
549 {
|
|
550 open(F,"$inputDIR$species[$i].pep");
|
|
551 @tmp=<F>;
|
|
552 close(F);
|
|
553 @tmp=grep(/^>/,@tmp);
|
|
554 $genenum{$species[$i]}=scalar(@tmp);
|
|
555 }
|
|
556
|
|
557 #### pan genome size and core genome size
|
|
558 print "Deducing pan genome size and core genome size for each composition...\n\n";
|
|
559
|
|
560 open(PAN,">${outputDIR}2.PanGenome.Data.txt");
|
|
561 print PAN "ClusterConservation\tTotalGeneNumber\tPanGenome\tCoreGenome\n";
|
|
562
|
|
563 for ($i=1;$i<=scalar(@species);$i++)
|
|
564 {
|
|
565 #@SpecieCombination=&Combination(\@species,$i);
|
|
566 #@SpecieCombination=&Combination($spnum,$i);
|
|
567 if (&ChkCombinationValue($spnum,$i) !=0) ### transfer the array reference to the subroutine
|
|
568 {
|
|
569 &Combination($spnum,$i,\@SpecieCombination); ## if the combination number is less than sampleSize, then fecth all, else sample
|
|
570 }else
|
|
571 {
|
|
572 &SampleCombination($spnum,$i,\@SpecieCombination);
|
|
573 }
|
|
574
|
|
575 foreach $key (@SpecieCombination)
|
|
576 {
|
|
577 ##### count total gene number in current combination
|
|
578 $tmp=0;
|
|
579 @spID=split(/\t/,$key); #### speices id in current combination
|
|
580 foreach (@spID)
|
|
581 {
|
|
582 $tmp=$tmp+$genenum{$species[$_]};
|
|
583 }
|
|
584 ##### scan pangenome and coregenome
|
|
585 @tmp=split(/\t/,&PanGenomeNumber(\@spID));
|
|
586 print PAN "$i\t$tmp\t".join("\t",@tmp)."\n";
|
|
587
|
|
588 }
|
|
589 }
|
|
590
|
|
591 close(PAN);
|
|
592
|
|
593 #### data fit
|
|
594
|
|
595 #### for model A
|
|
596
|
|
597 if ($spnum<3)
|
|
598 {
|
|
599 print "There are $spnum strains. For pan-genome function fitting, at least 3 strains data are required.\n";
|
|
600 }else
|
|
601 {
|
|
602 open(R,">${outputDIR}2.PanGenome.Profile.txt");
|
|
603 ##### genome number & pan-genome size
|
|
604 @xdata=qw();
|
|
605 @ydata=qw();
|
|
606 &ReadData2Array("${outputDIR}2.PanGenome.Data.txt",\@xdata,0,\@ydata,2);
|
|
607 &SumData(\@xdata,\@ydata,"mean");
|
|
608 ($fit_Rsquare, $fit_A, $fit_A_interval, $fit_B, $fit_C, $fit_C_interval)=&fit_model_A(\@xdata,\@ydata);
|
|
609 print R "The relation bewteen genome number and pan-genome size\n\n";
|
|
610 print R "Function model: y=A*x**B +C \n";
|
|
611 print R "\ty denotes pan-genome size, x denotes genome number, and A, B, C are fitting parameters.\n\n";
|
|
612 print R "Fitting result:\n";
|
|
613 print R "\ty = $fit_A *x**$fit_B + $fit_C\n";
|
|
614 print R "\tR-square = $fit_Rsquare\n";
|
|
615 print R "\tA 95% confidence interval: ($fit_A - $fit_A_interval , $fit_A + $fit_A_interval)\n";
|
|
616 print R "\tC 95% confidence interval: ($fit_C - $fit_C_interval , $fit_C + $fit_C_interval)\n\n\n\n\n";
|
|
617
|
|
618 ##### total gene number & pan-genome size
|
|
619 #@xdata=qw();
|
|
620 #@ydata=qw();
|
|
621 #&ReadData2Array("${outputDIR}2.PanGenome.Data.txt",\@xdata,1,\@ydata,2);
|
|
622 #&SumDataByMedian(\@xdata,\@ydata);
|
|
623 #($fit_Rsquare, $fit_A, $fit_B, $fit_C)=&fit_model_A(\@xdata,\@ydata);
|
|
624 #print R "The relation bewteen total gene number and pan-genome size\n\n";
|
|
625 #print R "$fit_Rsquare, $fit_A, $fit_B, $fit_C\n";
|
|
626 #print R "\ty = $fit_A *x**$fit_B + $fit_C R-square = $fit_Rsquare\n";
|
|
627 #print R "\tx: total gene number\n";
|
|
628 #print R "\ty: pan-genome size\n\n\n\n\n";
|
|
629
|
|
630 ##### genome number & core genome
|
|
631 @xdata=qw();
|
|
632 @ydata=qw();
|
|
633 &ReadData2Array("${outputDIR}2.PanGenome.Data.txt",\@xdata,0,\@ydata,3);
|
|
634 &SumData(\@xdata,\@ydata,"mean");
|
|
635 ($fit_Rsquare, $fit_A, $fit_A_interval, $fit_B, $fit_C, $fit_C_interval)=&fit_model_B(\@xdata,\@ydata);
|
|
636 print R "The relation bewteen genome number and core genome size\n\n";
|
|
637 print R "Function model: y=A*exp(B*x) +C \n";
|
|
638 print R "\ty denotes pan-genome size, x denotes genome number, and A, B, C are fitting parameters.\n\n";
|
|
639 print R "Fitting result:\n";
|
|
640 print R "\ty = $fit_A *exp($fit_B * x) + $fit_C R-square = $fit_Rsquare\n";
|
|
641 print R "\tR-square = $fit_Rsquare\n";
|
|
642 print R "\tA 95% confidence interval: ($fit_A - $fit_A_interval , $fit_A + $fit_A_interval)\n";
|
|
643 print R "\tC 95% confidence interval: ($fit_C - $fit_C_interval , $fit_C + $fit_C_interval)\n\n\n\n\n";
|
|
644 close(R);
|
|
645 }
|
|
646
|
|
647 }
|
|
648
|
|
649
|
|
650 ############################################
|
|
651 # section 3) CDS variation analysis
|
|
652 ############################################
|
|
653
|
|
654 if ($run_variation)
|
|
655 {
|
|
656 print "\n\n############################################\n";
|
|
657 print "# section 3) CDS variation analysis\n";
|
|
658 print "############################################\n\n\n";
|
|
659
|
|
660 #### Retrieve cluster
|
|
661 &RetrieveClusterFromFile("${outputDIR}1.Orthologs_Cluster.txt",\@clusters);
|
|
662 chomp(@clusters);
|
|
663
|
|
664 ## protein
|
|
665 system("rm -rf *.pep");
|
|
666 &PrepareFasta(\@species,$inputDIR,".pep"); ###prepare pep file
|
|
667 system("cat *.pep > All.faa && rm -rf *.pep && mv All.faa All.pep");
|
|
668 &ReadSequenceInToHash("All.pep",\%pep);
|
|
669 ## nucleic
|
|
670 system("rm -rf *.nuc");
|
|
671 &PrepareFasta(\@species,$inputDIR,".nuc"); ###prepare nuc file
|
|
672 system("cat *.nuc > All.ffn && rm -rf *.nuc && mv All.ffn All.nuc");
|
|
673 &ReadSequenceInToHash("All.nuc",\%nuc);
|
|
674
|
|
675 ## scanning SNP
|
|
676 %nt4tree=();
|
|
677 for ($i=0;$i<$spnum;$i++)
|
|
678 {
|
|
679 $nt4tree{"S".$i}="";
|
|
680 }
|
|
681
|
|
682 open(VAR,">${outputDIR}3.CDS.variation.txt");
|
|
683 print VAR "ClusterID\tStrains_Number\tGene_Number\tPosition\taaType\tntType\tntProfile\tVariation type\n";
|
|
684
|
|
685 open(VA,">${outputDIR}3.CDS.variation.analysis.txt");
|
|
686 print VA "ClusterID\tInDel Base\tNonsynonymous mutation\tSynonymous mutation\n";
|
|
687
|
|
688 for ($line=0;$line<@clusters;$line++)
|
|
689 {
|
|
690 @row=split(/\t|\,/,$clusters[$line]);
|
|
691 $ClusterID=$row[0];
|
|
692 splice(@row,0,1);
|
|
693 @row=grep(/^S/,@row);
|
|
694 if (scalar(@row) >=2)
|
|
695 {
|
|
696 open(PEP,">$ClusterID.pep");
|
|
697 open(NUC,">$ClusterID.nuc");
|
|
698 foreach $key (@row)
|
|
699 {
|
|
700 print PEP ">$key\n$pep{$key}\n";
|
|
701 print NUC ">$key\n$nuc{$key}\n";
|
|
702 }
|
|
703 close(PEP);
|
|
704 close(NUC);
|
|
705 system("$mafft --quiet $ClusterID.pep > $ClusterID.pal");
|
|
706 #system("perl ./pal2nal.pl $ClusterID.pal $ClusterID.nuc -output fasta > $ClusterID.nal");
|
|
707 $tmp=&pal2nal("$ClusterID.pal","$ClusterID.nuc","$ClusterID.nal");
|
|
708 if ($tmp == 0)
|
|
709 {
|
|
710 system("rm -rf $ClusterID.*");
|
|
711 next;
|
|
712 }
|
|
713
|
|
714 @tmp=&DetectSNP();
|
|
715 if (scalar(@tmp)>0)
|
|
716 {
|
|
717 print VA $ClusterID."\t".&VarAnalysis(\@tmp)."\n";
|
|
718 print VAR join("",@tmp);
|
|
719 ### core orthologs
|
|
720 @row=split(/\t/,$clusters[$line]);
|
|
721 splice(@row,0,1);
|
|
722 if ((&CountGeneInCluster(join("\t",@row)) ==$spnum) and (&CountSpeicesInCluster(join("\t",@row)) == $spnum) )
|
|
723 {
|
|
724 $count_tree++;
|
|
725 %tmpHash=();
|
|
726 foreach (@row)
|
|
727 {
|
|
728 $tmpHash{$_}="";
|
|
729 }
|
|
730 &RemoveHeadGap("$ClusterID.nal",\%tmpHash);
|
|
731 &ExtractSNP4tree(\%tmpHash,\%nt4tree);
|
|
732 }
|
|
733 }
|
|
734 system("rm -rf $ClusterID.*");
|
|
735 }
|
|
736 }
|
|
737 close(VAR);
|
|
738 close(VA);
|
|
739
|
|
740 open(R,">${outputDIR}3.CDS.variation.for.evolution.txt");
|
|
741 foreach $key (keys %nt4tree)
|
|
742 {
|
|
743 $_=$key;
|
|
744 s/s//gi;
|
|
745 print R ">$species[$_]\n$nt4tree{$key}\n";
|
|
746 }
|
|
747 close(R);
|
|
748 print $count_tree."\n\n";
|
|
749
|
|
750 ###
|
|
751 system("rm All.nuc All.pep");
|
|
752 }else
|
|
753 {
|
|
754 print "CDS variation is skipped.\n";
|
|
755 }
|
|
756
|
|
757 ############################################
|
|
758 # section 4) CDS variation analysis
|
|
759 ############################################
|
|
760
|
|
761 if ($run_evolution)
|
|
762 {
|
|
763 #### Retrieve cluster
|
|
764 &RetrieveClusterFromFile("${outputDIR}1.Orthologs_Cluster.txt",\@clusters);
|
|
765 chomp(@clusters);
|
|
766
|
|
767
|
|
768
|
|
769 ##################
|
|
770 ##
|
|
771 ## Distance based
|
|
772 ##
|
|
773 ##################
|
|
774
|
|
775 ### caculate the distance between each two strains by cluster
|
|
776 &ClusterProfil4Specie(\%tmpHash); # caculate Clusters profile for each specie
|
|
777 &DistanceMatrix($spnum,\%tmpHash); # caculate distance matrix acoording to Clusters profile
|
|
778 ###output distance
|
|
779 open(DIST,">${outputDIR}4.Species_Distance_Clusters_Based.txt");
|
|
780 ###header
|
|
781 printf DIST "%5d", $spnum;
|
|
782 print DIST "\n";
|
|
783 foreach $i (0..($spnum-1))
|
|
784 {
|
|
785 $key="sp".$i."sp";
|
|
786 printf DIST "%-10s",$key;
|
|
787 foreach $j (0..($spnum-1))
|
|
788 {
|
|
789 printf DIST " %8f",$tmpHash{$i."-".$j};
|
|
790 }
|
|
791 print DIST "\n";
|
|
792 }
|
|
793 close(DIST);
|
|
794
|
|
795 %tmpHash=();
|
|
796
|
|
797 ### based on pan genome (distance)
|
|
798 print "\nDraw pangenome based phylogenetic tree ...\n\n";
|
|
799
|
|
800 &PanBasedTree("${outputDIR}4.Species_Distance_Clusters_Based.txt","${outputDIR}4.PanBased");
|
|
801
|
|
802 ##################
|
|
803 ##
|
|
804 ## SNP based
|
|
805 ##
|
|
806 ##################
|
|
807 %tmpHash=();
|
|
808 if (!(-e "${outputDIR}3.CDS.variation.for.evolution.txt"))
|
|
809 {
|
|
810 print "Variation in core orthologs cluster is not found from ${outputDIR}3.CDS.variation.for.evolution.txt.\n";
|
|
811 print "Maybe you have skipped CDS variation analysis.\n";
|
|
812 }else
|
|
813 {
|
|
814 &ReadSequenceInToHash("${outputDIR}3.CDS.variation.for.evolution.txt",\%tmpHash);
|
|
815 open(R,">mlst.aln");
|
|
816 for ($i=0;$i<@species;$i++)
|
|
817 {
|
|
818 print R ">sp${i}sp\n".$tmpHash{$species[$i]}."\n";
|
|
819 }
|
|
820 close(R);
|
|
821 &fasta2phylip("mlst.aln","mlst.phylip");
|
|
822 system("rm -rf mlst.aln");
|
|
823
|
|
824 print "\nDraw SNP based phylogenetic tree ...\n\n";
|
|
825 &SNPBasedTree("mlst.phylip","${outputDIR}4.SNPBased");
|
|
826 system("rm -rf mlst.phylip")
|
|
827 }
|
|
828
|
|
829 ########
|
|
830 # replace speices name
|
|
831 ########
|
|
832
|
|
833 opendir(DIR,"${outputDIR}");
|
|
834 @tmp=readdir(DIR);
|
|
835 closedir(DIR);
|
|
836 @tmp=grep(/^4/,@tmp);
|
|
837 foreach $tmp (@tmp)
|
|
838 {
|
|
839 &ReplaceName(\@species,"${outputDIR}$tmp");
|
|
840 }
|
|
841 }else
|
|
842 {
|
|
843 print "Evolution analysis is skipped.\n";
|
|
844 }
|
|
845
|
|
846
|
|
847 ############################################
|
|
848 # section 4) Function analysis
|
|
849 ############################################
|
|
850
|
|
851 if ($run_function)
|
|
852 {
|
|
853 #### Retrieve cluster
|
|
854 &RetrieveClusterFromFile("${outputDIR}1.Orthologs_Cluster.txt",\@clusters);
|
|
855 chomp(@clusters);
|
|
856
|
|
857 #### prepare annotation file
|
|
858 &PrepareTable(\@species,$inputDIR,".function"); ###prepare location file
|
|
859 &ReadAnnotation(\@species,\%cog,\%description);
|
|
860
|
|
861
|
|
862 #### assign function
|
|
863 open(R,">${outputDIR}5.Orthologs_Cluster_Function.txt");
|
|
864 print R "ClusterID\tConservation_Level\tCOG\tDescription\n";
|
|
865 for ($i=0;$i<@clusters;$i++)
|
|
866 {
|
|
867 @row=split(/\t/,$clusters[$i]);
|
|
868 $ClusterID=$row[0];
|
|
869 splice(@row,0,1);
|
|
870 print R $ClusterID."\t".&CountSpeicesInCluster(join("\t",@row))."\t".&getCOG(\@row,\%cog)."\t".&getDescription(\@row,\%description)."\n";
|
|
871 }
|
|
872 close(R);
|
|
873
|
|
874 #### COG distribution
|
|
875
|
|
876 ###Whole Clusters COG Distribution
|
|
877 &outputCOGStatistic("${outputDIR}5.Orthologs_Whole_Cluster_COG_Distribution.txt",&scanCOG("${outputDIR}5.Orthologs_Cluster_Function.txt",$spnum,1));
|
|
878
|
|
879 ###Core Clusters COG Distribution
|
|
880 &outputCOGStatistic("${outputDIR}5.Orthologs_Core_Cluster_COG_Distribution.txt",&scanCOG("${outputDIR}5.Orthologs_Cluster_Function.txt",$spnum,$spnum));
|
|
881
|
|
882 ###Dispensable Clusters COG Distribution
|
|
883 &outputCOGStatistic("${outputDIR}5.Orthologs_Dispensable_Cluster_COG_Distribution.txt",&scanCOG("${outputDIR}5.Orthologs_Cluster_Function.txt",($spnum-1),2));
|
|
884
|
|
885 ###strains specifc Clusters COG Distribution
|
|
886 &outputCOGStatistic("${outputDIR}5.Orthologs_specifc_Cluster_COG_Distribution.txt",&scanCOG("${outputDIR}5.Orthologs_Cluster_Function.txt",1,1));
|
|
887
|
|
888 system("rm -rf *.function");
|
|
889
|
|
890 }else
|
|
891 {
|
|
892 print "Function analysis is skipped.\n";
|
|
893 }
|
|
894
|
|
895 sub outputCOGStatistic()
|
|
896 {
|
|
897 (my $file,my $subcogcount)=@_;
|
|
898 my @cogcat=("J A K L B","D Y V T M N Z W U O","C G E F H I P Q","R S -");
|
|
899 my @cogdesc=("INFORMATION STORAGE AND PROCESSING","CELLULAR PROCESSES AND SIGNALING","METABOLISM","POORLY CHARACTERIZED");
|
|
900 my @subcogcat=qw(J A K L B D Y V T M N Z W U O C G E F H I P Q R S -);
|
|
901 my @subcogdesc=("[J] Translation, ribosomal structure and biogenesis","[A] RNA processing and modification","[K] Transcription","[L] Replication, recombination and repair","[B] Chromatin structure and dynamics","[D] Cell cycle control, cell division, chromosome partitioning","[Y] Nuclear structure","[V] Defense mechanisms","[T] Signal transduction mechanisms","[M] Cell wall/membrane/envelope biogenesis","[N] Cell motility","[Z] Cytoskeleton","[W] Extracellular structures","[U] Intracellular trafficking, secretion, and vesicular transport","[O] Posttranslational modification, protein turnover, chaperones","[C] Energy production and conversion","[G] Carbohydrate transport and metabolism","[E] Amino acid transport and metabolism","[F] Nucleotide transport and metabolism","[H] Coenzyme transport and metabolism","[I] Lipid transport and metabolism","[P] Inorganic ion transport and metabolism","[Q] Secondary metabolites biosynthesis, transport and catabolism","[R] General function prediction only","[S] Function unknown","[-] Unclassified");
|
|
902 my %subcogdesc;
|
|
903 my $key;
|
|
904 my @cog;
|
|
905 my $i;
|
|
906 my $cognum;
|
|
907
|
|
908 for ($i=0;$i<@subcogcat;$i++)
|
|
909 {
|
|
910 $subcogdesc{$subcogcat[$i]}=$subcogdesc[$i];
|
|
911 }
|
|
912
|
|
913 open(R,">$file");
|
|
914 for ($i=0;$i<@cogcat;$i++)
|
|
915 {
|
|
916 $cognum=0;
|
|
917 foreach $key (split(" ",$cogcat[$i]))
|
|
918 {
|
|
919 $cognum=$cognum+$$subcogcount{$key};
|
|
920 }
|
|
921 print R $cogdesc[$i]." ( ".$cognum." )\n";
|
|
922 foreach $key (split(" ",$cogcat[$i]))
|
|
923 {
|
|
924 printf R "%-6d %s\n",$$subcogcount{$key},$subcogdesc{$key};
|
|
925 }
|
|
926 print R "\n";
|
|
927
|
|
928 }
|
|
929 close(R);
|
|
930
|
|
931 }
|
|
932
|
|
933 sub scanCOG()
|
|
934 {
|
|
935 (my $file,my $max_orth,my $min_orth)=@_;
|
|
936 my @row;
|
|
937 my @subcogcat=qw(J A K L B D Y V T M N Z W U O C G E F H I P Q R S -);
|
|
938 my %subcogcount;
|
|
939 my $cog;
|
|
940 my $key;
|
|
941
|
|
942 foreach $key (@subcogcat)
|
|
943 {
|
|
944 $subcogcount{$key}=0;
|
|
945 }
|
|
946
|
|
947 @subcogcat=qw(J A K L B D Y V T M N Z W U O C G E F H I P Q R S);
|
|
948
|
|
949 open(F,"$file");
|
|
950 $_=<F>;
|
|
951 while (<F>)
|
|
952 {
|
|
953 @row=split(/\t/,$_);
|
|
954 if ($row[1]>=$min_orth and $row[1]<=$max_orth)
|
|
955 {
|
|
956 if ($row[2] eq "-")
|
|
957 {
|
|
958 $subcogcount{"-"}++;
|
|
959 }else
|
|
960 {
|
|
961 $_=uc($row[2]);
|
|
962 s/COG//gi;
|
|
963 $cog=$_;
|
|
964 foreach $key (@subcogcat)
|
|
965 {
|
|
966 if ($cog=~/$key/)
|
|
967 {
|
|
968 $subcogcount{$key}++;
|
|
969 }
|
|
970 }
|
|
971 }
|
|
972 }
|
|
973 }
|
|
974 close(F);
|
|
975
|
|
976 return \%subcogcount;
|
|
977 }
|
|
978
|
|
979
|
|
980
|
|
981
|
|
982 sub getCOG()
|
|
983 {
|
|
984 (my $data,my $coghash)=@_;
|
|
985 my $cog="";
|
|
986 my @cog;
|
|
987 my $key;
|
|
988 my %hash;
|
|
989 my @gene=split(/\t|\,/,join("\t",@$data));
|
|
990 @gene=grep(/^S/,@gene);
|
|
991
|
|
992 foreach $key (@gene)
|
|
993 {
|
|
994 if (($$coghash{$key} ne "-") and ($$coghash{$key} ne ""))
|
|
995 {
|
|
996 $cog=$cog.",".$$coghash{$key};
|
|
997 }
|
|
998 }
|
|
999 @cog=split(/,/,$cog);
|
|
1000 foreach $cog (@cog)
|
|
1001 {
|
|
1002 if ($cog ne "")
|
|
1003 {
|
|
1004 $hash{$cog}=1;
|
|
1005 }
|
|
1006 }
|
|
1007
|
|
1008 $cog=join(",",(keys %hash));
|
|
1009 if ($cog eq "")
|
|
1010 {
|
|
1011 $cog="-";
|
|
1012 }
|
|
1013 return $cog;
|
|
1014 }
|
|
1015
|
|
1016 sub getDescription()
|
|
1017 {
|
|
1018 (my $data,my $deschash)=@_;
|
|
1019 my $desc="";
|
|
1020 my $key;
|
|
1021 my @gene=split(/\t|\,/,join("\t",@$data));
|
|
1022 @gene=grep(/^S/,@gene);
|
|
1023
|
|
1024 foreach $key (@gene)
|
|
1025 {
|
|
1026 if ( ($$deschash{$key} ne "") and ($$deschash{$key} ne "-") and ($$deschash{$key}!~/hypothetical/))
|
|
1027 {
|
|
1028 $desc=$$deschash{$key};
|
|
1029 }
|
|
1030 }
|
|
1031
|
|
1032 if ($desc eq "")
|
|
1033 {
|
|
1034 $desc="hypothetical protein";
|
|
1035 }
|
|
1036
|
|
1037 return $desc;
|
|
1038 }
|
|
1039
|
|
1040
|
|
1041
|
|
1042 sub ReadAnnotation()
|
|
1043 {
|
|
1044 (my $species,my $cog,my $description)=@_;
|
|
1045 my $i;
|
|
1046 my @row;
|
|
1047
|
|
1048 for ($i=0;$i<@$species;$i++)
|
|
1049 {
|
|
1050 open(F,"$$species[$i].function");
|
|
1051 while (<F>)
|
|
1052 {
|
|
1053 chomp($_);
|
|
1054 @row=split(/\t/,$_);
|
|
1055 if (scalar(@row)>=2)
|
|
1056 {
|
|
1057 $$cog{$row[0]}=$row[1];
|
|
1058 }else
|
|
1059 {
|
|
1060 $$cog{$row[0]}="-";
|
|
1061 }
|
|
1062
|
|
1063 if (scalar(@row)>=3)
|
|
1064 {
|
|
1065 $$description{$row[0]}=$row[2];
|
|
1066 }else
|
|
1067 {
|
|
1068 $$description{$row[0]}="hypothetical protein";
|
|
1069 }
|
|
1070 }
|
|
1071 close(F);
|
|
1072 }
|
|
1073 }
|
|
1074
|
|
1075
|
|
1076 sub SNPBasedTree()
|
|
1077 {
|
|
1078 (my $infile,my $outfileprefix)=@_;
|
|
1079 my $tmpin=$infile;
|
|
1080 my $tmpout;
|
|
1081
|
|
1082
|
|
1083 #### boootstrap
|
|
1084 print "\n#### seqboot ...\n\n";
|
|
1085 open(R,">seqboot.cmd");
|
|
1086 #print R "$tmpin\n";
|
|
1087 print R "R\n";
|
|
1088 print R "$bootstrap\n";
|
|
1089 print R "Y\n";
|
|
1090 print R "1\n";
|
|
1091 close(R);
|
|
1092
|
|
1093 system("cp $tmpin infile");
|
|
1094 system("$seqboot < seqboot.cmd");
|
|
1095 system("mv outfile 100dnaseq");
|
|
1096 system("rm -rf infile");
|
|
1097 system("rm seqboot.cmd"); # 100dnasseq
|
|
1098
|
|
1099 #### dnaml
|
|
1100 print "\n#### dnaml ...\n\n";
|
|
1101 open(R,">dnaml.cmd");
|
|
1102 #print R "100dnaseq\n";
|
|
1103 print R "T\n";
|
|
1104 print R "25\n";
|
|
1105 if ($bootstrap>1)
|
|
1106 {
|
|
1107 print R "M\n";
|
|
1108 print R "D\n";
|
|
1109 #print R "100\n";
|
|
1110 print R "$bootstrap\n";
|
|
1111 print R "1\n"; # Random number seed (must be odd)?
|
|
1112 print R "5\n"; # Number of times to jumble?
|
|
1113 }
|
|
1114 print R "Y\n";
|
|
1115 close(R);
|
|
1116
|
|
1117 system("cp 100dnaseq infile");
|
|
1118 system("$dnaml < dnaml.cmd");
|
|
1119 system("rm -rf outfile");
|
|
1120 system("rm -rf infile");
|
|
1121 system("mv outtree 100dnaseqtree"); # 100dnaseq, 100dnaseqtree
|
|
1122
|
|
1123 #### consense
|
|
1124 print "\n#### dnaml consense ...\n\n";
|
|
1125
|
|
1126 open(R,">consense.cmd");
|
|
1127 #print R "100dnaseqtree\n";
|
|
1128 print R "Y\n";
|
|
1129 close(R);
|
|
1130
|
|
1131 system("cp 100dnaseqtree intree");
|
|
1132 system("$consense < consense.cmd");
|
|
1133 system("mv outfile ${outfileprefix}.ML.outfile");
|
|
1134 system("mv outtree ${outfileprefix}.ML.tree");
|
|
1135 system("rm -rf infile");
|
|
1136 system("rm -rf 100dnaseqtree"); # 100dnaseq
|
|
1137
|
|
1138 #### dnadist
|
|
1139 print "\n#### dnadist ...\n\n";
|
|
1140 open(R,">dnadist.cmd");
|
|
1141 #print R "100dnaseq\n";
|
|
1142 print R "T\n";
|
|
1143 print R "25\n";
|
|
1144 if ($bootstrap>1)
|
|
1145 {
|
|
1146 print R "M\n";
|
|
1147 print R "D\n";
|
|
1148 #print R "100\n";
|
|
1149 print R "$bootstrap\n";
|
|
1150 }
|
|
1151 print R "Y\n";
|
|
1152 close(R);
|
|
1153
|
|
1154 system("cp 100dnaseq infile");
|
|
1155 system("$dnadist < dnadist.cmd");
|
|
1156 system("rm -rf 100dnaseq");
|
|
1157 system("rm -rf infile");
|
|
1158 system("mv outfile 100dnadist"); # 100dnadist
|
|
1159
|
|
1160 #### Neighbor-joining tree
|
|
1161 print "\n#### Neighbor-joining ...\n\n";
|
|
1162 open(R,">NJ.cmd");
|
|
1163 if ($bootstrap>1)
|
|
1164 {
|
|
1165 #print R "100dnadist\n";
|
|
1166 print R "M\n";
|
|
1167 #print R "100\n";
|
|
1168 print R "$bootstrap\n";
|
|
1169 print R "1\n";
|
|
1170 }
|
|
1171 print R "Y\n";
|
|
1172 close(R);
|
|
1173
|
|
1174 system("cp 100dnadist infile");
|
|
1175 system("$neighbor < NJ.cmd");
|
|
1176 system("mv outtree 100dnadistNJtree");
|
|
1177 system("rm outfile");
|
|
1178 system("rm -rf infile");
|
|
1179 system("rm -rf NJ.cmd"); # 100dnadist,100dnadistNJtree
|
|
1180
|
|
1181 #### NJ-consense
|
|
1182 print "\n#### NJ-consense ...\n\n";
|
|
1183 open(R,">NJ-consense.cmd");
|
|
1184 #print R "100dnadistNJtree\n";
|
|
1185 print R "Y\n";
|
|
1186 close(R);
|
|
1187
|
|
1188 system("cp 100dnadistNJtree intree");
|
|
1189 system("$consense < NJ-consense.cmd");
|
|
1190 system("mv outfile ${outfileprefix}.Neighbor-joining.outfile");
|
|
1191 system("mv outtree ${outfileprefix}.Neighbor-joining.tree");
|
|
1192 system("rm -rf NJ-consense.cmd");
|
|
1193 system("rm -rf intree");
|
|
1194 system("rm -rf 100dnadistNJtree");
|
|
1195
|
|
1196
|
|
1197 #### UPGMA tree
|
|
1198 print "\n#### UPGMA ...\n\n";
|
|
1199 open(R,">UPGMA.cmd");
|
|
1200 #print R "100dnadist\n";
|
|
1201 print R "N\n";
|
|
1202 if ($bootstrap>1)
|
|
1203 {
|
|
1204 print R "M\n";
|
|
1205 #print R "100\n";
|
|
1206 print R "$bootstrap\n";
|
|
1207 print R "1\n";
|
|
1208 }
|
|
1209 print R "Y\n";
|
|
1210 close(R);
|
|
1211
|
|
1212 system("cp 100dnadist infile");
|
|
1213 system("$neighbor < UPGMA.cmd");
|
|
1214 system("mv outtree 100dnadistUPGMAtree");
|
|
1215 system("rm -rf outfile");
|
|
1216 system("rm -rf infile");
|
|
1217 system("rm -rf UPGMA.cmd");
|
|
1218
|
|
1219 #### UPGMA-consense
|
|
1220 print "\n#### UPGMA-consense ...\n\n";
|
|
1221 open(R,">UPGMA-consense.cmd");
|
|
1222 #print R "100dnadistUPGMAtree\n";
|
|
1223 print R "Y\n";
|
|
1224 close(R);
|
|
1225
|
|
1226 system("cp 100dnadistUPGMAtree intree");
|
|
1227 system("$consense < UPGMA-consense.cmd");
|
|
1228 system("mv outfile ${outfileprefix}.UPGMA.outfile");
|
|
1229 system("mv outtree ${outfileprefix}.UPGMA.tree");
|
|
1230 system("rm -rf UPGMA-consense.cmd");
|
|
1231 system("rm -rf 100dnadistUPGMAtree");
|
|
1232 system("rm -rf intree");
|
|
1233
|
|
1234 ###CLEAN TMP FILE
|
|
1235
|
|
1236 system("rm -rf *.cmd");
|
|
1237 system("rm -rf 100dnadist");
|
|
1238 }
|
|
1239
|
|
1240 sub PanBasedTree()
|
|
1241 {
|
|
1242 (my $infile,my $outfileprefix)=@_;
|
|
1243 my $tmpin;
|
|
1244 my $tmpout;
|
|
1245
|
|
1246 $tmpin=$infile;
|
|
1247 #### Neighbor-joining tree
|
|
1248
|
|
1249 open(R,">NJ.cmd");
|
|
1250 #print R "$tmpin\n";
|
|
1251 print R "Y\n";
|
|
1252 close(R);
|
|
1253
|
|
1254 system("cp $tmpin infile");
|
|
1255 system("$neighbor < NJ.cmd");
|
|
1256 system("mv outfile ${outfileprefix}.Neighbor-joining.outfile");
|
|
1257 system("mv outtree ${outfileprefix}.Neighbor-joining.tree");
|
|
1258 system("rm -rf NJ.cmd");
|
|
1259 system("rm -rf infile");
|
|
1260
|
|
1261 #### UPGMA tree
|
|
1262
|
|
1263 open(R,">UPGMA.cmd");
|
|
1264 #print R "$tmpin\n";
|
|
1265 print R "N\n";
|
|
1266 print R "Y\n";
|
|
1267 close(R);
|
|
1268
|
|
1269 system("cp $tmpin infile");
|
|
1270 system("$neighbor < UPGMA.cmd");
|
|
1271 system("mv outfile ${outfileprefix}.UPGMA.outfile");
|
|
1272 system("mv outtree ${outfileprefix}.UPGMA.tree");
|
|
1273 system("rm -rf UPGMA.cmd");
|
|
1274 system("rm -rf infile");
|
|
1275
|
|
1276
|
|
1277 ###CLEAN TMP FILE
|
|
1278 system("rm -rf *.cmd");
|
|
1279 }
|
|
1280
|
|
1281 sub DistanceMatrix()
|
|
1282 {
|
|
1283 (my $spnum,my $hash)=@_;
|
|
1284 my $i;
|
|
1285 my $j;
|
|
1286 my $k;
|
|
1287 my $dist;
|
|
1288 my $ref;
|
|
1289 my $query;
|
|
1290 foreach $i (0..($spnum-1))
|
|
1291 {
|
|
1292 foreach $j ($i..($spnum-1))
|
|
1293 {
|
|
1294 $ref=$$hash{$i};
|
|
1295 $query=$$hash{$j};
|
|
1296 $dist=0;
|
|
1297 for ($k=0;$k<length($ref);$k++)
|
|
1298 {
|
|
1299 if (substr($ref,$k,1) ne substr($query,$k,1))
|
|
1300 {
|
|
1301 $dist++;
|
|
1302 }
|
|
1303 }
|
|
1304 $$hash{$i."-".$j}=$dist;
|
|
1305 $$hash{$j."-".$i}=$dist;
|
|
1306 }
|
|
1307 }
|
|
1308 }
|
|
1309
|
|
1310
|
|
1311 sub ClusterProfil4Specie
|
|
1312 {
|
|
1313 (my $hash)=@_;
|
|
1314 my @row;
|
|
1315 my $i;
|
|
1316
|
|
1317 foreach (0..($spnum-1)) #initialization Hash
|
|
1318 {
|
|
1319 $$hash{$_}="";
|
|
1320 }
|
|
1321
|
|
1322 foreach (@clusters)
|
|
1323 {
|
|
1324 @row=split(/\t/,$_);
|
|
1325 splice(@row,0,1);
|
|
1326 if (&CountSpeicesInCluster(join("\t",@row))>1)
|
|
1327 {
|
|
1328 for ($i=0;$i<@row;$i++)
|
|
1329 {
|
|
1330 if ($row[$i] eq "-")
|
|
1331 {
|
|
1332 $$hash{$i}=$$hash{$i}."0";
|
|
1333 }else
|
|
1334 {
|
|
1335 $$hash{$i}=$$hash{$i}."1";
|
|
1336 }
|
|
1337 }
|
|
1338 }
|
|
1339 }
|
|
1340 }
|
|
1341
|
|
1342 # &ExtractSNP4tree(\%tmpHash,\%nt4tree);
|
|
1343
|
|
1344 sub ExtractSNP4tree()
|
|
1345 {
|
|
1346 (my $hash,my $nt4treeRef)=@_;
|
|
1347 my $key;
|
|
1348 my @row;
|
|
1349 my $i;
|
|
1350 my $len;
|
|
1351 my @tribases;
|
|
1352 foreach $key (keys %$hash)
|
|
1353 {
|
|
1354 $$hash{substr($key,0,index($key,"G"))}=$$hash{$key};
|
|
1355 delete($$hash{$key});
|
|
1356 }
|
|
1357
|
|
1358 for ($i=0;$i<$spnum;$i++)
|
|
1359 {
|
|
1360 $nt4tree{"S".$i}=$nt4tree{"S".$i}.$$hash{"S".$i};
|
|
1361 }
|
|
1362 }
|
|
1363
|
|
1364
|
|
1365 =pod
|
|
1366 sub ExtractSNP4tree()
|
|
1367 {
|
|
1368 (my $hash,my $nt4treeRef)=@_;
|
|
1369 my $key;
|
|
1370 my @row;
|
|
1371 my $i;
|
|
1372 my $len;
|
|
1373 my @tribases;
|
|
1374 foreach $key (keys %$hash)
|
|
1375 {
|
|
1376 $$hash{substr($key,0,index($key,"G"))}=$$hash{$key};
|
|
1377 delete($$hash{$key});
|
|
1378 }
|
|
1379 @_=(keys %$hash);
|
|
1380 $len=length($_[0]);
|
|
1381 for ($j=0;3*$j<$len;$j++)
|
|
1382 {
|
|
1383 ##### scanning each codon
|
|
1384 for ($i=0;$i<$spnum;$i++)
|
|
1385 {
|
|
1386 $tribases[$i]=substr($$hash{"S".$i},3*$j,3);
|
|
1387 }
|
|
1388 ##### checking each codon
|
|
1389 if (&IsTheSame(@tribases) ==0)
|
|
1390 {
|
|
1391 for ($i=0;$i<@tribases;$i++)
|
|
1392 {
|
|
1393 $nt4tree{"S".$i}=$nt4tree{"S".$i}.$tribases[$i];
|
|
1394 }
|
|
1395 }
|
|
1396 }
|
|
1397 }
|
|
1398 =cut
|
|
1399
|
|
1400
|
|
1401 sub pal2nal()
|
|
1402 {
|
|
1403 (my $pal,my $nuc, my $nal)=@_;
|
|
1404 my %aaAln=();
|
|
1405 my %ffn=();
|
|
1406 my %ntAln=();
|
|
1407 my %nt;
|
|
1408 my $dna;
|
|
1409 my $nt;
|
|
1410 my $key;
|
|
1411 my $flag=1;
|
|
1412 my $i=0;
|
|
1413 my $j;
|
|
1414
|
|
1415 ### read protein aligment result
|
|
1416 &ReadAlignmentToHash("$pal",\%aaAln);
|
|
1417 ### read nt sequences
|
|
1418 &ReadSequenceInToHash("$nuc",\%ffn);
|
|
1419 foreach $key (keys %ffn)
|
|
1420 {
|
|
1421 $dna=$ffn{$key};
|
|
1422 #if (int(length($nt{$key})/3)*3 ne length($nt{$key}))
|
|
1423 if (int(length($dna)/3)*3 ne length($dna))
|
|
1424 {
|
|
1425 $flag=0;
|
|
1426 print "The length of nucleotide sequence is not 3 integer times.\n";
|
|
1427 last;
|
|
1428 }else
|
|
1429 {
|
|
1430 for ($i=0;$i<(length($dna)/3);$i++)
|
|
1431 {
|
|
1432 $nt{$key."|".$i}=substr($dna,$i*3,3);
|
|
1433 }
|
|
1434 }
|
|
1435 }
|
|
1436
|
|
1437 if ($flag==0)
|
|
1438 {
|
|
1439 return 0;
|
|
1440 }else
|
|
1441 {
|
|
1442 foreach $key (keys %aaAln) ### replace aa with corresponding nt
|
|
1443 {
|
|
1444 $nt="";
|
|
1445 $i=0;
|
|
1446 for ($j=0;$j<length($aaAln{$key});$j++)
|
|
1447 {
|
|
1448 if (substr($aaAln{$key},$j,1) eq "-")
|
|
1449 {
|
|
1450 $nt=$nt."---";
|
|
1451 }else
|
|
1452 {
|
|
1453 $nt=$nt.$nt{$key."|".$i};
|
|
1454 $i++;
|
|
1455 }
|
|
1456 }
|
|
1457 $ntAln{$key}=$nt;
|
|
1458 }
|
|
1459
|
|
1460 ### output
|
|
1461 open(R,">$nal");
|
|
1462 foreach (keys %ntAln)
|
|
1463 {
|
|
1464 print R ">$_\n".$ntAln{$_}."\n";
|
|
1465 }
|
|
1466 close(R);
|
|
1467
|
|
1468 return 1;
|
|
1469 }
|
|
1470 }
|
|
1471
|
|
1472
|
|
1473
|
|
1474
|
|
1475 sub DetectSNP()
|
|
1476 {
|
|
1477 my %faa;
|
|
1478 my %ffn;
|
|
1479 my @row;
|
|
1480 my $count_gene;
|
|
1481 my $count_sp;
|
|
1482 my @genelist;
|
|
1483 my $i;
|
|
1484 my $j;
|
|
1485 my $pepalnlen;
|
|
1486 my @cdsvar=qw();
|
|
1487 my $cdi=0;
|
|
1488 my @tribases;
|
|
1489 my @bases;
|
|
1490 my @aa;
|
|
1491
|
|
1492
|
|
1493 ### fetch gene list
|
|
1494 open(F,"$ClusterID.pep");
|
|
1495 @genelist=<F>;
|
|
1496 close(F);
|
|
1497 @genelist=grep(/^>/,@genelist);
|
|
1498 chomp(@genelist);
|
|
1499 $_=join("\t",@genelist);
|
|
1500 s/>//g;
|
|
1501 @genelist=split(/\t/,$_);
|
|
1502
|
|
1503 ### count gene number and species number
|
|
1504 @row=split(/\t/,$clusters[$ClusterID-1]);
|
|
1505 splice(@row,0,1);
|
|
1506 $count_sp=&CountSpeicesInCluster(join("\t",@row));
|
|
1507 $count_gene=&CountGeneInCluster(join("\t",@row));
|
|
1508
|
|
1509 ### read alignment sequences
|
|
1510 &ReadAlignmentToHash("$ClusterID.pal",\%faa);
|
|
1511 &ReadAlignmentToHash("$ClusterID.nal",\%ffn);
|
|
1512
|
|
1513 @_=(keys %faa);
|
|
1514 $pepalnlen=length($faa{$_[0]});
|
|
1515 ### scan SNP
|
|
1516 for ($i=1;$i<=$pepalnlen;$i++)
|
|
1517 {
|
|
1518 @tmp=qw();
|
|
1519 @tribases=qw();
|
|
1520 for ($j=0;$j<@genelist;$j++) ### fetch triplet codon
|
|
1521 {
|
|
1522 $tribases[$j]=substr($ffn{$genelist[$j]},3*($i-1),3);
|
|
1523 }
|
|
1524 if (&IsTheSame(@tribases) ==0) ### if triplet codon is not consistent
|
|
1525 {
|
|
1526 @aa=qw();
|
|
1527 for ($j=0;$j<@genelist;$j++)
|
|
1528 {
|
|
1529 $aa[$j]=substr($faa{$genelist[$j]},($i-1),1);
|
|
1530 }
|
|
1531 if (&IsTheSame(@aa) ==0) ### aa is not consistent
|
|
1532 {
|
|
1533 if (join("",@aa) =~/-/)
|
|
1534 {
|
|
1535 $cdsvar[$cdi++]=$ClusterID."\t".$count_sp."\t".$count_gene."\t".$i."\t".&CharType(\@aa)."\t-\t-\tInDel\n";
|
|
1536 }else
|
|
1537 {
|
|
1538 #### base 1
|
|
1539 for ($j=0;$j<@genelist;$j++)
|
|
1540 {
|
|
1541 $bases[$j]=substr($ffn{$genelist[$j]},3*($i-1),1);
|
|
1542 }
|
|
1543 if (&IsTheSame(@bases) ==0)
|
|
1544 {
|
|
1545 $cdsvar[$cdi++]=$ClusterID."\t".$count_sp."\t".$count_gene."\t".($i+0.1)."\t".&CharType(\@aa)."\t".&CharType(\@bases)."\t".join("",@bases)."\tNonsynonymous mutation\n";
|
|
1546 }
|
|
1547 #### base 2
|
|
1548 for ($j=0;$j<@genelist;$j++)
|
|
1549 {
|
|
1550 $bases[$j]=substr($ffn{$genelist[$j]},3*($i-1)+1,1);
|
|
1551 }
|
|
1552 if (&IsTheSame(@bases) ==0)
|
|
1553 {
|
|
1554 $cdsvar[$cdi++]=$ClusterID."\t".$count_sp."\t".$count_gene."\t".($i+0.2)."\t".&CharType(\@aa)."\t".&CharType(\@bases)."\t".join("",@bases)."\tNonsynonymous mutation\n";
|
|
1555 }
|
|
1556 #### base 3
|
|
1557 for ($j=0;$j<@genelist;$j++)
|
|
1558 {
|
|
1559 $bases[$j]=substr($ffn{$genelist[$j]},3*($i-1)+2,1);
|
|
1560 }
|
|
1561 if (&IsTheSame(@bases) ==0)
|
|
1562 {
|
|
1563 $cdsvar[$cdi++]=$ClusterID."\t".$count_sp."\t".$count_gene."\t".($i+0.3)."\t".&CharType(\@aa)."\t".&CharType(\@bases)."\t".join("",@bases)."\tNonsynonymous mutation\n";
|
|
1564 }
|
|
1565 }
|
|
1566 }else
|
|
1567 {
|
|
1568 #### base 1
|
|
1569 for ($j=0;$j<@genelist;$j++)
|
|
1570 {
|
|
1571 $bases[$j]=substr($ffn{$genelist[$j]},3*($i-1),1);
|
|
1572 }
|
|
1573 if (&IsTheSame(@bases) ==0)
|
|
1574 {
|
|
1575 $cdsvar[$cdi++]=$ClusterID."\t".$count_sp."\t".$count_gene."\t".($i+0.1)."\t".&CharType(\@aa)."\t".&CharType(\@bases)."\t".join("",@bases)."\tSynonymous mutation\n";
|
|
1576 }
|
|
1577 #### base 2
|
|
1578 for ($j=0;$j<@genelist;$j++)
|
|
1579 {
|
|
1580 $bases[$j]=substr($ffn{$genelist[$j]},3*($i-1)+1,1);
|
|
1581 }
|
|
1582 if (&IsTheSame(@bases) ==0)
|
|
1583 {
|
|
1584 $cdsvar[$cdi++]=$ClusterID."\t".$count_sp."\t".$count_gene."\t".($i+0.2)."\t".&CharType(\@aa)."\t".&CharType(\@bases)."\t".join("",@bases)."\tSynonymous mutation\n";
|
|
1585 }
|
|
1586 #### base 3
|
|
1587 for ($j=0;$j<@genelist;$j++)
|
|
1588 {
|
|
1589 $bases[$j]=substr($ffn{$genelist[$j]},3*($i-1)+2,1);
|
|
1590 }
|
|
1591 if (&IsTheSame(@bases) ==0)
|
|
1592 {
|
|
1593 $cdsvar[$cdi++]=$ClusterID."\t".$count_sp."\t".$count_gene."\t".($i+0.3)."\t".&CharType(\@aa)."\t".&CharType(\@bases)."\t".join("",@bases)."\tSynonymous mutation\n";
|
|
1594 }
|
|
1595 }
|
|
1596 }
|
|
1597 }
|
|
1598 return @cdsvar;
|
|
1599 }
|
|
1600
|
|
1601
|
|
1602 sub VarAnalysis()
|
|
1603 {
|
|
1604 (my $data)=@_;
|
|
1605 my @data=@$data;
|
|
1606 my $indel=0;
|
|
1607 my $syn=0;
|
|
1608 my $nonsyn=0;
|
|
1609 my @tmp;
|
|
1610 $indel=scalar(grep(/InDel$/,@data));
|
|
1611 $nonsyn=scalar(grep(/Nonsynonymous mutation$/,@data));;
|
|
1612 $syn=scalar(grep(/Synonymous mutation$/,@data));
|
|
1613 return "$indel\t$nonsyn\t$syn";
|
|
1614 }
|
|
1615
|
|
1616
|
|
1617
|
|
1618 sub CharType()
|
|
1619 {
|
|
1620 (my $str)=@_;
|
|
1621 my %hash;
|
|
1622 my @data=@$str;
|
|
1623 foreach (@data)
|
|
1624 {
|
|
1625 $hash{$_}=1;
|
|
1626 }
|
|
1627 return join(",",(keys %hash));
|
|
1628 }
|
|
1629
|
|
1630 sub IsTheSame()
|
|
1631 {
|
|
1632 (my @data)=@_;
|
|
1633 my %hash;
|
|
1634 foreach (@data)
|
|
1635 {
|
|
1636 $hash{$_}=1;
|
|
1637 }
|
|
1638 if (scalar(keys %hash) ==1)
|
|
1639 {
|
|
1640 return 1;
|
|
1641 }else
|
|
1642 {
|
|
1643 return 0;
|
|
1644 }
|
|
1645 }
|
|
1646
|
|
1647
|
|
1648
|
|
1649 sub FormatClusterOutPut()
|
|
1650 {
|
|
1651 (my $speices,my $file,my $cluster)=@_;
|
|
1652 my @row;
|
|
1653 my $gid=1;
|
|
1654 my $key;
|
|
1655 my %hash;
|
|
1656 my $gene;
|
|
1657 my @tmp;
|
|
1658 my $i;
|
|
1659 my $j;
|
|
1660 open(R,">$file");
|
|
1661 print R "ClutserID\t".join("\t",@$speices)."\n";
|
|
1662 foreach $key (@$cluster)
|
|
1663 {
|
|
1664 @row=split(/\t/,$key);
|
|
1665 for ($i=0;$i<@row;$i++)
|
|
1666 {
|
|
1667 if ($row[$i] ne "-")
|
|
1668 {
|
|
1669 @tmp=split(/,/,$row[$i]);
|
|
1670 for ($j=0;$j<@tmp;$j++)
|
|
1671 {
|
|
1672 $_=$tmp[$j];
|
|
1673 s/^S[0-9]+G//;
|
|
1674 $tmp[$j]=$_;
|
|
1675 }
|
|
1676 $row[$i]=join(",",@tmp);
|
|
1677 }
|
|
1678 }
|
|
1679 print R $gid."\t".join("\t",@row)."\n";
|
|
1680 $gid++;
|
|
1681 }
|
|
1682 close(R);
|
|
1683 }
|
|
1684 sub RetrieveClusterFromFile()
|
|
1685 {
|
|
1686 (my $file,my $clusters)=@_;
|
|
1687 my @content;
|
|
1688 my @row;
|
|
1689 my $spid;
|
|
1690 my $line=0;
|
|
1691 my $i=0;
|
|
1692 my $j;
|
|
1693 my @tmp;
|
|
1694 open(F,$file) or die "Could open $file\n";
|
|
1695 @content=<F>;
|
|
1696 close(F);
|
|
1697 splice(@content,0,1);
|
|
1698 chomp(@content);
|
|
1699 foreach (@content)
|
|
1700 {
|
|
1701 @row=split(/\t/,$_);
|
|
1702 $$clusters[$line]=$row[0];
|
|
1703 splice(@row,0,1);
|
|
1704 for ($i=0;$i<@row;$i++)
|
|
1705 {
|
|
1706 if ($row[$i] ne "-")
|
|
1707 {
|
|
1708 @tmp=split(/,/,$row[$i]);
|
|
1709 for ($j=0;$j<@tmp;$j++)
|
|
1710 {
|
|
1711 $tmp[$j]="S${i}G".$tmp[$j];
|
|
1712 }
|
|
1713 $row[$i]=join(",",@tmp);
|
|
1714 }
|
|
1715 }
|
|
1716 $$clusters[$line]=$$clusters[$line]."\t".join("\t",@row)."\n";
|
|
1717 $line++;
|
|
1718 }
|
|
1719 }
|
|
1720
|
|
1721
|
|
1722
|
|
1723
|
|
1724 sub GeneDistribution()
|
|
1725 {
|
|
1726 (my $clusters,my $hash)=@_;
|
|
1727 my @row;
|
|
1728 my $spid;
|
|
1729 my $orth;
|
|
1730 my $key;
|
|
1731 foreach (@$clusters)
|
|
1732 {
|
|
1733 @row=split(/\t/,$_);
|
|
1734 splice(@row,0,1);
|
|
1735 $orth=&CountSpeicesInCluster(join("\t",@row));
|
|
1736 @row=split(/\t|\,/,join("\t",@row));
|
|
1737 foreach $key (@row)
|
|
1738 {
|
|
1739 if ($key ne "-")
|
|
1740 {
|
|
1741 $spid=substr($key,1,(index($key,'G')-1)); ###extract strains id
|
|
1742 if (exists($$hash{$orth."|".$spid}))
|
|
1743 {
|
|
1744 $$hash{$orth."|".$spid}++;
|
|
1745 }else
|
|
1746 {
|
|
1747 $$hash{$orth."|".$spid}=1;
|
|
1748 }
|
|
1749 }
|
|
1750 }
|
|
1751 }
|
|
1752 }
|
|
1753
|
|
1754 sub CountSpeicesInCluster()
|
|
1755 {
|
|
1756 (my $str)=@_;
|
|
1757 chomp($str);
|
|
1758 my @list=split(/\t/,$str);
|
|
1759 my $key;
|
|
1760 my $count=0;
|
|
1761
|
|
1762 foreach $key (@list)
|
|
1763 {
|
|
1764 if ($key ne "-")
|
|
1765 {
|
|
1766 $count++;
|
|
1767 }
|
|
1768 }
|
|
1769 return $count;
|
|
1770 }
|
|
1771
|
|
1772 sub CountGeneInCluster()
|
|
1773 {
|
|
1774 (my $str)=@_;
|
|
1775 chomp();
|
|
1776 my @list=split(/\t|\,/,$str);
|
|
1777 my $key;
|
|
1778 my $count=0;
|
|
1779 foreach $key (@list)
|
|
1780 {
|
|
1781 if ($key ne "-")
|
|
1782 {
|
|
1783 $count++;
|
|
1784 }
|
|
1785 }
|
|
1786 return $count;
|
|
1787 }
|
|
1788
|
|
1789
|
|
1790
|
|
1791
|
|
1792 sub GF()
|
|
1793 {
|
|
1794 &PrepareFasta(\@species,$inputDIR,".pep"); ###prepare pep file
|
|
1795 system("cat ".join(".pep ",@species).".pep > All.pep");
|
|
1796 system("grep '>' All.pep > genelist");
|
|
1797 system("$formatdb -p T -i All.pep");
|
|
1798 system("$blastall -p blastp -i All.pep -d All.pep -M BLOSUM45 -m9 -e $evalue -o All.blastp -a $thread");
|
|
1799 system("perl ./Blast_Filter.pl All.blastp All.pep $coverage $identity $score | $mcl - --abc -I 2.0 -o All.cluster");
|
|
1800 &FormatCluster("All.cluster","genelist",$spnum,\@clusters);
|
|
1801 #system("rm -rf *.pep* All.blastp All.cluster genelist");
|
|
1802 }
|
|
1803
|
|
1804 sub MP()
|
|
1805 {
|
|
1806 # (my $species,my $inputDIR,my $thread,my $evalue,my $score,my $coverage,my $identity)=@_;
|
|
1807 my $i;
|
|
1808 my $j;
|
|
1809 &PrepareFasta(\@species,$inputDIR,".pep"); ###prepare pep file
|
|
1810 system("cat ".join(".pep ",@species).".pep > All.pep");
|
|
1811 system("grep '>' All.pep > genelist");
|
|
1812 system("rm -rf All.pep");
|
|
1813 for ($i=0;$i<$spnum;$i++)
|
|
1814 {
|
|
1815 for ($j=$i+1;$j<$spnum;$j++)
|
|
1816 {
|
|
1817 system("perl ./inparanoid.pl $blastall $thread $formatdb $score $global $local $species[$i].pep $species[$j].pep");
|
|
1818 }
|
|
1819 }
|
|
1820 system("perl ./multiparanoid.pl -species ".join(".pep+",@species).".pep -unique 1");
|
|
1821 ###convert the MP result to table list based on gene
|
|
1822 &MP_Result_to_Table("MP.Cluster","All.cluster");
|
|
1823 &FormatCluster("All.cluster","genelist",$spnum,\@clusters);
|
|
1824 system("rm -rf sqltable.* *.pep* MP.Cluster genelist");
|
|
1825 }
|
|
1826
|
|
1827 sub fasta2phylip()
|
|
1828 {
|
|
1829 (my $input,my $output)=@_;
|
|
1830 use Bio::AlignIO;
|
|
1831 my $inputfilename = "10.aln";
|
|
1832 my $in= Bio::AlignIO->new(-file => $input ,
|
|
1833 -format => 'fasta');
|
|
1834 my $out = Bio::AlignIO->new(-file => ">$output" ,
|
|
1835 -format => 'phylip');
|
|
1836 while ( my $aln = $in->next_aln() )
|
|
1837 {
|
|
1838 $out->write_aln($aln);
|
|
1839 }
|
|
1840 }
|
|
1841
|
|
1842
|
|
1843 sub RemoveHeadGap()
|
|
1844 {
|
|
1845 (my $nal,my $hash)=@_;
|
|
1846 my %aln;
|
|
1847 my $key;
|
|
1848 my $gaplength=0;
|
|
1849 my $len1;
|
|
1850 my $len2;
|
|
1851 &ReadSequenceInToHash("$nal",\%aln);
|
|
1852 foreach $key (keys %aln)
|
|
1853 {
|
|
1854 $len1=length($aln{$key});
|
|
1855 $_=$aln{$key};
|
|
1856 s/^-+//;
|
|
1857 $len2=length($_);
|
|
1858 if (($len1-$len2)>$gaplength)
|
|
1859 {
|
|
1860 $gaplength=$len1-$len2;
|
|
1861 }
|
|
1862 }
|
|
1863 foreach $key (keys %aln)
|
|
1864 {
|
|
1865 $$hash{$key}=$$hash{$key}.substr($aln{$key},$gaplength,(length($aln{$key})-$gaplength));
|
|
1866 }
|
|
1867 }
|
|
1868
|
|
1869 sub PrepareFasta()
|
|
1870 {
|
|
1871 (my $species,my $inputDIR,my $extention)=@_;
|
|
1872 my $sp;
|
|
1873 my $file;
|
|
1874 my $i;
|
|
1875 my %hash;
|
|
1876 my $key;
|
|
1877 for ($i=0;$i<@$species;$i++)
|
|
1878 {
|
|
1879 $file=$inputDIR.$$species[$i].$extention;
|
|
1880 %hash=();
|
|
1881 &ReadSequenceInToHash($file,\%hash);
|
|
1882 open(R,">$$species[$i]${extention}") or die "Could write into $file\n";
|
|
1883 foreach $key (keys %hash)
|
|
1884 {
|
|
1885 print R ">S${i}G$key\n";
|
|
1886 print R $hash{$key}."\n";
|
|
1887 }
|
|
1888 close(R);
|
|
1889 }
|
|
1890 }
|
|
1891
|
|
1892 sub PrepareTable()
|
|
1893 {
|
|
1894 (my $species,my $inputDIR,my $extention)=@_;
|
|
1895 my @content;
|
|
1896 my $i;
|
|
1897 my @row;
|
|
1898 my $file;
|
|
1899 for ($i=0;$i<@$species;$i++)
|
|
1900 {
|
|
1901 $file=$inputDIR.$$species[$i].$extention;
|
|
1902 open(F,$file) or die "Could open $file\n";
|
|
1903 @content=<F>;
|
|
1904 close(F);
|
|
1905 chomp(@content);
|
|
1906 open(R,">$$species[$i]${extention}") or die "Could write into $file\n";
|
|
1907 foreach (@content)
|
|
1908 {
|
|
1909 @row=split(/\t/,$_);
|
|
1910 $row[0]="S${i}G$row[0]";
|
|
1911 if ($extention eq ".location")
|
|
1912 {
|
|
1913 $row[0]=$row[0]."\t".$row[0];
|
|
1914 }
|
|
1915 print R join("\t",@row)."\n";
|
|
1916 }
|
|
1917 close(R);
|
|
1918 }
|
|
1919 }
|
|
1920
|
|
1921 sub CheckExtraProgram
|
|
1922 {
|
|
1923 #(my $section, my $method, my $tmparray)=@_;
|
|
1924 my @error;
|
|
1925 my $ei=0;
|
|
1926
|
|
1927 #####cluster gene
|
|
1928 if (substr($section,0,1) eq "1")
|
|
1929 {
|
|
1930 ###MP: blastall formatdb
|
|
1931 ###GF: blastall formatdb mcl
|
|
1932
|
|
1933 if (!(-e $formatdb))
|
|
1934 {
|
|
1935 $error[$ei++]="formatdb is not found at $formatdb\n";
|
|
1936 }
|
|
1937
|
|
1938 if (!(-X $formatdb))
|
|
1939 {
|
|
1940 $error[$ei++]="there is not premission to execute $formatdb\n";
|
|
1941 }
|
|
1942
|
|
1943 if (!(-e $blastall))
|
|
1944 {
|
|
1945 $error[$ei++]="blastall is not found at $blastall\n";
|
|
1946 }
|
|
1947
|
|
1948 if (!(-X $blastall))
|
|
1949 {
|
|
1950 $error[$ei++]="there is not premission to execute $blastall\n";
|
|
1951 }
|
|
1952
|
|
1953 if ($method eq "GF")
|
|
1954 {
|
|
1955 if (!(-e $mcl))
|
|
1956 {
|
|
1957 $error[$ei++]="mcl is not found at $mcl\n";
|
|
1958 }
|
|
1959 if (!(-X $mcl))
|
|
1960 {
|
|
1961 $error[$ei++]="there is not premission to execute $mcl\n";
|
|
1962 }
|
|
1963 }
|
|
1964 }
|
|
1965
|
|
1966 #####CDS variation
|
|
1967 if (substr($section,2,1) eq "1")
|
|
1968 {
|
|
1969 if (!(-e $mafft))
|
|
1970 {
|
|
1971 $error[$ei++]="mafft is not found at $mafft\n";
|
|
1972 }
|
|
1973 if (!(-X $mafft))
|
|
1974 {
|
|
1975 $error[$ei++]="there is not premission to execute $mafft\n";
|
|
1976 }
|
|
1977 }
|
|
1978
|
|
1979 #####CDS variation
|
|
1980 if (substr($section,3,1) eq "1")
|
|
1981 {
|
|
1982 if (!(-e $mafft))
|
|
1983 {
|
|
1984 $error[$ei++]="mafft is not found at $mafft\n";
|
|
1985 }
|
|
1986 if (!(-X $mafft))
|
|
1987 {
|
|
1988 $error[$ei++]="there is not premission to execute $mafft\n";
|
|
1989 }
|
|
1990 }
|
|
1991 #####Evolution analysis
|
|
1992 if (substr($section,3,1) eq "1")
|
|
1993 {
|
|
1994 if (-e $seqboot)
|
|
1995 {
|
|
1996 $error[$ei++]="there is not premission to execute $seqboot\n" if(!(-X $seqboot));
|
|
1997 }else
|
|
1998 {
|
|
1999 $error[$ei++]="seqboot is not found at $seqboot\n";
|
|
2000 }
|
|
2001 if (-e $dnaml)
|
|
2002 {
|
|
2003 $error[$ei++]="there is not premission to execute $dnaml\n" if(!(-X $dnaml));
|
|
2004 }else
|
|
2005 {
|
|
2006 $error[$ei++]="dnaml is not found at $dnaml\n";
|
|
2007 }
|
|
2008 if (-e $dnadist)
|
|
2009 {
|
|
2010 $error[$ei++]="there is not premission to execute $dnadist\n" if(!(-X $dnadist));
|
|
2011 }else
|
|
2012 {
|
|
2013 $error[$ei++]="dnadist is not found at $dnadist\n";
|
|
2014 }
|
|
2015 if (-e $neighbor)
|
|
2016 {
|
|
2017 $error[$ei++]="there is not premission to execute $neighbor\n" if(!(-X $neighbor));
|
|
2018 }else
|
|
2019 {
|
|
2020 $error[$ei++]="neighbor is not found at $neighbor\n";
|
|
2021 }
|
|
2022 if (-e $consense)
|
|
2023 {
|
|
2024 $error[$ei++]="there is not premission to execute $consense\n" if(!(-X $consense));
|
|
2025 }else
|
|
2026 {
|
|
2027 $error[$ei++]="consense is not found at $consense\n";
|
|
2028 }
|
|
2029 if (-e $dnapars)
|
|
2030 {
|
|
2031 $error[$ei++]="there is not premission to execute $dnapars\n" if(!(-X $dnapars));
|
|
2032 }else
|
|
2033 {
|
|
2034 $error[$ei++]="dnapars is not found at $dnapars\n";
|
|
2035 }
|
|
2036 }
|
|
2037 #@$tmparray=(@$tmparray,@error);
|
|
2038 @tmp=(@tmp,@error);
|
|
2039 }
|
|
2040
|
|
2041
|
|
2042 sub CheckInputFile()
|
|
2043 {
|
|
2044 (my $species,my $inputDIR,my $section,my $method,my $tmparray)=@_;
|
|
2045 ####cluster
|
|
2046 if (substr($section,0,1) eq "1")
|
|
2047 {
|
|
2048 if ($method eq "MM")
|
|
2049 {
|
|
2050 @$tmparray=(@$tmparray,&chk2SEQ($species,$inputDIR)); ### check pep and nuc
|
|
2051 @$tmparray=(@$tmparray,&chktab($species,$inputDIR,".location"));### chk pep nuc location
|
|
2052 }else
|
|
2053 {
|
|
2054 @$tmparray=(@$tmparray,&chk1SEQ($species,$inputDIR));
|
|
2055 }
|
|
2056 }
|
|
2057 ###CDS variation
|
|
2058 if (substr($section,2,1) eq "1")
|
|
2059 {
|
|
2060 @$tmparray=(@$tmparray,&chk2SEQ($species,$inputDIR));
|
|
2061 }
|
|
2062 ###function analysis
|
|
2063 if (substr($section,4,1) eq "1")
|
|
2064 {
|
|
2065 @$tmparray=(@$tmparray,&chktab($species,$inputDIR,".function"));
|
|
2066 }
|
|
2067 }
|
|
2068
|
|
2069
|
|
2070 sub chk1SEQ()
|
|
2071 {
|
|
2072 (my $species,my $inputDIR)=@_;
|
|
2073 my @error;
|
|
2074 my $ei=0;
|
|
2075 my $sp;
|
|
2076 my $pepfile;
|
|
2077 my %pep;
|
|
2078 foreach $sp (@$species)
|
|
2079 {
|
|
2080 %pep=();
|
|
2081 $pepfile=$inputDIR.$sp.".pep";
|
|
2082 &ReadSequenceInToHash($pepfile,\%pep);
|
|
2083 if (scalar(keys %pep)<2)
|
|
2084 {
|
|
2085 $error[$ei++]="format error in $pepfile\n";
|
|
2086 }
|
|
2087 }
|
|
2088 return @error;
|
|
2089 }
|
|
2090
|
|
2091 sub chk2SEQ()
|
|
2092 {
|
|
2093 (my $species,my $inputDIR)=@_;
|
|
2094 my $sp;
|
|
2095 my %pep;
|
|
2096 my %nuc;
|
|
2097 my $pepfile;
|
|
2098 my $nucfile;
|
|
2099 my $key;
|
|
2100 my @error;
|
|
2101 my $ei=0;
|
|
2102 foreach $sp (@$species)
|
|
2103 {
|
|
2104 $pepfile=$inputDIR.$sp.".pep";
|
|
2105 $nucfile=$inputDIR.$sp.".nuc";
|
|
2106 %pep=();
|
|
2107 %nuc=();
|
|
2108 &ReadSequenceInToHash("$pepfile",\%pep);
|
|
2109 &ReadSequenceInToHash("$nucfile",\%nuc);
|
|
2110 if (scalar(keys %pep) ne scalar(keys %nuc))
|
|
2111 {
|
|
2112 $error[$ei++]="Sequences number is not consistent in the following two file:\n\t$pepfile\n\t$nucfile\n";
|
|
2113 }else
|
|
2114 {
|
|
2115 foreach $key (keys %pep)
|
|
2116 {
|
|
2117 if (exists($nuc{$key}))
|
|
2118 {
|
|
2119 if (length($nuc{$key}) ne ((length($pep{$key})+1)*3))
|
|
2120 {
|
|
2121 $error[$ei++]="the length of $key in $nucfile is not consistent with its corresponding protein length\n";
|
|
2122 }
|
|
2123 }else
|
|
2124 {
|
|
2125 $error[$ei++]="$key lost in $nucfile\n";
|
|
2126 }
|
|
2127 }
|
|
2128
|
|
2129 foreach $key (keys %nuc)
|
|
2130 {
|
|
2131 if (!exists($pep{$key}))
|
|
2132 {
|
|
2133 $error[$ei++]="1048 $key lost in $pepfile\n";
|
|
2134 }
|
|
2135 }
|
|
2136 }
|
|
2137 }
|
|
2138 return @error;
|
|
2139 }
|
|
2140
|
|
2141
|
|
2142 sub chktab()
|
|
2143 {
|
|
2144 (my $species,my $inputDIR,my $extention)=@_;
|
|
2145 my %pep;
|
|
2146 my @row;
|
|
2147 my $key;
|
|
2148 my %tab;
|
|
2149 my @error;
|
|
2150 my $ei=0;
|
|
2151 my $sp;
|
|
2152 my $tabfile;
|
|
2153 my $pepfile;
|
|
2154 foreach $sp (@$species)
|
|
2155 {
|
|
2156 %tab=();
|
|
2157 %pep=();
|
|
2158 $tabfile=$inputDIR.$sp.$extention;
|
|
2159 open(F,"$tabfile");
|
|
2160 while (<F>)
|
|
2161 {
|
|
2162 chomp();
|
|
2163 @row=split(/\t/,$_);
|
|
2164 if (scalar(@row)<3)
|
|
2165 {
|
|
2166 $error[$ei++]="format error in $tabfile\n";
|
|
2167 }else
|
|
2168 {
|
|
2169 $tab{$row[0]}=$row[1];
|
|
2170 }
|
|
2171 }
|
|
2172 close(F);
|
|
2173 $pepfile=$inputDIR.$sp.".pep";
|
|
2174 &ReadSequenceInToHash($pepfile,\%pep);
|
|
2175 foreach $key (keys %pep)
|
|
2176 {
|
|
2177 if (!exists($tab{$key}))
|
|
2178 {
|
|
2179 $error[$ei++]="sequence $key lost infomation in $tabfile\n";
|
|
2180 }
|
|
2181 }
|
|
2182 }
|
|
2183 return @error;
|
|
2184 }
|
|
2185
|
|
2186
|
|
2187
|
|
2188 sub ReadSequenceInToHash()
|
|
2189 {
|
|
2190 use Bio::SeqIO;
|
|
2191 (my $file,my $hash)=@_;
|
|
2192 my $seq;
|
|
2193 my $in=Bio::SeqIO->new(-file=>"$file",-format=>"fasta");
|
|
2194 while ($seq=$in->next_seq())
|
|
2195 {
|
|
2196 #$$hash{$id."|".$seq->id}=$seq->seq();
|
|
2197 $$hash{$seq->id}=$seq->seq();
|
|
2198 }
|
|
2199 }
|
|
2200
|
|
2201 sub ReadAlignmentToHash()
|
|
2202 {
|
|
2203 (my $file,my $hash)=@_;
|
|
2204 my $name="";
|
|
2205 my $seq="";
|
|
2206 my @content;
|
|
2207 my $line;
|
|
2208 open(F,"$file");
|
|
2209 @content=<F>;
|
|
2210 close(F);
|
|
2211 chomp(@content);
|
|
2212 for ($line=0;$line<@content;$line++)
|
|
2213 {
|
|
2214 if ($content[$line]=~/^>/)
|
|
2215 {
|
|
2216 if ($line>0)
|
|
2217 {
|
|
2218 $$hash{$name}=$seq;
|
|
2219 $name="";
|
|
2220 }
|
|
2221
|
|
2222 $_=$content[$line];
|
|
2223 s/^>//;
|
|
2224 $name=$_;
|
|
2225 $seq="";
|
|
2226 }else
|
|
2227 {
|
|
2228 if ($name ne "")
|
|
2229 {
|
|
2230 $seq=$seq.$content[$line];
|
|
2231 }
|
|
2232 }
|
|
2233 }
|
|
2234 $$hash{$name}=$seq;
|
|
2235 }
|
|
2236
|
|
2237
|
|
2238
|
|
2239 sub Combination()
|
|
2240 {
|
|
2241 (my $m,my $n,my $comRef)=@_;
|
|
2242 my $str="";
|
|
2243 my %hash;
|
|
2244 my $fpos;
|
|
2245 my $num0;
|
|
2246 my $rest;
|
|
2247 my $tmp;
|
|
2248 my $i;
|
|
2249 my $j;
|
|
2250 my $key;
|
|
2251 #my $m=scalar(@$array);
|
|
2252 my @combination;
|
|
2253
|
|
2254 for ($i=1;$i<=$n;$i++)
|
|
2255 {
|
|
2256 $str="1".$str;
|
|
2257 }
|
|
2258
|
|
2259 for ($i=1;$i<=($m-$n);$i++)
|
|
2260 {
|
|
2261 $str=$str."0";
|
|
2262 }
|
|
2263
|
|
2264 $hash{$str}=1;
|
|
2265 while ($str=~/10/)
|
|
2266 {
|
|
2267 $fpos=index($str,"10");
|
|
2268 $_=$str;
|
|
2269 s/10/01/;
|
|
2270 $str=$_;
|
|
2271 $tmp=substr($str,0,$fpos);
|
|
2272 $_=$tmp;
|
|
2273 s/0//g;
|
|
2274 $rest=$_;
|
|
2275 $num0=$fpos-length($_);
|
|
2276 for ($i=1;$i<=$num0;$i++)
|
|
2277 {
|
|
2278 $rest="$rest"."0";
|
|
2279 }
|
|
2280 $str="$rest".substr($str,$fpos,$m-$fpos);
|
|
2281 $hash{$str}=1;
|
|
2282 }
|
|
2283 $j=0;
|
|
2284 foreach $key (keys %hash)
|
|
2285 {
|
|
2286 $combination[$j]="";
|
|
2287 for ($i=0;$i<$m;$i++)
|
|
2288 {
|
|
2289 if (substr($key,$i,1) eq "1")
|
|
2290 {
|
|
2291 if ($combination[$j] ne "")
|
|
2292 {
|
|
2293 #$combination[$j]=$combination[$j]."\t".$$array[$i];
|
|
2294 $combination[$j]=$combination[$j]."\t".$i;
|
|
2295 }else
|
|
2296 {
|
|
2297 #$combination[$j]=$$array[$i]; ### For return species ID
|
|
2298 $combination[$j]=$i;
|
|
2299 }
|
|
2300 }
|
|
2301 }
|
|
2302 $j++;
|
|
2303 }
|
|
2304 @$comRef=@combination; ### update the data through the physic address
|
|
2305 }
|
|
2306
|
|
2307 sub ChkCombinationValue()
|
|
2308 {
|
|
2309 (my $m,my $n)=@_;
|
|
2310 my %hash;
|
|
2311 my %vhash;
|
|
2312 my $value=0;
|
|
2313 my $key;
|
|
2314 my @row;
|
|
2315 my @sdA;
|
|
2316 my @sdB;
|
|
2317
|
|
2318 ### initialization
|
|
2319 $hash{$m."-".$n}=1;
|
|
2320
|
|
2321 ### split combination
|
|
2322 while (scalar(keys %hash)>0 and $value<=$sampleSize)
|
|
2323 {
|
|
2324 foreach $key (keys %hash)
|
|
2325 {
|
|
2326 if ($value > $sampleSize) ### threshold
|
|
2327 {
|
|
2328 last;
|
|
2329 }
|
|
2330 if (!exists($hash{$key}))
|
|
2331 {
|
|
2332 next;
|
|
2333 }
|
|
2334 @row=split(/-/,$key);
|
|
2335 #print $row[0]."|".$row[1]."\n";
|
|
2336 if ($row[0] eq $row[1])
|
|
2337 {
|
|
2338 $value=$value+$hash{$key};
|
|
2339 }else
|
|
2340 {
|
|
2341 ##split
|
|
2342 $sdA[0]=$row[0]-1;
|
|
2343 $sdA[1]=$row[1];
|
|
2344 $sdB[0]=$row[0]-1;
|
|
2345 $sdB[1]=$row[1]-1;
|
|
2346 ##storing A
|
|
2347 if (($sdA[0] eq $sdA[1]) or $sdA[1] ==0)
|
|
2348 {
|
|
2349 $value=$value+$hash{$key};
|
|
2350 }else
|
|
2351 {
|
|
2352 if (exists($hash{$sdA[0]."-".$sdA[1]}))
|
|
2353 {
|
|
2354 $hash{$sdA[0]."-".$sdA[1]}=$hash{$sdA[0]."-".$sdA[1]}+$hash{$key};
|
|
2355 }else
|
|
2356 {
|
|
2357 $hash{$sdA[0]."-".$sdA[1]}=$hash{$key};
|
|
2358 }
|
|
2359 }
|
|
2360
|
|
2361 ##storing B
|
|
2362 if (($sdB[0] eq $sdB[1]) or $sdB[1]==0)
|
|
2363 {
|
|
2364 $value=$value+$hash{$key};
|
|
2365 }else
|
|
2366 {
|
|
2367 if (exists($hash{$sdB[0]."-".$sdB[1]}))
|
|
2368 {
|
|
2369 $hash{$sdB[0]."-".$sdB[1]}=$hash{$sdB[0]."-".$sdB[1]}+$hash{$key};
|
|
2370 }else
|
|
2371 {
|
|
2372 $hash{$sdB[0]."-".$sdB[1]}=$hash{$key};
|
|
2373 }
|
|
2374 }
|
|
2375 }
|
|
2376 #delete original combination
|
|
2377 delete($hash{$key});
|
|
2378 }
|
|
2379 }
|
|
2380
|
|
2381 if ($value>$sampleSize)
|
|
2382 {
|
|
2383 return 0;
|
|
2384 }else
|
|
2385 {
|
|
2386 return $value;
|
|
2387 }
|
|
2388 }
|
|
2389
|
|
2390
|
|
2391 sub SampleCombination()
|
|
2392 {
|
|
2393 (my $m,my $n,my $comRef)=@_;
|
|
2394 my %hash;
|
|
2395 my $sampleTimes=0;
|
|
2396 my @randNum;
|
|
2397 my @sortID;
|
|
2398 my $i;
|
|
2399 my $j;
|
|
2400 my $tmp;
|
|
2401 while ( scalar(keys %hash)<$sampleSize and $sampleTimes<($sampleSize*2))
|
|
2402 {
|
|
2403 for ($i=0;$i<$m;$i++) # generate random data
|
|
2404 {
|
|
2405 $randNum[$i]=int(100000 * rand(100));
|
|
2406 $sortID[$i]=$i;
|
|
2407 }
|
|
2408
|
|
2409 for ($i=0;$i<$m;$i++) # sorting random data
|
|
2410 {
|
|
2411 for ($j=0;$j<$m;$j++)
|
|
2412 {
|
|
2413 if ($randNum[$sortID[$i]]<$randNum[$sortID[$j]])
|
|
2414 {
|
|
2415 $tmp=$sortID[$i];
|
|
2416 $sortID[$i]=$sortID[$j];
|
|
2417 $sortID[$j]=$tmp;
|
|
2418 }
|
|
2419 }
|
|
2420 }
|
|
2421
|
|
2422 #storing data
|
|
2423 $tmp=join("\t",sort {$a<=>$b} (splice(@sortID,0,$n)));
|
|
2424 $hash{$tmp}=1;
|
|
2425 $sampleTimes++;
|
|
2426 }
|
|
2427 @$comRef=keys %hash;
|
|
2428 }
|
|
2429
|
|
2430
|
|
2431 sub PanGenomeNumber()
|
|
2432 {
|
|
2433 (my $spID)=@_;
|
|
2434 my $pan=0;
|
|
2435 my $core=0;
|
|
2436 my $count; #### counter;
|
|
2437 my @row;
|
|
2438
|
|
2439 foreach (@clusters)
|
|
2440 {
|
|
2441 $count=0;
|
|
2442 @row=split(/\t/,$_);
|
|
2443
|
|
2444 foreach (@$spID)
|
|
2445 {
|
|
2446 $count=$count+$row[$_];
|
|
2447 }
|
|
2448
|
|
2449 if ($count>0)
|
|
2450 {
|
|
2451 $pan++;
|
|
2452 if ($count == scalar(@$spID))
|
|
2453 {
|
|
2454 $core++;
|
|
2455 }
|
|
2456 }
|
|
2457 }
|
|
2458 return $pan."\t".$core;
|
|
2459 }
|
|
2460
|
|
2461 sub fit_model_A()
|
|
2462 {
|
|
2463 ### model y = A * x**B + C
|
|
2464 (my $xdata,my $ydata)=@_;
|
|
2465 my $i;
|
|
2466 my $b;
|
|
2467 my $max_B=0;
|
|
2468 my $max_R=0;
|
|
2469 my $max_A=0;
|
|
2470 my $max_A_interval;
|
|
2471 my $max_C=0;
|
|
2472 my $max_C_interval;
|
|
2473 my $R=1e-100;
|
|
2474 my $start;
|
|
2475 my $end;
|
|
2476 my $step;
|
|
2477 my @xValues;
|
|
2478 my @yValues;
|
|
2479
|
|
2480 $start=1;
|
|
2481 $step=0.001;
|
|
2482 $b=$start;
|
|
2483 $max_R=0;
|
|
2484 $R=1e-100;
|
|
2485
|
|
2486 use Statistics::LineFit;
|
|
2487 use Statistics::Distributions;
|
|
2488
|
|
2489 while ($max_R<=$R)
|
|
2490 {
|
|
2491 if (($b < 0.02) and ($b >-0.02))
|
|
2492 {
|
|
2493 $b=-0.02;
|
|
2494 }
|
|
2495
|
|
2496 for ($i=0;$i<@$xdata;$i++)
|
|
2497 {
|
|
2498 $xValues[$i]=$$xdata[$i]**$b;
|
|
2499 }
|
|
2500 @yValues=@$ydata;
|
|
2501 my $lineFit = Statistics::LineFit->new();
|
|
2502 $lineFit->setData (\@xValues, \@yValues) or die "Invalid data";
|
|
2503 (my $intercept, my $slope) = $lineFit->coefficients();
|
|
2504 my $rSquared = $lineFit->rSquared();
|
|
2505 my $meanSquaredError = $lineFit->meanSqError();
|
|
2506 my $durbinWatson = $lineFit->durbinWatson();
|
|
2507 my $sigma = $lineFit->sigma();
|
|
2508 (my $tStatIntercept, my $tStatSlope) = $lineFit->tStatistics();
|
|
2509 (my $varianceIntercept,my $varianceSlope) = $lineFit->varianceOfEstimates();
|
|
2510
|
|
2511 $max_R=$R;
|
|
2512 $R=$rSquared;
|
|
2513 if ($max_R<=$R)
|
|
2514 {
|
|
2515 $max_R=$R;
|
|
2516 ($max_C,$max_A)=$lineFit->coefficients();
|
|
2517 $max_A_interval=Statistics::Distributions::tdistr (($spnum-2),.025)*sqrt($varianceSlope);
|
|
2518 $max_C_interval=Statistics::Distributions::tdistr (($spnum-2),.025)*sqrt($varianceIntercept);
|
|
2519 }
|
|
2520 $b=$b-$step;
|
|
2521 }
|
|
2522 $max_B=$b;
|
|
2523 return ($max_R,$max_A,$max_A_interval,$max_B,$max_C,$max_C_interval);
|
|
2524
|
|
2525 }
|
|
2526
|
|
2527 sub fit_model_B()
|
|
2528 {
|
|
2529 ### model y = A * exp(x*B) + C
|
|
2530 (my $xdata,my $ydata)=@_;
|
|
2531 my $i;
|
|
2532 my $b;
|
|
2533 my $max_B=0;
|
|
2534 my $max_R=0;
|
|
2535 my $max_A=0;
|
|
2536 my $max_A_interval;
|
|
2537 my $max_C=0;
|
|
2538 my $max_C_interval;
|
|
2539 my $R=1e-100;
|
|
2540 my $start;
|
|
2541 my $end;
|
|
2542 my $step;
|
|
2543 my @xValues;
|
|
2544 my @yValues;
|
|
2545
|
|
2546 $start=0;
|
|
2547 $step=0.001;
|
|
2548 $b=$start;
|
|
2549 $max_R=0;
|
|
2550 $R=1e-100;
|
|
2551
|
|
2552 use Statistics::LineFit;
|
|
2553 use Statistics::Distributions;
|
|
2554
|
|
2555 while ($max_R<=$R)
|
|
2556 {
|
|
2557 if (($b < 0.02) and ($b >-0.02))
|
|
2558 {
|
|
2559 $b=-0.02;
|
|
2560 }
|
|
2561
|
|
2562 for ($i=0;$i<@$xdata;$i++)
|
|
2563 {
|
|
2564 $xValues[$i]=exp($$xdata[$i]*$b);
|
|
2565 }
|
|
2566 @yValues=@$ydata;
|
|
2567 my $lineFit = Statistics::LineFit->new();
|
|
2568 $lineFit->setData (\@xValues, \@yValues) or die "Invalid data";
|
|
2569 (my $intercept, my $slope) = $lineFit->coefficients();
|
|
2570 my $rSquared = $lineFit->rSquared();
|
|
2571 my $meanSquaredError = $lineFit->meanSqError();
|
|
2572 my $durbinWatson = $lineFit->durbinWatson();
|
|
2573 my $sigma = $lineFit->sigma();
|
|
2574 (my $tStatIntercept, my $tStatSlope) = $lineFit->tStatistics();
|
|
2575 (my $varianceIntercept,my $varianceSlope) = $lineFit->varianceOfEstimates();
|
|
2576
|
|
2577 $max_R=$R;
|
|
2578 $R=$rSquared;
|
|
2579 if ($max_R<=$R)
|
|
2580 {
|
|
2581 $max_R=$R;
|
|
2582 ($max_C,$max_A)=$lineFit->coefficients();
|
|
2583 $max_A_interval=Statistics::Distributions::tdistr (($spnum-2),.025)*sqrt($varianceSlope);
|
|
2584 $max_C_interval=Statistics::Distributions::tdistr (($spnum-2),.025)*sqrt($varianceIntercept);
|
|
2585 }
|
|
2586 $b=$b-$step;
|
|
2587 }
|
|
2588 $max_B=$b;
|
|
2589 return ($max_R,$max_A,$max_A_interval,$max_B,$max_C,$max_C_interval);
|
|
2590 }
|
|
2591
|
|
2592
|
|
2593 sub ReadData2Array()
|
|
2594 {
|
|
2595 (my $file, my $array1,my $col1,my $array2,my $col2)=@_;
|
|
2596 my $i=0;
|
|
2597 open(F,$file);
|
|
2598 $_=<F>;
|
|
2599 while (<F>)
|
|
2600 {
|
|
2601 chomp();
|
|
2602 @_=split(/\t/,$_);
|
|
2603 $$array1[$i]=$_[$col1];
|
|
2604 $$array2[$i]=$_[$col2];
|
|
2605 $i++;
|
|
2606 }
|
|
2607 close(F);
|
|
2608 }
|
|
2609
|
|
2610 sub SumData()
|
|
2611 {
|
|
2612 (my $xdata,my $ydata,my $SumMethod)=@_;
|
|
2613 my %hash;
|
|
2614 my $i;
|
|
2615 my $key;
|
|
2616 my $max=0;
|
|
2617 for ($i=0;$i<@$xdata;$i++)
|
|
2618 {
|
|
2619 if (exists($hash{$$xdata[$i]}))
|
|
2620 {
|
|
2621 $hash{$$xdata[$i]}=$hash{$$xdata[$i]}." ".$$ydata[$i];
|
|
2622 }else
|
|
2623 {
|
|
2624 $hash{$$xdata[$i]}=$$ydata[$i];
|
|
2625 if ($$xdata[$i]>$max)
|
|
2626 {
|
|
2627 $max=$$xdata[$i];
|
|
2628 }
|
|
2629 }
|
|
2630 }
|
|
2631 @$xdata=qw();
|
|
2632 @$ydata=qw();
|
|
2633 $i=0;
|
|
2634 foreach $i (1..$max)
|
|
2635 {
|
|
2636 $$xdata[$i-1]=$i;
|
|
2637 if ($SumMethod eq "median")
|
|
2638 {
|
|
2639 $$ydata[$i-1]=&median($hash{$i});
|
|
2640 }else
|
|
2641 {
|
|
2642 $$ydata[$i-1]=&mean($hash{$i});
|
|
2643 }
|
|
2644 }
|
|
2645 #print join(",",@$xdata)."\n";
|
|
2646 #print join(",",@$ydata)."\n";
|
|
2647 }
|
|
2648
|
|
2649 sub median()
|
|
2650 {
|
|
2651 (my $data)=@_;
|
|
2652 my @data=split(/ /,$data);
|
|
2653 my $arraylen=scalar(@data);
|
|
2654 @data=sort{$a<=>$b} @data;
|
|
2655 if (int($arraylen/2)*2 == $arraylen)
|
|
2656 {
|
|
2657 return ($data[$arraylen/2]+$data[$arraylen/2-1])/2;
|
|
2658 }else
|
|
2659 {
|
|
2660 return $data[int($arraylen/2)];
|
|
2661 }
|
|
2662 }
|
|
2663
|
|
2664 sub mean()
|
|
2665 {
|
|
2666 (my $data)=@_;
|
|
2667 my @data=split(/ /,$data);
|
|
2668 my $sum=0;
|
|
2669 foreach (@data)
|
|
2670 {
|
|
2671 $sum=$sum+$_;
|
|
2672 }
|
|
2673 return int(($sum/scalar(@data))*1000)/1000;
|
|
2674 }
|
|
2675
|
|
2676 sub ReplaceName()
|
|
2677 {
|
|
2678 (my $sp,my $file)=@_;
|
|
2679 my @content;
|
|
2680 my $line;
|
|
2681 my $i;
|
|
2682 my $target;
|
|
2683 open(F,$file);
|
|
2684 @content=<F>;
|
|
2685 close(F);
|
|
2686 for ($line=0;$line<@content;$line++)
|
|
2687 {
|
|
2688 for ($i=0;$i<@$sp;$i++)
|
|
2689 {
|
|
2690 $_=$content[$line];
|
|
2691 $target="sp".$i."sp";
|
|
2692 s/$target/$$sp[$i]/;
|
|
2693 $content[$line]=$_;
|
|
2694 }
|
|
2695 }
|
|
2696 open(R,">$file");
|
|
2697 print R @content;
|
|
2698 close(R);
|
|
2699 }
|
|
2700
|
|
2701 sub MP_Result_to_Table()
|
|
2702 {
|
|
2703 (my $MPresult, my $outputfile)=@_;
|
|
2704 my %hash;
|
|
2705 my $maxid=0;
|
|
2706 my $i;
|
|
2707 my @row;
|
|
2708
|
|
2709 open(F,"$MPresult");
|
|
2710 $_=<F>;
|
|
2711 while (<F>)
|
|
2712 {
|
|
2713 @row=split(/\t/,$_);
|
|
2714 if (exists($hash{$row[0]}))
|
|
2715 {
|
|
2716 $hash{$row[0]}=$hash{$row[0]}."\t".$row[2];
|
|
2717 }else
|
|
2718 {
|
|
2719 $hash{$row[0]}=$row[2];
|
|
2720 if ($row[0]>$maxid)
|
|
2721 {
|
|
2722 $maxid=$row[0];
|
|
2723 }
|
|
2724 }
|
|
2725 }
|
|
2726 close(F);
|
|
2727
|
|
2728 open(R,">$outputfile");
|
|
2729 foreach $i (1..$maxid)
|
|
2730 {
|
|
2731 print R $hash{$i}."\n";
|
|
2732 }
|
|
2733 close(R);
|
|
2734 }
|
|
2735
|
|
2736
|
|
2737
|
|
2738 sub FormatCluster()
|
|
2739 {
|
|
2740 (my $infile,my $genelist,my $spnum,my $cluster)=@_;
|
|
2741 my %hash;
|
|
2742 my %gene;
|
|
2743 my $key;
|
|
2744 my @row;
|
|
2745 my $sp;
|
|
2746 my $line;
|
|
2747 my $i=0;
|
|
2748 my $j=0;
|
|
2749 my @content;
|
|
2750
|
|
2751 ### record gene in clusters
|
|
2752 open(F,"$infile");
|
|
2753 @content=<F>;
|
|
2754 close(F);
|
|
2755 chomp(@content);
|
|
2756 for ($line=0;$line<@content;$line++)
|
|
2757 {
|
|
2758 @row=split(/\t/,$content[$line]);
|
|
2759 foreach $key (@row)
|
|
2760 {
|
|
2761 $gene{$key}=1;
|
|
2762 }
|
|
2763 }
|
|
2764 ###retrieves gene which is not in clutsers
|
|
2765
|
|
2766 open(F,"$genelist");
|
|
2767 while ($key=<F>)
|
|
2768 {
|
|
2769 if ($key=~/^>/)
|
|
2770 {
|
|
2771 chomp($key);
|
|
2772 $_=$key;
|
|
2773 s/^>//;
|
|
2774 $key=$_;
|
|
2775 if (!exists($gene{$key}))
|
|
2776 {
|
|
2777 $content[$line]=$key;
|
|
2778 $line++;
|
|
2779 }
|
|
2780 }
|
|
2781 }
|
|
2782 close(F);
|
|
2783
|
|
2784 #### initialization @cluster
|
|
2785 @$cluster=qw();
|
|
2786 $j=0;
|
|
2787
|
|
2788 foreach $line (@content)
|
|
2789 {
|
|
2790 if ($line ne "")
|
|
2791 {
|
|
2792 %hash=();
|
|
2793 @row=split(/\t/,$line);
|
|
2794 foreach $key (@row)
|
|
2795 {
|
|
2796 $sp=substr($key,0,index($key,"G"));
|
|
2797 $gene{$key}=1;
|
|
2798 if (exists($hash{$sp}))
|
|
2799 {
|
|
2800 $hash{$sp}=$hash{$sp}.",".$key;
|
|
2801 }else
|
|
2802 {
|
|
2803 $hash{$sp}=$key;
|
|
2804 }
|
|
2805 }
|
|
2806
|
|
2807 $i=0;
|
|
2808 @row=qw();
|
|
2809
|
|
2810 foreach $i (0..($spnum-1))
|
|
2811 {
|
|
2812 if (exists($hash{"S$i"}))
|
|
2813 {
|
|
2814 $row[$i]=$hash{"S$i"};
|
|
2815 }else
|
|
2816 {
|
|
2817 $row[$i]="-";
|
|
2818 }
|
|
2819 }
|
|
2820 $$cluster[$j++]=join("\t",@row);
|
|
2821 }
|
|
2822 }
|
|
2823 }
|
|
2824
|
|
2825
|