Mercurial > repos > dereeper > pgap
comparison PGAP-1.2.1/PGAP.pl @ 0:83e62a1aeeeb draft
Uploaded
author | dereeper |
---|---|
date | Thu, 24 Jun 2021 13:51:52 +0000 |
parents | |
children | 70b7a5270968 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:83e62a1aeeeb |
---|---|
1 #!/usr/bin/perl | |
2 use strict; | |
3 use warnings; | |
4 use Getopt::Long; | |
5 | |
6 ### programs from BLAST | |
7 my $formatdb="formatdb"; | |
8 my $blastall="blastall"; | |
9 | |
10 ### programs from mcl | |
11 my $mcl="mcl"; | |
12 | |
13 ### programs from mafft | |
14 my $mafft="mafft"; | |
15 | |
16 ### programs from PHYLIP | |
17 my $seqboot="seqboot"; | |
18 my $neighbor="neighbor"; | |
19 my $consense="consense"; | |
20 my $dnaml="dnaml"; | |
21 my $dnadist="dnadist"; | |
22 my $dnapars="dnapars"; | |
23 | |
24 my $count_tree=0; | |
25 | |
26 my $sampleSize=8000; # when calculate the pan-genome size, we will sample $sampleSize combinations | |
27 # if the total combination number is larger than $sampleSize for specific genomes | |
28 # Surely, the number of $sampleSize is, the larger, the better. | |
29 # However, the larger the $sampleSize is, the more time would be consumed. | |
30 # we suggest the range: 5000 ~ 20,000 | |
31 | |
32 ##################################################################### | |
33 # DOn't modify the following code, unless you know their functions | |
34 ##################################################################### | |
35 | |
36 my %opt=qw(); | |
37 GetOptions(\%opt,"strains:s","input:s","output:s","cluster!","pangenome!","variation!","evolution!","function!","method:s","thread:i","score:f","evalue:f","coverage:f","local:f","global:f","identity:f","bootstrap:i","help|h!"); | |
38 | |
39 my @usage=qq( | |
40 ====== Pan-Genome Analysis Pipeline (PGAP) ====== | |
41 Version 1.2.1 | |
42 | |
43 Usage: perl PGAP.pl [Options] | |
44 | |
45 Options: | |
46 --strains String Input strains nicknames, and join them with '+', for example: A+B+C | |
47 --input String Input data directory | |
48 --output String Result output directory | |
49 | |
50 --cluster Run homologous gene clustering | |
51 --pangenome Run pan-genome analysis | |
52 --variation Run homologous clusters variation analysis | |
53 --evolution Run evolution analysis | |
54 --function Run Function analysis | |
55 | |
56 --method String GF for GeneFamily method, and MP for MultiParanoid method | |
57 for GF: fast, but not very accurate | |
58 evalue, score, indentity, coverage are employed | |
59 for MP: slow, but more accurate | |
60 score, coverage, local, global are employed | |
61 --thread Int Number of processors to use in blastall. [default:1] | |
62 --score Int Minimum score in blastall. [default:40] | |
63 --evalue Decimal Maximal E-value in blastall. [default:1e-10] | |
64 --coverage Decimal Minimum alignment coverage for two homologous proteins. [default:0.5] | |
65 --local Decimal Minimum local alignment overlap in MP method. [default:0.25] | |
66 --global Decimal Minimum global alignment overlap in MP method. [default:0.5] | |
67 --identity Decimal Minimum alignment indentity for two homologous proteins. [default:0.5] | |
68 --bootstrap Int Bootstrap times for phylogenetics tree. [default:1] | |
69 | |
70 --h or help Display this message | |
71 ); | |
72 | |
73 | |
74 ############# specified variable ############# | |
75 my $inputDIR; | |
76 my $outputDIR; | |
77 my $run_cluster; | |
78 my $run_pangenome; | |
79 my $run_variation; | |
80 my $run_evolution; | |
81 my $run_function; | |
82 my $method=""; | |
83 my $thread; | |
84 my $score; | |
85 my $identity; | |
86 my $evalue; | |
87 my $coverage; | |
88 my $global; | |
89 my $local; | |
90 my $bootstrap; | |
91 | |
92 | |
93 my %pep; | |
94 my %nuc; | |
95 my $spnum; | |
96 my @clusters; | |
97 my $Cluster; | |
98 my @SpecieCombination; | |
99 my @spID; | |
100 my %genenum; | |
101 my %aaAln; | |
102 my %ntAln; | |
103 my %cog; | |
104 my %description; | |
105 #my %aa4tree; ### AA sequence for Phylogenetic Tree | |
106 my %nt4tree; ### nucleotide sequence for Phylogenetic Tree | |
107 my @SNPPosition; ### SNP position | |
108 my $dieMessage="You did not run PGAP.pl in the program directory\n"; | |
109 my $section; | |
110 | |
111 ######### common temporary variable ############# | |
112 my $i; | |
113 my $j; | |
114 my $line; | |
115 my %tmpHash; | |
116 my @tmp; | |
117 my $tmp; | |
118 my $key; | |
119 my @row; | |
120 my $inparacount; | |
121 my $ClusterID; | |
122 my $orth; | |
123 my @content; | |
124 my $clusterName; | |
125 my @xdata; | |
126 my @ydata; | |
127 my @fit; | |
128 | |
129 my $fit_A; | |
130 my $fit_A_interval; | |
131 my $fit_B; | |
132 my $fit_C; | |
133 my $fit_C_interval; | |
134 my $fit_Rsquare; | |
135 | |
136 | |
137 | |
138 #### check option | |
139 | |
140 my $opt_error=0; | |
141 | |
142 if ((scalar(keys %opt) ==0) or (exists($opt{"help"}))) | |
143 { | |
144 print join("\n",@usage)."\n"; | |
145 exit; | |
146 } | |
147 | |
148 | |
149 ###################### public info | |
150 ### strains name | |
151 my @species; | |
152 if (exists($opt{"strains"})) | |
153 { | |
154 @species=split(/\+/,$opt{"strains"}); | |
155 $spnum=scalar(@species); | |
156 }else | |
157 { | |
158 print "Please assign strains nick name!\n"; | |
159 exit; | |
160 } | |
161 | |
162 ### input data directory | |
163 | |
164 if (exists($opt{"input"})) | |
165 { | |
166 $inputDIR=$opt{"input"}; | |
167 if ($inputDIR!~/\/$/) | |
168 { | |
169 $inputDIR=$inputDIR."/"; | |
170 } | |
171 }else | |
172 { | |
173 print "Please assign input data directory!\n\n"; | |
174 exit; | |
175 } | |
176 ### output data directory | |
177 | |
178 if (exists($opt{"output"})) | |
179 { | |
180 $outputDIR=$opt{"output"}; | |
181 if ($outputDIR!~/\/$/) | |
182 { | |
183 $outputDIR=$outputDIR."/"; | |
184 } | |
185 }else | |
186 { | |
187 print "Please assign result output directory!\n\n"; | |
188 exit; | |
189 } | |
190 | |
191 ###################### section info | |
192 | |
193 if (exists($opt{"cluster"})) | |
194 { | |
195 $run_cluster=1; | |
196 }else | |
197 { | |
198 $run_cluster=0; | |
199 } | |
200 | |
201 if (exists($opt{"pangenome"})) | |
202 { | |
203 $run_pangenome=1; | |
204 }else | |
205 { | |
206 $run_pangenome=0; | |
207 } | |
208 | |
209 if (exists($opt{"variation"})) | |
210 { | |
211 $run_variation=1; | |
212 }else | |
213 { | |
214 $run_variation=0; | |
215 } | |
216 | |
217 if (exists($opt{"evolution"})) | |
218 { | |
219 $run_evolution=1; | |
220 }else | |
221 { | |
222 $run_evolution=0; | |
223 } | |
224 | |
225 if (exists($opt{"function"})) | |
226 { | |
227 $run_function=1; | |
228 }else | |
229 { | |
230 $run_function=0; | |
231 } | |
232 | |
233 if ($run_cluster) | |
234 { | |
235 ### method | |
236 if (exists($opt{"method"})) | |
237 { | |
238 $method=uc($opt{"method"}); | |
239 if ($method!~/^GF$/ and $method!~/^MP$/) | |
240 { | |
241 print "Unknown method: ".$opt{"method"}."\n"; | |
242 exit; | |
243 } | |
244 }else | |
245 { | |
246 print "Please assign the cluster method!\n\n"; | |
247 exit; | |
248 } | |
249 | |
250 ##thread | |
251 if (exists($opt{"thread"})) | |
252 { | |
253 $thread=$opt{"thread"}; | |
254 if ($thread==0) | |
255 { | |
256 print "please assign an applicable thread value.\n"; | |
257 exit; | |
258 } | |
259 }else | |
260 { | |
261 $thread=1; | |
262 } | |
263 | |
264 ##score | |
265 if (exists($opt{"score"})) | |
266 { | |
267 $score=$opt{"score"}; | |
268 if ($score<=0) | |
269 { | |
270 print "please assign an applicable score value.\n"; | |
271 exit; | |
272 } | |
273 }else | |
274 { | |
275 $score=40; | |
276 } | |
277 | |
278 if ($method eq "GF") | |
279 { | |
280 ###identity | |
281 if (exists($opt{"identity"})) | |
282 { | |
283 $identity=$opt{"identity"}; | |
284 if ($identity>1 or $identity<=0) | |
285 { | |
286 print "identity should be 0 ~ 1 \n"; | |
287 exit; | |
288 } | |
289 }else | |
290 { | |
291 $identity=0.5; | |
292 } | |
293 | |
294 ###evalue | |
295 if (exists($opt{"evalue"})) | |
296 { | |
297 $evalue=$opt{"evalue"}; | |
298 }else | |
299 { | |
300 $evalue=1e-10; | |
301 } | |
302 | |
303 ###coverage | |
304 if (exists($opt{"coverage"})) | |
305 { | |
306 $coverage=$opt{"coverage"}; | |
307 if ($coverage>1 or $coverage<=0) | |
308 { | |
309 print "coverage should be 0 ~ 1 \n"; | |
310 exit; | |
311 } | |
312 }else | |
313 { | |
314 $coverage=0.5; | |
315 } | |
316 } | |
317 | |
318 | |
319 if ($method eq "MP") | |
320 { | |
321 ###global | |
322 if (exists($opt{"global"})) | |
323 { | |
324 $global=$opt{"global"}; | |
325 if ($global>1 or $global<=0) | |
326 { | |
327 print "global coverage should be 0 ~ 1 \n"; | |
328 exit; | |
329 } | |
330 }else | |
331 { | |
332 $global=0.5; | |
333 } | |
334 ###local | |
335 if (exists($opt{"local"})) | |
336 { | |
337 $local=$opt{"local"}; | |
338 if ($local<=0) | |
339 { | |
340 print "local coverage should be 0 ~ [global coverage value] \n"; | |
341 exit; | |
342 } | |
343 if ($local>$global) | |
344 { | |
345 print "local coverage should be less than global coverage!\n"; | |
346 exit; | |
347 } | |
348 }else | |
349 { | |
350 $local=0.25; | |
351 } | |
352 } | |
353 } | |
354 | |
355 if ($run_evolution) | |
356 { | |
357 if (exists($opt{"bootstrap"})) | |
358 { | |
359 $bootstrap=$opt{"bootstrap"}; | |
360 if ($bootstrap<=0) | |
361 { | |
362 print "please assign an applicable bootstrap value.\n"; | |
363 } | |
364 }else | |
365 { | |
366 $bootstrap=1; | |
367 } | |
368 } | |
369 | |
370 print "Program begin at ".localtime()."\n"; | |
371 print "The following are the parameters for current process:\n"; | |
372 print "Strains: ".join(",",@species)."\n"; | |
373 print "Input directory: $inputDIR\n"; | |
374 print "Output directory: $outputDIR\n"; | |
375 if ($run_cluster) | |
376 { | |
377 print "Cluster analysis: yes\n"; | |
378 print " Method: $method\n"; | |
379 print " Thread: $thread\n"; | |
380 if ($method eq "GF") | |
381 { | |
382 print " E-value: $evalue\n"; | |
383 print " Identity: $identity\n"; | |
384 print " Coverage: $coverage\n"; | |
385 print " Score: $score\n"; | |
386 } | |
387 if ($method eq "MP") | |
388 { | |
389 print " Local: $local\n"; | |
390 print " Global: $global\n"; | |
391 print " Score: $score\n"; | |
392 } | |
393 }else | |
394 { | |
395 print "Cluster analysis: no\n"; | |
396 } | |
397 if ($run_pangenome) | |
398 { | |
399 print "Pan-genome analysis: yes\n"; | |
400 }else | |
401 { | |
402 print "Pan-genome analysis: no\n"; | |
403 } | |
404 if ($run_variation) | |
405 { | |
406 print "Variation analysis: yes\n"; | |
407 }else | |
408 { | |
409 print "Variation analysis: no\n"; | |
410 } | |
411 if ($run_evolution) | |
412 { | |
413 print "Evolution analysis: yes\n"; | |
414 print " Bootstrap: $bootstrap\n"; | |
415 }else | |
416 { | |
417 print "Evolution analysis: no\n"; | |
418 } | |
419 if ($run_function) | |
420 { | |
421 print "Function analysis: yes\n"; | |
422 }else | |
423 { | |
424 print "Function analysis: no\n"; | |
425 } | |
426 | |
427 $section=$run_cluster.$run_pangenome.$run_variation.$run_evolution.$run_function; | |
428 | |
429 ############################################### | |
430 # section 0) check input file and program | |
431 ############################################### | |
432 if (!(-e $outputDIR)) | |
433 { | |
434 system("mkdir $outputDIR"); | |
435 } | |
436 system("chmod +rw $outputDIR"); | |
437 | |
438 if (!(-w $outputDIR)) | |
439 { | |
440 print "There is no WRITE permission in $outputDIR\n"; | |
441 exit; | |
442 } | |
443 @tmp=qw(); | |
444 &CheckInputFile(\@species,$inputDIR,$section,$method,\@tmp); | |
445 &CheckExtraProgram($section,$method,\@tmp); | |
446 | |
447 if (scalar(@tmp)>0) | |
448 { | |
449 open(R,">".$outputDIR."0.error.message"); | |
450 print R join("",@tmp)."\n"; | |
451 close(R); | |
452 print "error!\nlog are saved in ${outputDIR}0.error.message\n"; | |
453 exit; | |
454 } | |
455 | |
456 | |
457 ############################################ | |
458 # section 1) cluster analysis | |
459 ############################################ | |
460 | |
461 if ($run_cluster) | |
462 { | |
463 print "\n\n############################################\n"; | |
464 print "# section 1) cluster analysis\n"; | |
465 print "############################################\n\n\n"; | |
466 | |
467 #### cluster gene and return result to the array @clusters | |
468 | |
469 if ($method eq "MP") | |
470 { | |
471 print "Begin cluster gene with MP method ...\n"; | |
472 &MP(); | |
473 }else | |
474 { | |
475 print "Begin cluster gene with GF method ...\n"; | |
476 &GF(); | |
477 } | |
478 | |
479 #### output normal cluster format | |
480 | |
481 &FormatClusterOutPut(\@species,"${outputDIR}1.Orthologs_Cluster.txt",\@clusters); | |
482 | |
483 #### Retrieve cluster | |
484 | |
485 &RetrieveClusterFromFile("${outputDIR}1.Orthologs_Cluster.txt",\@clusters); | |
486 | |
487 ##### gene distribution in each strains | |
488 %tmpHash=(); | |
489 &GeneDistribution(\@clusters,\%tmpHash); | |
490 | |
491 open(R,">${outputDIR}1.Gene_Distribution_By_Conservation.txt"); | |
492 print R "SharedBy_Strains\t".join("\t",@species)."\n"; | |
493 | |
494 for ($i=$spnum;$i>0;$i--) | |
495 { | |
496 print R $i; | |
497 for ($j=0;$j<$spnum;$j++) | |
498 { | |
499 if (exists($tmpHash{$i."|".$j})) | |
500 { | |
501 print R "\t".$tmpHash{$i."|".$j}; | |
502 }else | |
503 { | |
504 print R "\t0"; | |
505 } | |
506 } | |
507 print R "\n"; | |
508 } | |
509 | |
510 close(R); | |
511 | |
512 | |
513 }else | |
514 { | |
515 print "Homologous gene clustering is skipped!\n"; | |
516 } | |
517 | |
518 | |
519 if ($run_pangenome) | |
520 { | |
521 print "\n\n############################################\n"; | |
522 print "# section 2) Pan-genome analysis\n"; | |
523 print "############################################\n\n\n"; | |
524 | |
525 #### Retrieve cluster | |
526 &RetrieveClusterFromFile("${outputDIR}1.Orthologs_Cluster.txt",\@clusters); | |
527 chomp(@clusters); | |
528 | |
529 #### convert file into 0-1 matrix | |
530 for ($line=0;$line<@clusters;$line++) | |
531 { | |
532 @row=split(/\t/,$clusters[$line]); | |
533 splice(@row,0,1); | |
534 for ($i=0;$i<@row;$i++) | |
535 { | |
536 if ($row[$i] eq "-") | |
537 { | |
538 $row[$i]=0; | |
539 }else | |
540 { | |
541 $row[$i]=1; | |
542 } | |
543 } | |
544 $clusters[$line]=join("\t",@row); | |
545 } | |
546 | |
547 #### fetch gene number of each strains | |
548 for ($i=0;$i<$spnum;$i++) | |
549 { | |
550 open(F,"$inputDIR$species[$i].pep"); | |
551 @tmp=<F>; | |
552 close(F); | |
553 @tmp=grep(/^>/,@tmp); | |
554 $genenum{$species[$i]}=scalar(@tmp); | |
555 } | |
556 | |
557 #### pan genome size and core genome size | |
558 print "Deducing pan genome size and core genome size for each composition...\n\n"; | |
559 | |
560 open(PAN,">${outputDIR}2.PanGenome.Data.txt"); | |
561 print PAN "ClusterConservation\tTotalGeneNumber\tPanGenome\tCoreGenome\n"; | |
562 | |
563 for ($i=1;$i<=scalar(@species);$i++) | |
564 { | |
565 #@SpecieCombination=&Combination(\@species,$i); | |
566 #@SpecieCombination=&Combination($spnum,$i); | |
567 if (&ChkCombinationValue($spnum,$i) !=0) ### transfer the array reference to the subroutine | |
568 { | |
569 &Combination($spnum,$i,\@SpecieCombination); ## if the combination number is less than sampleSize, then fecth all, else sample | |
570 }else | |
571 { | |
572 &SampleCombination($spnum,$i,\@SpecieCombination); | |
573 } | |
574 | |
575 foreach $key (@SpecieCombination) | |
576 { | |
577 ##### count total gene number in current combination | |
578 $tmp=0; | |
579 @spID=split(/\t/,$key); #### speices id in current combination | |
580 foreach (@spID) | |
581 { | |
582 $tmp=$tmp+$genenum{$species[$_]}; | |
583 } | |
584 ##### scan pangenome and coregenome | |
585 @tmp=split(/\t/,&PanGenomeNumber(\@spID)); | |
586 print PAN "$i\t$tmp\t".join("\t",@tmp)."\n"; | |
587 | |
588 } | |
589 } | |
590 | |
591 close(PAN); | |
592 | |
593 #### data fit | |
594 | |
595 #### for model A | |
596 | |
597 if ($spnum<3) | |
598 { | |
599 print "There are $spnum strains. For pan-genome function fitting, at least 3 strains data are required.\n"; | |
600 }else | |
601 { | |
602 open(R,">${outputDIR}2.PanGenome.Profile.txt"); | |
603 ##### genome number & pan-genome size | |
604 @xdata=qw(); | |
605 @ydata=qw(); | |
606 &ReadData2Array("${outputDIR}2.PanGenome.Data.txt",\@xdata,0,\@ydata,2); | |
607 &SumData(\@xdata,\@ydata,"mean"); | |
608 ($fit_Rsquare, $fit_A, $fit_A_interval, $fit_B, $fit_C, $fit_C_interval)=&fit_model_A(\@xdata,\@ydata); | |
609 print R "The relation bewteen genome number and pan-genome size\n\n"; | |
610 print R "Function model: y=A*x**B +C \n"; | |
611 print R "\ty denotes pan-genome size, x denotes genome number, and A, B, C are fitting parameters.\n\n"; | |
612 print R "Fitting result:\n"; | |
613 print R "\ty = $fit_A *x**$fit_B + $fit_C\n"; | |
614 print R "\tR-square = $fit_Rsquare\n"; | |
615 print R "\tA 95% confidence interval: ($fit_A - $fit_A_interval , $fit_A + $fit_A_interval)\n"; | |
616 print R "\tC 95% confidence interval: ($fit_C - $fit_C_interval , $fit_C + $fit_C_interval)\n\n\n\n\n"; | |
617 | |
618 ##### total gene number & pan-genome size | |
619 #@xdata=qw(); | |
620 #@ydata=qw(); | |
621 #&ReadData2Array("${outputDIR}2.PanGenome.Data.txt",\@xdata,1,\@ydata,2); | |
622 #&SumDataByMedian(\@xdata,\@ydata); | |
623 #($fit_Rsquare, $fit_A, $fit_B, $fit_C)=&fit_model_A(\@xdata,\@ydata); | |
624 #print R "The relation bewteen total gene number and pan-genome size\n\n"; | |
625 #print R "$fit_Rsquare, $fit_A, $fit_B, $fit_C\n"; | |
626 #print R "\ty = $fit_A *x**$fit_B + $fit_C R-square = $fit_Rsquare\n"; | |
627 #print R "\tx: total gene number\n"; | |
628 #print R "\ty: pan-genome size\n\n\n\n\n"; | |
629 | |
630 ##### genome number & core genome | |
631 @xdata=qw(); | |
632 @ydata=qw(); | |
633 &ReadData2Array("${outputDIR}2.PanGenome.Data.txt",\@xdata,0,\@ydata,3); | |
634 &SumData(\@xdata,\@ydata,"mean"); | |
635 ($fit_Rsquare, $fit_A, $fit_A_interval, $fit_B, $fit_C, $fit_C_interval)=&fit_model_B(\@xdata,\@ydata); | |
636 print R "The relation bewteen genome number and core genome size\n\n"; | |
637 print R "Function model: y=A*exp(B*x) +C \n"; | |
638 print R "\ty denotes pan-genome size, x denotes genome number, and A, B, C are fitting parameters.\n\n"; | |
639 print R "Fitting result:\n"; | |
640 print R "\ty = $fit_A *exp($fit_B * x) + $fit_C R-square = $fit_Rsquare\n"; | |
641 print R "\tR-square = $fit_Rsquare\n"; | |
642 print R "\tA 95% confidence interval: ($fit_A - $fit_A_interval , $fit_A + $fit_A_interval)\n"; | |
643 print R "\tC 95% confidence interval: ($fit_C - $fit_C_interval , $fit_C + $fit_C_interval)\n\n\n\n\n"; | |
644 close(R); | |
645 } | |
646 | |
647 } | |
648 | |
649 | |
650 ############################################ | |
651 # section 3) CDS variation analysis | |
652 ############################################ | |
653 | |
654 if ($run_variation) | |
655 { | |
656 print "\n\n############################################\n"; | |
657 print "# section 3) CDS variation analysis\n"; | |
658 print "############################################\n\n\n"; | |
659 | |
660 #### Retrieve cluster | |
661 &RetrieveClusterFromFile("${outputDIR}1.Orthologs_Cluster.txt",\@clusters); | |
662 chomp(@clusters); | |
663 | |
664 ## protein | |
665 system("rm -rf *.pep"); | |
666 &PrepareFasta(\@species,$inputDIR,".pep"); ###prepare pep file | |
667 system("cat *.pep > All.faa && rm -rf *.pep && mv All.faa All.pep"); | |
668 &ReadSequenceInToHash("All.pep",\%pep); | |
669 ## nucleic | |
670 system("rm -rf *.nuc"); | |
671 &PrepareFasta(\@species,$inputDIR,".nuc"); ###prepare nuc file | |
672 system("cat *.nuc > All.ffn && rm -rf *.nuc && mv All.ffn All.nuc"); | |
673 &ReadSequenceInToHash("All.nuc",\%nuc); | |
674 | |
675 ## scanning SNP | |
676 %nt4tree=(); | |
677 for ($i=0;$i<$spnum;$i++) | |
678 { | |
679 $nt4tree{"S".$i}=""; | |
680 } | |
681 | |
682 open(VAR,">${outputDIR}3.CDS.variation.txt"); | |
683 print VAR "ClusterID\tStrains_Number\tGene_Number\tPosition\taaType\tntType\tntProfile\tVariation type\n"; | |
684 | |
685 open(VA,">${outputDIR}3.CDS.variation.analysis.txt"); | |
686 print VA "ClusterID\tInDel Base\tNonsynonymous mutation\tSynonymous mutation\n"; | |
687 | |
688 for ($line=0;$line<@clusters;$line++) | |
689 { | |
690 @row=split(/\t|\,/,$clusters[$line]); | |
691 $ClusterID=$row[0]; | |
692 splice(@row,0,1); | |
693 @row=grep(/^S/,@row); | |
694 if (scalar(@row) >=2) | |
695 { | |
696 open(PEP,">$ClusterID.pep"); | |
697 open(NUC,">$ClusterID.nuc"); | |
698 foreach $key (@row) | |
699 { | |
700 print PEP ">$key\n$pep{$key}\n"; | |
701 print NUC ">$key\n$nuc{$key}\n"; | |
702 } | |
703 close(PEP); | |
704 close(NUC); | |
705 system("$mafft --quiet $ClusterID.pep > $ClusterID.pal"); | |
706 #system("perl ./pal2nal.pl $ClusterID.pal $ClusterID.nuc -output fasta > $ClusterID.nal"); | |
707 $tmp=&pal2nal("$ClusterID.pal","$ClusterID.nuc","$ClusterID.nal"); | |
708 if ($tmp == 0) | |
709 { | |
710 system("rm -rf $ClusterID.*"); | |
711 next; | |
712 } | |
713 | |
714 @tmp=&DetectSNP(); | |
715 if (scalar(@tmp)>0) | |
716 { | |
717 print VA $ClusterID."\t".&VarAnalysis(\@tmp)."\n"; | |
718 print VAR join("",@tmp); | |
719 ### core orthologs | |
720 @row=split(/\t/,$clusters[$line]); | |
721 splice(@row,0,1); | |
722 if ((&CountGeneInCluster(join("\t",@row)) ==$spnum) and (&CountSpeicesInCluster(join("\t",@row)) == $spnum) ) | |
723 { | |
724 $count_tree++; | |
725 %tmpHash=(); | |
726 foreach (@row) | |
727 { | |
728 $tmpHash{$_}=""; | |
729 } | |
730 &RemoveHeadGap("$ClusterID.nal",\%tmpHash); | |
731 &ExtractSNP4tree(\%tmpHash,\%nt4tree); | |
732 } | |
733 } | |
734 system("rm -rf $ClusterID.*"); | |
735 } | |
736 } | |
737 close(VAR); | |
738 close(VA); | |
739 | |
740 open(R,">${outputDIR}3.CDS.variation.for.evolution.txt"); | |
741 foreach $key (keys %nt4tree) | |
742 { | |
743 $_=$key; | |
744 s/s//gi; | |
745 print R ">$species[$_]\n$nt4tree{$key}\n"; | |
746 } | |
747 close(R); | |
748 print $count_tree."\n\n"; | |
749 | |
750 ### | |
751 system("rm All.nuc All.pep"); | |
752 }else | |
753 { | |
754 print "CDS variation is skipped.\n"; | |
755 } | |
756 | |
757 ############################################ | |
758 # section 4) CDS variation analysis | |
759 ############################################ | |
760 | |
761 if ($run_evolution) | |
762 { | |
763 #### Retrieve cluster | |
764 &RetrieveClusterFromFile("${outputDIR}1.Orthologs_Cluster.txt",\@clusters); | |
765 chomp(@clusters); | |
766 | |
767 | |
768 | |
769 ################## | |
770 ## | |
771 ## Distance based | |
772 ## | |
773 ################## | |
774 | |
775 ### caculate the distance between each two strains by cluster | |
776 &ClusterProfil4Specie(\%tmpHash); # caculate Clusters profile for each specie | |
777 &DistanceMatrix($spnum,\%tmpHash); # caculate distance matrix acoording to Clusters profile | |
778 ###output distance | |
779 open(DIST,">${outputDIR}4.Species_Distance_Clusters_Based.txt"); | |
780 ###header | |
781 printf DIST "%5d", $spnum; | |
782 print DIST "\n"; | |
783 foreach $i (0..($spnum-1)) | |
784 { | |
785 $key="sp".$i."sp"; | |
786 printf DIST "%-10s",$key; | |
787 foreach $j (0..($spnum-1)) | |
788 { | |
789 printf DIST " %8f",$tmpHash{$i."-".$j}; | |
790 } | |
791 print DIST "\n"; | |
792 } | |
793 close(DIST); | |
794 | |
795 %tmpHash=(); | |
796 | |
797 ### based on pan genome (distance) | |
798 print "\nDraw pangenome based phylogenetic tree ...\n\n"; | |
799 | |
800 &PanBasedTree("${outputDIR}4.Species_Distance_Clusters_Based.txt","${outputDIR}4.PanBased"); | |
801 | |
802 ################## | |
803 ## | |
804 ## SNP based | |
805 ## | |
806 ################## | |
807 %tmpHash=(); | |
808 if (!(-e "${outputDIR}3.CDS.variation.for.evolution.txt")) | |
809 { | |
810 print "Variation in core orthologs cluster is not found from ${outputDIR}3.CDS.variation.for.evolution.txt.\n"; | |
811 print "Maybe you have skipped CDS variation analysis.\n"; | |
812 }else | |
813 { | |
814 &ReadSequenceInToHash("${outputDIR}3.CDS.variation.for.evolution.txt",\%tmpHash); | |
815 open(R,">mlst.aln"); | |
816 for ($i=0;$i<@species;$i++) | |
817 { | |
818 print R ">sp${i}sp\n".$tmpHash{$species[$i]}."\n"; | |
819 } | |
820 close(R); | |
821 &fasta2phylip("mlst.aln","mlst.phylip"); | |
822 system("rm -rf mlst.aln"); | |
823 | |
824 print "\nDraw SNP based phylogenetic tree ...\n\n"; | |
825 &SNPBasedTree("mlst.phylip","${outputDIR}4.SNPBased"); | |
826 system("rm -rf mlst.phylip") | |
827 } | |
828 | |
829 ######## | |
830 # replace speices name | |
831 ######## | |
832 | |
833 opendir(DIR,"${outputDIR}"); | |
834 @tmp=readdir(DIR); | |
835 closedir(DIR); | |
836 @tmp=grep(/^4/,@tmp); | |
837 foreach $tmp (@tmp) | |
838 { | |
839 &ReplaceName(\@species,"${outputDIR}$tmp"); | |
840 } | |
841 }else | |
842 { | |
843 print "Evolution analysis is skipped.\n"; | |
844 } | |
845 | |
846 | |
847 ############################################ | |
848 # section 4) Function analysis | |
849 ############################################ | |
850 | |
851 if ($run_function) | |
852 { | |
853 #### Retrieve cluster | |
854 &RetrieveClusterFromFile("${outputDIR}1.Orthologs_Cluster.txt",\@clusters); | |
855 chomp(@clusters); | |
856 | |
857 #### prepare annotation file | |
858 &PrepareTable(\@species,$inputDIR,".function"); ###prepare location file | |
859 &ReadAnnotation(\@species,\%cog,\%description); | |
860 | |
861 | |
862 #### assign function | |
863 open(R,">${outputDIR}5.Orthologs_Cluster_Function.txt"); | |
864 print R "ClusterID\tConservation_Level\tCOG\tDescription\n"; | |
865 for ($i=0;$i<@clusters;$i++) | |
866 { | |
867 @row=split(/\t/,$clusters[$i]); | |
868 $ClusterID=$row[0]; | |
869 splice(@row,0,1); | |
870 print R $ClusterID."\t".&CountSpeicesInCluster(join("\t",@row))."\t".&getCOG(\@row,\%cog)."\t".&getDescription(\@row,\%description)."\n"; | |
871 } | |
872 close(R); | |
873 | |
874 #### COG distribution | |
875 | |
876 ###Whole Clusters COG Distribution | |
877 &outputCOGStatistic("${outputDIR}5.Orthologs_Whole_Cluster_COG_Distribution.txt",&scanCOG("${outputDIR}5.Orthologs_Cluster_Function.txt",$spnum,1)); | |
878 | |
879 ###Core Clusters COG Distribution | |
880 &outputCOGStatistic("${outputDIR}5.Orthologs_Core_Cluster_COG_Distribution.txt",&scanCOG("${outputDIR}5.Orthologs_Cluster_Function.txt",$spnum,$spnum)); | |
881 | |
882 ###Dispensable Clusters COG Distribution | |
883 &outputCOGStatistic("${outputDIR}5.Orthologs_Dispensable_Cluster_COG_Distribution.txt",&scanCOG("${outputDIR}5.Orthologs_Cluster_Function.txt",($spnum-1),2)); | |
884 | |
885 ###strains specifc Clusters COG Distribution | |
886 &outputCOGStatistic("${outputDIR}5.Orthologs_specifc_Cluster_COG_Distribution.txt",&scanCOG("${outputDIR}5.Orthologs_Cluster_Function.txt",1,1)); | |
887 | |
888 system("rm -rf *.function"); | |
889 | |
890 }else | |
891 { | |
892 print "Function analysis is skipped.\n"; | |
893 } | |
894 | |
895 sub outputCOGStatistic() | |
896 { | |
897 (my $file,my $subcogcount)=@_; | |
898 my @cogcat=("J A K L B","D Y V T M N Z W U O","C G E F H I P Q","R S -"); | |
899 my @cogdesc=("INFORMATION STORAGE AND PROCESSING","CELLULAR PROCESSES AND SIGNALING","METABOLISM","POORLY CHARACTERIZED"); | |
900 my @subcogcat=qw(J A K L B D Y V T M N Z W U O C G E F H I P Q R S -); | |
901 my @subcogdesc=("[J] Translation, ribosomal structure and biogenesis","[A] RNA processing and modification","[K] Transcription","[L] Replication, recombination and repair","[B] Chromatin structure and dynamics","[D] Cell cycle control, cell division, chromosome partitioning","[Y] Nuclear structure","[V] Defense mechanisms","[T] Signal transduction mechanisms","[M] Cell wall/membrane/envelope biogenesis","[N] Cell motility","[Z] Cytoskeleton","[W] Extracellular structures","[U] Intracellular trafficking, secretion, and vesicular transport","[O] Posttranslational modification, protein turnover, chaperones","[C] Energy production and conversion","[G] Carbohydrate transport and metabolism","[E] Amino acid transport and metabolism","[F] Nucleotide transport and metabolism","[H] Coenzyme transport and metabolism","[I] Lipid transport and metabolism","[P] Inorganic ion transport and metabolism","[Q] Secondary metabolites biosynthesis, transport and catabolism","[R] General function prediction only","[S] Function unknown","[-] Unclassified"); | |
902 my %subcogdesc; | |
903 my $key; | |
904 my @cog; | |
905 my $i; | |
906 my $cognum; | |
907 | |
908 for ($i=0;$i<@subcogcat;$i++) | |
909 { | |
910 $subcogdesc{$subcogcat[$i]}=$subcogdesc[$i]; | |
911 } | |
912 | |
913 open(R,">$file"); | |
914 for ($i=0;$i<@cogcat;$i++) | |
915 { | |
916 $cognum=0; | |
917 foreach $key (split(" ",$cogcat[$i])) | |
918 { | |
919 $cognum=$cognum+$$subcogcount{$key}; | |
920 } | |
921 print R $cogdesc[$i]." ( ".$cognum." )\n"; | |
922 foreach $key (split(" ",$cogcat[$i])) | |
923 { | |
924 printf R "%-6d %s\n",$$subcogcount{$key},$subcogdesc{$key}; | |
925 } | |
926 print R "\n"; | |
927 | |
928 } | |
929 close(R); | |
930 | |
931 } | |
932 | |
933 sub scanCOG() | |
934 { | |
935 (my $file,my $max_orth,my $min_orth)=@_; | |
936 my @row; | |
937 my @subcogcat=qw(J A K L B D Y V T M N Z W U O C G E F H I P Q R S -); | |
938 my %subcogcount; | |
939 my $cog; | |
940 my $key; | |
941 | |
942 foreach $key (@subcogcat) | |
943 { | |
944 $subcogcount{$key}=0; | |
945 } | |
946 | |
947 @subcogcat=qw(J A K L B D Y V T M N Z W U O C G E F H I P Q R S); | |
948 | |
949 open(F,"$file"); | |
950 $_=<F>; | |
951 while (<F>) | |
952 { | |
953 @row=split(/\t/,$_); | |
954 if ($row[1]>=$min_orth and $row[1]<=$max_orth) | |
955 { | |
956 if ($row[2] eq "-") | |
957 { | |
958 $subcogcount{"-"}++; | |
959 }else | |
960 { | |
961 $_=uc($row[2]); | |
962 s/COG//gi; | |
963 $cog=$_; | |
964 foreach $key (@subcogcat) | |
965 { | |
966 if ($cog=~/$key/) | |
967 { | |
968 $subcogcount{$key}++; | |
969 } | |
970 } | |
971 } | |
972 } | |
973 } | |
974 close(F); | |
975 | |
976 return \%subcogcount; | |
977 } | |
978 | |
979 | |
980 | |
981 | |
982 sub getCOG() | |
983 { | |
984 (my $data,my $coghash)=@_; | |
985 my $cog=""; | |
986 my @cog; | |
987 my $key; | |
988 my %hash; | |
989 my @gene=split(/\t|\,/,join("\t",@$data)); | |
990 @gene=grep(/^S/,@gene); | |
991 | |
992 foreach $key (@gene) | |
993 { | |
994 if (($$coghash{$key} ne "-") and ($$coghash{$key} ne "")) | |
995 { | |
996 $cog=$cog.",".$$coghash{$key}; | |
997 } | |
998 } | |
999 @cog=split(/,/,$cog); | |
1000 foreach $cog (@cog) | |
1001 { | |
1002 if ($cog ne "") | |
1003 { | |
1004 $hash{$cog}=1; | |
1005 } | |
1006 } | |
1007 | |
1008 $cog=join(",",(keys %hash)); | |
1009 if ($cog eq "") | |
1010 { | |
1011 $cog="-"; | |
1012 } | |
1013 return $cog; | |
1014 } | |
1015 | |
1016 sub getDescription() | |
1017 { | |
1018 (my $data,my $deschash)=@_; | |
1019 my $desc=""; | |
1020 my $key; | |
1021 my @gene=split(/\t|\,/,join("\t",@$data)); | |
1022 @gene=grep(/^S/,@gene); | |
1023 | |
1024 foreach $key (@gene) | |
1025 { | |
1026 if ( ($$deschash{$key} ne "") and ($$deschash{$key} ne "-") and ($$deschash{$key}!~/hypothetical/)) | |
1027 { | |
1028 $desc=$$deschash{$key}; | |
1029 } | |
1030 } | |
1031 | |
1032 if ($desc eq "") | |
1033 { | |
1034 $desc="hypothetical protein"; | |
1035 } | |
1036 | |
1037 return $desc; | |
1038 } | |
1039 | |
1040 | |
1041 | |
1042 sub ReadAnnotation() | |
1043 { | |
1044 (my $species,my $cog,my $description)=@_; | |
1045 my $i; | |
1046 my @row; | |
1047 | |
1048 for ($i=0;$i<@$species;$i++) | |
1049 { | |
1050 open(F,"$$species[$i].function"); | |
1051 while (<F>) | |
1052 { | |
1053 chomp($_); | |
1054 @row=split(/\t/,$_); | |
1055 if (scalar(@row)>=2) | |
1056 { | |
1057 $$cog{$row[0]}=$row[1]; | |
1058 }else | |
1059 { | |
1060 $$cog{$row[0]}="-"; | |
1061 } | |
1062 | |
1063 if (scalar(@row)>=3) | |
1064 { | |
1065 $$description{$row[0]}=$row[2]; | |
1066 }else | |
1067 { | |
1068 $$description{$row[0]}="hypothetical protein"; | |
1069 } | |
1070 } | |
1071 close(F); | |
1072 } | |
1073 } | |
1074 | |
1075 | |
1076 sub SNPBasedTree() | |
1077 { | |
1078 (my $infile,my $outfileprefix)=@_; | |
1079 my $tmpin=$infile; | |
1080 my $tmpout; | |
1081 | |
1082 | |
1083 #### boootstrap | |
1084 print "\n#### seqboot ...\n\n"; | |
1085 open(R,">seqboot.cmd"); | |
1086 #print R "$tmpin\n"; | |
1087 print R "R\n"; | |
1088 print R "$bootstrap\n"; | |
1089 print R "Y\n"; | |
1090 print R "1\n"; | |
1091 close(R); | |
1092 | |
1093 system("cp $tmpin infile"); | |
1094 system("$seqboot < seqboot.cmd"); | |
1095 system("mv outfile 100dnaseq"); | |
1096 system("rm -rf infile"); | |
1097 system("rm seqboot.cmd"); # 100dnasseq | |
1098 | |
1099 #### dnaml | |
1100 print "\n#### dnaml ...\n\n"; | |
1101 open(R,">dnaml.cmd"); | |
1102 #print R "100dnaseq\n"; | |
1103 print R "T\n"; | |
1104 print R "25\n"; | |
1105 if ($bootstrap>1) | |
1106 { | |
1107 print R "M\n"; | |
1108 print R "D\n"; | |
1109 #print R "100\n"; | |
1110 print R "$bootstrap\n"; | |
1111 print R "1\n"; # Random number seed (must be odd)? | |
1112 print R "5\n"; # Number of times to jumble? | |
1113 } | |
1114 print R "Y\n"; | |
1115 close(R); | |
1116 | |
1117 system("cp 100dnaseq infile"); | |
1118 system("$dnaml < dnaml.cmd"); | |
1119 system("rm -rf outfile"); | |
1120 system("rm -rf infile"); | |
1121 system("mv outtree 100dnaseqtree"); # 100dnaseq, 100dnaseqtree | |
1122 | |
1123 #### consense | |
1124 print "\n#### dnaml consense ...\n\n"; | |
1125 | |
1126 open(R,">consense.cmd"); | |
1127 #print R "100dnaseqtree\n"; | |
1128 print R "Y\n"; | |
1129 close(R); | |
1130 | |
1131 system("cp 100dnaseqtree intree"); | |
1132 system("$consense < consense.cmd"); | |
1133 system("mv outfile ${outfileprefix}.ML.outfile"); | |
1134 system("mv outtree ${outfileprefix}.ML.tree"); | |
1135 system("rm -rf infile"); | |
1136 system("rm -rf 100dnaseqtree"); # 100dnaseq | |
1137 | |
1138 #### dnadist | |
1139 print "\n#### dnadist ...\n\n"; | |
1140 open(R,">dnadist.cmd"); | |
1141 #print R "100dnaseq\n"; | |
1142 print R "T\n"; | |
1143 print R "25\n"; | |
1144 if ($bootstrap>1) | |
1145 { | |
1146 print R "M\n"; | |
1147 print R "D\n"; | |
1148 #print R "100\n"; | |
1149 print R "$bootstrap\n"; | |
1150 } | |
1151 print R "Y\n"; | |
1152 close(R); | |
1153 | |
1154 system("cp 100dnaseq infile"); | |
1155 system("$dnadist < dnadist.cmd"); | |
1156 system("rm -rf 100dnaseq"); | |
1157 system("rm -rf infile"); | |
1158 system("mv outfile 100dnadist"); # 100dnadist | |
1159 | |
1160 #### Neighbor-joining tree | |
1161 print "\n#### Neighbor-joining ...\n\n"; | |
1162 open(R,">NJ.cmd"); | |
1163 if ($bootstrap>1) | |
1164 { | |
1165 #print R "100dnadist\n"; | |
1166 print R "M\n"; | |
1167 #print R "100\n"; | |
1168 print R "$bootstrap\n"; | |
1169 print R "1\n"; | |
1170 } | |
1171 print R "Y\n"; | |
1172 close(R); | |
1173 | |
1174 system("cp 100dnadist infile"); | |
1175 system("$neighbor < NJ.cmd"); | |
1176 system("mv outtree 100dnadistNJtree"); | |
1177 system("rm outfile"); | |
1178 system("rm -rf infile"); | |
1179 system("rm -rf NJ.cmd"); # 100dnadist,100dnadistNJtree | |
1180 | |
1181 #### NJ-consense | |
1182 print "\n#### NJ-consense ...\n\n"; | |
1183 open(R,">NJ-consense.cmd"); | |
1184 #print R "100dnadistNJtree\n"; | |
1185 print R "Y\n"; | |
1186 close(R); | |
1187 | |
1188 system("cp 100dnadistNJtree intree"); | |
1189 system("$consense < NJ-consense.cmd"); | |
1190 system("mv outfile ${outfileprefix}.Neighbor-joining.outfile"); | |
1191 system("mv outtree ${outfileprefix}.Neighbor-joining.tree"); | |
1192 system("rm -rf NJ-consense.cmd"); | |
1193 system("rm -rf intree"); | |
1194 system("rm -rf 100dnadistNJtree"); | |
1195 | |
1196 | |
1197 #### UPGMA tree | |
1198 print "\n#### UPGMA ...\n\n"; | |
1199 open(R,">UPGMA.cmd"); | |
1200 #print R "100dnadist\n"; | |
1201 print R "N\n"; | |
1202 if ($bootstrap>1) | |
1203 { | |
1204 print R "M\n"; | |
1205 #print R "100\n"; | |
1206 print R "$bootstrap\n"; | |
1207 print R "1\n"; | |
1208 } | |
1209 print R "Y\n"; | |
1210 close(R); | |
1211 | |
1212 system("cp 100dnadist infile"); | |
1213 system("$neighbor < UPGMA.cmd"); | |
1214 system("mv outtree 100dnadistUPGMAtree"); | |
1215 system("rm -rf outfile"); | |
1216 system("rm -rf infile"); | |
1217 system("rm -rf UPGMA.cmd"); | |
1218 | |
1219 #### UPGMA-consense | |
1220 print "\n#### UPGMA-consense ...\n\n"; | |
1221 open(R,">UPGMA-consense.cmd"); | |
1222 #print R "100dnadistUPGMAtree\n"; | |
1223 print R "Y\n"; | |
1224 close(R); | |
1225 | |
1226 system("cp 100dnadistUPGMAtree intree"); | |
1227 system("$consense < UPGMA-consense.cmd"); | |
1228 system("mv outfile ${outfileprefix}.UPGMA.outfile"); | |
1229 system("mv outtree ${outfileprefix}.UPGMA.tree"); | |
1230 system("rm -rf UPGMA-consense.cmd"); | |
1231 system("rm -rf 100dnadistUPGMAtree"); | |
1232 system("rm -rf intree"); | |
1233 | |
1234 ###CLEAN TMP FILE | |
1235 | |
1236 system("rm -rf *.cmd"); | |
1237 system("rm -rf 100dnadist"); | |
1238 } | |
1239 | |
1240 sub PanBasedTree() | |
1241 { | |
1242 (my $infile,my $outfileprefix)=@_; | |
1243 my $tmpin; | |
1244 my $tmpout; | |
1245 | |
1246 $tmpin=$infile; | |
1247 #### Neighbor-joining tree | |
1248 | |
1249 open(R,">NJ.cmd"); | |
1250 #print R "$tmpin\n"; | |
1251 print R "Y\n"; | |
1252 close(R); | |
1253 | |
1254 system("cp $tmpin infile"); | |
1255 system("$neighbor < NJ.cmd"); | |
1256 system("mv outfile ${outfileprefix}.Neighbor-joining.outfile"); | |
1257 system("mv outtree ${outfileprefix}.Neighbor-joining.tree"); | |
1258 system("rm -rf NJ.cmd"); | |
1259 system("rm -rf infile"); | |
1260 | |
1261 #### UPGMA tree | |
1262 | |
1263 open(R,">UPGMA.cmd"); | |
1264 #print R "$tmpin\n"; | |
1265 print R "N\n"; | |
1266 print R "Y\n"; | |
1267 close(R); | |
1268 | |
1269 system("cp $tmpin infile"); | |
1270 system("$neighbor < UPGMA.cmd"); | |
1271 system("mv outfile ${outfileprefix}.UPGMA.outfile"); | |
1272 system("mv outtree ${outfileprefix}.UPGMA.tree"); | |
1273 system("rm -rf UPGMA.cmd"); | |
1274 system("rm -rf infile"); | |
1275 | |
1276 | |
1277 ###CLEAN TMP FILE | |
1278 system("rm -rf *.cmd"); | |
1279 } | |
1280 | |
1281 sub DistanceMatrix() | |
1282 { | |
1283 (my $spnum,my $hash)=@_; | |
1284 my $i; | |
1285 my $j; | |
1286 my $k; | |
1287 my $dist; | |
1288 my $ref; | |
1289 my $query; | |
1290 foreach $i (0..($spnum-1)) | |
1291 { | |
1292 foreach $j ($i..($spnum-1)) | |
1293 { | |
1294 $ref=$$hash{$i}; | |
1295 $query=$$hash{$j}; | |
1296 $dist=0; | |
1297 for ($k=0;$k<length($ref);$k++) | |
1298 { | |
1299 if (substr($ref,$k,1) ne substr($query,$k,1)) | |
1300 { | |
1301 $dist++; | |
1302 } | |
1303 } | |
1304 $$hash{$i."-".$j}=$dist; | |
1305 $$hash{$j."-".$i}=$dist; | |
1306 } | |
1307 } | |
1308 } | |
1309 | |
1310 | |
1311 sub ClusterProfil4Specie | |
1312 { | |
1313 (my $hash)=@_; | |
1314 my @row; | |
1315 my $i; | |
1316 | |
1317 foreach (0..($spnum-1)) #initialization Hash | |
1318 { | |
1319 $$hash{$_}=""; | |
1320 } | |
1321 | |
1322 foreach (@clusters) | |
1323 { | |
1324 @row=split(/\t/,$_); | |
1325 splice(@row,0,1); | |
1326 if (&CountSpeicesInCluster(join("\t",@row))>1) | |
1327 { | |
1328 for ($i=0;$i<@row;$i++) | |
1329 { | |
1330 if ($row[$i] eq "-") | |
1331 { | |
1332 $$hash{$i}=$$hash{$i}."0"; | |
1333 }else | |
1334 { | |
1335 $$hash{$i}=$$hash{$i}."1"; | |
1336 } | |
1337 } | |
1338 } | |
1339 } | |
1340 } | |
1341 | |
1342 # &ExtractSNP4tree(\%tmpHash,\%nt4tree); | |
1343 | |
1344 sub ExtractSNP4tree() | |
1345 { | |
1346 (my $hash,my $nt4treeRef)=@_; | |
1347 my $key; | |
1348 my @row; | |
1349 my $i; | |
1350 my $len; | |
1351 my @tribases; | |
1352 foreach $key (keys %$hash) | |
1353 { | |
1354 $$hash{substr($key,0,index($key,"G"))}=$$hash{$key}; | |
1355 delete($$hash{$key}); | |
1356 } | |
1357 | |
1358 for ($i=0;$i<$spnum;$i++) | |
1359 { | |
1360 $nt4tree{"S".$i}=$nt4tree{"S".$i}.$$hash{"S".$i}; | |
1361 } | |
1362 } | |
1363 | |
1364 | |
1365 =pod | |
1366 sub ExtractSNP4tree() | |
1367 { | |
1368 (my $hash,my $nt4treeRef)=@_; | |
1369 my $key; | |
1370 my @row; | |
1371 my $i; | |
1372 my $len; | |
1373 my @tribases; | |
1374 foreach $key (keys %$hash) | |
1375 { | |
1376 $$hash{substr($key,0,index($key,"G"))}=$$hash{$key}; | |
1377 delete($$hash{$key}); | |
1378 } | |
1379 @_=(keys %$hash); | |
1380 $len=length($_[0]); | |
1381 for ($j=0;3*$j<$len;$j++) | |
1382 { | |
1383 ##### scanning each codon | |
1384 for ($i=0;$i<$spnum;$i++) | |
1385 { | |
1386 $tribases[$i]=substr($$hash{"S".$i},3*$j,3); | |
1387 } | |
1388 ##### checking each codon | |
1389 if (&IsTheSame(@tribases) ==0) | |
1390 { | |
1391 for ($i=0;$i<@tribases;$i++) | |
1392 { | |
1393 $nt4tree{"S".$i}=$nt4tree{"S".$i}.$tribases[$i]; | |
1394 } | |
1395 } | |
1396 } | |
1397 } | |
1398 =cut | |
1399 | |
1400 | |
1401 sub pal2nal() | |
1402 { | |
1403 (my $pal,my $nuc, my $nal)=@_; | |
1404 my %aaAln=(); | |
1405 my %ffn=(); | |
1406 my %ntAln=(); | |
1407 my %nt; | |
1408 my $dna; | |
1409 my $nt; | |
1410 my $key; | |
1411 my $flag=1; | |
1412 my $i=0; | |
1413 my $j; | |
1414 | |
1415 ### read protein aligment result | |
1416 &ReadAlignmentToHash("$pal",\%aaAln); | |
1417 ### read nt sequences | |
1418 &ReadSequenceInToHash("$nuc",\%ffn); | |
1419 foreach $key (keys %ffn) | |
1420 { | |
1421 $dna=$ffn{$key}; | |
1422 #if (int(length($nt{$key})/3)*3 ne length($nt{$key})) | |
1423 if (int(length($dna)/3)*3 ne length($dna)) | |
1424 { | |
1425 $flag=0; | |
1426 print "The length of nucleotide sequence is not 3 integer times.\n"; | |
1427 last; | |
1428 }else | |
1429 { | |
1430 for ($i=0;$i<(length($dna)/3);$i++) | |
1431 { | |
1432 $nt{$key."|".$i}=substr($dna,$i*3,3); | |
1433 } | |
1434 } | |
1435 } | |
1436 | |
1437 if ($flag==0) | |
1438 { | |
1439 return 0; | |
1440 }else | |
1441 { | |
1442 foreach $key (keys %aaAln) ### replace aa with corresponding nt | |
1443 { | |
1444 $nt=""; | |
1445 $i=0; | |
1446 for ($j=0;$j<length($aaAln{$key});$j++) | |
1447 { | |
1448 if (substr($aaAln{$key},$j,1) eq "-") | |
1449 { | |
1450 $nt=$nt."---"; | |
1451 }else | |
1452 { | |
1453 $nt=$nt.$nt{$key."|".$i}; | |
1454 $i++; | |
1455 } | |
1456 } | |
1457 $ntAln{$key}=$nt; | |
1458 } | |
1459 | |
1460 ### output | |
1461 open(R,">$nal"); | |
1462 foreach (keys %ntAln) | |
1463 { | |
1464 print R ">$_\n".$ntAln{$_}."\n"; | |
1465 } | |
1466 close(R); | |
1467 | |
1468 return 1; | |
1469 } | |
1470 } | |
1471 | |
1472 | |
1473 | |
1474 | |
1475 sub DetectSNP() | |
1476 { | |
1477 my %faa; | |
1478 my %ffn; | |
1479 my @row; | |
1480 my $count_gene; | |
1481 my $count_sp; | |
1482 my @genelist; | |
1483 my $i; | |
1484 my $j; | |
1485 my $pepalnlen; | |
1486 my @cdsvar=qw(); | |
1487 my $cdi=0; | |
1488 my @tribases; | |
1489 my @bases; | |
1490 my @aa; | |
1491 | |
1492 | |
1493 ### fetch gene list | |
1494 open(F,"$ClusterID.pep"); | |
1495 @genelist=<F>; | |
1496 close(F); | |
1497 @genelist=grep(/^>/,@genelist); | |
1498 chomp(@genelist); | |
1499 $_=join("\t",@genelist); | |
1500 s/>//g; | |
1501 @genelist=split(/\t/,$_); | |
1502 | |
1503 ### count gene number and species number | |
1504 @row=split(/\t/,$clusters[$ClusterID-1]); | |
1505 splice(@row,0,1); | |
1506 $count_sp=&CountSpeicesInCluster(join("\t",@row)); | |
1507 $count_gene=&CountGeneInCluster(join("\t",@row)); | |
1508 | |
1509 ### read alignment sequences | |
1510 &ReadAlignmentToHash("$ClusterID.pal",\%faa); | |
1511 &ReadAlignmentToHash("$ClusterID.nal",\%ffn); | |
1512 | |
1513 @_=(keys %faa); | |
1514 $pepalnlen=length($faa{$_[0]}); | |
1515 ### scan SNP | |
1516 for ($i=1;$i<=$pepalnlen;$i++) | |
1517 { | |
1518 @tmp=qw(); | |
1519 @tribases=qw(); | |
1520 for ($j=0;$j<@genelist;$j++) ### fetch triplet codon | |
1521 { | |
1522 $tribases[$j]=substr($ffn{$genelist[$j]},3*($i-1),3); | |
1523 } | |
1524 if (&IsTheSame(@tribases) ==0) ### if triplet codon is not consistent | |
1525 { | |
1526 @aa=qw(); | |
1527 for ($j=0;$j<@genelist;$j++) | |
1528 { | |
1529 $aa[$j]=substr($faa{$genelist[$j]},($i-1),1); | |
1530 } | |
1531 if (&IsTheSame(@aa) ==0) ### aa is not consistent | |
1532 { | |
1533 if (join("",@aa) =~/-/) | |
1534 { | |
1535 $cdsvar[$cdi++]=$ClusterID."\t".$count_sp."\t".$count_gene."\t".$i."\t".&CharType(\@aa)."\t-\t-\tInDel\n"; | |
1536 }else | |
1537 { | |
1538 #### base 1 | |
1539 for ($j=0;$j<@genelist;$j++) | |
1540 { | |
1541 $bases[$j]=substr($ffn{$genelist[$j]},3*($i-1),1); | |
1542 } | |
1543 if (&IsTheSame(@bases) ==0) | |
1544 { | |
1545 $cdsvar[$cdi++]=$ClusterID."\t".$count_sp."\t".$count_gene."\t".($i+0.1)."\t".&CharType(\@aa)."\t".&CharType(\@bases)."\t".join("",@bases)."\tNonsynonymous mutation\n"; | |
1546 } | |
1547 #### base 2 | |
1548 for ($j=0;$j<@genelist;$j++) | |
1549 { | |
1550 $bases[$j]=substr($ffn{$genelist[$j]},3*($i-1)+1,1); | |
1551 } | |
1552 if (&IsTheSame(@bases) ==0) | |
1553 { | |
1554 $cdsvar[$cdi++]=$ClusterID."\t".$count_sp."\t".$count_gene."\t".($i+0.2)."\t".&CharType(\@aa)."\t".&CharType(\@bases)."\t".join("",@bases)."\tNonsynonymous mutation\n"; | |
1555 } | |
1556 #### base 3 | |
1557 for ($j=0;$j<@genelist;$j++) | |
1558 { | |
1559 $bases[$j]=substr($ffn{$genelist[$j]},3*($i-1)+2,1); | |
1560 } | |
1561 if (&IsTheSame(@bases) ==0) | |
1562 { | |
1563 $cdsvar[$cdi++]=$ClusterID."\t".$count_sp."\t".$count_gene."\t".($i+0.3)."\t".&CharType(\@aa)."\t".&CharType(\@bases)."\t".join("",@bases)."\tNonsynonymous mutation\n"; | |
1564 } | |
1565 } | |
1566 }else | |
1567 { | |
1568 #### base 1 | |
1569 for ($j=0;$j<@genelist;$j++) | |
1570 { | |
1571 $bases[$j]=substr($ffn{$genelist[$j]},3*($i-1),1); | |
1572 } | |
1573 if (&IsTheSame(@bases) ==0) | |
1574 { | |
1575 $cdsvar[$cdi++]=$ClusterID."\t".$count_sp."\t".$count_gene."\t".($i+0.1)."\t".&CharType(\@aa)."\t".&CharType(\@bases)."\t".join("",@bases)."\tSynonymous mutation\n"; | |
1576 } | |
1577 #### base 2 | |
1578 for ($j=0;$j<@genelist;$j++) | |
1579 { | |
1580 $bases[$j]=substr($ffn{$genelist[$j]},3*($i-1)+1,1); | |
1581 } | |
1582 if (&IsTheSame(@bases) ==0) | |
1583 { | |
1584 $cdsvar[$cdi++]=$ClusterID."\t".$count_sp."\t".$count_gene."\t".($i+0.2)."\t".&CharType(\@aa)."\t".&CharType(\@bases)."\t".join("",@bases)."\tSynonymous mutation\n"; | |
1585 } | |
1586 #### base 3 | |
1587 for ($j=0;$j<@genelist;$j++) | |
1588 { | |
1589 $bases[$j]=substr($ffn{$genelist[$j]},3*($i-1)+2,1); | |
1590 } | |
1591 if (&IsTheSame(@bases) ==0) | |
1592 { | |
1593 $cdsvar[$cdi++]=$ClusterID."\t".$count_sp."\t".$count_gene."\t".($i+0.3)."\t".&CharType(\@aa)."\t".&CharType(\@bases)."\t".join("",@bases)."\tSynonymous mutation\n"; | |
1594 } | |
1595 } | |
1596 } | |
1597 } | |
1598 return @cdsvar; | |
1599 } | |
1600 | |
1601 | |
1602 sub VarAnalysis() | |
1603 { | |
1604 (my $data)=@_; | |
1605 my @data=@$data; | |
1606 my $indel=0; | |
1607 my $syn=0; | |
1608 my $nonsyn=0; | |
1609 my @tmp; | |
1610 $indel=scalar(grep(/InDel$/,@data)); | |
1611 $nonsyn=scalar(grep(/Nonsynonymous mutation$/,@data));; | |
1612 $syn=scalar(grep(/Synonymous mutation$/,@data)); | |
1613 return "$indel\t$nonsyn\t$syn"; | |
1614 } | |
1615 | |
1616 | |
1617 | |
1618 sub CharType() | |
1619 { | |
1620 (my $str)=@_; | |
1621 my %hash; | |
1622 my @data=@$str; | |
1623 foreach (@data) | |
1624 { | |
1625 $hash{$_}=1; | |
1626 } | |
1627 return join(",",(keys %hash)); | |
1628 } | |
1629 | |
1630 sub IsTheSame() | |
1631 { | |
1632 (my @data)=@_; | |
1633 my %hash; | |
1634 foreach (@data) | |
1635 { | |
1636 $hash{$_}=1; | |
1637 } | |
1638 if (scalar(keys %hash) ==1) | |
1639 { | |
1640 return 1; | |
1641 }else | |
1642 { | |
1643 return 0; | |
1644 } | |
1645 } | |
1646 | |
1647 | |
1648 | |
1649 sub FormatClusterOutPut() | |
1650 { | |
1651 (my $speices,my $file,my $cluster)=@_; | |
1652 my @row; | |
1653 my $gid=1; | |
1654 my $key; | |
1655 my %hash; | |
1656 my $gene; | |
1657 my @tmp; | |
1658 my $i; | |
1659 my $j; | |
1660 open(R,">$file"); | |
1661 print R "ClutserID\t".join("\t",@$speices)."\n"; | |
1662 foreach $key (@$cluster) | |
1663 { | |
1664 @row=split(/\t/,$key); | |
1665 for ($i=0;$i<@row;$i++) | |
1666 { | |
1667 if ($row[$i] ne "-") | |
1668 { | |
1669 @tmp=split(/,/,$row[$i]); | |
1670 for ($j=0;$j<@tmp;$j++) | |
1671 { | |
1672 $_=$tmp[$j]; | |
1673 s/^S[0-9]+G//; | |
1674 $tmp[$j]=$_; | |
1675 } | |
1676 $row[$i]=join(",",@tmp); | |
1677 } | |
1678 } | |
1679 print R $gid."\t".join("\t",@row)."\n"; | |
1680 $gid++; | |
1681 } | |
1682 close(R); | |
1683 } | |
1684 sub RetrieveClusterFromFile() | |
1685 { | |
1686 (my $file,my $clusters)=@_; | |
1687 my @content; | |
1688 my @row; | |
1689 my $spid; | |
1690 my $line=0; | |
1691 my $i=0; | |
1692 my $j; | |
1693 my @tmp; | |
1694 open(F,$file) or die "Could open $file\n"; | |
1695 @content=<F>; | |
1696 close(F); | |
1697 splice(@content,0,1); | |
1698 chomp(@content); | |
1699 foreach (@content) | |
1700 { | |
1701 @row=split(/\t/,$_); | |
1702 $$clusters[$line]=$row[0]; | |
1703 splice(@row,0,1); | |
1704 for ($i=0;$i<@row;$i++) | |
1705 { | |
1706 if ($row[$i] ne "-") | |
1707 { | |
1708 @tmp=split(/,/,$row[$i]); | |
1709 for ($j=0;$j<@tmp;$j++) | |
1710 { | |
1711 $tmp[$j]="S${i}G".$tmp[$j]; | |
1712 } | |
1713 $row[$i]=join(",",@tmp); | |
1714 } | |
1715 } | |
1716 $$clusters[$line]=$$clusters[$line]."\t".join("\t",@row)."\n"; | |
1717 $line++; | |
1718 } | |
1719 } | |
1720 | |
1721 | |
1722 | |
1723 | |
1724 sub GeneDistribution() | |
1725 { | |
1726 (my $clusters,my $hash)=@_; | |
1727 my @row; | |
1728 my $spid; | |
1729 my $orth; | |
1730 my $key; | |
1731 foreach (@$clusters) | |
1732 { | |
1733 @row=split(/\t/,$_); | |
1734 splice(@row,0,1); | |
1735 $orth=&CountSpeicesInCluster(join("\t",@row)); | |
1736 @row=split(/\t|\,/,join("\t",@row)); | |
1737 foreach $key (@row) | |
1738 { | |
1739 if ($key ne "-") | |
1740 { | |
1741 $spid=substr($key,1,(index($key,'G')-1)); ###extract strains id | |
1742 if (exists($$hash{$orth."|".$spid})) | |
1743 { | |
1744 $$hash{$orth."|".$spid}++; | |
1745 }else | |
1746 { | |
1747 $$hash{$orth."|".$spid}=1; | |
1748 } | |
1749 } | |
1750 } | |
1751 } | |
1752 } | |
1753 | |
1754 sub CountSpeicesInCluster() | |
1755 { | |
1756 (my $str)=@_; | |
1757 chomp($str); | |
1758 my @list=split(/\t/,$str); | |
1759 my $key; | |
1760 my $count=0; | |
1761 | |
1762 foreach $key (@list) | |
1763 { | |
1764 if ($key ne "-") | |
1765 { | |
1766 $count++; | |
1767 } | |
1768 } | |
1769 return $count; | |
1770 } | |
1771 | |
1772 sub CountGeneInCluster() | |
1773 { | |
1774 (my $str)=@_; | |
1775 chomp(); | |
1776 my @list=split(/\t|\,/,$str); | |
1777 my $key; | |
1778 my $count=0; | |
1779 foreach $key (@list) | |
1780 { | |
1781 if ($key ne "-") | |
1782 { | |
1783 $count++; | |
1784 } | |
1785 } | |
1786 return $count; | |
1787 } | |
1788 | |
1789 | |
1790 | |
1791 | |
1792 sub GF() | |
1793 { | |
1794 &PrepareFasta(\@species,$inputDIR,".pep"); ###prepare pep file | |
1795 system("cat ".join(".pep ",@species).".pep > All.pep"); | |
1796 system("grep '>' All.pep > genelist"); | |
1797 system("$formatdb -p T -i All.pep"); | |
1798 system("$blastall -p blastp -i All.pep -d All.pep -M BLOSUM45 -m9 -e $evalue -o All.blastp -a $thread"); | |
1799 system("perl ./Blast_Filter.pl All.blastp All.pep $coverage $identity $score | $mcl - --abc -I 2.0 -o All.cluster"); | |
1800 &FormatCluster("All.cluster","genelist",$spnum,\@clusters); | |
1801 #system("rm -rf *.pep* All.blastp All.cluster genelist"); | |
1802 } | |
1803 | |
1804 sub MP() | |
1805 { | |
1806 # (my $species,my $inputDIR,my $thread,my $evalue,my $score,my $coverage,my $identity)=@_; | |
1807 my $i; | |
1808 my $j; | |
1809 &PrepareFasta(\@species,$inputDIR,".pep"); ###prepare pep file | |
1810 system("cat ".join(".pep ",@species).".pep > All.pep"); | |
1811 system("grep '>' All.pep > genelist"); | |
1812 system("rm -rf All.pep"); | |
1813 for ($i=0;$i<$spnum;$i++) | |
1814 { | |
1815 for ($j=$i+1;$j<$spnum;$j++) | |
1816 { | |
1817 system("perl ./inparanoid.pl $blastall $thread $formatdb $score $global $local $species[$i].pep $species[$j].pep"); | |
1818 } | |
1819 } | |
1820 system("perl ./multiparanoid.pl -species ".join(".pep+",@species).".pep -unique 1"); | |
1821 ###convert the MP result to table list based on gene | |
1822 &MP_Result_to_Table("MP.Cluster","All.cluster"); | |
1823 &FormatCluster("All.cluster","genelist",$spnum,\@clusters); | |
1824 system("rm -rf sqltable.* *.pep* MP.Cluster genelist"); | |
1825 } | |
1826 | |
1827 sub fasta2phylip() | |
1828 { | |
1829 (my $input,my $output)=@_; | |
1830 use Bio::AlignIO; | |
1831 my $inputfilename = "10.aln"; | |
1832 my $in= Bio::AlignIO->new(-file => $input , | |
1833 -format => 'fasta'); | |
1834 my $out = Bio::AlignIO->new(-file => ">$output" , | |
1835 -format => 'phylip'); | |
1836 while ( my $aln = $in->next_aln() ) | |
1837 { | |
1838 $out->write_aln($aln); | |
1839 } | |
1840 } | |
1841 | |
1842 | |
1843 sub RemoveHeadGap() | |
1844 { | |
1845 (my $nal,my $hash)=@_; | |
1846 my %aln; | |
1847 my $key; | |
1848 my $gaplength=0; | |
1849 my $len1; | |
1850 my $len2; | |
1851 &ReadSequenceInToHash("$nal",\%aln); | |
1852 foreach $key (keys %aln) | |
1853 { | |
1854 $len1=length($aln{$key}); | |
1855 $_=$aln{$key}; | |
1856 s/^-+//; | |
1857 $len2=length($_); | |
1858 if (($len1-$len2)>$gaplength) | |
1859 { | |
1860 $gaplength=$len1-$len2; | |
1861 } | |
1862 } | |
1863 foreach $key (keys %aln) | |
1864 { | |
1865 $$hash{$key}=$$hash{$key}.substr($aln{$key},$gaplength,(length($aln{$key})-$gaplength)); | |
1866 } | |
1867 } | |
1868 | |
1869 sub PrepareFasta() | |
1870 { | |
1871 (my $species,my $inputDIR,my $extention)=@_; | |
1872 my $sp; | |
1873 my $file; | |
1874 my $i; | |
1875 my %hash; | |
1876 my $key; | |
1877 for ($i=0;$i<@$species;$i++) | |
1878 { | |
1879 $file=$inputDIR.$$species[$i].$extention; | |
1880 %hash=(); | |
1881 &ReadSequenceInToHash($file,\%hash); | |
1882 open(R,">$$species[$i]${extention}") or die "Could write into $file\n"; | |
1883 foreach $key (keys %hash) | |
1884 { | |
1885 print R ">S${i}G$key\n"; | |
1886 print R $hash{$key}."\n"; | |
1887 } | |
1888 close(R); | |
1889 } | |
1890 } | |
1891 | |
1892 sub PrepareTable() | |
1893 { | |
1894 (my $species,my $inputDIR,my $extention)=@_; | |
1895 my @content; | |
1896 my $i; | |
1897 my @row; | |
1898 my $file; | |
1899 for ($i=0;$i<@$species;$i++) | |
1900 { | |
1901 $file=$inputDIR.$$species[$i].$extention; | |
1902 open(F,$file) or die "Could open $file\n"; | |
1903 @content=<F>; | |
1904 close(F); | |
1905 chomp(@content); | |
1906 open(R,">$$species[$i]${extention}") or die "Could write into $file\n"; | |
1907 foreach (@content) | |
1908 { | |
1909 @row=split(/\t/,$_); | |
1910 $row[0]="S${i}G$row[0]"; | |
1911 if ($extention eq ".location") | |
1912 { | |
1913 $row[0]=$row[0]."\t".$row[0]; | |
1914 } | |
1915 print R join("\t",@row)."\n"; | |
1916 } | |
1917 close(R); | |
1918 } | |
1919 } | |
1920 | |
1921 sub CheckExtraProgram | |
1922 { | |
1923 #(my $section, my $method, my $tmparray)=@_; | |
1924 my @error; | |
1925 my $ei=0; | |
1926 | |
1927 #####cluster gene | |
1928 if (substr($section,0,1) eq "1") | |
1929 { | |
1930 ###MP: blastall formatdb | |
1931 ###GF: blastall formatdb mcl | |
1932 | |
1933 if (!(-e $formatdb)) | |
1934 { | |
1935 $error[$ei++]="formatdb is not found at $formatdb\n"; | |
1936 } | |
1937 | |
1938 if (!(-X $formatdb)) | |
1939 { | |
1940 $error[$ei++]="there is not premission to execute $formatdb\n"; | |
1941 } | |
1942 | |
1943 if (!(-e $blastall)) | |
1944 { | |
1945 $error[$ei++]="blastall is not found at $blastall\n"; | |
1946 } | |
1947 | |
1948 if (!(-X $blastall)) | |
1949 { | |
1950 $error[$ei++]="there is not premission to execute $blastall\n"; | |
1951 } | |
1952 | |
1953 if ($method eq "GF") | |
1954 { | |
1955 if (!(-e $mcl)) | |
1956 { | |
1957 $error[$ei++]="mcl is not found at $mcl\n"; | |
1958 } | |
1959 if (!(-X $mcl)) | |
1960 { | |
1961 $error[$ei++]="there is not premission to execute $mcl\n"; | |
1962 } | |
1963 } | |
1964 } | |
1965 | |
1966 #####CDS variation | |
1967 if (substr($section,2,1) eq "1") | |
1968 { | |
1969 if (!(-e $mafft)) | |
1970 { | |
1971 $error[$ei++]="mafft is not found at $mafft\n"; | |
1972 } | |
1973 if (!(-X $mafft)) | |
1974 { | |
1975 $error[$ei++]="there is not premission to execute $mafft\n"; | |
1976 } | |
1977 } | |
1978 | |
1979 #####CDS variation | |
1980 if (substr($section,3,1) eq "1") | |
1981 { | |
1982 if (!(-e $mafft)) | |
1983 { | |
1984 $error[$ei++]="mafft is not found at $mafft\n"; | |
1985 } | |
1986 if (!(-X $mafft)) | |
1987 { | |
1988 $error[$ei++]="there is not premission to execute $mafft\n"; | |
1989 } | |
1990 } | |
1991 #####Evolution analysis | |
1992 if (substr($section,3,1) eq "1") | |
1993 { | |
1994 if (-e $seqboot) | |
1995 { | |
1996 $error[$ei++]="there is not premission to execute $seqboot\n" if(!(-X $seqboot)); | |
1997 }else | |
1998 { | |
1999 $error[$ei++]="seqboot is not found at $seqboot\n"; | |
2000 } | |
2001 if (-e $dnaml) | |
2002 { | |
2003 $error[$ei++]="there is not premission to execute $dnaml\n" if(!(-X $dnaml)); | |
2004 }else | |
2005 { | |
2006 $error[$ei++]="dnaml is not found at $dnaml\n"; | |
2007 } | |
2008 if (-e $dnadist) | |
2009 { | |
2010 $error[$ei++]="there is not premission to execute $dnadist\n" if(!(-X $dnadist)); | |
2011 }else | |
2012 { | |
2013 $error[$ei++]="dnadist is not found at $dnadist\n"; | |
2014 } | |
2015 if (-e $neighbor) | |
2016 { | |
2017 $error[$ei++]="there is not premission to execute $neighbor\n" if(!(-X $neighbor)); | |
2018 }else | |
2019 { | |
2020 $error[$ei++]="neighbor is not found at $neighbor\n"; | |
2021 } | |
2022 if (-e $consense) | |
2023 { | |
2024 $error[$ei++]="there is not premission to execute $consense\n" if(!(-X $consense)); | |
2025 }else | |
2026 { | |
2027 $error[$ei++]="consense is not found at $consense\n"; | |
2028 } | |
2029 if (-e $dnapars) | |
2030 { | |
2031 $error[$ei++]="there is not premission to execute $dnapars\n" if(!(-X $dnapars)); | |
2032 }else | |
2033 { | |
2034 $error[$ei++]="dnapars is not found at $dnapars\n"; | |
2035 } | |
2036 } | |
2037 #@$tmparray=(@$tmparray,@error); | |
2038 @tmp=(@tmp,@error); | |
2039 } | |
2040 | |
2041 | |
2042 sub CheckInputFile() | |
2043 { | |
2044 (my $species,my $inputDIR,my $section,my $method,my $tmparray)=@_; | |
2045 ####cluster | |
2046 if (substr($section,0,1) eq "1") | |
2047 { | |
2048 if ($method eq "MM") | |
2049 { | |
2050 @$tmparray=(@$tmparray,&chk2SEQ($species,$inputDIR)); ### check pep and nuc | |
2051 @$tmparray=(@$tmparray,&chktab($species,$inputDIR,".location"));### chk pep nuc location | |
2052 }else | |
2053 { | |
2054 @$tmparray=(@$tmparray,&chk1SEQ($species,$inputDIR)); | |
2055 } | |
2056 } | |
2057 ###CDS variation | |
2058 if (substr($section,2,1) eq "1") | |
2059 { | |
2060 @$tmparray=(@$tmparray,&chk2SEQ($species,$inputDIR)); | |
2061 } | |
2062 ###function analysis | |
2063 if (substr($section,4,1) eq "1") | |
2064 { | |
2065 @$tmparray=(@$tmparray,&chktab($species,$inputDIR,".function")); | |
2066 } | |
2067 } | |
2068 | |
2069 | |
2070 sub chk1SEQ() | |
2071 { | |
2072 (my $species,my $inputDIR)=@_; | |
2073 my @error; | |
2074 my $ei=0; | |
2075 my $sp; | |
2076 my $pepfile; | |
2077 my %pep; | |
2078 foreach $sp (@$species) | |
2079 { | |
2080 %pep=(); | |
2081 $pepfile=$inputDIR.$sp.".pep"; | |
2082 &ReadSequenceInToHash($pepfile,\%pep); | |
2083 if (scalar(keys %pep)<2) | |
2084 { | |
2085 $error[$ei++]="format error in $pepfile\n"; | |
2086 } | |
2087 } | |
2088 return @error; | |
2089 } | |
2090 | |
2091 sub chk2SEQ() | |
2092 { | |
2093 (my $species,my $inputDIR)=@_; | |
2094 my $sp; | |
2095 my %pep; | |
2096 my %nuc; | |
2097 my $pepfile; | |
2098 my $nucfile; | |
2099 my $key; | |
2100 my @error; | |
2101 my $ei=0; | |
2102 foreach $sp (@$species) | |
2103 { | |
2104 $pepfile=$inputDIR.$sp.".pep"; | |
2105 $nucfile=$inputDIR.$sp.".nuc"; | |
2106 %pep=(); | |
2107 %nuc=(); | |
2108 &ReadSequenceInToHash("$pepfile",\%pep); | |
2109 &ReadSequenceInToHash("$nucfile",\%nuc); | |
2110 if (scalar(keys %pep) ne scalar(keys %nuc)) | |
2111 { | |
2112 $error[$ei++]="Sequences number is not consistent in the following two file:\n\t$pepfile\n\t$nucfile\n"; | |
2113 }else | |
2114 { | |
2115 foreach $key (keys %pep) | |
2116 { | |
2117 if (exists($nuc{$key})) | |
2118 { | |
2119 if (length($nuc{$key}) ne ((length($pep{$key})+1)*3)) | |
2120 { | |
2121 $error[$ei++]="the length of $key in $nucfile is not consistent with its corresponding protein length\n"; | |
2122 } | |
2123 }else | |
2124 { | |
2125 $error[$ei++]="$key lost in $nucfile\n"; | |
2126 } | |
2127 } | |
2128 | |
2129 foreach $key (keys %nuc) | |
2130 { | |
2131 if (!exists($pep{$key})) | |
2132 { | |
2133 $error[$ei++]="1048 $key lost in $pepfile\n"; | |
2134 } | |
2135 } | |
2136 } | |
2137 } | |
2138 return @error; | |
2139 } | |
2140 | |
2141 | |
2142 sub chktab() | |
2143 { | |
2144 (my $species,my $inputDIR,my $extention)=@_; | |
2145 my %pep; | |
2146 my @row; | |
2147 my $key; | |
2148 my %tab; | |
2149 my @error; | |
2150 my $ei=0; | |
2151 my $sp; | |
2152 my $tabfile; | |
2153 my $pepfile; | |
2154 foreach $sp (@$species) | |
2155 { | |
2156 %tab=(); | |
2157 %pep=(); | |
2158 $tabfile=$inputDIR.$sp.$extention; | |
2159 open(F,"$tabfile"); | |
2160 while (<F>) | |
2161 { | |
2162 chomp(); | |
2163 @row=split(/\t/,$_); | |
2164 if (scalar(@row)<3) | |
2165 { | |
2166 $error[$ei++]="format error in $tabfile\n"; | |
2167 }else | |
2168 { | |
2169 $tab{$row[0]}=$row[1]; | |
2170 } | |
2171 } | |
2172 close(F); | |
2173 $pepfile=$inputDIR.$sp.".pep"; | |
2174 &ReadSequenceInToHash($pepfile,\%pep); | |
2175 foreach $key (keys %pep) | |
2176 { | |
2177 if (!exists($tab{$key})) | |
2178 { | |
2179 $error[$ei++]="sequence $key lost infomation in $tabfile\n"; | |
2180 } | |
2181 } | |
2182 } | |
2183 return @error; | |
2184 } | |
2185 | |
2186 | |
2187 | |
2188 sub ReadSequenceInToHash() | |
2189 { | |
2190 use Bio::SeqIO; | |
2191 (my $file,my $hash)=@_; | |
2192 my $seq; | |
2193 my $in=Bio::SeqIO->new(-file=>"$file",-format=>"fasta"); | |
2194 while ($seq=$in->next_seq()) | |
2195 { | |
2196 #$$hash{$id."|".$seq->id}=$seq->seq(); | |
2197 $$hash{$seq->id}=$seq->seq(); | |
2198 } | |
2199 } | |
2200 | |
2201 sub ReadAlignmentToHash() | |
2202 { | |
2203 (my $file,my $hash)=@_; | |
2204 my $name=""; | |
2205 my $seq=""; | |
2206 my @content; | |
2207 my $line; | |
2208 open(F,"$file"); | |
2209 @content=<F>; | |
2210 close(F); | |
2211 chomp(@content); | |
2212 for ($line=0;$line<@content;$line++) | |
2213 { | |
2214 if ($content[$line]=~/^>/) | |
2215 { | |
2216 if ($line>0) | |
2217 { | |
2218 $$hash{$name}=$seq; | |
2219 $name=""; | |
2220 } | |
2221 | |
2222 $_=$content[$line]; | |
2223 s/^>//; | |
2224 $name=$_; | |
2225 $seq=""; | |
2226 }else | |
2227 { | |
2228 if ($name ne "") | |
2229 { | |
2230 $seq=$seq.$content[$line]; | |
2231 } | |
2232 } | |
2233 } | |
2234 $$hash{$name}=$seq; | |
2235 } | |
2236 | |
2237 | |
2238 | |
2239 sub Combination() | |
2240 { | |
2241 (my $m,my $n,my $comRef)=@_; | |
2242 my $str=""; | |
2243 my %hash; | |
2244 my $fpos; | |
2245 my $num0; | |
2246 my $rest; | |
2247 my $tmp; | |
2248 my $i; | |
2249 my $j; | |
2250 my $key; | |
2251 #my $m=scalar(@$array); | |
2252 my @combination; | |
2253 | |
2254 for ($i=1;$i<=$n;$i++) | |
2255 { | |
2256 $str="1".$str; | |
2257 } | |
2258 | |
2259 for ($i=1;$i<=($m-$n);$i++) | |
2260 { | |
2261 $str=$str."0"; | |
2262 } | |
2263 | |
2264 $hash{$str}=1; | |
2265 while ($str=~/10/) | |
2266 { | |
2267 $fpos=index($str,"10"); | |
2268 $_=$str; | |
2269 s/10/01/; | |
2270 $str=$_; | |
2271 $tmp=substr($str,0,$fpos); | |
2272 $_=$tmp; | |
2273 s/0//g; | |
2274 $rest=$_; | |
2275 $num0=$fpos-length($_); | |
2276 for ($i=1;$i<=$num0;$i++) | |
2277 { | |
2278 $rest="$rest"."0"; | |
2279 } | |
2280 $str="$rest".substr($str,$fpos,$m-$fpos); | |
2281 $hash{$str}=1; | |
2282 } | |
2283 $j=0; | |
2284 foreach $key (keys %hash) | |
2285 { | |
2286 $combination[$j]=""; | |
2287 for ($i=0;$i<$m;$i++) | |
2288 { | |
2289 if (substr($key,$i,1) eq "1") | |
2290 { | |
2291 if ($combination[$j] ne "") | |
2292 { | |
2293 #$combination[$j]=$combination[$j]."\t".$$array[$i]; | |
2294 $combination[$j]=$combination[$j]."\t".$i; | |
2295 }else | |
2296 { | |
2297 #$combination[$j]=$$array[$i]; ### For return species ID | |
2298 $combination[$j]=$i; | |
2299 } | |
2300 } | |
2301 } | |
2302 $j++; | |
2303 } | |
2304 @$comRef=@combination; ### update the data through the physic address | |
2305 } | |
2306 | |
2307 sub ChkCombinationValue() | |
2308 { | |
2309 (my $m,my $n)=@_; | |
2310 my %hash; | |
2311 my %vhash; | |
2312 my $value=0; | |
2313 my $key; | |
2314 my @row; | |
2315 my @sdA; | |
2316 my @sdB; | |
2317 | |
2318 ### initialization | |
2319 $hash{$m."-".$n}=1; | |
2320 | |
2321 ### split combination | |
2322 while (scalar(keys %hash)>0 and $value<=$sampleSize) | |
2323 { | |
2324 foreach $key (keys %hash) | |
2325 { | |
2326 if ($value > $sampleSize) ### threshold | |
2327 { | |
2328 last; | |
2329 } | |
2330 if (!exists($hash{$key})) | |
2331 { | |
2332 next; | |
2333 } | |
2334 @row=split(/-/,$key); | |
2335 #print $row[0]."|".$row[1]."\n"; | |
2336 if ($row[0] eq $row[1]) | |
2337 { | |
2338 $value=$value+$hash{$key}; | |
2339 }else | |
2340 { | |
2341 ##split | |
2342 $sdA[0]=$row[0]-1; | |
2343 $sdA[1]=$row[1]; | |
2344 $sdB[0]=$row[0]-1; | |
2345 $sdB[1]=$row[1]-1; | |
2346 ##storing A | |
2347 if (($sdA[0] eq $sdA[1]) or $sdA[1] ==0) | |
2348 { | |
2349 $value=$value+$hash{$key}; | |
2350 }else | |
2351 { | |
2352 if (exists($hash{$sdA[0]."-".$sdA[1]})) | |
2353 { | |
2354 $hash{$sdA[0]."-".$sdA[1]}=$hash{$sdA[0]."-".$sdA[1]}+$hash{$key}; | |
2355 }else | |
2356 { | |
2357 $hash{$sdA[0]."-".$sdA[1]}=$hash{$key}; | |
2358 } | |
2359 } | |
2360 | |
2361 ##storing B | |
2362 if (($sdB[0] eq $sdB[1]) or $sdB[1]==0) | |
2363 { | |
2364 $value=$value+$hash{$key}; | |
2365 }else | |
2366 { | |
2367 if (exists($hash{$sdB[0]."-".$sdB[1]})) | |
2368 { | |
2369 $hash{$sdB[0]."-".$sdB[1]}=$hash{$sdB[0]."-".$sdB[1]}+$hash{$key}; | |
2370 }else | |
2371 { | |
2372 $hash{$sdB[0]."-".$sdB[1]}=$hash{$key}; | |
2373 } | |
2374 } | |
2375 } | |
2376 #delete original combination | |
2377 delete($hash{$key}); | |
2378 } | |
2379 } | |
2380 | |
2381 if ($value>$sampleSize) | |
2382 { | |
2383 return 0; | |
2384 }else | |
2385 { | |
2386 return $value; | |
2387 } | |
2388 } | |
2389 | |
2390 | |
2391 sub SampleCombination() | |
2392 { | |
2393 (my $m,my $n,my $comRef)=@_; | |
2394 my %hash; | |
2395 my $sampleTimes=0; | |
2396 my @randNum; | |
2397 my @sortID; | |
2398 my $i; | |
2399 my $j; | |
2400 my $tmp; | |
2401 while ( scalar(keys %hash)<$sampleSize and $sampleTimes<($sampleSize*2)) | |
2402 { | |
2403 for ($i=0;$i<$m;$i++) # generate random data | |
2404 { | |
2405 $randNum[$i]=int(100000 * rand(100)); | |
2406 $sortID[$i]=$i; | |
2407 } | |
2408 | |
2409 for ($i=0;$i<$m;$i++) # sorting random data | |
2410 { | |
2411 for ($j=0;$j<$m;$j++) | |
2412 { | |
2413 if ($randNum[$sortID[$i]]<$randNum[$sortID[$j]]) | |
2414 { | |
2415 $tmp=$sortID[$i]; | |
2416 $sortID[$i]=$sortID[$j]; | |
2417 $sortID[$j]=$tmp; | |
2418 } | |
2419 } | |
2420 } | |
2421 | |
2422 #storing data | |
2423 $tmp=join("\t",sort {$a<=>$b} (splice(@sortID,0,$n))); | |
2424 $hash{$tmp}=1; | |
2425 $sampleTimes++; | |
2426 } | |
2427 @$comRef=keys %hash; | |
2428 } | |
2429 | |
2430 | |
2431 sub PanGenomeNumber() | |
2432 { | |
2433 (my $spID)=@_; | |
2434 my $pan=0; | |
2435 my $core=0; | |
2436 my $count; #### counter; | |
2437 my @row; | |
2438 | |
2439 foreach (@clusters) | |
2440 { | |
2441 $count=0; | |
2442 @row=split(/\t/,$_); | |
2443 | |
2444 foreach (@$spID) | |
2445 { | |
2446 $count=$count+$row[$_]; | |
2447 } | |
2448 | |
2449 if ($count>0) | |
2450 { | |
2451 $pan++; | |
2452 if ($count == scalar(@$spID)) | |
2453 { | |
2454 $core++; | |
2455 } | |
2456 } | |
2457 } | |
2458 return $pan."\t".$core; | |
2459 } | |
2460 | |
2461 sub fit_model_A() | |
2462 { | |
2463 ### model y = A * x**B + C | |
2464 (my $xdata,my $ydata)=@_; | |
2465 my $i; | |
2466 my $b; | |
2467 my $max_B=0; | |
2468 my $max_R=0; | |
2469 my $max_A=0; | |
2470 my $max_A_interval; | |
2471 my $max_C=0; | |
2472 my $max_C_interval; | |
2473 my $R=1e-100; | |
2474 my $start; | |
2475 my $end; | |
2476 my $step; | |
2477 my @xValues; | |
2478 my @yValues; | |
2479 | |
2480 $start=1; | |
2481 $step=0.001; | |
2482 $b=$start; | |
2483 $max_R=0; | |
2484 $R=1e-100; | |
2485 | |
2486 use Statistics::LineFit; | |
2487 use Statistics::Distributions; | |
2488 | |
2489 while ($max_R<=$R) | |
2490 { | |
2491 if (($b < 0.02) and ($b >-0.02)) | |
2492 { | |
2493 $b=-0.02; | |
2494 } | |
2495 | |
2496 for ($i=0;$i<@$xdata;$i++) | |
2497 { | |
2498 $xValues[$i]=$$xdata[$i]**$b; | |
2499 } | |
2500 @yValues=@$ydata; | |
2501 my $lineFit = Statistics::LineFit->new(); | |
2502 $lineFit->setData (\@xValues, \@yValues) or die "Invalid data"; | |
2503 (my $intercept, my $slope) = $lineFit->coefficients(); | |
2504 my $rSquared = $lineFit->rSquared(); | |
2505 my $meanSquaredError = $lineFit->meanSqError(); | |
2506 my $durbinWatson = $lineFit->durbinWatson(); | |
2507 my $sigma = $lineFit->sigma(); | |
2508 (my $tStatIntercept, my $tStatSlope) = $lineFit->tStatistics(); | |
2509 (my $varianceIntercept,my $varianceSlope) = $lineFit->varianceOfEstimates(); | |
2510 | |
2511 $max_R=$R; | |
2512 $R=$rSquared; | |
2513 if ($max_R<=$R) | |
2514 { | |
2515 $max_R=$R; | |
2516 ($max_C,$max_A)=$lineFit->coefficients(); | |
2517 $max_A_interval=Statistics::Distributions::tdistr (($spnum-2),.025)*sqrt($varianceSlope); | |
2518 $max_C_interval=Statistics::Distributions::tdistr (($spnum-2),.025)*sqrt($varianceIntercept); | |
2519 } | |
2520 $b=$b-$step; | |
2521 } | |
2522 $max_B=$b; | |
2523 return ($max_R,$max_A,$max_A_interval,$max_B,$max_C,$max_C_interval); | |
2524 | |
2525 } | |
2526 | |
2527 sub fit_model_B() | |
2528 { | |
2529 ### model y = A * exp(x*B) + C | |
2530 (my $xdata,my $ydata)=@_; | |
2531 my $i; | |
2532 my $b; | |
2533 my $max_B=0; | |
2534 my $max_R=0; | |
2535 my $max_A=0; | |
2536 my $max_A_interval; | |
2537 my $max_C=0; | |
2538 my $max_C_interval; | |
2539 my $R=1e-100; | |
2540 my $start; | |
2541 my $end; | |
2542 my $step; | |
2543 my @xValues; | |
2544 my @yValues; | |
2545 | |
2546 $start=0; | |
2547 $step=0.001; | |
2548 $b=$start; | |
2549 $max_R=0; | |
2550 $R=1e-100; | |
2551 | |
2552 use Statistics::LineFit; | |
2553 use Statistics::Distributions; | |
2554 | |
2555 while ($max_R<=$R) | |
2556 { | |
2557 if (($b < 0.02) and ($b >-0.02)) | |
2558 { | |
2559 $b=-0.02; | |
2560 } | |
2561 | |
2562 for ($i=0;$i<@$xdata;$i++) | |
2563 { | |
2564 $xValues[$i]=exp($$xdata[$i]*$b); | |
2565 } | |
2566 @yValues=@$ydata; | |
2567 my $lineFit = Statistics::LineFit->new(); | |
2568 $lineFit->setData (\@xValues, \@yValues) or die "Invalid data"; | |
2569 (my $intercept, my $slope) = $lineFit->coefficients(); | |
2570 my $rSquared = $lineFit->rSquared(); | |
2571 my $meanSquaredError = $lineFit->meanSqError(); | |
2572 my $durbinWatson = $lineFit->durbinWatson(); | |
2573 my $sigma = $lineFit->sigma(); | |
2574 (my $tStatIntercept, my $tStatSlope) = $lineFit->tStatistics(); | |
2575 (my $varianceIntercept,my $varianceSlope) = $lineFit->varianceOfEstimates(); | |
2576 | |
2577 $max_R=$R; | |
2578 $R=$rSquared; | |
2579 if ($max_R<=$R) | |
2580 { | |
2581 $max_R=$R; | |
2582 ($max_C,$max_A)=$lineFit->coefficients(); | |
2583 $max_A_interval=Statistics::Distributions::tdistr (($spnum-2),.025)*sqrt($varianceSlope); | |
2584 $max_C_interval=Statistics::Distributions::tdistr (($spnum-2),.025)*sqrt($varianceIntercept); | |
2585 } | |
2586 $b=$b-$step; | |
2587 } | |
2588 $max_B=$b; | |
2589 return ($max_R,$max_A,$max_A_interval,$max_B,$max_C,$max_C_interval); | |
2590 } | |
2591 | |
2592 | |
2593 sub ReadData2Array() | |
2594 { | |
2595 (my $file, my $array1,my $col1,my $array2,my $col2)=@_; | |
2596 my $i=0; | |
2597 open(F,$file); | |
2598 $_=<F>; | |
2599 while (<F>) | |
2600 { | |
2601 chomp(); | |
2602 @_=split(/\t/,$_); | |
2603 $$array1[$i]=$_[$col1]; | |
2604 $$array2[$i]=$_[$col2]; | |
2605 $i++; | |
2606 } | |
2607 close(F); | |
2608 } | |
2609 | |
2610 sub SumData() | |
2611 { | |
2612 (my $xdata,my $ydata,my $SumMethod)=@_; | |
2613 my %hash; | |
2614 my $i; | |
2615 my $key; | |
2616 my $max=0; | |
2617 for ($i=0;$i<@$xdata;$i++) | |
2618 { | |
2619 if (exists($hash{$$xdata[$i]})) | |
2620 { | |
2621 $hash{$$xdata[$i]}=$hash{$$xdata[$i]}." ".$$ydata[$i]; | |
2622 }else | |
2623 { | |
2624 $hash{$$xdata[$i]}=$$ydata[$i]; | |
2625 if ($$xdata[$i]>$max) | |
2626 { | |
2627 $max=$$xdata[$i]; | |
2628 } | |
2629 } | |
2630 } | |
2631 @$xdata=qw(); | |
2632 @$ydata=qw(); | |
2633 $i=0; | |
2634 foreach $i (1..$max) | |
2635 { | |
2636 $$xdata[$i-1]=$i; | |
2637 if ($SumMethod eq "median") | |
2638 { | |
2639 $$ydata[$i-1]=&median($hash{$i}); | |
2640 }else | |
2641 { | |
2642 $$ydata[$i-1]=&mean($hash{$i}); | |
2643 } | |
2644 } | |
2645 #print join(",",@$xdata)."\n"; | |
2646 #print join(",",@$ydata)."\n"; | |
2647 } | |
2648 | |
2649 sub median() | |
2650 { | |
2651 (my $data)=@_; | |
2652 my @data=split(/ /,$data); | |
2653 my $arraylen=scalar(@data); | |
2654 @data=sort{$a<=>$b} @data; | |
2655 if (int($arraylen/2)*2 == $arraylen) | |
2656 { | |
2657 return ($data[$arraylen/2]+$data[$arraylen/2-1])/2; | |
2658 }else | |
2659 { | |
2660 return $data[int($arraylen/2)]; | |
2661 } | |
2662 } | |
2663 | |
2664 sub mean() | |
2665 { | |
2666 (my $data)=@_; | |
2667 my @data=split(/ /,$data); | |
2668 my $sum=0; | |
2669 foreach (@data) | |
2670 { | |
2671 $sum=$sum+$_; | |
2672 } | |
2673 return int(($sum/scalar(@data))*1000)/1000; | |
2674 } | |
2675 | |
2676 sub ReplaceName() | |
2677 { | |
2678 (my $sp,my $file)=@_; | |
2679 my @content; | |
2680 my $line; | |
2681 my $i; | |
2682 my $target; | |
2683 open(F,$file); | |
2684 @content=<F>; | |
2685 close(F); | |
2686 for ($line=0;$line<@content;$line++) | |
2687 { | |
2688 for ($i=0;$i<@$sp;$i++) | |
2689 { | |
2690 $_=$content[$line]; | |
2691 $target="sp".$i."sp"; | |
2692 s/$target/$$sp[$i]/; | |
2693 $content[$line]=$_; | |
2694 } | |
2695 } | |
2696 open(R,">$file"); | |
2697 print R @content; | |
2698 close(R); | |
2699 } | |
2700 | |
2701 sub MP_Result_to_Table() | |
2702 { | |
2703 (my $MPresult, my $outputfile)=@_; | |
2704 my %hash; | |
2705 my $maxid=0; | |
2706 my $i; | |
2707 my @row; | |
2708 | |
2709 open(F,"$MPresult"); | |
2710 $_=<F>; | |
2711 while (<F>) | |
2712 { | |
2713 @row=split(/\t/,$_); | |
2714 if (exists($hash{$row[0]})) | |
2715 { | |
2716 $hash{$row[0]}=$hash{$row[0]}."\t".$row[2]; | |
2717 }else | |
2718 { | |
2719 $hash{$row[0]}=$row[2]; | |
2720 if ($row[0]>$maxid) | |
2721 { | |
2722 $maxid=$row[0]; | |
2723 } | |
2724 } | |
2725 } | |
2726 close(F); | |
2727 | |
2728 open(R,">$outputfile"); | |
2729 foreach $i (1..$maxid) | |
2730 { | |
2731 print R $hash{$i}."\n"; | |
2732 } | |
2733 close(R); | |
2734 } | |
2735 | |
2736 | |
2737 | |
2738 sub FormatCluster() | |
2739 { | |
2740 (my $infile,my $genelist,my $spnum,my $cluster)=@_; | |
2741 my %hash; | |
2742 my %gene; | |
2743 my $key; | |
2744 my @row; | |
2745 my $sp; | |
2746 my $line; | |
2747 my $i=0; | |
2748 my $j=0; | |
2749 my @content; | |
2750 | |
2751 ### record gene in clusters | |
2752 open(F,"$infile"); | |
2753 @content=<F>; | |
2754 close(F); | |
2755 chomp(@content); | |
2756 for ($line=0;$line<@content;$line++) | |
2757 { | |
2758 @row=split(/\t/,$content[$line]); | |
2759 foreach $key (@row) | |
2760 { | |
2761 $gene{$key}=1; | |
2762 } | |
2763 } | |
2764 ###retrieves gene which is not in clutsers | |
2765 | |
2766 open(F,"$genelist"); | |
2767 while ($key=<F>) | |
2768 { | |
2769 if ($key=~/^>/) | |
2770 { | |
2771 chomp($key); | |
2772 $_=$key; | |
2773 s/^>//; | |
2774 $key=$_; | |
2775 if (!exists($gene{$key})) | |
2776 { | |
2777 $content[$line]=$key; | |
2778 $line++; | |
2779 } | |
2780 } | |
2781 } | |
2782 close(F); | |
2783 | |
2784 #### initialization @cluster | |
2785 @$cluster=qw(); | |
2786 $j=0; | |
2787 | |
2788 foreach $line (@content) | |
2789 { | |
2790 if ($line ne "") | |
2791 { | |
2792 %hash=(); | |
2793 @row=split(/\t/,$line); | |
2794 foreach $key (@row) | |
2795 { | |
2796 $sp=substr($key,0,index($key,"G")); | |
2797 $gene{$key}=1; | |
2798 if (exists($hash{$sp})) | |
2799 { | |
2800 $hash{$sp}=$hash{$sp}.",".$key; | |
2801 }else | |
2802 { | |
2803 $hash{$sp}=$key; | |
2804 } | |
2805 } | |
2806 | |
2807 $i=0; | |
2808 @row=qw(); | |
2809 | |
2810 foreach $i (0..($spnum-1)) | |
2811 { | |
2812 if (exists($hash{"S$i"})) | |
2813 { | |
2814 $row[$i]=$hash{"S$i"}; | |
2815 }else | |
2816 { | |
2817 $row[$i]="-"; | |
2818 } | |
2819 } | |
2820 $$cluster[$j++]=join("\t",@row); | |
2821 } | |
2822 } | |
2823 } | |
2824 | |
2825 |