comparison getSequenceInfo.pl @ 0:19ae17458c14 draft default tip

Uploaded
author dcouvin
date Wed, 15 Sep 2021 21:35:09 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:19ae17458c14
1 #!/usr/bin/perl
2
3 ################################################################################
4 ## "Copyright 2019 Vincent Moco and David Couvin"
5 ## licence GPL-3.0-or-later
6 ## This program is free software: you can redistribute it and/or modify
7 ## it under the terms of the GNU General Public License as published by
8 ## the Free Software Foundation, either version 3 of the License, or
9 ## (at your option) any later version.
10 ## This program is distributed in the hope that it will be useful,
11 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
12 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 ## GNU General Public License for more details.
14 ## You should have received a copy of the GNU General Public License
15 ## along with this program. If not, see <https://www.gnu.org/licenses/>.
16 ################################################################################
17
18 use strict;
19 use warnings;
20
21 my $version = "1.0.1"; # version
22
23 #my $number = 50;
24
25 # Date and time of the current day (Beginning)
26 #my ($start_year,$start_month,$start_day, $start_hour,$start_min,$start_sec) = Today_and_Now();
27 my $start = time();
28
29 print "##################################################################\n";
30 print "## ---> Welcome to $0 (version $version)!\n";
31 #print "## Start Date (yyyy-mm-dd, hh:min:sec): $start_year-$start_month-$start_day, $start_hour:$start_min:$start_sec\n";
32 print "##################################################################\n\n";
33
34 #BioPerl, Date::Calc, File::Log, have been removed from the @modules
35 my @modules = qw(
36 Archive::Tar
37 Bio::SeqIO
38 Bio::Species
39 File::Copy
40 File::Path
41 Net::FTP
42 IO::Uncompress::Gunzip
43 LWP::Simple
44 POSIX
45
46 );
47
48 foreach my $module (@modules) {
49 if (isModuleInstalled($module)) {
50 print "$module is.................installed!\n";
51 }
52 else {
53 print "$module is not installed. Please install it and try again.\n";
54 print "You can reinstall the $0 as shown on the README page or use the following command to install the module:\n";
55 print "cpan -i -f $module\n";
56 #system("cpan -i -f $module");
57 exit 1;
58 }
59 }
60 print "\n";
61
62 use Archive::Tar;
63 use Bio::SeqIO;
64 use Bio::Species;
65 #use Date::Calc qw(:all);
66 use File::Copy qw(cp move);
67 use File::Path qw(rmtree);
68 use Net::FTP;
69 use IO::Uncompress::Gunzip qw(gunzip $GunzipError);
70 use LWP::Simple qw(get);
71 use POSIX qw(floor);
72 #use File::Log;
73
74 ####################################################################################################
75 ## A Perl script allowing to get sequence information from GenBank, RefSeq or ENA repositories.
76 ##
77 ####################################################################################################
78
79 ### main program
80 ### parameters
81 my $directory = "genbank";
82
83 my $kingdom = ""; # kingdom of organism
84
85 my $releaseDate = "0000-00-00"; # sequences are downloaded from this release date
86
87 my $components; # components of the assembly
88
89 my $species = ""; # species name
90
91 my $getSummaries; # indicates if a new assembly summary is required
92
93 my $assemblyLevel = "Complete Genome,Chromosome,Scaffold,Contig"; # assembly level of the genome
94
95 my $quantity; # limit number of assemblies to download
96
97 my $sequenceID;
98
99 my $ftpServor = "ftp.ncbi.nlm.nih.gov";
100
101 my $enaFtpServor = "ftp.sra.ebi.ac.uk";
102
103 my $fldSep = "/"; # folder seperation change by OS
104
105 my @availableKingdoms = (
106 "archaea",
107 "bacteria",
108 "fungi",
109 "invertebrate",
110 "plant",
111 "protozoa",
112 "vertebrate_mammalian",
113 "vertebrate_other",
114 "viral"
115 ); # list of available kingdoms
116
117 my $actualOS = "Unix"; # OS of the computer
118
119 my $mainFolder; # folder in which the assemblies results are stored
120
121 my $assemblyTaxid = ""; # taxid for assembly
122
123 my $sraID; # SRA sequence ID
124
125 my $assemblyPrjID; # assembly or prj ID
126
127 my $log; # log
128
129 my $path = "";
130
131
132 if (@ARGV<1) {
133 help_user_simple($0);
134 exit 0;
135 }
136
137 if ($ARGV[0] eq "-help" || $ARGV[0] eq "-h") {
138 help_user_advance($0);
139 exit 0;
140 }
141
142 if ($ARGV[0] eq "-version" || $ARGV[0] eq "-v") {
143 program_version($0);
144 exit 0;
145 }
146
147 ##requirements
148 for (my $i = 0; $i <= $#ARGV; $i++) {
149 if ($ARGV[$i]=~/-kingdom/i or $ARGV[$i]=~/-k/i) {
150 $kingdom = $ARGV[$i+1];
151 }
152 elsif ($ARGV[$i]=~/-path/i or $ARGV[$i]=~/-pathSummary/i) {
153 $path = $ARGV[$i+1];
154 }
155 elsif ($ARGV[$i]=~/-directory/i or $ARGV[$i]=~/-dir/i) {
156 $directory = $ARGV[$i+1];
157 }
158 elsif ($ARGV[$i]=~/-date/i) {
159 $releaseDate = $ARGV[$i+1];
160 }
161 elsif ($ARGV[$i]=~/-getSummaries/i or $ARGV[$i]=~/-get/i) {
162 $getSummaries = $ARGV[$i+1];
163 }
164 elsif ($ARGV[$i]=~/-species/i or $ARGV[$i]=~/-s/i) {
165 $species = $ARGV[$i+1];
166 }
167 elsif ($ARGV[$i]=~/-level/i or $ARGV[$i]=~/-le/i) {
168 $assemblyLevel = $ARGV[$i+1];
169 }
170 elsif ($ARGV[$i]=~/-components/i or $ARGV[$i]=~/-c/i) {
171 $components = $ARGV[$i+1];
172 }
173 elsif ($ARGV[$i]=~/-quantity/i or $ARGV[$i]=~/-q/i or $ARGV[$i]=~/-number/i or $ARGV[$i]=~/-n/i) {
174 $quantity = int($ARGV[$i+1]);
175 }
176 elsif ($ARGV[$i]=~/-ena/i) {
177 $sequenceID = $ARGV[$i+1];
178 }
179 elsif ($ARGV[$i]=~/-output/i or $ARGV[$i]=~/-o/i) {
180 $mainFolder = $ARGV[$i+1];
181 }
182 elsif ($ARGV[$i]=~/-taxid/i) {
183 $assemblyTaxid = $ARGV[$i+1];
184 }
185 elsif ($ARGV[$i]=~/-fastq/i) {
186 $sraID = $ARGV[$i+1];
187 }
188 elsif ($ARGV[$i]=~/-assembly_or_project/i) {
189 $assemblyPrjID = $ARGV[$i+1];
190 }
191 elsif ($ARGV[$i]=~/-log/i) {
192 $log = 1;
193 # $log = File::Log->new({
194 # debug => 4,
195 # logFileName => 'myLogFile.log',
196 # logFileMode => '>',
197 # dateTimeStamp => 1,
198 # stderrRedirect => 1,
199 # defaultFile => 0,
200 # logFileDateTime => 1,
201 # appName => 'getSequenceInfo',
202 # PIDstamp => 0,
203 # storeExpText => 1,
204 # msgprepend => '',
205 # say => 1,
206 # });
207 }
208 }
209
210 #define folder separator and OS
211 if ($^O =~ /MSWin32/) {
212 $fldSep = "\\";
213 $actualOS = "MSWin32";
214 }
215
216 #LOG file
217 if ($log) { open (LOG, "log.txt") or die " error open log.txt $!:"; }
218
219
220 print "Working ...\n";
221
222 if ($kingdom eq "viruses") { $kingdom = "viral"; }
223
224 if (grep(/^$kingdom$/i, @availableKingdoms)) {
225
226 my @patternParametersList;
227
228 my @levelList = split /,/, $assemblyLevel;
229
230 if ($species ne "") {
231 my @speciesList = split(/,/, $species);
232
233 foreach my $actualSpecies (@speciesList) {
234 get_assembly_summary_species( $quantity, $releaseDate, $directory,$kingdom, $actualSpecies,
235 \@levelList, $fldSep, $actualOS, $mainFolder, $assemblyTaxid, $log,
236 $getSummaries, $components, $ftpServor);
237 }
238 }
239 elsif ($assemblyTaxid ne "") {
240 my @taxidList = split(/,/, $assemblyTaxid);
241
242 foreach my $actualID (@taxidList) {
243 get_assembly_summary_species($quantity, $releaseDate, $directory,$kingdom,$species,
244 \@levelList, $fldSep, $actualOS, $mainFolder, $actualID, $log,
245 $getSummaries, $components, $ftpServor);
246 }
247 }
248 else {
249 get_assembly_summary_species($quantity, $releaseDate, $directory,$kingdom,$species,
250 \@levelList, $fldSep, $actualOS, $mainFolder, $assemblyTaxid, $log,
251 $getSummaries, $components, $ftpServor);
252 }
253 }
254
255
256 if ($sequenceID) {
257 my @sequenceIDList = split /,/, $sequenceID;
258
259 foreach my $enaID (@sequenceIDList) {
260 sequence_ena($enaID, $log);
261 }
262 }
263
264 if ($sraID) {
265 my @sraIDList = split /,/, $sraID;
266
267 foreach my $sra (@sraIDList) {
268 download_ena_fastq($enaFtpServor, $sra, $log);
269 }
270 }
271
272 if ($assemblyPrjID) {
273 download_assembly_or_project($assemblyPrjID, $ftpServor, $fldSep, $directory, $log);
274 }
275
276 #my ($end_year,$end_month,$end_day, $end_hour,$end_min,$end_sec) = Today_and_Now();
277
278 #my ($D_y,$D_m,$D_d, $Dh,$Dm,$Ds) =
279 # Delta_YMDHMS($start_year,$start_month,$start_day, $start_hour, $start_min, $start_sec,
280 # $end_year, $end_month, $end_day, $end_hour,$end_min,$end_sec);
281
282 #print "End Date: $end_year-$end_month-$end_day, $end_hour:$end_min:$end_sec\n";
283 print "Thank you for using $0!\n";
284 #print "Execution time: $D_y years, $D_m months, $D_d days, $Dh:$Dm:$Ds (hours:minutes:seconds)\n";
285 my $end = time();
286
287 my $total = $end - $start;
288 my $min = $total / 60;
289 my $hrs = $min / 60;
290
291 print "Total time: $total seconds OR $min minutes OR $hrs hours ! \n";
292
293 ### subroutine
294 # display global help document
295 sub help_user_simple {
296 my $programme = shift @_;
297 print STDERR "Usage : perl $programme [options] \n";
298 print "Type perl $programme -version or perl $programme -v to get the current version\n";
299 print "Type perl $programme -help or perl $programme -h to get full help\n";
300 }
301 #------------------------------------------------------------------------------
302 # display full help document
303 sub help_user_advance {
304 print <<HEREDOC;
305
306 Name:
307 $0
308
309 Synopsis:
310 A Perl script allowing to get sequence information from GenBank RefSeq or ENA repositories.
311
312 Usage:
313 perl $0 [options]
314 examples:
315 perl $0 -k bacteria -s "Helicobacter pylori" -l "Complete Genome" -date 2019-06-01
316 perl $0 -k viruses -n 5 -date 2019-06-01
317 perl $0 -k "bacteria" -taxid 9,24 -n 10 -c plasmid -dir genbank -o Results
318 perl $0 -ena BN000065
319 perl $0 -fastq ERR818002
320 perl $0 -fastq ERR818002,ERR818004
321
322 Kingdoms:
323 archaea
324 bacteria
325 fungi
326 invertebrate
327 plant
328 protozoa
329 vertebrate_mammalian
330 vertebrate_other
331 viral
332
333 Assembly levels:
334 "Complete Genome"
335 Chromosome
336 Scaffold
337 Contig
338
339 General:
340 -help or -h displays this help
341 -version or -v displays the current version of the program
342
343 Options ([XXX] represents the expected value):
344 -directory or -dir [XXX] allows to indicate the NCBI's nucleotide sequences repository (default: $directory)
345 -get or -getSummaries [XXX] allows to obtain a new assembly summary file in function of given kingdoms (bacteria,fungi,protozoa...)
346 -k or -kingdom [XXX] allows to indicate kingdom of the organism (see the examples above)
347 -s or -species [XXX] allows to indicate the species (must be combined with -k option)
348 -taxid [XXX] allows to indicate a specific taxid (must be combined with -k option)
349 -assembly_or_project [XXX] allows to indicate a specific assembly accession or bioproject (must be combined with -k option)
350 -date [XXX] indicates the release date (with format yyyy-mm-dd) from which sequence information are available
351 -l or -level [XXX] allows to select a specific assembly level (e.g. "Complete Genome")
352 -o or -output [XXX] allows users to name the output result folder
353 -n or -number [XXX] allows to limit the total number of assemblies to be downloaded
354 -c or -components [XXX] allows to select specific components of the assembly (e.g. plasmid, chromosome, ...)
355 -ena [XXX] allows to download report and fasta file given a ENA sequence ID
356 -fastq [XXX] allows to download FASTQ sequences from ENA given a run accession (https://ena-docs.readthedocs.io/en/latest/faq/archive-generated-files.html)
357 -log allows to create a log file
358 HEREDOC
359 }
360 #------------------------------------------------------------------------------
361 # display program version
362 sub program_version {
363 my $programme = shift @_;
364 print "\n $programme, version : $version\n";
365 print "\n A perl script to get sequences informations\n";
366 }
367 #------------------------------------------------------------------------------
368 sub get_assembly_summary_species {
369 my ($quantity, $releaseDate, $directory, $kingdom, $species, $levelList, $fldSep,
370 $actualOS, $mainFolder, $assemblyTaxid, $log, $getSummaries, $components, $ftpServor) = @_;
371
372 # assembly_summary.txt file from NCBI FTP site
373 #my $assemblySummary = get_summaries($ftpServor, $kingdom, $getSummaries, $directory);
374
375 my $assemblySummary = "";
376 if($path){
377 $assemblySummary = $path;
378 }
379 else{
380 $assemblySummary = download_summaries($directory, $kingdom, $ftpServor, $fldSep, $getSummaries);
381 }
382
383 #lineage folder
384 # my $lineage_file = "/pub/taxonomy/new_taxdump/new_taxdump.tar.gz";
385
386 # allow to check old summary download
387 my $oldKingdom = "";
388
389 # start of output file
390 if ($log) {
391 print LOG "...Downloading assembly_summary.txt...\n";
392 }
393
394 # if ($actualOS =~ /Unix/i) {
395 #initialiaze tar manipulation
396 # my $tar = Archive::Tar->new;
397
398 #download taxdump folder
399 # download_file($ftpServor, $lineage_file);
400 # $tar->read("new_taxdump.tar.gz");
401 # $tar->extract_file("rankedlineage.dmp");
402 # }
403
404
405 #my $kingdomRep = $kingdom."_".$start_year."_".$start_month."_".$start_day;
406 #my $kingdomRep = $kingdom."_folder";
407 my $kingdomRep = "folder";
408
409 if ($mainFolder) { $kingdomRep = $mainFolder; }
410 mkdir $kingdomRep unless -d $kingdomRep;
411
412 # Repository for request
413
414 #my $repositoryAssembly = "assembly_repository_".$assemblyTaxid;
415 my $repositoryAssembly = "result";
416 mkdir $repositoryAssembly unless -d $repositoryAssembly;
417
418 my $oldAssemblyRep = "." . $fldSep . $kingdomRep . $fldSep . $repositoryAssembly;
419 if (-d $oldAssemblyRep) { rmtree($oldAssemblyRep) }
420
421 # Repository for fna file
422 my $repositoryFNA = "Assembly";
423 mkdir $repositoryFNA unless -e $repositoryFNA;
424
425 # Repository for genbank file
426 my $repositoryGenbank = "GenBank";
427 mkdir $repositoryGenbank unless -e $repositoryGenbank;
428
429 # Reposotiry for report file
430 my $repositoryReport = "Report";
431 mkdir $repositoryReport unless -e $repositoryReport ;
432
433 # Repositories for required components
434 my %componentsRepHash;
435
436 if ($components) {
437 for my $component (split /,/, $components) {
438 my $specificRep = $component."_folder";
439 #my $specificRep = $component."_".$species."_".$kingdom."_".$start_year."_".$start_month."_".$start_day;
440 mkdir $specificRep unless -d $specificRep;
441 $componentsRepHash{$component} = $specificRep;
442 }
443 }
444
445 if ($log) {
446 print LOG "...Create kingdom and components repository...\n";
447 }
448
449 my %assemblyReportList;
450 my @assemblyRepresentationList = qw/Full Partial/;
451 my @levelList = @{$levelList};
452 my $checkCompleteGenome = grep(/complete genome/i, @levelList);
453
454 if ($checkCompleteGenome > 0) {@assemblyRepresentationList = qw/Full/;}
455
456 if (-e $assemblySummary) {
457 # Read file
458 open (SUM, $assemblySummary) or die " error open assembly_summary.txt $!:";
459 while(<SUM>) {
460 chomp;
461 if ($_ !~ m/^#/) {
462 my @infoList = split /\t/, $_;
463 my $foundAssemRep = grep (/$infoList[13]/i, @assemblyRepresentationList);
464 my $checkLevel = grep(/$infoList[11]/i, @levelList); #replace 11 by 10
465
466 if ($foundAssemRep > 0 && $checkLevel > 0) {
467 my $indexInfo = 0;
468 my $searchPattern = "";
469 my $regex = "";
470
471 if ($species !~ /^$/) {
472 $indexInfo = 7;
473 $searchPattern = $species;
474 $regex = qr/$searchPattern/i;
475 }
476 elsif ($assemblyTaxid !~ /^$/) {
477 $indexInfo = 5;
478 $searchPattern = $assemblyTaxid;
479 $regex = qr/^$searchPattern$/i;
480 }
481
482 if (($infoList[$indexInfo] =~ $regex) or ($kingdom !~ /^$/ && $searchPattern =~ /^$/)) {
483 my @gcfInfo = split(/\//, $infoList[19]);
484 my $gcfName = pop(@gcfInfo);
485 my $realDate = $infoList[14];
486 $realDate =~ s/\//-/g;
487
488 my $genbankFile = $infoList[19] . "/" . $gcfName . "_genomic.gbff.gz";
489 my $dnaFile = $infoList[19] . "/" . $gcfName . "_genomic.fna.gz";
490 my $assemblyReport = $infoList[19] . "/" . $gcfName . "_assembly_report.txt";
491
492 if ($realDate gt $releaseDate) {
493
494 $dnaFile = obtain_file($ftpServor, $dnaFile);
495 $genbankFile = obtain_file($ftpServor, $genbankFile);
496 $assemblyReport = obtain_file($ftpServor, $assemblyReport);
497
498 download_file($ftpServor, $dnaFile);
499 download_file($ftpServor, $genbankFile);
500 download_file($ftpServor, $assemblyReport);
501
502 if ($log) {
503 print LOG "...download FASTA and GenBank report files...\n";
504 }
505
506 # download sequences and check number of "N" characters
507 my $fileFasta = $gcfName."_genomic.fna.gz";
508 my $ucpFasta = $gcfName."_genomic.fna";
509 if (-e $fileFasta) {
510 gunzip $fileFasta => $ucpFasta or die "gunzip failed: $GunzipError\n";
511 move($ucpFasta, $repositoryFNA) or die "move failed: $!";
512 }
513
514 if ($log) {
515 print LOG "...Unzip FASTA file named $fileFasta ...\n";
516 }
517
518 # download genome report
519 my $fileReport = $gcfName."_assembly_report.txt";
520 if (-e $fileReport) {
521 my $fileInformations = $gcfName."_informations.xls";
522 move($fileReport, $repositoryReport) or die "move failed: $!";
523 }
524
525 # download genbank files
526 my $fileGenbank = $gcfName."_genomic.gbff.gz";
527 my $ucpGenbank = $gcfName."_genomic.gbff";
528 if (-e $fileGenbank) {
529 gunzip $fileGenbank => $ucpGenbank or die "gunzip failed: $GunzipError\n";
530 move($ucpGenbank, $repositoryGenbank) or die "move failed: $!";
531 }
532
533 if ($log) {
534 print LOG "...Unzip GenBank file $fileGenbank ...\n";
535 }
536
537 # association report and fasta
538 my $fileFastaGenbank = $ucpFasta . "," . $ucpGenbank;
539 $assemblyReportList{$fileReport} = $fileFastaGenbank;
540
541 if ($quantity) { $quantity -= 1; }
542
543 }
544 }
545 }
546 }
547 if (defined $quantity && $quantity == 0) {
548 $quantity = undef;
549 last;
550 }
551 }
552 close(SUM) or die "close file error : $!";
553
554 if (!keys %assemblyReportList) {
555 print "##################################################################\n";
556 print "No results were found for the following query:\n";
557 print "perl $0 @ARGV\n";
558 print "##################################################################\n\n";
559
560 if ($actualOS =~ /unix/i) { unlink glob "*.dmp *.gz" or die "for file *.dmp *.gz $!:"; }
561
562 if (empty_folder($kingdomRep)) { rmdir $kingdomRep or die "fail remove directory $!:"; }
563 rmdir $repositoryAssembly or die "failed to remove directory $!:";
564 rmdir $repositoryFNA or die "failed to remove directory $!:";
565 rmdir $repositoryGenbank or die "failed to remove directory $!:";
566 rmdir $repositoryReport or die "failed to remove directory $!:";
567
568 if ($components) {
569 for my $componentRep (values %componentsRepHash) {
570 rmdir $componentRep or die "failed to remove directory $!:";
571 }
572 }
573 if ($log) {
574 print LOG "...No results were found for the following query: ";
575 print LOG "perl $0 @ARGV \n";
576 }
577
578 }
579 else {
580
581 if ($log) {
582 print LOG "...Results were found for the following query: ";
583 print LOG "perl $0 @ARGV \n";
584 }
585 # write summary files
586 my %componentsSumHash;
587 my @keysList = keys %assemblyReportList;
588 my $summary = "summary.xls";
589 my $htmlSummary = "summary.html";
590
591 if ($components) {
592 for my $component (split /,/, $components) {
593 my $specificSummary = $component. "_summary.xls";
594 $componentsSumHash{$component} = $specificSummary;
595 }
596 }
597
598 my $fileReport = "." . $fldSep. $repositoryReport . $fldSep . $keysList[0];
599 my @header = ();
600
601 open(FILE, $fileReport) or die "error open file : $!";
602 while(<FILE>) {
603 chomp;
604 if($_ =~ /:/){
605 $_ =~ s/^#*//;
606 my @ligne = split /:/, $_;
607 push(@header, $ligne[0]);
608 }
609 }
610 close(FILE) or die "error close file : $!";
611
612 open(HEAD, ">", $summary) or die " error open file : $!";
613 foreach(@header) {
614 print HEAD $_ . "\t";
615 }
616
617 print HEAD "Pubmed\tNucleScore\tClassification\t";
618 print HEAD "Country\tHost\tIsolation source\tA percent\t";
619 print HEAD "T percent\tG percent\tC percent\tN percent\tGC percent\t";
620 print HEAD "ATGC ratio\tLength\tShape\n";
621 close(HEAD) or die "error close file : $!";
622
623 if ($components) {
624 foreach my $componentSummary (values %componentsSumHash) {
625 open(SUM, ">>", $componentSummary) or die "error open file : $!";
626 print SUM "Id\tAssembly\tDescription\tLength\tStatus\tLevel\t";
627 print SUM "GC percent\tA percent\tT percent\tG percent\tC percent\n";
628 close(SUM) or die "error close file : $!";
629 }
630 }
631
632 for my $file (@keysList) {
633 my $reportFile = $repositoryReport . $fldSep . $file;
634 my @fastaGenbank = split /,/, $assemblyReportList{$file};
635 my $extFasta = $fastaGenbank[0];
636 my $extGenbank = $fastaGenbank[1];
637 my $fnaFile = $repositoryFNA . $fldSep . $extFasta;
638 my $genbankFile = $repositoryGenbank . $fldSep . $extGenbank;
639
640 write_assembly($reportFile, $fnaFile, $genbankFile, $summary, $repositoryAssembly,
641 \%componentsSumHash, $kingdom, $actualOS, \@header, $log);
642
643 if ($log) {
644 print LOG "...Call write_assembly subroutine...\n";
645 }
646 }
647
648 write_html_summary($summary);
649
650 if ($log) {
651 print LOG "...Call write_html_summary subroutine...\n";
652 }
653
654 if ($components) {
655 my @componentList = keys %componentsSumHash;
656 my %componentFastaHash = create_component_sequence_file($fldSep, $repositoryFNA, \@componentList);
657
658 if (keys %componentFastaHash && $log) { $log->msg(1,"call create_component_sequence_file subroutine");}
659
660 foreach my $component (keys %componentFastaHash) {
661 move($componentFastaHash{$component}, $componentsRepHash{$component}) or die "move failed: $!";
662 }
663 }
664
665 move($summary, $repositoryAssembly) or die "move failed: $!";
666 move($htmlSummary, $repositoryAssembly) or die "move failed: $!";
667 move($repositoryFNA, $repositoryAssembly . $fldSep . $repositoryFNA) or die "move failed: $!";
668 move($repositoryGenbank, $repositoryAssembly . $fldSep . $repositoryGenbank) or die "move failed: $!";
669 move($repositoryReport , $repositoryAssembly . $fldSep . $repositoryReport) or die "move failed: $!";
670 if ($log) {
671 print LOG "...move fasta, genbank and report to query folder \n";
672 }
673
674 if ($components) {
675 for my $component (keys %componentsSumHash) {
676 move($componentsSumHash{$component}, $componentsRepHash{$component}) or die "move failed: $!";
677 move($componentsRepHash{$component}, $repositoryAssembly . $fldSep . $componentsRepHash{$component}) or die "move failed: $!"
678 }
679 if ($log) {
680 print LOG "...move component files to folders\n";
681 }
682 }
683
684 move($repositoryAssembly, $kingdomRep . $fldSep . $repositoryAssembly) or die "move failed: $!";
685
686 if ($log) {
687 print LOG "...move query folder to main folder\n";
688 }
689
690 # if ($actualOS =~ /unix/i) { unlink glob "*.dmp" or die "for file *.dmp $!:"; }
691 unlink glob "*.gz sequence.txt" or die "$!: for file *.gz sequence.txt";
692 }
693 }
694 }
695 #write general assembly file
696 sub write_assembly {
697 my ($reportFile, $fnaFile, $genbankFile, $summary, $repositoryAssembly,
698 $componentsSumHashRef, $kingdom, $actualOS, $headerRef, $log) = @_;
699
700 my %componentsSumHash = %{$componentsSumHashRef};
701 my @header = @{$headerRef};
702 my %hashInformations = ();
703 my $seq = "";
704 my $genomeName = "";
705 my $country = "na";
706 my $GCpercent = -1;
707 my $taxId = "na";
708 my $assemblyLine;
709 my $pubmedId = "na";
710 my $host = "na";
711 my $isoSource = "na";
712 # my $species = "na";
713 # my $genus = "na";
714 # my $family = "na";
715 # my $order = "na";
716 # my $class = "na";
717 # my $phylum = "na";
718 my $shape = "na";
719 my $geneNumber = "na";
720
721 open(REP, "<", $reportFile) or die "error open file $!:";
722 while (<REP>) {
723 chomp;
724 $_ =~ s/^#*//;
725 if ($_ =~ /assembled-molecule/i) { $assemblyLine = $_; }
726 if ($_ =~ /:/) {
727 my @line = split /:/, $_;
728 if ($line[1]) { $hashInformations{$line[0]} = trim($line[1]); }
729 }
730 }
731 close(REP) or die "error close file $!:";
732
733
734 open(SUM, ">>", $summary) or die "error open file $!:";
735 foreach my $k(@header) {
736 if (grep $_ eq $k, keys %hashInformations) {
737
738 my $information = $hashInformations{$k};
739
740 if ($k =~ /Assembly name/i) { $genomeName = $information; }
741
742 if ($information =~ /^\s*$/) {
743 print SUM "na\t";
744 } else {
745 print SUM $information . "\t";
746 }
747 } else {
748 print SUM "na\t";
749 }
750 }
751 close(SUM) or die "error close file $!:";
752
753 open(FNA, "<", $fnaFile) or die "Could not open $!:";
754 while (<FNA>) {
755 chomp;
756 if ($_ !~ /^>/) { $seq .= $_; }
757 }
758 close(FNA) or die "error close file :$!";
759
760 foreach my $summaryKey (keys %hashInformations) {
761 if ($summaryKey =~ /taxid/i) {
762 $taxId = $hashInformations{$summaryKey};
763 }
764 }
765
766 my $classification = get_taxonomic_rank_genbank($genbankFile);
767
768 if ($log) {
769 print LOG "...get taxonomic rank from genbank file\n";
770 }
771
772 $GCpercent = gc_percent($seq);
773 my ($ade, $thy, $gua, $cyt, $n, $length) = number_nuc_length_seq($fnaFile);
774 my ($aPercent, $tPercent, $gPercent, $cPercent, $nPercent) = nucleotid_percent($ade, $thy, $gua, $cyt, $n, $length);
775 my $atgcRatio = atgc_ratio($ade, $thy, $gua, $cyt);
776
777 my @percentList = ($aPercent, $tPercent, $gPercent, $cPercent, $nPercent);
778
779 my $variance = shift_data_variance(@percentList);
780 my $nucleScore = nucle_score($variance, $GCpercent, $atgcRatio, $length);
781
782 if ($log) {
783 print LOG "compute GC percent nucleotid percent ATGC ratio NucleScore\n";
784 }
785
786 open(GBFF, "<", $genbankFile) or die "Error open file $!:";
787 while(<GBFF>) {
788 chomp;
789 if ($_ =~ /\/country="(.*)"/) { $country = trim($1); }
790 if ($_ =~ /PUBMED(.*)/) { $pubmedId = trim($1); }
791 if ($_ =~ /\/host="(.*)"/) { $host = trim($1); }
792 if ($_ =~ /\/isolation_source="(.*)"/) { $isoSource = trim($1); }
793 if ($_ =~ /\(Genes \(total\)\s+::(.*)/) { $geneNumber = trim($1); }
794 if ($_ =~ /LOCUS.*\s+([a-z]{1,})\s+[a-z]{1,}\s+[0-9]{2,}-[a-z]{1,}-[0-9]{4,}$/i) { $shape = trim($1); }
795 }
796 close(GBFF) or die "error close file $!:";
797
798
799 open(SUM, ">>", $summary) or die "error open file $!:";
800 print SUM $pubmedId . "\t" . $nucleScore . "\t" . $classification ."\t" ;
801 print SUM $country . "\t" . $host . "\t" . $isoSource . "\t" . $aPercent . "\t" ;
802 print SUM $tPercent . "\t" . $gPercent . "\t" . $cPercent ."\t" . $nPercent . "\t";
803 print SUM $GCpercent ."\t". $atgcRatio ."\t". $length . "\t". $shape."\n";
804 close(SUM) or die "error close file: $!";
805
806 if (%componentsSumHash) {
807 write_assembly_component($fnaFile, $genomeName, \%componentsSumHash, $log);
808 }
809 }
810 #------------------------------------------------------------------------------
811 # get assembly component
812 sub write_assembly_component {
813 my($fnaFile, $genomeName, $componentsSumHashRef, $log) = @_;
814
815 my %componentsSumHash = %{$componentsSumHashRef};
816 my $status = "na";
817 my $level = "na";
818 my $gcpercent;
819 my $tmp_fasta_file = "sequence.txt";
820 my @desc = ();
821
822 # check each sequence from (multi-)fasta file
823 my ($seq, $inputfile);
824
825 if ($log) {
826 print LOG "...search specific components\n";
827 }
828
829 # extract sequence details
830 my $seqIO = Bio::SeqIO->new(-format=>'Fasta', -file=>$fnaFile);
831
832 while ($seq = $seqIO->next_seq()) {
833 my $seqID = $seq->id; # ID of sequence
834 my $seqDesc = $seq->desc; # Description of sequence
835 my $globalSeq = $seq->seq;
836 my $seqLength = $seq->length();
837
838 open(TSEQ, ">", $tmp_fasta_file) or die "Error open file: $!";
839 print TSEQ $globalSeq;
840 close(TSEQ);
841
842 my ($ade, $thy, $gua, $cyt, $n, $length) = number_nuc_length_seq($tmp_fasta_file);
843
844 my ($aPercent, $tPercent, $gPercent, $cPercent, $nPercent) = nucleotid_percent($ade, $thy, $gua, $cyt, $n, $length);
845
846 $gcpercent = gc_percent($globalSeq);
847
848 @desc = split /,/, $seqDesc;
849
850 if ($desc[1]) { $level = $desc[1]; }
851
852 foreach my $component (keys %componentsSumHash) {
853 if ($desc[0] =~ /$component/) {
854 $status = $component;
855 my $info = $seqID . "\t" . $genomeName ."\t" . $seqDesc . "\t" . $seqLength . "\t" . $status . "\t" . $level ."\t";
856 $info.= $gcpercent."\t". $aPercent ."\t". $tPercent ."\t". $gPercent ."\t". $cPercent . "\n";
857 add_to_file($componentsSumHash{$component}, $info);
858 if ($log) {
859 print LOG "...found component $component \n";
860 }
861 }
862 }
863 }
864 }
865 #------------------------------------------------------------------------------
866 # download fasta sequence and report on ENA with assembly ID
867 sub get_fasta_and_report_sequence_ena_assembly {
868 my($sequenceID) = @_;
869 my $tmp_file = "fichier.txt";
870 my @id_list = ();
871 my $id_chain = "";
872 my $fasta_file = "";
873 my $report_file = "";
874 my $url = "https://www.ebi.ac.uk/ena/data/view/$sequenceID&display=xml";
875 my $output = get($url);
876
877 open(TMP, ">", $tmp_file) or die("could not open $!");
878 print TMP $output;
879 close(TMP) or die("could not close $!");
880
881 open(TMP, "<", $tmp_file) or die("could not open $!");
882 while(<TMP>){
883 if($_ =~ /<CHROMOSOME accession="(.*)">/){
884 push(@id_list, $1)
885 }
886 }
887 close(TMP) or die("could not close $!");
888
889 $id_chain = join(",", @id_list);
890 $url = "https://www.ebi.ac.uk/ena/data/view/$id_chain&display=fasta";
891 $output = get($url);
892 $fasta_file = $sequenceID . ".fasta";
893 open(FILE, ">", $fasta_file) or die("could not open $!");
894 print FILE $output;
895 close(FILE) or die("could not close $!");
896
897
898 $report_file = $sequenceID . "_report.txt";
899 for my $id (@id_list) {
900 $url = "https://www.ebi.ac.uk/ena/data/view/$id&display=text&header=true";
901 $output = get($url);
902 open(FILE, ">>", $report_file) or die("could not open $!");
903 print FILE $output;
904 print FILE "##########################################################################\n\n";
905 close(FILE) or die("could not close $!");
906 }
907
908 unlink "fichier.txt" or die "error delete file :$!";
909
910 return ($fasta_file, $report_file);
911 }
912 #------------------------------------------------------------------------------
913 # download ENA
914 sub sequence_ena {
915 my($sequenceID, $log) = @_;
916 #my $assemblyRep = $sequenceID . "_folder";
917 my $fastaFile;
918 my $reportFile;
919
920 #if(-d $assemblyRep) { rmtree($assemblyRep); }
921 #mkdir $assemblyRep;
922
923 if ($log) {
924 print LOG "...ENA sequence downloading ...\n";
925 }
926 #if ($log) {$log->msg(1, "Creation of repository: $assemblyRep\n");}
927
928 if($sequenceID =~ /^GCA_.*/){
929 ($fastaFile, $reportFile) = get_fasta_and_report_sequence_ena_assembly($sequenceID);
930 }
931 else {
932 ($fastaFile, $reportFile) = get_fasta_and_report_sequence_ena_other($sequenceID);
933 }
934
935 if ($log) {
936 print LOG "...Downloading of ENA FASTA and report files for sequence: $sequenceID\n";
937 }
938
939 #move($fastaFile, $assemblyRep) or die "move failed: $!";
940 #move($reportFile, $assemblyRep) or die "move failed: $!";
941
942 if ($log) {
943 print LOG "...Move fasta and report files to folder\n";
944 }
945 }
946 #------------------------------------------------------------------------------
947 # download fasta sequence and report on ENA with ENA ID
948 sub get_fasta_and_report_sequence_ena_other {
949 my($sequenceID) = @_;
950 my $fasta_file = "";
951 my $report_file = "";
952 my $url;
953 my $output;
954
955 $url = "https://www.ebi.ac.uk/ena/data/view/$sequenceID&display=fasta";
956 #if($actualOS eq "MSWin32"){
957 $output = get($url);
958 $fasta_file = $sequenceID . ".fasta";
959 open(FILE, ">", $fasta_file) or die("could not open $!");
960 print FILE $output;
961 close(FILE) or die "could not close $!";
962 #}
963 #else{
964 # system("wget $url");
965 #}
966
967 $output = "";
968
969 $url = "https://www.ebi.ac.uk/ena/data/view/$sequenceID&display=text&header=true";
970 #if($actualOS eq "MSWin32"){
971 $output = get($url); # replace by wget????
972 $report_file = $sequenceID . "_report.txt";
973 open(FILE, ">>", $report_file) or die "could not open: $!";
974 print FILE $output;
975 close(FILE) or die "could not close $!";
976 #}
977 #else{
978 # system("wget $url");
979 #}
980
981 return ($fasta_file, $report_file);
982 }
983 #------------------------------------------------------------------------------
984 # add information to file
985 sub add_to_file {
986 my ($file, $info) = @_;
987 open(FILE, ">>", $file) or die ("Could not open $!");
988 print FILE $info;
989 close(FILE);
990 }
991 #------------------------------------------------------------------------------
992 # return taxonomic rank of species by tax id
993 sub get_taxonomic_rank {
994 my($tax_id, $taxonomic_file) = @_;
995 my $species = "na";
996 my $genus = "na";
997 my $family = "na";
998 my $order = "na";
999 my $class = "na";
1000 my $phylum = "na";
1001
1002 # my ($species,$genus,$family,$order,$class,$phylum);
1003 my @tmp_array = ($species, $genus, $family, $order, $class, $phylum);
1004
1005 open(TFILE, "<", $taxonomic_file) or
1006 die("Could not open $taxonomic_file: $!");
1007
1008 while(<TFILE>) {
1009 chomp;
1010 my @tax_info = split(/\|/, $_);
1011
1012 if ($tax_info[0] == $tax_id) {
1013 @tax_info = trim_array(@tax_info);
1014
1015 $tmp_array[0] = $tax_info[1];
1016 splice(@tax_info, 0, 3);
1017
1018 for(my $i = 1; $i < $#tmp_array + 1; $i++) {
1019 if (length($tax_info[$i-1]) > 0) { $tmp_array[$i] = $tax_info[$i-1]; }
1020 }
1021 close(TFILE) or die "error close $taxonomic_file $!:";
1022 return @tmp_array;
1023 }
1024 }
1025 close(TFILE) or die "error close $taxonomic_file $!:";
1026 }
1027 #------------------------------------------------------------------------------
1028 # write html summary file
1029 sub write_html_summary {
1030 my($summary) = @_;
1031 my $htmlFile = "summary.html";
1032 my $header = "";
1033 my @fileToRead = ();
1034
1035 open(HTML, ">", $htmlFile) or die "error open HTML summary $!";
1036 print HTML "<!DOCTYPE html>\n";
1037 print HTML "<html>\n";
1038 print HTML " <head>\n";
1039 print HTML " <title>Assembly summary</title>\n";
1040 print HTML " </head>\n";
1041 print HTML " <body>\n";
1042 print HTML " <h2>Assembly Summary</h2>\n";
1043 close(HTML) or die "error close HTML summary $!";
1044
1045 open(SUM, "<", $summary) or die "error open tsv summary $!";
1046 @fileToRead = <SUM>;
1047 close(SUM) or die "error close tsv summary $!";
1048
1049 $header = splice(@fileToRead, 0, 1);
1050
1051 for my $line (@fileToRead) {
1052 write_html_table($line, $htmlFile, $header);
1053 }
1054
1055 open(HTML, ">>", $htmlFile) or die "error open HTML summary $!";
1056 print HTML " </body>\n";
1057 print HTML "</html>\n";
1058 close(HTML) or die "error close HTML summary $!";
1059 }
1060 #------------------------------------------------------------------------------
1061 # write html table for summary
1062 sub write_html_table {
1063 my ($line, $htmlFile, $header) = @_;
1064
1065 open(HTML, ">>", $htmlFile) or die "error open HTML summary $!";
1066 print HTML " <table border=\"1\" style=\"margin-bottom: 20px;\">\n";
1067 close(HTML) or die "error close HTML summary $!";
1068 add_table_content($line, $htmlFile, $header);
1069 }
1070 #------------------------------------------------------------------------------
1071 # add information to table
1072 sub add_table_content {
1073 my ($line, $htmlFile, $headers) = @_;
1074
1075 my @assemblyHeader = split(/\t/, $headers);
1076 my @assemblyInfo = split(/\t/, $line);
1077 my %hashHeaderInfo;
1078 my $nbOfCell = 7;
1079 my $fullLine = floor(($#assemblyHeader + 1) / $nbOfCell);
1080 my $restCell = $#assemblyHeader + 1 - $fullLine * $nbOfCell;
1081
1082
1083 for (my $i = 0; $i < $#assemblyHeader + 1; $i++) {
1084 $hashHeaderInfo{trim($assemblyHeader[$i])} = $assemblyInfo[$i];
1085 }
1086
1087 my @keysHeaderInfo = keys %hashHeaderInfo;
1088 my $cellIndex = 0;
1089
1090 open(HTML, ">>", $htmlFile) or die "error open HTML summary $!";
1091 for (my $turn = 0; $turn < $fullLine; $turn++) {
1092
1093 print HTML " <tr>\n";
1094 for my $header (@assemblyHeader[$cellIndex..$cellIndex + $nbOfCell - 1]) {
1095 print HTML " <th>$header</th>\n";
1096 }
1097 print HTML " </tr>\n";
1098
1099 print HTML " <tr>\n";
1100 for my $header (@assemblyHeader[$cellIndex..$cellIndex + $nbOfCell - 1]) {
1101 if ($header =~ /PUBMED/i && $hashHeaderInfo{$header} ne "na") {
1102 print HTML " <td><a href=https://www.ncbi.nlm.nih.gov/pubmed/?term=".
1103 "$hashHeaderInfo{$header} target=\"_blank\">$hashHeaderInfo{trim($header)}</a></td>";
1104 }
1105 else {
1106 print HTML " <td>$hashHeaderInfo{trim($header)}</td>\n";
1107 }
1108 }
1109 print HTML " </tr>\n";
1110
1111 $cellIndex += $nbOfCell;
1112 }
1113
1114 print HTML " <tr>\n";
1115 for my $header(@assemblyHeader[$cellIndex..$#keysHeaderInfo]) {
1116 print HTML " <th>$header</th>\n";
1117 }
1118 print HTML " <tr>\n";
1119
1120 print HTML " <tr>\n";
1121 for my $header(@assemblyHeader[$cellIndex..$#keysHeaderInfo]) {
1122 print HTML " <td>$hashHeaderInfo{trim($header)}</td>\n";
1123 }
1124 print HTML " <tr>\n";
1125
1126 print HTML " </table>\n";
1127 close(HTML) or die "error close HTML summary $!";
1128 }
1129 #------------------------------------------------------------------------------
1130 #getTaxonomicRanks (function allowing to get taxonomic ranks from Genbank file)
1131 sub get_taxonomic_rank_genbank {
1132 my ($genbank) = @_;
1133
1134 my $seqio_object = Bio::SeqIO->new(-file => $genbank);
1135 my $seq_object = $seqio_object->next_seq;
1136
1137 # legible and long
1138 my $species_object = $seq_object->species;
1139 my $species_string = $species_object->node_name;
1140
1141 # get all taxa from the ORGANISM section in an array
1142 my @classification = $seq_object->species->classification;
1143 # my $arraySize = @classification;
1144
1145 # print "@classification\n";
1146
1147 # if($arraySize == 7){
1148 # ($species,$genus,$family,$order,$class,$phylum,$kingdomGB) = @classification;
1149 # }
1150 # elsif($arraySize == 4){
1151 # ($species,$class,$phylum,$kingdomGB) = @classification;
1152 # }
1153
1154 my $classification = join(",", @classification);
1155
1156 return ($classification);
1157 }
1158 #------------------------------------------------------------------------------
1159 #add all sequences components to file
1160 sub create_component_sequence_file {
1161 my ($fldSep, $repository, $listComponentRef) = @_;
1162
1163 my @listFnaFile;
1164 my @listComponent = @{$listComponentRef};
1165
1166 opendir(my $dh, $repository) || die "Can't opendir $repository: $!";
1167 @listFnaFile = grep{/fna$/} readdir($dh);
1168 closedir $dh;
1169
1170 my %componentFastaHash;
1171
1172 foreach my $component (@listComponent) {
1173
1174 my $componentFasta = $component.".fasta";
1175
1176 foreach my $fnaFile (@listFnaFile) {
1177
1178 # my $actualFile = $repository . $fldSep . $fnaFile;
1179
1180 my $seq;
1181 my $seqIO = Bio::SeqIO->new(-format=>'Fasta', -file=>$repository . $fldSep . $fnaFile);
1182
1183 while ($seq = $seqIO->next_seq()) {
1184
1185 my $seqDesc = $seq->desc;
1186
1187 if ($seqDesc =~ /$component/) {
1188 my $seqID = $seq->id;
1189 my $seqNuc = $seq->seq;
1190 my $shift = 60;
1191 my @seqArray = split //, $seqNuc;
1192 my $newSeqNuc = "";
1193
1194 if (length $seqNuc <= $shift) {
1195 $newSeqNuc = $seqNuc;
1196 }
1197 else {
1198 for(my $i = 0; $i < $#seqArray + 1; $i ++) {
1199 $newSeqNuc .= $seqArray[$i];
1200 if (($i + 1) % $shift == 0) { $newSeqNuc .= ","; }
1201 }
1202 }
1203
1204 open(FASTA, ">>", $componentFasta) or die "error open file $!:";
1205 print FASTA ">$seqID $seqDesc\n";
1206 foreach my $subSeqNuc (split /,/, $newSeqNuc) {
1207 print FASTA "$subSeqNuc\n";
1208 }
1209 close(FASTA) or die "error close file $!:";
1210 }
1211 }
1212 }
1213 if (-e $componentFasta) { $componentFastaHash{$component} = $componentFasta; }
1214 }
1215 return %componentFastaHash;
1216 }
1217 # remove back and front spaces
1218 sub trim {
1219 my ($string) = @_;
1220 $string =~ s/^\s+//;
1221 $string =~ s/\s+$//;
1222 return $string;
1223 }
1224 #------------------------------------------------------------------------------
1225 # use trim in array
1226 sub trim_array {
1227 my (@array) = @_;
1228 foreach my $value (@array) {
1229 $value = trim($value);
1230 }
1231 return @array;
1232 }
1233 #------------------------------------------------------------------------------
1234 # check if folder is empty
1235 sub empty_folder {
1236 my $dirname = shift;
1237 opendir(my $dholder, $dirname) or die "error not a directory";
1238 my $isEmpty = scalar(grep { $_ ne "." && $_ ne ".." } readdir($dholder));
1239 if ($isEmpty == 0) { return $isEmpty; }
1240 }
1241 #------------------------------------------------------------------------------
1242 # number nucleotid and length
1243 sub number_nuc_length_seq {
1244 my ($fastaFile) = @_;
1245 my $ade = 0;
1246 my $thy = 0;
1247 my $gua = 0;
1248 my $cyt = 0;
1249 my $n = 0;
1250 my $length = 0;
1251
1252 open (FASTA, "<", $fastaFile) or die "Could not open $!";
1253 while (<FASTA>) {
1254 chomp;
1255 if ($_ !~ />/) {
1256 my @seq = split //, $_;
1257
1258 for my $nuc (@seq) {
1259 $length +=1 ;
1260 if ($nuc =~ /a/i) {$ade+=1;}
1261 elsif ($nuc =~ /t/i) {$thy+=1;}
1262 elsif ($nuc =~ /g/i) {$gua+=1;}
1263 elsif ($nuc =~ /c/i) {$cyt+=1;}
1264 elsif ($nuc =~ /n/i) {$n+=1;}
1265 }
1266 }
1267 }
1268 close(FASTA) or die "Error close file :$!";
1269 return ($ade, $thy, $gua, $cyt, $n, $length);
1270
1271 }
1272 #------------------------------------------------------------------------------
1273 # compute percentage of nucleotid
1274 sub nucleotid_percent {
1275 my($ade, $thy, $gua, $cyt, $n, $length) = @_;
1276
1277 my $adePercent = $ade / $length * 100;
1278 my $thyPercent = $thy / $length * 100;
1279 my $guaPercent = $gua / $length * 100;
1280 my $cytPercent = $cyt / $length * 100;
1281 my $nPercent = $n / $length * 100;
1282
1283 return ($adePercent, $thyPercent, $guaPercent, $cytPercent, $nPercent);
1284
1285 }
1286 #------------------------------------------------------------------------------
1287 # compute ATGC ratio
1288 sub atgc_ratio {
1289 my ($ade, $thy, $gua, $cyt) = @_;
1290 return (($ade + $thy) / ($gua + $cyt));
1291 }
1292 #------------------------------------------------------------------------------
1293 # variance
1294 sub shift_data_variance {
1295 my (@data) = @_;
1296
1297 if ($#data + 1 < 2) { return 0.0; }
1298
1299 my $K = $data[0];
1300 my ($n, $Ex, $Ex2) = 0.0;
1301
1302 for my $x (@data) {
1303 $n = $n + 1;
1304 $Ex += $x - $K;
1305 $Ex2 += ($x - $K) * ($x - $K);
1306 }
1307
1308 my $variance = ($Ex2 - ($Ex * $Ex) / $n) / ($n); ## ($n - 1)
1309
1310 return $variance;
1311
1312 }
1313 #------------------------------------------------------------------------------
1314 # nucle score
1315 sub nucle_score {
1316 my ($variance, $gcPercent, $atgcRatio, $length) = @_;
1317 #return log2(($variance * $gcPercent * $atgcRatio) / sqrt($length));
1318 return log2(($variance * $gcPercent * $atgcRatio ** (3)) / sqrt($length));
1319 }
1320 #------------------------------------------------------------------------------
1321 sub log2 {
1322 my $n = shift;
1323 return (log($n) / log(2));
1324 }
1325 #------------------------------------------------------------------------------
1326 # compute GC pourcent
1327 sub gc_percent {
1328 my ($seq) = @_;
1329
1330 my @charSeq = split(//, uc($seq));
1331 my %hashFlank = ();
1332
1333 foreach my $v (@charSeq) {
1334 $hashFlank{$v} += 1;
1335 }
1336
1337 if (! $hashFlank{'G'}) { $hashFlank{'G'} = 0;}
1338 if (! $hashFlank{'C'}) { $hashFlank{'C'} = 0;}
1339
1340 if(length($seq) == 0) {
1341 return 0;
1342 }
1343 else {
1344 return (($hashFlank{'G'} + $hashFlank{'C'}) / (length($seq))) * 100;
1345 }
1346
1347 }
1348 #------------------------------------------------------------------------------
1349 # download file from ftp protocol
1350 sub download_file {
1351 my($servor, $file) = @_;
1352
1353 #if($actualOS eq "MSWin32"){
1354 my $ftp = Net::FTP->new($servor, Debug => 0)
1355 or die "Cannot connect to $servor";
1356
1357 $ftp->login("anonymous", "-anonymous@")
1358 or die "Cannot login ", $ftp->message;
1359 $ftp->binary;
1360 $ftp->get($file) or die "get failed ", $ftp->message;
1361
1362 $ftp->quit;
1363 #}
1364 #else{
1365 # system("wget $file");
1366 #}
1367 }
1368 #------------------------------------------------------------------------------
1369 # obtain file directory
1370 sub obtain_file {
1371 my($servor, $link) = @_;
1372 if ($link =~ /$servor(.*)/) { return ($1); }
1373 }
1374 #------------------------------------------------------------------------------
1375 # download fastq file from ENA
1376 sub download_ena_fastq {
1377 my ($enaFtpServor, $sraId, $log) = @_;
1378
1379 my $fastqDir = "/vol1/fastq/";
1380 my $dir1 = substr $sraId, 0, 6;
1381 my $dir2 = "000";
1382 my $digits = substr $sraId, 3;
1383 my $fastqRep = $sraId . "_folder";
1384
1385 if ($log) {
1386 print LOG "...Downloading fastq file from ENA\n";
1387 }
1388
1389 if (length $digits == 6) {
1390 $dir2 = $sraId;
1391 $fastqDir .= $dir1 . "/" . $dir2 . "/";
1392 }
1393 elsif (length $digits > 6) {
1394 my $digitsNumber = 0;
1395 my @digitsList = split //, (substr $digits, 6);
1396
1397 foreach my $char (@digitsList) {
1398 if (length $dir2 >= 1) {
1399 $dir2 = substr $dir2, 0, (length $dir2) - 1;
1400 $digitsNumber += 1;
1401 }
1402 }
1403 $dir2 .= substr $digits, -$digitsNumber;
1404 $fastqDir .= $dir1 . "/" . $dir2 . "/" . $sraId . "/";
1405 }
1406
1407 if ($log) {
1408 print LOG "...recreate database folder path for FASTQ downloading\n";
1409 }
1410
1411 my $ftp = Net::FTP->new($enaFtpServor, Debug => 0)
1412 or die "Cannot connect to $enaFtpServor";
1413
1414 $ftp->login("anonymous", "-anonymous@")
1415 or die "Cannot login ", $ftp->message;
1416 $ftp->binary;
1417
1418 $ftp->cwd($fastqDir)
1419 or die "maybe undefined sequence id, can't go to $fastqDir: ", $ftp->message;
1420
1421 my @fastqFiles = $ftp->ls("$sraId*");
1422
1423 if ($log) {
1424 print LOG "...Searching fastq files in path\n";
1425 }
1426
1427 if (!grep(/^$/, @fastqFiles)) {
1428
1429 if (-d $fastqRep) { rmtree($fastqRep) }
1430 mkdir $fastqRep;
1431
1432 foreach my $fastqFile (@fastqFiles) {
1433 #if($actualOS eq "MSWin32"){
1434 $ftp->get($fastqFile) or die "get failed ", $ftp->message;
1435 #}
1436 #else{
1437 # system("wget $fastqFile");
1438 #}
1439
1440 #my @baseAndExt = split /\./, $fastqFile;
1441 #my $unzipFastq = $baseAndExt[0] . ".fastq";
1442
1443 #gunzip $fastqFile => $unzipFastq or die "gunzip failed: $GunzipError\n";
1444 move($fastqFile, $fastqRep) or die "move failed: $!"; # DC replaced $unzipFastq by $fastqFile
1445 }
1446 #unlink glob "*fastq.gz" or die "$!: for file *fastq.gz";
1447
1448 if ($log) {
1449 print LOG "...Finalizing download of FASTQ file\n";
1450 }
1451 }
1452
1453 if ($log) {
1454 print LOG "End of download\n";
1455 }
1456
1457 $ftp->quit;
1458 }
1459 #------------------------------------------------------------------------------
1460 # download fastq file from ENA
1461 sub get_assembly_or_project {
1462 my ($file, $sequence, $ftpServor, $fldSep, $log) = @_;
1463
1464 my $pattern;
1465 my $indexInfo;
1466 my %folderHash;
1467
1468 # Repository for fna file
1469 my $repositoryFNA = "Assembly";
1470
1471 # Repository for genbank file
1472 my $repositoryGenbank = "GenBank";
1473
1474 # Reposotiry for report file
1475 my $repositoryReport = "Report";
1476
1477 # global repository
1478 my $repositorySequence = $sequence;
1479
1480
1481 if ($sequence =~ /^GC[AF]_(.*)/) {
1482 $indexInfo = 0;
1483 $pattern = $1;
1484 }
1485 elsif ($sequence =~ /^PRJ/) {
1486 $indexInfo = 1;
1487 $pattern = $sequence;
1488 }
1489
1490 open(SUM, $file) or die "error open file $!:";
1491 while(<SUM>) {
1492 chomp;
1493 if ($_ !~ /^#/) {
1494 my @infoList = split /\t/, $_;
1495 if ($infoList[$indexInfo] =~ /$pattern/) {
1496 my @gcfInfo = split(/\//, $infoList[19]);
1497 my $gcfName = pop(@gcfInfo);
1498
1499
1500 my $genbankFile = $infoList[19] . "/" . $gcfName . "_genomic.gbff.gz";
1501 my $dnaFile = $infoList[19] . "/" . $gcfName . "_genomic.fna.gz";
1502 my $assemblyReport = $infoList[19] . "/" . $gcfName . "_assembly_report.txt";
1503
1504 $dnaFile = obtain_file($ftpServor, $dnaFile);
1505 $genbankFile = obtain_file($ftpServor, $genbankFile);
1506 $assemblyReport = obtain_file($ftpServor, $assemblyReport);
1507
1508 download_file($ftpServor, $dnaFile);
1509 download_file($ftpServor, $genbankFile);
1510 download_file($ftpServor, $assemblyReport);
1511
1512 # download sequences and check number of "N" characters
1513 my $fileFasta = $gcfName."_genomic.fna.gz";
1514 my $ucpFasta = $gcfName."_genomic.fna";
1515 if (-e $fileFasta) {
1516 gunzip $fileFasta => $ucpFasta or die "gunzip failed: $GunzipError\n";
1517 $folderHash{$ucpFasta} = $repositoryFNA;
1518 }
1519
1520 # download genome report
1521 my $fileReport = $gcfName."_assembly_report.txt";
1522 if (-e $fileReport) {
1523 $folderHash{$fileReport} = $repositoryReport;
1524 }
1525
1526 # download genbank files
1527 my $fileGenbank = $gcfName."_genomic.gbff.gz";
1528 my $ucpGenbank = $gcfName."_genomic.gbff";
1529 if (-e $fileGenbank) {
1530 gunzip $fileGenbank => $ucpGenbank or die "gunzip failed: $GunzipError\n";
1531 $folderHash{$ucpGenbank} = $repositoryGenbank;
1532 }
1533
1534 }
1535 }
1536 }
1537 close(SUM) or die "error close file $!";
1538
1539 if (keys %folderHash) {
1540 if (-e $repositorySequence) {rmtree($repositorySequence);}
1541
1542 if ($log) {
1543 print LOG "...Download files from GenBank or RefSeq \n";
1544 }
1545
1546 mkdir $repositorySequence;
1547 mkdir $repositoryFNA;
1548 mkdir $repositoryGenbank;
1549 mkdir $repositoryReport;
1550
1551 for my $ucpFile (keys %folderHash) {
1552 move($ucpFile, $folderHash{$ucpFile}) or die "error move file $!:";
1553 }
1554 move($repositoryFNA, $repositorySequence . $fldSep. $repositoryFNA) or die "error move file $!:";
1555 move($repositoryGenbank, $repositorySequence . $fldSep. $repositoryGenbank) or die "error move file $!:";
1556 move($repositoryReport, $repositorySequence . $fldSep. $repositoryReport) or die "error move file $!:";
1557 unlink glob "*.gz" or die "for file *.gz $!:";
1558
1559 if ($log) {
1560 print LOG "...move GenBank/RefSeq sequence files to dedicated folders\n";
1561 }
1562 }
1563
1564 }
1565 #------------------------------------------------------------------------------
1566 sub download_assembly_or_project {
1567 my ($sequenceId, $ftpServor, $fldSep, $directory, $log) = @_;
1568
1569 my $assemblySummary;
1570 my @sequenceIdList = split /,/, $sequenceId;
1571
1572 if ($directory =~ /refseq/) {
1573 $assemblySummary = "assembly_summary_refseq.txt";
1574 } elsif ($directory =~ /genbank/) {
1575 $assemblySummary = "assembly_summary_genbank.txt";
1576 }
1577
1578 my $assemblySummaryPath = "/genomes/ASSEMBLY_REPORTS/" . $assemblySummary;
1579 download_file($ftpServor, $assemblySummaryPath);
1580
1581 foreach my $sequence (@sequenceIdList) {
1582 get_assembly_or_project($assemblySummary, $sequence, $ftpServor, $fldSep, $log);
1583 }
1584
1585 unlink $assemblySummary or die "error remove file $!:";
1586 }
1587 #------------------------------------------------------------------------------
1588 # check if all required module are install
1589 sub isModuleInstalled {
1590 my $mod = shift;
1591
1592 #eval("use $mod");
1593 my $commandModule = `perldoc -l $mod`;
1594
1595 if ($commandModule) {
1596 return(1);
1597 } else {
1598 return(0);
1599 }
1600 }
1601 #------------------------------------------------------------------------------
1602 # download assembly summary
1603 sub download_summaries {
1604 my ($database, $kingdom, $ftpServor, $fldSep, $getSummaries) = @_;
1605
1606 my $assemblySummary = "assembly_summary.txt";
1607 my $assemblySummaryLink;
1608 my $fileName;
1609
1610 opendir my $workingDirectory, "." . $fldSep or die "error open dir $!:";
1611 my @filesList = readdir $workingDirectory;
1612 closedir $workingDirectory;
1613
1614 if ($getSummaries) {
1615 foreach my $summaryKingdom (split /,/, $getSummaries) {
1616 foreach my $file (@filesList) {
1617 if ($file =~ /assembly_summary.txt/i && $file =~ /$summaryKingdom/i && $file =~ /$database/) {
1618 unlink $file or die "error remove file $!:";
1619 $assemblySummaryLink = "/genomes/$database/$summaryKingdom/assembly_summary.txt";
1620 download_file($ftpServor, $assemblySummaryLink);
1621 $fileName = $database . "_" . $summaryKingdom . "_" . "assembly_summary.txt";
1622 rename $assemblySummary, $fileName;
1623 print "replace assembly_summary file\n";
1624 }
1625 }
1626 }
1627 }
1628
1629 foreach my $file (@filesList) {
1630 if ($file =~ /assembly_summary.txt/i && $file =~ /$kingdom/i && $file =~ /$database/) {
1631 return $file;
1632 }
1633 }
1634
1635 $assemblySummaryLink = "/genomes/$database/$kingdom/assembly_summary.txt";
1636 $fileName = $database . "_" . $kingdom . "_" . "assembly_summary.txt";
1637 download_file($ftpServor, $assemblySummaryLink);
1638 rename $assemblySummary, $fileName;
1639
1640 return $fileName;
1641 }
1642 #---------------------
1643 sub bioseqio {
1644
1645 my ($keyword, $file) = @_;
1646 local $/ = "\n>"; # read by FASTA record
1647
1648 my $count = 0;
1649
1650 open FASTA, $file;
1651 while (<FASTA>) {
1652 chomp;
1653 my $seq = $_;
1654 #my ($id) = $seq =~ /^>*(\S+)/; # parse ID as first word in FASTA header
1655 if ($seq =~ /^>*.*$keyword/) {
1656 #$seq =~ s/^>*.+\n//; # remove FASTA header
1657 #$seq =~ s/\n//g; # remove endlines
1658 $count++;
1659 print "\nThe sequence number is: $count \n";
1660 print ">$seq";
1661 }
1662 }
1663 close FASTA;
1664 }