Mercurial > repos > dcouvin > getsequenceinfo

diff getSequenceInfo.pl @ 0:19ae17458c14 draft default tip
Uploaded
author: dcouvin
date: Wed, 15 Sep 2021 21:35:09 +0000
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/getSequenceInfo.pl	Wed Sep 15 21:35:09 2021 +0000
@@ -0,0 +1,1664 @@
+#!/usr/bin/perl
+
+################################################################################
+## "Copyright 2019 Vincent Moco and David Couvin"
+## licence GPL-3.0-or-later
+## This program is free software: you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation, either version 3 of the License, or
+##  (at your option) any later version.
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+## GNU General Public License for more details.
+## You should have received a copy of the GNU General Public License
+## along with this program.  If not, see <https://www.gnu.org/licenses/>. 
+################################################################################
+
+use strict;
+use warnings;
+
+my $version = "1.0.1"; # version
+
+#my $number = 50;
+
+# Date and time of the current day (Beginning)
+#my ($start_year,$start_month,$start_day, $start_hour,$start_min,$start_sec) = Today_and_Now();
+my $start = time();
+
+print "##################################################################\n";
+print "## ---> Welcome to $0 (version $version)!\n";
+#print "## Start Date (yyyy-mm-dd, hh:min:sec): $start_year-$start_month-$start_day, $start_hour:$start_min:$start_sec\n";
+print "##################################################################\n\n";
+
+#BioPerl, Date::Calc, File::Log, have been removed from the @modules
+my @modules = qw(
+	Archive::Tar
+	Bio::SeqIO
+	Bio::Species
+	File::Copy
+	File::Path
+	Net::FTP
+	IO::Uncompress::Gunzip
+	LWP::Simple
+	POSIX
+	
+);
+
+foreach my $module (@modules) {
+	if (isModuleInstalled($module)) {
+	  print "$module is.................installed!\n";
+	} 
+	else {
+	  print "$module is not installed. Please install it and try again.\n";
+	  print "You can reinstall the $0 as shown on the README page or use the following command to install the module:\n";
+	  print "cpan -i -f $module\n";
+	  #system("cpan -i -f $module");
+	  exit 1;
+	}
+}
+print "\n";
+
+use Archive::Tar;
+use Bio::SeqIO;
+use Bio::Species;
+#use Date::Calc qw(:all);
+use File::Copy qw(cp move);
+use File::Path qw(rmtree);
+use Net::FTP;
+use IO::Uncompress::Gunzip qw(gunzip $GunzipError);
+use LWP::Simple qw(get);
+use POSIX qw(floor);
+#use File::Log;
+
+####################################################################################################
+##  A Perl script allowing to get sequence information from GenBank, RefSeq or ENA repositories.
+##  
+####################################################################################################
+
+### main program
+### parameters
+my $directory = "genbank";
+	
+my $kingdom = ""; # kingdom of organism
+
+my $releaseDate = "0000-00-00"; # sequences are downloaded from this release date
+
+my $components;  # components of the assembly
+
+my $species = ""; # species name
+
+my $getSummaries; # indicates if a new assembly summary is required
+
+my $assemblyLevel = "Complete Genome,Chromosome,Scaffold,Contig"; # assembly level of the genome
+
+my $quantity; # limit number of assemblies to download
+
+my $sequenceID;
+
+my $ftpServor = "ftp.ncbi.nlm.nih.gov";
+
+my $enaFtpServor = "ftp.sra.ebi.ac.uk";
+
+my $fldSep = "/"; # folder seperation change by OS 
+
+my @availableKingdoms = (
+	"archaea",
+	"bacteria",
+	"fungi",
+	"invertebrate",
+	"plant",
+	"protozoa",	
+	"vertebrate_mammalian",
+	"vertebrate_other",
+	"viral"
+);  # list of available kingdoms
+
+my $actualOS = "Unix"; # OS of the computer
+
+my $mainFolder; # folder in which the assemblies results are stored
+
+my $assemblyTaxid = ""; # taxid for assembly
+
+my $sraID; # SRA sequence ID
+
+my $assemblyPrjID; # assembly or prj ID
+ 
+my $log; # log 
+
+my $path = "";
+
+
+if (@ARGV<1) {
+	help_user_simple($0);
+	exit 0;
+}
+
+if ($ARGV[0] eq "-help" || $ARGV[0] eq "-h") {
+	help_user_advance($0);
+	exit 0;
+}
+
+if ($ARGV[0] eq "-version" || $ARGV[0] eq "-v") {
+	program_version($0);
+	exit 0;	
+}
+
+##requirements
+for (my $i = 0; $i <= $#ARGV; $i++) {
+    if ($ARGV[$i]=~/-kingdom/i or $ARGV[$i]=~/-k/i) {
+		$kingdom = $ARGV[$i+1];
+    }
+    elsif ($ARGV[$i]=~/-path/i or $ARGV[$i]=~/-pathSummary/i) {
+                $path = $ARGV[$i+1];
+    }
+    elsif ($ARGV[$i]=~/-directory/i or $ARGV[$i]=~/-dir/i) {
+    	$directory = $ARGV[$i+1];
+    }
+    elsif ($ARGV[$i]=~/-date/i) {
+    	$releaseDate = $ARGV[$i+1];
+    }
+	elsif ($ARGV[$i]=~/-getSummaries/i or $ARGV[$i]=~/-get/i) {
+    	$getSummaries = $ARGV[$i+1];
+    }
+	elsif ($ARGV[$i]=~/-species/i or $ARGV[$i]=~/-s/i) {
+		$species = $ARGV[$i+1];
+	}
+	elsif ($ARGV[$i]=~/-level/i or $ARGV[$i]=~/-le/i) {
+		$assemblyLevel = $ARGV[$i+1];
+	}
+	elsif ($ARGV[$i]=~/-components/i or $ARGV[$i]=~/-c/i) {
+		$components = $ARGV[$i+1];
+	}
+	elsif ($ARGV[$i]=~/-quantity/i or $ARGV[$i]=~/-q/i or $ARGV[$i]=~/-number/i or $ARGV[$i]=~/-n/i) {
+		$quantity = int($ARGV[$i+1]);
+	}
+	elsif ($ARGV[$i]=~/-ena/i) {
+		$sequenceID = $ARGV[$i+1];
+	}
+	elsif ($ARGV[$i]=~/-output/i or $ARGV[$i]=~/-o/i) {
+		$mainFolder = $ARGV[$i+1];
+	}
+	elsif ($ARGV[$i]=~/-taxid/i) {
+		$assemblyTaxid = $ARGV[$i+1];
+	}
+	elsif ($ARGV[$i]=~/-fastq/i) {
+		$sraID = $ARGV[$i+1];
+	}
+	elsif ($ARGV[$i]=~/-assembly_or_project/i) {
+		$assemblyPrjID = $ARGV[$i+1];
+	}
+	elsif ($ARGV[$i]=~/-log/i) {
+		$log = 1;
+		# $log = File::Log->new({
+			# debug           => 4,                   
+			# logFileName     => 'myLogFile.log',    
+			# logFileMode     => '>',                 
+			# dateTimeStamp   => 1,                  
+			# stderrRedirect  => 1,                   
+			# defaultFile     => 0,                  
+			# logFileDateTime => 1,                  
+			# appName         => 'getSequenceInfo',
+			# PIDstamp        => 0,            
+			# storeExpText    => 1,                 
+			# msgprepend      => '',                 
+			# say             => 1,          
+		# }); 
+	}
+}
+
+#define folder separator and OS
+if ($^O =~ /MSWin32/) { 
+	$fldSep = "\\";
+	$actualOS	= "MSWin32";
+}
+
+#LOG file 
+if ($log) { open (LOG, "log.txt") or die " error open log.txt  $!:"; }
+
+
+print "Working ...\n"; 
+
+if ($kingdom eq "viruses") { $kingdom = "viral"; }
+
+if (grep(/^$kingdom$/i, @availableKingdoms)) {
+	
+	my @patternParametersList;
+	
+	my @levelList = split /,/, $assemblyLevel;
+	
+	if ($species ne "") {
+		my @speciesList = split(/,/, $species);
+		
+		foreach my $actualSpecies (@speciesList) {
+			get_assembly_summary_species( $quantity, $releaseDate, $directory,$kingdom, $actualSpecies,
+			\@levelList,  $fldSep, $actualOS, $mainFolder, $assemblyTaxid, $log, 
+			$getSummaries, $components, $ftpServor);
+		}
+	}
+	elsif ($assemblyTaxid ne "") {
+		my @taxidList = split(/,/, $assemblyTaxid);
+		
+		foreach my $actualID (@taxidList) {
+			get_assembly_summary_species($quantity, $releaseDate, $directory,$kingdom,$species, 
+			\@levelList, $fldSep, $actualOS, $mainFolder, $actualID, $log, 
+			$getSummaries, $components, $ftpServor);
+		}
+	} 
+	else {
+		get_assembly_summary_species($quantity, $releaseDate, $directory,$kingdom,$species,
+		\@levelList, $fldSep, $actualOS, $mainFolder, $assemblyTaxid, $log, 
+		$getSummaries, $components, $ftpServor);
+	}	
+}
+
+
+if ($sequenceID) { 
+	my @sequenceIDList = split /,/, $sequenceID;
+	
+	foreach my $enaID (@sequenceIDList) {
+		sequence_ena($enaID, $log);
+	}
+}
+
+if ($sraID) {
+	my @sraIDList = split /,/, $sraID;
+	
+	foreach my $sra (@sraIDList) {
+		download_ena_fastq($enaFtpServor, $sra, $log);
+	}	
+}
+
+if ($assemblyPrjID) {
+	download_assembly_or_project($assemblyPrjID, $ftpServor, $fldSep, $directory, $log);
+}
+
+#my ($end_year,$end_month,$end_day, $end_hour,$end_min,$end_sec) = Today_and_Now();
+
+#my ($D_y,$D_m,$D_d, $Dh,$Dm,$Ds) =
+#      Delta_YMDHMS($start_year,$start_month,$start_day, $start_hour, $start_min, $start_sec,
+#                   $end_year, $end_month, $end_day, $end_hour,$end_min,$end_sec);
+
+#print "End Date: $end_year-$end_month-$end_day, $end_hour:$end_min:$end_sec\n";
+print "Thank you for using $0!\n";
+#print "Execution time: $D_y years, $D_m months, $D_d days, $Dh:$Dm:$Ds (hours:minutes:seconds)\n";
+my $end = time();
+
+my $total = $end - $start;
+my $min = $total / 60;
+my $hrs = $min / 60;
+
+print "Total time: $total seconds OR $min minutes OR $hrs hours ! \n";
+
+### subroutine 
+# display global help document
+sub help_user_simple {
+	my $programme = shift @_;
+	print STDERR  "Usage : perl $programme [options] \n";
+	print "Type perl $programme -version or perl $programme -v to get the current version\n";
+	print "Type perl $programme -help or perl $programme -h to get full help\n";
+}
+#------------------------------------------------------------------------------
+# display full help document
+sub help_user_advance {
+	print <<HEREDOC;
+	
+	Name: 
+		$0
+	
+	Synopsis:
+		A Perl script allowing to get sequence information from GenBank RefSeq or ENA repositories.
+		
+	Usage:
+	  perl $0 [options]
+	  examples: 
+	     perl $0 -k bacteria -s "Helicobacter pylori" -l "Complete Genome" -date 2019-06-01 
+	     perl $0 -k viruses -n 5 -date 2019-06-01
+	     perl $0 -k "bacteria" -taxid 9,24 -n 10 -c plasmid -dir genbank -o Results
+	     perl $0 -ena BN000065
+	     perl $0 -fastq ERR818002
+	     perl $0 -fastq ERR818002,ERR818004
+						 	
+	Kingdoms:
+		archaea
+		bacteria
+		fungi
+		invertebrate
+		plant
+		protozoa
+		vertebrate_mammalian
+		vertebrate_other
+		viral
+	
+	Assembly levels:
+		"Complete Genome"
+		Chromosome
+		Scaffold
+		Contig 
+	
+	General:
+		-help or -h			displays this help 	
+		-version or -v			displays the current version of the program
+		
+	Options ([XXX] represents the expected value):
+		-directory or -dir [XXX]	allows to indicate the NCBI's nucleotide sequences repository (default: $directory)
+		-get or -getSummaries [XXX]	allows to obtain a new assembly summary file in function of given kingdoms (bacteria,fungi,protozoa...)	
+		-k or -kingdom [XXX]		allows to indicate kingdom of the organism (see the examples above)
+		-s or -species [XXX]		allows to indicate the species (must be combined with -k option)
+		-taxid [XXX]			allows to indicate a specific taxid (must be combined with -k option)
+		-assembly_or_project [XXX]	allows to indicate a specific assembly accession or bioproject (must be combined with -k option)
+		-date [XXX]			indicates the release date (with format yyyy-mm-dd) from which sequence information are available
+		-l or -level [XXX]		allows to select a specific assembly level (e.g. "Complete Genome")
+		-o or -output [XXX]		allows users to name the output result folder
+		-n or -number [XXX]		allows to limit the total number of assemblies to be downloaded
+		-c or -components [XXX]		allows to select specific components of the assembly (e.g. plasmid, chromosome, ...)
+		-ena [XXX] 			allows to download report and fasta file given a ENA sequence ID 
+		-fastq [XXX]			allows to download FASTQ sequences from ENA given a run accession (https://ena-docs.readthedocs.io/en/latest/faq/archive-generated-files.html)
+		-log				allows to create a log file
+HEREDOC
+}
+#------------------------------------------------------------------------------
+# display program version 
+sub program_version {
+	my $programme = shift @_;
+	print "\n $programme, version : $version\n";
+	print "\n A perl script to get sequences informations\n";
+}
+#------------------------------------------------------------------------------
+sub get_assembly_summary_species {
+	my ($quantity, $releaseDate, $directory, $kingdom, $species, $levelList, $fldSep, 
+	$actualOS, $mainFolder, $assemblyTaxid, $log, $getSummaries, $components, $ftpServor) = @_;
+	
+	# assembly_summary.txt file from NCBI FTP site
+	#my $assemblySummary = get_summaries($ftpServor, $kingdom, $getSummaries, $directory);
+	
+        my $assemblySummary = "";
+        if($path){
+            $assemblySummary = $path;
+        }
+        else{
+	    $assemblySummary = download_summaries($directory, $kingdom, $ftpServor, $fldSep, $getSummaries);
+	}
+
+        #lineage folder
+	# my $lineage_file = "/pub/taxonomy/new_taxdump/new_taxdump.tar.gz";
+	
+	# allow to check old summary download
+	my $oldKingdom = ""; 
+	
+	# start of output file
+	if ($log) {
+		print LOG "...Downloading assembly_summary.txt...\n";
+	}
+	
+# if ($actualOS =~ /Unix/i) {	
+		#initialiaze tar manipulation
+		# my $tar = Archive::Tar->new;
+	
+		#download taxdump folder
+		# download_file($ftpServor, $lineage_file);
+		# $tar->read("new_taxdump.tar.gz");
+		# $tar->extract_file("rankedlineage.dmp");
+	# }
+	
+	
+	#my $kingdomRep = $kingdom."_".$start_year."_".$start_month."_".$start_day;
+	#my $kingdomRep = $kingdom."_folder";
+	my $kingdomRep = "folder";
+
+	if ($mainFolder) { $kingdomRep = $mainFolder; }
+	mkdir $kingdomRep unless -d $kingdomRep;
+	
+	# Repository for request
+
+	#my $repositoryAssembly = "assembly_repository_".$assemblyTaxid;
+	my $repositoryAssembly = "result";
+	mkdir $repositoryAssembly unless -d $repositoryAssembly;
+	
+	my $oldAssemblyRep = "." . $fldSep . $kingdomRep . $fldSep . $repositoryAssembly;
+	if (-d $oldAssemblyRep) { rmtree($oldAssemblyRep) }
+	
+	#  Repository for fna file
+	my $repositoryFNA = "Assembly";
+	mkdir $repositoryFNA unless -e $repositoryFNA;
+	
+	# Repository for genbank file
+	my $repositoryGenbank = "GenBank";
+	mkdir $repositoryGenbank unless -e $repositoryGenbank;
+	
+	# Reposotiry for report file
+	my $repositoryReport = "Report";
+	mkdir $repositoryReport  unless -e $repositoryReport ;
+	
+	# Repositories for required components
+	my %componentsRepHash;
+	
+	if ($components) {
+		for my $component (split /,/, $components) {
+			my $specificRep = $component."_folder";
+			#my $specificRep = $component."_".$species."_".$kingdom."_".$start_year."_".$start_month."_".$start_day;
+			mkdir $specificRep unless -d $specificRep;
+			$componentsRepHash{$component} = $specificRep;
+		}
+	}
+	
+	if ($log) {
+		print LOG "...Create kingdom and components repository...\n";
+	}
+	
+	my %assemblyReportList;
+	my @assemblyRepresentationList = qw/Full Partial/;
+	my @levelList = @{$levelList};
+	my $checkCompleteGenome = grep(/complete genome/i, @levelList);
+	
+	if ($checkCompleteGenome > 0) {@assemblyRepresentationList = qw/Full/;}
+	
+	if (-e $assemblySummary) {
+		# Read file
+		open (SUM, $assemblySummary) or die " error open assembly_summary.txt  $!:";
+		while(<SUM>) {
+			chomp;
+			if ($_ !~  m/^#/) {
+				my @infoList = split /\t/, $_;
+				my $foundAssemRep = grep (/$infoList[13]/i, @assemblyRepresentationList);
+				my $checkLevel = grep(/$infoList[11]/i, @levelList);   #replace 11 by 10
+				
+				if ($foundAssemRep > 0 && $checkLevel > 0) {
+					my $indexInfo = 0;
+					my $searchPattern = "";
+					my $regex = "";
+					
+					if ($species !~ /^$/) {
+						$indexInfo = 7;
+						$searchPattern = $species;
+						$regex = qr/$searchPattern/i;
+					}
+					elsif ($assemblyTaxid !~ /^$/) { 
+						$indexInfo = 5; 
+						$searchPattern = $assemblyTaxid;
+						$regex = qr/^$searchPattern$/i;
+					}
+					
+					if (($infoList[$indexInfo] =~ $regex) or ($kingdom !~ /^$/ && $searchPattern =~ /^$/)) { 
+						my @gcfInfo = split(/\//, $infoList[19]);  
+						my $gcfName = pop(@gcfInfo);
+						my $realDate = $infoList[14];
+						$realDate =~ s/\//-/g;
+						
+						my $genbankFile = $infoList[19] . "/" . $gcfName . "_genomic.gbff.gz";
+						my $dnaFile = $infoList[19] . "/" . $gcfName . "_genomic.fna.gz";
+						my $assemblyReport = $infoList[19] . "/" . $gcfName . "_assembly_report.txt";
+						
+						if ($realDate gt $releaseDate) {
+							
+							$dnaFile = obtain_file($ftpServor, $dnaFile);
+							$genbankFile = obtain_file($ftpServor, $genbankFile);
+							$assemblyReport = obtain_file($ftpServor, $assemblyReport);
+						
+							download_file($ftpServor, $dnaFile);
+							download_file($ftpServor, $genbankFile);
+							download_file($ftpServor, $assemblyReport);
+							
+							if ($log) {
+								print LOG "...download FASTA and GenBank report files...\n";
+							}
+						
+							# download sequences and check number of "N" characters
+							my $fileFasta = $gcfName."_genomic.fna.gz";
+							my $ucpFasta = $gcfName."_genomic.fna";
+							if (-e $fileFasta) {
+								gunzip $fileFasta => $ucpFasta or die "gunzip failed: $GunzipError\n";
+								move($ucpFasta, $repositoryFNA) or die "move failed: $!";
+							}
+							
+							if ($log) {
+								print LOG "...Unzip FASTA file named $fileFasta ...\n";
+							}
+					
+							# download genome report
+							my $fileReport =  $gcfName."_assembly_report.txt";
+							if (-e $fileReport) {
+								my $fileInformations = $gcfName."_informations.xls";
+								move($fileReport, $repositoryReport) or die "move failed: $!";
+							}
+							
+							# download genbank files
+							my $fileGenbank = $gcfName."_genomic.gbff.gz";
+							my $ucpGenbank = $gcfName."_genomic.gbff";
+							if (-e $fileGenbank) {
+								gunzip $fileGenbank => $ucpGenbank or die "gunzip failed: $GunzipError\n";
+								move($ucpGenbank, $repositoryGenbank) or die "move failed: $!";
+							}
+							
+							if ($log) {
+								print LOG "...Unzip GenBank file $fileGenbank ...\n";
+							}
+					
+							# association report and fasta
+							my $fileFastaGenbank = $ucpFasta . "," . $ucpGenbank;
+							$assemblyReportList{$fileReport} = $fileFastaGenbank;
+						
+							if ($quantity) { $quantity -= 1; }
+							
+						}
+					}
+				}
+			}
+			if (defined $quantity && $quantity == 0) {
+				$quantity = undef;
+				last;
+			}
+		}
+		close(SUM) or die "close file error : $!";
+		
+		if (!keys %assemblyReportList) {
+			print "##################################################################\n";
+			print "No results were found for the following query:\n";
+			print "perl $0 @ARGV\n";
+			print "##################################################################\n\n";
+			
+			if ($actualOS =~ /unix/i) { unlink glob "*.dmp *.gz"  or die "for file *.dmp *.gz $!:"; }
+			
+			if (empty_folder($kingdomRep)) { rmdir $kingdomRep or die "fail remove directory $!:"; }
+			rmdir $repositoryAssembly or die "failed to remove directory $!:";
+			rmdir $repositoryFNA or die "failed to remove directory $!:";
+			rmdir $repositoryGenbank or die "failed to remove directory $!:"; 
+			rmdir $repositoryReport or die "failed to remove directory $!:"; 
+			
+			if ($components) {
+				for my  $componentRep (values %componentsRepHash) {
+					rmdir $componentRep or die "failed to remove directory $!:"; 
+				}
+			}
+			if ($log) {
+				print LOG "...No results were found for the following query: ";
+				print LOG "perl $0 @ARGV \n";
+			}
+			
+		} 
+		else {
+			
+			if ($log) {
+				print LOG "...Results were found for the following query: ";
+				print LOG "perl $0 @ARGV \n";
+			}
+			# write summary files 
+			my %componentsSumHash;
+			my @keysList = keys %assemblyReportList;
+			my $summary = "summary.xls";
+			my $htmlSummary = "summary.html";
+			
+			if ($components) {
+				for my $component (split /,/, $components) {
+					my $specificSummary =  $component. "_summary.xls"; 
+					$componentsSumHash{$component} = $specificSummary;
+				}
+			}
+			
+			my $fileReport = "." . $fldSep. $repositoryReport . $fldSep . $keysList[0];
+			my @header = ();
+			
+			open(FILE, $fileReport) or die "error open file : $!";
+			while(<FILE>) {
+				chomp;
+				if($_ =~ /:/){
+					$_ =~ s/^#*//;
+					my @ligne = split /:/, $_;
+					push(@header, $ligne[0]);
+				}
+			}
+			close(FILE) or die "error close file : $!";
+			
+			open(HEAD, ">", $summary) or die " error open file : $!";
+			foreach(@header) {
+				print HEAD $_ . "\t";
+			}
+			
+			print HEAD "Pubmed\tNucleScore\tClassification\t"; 
+			print HEAD "Country\tHost\tIsolation source\tA percent\t";
+			print HEAD "T percent\tG percent\tC percent\tN percent\tGC percent\t";
+			print HEAD "ATGC ratio\tLength\tShape\n";
+			close(HEAD) or die "error close file : $!";
+			
+			if ($components) {
+				foreach my $componentSummary (values %componentsSumHash) {
+					open(SUM, ">>", $componentSummary) or die "error open file : $!";
+					print SUM "Id\tAssembly\tDescription\tLength\tStatus\tLevel\t";
+					print SUM "GC percent\tA percent\tT percent\tG percent\tC percent\n";	
+					close(SUM) or die "error close file : $!";
+				}
+			}
+		 
+			for my $file (@keysList) {
+				my $reportFile =  $repositoryReport . $fldSep . $file;
+				my @fastaGenbank = split /,/, $assemblyReportList{$file};
+				my $extFasta = $fastaGenbank[0];
+				my $extGenbank = $fastaGenbank[1];
+				my $fnaFile = $repositoryFNA . $fldSep . $extFasta;
+				my $genbankFile = $repositoryGenbank . $fldSep . $extGenbank;
+				
+				write_assembly($reportFile, $fnaFile, $genbankFile, $summary, $repositoryAssembly,
+				\%componentsSumHash, $kingdom,  $actualOS, \@header, $log);
+				
+				if ($log) {
+					print LOG "...Call write_assembly subroutine...\n";
+				}
+			}
+			
+			write_html_summary($summary);
+			
+			if ($log) {
+				print LOG "...Call write_html_summary subroutine...\n";
+			}
+			
+			if ($components) {
+				my @componentList = keys %componentsSumHash;
+				my %componentFastaHash = create_component_sequence_file($fldSep, $repositoryFNA, \@componentList);
+			
+				if (keys %componentFastaHash && $log) { $log->msg(1,"call create_component_sequence_file subroutine");}
+			
+				foreach my $component (keys %componentFastaHash) { 
+					move($componentFastaHash{$component}, $componentsRepHash{$component}) or die "move failed: $!"; 
+				}
+			} 
+			
+			move($summary, $repositoryAssembly) or die "move failed: $!";
+			move($htmlSummary, $repositoryAssembly) or die "move failed: $!";
+			move($repositoryFNA, $repositoryAssembly . $fldSep . $repositoryFNA) or die "move failed: $!";
+			move($repositoryGenbank, $repositoryAssembly . $fldSep . $repositoryGenbank) or die "move failed: $!";
+			move($repositoryReport  , $repositoryAssembly . $fldSep . $repositoryReport) or die "move failed: $!";
+			if ($log) {
+				print LOG "...move fasta, genbank and report to  query folder \n";
+			}
+			
+			if ($components) {
+				for my $component (keys %componentsSumHash) {
+					move($componentsSumHash{$component}, $componentsRepHash{$component}) or die "move failed: $!";
+					move($componentsRepHash{$component}, $repositoryAssembly . $fldSep . $componentsRepHash{$component}) or die "move failed: $!"
+				}
+				if ($log) {
+					print LOG "...move component files to folders\n";
+				}
+			}
+			
+			move($repositoryAssembly, $kingdomRep . $fldSep . $repositoryAssembly) or die "move failed: $!";
+
+			if ($log) {
+				print LOG "...move query folder to main folder\n";
+			}
+			
+			# if ($actualOS =~ /unix/i) { unlink glob "*.dmp"  or die "for file *.dmp $!:"; }
+			unlink glob "*.gz sequence.txt"  or die "$!: for file *.gz sequence.txt";
+		}
+	} 
+}
+#write general assembly file
+sub write_assembly {
+	my ($reportFile, $fnaFile, $genbankFile, $summary, $repositoryAssembly,
+	$componentsSumHashRef, $kingdom,  $actualOS, $headerRef, $log) = @_;
+	
+	my %componentsSumHash = %{$componentsSumHashRef};
+	my @header = @{$headerRef};
+	my %hashInformations = ();
+	my $seq = "";
+	my $genomeName = "";
+	my $country = "na";
+	my $GCpercent = -1;
+	my $taxId = "na";
+	my $assemblyLine;
+	my $pubmedId = "na";
+	my $host = "na";
+	my $isoSource = "na";
+	# my $species = "na";
+	# my $genus = "na";
+	# my $family = "na";
+	# my $order = "na";
+	# my $class = "na";
+	# my $phylum = "na";
+	my $shape = "na";
+	my $geneNumber = "na";
+	
+	open(REP, "<", $reportFile) or die "error open file $!:";
+	while (<REP>) {
+		chomp;
+		$_ =~ s/^#*//;
+		if ($_  =~ /assembled-molecule/i) { $assemblyLine = $_; }
+		if ($_ =~ /:/) {
+			my @line = split /:/, $_;
+			if ($line[1]) { $hashInformations{$line[0]} = trim($line[1]); }
+		}
+	}
+	close(REP) or die "error close file $!:";
+	
+	
+	open(SUM, ">>", $summary) or die "error open file $!:";
+	foreach my $k(@header) {
+		if (grep $_ eq $k, keys %hashInformations) {
+			
+			my $information = $hashInformations{$k};
+			
+			if ($k =~ /Assembly name/i) { $genomeName = $information; }
+			
+			if ($information =~ /^\s*$/) {
+				print SUM "na\t";
+			} else { 
+				print SUM $information . "\t";
+			}
+		} else {
+			print SUM "na\t";
+		}
+	}
+	close(SUM) or die "error close file $!:";
+	
+	open(FNA, "<", $fnaFile) or die "Could not open $!:";
+	while (<FNA>) {
+		chomp;
+		if ($_ !~ /^>/) { $seq .= $_; }
+	}
+	close(FNA)  or die "error close file :$!";
+	
+	foreach my $summaryKey (keys %hashInformations) {
+		if ($summaryKey =~ /taxid/i) {
+			$taxId = $hashInformations{$summaryKey};
+		}
+	}
+
+	my $classification = get_taxonomic_rank_genbank($genbankFile);
+	
+	if ($log) {
+		print LOG "...get taxonomic rank from genbank file\n";
+	}
+	
+	$GCpercent = gc_percent($seq);
+	my ($ade, $thy, $gua, $cyt, $n, $length) = number_nuc_length_seq($fnaFile);
+	my ($aPercent, $tPercent, $gPercent, $cPercent, $nPercent) = nucleotid_percent($ade, $thy, $gua, $cyt, $n, $length);
+	my $atgcRatio =	 atgc_ratio($ade, $thy, $gua, $cyt);
+	
+	my @percentList = ($aPercent, $tPercent, $gPercent, $cPercent, $nPercent);
+	
+	my $variance = shift_data_variance(@percentList);
+	my $nucleScore = nucle_score($variance, $GCpercent, $atgcRatio, $length);
+	
+	if ($log) {
+		print LOG "compute GC percent nucleotid percent ATGC ratio NucleScore\n";
+	}
+	
+	open(GBFF, "<", $genbankFile) or die "Error open file $!:";
+	while(<GBFF>) {
+		chomp;
+		if ($_ =~ /\/country="(.*)"/) { $country = trim($1); }
+		if ($_ =~ /PUBMED(.*)/) {  $pubmedId = trim($1); }
+		if ($_ =~ /\/host="(.*)"/) {  $host = trim($1); }		
+		if ($_ =~ /\/isolation_source="(.*)"/) {  $isoSource = trim($1); }
+		if ($_ =~ /\(Genes \(total\)\s+::(.*)/) { $geneNumber = trim($1); }
+		if ($_ =~ /LOCUS.*\s+([a-z]{1,})\s+[a-z]{1,}\s+[0-9]{2,}-[a-z]{1,}-[0-9]{4,}$/i) { $shape = trim($1); }
+	}
+	close(GBFF) or die "error close file $!:";
+
+	
+	open(SUM, ">>", $summary) or die "error open file $!:";
+	print SUM $pubmedId . "\t" . $nucleScore . "\t" . $classification ."\t" ; 
+	print SUM $country . "\t" . $host . "\t" . $isoSource  . "\t" . $aPercent . "\t" ; 
+	print SUM $tPercent . "\t" . $gPercent . "\t" . $cPercent  ."\t" . $nPercent . "\t";
+	print SUM $GCpercent ."\t". $atgcRatio ."\t". $length . "\t". $shape."\n"; 
+	close(SUM) or die "error close file: $!";
+	
+	if (%componentsSumHash) {
+		write_assembly_component($fnaFile, $genomeName, \%componentsSumHash, $log);
+	}
+}
+#------------------------------------------------------------------------------
+# get assembly component
+sub write_assembly_component {
+	my($fnaFile, $genomeName, $componentsSumHashRef, $log) = @_;
+	
+	my %componentsSumHash = %{$componentsSumHashRef};
+	my $status = "na";
+	my $level = "na";
+	my $gcpercent;
+	my $tmp_fasta_file = "sequence.txt";
+	my @desc = ();
+	
+	# check each sequence from (multi-)fasta file
+	my ($seq, $inputfile);
+
+	if ($log) {
+		print LOG "...search specific components\n";
+	}
+
+	# extract sequence details
+	my $seqIO = Bio::SeqIO->new(-format=>'Fasta', -file=>$fnaFile);
+	
+	while ($seq = $seqIO->next_seq()) {
+		my $seqID = $seq->id; # ID of sequence
+		my $seqDesc = $seq->desc; # Description of sequence
+		my $globalSeq = $seq->seq;
+		my $seqLength = $seq->length();
+		
+		open(TSEQ, ">", $tmp_fasta_file) or die "Error open file: $!";
+		print TSEQ $globalSeq;
+		close(TSEQ);
+		
+		my ($ade, $thy, $gua, $cyt, $n, $length) = number_nuc_length_seq($tmp_fasta_file);
+		
+		my ($aPercent, $tPercent, $gPercent, $cPercent, $nPercent) = nucleotid_percent($ade, $thy, $gua, $cyt, $n, $length);
+		
+		$gcpercent = gc_percent($globalSeq);
+		
+		@desc = split /,/, $seqDesc;
+		
+		if ($desc[1]) { $level = $desc[1]; }
+		
+		foreach my $component (keys %componentsSumHash) {
+			if ($desc[0] =~ /$component/) {
+				$status = $component;
+				my $info = $seqID . "\t" . $genomeName ."\t" . $seqDesc . "\t" . $seqLength . "\t" . $status . "\t" . $level ."\t";
+				$info.= $gcpercent."\t". $aPercent ."\t". $tPercent ."\t". $gPercent ."\t". $cPercent . "\n";
+				add_to_file($componentsSumHash{$component}, $info);
+				if ($log) {
+					print LOG "...found component $component \n";
+				}
+			}
+		}
+	}
+}
+#------------------------------------------------------------------------------
+# download fasta sequence and report on ENA with assembly ID
+sub get_fasta_and_report_sequence_ena_assembly {
+	my($sequenceID) = @_;
+	my $tmp_file = "fichier.txt";
+	my @id_list = ();
+	my $id_chain = "";
+	my $fasta_file = "";
+	my $report_file = "";
+	my $url = "https://www.ebi.ac.uk/ena/data/view/$sequenceID&display=xml";
+	my $output = get($url);
+	
+	open(TMP, ">", $tmp_file) or die("could not open $!");
+	print TMP $output;
+	close(TMP) or die("could not close $!");
+	
+	open(TMP, "<", $tmp_file) or die("could not open $!");
+	while(<TMP>){
+		if($_ =~ /<CHROMOSOME accession="(.*)">/){
+		push(@id_list, $1)
+		}
+	}
+	close(TMP) or die("could not close $!");
+	
+	$id_chain = join(",", @id_list);
+	$url = "https://www.ebi.ac.uk/ena/data/view/$id_chain&display=fasta";
+	$output = get($url);
+	$fasta_file = $sequenceID . ".fasta";
+	open(FILE, ">", $fasta_file) or die("could not open $!");
+	print FILE $output;
+	close(FILE) or die("could not close $!");
+	
+	
+	$report_file = $sequenceID . "_report.txt";
+	for my $id (@id_list) {
+		$url = "https://www.ebi.ac.uk/ena/data/view/$id&display=text&header=true";
+		$output = get($url);
+		open(FILE, ">>", $report_file) or die("could not open $!");
+		print FILE $output;
+		print FILE "##########################################################################\n\n";
+		close(FILE) or die("could not close $!");
+	}
+	
+	unlink "fichier.txt" or die "error delete file :$!";
+	
+	return ($fasta_file, $report_file);
+}
+#------------------------------------------------------------------------------
+# download ENA
+sub sequence_ena {
+	my($sequenceID, $log) = @_;
+	#my $assemblyRep = $sequenceID . "_folder";
+	my $fastaFile;
+	my $reportFile;
+
+	#if(-d $assemblyRep) { rmtree($assemblyRep); }
+	#mkdir $assemblyRep;
+	
+	if ($log) {
+		print LOG "...ENA sequence downloading ...\n";
+	}
+	#if ($log) {$log->msg(1, "Creation of repository: $assemblyRep\n");}
+	
+	if($sequenceID =~ /^GCA_.*/){
+		($fastaFile, $reportFile) = get_fasta_and_report_sequence_ena_assembly($sequenceID);
+	}
+	else {
+		($fastaFile, $reportFile) = get_fasta_and_report_sequence_ena_other($sequenceID);
+	}
+	
+	if ($log) {
+		print LOG "...Downloading of ENA FASTA and report files for sequence: $sequenceID\n";
+	}
+	
+	#move($fastaFile, $assemblyRep) or die "move failed: $!";
+	#move($reportFile, $assemblyRep) or die "move failed: $!";
+	
+	if ($log) {
+		print LOG "...Move fasta and report files to folder\n";
+	}
+} 
+#------------------------------------------------------------------------------
+# download fasta sequence and report on ENA with ENA ID 
+sub get_fasta_and_report_sequence_ena_other {
+	my($sequenceID) = @_;
+	my $fasta_file = "";
+	my $report_file = "";
+	my $url;
+	my $output;
+	
+	$url = "https://www.ebi.ac.uk/ena/data/view/$sequenceID&display=fasta";
+	#if($actualOS eq "MSWin32"){
+		$output = get($url);
+		$fasta_file = $sequenceID . ".fasta";
+		open(FILE, ">", $fasta_file) or die("could not open $!");
+		print FILE $output;
+		close(FILE) or die "could not close $!";
+	#}
+	#else{
+	#	system("wget $url");
+	#}
+	
+	$output = "";
+	
+	$url = "https://www.ebi.ac.uk/ena/data/view/$sequenceID&display=text&header=true";
+	#if($actualOS eq "MSWin32"){
+		$output = get($url);  # replace by wget????
+		$report_file = $sequenceID . "_report.txt";
+		open(FILE, ">>", $report_file) or die "could not open: $!";
+		print FILE $output;
+		close(FILE) or die "could not close $!";
+	#}
+	#else{
+	#	system("wget $url");
+	#}
+	
+	return ($fasta_file, $report_file);
+}
+#------------------------------------------------------------------------------
+# add information to file
+sub add_to_file {
+	my ($file, $info) = @_;
+	open(FILE, ">>",  $file) or die ("Could not open $!");
+	print FILE $info;
+	close(FILE);
+}
+#------------------------------------------------------------------------------
+#   return taxonomic rank of species by tax id
+sub get_taxonomic_rank {
+	my($tax_id, $taxonomic_file) = @_;
+	my $species = "na";
+	my $genus = "na";
+	my $family = "na";
+	my $order = "na";
+	my $class = "na";
+	my $phylum = "na"; 
+	
+	# my ($species,$genus,$family,$order,$class,$phylum);
+	my @tmp_array = ($species, $genus, $family, $order, $class, $phylum);
+	
+	open(TFILE, "<", $taxonomic_file) or 
+		die("Could not open $taxonomic_file: $!");
+		
+	while(<TFILE>) {
+		chomp;
+		my @tax_info = split(/\|/, $_);
+		
+		if ($tax_info[0] == $tax_id) {
+			@tax_info  = trim_array(@tax_info);
+		
+			$tmp_array[0] = $tax_info[1];
+			splice(@tax_info, 0, 3);
+			
+			for(my $i = 1; $i < $#tmp_array + 1; $i++) {
+				if (length($tax_info[$i-1]) > 0) { $tmp_array[$i] = $tax_info[$i-1]; }
+			}
+			close(TFILE) or die "error close $taxonomic_file  $!:";
+			return @tmp_array;
+		}
+	}
+	close(TFILE) or die "error close $taxonomic_file  $!:";
+}
+#------------------------------------------------------------------------------
+#   write html summary file
+sub write_html_summary {
+	my($summary) = @_;
+	my $htmlFile = "summary.html";
+	my $header = "";
+	my @fileToRead = ();
+	
+	open(HTML, ">", $htmlFile) or die "error open HTML summary $!";
+	print HTML "<!DOCTYPE html>\n";
+	print HTML "<html>\n";
+	print HTML " <head>\n";
+	print HTML "  <title>Assembly summary</title>\n";
+	print HTML " </head>\n";
+	print HTML " <body>\n";
+	print HTML "  <h2>Assembly Summary</h2>\n";
+	close(HTML) or die "error close HTML summary $!";
+	
+	open(SUM, "<", $summary) or die "error open tsv summary $!";
+	@fileToRead = <SUM>;
+	close(SUM) or die "error close tsv summary $!";
+	
+	$header = splice(@fileToRead, 0, 1);
+	
+	for my $line (@fileToRead) {
+		write_html_table($line, $htmlFile, $header);
+	}
+	
+	open(HTML, ">>", $htmlFile) or die "error open HTML summary $!";
+	print HTML " </body>\n";
+	print HTML "</html>\n";
+	close(HTML) or die "error close HTML summary $!";
+}
+#------------------------------------------------------------------------------
+#   write html table for summary
+sub write_html_table {
+	my ($line, $htmlFile, $header) = @_;
+	
+	open(HTML, ">>", $htmlFile) or die "error open HTML summary $!";
+	print HTML "  <table  border=\"1\" style=\"margin-bottom: 20px;\">\n";
+	close(HTML) or die "error close HTML summary $!";
+	add_table_content($line, $htmlFile, $header);
+}
+#------------------------------------------------------------------------------
+#   add information to table
+sub add_table_content {
+	my ($line, $htmlFile, $headers) = @_;
+	
+	my @assemblyHeader = split(/\t/, $headers);
+	my @assemblyInfo = split(/\t/, $line);
+	my %hashHeaderInfo;
+	my $nbOfCell = 7;
+	my $fullLine = floor(($#assemblyHeader + 1) / $nbOfCell);
+	my $restCell = $#assemblyHeader + 1 - $fullLine * $nbOfCell;
+	
+	
+	for (my $i = 0; $i < $#assemblyHeader + 1; $i++) {
+		$hashHeaderInfo{trim($assemblyHeader[$i])} = $assemblyInfo[$i];
+	}
+	
+	my @keysHeaderInfo = keys %hashHeaderInfo;
+	my $cellIndex = 0;
+	
+	open(HTML, ">>", $htmlFile) or die "error open HTML summary $!";
+	for (my $turn = 0; $turn < $fullLine; $turn++) {
+	
+		print HTML "   <tr>\n";
+		for my $header (@assemblyHeader[$cellIndex..$cellIndex + $nbOfCell - 1]) {
+			print  HTML "    <th>$header</th>\n";
+		}
+		print HTML "   </tr>\n";
+		
+		print HTML "   <tr>\n";
+		for my $header (@assemblyHeader[$cellIndex..$cellIndex + $nbOfCell - 1]) {
+			if ($header =~ /PUBMED/i && $hashHeaderInfo{$header} ne "na") {
+				print HTML "   <td><a href=https://www.ncbi.nlm.nih.gov/pubmed/?term=".
+				"$hashHeaderInfo{$header} target=\"_blank\">$hashHeaderInfo{trim($header)}</a></td>";
+			}
+			else {
+				print  HTML "    <td>$hashHeaderInfo{trim($header)}</td>\n";
+			}
+		}
+		print HTML "   </tr>\n";
+		
+		$cellIndex +=  $nbOfCell;
+	}
+	
+	print HTML "   <tr>\n";
+	for my $header(@assemblyHeader[$cellIndex..$#keysHeaderInfo]) {
+		print  HTML "    <th>$header</th>\n";
+	}
+	print HTML "   <tr>\n";
+	
+	print HTML "   <tr>\n";
+	for my $header(@assemblyHeader[$cellIndex..$#keysHeaderInfo]) {
+		print  HTML "    <td>$hashHeaderInfo{trim($header)}</td>\n";		
+	}
+	print HTML "   <tr>\n";
+	
+	print HTML "  </table>\n";
+	close(HTML) or die "error close HTML summary $!";
+}
+#------------------------------------------------------------------------------
+#getTaxonomicRanks (function allowing to get taxonomic ranks from Genbank file)
+sub get_taxonomic_rank_genbank {
+	my ($genbank) = @_;
+
+	my $seqio_object = Bio::SeqIO->new(-file => $genbank);
+	my $seq_object = $seqio_object->next_seq;
+
+	# legible and long
+	my $species_object = $seq_object->species;
+	my $species_string = $species_object->node_name;
+
+	# get all taxa from the ORGANISM section in an array
+	my @classification = $seq_object->species->classification;
+	# my $arraySize = @classification;
+	
+	# print "@classification\n";
+
+	# if($arraySize == 7){
+		# ($species,$genus,$family,$order,$class,$phylum,$kingdomGB) = @classification;
+	# }
+	# elsif($arraySize == 4){
+		# ($species,$class,$phylum,$kingdomGB) = @classification;
+	# }
+	
+	my $classification = join(",", @classification);
+  	
+	return ($classification); 
+}
+#------------------------------------------------------------------------------
+#add all sequences components to file
+sub create_component_sequence_file {
+	my ($fldSep, $repository, $listComponentRef) = @_;
+	
+	my @listFnaFile;
+	my @listComponent = @{$listComponentRef};
+	
+	opendir(my $dh, $repository) || die "Can't opendir $repository: $!";
+	@listFnaFile = grep{/fna$/} readdir($dh);
+	closedir $dh;
+	
+	my %componentFastaHash;
+
+	foreach my $component (@listComponent) {
+	
+		my $componentFasta = $component.".fasta";
+		
+		foreach my $fnaFile (@listFnaFile) {
+		
+			# my $actualFile = $repository . $fldSep . $fnaFile;
+			
+			my $seq;
+			my $seqIO = Bio::SeqIO->new(-format=>'Fasta', -file=>$repository . $fldSep . $fnaFile);
+			
+			while ($seq = $seqIO->next_seq()) {
+			
+				my $seqDesc = $seq->desc;
+				
+				if ($seqDesc =~ /$component/) {
+					my $seqID = $seq->id;
+					my $seqNuc = $seq->seq;
+					my $shift = 60;
+					my @seqArray = split //, $seqNuc;
+					my $newSeqNuc = "";
+					
+					if (length $seqNuc <= $shift) {
+						$newSeqNuc = $seqNuc;
+					}
+					else {
+						for(my $i = 0; $i < $#seqArray + 1; $i ++) {
+							$newSeqNuc .=  $seqArray[$i];
+							if (($i + 1) % $shift == 0) { $newSeqNuc .= ","; }
+						}
+					}
+					
+					open(FASTA, ">>", $componentFasta) or die "error open file $!:";
+					print FASTA ">$seqID $seqDesc\n";
+					foreach my $subSeqNuc (split /,/, $newSeqNuc) {
+						print FASTA "$subSeqNuc\n";
+					}
+					close(FASTA) or die "error close file $!:";
+				}
+			}
+		}
+		if (-e $componentFasta) { $componentFastaHash{$component} = $componentFasta; }
+	}
+	return %componentFastaHash;
+}
+# remove back and front spaces
+sub trim {
+my ($string) = @_;
+$string =~ s/^\s+//;
+$string =~ s/\s+$//;
+return $string;
+}
+#------------------------------------------------------------------------------
+# use trim in array
+sub trim_array {
+	my (@array) = @_;
+	foreach my $value (@array) {
+		$value = trim($value);
+	}
+	return @array;
+}
+#------------------------------------------------------------------------------
+# check if folder is empty
+sub empty_folder {
+	my $dirname = shift;
+    opendir(my $dholder, $dirname) or die "error not a directory";
+	my $isEmpty = scalar(grep { $_ ne "." && $_ ne ".." } readdir($dholder));
+	if ($isEmpty == 0) { return $isEmpty; }
+}
+#------------------------------------------------------------------------------
+# number nucleotid and length
+sub number_nuc_length_seq {
+	my ($fastaFile) = @_;
+	my $ade = 0;
+	my $thy = 0;
+	my $gua = 0;
+	my $cyt = 0;
+	my $n = 0;
+	my $length = 0;
+	
+	open (FASTA, "<", $fastaFile) or die "Could not open $!";
+	while (<FASTA>) {
+		chomp;
+		if ($_ !~ />/) {
+			my @seq = split //, $_;
+			
+			for my $nuc (@seq) {
+				$length +=1 ;
+				if ($nuc =~ /a/i) {$ade+=1;}
+				elsif ($nuc =~ /t/i) {$thy+=1;}
+				elsif ($nuc =~ /g/i) {$gua+=1;}
+				elsif ($nuc =~ /c/i) {$cyt+=1;}
+				elsif ($nuc =~ /n/i) {$n+=1;}
+			}
+		}
+	}
+	close(FASTA) or die "Error close file :$!";
+	return ($ade, $thy, $gua, $cyt, $n, $length);
+	
+}
+#------------------------------------------------------------------------------
+# compute percentage of nucleotid
+sub nucleotid_percent {
+	my($ade, $thy, $gua, $cyt, $n, $length) = @_;
+	
+	my $adePercent = $ade / $length * 100;
+	my $thyPercent = $thy / $length * 100;
+	my $guaPercent = $gua / $length * 100;
+	my $cytPercent = $cyt / $length * 100;
+	my $nPercent = $n / $length * 100;
+	
+	return ($adePercent, $thyPercent, $guaPercent, $cytPercent, $nPercent);
+ 
+}
+#------------------------------------------------------------------------------
+# compute ATGC ratio 
+sub atgc_ratio {
+	my ($ade, $thy, $gua, $cyt) = @_;
+	return (($ade + $thy) / ($gua + $cyt));
+}
+#------------------------------------------------------------------------------
+# variance
+sub shift_data_variance {
+	my (@data) = @_;
+	
+	if ($#data + 1 < 2) { return 0.0; }
+
+	my $K = $data[0];
+	my ($n, $Ex, $Ex2) = 0.0;
+	
+	for my $x (@data) {
+		$n = $n + 1;
+		$Ex += $x - $K;
+		$Ex2 += ($x - $K) * ($x - $K);
+	}
+	
+	my $variance = ($Ex2 - ($Ex * $Ex) / $n) / ($n); ## ($n - 1)
+	
+	return $variance;
+
+}
+#------------------------------------------------------------------------------
+# nucle score
+sub nucle_score {
+	my ($variance, $gcPercent, $atgcRatio, $length) = @_;
+	#return log2(($variance * $gcPercent * $atgcRatio) / sqrt($length));
+	return log2(($variance * $gcPercent * $atgcRatio ** (3)) / sqrt($length));
+}
+#------------------------------------------------------------------------------
+sub log2 {
+  my $n = shift;
+  return (log($n) / log(2));
+}
+#------------------------------------------------------------------------------
+# compute GC pourcent
+sub gc_percent {
+	my ($seq) = @_;
+	
+	my @charSeq = split(//, uc($seq));
+	my %hashFlank = ();
+
+	foreach my $v (@charSeq) {
+		$hashFlank{$v} += 1;  
+	}
+  
+	if (! $hashFlank{'G'}) { $hashFlank{'G'} = 0;}
+	if (! $hashFlank{'C'}) { $hashFlank{'C'} = 0;}
+
+	if(length($seq) == 0) {
+		return 0;
+	}
+	else {
+		return (($hashFlank{'G'} + $hashFlank{'C'}) / (length($seq))) * 100;
+	}
+	
+}
+#------------------------------------------------------------------------------
+# download file from ftp protocol
+sub download_file {
+	my($servor, $file) = @_;
+	
+	#if($actualOS eq "MSWin32"){
+		my $ftp = Net::FTP->new($servor, Debug => 0)
+		or die "Cannot connect to $servor";
+		
+		$ftp->login("anonymous", "-anonymous@")
+			or die "Cannot login ", $ftp->message;
+		$ftp->binary;	
+		$ftp->get($file) or die "get failed ", $ftp->message;
+		
+		$ftp->quit;
+	#}
+	#else{
+	#	system("wget $file");
+	#}
+}
+#------------------------------------------------------------------------------
+# obtain file directory
+sub obtain_file {
+	my($servor, $link) = @_;
+	if ($link =~ /$servor(.*)/) { return ($1); }
+}
+#------------------------------------------------------------------------------
+# download fastq file from ENA
+sub download_ena_fastq {
+	my ($enaFtpServor, $sraId, $log) = @_;
+	
+	my $fastqDir = "/vol1/fastq/";
+	my $dir1 = substr $sraId, 0, 6;
+	my $dir2 = "000";
+	my $digits = substr $sraId, 3;
+	my $fastqRep =  $sraId . "_folder";
+	
+	if ($log) {
+		print LOG "...Downloading fastq file from ENA\n";
+	}
+
+	if (length $digits == 6) {
+		$dir2 = $sraId;
+		$fastqDir .= $dir1 . "/" . $dir2 . "/";
+	}
+	elsif (length $digits  > 6) {
+		my $digitsNumber = 0;
+		my @digitsList = split //, (substr $digits, 6);
+	
+		foreach my $char (@digitsList) {
+			if (length $dir2 >= 1) {
+				$dir2 = substr $dir2, 0, (length $dir2) - 1;
+				$digitsNumber += 1;
+			}
+		}
+		$dir2 .= substr $digits, -$digitsNumber;
+		$fastqDir .= $dir1 . "/" . $dir2 . "/" . $sraId . "/";
+	}
+
+	if ($log) {
+		print LOG "...recreate database folder path for FASTQ downloading\n";
+	}
+	
+	my $ftp = Net::FTP->new($enaFtpServor, Debug => 0)
+	or die "Cannot connect to $enaFtpServor";
+	
+	$ftp->login("anonymous", "-anonymous@")
+		or die "Cannot login ", $ftp->message;
+	$ftp->binary;
+	
+	$ftp->cwd($fastqDir) 
+		or die "maybe undefined sequence id, can't go to $fastqDir: ", $ftp->message;
+	
+	my @fastqFiles = $ftp->ls("$sraId*"); 
+	
+	if ($log) {
+		print LOG "...Searching fastq files in path\n";
+	}
+	
+	if (!grep(/^$/, @fastqFiles)) {
+		
+		if (-d $fastqRep) { rmtree($fastqRep) }
+		mkdir $fastqRep;
+		
+		foreach my $fastqFile (@fastqFiles) {
+			#if($actualOS eq "MSWin32"){
+				$ftp->get($fastqFile) or die "get failed ", $ftp->message;
+			#}
+			#else{
+			#	system("wget $fastqFile");
+			#}
+			
+			#my @baseAndExt = split /\./, $fastqFile;
+			#my $unzipFastq = $baseAndExt[0] . ".fastq";
+		
+			#gunzip $fastqFile => $unzipFastq or die "gunzip failed: $GunzipError\n";
+			move($fastqFile, $fastqRep) or die "move failed: $!"; # DC replaced $unzipFastq by $fastqFile
+		} 
+		#unlink glob "*fastq.gz"  or die "$!: for file *fastq.gz";
+		
+		if ($log) {
+			print LOG "...Finalizing download of FASTQ file\n";
+		}
+	}
+
+	if ($log) {
+		print LOG "End of download\n";
+	}
+	
+	$ftp->quit;
+}
+#------------------------------------------------------------------------------
+# download fastq file from ENA
+sub get_assembly_or_project {
+	my ($file, $sequence, $ftpServor, $fldSep, $log) = @_;
+	
+	my $pattern;
+	my $indexInfo;
+	my %folderHash;
+	
+	#  Repository for fna file
+	my $repositoryFNA = "Assembly";
+	
+	# Repository for genbank file
+	my $repositoryGenbank = "GenBank";
+	
+	# Reposotiry for report file
+	my $repositoryReport = "Report";
+	
+	# global repository
+	my $repositorySequence = $sequence;
+
+	
+	if ($sequence =~ /^GC[AF]_(.*)/) {
+		$indexInfo = 0;
+		$pattern = $1;
+	}
+	elsif ($sequence =~ /^PRJ/) {
+		$indexInfo = 1;
+		$pattern = $sequence;
+	}
+	
+	open(SUM, $file) or die "error open file $!:";
+	while(<SUM>) {
+		chomp;
+		if ($_ !~ /^#/) {
+			my @infoList = split /\t/, $_;
+			if ($infoList[$indexInfo] =~ /$pattern/) {
+				my @gcfInfo = split(/\//, $infoList[19]);  
+				my $gcfName = pop(@gcfInfo);
+				
+				
+				my $genbankFile = $infoList[19] . "/" . $gcfName . "_genomic.gbff.gz";
+				my $dnaFile = $infoList[19] . "/" . $gcfName . "_genomic.fna.gz";
+				my $assemblyReport = $infoList[19] . "/" . $gcfName . "_assembly_report.txt";
+				
+				$dnaFile = obtain_file($ftpServor, $dnaFile);
+				$genbankFile = obtain_file($ftpServor, $genbankFile);
+				$assemblyReport = obtain_file($ftpServor, $assemblyReport);
+				
+				download_file($ftpServor, $dnaFile);
+				download_file($ftpServor, $genbankFile);
+				download_file($ftpServor, $assemblyReport);
+				
+				# download sequences and check number of "N" characters
+				my $fileFasta = $gcfName."_genomic.fna.gz";
+				my $ucpFasta = $gcfName."_genomic.fna";
+				if (-e $fileFasta) {
+					gunzip $fileFasta => $ucpFasta or die "gunzip failed: $GunzipError\n";
+					$folderHash{$ucpFasta} = $repositoryFNA;
+				}
+				
+				# download genome report
+				my $fileReport =  $gcfName."_assembly_report.txt";
+				if (-e $fileReport) {
+					$folderHash{$fileReport} = $repositoryReport;
+				}
+				
+				# download genbank files
+				my $fileGenbank = $gcfName."_genomic.gbff.gz";
+				my $ucpGenbank = $gcfName."_genomic.gbff";
+				if (-e $fileGenbank) {
+					gunzip $fileGenbank => $ucpGenbank or die "gunzip failed: $GunzipError\n";
+					$folderHash{$ucpGenbank} = $repositoryGenbank;
+				}
+		
+			}
+		}
+	}
+	close(SUM) or die "error close file $!";	
+	
+	if (keys %folderHash) {
+		if (-e $repositorySequence) {rmtree($repositorySequence);}
+		
+		if ($log) {
+			print LOG "...Download files from GenBank or RefSeq \n";
+		}
+		
+		mkdir $repositorySequence;
+		mkdir $repositoryFNA;
+		mkdir $repositoryGenbank;
+		mkdir $repositoryReport;
+		
+		for my $ucpFile (keys %folderHash) {
+			move($ucpFile, $folderHash{$ucpFile}) or die "error move file $!:";
+		}
+		move($repositoryFNA, $repositorySequence . $fldSep. $repositoryFNA) or die "error move file $!:";
+		move($repositoryGenbank, $repositorySequence . $fldSep. $repositoryGenbank) or die "error move file $!:";
+		move($repositoryReport, $repositorySequence . $fldSep. $repositoryReport) or die "error move file $!:";
+		unlink glob "*.gz"  or die "for file *.gz $!:";
+		
+		if ($log) {
+			print LOG "...move GenBank/RefSeq sequence files to dedicated folders\n";
+		}
+	} 
+	
+}
+#------------------------------------------------------------------------------
+sub download_assembly_or_project {
+	my ($sequenceId, $ftpServor, $fldSep, $directory, $log) = @_;
+	
+	my $assemblySummary;
+	my @sequenceIdList = split /,/, $sequenceId;
+	
+	if ($directory =~ /refseq/) {
+		$assemblySummary = "assembly_summary_refseq.txt";
+	} elsif ($directory =~ /genbank/) {
+		$assemblySummary = "assembly_summary_genbank.txt";
+	}
+	
+	my $assemblySummaryPath = "/genomes/ASSEMBLY_REPORTS/" . $assemblySummary; 
+	download_file($ftpServor, $assemblySummaryPath);
+	
+	foreach my $sequence (@sequenceIdList) {
+		get_assembly_or_project($assemblySummary, $sequence, $ftpServor, $fldSep, $log);
+	}
+	
+	unlink $assemblySummary or die "error remove file $!:"; 
+}
+#------------------------------------------------------------------------------
+# check if all required module are install
+sub isModuleInstalled {
+  my $mod = shift;
+
+  #eval("use $mod");
+  my $commandModule = `perldoc -l $mod`;
+  
+  if ($commandModule) {
+    return(1);
+  } else {
+    return(0);
+  }
+}
+#------------------------------------------------------------------------------
+# download assembly summary
+sub download_summaries {
+	my ($database, $kingdom, $ftpServor, $fldSep, $getSummaries) = @_;
+	
+	my $assemblySummary = "assembly_summary.txt";
+	my $assemblySummaryLink;
+	my $fileName;
+	
+	opendir my $workingDirectory, "." . $fldSep or die "error open dir $!:";
+	my @filesList = readdir $workingDirectory; 
+	closedir $workingDirectory;
+	
+	if ($getSummaries) {
+		foreach my $summaryKingdom (split /,/, $getSummaries) {
+			foreach my $file (@filesList) {
+				if ($file =~ /assembly_summary.txt/i && $file =~ /$summaryKingdom/i && $file =~ /$database/) {
+					unlink $file or die "error remove file $!:";
+					$assemblySummaryLink = "/genomes/$database/$summaryKingdom/assembly_summary.txt";
+					download_file($ftpServor, $assemblySummaryLink);
+					$fileName = $database . "_" . $summaryKingdom . "_" . "assembly_summary.txt";
+					rename $assemblySummary, $fileName;
+					print "replace assembly_summary file\n";
+				}
+			}
+		}
+	}
+	
+	foreach my $file (@filesList) {
+		if ($file =~ /assembly_summary.txt/i && $file =~ /$kingdom/i && $file =~ /$database/) {
+			return $file;
+		}
+	}
+	
+	$assemblySummaryLink = "/genomes/$database/$kingdom/assembly_summary.txt";
+	$fileName = $database . "_" . $kingdom . "_" . "assembly_summary.txt";
+	download_file($ftpServor, $assemblySummaryLink);
+	rename $assemblySummary, $fileName;
+	
+	return $fileName;
+}
+#---------------------
+sub bioseqio {
+	
+	my ($keyword, $file) = @_;
+	local $/ = "\n>";  # read by FASTA record
+
+	my $count = 0;
+
+	open FASTA, $file;
+	while (<FASTA>) {
+		chomp;
+		my $seq = $_;
+		#my ($id) = $seq =~ /^>*(\S+)/;  # parse ID as first word in FASTA header
+		if ($seq =~ /^>*.*$keyword/) {
+			#$seq =~ s/^>*.+\n//;  # remove FASTA header
+			#$seq =~ s/\n//g;  # remove endlines
+			$count++;
+			print "\nThe sequence number is: $count \n";
+			print ">$seq";
+		}
+	}
+	close FASTA;
+}
author	dcouvin
date	Wed, 15 Sep 2021 21:35:09 +0000
parents
children