Mercurial > repos > dcouvin > getsequenceinfo
comparison getSequenceInfo.pl @ 0:19ae17458c14 draft default tip
Uploaded
| author | dcouvin |
|---|---|
| date | Wed, 15 Sep 2021 21:35:09 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:19ae17458c14 |
|---|---|
| 1 #!/usr/bin/perl | |
| 2 | |
| 3 ################################################################################ | |
| 4 ## "Copyright 2019 Vincent Moco and David Couvin" | |
| 5 ## licence GPL-3.0-or-later | |
| 6 ## This program is free software: you can redistribute it and/or modify | |
| 7 ## it under the terms of the GNU General Public License as published by | |
| 8 ## the Free Software Foundation, either version 3 of the License, or | |
| 9 ## (at your option) any later version. | |
| 10 ## This program is distributed in the hope that it will be useful, | |
| 11 ## but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 12 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 13 ## GNU General Public License for more details. | |
| 14 ## You should have received a copy of the GNU General Public License | |
| 15 ## along with this program. If not, see <https://www.gnu.org/licenses/>. | |
| 16 ################################################################################ | |
| 17 | |
| 18 use strict; | |
| 19 use warnings; | |
| 20 | |
| 21 my $version = "1.0.1"; # version | |
| 22 | |
| 23 #my $number = 50; | |
| 24 | |
| 25 # Date and time of the current day (Beginning) | |
| 26 #my ($start_year,$start_month,$start_day, $start_hour,$start_min,$start_sec) = Today_and_Now(); | |
| 27 my $start = time(); | |
| 28 | |
| 29 print "##################################################################\n"; | |
| 30 print "## ---> Welcome to $0 (version $version)!\n"; | |
| 31 #print "## Start Date (yyyy-mm-dd, hh:min:sec): $start_year-$start_month-$start_day, $start_hour:$start_min:$start_sec\n"; | |
| 32 print "##################################################################\n\n"; | |
| 33 | |
| 34 #BioPerl, Date::Calc, File::Log, have been removed from the @modules | |
| 35 my @modules = qw( | |
| 36 Archive::Tar | |
| 37 Bio::SeqIO | |
| 38 Bio::Species | |
| 39 File::Copy | |
| 40 File::Path | |
| 41 Net::FTP | |
| 42 IO::Uncompress::Gunzip | |
| 43 LWP::Simple | |
| 44 POSIX | |
| 45 | |
| 46 ); | |
| 47 | |
| 48 foreach my $module (@modules) { | |
| 49 if (isModuleInstalled($module)) { | |
| 50 print "$module is.................installed!\n"; | |
| 51 } | |
| 52 else { | |
| 53 print "$module is not installed. Please install it and try again.\n"; | |
| 54 print "You can reinstall the $0 as shown on the README page or use the following command to install the module:\n"; | |
| 55 print "cpan -i -f $module\n"; | |
| 56 #system("cpan -i -f $module"); | |
| 57 exit 1; | |
| 58 } | |
| 59 } | |
| 60 print "\n"; | |
| 61 | |
| 62 use Archive::Tar; | |
| 63 use Bio::SeqIO; | |
| 64 use Bio::Species; | |
| 65 #use Date::Calc qw(:all); | |
| 66 use File::Copy qw(cp move); | |
| 67 use File::Path qw(rmtree); | |
| 68 use Net::FTP; | |
| 69 use IO::Uncompress::Gunzip qw(gunzip $GunzipError); | |
| 70 use LWP::Simple qw(get); | |
| 71 use POSIX qw(floor); | |
| 72 #use File::Log; | |
| 73 | |
| 74 #################################################################################################### | |
| 75 ## A Perl script allowing to get sequence information from GenBank, RefSeq or ENA repositories. | |
| 76 ## | |
| 77 #################################################################################################### | |
| 78 | |
| 79 ### main program | |
| 80 ### parameters | |
| 81 my $directory = "genbank"; | |
| 82 | |
| 83 my $kingdom = ""; # kingdom of organism | |
| 84 | |
| 85 my $releaseDate = "0000-00-00"; # sequences are downloaded from this release date | |
| 86 | |
| 87 my $components; # components of the assembly | |
| 88 | |
| 89 my $species = ""; # species name | |
| 90 | |
| 91 my $getSummaries; # indicates if a new assembly summary is required | |
| 92 | |
| 93 my $assemblyLevel = "Complete Genome,Chromosome,Scaffold,Contig"; # assembly level of the genome | |
| 94 | |
| 95 my $quantity; # limit number of assemblies to download | |
| 96 | |
| 97 my $sequenceID; | |
| 98 | |
| 99 my $ftpServor = "ftp.ncbi.nlm.nih.gov"; | |
| 100 | |
| 101 my $enaFtpServor = "ftp.sra.ebi.ac.uk"; | |
| 102 | |
| 103 my $fldSep = "/"; # folder seperation change by OS | |
| 104 | |
| 105 my @availableKingdoms = ( | |
| 106 "archaea", | |
| 107 "bacteria", | |
| 108 "fungi", | |
| 109 "invertebrate", | |
| 110 "plant", | |
| 111 "protozoa", | |
| 112 "vertebrate_mammalian", | |
| 113 "vertebrate_other", | |
| 114 "viral" | |
| 115 ); # list of available kingdoms | |
| 116 | |
| 117 my $actualOS = "Unix"; # OS of the computer | |
| 118 | |
| 119 my $mainFolder; # folder in which the assemblies results are stored | |
| 120 | |
| 121 my $assemblyTaxid = ""; # taxid for assembly | |
| 122 | |
| 123 my $sraID; # SRA sequence ID | |
| 124 | |
| 125 my $assemblyPrjID; # assembly or prj ID | |
| 126 | |
| 127 my $log; # log | |
| 128 | |
| 129 my $path = ""; | |
| 130 | |
| 131 | |
| 132 if (@ARGV<1) { | |
| 133 help_user_simple($0); | |
| 134 exit 0; | |
| 135 } | |
| 136 | |
| 137 if ($ARGV[0] eq "-help" || $ARGV[0] eq "-h") { | |
| 138 help_user_advance($0); | |
| 139 exit 0; | |
| 140 } | |
| 141 | |
| 142 if ($ARGV[0] eq "-version" || $ARGV[0] eq "-v") { | |
| 143 program_version($0); | |
| 144 exit 0; | |
| 145 } | |
| 146 | |
| 147 ##requirements | |
| 148 for (my $i = 0; $i <= $#ARGV; $i++) { | |
| 149 if ($ARGV[$i]=~/-kingdom/i or $ARGV[$i]=~/-k/i) { | |
| 150 $kingdom = $ARGV[$i+1]; | |
| 151 } | |
| 152 elsif ($ARGV[$i]=~/-path/i or $ARGV[$i]=~/-pathSummary/i) { | |
| 153 $path = $ARGV[$i+1]; | |
| 154 } | |
| 155 elsif ($ARGV[$i]=~/-directory/i or $ARGV[$i]=~/-dir/i) { | |
| 156 $directory = $ARGV[$i+1]; | |
| 157 } | |
| 158 elsif ($ARGV[$i]=~/-date/i) { | |
| 159 $releaseDate = $ARGV[$i+1]; | |
| 160 } | |
| 161 elsif ($ARGV[$i]=~/-getSummaries/i or $ARGV[$i]=~/-get/i) { | |
| 162 $getSummaries = $ARGV[$i+1]; | |
| 163 } | |
| 164 elsif ($ARGV[$i]=~/-species/i or $ARGV[$i]=~/-s/i) { | |
| 165 $species = $ARGV[$i+1]; | |
| 166 } | |
| 167 elsif ($ARGV[$i]=~/-level/i or $ARGV[$i]=~/-le/i) { | |
| 168 $assemblyLevel = $ARGV[$i+1]; | |
| 169 } | |
| 170 elsif ($ARGV[$i]=~/-components/i or $ARGV[$i]=~/-c/i) { | |
| 171 $components = $ARGV[$i+1]; | |
| 172 } | |
| 173 elsif ($ARGV[$i]=~/-quantity/i or $ARGV[$i]=~/-q/i or $ARGV[$i]=~/-number/i or $ARGV[$i]=~/-n/i) { | |
| 174 $quantity = int($ARGV[$i+1]); | |
| 175 } | |
| 176 elsif ($ARGV[$i]=~/-ena/i) { | |
| 177 $sequenceID = $ARGV[$i+1]; | |
| 178 } | |
| 179 elsif ($ARGV[$i]=~/-output/i or $ARGV[$i]=~/-o/i) { | |
| 180 $mainFolder = $ARGV[$i+1]; | |
| 181 } | |
| 182 elsif ($ARGV[$i]=~/-taxid/i) { | |
| 183 $assemblyTaxid = $ARGV[$i+1]; | |
| 184 } | |
| 185 elsif ($ARGV[$i]=~/-fastq/i) { | |
| 186 $sraID = $ARGV[$i+1]; | |
| 187 } | |
| 188 elsif ($ARGV[$i]=~/-assembly_or_project/i) { | |
| 189 $assemblyPrjID = $ARGV[$i+1]; | |
| 190 } | |
| 191 elsif ($ARGV[$i]=~/-log/i) { | |
| 192 $log = 1; | |
| 193 # $log = File::Log->new({ | |
| 194 # debug => 4, | |
| 195 # logFileName => 'myLogFile.log', | |
| 196 # logFileMode => '>', | |
| 197 # dateTimeStamp => 1, | |
| 198 # stderrRedirect => 1, | |
| 199 # defaultFile => 0, | |
| 200 # logFileDateTime => 1, | |
| 201 # appName => 'getSequenceInfo', | |
| 202 # PIDstamp => 0, | |
| 203 # storeExpText => 1, | |
| 204 # msgprepend => '', | |
| 205 # say => 1, | |
| 206 # }); | |
| 207 } | |
| 208 } | |
| 209 | |
| 210 #define folder separator and OS | |
| 211 if ($^O =~ /MSWin32/) { | |
| 212 $fldSep = "\\"; | |
| 213 $actualOS = "MSWin32"; | |
| 214 } | |
| 215 | |
| 216 #LOG file | |
| 217 if ($log) { open (LOG, "log.txt") or die " error open log.txt $!:"; } | |
| 218 | |
| 219 | |
| 220 print "Working ...\n"; | |
| 221 | |
| 222 if ($kingdom eq "viruses") { $kingdom = "viral"; } | |
| 223 | |
| 224 if (grep(/^$kingdom$/i, @availableKingdoms)) { | |
| 225 | |
| 226 my @patternParametersList; | |
| 227 | |
| 228 my @levelList = split /,/, $assemblyLevel; | |
| 229 | |
| 230 if ($species ne "") { | |
| 231 my @speciesList = split(/,/, $species); | |
| 232 | |
| 233 foreach my $actualSpecies (@speciesList) { | |
| 234 get_assembly_summary_species( $quantity, $releaseDate, $directory,$kingdom, $actualSpecies, | |
| 235 \@levelList, $fldSep, $actualOS, $mainFolder, $assemblyTaxid, $log, | |
| 236 $getSummaries, $components, $ftpServor); | |
| 237 } | |
| 238 } | |
| 239 elsif ($assemblyTaxid ne "") { | |
| 240 my @taxidList = split(/,/, $assemblyTaxid); | |
| 241 | |
| 242 foreach my $actualID (@taxidList) { | |
| 243 get_assembly_summary_species($quantity, $releaseDate, $directory,$kingdom,$species, | |
| 244 \@levelList, $fldSep, $actualOS, $mainFolder, $actualID, $log, | |
| 245 $getSummaries, $components, $ftpServor); | |
| 246 } | |
| 247 } | |
| 248 else { | |
| 249 get_assembly_summary_species($quantity, $releaseDate, $directory,$kingdom,$species, | |
| 250 \@levelList, $fldSep, $actualOS, $mainFolder, $assemblyTaxid, $log, | |
| 251 $getSummaries, $components, $ftpServor); | |
| 252 } | |
| 253 } | |
| 254 | |
| 255 | |
| 256 if ($sequenceID) { | |
| 257 my @sequenceIDList = split /,/, $sequenceID; | |
| 258 | |
| 259 foreach my $enaID (@sequenceIDList) { | |
| 260 sequence_ena($enaID, $log); | |
| 261 } | |
| 262 } | |
| 263 | |
| 264 if ($sraID) { | |
| 265 my @sraIDList = split /,/, $sraID; | |
| 266 | |
| 267 foreach my $sra (@sraIDList) { | |
| 268 download_ena_fastq($enaFtpServor, $sra, $log); | |
| 269 } | |
| 270 } | |
| 271 | |
| 272 if ($assemblyPrjID) { | |
| 273 download_assembly_or_project($assemblyPrjID, $ftpServor, $fldSep, $directory, $log); | |
| 274 } | |
| 275 | |
| 276 #my ($end_year,$end_month,$end_day, $end_hour,$end_min,$end_sec) = Today_and_Now(); | |
| 277 | |
| 278 #my ($D_y,$D_m,$D_d, $Dh,$Dm,$Ds) = | |
| 279 # Delta_YMDHMS($start_year,$start_month,$start_day, $start_hour, $start_min, $start_sec, | |
| 280 # $end_year, $end_month, $end_day, $end_hour,$end_min,$end_sec); | |
| 281 | |
| 282 #print "End Date: $end_year-$end_month-$end_day, $end_hour:$end_min:$end_sec\n"; | |
| 283 print "Thank you for using $0!\n"; | |
| 284 #print "Execution time: $D_y years, $D_m months, $D_d days, $Dh:$Dm:$Ds (hours:minutes:seconds)\n"; | |
| 285 my $end = time(); | |
| 286 | |
| 287 my $total = $end - $start; | |
| 288 my $min = $total / 60; | |
| 289 my $hrs = $min / 60; | |
| 290 | |
| 291 print "Total time: $total seconds OR $min minutes OR $hrs hours ! \n"; | |
| 292 | |
| 293 ### subroutine | |
| 294 # display global help document | |
| 295 sub help_user_simple { | |
| 296 my $programme = shift @_; | |
| 297 print STDERR "Usage : perl $programme [options] \n"; | |
| 298 print "Type perl $programme -version or perl $programme -v to get the current version\n"; | |
| 299 print "Type perl $programme -help or perl $programme -h to get full help\n"; | |
| 300 } | |
| 301 #------------------------------------------------------------------------------ | |
| 302 # display full help document | |
| 303 sub help_user_advance { | |
| 304 print <<HEREDOC; | |
| 305 | |
| 306 Name: | |
| 307 $0 | |
| 308 | |
| 309 Synopsis: | |
| 310 A Perl script allowing to get sequence information from GenBank RefSeq or ENA repositories. | |
| 311 | |
| 312 Usage: | |
| 313 perl $0 [options] | |
| 314 examples: | |
| 315 perl $0 -k bacteria -s "Helicobacter pylori" -l "Complete Genome" -date 2019-06-01 | |
| 316 perl $0 -k viruses -n 5 -date 2019-06-01 | |
| 317 perl $0 -k "bacteria" -taxid 9,24 -n 10 -c plasmid -dir genbank -o Results | |
| 318 perl $0 -ena BN000065 | |
| 319 perl $0 -fastq ERR818002 | |
| 320 perl $0 -fastq ERR818002,ERR818004 | |
| 321 | |
| 322 Kingdoms: | |
| 323 archaea | |
| 324 bacteria | |
| 325 fungi | |
| 326 invertebrate | |
| 327 plant | |
| 328 protozoa | |
| 329 vertebrate_mammalian | |
| 330 vertebrate_other | |
| 331 viral | |
| 332 | |
| 333 Assembly levels: | |
| 334 "Complete Genome" | |
| 335 Chromosome | |
| 336 Scaffold | |
| 337 Contig | |
| 338 | |
| 339 General: | |
| 340 -help or -h displays this help | |
| 341 -version or -v displays the current version of the program | |
| 342 | |
| 343 Options ([XXX] represents the expected value): | |
| 344 -directory or -dir [XXX] allows to indicate the NCBI's nucleotide sequences repository (default: $directory) | |
| 345 -get or -getSummaries [XXX] allows to obtain a new assembly summary file in function of given kingdoms (bacteria,fungi,protozoa...) | |
| 346 -k or -kingdom [XXX] allows to indicate kingdom of the organism (see the examples above) | |
| 347 -s or -species [XXX] allows to indicate the species (must be combined with -k option) | |
| 348 -taxid [XXX] allows to indicate a specific taxid (must be combined with -k option) | |
| 349 -assembly_or_project [XXX] allows to indicate a specific assembly accession or bioproject (must be combined with -k option) | |
| 350 -date [XXX] indicates the release date (with format yyyy-mm-dd) from which sequence information are available | |
| 351 -l or -level [XXX] allows to select a specific assembly level (e.g. "Complete Genome") | |
| 352 -o or -output [XXX] allows users to name the output result folder | |
| 353 -n or -number [XXX] allows to limit the total number of assemblies to be downloaded | |
| 354 -c or -components [XXX] allows to select specific components of the assembly (e.g. plasmid, chromosome, ...) | |
| 355 -ena [XXX] allows to download report and fasta file given a ENA sequence ID | |
| 356 -fastq [XXX] allows to download FASTQ sequences from ENA given a run accession (https://ena-docs.readthedocs.io/en/latest/faq/archive-generated-files.html) | |
| 357 -log allows to create a log file | |
| 358 HEREDOC | |
| 359 } | |
| 360 #------------------------------------------------------------------------------ | |
| 361 # display program version | |
| 362 sub program_version { | |
| 363 my $programme = shift @_; | |
| 364 print "\n $programme, version : $version\n"; | |
| 365 print "\n A perl script to get sequences informations\n"; | |
| 366 } | |
| 367 #------------------------------------------------------------------------------ | |
| 368 sub get_assembly_summary_species { | |
| 369 my ($quantity, $releaseDate, $directory, $kingdom, $species, $levelList, $fldSep, | |
| 370 $actualOS, $mainFolder, $assemblyTaxid, $log, $getSummaries, $components, $ftpServor) = @_; | |
| 371 | |
| 372 # assembly_summary.txt file from NCBI FTP site | |
| 373 #my $assemblySummary = get_summaries($ftpServor, $kingdom, $getSummaries, $directory); | |
| 374 | |
| 375 my $assemblySummary = ""; | |
| 376 if($path){ | |
| 377 $assemblySummary = $path; | |
| 378 } | |
| 379 else{ | |
| 380 $assemblySummary = download_summaries($directory, $kingdom, $ftpServor, $fldSep, $getSummaries); | |
| 381 } | |
| 382 | |
| 383 #lineage folder | |
| 384 # my $lineage_file = "/pub/taxonomy/new_taxdump/new_taxdump.tar.gz"; | |
| 385 | |
| 386 # allow to check old summary download | |
| 387 my $oldKingdom = ""; | |
| 388 | |
| 389 # start of output file | |
| 390 if ($log) { | |
| 391 print LOG "...Downloading assembly_summary.txt...\n"; | |
| 392 } | |
| 393 | |
| 394 # if ($actualOS =~ /Unix/i) { | |
| 395 #initialiaze tar manipulation | |
| 396 # my $tar = Archive::Tar->new; | |
| 397 | |
| 398 #download taxdump folder | |
| 399 # download_file($ftpServor, $lineage_file); | |
| 400 # $tar->read("new_taxdump.tar.gz"); | |
| 401 # $tar->extract_file("rankedlineage.dmp"); | |
| 402 # } | |
| 403 | |
| 404 | |
| 405 #my $kingdomRep = $kingdom."_".$start_year."_".$start_month."_".$start_day; | |
| 406 #my $kingdomRep = $kingdom."_folder"; | |
| 407 my $kingdomRep = "folder"; | |
| 408 | |
| 409 if ($mainFolder) { $kingdomRep = $mainFolder; } | |
| 410 mkdir $kingdomRep unless -d $kingdomRep; | |
| 411 | |
| 412 # Repository for request | |
| 413 | |
| 414 #my $repositoryAssembly = "assembly_repository_".$assemblyTaxid; | |
| 415 my $repositoryAssembly = "result"; | |
| 416 mkdir $repositoryAssembly unless -d $repositoryAssembly; | |
| 417 | |
| 418 my $oldAssemblyRep = "." . $fldSep . $kingdomRep . $fldSep . $repositoryAssembly; | |
| 419 if (-d $oldAssemblyRep) { rmtree($oldAssemblyRep) } | |
| 420 | |
| 421 # Repository for fna file | |
| 422 my $repositoryFNA = "Assembly"; | |
| 423 mkdir $repositoryFNA unless -e $repositoryFNA; | |
| 424 | |
| 425 # Repository for genbank file | |
| 426 my $repositoryGenbank = "GenBank"; | |
| 427 mkdir $repositoryGenbank unless -e $repositoryGenbank; | |
| 428 | |
| 429 # Reposotiry for report file | |
| 430 my $repositoryReport = "Report"; | |
| 431 mkdir $repositoryReport unless -e $repositoryReport ; | |
| 432 | |
| 433 # Repositories for required components | |
| 434 my %componentsRepHash; | |
| 435 | |
| 436 if ($components) { | |
| 437 for my $component (split /,/, $components) { | |
| 438 my $specificRep = $component."_folder"; | |
| 439 #my $specificRep = $component."_".$species."_".$kingdom."_".$start_year."_".$start_month."_".$start_day; | |
| 440 mkdir $specificRep unless -d $specificRep; | |
| 441 $componentsRepHash{$component} = $specificRep; | |
| 442 } | |
| 443 } | |
| 444 | |
| 445 if ($log) { | |
| 446 print LOG "...Create kingdom and components repository...\n"; | |
| 447 } | |
| 448 | |
| 449 my %assemblyReportList; | |
| 450 my @assemblyRepresentationList = qw/Full Partial/; | |
| 451 my @levelList = @{$levelList}; | |
| 452 my $checkCompleteGenome = grep(/complete genome/i, @levelList); | |
| 453 | |
| 454 if ($checkCompleteGenome > 0) {@assemblyRepresentationList = qw/Full/;} | |
| 455 | |
| 456 if (-e $assemblySummary) { | |
| 457 # Read file | |
| 458 open (SUM, $assemblySummary) or die " error open assembly_summary.txt $!:"; | |
| 459 while(<SUM>) { | |
| 460 chomp; | |
| 461 if ($_ !~ m/^#/) { | |
| 462 my @infoList = split /\t/, $_; | |
| 463 my $foundAssemRep = grep (/$infoList[13]/i, @assemblyRepresentationList); | |
| 464 my $checkLevel = grep(/$infoList[11]/i, @levelList); #replace 11 by 10 | |
| 465 | |
| 466 if ($foundAssemRep > 0 && $checkLevel > 0) { | |
| 467 my $indexInfo = 0; | |
| 468 my $searchPattern = ""; | |
| 469 my $regex = ""; | |
| 470 | |
| 471 if ($species !~ /^$/) { | |
| 472 $indexInfo = 7; | |
| 473 $searchPattern = $species; | |
| 474 $regex = qr/$searchPattern/i; | |
| 475 } | |
| 476 elsif ($assemblyTaxid !~ /^$/) { | |
| 477 $indexInfo = 5; | |
| 478 $searchPattern = $assemblyTaxid; | |
| 479 $regex = qr/^$searchPattern$/i; | |
| 480 } | |
| 481 | |
| 482 if (($infoList[$indexInfo] =~ $regex) or ($kingdom !~ /^$/ && $searchPattern =~ /^$/)) { | |
| 483 my @gcfInfo = split(/\//, $infoList[19]); | |
| 484 my $gcfName = pop(@gcfInfo); | |
| 485 my $realDate = $infoList[14]; | |
| 486 $realDate =~ s/\//-/g; | |
| 487 | |
| 488 my $genbankFile = $infoList[19] . "/" . $gcfName . "_genomic.gbff.gz"; | |
| 489 my $dnaFile = $infoList[19] . "/" . $gcfName . "_genomic.fna.gz"; | |
| 490 my $assemblyReport = $infoList[19] . "/" . $gcfName . "_assembly_report.txt"; | |
| 491 | |
| 492 if ($realDate gt $releaseDate) { | |
| 493 | |
| 494 $dnaFile = obtain_file($ftpServor, $dnaFile); | |
| 495 $genbankFile = obtain_file($ftpServor, $genbankFile); | |
| 496 $assemblyReport = obtain_file($ftpServor, $assemblyReport); | |
| 497 | |
| 498 download_file($ftpServor, $dnaFile); | |
| 499 download_file($ftpServor, $genbankFile); | |
| 500 download_file($ftpServor, $assemblyReport); | |
| 501 | |
| 502 if ($log) { | |
| 503 print LOG "...download FASTA and GenBank report files...\n"; | |
| 504 } | |
| 505 | |
| 506 # download sequences and check number of "N" characters | |
| 507 my $fileFasta = $gcfName."_genomic.fna.gz"; | |
| 508 my $ucpFasta = $gcfName."_genomic.fna"; | |
| 509 if (-e $fileFasta) { | |
| 510 gunzip $fileFasta => $ucpFasta or die "gunzip failed: $GunzipError\n"; | |
| 511 move($ucpFasta, $repositoryFNA) or die "move failed: $!"; | |
| 512 } | |
| 513 | |
| 514 if ($log) { | |
| 515 print LOG "...Unzip FASTA file named $fileFasta ...\n"; | |
| 516 } | |
| 517 | |
| 518 # download genome report | |
| 519 my $fileReport = $gcfName."_assembly_report.txt"; | |
| 520 if (-e $fileReport) { | |
| 521 my $fileInformations = $gcfName."_informations.xls"; | |
| 522 move($fileReport, $repositoryReport) or die "move failed: $!"; | |
| 523 } | |
| 524 | |
| 525 # download genbank files | |
| 526 my $fileGenbank = $gcfName."_genomic.gbff.gz"; | |
| 527 my $ucpGenbank = $gcfName."_genomic.gbff"; | |
| 528 if (-e $fileGenbank) { | |
| 529 gunzip $fileGenbank => $ucpGenbank or die "gunzip failed: $GunzipError\n"; | |
| 530 move($ucpGenbank, $repositoryGenbank) or die "move failed: $!"; | |
| 531 } | |
| 532 | |
| 533 if ($log) { | |
| 534 print LOG "...Unzip GenBank file $fileGenbank ...\n"; | |
| 535 } | |
| 536 | |
| 537 # association report and fasta | |
| 538 my $fileFastaGenbank = $ucpFasta . "," . $ucpGenbank; | |
| 539 $assemblyReportList{$fileReport} = $fileFastaGenbank; | |
| 540 | |
| 541 if ($quantity) { $quantity -= 1; } | |
| 542 | |
| 543 } | |
| 544 } | |
| 545 } | |
| 546 } | |
| 547 if (defined $quantity && $quantity == 0) { | |
| 548 $quantity = undef; | |
| 549 last; | |
| 550 } | |
| 551 } | |
| 552 close(SUM) or die "close file error : $!"; | |
| 553 | |
| 554 if (!keys %assemblyReportList) { | |
| 555 print "##################################################################\n"; | |
| 556 print "No results were found for the following query:\n"; | |
| 557 print "perl $0 @ARGV\n"; | |
| 558 print "##################################################################\n\n"; | |
| 559 | |
| 560 if ($actualOS =~ /unix/i) { unlink glob "*.dmp *.gz" or die "for file *.dmp *.gz $!:"; } | |
| 561 | |
| 562 if (empty_folder($kingdomRep)) { rmdir $kingdomRep or die "fail remove directory $!:"; } | |
| 563 rmdir $repositoryAssembly or die "failed to remove directory $!:"; | |
| 564 rmdir $repositoryFNA or die "failed to remove directory $!:"; | |
| 565 rmdir $repositoryGenbank or die "failed to remove directory $!:"; | |
| 566 rmdir $repositoryReport or die "failed to remove directory $!:"; | |
| 567 | |
| 568 if ($components) { | |
| 569 for my $componentRep (values %componentsRepHash) { | |
| 570 rmdir $componentRep or die "failed to remove directory $!:"; | |
| 571 } | |
| 572 } | |
| 573 if ($log) { | |
| 574 print LOG "...No results were found for the following query: "; | |
| 575 print LOG "perl $0 @ARGV \n"; | |
| 576 } | |
| 577 | |
| 578 } | |
| 579 else { | |
| 580 | |
| 581 if ($log) { | |
| 582 print LOG "...Results were found for the following query: "; | |
| 583 print LOG "perl $0 @ARGV \n"; | |
| 584 } | |
| 585 # write summary files | |
| 586 my %componentsSumHash; | |
| 587 my @keysList = keys %assemblyReportList; | |
| 588 my $summary = "summary.xls"; | |
| 589 my $htmlSummary = "summary.html"; | |
| 590 | |
| 591 if ($components) { | |
| 592 for my $component (split /,/, $components) { | |
| 593 my $specificSummary = $component. "_summary.xls"; | |
| 594 $componentsSumHash{$component} = $specificSummary; | |
| 595 } | |
| 596 } | |
| 597 | |
| 598 my $fileReport = "." . $fldSep. $repositoryReport . $fldSep . $keysList[0]; | |
| 599 my @header = (); | |
| 600 | |
| 601 open(FILE, $fileReport) or die "error open file : $!"; | |
| 602 while(<FILE>) { | |
| 603 chomp; | |
| 604 if($_ =~ /:/){ | |
| 605 $_ =~ s/^#*//; | |
| 606 my @ligne = split /:/, $_; | |
| 607 push(@header, $ligne[0]); | |
| 608 } | |
| 609 } | |
| 610 close(FILE) or die "error close file : $!"; | |
| 611 | |
| 612 open(HEAD, ">", $summary) or die " error open file : $!"; | |
| 613 foreach(@header) { | |
| 614 print HEAD $_ . "\t"; | |
| 615 } | |
| 616 | |
| 617 print HEAD "Pubmed\tNucleScore\tClassification\t"; | |
| 618 print HEAD "Country\tHost\tIsolation source\tA percent\t"; | |
| 619 print HEAD "T percent\tG percent\tC percent\tN percent\tGC percent\t"; | |
| 620 print HEAD "ATGC ratio\tLength\tShape\n"; | |
| 621 close(HEAD) or die "error close file : $!"; | |
| 622 | |
| 623 if ($components) { | |
| 624 foreach my $componentSummary (values %componentsSumHash) { | |
| 625 open(SUM, ">>", $componentSummary) or die "error open file : $!"; | |
| 626 print SUM "Id\tAssembly\tDescription\tLength\tStatus\tLevel\t"; | |
| 627 print SUM "GC percent\tA percent\tT percent\tG percent\tC percent\n"; | |
| 628 close(SUM) or die "error close file : $!"; | |
| 629 } | |
| 630 } | |
| 631 | |
| 632 for my $file (@keysList) { | |
| 633 my $reportFile = $repositoryReport . $fldSep . $file; | |
| 634 my @fastaGenbank = split /,/, $assemblyReportList{$file}; | |
| 635 my $extFasta = $fastaGenbank[0]; | |
| 636 my $extGenbank = $fastaGenbank[1]; | |
| 637 my $fnaFile = $repositoryFNA . $fldSep . $extFasta; | |
| 638 my $genbankFile = $repositoryGenbank . $fldSep . $extGenbank; | |
| 639 | |
| 640 write_assembly($reportFile, $fnaFile, $genbankFile, $summary, $repositoryAssembly, | |
| 641 \%componentsSumHash, $kingdom, $actualOS, \@header, $log); | |
| 642 | |
| 643 if ($log) { | |
| 644 print LOG "...Call write_assembly subroutine...\n"; | |
| 645 } | |
| 646 } | |
| 647 | |
| 648 write_html_summary($summary); | |
| 649 | |
| 650 if ($log) { | |
| 651 print LOG "...Call write_html_summary subroutine...\n"; | |
| 652 } | |
| 653 | |
| 654 if ($components) { | |
| 655 my @componentList = keys %componentsSumHash; | |
| 656 my %componentFastaHash = create_component_sequence_file($fldSep, $repositoryFNA, \@componentList); | |
| 657 | |
| 658 if (keys %componentFastaHash && $log) { $log->msg(1,"call create_component_sequence_file subroutine");} | |
| 659 | |
| 660 foreach my $component (keys %componentFastaHash) { | |
| 661 move($componentFastaHash{$component}, $componentsRepHash{$component}) or die "move failed: $!"; | |
| 662 } | |
| 663 } | |
| 664 | |
| 665 move($summary, $repositoryAssembly) or die "move failed: $!"; | |
| 666 move($htmlSummary, $repositoryAssembly) or die "move failed: $!"; | |
| 667 move($repositoryFNA, $repositoryAssembly . $fldSep . $repositoryFNA) or die "move failed: $!"; | |
| 668 move($repositoryGenbank, $repositoryAssembly . $fldSep . $repositoryGenbank) or die "move failed: $!"; | |
| 669 move($repositoryReport , $repositoryAssembly . $fldSep . $repositoryReport) or die "move failed: $!"; | |
| 670 if ($log) { | |
| 671 print LOG "...move fasta, genbank and report to query folder \n"; | |
| 672 } | |
| 673 | |
| 674 if ($components) { | |
| 675 for my $component (keys %componentsSumHash) { | |
| 676 move($componentsSumHash{$component}, $componentsRepHash{$component}) or die "move failed: $!"; | |
| 677 move($componentsRepHash{$component}, $repositoryAssembly . $fldSep . $componentsRepHash{$component}) or die "move failed: $!" | |
| 678 } | |
| 679 if ($log) { | |
| 680 print LOG "...move component files to folders\n"; | |
| 681 } | |
| 682 } | |
| 683 | |
| 684 move($repositoryAssembly, $kingdomRep . $fldSep . $repositoryAssembly) or die "move failed: $!"; | |
| 685 | |
| 686 if ($log) { | |
| 687 print LOG "...move query folder to main folder\n"; | |
| 688 } | |
| 689 | |
| 690 # if ($actualOS =~ /unix/i) { unlink glob "*.dmp" or die "for file *.dmp $!:"; } | |
| 691 unlink glob "*.gz sequence.txt" or die "$!: for file *.gz sequence.txt"; | |
| 692 } | |
| 693 } | |
| 694 } | |
| 695 #write general assembly file | |
| 696 sub write_assembly { | |
| 697 my ($reportFile, $fnaFile, $genbankFile, $summary, $repositoryAssembly, | |
| 698 $componentsSumHashRef, $kingdom, $actualOS, $headerRef, $log) = @_; | |
| 699 | |
| 700 my %componentsSumHash = %{$componentsSumHashRef}; | |
| 701 my @header = @{$headerRef}; | |
| 702 my %hashInformations = (); | |
| 703 my $seq = ""; | |
| 704 my $genomeName = ""; | |
| 705 my $country = "na"; | |
| 706 my $GCpercent = -1; | |
| 707 my $taxId = "na"; | |
| 708 my $assemblyLine; | |
| 709 my $pubmedId = "na"; | |
| 710 my $host = "na"; | |
| 711 my $isoSource = "na"; | |
| 712 # my $species = "na"; | |
| 713 # my $genus = "na"; | |
| 714 # my $family = "na"; | |
| 715 # my $order = "na"; | |
| 716 # my $class = "na"; | |
| 717 # my $phylum = "na"; | |
| 718 my $shape = "na"; | |
| 719 my $geneNumber = "na"; | |
| 720 | |
| 721 open(REP, "<", $reportFile) or die "error open file $!:"; | |
| 722 while (<REP>) { | |
| 723 chomp; | |
| 724 $_ =~ s/^#*//; | |
| 725 if ($_ =~ /assembled-molecule/i) { $assemblyLine = $_; } | |
| 726 if ($_ =~ /:/) { | |
| 727 my @line = split /:/, $_; | |
| 728 if ($line[1]) { $hashInformations{$line[0]} = trim($line[1]); } | |
| 729 } | |
| 730 } | |
| 731 close(REP) or die "error close file $!:"; | |
| 732 | |
| 733 | |
| 734 open(SUM, ">>", $summary) or die "error open file $!:"; | |
| 735 foreach my $k(@header) { | |
| 736 if (grep $_ eq $k, keys %hashInformations) { | |
| 737 | |
| 738 my $information = $hashInformations{$k}; | |
| 739 | |
| 740 if ($k =~ /Assembly name/i) { $genomeName = $information; } | |
| 741 | |
| 742 if ($information =~ /^\s*$/) { | |
| 743 print SUM "na\t"; | |
| 744 } else { | |
| 745 print SUM $information . "\t"; | |
| 746 } | |
| 747 } else { | |
| 748 print SUM "na\t"; | |
| 749 } | |
| 750 } | |
| 751 close(SUM) or die "error close file $!:"; | |
| 752 | |
| 753 open(FNA, "<", $fnaFile) or die "Could not open $!:"; | |
| 754 while (<FNA>) { | |
| 755 chomp; | |
| 756 if ($_ !~ /^>/) { $seq .= $_; } | |
| 757 } | |
| 758 close(FNA) or die "error close file :$!"; | |
| 759 | |
| 760 foreach my $summaryKey (keys %hashInformations) { | |
| 761 if ($summaryKey =~ /taxid/i) { | |
| 762 $taxId = $hashInformations{$summaryKey}; | |
| 763 } | |
| 764 } | |
| 765 | |
| 766 my $classification = get_taxonomic_rank_genbank($genbankFile); | |
| 767 | |
| 768 if ($log) { | |
| 769 print LOG "...get taxonomic rank from genbank file\n"; | |
| 770 } | |
| 771 | |
| 772 $GCpercent = gc_percent($seq); | |
| 773 my ($ade, $thy, $gua, $cyt, $n, $length) = number_nuc_length_seq($fnaFile); | |
| 774 my ($aPercent, $tPercent, $gPercent, $cPercent, $nPercent) = nucleotid_percent($ade, $thy, $gua, $cyt, $n, $length); | |
| 775 my $atgcRatio = atgc_ratio($ade, $thy, $gua, $cyt); | |
| 776 | |
| 777 my @percentList = ($aPercent, $tPercent, $gPercent, $cPercent, $nPercent); | |
| 778 | |
| 779 my $variance = shift_data_variance(@percentList); | |
| 780 my $nucleScore = nucle_score($variance, $GCpercent, $atgcRatio, $length); | |
| 781 | |
| 782 if ($log) { | |
| 783 print LOG "compute GC percent nucleotid percent ATGC ratio NucleScore\n"; | |
| 784 } | |
| 785 | |
| 786 open(GBFF, "<", $genbankFile) or die "Error open file $!:"; | |
| 787 while(<GBFF>) { | |
| 788 chomp; | |
| 789 if ($_ =~ /\/country="(.*)"/) { $country = trim($1); } | |
| 790 if ($_ =~ /PUBMED(.*)/) { $pubmedId = trim($1); } | |
| 791 if ($_ =~ /\/host="(.*)"/) { $host = trim($1); } | |
| 792 if ($_ =~ /\/isolation_source="(.*)"/) { $isoSource = trim($1); } | |
| 793 if ($_ =~ /\(Genes \(total\)\s+::(.*)/) { $geneNumber = trim($1); } | |
| 794 if ($_ =~ /LOCUS.*\s+([a-z]{1,})\s+[a-z]{1,}\s+[0-9]{2,}-[a-z]{1,}-[0-9]{4,}$/i) { $shape = trim($1); } | |
| 795 } | |
| 796 close(GBFF) or die "error close file $!:"; | |
| 797 | |
| 798 | |
| 799 open(SUM, ">>", $summary) or die "error open file $!:"; | |
| 800 print SUM $pubmedId . "\t" . $nucleScore . "\t" . $classification ."\t" ; | |
| 801 print SUM $country . "\t" . $host . "\t" . $isoSource . "\t" . $aPercent . "\t" ; | |
| 802 print SUM $tPercent . "\t" . $gPercent . "\t" . $cPercent ."\t" . $nPercent . "\t"; | |
| 803 print SUM $GCpercent ."\t". $atgcRatio ."\t". $length . "\t". $shape."\n"; | |
| 804 close(SUM) or die "error close file: $!"; | |
| 805 | |
| 806 if (%componentsSumHash) { | |
| 807 write_assembly_component($fnaFile, $genomeName, \%componentsSumHash, $log); | |
| 808 } | |
| 809 } | |
| 810 #------------------------------------------------------------------------------ | |
| 811 # get assembly component | |
| 812 sub write_assembly_component { | |
| 813 my($fnaFile, $genomeName, $componentsSumHashRef, $log) = @_; | |
| 814 | |
| 815 my %componentsSumHash = %{$componentsSumHashRef}; | |
| 816 my $status = "na"; | |
| 817 my $level = "na"; | |
| 818 my $gcpercent; | |
| 819 my $tmp_fasta_file = "sequence.txt"; | |
| 820 my @desc = (); | |
| 821 | |
| 822 # check each sequence from (multi-)fasta file | |
| 823 my ($seq, $inputfile); | |
| 824 | |
| 825 if ($log) { | |
| 826 print LOG "...search specific components\n"; | |
| 827 } | |
| 828 | |
| 829 # extract sequence details | |
| 830 my $seqIO = Bio::SeqIO->new(-format=>'Fasta', -file=>$fnaFile); | |
| 831 | |
| 832 while ($seq = $seqIO->next_seq()) { | |
| 833 my $seqID = $seq->id; # ID of sequence | |
| 834 my $seqDesc = $seq->desc; # Description of sequence | |
| 835 my $globalSeq = $seq->seq; | |
| 836 my $seqLength = $seq->length(); | |
| 837 | |
| 838 open(TSEQ, ">", $tmp_fasta_file) or die "Error open file: $!"; | |
| 839 print TSEQ $globalSeq; | |
| 840 close(TSEQ); | |
| 841 | |
| 842 my ($ade, $thy, $gua, $cyt, $n, $length) = number_nuc_length_seq($tmp_fasta_file); | |
| 843 | |
| 844 my ($aPercent, $tPercent, $gPercent, $cPercent, $nPercent) = nucleotid_percent($ade, $thy, $gua, $cyt, $n, $length); | |
| 845 | |
| 846 $gcpercent = gc_percent($globalSeq); | |
| 847 | |
| 848 @desc = split /,/, $seqDesc; | |
| 849 | |
| 850 if ($desc[1]) { $level = $desc[1]; } | |
| 851 | |
| 852 foreach my $component (keys %componentsSumHash) { | |
| 853 if ($desc[0] =~ /$component/) { | |
| 854 $status = $component; | |
| 855 my $info = $seqID . "\t" . $genomeName ."\t" . $seqDesc . "\t" . $seqLength . "\t" . $status . "\t" . $level ."\t"; | |
| 856 $info.= $gcpercent."\t". $aPercent ."\t". $tPercent ."\t". $gPercent ."\t". $cPercent . "\n"; | |
| 857 add_to_file($componentsSumHash{$component}, $info); | |
| 858 if ($log) { | |
| 859 print LOG "...found component $component \n"; | |
| 860 } | |
| 861 } | |
| 862 } | |
| 863 } | |
| 864 } | |
| 865 #------------------------------------------------------------------------------ | |
| 866 # download fasta sequence and report on ENA with assembly ID | |
| 867 sub get_fasta_and_report_sequence_ena_assembly { | |
| 868 my($sequenceID) = @_; | |
| 869 my $tmp_file = "fichier.txt"; | |
| 870 my @id_list = (); | |
| 871 my $id_chain = ""; | |
| 872 my $fasta_file = ""; | |
| 873 my $report_file = ""; | |
| 874 my $url = "https://www.ebi.ac.uk/ena/data/view/$sequenceID&display=xml"; | |
| 875 my $output = get($url); | |
| 876 | |
| 877 open(TMP, ">", $tmp_file) or die("could not open $!"); | |
| 878 print TMP $output; | |
| 879 close(TMP) or die("could not close $!"); | |
| 880 | |
| 881 open(TMP, "<", $tmp_file) or die("could not open $!"); | |
| 882 while(<TMP>){ | |
| 883 if($_ =~ /<CHROMOSOME accession="(.*)">/){ | |
| 884 push(@id_list, $1) | |
| 885 } | |
| 886 } | |
| 887 close(TMP) or die("could not close $!"); | |
| 888 | |
| 889 $id_chain = join(",", @id_list); | |
| 890 $url = "https://www.ebi.ac.uk/ena/data/view/$id_chain&display=fasta"; | |
| 891 $output = get($url); | |
| 892 $fasta_file = $sequenceID . ".fasta"; | |
| 893 open(FILE, ">", $fasta_file) or die("could not open $!"); | |
| 894 print FILE $output; | |
| 895 close(FILE) or die("could not close $!"); | |
| 896 | |
| 897 | |
| 898 $report_file = $sequenceID . "_report.txt"; | |
| 899 for my $id (@id_list) { | |
| 900 $url = "https://www.ebi.ac.uk/ena/data/view/$id&display=text&header=true"; | |
| 901 $output = get($url); | |
| 902 open(FILE, ">>", $report_file) or die("could not open $!"); | |
| 903 print FILE $output; | |
| 904 print FILE "##########################################################################\n\n"; | |
| 905 close(FILE) or die("could not close $!"); | |
| 906 } | |
| 907 | |
| 908 unlink "fichier.txt" or die "error delete file :$!"; | |
| 909 | |
| 910 return ($fasta_file, $report_file); | |
| 911 } | |
| 912 #------------------------------------------------------------------------------ | |
| 913 # download ENA | |
| 914 sub sequence_ena { | |
| 915 my($sequenceID, $log) = @_; | |
| 916 #my $assemblyRep = $sequenceID . "_folder"; | |
| 917 my $fastaFile; | |
| 918 my $reportFile; | |
| 919 | |
| 920 #if(-d $assemblyRep) { rmtree($assemblyRep); } | |
| 921 #mkdir $assemblyRep; | |
| 922 | |
| 923 if ($log) { | |
| 924 print LOG "...ENA sequence downloading ...\n"; | |
| 925 } | |
| 926 #if ($log) {$log->msg(1, "Creation of repository: $assemblyRep\n");} | |
| 927 | |
| 928 if($sequenceID =~ /^GCA_.*/){ | |
| 929 ($fastaFile, $reportFile) = get_fasta_and_report_sequence_ena_assembly($sequenceID); | |
| 930 } | |
| 931 else { | |
| 932 ($fastaFile, $reportFile) = get_fasta_and_report_sequence_ena_other($sequenceID); | |
| 933 } | |
| 934 | |
| 935 if ($log) { | |
| 936 print LOG "...Downloading of ENA FASTA and report files for sequence: $sequenceID\n"; | |
| 937 } | |
| 938 | |
| 939 #move($fastaFile, $assemblyRep) or die "move failed: $!"; | |
| 940 #move($reportFile, $assemblyRep) or die "move failed: $!"; | |
| 941 | |
| 942 if ($log) { | |
| 943 print LOG "...Move fasta and report files to folder\n"; | |
| 944 } | |
| 945 } | |
| 946 #------------------------------------------------------------------------------ | |
| 947 # download fasta sequence and report on ENA with ENA ID | |
| 948 sub get_fasta_and_report_sequence_ena_other { | |
| 949 my($sequenceID) = @_; | |
| 950 my $fasta_file = ""; | |
| 951 my $report_file = ""; | |
| 952 my $url; | |
| 953 my $output; | |
| 954 | |
| 955 $url = "https://www.ebi.ac.uk/ena/data/view/$sequenceID&display=fasta"; | |
| 956 #if($actualOS eq "MSWin32"){ | |
| 957 $output = get($url); | |
| 958 $fasta_file = $sequenceID . ".fasta"; | |
| 959 open(FILE, ">", $fasta_file) or die("could not open $!"); | |
| 960 print FILE $output; | |
| 961 close(FILE) or die "could not close $!"; | |
| 962 #} | |
| 963 #else{ | |
| 964 # system("wget $url"); | |
| 965 #} | |
| 966 | |
| 967 $output = ""; | |
| 968 | |
| 969 $url = "https://www.ebi.ac.uk/ena/data/view/$sequenceID&display=text&header=true"; | |
| 970 #if($actualOS eq "MSWin32"){ | |
| 971 $output = get($url); # replace by wget???? | |
| 972 $report_file = $sequenceID . "_report.txt"; | |
| 973 open(FILE, ">>", $report_file) or die "could not open: $!"; | |
| 974 print FILE $output; | |
| 975 close(FILE) or die "could not close $!"; | |
| 976 #} | |
| 977 #else{ | |
| 978 # system("wget $url"); | |
| 979 #} | |
| 980 | |
| 981 return ($fasta_file, $report_file); | |
| 982 } | |
| 983 #------------------------------------------------------------------------------ | |
| 984 # add information to file | |
| 985 sub add_to_file { | |
| 986 my ($file, $info) = @_; | |
| 987 open(FILE, ">>", $file) or die ("Could not open $!"); | |
| 988 print FILE $info; | |
| 989 close(FILE); | |
| 990 } | |
| 991 #------------------------------------------------------------------------------ | |
| 992 # return taxonomic rank of species by tax id | |
| 993 sub get_taxonomic_rank { | |
| 994 my($tax_id, $taxonomic_file) = @_; | |
| 995 my $species = "na"; | |
| 996 my $genus = "na"; | |
| 997 my $family = "na"; | |
| 998 my $order = "na"; | |
| 999 my $class = "na"; | |
| 1000 my $phylum = "na"; | |
| 1001 | |
| 1002 # my ($species,$genus,$family,$order,$class,$phylum); | |
| 1003 my @tmp_array = ($species, $genus, $family, $order, $class, $phylum); | |
| 1004 | |
| 1005 open(TFILE, "<", $taxonomic_file) or | |
| 1006 die("Could not open $taxonomic_file: $!"); | |
| 1007 | |
| 1008 while(<TFILE>) { | |
| 1009 chomp; | |
| 1010 my @tax_info = split(/\|/, $_); | |
| 1011 | |
| 1012 if ($tax_info[0] == $tax_id) { | |
| 1013 @tax_info = trim_array(@tax_info); | |
| 1014 | |
| 1015 $tmp_array[0] = $tax_info[1]; | |
| 1016 splice(@tax_info, 0, 3); | |
| 1017 | |
| 1018 for(my $i = 1; $i < $#tmp_array + 1; $i++) { | |
| 1019 if (length($tax_info[$i-1]) > 0) { $tmp_array[$i] = $tax_info[$i-1]; } | |
| 1020 } | |
| 1021 close(TFILE) or die "error close $taxonomic_file $!:"; | |
| 1022 return @tmp_array; | |
| 1023 } | |
| 1024 } | |
| 1025 close(TFILE) or die "error close $taxonomic_file $!:"; | |
| 1026 } | |
| 1027 #------------------------------------------------------------------------------ | |
| 1028 # write html summary file | |
| 1029 sub write_html_summary { | |
| 1030 my($summary) = @_; | |
| 1031 my $htmlFile = "summary.html"; | |
| 1032 my $header = ""; | |
| 1033 my @fileToRead = (); | |
| 1034 | |
| 1035 open(HTML, ">", $htmlFile) or die "error open HTML summary $!"; | |
| 1036 print HTML "<!DOCTYPE html>\n"; | |
| 1037 print HTML "<html>\n"; | |
| 1038 print HTML " <head>\n"; | |
| 1039 print HTML " <title>Assembly summary</title>\n"; | |
| 1040 print HTML " </head>\n"; | |
| 1041 print HTML " <body>\n"; | |
| 1042 print HTML " <h2>Assembly Summary</h2>\n"; | |
| 1043 close(HTML) or die "error close HTML summary $!"; | |
| 1044 | |
| 1045 open(SUM, "<", $summary) or die "error open tsv summary $!"; | |
| 1046 @fileToRead = <SUM>; | |
| 1047 close(SUM) or die "error close tsv summary $!"; | |
| 1048 | |
| 1049 $header = splice(@fileToRead, 0, 1); | |
| 1050 | |
| 1051 for my $line (@fileToRead) { | |
| 1052 write_html_table($line, $htmlFile, $header); | |
| 1053 } | |
| 1054 | |
| 1055 open(HTML, ">>", $htmlFile) or die "error open HTML summary $!"; | |
| 1056 print HTML " </body>\n"; | |
| 1057 print HTML "</html>\n"; | |
| 1058 close(HTML) or die "error close HTML summary $!"; | |
| 1059 } | |
| 1060 #------------------------------------------------------------------------------ | |
| 1061 # write html table for summary | |
| 1062 sub write_html_table { | |
| 1063 my ($line, $htmlFile, $header) = @_; | |
| 1064 | |
| 1065 open(HTML, ">>", $htmlFile) or die "error open HTML summary $!"; | |
| 1066 print HTML " <table border=\"1\" style=\"margin-bottom: 20px;\">\n"; | |
| 1067 close(HTML) or die "error close HTML summary $!"; | |
| 1068 add_table_content($line, $htmlFile, $header); | |
| 1069 } | |
| 1070 #------------------------------------------------------------------------------ | |
| 1071 # add information to table | |
| 1072 sub add_table_content { | |
| 1073 my ($line, $htmlFile, $headers) = @_; | |
| 1074 | |
| 1075 my @assemblyHeader = split(/\t/, $headers); | |
| 1076 my @assemblyInfo = split(/\t/, $line); | |
| 1077 my %hashHeaderInfo; | |
| 1078 my $nbOfCell = 7; | |
| 1079 my $fullLine = floor(($#assemblyHeader + 1) / $nbOfCell); | |
| 1080 my $restCell = $#assemblyHeader + 1 - $fullLine * $nbOfCell; | |
| 1081 | |
| 1082 | |
| 1083 for (my $i = 0; $i < $#assemblyHeader + 1; $i++) { | |
| 1084 $hashHeaderInfo{trim($assemblyHeader[$i])} = $assemblyInfo[$i]; | |
| 1085 } | |
| 1086 | |
| 1087 my @keysHeaderInfo = keys %hashHeaderInfo; | |
| 1088 my $cellIndex = 0; | |
| 1089 | |
| 1090 open(HTML, ">>", $htmlFile) or die "error open HTML summary $!"; | |
| 1091 for (my $turn = 0; $turn < $fullLine; $turn++) { | |
| 1092 | |
| 1093 print HTML " <tr>\n"; | |
| 1094 for my $header (@assemblyHeader[$cellIndex..$cellIndex + $nbOfCell - 1]) { | |
| 1095 print HTML " <th>$header</th>\n"; | |
| 1096 } | |
| 1097 print HTML " </tr>\n"; | |
| 1098 | |
| 1099 print HTML " <tr>\n"; | |
| 1100 for my $header (@assemblyHeader[$cellIndex..$cellIndex + $nbOfCell - 1]) { | |
| 1101 if ($header =~ /PUBMED/i && $hashHeaderInfo{$header} ne "na") { | |
| 1102 print HTML " <td><a href=https://www.ncbi.nlm.nih.gov/pubmed/?term=". | |
| 1103 "$hashHeaderInfo{$header} target=\"_blank\">$hashHeaderInfo{trim($header)}</a></td>"; | |
| 1104 } | |
| 1105 else { | |
| 1106 print HTML " <td>$hashHeaderInfo{trim($header)}</td>\n"; | |
| 1107 } | |
| 1108 } | |
| 1109 print HTML " </tr>\n"; | |
| 1110 | |
| 1111 $cellIndex += $nbOfCell; | |
| 1112 } | |
| 1113 | |
| 1114 print HTML " <tr>\n"; | |
| 1115 for my $header(@assemblyHeader[$cellIndex..$#keysHeaderInfo]) { | |
| 1116 print HTML " <th>$header</th>\n"; | |
| 1117 } | |
| 1118 print HTML " <tr>\n"; | |
| 1119 | |
| 1120 print HTML " <tr>\n"; | |
| 1121 for my $header(@assemblyHeader[$cellIndex..$#keysHeaderInfo]) { | |
| 1122 print HTML " <td>$hashHeaderInfo{trim($header)}</td>\n"; | |
| 1123 } | |
| 1124 print HTML " <tr>\n"; | |
| 1125 | |
| 1126 print HTML " </table>\n"; | |
| 1127 close(HTML) or die "error close HTML summary $!"; | |
| 1128 } | |
| 1129 #------------------------------------------------------------------------------ | |
| 1130 #getTaxonomicRanks (function allowing to get taxonomic ranks from Genbank file) | |
| 1131 sub get_taxonomic_rank_genbank { | |
| 1132 my ($genbank) = @_; | |
| 1133 | |
| 1134 my $seqio_object = Bio::SeqIO->new(-file => $genbank); | |
| 1135 my $seq_object = $seqio_object->next_seq; | |
| 1136 | |
| 1137 # legible and long | |
| 1138 my $species_object = $seq_object->species; | |
| 1139 my $species_string = $species_object->node_name; | |
| 1140 | |
| 1141 # get all taxa from the ORGANISM section in an array | |
| 1142 my @classification = $seq_object->species->classification; | |
| 1143 # my $arraySize = @classification; | |
| 1144 | |
| 1145 # print "@classification\n"; | |
| 1146 | |
| 1147 # if($arraySize == 7){ | |
| 1148 # ($species,$genus,$family,$order,$class,$phylum,$kingdomGB) = @classification; | |
| 1149 # } | |
| 1150 # elsif($arraySize == 4){ | |
| 1151 # ($species,$class,$phylum,$kingdomGB) = @classification; | |
| 1152 # } | |
| 1153 | |
| 1154 my $classification = join(",", @classification); | |
| 1155 | |
| 1156 return ($classification); | |
| 1157 } | |
| 1158 #------------------------------------------------------------------------------ | |
| 1159 #add all sequences components to file | |
| 1160 sub create_component_sequence_file { | |
| 1161 my ($fldSep, $repository, $listComponentRef) = @_; | |
| 1162 | |
| 1163 my @listFnaFile; | |
| 1164 my @listComponent = @{$listComponentRef}; | |
| 1165 | |
| 1166 opendir(my $dh, $repository) || die "Can't opendir $repository: $!"; | |
| 1167 @listFnaFile = grep{/fna$/} readdir($dh); | |
| 1168 closedir $dh; | |
| 1169 | |
| 1170 my %componentFastaHash; | |
| 1171 | |
| 1172 foreach my $component (@listComponent) { | |
| 1173 | |
| 1174 my $componentFasta = $component.".fasta"; | |
| 1175 | |
| 1176 foreach my $fnaFile (@listFnaFile) { | |
| 1177 | |
| 1178 # my $actualFile = $repository . $fldSep . $fnaFile; | |
| 1179 | |
| 1180 my $seq; | |
| 1181 my $seqIO = Bio::SeqIO->new(-format=>'Fasta', -file=>$repository . $fldSep . $fnaFile); | |
| 1182 | |
| 1183 while ($seq = $seqIO->next_seq()) { | |
| 1184 | |
| 1185 my $seqDesc = $seq->desc; | |
| 1186 | |
| 1187 if ($seqDesc =~ /$component/) { | |
| 1188 my $seqID = $seq->id; | |
| 1189 my $seqNuc = $seq->seq; | |
| 1190 my $shift = 60; | |
| 1191 my @seqArray = split //, $seqNuc; | |
| 1192 my $newSeqNuc = ""; | |
| 1193 | |
| 1194 if (length $seqNuc <= $shift) { | |
| 1195 $newSeqNuc = $seqNuc; | |
| 1196 } | |
| 1197 else { | |
| 1198 for(my $i = 0; $i < $#seqArray + 1; $i ++) { | |
| 1199 $newSeqNuc .= $seqArray[$i]; | |
| 1200 if (($i + 1) % $shift == 0) { $newSeqNuc .= ","; } | |
| 1201 } | |
| 1202 } | |
| 1203 | |
| 1204 open(FASTA, ">>", $componentFasta) or die "error open file $!:"; | |
| 1205 print FASTA ">$seqID $seqDesc\n"; | |
| 1206 foreach my $subSeqNuc (split /,/, $newSeqNuc) { | |
| 1207 print FASTA "$subSeqNuc\n"; | |
| 1208 } | |
| 1209 close(FASTA) or die "error close file $!:"; | |
| 1210 } | |
| 1211 } | |
| 1212 } | |
| 1213 if (-e $componentFasta) { $componentFastaHash{$component} = $componentFasta; } | |
| 1214 } | |
| 1215 return %componentFastaHash; | |
| 1216 } | |
| 1217 # remove back and front spaces | |
| 1218 sub trim { | |
| 1219 my ($string) = @_; | |
| 1220 $string =~ s/^\s+//; | |
| 1221 $string =~ s/\s+$//; | |
| 1222 return $string; | |
| 1223 } | |
| 1224 #------------------------------------------------------------------------------ | |
| 1225 # use trim in array | |
| 1226 sub trim_array { | |
| 1227 my (@array) = @_; | |
| 1228 foreach my $value (@array) { | |
| 1229 $value = trim($value); | |
| 1230 } | |
| 1231 return @array; | |
| 1232 } | |
| 1233 #------------------------------------------------------------------------------ | |
| 1234 # check if folder is empty | |
| 1235 sub empty_folder { | |
| 1236 my $dirname = shift; | |
| 1237 opendir(my $dholder, $dirname) or die "error not a directory"; | |
| 1238 my $isEmpty = scalar(grep { $_ ne "." && $_ ne ".." } readdir($dholder)); | |
| 1239 if ($isEmpty == 0) { return $isEmpty; } | |
| 1240 } | |
| 1241 #------------------------------------------------------------------------------ | |
| 1242 # number nucleotid and length | |
| 1243 sub number_nuc_length_seq { | |
| 1244 my ($fastaFile) = @_; | |
| 1245 my $ade = 0; | |
| 1246 my $thy = 0; | |
| 1247 my $gua = 0; | |
| 1248 my $cyt = 0; | |
| 1249 my $n = 0; | |
| 1250 my $length = 0; | |
| 1251 | |
| 1252 open (FASTA, "<", $fastaFile) or die "Could not open $!"; | |
| 1253 while (<FASTA>) { | |
| 1254 chomp; | |
| 1255 if ($_ !~ />/) { | |
| 1256 my @seq = split //, $_; | |
| 1257 | |
| 1258 for my $nuc (@seq) { | |
| 1259 $length +=1 ; | |
| 1260 if ($nuc =~ /a/i) {$ade+=1;} | |
| 1261 elsif ($nuc =~ /t/i) {$thy+=1;} | |
| 1262 elsif ($nuc =~ /g/i) {$gua+=1;} | |
| 1263 elsif ($nuc =~ /c/i) {$cyt+=1;} | |
| 1264 elsif ($nuc =~ /n/i) {$n+=1;} | |
| 1265 } | |
| 1266 } | |
| 1267 } | |
| 1268 close(FASTA) or die "Error close file :$!"; | |
| 1269 return ($ade, $thy, $gua, $cyt, $n, $length); | |
| 1270 | |
| 1271 } | |
| 1272 #------------------------------------------------------------------------------ | |
| 1273 # compute percentage of nucleotid | |
| 1274 sub nucleotid_percent { | |
| 1275 my($ade, $thy, $gua, $cyt, $n, $length) = @_; | |
| 1276 | |
| 1277 my $adePercent = $ade / $length * 100; | |
| 1278 my $thyPercent = $thy / $length * 100; | |
| 1279 my $guaPercent = $gua / $length * 100; | |
| 1280 my $cytPercent = $cyt / $length * 100; | |
| 1281 my $nPercent = $n / $length * 100; | |
| 1282 | |
| 1283 return ($adePercent, $thyPercent, $guaPercent, $cytPercent, $nPercent); | |
| 1284 | |
| 1285 } | |
| 1286 #------------------------------------------------------------------------------ | |
| 1287 # compute ATGC ratio | |
| 1288 sub atgc_ratio { | |
| 1289 my ($ade, $thy, $gua, $cyt) = @_; | |
| 1290 return (($ade + $thy) / ($gua + $cyt)); | |
| 1291 } | |
| 1292 #------------------------------------------------------------------------------ | |
| 1293 # variance | |
| 1294 sub shift_data_variance { | |
| 1295 my (@data) = @_; | |
| 1296 | |
| 1297 if ($#data + 1 < 2) { return 0.0; } | |
| 1298 | |
| 1299 my $K = $data[0]; | |
| 1300 my ($n, $Ex, $Ex2) = 0.0; | |
| 1301 | |
| 1302 for my $x (@data) { | |
| 1303 $n = $n + 1; | |
| 1304 $Ex += $x - $K; | |
| 1305 $Ex2 += ($x - $K) * ($x - $K); | |
| 1306 } | |
| 1307 | |
| 1308 my $variance = ($Ex2 - ($Ex * $Ex) / $n) / ($n); ## ($n - 1) | |
| 1309 | |
| 1310 return $variance; | |
| 1311 | |
| 1312 } | |
| 1313 #------------------------------------------------------------------------------ | |
| 1314 # nucle score | |
| 1315 sub nucle_score { | |
| 1316 my ($variance, $gcPercent, $atgcRatio, $length) = @_; | |
| 1317 #return log2(($variance * $gcPercent * $atgcRatio) / sqrt($length)); | |
| 1318 return log2(($variance * $gcPercent * $atgcRatio ** (3)) / sqrt($length)); | |
| 1319 } | |
| 1320 #------------------------------------------------------------------------------ | |
| 1321 sub log2 { | |
| 1322 my $n = shift; | |
| 1323 return (log($n) / log(2)); | |
| 1324 } | |
| 1325 #------------------------------------------------------------------------------ | |
| 1326 # compute GC pourcent | |
| 1327 sub gc_percent { | |
| 1328 my ($seq) = @_; | |
| 1329 | |
| 1330 my @charSeq = split(//, uc($seq)); | |
| 1331 my %hashFlank = (); | |
| 1332 | |
| 1333 foreach my $v (@charSeq) { | |
| 1334 $hashFlank{$v} += 1; | |
| 1335 } | |
| 1336 | |
| 1337 if (! $hashFlank{'G'}) { $hashFlank{'G'} = 0;} | |
| 1338 if (! $hashFlank{'C'}) { $hashFlank{'C'} = 0;} | |
| 1339 | |
| 1340 if(length($seq) == 0) { | |
| 1341 return 0; | |
| 1342 } | |
| 1343 else { | |
| 1344 return (($hashFlank{'G'} + $hashFlank{'C'}) / (length($seq))) * 100; | |
| 1345 } | |
| 1346 | |
| 1347 } | |
| 1348 #------------------------------------------------------------------------------ | |
| 1349 # download file from ftp protocol | |
| 1350 sub download_file { | |
| 1351 my($servor, $file) = @_; | |
| 1352 | |
| 1353 #if($actualOS eq "MSWin32"){ | |
| 1354 my $ftp = Net::FTP->new($servor, Debug => 0) | |
| 1355 or die "Cannot connect to $servor"; | |
| 1356 | |
| 1357 $ftp->login("anonymous", "-anonymous@") | |
| 1358 or die "Cannot login ", $ftp->message; | |
| 1359 $ftp->binary; | |
| 1360 $ftp->get($file) or die "get failed ", $ftp->message; | |
| 1361 | |
| 1362 $ftp->quit; | |
| 1363 #} | |
| 1364 #else{ | |
| 1365 # system("wget $file"); | |
| 1366 #} | |
| 1367 } | |
| 1368 #------------------------------------------------------------------------------ | |
| 1369 # obtain file directory | |
| 1370 sub obtain_file { | |
| 1371 my($servor, $link) = @_; | |
| 1372 if ($link =~ /$servor(.*)/) { return ($1); } | |
| 1373 } | |
| 1374 #------------------------------------------------------------------------------ | |
| 1375 # download fastq file from ENA | |
| 1376 sub download_ena_fastq { | |
| 1377 my ($enaFtpServor, $sraId, $log) = @_; | |
| 1378 | |
| 1379 my $fastqDir = "/vol1/fastq/"; | |
| 1380 my $dir1 = substr $sraId, 0, 6; | |
| 1381 my $dir2 = "000"; | |
| 1382 my $digits = substr $sraId, 3; | |
| 1383 my $fastqRep = $sraId . "_folder"; | |
| 1384 | |
| 1385 if ($log) { | |
| 1386 print LOG "...Downloading fastq file from ENA\n"; | |
| 1387 } | |
| 1388 | |
| 1389 if (length $digits == 6) { | |
| 1390 $dir2 = $sraId; | |
| 1391 $fastqDir .= $dir1 . "/" . $dir2 . "/"; | |
| 1392 } | |
| 1393 elsif (length $digits > 6) { | |
| 1394 my $digitsNumber = 0; | |
| 1395 my @digitsList = split //, (substr $digits, 6); | |
| 1396 | |
| 1397 foreach my $char (@digitsList) { | |
| 1398 if (length $dir2 >= 1) { | |
| 1399 $dir2 = substr $dir2, 0, (length $dir2) - 1; | |
| 1400 $digitsNumber += 1; | |
| 1401 } | |
| 1402 } | |
| 1403 $dir2 .= substr $digits, -$digitsNumber; | |
| 1404 $fastqDir .= $dir1 . "/" . $dir2 . "/" . $sraId . "/"; | |
| 1405 } | |
| 1406 | |
| 1407 if ($log) { | |
| 1408 print LOG "...recreate database folder path for FASTQ downloading\n"; | |
| 1409 } | |
| 1410 | |
| 1411 my $ftp = Net::FTP->new($enaFtpServor, Debug => 0) | |
| 1412 or die "Cannot connect to $enaFtpServor"; | |
| 1413 | |
| 1414 $ftp->login("anonymous", "-anonymous@") | |
| 1415 or die "Cannot login ", $ftp->message; | |
| 1416 $ftp->binary; | |
| 1417 | |
| 1418 $ftp->cwd($fastqDir) | |
| 1419 or die "maybe undefined sequence id, can't go to $fastqDir: ", $ftp->message; | |
| 1420 | |
| 1421 my @fastqFiles = $ftp->ls("$sraId*"); | |
| 1422 | |
| 1423 if ($log) { | |
| 1424 print LOG "...Searching fastq files in path\n"; | |
| 1425 } | |
| 1426 | |
| 1427 if (!grep(/^$/, @fastqFiles)) { | |
| 1428 | |
| 1429 if (-d $fastqRep) { rmtree($fastqRep) } | |
| 1430 mkdir $fastqRep; | |
| 1431 | |
| 1432 foreach my $fastqFile (@fastqFiles) { | |
| 1433 #if($actualOS eq "MSWin32"){ | |
| 1434 $ftp->get($fastqFile) or die "get failed ", $ftp->message; | |
| 1435 #} | |
| 1436 #else{ | |
| 1437 # system("wget $fastqFile"); | |
| 1438 #} | |
| 1439 | |
| 1440 #my @baseAndExt = split /\./, $fastqFile; | |
| 1441 #my $unzipFastq = $baseAndExt[0] . ".fastq"; | |
| 1442 | |
| 1443 #gunzip $fastqFile => $unzipFastq or die "gunzip failed: $GunzipError\n"; | |
| 1444 move($fastqFile, $fastqRep) or die "move failed: $!"; # DC replaced $unzipFastq by $fastqFile | |
| 1445 } | |
| 1446 #unlink glob "*fastq.gz" or die "$!: for file *fastq.gz"; | |
| 1447 | |
| 1448 if ($log) { | |
| 1449 print LOG "...Finalizing download of FASTQ file\n"; | |
| 1450 } | |
| 1451 } | |
| 1452 | |
| 1453 if ($log) { | |
| 1454 print LOG "End of download\n"; | |
| 1455 } | |
| 1456 | |
| 1457 $ftp->quit; | |
| 1458 } | |
| 1459 #------------------------------------------------------------------------------ | |
| 1460 # download fastq file from ENA | |
| 1461 sub get_assembly_or_project { | |
| 1462 my ($file, $sequence, $ftpServor, $fldSep, $log) = @_; | |
| 1463 | |
| 1464 my $pattern; | |
| 1465 my $indexInfo; | |
| 1466 my %folderHash; | |
| 1467 | |
| 1468 # Repository for fna file | |
| 1469 my $repositoryFNA = "Assembly"; | |
| 1470 | |
| 1471 # Repository for genbank file | |
| 1472 my $repositoryGenbank = "GenBank"; | |
| 1473 | |
| 1474 # Reposotiry for report file | |
| 1475 my $repositoryReport = "Report"; | |
| 1476 | |
| 1477 # global repository | |
| 1478 my $repositorySequence = $sequence; | |
| 1479 | |
| 1480 | |
| 1481 if ($sequence =~ /^GC[AF]_(.*)/) { | |
| 1482 $indexInfo = 0; | |
| 1483 $pattern = $1; | |
| 1484 } | |
| 1485 elsif ($sequence =~ /^PRJ/) { | |
| 1486 $indexInfo = 1; | |
| 1487 $pattern = $sequence; | |
| 1488 } | |
| 1489 | |
| 1490 open(SUM, $file) or die "error open file $!:"; | |
| 1491 while(<SUM>) { | |
| 1492 chomp; | |
| 1493 if ($_ !~ /^#/) { | |
| 1494 my @infoList = split /\t/, $_; | |
| 1495 if ($infoList[$indexInfo] =~ /$pattern/) { | |
| 1496 my @gcfInfo = split(/\//, $infoList[19]); | |
| 1497 my $gcfName = pop(@gcfInfo); | |
| 1498 | |
| 1499 | |
| 1500 my $genbankFile = $infoList[19] . "/" . $gcfName . "_genomic.gbff.gz"; | |
| 1501 my $dnaFile = $infoList[19] . "/" . $gcfName . "_genomic.fna.gz"; | |
| 1502 my $assemblyReport = $infoList[19] . "/" . $gcfName . "_assembly_report.txt"; | |
| 1503 | |
| 1504 $dnaFile = obtain_file($ftpServor, $dnaFile); | |
| 1505 $genbankFile = obtain_file($ftpServor, $genbankFile); | |
| 1506 $assemblyReport = obtain_file($ftpServor, $assemblyReport); | |
| 1507 | |
| 1508 download_file($ftpServor, $dnaFile); | |
| 1509 download_file($ftpServor, $genbankFile); | |
| 1510 download_file($ftpServor, $assemblyReport); | |
| 1511 | |
| 1512 # download sequences and check number of "N" characters | |
| 1513 my $fileFasta = $gcfName."_genomic.fna.gz"; | |
| 1514 my $ucpFasta = $gcfName."_genomic.fna"; | |
| 1515 if (-e $fileFasta) { | |
| 1516 gunzip $fileFasta => $ucpFasta or die "gunzip failed: $GunzipError\n"; | |
| 1517 $folderHash{$ucpFasta} = $repositoryFNA; | |
| 1518 } | |
| 1519 | |
| 1520 # download genome report | |
| 1521 my $fileReport = $gcfName."_assembly_report.txt"; | |
| 1522 if (-e $fileReport) { | |
| 1523 $folderHash{$fileReport} = $repositoryReport; | |
| 1524 } | |
| 1525 | |
| 1526 # download genbank files | |
| 1527 my $fileGenbank = $gcfName."_genomic.gbff.gz"; | |
| 1528 my $ucpGenbank = $gcfName."_genomic.gbff"; | |
| 1529 if (-e $fileGenbank) { | |
| 1530 gunzip $fileGenbank => $ucpGenbank or die "gunzip failed: $GunzipError\n"; | |
| 1531 $folderHash{$ucpGenbank} = $repositoryGenbank; | |
| 1532 } | |
| 1533 | |
| 1534 } | |
| 1535 } | |
| 1536 } | |
| 1537 close(SUM) or die "error close file $!"; | |
| 1538 | |
| 1539 if (keys %folderHash) { | |
| 1540 if (-e $repositorySequence) {rmtree($repositorySequence);} | |
| 1541 | |
| 1542 if ($log) { | |
| 1543 print LOG "...Download files from GenBank or RefSeq \n"; | |
| 1544 } | |
| 1545 | |
| 1546 mkdir $repositorySequence; | |
| 1547 mkdir $repositoryFNA; | |
| 1548 mkdir $repositoryGenbank; | |
| 1549 mkdir $repositoryReport; | |
| 1550 | |
| 1551 for my $ucpFile (keys %folderHash) { | |
| 1552 move($ucpFile, $folderHash{$ucpFile}) or die "error move file $!:"; | |
| 1553 } | |
| 1554 move($repositoryFNA, $repositorySequence . $fldSep. $repositoryFNA) or die "error move file $!:"; | |
| 1555 move($repositoryGenbank, $repositorySequence . $fldSep. $repositoryGenbank) or die "error move file $!:"; | |
| 1556 move($repositoryReport, $repositorySequence . $fldSep. $repositoryReport) or die "error move file $!:"; | |
| 1557 unlink glob "*.gz" or die "for file *.gz $!:"; | |
| 1558 | |
| 1559 if ($log) { | |
| 1560 print LOG "...move GenBank/RefSeq sequence files to dedicated folders\n"; | |
| 1561 } | |
| 1562 } | |
| 1563 | |
| 1564 } | |
| 1565 #------------------------------------------------------------------------------ | |
| 1566 sub download_assembly_or_project { | |
| 1567 my ($sequenceId, $ftpServor, $fldSep, $directory, $log) = @_; | |
| 1568 | |
| 1569 my $assemblySummary; | |
| 1570 my @sequenceIdList = split /,/, $sequenceId; | |
| 1571 | |
| 1572 if ($directory =~ /refseq/) { | |
| 1573 $assemblySummary = "assembly_summary_refseq.txt"; | |
| 1574 } elsif ($directory =~ /genbank/) { | |
| 1575 $assemblySummary = "assembly_summary_genbank.txt"; | |
| 1576 } | |
| 1577 | |
| 1578 my $assemblySummaryPath = "/genomes/ASSEMBLY_REPORTS/" . $assemblySummary; | |
| 1579 download_file($ftpServor, $assemblySummaryPath); | |
| 1580 | |
| 1581 foreach my $sequence (@sequenceIdList) { | |
| 1582 get_assembly_or_project($assemblySummary, $sequence, $ftpServor, $fldSep, $log); | |
| 1583 } | |
| 1584 | |
| 1585 unlink $assemblySummary or die "error remove file $!:"; | |
| 1586 } | |
| 1587 #------------------------------------------------------------------------------ | |
| 1588 # check if all required module are install | |
| 1589 sub isModuleInstalled { | |
| 1590 my $mod = shift; | |
| 1591 | |
| 1592 #eval("use $mod"); | |
| 1593 my $commandModule = `perldoc -l $mod`; | |
| 1594 | |
| 1595 if ($commandModule) { | |
| 1596 return(1); | |
| 1597 } else { | |
| 1598 return(0); | |
| 1599 } | |
| 1600 } | |
| 1601 #------------------------------------------------------------------------------ | |
| 1602 # download assembly summary | |
| 1603 sub download_summaries { | |
| 1604 my ($database, $kingdom, $ftpServor, $fldSep, $getSummaries) = @_; | |
| 1605 | |
| 1606 my $assemblySummary = "assembly_summary.txt"; | |
| 1607 my $assemblySummaryLink; | |
| 1608 my $fileName; | |
| 1609 | |
| 1610 opendir my $workingDirectory, "." . $fldSep or die "error open dir $!:"; | |
| 1611 my @filesList = readdir $workingDirectory; | |
| 1612 closedir $workingDirectory; | |
| 1613 | |
| 1614 if ($getSummaries) { | |
| 1615 foreach my $summaryKingdom (split /,/, $getSummaries) { | |
| 1616 foreach my $file (@filesList) { | |
| 1617 if ($file =~ /assembly_summary.txt/i && $file =~ /$summaryKingdom/i && $file =~ /$database/) { | |
| 1618 unlink $file or die "error remove file $!:"; | |
| 1619 $assemblySummaryLink = "/genomes/$database/$summaryKingdom/assembly_summary.txt"; | |
| 1620 download_file($ftpServor, $assemblySummaryLink); | |
| 1621 $fileName = $database . "_" . $summaryKingdom . "_" . "assembly_summary.txt"; | |
| 1622 rename $assemblySummary, $fileName; | |
| 1623 print "replace assembly_summary file\n"; | |
| 1624 } | |
| 1625 } | |
| 1626 } | |
| 1627 } | |
| 1628 | |
| 1629 foreach my $file (@filesList) { | |
| 1630 if ($file =~ /assembly_summary.txt/i && $file =~ /$kingdom/i && $file =~ /$database/) { | |
| 1631 return $file; | |
| 1632 } | |
| 1633 } | |
| 1634 | |
| 1635 $assemblySummaryLink = "/genomes/$database/$kingdom/assembly_summary.txt"; | |
| 1636 $fileName = $database . "_" . $kingdom . "_" . "assembly_summary.txt"; | |
| 1637 download_file($ftpServor, $assemblySummaryLink); | |
| 1638 rename $assemblySummary, $fileName; | |
| 1639 | |
| 1640 return $fileName; | |
| 1641 } | |
| 1642 #--------------------- | |
| 1643 sub bioseqio { | |
| 1644 | |
| 1645 my ($keyword, $file) = @_; | |
| 1646 local $/ = "\n>"; # read by FASTA record | |
| 1647 | |
| 1648 my $count = 0; | |
| 1649 | |
| 1650 open FASTA, $file; | |
| 1651 while (<FASTA>) { | |
| 1652 chomp; | |
| 1653 my $seq = $_; | |
| 1654 #my ($id) = $seq =~ /^>*(\S+)/; # parse ID as first word in FASTA header | |
| 1655 if ($seq =~ /^>*.*$keyword/) { | |
| 1656 #$seq =~ s/^>*.+\n//; # remove FASTA header | |
| 1657 #$seq =~ s/\n//g; # remove endlines | |
| 1658 $count++; | |
| 1659 print "\nThe sequence number is: $count \n"; | |
| 1660 print ">$seq"; | |
| 1661 } | |
| 1662 } | |
| 1663 close FASTA; | |
| 1664 } |
