Mercurial > repos > dcouvin > getsequenceinfo
comparison getSequenceInfo.pl @ 0:19ae17458c14 draft default tip
Uploaded
author | dcouvin |
---|---|
date | Wed, 15 Sep 2021 21:35:09 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:19ae17458c14 |
---|---|
1 #!/usr/bin/perl | |
2 | |
3 ################################################################################ | |
4 ## "Copyright 2019 Vincent Moco and David Couvin" | |
5 ## licence GPL-3.0-or-later | |
6 ## This program is free software: you can redistribute it and/or modify | |
7 ## it under the terms of the GNU General Public License as published by | |
8 ## the Free Software Foundation, either version 3 of the License, or | |
9 ## (at your option) any later version. | |
10 ## This program is distributed in the hope that it will be useful, | |
11 ## but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 ## GNU General Public License for more details. | |
14 ## You should have received a copy of the GNU General Public License | |
15 ## along with this program. If not, see <https://www.gnu.org/licenses/>. | |
16 ################################################################################ | |
17 | |
18 use strict; | |
19 use warnings; | |
20 | |
21 my $version = "1.0.1"; # version | |
22 | |
23 #my $number = 50; | |
24 | |
25 # Date and time of the current day (Beginning) | |
26 #my ($start_year,$start_month,$start_day, $start_hour,$start_min,$start_sec) = Today_and_Now(); | |
27 my $start = time(); | |
28 | |
29 print "##################################################################\n"; | |
30 print "## ---> Welcome to $0 (version $version)!\n"; | |
31 #print "## Start Date (yyyy-mm-dd, hh:min:sec): $start_year-$start_month-$start_day, $start_hour:$start_min:$start_sec\n"; | |
32 print "##################################################################\n\n"; | |
33 | |
34 #BioPerl, Date::Calc, File::Log, have been removed from the @modules | |
35 my @modules = qw( | |
36 Archive::Tar | |
37 Bio::SeqIO | |
38 Bio::Species | |
39 File::Copy | |
40 File::Path | |
41 Net::FTP | |
42 IO::Uncompress::Gunzip | |
43 LWP::Simple | |
44 POSIX | |
45 | |
46 ); | |
47 | |
48 foreach my $module (@modules) { | |
49 if (isModuleInstalled($module)) { | |
50 print "$module is.................installed!\n"; | |
51 } | |
52 else { | |
53 print "$module is not installed. Please install it and try again.\n"; | |
54 print "You can reinstall the $0 as shown on the README page or use the following command to install the module:\n"; | |
55 print "cpan -i -f $module\n"; | |
56 #system("cpan -i -f $module"); | |
57 exit 1; | |
58 } | |
59 } | |
60 print "\n"; | |
61 | |
62 use Archive::Tar; | |
63 use Bio::SeqIO; | |
64 use Bio::Species; | |
65 #use Date::Calc qw(:all); | |
66 use File::Copy qw(cp move); | |
67 use File::Path qw(rmtree); | |
68 use Net::FTP; | |
69 use IO::Uncompress::Gunzip qw(gunzip $GunzipError); | |
70 use LWP::Simple qw(get); | |
71 use POSIX qw(floor); | |
72 #use File::Log; | |
73 | |
74 #################################################################################################### | |
75 ## A Perl script allowing to get sequence information from GenBank, RefSeq or ENA repositories. | |
76 ## | |
77 #################################################################################################### | |
78 | |
79 ### main program | |
80 ### parameters | |
81 my $directory = "genbank"; | |
82 | |
83 my $kingdom = ""; # kingdom of organism | |
84 | |
85 my $releaseDate = "0000-00-00"; # sequences are downloaded from this release date | |
86 | |
87 my $components; # components of the assembly | |
88 | |
89 my $species = ""; # species name | |
90 | |
91 my $getSummaries; # indicates if a new assembly summary is required | |
92 | |
93 my $assemblyLevel = "Complete Genome,Chromosome,Scaffold,Contig"; # assembly level of the genome | |
94 | |
95 my $quantity; # limit number of assemblies to download | |
96 | |
97 my $sequenceID; | |
98 | |
99 my $ftpServor = "ftp.ncbi.nlm.nih.gov"; | |
100 | |
101 my $enaFtpServor = "ftp.sra.ebi.ac.uk"; | |
102 | |
103 my $fldSep = "/"; # folder seperation change by OS | |
104 | |
105 my @availableKingdoms = ( | |
106 "archaea", | |
107 "bacteria", | |
108 "fungi", | |
109 "invertebrate", | |
110 "plant", | |
111 "protozoa", | |
112 "vertebrate_mammalian", | |
113 "vertebrate_other", | |
114 "viral" | |
115 ); # list of available kingdoms | |
116 | |
117 my $actualOS = "Unix"; # OS of the computer | |
118 | |
119 my $mainFolder; # folder in which the assemblies results are stored | |
120 | |
121 my $assemblyTaxid = ""; # taxid for assembly | |
122 | |
123 my $sraID; # SRA sequence ID | |
124 | |
125 my $assemblyPrjID; # assembly or prj ID | |
126 | |
127 my $log; # log | |
128 | |
129 my $path = ""; | |
130 | |
131 | |
132 if (@ARGV<1) { | |
133 help_user_simple($0); | |
134 exit 0; | |
135 } | |
136 | |
137 if ($ARGV[0] eq "-help" || $ARGV[0] eq "-h") { | |
138 help_user_advance($0); | |
139 exit 0; | |
140 } | |
141 | |
142 if ($ARGV[0] eq "-version" || $ARGV[0] eq "-v") { | |
143 program_version($0); | |
144 exit 0; | |
145 } | |
146 | |
147 ##requirements | |
148 for (my $i = 0; $i <= $#ARGV; $i++) { | |
149 if ($ARGV[$i]=~/-kingdom/i or $ARGV[$i]=~/-k/i) { | |
150 $kingdom = $ARGV[$i+1]; | |
151 } | |
152 elsif ($ARGV[$i]=~/-path/i or $ARGV[$i]=~/-pathSummary/i) { | |
153 $path = $ARGV[$i+1]; | |
154 } | |
155 elsif ($ARGV[$i]=~/-directory/i or $ARGV[$i]=~/-dir/i) { | |
156 $directory = $ARGV[$i+1]; | |
157 } | |
158 elsif ($ARGV[$i]=~/-date/i) { | |
159 $releaseDate = $ARGV[$i+1]; | |
160 } | |
161 elsif ($ARGV[$i]=~/-getSummaries/i or $ARGV[$i]=~/-get/i) { | |
162 $getSummaries = $ARGV[$i+1]; | |
163 } | |
164 elsif ($ARGV[$i]=~/-species/i or $ARGV[$i]=~/-s/i) { | |
165 $species = $ARGV[$i+1]; | |
166 } | |
167 elsif ($ARGV[$i]=~/-level/i or $ARGV[$i]=~/-le/i) { | |
168 $assemblyLevel = $ARGV[$i+1]; | |
169 } | |
170 elsif ($ARGV[$i]=~/-components/i or $ARGV[$i]=~/-c/i) { | |
171 $components = $ARGV[$i+1]; | |
172 } | |
173 elsif ($ARGV[$i]=~/-quantity/i or $ARGV[$i]=~/-q/i or $ARGV[$i]=~/-number/i or $ARGV[$i]=~/-n/i) { | |
174 $quantity = int($ARGV[$i+1]); | |
175 } | |
176 elsif ($ARGV[$i]=~/-ena/i) { | |
177 $sequenceID = $ARGV[$i+1]; | |
178 } | |
179 elsif ($ARGV[$i]=~/-output/i or $ARGV[$i]=~/-o/i) { | |
180 $mainFolder = $ARGV[$i+1]; | |
181 } | |
182 elsif ($ARGV[$i]=~/-taxid/i) { | |
183 $assemblyTaxid = $ARGV[$i+1]; | |
184 } | |
185 elsif ($ARGV[$i]=~/-fastq/i) { | |
186 $sraID = $ARGV[$i+1]; | |
187 } | |
188 elsif ($ARGV[$i]=~/-assembly_or_project/i) { | |
189 $assemblyPrjID = $ARGV[$i+1]; | |
190 } | |
191 elsif ($ARGV[$i]=~/-log/i) { | |
192 $log = 1; | |
193 # $log = File::Log->new({ | |
194 # debug => 4, | |
195 # logFileName => 'myLogFile.log', | |
196 # logFileMode => '>', | |
197 # dateTimeStamp => 1, | |
198 # stderrRedirect => 1, | |
199 # defaultFile => 0, | |
200 # logFileDateTime => 1, | |
201 # appName => 'getSequenceInfo', | |
202 # PIDstamp => 0, | |
203 # storeExpText => 1, | |
204 # msgprepend => '', | |
205 # say => 1, | |
206 # }); | |
207 } | |
208 } | |
209 | |
210 #define folder separator and OS | |
211 if ($^O =~ /MSWin32/) { | |
212 $fldSep = "\\"; | |
213 $actualOS = "MSWin32"; | |
214 } | |
215 | |
216 #LOG file | |
217 if ($log) { open (LOG, "log.txt") or die " error open log.txt $!:"; } | |
218 | |
219 | |
220 print "Working ...\n"; | |
221 | |
222 if ($kingdom eq "viruses") { $kingdom = "viral"; } | |
223 | |
224 if (grep(/^$kingdom$/i, @availableKingdoms)) { | |
225 | |
226 my @patternParametersList; | |
227 | |
228 my @levelList = split /,/, $assemblyLevel; | |
229 | |
230 if ($species ne "") { | |
231 my @speciesList = split(/,/, $species); | |
232 | |
233 foreach my $actualSpecies (@speciesList) { | |
234 get_assembly_summary_species( $quantity, $releaseDate, $directory,$kingdom, $actualSpecies, | |
235 \@levelList, $fldSep, $actualOS, $mainFolder, $assemblyTaxid, $log, | |
236 $getSummaries, $components, $ftpServor); | |
237 } | |
238 } | |
239 elsif ($assemblyTaxid ne "") { | |
240 my @taxidList = split(/,/, $assemblyTaxid); | |
241 | |
242 foreach my $actualID (@taxidList) { | |
243 get_assembly_summary_species($quantity, $releaseDate, $directory,$kingdom,$species, | |
244 \@levelList, $fldSep, $actualOS, $mainFolder, $actualID, $log, | |
245 $getSummaries, $components, $ftpServor); | |
246 } | |
247 } | |
248 else { | |
249 get_assembly_summary_species($quantity, $releaseDate, $directory,$kingdom,$species, | |
250 \@levelList, $fldSep, $actualOS, $mainFolder, $assemblyTaxid, $log, | |
251 $getSummaries, $components, $ftpServor); | |
252 } | |
253 } | |
254 | |
255 | |
256 if ($sequenceID) { | |
257 my @sequenceIDList = split /,/, $sequenceID; | |
258 | |
259 foreach my $enaID (@sequenceIDList) { | |
260 sequence_ena($enaID, $log); | |
261 } | |
262 } | |
263 | |
264 if ($sraID) { | |
265 my @sraIDList = split /,/, $sraID; | |
266 | |
267 foreach my $sra (@sraIDList) { | |
268 download_ena_fastq($enaFtpServor, $sra, $log); | |
269 } | |
270 } | |
271 | |
272 if ($assemblyPrjID) { | |
273 download_assembly_or_project($assemblyPrjID, $ftpServor, $fldSep, $directory, $log); | |
274 } | |
275 | |
276 #my ($end_year,$end_month,$end_day, $end_hour,$end_min,$end_sec) = Today_and_Now(); | |
277 | |
278 #my ($D_y,$D_m,$D_d, $Dh,$Dm,$Ds) = | |
279 # Delta_YMDHMS($start_year,$start_month,$start_day, $start_hour, $start_min, $start_sec, | |
280 # $end_year, $end_month, $end_day, $end_hour,$end_min,$end_sec); | |
281 | |
282 #print "End Date: $end_year-$end_month-$end_day, $end_hour:$end_min:$end_sec\n"; | |
283 print "Thank you for using $0!\n"; | |
284 #print "Execution time: $D_y years, $D_m months, $D_d days, $Dh:$Dm:$Ds (hours:minutes:seconds)\n"; | |
285 my $end = time(); | |
286 | |
287 my $total = $end - $start; | |
288 my $min = $total / 60; | |
289 my $hrs = $min / 60; | |
290 | |
291 print "Total time: $total seconds OR $min minutes OR $hrs hours ! \n"; | |
292 | |
293 ### subroutine | |
294 # display global help document | |
295 sub help_user_simple { | |
296 my $programme = shift @_; | |
297 print STDERR "Usage : perl $programme [options] \n"; | |
298 print "Type perl $programme -version or perl $programme -v to get the current version\n"; | |
299 print "Type perl $programme -help or perl $programme -h to get full help\n"; | |
300 } | |
301 #------------------------------------------------------------------------------ | |
302 # display full help document | |
303 sub help_user_advance { | |
304 print <<HEREDOC; | |
305 | |
306 Name: | |
307 $0 | |
308 | |
309 Synopsis: | |
310 A Perl script allowing to get sequence information from GenBank RefSeq or ENA repositories. | |
311 | |
312 Usage: | |
313 perl $0 [options] | |
314 examples: | |
315 perl $0 -k bacteria -s "Helicobacter pylori" -l "Complete Genome" -date 2019-06-01 | |
316 perl $0 -k viruses -n 5 -date 2019-06-01 | |
317 perl $0 -k "bacteria" -taxid 9,24 -n 10 -c plasmid -dir genbank -o Results | |
318 perl $0 -ena BN000065 | |
319 perl $0 -fastq ERR818002 | |
320 perl $0 -fastq ERR818002,ERR818004 | |
321 | |
322 Kingdoms: | |
323 archaea | |
324 bacteria | |
325 fungi | |
326 invertebrate | |
327 plant | |
328 protozoa | |
329 vertebrate_mammalian | |
330 vertebrate_other | |
331 viral | |
332 | |
333 Assembly levels: | |
334 "Complete Genome" | |
335 Chromosome | |
336 Scaffold | |
337 Contig | |
338 | |
339 General: | |
340 -help or -h displays this help | |
341 -version or -v displays the current version of the program | |
342 | |
343 Options ([XXX] represents the expected value): | |
344 -directory or -dir [XXX] allows to indicate the NCBI's nucleotide sequences repository (default: $directory) | |
345 -get or -getSummaries [XXX] allows to obtain a new assembly summary file in function of given kingdoms (bacteria,fungi,protozoa...) | |
346 -k or -kingdom [XXX] allows to indicate kingdom of the organism (see the examples above) | |
347 -s or -species [XXX] allows to indicate the species (must be combined with -k option) | |
348 -taxid [XXX] allows to indicate a specific taxid (must be combined with -k option) | |
349 -assembly_or_project [XXX] allows to indicate a specific assembly accession or bioproject (must be combined with -k option) | |
350 -date [XXX] indicates the release date (with format yyyy-mm-dd) from which sequence information are available | |
351 -l or -level [XXX] allows to select a specific assembly level (e.g. "Complete Genome") | |
352 -o or -output [XXX] allows users to name the output result folder | |
353 -n or -number [XXX] allows to limit the total number of assemblies to be downloaded | |
354 -c or -components [XXX] allows to select specific components of the assembly (e.g. plasmid, chromosome, ...) | |
355 -ena [XXX] allows to download report and fasta file given a ENA sequence ID | |
356 -fastq [XXX] allows to download FASTQ sequences from ENA given a run accession (https://ena-docs.readthedocs.io/en/latest/faq/archive-generated-files.html) | |
357 -log allows to create a log file | |
358 HEREDOC | |
359 } | |
360 #------------------------------------------------------------------------------ | |
361 # display program version | |
362 sub program_version { | |
363 my $programme = shift @_; | |
364 print "\n $programme, version : $version\n"; | |
365 print "\n A perl script to get sequences informations\n"; | |
366 } | |
367 #------------------------------------------------------------------------------ | |
368 sub get_assembly_summary_species { | |
369 my ($quantity, $releaseDate, $directory, $kingdom, $species, $levelList, $fldSep, | |
370 $actualOS, $mainFolder, $assemblyTaxid, $log, $getSummaries, $components, $ftpServor) = @_; | |
371 | |
372 # assembly_summary.txt file from NCBI FTP site | |
373 #my $assemblySummary = get_summaries($ftpServor, $kingdom, $getSummaries, $directory); | |
374 | |
375 my $assemblySummary = ""; | |
376 if($path){ | |
377 $assemblySummary = $path; | |
378 } | |
379 else{ | |
380 $assemblySummary = download_summaries($directory, $kingdom, $ftpServor, $fldSep, $getSummaries); | |
381 } | |
382 | |
383 #lineage folder | |
384 # my $lineage_file = "/pub/taxonomy/new_taxdump/new_taxdump.tar.gz"; | |
385 | |
386 # allow to check old summary download | |
387 my $oldKingdom = ""; | |
388 | |
389 # start of output file | |
390 if ($log) { | |
391 print LOG "...Downloading assembly_summary.txt...\n"; | |
392 } | |
393 | |
394 # if ($actualOS =~ /Unix/i) { | |
395 #initialiaze tar manipulation | |
396 # my $tar = Archive::Tar->new; | |
397 | |
398 #download taxdump folder | |
399 # download_file($ftpServor, $lineage_file); | |
400 # $tar->read("new_taxdump.tar.gz"); | |
401 # $tar->extract_file("rankedlineage.dmp"); | |
402 # } | |
403 | |
404 | |
405 #my $kingdomRep = $kingdom."_".$start_year."_".$start_month."_".$start_day; | |
406 #my $kingdomRep = $kingdom."_folder"; | |
407 my $kingdomRep = "folder"; | |
408 | |
409 if ($mainFolder) { $kingdomRep = $mainFolder; } | |
410 mkdir $kingdomRep unless -d $kingdomRep; | |
411 | |
412 # Repository for request | |
413 | |
414 #my $repositoryAssembly = "assembly_repository_".$assemblyTaxid; | |
415 my $repositoryAssembly = "result"; | |
416 mkdir $repositoryAssembly unless -d $repositoryAssembly; | |
417 | |
418 my $oldAssemblyRep = "." . $fldSep . $kingdomRep . $fldSep . $repositoryAssembly; | |
419 if (-d $oldAssemblyRep) { rmtree($oldAssemblyRep) } | |
420 | |
421 # Repository for fna file | |
422 my $repositoryFNA = "Assembly"; | |
423 mkdir $repositoryFNA unless -e $repositoryFNA; | |
424 | |
425 # Repository for genbank file | |
426 my $repositoryGenbank = "GenBank"; | |
427 mkdir $repositoryGenbank unless -e $repositoryGenbank; | |
428 | |
429 # Reposotiry for report file | |
430 my $repositoryReport = "Report"; | |
431 mkdir $repositoryReport unless -e $repositoryReport ; | |
432 | |
433 # Repositories for required components | |
434 my %componentsRepHash; | |
435 | |
436 if ($components) { | |
437 for my $component (split /,/, $components) { | |
438 my $specificRep = $component."_folder"; | |
439 #my $specificRep = $component."_".$species."_".$kingdom."_".$start_year."_".$start_month."_".$start_day; | |
440 mkdir $specificRep unless -d $specificRep; | |
441 $componentsRepHash{$component} = $specificRep; | |
442 } | |
443 } | |
444 | |
445 if ($log) { | |
446 print LOG "...Create kingdom and components repository...\n"; | |
447 } | |
448 | |
449 my %assemblyReportList; | |
450 my @assemblyRepresentationList = qw/Full Partial/; | |
451 my @levelList = @{$levelList}; | |
452 my $checkCompleteGenome = grep(/complete genome/i, @levelList); | |
453 | |
454 if ($checkCompleteGenome > 0) {@assemblyRepresentationList = qw/Full/;} | |
455 | |
456 if (-e $assemblySummary) { | |
457 # Read file | |
458 open (SUM, $assemblySummary) or die " error open assembly_summary.txt $!:"; | |
459 while(<SUM>) { | |
460 chomp; | |
461 if ($_ !~ m/^#/) { | |
462 my @infoList = split /\t/, $_; | |
463 my $foundAssemRep = grep (/$infoList[13]/i, @assemblyRepresentationList); | |
464 my $checkLevel = grep(/$infoList[11]/i, @levelList); #replace 11 by 10 | |
465 | |
466 if ($foundAssemRep > 0 && $checkLevel > 0) { | |
467 my $indexInfo = 0; | |
468 my $searchPattern = ""; | |
469 my $regex = ""; | |
470 | |
471 if ($species !~ /^$/) { | |
472 $indexInfo = 7; | |
473 $searchPattern = $species; | |
474 $regex = qr/$searchPattern/i; | |
475 } | |
476 elsif ($assemblyTaxid !~ /^$/) { | |
477 $indexInfo = 5; | |
478 $searchPattern = $assemblyTaxid; | |
479 $regex = qr/^$searchPattern$/i; | |
480 } | |
481 | |
482 if (($infoList[$indexInfo] =~ $regex) or ($kingdom !~ /^$/ && $searchPattern =~ /^$/)) { | |
483 my @gcfInfo = split(/\//, $infoList[19]); | |
484 my $gcfName = pop(@gcfInfo); | |
485 my $realDate = $infoList[14]; | |
486 $realDate =~ s/\//-/g; | |
487 | |
488 my $genbankFile = $infoList[19] . "/" . $gcfName . "_genomic.gbff.gz"; | |
489 my $dnaFile = $infoList[19] . "/" . $gcfName . "_genomic.fna.gz"; | |
490 my $assemblyReport = $infoList[19] . "/" . $gcfName . "_assembly_report.txt"; | |
491 | |
492 if ($realDate gt $releaseDate) { | |
493 | |
494 $dnaFile = obtain_file($ftpServor, $dnaFile); | |
495 $genbankFile = obtain_file($ftpServor, $genbankFile); | |
496 $assemblyReport = obtain_file($ftpServor, $assemblyReport); | |
497 | |
498 download_file($ftpServor, $dnaFile); | |
499 download_file($ftpServor, $genbankFile); | |
500 download_file($ftpServor, $assemblyReport); | |
501 | |
502 if ($log) { | |
503 print LOG "...download FASTA and GenBank report files...\n"; | |
504 } | |
505 | |
506 # download sequences and check number of "N" characters | |
507 my $fileFasta = $gcfName."_genomic.fna.gz"; | |
508 my $ucpFasta = $gcfName."_genomic.fna"; | |
509 if (-e $fileFasta) { | |
510 gunzip $fileFasta => $ucpFasta or die "gunzip failed: $GunzipError\n"; | |
511 move($ucpFasta, $repositoryFNA) or die "move failed: $!"; | |
512 } | |
513 | |
514 if ($log) { | |
515 print LOG "...Unzip FASTA file named $fileFasta ...\n"; | |
516 } | |
517 | |
518 # download genome report | |
519 my $fileReport = $gcfName."_assembly_report.txt"; | |
520 if (-e $fileReport) { | |
521 my $fileInformations = $gcfName."_informations.xls"; | |
522 move($fileReport, $repositoryReport) or die "move failed: $!"; | |
523 } | |
524 | |
525 # download genbank files | |
526 my $fileGenbank = $gcfName."_genomic.gbff.gz"; | |
527 my $ucpGenbank = $gcfName."_genomic.gbff"; | |
528 if (-e $fileGenbank) { | |
529 gunzip $fileGenbank => $ucpGenbank or die "gunzip failed: $GunzipError\n"; | |
530 move($ucpGenbank, $repositoryGenbank) or die "move failed: $!"; | |
531 } | |
532 | |
533 if ($log) { | |
534 print LOG "...Unzip GenBank file $fileGenbank ...\n"; | |
535 } | |
536 | |
537 # association report and fasta | |
538 my $fileFastaGenbank = $ucpFasta . "," . $ucpGenbank; | |
539 $assemblyReportList{$fileReport} = $fileFastaGenbank; | |
540 | |
541 if ($quantity) { $quantity -= 1; } | |
542 | |
543 } | |
544 } | |
545 } | |
546 } | |
547 if (defined $quantity && $quantity == 0) { | |
548 $quantity = undef; | |
549 last; | |
550 } | |
551 } | |
552 close(SUM) or die "close file error : $!"; | |
553 | |
554 if (!keys %assemblyReportList) { | |
555 print "##################################################################\n"; | |
556 print "No results were found for the following query:\n"; | |
557 print "perl $0 @ARGV\n"; | |
558 print "##################################################################\n\n"; | |
559 | |
560 if ($actualOS =~ /unix/i) { unlink glob "*.dmp *.gz" or die "for file *.dmp *.gz $!:"; } | |
561 | |
562 if (empty_folder($kingdomRep)) { rmdir $kingdomRep or die "fail remove directory $!:"; } | |
563 rmdir $repositoryAssembly or die "failed to remove directory $!:"; | |
564 rmdir $repositoryFNA or die "failed to remove directory $!:"; | |
565 rmdir $repositoryGenbank or die "failed to remove directory $!:"; | |
566 rmdir $repositoryReport or die "failed to remove directory $!:"; | |
567 | |
568 if ($components) { | |
569 for my $componentRep (values %componentsRepHash) { | |
570 rmdir $componentRep or die "failed to remove directory $!:"; | |
571 } | |
572 } | |
573 if ($log) { | |
574 print LOG "...No results were found for the following query: "; | |
575 print LOG "perl $0 @ARGV \n"; | |
576 } | |
577 | |
578 } | |
579 else { | |
580 | |
581 if ($log) { | |
582 print LOG "...Results were found for the following query: "; | |
583 print LOG "perl $0 @ARGV \n"; | |
584 } | |
585 # write summary files | |
586 my %componentsSumHash; | |
587 my @keysList = keys %assemblyReportList; | |
588 my $summary = "summary.xls"; | |
589 my $htmlSummary = "summary.html"; | |
590 | |
591 if ($components) { | |
592 for my $component (split /,/, $components) { | |
593 my $specificSummary = $component. "_summary.xls"; | |
594 $componentsSumHash{$component} = $specificSummary; | |
595 } | |
596 } | |
597 | |
598 my $fileReport = "." . $fldSep. $repositoryReport . $fldSep . $keysList[0]; | |
599 my @header = (); | |
600 | |
601 open(FILE, $fileReport) or die "error open file : $!"; | |
602 while(<FILE>) { | |
603 chomp; | |
604 if($_ =~ /:/){ | |
605 $_ =~ s/^#*//; | |
606 my @ligne = split /:/, $_; | |
607 push(@header, $ligne[0]); | |
608 } | |
609 } | |
610 close(FILE) or die "error close file : $!"; | |
611 | |
612 open(HEAD, ">", $summary) or die " error open file : $!"; | |
613 foreach(@header) { | |
614 print HEAD $_ . "\t"; | |
615 } | |
616 | |
617 print HEAD "Pubmed\tNucleScore\tClassification\t"; | |
618 print HEAD "Country\tHost\tIsolation source\tA percent\t"; | |
619 print HEAD "T percent\tG percent\tC percent\tN percent\tGC percent\t"; | |
620 print HEAD "ATGC ratio\tLength\tShape\n"; | |
621 close(HEAD) or die "error close file : $!"; | |
622 | |
623 if ($components) { | |
624 foreach my $componentSummary (values %componentsSumHash) { | |
625 open(SUM, ">>", $componentSummary) or die "error open file : $!"; | |
626 print SUM "Id\tAssembly\tDescription\tLength\tStatus\tLevel\t"; | |
627 print SUM "GC percent\tA percent\tT percent\tG percent\tC percent\n"; | |
628 close(SUM) or die "error close file : $!"; | |
629 } | |
630 } | |
631 | |
632 for my $file (@keysList) { | |
633 my $reportFile = $repositoryReport . $fldSep . $file; | |
634 my @fastaGenbank = split /,/, $assemblyReportList{$file}; | |
635 my $extFasta = $fastaGenbank[0]; | |
636 my $extGenbank = $fastaGenbank[1]; | |
637 my $fnaFile = $repositoryFNA . $fldSep . $extFasta; | |
638 my $genbankFile = $repositoryGenbank . $fldSep . $extGenbank; | |
639 | |
640 write_assembly($reportFile, $fnaFile, $genbankFile, $summary, $repositoryAssembly, | |
641 \%componentsSumHash, $kingdom, $actualOS, \@header, $log); | |
642 | |
643 if ($log) { | |
644 print LOG "...Call write_assembly subroutine...\n"; | |
645 } | |
646 } | |
647 | |
648 write_html_summary($summary); | |
649 | |
650 if ($log) { | |
651 print LOG "...Call write_html_summary subroutine...\n"; | |
652 } | |
653 | |
654 if ($components) { | |
655 my @componentList = keys %componentsSumHash; | |
656 my %componentFastaHash = create_component_sequence_file($fldSep, $repositoryFNA, \@componentList); | |
657 | |
658 if (keys %componentFastaHash && $log) { $log->msg(1,"call create_component_sequence_file subroutine");} | |
659 | |
660 foreach my $component (keys %componentFastaHash) { | |
661 move($componentFastaHash{$component}, $componentsRepHash{$component}) or die "move failed: $!"; | |
662 } | |
663 } | |
664 | |
665 move($summary, $repositoryAssembly) or die "move failed: $!"; | |
666 move($htmlSummary, $repositoryAssembly) or die "move failed: $!"; | |
667 move($repositoryFNA, $repositoryAssembly . $fldSep . $repositoryFNA) or die "move failed: $!"; | |
668 move($repositoryGenbank, $repositoryAssembly . $fldSep . $repositoryGenbank) or die "move failed: $!"; | |
669 move($repositoryReport , $repositoryAssembly . $fldSep . $repositoryReport) or die "move failed: $!"; | |
670 if ($log) { | |
671 print LOG "...move fasta, genbank and report to query folder \n"; | |
672 } | |
673 | |
674 if ($components) { | |
675 for my $component (keys %componentsSumHash) { | |
676 move($componentsSumHash{$component}, $componentsRepHash{$component}) or die "move failed: $!"; | |
677 move($componentsRepHash{$component}, $repositoryAssembly . $fldSep . $componentsRepHash{$component}) or die "move failed: $!" | |
678 } | |
679 if ($log) { | |
680 print LOG "...move component files to folders\n"; | |
681 } | |
682 } | |
683 | |
684 move($repositoryAssembly, $kingdomRep . $fldSep . $repositoryAssembly) or die "move failed: $!"; | |
685 | |
686 if ($log) { | |
687 print LOG "...move query folder to main folder\n"; | |
688 } | |
689 | |
690 # if ($actualOS =~ /unix/i) { unlink glob "*.dmp" or die "for file *.dmp $!:"; } | |
691 unlink glob "*.gz sequence.txt" or die "$!: for file *.gz sequence.txt"; | |
692 } | |
693 } | |
694 } | |
695 #write general assembly file | |
696 sub write_assembly { | |
697 my ($reportFile, $fnaFile, $genbankFile, $summary, $repositoryAssembly, | |
698 $componentsSumHashRef, $kingdom, $actualOS, $headerRef, $log) = @_; | |
699 | |
700 my %componentsSumHash = %{$componentsSumHashRef}; | |
701 my @header = @{$headerRef}; | |
702 my %hashInformations = (); | |
703 my $seq = ""; | |
704 my $genomeName = ""; | |
705 my $country = "na"; | |
706 my $GCpercent = -1; | |
707 my $taxId = "na"; | |
708 my $assemblyLine; | |
709 my $pubmedId = "na"; | |
710 my $host = "na"; | |
711 my $isoSource = "na"; | |
712 # my $species = "na"; | |
713 # my $genus = "na"; | |
714 # my $family = "na"; | |
715 # my $order = "na"; | |
716 # my $class = "na"; | |
717 # my $phylum = "na"; | |
718 my $shape = "na"; | |
719 my $geneNumber = "na"; | |
720 | |
721 open(REP, "<", $reportFile) or die "error open file $!:"; | |
722 while (<REP>) { | |
723 chomp; | |
724 $_ =~ s/^#*//; | |
725 if ($_ =~ /assembled-molecule/i) { $assemblyLine = $_; } | |
726 if ($_ =~ /:/) { | |
727 my @line = split /:/, $_; | |
728 if ($line[1]) { $hashInformations{$line[0]} = trim($line[1]); } | |
729 } | |
730 } | |
731 close(REP) or die "error close file $!:"; | |
732 | |
733 | |
734 open(SUM, ">>", $summary) or die "error open file $!:"; | |
735 foreach my $k(@header) { | |
736 if (grep $_ eq $k, keys %hashInformations) { | |
737 | |
738 my $information = $hashInformations{$k}; | |
739 | |
740 if ($k =~ /Assembly name/i) { $genomeName = $information; } | |
741 | |
742 if ($information =~ /^\s*$/) { | |
743 print SUM "na\t"; | |
744 } else { | |
745 print SUM $information . "\t"; | |
746 } | |
747 } else { | |
748 print SUM "na\t"; | |
749 } | |
750 } | |
751 close(SUM) or die "error close file $!:"; | |
752 | |
753 open(FNA, "<", $fnaFile) or die "Could not open $!:"; | |
754 while (<FNA>) { | |
755 chomp; | |
756 if ($_ !~ /^>/) { $seq .= $_; } | |
757 } | |
758 close(FNA) or die "error close file :$!"; | |
759 | |
760 foreach my $summaryKey (keys %hashInformations) { | |
761 if ($summaryKey =~ /taxid/i) { | |
762 $taxId = $hashInformations{$summaryKey}; | |
763 } | |
764 } | |
765 | |
766 my $classification = get_taxonomic_rank_genbank($genbankFile); | |
767 | |
768 if ($log) { | |
769 print LOG "...get taxonomic rank from genbank file\n"; | |
770 } | |
771 | |
772 $GCpercent = gc_percent($seq); | |
773 my ($ade, $thy, $gua, $cyt, $n, $length) = number_nuc_length_seq($fnaFile); | |
774 my ($aPercent, $tPercent, $gPercent, $cPercent, $nPercent) = nucleotid_percent($ade, $thy, $gua, $cyt, $n, $length); | |
775 my $atgcRatio = atgc_ratio($ade, $thy, $gua, $cyt); | |
776 | |
777 my @percentList = ($aPercent, $tPercent, $gPercent, $cPercent, $nPercent); | |
778 | |
779 my $variance = shift_data_variance(@percentList); | |
780 my $nucleScore = nucle_score($variance, $GCpercent, $atgcRatio, $length); | |
781 | |
782 if ($log) { | |
783 print LOG "compute GC percent nucleotid percent ATGC ratio NucleScore\n"; | |
784 } | |
785 | |
786 open(GBFF, "<", $genbankFile) or die "Error open file $!:"; | |
787 while(<GBFF>) { | |
788 chomp; | |
789 if ($_ =~ /\/country="(.*)"/) { $country = trim($1); } | |
790 if ($_ =~ /PUBMED(.*)/) { $pubmedId = trim($1); } | |
791 if ($_ =~ /\/host="(.*)"/) { $host = trim($1); } | |
792 if ($_ =~ /\/isolation_source="(.*)"/) { $isoSource = trim($1); } | |
793 if ($_ =~ /\(Genes \(total\)\s+::(.*)/) { $geneNumber = trim($1); } | |
794 if ($_ =~ /LOCUS.*\s+([a-z]{1,})\s+[a-z]{1,}\s+[0-9]{2,}-[a-z]{1,}-[0-9]{4,}$/i) { $shape = trim($1); } | |
795 } | |
796 close(GBFF) or die "error close file $!:"; | |
797 | |
798 | |
799 open(SUM, ">>", $summary) or die "error open file $!:"; | |
800 print SUM $pubmedId . "\t" . $nucleScore . "\t" . $classification ."\t" ; | |
801 print SUM $country . "\t" . $host . "\t" . $isoSource . "\t" . $aPercent . "\t" ; | |
802 print SUM $tPercent . "\t" . $gPercent . "\t" . $cPercent ."\t" . $nPercent . "\t"; | |
803 print SUM $GCpercent ."\t". $atgcRatio ."\t". $length . "\t". $shape."\n"; | |
804 close(SUM) or die "error close file: $!"; | |
805 | |
806 if (%componentsSumHash) { | |
807 write_assembly_component($fnaFile, $genomeName, \%componentsSumHash, $log); | |
808 } | |
809 } | |
810 #------------------------------------------------------------------------------ | |
811 # get assembly component | |
812 sub write_assembly_component { | |
813 my($fnaFile, $genomeName, $componentsSumHashRef, $log) = @_; | |
814 | |
815 my %componentsSumHash = %{$componentsSumHashRef}; | |
816 my $status = "na"; | |
817 my $level = "na"; | |
818 my $gcpercent; | |
819 my $tmp_fasta_file = "sequence.txt"; | |
820 my @desc = (); | |
821 | |
822 # check each sequence from (multi-)fasta file | |
823 my ($seq, $inputfile); | |
824 | |
825 if ($log) { | |
826 print LOG "...search specific components\n"; | |
827 } | |
828 | |
829 # extract sequence details | |
830 my $seqIO = Bio::SeqIO->new(-format=>'Fasta', -file=>$fnaFile); | |
831 | |
832 while ($seq = $seqIO->next_seq()) { | |
833 my $seqID = $seq->id; # ID of sequence | |
834 my $seqDesc = $seq->desc; # Description of sequence | |
835 my $globalSeq = $seq->seq; | |
836 my $seqLength = $seq->length(); | |
837 | |
838 open(TSEQ, ">", $tmp_fasta_file) or die "Error open file: $!"; | |
839 print TSEQ $globalSeq; | |
840 close(TSEQ); | |
841 | |
842 my ($ade, $thy, $gua, $cyt, $n, $length) = number_nuc_length_seq($tmp_fasta_file); | |
843 | |
844 my ($aPercent, $tPercent, $gPercent, $cPercent, $nPercent) = nucleotid_percent($ade, $thy, $gua, $cyt, $n, $length); | |
845 | |
846 $gcpercent = gc_percent($globalSeq); | |
847 | |
848 @desc = split /,/, $seqDesc; | |
849 | |
850 if ($desc[1]) { $level = $desc[1]; } | |
851 | |
852 foreach my $component (keys %componentsSumHash) { | |
853 if ($desc[0] =~ /$component/) { | |
854 $status = $component; | |
855 my $info = $seqID . "\t" . $genomeName ."\t" . $seqDesc . "\t" . $seqLength . "\t" . $status . "\t" . $level ."\t"; | |
856 $info.= $gcpercent."\t". $aPercent ."\t". $tPercent ."\t". $gPercent ."\t". $cPercent . "\n"; | |
857 add_to_file($componentsSumHash{$component}, $info); | |
858 if ($log) { | |
859 print LOG "...found component $component \n"; | |
860 } | |
861 } | |
862 } | |
863 } | |
864 } | |
865 #------------------------------------------------------------------------------ | |
866 # download fasta sequence and report on ENA with assembly ID | |
867 sub get_fasta_and_report_sequence_ena_assembly { | |
868 my($sequenceID) = @_; | |
869 my $tmp_file = "fichier.txt"; | |
870 my @id_list = (); | |
871 my $id_chain = ""; | |
872 my $fasta_file = ""; | |
873 my $report_file = ""; | |
874 my $url = "https://www.ebi.ac.uk/ena/data/view/$sequenceID&display=xml"; | |
875 my $output = get($url); | |
876 | |
877 open(TMP, ">", $tmp_file) or die("could not open $!"); | |
878 print TMP $output; | |
879 close(TMP) or die("could not close $!"); | |
880 | |
881 open(TMP, "<", $tmp_file) or die("could not open $!"); | |
882 while(<TMP>){ | |
883 if($_ =~ /<CHROMOSOME accession="(.*)">/){ | |
884 push(@id_list, $1) | |
885 } | |
886 } | |
887 close(TMP) or die("could not close $!"); | |
888 | |
889 $id_chain = join(",", @id_list); | |
890 $url = "https://www.ebi.ac.uk/ena/data/view/$id_chain&display=fasta"; | |
891 $output = get($url); | |
892 $fasta_file = $sequenceID . ".fasta"; | |
893 open(FILE, ">", $fasta_file) or die("could not open $!"); | |
894 print FILE $output; | |
895 close(FILE) or die("could not close $!"); | |
896 | |
897 | |
898 $report_file = $sequenceID . "_report.txt"; | |
899 for my $id (@id_list) { | |
900 $url = "https://www.ebi.ac.uk/ena/data/view/$id&display=text&header=true"; | |
901 $output = get($url); | |
902 open(FILE, ">>", $report_file) or die("could not open $!"); | |
903 print FILE $output; | |
904 print FILE "##########################################################################\n\n"; | |
905 close(FILE) or die("could not close $!"); | |
906 } | |
907 | |
908 unlink "fichier.txt" or die "error delete file :$!"; | |
909 | |
910 return ($fasta_file, $report_file); | |
911 } | |
912 #------------------------------------------------------------------------------ | |
913 # download ENA | |
914 sub sequence_ena { | |
915 my($sequenceID, $log) = @_; | |
916 #my $assemblyRep = $sequenceID . "_folder"; | |
917 my $fastaFile; | |
918 my $reportFile; | |
919 | |
920 #if(-d $assemblyRep) { rmtree($assemblyRep); } | |
921 #mkdir $assemblyRep; | |
922 | |
923 if ($log) { | |
924 print LOG "...ENA sequence downloading ...\n"; | |
925 } | |
926 #if ($log) {$log->msg(1, "Creation of repository: $assemblyRep\n");} | |
927 | |
928 if($sequenceID =~ /^GCA_.*/){ | |
929 ($fastaFile, $reportFile) = get_fasta_and_report_sequence_ena_assembly($sequenceID); | |
930 } | |
931 else { | |
932 ($fastaFile, $reportFile) = get_fasta_and_report_sequence_ena_other($sequenceID); | |
933 } | |
934 | |
935 if ($log) { | |
936 print LOG "...Downloading of ENA FASTA and report files for sequence: $sequenceID\n"; | |
937 } | |
938 | |
939 #move($fastaFile, $assemblyRep) or die "move failed: $!"; | |
940 #move($reportFile, $assemblyRep) or die "move failed: $!"; | |
941 | |
942 if ($log) { | |
943 print LOG "...Move fasta and report files to folder\n"; | |
944 } | |
945 } | |
946 #------------------------------------------------------------------------------ | |
947 # download fasta sequence and report on ENA with ENA ID | |
948 sub get_fasta_and_report_sequence_ena_other { | |
949 my($sequenceID) = @_; | |
950 my $fasta_file = ""; | |
951 my $report_file = ""; | |
952 my $url; | |
953 my $output; | |
954 | |
955 $url = "https://www.ebi.ac.uk/ena/data/view/$sequenceID&display=fasta"; | |
956 #if($actualOS eq "MSWin32"){ | |
957 $output = get($url); | |
958 $fasta_file = $sequenceID . ".fasta"; | |
959 open(FILE, ">", $fasta_file) or die("could not open $!"); | |
960 print FILE $output; | |
961 close(FILE) or die "could not close $!"; | |
962 #} | |
963 #else{ | |
964 # system("wget $url"); | |
965 #} | |
966 | |
967 $output = ""; | |
968 | |
969 $url = "https://www.ebi.ac.uk/ena/data/view/$sequenceID&display=text&header=true"; | |
970 #if($actualOS eq "MSWin32"){ | |
971 $output = get($url); # replace by wget???? | |
972 $report_file = $sequenceID . "_report.txt"; | |
973 open(FILE, ">>", $report_file) or die "could not open: $!"; | |
974 print FILE $output; | |
975 close(FILE) or die "could not close $!"; | |
976 #} | |
977 #else{ | |
978 # system("wget $url"); | |
979 #} | |
980 | |
981 return ($fasta_file, $report_file); | |
982 } | |
983 #------------------------------------------------------------------------------ | |
984 # add information to file | |
985 sub add_to_file { | |
986 my ($file, $info) = @_; | |
987 open(FILE, ">>", $file) or die ("Could not open $!"); | |
988 print FILE $info; | |
989 close(FILE); | |
990 } | |
991 #------------------------------------------------------------------------------ | |
992 # return taxonomic rank of species by tax id | |
993 sub get_taxonomic_rank { | |
994 my($tax_id, $taxonomic_file) = @_; | |
995 my $species = "na"; | |
996 my $genus = "na"; | |
997 my $family = "na"; | |
998 my $order = "na"; | |
999 my $class = "na"; | |
1000 my $phylum = "na"; | |
1001 | |
1002 # my ($species,$genus,$family,$order,$class,$phylum); | |
1003 my @tmp_array = ($species, $genus, $family, $order, $class, $phylum); | |
1004 | |
1005 open(TFILE, "<", $taxonomic_file) or | |
1006 die("Could not open $taxonomic_file: $!"); | |
1007 | |
1008 while(<TFILE>) { | |
1009 chomp; | |
1010 my @tax_info = split(/\|/, $_); | |
1011 | |
1012 if ($tax_info[0] == $tax_id) { | |
1013 @tax_info = trim_array(@tax_info); | |
1014 | |
1015 $tmp_array[0] = $tax_info[1]; | |
1016 splice(@tax_info, 0, 3); | |
1017 | |
1018 for(my $i = 1; $i < $#tmp_array + 1; $i++) { | |
1019 if (length($tax_info[$i-1]) > 0) { $tmp_array[$i] = $tax_info[$i-1]; } | |
1020 } | |
1021 close(TFILE) or die "error close $taxonomic_file $!:"; | |
1022 return @tmp_array; | |
1023 } | |
1024 } | |
1025 close(TFILE) or die "error close $taxonomic_file $!:"; | |
1026 } | |
1027 #------------------------------------------------------------------------------ | |
1028 # write html summary file | |
1029 sub write_html_summary { | |
1030 my($summary) = @_; | |
1031 my $htmlFile = "summary.html"; | |
1032 my $header = ""; | |
1033 my @fileToRead = (); | |
1034 | |
1035 open(HTML, ">", $htmlFile) or die "error open HTML summary $!"; | |
1036 print HTML "<!DOCTYPE html>\n"; | |
1037 print HTML "<html>\n"; | |
1038 print HTML " <head>\n"; | |
1039 print HTML " <title>Assembly summary</title>\n"; | |
1040 print HTML " </head>\n"; | |
1041 print HTML " <body>\n"; | |
1042 print HTML " <h2>Assembly Summary</h2>\n"; | |
1043 close(HTML) or die "error close HTML summary $!"; | |
1044 | |
1045 open(SUM, "<", $summary) or die "error open tsv summary $!"; | |
1046 @fileToRead = <SUM>; | |
1047 close(SUM) or die "error close tsv summary $!"; | |
1048 | |
1049 $header = splice(@fileToRead, 0, 1); | |
1050 | |
1051 for my $line (@fileToRead) { | |
1052 write_html_table($line, $htmlFile, $header); | |
1053 } | |
1054 | |
1055 open(HTML, ">>", $htmlFile) or die "error open HTML summary $!"; | |
1056 print HTML " </body>\n"; | |
1057 print HTML "</html>\n"; | |
1058 close(HTML) or die "error close HTML summary $!"; | |
1059 } | |
1060 #------------------------------------------------------------------------------ | |
1061 # write html table for summary | |
1062 sub write_html_table { | |
1063 my ($line, $htmlFile, $header) = @_; | |
1064 | |
1065 open(HTML, ">>", $htmlFile) or die "error open HTML summary $!"; | |
1066 print HTML " <table border=\"1\" style=\"margin-bottom: 20px;\">\n"; | |
1067 close(HTML) or die "error close HTML summary $!"; | |
1068 add_table_content($line, $htmlFile, $header); | |
1069 } | |
1070 #------------------------------------------------------------------------------ | |
1071 # add information to table | |
1072 sub add_table_content { | |
1073 my ($line, $htmlFile, $headers) = @_; | |
1074 | |
1075 my @assemblyHeader = split(/\t/, $headers); | |
1076 my @assemblyInfo = split(/\t/, $line); | |
1077 my %hashHeaderInfo; | |
1078 my $nbOfCell = 7; | |
1079 my $fullLine = floor(($#assemblyHeader + 1) / $nbOfCell); | |
1080 my $restCell = $#assemblyHeader + 1 - $fullLine * $nbOfCell; | |
1081 | |
1082 | |
1083 for (my $i = 0; $i < $#assemblyHeader + 1; $i++) { | |
1084 $hashHeaderInfo{trim($assemblyHeader[$i])} = $assemblyInfo[$i]; | |
1085 } | |
1086 | |
1087 my @keysHeaderInfo = keys %hashHeaderInfo; | |
1088 my $cellIndex = 0; | |
1089 | |
1090 open(HTML, ">>", $htmlFile) or die "error open HTML summary $!"; | |
1091 for (my $turn = 0; $turn < $fullLine; $turn++) { | |
1092 | |
1093 print HTML " <tr>\n"; | |
1094 for my $header (@assemblyHeader[$cellIndex..$cellIndex + $nbOfCell - 1]) { | |
1095 print HTML " <th>$header</th>\n"; | |
1096 } | |
1097 print HTML " </tr>\n"; | |
1098 | |
1099 print HTML " <tr>\n"; | |
1100 for my $header (@assemblyHeader[$cellIndex..$cellIndex + $nbOfCell - 1]) { | |
1101 if ($header =~ /PUBMED/i && $hashHeaderInfo{$header} ne "na") { | |
1102 print HTML " <td><a href=https://www.ncbi.nlm.nih.gov/pubmed/?term=". | |
1103 "$hashHeaderInfo{$header} target=\"_blank\">$hashHeaderInfo{trim($header)}</a></td>"; | |
1104 } | |
1105 else { | |
1106 print HTML " <td>$hashHeaderInfo{trim($header)}</td>\n"; | |
1107 } | |
1108 } | |
1109 print HTML " </tr>\n"; | |
1110 | |
1111 $cellIndex += $nbOfCell; | |
1112 } | |
1113 | |
1114 print HTML " <tr>\n"; | |
1115 for my $header(@assemblyHeader[$cellIndex..$#keysHeaderInfo]) { | |
1116 print HTML " <th>$header</th>\n"; | |
1117 } | |
1118 print HTML " <tr>\n"; | |
1119 | |
1120 print HTML " <tr>\n"; | |
1121 for my $header(@assemblyHeader[$cellIndex..$#keysHeaderInfo]) { | |
1122 print HTML " <td>$hashHeaderInfo{trim($header)}</td>\n"; | |
1123 } | |
1124 print HTML " <tr>\n"; | |
1125 | |
1126 print HTML " </table>\n"; | |
1127 close(HTML) or die "error close HTML summary $!"; | |
1128 } | |
1129 #------------------------------------------------------------------------------ | |
1130 #getTaxonomicRanks (function allowing to get taxonomic ranks from Genbank file) | |
1131 sub get_taxonomic_rank_genbank { | |
1132 my ($genbank) = @_; | |
1133 | |
1134 my $seqio_object = Bio::SeqIO->new(-file => $genbank); | |
1135 my $seq_object = $seqio_object->next_seq; | |
1136 | |
1137 # legible and long | |
1138 my $species_object = $seq_object->species; | |
1139 my $species_string = $species_object->node_name; | |
1140 | |
1141 # get all taxa from the ORGANISM section in an array | |
1142 my @classification = $seq_object->species->classification; | |
1143 # my $arraySize = @classification; | |
1144 | |
1145 # print "@classification\n"; | |
1146 | |
1147 # if($arraySize == 7){ | |
1148 # ($species,$genus,$family,$order,$class,$phylum,$kingdomGB) = @classification; | |
1149 # } | |
1150 # elsif($arraySize == 4){ | |
1151 # ($species,$class,$phylum,$kingdomGB) = @classification; | |
1152 # } | |
1153 | |
1154 my $classification = join(",", @classification); | |
1155 | |
1156 return ($classification); | |
1157 } | |
1158 #------------------------------------------------------------------------------ | |
1159 #add all sequences components to file | |
1160 sub create_component_sequence_file { | |
1161 my ($fldSep, $repository, $listComponentRef) = @_; | |
1162 | |
1163 my @listFnaFile; | |
1164 my @listComponent = @{$listComponentRef}; | |
1165 | |
1166 opendir(my $dh, $repository) || die "Can't opendir $repository: $!"; | |
1167 @listFnaFile = grep{/fna$/} readdir($dh); | |
1168 closedir $dh; | |
1169 | |
1170 my %componentFastaHash; | |
1171 | |
1172 foreach my $component (@listComponent) { | |
1173 | |
1174 my $componentFasta = $component.".fasta"; | |
1175 | |
1176 foreach my $fnaFile (@listFnaFile) { | |
1177 | |
1178 # my $actualFile = $repository . $fldSep . $fnaFile; | |
1179 | |
1180 my $seq; | |
1181 my $seqIO = Bio::SeqIO->new(-format=>'Fasta', -file=>$repository . $fldSep . $fnaFile); | |
1182 | |
1183 while ($seq = $seqIO->next_seq()) { | |
1184 | |
1185 my $seqDesc = $seq->desc; | |
1186 | |
1187 if ($seqDesc =~ /$component/) { | |
1188 my $seqID = $seq->id; | |
1189 my $seqNuc = $seq->seq; | |
1190 my $shift = 60; | |
1191 my @seqArray = split //, $seqNuc; | |
1192 my $newSeqNuc = ""; | |
1193 | |
1194 if (length $seqNuc <= $shift) { | |
1195 $newSeqNuc = $seqNuc; | |
1196 } | |
1197 else { | |
1198 for(my $i = 0; $i < $#seqArray + 1; $i ++) { | |
1199 $newSeqNuc .= $seqArray[$i]; | |
1200 if (($i + 1) % $shift == 0) { $newSeqNuc .= ","; } | |
1201 } | |
1202 } | |
1203 | |
1204 open(FASTA, ">>", $componentFasta) or die "error open file $!:"; | |
1205 print FASTA ">$seqID $seqDesc\n"; | |
1206 foreach my $subSeqNuc (split /,/, $newSeqNuc) { | |
1207 print FASTA "$subSeqNuc\n"; | |
1208 } | |
1209 close(FASTA) or die "error close file $!:"; | |
1210 } | |
1211 } | |
1212 } | |
1213 if (-e $componentFasta) { $componentFastaHash{$component} = $componentFasta; } | |
1214 } | |
1215 return %componentFastaHash; | |
1216 } | |
1217 # remove back and front spaces | |
1218 sub trim { | |
1219 my ($string) = @_; | |
1220 $string =~ s/^\s+//; | |
1221 $string =~ s/\s+$//; | |
1222 return $string; | |
1223 } | |
1224 #------------------------------------------------------------------------------ | |
1225 # use trim in array | |
1226 sub trim_array { | |
1227 my (@array) = @_; | |
1228 foreach my $value (@array) { | |
1229 $value = trim($value); | |
1230 } | |
1231 return @array; | |
1232 } | |
1233 #------------------------------------------------------------------------------ | |
1234 # check if folder is empty | |
1235 sub empty_folder { | |
1236 my $dirname = shift; | |
1237 opendir(my $dholder, $dirname) or die "error not a directory"; | |
1238 my $isEmpty = scalar(grep { $_ ne "." && $_ ne ".." } readdir($dholder)); | |
1239 if ($isEmpty == 0) { return $isEmpty; } | |
1240 } | |
1241 #------------------------------------------------------------------------------ | |
1242 # number nucleotid and length | |
1243 sub number_nuc_length_seq { | |
1244 my ($fastaFile) = @_; | |
1245 my $ade = 0; | |
1246 my $thy = 0; | |
1247 my $gua = 0; | |
1248 my $cyt = 0; | |
1249 my $n = 0; | |
1250 my $length = 0; | |
1251 | |
1252 open (FASTA, "<", $fastaFile) or die "Could not open $!"; | |
1253 while (<FASTA>) { | |
1254 chomp; | |
1255 if ($_ !~ />/) { | |
1256 my @seq = split //, $_; | |
1257 | |
1258 for my $nuc (@seq) { | |
1259 $length +=1 ; | |
1260 if ($nuc =~ /a/i) {$ade+=1;} | |
1261 elsif ($nuc =~ /t/i) {$thy+=1;} | |
1262 elsif ($nuc =~ /g/i) {$gua+=1;} | |
1263 elsif ($nuc =~ /c/i) {$cyt+=1;} | |
1264 elsif ($nuc =~ /n/i) {$n+=1;} | |
1265 } | |
1266 } | |
1267 } | |
1268 close(FASTA) or die "Error close file :$!"; | |
1269 return ($ade, $thy, $gua, $cyt, $n, $length); | |
1270 | |
1271 } | |
1272 #------------------------------------------------------------------------------ | |
1273 # compute percentage of nucleotid | |
1274 sub nucleotid_percent { | |
1275 my($ade, $thy, $gua, $cyt, $n, $length) = @_; | |
1276 | |
1277 my $adePercent = $ade / $length * 100; | |
1278 my $thyPercent = $thy / $length * 100; | |
1279 my $guaPercent = $gua / $length * 100; | |
1280 my $cytPercent = $cyt / $length * 100; | |
1281 my $nPercent = $n / $length * 100; | |
1282 | |
1283 return ($adePercent, $thyPercent, $guaPercent, $cytPercent, $nPercent); | |
1284 | |
1285 } | |
1286 #------------------------------------------------------------------------------ | |
1287 # compute ATGC ratio | |
1288 sub atgc_ratio { | |
1289 my ($ade, $thy, $gua, $cyt) = @_; | |
1290 return (($ade + $thy) / ($gua + $cyt)); | |
1291 } | |
1292 #------------------------------------------------------------------------------ | |
1293 # variance | |
1294 sub shift_data_variance { | |
1295 my (@data) = @_; | |
1296 | |
1297 if ($#data + 1 < 2) { return 0.0; } | |
1298 | |
1299 my $K = $data[0]; | |
1300 my ($n, $Ex, $Ex2) = 0.0; | |
1301 | |
1302 for my $x (@data) { | |
1303 $n = $n + 1; | |
1304 $Ex += $x - $K; | |
1305 $Ex2 += ($x - $K) * ($x - $K); | |
1306 } | |
1307 | |
1308 my $variance = ($Ex2 - ($Ex * $Ex) / $n) / ($n); ## ($n - 1) | |
1309 | |
1310 return $variance; | |
1311 | |
1312 } | |
1313 #------------------------------------------------------------------------------ | |
1314 # nucle score | |
1315 sub nucle_score { | |
1316 my ($variance, $gcPercent, $atgcRatio, $length) = @_; | |
1317 #return log2(($variance * $gcPercent * $atgcRatio) / sqrt($length)); | |
1318 return log2(($variance * $gcPercent * $atgcRatio ** (3)) / sqrt($length)); | |
1319 } | |
1320 #------------------------------------------------------------------------------ | |
1321 sub log2 { | |
1322 my $n = shift; | |
1323 return (log($n) / log(2)); | |
1324 } | |
1325 #------------------------------------------------------------------------------ | |
1326 # compute GC pourcent | |
1327 sub gc_percent { | |
1328 my ($seq) = @_; | |
1329 | |
1330 my @charSeq = split(//, uc($seq)); | |
1331 my %hashFlank = (); | |
1332 | |
1333 foreach my $v (@charSeq) { | |
1334 $hashFlank{$v} += 1; | |
1335 } | |
1336 | |
1337 if (! $hashFlank{'G'}) { $hashFlank{'G'} = 0;} | |
1338 if (! $hashFlank{'C'}) { $hashFlank{'C'} = 0;} | |
1339 | |
1340 if(length($seq) == 0) { | |
1341 return 0; | |
1342 } | |
1343 else { | |
1344 return (($hashFlank{'G'} + $hashFlank{'C'}) / (length($seq))) * 100; | |
1345 } | |
1346 | |
1347 } | |
1348 #------------------------------------------------------------------------------ | |
1349 # download file from ftp protocol | |
1350 sub download_file { | |
1351 my($servor, $file) = @_; | |
1352 | |
1353 #if($actualOS eq "MSWin32"){ | |
1354 my $ftp = Net::FTP->new($servor, Debug => 0) | |
1355 or die "Cannot connect to $servor"; | |
1356 | |
1357 $ftp->login("anonymous", "-anonymous@") | |
1358 or die "Cannot login ", $ftp->message; | |
1359 $ftp->binary; | |
1360 $ftp->get($file) or die "get failed ", $ftp->message; | |
1361 | |
1362 $ftp->quit; | |
1363 #} | |
1364 #else{ | |
1365 # system("wget $file"); | |
1366 #} | |
1367 } | |
1368 #------------------------------------------------------------------------------ | |
1369 # obtain file directory | |
1370 sub obtain_file { | |
1371 my($servor, $link) = @_; | |
1372 if ($link =~ /$servor(.*)/) { return ($1); } | |
1373 } | |
1374 #------------------------------------------------------------------------------ | |
1375 # download fastq file from ENA | |
1376 sub download_ena_fastq { | |
1377 my ($enaFtpServor, $sraId, $log) = @_; | |
1378 | |
1379 my $fastqDir = "/vol1/fastq/"; | |
1380 my $dir1 = substr $sraId, 0, 6; | |
1381 my $dir2 = "000"; | |
1382 my $digits = substr $sraId, 3; | |
1383 my $fastqRep = $sraId . "_folder"; | |
1384 | |
1385 if ($log) { | |
1386 print LOG "...Downloading fastq file from ENA\n"; | |
1387 } | |
1388 | |
1389 if (length $digits == 6) { | |
1390 $dir2 = $sraId; | |
1391 $fastqDir .= $dir1 . "/" . $dir2 . "/"; | |
1392 } | |
1393 elsif (length $digits > 6) { | |
1394 my $digitsNumber = 0; | |
1395 my @digitsList = split //, (substr $digits, 6); | |
1396 | |
1397 foreach my $char (@digitsList) { | |
1398 if (length $dir2 >= 1) { | |
1399 $dir2 = substr $dir2, 0, (length $dir2) - 1; | |
1400 $digitsNumber += 1; | |
1401 } | |
1402 } | |
1403 $dir2 .= substr $digits, -$digitsNumber; | |
1404 $fastqDir .= $dir1 . "/" . $dir2 . "/" . $sraId . "/"; | |
1405 } | |
1406 | |
1407 if ($log) { | |
1408 print LOG "...recreate database folder path for FASTQ downloading\n"; | |
1409 } | |
1410 | |
1411 my $ftp = Net::FTP->new($enaFtpServor, Debug => 0) | |
1412 or die "Cannot connect to $enaFtpServor"; | |
1413 | |
1414 $ftp->login("anonymous", "-anonymous@") | |
1415 or die "Cannot login ", $ftp->message; | |
1416 $ftp->binary; | |
1417 | |
1418 $ftp->cwd($fastqDir) | |
1419 or die "maybe undefined sequence id, can't go to $fastqDir: ", $ftp->message; | |
1420 | |
1421 my @fastqFiles = $ftp->ls("$sraId*"); | |
1422 | |
1423 if ($log) { | |
1424 print LOG "...Searching fastq files in path\n"; | |
1425 } | |
1426 | |
1427 if (!grep(/^$/, @fastqFiles)) { | |
1428 | |
1429 if (-d $fastqRep) { rmtree($fastqRep) } | |
1430 mkdir $fastqRep; | |
1431 | |
1432 foreach my $fastqFile (@fastqFiles) { | |
1433 #if($actualOS eq "MSWin32"){ | |
1434 $ftp->get($fastqFile) or die "get failed ", $ftp->message; | |
1435 #} | |
1436 #else{ | |
1437 # system("wget $fastqFile"); | |
1438 #} | |
1439 | |
1440 #my @baseAndExt = split /\./, $fastqFile; | |
1441 #my $unzipFastq = $baseAndExt[0] . ".fastq"; | |
1442 | |
1443 #gunzip $fastqFile => $unzipFastq or die "gunzip failed: $GunzipError\n"; | |
1444 move($fastqFile, $fastqRep) or die "move failed: $!"; # DC replaced $unzipFastq by $fastqFile | |
1445 } | |
1446 #unlink glob "*fastq.gz" or die "$!: for file *fastq.gz"; | |
1447 | |
1448 if ($log) { | |
1449 print LOG "...Finalizing download of FASTQ file\n"; | |
1450 } | |
1451 } | |
1452 | |
1453 if ($log) { | |
1454 print LOG "End of download\n"; | |
1455 } | |
1456 | |
1457 $ftp->quit; | |
1458 } | |
1459 #------------------------------------------------------------------------------ | |
1460 # download fastq file from ENA | |
1461 sub get_assembly_or_project { | |
1462 my ($file, $sequence, $ftpServor, $fldSep, $log) = @_; | |
1463 | |
1464 my $pattern; | |
1465 my $indexInfo; | |
1466 my %folderHash; | |
1467 | |
1468 # Repository for fna file | |
1469 my $repositoryFNA = "Assembly"; | |
1470 | |
1471 # Repository for genbank file | |
1472 my $repositoryGenbank = "GenBank"; | |
1473 | |
1474 # Reposotiry for report file | |
1475 my $repositoryReport = "Report"; | |
1476 | |
1477 # global repository | |
1478 my $repositorySequence = $sequence; | |
1479 | |
1480 | |
1481 if ($sequence =~ /^GC[AF]_(.*)/) { | |
1482 $indexInfo = 0; | |
1483 $pattern = $1; | |
1484 } | |
1485 elsif ($sequence =~ /^PRJ/) { | |
1486 $indexInfo = 1; | |
1487 $pattern = $sequence; | |
1488 } | |
1489 | |
1490 open(SUM, $file) or die "error open file $!:"; | |
1491 while(<SUM>) { | |
1492 chomp; | |
1493 if ($_ !~ /^#/) { | |
1494 my @infoList = split /\t/, $_; | |
1495 if ($infoList[$indexInfo] =~ /$pattern/) { | |
1496 my @gcfInfo = split(/\//, $infoList[19]); | |
1497 my $gcfName = pop(@gcfInfo); | |
1498 | |
1499 | |
1500 my $genbankFile = $infoList[19] . "/" . $gcfName . "_genomic.gbff.gz"; | |
1501 my $dnaFile = $infoList[19] . "/" . $gcfName . "_genomic.fna.gz"; | |
1502 my $assemblyReport = $infoList[19] . "/" . $gcfName . "_assembly_report.txt"; | |
1503 | |
1504 $dnaFile = obtain_file($ftpServor, $dnaFile); | |
1505 $genbankFile = obtain_file($ftpServor, $genbankFile); | |
1506 $assemblyReport = obtain_file($ftpServor, $assemblyReport); | |
1507 | |
1508 download_file($ftpServor, $dnaFile); | |
1509 download_file($ftpServor, $genbankFile); | |
1510 download_file($ftpServor, $assemblyReport); | |
1511 | |
1512 # download sequences and check number of "N" characters | |
1513 my $fileFasta = $gcfName."_genomic.fna.gz"; | |
1514 my $ucpFasta = $gcfName."_genomic.fna"; | |
1515 if (-e $fileFasta) { | |
1516 gunzip $fileFasta => $ucpFasta or die "gunzip failed: $GunzipError\n"; | |
1517 $folderHash{$ucpFasta} = $repositoryFNA; | |
1518 } | |
1519 | |
1520 # download genome report | |
1521 my $fileReport = $gcfName."_assembly_report.txt"; | |
1522 if (-e $fileReport) { | |
1523 $folderHash{$fileReport} = $repositoryReport; | |
1524 } | |
1525 | |
1526 # download genbank files | |
1527 my $fileGenbank = $gcfName."_genomic.gbff.gz"; | |
1528 my $ucpGenbank = $gcfName."_genomic.gbff"; | |
1529 if (-e $fileGenbank) { | |
1530 gunzip $fileGenbank => $ucpGenbank or die "gunzip failed: $GunzipError\n"; | |
1531 $folderHash{$ucpGenbank} = $repositoryGenbank; | |
1532 } | |
1533 | |
1534 } | |
1535 } | |
1536 } | |
1537 close(SUM) or die "error close file $!"; | |
1538 | |
1539 if (keys %folderHash) { | |
1540 if (-e $repositorySequence) {rmtree($repositorySequence);} | |
1541 | |
1542 if ($log) { | |
1543 print LOG "...Download files from GenBank or RefSeq \n"; | |
1544 } | |
1545 | |
1546 mkdir $repositorySequence; | |
1547 mkdir $repositoryFNA; | |
1548 mkdir $repositoryGenbank; | |
1549 mkdir $repositoryReport; | |
1550 | |
1551 for my $ucpFile (keys %folderHash) { | |
1552 move($ucpFile, $folderHash{$ucpFile}) or die "error move file $!:"; | |
1553 } | |
1554 move($repositoryFNA, $repositorySequence . $fldSep. $repositoryFNA) or die "error move file $!:"; | |
1555 move($repositoryGenbank, $repositorySequence . $fldSep. $repositoryGenbank) or die "error move file $!:"; | |
1556 move($repositoryReport, $repositorySequence . $fldSep. $repositoryReport) or die "error move file $!:"; | |
1557 unlink glob "*.gz" or die "for file *.gz $!:"; | |
1558 | |
1559 if ($log) { | |
1560 print LOG "...move GenBank/RefSeq sequence files to dedicated folders\n"; | |
1561 } | |
1562 } | |
1563 | |
1564 } | |
1565 #------------------------------------------------------------------------------ | |
1566 sub download_assembly_or_project { | |
1567 my ($sequenceId, $ftpServor, $fldSep, $directory, $log) = @_; | |
1568 | |
1569 my $assemblySummary; | |
1570 my @sequenceIdList = split /,/, $sequenceId; | |
1571 | |
1572 if ($directory =~ /refseq/) { | |
1573 $assemblySummary = "assembly_summary_refseq.txt"; | |
1574 } elsif ($directory =~ /genbank/) { | |
1575 $assemblySummary = "assembly_summary_genbank.txt"; | |
1576 } | |
1577 | |
1578 my $assemblySummaryPath = "/genomes/ASSEMBLY_REPORTS/" . $assemblySummary; | |
1579 download_file($ftpServor, $assemblySummaryPath); | |
1580 | |
1581 foreach my $sequence (@sequenceIdList) { | |
1582 get_assembly_or_project($assemblySummary, $sequence, $ftpServor, $fldSep, $log); | |
1583 } | |
1584 | |
1585 unlink $assemblySummary or die "error remove file $!:"; | |
1586 } | |
1587 #------------------------------------------------------------------------------ | |
1588 # check if all required module are install | |
1589 sub isModuleInstalled { | |
1590 my $mod = shift; | |
1591 | |
1592 #eval("use $mod"); | |
1593 my $commandModule = `perldoc -l $mod`; | |
1594 | |
1595 if ($commandModule) { | |
1596 return(1); | |
1597 } else { | |
1598 return(0); | |
1599 } | |
1600 } | |
1601 #------------------------------------------------------------------------------ | |
1602 # download assembly summary | |
1603 sub download_summaries { | |
1604 my ($database, $kingdom, $ftpServor, $fldSep, $getSummaries) = @_; | |
1605 | |
1606 my $assemblySummary = "assembly_summary.txt"; | |
1607 my $assemblySummaryLink; | |
1608 my $fileName; | |
1609 | |
1610 opendir my $workingDirectory, "." . $fldSep or die "error open dir $!:"; | |
1611 my @filesList = readdir $workingDirectory; | |
1612 closedir $workingDirectory; | |
1613 | |
1614 if ($getSummaries) { | |
1615 foreach my $summaryKingdom (split /,/, $getSummaries) { | |
1616 foreach my $file (@filesList) { | |
1617 if ($file =~ /assembly_summary.txt/i && $file =~ /$summaryKingdom/i && $file =~ /$database/) { | |
1618 unlink $file or die "error remove file $!:"; | |
1619 $assemblySummaryLink = "/genomes/$database/$summaryKingdom/assembly_summary.txt"; | |
1620 download_file($ftpServor, $assemblySummaryLink); | |
1621 $fileName = $database . "_" . $summaryKingdom . "_" . "assembly_summary.txt"; | |
1622 rename $assemblySummary, $fileName; | |
1623 print "replace assembly_summary file\n"; | |
1624 } | |
1625 } | |
1626 } | |
1627 } | |
1628 | |
1629 foreach my $file (@filesList) { | |
1630 if ($file =~ /assembly_summary.txt/i && $file =~ /$kingdom/i && $file =~ /$database/) { | |
1631 return $file; | |
1632 } | |
1633 } | |
1634 | |
1635 $assemblySummaryLink = "/genomes/$database/$kingdom/assembly_summary.txt"; | |
1636 $fileName = $database . "_" . $kingdom . "_" . "assembly_summary.txt"; | |
1637 download_file($ftpServor, $assemblySummaryLink); | |
1638 rename $assemblySummary, $fileName; | |
1639 | |
1640 return $fileName; | |
1641 } | |
1642 #--------------------- | |
1643 sub bioseqio { | |
1644 | |
1645 my ($keyword, $file) = @_; | |
1646 local $/ = "\n>"; # read by FASTA record | |
1647 | |
1648 my $count = 0; | |
1649 | |
1650 open FASTA, $file; | |
1651 while (<FASTA>) { | |
1652 chomp; | |
1653 my $seq = $_; | |
1654 #my ($id) = $seq =~ /^>*(\S+)/; # parse ID as first word in FASTA header | |
1655 if ($seq =~ /^>*.*$keyword/) { | |
1656 #$seq =~ s/^>*.+\n//; # remove FASTA header | |
1657 #$seq =~ s/\n//g; # remove endlines | |
1658 $count++; | |
1659 print "\nThe sequence number is: $count \n"; | |
1660 print ">$seq"; | |
1661 } | |
1662 } | |
1663 close FASTA; | |
1664 } |