Mercurial > repos > dereeper > pangenome_explorer
comparison PanExplorer_workflow/Perl/wget.pl @ 1:032f6b3806a3 draft
Uploaded
| author | dereeper | 
|---|---|
| date | Thu, 30 May 2024 11:16:08 +0000 | 
| parents | |
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| 0:3cbb01081cde | 1:032f6b3806a3 | 
|---|---|
| 1 #!/usr/bin/perl | |
| 2 | |
| 3 use strict; | |
| 4 | |
| 5 use File::Basename; | |
| 6 my $dirname = dirname(__FILE__); | |
| 7 | |
| 8 system("wget https://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/prokaryotes.txt"); | |
| 9 system("wget https://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/eukaryotes.txt"); | |
| 10 | |
| 11 my %continents; | |
| 12 open(F,"countries.txt"); | |
| 13 <F>; | |
| 14 while(my $line =<F>){ | |
| 15 chomp($line); | |
| 16 my ($continent,$country) = split(/,/,$line); | |
| 17 $continents{$country} = $continent; | |
| 18 } | |
| 19 close(F); | |
| 20 | |
| 21 my $input = $ARGV[0]; | |
| 22 my $outdir = $ARGV[1]; | |
| 23 my $private_genomes = $ARGV[2]; | |
| 24 | |
| 25 system("cat $input $private_genomes >$outdir/list.txt"); | |
| 26 | |
| 27 my $concat = ""; | |
| 28 open(O2,">$outdir/genbanks.txt"); | |
| 29 open(O,">$outdir/strains.txt"); | |
| 30 open(GENES,">$outdir/genes.txt"); | |
| 31 open(L,">$outdir/list_genomes.txt"); | |
| 32 open(L2,">$outdir/list_genomes2.txt"); | |
| 33 open(L3,">$outdir/genomes.txt"); | |
| 34 open(L4,">$outdir/genomes2.txt"); | |
| 35 open(SEQFILE,">$outdir/seqfile"); | |
| 36 open(PanSN,">$outdir/all_genomes.fa"); | |
| 37 open(METADATA,">$outdir/metadata_strains.txt"); | |
| 38 | |
| 39 open(F,"$outdir/list.txt"); | |
| 40 #open(TEST,">$outdir/test"); | |
| 41 while(my $line =<F>){ | |
| 42 chomp($line); | |
| 43 my $genbank = $line; | |
| 44 if (!-e "$genbank"){ | |
| 45 my $grep = `grep '$line' prokaryotes.txt`; | |
| 46 #print "$genbank $line aaa $grep\n";exit; | |
| 47 my @infos = split(/\t/,$grep); | |
| 48 my $status = $infos[15]; | |
| 49 if ($status !~/Complete Genome/ && $status !~/Chromosome/){ | |
| 50 #next; | |
| 51 } | |
| 52 my $ftp_path = $infos[$#infos -2]; | |
| 53 | |
| 54 $ftp_path =~s/ftp:/http:/g; | |
| 55 my @table = split(/\//,$ftp_path); | |
| 56 my $name = $table[$#table]; | |
| 57 my $prot_file = "$ftp_path/$name"."_protein.faa.gz"; | |
| 58 my $gbff = "$ftp_path/$name"."_genomic.gbff.gz"; | |
| 59 my $gff = "$ftp_path/$name"."_genomic.gff.gz"; | |
| 60 my $genome_fasta = "$ftp_path/$name"."_genomic.fna.gz"; | |
| 61 my @particules = split(/_/,$name); | |
| 62 | |
| 63 `wget -O $outdir/$genbank.fasta.gz $genome_fasta`; | |
| 64 `gunzip $outdir/$genbank.fasta.gz`; | |
| 65 `wget -O $outdir/$genbank.gb.gz $gbff`; | |
| 66 system("gunzip $outdir/$genbank.gb.gz"); | |
| 67 | |
| 68 | |
| 69 ################################################################ | |
| 70 # for eukaryotes | |
| 71 ################################################################ | |
| 72 if (!$grep){ | |
| 73 $grep = `grep '$line' eukaryotes.txt`; | |
| 74 my @infos = split(/\t/,$grep); | |
| 75 my $gca = $infos[8]; | |
| 76 if ($gca =~/GCA_(\d\d\d)(\d\d\d)(\d\d\d)/){ | |
| 77 my $part1 = $1; | |
| 78 $ftp_path = "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/$1/$2/$3"; | |
| 79 `wget -O $outdir/$gca.index.html $ftp_path`; | |
| 80 my $grep_name = `grep '$gca' $outdir/$gca.index.html`; | |
| 81 unlink("$outdir/$gca.index.html"); | |
| 82 my $name; | |
| 83 if ($grep_name =~/href=\"(.*)\"/){ | |
| 84 $name = $1; | |
| 85 $name=~s/\///g; | |
| 86 } | |
| 87 $ftp_path = $ftp_path."/$name"; | |
| 88 my $prot_file = "$ftp_path/$name"."_protein.faa.gz"; | |
| 89 my $gbff = "$ftp_path/$name"."_genomic.gbff.gz"; | |
| 90 my $gff = "$ftp_path/$name"."_genomic.gff.gz"; | |
| 91 my $genome_fasta = "$ftp_path/$name"."_genomic.fna.gz"; | |
| 92 my @particules = split(/_/,$name); | |
| 93 | |
| 94 `wget -O $outdir/$genbank.fasta.gz $genome_fasta`; | |
| 95 `gunzip $outdir/$genbank.fasta.gz`; | |
| 96 `wget -O $outdir/$genbank.gb.gz $gbff`; | |
| 97 `wget -O $outdir/$genbank.faa.gz $prot_file`; | |
| 98 system("gunzip $outdir/$genbank.gb.gz"); | |
| 99 system("gunzip $outdir/$genbank.faa.gz"); | |
| 100 | |
| 101 } | |
| 102 } | |
| 103 } | |
| 104 else{ | |
| 105 my $genbank_file = $genbank; | |
| 106 my $grep = `grep 'LOCUS' $genbank_file`; | |
| 107 $genbank = "unknown"; | |
| 108 if ($grep =~/LOCUS\s+([\-\:\w]+)/){$genbank = $1;} | |
| 109 | |
| 110 #$genbank =~s/\:/_/g; | |
| 111 | |
| 112 my $cmd = "cp -rf $genbank_file $outdir/$genbank.gb"; | |
| 113 system($cmd); | |
| 114 | |
| 115 my %genome_seqs; | |
| 116 my $current_chr; | |
| 117 my $go = 0; | |
| 118 open(G,"$outdir/$genbank.gb"); | |
| 119 while(<G>){ | |
| 120 if ($go == 1 && /(\d+) (.*)$/){ | |
| 121 my $line = $2; | |
| 122 $line =~s/ //g; | |
| 123 $genome_seqs{$current_chr}.=$line; | |
| 124 } | |
| 125 if (/LOCUS ([^\s]+)/){ | |
| 126 $current_chr = $1; | |
| 127 } | |
| 128 if (/ORIGIN/){$go = 1;} | |
| 129 if (/^\/\//){$go = 0;} | |
| 130 } | |
| 131 close(G); | |
| 132 | |
| 133 open(FASTA,">$outdir/$genbank.fasta"); | |
| 134 foreach my $ch(keys(%genome_seqs)){ | |
| 135 print FASTA ">$ch\n"; | |
| 136 my $seq = $genome_seqs{$ch}; | |
| 137 print FASTA "$seq\n"; | |
| 138 } | |
| 139 close(FASTA); | |
| 140 } | |
| 141 #my $get_organism_line = `head -10 $outdir/$genbank.gb | grep DEFINITION `; | |
| 142 my $get_organism_line = `head -10 $outdir/$genbank.gb | grep -A 1 DEFINITION `; | |
| 143 | |
| 144 # if several lines for DEFINITION, concatenate the lines | |
| 145 my @lines_organism = split(/\n/,$get_organism_line); | |
| 146 my $first_line = $lines_organism[0]; | |
| 147 my $second_line = $lines_organism[1]; | |
| 148 if ($second_line =~/^ (.*)/){ | |
| 149 $get_organism_line = $first_line. " ".$1; | |
| 150 } | |
| 151 else{ | |
| 152 $get_organism_line = $first_line; | |
| 153 } | |
| 154 | |
| 155 my $strain; | |
| 156 my $genus; | |
| 157 if ($get_organism_line =~/DEFINITION (.*)$/){ | |
| 158 $strain = $1; | |
| 159 ($genus) = split(/\s/,$strain); | |
| 160 } | |
| 161 my $country = `head -100 $outdir/$genbank.gb | grep country `; | |
| 162 $country =~s/^\s+//g; | |
| 163 $country =~s/\/country=//g; | |
| 164 $country =~s/\"//g; | |
| 165 $country =~s/\n//g;$country =~s/\r//g; | |
| 166 if ($country =~/:/){ | |
| 167 my $city; | |
| 168 ($country,$city) = split(/:/,$country); | |
| 169 } | |
| 170 if ($country eq ""){$country = "unresolved";} | |
| 171 my $continent = "unresolved"; | |
| 172 if ($continents{$country}){ | |
| 173 $continent = $continents{$country}; | |
| 174 } | |
| 175 $strain =~s/\.//g; | |
| 176 my ($info1,$info2 ) = split(",",$strain); | |
| 177 $strain = $info1; | |
| 178 $strain =~s/ /_/g; | |
| 179 $strain =~s/strain_//g; | |
| 180 $strain =~s/_chromosome//g; | |
| 181 $strain =~s/_genome//g; | |
| 182 $strain =~s/_str_/_/g; | |
| 183 $strain =~s/[^\w\-\_]//g; | |
| 184 $strain =~s/\-/_/g; | |
| 185 | |
| 186 print O "$genbank $strain\n"; | |
| 187 $concat .= "$genbank,"; | |
| 188 print L "$genbank $outdir/$genbank.gb\n"; | |
| 189 print L2 "$genbank\n"; | |
| 190 print L3 "$outdir/$genbank.fasta\n"; | |
| 191 print L4 "$outdir/$genbank.fasta $strain\n"; | |
| 192 print SEQFILE "$genbank $outdir/$genbank.fasta\n"; | |
| 193 print METADATA "$strain\t$genus\t$country\t$continent\n"; | |
| 194 | |
| 195 | |
| 196 my %genome_sequences; | |
| 197 my $genome = ""; | |
| 198 my $seqid = ""; | |
| 199 open(GENOME,"$outdir/$genbank.fasta"); | |
| 200 while(<GENOME>){ | |
| 201 if (!/^>/){ | |
| 202 my $line = $_; | |
| 203 $line =~s/\n//g;$line =~s/\r//g; | |
| 204 $genome_sequences{$seqid} .= $line; | |
| 205 $genome .= $line; | |
| 206 print PanSN $_; | |
| 207 } | |
| 208 elsif (/>([^\s]+)/){ | |
| 209 $seqid = $1; | |
| 210 $seqid =~s/\.\d+$//g; | |
| 211 print PanSN ">$strain#$seqid\n"; | |
| 212 } | |
| 213 } | |
| 214 close(GENOME); | |
| 215 | |
| 216 | |
| 217 open(N,">$outdir/$genbank.nuc"); | |
| 218 open(P,">$outdir/$genbank.pep"); | |
| 219 open(FUNC,">$outdir/$genbank.func"); | |
| 220 my $go = 0; | |
| 221 my $start; | |
| 222 my $end; | |
| 223 my $product; | |
| 224 my $complement = 0; | |
| 225 my $end_gene = "no"; | |
| 226 my $protein = ""; | |
| 227 | |
| 228 #`sed -i "s/'//g" $outdir/$genbank.gb`; | |
| 229 | |
| 230 my $has_translation = `grep -c 'translation=' $outdir/$genbank.gb`; | |
| 231 $has_translation =~s/\n//g;$has_translation =~s/\r//g; | |
| 232 open(G,"$outdir/$genbank.gb"); | |
| 233 my $current_gene; | |
| 234 my $current_chrom; | |
| 235 while(<G>){ | |
| 236 if (/^\s+ORGANISM\s+(.*)$/){ | |
| 237 } | |
| 238 if (/protein_id=\"(.*)\"/){ | |
| 239 $current_gene = $1; | |
| 240 } | |
| 241 if (/LOCUS ([^\s]+)/){ | |
| 242 $current_chrom = $1; | |
| 243 } | |
| 244 if (/locus_tag=\"(.*)\"/){ | |
| 245 $current_gene = $1; | |
| 246 } | |
| 247 if ($go == 1){ | |
| 248 my $line = $_; | |
| 249 $line =~s/ //g; | |
| 250 $line =~s/\n//g;$line =~s/\r//g; | |
| 251 $protein .= $line; | |
| 252 if (/\"$/){ | |
| 253 $protein =~s/\"//g; | |
| 254 $end_gene = "yes"; | |
| 255 | |
| 256 } | |
| 257 } | |
| 258 if (/\/translation=\"(.*)/ or ($has_translation == 0 && /protein_id=/)){ | |
| 259 $go = 1; | |
| 260 $protein .= $1; | |
| 261 print P ">$current_gene\n"; | |
| 262 print N ">$current_gene\n"; | |
| 263 print GENES "$current_gene $product [$strain]\n"; | |
| 264 | |
| 265 if ($protein =~/\"$/){ | |
| 266 $end_gene = "yes"; | |
| 267 } | |
| 268 if ($has_translation == 0){$end_gene = "yes";} | |
| 269 } | |
| 270 if ($end_gene eq "yes"){ | |
| 271 $protein =~s/\"//g; | |
| 272 print P "$protein\n"; | |
| 273 $protein = ""; | |
| 274 my $length = $end - $start + 1; | |
| 275 | |
| 276 #print "okkkk $current_chrom\n"; | |
| 277 #my $geneseq = substr($genome,$start-1,$length); | |
| 278 my $geneseq = substr($genome_sequences{$current_chrom},$start-1,$length); | |
| 279 | |
| 280 | |
| 281 if ($complement){ | |
| 282 my $revcomp = reverse $geneseq; | |
| 283 $revcomp =~ tr/ATGCatgc/TACGtacg/; | |
| 284 $geneseq = $revcomp; | |
| 285 } | |
| 286 print N "$geneseq\n"; | |
| 287 print FUNC "$current_gene - $product\n"; | |
| 288 $go = 0; | |
| 289 $end_gene = "no"; | |
| 290 } | |
| 291 if (/\/product=\"(.*)\"/){ | |
| 292 $product = $1; | |
| 293 } | |
| 294 if (/^\s+CDS\s+(\d+)\.\.(\d+)\s*$/){ | |
| 295 $start = $1; | |
| 296 $end = $2; | |
| 297 $complement = 0; | |
| 298 } | |
| 299 if (/^\s+CDS\s+complement\((\d+)\.\.(\d+)\)\s*$/){ | |
| 300 $start = $1; | |
| 301 $end = $2; | |
| 302 $complement = 1; | |
| 303 } | |
| 304 } | |
| 305 close(G); | |
| 306 close(P); | |
| 307 close(N); | |
| 308 close(FUNC); | |
| 309 | |
| 310 if ($has_translation == 0){ | |
| 311 system("perl $dirname/translate.pl $outdir/$genbank.nuc $outdir/$genbank.pep"); | |
| 312 } | |
| 313 | |
| 314 my $prot_num = 0; | |
| 315 open(PRT,">$outdir/$genbank.prt"); | |
| 316 open(P,"$outdir/$genbank.pep"); | |
| 317 while(<P>){ | |
| 318 if (/>(.*)/){ | |
| 319 my $prot_id = $1; | |
| 320 $prot_num++; | |
| 321 my $new_id = "$strain"."_".$prot_num; | |
| 322 print PRT ">$new_id\n"; | |
| 323 } | |
| 324 else{ | |
| 325 print PRT $_; | |
| 326 } | |
| 327 } | |
| 328 close(P); | |
| 329 close(PRT); | |
| 330 } | |
| 331 close(F); | |
| 332 close(O); | |
| 333 close(METADATA); | |
| 334 chop($concat); | |
| 335 print O2 $concat; | |
| 336 close(O2); | |
| 337 close(L); | |
| 338 close(L2); | |
| 339 close(L3); | |
| 340 close(L4); | |
| 341 close(GENES); | |
| 342 close(SEQFILE); | |
| 343 close(PanSN); | |
| 344 #close(TEST); | |
| 345 | |
| 346 unlink("prokaryotes.txt"); | |
| 347 unlink("eukaryotes.txt"); | 
