Mercurial > repos > dereeper > pgap
view PGAP-1.2.1/Converter_NCBINewFormatData.pl @ 11:e0c913351502 draft default tip
Uploaded
author | dereeper |
---|---|
date | Thu, 01 Jul 2021 12:22:53 +0000 |
parents | 83e62a1aeeeb |
children |
line wrap: on
line source
#!/usr/bin/perl use strict; use warnings; use Getopt::Long; my %optionHash=qw(); my $inprefix=""; my $outprefix=""; my $indir=""; my $outdir=""; my @inList; my @outList; GetOptions(\%optionHash,"inprefix:s","outprefix:s","indir:s","outdir:s","help|h!"); if(scalar(keys %optionHash)==0){ &print_usage(""); } if(exists($optionHash{"h"}) or exists($optionHash{"help"}) ){ &print_usage(""); } if( exists($optionHash{"inprefix"} ) ){ $inprefix = $optionHash{"inprefix"}; @inList=split(/\+/,$inprefix); }else{ &print_usage("--inprefix should be provided!"); } if( exists($optionHash{"outprefix"} ) ){ $outprefix = $optionHash{"outprefix"}; @outList=split(/\+/,$outprefix); } if( exists($optionHash{"indir"} ) ){ $indir = $optionHash{"indir"}; }else{ &print_usage("--indir should be provided!"); } if( exists($optionHash{"outdir"} ) ){ $outdir = $optionHash{"outdir"}; }else{ &print_usage("--outdir should be provided!"); } if( $inprefix eq "" or $indir eq "" or $outdir eq "" ){ &print_usage(""); } if( scalar(@outList) >0 and scalar(@outList) != scalar(@inList) ){ &print_usage("If outprefix was provided, the name number should be identical with inprefix"); } system("mkdir -p $outdir"); foreach my $idx (0 .. (scalar(@inList) -1) ){ my $inname = $inList[$idx]; my $feature_table = $indir."/".$inname."_feature_table.txt"; my $faa = $indir."/".$inname."_protein.faa"; my $fna = $indir."/".$inname."_genomic.fna"; my %genePositionHash; my %genomeSequenceHash; my %geneAnnotationHash; my %gene2genomeHash; my %genefaaHash; my $count=0; my $outname; if($outprefix eq ""){ my @tmp=split(/\./,$inname); $outname = $tmp[0]; }else{ $outname = $outList[$idx]; } # check file if( ! -e $feature_table){ print "$feature_table was not found!\n"; print "$inname was skipped!\n"; next; } if( ! -e $faa){ print "$faa was not found!\n"; print "$inname was skipped!\n"; next; } if( ! -e $fna){ print "$fna was not found!\n"; print "$inname was skipped!\n"; next; } # read feature &read_feature($feature_table,\%geneAnnotationHash,\%genePositionHash,\%gene2genomeHash); # get genome sequence &read_genomeSequence($fna,\%genomeSequenceHash); # get faa &read_faa($faa,\%genefaaHash); # extract nuc and output open(PEP,">$outdir/$outname.pep"); open(NUC,">$outdir/$outname.nuc"); open(FUN,">$outdir/$outname.function"); foreach my $mygene (keys %genePositionHash){ if( (!exists( $geneAnnotationHash{$mygene})) or (!exists($gene2genomeHash{$mygene})) or (!exists($genefaaHash{$mygene})) or (!exists($genomeSequenceHash{$gene2genomeHash{$mygene}})) ){ print $inname . "\t" . $mygene . "\t" . "skipped for insufficient data!\n"; next; } my $nuc = &getSeq($genePositionHash{$mygene}, $genomeSequenceHash{$gene2genomeHash{$mygene}} ); if( $nuc eq ""){ print $inname . "\t" . $mygene . "\t" . "skipped for insufficient data!\n"; next; } print PEP ">$mygene\n$genefaaHash{$mygene}\n"; print NUC ">$mygene\n$nuc\n"; print FUN "$mygene\t-\t$geneAnnotationHash{$mygene}\n"; $count++; } close(PEP); close(NUC); close(FUN); print $inname . " -> $outname: $count genes were extracted in total.\n"; } sub getSeq(){ (my $pos,my $genomeSeq)=@_; my @tmp=split(/\|/,$pos); if( $tmp[1]> length($genomeSeq) ){ return ""; } my $seq = substr($genomeSeq,$tmp[0]-1,$tmp[1]-$tmp[0]+1); if($tmp[2] eq "-"){ $seq = &rcseq($seq); } return $seq; } sub rcseq(){ (my $seq)=@_; $seq=uc($seq); $_=$seq; tr/ACGT/TGCA/; $seq = $_; return reverse($_); } sub read_faa(){ (my $infile, my $seqHash)=@_; my $seq=""; my $name=""; my @content; open(F,$infile); @content=<F>; close(F); chomp(@content); for (my $line = 0; $line < @content; $line++) { if($content[$line] =~/^>/ ){ if($name ne ""){ $$seqHash{$name}=$seq; } my @tmp=split(/\s+/,$content[$line]); @tmp=split(/\./,$tmp[0]); $name=substr($tmp[0], 1, length($tmp[0])-1); $seq=""; }else{ $seq = $seq . $content[$line]; } } $$seqHash{$name}=$seq; } sub read_genomeSequence(){ (my $infile,my $seqHash)=@_; my $seq=""; my $name=""; my @content; open(F,$infile); @content=<F>; close(F); chomp(@content); for (my $line = 0; $line < @content; $line++) { if($content[$line] =~/^>/ ){ if($name ne ""){ $$seqHash{$name}=$seq; } my @tmp=split(/\s+/,$content[$line]); $name=substr($tmp[0], 1, length($tmp[0])-1); $seq=""; }else{ $seq = $seq . $content[$line]; } } $$seqHash{$name}=$seq; } sub read_feature(){ (my $infile, my $annHash,my $posHash,my $gene2genomeHash)=@_; my @content; open(F,$infile); @content=<F>; close(F); chomp(@content); for (my $line = 1; $line < @content - 1; $line=$line+1) { my @row=split(/\t/,$content[$line]); my @nextrow=split(/\t/,$content[$line+1]); if($row[1] ne "protein_coding"){ next; } # in case if( $row[7]."|".$row[8]."|".$row[9] ne $nextrow[7]."|".$nextrow[8]."|".$nextrow[9]){ next; } # print $nextrow[10]."\n"; # print $row[7]."|".$row[8]."|".$row[9]."\n"; my @tmp=split(/\./,$nextrow[10]); my $geneName = $tmp[0]; $$annHash{$geneName}=$nextrow[13]; $$posHash{$geneName}=$row[7]."|".$row[8]."|".$row[9]; $$gene2genomeHash{$geneName}=$row[6]; } } sub print_usage(){ my $message=shift; my @usage=qq( Version: 2016042201 Usage: perl Converter_NCBINewFormatData.pl [options] Options: --inprefix String The prefix of the input files, such as GCF_000007545.1_ASM754v1 If two or more strains were provided, please join their prefixs with "+" Such as GCF_000007545.1_ASM754v1+GCF_000008105.1_ASM810v1+GCF_000711315.1_ASM71131v1 --indir String The directory of those input files --outprefix String The prefix for the output files. If a value "ty2" was provided, the output files would be: ty2.nuc, ty2.pep, and ty2.function If two or more strains were provided, please join their prefixs with "+" If the prefix was not provided, the assembly value would be used as the prefix, such as GCF_000007545 --outdir String The directory for the output files Note: 1. Before converting data with this script, please prepare *feature_table.txt, *genomic.fna and *protein.faa files for each strain. 2. This script was designed for NCBI new format data only. If part of your data is in the old format, please use the Converter_finished.pl or Converter_draft.pl script to convert the data. ); print join("\n",@usage)."\n"; print $message."\n"; exit; }