view PGAP-1.2.1/Converter_NCBINewFormatData.pl @ 0:83e62a1aeeeb draft

Uploaded
author dereeper
date Thu, 24 Jun 2021 13:51:52 +0000
parents
children
line wrap: on
line source

#!/usr/bin/perl
use strict;
use warnings;
use Getopt::Long;



my %optionHash=qw();
my $inprefix="";
my $outprefix="";
my $indir="";
my $outdir="";
my @inList;
my @outList;

GetOptions(\%optionHash,"inprefix:s","outprefix:s","indir:s","outdir:s","help|h!");


if(scalar(keys %optionHash)==0){
	&print_usage("");
}

if(exists($optionHash{"h"}) or exists($optionHash{"help"}) ){
	&print_usage("");
}

if( exists($optionHash{"inprefix"} ) ){
	$inprefix = $optionHash{"inprefix"};
	@inList=split(/\+/,$inprefix);
}else{
	&print_usage("--inprefix should be provided!");
}

if( exists($optionHash{"outprefix"} ) ){
	$outprefix = $optionHash{"outprefix"};
	@outList=split(/\+/,$outprefix);
}

if( exists($optionHash{"indir"} ) ){
	$indir = $optionHash{"indir"};
}else{
	&print_usage("--indir should be provided!");
}

if( exists($optionHash{"outdir"} ) ){
	$outdir = $optionHash{"outdir"};
}else{
	&print_usage("--outdir should be provided!");
}


if( $inprefix eq "" or $indir eq "" or $outdir eq "" ){
	&print_usage("");
}

if( scalar(@outList) >0 and scalar(@outList) != scalar(@inList) ){
	&print_usage("If outprefix was provided, the name number should be identical with inprefix");
}


system("mkdir -p $outdir");

foreach my $idx (0 .. (scalar(@inList) -1)  ){
	my $inname = $inList[$idx];
	my $feature_table = $indir."/".$inname."_feature_table.txt";
	my $faa           = $indir."/".$inname."_protein.faa";
	my $fna           = $indir."/".$inname."_genomic.fna";

	my %genePositionHash;
	my %genomeSequenceHash;
	my %geneAnnotationHash;
	my %gene2genomeHash;
	my %genefaaHash;
	my $count=0;

	my $outname;

	if($outprefix eq ""){
		my @tmp=split(/\./,$inname);
		$outname = $tmp[0];
	}else{
		$outname = $outList[$idx];
	}

	# check file
	if( ! -e $feature_table){
		print "$feature_table was not found!\n";
		print "$inname was skipped!\n";
		next;
	}

	if( ! -e $faa){
		print "$faa was not found!\n";
		print "$inname was skipped!\n";
		next;
	}

	if( ! -e $fna){
		print "$fna was not found!\n";
		print "$inname was skipped!\n";
		next;
	}

	# read feature
	&read_feature($feature_table,\%geneAnnotationHash,\%genePositionHash,\%gene2genomeHash);

	# get genome sequence
	&read_genomeSequence($fna,\%genomeSequenceHash);

	# get faa
	&read_faa($faa,\%genefaaHash);

	# extract nuc and output

	open(PEP,">$outdir/$outname.pep");
	open(NUC,">$outdir/$outname.nuc");
	open(FUN,">$outdir/$outname.function");

	foreach my $mygene (keys %genePositionHash){
		if( (!exists( $geneAnnotationHash{$mygene}))  or (!exists($gene2genomeHash{$mygene})) or (!exists($genefaaHash{$mygene})) or (!exists($genomeSequenceHash{$gene2genomeHash{$mygene}})) ){
			print $inname . "\t" . $mygene . "\t" . "skipped for insufficient data!\n";
			next;
		}

		my $nuc = &getSeq($genePositionHash{$mygene}, $genomeSequenceHash{$gene2genomeHash{$mygene}} );

		if( $nuc eq ""){
			print $inname . "\t" . $mygene . "\t" . "skipped for insufficient data!\n";
			next;
		}

		print PEP ">$mygene\n$genefaaHash{$mygene}\n";
		print NUC ">$mygene\n$nuc\n";
		print FUN "$mygene\t-\t$geneAnnotationHash{$mygene}\n";
		$count++;
	}
	close(PEP);
	close(NUC);
	close(FUN);

	print $inname . " -> $outname: $count genes were extracted in total.\n";
}


sub getSeq(){
	(my $pos,my $genomeSeq)=@_;
	my @tmp=split(/\|/,$pos);
	if( $tmp[1]> length($genomeSeq) ){
		return "";
	}
	my $seq = substr($genomeSeq,$tmp[0]-1,$tmp[1]-$tmp[0]+1);
	if($tmp[2] eq "-"){
		$seq = &rcseq($seq);
	}
	return $seq;
}



sub rcseq(){
	(my $seq)=@_;
	$seq=uc($seq);

	$_=$seq;
	tr/ACGT/TGCA/;
	$seq = $_;
	return reverse($_);
}



sub read_faa(){
	(my $infile, my $seqHash)=@_;
	my $seq="";
	my $name="";
	my @content;
	open(F,$infile);
	@content=<F>;
	close(F);
	chomp(@content);

	for (my $line = 0; $line < @content; $line++) {
		if($content[$line] =~/^>/ ){
			if($name ne ""){
				$$seqHash{$name}=$seq;
			}
			my @tmp=split(/\s+/,$content[$line]);
			@tmp=split(/\./,$tmp[0]);

			$name=substr($tmp[0], 1, length($tmp[0])-1);
			$seq="";
		}else{
			$seq = $seq . $content[$line];
		}
	}
	$$seqHash{$name}=$seq;
}






sub read_genomeSequence(){
	(my $infile,my $seqHash)=@_;
	my $seq="";
	my $name="";
	my @content;
	open(F,$infile);
	@content=<F>;
	close(F);
	chomp(@content);

	for (my $line = 0; $line < @content; $line++) {
		if($content[$line] =~/^>/ ){
			if($name ne ""){
				$$seqHash{$name}=$seq;
			}
			my @tmp=split(/\s+/,$content[$line]);
			$name=substr($tmp[0], 1, length($tmp[0])-1);
			$seq="";
		}else{
			$seq = $seq . $content[$line];
		}
	}
	$$seqHash{$name}=$seq;
}



sub read_feature(){
	(my $infile, my $annHash,my $posHash,my $gene2genomeHash)=@_;
	my @content;
	open(F,$infile);
	@content=<F>;
	close(F);
	chomp(@content);

	for (my $line = 1; $line < @content - 1; $line=$line+1) {
		my @row=split(/\t/,$content[$line]);
		my @nextrow=split(/\t/,$content[$line+1]);

		if($row[1] ne "protein_coding"){
			next;
		}

		# in case
		if( $row[7]."|".$row[8]."|".$row[9] ne $nextrow[7]."|".$nextrow[8]."|".$nextrow[9]){
			next;
		}

		# print $nextrow[10]."\n";
		# print $row[7]."|".$row[8]."|".$row[9]."\n";

		my @tmp=split(/\./,$nextrow[10]);
		my $geneName = $tmp[0];

		$$annHash{$geneName}=$nextrow[13];
		$$posHash{$geneName}=$row[7]."|".$row[8]."|".$row[9];
		$$gene2genomeHash{$geneName}=$row[6];
	}
}


sub print_usage(){
	my $message=shift;
my @usage=qq(
Version: 2016042201
Usage:   perl Converter_NCBINewFormatData.pl [options]


Options: 

   --inprefix   String     The prefix of the input files, such as GCF_000007545.1_ASM754v1
                           If two or more strains were provided, please join their prefixs with "+"
                           Such as GCF_000007545.1_ASM754v1+GCF_000008105.1_ASM810v1+GCF_000711315.1_ASM71131v1

   --indir      String     The directory of those input files

   --outprefix  String     The prefix for the output files.
                           If a value "ty2" was provided, the output files would be: ty2.nuc, ty2.pep, and ty2.function
                           If two or more strains were provided, please join their prefixs with "+"
                           If the prefix was not provided, the assembly value would be used as the prefix, such as GCF_000007545

   --outdir     String     The directory for the output files

Note:
   1. Before converting data with this script, please prepare *feature_table.txt, *genomic.fna and *protein.faa files for each strain.
   
   2. This script was designed for NCBI new format data only. If part of your data is in the old format, please use the Converter_finished.pl or Converter_draft.pl script to convert the data. 


);

	print join("\n",@usage)."\n";
	print $message."\n";
	exit;
}