Mercurial > repos > dcouvin > nuclescore

<tool id="nuclescoretool" name="nuclescore" version="0.1.0">
  <description>nuclescore</description>


<command detect_errors="aggressive"><![CDATA[

#import re
        ## Creates symlinks for each input file based on the Galaxy 'element_identifier'
        ## Used so that a human-readable name appears in the output table (instead of 'dataset_xyz.dat')
        #set $named_input_files = ''
        #for $input_file in $input_files
            ## Add single quotes around each input file identifier
            #set $_input_file = "'{}'".format($input_file.element_identifier)
            ln -s '${input_file}' ${_input_file} &&
            #set $named_input_files = $named_input_files + ' ' + $_input_file
        #end for

	perl '$__tool_directory__/nucleScore.pl' $_input_file $output


]]></command>
 <!-- ./nuclescore.sh ${named_input_files} > "$output" -->

<inputs>
  <param format="fasta" name="input_files" type="data" label="Genome fasta file : " multiple="true" display="checkboxes"/>
</inputs>

 <outputs>
    <data format="tabular" name="output" />
 </outputs>

<help>
No documentation
  </help>

</tool>


-------------------------------------------------------------------------------------------------------------------------------------------------------------------

#!/usr/bin/perl

use strict;
use warnings;
use Bio::SeqIO;
use Shannon::Entropy qw/entropy/;
use File::Basename;
#use Bio::Species;

#use FindBin;
#use lib "$FindBin::RealBin/../perl5";

my $input = $ARGV[0];
#chercher comment faire une liste perl pour input
my @liste = split(/,/, $input);
my $recap_total_seq = $ARGV[1];

#my ($input, $recap_total_seq) = @ARGV;

my $start = time();

#my $file = ""; #= $ARGV[0];
#my $recap_total_seq = "nucleScore_result.xls";

open (RECAP,'>', $recap_total_seq) or die "could not open $!";
  print RECAP "File\tA percent\tT percent\tC percent\tG percent\tGC percent\tAT/GC ratio\tNucleScore\tShannon Entropy\tAAA\tAAT\n";
close(RECAP);


#FASTA files
#if(@ARGV){

  #for (my $i = 0; $i <= $#ARGV; $i++) {
    #if ($ARGV[$i]=~/-output/i or $ARGV[$i]=~/-o/i) {
    #		$recap_total_seq = $ARGV[$i+1];
    #}
  #}


 open (RECAP,'>>', $recap_total_seq) or die "could not open $!";

#refaire le for pour la liste input
#for my $arg (@ARGV){
for my $arg (@liste){
#    if ($arg =~ m/.fasta/ or $arg =~ m/.fna/ or $arg =~ m/.fa/){

	#print "Traitement du fichier de sequence: $arg\n";
	print "Traitement du fichier de sequence: $arg\n";
	#my $file = $arg;
	my $file = $arg;


	my $seqIO = Bio::SeqIO->new(-format=>'Fasta', -file=>$file);
	my $globalSeq = "";
	while (my $seq = $seqIO->next_seq()) {
		my $seqID = $seq->id;
		my $seqNuc = $seq->seq;
		$globalSeq .= $seqNuc;
		#push @arrayID, $seqID;
		#$hSeq{$seqID} = $seqNuc;
		#my @seqArray = split //, $seqNuc;
	}

	my $gcpercent = gc_percent($globalSeq);
	my ($ade, $thy, $gua, $cyt, $n, $length) = number_nuc_length_seq($file);
	my ($aPercent, $tPercent, $gPercent, $cPercent, $nPercent) = nucleotid_percent($ade, $thy, $gua, $cyt, $n, $length);

	my $atgcRatio =	 atgc_ratio($ade, $thy, $gua, $cyt);

	my @percentList = ($aPercent, $tPercent, $gPercent, $cPercent, $nPercent);

	my $variance = shift_data_variance(@percentList);
	my $nucleScore = nucle_score($variance, $gcpercent, $atgcRatio, $length);
	my $entropy = entropy($globalSeq);

	print "The sequence length for $file is: $length\n";
	print "A percent: $aPercent\n";
	print "T percent: $tPercent\n";
	print "G percent: $gPercent\n";
	print "C percent: $cPercent\n";
	print "N percent: $nPercent\n";

	print "GC percent: $gcpercent\n";

	print "AT/GC ratio: $atgcRatio\n";

	print "NucleScore: $nucleScore\n";

	print "Shannon Entropy: $entropy\n\n";

	print "3 digits:\n";
	my @trinucs=($globalSeq=~/(?=(.{3}))/g);
	my %tri_count=();
	$tri_count{$_}++ for @trinucs;
	print $_,":",$tri_count{$_},"\n" for sort keys(%tri_count);
	print "\n2 digits:\n";
	my @trinucs2=($globalSeq=~/(?=(.{2}))/g);
	my %tri_count2=();
	$tri_count2{$_}++ for @trinucs2;
	print $_,":",$tri_count2{$_},"\n" for sort keys(%tri_count2);

	my $aaa = $tri_count{'AAA'};
	my $aat = $tri_count{'AAT'};

	print "--------------------------------------\n\n";

	my $label = basename($file);


	#Summary file
	#print RECAP "$file\t$aPercent\t$tPercent\t$cPercent\t$gPercent\t$gcpercent\t$atgcRatio\t$nucleScore\t$entropy\t$aaa\t$aat\n";
	print RECAP "$label\t$aPercent\t$tPercent\t$cPercent\t$gPercent\t$gcpercent\t$atgcRatio\t$nucleScore\t$entropy\t$aaa\t$aat\n";
	#}
  }
  close (RECAP) or die "close file error : $!";
#}

my $end = time();

my $total = $end - $start;

print "***** Total time (in seconds) is: $total *****\n";

#------------------------------------------------------------------------------
# number nucleotid and length
sub number_nuc_length_seq {
	my ($fastaFile) = @_;
	my $ade = 0;
	my $thy = 0;
	my $gua = 0;
	my $cyt = 0;
	my $n = 0;
	my $length = 0;

	open (FASTA, "<", $fastaFile) or die "Could not open $!";
	while (<FASTA>) {
		chomp;
		if ($_ !~ />/) {
			my @seq = split //, $_;

			for my $nuc (@seq) {
				$length +=1 ;
				if ($nuc =~ /a/i) {$ade+=1;}
				elsif ($nuc =~ /t/i) {$thy+=1;}
				elsif ($nuc =~ /g/i) {$gua+=1;}
				elsif ($nuc =~ /c/i) {$cyt+=1;}
				elsif ($nuc =~ /n/i) {$n+=1;}
			}
		}
	}
	close(FASTA) or die "Error close file :$!";
	return ($ade, $thy, $gua, $cyt, $n, $length);

}

#------------------------------------------------------------------------------
# compute percentage of nucleotid
sub nucleotid_percent {
	my($ade, $thy, $gua, $cyt, $n, $length) = @_;

	my $adePercent = $ade / $length * 100;
	my $thyPercent = $thy / $length * 100;
	my $guaPercent = $gua / $length * 100;
	my $cytPercent = $cyt / $length * 100;
	my $nPercent = $n / $length * 100;

	return ($adePercent, $thyPercent, $guaPercent, $cytPercent, $nPercent);

}

#------------------------------------------------------------------------------
# compute GC pourcent
sub gc_percent {
	my ($seq) = @_;

	my @charSeq = split(//, uc($seq));
	my %hashFlank = ();

	foreach my $v (@charSeq) {
		$hashFlank{$v} += 1;
	}

	if (! $hashFlank{'G'}) { $hashFlank{'G'} = 0;}
	if (! $hashFlank{'C'}) { $hashFlank{'C'} = 0;}

	if(length($seq) == 0) {
		return 0;
	}
	else {
		return (($hashFlank{'G'} + $hashFlank{'C'}) / (length($seq))) * 100;
	}

}
#------------------------------------------------------------------------------
# compute ATGC ratio
sub atgc_ratio {
	my ($ade, $thy, $gua, $cyt) = @_;

	return (($ade + $thy) / ($gua + $cyt));

}
#------------------------------------------------------------------------------
# variance
sub shift_data_variance {
	my (@data) = @_;

	if ($#data + 1 < 2) { return 0.0; }

	my $K = $data[0];
	my ($n, $Ex, $Ex2) = 0.0;

	for my $x (@data) {
		$n = $n + 1;
		$Ex += $x - $K;
		$Ex2 += ($x - $K) * ($x - $K);
	}

	my $variance = ($Ex2 - ($Ex * $Ex) / $n) / ($n); ## ($n - 1)

	return $variance;

}
#------------------------------------------------------------------------------
# nucle score
#sub nucle_score {
#	my ($variance, $gcPercent, $atgcRatio, $length) = @_;
#
#	return (($variance * $gcPercent * $atgcRatio) / $length);
#}
sub nucle_score {
	my ($variance, $gcPercent, $atgcRatio, $length) = @_;
	return log2(($variance * $gcPercent * $atgcRatio) / sqrt($length));
}

#------------------------------------------------------------------------------
sub log2 {
  my $n = shift;
  return (log($n) / log(2));
}
author	dcouvin
date	Fri, 03 Sep 2021 22:36:56 +0000
parents
children