# HG changeset patch # User iuc # Date 1618996246 0 # Node ID 16f1f3e2de426a34a75654a3697b309855548987 # Parent 9c620a950d3a7dbee1dadbac955d2130aab2a4dd "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit 02d0ae7ac02425ef454d2e42a0513887596a3b4d" diff -r 9c620a950d3a -r 16f1f3e2de42 fasta-stats.pl --- a/fasta-stats.pl Thu Nov 22 04:16:35 2018 -0500 +++ b/fasta-stats.pl Wed Apr 21 09:10:46 2021 +0000 @@ -8,6 +8,16 @@ use warnings; use List::Util qw(sum min max); + +#Parameters +my $file = shift; +my $calc_ng50 = 0; +my $genome_size = 0; +if (scalar(@ARGV) > 0){ + $genome_size = $ARGV[0]; + $calc_ng50 = 1; +} + # stat storage my $n=0; @@ -17,7 +27,10 @@ # MAIN LOOP collecting sequences -while (my $line = ) { +#open the file first +open IN, $file or die{ "Couldn't open $file for reading\n$!" }; + +while (my $line = ) { chomp $line; if ($line =~ m/^\s*>/) { process($seq) if $n; @@ -46,17 +59,15 @@ $stat{'len_max'} = $len[-1]; $stat{'len_median'} = $len[int(@len/2)]; $stat{'len_mean'} = int( $stat{'num_bp'} / $stat{'num_seq'} ); + # calculate n50 - - $stat{'len_N50'} = 0; - my $cum=0; my $thresh = int 0.5 * $stat{'num_bp'}; - for my $i (0 .. $#len) { - $cum += $len[$i]; - if ($cum >= $thresh) { - $stat{'len_N50'} = $len[$i]; - last; - } + $stat{'len_N50'} = &calc_x50(@len, $thresh); + + #calculate NG50 + if ($calc_ng50) { + my $thresh = int 0.5 * $genome_size * 1000000; + $stat{'len_NG50'} = &calc_x50(@len, $thresh); } } @@ -87,3 +98,18 @@ push @len, length($s); } +# N50/NG50 calculation sub + +sub calc_x50{ + my @x = shift; + my $thresh = shift; + my $cum=0; + for my $i (0 .. $#x) { + $cum += $x[$i]; + if ($cum >= $thresh) { + return $x[$i]; + } + } + return 0; +} + diff -r 9c620a950d3a -r 16f1f3e2de42 fasta-stats.xml --- a/fasta-stats.xml Thu Nov 22 04:16:35 2018 -0500 +++ b/fasta-stats.xml Wed Apr 21 09:10:46 2021 +0000 @@ -1,4 +1,4 @@ - + Display summary statistics for a fasta file. perl @@ -6,11 +6,15 @@ '$stats' ]]> + @@ -20,6 +24,11 @@ + + + + + **Fasta Stats** @@ -36,6 +45,8 @@ GC content in % + If an optional genome size estimate is specified, then the NG50 length will also be calculated. + ------ Inputs: diff -r 9c620a950d3a -r 16f1f3e2de42 test-data/ng50_out.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/ng50_out.txt Wed Apr 21 09:10:46 2021 +0000 @@ -0,0 +1,15 @@ +GC_content 52.0 +len_N50 194780 +len_NG50 0 +len_max 194780 +len_mean 194780 +len_median 194780 +len_min 194780 +num_A 46297 +num_C 50626 +num_G 50678 +num_N 0 +num_T 47179 +num_bp 194780 +num_bp_not_N 194780 +num_seq 1