Mercurial > repos > iuc > fasta_stats
changeset 1:16f1f3e2de42 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit 02d0ae7ac02425ef454d2e42a0513887596a3b4d"
author | iuc |
---|---|
date | Wed, 21 Apr 2021 09:10:46 +0000 |
parents | 9c620a950d3a |
children | cd0874854f51 |
files | fasta-stats.pl fasta-stats.xml test-data/ng50_out.txt |
diffstat | 3 files changed, 63 insertions(+), 11 deletions(-) [+] |
line wrap: on
line diff
--- a/fasta-stats.pl Thu Nov 22 04:16:35 2018 -0500 +++ b/fasta-stats.pl Wed Apr 21 09:10:46 2021 +0000 @@ -8,6 +8,16 @@ use warnings; use List::Util qw(sum min max); + +#Parameters +my $file = shift; +my $calc_ng50 = 0; +my $genome_size = 0; +if (scalar(@ARGV) > 0){ + $genome_size = $ARGV[0]; + $calc_ng50 = 1; +} + # stat storage my $n=0; @@ -17,7 +27,10 @@ # MAIN LOOP collecting sequences -while (my $line = <ARGV>) { +#open the file first +open IN, $file or die{ "Couldn't open $file for reading\n$!" }; + +while (my $line = <IN>) { chomp $line; if ($line =~ m/^\s*>/) { process($seq) if $n; @@ -46,17 +59,15 @@ $stat{'len_max'} = $len[-1]; $stat{'len_median'} = $len[int(@len/2)]; $stat{'len_mean'} = int( $stat{'num_bp'} / $stat{'num_seq'} ); + # calculate n50 - - $stat{'len_N50'} = 0; - my $cum=0; my $thresh = int 0.5 * $stat{'num_bp'}; - for my $i (0 .. $#len) { - $cum += $len[$i]; - if ($cum >= $thresh) { - $stat{'len_N50'} = $len[$i]; - last; - } + $stat{'len_N50'} = &calc_x50(@len, $thresh); + + #calculate NG50 + if ($calc_ng50) { + my $thresh = int 0.5 * $genome_size * 1000000; + $stat{'len_NG50'} = &calc_x50(@len, $thresh); } } @@ -87,3 +98,18 @@ push @len, length($s); } +# N50/NG50 calculation sub + +sub calc_x50{ + my @x = shift; + my $thresh = shift; + my $cum=0; + for my $i (0 .. $#x) { + $cum += $x[$i]; + if ($cum >= $thresh) { + return $x[$i]; + } + } + return 0; +} +
--- a/fasta-stats.xml Thu Nov 22 04:16:35 2018 -0500 +++ b/fasta-stats.xml Wed Apr 21 09:10:46 2021 +0000 @@ -1,4 +1,4 @@ -<tool id="fasta-stats" name="Fasta Statistics" version="1.0.1"> +<tool id="fasta-stats" name="Fasta Statistics" version="1.0.2"> <description>Display summary statistics for a fasta file.</description> <requirements> <requirement type="package" version="5.26">perl</requirement> @@ -6,11 +6,15 @@ <command detect_errors="exit_code"><![CDATA[ perl '${__tool_directory__}/fasta-stats.pl' '$dataset' + #if $genome_size: + $genome_size + #end if > '$stats' ]]> </command> <inputs> <param name="dataset" type="data" format="fasta" label="fasta or multifasta file" help="fasta dataset to get statistics for."/> + <param name="genome_size" type="float" optional="True" label="Genome size estimate (optional)" help="Estimate of the genome size in megabases (MB). If specified, NG50 will be calculated."/> </inputs> <outputs> <data name="stats" format="tabular" label="${tool.name} on ${on_string}: Fasta summary stats"/> @@ -20,6 +24,11 @@ <param name="dataset" value="test.fasta"/> <output name="stats" file="test_out.txt"/> </test> + <test> + <param name="dataset" value="test.fasta"/> + <param name="genome_size" value="5.0"/> + <output name="stats" file="ng50_out.txt"/> + </test> </tests> <help> **Fasta Stats** @@ -36,6 +45,8 @@ GC content in % + If an optional genome size estimate is specified, then the NG50 length will also be calculated. + ------ Inputs:
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/ng50_out.txt Wed Apr 21 09:10:46 2021 +0000 @@ -0,0 +1,15 @@ +GC_content 52.0 +len_N50 194780 +len_NG50 0 +len_max 194780 +len_mean 194780 +len_median 194780 +len_min 194780 +num_A 46297 +num_C 50626 +num_G 50678 +num_N 0 +num_T 47179 +num_bp 194780 +num_bp_not_N 194780 +num_seq 1