Mercurial > repos > iuc > fasta_stats
view fasta-stats.pl @ 2:cd0874854f51 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit adc5e3616c1849551c9a712b651b0d1c6b0e88f1"
author | iuc |
---|---|
date | Mon, 26 Apr 2021 10:01:43 +0000 |
parents | 16f1f3e2de42 |
children | 56022eb50bbd |
line wrap: on
line source
#!/usr/bin/env perl # fasta-stats # written by torsten.seemann@monash.edu # oct 2012 use strict; use warnings; use List::Util qw(sum min max); #Parameters my $file = shift; my $calc_ng50 = 0; my $genome_size = 0; if (scalar(@ARGV) > 0){ $genome_size = $ARGV[0]; $calc_ng50 = 1; } # stat storage my $n=0; my $seq = ''; my %stat; my @len; # MAIN LOOP collecting sequences #open the file first open IN, $file or die{ "Couldn't open $file for reading\n$!" }; while (my $line = <IN>) { chomp $line; if ($line =~ m/^\s*>/) { process($seq) if $n; $n++; $seq=''; } else { $seq .= $line; } } process($seq) if $n; # sort length array # (should use hash here for efficiency with huge no of short reads?) @len = sort { $b <=> $a } @len; # compute more stats $stat{'num_seq'} = scalar(@len); if (@len) { $stat{'num_bp'} = sum(@len); $stat{'len_min'} = $len[0]; $stat{'len_max'} = $len[-1]; $stat{'len_median'} = $len[int(@len/2)]; $stat{'len_mean'} = int( $stat{'num_bp'} / $stat{'num_seq'} ); # calculate n50 my $thresh = int 0.5 * $stat{'num_bp'}; ($stat{'len_N50'}, $stat{'L50'}) = &calc_x50(\@len, $thresh); #calculate NG50 if ($calc_ng50) { my $thresh = int 0.5 * $genome_size; ($stat{'len_NG50'}, $stat{'LG50'}) = &calc_x50(\@len, $thresh); } } #calculate GC content $stat{'num_bp_not_N'} = $stat{'num_G'} + $stat{'num_C'} + $stat{'num_A'} + $stat{'num_T'}; $stat{'GC_content'} = ($stat{'num_G'} + $stat{'num_C'}) / $stat{'num_bp_not_N'}*100; # print stats as .tsv for my $name (sort keys %stat) { if ($name =~ m/GC_content/){ printf "%s\t%0.1f\n", $name, $stat{$name}; } else { printf "%s\t%s\n", $name, $stat{$name}; } } # run for each sequence sub process { my($s) = @_; # base composition for my $x (qw(A G T C N)) { my $count = $s =~ s/$x/$x/gi; $stat{"num_$x"} += $count; } # keep list of all lengths encountered push @len, length($s); } # N50/NG50 calculation sub sub calc_x50{ my $ref = shift; my @x = @$ref; my $thresh = shift; my $cum=0; for my $i (0 .. $#x) { $cum += $x[$i]; if ($cum >= $thresh) { return $x[$i], $i+1; } } return (0,0); }