diff fasta-stats.pl @ 4:0dbb995c7d35 draft default tip

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit 50f5cce5a8c11001e2c59600a2b99a4243b6d06f"
author iuc
date Thu, 18 Nov 2021 20:56:57 +0000
parents 56022eb50bbd
children
line wrap: on
line diff
--- a/fasta-stats.pl	Mon Jul 05 13:36:26 2021 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,120 +0,0 @@
-#!/usr/bin/env perl
-
-# fasta-stats
-# written by torsten.seemann@monash.edu 
-# oct 2012
-
-use strict;
-use warnings;
-use List::Util qw(sum min max);
-
-
-#Parameters
-my $file = shift;
-my $calc_ng50 = 0;
-my $genome_size = 0;
-if (scalar(@ARGV) > 0){
-  $genome_size = $ARGV[0];
-  $calc_ng50 = 1;
-}
-
-# stat storage
-
-my $n=0;
-my $seq = '';
-my %stat;
-my @len;
-
-# MAIN LOOP collecting sequences
-
-#open the file first
-open IN, $file or die{ "Couldn't open $file for reading\n$!" };
-
-while (my $line = <IN>) {
-  chomp $line;
-  if ($line =~ m/^\s*>/) {
-    process($seq) if $n;
-    $n++;
-    $seq='';
-  }
-  else {
-    $seq .= $line;
-  }  
-}
-
-process($seq) if $n;
-
-# sort length array 
-# (should use hash here for efficiency with huge no of short reads?)
-
-@len = sort { $b <=> $a } @len;
-
-# compute more stats
-
-$stat{'num_seq'} = scalar(@len);
-
-if (@len) {
-  $stat{'num_bp'} = sum(@len);
-  $stat{'len_min'} = $len[-1];
-  $stat{'len_max'} = $len[0];
-  $stat{'len_median'} = $len[int(@len/2)];
-  $stat{'len_mean'} = int( $stat{'num_bp'} / $stat{'num_seq'} ); 
-  
-  # calculate n50
-  my $thresh = int 0.5 * $stat{'num_bp'};
-  ($stat{'len_N50'}, $stat{'L50'}) = &calc_x50(\@len, $thresh);
-
-  #calculate n90
-  my $thresh = int 0.9 * $stat{'num_bp'};
-  ($stat{'len_N90'}, $stat{'L90'}) = &calc_x50(\@len, $thresh);
-  
-  #calculate NG50
-  if ($calc_ng50) {
-    my $thresh = int 0.5 * $genome_size;
-    ($stat{'len_NG50'}, $stat{'LG50'}) = &calc_x50(\@len, $thresh);
-  }
-}
-
-#calculate GC content
-$stat{'num_bp_not_N'} = $stat{'num_G'} + $stat{'num_C'} + $stat{'num_A'} + $stat{'num_T'};
-$stat{'GC_content'} = ($stat{'num_G'} + $stat{'num_C'}) / $stat{'num_bp_not_N'}*100;
-
-# print stats as .tsv
-
-for my $name (sort keys %stat) {
-    if ($name =~ m/GC_content/){
-        printf "%s\t%0.1f\n", $name, $stat{$name};
-    } else {
-        printf "%s\t%s\n", $name, $stat{$name};
-    }
-}
-
-# run for each sequence
-
-sub process {
-  my($s) = @_;
-  # base composition
-  for my $x (qw(A G T C N)) {
-    my $count = $s =~ s/$x/$x/gi;
-    $stat{"num_$x"} += $count;
-  }
-  # keep list of all lengths encountered
-  push @len, length($s);    
-}
-
-# N50/NG50 calculation sub
-
-sub calc_x50{
-  my $ref = shift;
-  my @x = @$ref;
-  my $thresh = shift;
-  my $cum=0;
-  for my $i (0 .. $#x) {
-    $cum += $x[$i];
-    if ($cum >= $thresh) {
-      return $x[$i], $i+1;
-    }
-  }
-  return (0,0);
-}
-