comparison fasta-stats.pl @ 4:0dbb995c7d35 draft default tip

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit 50f5cce5a8c11001e2c59600a2b99a4243b6d06f"
author iuc
date Thu, 18 Nov 2021 20:56:57 +0000
parents 56022eb50bbd
children
comparison
equal deleted inserted replaced
3:56022eb50bbd 4:0dbb995c7d35
1 #!/usr/bin/env perl
2
3 # fasta-stats
4 # written by torsten.seemann@monash.edu
5 # oct 2012
6
7 use strict;
8 use warnings;
9 use List::Util qw(sum min max);
10
11
12 #Parameters
13 my $file = shift;
14 my $calc_ng50 = 0;
15 my $genome_size = 0;
16 if (scalar(@ARGV) > 0){
17 $genome_size = $ARGV[0];
18 $calc_ng50 = 1;
19 }
20
21 # stat storage
22
23 my $n=0;
24 my $seq = '';
25 my %stat;
26 my @len;
27
28 # MAIN LOOP collecting sequences
29
30 #open the file first
31 open IN, $file or die{ "Couldn't open $file for reading\n$!" };
32
33 while (my $line = <IN>) {
34 chomp $line;
35 if ($line =~ m/^\s*>/) {
36 process($seq) if $n;
37 $n++;
38 $seq='';
39 }
40 else {
41 $seq .= $line;
42 }
43 }
44
45 process($seq) if $n;
46
47 # sort length array
48 # (should use hash here for efficiency with huge no of short reads?)
49
50 @len = sort { $b <=> $a } @len;
51
52 # compute more stats
53
54 $stat{'num_seq'} = scalar(@len);
55
56 if (@len) {
57 $stat{'num_bp'} = sum(@len);
58 $stat{'len_min'} = $len[-1];
59 $stat{'len_max'} = $len[0];
60 $stat{'len_median'} = $len[int(@len/2)];
61 $stat{'len_mean'} = int( $stat{'num_bp'} / $stat{'num_seq'} );
62
63 # calculate n50
64 my $thresh = int 0.5 * $stat{'num_bp'};
65 ($stat{'len_N50'}, $stat{'L50'}) = &calc_x50(\@len, $thresh);
66
67 #calculate n90
68 my $thresh = int 0.9 * $stat{'num_bp'};
69 ($stat{'len_N90'}, $stat{'L90'}) = &calc_x50(\@len, $thresh);
70
71 #calculate NG50
72 if ($calc_ng50) {
73 my $thresh = int 0.5 * $genome_size;
74 ($stat{'len_NG50'}, $stat{'LG50'}) = &calc_x50(\@len, $thresh);
75 }
76 }
77
78 #calculate GC content
79 $stat{'num_bp_not_N'} = $stat{'num_G'} + $stat{'num_C'} + $stat{'num_A'} + $stat{'num_T'};
80 $stat{'GC_content'} = ($stat{'num_G'} + $stat{'num_C'}) / $stat{'num_bp_not_N'}*100;
81
82 # print stats as .tsv
83
84 for my $name (sort keys %stat) {
85 if ($name =~ m/GC_content/){
86 printf "%s\t%0.1f\n", $name, $stat{$name};
87 } else {
88 printf "%s\t%s\n", $name, $stat{$name};
89 }
90 }
91
92 # run for each sequence
93
94 sub process {
95 my($s) = @_;
96 # base composition
97 for my $x (qw(A G T C N)) {
98 my $count = $s =~ s/$x/$x/gi;
99 $stat{"num_$x"} += $count;
100 }
101 # keep list of all lengths encountered
102 push @len, length($s);
103 }
104
105 # N50/NG50 calculation sub
106
107 sub calc_x50{
108 my $ref = shift;
109 my @x = @$ref;
110 my $thresh = shift;
111 my $cum=0;
112 for my $i (0 .. $#x) {
113 $cum += $x[$i];
114 if ($cum >= $thresh) {
115 return $x[$i], $i+1;
116 }
117 }
118 return (0,0);
119 }
120