1
|
1 #!/usr/bin/env perl
|
|
2
|
|
3 # fasta-stats
|
|
4 # written by torsten.seemann@monash.edu
|
|
5 # oct 2012
|
|
6
|
|
7 use strict;
|
|
8 use warnings;
|
|
9 use List::Util qw(sum min max);
|
|
10
|
|
11 # stat storage
|
|
12
|
|
13 my $n=0;
|
|
14 my $seq = '';
|
|
15 my %stat;
|
|
16 my @len;
|
|
17
|
|
18 # MAIN LOOP collecting sequences
|
|
19
|
|
20 while (my $line = <ARGV>) {
|
|
21 chomp $line;
|
|
22 if ($line =~ m/^\s*>/) {
|
|
23 process($seq) if $n;
|
|
24 $n++;
|
|
25 $seq='';
|
|
26 }
|
|
27 else {
|
|
28 $seq .= $line;
|
|
29 }
|
|
30 }
|
|
31
|
|
32 process($seq) if $n;
|
|
33
|
|
34 # sort length array
|
|
35 # (should use hash here for efficiency with huge no of short reads?)
|
|
36
|
|
37 @len = sort { $a <=> $b } @len;
|
|
38
|
|
39 # compute more stats
|
|
40
|
|
41 $stat{'num_seq'} = scalar(@len);
|
|
42
|
|
43 if (@len) {
|
|
44 $stat{'num_bp'} = sum(@len);
|
|
45 $stat{'len_min'} = $len[0];
|
|
46 $stat{'len_max'} = $len[-1];
|
|
47 $stat{'len_median'} = $len[int(@len/2)];
|
|
48 $stat{'len_mean'} = int( $stat{'num_bp'} / $stat{'num_seq'} );
|
|
49 # calculate n50
|
|
50
|
|
51 $stat{'len_N50'} = 0;
|
|
52 my $cum=0;
|
|
53 my $thresh = int 0.5 * $stat{'num_bp'};
|
|
54 for my $i (0 .. $#len) {
|
|
55 $cum += $len[$i];
|
|
56 if ($cum >= $thresh) {
|
|
57 $stat{'len_N50'} = $len[$i];
|
|
58 last;
|
|
59 }
|
|
60 }
|
|
61 }
|
|
62
|
|
63 #calculate GC content
|
|
64 $stat{'num_bp_not_N'} = $stat{'num_G'} + $stat{'num_C'} + $stat{'num_A'} + $stat{'num_T'};
|
|
65 $stat{'GC_content'} = ($stat{'num_G'} + $stat{'num_C'}) / $stat{'num_bp_not_N'}*100;
|
|
66
|
|
67 # print stats as .tsv
|
|
68
|
|
69 for my $name (sort keys %stat) {
|
|
70 if ($name =~ m/GC_content/){
|
|
71 printf "%s\t%0.1f\n", $name, $stat{$name};
|
|
72 } else {
|
|
73 printf "%s\t%s\n", $name, $stat{$name};
|
|
74 }
|
|
75 }
|
|
76
|
|
77 # run for each sequence
|
|
78
|
|
79 sub process {
|
|
80 my($s) = @_;
|
|
81 # base composition
|
|
82 for my $x (qw(A G T C N)) {
|
|
83 my $count = $s =~ s/$x/$x/gi;
|
|
84 $stat{"num_$x"} += $count;
|
|
85 }
|
|
86 # keep list of all lengths encountered
|
|
87 push @len, length($s);
|
|
88 }
|
|
89
|