Mercurial > repos > iuc > fasta_stats
comparison fasta-stats.pl @ 0:9c620a950d3a draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
author | iuc |
---|---|
date | Thu, 22 Nov 2018 04:16:35 -0500 |
parents | |
children | 16f1f3e2de42 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9c620a950d3a |
---|---|
1 #!/usr/bin/env perl | |
2 | |
3 # fasta-stats | |
4 # written by torsten.seemann@monash.edu | |
5 # oct 2012 | |
6 | |
7 use strict; | |
8 use warnings; | |
9 use List::Util qw(sum min max); | |
10 | |
11 # stat storage | |
12 | |
13 my $n=0; | |
14 my $seq = ''; | |
15 my %stat; | |
16 my @len; | |
17 | |
18 # MAIN LOOP collecting sequences | |
19 | |
20 while (my $line = <ARGV>) { | |
21 chomp $line; | |
22 if ($line =~ m/^\s*>/) { | |
23 process($seq) if $n; | |
24 $n++; | |
25 $seq=''; | |
26 } | |
27 else { | |
28 $seq .= $line; | |
29 } | |
30 } | |
31 | |
32 process($seq) if $n; | |
33 | |
34 # sort length array | |
35 # (should use hash here for efficiency with huge no of short reads?) | |
36 | |
37 @len = sort { $a <=> $b } @len; | |
38 | |
39 # compute more stats | |
40 | |
41 $stat{'num_seq'} = scalar(@len); | |
42 | |
43 if (@len) { | |
44 $stat{'num_bp'} = sum(@len); | |
45 $stat{'len_min'} = $len[0]; | |
46 $stat{'len_max'} = $len[-1]; | |
47 $stat{'len_median'} = $len[int(@len/2)]; | |
48 $stat{'len_mean'} = int( $stat{'num_bp'} / $stat{'num_seq'} ); | |
49 # calculate n50 | |
50 | |
51 $stat{'len_N50'} = 0; | |
52 my $cum=0; | |
53 my $thresh = int 0.5 * $stat{'num_bp'}; | |
54 for my $i (0 .. $#len) { | |
55 $cum += $len[$i]; | |
56 if ($cum >= $thresh) { | |
57 $stat{'len_N50'} = $len[$i]; | |
58 last; | |
59 } | |
60 } | |
61 } | |
62 | |
63 #calculate GC content | |
64 $stat{'num_bp_not_N'} = $stat{'num_G'} + $stat{'num_C'} + $stat{'num_A'} + $stat{'num_T'}; | |
65 $stat{'GC_content'} = ($stat{'num_G'} + $stat{'num_C'}) / $stat{'num_bp_not_N'}*100; | |
66 | |
67 # print stats as .tsv | |
68 | |
69 for my $name (sort keys %stat) { | |
70 if ($name =~ m/GC_content/){ | |
71 printf "%s\t%0.1f\n", $name, $stat{$name}; | |
72 } else { | |
73 printf "%s\t%s\n", $name, $stat{$name}; | |
74 } | |
75 } | |
76 | |
77 # run for each sequence | |
78 | |
79 sub process { | |
80 my($s) = @_; | |
81 # base composition | |
82 for my $x (qw(A G T C N)) { | |
83 my $count = $s =~ s/$x/$x/gi; | |
84 $stat{"num_$x"} += $count; | |
85 } | |
86 # keep list of all lengths encountered | |
87 push @len, length($s); | |
88 } | |
89 |