Mercurial > repos > iuc > fasta_stats
comparison fasta-stats.pl @ 4:0dbb995c7d35 draft default tip
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit 50f5cce5a8c11001e2c59600a2b99a4243b6d06f"
author | iuc |
---|---|
date | Thu, 18 Nov 2021 20:56:57 +0000 |
parents | 56022eb50bbd |
children |
comparison
equal
deleted
inserted
replaced
3:56022eb50bbd | 4:0dbb995c7d35 |
---|---|
1 #!/usr/bin/env perl | |
2 | |
3 # fasta-stats | |
4 # written by torsten.seemann@monash.edu | |
5 # oct 2012 | |
6 | |
7 use strict; | |
8 use warnings; | |
9 use List::Util qw(sum min max); | |
10 | |
11 | |
12 #Parameters | |
13 my $file = shift; | |
14 my $calc_ng50 = 0; | |
15 my $genome_size = 0; | |
16 if (scalar(@ARGV) > 0){ | |
17 $genome_size = $ARGV[0]; | |
18 $calc_ng50 = 1; | |
19 } | |
20 | |
21 # stat storage | |
22 | |
23 my $n=0; | |
24 my $seq = ''; | |
25 my %stat; | |
26 my @len; | |
27 | |
28 # MAIN LOOP collecting sequences | |
29 | |
30 #open the file first | |
31 open IN, $file or die{ "Couldn't open $file for reading\n$!" }; | |
32 | |
33 while (my $line = <IN>) { | |
34 chomp $line; | |
35 if ($line =~ m/^\s*>/) { | |
36 process($seq) if $n; | |
37 $n++; | |
38 $seq=''; | |
39 } | |
40 else { | |
41 $seq .= $line; | |
42 } | |
43 } | |
44 | |
45 process($seq) if $n; | |
46 | |
47 # sort length array | |
48 # (should use hash here for efficiency with huge no of short reads?) | |
49 | |
50 @len = sort { $b <=> $a } @len; | |
51 | |
52 # compute more stats | |
53 | |
54 $stat{'num_seq'} = scalar(@len); | |
55 | |
56 if (@len) { | |
57 $stat{'num_bp'} = sum(@len); | |
58 $stat{'len_min'} = $len[-1]; | |
59 $stat{'len_max'} = $len[0]; | |
60 $stat{'len_median'} = $len[int(@len/2)]; | |
61 $stat{'len_mean'} = int( $stat{'num_bp'} / $stat{'num_seq'} ); | |
62 | |
63 # calculate n50 | |
64 my $thresh = int 0.5 * $stat{'num_bp'}; | |
65 ($stat{'len_N50'}, $stat{'L50'}) = &calc_x50(\@len, $thresh); | |
66 | |
67 #calculate n90 | |
68 my $thresh = int 0.9 * $stat{'num_bp'}; | |
69 ($stat{'len_N90'}, $stat{'L90'}) = &calc_x50(\@len, $thresh); | |
70 | |
71 #calculate NG50 | |
72 if ($calc_ng50) { | |
73 my $thresh = int 0.5 * $genome_size; | |
74 ($stat{'len_NG50'}, $stat{'LG50'}) = &calc_x50(\@len, $thresh); | |
75 } | |
76 } | |
77 | |
78 #calculate GC content | |
79 $stat{'num_bp_not_N'} = $stat{'num_G'} + $stat{'num_C'} + $stat{'num_A'} + $stat{'num_T'}; | |
80 $stat{'GC_content'} = ($stat{'num_G'} + $stat{'num_C'}) / $stat{'num_bp_not_N'}*100; | |
81 | |
82 # print stats as .tsv | |
83 | |
84 for my $name (sort keys %stat) { | |
85 if ($name =~ m/GC_content/){ | |
86 printf "%s\t%0.1f\n", $name, $stat{$name}; | |
87 } else { | |
88 printf "%s\t%s\n", $name, $stat{$name}; | |
89 } | |
90 } | |
91 | |
92 # run for each sequence | |
93 | |
94 sub process { | |
95 my($s) = @_; | |
96 # base composition | |
97 for my $x (qw(A G T C N)) { | |
98 my $count = $s =~ s/$x/$x/gi; | |
99 $stat{"num_$x"} += $count; | |
100 } | |
101 # keep list of all lengths encountered | |
102 push @len, length($s); | |
103 } | |
104 | |
105 # N50/NG50 calculation sub | |
106 | |
107 sub calc_x50{ | |
108 my $ref = shift; | |
109 my @x = @$ref; | |
110 my $thresh = shift; | |
111 my $cum=0; | |
112 for my $i (0 .. $#x) { | |
113 $cum += $x[$i]; | |
114 if ($cum >= $thresh) { | |
115 return $x[$i], $i+1; | |
116 } | |
117 } | |
118 return (0,0); | |
119 } | |
120 |