Mercurial > repos > iuc > fasta_stats
annotate fasta-stats.pl @ 0:9c620a950d3a draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
author | iuc |
---|---|
date | Thu, 22 Nov 2018 04:16:35 -0500 |
parents | |
children | 16f1f3e2de42 |
rev | line source |
---|---|
0
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
1 #!/usr/bin/env perl |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
2 |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
3 # fasta-stats |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
4 # written by torsten.seemann@monash.edu |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
5 # oct 2012 |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
6 |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
7 use strict; |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
8 use warnings; |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
9 use List::Util qw(sum min max); |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
10 |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
11 # stat storage |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
12 |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
13 my $n=0; |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
14 my $seq = ''; |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
15 my %stat; |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
16 my @len; |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
17 |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
18 # MAIN LOOP collecting sequences |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
19 |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
20 while (my $line = <ARGV>) { |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
21 chomp $line; |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
22 if ($line =~ m/^\s*>/) { |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
23 process($seq) if $n; |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
24 $n++; |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
25 $seq=''; |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
26 } |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
27 else { |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
28 $seq .= $line; |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
29 } |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
30 } |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
31 |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
32 process($seq) if $n; |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
33 |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
34 # sort length array |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
35 # (should use hash here for efficiency with huge no of short reads?) |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
36 |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
37 @len = sort { $a <=> $b } @len; |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
38 |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
39 # compute more stats |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
40 |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
41 $stat{'num_seq'} = scalar(@len); |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
42 |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
43 if (@len) { |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
44 $stat{'num_bp'} = sum(@len); |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
45 $stat{'len_min'} = $len[0]; |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
46 $stat{'len_max'} = $len[-1]; |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
47 $stat{'len_median'} = $len[int(@len/2)]; |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
48 $stat{'len_mean'} = int( $stat{'num_bp'} / $stat{'num_seq'} ); |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
49 # calculate n50 |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
50 |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
51 $stat{'len_N50'} = 0; |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
52 my $cum=0; |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
53 my $thresh = int 0.5 * $stat{'num_bp'}; |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
54 for my $i (0 .. $#len) { |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
55 $cum += $len[$i]; |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
56 if ($cum >= $thresh) { |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
57 $stat{'len_N50'} = $len[$i]; |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
58 last; |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
59 } |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
60 } |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
61 } |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
62 |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
63 #calculate GC content |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
64 $stat{'num_bp_not_N'} = $stat{'num_G'} + $stat{'num_C'} + $stat{'num_A'} + $stat{'num_T'}; |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
65 $stat{'GC_content'} = ($stat{'num_G'} + $stat{'num_C'}) / $stat{'num_bp_not_N'}*100; |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
66 |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
67 # print stats as .tsv |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
68 |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
69 for my $name (sort keys %stat) { |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
70 if ($name =~ m/GC_content/){ |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
71 printf "%s\t%0.1f\n", $name, $stat{$name}; |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
72 } else { |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
73 printf "%s\t%s\n", $name, $stat{$name}; |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
74 } |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
75 } |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
76 |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
77 # run for each sequence |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
78 |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
79 sub process { |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
80 my($s) = @_; |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
81 # base composition |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
82 for my $x (qw(A G T C N)) { |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
83 my $count = $s =~ s/$x/$x/gi; |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
84 $stat{"num_$x"} += $count; |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
85 } |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
86 # keep list of all lengths encountered |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
87 push @len, length($s); |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
88 } |
9c620a950d3a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff
changeset
|
89 |