3
|
1 #!/usr/bin/perl
|
|
2
|
|
3 # FASTX-toolkit - FASTA/FASTQ preprocessing tools.
|
|
4 # Copyright (C) 2009 A. Gordon (gordon@cshl.edu)
|
|
5 #
|
|
6 # This program is free software: you can redistribute it and/or modify
|
|
7 # it under the terms of the GNU Affero General Public License as
|
|
8 # published by the Free Software Foundation, either version 3 of the
|
|
9 # License, or (at your option) any later version.
|
|
10 #
|
|
11 # This program is distributed in the hope that it will be useful,
|
|
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
14 # GNU Affero General Public License for more details.
|
|
15 #
|
|
16 # You should have received a copy of the GNU Affero General Public License
|
|
17 # along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
18
|
|
19 use strict;
|
|
20 use warnings;
|
|
21 use GD::Graph::bars;
|
|
22 use Data::Dumper;
|
|
23 use PerlIO::gzip;
|
|
24
|
|
25 if (scalar @ARGV==0) {
|
|
26 print<<END;
|
|
27
|
|
28 Create a Linker Clipping Information Histogram
|
|
29
|
|
30 usage: $0 INPUT_FILE.FA OUTPUT_FILE.PNG
|
|
31
|
|
32 INPUT_FILE.FA = input file (in FASTA format, can be GZIPped)
|
|
33 OUTPUT_FILE.PNG = histogram image
|
|
34
|
|
35 END
|
|
36 exit 0;
|
|
37 }
|
|
38
|
|
39 #
|
|
40 # Read parameters
|
|
41 #
|
|
42 open(IN, "<:gzip(autopop)", "$ARGV[0]") or die "Cannot open input file $ARGV[0]\n";
|
|
43 open(OUT, ">$ARGV[1]") or die "Cannot create output file $ARGV[1]\n";
|
|
44 binmode OUT;
|
|
45
|
|
46 my %histogram ;
|
|
47
|
|
48 while (my $name = <IN>) {
|
|
49 my $sequence = <IN> ;
|
|
50 chomp $sequence;
|
|
51
|
|
52 my $sequence_length = length($sequence);
|
|
53
|
|
54 my $count;
|
|
55
|
|
56 if ( index($name, "-")==-1 ) {
|
|
57 #Assume this file is not collapsed, just count each seqeunce as 1
|
|
58 $count = 1 ;
|
|
59 } else {
|
|
60 #Assume file is collapsed (that is - sequence-ID has two numbers with a separating dash)
|
|
61 (undef, $count) = $name =~ /^\>(\d+)\-(\d+)$/ ;
|
|
62
|
|
63 # If the match failed, treat this fasta as not collapsed;
|
|
64 $count = 1 if not defined $count ;
|
|
65 }
|
|
66
|
|
67 $histogram{$sequence_length} += $count ;
|
|
68 }
|
|
69
|
|
70 #Textual Output
|
|
71 if (0) {
|
|
72 print "Length\tCount\n";
|
|
73 foreach my $length_key ( sort { $a <=> $b } keys %histogram ) {
|
|
74 print $length_key,"\t", $histogram{$length_key},"\n";
|
|
75 }
|
|
76 exit 0;
|
|
77 }
|
|
78
|
|
79 ## Build the data as required by GD::Graph::bars.
|
|
80 ## Data list has two items (each item is itself a list)
|
|
81 ## 1. a list of x-axis labels (these are the keys from the histogram)
|
|
82 ## 2. a list of values
|
|
83 my @data = (
|
|
84 [ sort { $a <=> $b } keys %histogram ],
|
|
85 [ map { $histogram{$_} } sort { $a <=> $b } keys %histogram ] ) ;
|
|
86
|
|
87 my $graph = new GD::Graph::bars (1000,800);
|
|
88
|
|
89 $graph->set(
|
|
90 x_label => 'Length',
|
|
91 y_label => 'Amount',
|
|
92 title => 'Sequences lengths Distribution (after clipping)',
|
|
93 bar_spacing => 10,
|
|
94 transparent => 0,
|
|
95 t_margin => 10,
|
|
96 y_tick_number => 20,
|
|
97 y_long_ticks => 1,
|
|
98 ) or die $graph->error;
|
|
99
|
|
100 $graph->plot(\@data) or die $graph->error;
|
|
101 print OUT $graph->gd->png;
|
|
102
|
|
103 close IN;
|
|
104 close OUT;
|