comparison fastx_toolkit-0.0.6/scripts/fasta_clipping_histogram.pl @ 3:997f5136985f draft default tip

Uploaded
author xilinxu
date Thu, 14 Aug 2014 04:52:17 -0400
parents
children
comparison
equal deleted inserted replaced
2:dfe9332138cf 3:997f5136985f
1 #!/usr/bin/perl
2
3 # FASTX-toolkit - FASTA/FASTQ preprocessing tools.
4 # Copyright (C) 2009 A. Gordon (gordon@cshl.edu)
5 #
6 # This program is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU Affero General Public License as
8 # published by the Free Software Foundation, either version 3 of the
9 # License, or (at your option) any later version.
10 #
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU Affero General Public License for more details.
15 #
16 # You should have received a copy of the GNU Affero General Public License
17 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18
19 use strict;
20 use warnings;
21 use GD::Graph::bars;
22 use Data::Dumper;
23 use PerlIO::gzip;
24
25 if (scalar @ARGV==0) {
26 print<<END;
27
28 Create a Linker Clipping Information Histogram
29
30 usage: $0 INPUT_FILE.FA OUTPUT_FILE.PNG
31
32 INPUT_FILE.FA = input file (in FASTA format, can be GZIPped)
33 OUTPUT_FILE.PNG = histogram image
34
35 END
36 exit 0;
37 }
38
39 #
40 # Read parameters
41 #
42 open(IN, "<:gzip(autopop)", "$ARGV[0]") or die "Cannot open input file $ARGV[0]\n";
43 open(OUT, ">$ARGV[1]") or die "Cannot create output file $ARGV[1]\n";
44 binmode OUT;
45
46 my %histogram ;
47
48 while (my $name = <IN>) {
49 my $sequence = <IN> ;
50 chomp $sequence;
51
52 my $sequence_length = length($sequence);
53
54 my $count;
55
56 if ( index($name, "-")==-1 ) {
57 #Assume this file is not collapsed, just count each seqeunce as 1
58 $count = 1 ;
59 } else {
60 #Assume file is collapsed (that is - sequence-ID has two numbers with a separating dash)
61 (undef, $count) = $name =~ /^\>(\d+)\-(\d+)$/ ;
62
63 # If the match failed, treat this fasta as not collapsed;
64 $count = 1 if not defined $count ;
65 }
66
67 $histogram{$sequence_length} += $count ;
68 }
69
70 #Textual Output
71 if (0) {
72 print "Length\tCount\n";
73 foreach my $length_key ( sort { $a <=> $b } keys %histogram ) {
74 print $length_key,"\t", $histogram{$length_key},"\n";
75 }
76 exit 0;
77 }
78
79 ## Build the data as required by GD::Graph::bars.
80 ## Data list has two items (each item is itself a list)
81 ## 1. a list of x-axis labels (these are the keys from the histogram)
82 ## 2. a list of values
83 my @data = (
84 [ sort { $a <=> $b } keys %histogram ],
85 [ map { $histogram{$_} } sort { $a <=> $b } keys %histogram ] ) ;
86
87 my $graph = new GD::Graph::bars (1000,800);
88
89 $graph->set(
90 x_label => 'Length',
91 y_label => 'Amount',
92 title => 'Sequences lengths Distribution (after clipping)',
93 bar_spacing => 10,
94 transparent => 0,
95 t_margin => 10,
96 y_tick_number => 20,
97 y_long_ticks => 1,
98 ) or die $graph->error;
99
100 $graph->plot(\@data) or die $graph->error;
101 print OUT $graph->gd->png;
102
103 close IN;
104 close OUT;