diff fastx_toolkit-0.0.6/scripts/fasta_clipping_histogram.pl @ 3:997f5136985f draft default tip

Uploaded
author xilinxu
date Thu, 14 Aug 2014 04:52:17 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fastx_toolkit-0.0.6/scripts/fasta_clipping_histogram.pl	Thu Aug 14 04:52:17 2014 -0400
@@ -0,0 +1,104 @@
+#!/usr/bin/perl
+
+#    FASTX-toolkit - FASTA/FASTQ preprocessing tools.
+#    Copyright (C) 2009  A. Gordon (gordon@cshl.edu)
+#
+#   This program is free software: you can redistribute it and/or modify
+#   it under the terms of the GNU Affero General Public License as
+#   published by the Free Software Foundation, either version 3 of the
+#   License, or (at your option) any later version.
+#
+#   This program is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#   GNU Affero General Public License for more details.
+#
+#    You should have received a copy of the GNU Affero General Public License
+#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+use strict;
+use warnings;
+use GD::Graph::bars;
+use Data::Dumper;
+use PerlIO::gzip;
+
+if (scalar @ARGV==0) {
+	print<<END;
+	
+Create a Linker Clipping Information Histogram
+
+usage: $0 INPUT_FILE.FA OUTPUT_FILE.PNG
+
+	INPUT_FILE.FA   = input file (in FASTA format, can be GZIPped)
+	OUTPUT_FILE.PNG = histogram image
+
+END
+	exit 0;
+}
+
+#
+# Read parameters
+#
+open(IN, "<:gzip(autopop)", "$ARGV[0]") or die "Cannot open input file $ARGV[0]\n";
+open(OUT, ">$ARGV[1]") or die "Cannot create output file $ARGV[1]\n";
+binmode OUT;
+
+my %histogram ;
+
+while (my $name = <IN>) {
+	my $sequence = <IN> ;
+	chomp $sequence;
+	
+	my $sequence_length = length($sequence);
+	
+	my $count;
+
+	if ( index($name, "-")==-1 ) {
+		#Assume this file is not collapsed, just count each seqeunce as 1
+		$count = 1 ;
+	} else  {
+		#Assume file is collapsed (that is - sequence-ID has two numbers with a separating dash)
+		(undef, $count) = $name =~ /^\>(\d+)\-(\d+)$/ ;
+
+		# If the match failed, treat this fasta as not collapsed;
+		$count = 1 if not defined $count ;
+	}
+	
+	$histogram{$sequence_length} += $count ;
+}
+
+#Textual Output
+if (0) {
+	print "Length\tCount\n";
+	foreach my $length_key ( sort { $a <=> $b } keys %histogram ) {
+		print $length_key,"\t", $histogram{$length_key},"\n";
+	}
+	exit 0;
+}
+
+## Build the data as required by GD::Graph::bars.
+## Data list has two items (each item is itself a list)
+##   1. a list of x-axis labels (these are the keys from the histogram)
+##   2. a list of values
+my @data = (
+	[ sort { $a <=> $b } keys %histogram ],
+	[ map { $histogram{$_} } sort { $a <=> $b } keys %histogram ] ) ;
+
+my $graph = new GD::Graph::bars (1000,800);
+
+$graph->set(
+	x_label => 'Length',
+	y_label => 'Amount',
+	title => 'Sequences lengths Distribution (after clipping)',
+	bar_spacing => 10,
+	transparent => 0,
+	t_margin => 10,
+	y_tick_number => 20,
+	y_long_ticks => 1,
+	)  or die $graph->error;
+
+$graph->plot(\@data) or die $graph->error;
+print OUT $graph->gd->png;
+
+close IN;
+close OUT;