changeset 1:b9b2da3fa7d7 draft default tip

Uploaded
author fastaptamer
date Tue, 10 Feb 2015 14:23:07 -0500 (2015-02-10)
parents 2bed8ca187a1
children
files fastaptamer_count
diffstat 1 files changed, 166 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fastaptamer_count	Tue Feb 10 14:23:07 2015 -0500
@@ -0,0 +1,166 @@
+#!/usr/bin/env perl
+
+## Last Modified January 19th, 2015 22:22 CST
+
+## Citation:
+## Khalid K. Alam, Jonathan L. Chang & Donald H. Burke. 
+## FASTAptamer: A Bioinformatic Toolkit for High-Throughput Sequence Analysis of 
+## Combinatorial Selections. Molecular Therapy — Nucleic Acids. 2015.
+## DOI: 10.1038/mtna.2015.4
+
+## Distributed under GNU General Public License v3
+
+use Getopt::Long;    ## Core Perl module for command line arguments/options
+
+my $start = time;    ## Starts program execution timer
+my $input_fh;        ## Input file handle
+my $output_fh;       ## Output file handle
+my $quiet;           ## true/false variable for summary report supression
+my $help;            ## true/false variable for help screen
+my $version;         ## true/false variable for version screen
+
+                                           ## Take command line arguments for...
+GetOptions (    "input=s" => \$input_fh,        ## ...input file
+                "output=s" => \$output_fh,      ## ...output file
+                "quiet" => \$quiet,             ## ...supressing std. out
+                "version" => \$version,         ## ...showing version screen
+                "help" => \$help);              ## ...showing help screen
+		
+if (defined $help){                           ## Print help screen if -h is true
+    print <<"HELP";
+	
+--------------------------------------------------------------------------------
+                               FASTAptamer-Count
+--------------------------------------------------------------------------------
+
+Usage: fastaptamer_count [-h] [-q] [-v] [-i INFILE] [-o OUTFILE] 
+
+    [-h]            = Help screen.
+    [-q]            = Suppress STDOUT of run report.
+    [-v]            = Display version.
+    [-i INFILE]     = FASTQ input file. REQUIRED.
+    [-o OUTFILE]    = FASTA output file. REQUIRED.
+
+FASTAptamer-Count serves as the gateway to the FASTAptamer toolkit.  For a given
+.FASTQ input file, FASTAptamer-Count will determine the number of times each se-
+quence was read, rank and sort sequences by decreasing total reads, and normali-
+ze sequence frequency to reads per million. Output is generated as a non-redund-
+ant FASTA file in the following format for each sequence:
+
+        >RANK-READS-RPM
+        SEQUENCE
+		
+Summary report (total reads, unique reads, and execution time) is displayed as 
+STDOUT at program completion unless [-q] is invoked.
+
+HELP
+exit;
+}
+
+if (defined $version){                     ## Print version screen if -v is true
+    print <<"VERSION";
+	
+FASTAptamer v1.0.2
+	
+VERSION
+exit;
+}
+
+##########################################	
+## Open input file or exit with warning  #
+##########################################
+
+open (INPUT, '<', $input_fh) or die		
+"\nCould not open input file or no input file was specified.\n
+See help documentation [-h], README, or User's Guide for program usage.\n";
+
+##########################################
+## Open output file or exit with warning #
+##########################################
+
+open (OUTPUT, '>', $output_fh) or die		
+"\nCould not open output file or no output file was specified.\n
+See help documentation [-h], README, or User's Guide for program usage.\n";
+
+###############################################
+## Create a hash for sequence reads where key #
+## is the sequence and value is read count    #
+###############################################
+
+my %sequence_reads;	
+
+my $entries = 0;    ## Counts the number of entries processed
+
+$/ = "@"; ## Change default input record separator to read FASTQ formatted file	
+
+################################################################################
+## Read input file and for each entry, add sequence or increment value in hash #
+################################################################################
+
+while (<INPUT>){
+    if ($_ =~ /.+\n(\S+)\n.+\n.+/){
+        my $sequence = $1;
+        $sequence_reads{$sequence} += 1;
+        $entries++;
+    }
+}
+
+close INPUT;
+
+############################################
+## Sort hash by decreasing read count by   #
+## converting to arrays of values and keys #
+############################################
+
+my @keys = sort { $sequence_reads{$b} <=> $sequence_reads{$a} } keys(%sequence_reads);
+my @vals = @sequence_reads{@keys};
+
+##################################################################################
+## Using both arrays, print to output the FASTA formatted file containing        #
+## the sequence rank, reads, reads per million, and sequence. The extra          #
+## scalars here are used to ensure that sequences with equal read counts         #
+## receive the same rank value, and that the next untied value is properly       #
+## assigned a rank that takes into account the number of tied values before it,  #
+## also known as standard competition ranking.                                   #
+##################################################################################
+
+my $last_reads_count = $vals[0];
+my $rank = 1;
+
+for my $i (0..$#keys){
+    my $current_reads_count = $vals[$i];
+    if ($current_reads_count < $last_reads_count){
+        $rank = $i + 1; 
+        print OUTPUT ">$rank-$vals[$i]-";
+        printf OUTPUT "%.2f", ($current_reads_count/$entries)*(1000000);
+        print OUTPUT "\n";
+        print OUTPUT "$keys[$i]\n";
+		
+    }
+    elsif ($current_reads_count == $last_reads_count){
+        print OUTPUT ">$rank-$vals[$i]-";
+        printf OUTPUT "%.2f", ($current_reads_count/$entries)*(1000000);
+        print OUTPUT "\n";
+        print OUTPUT "$keys[$i]\n";
+    }
+    $last_reads_count = $vals[$i];
+	
+}
+
+close OUTPUT;
+
+##################################################################################
+## Unless the -q option is invoked, the following statistics are printed as      #
+## standard output.                                                              #
+##################################################################################
+
+unless ($quiet){
+    print "\n$entries total sequences.  " . ($#keys + 1) . " unique sequences.\n";
+    print "Input file: \"$input_fh\".\n";
+    print "Output file: \"$output_fh\".\n";
+
+    my $duration = time - $start;
+    print "Execution time: $duration s.\n";
+}
+
+exit;