view fastaptamer_count @ 1:b9b2da3fa7d7 draft default tip

Uploaded
author fastaptamer
date Tue, 10 Feb 2015 14:23:07 -0500
parents
children
line wrap: on
line source

#!/usr/bin/env perl

## Last Modified January 19th, 2015 22:22 CST

## Citation:
## Khalid K. Alam, Jonathan L. Chang & Donald H. Burke. 
## FASTAptamer: A Bioinformatic Toolkit for High-Throughput Sequence Analysis of 
## Combinatorial Selections. Molecular Therapy — Nucleic Acids. 2015.
## DOI: 10.1038/mtna.2015.4

## Distributed under GNU General Public License v3

use Getopt::Long;    ## Core Perl module for command line arguments/options

my $start = time;    ## Starts program execution timer
my $input_fh;        ## Input file handle
my $output_fh;       ## Output file handle
my $quiet;           ## true/false variable for summary report supression
my $help;            ## true/false variable for help screen
my $version;         ## true/false variable for version screen

                                           ## Take command line arguments for...
GetOptions (    "input=s" => \$input_fh,        ## ...input file
                "output=s" => \$output_fh,      ## ...output file
                "quiet" => \$quiet,             ## ...supressing std. out
                "version" => \$version,         ## ...showing version screen
                "help" => \$help);              ## ...showing help screen
		
if (defined $help){                           ## Print help screen if -h is true
    print <<"HELP";
	
--------------------------------------------------------------------------------
                               FASTAptamer-Count
--------------------------------------------------------------------------------

Usage: fastaptamer_count [-h] [-q] [-v] [-i INFILE] [-o OUTFILE] 

    [-h]            = Help screen.
    [-q]            = Suppress STDOUT of run report.
    [-v]            = Display version.
    [-i INFILE]     = FASTQ input file. REQUIRED.
    [-o OUTFILE]    = FASTA output file. REQUIRED.

FASTAptamer-Count serves as the gateway to the FASTAptamer toolkit.  For a given
.FASTQ input file, FASTAptamer-Count will determine the number of times each se-
quence was read, rank and sort sequences by decreasing total reads, and normali-
ze sequence frequency to reads per million. Output is generated as a non-redund-
ant FASTA file in the following format for each sequence:

        >RANK-READS-RPM
        SEQUENCE
		
Summary report (total reads, unique reads, and execution time) is displayed as 
STDOUT at program completion unless [-q] is invoked.

HELP
exit;
}

if (defined $version){                     ## Print version screen if -v is true
    print <<"VERSION";
	
FASTAptamer v1.0.2
	
VERSION
exit;
}

##########################################	
## Open input file or exit with warning  #
##########################################

open (INPUT, '<', $input_fh) or die		
"\nCould not open input file or no input file was specified.\n
See help documentation [-h], README, or User's Guide for program usage.\n";

##########################################
## Open output file or exit with warning #
##########################################

open (OUTPUT, '>', $output_fh) or die		
"\nCould not open output file or no output file was specified.\n
See help documentation [-h], README, or User's Guide for program usage.\n";

###############################################
## Create a hash for sequence reads where key #
## is the sequence and value is read count    #
###############################################

my %sequence_reads;	

my $entries = 0;    ## Counts the number of entries processed

$/ = "@"; ## Change default input record separator to read FASTQ formatted file	

################################################################################
## Read input file and for each entry, add sequence or increment value in hash #
################################################################################

while (<INPUT>){
    if ($_ =~ /.+\n(\S+)\n.+\n.+/){
        my $sequence = $1;
        $sequence_reads{$sequence} += 1;
        $entries++;
    }
}

close INPUT;

############################################
## Sort hash by decreasing read count by   #
## converting to arrays of values and keys #
############################################

my @keys = sort { $sequence_reads{$b} <=> $sequence_reads{$a} } keys(%sequence_reads);
my @vals = @sequence_reads{@keys};

##################################################################################
## Using both arrays, print to output the FASTA formatted file containing        #
## the sequence rank, reads, reads per million, and sequence. The extra          #
## scalars here are used to ensure that sequences with equal read counts         #
## receive the same rank value, and that the next untied value is properly       #
## assigned a rank that takes into account the number of tied values before it,  #
## also known as standard competition ranking.                                   #
##################################################################################

my $last_reads_count = $vals[0];
my $rank = 1;

for my $i (0..$#keys){
    my $current_reads_count = $vals[$i];
    if ($current_reads_count < $last_reads_count){
        $rank = $i + 1; 
        print OUTPUT ">$rank-$vals[$i]-";
        printf OUTPUT "%.2f", ($current_reads_count/$entries)*(1000000);
        print OUTPUT "\n";
        print OUTPUT "$keys[$i]\n";
		
    }
    elsif ($current_reads_count == $last_reads_count){
        print OUTPUT ">$rank-$vals[$i]-";
        printf OUTPUT "%.2f", ($current_reads_count/$entries)*(1000000);
        print OUTPUT "\n";
        print OUTPUT "$keys[$i]\n";
    }
    $last_reads_count = $vals[$i];
	
}

close OUTPUT;

##################################################################################
## Unless the -q option is invoked, the following statistics are printed as      #
## standard output.                                                              #
##################################################################################

unless ($quiet){
    print "\n$entries total sequences.  " . ($#keys + 1) . " unique sequences.\n";
    print "Input file: \"$input_fh\".\n";
    print "Output file: \"$output_fh\".\n";

    my $duration = time - $start;
    print "Execution time: $duration s.\n";
}

exit;