comparison fastaptamer_count @ 1:b9b2da3fa7d7 draft default tip

Uploaded
author fastaptamer
date Tue, 10 Feb 2015 14:23:07 -0500
parents
children
comparison
equal deleted inserted replaced
0:2bed8ca187a1 1:b9b2da3fa7d7
1 #!/usr/bin/env perl
2
3 ## Last Modified January 19th, 2015 22:22 CST
4
5 ## Citation:
6 ## Khalid K. Alam, Jonathan L. Chang & Donald H. Burke.
7 ## FASTAptamer: A Bioinformatic Toolkit for High-Throughput Sequence Analysis of
8 ## Combinatorial Selections. Molecular Therapy — Nucleic Acids. 2015.
9 ## DOI: 10.1038/mtna.2015.4
10
11 ## Distributed under GNU General Public License v3
12
13 use Getopt::Long; ## Core Perl module for command line arguments/options
14
15 my $start = time; ## Starts program execution timer
16 my $input_fh; ## Input file handle
17 my $output_fh; ## Output file handle
18 my $quiet; ## true/false variable for summary report supression
19 my $help; ## true/false variable for help screen
20 my $version; ## true/false variable for version screen
21
22 ## Take command line arguments for...
23 GetOptions ( "input=s" => \$input_fh, ## ...input file
24 "output=s" => \$output_fh, ## ...output file
25 "quiet" => \$quiet, ## ...supressing std. out
26 "version" => \$version, ## ...showing version screen
27 "help" => \$help); ## ...showing help screen
28
29 if (defined $help){ ## Print help screen if -h is true
30 print <<"HELP";
31
32 --------------------------------------------------------------------------------
33 FASTAptamer-Count
34 --------------------------------------------------------------------------------
35
36 Usage: fastaptamer_count [-h] [-q] [-v] [-i INFILE] [-o OUTFILE]
37
38 [-h] = Help screen.
39 [-q] = Suppress STDOUT of run report.
40 [-v] = Display version.
41 [-i INFILE] = FASTQ input file. REQUIRED.
42 [-o OUTFILE] = FASTA output file. REQUIRED.
43
44 FASTAptamer-Count serves as the gateway to the FASTAptamer toolkit. For a given
45 .FASTQ input file, FASTAptamer-Count will determine the number of times each se-
46 quence was read, rank and sort sequences by decreasing total reads, and normali-
47 ze sequence frequency to reads per million. Output is generated as a non-redund-
48 ant FASTA file in the following format for each sequence:
49
50 >RANK-READS-RPM
51 SEQUENCE
52
53 Summary report (total reads, unique reads, and execution time) is displayed as
54 STDOUT at program completion unless [-q] is invoked.
55
56 HELP
57 exit;
58 }
59
60 if (defined $version){ ## Print version screen if -v is true
61 print <<"VERSION";
62
63 FASTAptamer v1.0.2
64
65 VERSION
66 exit;
67 }
68
69 ##########################################
70 ## Open input file or exit with warning #
71 ##########################################
72
73 open (INPUT, '<', $input_fh) or die
74 "\nCould not open input file or no input file was specified.\n
75 See help documentation [-h], README, or User's Guide for program usage.\n";
76
77 ##########################################
78 ## Open output file or exit with warning #
79 ##########################################
80
81 open (OUTPUT, '>', $output_fh) or die
82 "\nCould not open output file or no output file was specified.\n
83 See help documentation [-h], README, or User's Guide for program usage.\n";
84
85 ###############################################
86 ## Create a hash for sequence reads where key #
87 ## is the sequence and value is read count #
88 ###############################################
89
90 my %sequence_reads;
91
92 my $entries = 0; ## Counts the number of entries processed
93
94 $/ = "@"; ## Change default input record separator to read FASTQ formatted file
95
96 ################################################################################
97 ## Read input file and for each entry, add sequence or increment value in hash #
98 ################################################################################
99
100 while (<INPUT>){
101 if ($_ =~ /.+\n(\S+)\n.+\n.+/){
102 my $sequence = $1;
103 $sequence_reads{$sequence} += 1;
104 $entries++;
105 }
106 }
107
108 close INPUT;
109
110 ############################################
111 ## Sort hash by decreasing read count by #
112 ## converting to arrays of values and keys #
113 ############################################
114
115 my @keys = sort { $sequence_reads{$b} <=> $sequence_reads{$a} } keys(%sequence_reads);
116 my @vals = @sequence_reads{@keys};
117
118 ##################################################################################
119 ## Using both arrays, print to output the FASTA formatted file containing #
120 ## the sequence rank, reads, reads per million, and sequence. The extra #
121 ## scalars here are used to ensure that sequences with equal read counts #
122 ## receive the same rank value, and that the next untied value is properly #
123 ## assigned a rank that takes into account the number of tied values before it, #
124 ## also known as standard competition ranking. #
125 ##################################################################################
126
127 my $last_reads_count = $vals[0];
128 my $rank = 1;
129
130 for my $i (0..$#keys){
131 my $current_reads_count = $vals[$i];
132 if ($current_reads_count < $last_reads_count){
133 $rank = $i + 1;
134 print OUTPUT ">$rank-$vals[$i]-";
135 printf OUTPUT "%.2f", ($current_reads_count/$entries)*(1000000);
136 print OUTPUT "\n";
137 print OUTPUT "$keys[$i]\n";
138
139 }
140 elsif ($current_reads_count == $last_reads_count){
141 print OUTPUT ">$rank-$vals[$i]-";
142 printf OUTPUT "%.2f", ($current_reads_count/$entries)*(1000000);
143 print OUTPUT "\n";
144 print OUTPUT "$keys[$i]\n";
145 }
146 $last_reads_count = $vals[$i];
147
148 }
149
150 close OUTPUT;
151
152 ##################################################################################
153 ## Unless the -q option is invoked, the following statistics are printed as #
154 ## standard output. #
155 ##################################################################################
156
157 unless ($quiet){
158 print "\n$entries total sequences. " . ($#keys + 1) . " unique sequences.\n";
159 print "Input file: \"$input_fh\".\n";
160 print "Output file: \"$output_fh\".\n";
161
162 my $duration = time - $start;
163 print "Execution time: $duration s.\n";
164 }
165
166 exit;