annotate fastaptamer_count @ 1:b9b2da3fa7d7 draft default tip

Uploaded
author fastaptamer
date Tue, 10 Feb 2015 14:23:07 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
1 #!/usr/bin/env perl
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
2
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
3 ## Last Modified January 19th, 2015 22:22 CST
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
4
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
5 ## Citation:
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
6 ## Khalid K. Alam, Jonathan L. Chang & Donald H. Burke.
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
7 ## FASTAptamer: A Bioinformatic Toolkit for High-Throughput Sequence Analysis of
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
8 ## Combinatorial Selections. Molecular Therapy — Nucleic Acids. 2015.
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
9 ## DOI: 10.1038/mtna.2015.4
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
10
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
11 ## Distributed under GNU General Public License v3
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
12
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
13 use Getopt::Long; ## Core Perl module for command line arguments/options
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
14
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
15 my $start = time; ## Starts program execution timer
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
16 my $input_fh; ## Input file handle
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
17 my $output_fh; ## Output file handle
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
18 my $quiet; ## true/false variable for summary report supression
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
19 my $help; ## true/false variable for help screen
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
20 my $version; ## true/false variable for version screen
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
21
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
22 ## Take command line arguments for...
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
23 GetOptions ( "input=s" => \$input_fh, ## ...input file
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
24 "output=s" => \$output_fh, ## ...output file
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
25 "quiet" => \$quiet, ## ...supressing std. out
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
26 "version" => \$version, ## ...showing version screen
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
27 "help" => \$help); ## ...showing help screen
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
28
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
29 if (defined $help){ ## Print help screen if -h is true
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
30 print <<"HELP";
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
31
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
32 --------------------------------------------------------------------------------
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
33 FASTAptamer-Count
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
34 --------------------------------------------------------------------------------
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
35
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
36 Usage: fastaptamer_count [-h] [-q] [-v] [-i INFILE] [-o OUTFILE]
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
37
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
38 [-h] = Help screen.
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
39 [-q] = Suppress STDOUT of run report.
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
40 [-v] = Display version.
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
41 [-i INFILE] = FASTQ input file. REQUIRED.
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
42 [-o OUTFILE] = FASTA output file. REQUIRED.
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
43
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
44 FASTAptamer-Count serves as the gateway to the FASTAptamer toolkit. For a given
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
45 .FASTQ input file, FASTAptamer-Count will determine the number of times each se-
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
46 quence was read, rank and sort sequences by decreasing total reads, and normali-
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
47 ze sequence frequency to reads per million. Output is generated as a non-redund-
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
48 ant FASTA file in the following format for each sequence:
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
49
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
50 >RANK-READS-RPM
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
51 SEQUENCE
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
52
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
53 Summary report (total reads, unique reads, and execution time) is displayed as
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
54 STDOUT at program completion unless [-q] is invoked.
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
55
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
56 HELP
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
57 exit;
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
58 }
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
59
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
60 if (defined $version){ ## Print version screen if -v is true
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
61 print <<"VERSION";
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
62
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
63 FASTAptamer v1.0.2
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
64
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
65 VERSION
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
66 exit;
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
67 }
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
68
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
69 ##########################################
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
70 ## Open input file or exit with warning #
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
71 ##########################################
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
72
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
73 open (INPUT, '<', $input_fh) or die
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
74 "\nCould not open input file or no input file was specified.\n
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
75 See help documentation [-h], README, or User's Guide for program usage.\n";
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
76
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
77 ##########################################
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
78 ## Open output file or exit with warning #
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
79 ##########################################
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
80
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
81 open (OUTPUT, '>', $output_fh) or die
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
82 "\nCould not open output file or no output file was specified.\n
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
83 See help documentation [-h], README, or User's Guide for program usage.\n";
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
84
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
85 ###############################################
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
86 ## Create a hash for sequence reads where key #
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
87 ## is the sequence and value is read count #
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
88 ###############################################
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
89
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
90 my %sequence_reads;
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
91
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
92 my $entries = 0; ## Counts the number of entries processed
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
93
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
94 $/ = "@"; ## Change default input record separator to read FASTQ formatted file
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
95
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
96 ################################################################################
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
97 ## Read input file and for each entry, add sequence or increment value in hash #
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
98 ################################################################################
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
99
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
100 while (<INPUT>){
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
101 if ($_ =~ /.+\n(\S+)\n.+\n.+/){
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
102 my $sequence = $1;
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
103 $sequence_reads{$sequence} += 1;
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
104 $entries++;
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
105 }
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
106 }
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
107
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
108 close INPUT;
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
109
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
110 ############################################
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
111 ## Sort hash by decreasing read count by #
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
112 ## converting to arrays of values and keys #
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
113 ############################################
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
114
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
115 my @keys = sort { $sequence_reads{$b} <=> $sequence_reads{$a} } keys(%sequence_reads);
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
116 my @vals = @sequence_reads{@keys};
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
117
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
118 ##################################################################################
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
119 ## Using both arrays, print to output the FASTA formatted file containing #
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
120 ## the sequence rank, reads, reads per million, and sequence. The extra #
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
121 ## scalars here are used to ensure that sequences with equal read counts #
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
122 ## receive the same rank value, and that the next untied value is properly #
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
123 ## assigned a rank that takes into account the number of tied values before it, #
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
124 ## also known as standard competition ranking. #
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
125 ##################################################################################
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
126
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
127 my $last_reads_count = $vals[0];
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
128 my $rank = 1;
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
129
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
130 for my $i (0..$#keys){
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
131 my $current_reads_count = $vals[$i];
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
132 if ($current_reads_count < $last_reads_count){
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
133 $rank = $i + 1;
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
134 print OUTPUT ">$rank-$vals[$i]-";
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
135 printf OUTPUT "%.2f", ($current_reads_count/$entries)*(1000000);
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
136 print OUTPUT "\n";
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
137 print OUTPUT "$keys[$i]\n";
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
138
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
139 }
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
140 elsif ($current_reads_count == $last_reads_count){
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
141 print OUTPUT ">$rank-$vals[$i]-";
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
142 printf OUTPUT "%.2f", ($current_reads_count/$entries)*(1000000);
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
143 print OUTPUT "\n";
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
144 print OUTPUT "$keys[$i]\n";
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
145 }
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
146 $last_reads_count = $vals[$i];
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
147
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
148 }
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
149
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
150 close OUTPUT;
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
151
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
152 ##################################################################################
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
153 ## Unless the -q option is invoked, the following statistics are printed as #
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
154 ## standard output. #
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
155 ##################################################################################
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
156
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
157 unless ($quiet){
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
158 print "\n$entries total sequences. " . ($#keys + 1) . " unique sequences.\n";
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
159 print "Input file: \"$input_fh\".\n";
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
160 print "Output file: \"$output_fh\".\n";
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
161
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
162 my $duration = time - $start;
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
163 print "Execution time: $duration s.\n";
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
164 }
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
165
b9b2da3fa7d7 Uploaded
fastaptamer
parents:
diff changeset
166 exit;