annotate fastx_toolkit-0.0.6/scripts/fastx_barcode_splitter.pl @ 3:997f5136985f draft default tip

Uploaded
author xilinxu
date Thu, 14 Aug 2014 04:52:17 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
997f5136985f Uploaded
xilinxu
parents:
diff changeset
1 #!/usr/bin/perl
997f5136985f Uploaded
xilinxu
parents:
diff changeset
2
997f5136985f Uploaded
xilinxu
parents:
diff changeset
3 # FASTX-toolkit - FASTA/FASTQ preprocessing tools.
997f5136985f Uploaded
xilinxu
parents:
diff changeset
4 # Copyright (C) 2009 A. Gordon (gordon@cshl.edu)
997f5136985f Uploaded
xilinxu
parents:
diff changeset
5 #
997f5136985f Uploaded
xilinxu
parents:
diff changeset
6 # This program is free software: you can redistribute it and/or modify
997f5136985f Uploaded
xilinxu
parents:
diff changeset
7 # it under the terms of the GNU Affero General Public License as
997f5136985f Uploaded
xilinxu
parents:
diff changeset
8 # published by the Free Software Foundation, either version 3 of the
997f5136985f Uploaded
xilinxu
parents:
diff changeset
9 # License, or (at your option) any later version.
997f5136985f Uploaded
xilinxu
parents:
diff changeset
10 #
997f5136985f Uploaded
xilinxu
parents:
diff changeset
11 # This program is distributed in the hope that it will be useful,
997f5136985f Uploaded
xilinxu
parents:
diff changeset
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
997f5136985f Uploaded
xilinxu
parents:
diff changeset
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
997f5136985f Uploaded
xilinxu
parents:
diff changeset
14 # GNU Affero General Public License for more details.
997f5136985f Uploaded
xilinxu
parents:
diff changeset
15 #
997f5136985f Uploaded
xilinxu
parents:
diff changeset
16 # You should have received a copy of the GNU Affero General Public License
997f5136985f Uploaded
xilinxu
parents:
diff changeset
17 # along with this program. If not, see <http://www.gnu.org/licenses/>.
997f5136985f Uploaded
xilinxu
parents:
diff changeset
18
997f5136985f Uploaded
xilinxu
parents:
diff changeset
19 use strict;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
20 use warnings;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
21 use IO::Handle;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
22 use Data::Dumper;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
23 use Getopt::Long;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
24 use Carp;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
25
997f5136985f Uploaded
xilinxu
parents:
diff changeset
26 ##
997f5136985f Uploaded
xilinxu
parents:
diff changeset
27 ## This program splits a FASTQ/FASTA file into several smaller files,
997f5136985f Uploaded
xilinxu
parents:
diff changeset
28 ## Based on barcode matching.
997f5136985f Uploaded
xilinxu
parents:
diff changeset
29 ##
997f5136985f Uploaded
xilinxu
parents:
diff changeset
30 ## run with "--help" for usage information
997f5136985f Uploaded
xilinxu
parents:
diff changeset
31 ##
997f5136985f Uploaded
xilinxu
parents:
diff changeset
32 ## Assaf Gordon <gordon@cshl.edu> , 11sep2008
997f5136985f Uploaded
xilinxu
parents:
diff changeset
33
997f5136985f Uploaded
xilinxu
parents:
diff changeset
34 # Forward declarations
997f5136985f Uploaded
xilinxu
parents:
diff changeset
35 sub load_barcode_file ($);
997f5136985f Uploaded
xilinxu
parents:
diff changeset
36 sub parse_command_line ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
37 sub match_sequences ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
38 sub mismatch_count($$) ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
39 sub print_results;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
40 sub open_and_detect_input_format;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
41 sub read_record;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
42 sub write_record($);
997f5136985f Uploaded
xilinxu
parents:
diff changeset
43 sub usage();
997f5136985f Uploaded
xilinxu
parents:
diff changeset
44
997f5136985f Uploaded
xilinxu
parents:
diff changeset
45 # Global flags and arguments,
997f5136985f Uploaded
xilinxu
parents:
diff changeset
46 # Set by command line argumens
997f5136985f Uploaded
xilinxu
parents:
diff changeset
47 my $barcode_file ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
48 my $barcodes_at_eol = 0 ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
49 my $barcodes_at_bol = 0 ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
50 my $exact_match = 0 ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
51 my $allow_partial_overlap = 0;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
52 my $allowed_mismatches = 1;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
53 my $newfile_suffix = '';
997f5136985f Uploaded
xilinxu
parents:
diff changeset
54 my $newfile_prefix ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
55 my $quiet = 0 ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
56 my $debug = 0 ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
57 my $fastq_format = 1;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
58
997f5136985f Uploaded
xilinxu
parents:
diff changeset
59 # Global variables
997f5136985f Uploaded
xilinxu
parents:
diff changeset
60 # Populated by 'create_output_files'
997f5136985f Uploaded
xilinxu
parents:
diff changeset
61 my %filenames;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
62 my %files;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
63 my %counts = ( 'unmatched' => 0 );
997f5136985f Uploaded
xilinxu
parents:
diff changeset
64 my $barcodes_length;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
65 my @barcodes;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
66 my $input_file_io;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
67
997f5136985f Uploaded
xilinxu
parents:
diff changeset
68
997f5136985f Uploaded
xilinxu
parents:
diff changeset
69 # The Four lines per record in FASTQ format.
997f5136985f Uploaded
xilinxu
parents:
diff changeset
70 # (when using FASTA format, only the first two are used)
997f5136985f Uploaded
xilinxu
parents:
diff changeset
71 my $seq_name;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
72 my $seq_bases;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
73 my $seq_name2;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
74 my $seq_qualities;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
75
997f5136985f Uploaded
xilinxu
parents:
diff changeset
76
997f5136985f Uploaded
xilinxu
parents:
diff changeset
77 #
997f5136985f Uploaded
xilinxu
parents:
diff changeset
78 # Start of Program
997f5136985f Uploaded
xilinxu
parents:
diff changeset
79 #
997f5136985f Uploaded
xilinxu
parents:
diff changeset
80 parse_command_line ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
81
997f5136985f Uploaded
xilinxu
parents:
diff changeset
82 load_barcode_file ( $barcode_file ) ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
83
997f5136985f Uploaded
xilinxu
parents:
diff changeset
84 open_and_detect_input_format;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
85
997f5136985f Uploaded
xilinxu
parents:
diff changeset
86 match_sequences ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
87
997f5136985f Uploaded
xilinxu
parents:
diff changeset
88 print_results unless $quiet;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
89
997f5136985f Uploaded
xilinxu
parents:
diff changeset
90 #
997f5136985f Uploaded
xilinxu
parents:
diff changeset
91 # End of program
997f5136985f Uploaded
xilinxu
parents:
diff changeset
92 #
997f5136985f Uploaded
xilinxu
parents:
diff changeset
93
997f5136985f Uploaded
xilinxu
parents:
diff changeset
94
997f5136985f Uploaded
xilinxu
parents:
diff changeset
95
997f5136985f Uploaded
xilinxu
parents:
diff changeset
96
997f5136985f Uploaded
xilinxu
parents:
diff changeset
97
997f5136985f Uploaded
xilinxu
parents:
diff changeset
98
997f5136985f Uploaded
xilinxu
parents:
diff changeset
99
997f5136985f Uploaded
xilinxu
parents:
diff changeset
100
997f5136985f Uploaded
xilinxu
parents:
diff changeset
101 sub parse_command_line {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
102 my $help;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
103
997f5136985f Uploaded
xilinxu
parents:
diff changeset
104 usage() if (scalar @ARGV==0);
997f5136985f Uploaded
xilinxu
parents:
diff changeset
105
997f5136985f Uploaded
xilinxu
parents:
diff changeset
106 my $result = GetOptions ( "bcfile=s" => \$barcode_file,
997f5136985f Uploaded
xilinxu
parents:
diff changeset
107 "eol" => \$barcodes_at_eol,
997f5136985f Uploaded
xilinxu
parents:
diff changeset
108 "bol" => \$barcodes_at_bol,
997f5136985f Uploaded
xilinxu
parents:
diff changeset
109 "exact" => \$exact_match,
997f5136985f Uploaded
xilinxu
parents:
diff changeset
110 "prefix=s" => \$newfile_prefix,
997f5136985f Uploaded
xilinxu
parents:
diff changeset
111 "suffix=s" => \$newfile_suffix,
997f5136985f Uploaded
xilinxu
parents:
diff changeset
112 "quiet" => \$quiet,
997f5136985f Uploaded
xilinxu
parents:
diff changeset
113 "partial=i" => \$allow_partial_overlap,
997f5136985f Uploaded
xilinxu
parents:
diff changeset
114 "debug" => \$debug,
997f5136985f Uploaded
xilinxu
parents:
diff changeset
115 "mismatches=i" => \$allowed_mismatches,
997f5136985f Uploaded
xilinxu
parents:
diff changeset
116 "help" => \$help
997f5136985f Uploaded
xilinxu
parents:
diff changeset
117 ) ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
118
997f5136985f Uploaded
xilinxu
parents:
diff changeset
119 usage() if ($help);
997f5136985f Uploaded
xilinxu
parents:
diff changeset
120
997f5136985f Uploaded
xilinxu
parents:
diff changeset
121 die "Error: barcode file not specified (use '--bcfile [FILENAME]')\n" unless defined $barcode_file;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
122 die "Error: prefix path/filename not specified (use '--prefix [PATH]')\n" unless defined $newfile_prefix;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
123
997f5136985f Uploaded
xilinxu
parents:
diff changeset
124 if ($barcodes_at_bol == $barcodes_at_eol) {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
125 die "Error: can't specify both --eol & --bol\n" if $barcodes_at_eol;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
126 die "Error: must specify either --eol or --bol\n" ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
127 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
128
997f5136985f Uploaded
xilinxu
parents:
diff changeset
129 die "Error: invalid for value partial matches (valid values are 0 or greater)\n" if $allow_partial_overlap<0;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
130
997f5136985f Uploaded
xilinxu
parents:
diff changeset
131 $allowed_mismatches = 0 if $exact_match;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
132
997f5136985f Uploaded
xilinxu
parents:
diff changeset
133 die "Error: invalid value for mismatches (valid values are 0 or more)\n" if ($allowed_mismatches<0);
997f5136985f Uploaded
xilinxu
parents:
diff changeset
134
997f5136985f Uploaded
xilinxu
parents:
diff changeset
135 die "Error: partial overlap value ($allow_partial_overlap) bigger than " .
997f5136985f Uploaded
xilinxu
parents:
diff changeset
136 "max. allowed mismatches ($allowed_mismatches)\n" if ($allow_partial_overlap > $allowed_mismatches);
997f5136985f Uploaded
xilinxu
parents:
diff changeset
137
997f5136985f Uploaded
xilinxu
parents:
diff changeset
138
997f5136985f Uploaded
xilinxu
parents:
diff changeset
139 exit unless $result;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
140 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
141
997f5136985f Uploaded
xilinxu
parents:
diff changeset
142
997f5136985f Uploaded
xilinxu
parents:
diff changeset
143
997f5136985f Uploaded
xilinxu
parents:
diff changeset
144 #
997f5136985f Uploaded
xilinxu
parents:
diff changeset
145 # Read the barcode file
997f5136985f Uploaded
xilinxu
parents:
diff changeset
146 #
997f5136985f Uploaded
xilinxu
parents:
diff changeset
147 sub load_barcode_file ($) {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
148 my $filename = shift or croak "Missing barcode file name";
997f5136985f Uploaded
xilinxu
parents:
diff changeset
149
997f5136985f Uploaded
xilinxu
parents:
diff changeset
150 open BCFILE,"<$filename" or die "Error: failed to open barcode file ($filename)\n";
997f5136985f Uploaded
xilinxu
parents:
diff changeset
151 while (<BCFILE>) {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
152 next if m/^#/;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
153 chomp;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
154 my ($ident, $barcode) = split ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
155
997f5136985f Uploaded
xilinxu
parents:
diff changeset
156 $barcode = uc($barcode);
997f5136985f Uploaded
xilinxu
parents:
diff changeset
157
997f5136985f Uploaded
xilinxu
parents:
diff changeset
158 # Sanity checks on the barcodes
997f5136985f Uploaded
xilinxu
parents:
diff changeset
159 die "Error: bad data at barcode file ($filename) line $.\n" unless defined $barcode;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
160 die "Error: bad barcode value ($barcode) at barcode file ($filename) line $.\n"
997f5136985f Uploaded
xilinxu
parents:
diff changeset
161 unless $barcode =~ m/^[AGCT]+$/;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
162
997f5136985f Uploaded
xilinxu
parents:
diff changeset
163 die "Error: bad identifier value ($ident) at barcode file ($filename) line $. (must be alphanumeric)\n"
997f5136985f Uploaded
xilinxu
parents:
diff changeset
164 unless $ident =~ m/^\w+$/;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
165
997f5136985f Uploaded
xilinxu
parents:
diff changeset
166 die "Error: badcode($ident, $barcode) is shorter or equal to maximum number of " .
997f5136985f Uploaded
xilinxu
parents:
diff changeset
167 "mismatches ($allowed_mismatches). This makes no sense. Specify fewer mismatches.\n"
997f5136985f Uploaded
xilinxu
parents:
diff changeset
168 if length($barcode)<=$allowed_mismatches;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
169
997f5136985f Uploaded
xilinxu
parents:
diff changeset
170 $barcodes_length = length($barcode) unless defined $barcodes_length;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
171 die "Error: found barcodes in different lengths. this feature is not supported yet.\n"
997f5136985f Uploaded
xilinxu
parents:
diff changeset
172 unless $barcodes_length == length($barcode);
997f5136985f Uploaded
xilinxu
parents:
diff changeset
173
997f5136985f Uploaded
xilinxu
parents:
diff changeset
174 push @barcodes, [$ident, $barcode];
997f5136985f Uploaded
xilinxu
parents:
diff changeset
175
997f5136985f Uploaded
xilinxu
parents:
diff changeset
176 if ($allow_partial_overlap>0) {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
177 foreach my $i (1 .. $allow_partial_overlap) {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
178 substr $barcode, ($barcodes_at_bol)?0:-1, 1, '';
997f5136985f Uploaded
xilinxu
parents:
diff changeset
179 push @barcodes, [$ident, $barcode];
997f5136985f Uploaded
xilinxu
parents:
diff changeset
180 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
181 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
182 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
183 close BCFILE;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
184
997f5136985f Uploaded
xilinxu
parents:
diff changeset
185 if ($debug) {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
186 print STDERR "barcode\tsequence\n";
997f5136985f Uploaded
xilinxu
parents:
diff changeset
187 foreach my $barcoderef (@barcodes) {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
188 my ($ident, $seq) = @{$barcoderef};
997f5136985f Uploaded
xilinxu
parents:
diff changeset
189 print STDERR $ident,"\t", $seq ,"\n";
997f5136985f Uploaded
xilinxu
parents:
diff changeset
190 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
191 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
192 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
193
997f5136985f Uploaded
xilinxu
parents:
diff changeset
194 # Create one output file for each barcode.
997f5136985f Uploaded
xilinxu
parents:
diff changeset
195 # (Also create a file for the dummy 'unmatched' barcode)
997f5136985f Uploaded
xilinxu
parents:
diff changeset
196 sub create_output_files {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
197 my %barcodes = map { $_->[0] => 1 } @barcodes; #generate a uniq list of barcode identifiers;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
198 $barcodes{'unmatched'} = 1 ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
199
997f5136985f Uploaded
xilinxu
parents:
diff changeset
200 foreach my $ident (keys %barcodes) {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
201 my $new_filename = $newfile_prefix . $ident . $newfile_suffix;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
202 $filenames{$ident} = $new_filename;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
203 open my $file, ">$new_filename" or die "Error: failed to create output file ($new_filename)\n";
997f5136985f Uploaded
xilinxu
parents:
diff changeset
204 $files{$ident} = $file ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
205 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
206 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
207
997f5136985f Uploaded
xilinxu
parents:
diff changeset
208 sub match_sequences {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
209
997f5136985f Uploaded
xilinxu
parents:
diff changeset
210 my %barcodes = map { $_->[0] => 1 } @barcodes; #generate a uniq list of barcode identifiers;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
211 $barcodes{'unmatched'} = 1 ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
212
997f5136985f Uploaded
xilinxu
parents:
diff changeset
213 #reset counters
997f5136985f Uploaded
xilinxu
parents:
diff changeset
214 foreach my $ident ( keys %barcodes ) {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
215 $counts{$ident} = 0;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
216 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
217
997f5136985f Uploaded
xilinxu
parents:
diff changeset
218 create_output_files;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
219
997f5136985f Uploaded
xilinxu
parents:
diff changeset
220 # Read file FASTQ file
997f5136985f Uploaded
xilinxu
parents:
diff changeset
221 # split accotding to barcodes
997f5136985f Uploaded
xilinxu
parents:
diff changeset
222 while ( read_record ) {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
223 chomp $seq_bases;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
224
997f5136985f Uploaded
xilinxu
parents:
diff changeset
225 print STDERR "sequence $seq_bases: \n" if $debug;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
226
997f5136985f Uploaded
xilinxu
parents:
diff changeset
227 my $best_barcode_mismatches_count = $barcodes_length;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
228 my $best_barcode_ident = undef;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
229
997f5136985f Uploaded
xilinxu
parents:
diff changeset
230 #Try all barcodes, find the one with the lowest mismatch count
997f5136985f Uploaded
xilinxu
parents:
diff changeset
231 foreach my $barcoderef (@barcodes) {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
232 my ($ident, $barcode) = @{$barcoderef};
997f5136985f Uploaded
xilinxu
parents:
diff changeset
233
997f5136985f Uploaded
xilinxu
parents:
diff changeset
234 # Get DNA fragment (in the length of the barcodes)
997f5136985f Uploaded
xilinxu
parents:
diff changeset
235 # The barcode will be tested only against this fragment
997f5136985f Uploaded
xilinxu
parents:
diff changeset
236 # (no point in testing the barcode against the whole sequence)
997f5136985f Uploaded
xilinxu
parents:
diff changeset
237 my $sequence_fragment;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
238 if ($barcodes_at_bol) {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
239 $sequence_fragment = substr $seq_bases, 0, $barcodes_length;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
240 } else {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
241 $sequence_fragment = substr $seq_bases, - $barcodes_length;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
242 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
243
997f5136985f Uploaded
xilinxu
parents:
diff changeset
244 my $mm = mismatch_count($sequence_fragment, $barcode) ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
245
997f5136985f Uploaded
xilinxu
parents:
diff changeset
246 # if this is a partial match, add the non-overlap as a mismatch
997f5136985f Uploaded
xilinxu
parents:
diff changeset
247 # (partial barcodes are shorter than the length of the original barcodes)
997f5136985f Uploaded
xilinxu
parents:
diff changeset
248 $mm += ($barcodes_length - length($barcode));
997f5136985f Uploaded
xilinxu
parents:
diff changeset
249
997f5136985f Uploaded
xilinxu
parents:
diff changeset
250 if ( $mm < $best_barcode_mismatches_count ) {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
251 $best_barcode_mismatches_count = $mm ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
252 $best_barcode_ident = $ident ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
253 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
254 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
255
997f5136985f Uploaded
xilinxu
parents:
diff changeset
256 $best_barcode_ident = 'unmatched'
997f5136985f Uploaded
xilinxu
parents:
diff changeset
257 if ( (!defined $best_barcode_ident) || $best_barcode_mismatches_count>$allowed_mismatches) ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
258
997f5136985f Uploaded
xilinxu
parents:
diff changeset
259 print STDERR "sequence $seq_bases matched barcode: $best_barcode_ident\n" if $debug;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
260
997f5136985f Uploaded
xilinxu
parents:
diff changeset
261 $counts{$best_barcode_ident}++;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
262
997f5136985f Uploaded
xilinxu
parents:
diff changeset
263 #get the file associated with the matched barcode.
997f5136985f Uploaded
xilinxu
parents:
diff changeset
264 #(note: there's also a file associated with 'unmatched' barcode)
997f5136985f Uploaded
xilinxu
parents:
diff changeset
265 my $file = $files{$best_barcode_ident};
997f5136985f Uploaded
xilinxu
parents:
diff changeset
266
997f5136985f Uploaded
xilinxu
parents:
diff changeset
267 write_record($file);
997f5136985f Uploaded
xilinxu
parents:
diff changeset
268 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
269 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
270
997f5136985f Uploaded
xilinxu
parents:
diff changeset
271 #Quickly calculate hamming distance between two strings
997f5136985f Uploaded
xilinxu
parents:
diff changeset
272 #
997f5136985f Uploaded
xilinxu
parents:
diff changeset
273 #NOTE: Strings must be same length.
997f5136985f Uploaded
xilinxu
parents:
diff changeset
274 # returns number of different characters.
997f5136985f Uploaded
xilinxu
parents:
diff changeset
275 #see http://www.perlmonks.org/?node_id=500235
997f5136985f Uploaded
xilinxu
parents:
diff changeset
276 sub mismatch_count($$) { length( $_[ 0 ] ) - ( ( $_[ 0 ] ^ $_[ 1 ] ) =~ tr[\0][\0] ) }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
277
997f5136985f Uploaded
xilinxu
parents:
diff changeset
278
997f5136985f Uploaded
xilinxu
parents:
diff changeset
279
997f5136985f Uploaded
xilinxu
parents:
diff changeset
280 sub print_results
997f5136985f Uploaded
xilinxu
parents:
diff changeset
281 {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
282 print "Barcode\tCount\tLocation\n";
997f5136985f Uploaded
xilinxu
parents:
diff changeset
283 my $total = 0 ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
284 foreach my $ident (sort keys %counts) {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
285 print $ident, "\t", $counts{$ident},"\t",$filenames{$ident},"\n";
997f5136985f Uploaded
xilinxu
parents:
diff changeset
286 $total += $counts{$ident};
997f5136985f Uploaded
xilinxu
parents:
diff changeset
287 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
288 print "total\t",$total,"\n";
997f5136985f Uploaded
xilinxu
parents:
diff changeset
289 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
290
997f5136985f Uploaded
xilinxu
parents:
diff changeset
291
997f5136985f Uploaded
xilinxu
parents:
diff changeset
292 sub read_record
997f5136985f Uploaded
xilinxu
parents:
diff changeset
293 {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
294 $seq_name = $input_file_io->getline();
997f5136985f Uploaded
xilinxu
parents:
diff changeset
295
997f5136985f Uploaded
xilinxu
parents:
diff changeset
296 return undef unless defined $seq_name; # End of file?
997f5136985f Uploaded
xilinxu
parents:
diff changeset
297
997f5136985f Uploaded
xilinxu
parents:
diff changeset
298 $seq_bases = $input_file_io->getline();
997f5136985f Uploaded
xilinxu
parents:
diff changeset
299 die "Error: bad input file, expecting line with sequences\n" unless defined $seq_bases;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
300
997f5136985f Uploaded
xilinxu
parents:
diff changeset
301 # If using FASTQ format, read two more lines
997f5136985f Uploaded
xilinxu
parents:
diff changeset
302 if ($fastq_format) {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
303 $seq_name2 = $input_file_io->getline();
997f5136985f Uploaded
xilinxu
parents:
diff changeset
304 die "Error: bad input file, expecting line with sequence name2\n" unless defined $seq_name2;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
305
997f5136985f Uploaded
xilinxu
parents:
diff changeset
306 $seq_qualities = $input_file_io->getline();
997f5136985f Uploaded
xilinxu
parents:
diff changeset
307 die "Error: bad input file, expecting line with quality scores\n" unless defined $seq_qualities;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
308 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
309 return 1;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
310 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
311
997f5136985f Uploaded
xilinxu
parents:
diff changeset
312 sub write_record($)
997f5136985f Uploaded
xilinxu
parents:
diff changeset
313 {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
314 my $file = shift;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
315
997f5136985f Uploaded
xilinxu
parents:
diff changeset
316 croak "Bad file handle" unless defined $file;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
317
997f5136985f Uploaded
xilinxu
parents:
diff changeset
318 print $file $seq_name;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
319 print $file $seq_bases,"\n";
997f5136985f Uploaded
xilinxu
parents:
diff changeset
320
997f5136985f Uploaded
xilinxu
parents:
diff changeset
321 #if using FASTQ format, write two more lines
997f5136985f Uploaded
xilinxu
parents:
diff changeset
322 if ($fastq_format) {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
323 print $file $seq_name2;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
324 print $file $seq_qualities;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
325 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
326 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
327
997f5136985f Uploaded
xilinxu
parents:
diff changeset
328 sub open_and_detect_input_format
997f5136985f Uploaded
xilinxu
parents:
diff changeset
329 {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
330 $input_file_io = new IO::Handle;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
331 die "Failed to open STDIN " unless $input_file_io->fdopen(fileno(STDIN),"r");
997f5136985f Uploaded
xilinxu
parents:
diff changeset
332
997f5136985f Uploaded
xilinxu
parents:
diff changeset
333 # Get the first characeter, and push it back
997f5136985f Uploaded
xilinxu
parents:
diff changeset
334 my $first_char = $input_file_io->getc();
997f5136985f Uploaded
xilinxu
parents:
diff changeset
335 $input_file_io->ungetc(ord $first_char);
997f5136985f Uploaded
xilinxu
parents:
diff changeset
336
997f5136985f Uploaded
xilinxu
parents:
diff changeset
337 if ($first_char eq '>') {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
338 # FASTA format
997f5136985f Uploaded
xilinxu
parents:
diff changeset
339 $fastq_format = 0 ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
340 print STDERR "Detected FASTA format\n" if $debug;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
341 } elsif ($first_char eq '@') {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
342 # FASTQ format
997f5136985f Uploaded
xilinxu
parents:
diff changeset
343 $fastq_format = 1;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
344 print STDERR "Detected FASTQ format\n" if $debug;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
345 } else {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
346 die "Error: unknown file format. First character = '$first_char' (expecting > or \@)\n";
997f5136985f Uploaded
xilinxu
parents:
diff changeset
347 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
348 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
349
997f5136985f Uploaded
xilinxu
parents:
diff changeset
350 sub usage()
997f5136985f Uploaded
xilinxu
parents:
diff changeset
351 {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
352 print<<EOF;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
353 Barcode Splitter, by Assaf Gordon (gordon\@cshl.edu), 11sep2008
997f5136985f Uploaded
xilinxu
parents:
diff changeset
354
997f5136985f Uploaded
xilinxu
parents:
diff changeset
355 This program reads FASTA/FASTQ file and splits it into several smaller files,
997f5136985f Uploaded
xilinxu
parents:
diff changeset
356 Based on barcode matching.
997f5136985f Uploaded
xilinxu
parents:
diff changeset
357 FASTA/FASTQ data is read from STDIN (format is auto-detected.)
997f5136985f Uploaded
xilinxu
parents:
diff changeset
358 Output files will be writen to disk.
997f5136985f Uploaded
xilinxu
parents:
diff changeset
359 Summary will be printed to STDOUT.
997f5136985f Uploaded
xilinxu
parents:
diff changeset
360
997f5136985f Uploaded
xilinxu
parents:
diff changeset
361 usage: $0 --bcfile FILE --prefix PREFIX [--suffix SUFFIX] [--bol|--eol]
997f5136985f Uploaded
xilinxu
parents:
diff changeset
362 [--mismatches N] [--exact] [--partial N] [--help] [--quiet] [--debug]
997f5136985f Uploaded
xilinxu
parents:
diff changeset
363
997f5136985f Uploaded
xilinxu
parents:
diff changeset
364 Arguments:
997f5136985f Uploaded
xilinxu
parents:
diff changeset
365
997f5136985f Uploaded
xilinxu
parents:
diff changeset
366 --bcfile FILE - Barcodes file name. (see explanation below.)
997f5136985f Uploaded
xilinxu
parents:
diff changeset
367 --prefix PREFIX - File prefix. will be added to the output files. Can be used
997f5136985f Uploaded
xilinxu
parents:
diff changeset
368 to specify output directories.
997f5136985f Uploaded
xilinxu
parents:
diff changeset
369 --suffix SUFFIX - File suffix (optional). Can be used to specify file
997f5136985f Uploaded
xilinxu
parents:
diff changeset
370 extensions.
997f5136985f Uploaded
xilinxu
parents:
diff changeset
371 --bol - Try to match barcodes at the BEGINNING of sequences.
997f5136985f Uploaded
xilinxu
parents:
diff changeset
372 (What biologists would call the 5' end, and programmers
997f5136985f Uploaded
xilinxu
parents:
diff changeset
373 would call index 0.)
997f5136985f Uploaded
xilinxu
parents:
diff changeset
374 --eol - Try to match barcodes at the END of sequences.
997f5136985f Uploaded
xilinxu
parents:
diff changeset
375 (What biologists would call the 3' end, and programmers
997f5136985f Uploaded
xilinxu
parents:
diff changeset
376 would call the end of the string.)
997f5136985f Uploaded
xilinxu
parents:
diff changeset
377 NOTE: one of --bol, --eol must be specified, but not both.
997f5136985f Uploaded
xilinxu
parents:
diff changeset
378 --mismatches N - Max. number of mismatches allowed. default is 1.
997f5136985f Uploaded
xilinxu
parents:
diff changeset
379 --exact - Same as '--mismatches 0'. If both --exact and --mismatches
997f5136985f Uploaded
xilinxu
parents:
diff changeset
380 are specified, '--exact' takes precedence.
997f5136985f Uploaded
xilinxu
parents:
diff changeset
381 --partial N - Allow partial overlap of barcodes. (see explanation below.)
997f5136985f Uploaded
xilinxu
parents:
diff changeset
382 (Default is not partial matching)
997f5136985f Uploaded
xilinxu
parents:
diff changeset
383 --quiet - Don't print counts and summary at the end of the run.
997f5136985f Uploaded
xilinxu
parents:
diff changeset
384 (Default is to print.)
997f5136985f Uploaded
xilinxu
parents:
diff changeset
385 --debug - Print lots of useless debug information to STDERR.
997f5136985f Uploaded
xilinxu
parents:
diff changeset
386 --help - This helpful help screen.
997f5136985f Uploaded
xilinxu
parents:
diff changeset
387
997f5136985f Uploaded
xilinxu
parents:
diff changeset
388 Example (Assuming 's_2_100.txt' is a FASTQ file, 'mybarcodes.txt' is
997f5136985f Uploaded
xilinxu
parents:
diff changeset
389 the barcodes file):
997f5136985f Uploaded
xilinxu
parents:
diff changeset
390
997f5136985f Uploaded
xilinxu
parents:
diff changeset
391 \$ cat s_2_100.txt | $0 --bcfile mybarcodes.txt --bol --mismatches 2 \\
997f5136985f Uploaded
xilinxu
parents:
diff changeset
392 --prefix /tmp/bla_ --suffix ".txt"
997f5136985f Uploaded
xilinxu
parents:
diff changeset
393
997f5136985f Uploaded
xilinxu
parents:
diff changeset
394 Barcode file format
997f5136985f Uploaded
xilinxu
parents:
diff changeset
395 -------------------
997f5136985f Uploaded
xilinxu
parents:
diff changeset
396 Barcode files are simple text files. Each line should contain an identifier
997f5136985f Uploaded
xilinxu
parents:
diff changeset
397 (descriptive name for the barcode), and the barcode itself (A/C/G/T),
997f5136985f Uploaded
xilinxu
parents:
diff changeset
398 separated by a TAB character. Example:
997f5136985f Uploaded
xilinxu
parents:
diff changeset
399
997f5136985f Uploaded
xilinxu
parents:
diff changeset
400 #This line is a comment (starts with a 'number' sign)
997f5136985f Uploaded
xilinxu
parents:
diff changeset
401 BC1 GATCT
997f5136985f Uploaded
xilinxu
parents:
diff changeset
402 BC2 ATCGT
997f5136985f Uploaded
xilinxu
parents:
diff changeset
403 BC3 GTGAT
997f5136985f Uploaded
xilinxu
parents:
diff changeset
404 BC4 TGTCT
997f5136985f Uploaded
xilinxu
parents:
diff changeset
405
997f5136985f Uploaded
xilinxu
parents:
diff changeset
406 For each barcode, a new FASTQ file will be created (with the barcode's
997f5136985f Uploaded
xilinxu
parents:
diff changeset
407 identifier as part of the file name). Sequences matching the barcode
997f5136985f Uploaded
xilinxu
parents:
diff changeset
408 will be stored in the appropriate file.
997f5136985f Uploaded
xilinxu
parents:
diff changeset
409
997f5136985f Uploaded
xilinxu
parents:
diff changeset
410 Running the above example (assuming "mybarcodes.txt" contains the above
997f5136985f Uploaded
xilinxu
parents:
diff changeset
411 barcodes), will create the following files:
997f5136985f Uploaded
xilinxu
parents:
diff changeset
412 /tmp/bla_BC1.txt
997f5136985f Uploaded
xilinxu
parents:
diff changeset
413 /tmp/bla_BC2.txt
997f5136985f Uploaded
xilinxu
parents:
diff changeset
414 /tmp/bla_BC3.txt
997f5136985f Uploaded
xilinxu
parents:
diff changeset
415 /tmp/bla_BC4.txt
997f5136985f Uploaded
xilinxu
parents:
diff changeset
416 /tmp/bla_unmatched.txt
997f5136985f Uploaded
xilinxu
parents:
diff changeset
417 The 'unmatched' file will contain all sequences that didn't match any barcode.
997f5136985f Uploaded
xilinxu
parents:
diff changeset
418
997f5136985f Uploaded
xilinxu
parents:
diff changeset
419 Barcode matching
997f5136985f Uploaded
xilinxu
parents:
diff changeset
420 ----------------
997f5136985f Uploaded
xilinxu
parents:
diff changeset
421
997f5136985f Uploaded
xilinxu
parents:
diff changeset
422 ** Without partial matching:
997f5136985f Uploaded
xilinxu
parents:
diff changeset
423
997f5136985f Uploaded
xilinxu
parents:
diff changeset
424 Count mismatches between the FASTA/Q sequences and the barcodes.
997f5136985f Uploaded
xilinxu
parents:
diff changeset
425 The barcode which matched with the lowest mismatches count (providing the
997f5136985f Uploaded
xilinxu
parents:
diff changeset
426 count is small or equal to '--mismatches N') 'gets' the sequences.
997f5136985f Uploaded
xilinxu
parents:
diff changeset
427
997f5136985f Uploaded
xilinxu
parents:
diff changeset
428 Example (using the above barcodes):
997f5136985f Uploaded
xilinxu
parents:
diff changeset
429 Input Sequence:
997f5136985f Uploaded
xilinxu
parents:
diff changeset
430 GATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG
997f5136985f Uploaded
xilinxu
parents:
diff changeset
431
997f5136985f Uploaded
xilinxu
parents:
diff changeset
432 Matching with '--bol --mismatches 1':
997f5136985f Uploaded
xilinxu
parents:
diff changeset
433 GATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG
997f5136985f Uploaded
xilinxu
parents:
diff changeset
434 GATCT (1 mismatch, BC1)
997f5136985f Uploaded
xilinxu
parents:
diff changeset
435 ATCGT (4 mismatches, BC2)
997f5136985f Uploaded
xilinxu
parents:
diff changeset
436 GTGAT (3 mismatches, BC3)
997f5136985f Uploaded
xilinxu
parents:
diff changeset
437 TGTCT (3 mismatches, BC4)
997f5136985f Uploaded
xilinxu
parents:
diff changeset
438
997f5136985f Uploaded
xilinxu
parents:
diff changeset
439 This sequence will be classified as 'BC1' (it has the lowest mismatch count).
997f5136985f Uploaded
xilinxu
parents:
diff changeset
440 If '--exact' or '--mismatches 0' were specified, this sequence would be
997f5136985f Uploaded
xilinxu
parents:
diff changeset
441 classified as 'unmatched' (because, although BC1 had the lowest mismatch count,
997f5136985f Uploaded
xilinxu
parents:
diff changeset
442 it is above the maximum allowed mismatches).
997f5136985f Uploaded
xilinxu
parents:
diff changeset
443
997f5136985f Uploaded
xilinxu
parents:
diff changeset
444 Matching with '--eol' (end of line) does the same, but from the other side
997f5136985f Uploaded
xilinxu
parents:
diff changeset
445 of the sequence.
997f5136985f Uploaded
xilinxu
parents:
diff changeset
446
997f5136985f Uploaded
xilinxu
parents:
diff changeset
447 ** With partial matching (very similar to indels):
997f5136985f Uploaded
xilinxu
parents:
diff changeset
448
997f5136985f Uploaded
xilinxu
parents:
diff changeset
449 Same as above, with the following addition: barcodes are also checked for
997f5136985f Uploaded
xilinxu
parents:
diff changeset
450 partial overlap (number of allowed non-overlapping bases is '--partial N').
997f5136985f Uploaded
xilinxu
parents:
diff changeset
451
997f5136985f Uploaded
xilinxu
parents:
diff changeset
452 Example:
997f5136985f Uploaded
xilinxu
parents:
diff changeset
453 Input sequence is ATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG
997f5136985f Uploaded
xilinxu
parents:
diff changeset
454 (Same as above, but note the missing 'G' at the beginning.)
997f5136985f Uploaded
xilinxu
parents:
diff changeset
455
997f5136985f Uploaded
xilinxu
parents:
diff changeset
456 Matching (without partial overlapping) against BC1 yields 4 mismatches:
997f5136985f Uploaded
xilinxu
parents:
diff changeset
457 ATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG
997f5136985f Uploaded
xilinxu
parents:
diff changeset
458 GATCT (4 mismatches)
997f5136985f Uploaded
xilinxu
parents:
diff changeset
459
997f5136985f Uploaded
xilinxu
parents:
diff changeset
460 Partial overlapping would also try the following match:
997f5136985f Uploaded
xilinxu
parents:
diff changeset
461 -ATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG
997f5136985f Uploaded
xilinxu
parents:
diff changeset
462 GATCT (1 mismatch)
997f5136985f Uploaded
xilinxu
parents:
diff changeset
463
997f5136985f Uploaded
xilinxu
parents:
diff changeset
464 Note: scoring counts a missing base as a mismatch, so the final
997f5136985f Uploaded
xilinxu
parents:
diff changeset
465 mismatch count is 2 (1 'real' mismatch, 1 'missing base' mismatch).
997f5136985f Uploaded
xilinxu
parents:
diff changeset
466 If running with '--mismatches 2' (meaning allowing upto 2 mismatches) - this
997f5136985f Uploaded
xilinxu
parents:
diff changeset
467 seqeunce will be classified as BC1.
997f5136985f Uploaded
xilinxu
parents:
diff changeset
468
997f5136985f Uploaded
xilinxu
parents:
diff changeset
469 EOF
997f5136985f Uploaded
xilinxu
parents:
diff changeset
470
997f5136985f Uploaded
xilinxu
parents:
diff changeset
471 exit 1;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
472 }