Mercurial > repos > lparsons > fastx_barcode_splitter_enhanced
comparison fastx_barcode_splitter.pl @ 4:0fb7e9130a70 draft default tip
planemo upload for repository https://github.com/lparsons/galaxy_tools/tree/master/tools/fastx_barcode_splitter_enhanced commit 460463a5406419fe8e113467bdb8bd093d21e7c5
author | lparsons |
---|---|
date | Mon, 02 May 2016 17:04:32 -0400 |
parents | e7b7cdc1834d |
children |
comparison
equal
deleted
inserted
replaced
3:e7b7cdc1834d | 4:0fb7e9130a70 |
---|---|
4 # Copyright (C) 2009 A. Gordon (gordon@cshl.edu) | 4 # Copyright (C) 2009 A. Gordon (gordon@cshl.edu) |
5 # | 5 # |
6 # Lance Parsons (lparsons@princeton.edu) | 6 # Lance Parsons (lparsons@princeton.edu) |
7 # 3/21/2011 - Modified to accept separate index file for barcodes | 7 # 3/21/2011 - Modified to accept separate index file for barcodes |
8 # 4/6/2011 - Modified to cleanup bad barcode identifiers (esp. useful for Galaxy) | 8 # 4/6/2011 - Modified to cleanup bad barcode identifiers (esp. useful for Galaxy) |
9 # | 9 # 4/28/2016 - Modified summary output to remove file paths and add comment |
10 # character '#' | |
11 | |
10 # This program is free software: you can redistribute it and/or modify | 12 # This program is free software: you can redistribute it and/or modify |
11 # it under the terms of the GNU Affero General Public License as | 13 # it under the terms of the GNU Affero General Public License as |
12 # published by the Free Software Foundation, either version 3 of the | 14 # published by the Free Software Foundation, either version 3 of the |
13 # License, or (at your option) any later version. | 15 # License, or (at your option) any later version. |
14 # | 16 # |
46 sub read_index_record; | 48 sub read_index_record; |
47 sub read_record; | 49 sub read_record; |
48 sub write_record($); | 50 sub write_record($); |
49 sub usage(); | 51 sub usage(); |
50 | 52 |
51 # Global flags and arguments, | 53 # Global flags and arguments, |
52 # Set by command line argumens | 54 # Set by command line argumens |
53 my $barcode_file ; | 55 my $barcode_file ; |
54 my $barcodes_at_eol = 0 ; | 56 my $barcodes_at_eol = 0 ; |
55 my $barcodes_at_bol = 0 ; | 57 my $barcodes_at_bol = 0 ; |
56 my $index_read_file ; | 58 my $index_read_file ; |
63 my $debug = 0 ; | 65 my $debug = 0 ; |
64 my $fastq_format = 1; | 66 my $fastq_format = 1; |
65 my $index_fastq_format = 1; | 67 my $index_fastq_format = 1; |
66 my $read_id_check_strip_characters = 1; | 68 my $read_id_check_strip_characters = 1; |
67 | 69 |
68 # Global variables | 70 # Global variables |
69 # Populated by 'create_output_files' | 71 # Populated by 'create_output_files' |
70 my %filenames; | 72 my %filenames; |
71 my %files; | 73 my %files; |
72 my %counts = ( 'unmatched' => 0 ); | 74 my %counts = ( 'unmatched' => 0 ); |
73 my $barcodes_length; | 75 my $barcodes_length; |
115 | 117 |
116 | 118 |
117 | 119 |
118 | 120 |
119 sub parse_command_line { | 121 sub parse_command_line { |
120 my $help; | 122 my $help; |
121 | 123 |
122 usage() if (scalar @ARGV==0); | 124 usage() if (scalar @ARGV==0); |
123 | 125 |
124 my $result = GetOptions ( "bcfile=s" => \$barcode_file, | 126 my $result = GetOptions ( "bcfile=s" => \$barcode_file, |
125 "eol" => \$barcodes_at_eol, | 127 "eol" => \$barcodes_at_eol, |
126 "bol" => \$barcodes_at_bol, | 128 "bol" => \$barcodes_at_bol, |
127 "idxfile=s" => \$index_read_file, | 129 "idxfile=s" => \$index_read_file, |
128 "idxidstrip=i" => \$read_id_check_strip_characters, | 130 "idxidstrip=i" => \$read_id_check_strip_characters, |
129 "exact" => \$exact_match, | 131 "exact" => \$exact_match, |
130 "prefix=s" => \$newfile_prefix, | 132 "prefix=s" => \$newfile_prefix, |
131 "suffix=s" => \$newfile_suffix, | 133 "suffix=s" => \$newfile_suffix, |
132 "quiet" => \$quiet, | 134 "quiet" => \$quiet, |
133 "partial=i" => \$allow_partial_overlap, | 135 "partial=i" => \$allow_partial_overlap, |
134 "debug" => \$debug, | 136 "debug" => \$debug, |
135 "mismatches=i" => \$allowed_mismatches, | 137 "mismatches=i" => \$allowed_mismatches, |
136 "help" => \$help | 138 "help" => \$help |
137 ) ; | 139 ) ; |
138 | 140 |
139 usage() if ($help); | 141 usage() if ($help); |
140 | 142 |
141 die "Error: barcode file not specified (use '--bcfile [FILENAME]')\n" unless defined $barcode_file; | 143 die "Error: barcode file not specified (use '--bcfile [FILENAME]')\n" unless defined $barcode_file; |
142 die "Error: prefix path/filename not specified (use '--prefix [PATH]')\n" unless defined $newfile_prefix; | 144 die "Error: prefix path/filename not specified (use '--prefix [PATH]')\n" unless defined $newfile_prefix; |
143 | 145 |
144 if (! defined $index_read_file) { | 146 if (! defined $index_read_file) { |
145 if ($barcodes_at_bol == $barcodes_at_eol) { | 147 if ($barcodes_at_bol == $barcodes_at_eol) { |
146 die "Error: can't specify both --eol & --bol\n" if $barcodes_at_eol; | 148 die "Error: can't specify both --eol & --bol\n" if $barcodes_at_eol; |
147 die "Error: must specify either --eol or --bol or --idxfile\n" ; | 149 die "Error: must specify either --eol or --bol or --idxfile\n" ; |
148 } | 150 } |
149 } | 151 } |
150 elsif ($barcodes_at_bol || $barcodes_at_eol) { | 152 elsif ($barcodes_at_bol || $barcodes_at_eol) { |
151 die "Error: Must specify only one of --idxfile, --eol, or --bol"; | 153 die "Error: Must specify only one of --idxfile, --eol, or --bol"; |
152 } | 154 } |
153 | 155 |
154 die "Error: invalid for value partial matches (valid values are 0 or greater)\n" if $allow_partial_overlap<0; | 156 die "Error: invalid for value partial matches (valid values are 0 or greater)\n" if $allow_partial_overlap<0; |
155 | 157 |
156 $allowed_mismatches = 0 if $exact_match; | 158 $allowed_mismatches = 0 if $exact_match; |
157 | 159 |
158 die "Error: invalid value for mismatches (valid values are 0 or more)\n" if ($allowed_mismatches<0); | 160 die "Error: invalid value for mismatches (valid values are 0 or more)\n" if ($allowed_mismatches<0); |
159 | 161 |
160 die "Error: partial overlap value ($allow_partial_overlap) bigger than " . | 162 die "Error: partial overlap value ($allow_partial_overlap) bigger than " . |
161 "max. allowed mismatches ($allowed_mismatches)\n" if ($allow_partial_overlap > $allowed_mismatches); | 163 "max. allowed mismatches ($allowed_mismatches)\n" if ($allow_partial_overlap > $allowed_mismatches); |
162 | 164 |
163 | 165 |
164 exit unless $result; | 166 exit unless $result; |
165 } | 167 } |
166 | 168 |
167 | 169 |
168 | 170 |
169 # | 171 # |
170 # Read the barcode file | 172 # Read the barcode file |
171 # | 173 # |
172 sub load_barcode_file ($) { | 174 sub load_barcode_file ($) { |
173 my $filename = shift or croak "Missing barcode file name"; | 175 my $filename = shift or croak "Missing barcode file name"; |
174 | 176 |
175 open BCFILE,"<$filename" or die "Error: failed to open barcode file ($filename)\n"; | 177 open BCFILE,"<$filename" or die "Error: failed to open barcode file ($filename)\n"; |
176 while (<BCFILE>) { | 178 while (<BCFILE>) { |
177 next if m/^#/; | 179 next if m/^#/; |
178 chomp; | 180 chomp; |
179 my ($ident, $barcode) = split('\t') ; | 181 my ($ident, $barcode) = split('\t') ; |
180 | 182 |
181 $barcode = uc($barcode); | 183 $barcode = uc($barcode); |
182 | 184 |
183 # Sanity checks on the barcodes | 185 # Sanity checks on the barcodes |
184 die "Error: bad data at barcode file ($filename) line $.\n" unless defined $barcode; | 186 die "Error: bad data at barcode file ($filename) line $.\n" unless defined $barcode; |
185 die "Error: bad barcode value ($barcode) at barcode file ($filename) line $.\n" | 187 die "Error: bad barcode value ($barcode) at barcode file ($filename) line $.\n" |
186 unless $barcode =~ m/^[AGCT]+$/; | 188 unless $barcode =~ m/^[AGCT]+$/; |
187 | 189 |
188 # Cleanup Identifiers (only allow alphanumeric, replace others with dash '-') | 190 # Cleanup Identifiers (only allow alphanumeric, replace others with dash '-') |
189 $ident =~ s/[^A-Za-z0-9]/-/g; | 191 $ident =~ s/[^A-Za-z0-9]/-/g; |
190 die "Error: bad identifier value ($ident) at barcode file ($filename) line $. (must be alphanumeric)\n" | 192 die "Error: bad identifier value ($ident) at barcode file ($filename) line $. (must be alphanumeric)\n" |
191 unless $ident =~ m/^[A-Za-z0-9-]+$/; | 193 unless $ident =~ m/^[A-Za-z0-9-]+$/; |
192 | 194 |
193 die "Error: badcode($ident, $barcode) is shorter or equal to maximum number of " . | 195 die "Error: badcode($ident, $barcode) is shorter or equal to maximum number of " . |
194 "mismatches ($allowed_mismatches). This makes no sense. Specify fewer mismatches.\n" | 196 "mismatches ($allowed_mismatches). This makes no sense. Specify fewer mismatches.\n" |
195 if length($barcode)<=$allowed_mismatches; | 197 if length($barcode)<=$allowed_mismatches; |
196 | 198 |
197 $barcodes_length = length($barcode) unless defined $barcodes_length; | 199 $barcodes_length = length($barcode) unless defined $barcodes_length; |
198 die "Error: found barcodes in different lengths. this feature is not supported yet.\n" | 200 die "Error: found barcodes in different lengths. this feature is not supported yet.\n" |
199 unless $barcodes_length == length($barcode); | 201 unless $barcodes_length == length($barcode); |
200 | 202 |
201 push @barcodes, [$ident, $barcode]; | 203 push @barcodes, [$ident, $barcode]; |
202 | 204 |
203 if ($allow_partial_overlap>0) { | 205 if ($allow_partial_overlap>0) { |
204 foreach my $i (1 .. $allow_partial_overlap) { | 206 foreach my $i (1 .. $allow_partial_overlap) { |
205 substr $barcode, ($barcodes_at_bol)?0:-1, 1, ''; | 207 substr $barcode, ($barcodes_at_bol)?0:-1, 1, ''; |
206 push @barcodes, [$ident, $barcode]; | 208 push @barcodes, [$ident, $barcode]; |
207 } | 209 } |
208 } | 210 } |
209 } | 211 } |
210 close BCFILE; | 212 close BCFILE; |
211 | 213 |
212 if ($debug) { | 214 if ($debug) { |
213 print STDERR "barcode\tsequence\n"; | 215 print STDERR "barcode\tsequence\n"; |
214 foreach my $barcoderef (@barcodes) { | 216 foreach my $barcoderef (@barcodes) { |
215 my ($ident, $seq) = @{$barcoderef}; | 217 my ($ident, $seq) = @{$barcoderef}; |
216 print STDERR $ident,"\t", $seq ,"\n"; | 218 print STDERR $ident,"\t", $seq ,"\n"; |
217 } | 219 } |
218 } | 220 } |
219 } | 221 } |
220 | 222 |
221 # Create one output file for each barcode. | 223 # Create one output file for each barcode. |
222 # (Also create a file for the dummy 'unmatched' barcode) | 224 # (Also create a file for the dummy 'unmatched' barcode) |
223 sub create_output_files { | 225 sub create_output_files { |
224 my %barcodes = map { $_->[0] => 1 } @barcodes; #generate a uniq list of barcode identifiers; | 226 my %barcodes = map { $_->[0] => 1 } @barcodes; #generate a uniq list of barcode identifiers; |
225 $barcodes{'unmatched'} = 1 ; | 227 $barcodes{'unmatched'} = 1 ; |
226 | 228 |
227 foreach my $ident (keys %barcodes) { | 229 foreach my $ident (keys %barcodes) { |
228 my $new_filename = $newfile_prefix . $ident . $newfile_suffix; | 230 my $new_filename = $newfile_prefix . $ident . $newfile_suffix; |
229 $filenames{$ident} = $new_filename; | 231 $filenames{$ident} = $new_filename; |
230 open my $file, ">$new_filename" or die "Error: failed to create output file ($new_filename)\n"; | 232 open my $file, ">$new_filename" or die "Error: failed to create output file ($new_filename)\n"; |
231 $files{$ident} = $file ; | 233 $files{$ident} = $file ; |
232 } | 234 } |
233 } | 235 } |
234 | 236 |
235 sub match_sequences { | 237 sub match_sequences { |
236 | 238 |
237 my %barcodes = map { $_->[0] => 1 } @barcodes; #generate a uniq list of barcode identifiers; | 239 my %barcodes = map { $_->[0] => 1 } @barcodes; #generate a uniq list of barcode identifiers; |
238 $barcodes{'unmatched'} = 1 ; | 240 $barcodes{'unmatched'} = 1 ; |
239 | 241 |
240 #reset counters | 242 #reset counters |
241 foreach my $ident ( keys %barcodes ) { | 243 foreach my $ident ( keys %barcodes ) { |
242 $counts{$ident} = 0; | 244 $counts{$ident} = 0; |
243 } | 245 } |
244 | 246 |
245 create_output_files; | 247 create_output_files; |
246 | 248 |
247 # Read file FASTQ file | 249 # Read file FASTQ file |
248 # split accotding to barcodes | 250 # split accotding to barcodes |
249 while ( read_record ) { | 251 while ( read_record ) { |
250 chomp $seq_name; | 252 chomp $seq_name; |
251 chomp $seq_bases; | 253 chomp $seq_bases; |
252 if (defined $index_read_file) { | 254 if (defined $index_read_file) { |
253 read_index_record() or die "Error: Unable to read index sequence for sequence name ($seq_name), check to make sure the file lengths match.\n"; | 255 read_index_record() or die "Error: Unable to read index sequence for sequence name ($seq_name), check to make sure the file lengths match.\n"; |
254 chomp $index_seq_name; | 256 chomp $index_seq_name; |
255 chomp $index_seq_bases; | 257 chomp $index_seq_bases; |
256 | 258 |
257 # Assert that the read ids match | 259 # Assert that the read ids match |
258 my $seq_name_match = &strip_read_id($seq_name); | 260 my $seq_name_match = &strip_read_id($seq_name); |
259 my $index_seq_name_match = &strip_read_id($index_seq_name); | 261 my $index_seq_name_match = &strip_read_id($index_seq_name); |
260 if ($seq_name_match ne $index_seq_name_match) { | 262 if ($seq_name_match ne $index_seq_name_match) { |
261 die "Error: Index sequence name ($index_seq_name) does not match sequence name ($seq_name)\n"; | 263 die "Error: Index sequence name ($index_seq_name) does not match sequence name ($seq_name)\n"; |
262 } | 264 } |
263 | 265 |
264 } | 266 } |
265 | 267 |
266 print STDERR "sequence $seq_bases: \n" if $debug; | 268 print STDERR "sequence $seq_bases: \n" if $debug; |
267 | 269 |
268 my $best_barcode_mismatches_count = $barcodes_length; | 270 my $best_barcode_mismatches_count = $barcodes_length; |
269 my $best_barcode_ident = undef; | 271 my $best_barcode_ident = undef; |
270 | 272 |
271 #Try all barcodes, find the one with the lowest mismatch count | 273 #Try all barcodes, find the one with the lowest mismatch count |
272 foreach my $barcoderef (@barcodes) { | 274 foreach my $barcoderef (@barcodes) { |
273 my ($ident, $barcode) = @{$barcoderef}; | 275 my ($ident, $barcode) = @{$barcoderef}; |
274 | 276 |
275 # Get DNA fragment (in the length of the barcodes) | 277 # Get DNA fragment (in the length of the barcodes) |
276 # The barcode will be tested only against this fragment | 278 # The barcode will be tested only against this fragment |
277 # (no point in testing the barcode against the whole sequence) | 279 # (no point in testing the barcode against the whole sequence) |
278 my $sequence_fragment; | 280 my $sequence_fragment; |
279 if ($barcodes_at_bol) { | 281 if ($barcodes_at_bol) { |
280 $sequence_fragment = substr $seq_bases, 0, $barcodes_length; | 282 $sequence_fragment = substr $seq_bases, 0, $barcodes_length; |
281 } elsif ($barcodes_at_eol) { | 283 } elsif ($barcodes_at_eol) { |
282 $sequence_fragment = substr $seq_bases, - $barcodes_length; | 284 $sequence_fragment = substr $seq_bases, - $barcodes_length; |
283 } else { | 285 } else { |
284 $sequence_fragment = substr $index_seq_bases, 0, $barcodes_length; | 286 $sequence_fragment = substr $index_seq_bases, 0, $barcodes_length; |
285 } | 287 } |
286 | 288 |
287 my $mm = mismatch_count($sequence_fragment, $barcode) ; | 289 my $mm = mismatch_count($sequence_fragment, $barcode) ; |
288 | 290 |
289 # if this is a partial match, add the non-overlap as a mismatch | 291 # if this is a partial match, add the non-overlap as a mismatch |
290 # (partial barcodes are shorter than the length of the original barcodes) | 292 # (partial barcodes are shorter than the length of the original barcodes) |
291 $mm += ($barcodes_length - length($barcode)); | 293 $mm += ($barcodes_length - length($barcode)); |
292 | 294 |
293 if ( $mm < $best_barcode_mismatches_count ) { | 295 if ( $mm < $best_barcode_mismatches_count ) { |
294 $best_barcode_mismatches_count = $mm ; | 296 $best_barcode_mismatches_count = $mm ; |
295 $best_barcode_ident = $ident ; | 297 $best_barcode_ident = $ident ; |
296 } | 298 } |
297 } | 299 } |
298 | 300 |
299 $best_barcode_ident = 'unmatched' | 301 $best_barcode_ident = 'unmatched' |
300 if ( (!defined $best_barcode_ident) || $best_barcode_mismatches_count>$allowed_mismatches) ; | 302 if ( (!defined $best_barcode_ident) || $best_barcode_mismatches_count>$allowed_mismatches) ; |
301 | 303 |
302 print STDERR "sequence $seq_bases matched barcode: $best_barcode_ident\n" if $debug; | 304 print STDERR "sequence $seq_bases matched barcode: $best_barcode_ident\n" if $debug; |
303 | 305 |
304 $counts{$best_barcode_ident}++; | 306 $counts{$best_barcode_ident}++; |
305 | 307 |
306 #get the file associated with the matched barcode. | 308 #get the file associated with the matched barcode. |
307 #(note: there's also a file associated with 'unmatched' barcode) | 309 #(note: there's also a file associated with 'unmatched' barcode) |
308 my $file = $files{$best_barcode_ident}; | 310 my $file = $files{$best_barcode_ident}; |
309 | 311 |
310 write_record($file); | 312 write_record($file); |
311 } | 313 } |
312 } | 314 } |
313 | 315 |
314 # Strip end of readids when matching to avoid mismatch between read 1, 2, 3, etc. | 316 # Strip end of readids when matching to avoid mismatch between read 1, 2, 3, etc. |
315 sub strip_read_id { | 317 sub strip_read_id { |
316 my $read_id = shift; | 318 my $read_id = shift; |
317 my $stripped_read_id = $read_id; | 319 my $stripped_read_id = $read_id; |
318 if ($read_id_check_strip_characters) { | 320 if ($read_id_check_strip_characters) { |
319 if ($read_id =~ /@([^:]+):([0-9]+):([^:]+):([0-9]+):([0-9]+):([0-9]+):([0-9]+) ([0-9]+):([YN]):([0-9]+):([ACGT]+){0,1}/) { # CASAVA 1.8+ | 321 if ($read_id =~ /@([^:]+):([0-9]+):([^:]+):([0-9]+):([0-9]+):([0-9]+):([0-9]+) ([0-9]+):([YN]):([0-9]+):([ACGT]+){0,1}/) { # CASAVA 1.8+ |
320 my @parts = split(/ /,$read_id); | 322 my @parts = split(/ /,$read_id); |
321 $stripped_read_id = $parts[0]; | 323 $stripped_read_id = $parts[0]; |
322 } else { # CASAVA 1.7 and earlier | 324 } else { # CASAVA 1.7 and earlier |
323 $stripped_read_id = substr($read_id, 0, length($read_id)-$read_id_check_strip_characters); | 325 $stripped_read_id = substr($read_id, 0, length($read_id)-$read_id_check_strip_characters); |
324 } | 326 } |
325 } | 327 } |
326 return $stripped_read_id; | 328 return $stripped_read_id; |
327 } | 329 } |
328 | 330 |
329 | 331 |
330 #Quickly calculate hamming distance between two strings | 332 #Quickly calculate hamming distance between two strings |
331 # | 333 # |
336 | 338 |
337 | 339 |
338 | 340 |
339 sub print_results | 341 sub print_results |
340 { | 342 { |
341 print "Barcode\tCount\tLocation\n"; | 343 print "# Barcode\tCount\n"; |
342 my $total = 0 ; | 344 my $total = 0 ; |
343 foreach my $ident (sort keys %counts) { | 345 foreach my $ident (sort keys %counts) { |
344 print $ident, "\t", $counts{$ident},"\t",$filenames{$ident},"\n"; | 346 print $ident, "\t", $counts{$ident},"\n"; |
345 $total += $counts{$ident}; | 347 $total += $counts{$ident}; |
346 } | 348 } |
347 print "total\t",$total,"\n"; | 349 print "total\t",$total,"\n"; |
348 } | 350 } |
349 | 351 |
350 | 352 |
351 sub read_record | 353 sub read_record |
352 { | 354 { |
353 $seq_name = $input_file_io->getline(); | 355 $seq_name = $input_file_io->getline(); |
354 | 356 |
355 return undef unless defined $seq_name; # End of file? | 357 return undef unless defined $seq_name; # End of file? |
356 | 358 |
357 $seq_bases = $input_file_io->getline(); | 359 $seq_bases = $input_file_io->getline(); |
358 die "Error: bad input file, expecting line with sequences\n" unless defined $seq_bases; | 360 die "Error: bad input file, expecting line with sequences\n" unless defined $seq_bases; |
359 | 361 |
360 # If using FASTQ format, read two more lines | 362 # If using FASTQ format, read two more lines |
361 if ($fastq_format) { | 363 if ($fastq_format) { |
362 $seq_name2 = $input_file_io->getline(); | 364 $seq_name2 = $input_file_io->getline(); |
363 die "Error: bad input file, expecting line with sequence name2\n" unless defined $seq_name2; | 365 die "Error: bad input file, expecting line with sequence name2\n" unless defined $seq_name2; |
364 | 366 |
365 $seq_qualities = $input_file_io->getline(); | 367 $seq_qualities = $input_file_io->getline(); |
366 die "Error: bad input file, expecting line with quality scores\n" unless defined $seq_qualities; | 368 die "Error: bad input file, expecting line with quality scores\n" unless defined $seq_qualities; |
367 } | 369 } |
368 return 1; | 370 return 1; |
369 } | 371 } |
370 | 372 |
371 sub write_record($) | 373 sub write_record($) |
372 { | 374 { |
373 my $file = shift; | 375 my $file = shift; |
374 | 376 |
375 croak "Bad file handle" unless defined $file; | 377 croak "Bad file handle" unless defined $file; |
376 | 378 |
377 print $file $seq_name,"\n"; | 379 print $file $seq_name,"\n"; |
378 print $file $seq_bases,"\n"; | 380 print $file $seq_bases,"\n"; |
379 | 381 |
380 #if using FASTQ format, write two more lines | 382 #if using FASTQ format, write two more lines |
381 if ($fastq_format) { | 383 if ($fastq_format) { |
382 print $file $seq_name2; | 384 print $file $seq_name2; |
383 print $file $seq_qualities; | 385 print $file $seq_qualities; |
384 } | 386 } |
385 } | 387 } |
386 | 388 |
387 sub open_and_detect_input_format | 389 sub open_and_detect_input_format |
388 { | 390 { |
389 $input_file_io = new IO::Handle; | 391 $input_file_io = new IO::Handle; |
390 die "Failed to open STDIN " unless $input_file_io->fdopen(fileno(STDIN),"r"); | 392 die "Failed to open STDIN " unless $input_file_io->fdopen(fileno(STDIN),"r"); |
391 | 393 |
392 # Get the first characeter, and push it back | 394 # Get the first characeter, and push it back |
393 my $first_char = $input_file_io->getc(); | 395 my $first_char = $input_file_io->getc(); |
394 $input_file_io->ungetc(ord $first_char); | 396 $input_file_io->ungetc(ord $first_char); |
395 | 397 |
396 if ($first_char eq '>') { | 398 if ($first_char eq '>') { |
397 # FASTA format | 399 # FASTA format |
398 $fastq_format = 0 ; | 400 $fastq_format = 0 ; |
399 print STDERR "Detected FASTA format\n" if $debug; | 401 print STDERR "Detected FASTA format\n" if $debug; |
400 } elsif ($first_char eq '@') { | 402 } elsif ($first_char eq '@') { |
401 # FASTQ format | 403 # FASTQ format |
402 $fastq_format = 1; | 404 $fastq_format = 1; |
403 print STDERR "Detected FASTQ format\n" if $debug; | 405 print STDERR "Detected FASTQ format\n" if $debug; |
404 } else { | 406 } else { |
405 die "Error: unknown file format. First character = '$first_char' (expecting > or \@)\n"; | 407 die "Error: unknown file format. First character = '$first_char' (expecting > or \@)\n"; |
406 } | 408 } |
407 } | 409 } |
408 | 410 |
409 | 411 |
410 sub open_index_and_detect_input_format($) { | 412 sub open_index_and_detect_input_format($) { |
411 my $filename = shift or croak "Missing index read file name"; | 413 my $filename = shift or croak "Missing index read file name"; |
412 | 414 |
413 open IDXFILE,"<$filename" or die "Error: failed to open index read file ($filename)\n"; | 415 open IDXFILE,"<$filename" or die "Error: failed to open index read file ($filename)\n"; |
414 | 416 |
415 # Get the first line, and reset file pointer | 417 # Get the first line, and reset file pointer |
416 my $first_line = <IDXFILE>; | 418 my $first_line = <IDXFILE>; |
417 my $first_char = substr($first_line, 0, 1); | 419 my $first_char = substr($first_line, 0, 1); |
418 seek(IDXFILE, 0, 0); | 420 seek(IDXFILE, 0, 0); |
419 | 421 |
420 if ($first_char eq '>') { | 422 if ($first_char eq '>') { |
421 # FASTA format | 423 # FASTA format |
422 $index_fastq_format = 0 ; | 424 $index_fastq_format = 0 ; |
423 print STDERR "Detected FASTA format for index file\n" if $debug; | 425 print STDERR "Detected FASTA format for index file\n" if $debug; |
424 } elsif ($first_char eq '@') { | 426 } elsif ($first_char eq '@') { |
425 # FASTQ format | 427 # FASTQ format |
426 $index_fastq_format = 1; | 428 $index_fastq_format = 1; |
427 print STDERR "Detected FASTQ format for index file\n" if $debug; | 429 print STDERR "Detected FASTQ format for index file\n" if $debug; |
428 } else { | 430 } else { |
429 die "Error: unknown index file format. First character = '$first_char' (expecting > or \@)\n"; | 431 die "Error: unknown index file format. First character = '$first_char' (expecting > or \@)\n"; |
430 } | 432 } |
431 } | 433 } |
432 | 434 |
433 sub read_index_record | 435 sub read_index_record |
434 { | 436 { |
435 $index_seq_name = <IDXFILE>; | 437 $index_seq_name = <IDXFILE>; |
436 | 438 |
437 return undef unless defined $index_seq_name; # End of file? | 439 return undef unless defined $index_seq_name; # End of file? |
438 | 440 |
439 $index_seq_bases = <IDXFILE>; | 441 $index_seq_bases = <IDXFILE>; |
440 die "Error: bad input file, expecting line with sequences\n" unless defined $index_seq_bases; | 442 die "Error: bad input file, expecting line with sequences\n" unless defined $index_seq_bases; |
441 | 443 |
442 # If using FASTQ format, read two more lines | 444 # If using FASTQ format, read two more lines |
443 if ($index_fastq_format) { | 445 if ($index_fastq_format) { |
444 $index_seq_name2 = <IDXFILE>; | 446 $index_seq_name2 = <IDXFILE>; |
445 die "Error: bad input file, expecting line with sequence name2\n" unless defined $index_seq_name2; | 447 die "Error: bad input file, expecting line with sequence name2\n" unless defined $index_seq_name2; |
446 | 448 |
447 $index_seq_qualities = <IDXFILE>; | 449 $index_seq_qualities = <IDXFILE>; |
448 die "Error: bad input file, expecting line with quality scores\n" unless defined $index_seq_qualities; | 450 die "Error: bad input file, expecting line with quality scores\n" unless defined $index_seq_qualities; |
449 } | 451 } |
450 return 1; | 452 return 1; |
451 } | 453 } |
452 | 454 |
453 sub usage() | 455 sub usage() |
454 { | 456 { |
455 print<<EOF; | 457 print<<EOF; |
456 Barcode Splitter, by Assaf Gordon (gordon\@cshl.edu), 11sep2008 | 458 Barcode Splitter, by Assaf Gordon (gordon\@cshl.edu), 11sep2008 |
457 | 459 |
458 This program reads FASTA/FASTQ file and splits it into several smaller files, | 460 This program reads FASTA/FASTQ file and splits it into several smaller files, |
459 Based on barcode matching. | 461 Based on barcode matching. |
460 FASTA/FASTQ data is read from STDIN (format is auto-detected.) | 462 FASTA/FASTQ data is read from STDIN (format is auto-detected.) |
461 Output files will be writen to disk. | 463 Output files will be writen to disk. |
462 Summary will be printed to STDOUT. | 464 Summary will be printed to STDOUT. |
463 | 465 |
464 usage: $0 --bcfile FILE --prefix PREFIX [--suffix SUFFIX] [--bol|--eol|--idxfile] | 466 usage: $0 --bcfile FILE --prefix PREFIX [--suffix SUFFIX] [--bol|--eol|--idxfile] |
465 [--mismatches N] [--exact] [--partial N] [--idxidstrip N] | 467 [--mismatches N] [--exact] [--partial N] [--idxidstrip N] |
466 [--help] [--quiet] [--debug] | 468 [--help] [--quiet] [--debug] |
467 | 469 |
468 Arguments: | 470 Arguments: |
469 | 471 |
470 --bcfile FILE - Barcodes file name. (see explanation below.) | 472 --bcfile FILE - Barcodes file name. (see explanation below.) |
471 --prefix PREFIX - File prefix. will be added to the output files. Can be used | 473 --prefix PREFIX - File prefix. will be added to the output files. Can be used |
472 to specify output directories. | 474 to specify output directories. |
473 --suffix SUFFIX - File suffix (optional). Can be used to specify file | 475 --suffix SUFFIX - File suffix (optional). Can be used to specify file |
474 extensions. | 476 extensions. |
475 --bol - Try to match barcodes at the BEGINNING of sequences. | 477 --bol - Try to match barcodes at the BEGINNING of sequences. |
476 (What biologists would call the 5' end, and programmers | 478 (What biologists would call the 5' end, and programmers |
477 would call index 0.) | 479 would call index 0.) |
478 --eol - Try to match barcodes at the END of sequences. | 480 --eol - Try to match barcodes at the END of sequences. |
479 (What biologists would call the 3' end, and programmers | 481 (What biologists would call the 3' end, and programmers |
480 would call the end of the string.) | 482 would call the end of the string.) |
481 --idxfile FILE - Read barcodes from separate index file (fasta or fastq) | 483 --idxfile FILE - Read barcodes from separate index file (fasta or fastq) |
482 NOTE: one of --bol, --eol, --idxfile must be specified, | 484 NOTE: one of --bol, --eol, --idxfile must be specified, |
483 but not more than one. | 485 but not more than one. |
484 --idxidstrip N - When using index file, strip this number of characters | 486 --idxidstrip N - When using index file, strip this number of characters |
485 from the end of the sequence id before matching. | 487 from the end of the sequence id before matching. |
486 Automatically detects CASAVA 1.8 format and strips at a | 488 Automatically detects CASAVA 1.8 format and strips at a |
487 space in the id, use 0 to disable this. | 489 space in the id, use 0 to disable this. |
488 (Default is 1). | 490 (Default is 1). |
489 --mismatches N - Max. number of mismatches allowed. default is 1. | 491 --mismatches N - Max. number of mismatches allowed. default is 1. |
490 --exact - Same as '--mismatches 0'. If both --exact and --mismatches | 492 --exact - Same as '--mismatches 0'. If both --exact and --mismatches |
491 are specified, '--exact' takes precedence. | 493 are specified, '--exact' takes precedence. |
492 --partial N - Allow partial overlap of barcodes. (see explanation below.) | 494 --partial N - Allow partial overlap of barcodes. (see explanation below.) |
493 (Default is not partial matching) | 495 (Default is not partial matching) |
494 --quiet - Don't print counts and summary at the end of the run. | 496 --quiet - Don't print counts and summary at the end of the run. |
495 (Default is to print.) | 497 (Default is to print.) |
496 --debug - Print lots of useless debug information to STDERR. | 498 --debug - Print lots of useless debug information to STDERR. |
497 --help - This helpful help screen. | 499 --help - This helpful help screen. |
498 | 500 |
499 Example (Assuming 's_2_100.txt' is a FASTQ file, 'mybarcodes.txt' is | 501 Example (Assuming 's_2_100.txt' is a FASTQ file, 'mybarcodes.txt' is |
500 the barcodes file): | 502 the barcodes file): |
501 | 503 |
502 \$ cat s_2_100.txt | $0 --bcfile mybarcodes.txt --bol --mismatches 2 \\ | 504 \$ cat s_2_100.txt | $0 --bcfile mybarcodes.txt --bol --mismatches 2 \\ |
503 --prefix /tmp/bla_ --suffix ".txt" | 505 --prefix /tmp/bla_ --suffix ".txt" |
504 | 506 |
505 Barcode file format | 507 Barcode file format |
506 ------------------- | 508 ------------------- |
507 Barcode files are simple text files. Each line should contain an identifier | 509 Barcode files are simple text files. Each line should contain an identifier |
508 (descriptive name for the barcode), and the barcode itself (A/C/G/T), | 510 (descriptive name for the barcode), and the barcode itself (A/C/G/T), |
509 separated by a TAB character. Example: | 511 separated by a TAB character. Example: |
510 | 512 |
511 #This line is a comment (starts with a 'number' sign) | 513 #This line is a comment (starts with a 'number' sign) |
512 BC1 GATCT | 514 BC1 GATCT |
513 BC2 ATCGT | 515 BC2 ATCGT |
514 BC3 GTGAT | 516 BC3 GTGAT |
515 BC4 TGTCT | 517 BC4 TGTCT |
516 | 518 |
517 For each barcode, a new FASTQ file will be created (with the barcode's | 519 For each barcode, a new FASTQ file will be created (with the barcode's |
518 identifier as part of the file name). Sequences matching the barcode | 520 identifier as part of the file name). Sequences matching the barcode |
519 will be stored in the appropriate file. | 521 will be stored in the appropriate file. |
520 | 522 |
521 Running the above example (assuming "mybarcodes.txt" contains the above | 523 Running the above example (assuming "mybarcodes.txt" contains the above |
522 barcodes), will create the following files: | 524 barcodes), will create the following files: |
523 /tmp/bla_BC1.txt | 525 /tmp/bla_BC1.txt |
524 /tmp/bla_BC2.txt | 526 /tmp/bla_BC2.txt |
525 /tmp/bla_BC3.txt | 527 /tmp/bla_BC3.txt |
526 /tmp/bla_BC4.txt | 528 /tmp/bla_BC4.txt |
527 /tmp/bla_unmatched.txt | 529 /tmp/bla_unmatched.txt |
528 The 'unmatched' file will contain all sequences that didn't match any barcode. | 530 The 'unmatched' file will contain all sequences that didn't match any barcode. |
529 | 531 |
530 Barcode matching | 532 Barcode matching |
531 ---------------- | 533 ---------------- |
532 | 534 |
546 ATCGT (4 mismatches, BC2) | 548 ATCGT (4 mismatches, BC2) |
547 GTGAT (3 mismatches, BC3) | 549 GTGAT (3 mismatches, BC3) |
548 TGTCT (3 mismatches, BC4) | 550 TGTCT (3 mismatches, BC4) |
549 | 551 |
550 This sequence will be classified as 'BC1' (it has the lowest mismatch count). | 552 This sequence will be classified as 'BC1' (it has the lowest mismatch count). |
551 If '--exact' or '--mismatches 0' were specified, this sequence would be | 553 If '--exact' or '--mismatches 0' were specified, this sequence would be |
552 classified as 'unmatched' (because, although BC1 had the lowest mismatch count, | 554 classified as 'unmatched' (because, although BC1 had the lowest mismatch count, |
553 it is above the maximum allowed mismatches). | 555 it is above the maximum allowed mismatches). |
554 | 556 |
555 Matching with '--eol' (end of line) does the same, but from the other side | 557 Matching with '--eol' (end of line) does the same, but from the other side |
556 of the sequence. | 558 of the sequence. |
572 -ATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG | 574 -ATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG |
573 GATCT (1 mismatch) | 575 GATCT (1 mismatch) |
574 | 576 |
575 Note: scoring counts a missing base as a mismatch, so the final | 577 Note: scoring counts a missing base as a mismatch, so the final |
576 mismatch count is 2 (1 'real' mismatch, 1 'missing base' mismatch). | 578 mismatch count is 2 (1 'real' mismatch, 1 'missing base' mismatch). |
577 If running with '--mismatches 2' (meaning allowing upto 2 mismatches) - this | 579 If running with '--mismatches 2' (meaning allowing upto 2 mismatches) - this |
578 seqeunce will be classified as BC1. | 580 seqeunce will be classified as BC1. |
579 | 581 |
580 EOF | 582 EOF |
581 | 583 |
582 exit 1; | 584 exit 1; |