comparison fastx_barcode_splitter.pl @ 0:84bbf4fd24c3 draft

Initial toolshed version with support for separate index reads and automatic loading of results into Galaxy history.
author lparsons
date Fri, 08 Nov 2013 09:53:39 -0500
parents
children e7b7cdc1834d
comparison
equal deleted inserted replaced
-1:000000000000 0:84bbf4fd24c3
1 #!/usr/bin/perl
2
3 # FASTX-toolkit - FASTA/FASTQ preprocessing tools.
4 # Copyright (C) 2009 A. Gordon (gordon@cshl.edu)
5 #
6 # Lance Parsons (lparsons@princeton.edu)
7 # 3/21/2011 - Modified to accept separate index file for barcodes
8 # 4/6/2011 - Modified to cleanup bad barcode identifiers (esp. useful for Galaxy)
9 #
10 # This program is free software: you can redistribute it and/or modify
11 # it under the terms of the GNU Affero General Public License as
12 # published by the Free Software Foundation, either version 3 of the
13 # License, or (at your option) any later version.
14 #
15 # This program is distributed in the hope that it will be useful,
16 # but WITHOUT ANY WARRANTY; without even the implied warranty of
17 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 # GNU Affero General Public License for more details.
19 #
20 # You should have received a copy of the GNU Affero General Public License
21 # along with this program. If not, see <http://www.gnu.org/licenses/>.
22
23 use strict;
24 use warnings;
25 use IO::Handle;
26 use Data::Dumper;
27 use Getopt::Long;
28 use Carp;
29
30 ##
31 ## This program splits a FASTQ/FASTA file into several smaller files,
32 ## Based on barcode matching.
33 ##
34 ## run with "--help" for usage information
35 ##
36 ## Assaf Gordon <gordon@cshl.edu> , 11sep2008
37
38 # Forward declarations
39 sub load_barcode_file ($);
40 sub parse_command_line ;
41 sub match_sequences ;
42 sub mismatch_count($$) ;
43 sub print_results;
44 sub open_and_detect_input_format;
45 sub open_index_and_detect_input_format($);
46 sub read_index_record;
47 sub read_record;
48 sub write_record($);
49 sub usage();
50
51 # Global flags and arguments,
52 # Set by command line argumens
53 my $barcode_file ;
54 my $barcodes_at_eol = 0 ;
55 my $barcodes_at_bol = 0 ;
56 my $index_read_file ;
57 my $exact_match = 0 ;
58 my $allow_partial_overlap = 0;
59 my $allowed_mismatches = 1;
60 my $newfile_suffix = '';
61 my $newfile_prefix ;
62 my $quiet = 0 ;
63 my $debug = 0 ;
64 my $fastq_format = 1;
65 my $index_fastq_format = 1;
66 my $read_id_check_strip_characters = 1;
67
68 # Global variables
69 # Populated by 'create_output_files'
70 my %filenames;
71 my %files;
72 my %counts = ( 'unmatched' => 0 );
73 my $barcodes_length;
74 my @barcodes;
75 my $input_file_io;
76
77
78 # The Four lines per record in FASTQ format.
79 # (when using FASTA format, only the first two are used)
80 my $seq_name;
81 my $seq_bases;
82 my $seq_name2;
83 my $seq_qualities;
84
85 # Values used for index read file
86 my $index_seq_name;
87 my $index_seq_bases;
88 my $index_seq_name2;
89 my $index_seq_qualities;
90
91
92
93 #
94 # Start of Program
95 #
96 parse_command_line ;
97
98 load_barcode_file ( $barcode_file ) ;
99
100 open_and_detect_input_format;
101
102 if (defined $index_read_file) {open_index_and_detect_input_format ( $index_read_file );}
103
104 match_sequences ;
105
106 print_results unless $quiet;
107
108 #
109 # End of program
110 #
111
112
113
114
115
116
117
118
119 sub parse_command_line {
120 my $help;
121
122 usage() if (scalar @ARGV==0);
123
124 my $result = GetOptions ( "bcfile=s" => \$barcode_file,
125 "eol" => \$barcodes_at_eol,
126 "bol" => \$barcodes_at_bol,
127 "idxfile=s" => \$index_read_file,
128 "idxidstrip=i" => \$read_id_check_strip_characters,
129 "exact" => \$exact_match,
130 "prefix=s" => \$newfile_prefix,
131 "suffix=s" => \$newfile_suffix,
132 "quiet" => \$quiet,
133 "partial=i" => \$allow_partial_overlap,
134 "debug" => \$debug,
135 "mismatches=i" => \$allowed_mismatches,
136 "help" => \$help
137 ) ;
138
139 usage() if ($help);
140
141 die "Error: barcode file not specified (use '--bcfile [FILENAME]')\n" unless defined $barcode_file;
142 die "Error: prefix path/filename not specified (use '--prefix [PATH]')\n" unless defined $newfile_prefix;
143
144 if (! defined $index_read_file) {
145 if ($barcodes_at_bol == $barcodes_at_eol) {
146 die "Error: can't specify both --eol & --bol\n" if $barcodes_at_eol;
147 die "Error: must specify either --eol or --bol or --idxfile\n" ;
148 }
149 }
150 elsif ($barcodes_at_bol || $barcodes_at_eol) {
151 die "Error: Must specify only one of --idxfile, --eol, or --bol";
152 }
153
154 die "Error: invalid for value partial matches (valid values are 0 or greater)\n" if $allow_partial_overlap<0;
155
156 $allowed_mismatches = 0 if $exact_match;
157
158 die "Error: invalid value for mismatches (valid values are 0 or more)\n" if ($allowed_mismatches<0);
159
160 die "Error: partial overlap value ($allow_partial_overlap) bigger than " .
161 "max. allowed mismatches ($allowed_mismatches)\n" if ($allow_partial_overlap > $allowed_mismatches);
162
163
164 exit unless $result;
165 }
166
167
168
169 #
170 # Read the barcode file
171 #
172 sub load_barcode_file ($) {
173 my $filename = shift or croak "Missing barcode file name";
174
175 open BCFILE,"<$filename" or die "Error: failed to open barcode file ($filename)\n";
176 while (<BCFILE>) {
177 next if m/^#/;
178 chomp;
179 my ($ident, $barcode) = split('\t') ;
180
181 $barcode = uc($barcode);
182
183 # Sanity checks on the barcodes
184 die "Error: bad data at barcode file ($filename) line $.\n" unless defined $barcode;
185 die "Error: bad barcode value ($barcode) at barcode file ($filename) line $.\n"
186 unless $barcode =~ m/^[AGCT]+$/;
187
188 # Cleanup Identifiers (only allow alphanumeric, replace others with dash '-')
189 $ident =~ s/[^A-Za-z0-9]/-/g;
190 die "Error: bad identifier value ($ident) at barcode file ($filename) line $. (must be alphanumeric)\n"
191 unless $ident =~ m/^[A-Za-z0-9-]+$/;
192
193 die "Error: badcode($ident, $barcode) is shorter or equal to maximum number of " .
194 "mismatches ($allowed_mismatches). This makes no sense. Specify fewer mismatches.\n"
195 if length($barcode)<=$allowed_mismatches;
196
197 $barcodes_length = length($barcode) unless defined $barcodes_length;
198 die "Error: found barcodes in different lengths. this feature is not supported yet.\n"
199 unless $barcodes_length == length($barcode);
200
201 push @barcodes, [$ident, $barcode];
202
203 if ($allow_partial_overlap>0) {
204 foreach my $i (1 .. $allow_partial_overlap) {
205 substr $barcode, ($barcodes_at_bol)?0:-1, 1, '';
206 push @barcodes, [$ident, $barcode];
207 }
208 }
209 }
210 close BCFILE;
211
212 if ($debug) {
213 print STDERR "barcode\tsequence\n";
214 foreach my $barcoderef (@barcodes) {
215 my ($ident, $seq) = @{$barcoderef};
216 print STDERR $ident,"\t", $seq ,"\n";
217 }
218 }
219 }
220
221 # Create one output file for each barcode.
222 # (Also create a file for the dummy 'unmatched' barcode)
223 sub create_output_files {
224 my %barcodes = map { $_->[0] => 1 } @barcodes; #generate a uniq list of barcode identifiers;
225 $barcodes{'unmatched'} = 1 ;
226
227 foreach my $ident (keys %barcodes) {
228 my $new_filename = $newfile_prefix . $ident . $newfile_suffix;
229 $filenames{$ident} = $new_filename;
230 open my $file, ">$new_filename" or die "Error: failed to create output file ($new_filename)\n";
231 $files{$ident} = $file ;
232 }
233 }
234
235 sub match_sequences {
236
237 my %barcodes = map { $_->[0] => 1 } @barcodes; #generate a uniq list of barcode identifiers;
238 $barcodes{'unmatched'} = 1 ;
239
240 #reset counters
241 foreach my $ident ( keys %barcodes ) {
242 $counts{$ident} = 0;
243 }
244
245 create_output_files;
246
247 # Read file FASTQ file
248 # split accotding to barcodes
249 while ( read_record ) {
250 chomp $seq_name;
251 chomp $seq_bases;
252 if (defined $index_read_file) {
253 read_index_record() or die "Error: Unable to read index sequence for sequence name ($seq_name), check to make sure the file lengths match.\n";
254 chomp $index_seq_name;
255 chomp $index_seq_bases;
256
257 # Assert that the read ids match
258 my $seq_name_match = &strip_read_id($seq_name);
259 my $index_seq_name_match = &strip_read_id($index_seq_name);
260 if ($seq_name_match ne $index_seq_name_match) {
261 die "Error: Index sequence name ($index_seq_name) does not match sequence name ($seq_name)\n";
262 }
263
264 }
265
266 print STDERR "sequence $seq_bases: \n" if $debug;
267
268 my $best_barcode_mismatches_count = $barcodes_length;
269 my $best_barcode_ident = undef;
270
271 #Try all barcodes, find the one with the lowest mismatch count
272 foreach my $barcoderef (@barcodes) {
273 my ($ident, $barcode) = @{$barcoderef};
274
275 # Get DNA fragment (in the length of the barcodes)
276 # The barcode will be tested only against this fragment
277 # (no point in testing the barcode against the whole sequence)
278 my $sequence_fragment;
279 if ($barcodes_at_bol) {
280 $sequence_fragment = substr $seq_bases, 0, $barcodes_length;
281 } elsif ($barcodes_at_eol) {
282 $sequence_fragment = substr $seq_bases, - $barcodes_length;
283 } else {
284 $sequence_fragment = substr $index_seq_bases, 0, $barcodes_length;
285 }
286
287 my $mm = mismatch_count($sequence_fragment, $barcode) ;
288
289 # if this is a partial match, add the non-overlap as a mismatch
290 # (partial barcodes are shorter than the length of the original barcodes)
291 $mm += ($barcodes_length - length($barcode));
292
293 if ( $mm < $best_barcode_mismatches_count ) {
294 $best_barcode_mismatches_count = $mm ;
295 $best_barcode_ident = $ident ;
296 }
297 }
298
299 $best_barcode_ident = 'unmatched'
300 if ( (!defined $best_barcode_ident) || $best_barcode_mismatches_count>$allowed_mismatches) ;
301
302 print STDERR "sequence $seq_bases matched barcode: $best_barcode_ident\n" if $debug;
303
304 $counts{$best_barcode_ident}++;
305
306 #get the file associated with the matched barcode.
307 #(note: there's also a file associated with 'unmatched' barcode)
308 my $file = $files{$best_barcode_ident};
309
310 write_record($file);
311 }
312 }
313
314 # Strip end of readids when matching to avoid mismatch between read 1, 2, 3, etc.
315 sub strip_read_id {
316 my $read_id = shift;
317 my $stripped_read_id = $read_id;
318 if ($read_id_check_strip_characters) {
319 if ($read_id =~ /@([a-zA-Z0-9_-]+):([0-9]+):([a-zA-Z0-9]+):([0-9]+):([0-9]+):([0-9]+):([0-9]+) ([0-9]+):([YN]):([0-9]+):([ACGT]+){0,1}/) { # CASAVA 1.8+
320 my @parts = split(/ /,$read_id);
321 $stripped_read_id = $parts[0];
322 } else { # CASAVA 1.7 and earlier
323 $stripped_read_id = substr($read_id, 0, length($read_id)-$read_id_check_strip_characters);
324 }
325 }
326 return $stripped_read_id;
327 }
328
329
330 #Quickly calculate hamming distance between two strings
331 #
332 #NOTE: Strings must be same length.
333 # returns number of different characters.
334 #see http://www.perlmonks.org/?node_id=500235
335 sub mismatch_count($$) { length( $_[ 0 ] ) - ( ( $_[ 0 ] ^ $_[ 1 ] ) =~ tr[\0][\0] ) }
336
337
338
339 sub print_results
340 {
341 print "Barcode\tCount\tLocation\n";
342 my $total = 0 ;
343 foreach my $ident (sort keys %counts) {
344 print $ident, "\t", $counts{$ident},"\t",$filenames{$ident},"\n";
345 $total += $counts{$ident};
346 }
347 print "total\t",$total,"\n";
348 }
349
350
351 sub read_record
352 {
353 $seq_name = $input_file_io->getline();
354
355 return undef unless defined $seq_name; # End of file?
356
357 $seq_bases = $input_file_io->getline();
358 die "Error: bad input file, expecting line with sequences\n" unless defined $seq_bases;
359
360 # If using FASTQ format, read two more lines
361 if ($fastq_format) {
362 $seq_name2 = $input_file_io->getline();
363 die "Error: bad input file, expecting line with sequence name2\n" unless defined $seq_name2;
364
365 $seq_qualities = $input_file_io->getline();
366 die "Error: bad input file, expecting line with quality scores\n" unless defined $seq_qualities;
367 }
368 return 1;
369 }
370
371 sub write_record($)
372 {
373 my $file = shift;
374
375 croak "Bad file handle" unless defined $file;
376
377 print $file $seq_name,"\n";
378 print $file $seq_bases,"\n";
379
380 #if using FASTQ format, write two more lines
381 if ($fastq_format) {
382 print $file $seq_name2;
383 print $file $seq_qualities;
384 }
385 }
386
387 sub open_and_detect_input_format
388 {
389 $input_file_io = new IO::Handle;
390 die "Failed to open STDIN " unless $input_file_io->fdopen(fileno(STDIN),"r");
391
392 # Get the first characeter, and push it back
393 my $first_char = $input_file_io->getc();
394 $input_file_io->ungetc(ord $first_char);
395
396 if ($first_char eq '>') {
397 # FASTA format
398 $fastq_format = 0 ;
399 print STDERR "Detected FASTA format\n" if $debug;
400 } elsif ($first_char eq '@') {
401 # FASTQ format
402 $fastq_format = 1;
403 print STDERR "Detected FASTQ format\n" if $debug;
404 } else {
405 die "Error: unknown file format. First character = '$first_char' (expecting > or \@)\n";
406 }
407 }
408
409
410 sub open_index_and_detect_input_format($) {
411 my $filename = shift or croak "Missing index read file name";
412
413 open IDXFILE,"<$filename" or die "Error: failed to open index read file ($filename)\n";
414
415 # Get the first line, and reset file pointer
416 my $first_line = <IDXFILE>;
417 my $first_char = substr($first_line, 0, 1);
418 seek(IDXFILE, 0, 0);
419
420 if ($first_char eq '>') {
421 # FASTA format
422 $index_fastq_format = 0 ;
423 print STDERR "Detected FASTA format for index file\n" if $debug;
424 } elsif ($first_char eq '@') {
425 # FASTQ format
426 $index_fastq_format = 1;
427 print STDERR "Detected FASTQ format for index file\n" if $debug;
428 } else {
429 die "Error: unknown index file format. First character = '$first_char' (expecting > or \@)\n";
430 }
431 }
432
433 sub read_index_record
434 {
435 $index_seq_name = <IDXFILE>;
436
437 return undef unless defined $index_seq_name; # End of file?
438
439 $index_seq_bases = <IDXFILE>;
440 die "Error: bad input file, expecting line with sequences\n" unless defined $index_seq_bases;
441
442 # If using FASTQ format, read two more lines
443 if ($index_fastq_format) {
444 $index_seq_name2 = <IDXFILE>;
445 die "Error: bad input file, expecting line with sequence name2\n" unless defined $index_seq_name2;
446
447 $index_seq_qualities = <IDXFILE>;
448 die "Error: bad input file, expecting line with quality scores\n" unless defined $index_seq_qualities;
449 }
450 return 1;
451 }
452
453 sub usage()
454 {
455 print<<EOF;
456 Barcode Splitter, by Assaf Gordon (gordon\@cshl.edu), 11sep2008
457
458 This program reads FASTA/FASTQ file and splits it into several smaller files,
459 Based on barcode matching.
460 FASTA/FASTQ data is read from STDIN (format is auto-detected.)
461 Output files will be writen to disk.
462 Summary will be printed to STDOUT.
463
464 usage: $0 --bcfile FILE --prefix PREFIX [--suffix SUFFIX] [--bol|--eol|--idxfile]
465 [--mismatches N] [--exact] [--partial N] [--idxidstrip N]
466 [--help] [--quiet] [--debug]
467
468 Arguments:
469
470 --bcfile FILE - Barcodes file name. (see explanation below.)
471 --prefix PREFIX - File prefix. will be added to the output files. Can be used
472 to specify output directories.
473 --suffix SUFFIX - File suffix (optional). Can be used to specify file
474 extensions.
475 --bol - Try to match barcodes at the BEGINNING of sequences.
476 (What biologists would call the 5' end, and programmers
477 would call index 0.)
478 --eol - Try to match barcodes at the END of sequences.
479 (What biologists would call the 3' end, and programmers
480 would call the end of the string.)
481 --idxfile FILE - Read barcodes from separate index file (fasta or fastq)
482 NOTE: one of --bol, --eol, --idxfile must be specified,
483 but not more than one.
484 --idxidstrip N - When using index file, strip this number of characters
485 from the end of the sequence id before matching.
486 Automatically detects CASAVA 1.8 format and strips at a
487 space in the id, use 0 to disable this.
488 (Default is 1).
489 --mismatches N - Max. number of mismatches allowed. default is 1.
490 --exact - Same as '--mismatches 0'. If both --exact and --mismatches
491 are specified, '--exact' takes precedence.
492 --partial N - Allow partial overlap of barcodes. (see explanation below.)
493 (Default is not partial matching)
494 --quiet - Don't print counts and summary at the end of the run.
495 (Default is to print.)
496 --debug - Print lots of useless debug information to STDERR.
497 --help - This helpful help screen.
498
499 Example (Assuming 's_2_100.txt' is a FASTQ file, 'mybarcodes.txt' is
500 the barcodes file):
501
502 \$ cat s_2_100.txt | $0 --bcfile mybarcodes.txt --bol --mismatches 2 \\
503 --prefix /tmp/bla_ --suffix ".txt"
504
505 Barcode file format
506 -------------------
507 Barcode files are simple text files. Each line should contain an identifier
508 (descriptive name for the barcode), and the barcode itself (A/C/G/T),
509 separated by a TAB character. Example:
510
511 #This line is a comment (starts with a 'number' sign)
512 BC1 GATCT
513 BC2 ATCGT
514 BC3 GTGAT
515 BC4 TGTCT
516
517 For each barcode, a new FASTQ file will be created (with the barcode's
518 identifier as part of the file name). Sequences matching the barcode
519 will be stored in the appropriate file.
520
521 Running the above example (assuming "mybarcodes.txt" contains the above
522 barcodes), will create the following files:
523 /tmp/bla_BC1.txt
524 /tmp/bla_BC2.txt
525 /tmp/bla_BC3.txt
526 /tmp/bla_BC4.txt
527 /tmp/bla_unmatched.txt
528 The 'unmatched' file will contain all sequences that didn't match any barcode.
529
530 Barcode matching
531 ----------------
532
533 ** Without partial matching:
534
535 Count mismatches between the FASTA/Q sequences and the barcodes.
536 The barcode which matched with the lowest mismatches count (providing the
537 count is small or equal to '--mismatches N') 'gets' the sequences.
538
539 Example (using the above barcodes):
540 Input Sequence:
541 GATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG
542
543 Matching with '--bol --mismatches 1':
544 GATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG
545 GATCT (1 mismatch, BC1)
546 ATCGT (4 mismatches, BC2)
547 GTGAT (3 mismatches, BC3)
548 TGTCT (3 mismatches, BC4)
549
550 This sequence will be classified as 'BC1' (it has the lowest mismatch count).
551 If '--exact' or '--mismatches 0' were specified, this sequence would be
552 classified as 'unmatched' (because, although BC1 had the lowest mismatch count,
553 it is above the maximum allowed mismatches).
554
555 Matching with '--eol' (end of line) does the same, but from the other side
556 of the sequence.
557
558 ** With partial matching (very similar to indels):
559
560 Same as above, with the following addition: barcodes are also checked for
561 partial overlap (number of allowed non-overlapping bases is '--partial N').
562
563 Example:
564 Input sequence is ATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG
565 (Same as above, but note the missing 'G' at the beginning.)
566
567 Matching (without partial overlapping) against BC1 yields 4 mismatches:
568 ATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG
569 GATCT (4 mismatches)
570
571 Partial overlapping would also try the following match:
572 -ATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG
573 GATCT (1 mismatch)
574
575 Note: scoring counts a missing base as a mismatch, so the final
576 mismatch count is 2 (1 'real' mismatch, 1 'missing base' mismatch).
577 If running with '--mismatches 2' (meaning allowing upto 2 mismatches) - this
578 seqeunce will be classified as BC1.
579
580 EOF
581
582 exit 1;
583 }