comparison fastx_barcode_splitter.pl @ 4:0fb7e9130a70 draft default tip

planemo upload for repository https://github.com/lparsons/galaxy_tools/tree/master/tools/fastx_barcode_splitter_enhanced commit 460463a5406419fe8e113467bdb8bd093d21e7c5
author lparsons
date Mon, 02 May 2016 17:04:32 -0400
parents e7b7cdc1834d
children
comparison
equal deleted inserted replaced
3:e7b7cdc1834d 4:0fb7e9130a70
4 # Copyright (C) 2009 A. Gordon (gordon@cshl.edu) 4 # Copyright (C) 2009 A. Gordon (gordon@cshl.edu)
5 # 5 #
6 # Lance Parsons (lparsons@princeton.edu) 6 # Lance Parsons (lparsons@princeton.edu)
7 # 3/21/2011 - Modified to accept separate index file for barcodes 7 # 3/21/2011 - Modified to accept separate index file for barcodes
8 # 4/6/2011 - Modified to cleanup bad barcode identifiers (esp. useful for Galaxy) 8 # 4/6/2011 - Modified to cleanup bad barcode identifiers (esp. useful for Galaxy)
9 # 9 # 4/28/2016 - Modified summary output to remove file paths and add comment
10 # character '#'
11
10 # This program is free software: you can redistribute it and/or modify 12 # This program is free software: you can redistribute it and/or modify
11 # it under the terms of the GNU Affero General Public License as 13 # it under the terms of the GNU Affero General Public License as
12 # published by the Free Software Foundation, either version 3 of the 14 # published by the Free Software Foundation, either version 3 of the
13 # License, or (at your option) any later version. 15 # License, or (at your option) any later version.
14 # 16 #
46 sub read_index_record; 48 sub read_index_record;
47 sub read_record; 49 sub read_record;
48 sub write_record($); 50 sub write_record($);
49 sub usage(); 51 sub usage();
50 52
51 # Global flags and arguments, 53 # Global flags and arguments,
52 # Set by command line argumens 54 # Set by command line argumens
53 my $barcode_file ; 55 my $barcode_file ;
54 my $barcodes_at_eol = 0 ; 56 my $barcodes_at_eol = 0 ;
55 my $barcodes_at_bol = 0 ; 57 my $barcodes_at_bol = 0 ;
56 my $index_read_file ; 58 my $index_read_file ;
63 my $debug = 0 ; 65 my $debug = 0 ;
64 my $fastq_format = 1; 66 my $fastq_format = 1;
65 my $index_fastq_format = 1; 67 my $index_fastq_format = 1;
66 my $read_id_check_strip_characters = 1; 68 my $read_id_check_strip_characters = 1;
67 69
68 # Global variables 70 # Global variables
69 # Populated by 'create_output_files' 71 # Populated by 'create_output_files'
70 my %filenames; 72 my %filenames;
71 my %files; 73 my %files;
72 my %counts = ( 'unmatched' => 0 ); 74 my %counts = ( 'unmatched' => 0 );
73 my $barcodes_length; 75 my $barcodes_length;
115 117
116 118
117 119
118 120
119 sub parse_command_line { 121 sub parse_command_line {
120 my $help; 122 my $help;
121 123
122 usage() if (scalar @ARGV==0); 124 usage() if (scalar @ARGV==0);
123 125
124 my $result = GetOptions ( "bcfile=s" => \$barcode_file, 126 my $result = GetOptions ( "bcfile=s" => \$barcode_file,
125 "eol" => \$barcodes_at_eol, 127 "eol" => \$barcodes_at_eol,
126 "bol" => \$barcodes_at_bol, 128 "bol" => \$barcodes_at_bol,
127 "idxfile=s" => \$index_read_file, 129 "idxfile=s" => \$index_read_file,
128 "idxidstrip=i" => \$read_id_check_strip_characters, 130 "idxidstrip=i" => \$read_id_check_strip_characters,
129 "exact" => \$exact_match, 131 "exact" => \$exact_match,
130 "prefix=s" => \$newfile_prefix, 132 "prefix=s" => \$newfile_prefix,
131 "suffix=s" => \$newfile_suffix, 133 "suffix=s" => \$newfile_suffix,
132 "quiet" => \$quiet, 134 "quiet" => \$quiet,
133 "partial=i" => \$allow_partial_overlap, 135 "partial=i" => \$allow_partial_overlap,
134 "debug" => \$debug, 136 "debug" => \$debug,
135 "mismatches=i" => \$allowed_mismatches, 137 "mismatches=i" => \$allowed_mismatches,
136 "help" => \$help 138 "help" => \$help
137 ) ; 139 ) ;
138 140
139 usage() if ($help); 141 usage() if ($help);
140 142
141 die "Error: barcode file not specified (use '--bcfile [FILENAME]')\n" unless defined $barcode_file; 143 die "Error: barcode file not specified (use '--bcfile [FILENAME]')\n" unless defined $barcode_file;
142 die "Error: prefix path/filename not specified (use '--prefix [PATH]')\n" unless defined $newfile_prefix; 144 die "Error: prefix path/filename not specified (use '--prefix [PATH]')\n" unless defined $newfile_prefix;
143 145
144 if (! defined $index_read_file) { 146 if (! defined $index_read_file) {
145 if ($barcodes_at_bol == $barcodes_at_eol) { 147 if ($barcodes_at_bol == $barcodes_at_eol) {
146 die "Error: can't specify both --eol & --bol\n" if $barcodes_at_eol; 148 die "Error: can't specify both --eol & --bol\n" if $barcodes_at_eol;
147 die "Error: must specify either --eol or --bol or --idxfile\n" ; 149 die "Error: must specify either --eol or --bol or --idxfile\n" ;
148 } 150 }
149 } 151 }
150 elsif ($barcodes_at_bol || $barcodes_at_eol) { 152 elsif ($barcodes_at_bol || $barcodes_at_eol) {
151 die "Error: Must specify only one of --idxfile, --eol, or --bol"; 153 die "Error: Must specify only one of --idxfile, --eol, or --bol";
152 } 154 }
153 155
154 die "Error: invalid for value partial matches (valid values are 0 or greater)\n" if $allow_partial_overlap<0; 156 die "Error: invalid for value partial matches (valid values are 0 or greater)\n" if $allow_partial_overlap<0;
155 157
156 $allowed_mismatches = 0 if $exact_match; 158 $allowed_mismatches = 0 if $exact_match;
157 159
158 die "Error: invalid value for mismatches (valid values are 0 or more)\n" if ($allowed_mismatches<0); 160 die "Error: invalid value for mismatches (valid values are 0 or more)\n" if ($allowed_mismatches<0);
159 161
160 die "Error: partial overlap value ($allow_partial_overlap) bigger than " . 162 die "Error: partial overlap value ($allow_partial_overlap) bigger than " .
161 "max. allowed mismatches ($allowed_mismatches)\n" if ($allow_partial_overlap > $allowed_mismatches); 163 "max. allowed mismatches ($allowed_mismatches)\n" if ($allow_partial_overlap > $allowed_mismatches);
162 164
163 165
164 exit unless $result; 166 exit unless $result;
165 } 167 }
166 168
167 169
168 170
169 # 171 #
170 # Read the barcode file 172 # Read the barcode file
171 # 173 #
172 sub load_barcode_file ($) { 174 sub load_barcode_file ($) {
173 my $filename = shift or croak "Missing barcode file name"; 175 my $filename = shift or croak "Missing barcode file name";
174 176
175 open BCFILE,"<$filename" or die "Error: failed to open barcode file ($filename)\n"; 177 open BCFILE,"<$filename" or die "Error: failed to open barcode file ($filename)\n";
176 while (<BCFILE>) { 178 while (<BCFILE>) {
177 next if m/^#/; 179 next if m/^#/;
178 chomp; 180 chomp;
179 my ($ident, $barcode) = split('\t') ; 181 my ($ident, $barcode) = split('\t') ;
180 182
181 $barcode = uc($barcode); 183 $barcode = uc($barcode);
182 184
183 # Sanity checks on the barcodes 185 # Sanity checks on the barcodes
184 die "Error: bad data at barcode file ($filename) line $.\n" unless defined $barcode; 186 die "Error: bad data at barcode file ($filename) line $.\n" unless defined $barcode;
185 die "Error: bad barcode value ($barcode) at barcode file ($filename) line $.\n" 187 die "Error: bad barcode value ($barcode) at barcode file ($filename) line $.\n"
186 unless $barcode =~ m/^[AGCT]+$/; 188 unless $barcode =~ m/^[AGCT]+$/;
187 189
188 # Cleanup Identifiers (only allow alphanumeric, replace others with dash '-') 190 # Cleanup Identifiers (only allow alphanumeric, replace others with dash '-')
189 $ident =~ s/[^A-Za-z0-9]/-/g; 191 $ident =~ s/[^A-Za-z0-9]/-/g;
190 die "Error: bad identifier value ($ident) at barcode file ($filename) line $. (must be alphanumeric)\n" 192 die "Error: bad identifier value ($ident) at barcode file ($filename) line $. (must be alphanumeric)\n"
191 unless $ident =~ m/^[A-Za-z0-9-]+$/; 193 unless $ident =~ m/^[A-Za-z0-9-]+$/;
192 194
193 die "Error: badcode($ident, $barcode) is shorter or equal to maximum number of " . 195 die "Error: badcode($ident, $barcode) is shorter or equal to maximum number of " .
194 "mismatches ($allowed_mismatches). This makes no sense. Specify fewer mismatches.\n" 196 "mismatches ($allowed_mismatches). This makes no sense. Specify fewer mismatches.\n"
195 if length($barcode)<=$allowed_mismatches; 197 if length($barcode)<=$allowed_mismatches;
196 198
197 $barcodes_length = length($barcode) unless defined $barcodes_length; 199 $barcodes_length = length($barcode) unless defined $barcodes_length;
198 die "Error: found barcodes in different lengths. this feature is not supported yet.\n" 200 die "Error: found barcodes in different lengths. this feature is not supported yet.\n"
199 unless $barcodes_length == length($barcode); 201 unless $barcodes_length == length($barcode);
200 202
201 push @barcodes, [$ident, $barcode]; 203 push @barcodes, [$ident, $barcode];
202 204
203 if ($allow_partial_overlap>0) { 205 if ($allow_partial_overlap>0) {
204 foreach my $i (1 .. $allow_partial_overlap) { 206 foreach my $i (1 .. $allow_partial_overlap) {
205 substr $barcode, ($barcodes_at_bol)?0:-1, 1, ''; 207 substr $barcode, ($barcodes_at_bol)?0:-1, 1, '';
206 push @barcodes, [$ident, $barcode]; 208 push @barcodes, [$ident, $barcode];
207 } 209 }
208 } 210 }
209 } 211 }
210 close BCFILE; 212 close BCFILE;
211 213
212 if ($debug) { 214 if ($debug) {
213 print STDERR "barcode\tsequence\n"; 215 print STDERR "barcode\tsequence\n";
214 foreach my $barcoderef (@barcodes) { 216 foreach my $barcoderef (@barcodes) {
215 my ($ident, $seq) = @{$barcoderef}; 217 my ($ident, $seq) = @{$barcoderef};
216 print STDERR $ident,"\t", $seq ,"\n"; 218 print STDERR $ident,"\t", $seq ,"\n";
217 } 219 }
218 } 220 }
219 } 221 }
220 222
221 # Create one output file for each barcode. 223 # Create one output file for each barcode.
222 # (Also create a file for the dummy 'unmatched' barcode) 224 # (Also create a file for the dummy 'unmatched' barcode)
223 sub create_output_files { 225 sub create_output_files {
224 my %barcodes = map { $_->[0] => 1 } @barcodes; #generate a uniq list of barcode identifiers; 226 my %barcodes = map { $_->[0] => 1 } @barcodes; #generate a uniq list of barcode identifiers;
225 $barcodes{'unmatched'} = 1 ; 227 $barcodes{'unmatched'} = 1 ;
226 228
227 foreach my $ident (keys %barcodes) { 229 foreach my $ident (keys %barcodes) {
228 my $new_filename = $newfile_prefix . $ident . $newfile_suffix; 230 my $new_filename = $newfile_prefix . $ident . $newfile_suffix;
229 $filenames{$ident} = $new_filename; 231 $filenames{$ident} = $new_filename;
230 open my $file, ">$new_filename" or die "Error: failed to create output file ($new_filename)\n"; 232 open my $file, ">$new_filename" or die "Error: failed to create output file ($new_filename)\n";
231 $files{$ident} = $file ; 233 $files{$ident} = $file ;
232 } 234 }
233 } 235 }
234 236
235 sub match_sequences { 237 sub match_sequences {
236 238
237 my %barcodes = map { $_->[0] => 1 } @barcodes; #generate a uniq list of barcode identifiers; 239 my %barcodes = map { $_->[0] => 1 } @barcodes; #generate a uniq list of barcode identifiers;
238 $barcodes{'unmatched'} = 1 ; 240 $barcodes{'unmatched'} = 1 ;
239 241
240 #reset counters 242 #reset counters
241 foreach my $ident ( keys %barcodes ) { 243 foreach my $ident ( keys %barcodes ) {
242 $counts{$ident} = 0; 244 $counts{$ident} = 0;
243 } 245 }
244 246
245 create_output_files; 247 create_output_files;
246 248
247 # Read file FASTQ file 249 # Read file FASTQ file
248 # split accotding to barcodes 250 # split accotding to barcodes
249 while ( read_record ) { 251 while ( read_record ) {
250 chomp $seq_name; 252 chomp $seq_name;
251 chomp $seq_bases; 253 chomp $seq_bases;
252 if (defined $index_read_file) { 254 if (defined $index_read_file) {
253 read_index_record() or die "Error: Unable to read index sequence for sequence name ($seq_name), check to make sure the file lengths match.\n"; 255 read_index_record() or die "Error: Unable to read index sequence for sequence name ($seq_name), check to make sure the file lengths match.\n";
254 chomp $index_seq_name; 256 chomp $index_seq_name;
255 chomp $index_seq_bases; 257 chomp $index_seq_bases;
256 258
257 # Assert that the read ids match 259 # Assert that the read ids match
258 my $seq_name_match = &strip_read_id($seq_name); 260 my $seq_name_match = &strip_read_id($seq_name);
259 my $index_seq_name_match = &strip_read_id($index_seq_name); 261 my $index_seq_name_match = &strip_read_id($index_seq_name);
260 if ($seq_name_match ne $index_seq_name_match) { 262 if ($seq_name_match ne $index_seq_name_match) {
261 die "Error: Index sequence name ($index_seq_name) does not match sequence name ($seq_name)\n"; 263 die "Error: Index sequence name ($index_seq_name) does not match sequence name ($seq_name)\n";
262 } 264 }
263 265
264 } 266 }
265 267
266 print STDERR "sequence $seq_bases: \n" if $debug; 268 print STDERR "sequence $seq_bases: \n" if $debug;
267 269
268 my $best_barcode_mismatches_count = $barcodes_length; 270 my $best_barcode_mismatches_count = $barcodes_length;
269 my $best_barcode_ident = undef; 271 my $best_barcode_ident = undef;
270 272
271 #Try all barcodes, find the one with the lowest mismatch count 273 #Try all barcodes, find the one with the lowest mismatch count
272 foreach my $barcoderef (@barcodes) { 274 foreach my $barcoderef (@barcodes) {
273 my ($ident, $barcode) = @{$barcoderef}; 275 my ($ident, $barcode) = @{$barcoderef};
274 276
275 # Get DNA fragment (in the length of the barcodes) 277 # Get DNA fragment (in the length of the barcodes)
276 # The barcode will be tested only against this fragment 278 # The barcode will be tested only against this fragment
277 # (no point in testing the barcode against the whole sequence) 279 # (no point in testing the barcode against the whole sequence)
278 my $sequence_fragment; 280 my $sequence_fragment;
279 if ($barcodes_at_bol) { 281 if ($barcodes_at_bol) {
280 $sequence_fragment = substr $seq_bases, 0, $barcodes_length; 282 $sequence_fragment = substr $seq_bases, 0, $barcodes_length;
281 } elsif ($barcodes_at_eol) { 283 } elsif ($barcodes_at_eol) {
282 $sequence_fragment = substr $seq_bases, - $barcodes_length; 284 $sequence_fragment = substr $seq_bases, - $barcodes_length;
283 } else { 285 } else {
284 $sequence_fragment = substr $index_seq_bases, 0, $barcodes_length; 286 $sequence_fragment = substr $index_seq_bases, 0, $barcodes_length;
285 } 287 }
286 288
287 my $mm = mismatch_count($sequence_fragment, $barcode) ; 289 my $mm = mismatch_count($sequence_fragment, $barcode) ;
288 290
289 # if this is a partial match, add the non-overlap as a mismatch 291 # if this is a partial match, add the non-overlap as a mismatch
290 # (partial barcodes are shorter than the length of the original barcodes) 292 # (partial barcodes are shorter than the length of the original barcodes)
291 $mm += ($barcodes_length - length($barcode)); 293 $mm += ($barcodes_length - length($barcode));
292 294
293 if ( $mm < $best_barcode_mismatches_count ) { 295 if ( $mm < $best_barcode_mismatches_count ) {
294 $best_barcode_mismatches_count = $mm ; 296 $best_barcode_mismatches_count = $mm ;
295 $best_barcode_ident = $ident ; 297 $best_barcode_ident = $ident ;
296 } 298 }
297 } 299 }
298 300
299 $best_barcode_ident = 'unmatched' 301 $best_barcode_ident = 'unmatched'
300 if ( (!defined $best_barcode_ident) || $best_barcode_mismatches_count>$allowed_mismatches) ; 302 if ( (!defined $best_barcode_ident) || $best_barcode_mismatches_count>$allowed_mismatches) ;
301 303
302 print STDERR "sequence $seq_bases matched barcode: $best_barcode_ident\n" if $debug; 304 print STDERR "sequence $seq_bases matched barcode: $best_barcode_ident\n" if $debug;
303 305
304 $counts{$best_barcode_ident}++; 306 $counts{$best_barcode_ident}++;
305 307
306 #get the file associated with the matched barcode. 308 #get the file associated with the matched barcode.
307 #(note: there's also a file associated with 'unmatched' barcode) 309 #(note: there's also a file associated with 'unmatched' barcode)
308 my $file = $files{$best_barcode_ident}; 310 my $file = $files{$best_barcode_ident};
309 311
310 write_record($file); 312 write_record($file);
311 } 313 }
312 } 314 }
313 315
314 # Strip end of readids when matching to avoid mismatch between read 1, 2, 3, etc. 316 # Strip end of readids when matching to avoid mismatch between read 1, 2, 3, etc.
315 sub strip_read_id { 317 sub strip_read_id {
316 my $read_id = shift; 318 my $read_id = shift;
317 my $stripped_read_id = $read_id; 319 my $stripped_read_id = $read_id;
318 if ($read_id_check_strip_characters) { 320 if ($read_id_check_strip_characters) {
319 if ($read_id =~ /@([^:]+):([0-9]+):([^:]+):([0-9]+):([0-9]+):([0-9]+):([0-9]+) ([0-9]+):([YN]):([0-9]+):([ACGT]+){0,1}/) { # CASAVA 1.8+ 321 if ($read_id =~ /@([^:]+):([0-9]+):([^:]+):([0-9]+):([0-9]+):([0-9]+):([0-9]+) ([0-9]+):([YN]):([0-9]+):([ACGT]+){0,1}/) { # CASAVA 1.8+
320 my @parts = split(/ /,$read_id); 322 my @parts = split(/ /,$read_id);
321 $stripped_read_id = $parts[0]; 323 $stripped_read_id = $parts[0];
322 } else { # CASAVA 1.7 and earlier 324 } else { # CASAVA 1.7 and earlier
323 $stripped_read_id = substr($read_id, 0, length($read_id)-$read_id_check_strip_characters); 325 $stripped_read_id = substr($read_id, 0, length($read_id)-$read_id_check_strip_characters);
324 } 326 }
325 } 327 }
326 return $stripped_read_id; 328 return $stripped_read_id;
327 } 329 }
328 330
329 331
330 #Quickly calculate hamming distance between two strings 332 #Quickly calculate hamming distance between two strings
331 # 333 #
336 338
337 339
338 340
339 sub print_results 341 sub print_results
340 { 342 {
341 print "Barcode\tCount\tLocation\n"; 343 print "# Barcode\tCount\n";
342 my $total = 0 ; 344 my $total = 0 ;
343 foreach my $ident (sort keys %counts) { 345 foreach my $ident (sort keys %counts) {
344 print $ident, "\t", $counts{$ident},"\t",$filenames{$ident},"\n"; 346 print $ident, "\t", $counts{$ident},"\n";
345 $total += $counts{$ident}; 347 $total += $counts{$ident};
346 } 348 }
347 print "total\t",$total,"\n"; 349 print "total\t",$total,"\n";
348 } 350 }
349 351
350 352
351 sub read_record 353 sub read_record
352 { 354 {
353 $seq_name = $input_file_io->getline(); 355 $seq_name = $input_file_io->getline();
354 356
355 return undef unless defined $seq_name; # End of file? 357 return undef unless defined $seq_name; # End of file?
356 358
357 $seq_bases = $input_file_io->getline(); 359 $seq_bases = $input_file_io->getline();
358 die "Error: bad input file, expecting line with sequences\n" unless defined $seq_bases; 360 die "Error: bad input file, expecting line with sequences\n" unless defined $seq_bases;
359 361
360 # If using FASTQ format, read two more lines 362 # If using FASTQ format, read two more lines
361 if ($fastq_format) { 363 if ($fastq_format) {
362 $seq_name2 = $input_file_io->getline(); 364 $seq_name2 = $input_file_io->getline();
363 die "Error: bad input file, expecting line with sequence name2\n" unless defined $seq_name2; 365 die "Error: bad input file, expecting line with sequence name2\n" unless defined $seq_name2;
364 366
365 $seq_qualities = $input_file_io->getline(); 367 $seq_qualities = $input_file_io->getline();
366 die "Error: bad input file, expecting line with quality scores\n" unless defined $seq_qualities; 368 die "Error: bad input file, expecting line with quality scores\n" unless defined $seq_qualities;
367 } 369 }
368 return 1; 370 return 1;
369 } 371 }
370 372
371 sub write_record($) 373 sub write_record($)
372 { 374 {
373 my $file = shift; 375 my $file = shift;
374 376
375 croak "Bad file handle" unless defined $file; 377 croak "Bad file handle" unless defined $file;
376 378
377 print $file $seq_name,"\n"; 379 print $file $seq_name,"\n";
378 print $file $seq_bases,"\n"; 380 print $file $seq_bases,"\n";
379 381
380 #if using FASTQ format, write two more lines 382 #if using FASTQ format, write two more lines
381 if ($fastq_format) { 383 if ($fastq_format) {
382 print $file $seq_name2; 384 print $file $seq_name2;
383 print $file $seq_qualities; 385 print $file $seq_qualities;
384 } 386 }
385 } 387 }
386 388
387 sub open_and_detect_input_format 389 sub open_and_detect_input_format
388 { 390 {
389 $input_file_io = new IO::Handle; 391 $input_file_io = new IO::Handle;
390 die "Failed to open STDIN " unless $input_file_io->fdopen(fileno(STDIN),"r"); 392 die "Failed to open STDIN " unless $input_file_io->fdopen(fileno(STDIN),"r");
391 393
392 # Get the first characeter, and push it back 394 # Get the first characeter, and push it back
393 my $first_char = $input_file_io->getc(); 395 my $first_char = $input_file_io->getc();
394 $input_file_io->ungetc(ord $first_char); 396 $input_file_io->ungetc(ord $first_char);
395 397
396 if ($first_char eq '>') { 398 if ($first_char eq '>') {
397 # FASTA format 399 # FASTA format
398 $fastq_format = 0 ; 400 $fastq_format = 0 ;
399 print STDERR "Detected FASTA format\n" if $debug; 401 print STDERR "Detected FASTA format\n" if $debug;
400 } elsif ($first_char eq '@') { 402 } elsif ($first_char eq '@') {
401 # FASTQ format 403 # FASTQ format
402 $fastq_format = 1; 404 $fastq_format = 1;
403 print STDERR "Detected FASTQ format\n" if $debug; 405 print STDERR "Detected FASTQ format\n" if $debug;
404 } else { 406 } else {
405 die "Error: unknown file format. First character = '$first_char' (expecting > or \@)\n"; 407 die "Error: unknown file format. First character = '$first_char' (expecting > or \@)\n";
406 } 408 }
407 } 409 }
408 410
409 411
410 sub open_index_and_detect_input_format($) { 412 sub open_index_and_detect_input_format($) {
411 my $filename = shift or croak "Missing index read file name"; 413 my $filename = shift or croak "Missing index read file name";
412 414
413 open IDXFILE,"<$filename" or die "Error: failed to open index read file ($filename)\n"; 415 open IDXFILE,"<$filename" or die "Error: failed to open index read file ($filename)\n";
414 416
415 # Get the first line, and reset file pointer 417 # Get the first line, and reset file pointer
416 my $first_line = <IDXFILE>; 418 my $first_line = <IDXFILE>;
417 my $first_char = substr($first_line, 0, 1); 419 my $first_char = substr($first_line, 0, 1);
418 seek(IDXFILE, 0, 0); 420 seek(IDXFILE, 0, 0);
419 421
420 if ($first_char eq '>') { 422 if ($first_char eq '>') {
421 # FASTA format 423 # FASTA format
422 $index_fastq_format = 0 ; 424 $index_fastq_format = 0 ;
423 print STDERR "Detected FASTA format for index file\n" if $debug; 425 print STDERR "Detected FASTA format for index file\n" if $debug;
424 } elsif ($first_char eq '@') { 426 } elsif ($first_char eq '@') {
425 # FASTQ format 427 # FASTQ format
426 $index_fastq_format = 1; 428 $index_fastq_format = 1;
427 print STDERR "Detected FASTQ format for index file\n" if $debug; 429 print STDERR "Detected FASTQ format for index file\n" if $debug;
428 } else { 430 } else {
429 die "Error: unknown index file format. First character = '$first_char' (expecting > or \@)\n"; 431 die "Error: unknown index file format. First character = '$first_char' (expecting > or \@)\n";
430 } 432 }
431 } 433 }
432 434
433 sub read_index_record 435 sub read_index_record
434 { 436 {
435 $index_seq_name = <IDXFILE>; 437 $index_seq_name = <IDXFILE>;
436 438
437 return undef unless defined $index_seq_name; # End of file? 439 return undef unless defined $index_seq_name; # End of file?
438 440
439 $index_seq_bases = <IDXFILE>; 441 $index_seq_bases = <IDXFILE>;
440 die "Error: bad input file, expecting line with sequences\n" unless defined $index_seq_bases; 442 die "Error: bad input file, expecting line with sequences\n" unless defined $index_seq_bases;
441 443
442 # If using FASTQ format, read two more lines 444 # If using FASTQ format, read two more lines
443 if ($index_fastq_format) { 445 if ($index_fastq_format) {
444 $index_seq_name2 = <IDXFILE>; 446 $index_seq_name2 = <IDXFILE>;
445 die "Error: bad input file, expecting line with sequence name2\n" unless defined $index_seq_name2; 447 die "Error: bad input file, expecting line with sequence name2\n" unless defined $index_seq_name2;
446 448
447 $index_seq_qualities = <IDXFILE>; 449 $index_seq_qualities = <IDXFILE>;
448 die "Error: bad input file, expecting line with quality scores\n" unless defined $index_seq_qualities; 450 die "Error: bad input file, expecting line with quality scores\n" unless defined $index_seq_qualities;
449 } 451 }
450 return 1; 452 return 1;
451 } 453 }
452 454
453 sub usage() 455 sub usage()
454 { 456 {
455 print<<EOF; 457 print<<EOF;
456 Barcode Splitter, by Assaf Gordon (gordon\@cshl.edu), 11sep2008 458 Barcode Splitter, by Assaf Gordon (gordon\@cshl.edu), 11sep2008
457 459
458 This program reads FASTA/FASTQ file and splits it into several smaller files, 460 This program reads FASTA/FASTQ file and splits it into several smaller files,
459 Based on barcode matching. 461 Based on barcode matching.
460 FASTA/FASTQ data is read from STDIN (format is auto-detected.) 462 FASTA/FASTQ data is read from STDIN (format is auto-detected.)
461 Output files will be writen to disk. 463 Output files will be writen to disk.
462 Summary will be printed to STDOUT. 464 Summary will be printed to STDOUT.
463 465
464 usage: $0 --bcfile FILE --prefix PREFIX [--suffix SUFFIX] [--bol|--eol|--idxfile] 466 usage: $0 --bcfile FILE --prefix PREFIX [--suffix SUFFIX] [--bol|--eol|--idxfile]
465 [--mismatches N] [--exact] [--partial N] [--idxidstrip N] 467 [--mismatches N] [--exact] [--partial N] [--idxidstrip N]
466 [--help] [--quiet] [--debug] 468 [--help] [--quiet] [--debug]
467 469
468 Arguments: 470 Arguments:
469 471
470 --bcfile FILE - Barcodes file name. (see explanation below.) 472 --bcfile FILE - Barcodes file name. (see explanation below.)
471 --prefix PREFIX - File prefix. will be added to the output files. Can be used 473 --prefix PREFIX - File prefix. will be added to the output files. Can be used
472 to specify output directories. 474 to specify output directories.
473 --suffix SUFFIX - File suffix (optional). Can be used to specify file 475 --suffix SUFFIX - File suffix (optional). Can be used to specify file
474 extensions. 476 extensions.
475 --bol - Try to match barcodes at the BEGINNING of sequences. 477 --bol - Try to match barcodes at the BEGINNING of sequences.
476 (What biologists would call the 5' end, and programmers 478 (What biologists would call the 5' end, and programmers
477 would call index 0.) 479 would call index 0.)
478 --eol - Try to match barcodes at the END of sequences. 480 --eol - Try to match barcodes at the END of sequences.
479 (What biologists would call the 3' end, and programmers 481 (What biologists would call the 3' end, and programmers
480 would call the end of the string.) 482 would call the end of the string.)
481 --idxfile FILE - Read barcodes from separate index file (fasta or fastq) 483 --idxfile FILE - Read barcodes from separate index file (fasta or fastq)
482 NOTE: one of --bol, --eol, --idxfile must be specified, 484 NOTE: one of --bol, --eol, --idxfile must be specified,
483 but not more than one. 485 but not more than one.
484 --idxidstrip N - When using index file, strip this number of characters 486 --idxidstrip N - When using index file, strip this number of characters
485 from the end of the sequence id before matching. 487 from the end of the sequence id before matching.
486 Automatically detects CASAVA 1.8 format and strips at a 488 Automatically detects CASAVA 1.8 format and strips at a
487 space in the id, use 0 to disable this. 489 space in the id, use 0 to disable this.
488 (Default is 1). 490 (Default is 1).
489 --mismatches N - Max. number of mismatches allowed. default is 1. 491 --mismatches N - Max. number of mismatches allowed. default is 1.
490 --exact - Same as '--mismatches 0'. If both --exact and --mismatches 492 --exact - Same as '--mismatches 0'. If both --exact and --mismatches
491 are specified, '--exact' takes precedence. 493 are specified, '--exact' takes precedence.
492 --partial N - Allow partial overlap of barcodes. (see explanation below.) 494 --partial N - Allow partial overlap of barcodes. (see explanation below.)
493 (Default is not partial matching) 495 (Default is not partial matching)
494 --quiet - Don't print counts and summary at the end of the run. 496 --quiet - Don't print counts and summary at the end of the run.
495 (Default is to print.) 497 (Default is to print.)
496 --debug - Print lots of useless debug information to STDERR. 498 --debug - Print lots of useless debug information to STDERR.
497 --help - This helpful help screen. 499 --help - This helpful help screen.
498 500
499 Example (Assuming 's_2_100.txt' is a FASTQ file, 'mybarcodes.txt' is 501 Example (Assuming 's_2_100.txt' is a FASTQ file, 'mybarcodes.txt' is
500 the barcodes file): 502 the barcodes file):
501 503
502 \$ cat s_2_100.txt | $0 --bcfile mybarcodes.txt --bol --mismatches 2 \\ 504 \$ cat s_2_100.txt | $0 --bcfile mybarcodes.txt --bol --mismatches 2 \\
503 --prefix /tmp/bla_ --suffix ".txt" 505 --prefix /tmp/bla_ --suffix ".txt"
504 506
505 Barcode file format 507 Barcode file format
506 ------------------- 508 -------------------
507 Barcode files are simple text files. Each line should contain an identifier 509 Barcode files are simple text files. Each line should contain an identifier
508 (descriptive name for the barcode), and the barcode itself (A/C/G/T), 510 (descriptive name for the barcode), and the barcode itself (A/C/G/T),
509 separated by a TAB character. Example: 511 separated by a TAB character. Example:
510 512
511 #This line is a comment (starts with a 'number' sign) 513 #This line is a comment (starts with a 'number' sign)
512 BC1 GATCT 514 BC1 GATCT
513 BC2 ATCGT 515 BC2 ATCGT
514 BC3 GTGAT 516 BC3 GTGAT
515 BC4 TGTCT 517 BC4 TGTCT
516 518
517 For each barcode, a new FASTQ file will be created (with the barcode's 519 For each barcode, a new FASTQ file will be created (with the barcode's
518 identifier as part of the file name). Sequences matching the barcode 520 identifier as part of the file name). Sequences matching the barcode
519 will be stored in the appropriate file. 521 will be stored in the appropriate file.
520 522
521 Running the above example (assuming "mybarcodes.txt" contains the above 523 Running the above example (assuming "mybarcodes.txt" contains the above
522 barcodes), will create the following files: 524 barcodes), will create the following files:
523 /tmp/bla_BC1.txt 525 /tmp/bla_BC1.txt
524 /tmp/bla_BC2.txt 526 /tmp/bla_BC2.txt
525 /tmp/bla_BC3.txt 527 /tmp/bla_BC3.txt
526 /tmp/bla_BC4.txt 528 /tmp/bla_BC4.txt
527 /tmp/bla_unmatched.txt 529 /tmp/bla_unmatched.txt
528 The 'unmatched' file will contain all sequences that didn't match any barcode. 530 The 'unmatched' file will contain all sequences that didn't match any barcode.
529 531
530 Barcode matching 532 Barcode matching
531 ---------------- 533 ----------------
532 534
546 ATCGT (4 mismatches, BC2) 548 ATCGT (4 mismatches, BC2)
547 GTGAT (3 mismatches, BC3) 549 GTGAT (3 mismatches, BC3)
548 TGTCT (3 mismatches, BC4) 550 TGTCT (3 mismatches, BC4)
549 551
550 This sequence will be classified as 'BC1' (it has the lowest mismatch count). 552 This sequence will be classified as 'BC1' (it has the lowest mismatch count).
551 If '--exact' or '--mismatches 0' were specified, this sequence would be 553 If '--exact' or '--mismatches 0' were specified, this sequence would be
552 classified as 'unmatched' (because, although BC1 had the lowest mismatch count, 554 classified as 'unmatched' (because, although BC1 had the lowest mismatch count,
553 it is above the maximum allowed mismatches). 555 it is above the maximum allowed mismatches).
554 556
555 Matching with '--eol' (end of line) does the same, but from the other side 557 Matching with '--eol' (end of line) does the same, but from the other side
556 of the sequence. 558 of the sequence.
572 -ATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG 574 -ATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG
573 GATCT (1 mismatch) 575 GATCT (1 mismatch)
574 576
575 Note: scoring counts a missing base as a mismatch, so the final 577 Note: scoring counts a missing base as a mismatch, so the final
576 mismatch count is 2 (1 'real' mismatch, 1 'missing base' mismatch). 578 mismatch count is 2 (1 'real' mismatch, 1 'missing base' mismatch).
577 If running with '--mismatches 2' (meaning allowing upto 2 mismatches) - this 579 If running with '--mismatches 2' (meaning allowing upto 2 mismatches) - this
578 seqeunce will be classified as BC1. 580 seqeunce will be classified as BC1.
579 581
580 EOF 582 EOF
581 583
582 exit 1; 584 exit 1;