Mercurial > repos > lparsons > fastx_barcode_splitter_enhanced
changeset 4:0fb7e9130a70 draft default tip
planemo upload for repository https://github.com/lparsons/galaxy_tools/tree/master/tools/fastx_barcode_splitter_enhanced commit 460463a5406419fe8e113467bdb8bd093d21e7c5
author | lparsons |
---|---|
date | Mon, 02 May 2016 17:04:32 -0400 |
parents | e7b7cdc1834d |
children | |
files | fastx_barcode_splitter.pl fastx_barcode_splitter.xml fastx_barcode_splitter_galaxy_wrapper.sh test-data/.fastx_barcode_splitter1_BC4.out.swp test-data/fastx_barcode_splitter1.out testoutput/fastx_barcode_splitter1.out tmp/BC1 tmp/BC2 tmp/BC3 tmp/BC4 tmp/tmp_BC1_visible.fastqsanger tmp/tmp_BC1_visible_fastqsanger tmp/tmp_BC2_visible.fastqsanger tmp/tmp_BC2_visible_fastqsanger tmp/tmp_BC3_visible.fastqsanger tmp/tmp_BC3_visible_fastqsanger tmp/tmp_BC4_visible.fastqsanger tmp/tmp_BC4_visible_fastqsanger tmp/tmp_unmatched_visible.fastqsanger tmp/tmp_unmatched_visible_fastqsanger tmp/unmatched tmp_stdio.txt |
diffstat | 21 files changed, 379 insertions(+), 934 deletions(-) [+] |
line wrap: on
line diff
--- a/fastx_barcode_splitter.pl Fri Mar 11 17:24:31 2016 -0500 +++ b/fastx_barcode_splitter.pl Mon May 02 17:04:32 2016 -0400 @@ -6,7 +6,9 @@ # Lance Parsons (lparsons@princeton.edu) # 3/21/2011 - Modified to accept separate index file for barcodes # 4/6/2011 - Modified to cleanup bad barcode identifiers (esp. useful for Galaxy) -# +# 4/28/2016 - Modified summary output to remove file paths and add comment +# character '#' + # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as # published by the Free Software Foundation, either version 3 of the @@ -48,7 +50,7 @@ sub write_record($); sub usage(); -# Global flags and arguments, +# Global flags and arguments, # Set by command line argumens my $barcode_file ; my $barcodes_at_eol = 0 ; @@ -65,7 +67,7 @@ my $index_fastq_format = 1; my $read_id_check_strip_characters = 1; -# Global variables +# Global variables # Populated by 'create_output_files' my %filenames; my %files; @@ -117,51 +119,51 @@ sub parse_command_line { - my $help; + my $help; - usage() if (scalar @ARGV==0); + usage() if (scalar @ARGV==0); - my $result = GetOptions ( "bcfile=s" => \$barcode_file, - "eol" => \$barcodes_at_eol, - "bol" => \$barcodes_at_bol, - "idxfile=s" => \$index_read_file, - "idxidstrip=i" => \$read_id_check_strip_characters, - "exact" => \$exact_match, - "prefix=s" => \$newfile_prefix, - "suffix=s" => \$newfile_suffix, - "quiet" => \$quiet, - "partial=i" => \$allow_partial_overlap, - "debug" => \$debug, - "mismatches=i" => \$allowed_mismatches, - "help" => \$help - ) ; + my $result = GetOptions ( "bcfile=s" => \$barcode_file, + "eol" => \$barcodes_at_eol, + "bol" => \$barcodes_at_bol, + "idxfile=s" => \$index_read_file, + "idxidstrip=i" => \$read_id_check_strip_characters, + "exact" => \$exact_match, + "prefix=s" => \$newfile_prefix, + "suffix=s" => \$newfile_suffix, + "quiet" => \$quiet, + "partial=i" => \$allow_partial_overlap, + "debug" => \$debug, + "mismatches=i" => \$allowed_mismatches, + "help" => \$help + ) ; - usage() if ($help); + usage() if ($help); - die "Error: barcode file not specified (use '--bcfile [FILENAME]')\n" unless defined $barcode_file; - die "Error: prefix path/filename not specified (use '--prefix [PATH]')\n" unless defined $newfile_prefix; + die "Error: barcode file not specified (use '--bcfile [FILENAME]')\n" unless defined $barcode_file; + die "Error: prefix path/filename not specified (use '--prefix [PATH]')\n" unless defined $newfile_prefix; - if (! defined $index_read_file) { - if ($barcodes_at_bol == $barcodes_at_eol) { - die "Error: can't specify both --eol & --bol\n" if $barcodes_at_eol; - die "Error: must specify either --eol or --bol or --idxfile\n" ; - } - } - elsif ($barcodes_at_bol || $barcodes_at_eol) { - die "Error: Must specify only one of --idxfile, --eol, or --bol"; - } + if (! defined $index_read_file) { + if ($barcodes_at_bol == $barcodes_at_eol) { + die "Error: can't specify both --eol & --bol\n" if $barcodes_at_eol; + die "Error: must specify either --eol or --bol or --idxfile\n" ; + } + } + elsif ($barcodes_at_bol || $barcodes_at_eol) { + die "Error: Must specify only one of --idxfile, --eol, or --bol"; + } - die "Error: invalid for value partial matches (valid values are 0 or greater)\n" if $allow_partial_overlap<0; + die "Error: invalid for value partial matches (valid values are 0 or greater)\n" if $allow_partial_overlap<0; - $allowed_mismatches = 0 if $exact_match; + $allowed_mismatches = 0 if $exact_match; - die "Error: invalid value for mismatches (valid values are 0 or more)\n" if ($allowed_mismatches<0); + die "Error: invalid value for mismatches (valid values are 0 or more)\n" if ($allowed_mismatches<0); - die "Error: partial overlap value ($allow_partial_overlap) bigger than " . - "max. allowed mismatches ($allowed_mismatches)\n" if ($allow_partial_overlap > $allowed_mismatches); + die "Error: partial overlap value ($allow_partial_overlap) bigger than " . + "max. allowed mismatches ($allowed_mismatches)\n" if ($allow_partial_overlap > $allowed_mismatches); - exit unless $result; + exit unless $result; } @@ -170,160 +172,160 @@ # Read the barcode file # sub load_barcode_file ($) { - my $filename = shift or croak "Missing barcode file name"; + my $filename = shift or croak "Missing barcode file name"; - open BCFILE,"<$filename" or die "Error: failed to open barcode file ($filename)\n"; - while (<BCFILE>) { - next if m/^#/; - chomp; - my ($ident, $barcode) = split('\t') ; + open BCFILE,"<$filename" or die "Error: failed to open barcode file ($filename)\n"; + while (<BCFILE>) { + next if m/^#/; + chomp; + my ($ident, $barcode) = split('\t') ; - $barcode = uc($barcode); + $barcode = uc($barcode); - # Sanity checks on the barcodes - die "Error: bad data at barcode file ($filename) line $.\n" unless defined $barcode; - die "Error: bad barcode value ($barcode) at barcode file ($filename) line $.\n" - unless $barcode =~ m/^[AGCT]+$/; + # Sanity checks on the barcodes + die "Error: bad data at barcode file ($filename) line $.\n" unless defined $barcode; + die "Error: bad barcode value ($barcode) at barcode file ($filename) line $.\n" + unless $barcode =~ m/^[AGCT]+$/; - # Cleanup Identifiers (only allow alphanumeric, replace others with dash '-') - $ident =~ s/[^A-Za-z0-9]/-/g; - die "Error: bad identifier value ($ident) at barcode file ($filename) line $. (must be alphanumeric)\n" - unless $ident =~ m/^[A-Za-z0-9-]+$/; + # Cleanup Identifiers (only allow alphanumeric, replace others with dash '-') + $ident =~ s/[^A-Za-z0-9]/-/g; + die "Error: bad identifier value ($ident) at barcode file ($filename) line $. (must be alphanumeric)\n" + unless $ident =~ m/^[A-Za-z0-9-]+$/; - die "Error: badcode($ident, $barcode) is shorter or equal to maximum number of " . - "mismatches ($allowed_mismatches). This makes no sense. Specify fewer mismatches.\n" - if length($barcode)<=$allowed_mismatches; + die "Error: badcode($ident, $barcode) is shorter or equal to maximum number of " . + "mismatches ($allowed_mismatches). This makes no sense. Specify fewer mismatches.\n" + if length($barcode)<=$allowed_mismatches; - $barcodes_length = length($barcode) unless defined $barcodes_length; - die "Error: found barcodes in different lengths. this feature is not supported yet.\n" - unless $barcodes_length == length($barcode); + $barcodes_length = length($barcode) unless defined $barcodes_length; + die "Error: found barcodes in different lengths. this feature is not supported yet.\n" + unless $barcodes_length == length($barcode); - push @barcodes, [$ident, $barcode]; + push @barcodes, [$ident, $barcode]; - if ($allow_partial_overlap>0) { - foreach my $i (1 .. $allow_partial_overlap) { - substr $barcode, ($barcodes_at_bol)?0:-1, 1, ''; - push @barcodes, [$ident, $barcode]; - } - } - } - close BCFILE; + if ($allow_partial_overlap>0) { + foreach my $i (1 .. $allow_partial_overlap) { + substr $barcode, ($barcodes_at_bol)?0:-1, 1, ''; + push @barcodes, [$ident, $barcode]; + } + } + } + close BCFILE; - if ($debug) { - print STDERR "barcode\tsequence\n"; - foreach my $barcoderef (@barcodes) { - my ($ident, $seq) = @{$barcoderef}; - print STDERR $ident,"\t", $seq ,"\n"; - } - } + if ($debug) { + print STDERR "barcode\tsequence\n"; + foreach my $barcoderef (@barcodes) { + my ($ident, $seq) = @{$barcoderef}; + print STDERR $ident,"\t", $seq ,"\n"; + } + } } # Create one output file for each barcode. # (Also create a file for the dummy 'unmatched' barcode) sub create_output_files { - my %barcodes = map { $_->[0] => 1 } @barcodes; #generate a uniq list of barcode identifiers; - $barcodes{'unmatched'} = 1 ; + my %barcodes = map { $_->[0] => 1 } @barcodes; #generate a uniq list of barcode identifiers; + $barcodes{'unmatched'} = 1 ; - foreach my $ident (keys %barcodes) { - my $new_filename = $newfile_prefix . $ident . $newfile_suffix; - $filenames{$ident} = $new_filename; - open my $file, ">$new_filename" or die "Error: failed to create output file ($new_filename)\n"; - $files{$ident} = $file ; - } + foreach my $ident (keys %barcodes) { + my $new_filename = $newfile_prefix . $ident . $newfile_suffix; + $filenames{$ident} = $new_filename; + open my $file, ">$new_filename" or die "Error: failed to create output file ($new_filename)\n"; + $files{$ident} = $file ; + } } sub match_sequences { - my %barcodes = map { $_->[0] => 1 } @barcodes; #generate a uniq list of barcode identifiers; - $barcodes{'unmatched'} = 1 ; + my %barcodes = map { $_->[0] => 1 } @barcodes; #generate a uniq list of barcode identifiers; + $barcodes{'unmatched'} = 1 ; - #reset counters - foreach my $ident ( keys %barcodes ) { - $counts{$ident} = 0; - } + #reset counters + foreach my $ident ( keys %barcodes ) { + $counts{$ident} = 0; + } - create_output_files; + create_output_files; - # Read file FASTQ file - # split accotding to barcodes - while ( read_record ) { - chomp $seq_name; - chomp $seq_bases; - if (defined $index_read_file) { - read_index_record() or die "Error: Unable to read index sequence for sequence name ($seq_name), check to make sure the file lengths match.\n"; - chomp $index_seq_name; - chomp $index_seq_bases; + # Read file FASTQ file + # split accotding to barcodes + while ( read_record ) { + chomp $seq_name; + chomp $seq_bases; + if (defined $index_read_file) { + read_index_record() or die "Error: Unable to read index sequence for sequence name ($seq_name), check to make sure the file lengths match.\n"; + chomp $index_seq_name; + chomp $index_seq_bases; - # Assert that the read ids match - my $seq_name_match = &strip_read_id($seq_name); - my $index_seq_name_match = &strip_read_id($index_seq_name); - if ($seq_name_match ne $index_seq_name_match) { - die "Error: Index sequence name ($index_seq_name) does not match sequence name ($seq_name)\n"; - } + # Assert that the read ids match + my $seq_name_match = &strip_read_id($seq_name); + my $index_seq_name_match = &strip_read_id($index_seq_name); + if ($seq_name_match ne $index_seq_name_match) { + die "Error: Index sequence name ($index_seq_name) does not match sequence name ($seq_name)\n"; + } - } + } - print STDERR "sequence $seq_bases: \n" if $debug; + print STDERR "sequence $seq_bases: \n" if $debug; - my $best_barcode_mismatches_count = $barcodes_length; - my $best_barcode_ident = undef; + my $best_barcode_mismatches_count = $barcodes_length; + my $best_barcode_ident = undef; - #Try all barcodes, find the one with the lowest mismatch count - foreach my $barcoderef (@barcodes) { - my ($ident, $barcode) = @{$barcoderef}; + #Try all barcodes, find the one with the lowest mismatch count + foreach my $barcoderef (@barcodes) { + my ($ident, $barcode) = @{$barcoderef}; - # Get DNA fragment (in the length of the barcodes) - # The barcode will be tested only against this fragment - # (no point in testing the barcode against the whole sequence) - my $sequence_fragment; - if ($barcodes_at_bol) { - $sequence_fragment = substr $seq_bases, 0, $barcodes_length; - } elsif ($barcodes_at_eol) { - $sequence_fragment = substr $seq_bases, - $barcodes_length; - } else { - $sequence_fragment = substr $index_seq_bases, 0, $barcodes_length; - } + # Get DNA fragment (in the length of the barcodes) + # The barcode will be tested only against this fragment + # (no point in testing the barcode against the whole sequence) + my $sequence_fragment; + if ($barcodes_at_bol) { + $sequence_fragment = substr $seq_bases, 0, $barcodes_length; + } elsif ($barcodes_at_eol) { + $sequence_fragment = substr $seq_bases, - $barcodes_length; + } else { + $sequence_fragment = substr $index_seq_bases, 0, $barcodes_length; + } - my $mm = mismatch_count($sequence_fragment, $barcode) ; + my $mm = mismatch_count($sequence_fragment, $barcode) ; - # if this is a partial match, add the non-overlap as a mismatch - # (partial barcodes are shorter than the length of the original barcodes) - $mm += ($barcodes_length - length($barcode)); + # if this is a partial match, add the non-overlap as a mismatch + # (partial barcodes are shorter than the length of the original barcodes) + $mm += ($barcodes_length - length($barcode)); - if ( $mm < $best_barcode_mismatches_count ) { - $best_barcode_mismatches_count = $mm ; - $best_barcode_ident = $ident ; - } - } + if ( $mm < $best_barcode_mismatches_count ) { + $best_barcode_mismatches_count = $mm ; + $best_barcode_ident = $ident ; + } + } - $best_barcode_ident = 'unmatched' - if ( (!defined $best_barcode_ident) || $best_barcode_mismatches_count>$allowed_mismatches) ; + $best_barcode_ident = 'unmatched' + if ( (!defined $best_barcode_ident) || $best_barcode_mismatches_count>$allowed_mismatches) ; - print STDERR "sequence $seq_bases matched barcode: $best_barcode_ident\n" if $debug; + print STDERR "sequence $seq_bases matched barcode: $best_barcode_ident\n" if $debug; - $counts{$best_barcode_ident}++; + $counts{$best_barcode_ident}++; - #get the file associated with the matched barcode. - #(note: there's also a file associated with 'unmatched' barcode) - my $file = $files{$best_barcode_ident}; + #get the file associated with the matched barcode. + #(note: there's also a file associated with 'unmatched' barcode) + my $file = $files{$best_barcode_ident}; - write_record($file); - } + write_record($file); + } } # Strip end of readids when matching to avoid mismatch between read 1, 2, 3, etc. sub strip_read_id { - my $read_id = shift; - my $stripped_read_id = $read_id; - if ($read_id_check_strip_characters) { - if ($read_id =~ /@([^:]+):([0-9]+):([^:]+):([0-9]+):([0-9]+):([0-9]+):([0-9]+) ([0-9]+):([YN]):([0-9]+):([ACGT]+){0,1}/) { # CASAVA 1.8+ - my @parts = split(/ /,$read_id); - $stripped_read_id = $parts[0]; - } else { # CASAVA 1.7 and earlier - $stripped_read_id = substr($read_id, 0, length($read_id)-$read_id_check_strip_characters); - } - } - return $stripped_read_id; + my $read_id = shift; + my $stripped_read_id = $read_id; + if ($read_id_check_strip_characters) { + if ($read_id =~ /@([^:]+):([0-9]+):([^:]+):([0-9]+):([0-9]+):([0-9]+):([0-9]+) ([0-9]+):([YN]):([0-9]+):([ACGT]+){0,1}/) { # CASAVA 1.8+ + my @parts = split(/ /,$read_id); + $stripped_read_id = $parts[0]; + } else { # CASAVA 1.7 and earlier + $stripped_read_id = substr($read_id, 0, length($read_id)-$read_id_check_strip_characters); + } + } + return $stripped_read_id; } @@ -338,121 +340,121 @@ sub print_results { - print "Barcode\tCount\tLocation\n"; - my $total = 0 ; - foreach my $ident (sort keys %counts) { - print $ident, "\t", $counts{$ident},"\t",$filenames{$ident},"\n"; - $total += $counts{$ident}; - } - print "total\t",$total,"\n"; + print "# Barcode\tCount\n"; + my $total = 0 ; + foreach my $ident (sort keys %counts) { + print $ident, "\t", $counts{$ident},"\n"; + $total += $counts{$ident}; + } + print "total\t",$total,"\n"; } sub read_record { - $seq_name = $input_file_io->getline(); + $seq_name = $input_file_io->getline(); - return undef unless defined $seq_name; # End of file? + return undef unless defined $seq_name; # End of file? - $seq_bases = $input_file_io->getline(); - die "Error: bad input file, expecting line with sequences\n" unless defined $seq_bases; + $seq_bases = $input_file_io->getline(); + die "Error: bad input file, expecting line with sequences\n" unless defined $seq_bases; - # If using FASTQ format, read two more lines - if ($fastq_format) { - $seq_name2 = $input_file_io->getline(); - die "Error: bad input file, expecting line with sequence name2\n" unless defined $seq_name2; + # If using FASTQ format, read two more lines + if ($fastq_format) { + $seq_name2 = $input_file_io->getline(); + die "Error: bad input file, expecting line with sequence name2\n" unless defined $seq_name2; - $seq_qualities = $input_file_io->getline(); - die "Error: bad input file, expecting line with quality scores\n" unless defined $seq_qualities; - } - return 1; + $seq_qualities = $input_file_io->getline(); + die "Error: bad input file, expecting line with quality scores\n" unless defined $seq_qualities; + } + return 1; } sub write_record($) { - my $file = shift; + my $file = shift; - croak "Bad file handle" unless defined $file; + croak "Bad file handle" unless defined $file; - print $file $seq_name,"\n"; - print $file $seq_bases,"\n"; + print $file $seq_name,"\n"; + print $file $seq_bases,"\n"; - #if using FASTQ format, write two more lines - if ($fastq_format) { - print $file $seq_name2; - print $file $seq_qualities; - } + #if using FASTQ format, write two more lines + if ($fastq_format) { + print $file $seq_name2; + print $file $seq_qualities; + } } sub open_and_detect_input_format { - $input_file_io = new IO::Handle; - die "Failed to open STDIN " unless $input_file_io->fdopen(fileno(STDIN),"r"); + $input_file_io = new IO::Handle; + die "Failed to open STDIN " unless $input_file_io->fdopen(fileno(STDIN),"r"); - # Get the first characeter, and push it back - my $first_char = $input_file_io->getc(); - $input_file_io->ungetc(ord $first_char); + # Get the first characeter, and push it back + my $first_char = $input_file_io->getc(); + $input_file_io->ungetc(ord $first_char); - if ($first_char eq '>') { - # FASTA format - $fastq_format = 0 ; - print STDERR "Detected FASTA format\n" if $debug; - } elsif ($first_char eq '@') { - # FASTQ format - $fastq_format = 1; - print STDERR "Detected FASTQ format\n" if $debug; - } else { - die "Error: unknown file format. First character = '$first_char' (expecting > or \@)\n"; - } + if ($first_char eq '>') { + # FASTA format + $fastq_format = 0 ; + print STDERR "Detected FASTA format\n" if $debug; + } elsif ($first_char eq '@') { + # FASTQ format + $fastq_format = 1; + print STDERR "Detected FASTQ format\n" if $debug; + } else { + die "Error: unknown file format. First character = '$first_char' (expecting > or \@)\n"; + } } sub open_index_and_detect_input_format($) { - my $filename = shift or croak "Missing index read file name"; + my $filename = shift or croak "Missing index read file name"; - open IDXFILE,"<$filename" or die "Error: failed to open index read file ($filename)\n"; + open IDXFILE,"<$filename" or die "Error: failed to open index read file ($filename)\n"; - # Get the first line, and reset file pointer - my $first_line = <IDXFILE>; - my $first_char = substr($first_line, 0, 1); - seek(IDXFILE, 0, 0); + # Get the first line, and reset file pointer + my $first_line = <IDXFILE>; + my $first_char = substr($first_line, 0, 1); + seek(IDXFILE, 0, 0); - if ($first_char eq '>') { - # FASTA format - $index_fastq_format = 0 ; - print STDERR "Detected FASTA format for index file\n" if $debug; - } elsif ($first_char eq '@') { - # FASTQ format - $index_fastq_format = 1; - print STDERR "Detected FASTQ format for index file\n" if $debug; - } else { - die "Error: unknown index file format. First character = '$first_char' (expecting > or \@)\n"; - } + if ($first_char eq '>') { + # FASTA format + $index_fastq_format = 0 ; + print STDERR "Detected FASTA format for index file\n" if $debug; + } elsif ($first_char eq '@') { + # FASTQ format + $index_fastq_format = 1; + print STDERR "Detected FASTQ format for index file\n" if $debug; + } else { + die "Error: unknown index file format. First character = '$first_char' (expecting > or \@)\n"; + } } sub read_index_record { - $index_seq_name = <IDXFILE>; + $index_seq_name = <IDXFILE>; - return undef unless defined $index_seq_name; # End of file? + return undef unless defined $index_seq_name; # End of file? - $index_seq_bases = <IDXFILE>; - die "Error: bad input file, expecting line with sequences\n" unless defined $index_seq_bases; + $index_seq_bases = <IDXFILE>; + die "Error: bad input file, expecting line with sequences\n" unless defined $index_seq_bases; - # If using FASTQ format, read two more lines - if ($index_fastq_format) { - $index_seq_name2 = <IDXFILE>; - die "Error: bad input file, expecting line with sequence name2\n" unless defined $index_seq_name2; + # If using FASTQ format, read two more lines + if ($index_fastq_format) { + $index_seq_name2 = <IDXFILE>; + die "Error: bad input file, expecting line with sequence name2\n" unless defined $index_seq_name2; - $index_seq_qualities = <IDXFILE>; - die "Error: bad input file, expecting line with quality scores\n" unless defined $index_seq_qualities; - } - return 1; + $index_seq_qualities = <IDXFILE>; + die "Error: bad input file, expecting line with quality scores\n" unless defined $index_seq_qualities; + } + return 1; } sub usage() { - print<<EOF; + print<<EOF; Barcode Splitter, by Assaf Gordon (gordon\@cshl.edu), 11sep2008 This program reads FASTA/FASTQ file and splits it into several smaller files, @@ -461,51 +463,51 @@ Output files will be writen to disk. Summary will be printed to STDOUT. -usage: $0 --bcfile FILE --prefix PREFIX [--suffix SUFFIX] [--bol|--eol|--idxfile] - [--mismatches N] [--exact] [--partial N] [--idxidstrip N] - [--help] [--quiet] [--debug] +usage: $0 --bcfile FILE --prefix PREFIX [--suffix SUFFIX] [--bol|--eol|--idxfile] + [--mismatches N] [--exact] [--partial N] [--idxidstrip N] + [--help] [--quiet] [--debug] Arguments: ---bcfile FILE - Barcodes file name. (see explanation below.) ---prefix PREFIX - File prefix. will be added to the output files. Can be used - to specify output directories. ---suffix SUFFIX - File suffix (optional). Can be used to specify file - extensions. ---bol - Try to match barcodes at the BEGINNING of sequences. - (What biologists would call the 5' end, and programmers - would call index 0.) ---eol - Try to match barcodes at the END of sequences. - (What biologists would call the 3' end, and programmers - would call the end of the string.) +--bcfile FILE - Barcodes file name. (see explanation below.) +--prefix PREFIX - File prefix. will be added to the output files. Can be used + to specify output directories. +--suffix SUFFIX - File suffix (optional). Can be used to specify file + extensions. +--bol - Try to match barcodes at the BEGINNING of sequences. + (What biologists would call the 5' end, and programmers + would call index 0.) +--eol - Try to match barcodes at the END of sequences. + (What biologists would call the 3' end, and programmers + would call the end of the string.) --idxfile FILE - Read barcodes from separate index file (fasta or fastq) - NOTE: one of --bol, --eol, --idxfile must be specified, - but not more than one. + NOTE: one of --bol, --eol, --idxfile must be specified, + but not more than one. --idxidstrip N - When using index file, strip this number of characters - from the end of the sequence id before matching. - Automatically detects CASAVA 1.8 format and strips at a - space in the id, use 0 to disable this. - (Default is 1). ---mismatches N - Max. number of mismatches allowed. default is 1. ---exact - Same as '--mismatches 0'. If both --exact and --mismatches - are specified, '--exact' takes precedence. ---partial N - Allow partial overlap of barcodes. (see explanation below.) - (Default is not partial matching) ---quiet - Don't print counts and summary at the end of the run. - (Default is to print.) ---debug - Print lots of useless debug information to STDERR. ---help - This helpful help screen. + from the end of the sequence id before matching. + Automatically detects CASAVA 1.8 format and strips at a + space in the id, use 0 to disable this. + (Default is 1). +--mismatches N - Max. number of mismatches allowed. default is 1. +--exact - Same as '--mismatches 0'. If both --exact and --mismatches + are specified, '--exact' takes precedence. +--partial N - Allow partial overlap of barcodes. (see explanation below.) + (Default is not partial matching) +--quiet - Don't print counts and summary at the end of the run. + (Default is to print.) +--debug - Print lots of useless debug information to STDERR. +--help - This helpful help screen. -Example (Assuming 's_2_100.txt' is a FASTQ file, 'mybarcodes.txt' is +Example (Assuming 's_2_100.txt' is a FASTQ file, 'mybarcodes.txt' is the barcodes file): - \$ cat s_2_100.txt | $0 --bcfile mybarcodes.txt --bol --mismatches 2 \\ - --prefix /tmp/bla_ --suffix ".txt" + \$ cat s_2_100.txt | $0 --bcfile mybarcodes.txt --bol --mismatches 2 \\ + --prefix /tmp/bla_ --suffix ".txt" Barcode file format ------------------- -Barcode files are simple text files. Each line should contain an identifier -(descriptive name for the barcode), and the barcode itself (A/C/G/T), +Barcode files are simple text files. Each line should contain an identifier +(descriptive name for the barcode), and the barcode itself (A/C/G/T), separated by a TAB character. Example: #This line is a comment (starts with a 'number' sign) @@ -514,17 +516,17 @@ BC3 GTGAT BC4 TGTCT -For each barcode, a new FASTQ file will be created (with the barcode's -identifier as part of the file name). Sequences matching the barcode +For each barcode, a new FASTQ file will be created (with the barcode's +identifier as part of the file name). Sequences matching the barcode will be stored in the appropriate file. -Running the above example (assuming "mybarcodes.txt" contains the above +Running the above example (assuming "mybarcodes.txt" contains the above barcodes), will create the following files: - /tmp/bla_BC1.txt - /tmp/bla_BC2.txt - /tmp/bla_BC3.txt - /tmp/bla_BC4.txt - /tmp/bla_unmatched.txt + /tmp/bla_BC1.txt + /tmp/bla_BC2.txt + /tmp/bla_BC3.txt + /tmp/bla_BC4.txt + /tmp/bla_unmatched.txt The 'unmatched' file will contain all sequences that didn't match any barcode. Barcode matching @@ -548,7 +550,7 @@ TGTCT (3 mismatches, BC4) This sequence will be classified as 'BC1' (it has the lowest mismatch count). -If '--exact' or '--mismatches 0' were specified, this sequence would be +If '--exact' or '--mismatches 0' were specified, this sequence would be classified as 'unmatched' (because, although BC1 had the lowest mismatch count, it is above the maximum allowed mismatches). @@ -574,7 +576,7 @@ Note: scoring counts a missing base as a mismatch, so the final mismatch count is 2 (1 'real' mismatch, 1 'missing base' mismatch). -If running with '--mismatches 2' (meaning allowing upto 2 mismatches) - this +If running with '--mismatches 2' (meaning allowing upto 2 mismatches) - this seqeunce will be classified as BC1. EOF
--- a/fastx_barcode_splitter.xml Fri Mar 11 17:24:31 2016 -0500 +++ b/fastx_barcode_splitter.xml Mon May 02 17:04:32 2016 -0400 @@ -1,82 +1,86 @@ -<tool id="cshl_princeton_fastx_barcode_splitter" version="1.1" name="Barcode Splitter"> - <description></description> - <command interpreter="bash"><![CDATA[ +<tool id="cshl_princeton_fastx_barcode_splitter" version="1.2" name="Barcode Splitter"> + <description></description> + <command interpreter="bash" detect_errors="aggressive"><![CDATA[ fastx_barcode_splitter_galaxy_wrapper.sh $BARCODE $input "split/" $input.extension --mismatches $mismatches --partial $partial #if $refBarcodeLocation.barcodeLocation == "idxfile": --idxfile $refBarcodeLocation.idxfile #else: $refBarcodeLocation.EOL #end if -> $output +> $summary ]]> - </command> + </command> - <inputs> - <param format="txt" name="BARCODE" type="data" label="Barcodes to use" /> - <param format="fasta,fastqsanger,fastqsolexa,fastqillumina" name="input" type="data" label="Library to split" /> + <inputs> + <param format="txt" name="BARCODE" type="data" label="Barcodes to use" /> + <param format="fasta,fastq,fastqsanger,fastqsolexa,fastqillumina" name="input" type="data" label="Library to split" /> - <conditional name="refBarcodeLocation"> - <param name="barcodeLocation" type="select" label="Barcodes found at"> - <option value="bol">Start of sequence (5' end)</option> - <option value="eol">End of sequence (3' end)</option> - <option value="idxfile">Separate index file</option> - </param> - <when value="bol"> - <param name="EOL" type="hidden" value="--bol" /> - </when> - <when value="eol"> - <param name="EOL" type="hidden" value="--eol" /> - </when> - <when value="idxfile"> - <param name="idxfile" type="data" format="fasta,fastq,fastqsanger" label="Select index read file" /> - </when> - </conditional> + <conditional name="refBarcodeLocation"> + <param name="barcodeLocation" type="select" label="Barcodes found at"> + <option value="bol">Start of sequence (5' end)</option> + <option value="eol">End of sequence (3' end)</option> + <option value="idxfile">Separate index file</option> + </param> + <when value="bol"> + <param name="EOL" type="hidden" value="--bol" /> + </when> + <when value="eol"> + <param name="EOL" type="hidden" value="--eol" /> + </when> + <when value="idxfile"> + <param name="idxfile" type="data" format="fasta,fastq,fastqsanger" label="Select index read file" /> + </when> + </conditional> - <param name="mismatches" type="integer" size="3" value="0" label="Number of allowed mismatches" /> + <param name="mismatches" type="integer" size="3" value="0" label="Number of allowed mismatches" /> - <param name="partial" type="integer" size="3" value="0" label="Number of allowed barcodes nucleotide deletions" /> + <param name="partial" type="integer" size="3" value="0" label="Number of allowed barcodes nucleotide deletions" /> - </inputs> + </inputs> - <outputs> - <data format="html" name="output"> - <discover_datasets pattern="__designation_and_ext__" directory="split" visible="true" label="${tool.name}: ${designation}"/> - </data> - </outputs> + <outputs> + <data format="tabular" name="summary" label="${tool.name} on ${on_string}: Summary" /> + <collection name="split_output" type="list" format_source="input" label="${tool.name} on ${on_string}"> + <discover_datasets pattern="__designation_and_ext__" directory="split" visible="false" label="${designation}"/> + </collection> + </outputs> - <tests> - <test> - <!-- Split a FASTQ file --> - <param name="BARCODE" value="fastx_barcode_splitter1.txt" /> - <param name="input" value="fastx_barcode_splitter1.fastq" ftype="fastqsolexa" /> - <param name="barcodeLocation" value="bol" /> - <param name="mismatches" value="2" /> - <param name="partial" value="0" /> - <output name="output" file="fastx_barcode_splitter1.out"> - <discovered_dataset designation="BC1" ftype="fastqsolexa" file="fastx_barcode_splitter1_BC1.out" /> - <discovered_dataset designation="BC2" ftype="fastqsolexa" file="fastx_barcode_splitter1_BC2.out" /> - <discovered_dataset designation="BC3" ftype="fastqsolexa" file="fastx_barcode_splitter1_BC3.out" /> - <discovered_dataset designation="BC4" ftype="fastqsolexa" file="fastx_barcode_splitter1_BC4.out" /> - <discovered_dataset designation="unmatched" ftype="fastqsolexa" file="fastx_barcode_splitter1_unmatched.out" /> - </output> - </test> - <test> - <!-- Split a FASTQ file, using separate index read --> - <param name="BARCODE" value="fastx_barcode_splitter1.txt" /> - <param name="input" value="fastx_barcode_splitter1.fastq" ftype="fastqsolexa" /> - <param name="idxfile" value="fastx_barcode_splitter_index.fastq" ftype="fastqsolexa" /> - <param name="barcodeLocation" value="idxfile" /> - <param name="mismatches" value="2" /> - <param name="partial" value="0" /> - <output name="output" file="fastx_barcode_splitter1.out"> - <discovered_dataset designation="BC1" ftype="fastqsolexa" file="fastx_barcode_splitter1_BC1.out" /> - <discovered_dataset designation="BC2" ftype="fastqsolexa" file="fastx_barcode_splitter1_BC2.out" /> - <discovered_dataset designation="BC3" ftype="fastqsolexa" file="fastx_barcode_splitter1_BC3.out" /> - <discovered_dataset designation="BC4" ftype="fastqsolexa" file="fastx_barcode_splitter1_BC4.out" /> - <discovered_dataset designation="unmatched" ftype="fastqsolexa" file="fastx_barcode_splitter1_unmatched.out" /> - </output> - </test> - </tests> + <tests> + <test> + <!-- Split a FASTQ file --> + <param name="BARCODE" value="fastx_barcode_splitter1.txt" /> + <param name="input" value="fastx_barcode_splitter1.fastq" ftype="fastqsolexa" /> + <param name="barcodeLocation" value="bol" /> + <param name="mismatches" value="2" /> + <param name="partial" value="0" /> + <output name="summary" file="fastx_barcode_splitter1.out" /> + <collection name="output" type="list"> + <discovered_dataset designation="BC1" ftype="fastqsolexa" file="fastx_barcode_splitter1_BC1.out" /> + <discovered_dataset designation="BC2" ftype="fastqsolexa" file="fastx_barcode_splitter1_BC2.out" /> + <discovered_dataset designation="BC3" ftype="fastqsolexa" file="fastx_barcode_splitter1_BC3.out" /> + <discovered_dataset designation="BC4" ftype="fastqsolexa" file="fastx_barcode_splitter1_BC4.out" /> + <discovered_dataset designation="unmatched" ftype="fastqsolexa" file="fastx_barcode_splitter1_unmatched.out" /> + </collection> + </test> + + <test> + <!-- Split a FASTQ file, using separate index read --> + <param name="BARCODE" value="fastx_barcode_splitter1.txt" /> + <param name="input" value="fastx_barcode_splitter1.fastq" ftype="fastqsolexa" /> + <param name="idxfile" value="fastx_barcode_splitter_index.fastq" ftype="fastqsolexa" /> + <param name="barcodeLocation" value="idxfile" /> + <param name="mismatches" value="2" /> + <param name="partial" value="0" /> + <output name="output" file="fastx_barcode_splitter1.out" /> + <collection name="split_output" type="list"> + <discovered_dataset designation="BC1" ftype="fastqsolexa" file="fastx_barcode_splitter1_BC1.out" /> + <discovered_dataset designation="BC2" ftype="fastqsolexa" file="fastx_barcode_splitter1_BC2.out" /> + <discovered_dataset designation="BC3" ftype="fastqsolexa" file="fastx_barcode_splitter1_BC3.out" /> + <discovered_dataset designation="BC4" ftype="fastqsolexa" file="fastx_barcode_splitter1_BC4.out" /> + <discovered_dataset designation="unmatched" ftype="fastqsolexa" file="fastx_barcode_splitter1_unmatched.out" /> + </collection> + </test> + </tests> <help><![CDATA[ **What it does** @@ -92,9 +96,9 @@ Example:: #This line is a comment (starts with a 'number' sign) - BC1 GATCT - BC2 ATCGT - BC3 GTGAT + BC1 GATCT + BC2 ATCGT + BC3 GTGAT BC4 TGTCT For each barcode, a new FASTQ file will be created (with the barcode's identifier as part of the file name). @@ -102,20 +106,20 @@ One additional FASTQ file will be created (the 'unmatched' file), where sequences not matching any barcode will be stored. -The output of this tool is an HTML file, displaying the split counts and the file names. -In addition, each fastq file produced will be loaded into the galaxy history automatically. +The output of this tool is a summary tabel displaying the split counts for each barcode identifier. +In addition, each fastq file produced will be loaded into the galaxy history as part of a collection list. ]]> - </help> + </help> - <!-- FASTX-barcode-splitter is part of the FASTX-toolkit, by A.Gordon (gordon@cshl.edu) --> - <citations> + <!-- FASTX-barcode-splitter is part of the FASTX-toolkit, by A.Gordon (gordon@cshl.edu) --> + <citations> <citation type="bibtex"> - @misc{gordon_fastx-toolkit_????, - title = {{FASTX}-{Toolkit}}, - url = {http://hannonlab.cshl.edu/fastx_toolkit/index.html}, - author = {Gordon, Assaf} - } - </citation> - </citations> + @misc{gordon_fastx-toolkit_????, + title = {{FASTX}-{Toolkit}}, + url = {http://hannonlab.cshl.edu/fastx_toolkit/index.html}, + author = {Gordon, Assaf} + } + </citation> + </citations> </tool>
--- a/fastx_barcode_splitter_galaxy_wrapper.sh Fri Mar 11 17:24:31 2016 -0500 +++ b/fastx_barcode_splitter_galaxy_wrapper.sh Mon May 02 17:04:32 2016 -0400 @@ -17,21 +17,18 @@ # along with this program. If not, see <http://www.gnu.org/licenses/>. # Modified by Lance Parsons (lparsons@princeton.edu) -# 2011-03-15 Adapted to allow galaxy to determine filetype -# 2015-10-21 Updated to make compatible with OSX (BSD sed) -# 2015-11-13 Removed LIBRARY_NAME, no longer needed +# 2011-03-15 Adapted to allow galaxy to determine filetype +# 2015-10-21 Updated to make compatible with OSX (BSD sed) +# 2015-11-13 Removed LIBRARY_NAME, no longer needed +# 2016-04-28 Output summary as simple tabular output -#This is a shell script wrapper for 'fastx_barcode_splitter.pl' +# This is a shell script wrapper for 'fastx_barcode_splitter.pl' # # 1. Output files are saved at the dataset's files_path directory. -# -# 2. 'fastx_barcode_splitter.pl' outputs a textual table. -# This script turns it into pretty HTML with working URL -# (so lazy users can just click on the URLs and get their files) if [ "$1x" = "x" ]; then - echo "Usage: $0 [BARCODE FILE] [FASTQ FILE] [OUTPUT_PATH] [FILETYPE]" >&2 - exit 1 + echo "Usage: $0 [BARCODE FILE] [FASTQ FILE] [OUTPUT_PATH] [FILETYPE]" >&2 + exit 1 fi BARCODE_FILE="$1" @@ -42,22 +39,22 @@ # The rest of the parameters are passed to the split program if [ "${OUTPUT_PATH}x" = "x" ]; then - echo "Usage: $0 [BARCODE FILE] [FASTQ FILE] [OUTPUT_PATH] [FILETYPE]" >&2 - exit 1 + echo "Usage: $0 [BARCODE FILE] [FASTQ FILE] [OUTPUT_PATH] [FILETYPE]" >&2 + exit 1 fi if [ ! -r "$FASTQ_FILE" ]; then - echo "Error: Input file ($FASTQ_FILE) not found!" >&2 - exit 1 + echo "Error: Input file ($FASTQ_FILE) not found!" >&2 + exit 1 fi if [ ! -r "$BARCODE_FILE" ]; then - echo "Error: barcode file ($BARCODE_FILE) not found!" >&2 - exit 1 + echo "Error: barcode file ($BARCODE_FILE) not found!" >&2 + exit 1 fi mkdir -p "$OUTPUT_PATH" if [ ! -d "$OUTPUT_PATH" ]; then - echo "Error: failed to create output path '$OUTPUT_PATH'" >&2 - exit 1 + echo "Error: failed to create output path '$OUTPUT_PATH'" >&2 + exit 1 fi BASEPATH="$OUTPUT_PATH/" @@ -67,13 +64,7 @@ RESULTS=$(gzip -cdf "$FASTQ_FILE" | "$DIRECTORY/fastx_barcode_splitter.pl" --bcfile "$BARCODE_FILE" --prefix "$PREFIX" --suffix "$SUFFIX" "$@") if [ $? != 0 ]; then - echo "error" + echo "error" fi -# -# Convert the textual tab-separated table into simple HTML table -echo "<html><body><table border=1>" -echo "$RESULTS" | sed "s|$BASEPATH\\(.*\\)|\\1|" | \ -perl -n -e '$_ =~ s|\t|</td><td>|g; print "<tr><td>\n$_</td></tr>\n"' -echo "<p>" -echo "</table></body></html>" +echo "$RESULTS" \ No newline at end of file
--- a/test-data/fastx_barcode_splitter1.out Fri Mar 11 17:24:31 2016 -0500 +++ b/test-data/fastx_barcode_splitter1.out Mon May 02 17:04:32 2016 -0400 @@ -1,24 +1,7 @@ -<html><body><table border=1> -<tr><td> -Barcode</td><td>Count</td><td>Location -</td></tr> -<tr><td> -BC1</td><td>11</td><td>BC1.fastqsolexa -</td></tr> -<tr><td> -BC2</td><td>12</td><td>BC2.fastqsolexa -</td></tr> -<tr><td> -BC3</td><td>9</td><td>BC3.fastqsolexa -</td></tr> -<tr><td> -BC4</td><td>1</td><td>BC4.fastqsolexa -</td></tr> -<tr><td> -unmatched</td><td>9</td><td>unmatched.fastqsolexa -</td></tr> -<tr><td> -total</td><td>42 -</td></tr> -<p> -</table></body></html> +# Barcode Count +BC1 11 +BC2 12 +BC3 9 +BC4 1 +unmatched 9 +total 42
--- a/testoutput/fastx_barcode_splitter1.out Fri Mar 11 17:24:31 2016 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,24 +0,0 @@ -<html><body><table border=1> -<tr><td> -Barcode</td><td>Count</td><td>Location -</td></tr> -<tr><td> -BC1</td><td>11</td><td><a href="primary_3_BC1_visible_fastqsolexa">primary_3_BC1_visible_fastqsolexa<a> -</td></tr> -<tr><td> -BC2</td><td>12</td><td><a href="primary_3_BC2_visible_fastqsolexa">primary_3_BC2_visible_fastqsolexa<a> -</td></tr> -<tr><td> -BC3</td><td>9</td><td><a href="primary_3_BC3_visible_fastqsolexa">primary_3_BC3_visible_fastqsolexa<a> -</td></tr> -<tr><td> -BC4</td><td>1</td><td><a href="primary_3_BC4_visible_fastqsolexa">primary_3_BC4_visible_fastqsolexa<a> -</td></tr> -<tr><td> -unmatched</td><td>9</td><td><a href="primary_3_unmatched_visible_fastqsolexa">primary_3_unmatched_visible_fastqsolexa<a> -</td></tr> -<tr><td> -total</td><td>42 -</td></tr> -<p> -</table></body></html>
--- a/tmp/BC1 Fri Mar 11 17:24:31 2016 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,32 +0,0 @@ -@CSHL_3_FC042AGLLWW:1:2:7:203 -GATCTAGTAGTAGTAGA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GATCTAGTAGTAGTAGA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GATCTAGTAGTAGTAGA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GATCTAGTAGTAGTAGA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GATCTAGTAGTAGTAGA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGTCTAGTAGTAGTAGA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGTCTTCTCTATGTACA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGTCTGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa
--- a/tmp/BC2 Fri Mar 11 17:24:31 2016 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,32 +0,0 @@ -@CSHL_3_FC042AGLLWW:1:2:7:203 -ATCGTTCTCTATGTACA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -ATCGTGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -ATCGTTCTCTATGTACA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -ATCGTGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -ATCTTTCTCTATGTACA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -ATCTTGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -ATCTTGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -ATCTTTCTCTATGTACA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa
--- a/tmp/BC4 Fri Mar 11 17:24:31 2016 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,4 +0,0 @@ -@CSHL_3_FC042AGLLWW:1:2:7:203 -TGTCTGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa \ No newline at end of file
--- a/tmp/tmp_BC1_visible.fastqsanger Fri Mar 11 17:24:31 2016 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,44 +0,0 @@ -@CSHL_3_FC042AGLLWW:1:2:7:203 -GATCTAGTAGTAGTAGA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GATCTAGTAGTAGTAGA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GATCTAGTAGTAGTAGA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GATCTAGTAGTAGTAGA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GATCTAGTAGTAGTAGA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGTCTAGTAGTAGTAGA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGTCTTCTCTATGTACA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGTCTGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGTATTCTCTATGTACA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGTATTCTCTATGTACA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGTATTCTCTATGTACA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa
--- a/tmp/tmp_BC1_visible_fastqsanger Fri Mar 11 17:24:31 2016 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,44 +0,0 @@ -@CSHL_3_FC042AGLLWW:1:2:7:203 -GATCTAGTAGTAGTAGA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GATCTAGTAGTAGTAGA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GATCTAGTAGTAGTAGA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GATCTAGTAGTAGTAGA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GATCTAGTAGTAGTAGA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGTCTAGTAGTAGTAGA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGTCTTCTCTATGTACA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGTCTGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGTATTCTCTATGTACA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGTATTCTCTATGTACA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGTATTCTCTATGTACA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa
--- a/tmp/tmp_BC2_visible.fastqsanger Fri Mar 11 17:24:31 2016 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,48 +0,0 @@ -@CSHL_3_FC042AGLLWW:1:2:7:203 -ATCGTTCTCTATGTACA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -ATCGTGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -ATCGTTCTCTATGTACA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -ATCGTGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -ATCTTTCTCTATGTACA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -ATCTTGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -ATCTTGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -ATCTTTCTCTATGTACA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -ATCTCGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -ATCTCGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -ATCTCTCTCTATGTACA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -ATCTCGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa
--- a/tmp/tmp_BC2_visible_fastqsanger Fri Mar 11 17:24:31 2016 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,48 +0,0 @@ -@CSHL_3_FC042AGLLWW:1:2:7:203 -ATCGTTCTCTATGTACA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -ATCGTGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -ATCGTTCTCTATGTACA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -ATCGTGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -ATCTTTCTCTATGTACA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -ATCTTGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -ATCTTGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -ATCTTTCTCTATGTACA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -ATCTCGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -ATCTCGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -ATCTCTCTCTATGTACA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -ATCTCGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa
--- a/tmp/tmp_BC3_visible.fastqsanger Fri Mar 11 17:24:31 2016 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,36 +0,0 @@ -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGAATGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGAATTCTCTATGTACA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGAATGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGAATTCTCTATGTACA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGAATGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGAATTCTCTATGTACA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGAATGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGAATTCTCTATGTACA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGAATGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa
--- a/tmp/tmp_BC3_visible_fastqsanger Fri Mar 11 17:24:31 2016 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,36 +0,0 @@ -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGAATGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGAATTCTCTATGTACA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGAATGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGAATTCTCTATGTACA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGAATGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGAATTCTCTATGTACA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGAATGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGAATTCTCTATGTACA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGAATGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa
--- a/tmp/tmp_BC4_visible.fastqsanger Fri Mar 11 17:24:31 2016 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,4 +0,0 @@ -@CSHL_3_FC042AGLLWW:1:2:7:203 -TGTCTGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa \ No newline at end of file
--- a/tmp/tmp_BC4_visible_fastqsanger Fri Mar 11 17:24:31 2016 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,4 +0,0 @@ -@CSHL_3_FC042AGLLWW:1:2:7:203 -TGTCTGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa \ No newline at end of file
--- a/tmp/tmp_unmatched_visible.fastqsanger Fri Mar 11 17:24:31 2016 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,36 +0,0 @@ -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGTACGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGTACTCTCTATGTACA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGTACGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -TAGTTGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -TAGTTGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -TAGTTTCTCTATGTACA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -TAGTTTCTCTATGTACA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -TAGTTGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -TAGTTTCTCTATGTACA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa
--- a/tmp/tmp_unmatched_visible_fastqsanger Fri Mar 11 17:24:31 2016 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,36 +0,0 @@ -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGTACGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGTACTCTCTATGTACA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGTACGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -TAGTTGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -TAGTTGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -TAGTTTCTCTATGTACA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -TAGTTTCTCTATGTACA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -TAGTTGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -TAGTTTCTCTATGTACA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa
--- a/tmp/unmatched Fri Mar 11 17:24:31 2016 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,100 +0,0 @@ -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGTATTCTCTATGTACA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGTATTCTCTATGTACA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGTATTCTCTATGTACA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGTACGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGTACTCTCTATGTACA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGTACGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -ATCTCGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -ATCTCGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -ATCTCTCTCTATGTACA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -ATCTCGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGAATGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGAATTCTCTATGTACA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGAATGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGAATTCTCTATGTACA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGAATGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGAATTCTCTATGTACA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGAATGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGAATTCTCTATGTACA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -GGAATGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -TAGTTGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -TAGTTGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -TAGTTTCTCTATGTACA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -TAGTTTCTCTATGTACA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -TAGTTGAGTATACACAT -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa -@CSHL_3_FC042AGLLWW:1:2:7:203 -TAGTTTCTCTATGTACA -+CSHL_3_FC042AGLLWW:1:2:7:203 -aab^V^aU]`aa^aZaa