Mercurial > repos > lparsons > fastx_barcode_splitter_enhanced

diff fastx_barcode_splitter.pl @ 4:0fb7e9130a70 draft default tip
planemo upload for repository https://github.com/lparsons/galaxy_tools/tree/master/tools/fastx_barcode_splitter_enhanced commit 460463a5406419fe8e113467bdb8bd093d21e7c5
author: lparsons
date: Mon, 02 May 2016 17:04:32 -0400
parents: e7b7cdc1834d
--- a/fastx_barcode_splitter.pl	Fri Mar 11 17:24:31 2016 -0500
+++ b/fastx_barcode_splitter.pl	Mon May 02 17:04:32 2016 -0400
@@ -6,7 +6,9 @@
 #   Lance Parsons (lparsons@princeton.edu)
 #   3/21/2011 - Modified to accept separate index file for barcodes
 #   4/6/2011 - Modified to cleanup bad barcode identifiers (esp. useful for Galaxy)
-#
+#   4/28/2016 - Modified summary output to remove file paths and add comment
+#               character '#'
+
 #   This program is free software: you can redistribute it and/or modify
 #   it under the terms of the GNU Affero General Public License as
 #   published by the Free Software Foundation, either version 3 of the
@@ -48,7 +50,7 @@
 sub write_record($);
 sub usage();
 
-# Global flags and arguments, 
+# Global flags and arguments,
 # Set by command line argumens
 my $barcode_file ;
 my $barcodes_at_eol = 0 ;
@@ -65,7 +67,7 @@
 my $index_fastq_format = 1;
 my $read_id_check_strip_characters = 1;
 
-# Global variables 
+# Global variables
 # Populated by 'create_output_files'
 my %filenames;
 my %files;
@@ -117,51 +119,51 @@
 
 
 sub parse_command_line {
-	my $help;
+  my $help;
 
-	usage() if (scalar @ARGV==0);
+  usage() if (scalar @ARGV==0);
 
-	my $result = GetOptions ( "bcfile=s" => \$barcode_file,
-		"eol"  => \$barcodes_at_eol,
-		"bol"  => \$barcodes_at_bol,
-		"idxfile=s"  => \$index_read_file,
-		"idxidstrip=i" => \$read_id_check_strip_characters,
-		"exact" => \$exact_match,
-		"prefix=s" => \$newfile_prefix,
-		"suffix=s" => \$newfile_suffix,
-		"quiet" => \$quiet, 
-		"partial=i" => \$allow_partial_overlap,
-		"debug" => \$debug,
-		"mismatches=i" => \$allowed_mismatches,
-		"help" => \$help
-	) ;
+  my $result = GetOptions ( "bcfile=s" => \$barcode_file,
+    "eol"  => \$barcodes_at_eol,
+    "bol"  => \$barcodes_at_bol,
+    "idxfile=s"  => \$index_read_file,
+    "idxidstrip=i" => \$read_id_check_strip_characters,
+    "exact" => \$exact_match,
+    "prefix=s" => \$newfile_prefix,
+    "suffix=s" => \$newfile_suffix,
+    "quiet" => \$quiet,
+    "partial=i" => \$allow_partial_overlap,
+    "debug" => \$debug,
+    "mismatches=i" => \$allowed_mismatches,
+    "help" => \$help
+  ) ;
 
-	usage() if ($help);
+  usage() if ($help);
 
-	die "Error: barcode file not specified (use '--bcfile [FILENAME]')\n" unless defined $barcode_file;
-	die "Error: prefix path/filename not specified (use '--prefix [PATH]')\n" unless defined $newfile_prefix;
+  die "Error: barcode file not specified (use '--bcfile [FILENAME]')\n" unless defined $barcode_file;
+  die "Error: prefix path/filename not specified (use '--prefix [PATH]')\n" unless defined $newfile_prefix;
 
-	if (! defined $index_read_file) {
-		if ($barcodes_at_bol == $barcodes_at_eol) {
-			die "Error: can't specify both --eol & --bol\n" if $barcodes_at_eol;
-			die "Error: must specify either --eol or --bol or --idxfile\n" ;
-		}
-	}
-	elsif ($barcodes_at_bol || $barcodes_at_eol) {
-		die "Error: Must specify only one of --idxfile, --eol, or --bol";
-	}
+  if (! defined $index_read_file) {
+    if ($barcodes_at_bol == $barcodes_at_eol) {
+      die "Error: can't specify both --eol & --bol\n" if $barcodes_at_eol;
+      die "Error: must specify either --eol or --bol or --idxfile\n" ;
+    }
+  }
+  elsif ($barcodes_at_bol || $barcodes_at_eol) {
+    die "Error: Must specify only one of --idxfile, --eol, or --bol";
+  }
 
-	die "Error: invalid for value partial matches (valid values are 0 or greater)\n" if $allow_partial_overlap<0;
+  die "Error: invalid for value partial matches (valid values are 0 or greater)\n" if $allow_partial_overlap<0;
 
-	$allowed_mismatches = 0 if $exact_match;
+  $allowed_mismatches = 0 if $exact_match;
 
-	die "Error: invalid value for mismatches (valid values are 0 or more)\n" if ($allowed_mismatches<0);
+  die "Error: invalid value for mismatches (valid values are 0 or more)\n" if ($allowed_mismatches<0);
 
-	die "Error: partial overlap value ($allow_partial_overlap) bigger than " . 
-	"max. allowed mismatches ($allowed_mismatches)\n" if ($allow_partial_overlap > $allowed_mismatches);
+  die "Error: partial overlap value ($allow_partial_overlap) bigger than " .
+  "max. allowed mismatches ($allowed_mismatches)\n" if ($allow_partial_overlap > $allowed_mismatches);
 
 
-	exit unless $result;
+  exit unless $result;
 }
 
 
@@ -170,160 +172,160 @@
 # Read the barcode file
 #
 sub load_barcode_file ($) {
-	my $filename = shift or croak "Missing barcode file name";
+  my $filename = shift or croak "Missing barcode file name";
 
-	open BCFILE,"<$filename" or die "Error: failed to open barcode file ($filename)\n";
-	while (<BCFILE>) {
-		next if m/^#/;
-		chomp;
-		my ($ident, $barcode) = split('\t') ;
+  open BCFILE,"<$filename" or die "Error: failed to open barcode file ($filename)\n";
+  while (<BCFILE>) {
+    next if m/^#/;
+    chomp;
+    my ($ident, $barcode) = split('\t') ;
 
-		$barcode = uc($barcode);
+    $barcode = uc($barcode);
 
-		# Sanity checks on the barcodes
-		die "Error: bad data at barcode file ($filename) line $.\n" unless defined $barcode;
-		die "Error: bad barcode value ($barcode) at barcode file ($filename) line $.\n"
-		unless $barcode =~ m/^[AGCT]+$/;
+    # Sanity checks on the barcodes
+    die "Error: bad data at barcode file ($filename) line $.\n" unless defined $barcode;
+    die "Error: bad barcode value ($barcode) at barcode file ($filename) line $.\n"
+    unless $barcode =~ m/^[AGCT]+$/;
 
-		# Cleanup Identifiers (only allow alphanumeric, replace others with dash '-')
-		$ident =~ s/[^A-Za-z0-9]/-/g;
-		die "Error: bad identifier value ($ident) at barcode file ($filename) line $. (must be alphanumeric)\n" 
-		unless $ident =~ m/^[A-Za-z0-9-]+$/;
+    # Cleanup Identifiers (only allow alphanumeric, replace others with dash '-')
+    $ident =~ s/[^A-Za-z0-9]/-/g;
+    die "Error: bad identifier value ($ident) at barcode file ($filename) line $. (must be alphanumeric)\n"
+    unless $ident =~ m/^[A-Za-z0-9-]+$/;
 
-		die "Error: badcode($ident, $barcode) is shorter or equal to maximum number of " .
-		"mismatches ($allowed_mismatches). This makes no sense. Specify fewer  mismatches.\n" 
-		if length($barcode)<=$allowed_mismatches;
+    die "Error: badcode($ident, $barcode) is shorter or equal to maximum number of " .
+    "mismatches ($allowed_mismatches). This makes no sense. Specify fewer  mismatches.\n"
+    if length($barcode)<=$allowed_mismatches;
 
-		$barcodes_length = length($barcode) unless defined $barcodes_length;
-		die "Error: found barcodes in different lengths. this feature is not supported yet.\n" 
-		unless $barcodes_length == length($barcode);
+    $barcodes_length = length($barcode) unless defined $barcodes_length;
+    die "Error: found barcodes in different lengths. this feature is not supported yet.\n"
+    unless $barcodes_length == length($barcode);
 
-		push @barcodes, [$ident, $barcode];
+    push @barcodes, [$ident, $barcode];
 
-		if ($allow_partial_overlap>0) {
-			foreach my $i (1 .. $allow_partial_overlap) {
-				substr $barcode, ($barcodes_at_bol)?0:-1, 1, '';
-				push @barcodes, [$ident, $barcode];
-			}
-		}
-	}
-	close BCFILE;
+    if ($allow_partial_overlap>0) {
+      foreach my $i (1 .. $allow_partial_overlap) {
+        substr $barcode, ($barcodes_at_bol)?0:-1, 1, '';
+        push @barcodes, [$ident, $barcode];
+      }
+    }
+  }
+  close BCFILE;
 
-	if ($debug) {
-		print STDERR "barcode\tsequence\n";
-		foreach my $barcoderef (@barcodes) {
-			my ($ident, $seq) = @{$barcoderef};
-			print STDERR $ident,"\t", $seq ,"\n";
-		}
-	}
+  if ($debug) {
+    print STDERR "barcode\tsequence\n";
+    foreach my $barcoderef (@barcodes) {
+      my ($ident, $seq) = @{$barcoderef};
+      print STDERR $ident,"\t", $seq ,"\n";
+    }
+  }
 }
 
 # Create one output file for each barcode.
 # (Also create a file for the dummy 'unmatched' barcode)
 sub create_output_files {
-	my %barcodes = map { $_->[0] => 1 } @barcodes; #generate a uniq list of barcode identifiers;
-	$barcodes{'unmatched'} = 1 ;
+  my %barcodes = map { $_->[0] => 1 } @barcodes; #generate a uniq list of barcode identifiers;
+  $barcodes{'unmatched'} = 1 ;
 
-	foreach my $ident (keys %barcodes) {
-		my $new_filename = $newfile_prefix . $ident . $newfile_suffix; 
-		$filenames{$ident} = $new_filename;
-		open my $file, ">$new_filename" or die "Error: failed to create output file ($new_filename)\n"; 
-		$files{$ident} = $file ;
-	}
+  foreach my $ident (keys %barcodes) {
+    my $new_filename = $newfile_prefix . $ident . $newfile_suffix;
+    $filenames{$ident} = $new_filename;
+    open my $file, ">$new_filename" or die "Error: failed to create output file ($new_filename)\n";
+    $files{$ident} = $file ;
+  }
 }
 
 sub match_sequences {
 
-	my %barcodes = map { $_->[0] => 1 } @barcodes; #generate a uniq list of barcode identifiers;
-	$barcodes{'unmatched'} = 1 ;
+  my %barcodes = map { $_->[0] => 1 } @barcodes; #generate a uniq list of barcode identifiers;
+  $barcodes{'unmatched'} = 1 ;
 
-	#reset counters
-	foreach my $ident ( keys %barcodes ) {
-		$counts{$ident} = 0;
-	}
+  #reset counters
+  foreach my $ident ( keys %barcodes ) {
+    $counts{$ident} = 0;
+  }
 
-	create_output_files;
+  create_output_files;
 
-	# Read file FASTQ file
-	# split accotding to barcodes
-	while ( read_record ) {
-		chomp $seq_name;
-		chomp $seq_bases;
-		if (defined $index_read_file) {
-			read_index_record() or die "Error: Unable to read index sequence for sequence name ($seq_name), check to make sure the file lengths match.\n";
-			chomp $index_seq_name;
-			chomp $index_seq_bases;
+  # Read file FASTQ file
+  # split accotding to barcodes
+  while ( read_record ) {
+    chomp $seq_name;
+    chomp $seq_bases;
+    if (defined $index_read_file) {
+      read_index_record() or die "Error: Unable to read index sequence for sequence name ($seq_name), check to make sure the file lengths match.\n";
+      chomp $index_seq_name;
+      chomp $index_seq_bases;
 
-			# Assert that the read ids match
-			my $seq_name_match = &strip_read_id($seq_name);
-			my $index_seq_name_match = &strip_read_id($index_seq_name);
-			if ($seq_name_match ne $index_seq_name_match) {
-				die "Error: Index sequence name ($index_seq_name) does not match sequence name ($seq_name)\n";
-			}
+      # Assert that the read ids match
+      my $seq_name_match = &strip_read_id($seq_name);
+      my $index_seq_name_match = &strip_read_id($index_seq_name);
+      if ($seq_name_match ne $index_seq_name_match) {
+        die "Error: Index sequence name ($index_seq_name) does not match sequence name ($seq_name)\n";
+      }
 
-		}
+    }
 
-		print STDERR "sequence $seq_bases: \n" if $debug;
+    print STDERR "sequence $seq_bases: \n" if $debug;
 
-		my $best_barcode_mismatches_count = $barcodes_length;
-		my $best_barcode_ident = undef;
+    my $best_barcode_mismatches_count = $barcodes_length;
+    my $best_barcode_ident = undef;
 
-		#Try all barcodes, find the one with the lowest mismatch count
-		foreach my $barcoderef (@barcodes) {
-			my ($ident, $barcode) = @{$barcoderef};
+    #Try all barcodes, find the one with the lowest mismatch count
+    foreach my $barcoderef (@barcodes) {
+      my ($ident, $barcode) = @{$barcoderef};
 
-			# Get DNA fragment (in the length of the barcodes)
-			# The barcode will be tested only against this fragment
-			# (no point in testing the barcode against the whole sequence)
-			my $sequence_fragment;
-			if ($barcodes_at_bol) {
-				$sequence_fragment = substr $seq_bases, 0, $barcodes_length;
-			} elsif ($barcodes_at_eol) {
-				$sequence_fragment = substr $seq_bases, - $barcodes_length;
-			} else {
-				$sequence_fragment = substr $index_seq_bases, 0, $barcodes_length;
-			}
+      # Get DNA fragment (in the length of the barcodes)
+      # The barcode will be tested only against this fragment
+      # (no point in testing the barcode against the whole sequence)
+      my $sequence_fragment;
+      if ($barcodes_at_bol) {
+        $sequence_fragment = substr $seq_bases, 0, $barcodes_length;
+      } elsif ($barcodes_at_eol) {
+        $sequence_fragment = substr $seq_bases, - $barcodes_length;
+      } else {
+        $sequence_fragment = substr $index_seq_bases, 0, $barcodes_length;
+      }
 
-			my $mm = mismatch_count($sequence_fragment, $barcode) ; 
+      my $mm = mismatch_count($sequence_fragment, $barcode) ;
 
-			# if this is a partial match, add the non-overlap as a mismatch
-			# (partial barcodes are shorter than the length of the original barcodes)
-			$mm += ($barcodes_length - length($barcode)); 
+      # if this is a partial match, add the non-overlap as a mismatch
+      # (partial barcodes are shorter than the length of the original barcodes)
+      $mm += ($barcodes_length - length($barcode));
 
-			if ( $mm < $best_barcode_mismatches_count ) {
-				$best_barcode_mismatches_count = $mm ;
-				$best_barcode_ident = $ident ;
-			}
-		}
+      if ( $mm < $best_barcode_mismatches_count ) {
+        $best_barcode_mismatches_count = $mm ;
+        $best_barcode_ident = $ident ;
+      }
+    }
 
-		$best_barcode_ident = 'unmatched' 
-		if ( (!defined $best_barcode_ident) || $best_barcode_mismatches_count>$allowed_mismatches) ;
+    $best_barcode_ident = 'unmatched'
+    if ( (!defined $best_barcode_ident) || $best_barcode_mismatches_count>$allowed_mismatches) ;
 
-		print STDERR "sequence $seq_bases matched barcode: $best_barcode_ident\n" if $debug;
+    print STDERR "sequence $seq_bases matched barcode: $best_barcode_ident\n" if $debug;
 
-		$counts{$best_barcode_ident}++;
+    $counts{$best_barcode_ident}++;
 
-		#get the file associated with the matched barcode.
-		#(note: there's also a file associated with 'unmatched' barcode)
-		my $file = $files{$best_barcode_ident};
+    #get the file associated with the matched barcode.
+    #(note: there's also a file associated with 'unmatched' barcode)
+    my $file = $files{$best_barcode_ident};
 
-		write_record($file);
-	}
+    write_record($file);
+  }
 }
 
 # Strip end of readids when matching to avoid mismatch between read 1, 2, 3, etc.
 sub strip_read_id {
-	my $read_id = shift;
-	my $stripped_read_id = $read_id;
-	if ($read_id_check_strip_characters) {
-		if ($read_id =~ /@([^:]+):([0-9]+):([^:]+):([0-9]+):([0-9]+):([0-9]+):([0-9]+) ([0-9]+):([YN]):([0-9]+):([ACGT]+){0,1}/) { # CASAVA 1.8+
-			my @parts = split(/ /,$read_id);
-			$stripped_read_id = $parts[0];
-		} else { # CASAVA 1.7 and earlier
-			$stripped_read_id = substr($read_id, 0, length($read_id)-$read_id_check_strip_characters);
-		}
-	}
-	return $stripped_read_id;
+  my $read_id = shift;
+  my $stripped_read_id = $read_id;
+  if ($read_id_check_strip_characters) {
+    if ($read_id =~ /@([^:]+):([0-9]+):([^:]+):([0-9]+):([0-9]+):([0-9]+):([0-9]+) ([0-9]+):([YN]):([0-9]+):([ACGT]+){0,1}/) { # CASAVA 1.8+
+      my @parts = split(/ /,$read_id);
+      $stripped_read_id = $parts[0];
+    } else { # CASAVA 1.7 and earlier
+      $stripped_read_id = substr($read_id, 0, length($read_id)-$read_id_check_strip_characters);
+    }
+  }
+  return $stripped_read_id;
 }
 
 
@@ -338,121 +340,121 @@
 
 sub print_results
 {
-	print "Barcode\tCount\tLocation\n";
-	my $total = 0 ;
-	foreach my $ident (sort keys %counts) {
-		print $ident, "\t", $counts{$ident},"\t",$filenames{$ident},"\n";
-		$total += $counts{$ident};
-	}
-	print "total\t",$total,"\n";
+  print "# Barcode\tCount\n";
+  my $total = 0 ;
+  foreach my $ident (sort keys %counts) {
+    print $ident, "\t", $counts{$ident},"\n";
+    $total += $counts{$ident};
+  }
+  print "total\t",$total,"\n";
 }
 
 
 sub read_record
 {
-	$seq_name = $input_file_io->getline();
+  $seq_name = $input_file_io->getline();
 
-	return undef unless defined $seq_name; # End of file?
+  return undef unless defined $seq_name; # End of file?
 
-	$seq_bases = $input_file_io->getline();
-	die "Error: bad input file, expecting line with sequences\n" unless defined $seq_bases;
+  $seq_bases = $input_file_io->getline();
+  die "Error: bad input file, expecting line with sequences\n" unless defined $seq_bases;
 
-	# If using FASTQ format, read two more lines
-	if ($fastq_format) {
-		$seq_name2  = $input_file_io->getline();
-		die "Error: bad input file, expecting line with sequence name2\n" unless defined $seq_name2;
+  # If using FASTQ format, read two more lines
+  if ($fastq_format) {
+    $seq_name2  = $input_file_io->getline();
+    die "Error: bad input file, expecting line with sequence name2\n" unless defined $seq_name2;
 
-		$seq_qualities = $input_file_io->getline();
-		die "Error: bad input file, expecting line with quality scores\n" unless defined $seq_qualities;
-	}
-	return 1;
+    $seq_qualities = $input_file_io->getline();
+    die "Error: bad input file, expecting line with quality scores\n" unless defined $seq_qualities;
+  }
+  return 1;
 }
 
 sub write_record($)
 {
-	my $file = shift;
+  my $file = shift;
 
-	croak "Bad file handle" unless defined $file;
+  croak "Bad file handle" unless defined $file;
 
-	print $file $seq_name,"\n";
-	print $file $seq_bases,"\n";
+  print $file $seq_name,"\n";
+  print $file $seq_bases,"\n";
 
-	#if using FASTQ format, write two more lines
-	if ($fastq_format) {
-		print $file $seq_name2;
-		print $file $seq_qualities;
-	}
+  #if using FASTQ format, write two more lines
+  if ($fastq_format) {
+    print $file $seq_name2;
+    print $file $seq_qualities;
+  }
 }
 
 sub open_and_detect_input_format
 {
-	$input_file_io  = new IO::Handle;
-	die "Failed to open STDIN " unless $input_file_io->fdopen(fileno(STDIN),"r");
+  $input_file_io  = new IO::Handle;
+  die "Failed to open STDIN " unless $input_file_io->fdopen(fileno(STDIN),"r");
 
-	# Get the first characeter, and push it back
-	my $first_char = $input_file_io->getc();
-	$input_file_io->ungetc(ord $first_char);
+  # Get the first characeter, and push it back
+  my $first_char = $input_file_io->getc();
+  $input_file_io->ungetc(ord $first_char);
 
-	if ($first_char eq '>') {
-		# FASTA format
-		$fastq_format = 0 ;
-		print STDERR "Detected FASTA format\n" if $debug;
-	} elsif ($first_char eq '@') {
-		# FASTQ format
-		$fastq_format = 1;
-		print STDERR "Detected FASTQ format\n" if $debug;
-	} else {
-		die "Error: unknown file format. First character = '$first_char' (expecting > or \@)\n";
-	}
+  if ($first_char eq '>') {
+    # FASTA format
+    $fastq_format = 0 ;
+    print STDERR "Detected FASTA format\n" if $debug;
+  } elsif ($first_char eq '@') {
+    # FASTQ format
+    $fastq_format = 1;
+    print STDERR "Detected FASTQ format\n" if $debug;
+  } else {
+    die "Error: unknown file format. First character = '$first_char' (expecting > or \@)\n";
+  }
 }
 
 
 sub open_index_and_detect_input_format($) {
-	my $filename = shift or croak "Missing index read file name";
+  my $filename = shift or croak "Missing index read file name";
 
-	open IDXFILE,"<$filename" or die "Error: failed to open index read file ($filename)\n";
+  open IDXFILE,"<$filename" or die "Error: failed to open index read file ($filename)\n";
 
-	# Get the first line, and reset file pointer
-	my $first_line = <IDXFILE>;
-	my $first_char = substr($first_line, 0, 1);
-	seek(IDXFILE, 0, 0);
+  # Get the first line, and reset file pointer
+  my $first_line = <IDXFILE>;
+  my $first_char = substr($first_line, 0, 1);
+  seek(IDXFILE, 0, 0);
 
-	if ($first_char eq '>') {
-		# FASTA format
-		$index_fastq_format = 0 ;
-		print STDERR "Detected FASTA format for index file\n" if $debug;
-	} elsif ($first_char eq '@') {
-		# FASTQ format
-		$index_fastq_format = 1;
-		print STDERR "Detected FASTQ format for index file\n" if $debug;
-	} else {
-		die "Error: unknown index file format. First character = '$first_char' (expecting > or \@)\n";
-	}
+  if ($first_char eq '>') {
+    # FASTA format
+    $index_fastq_format = 0 ;
+    print STDERR "Detected FASTA format for index file\n" if $debug;
+  } elsif ($first_char eq '@') {
+    # FASTQ format
+    $index_fastq_format = 1;
+    print STDERR "Detected FASTQ format for index file\n" if $debug;
+  } else {
+    die "Error: unknown index file format. First character = '$first_char' (expecting > or \@)\n";
+  }
 }
 
 sub read_index_record
 {
-	$index_seq_name = <IDXFILE>;
+  $index_seq_name = <IDXFILE>;
 
-	return undef unless defined $index_seq_name; # End of file?
+  return undef unless defined $index_seq_name; # End of file?
 
-	$index_seq_bases = <IDXFILE>;
-	die "Error: bad input file, expecting line with sequences\n" unless defined $index_seq_bases;
+  $index_seq_bases = <IDXFILE>;
+  die "Error: bad input file, expecting line with sequences\n" unless defined $index_seq_bases;
 
-	# If using FASTQ format, read two more lines
-	if ($index_fastq_format) {
-		$index_seq_name2  = <IDXFILE>;
-		die "Error: bad input file, expecting line with sequence name2\n" unless defined $index_seq_name2;
+  # If using FASTQ format, read two more lines
+  if ($index_fastq_format) {
+    $index_seq_name2  = <IDXFILE>;
+    die "Error: bad input file, expecting line with sequence name2\n" unless defined $index_seq_name2;
 
-		$index_seq_qualities = <IDXFILE>;
-		die "Error: bad input file, expecting line with quality scores\n" unless defined $index_seq_qualities;
-	}
-	return 1;
+    $index_seq_qualities = <IDXFILE>;
+    die "Error: bad input file, expecting line with quality scores\n" unless defined $index_seq_qualities;
+  }
+  return 1;
 }
 
 sub usage()
 {
-	print<<EOF;
+  print<<EOF;
 Barcode Splitter, by Assaf Gordon (gordon\@cshl.edu), 11sep2008
 
 This program reads FASTA/FASTQ file and splits it into several smaller files,
@@ -461,51 +463,51 @@
 Output files will be writen to disk.
 Summary will be printed to STDOUT.
 
-usage: $0 --bcfile FILE --prefix PREFIX [--suffix SUFFIX] [--bol|--eol|--idxfile] 
-	 [--mismatches N] [--exact] [--partial N] [--idxidstrip N]
-	 [--help] [--quiet] [--debug]
+usage: $0 --bcfile FILE --prefix PREFIX [--suffix SUFFIX] [--bol|--eol|--idxfile]
+   [--mismatches N] [--exact] [--partial N] [--idxidstrip N]
+   [--help] [--quiet] [--debug]
 
 Arguments:
 
---bcfile FILE	- Barcodes file name. (see explanation below.)
---prefix PREFIX	- File prefix. will be added to the output files. Can be used
-		  to specify output directories.
---suffix SUFFIX	- File suffix (optional). Can be used to specify file
-		  extensions.
---bol		- Try to match barcodes at the BEGINNING of sequences.
-		  (What biologists would call the 5' end, and programmers
-		  would call index 0.)
---eol		- Try to match barcodes at the END of sequences.
-		  (What biologists would call the 3' end, and programmers
-		  would call the end of the string.)
+--bcfile FILE  - Barcodes file name. (see explanation below.)
+--prefix PREFIX  - File prefix. will be added to the output files. Can be used
+      to specify output directories.
+--suffix SUFFIX  - File suffix (optional). Can be used to specify file
+      extensions.
+--bol    - Try to match barcodes at the BEGINNING of sequences.
+      (What biologists would call the 5' end, and programmers
+      would call index 0.)
+--eol    - Try to match barcodes at the END of sequences.
+      (What biologists would call the 3' end, and programmers
+      would call the end of the string.)
 --idxfile FILE  - Read barcodes from separate index file (fasta or fastq)
-		  NOTE: one of --bol, --eol, --idxfile must be specified,
-		       but not more than one.
+      NOTE: one of --bol, --eol, --idxfile must be specified,
+           but not more than one.
 --idxidstrip N  - When using index file, strip this number of characters
-		  from the end of the sequence id before matching.
-		  Automatically detects CASAVA 1.8 format and strips at a
-		  space in the id, use 0 to disable this.
-		  (Default is 1). 
---mismatches N	- Max. number of mismatches allowed. default is 1.
---exact		- Same as '--mismatches 0'. If both --exact and --mismatches 
-		  are specified, '--exact' takes precedence.
---partial N	- Allow partial overlap of barcodes. (see explanation below.)
-		  (Default is not partial matching)
---quiet		- Don't print counts and summary at the end of the run.
-		  (Default is to print.)
---debug		- Print lots of useless debug information to STDERR.
---help		- This helpful help screen.
+      from the end of the sequence id before matching.
+      Automatically detects CASAVA 1.8 format and strips at a
+      space in the id, use 0 to disable this.
+      (Default is 1).
+--mismatches N  - Max. number of mismatches allowed. default is 1.
+--exact    - Same as '--mismatches 0'. If both --exact and --mismatches
+      are specified, '--exact' takes precedence.
+--partial N  - Allow partial overlap of barcodes. (see explanation below.)
+      (Default is not partial matching)
+--quiet    - Don't print counts and summary at the end of the run.
+      (Default is to print.)
+--debug    - Print lots of useless debug information to STDERR.
+--help    - This helpful help screen.
 
-Example (Assuming 's_2_100.txt' is a FASTQ file, 'mybarcodes.txt' is 
+Example (Assuming 's_2_100.txt' is a FASTQ file, 'mybarcodes.txt' is
 the barcodes file):
 
-   \$ cat s_2_100.txt | $0 --bcfile mybarcodes.txt --bol --mismatches 2 \\
-	--prefix /tmp/bla_ --suffix ".txt"
+  \$ cat s_2_100.txt | $0 --bcfile mybarcodes.txt --bol --mismatches 2 \\
+  --prefix /tmp/bla_ --suffix ".txt"
 
 Barcode file format
 -------------------
-Barcode files are simple text files. Each line should contain an identifier 
-(descriptive name for the barcode), and the barcode itself (A/C/G/T), 
+Barcode files are simple text files. Each line should contain an identifier
+(descriptive name for the barcode), and the barcode itself (A/C/G/T),
 separated by a TAB character. Example:
 
     #This line is a comment (starts with a 'number' sign)
@@ -514,17 +516,17 @@
     BC3 GTGAT
     BC4 TGTCT
 
-For each barcode, a new FASTQ file will be created (with the barcode's 
-identifier as part of the file name). Sequences matching the barcode 
+For each barcode, a new FASTQ file will be created (with the barcode's
+identifier as part of the file name). Sequences matching the barcode
 will be stored in the appropriate file.
 
-Running the above example (assuming "mybarcodes.txt" contains the above 
+Running the above example (assuming "mybarcodes.txt" contains the above
 barcodes), will create the following files:
-	/tmp/bla_BC1.txt
-	/tmp/bla_BC2.txt
-	/tmp/bla_BC3.txt
-	/tmp/bla_BC4.txt
-	/tmp/bla_unmatched.txt
+  /tmp/bla_BC1.txt
+  /tmp/bla_BC2.txt
+  /tmp/bla_BC3.txt
+  /tmp/bla_BC4.txt
+  /tmp/bla_unmatched.txt
 The 'unmatched' file will contain all sequences that didn't match any barcode.
 
 Barcode matching
@@ -548,7 +550,7 @@
 TGTCT (3 mismatches, BC4)
 
 This sequence will be classified as 'BC1' (it has the lowest mismatch count).
-If '--exact' or '--mismatches 0' were specified, this sequence would be 
+If '--exact' or '--mismatches 0' were specified, this sequence would be
 classified as 'unmatched' (because, although BC1 had the lowest mismatch count,
 it is above the maximum allowed mismatches).
 
@@ -574,7 +576,7 @@
 
 Note: scoring counts a missing base as a mismatch, so the final
 mismatch count is 2 (1 'real' mismatch, 1 'missing base' mismatch).
-If running with '--mismatches 2' (meaning allowing upto 2 mismatches) - this 
+If running with '--mismatches 2' (meaning allowing upto 2 mismatches) - this
 seqeunce will be classified as BC1.
 
 EOF
author	lparsons
date	Mon, 02 May 2016 17:04:32 -0400
parents	e7b7cdc1834d
children