Mercurial > repos > lparsons > fastx_barcode_splitter_enhanced
comparison fastx_barcode_splitter.pl @ 0:84bbf4fd24c3 draft
Initial toolshed version with support for separate index reads and automatic loading of results into Galaxy history.
author | lparsons |
---|---|
date | Fri, 08 Nov 2013 09:53:39 -0500 |
parents | |
children | e7b7cdc1834d |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:84bbf4fd24c3 |
---|---|
1 #!/usr/bin/perl | |
2 | |
3 # FASTX-toolkit - FASTA/FASTQ preprocessing tools. | |
4 # Copyright (C) 2009 A. Gordon (gordon@cshl.edu) | |
5 # | |
6 # Lance Parsons (lparsons@princeton.edu) | |
7 # 3/21/2011 - Modified to accept separate index file for barcodes | |
8 # 4/6/2011 - Modified to cleanup bad barcode identifiers (esp. useful for Galaxy) | |
9 # | |
10 # This program is free software: you can redistribute it and/or modify | |
11 # it under the terms of the GNU Affero General Public License as | |
12 # published by the Free Software Foundation, either version 3 of the | |
13 # License, or (at your option) any later version. | |
14 # | |
15 # This program is distributed in the hope that it will be useful, | |
16 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
17 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
18 # GNU Affero General Public License for more details. | |
19 # | |
20 # You should have received a copy of the GNU Affero General Public License | |
21 # along with this program. If not, see <http://www.gnu.org/licenses/>. | |
22 | |
23 use strict; | |
24 use warnings; | |
25 use IO::Handle; | |
26 use Data::Dumper; | |
27 use Getopt::Long; | |
28 use Carp; | |
29 | |
30 ## | |
31 ## This program splits a FASTQ/FASTA file into several smaller files, | |
32 ## Based on barcode matching. | |
33 ## | |
34 ## run with "--help" for usage information | |
35 ## | |
36 ## Assaf Gordon <gordon@cshl.edu> , 11sep2008 | |
37 | |
38 # Forward declarations | |
39 sub load_barcode_file ($); | |
40 sub parse_command_line ; | |
41 sub match_sequences ; | |
42 sub mismatch_count($$) ; | |
43 sub print_results; | |
44 sub open_and_detect_input_format; | |
45 sub open_index_and_detect_input_format($); | |
46 sub read_index_record; | |
47 sub read_record; | |
48 sub write_record($); | |
49 sub usage(); | |
50 | |
51 # Global flags and arguments, | |
52 # Set by command line argumens | |
53 my $barcode_file ; | |
54 my $barcodes_at_eol = 0 ; | |
55 my $barcodes_at_bol = 0 ; | |
56 my $index_read_file ; | |
57 my $exact_match = 0 ; | |
58 my $allow_partial_overlap = 0; | |
59 my $allowed_mismatches = 1; | |
60 my $newfile_suffix = ''; | |
61 my $newfile_prefix ; | |
62 my $quiet = 0 ; | |
63 my $debug = 0 ; | |
64 my $fastq_format = 1; | |
65 my $index_fastq_format = 1; | |
66 my $read_id_check_strip_characters = 1; | |
67 | |
68 # Global variables | |
69 # Populated by 'create_output_files' | |
70 my %filenames; | |
71 my %files; | |
72 my %counts = ( 'unmatched' => 0 ); | |
73 my $barcodes_length; | |
74 my @barcodes; | |
75 my $input_file_io; | |
76 | |
77 | |
78 # The Four lines per record in FASTQ format. | |
79 # (when using FASTA format, only the first two are used) | |
80 my $seq_name; | |
81 my $seq_bases; | |
82 my $seq_name2; | |
83 my $seq_qualities; | |
84 | |
85 # Values used for index read file | |
86 my $index_seq_name; | |
87 my $index_seq_bases; | |
88 my $index_seq_name2; | |
89 my $index_seq_qualities; | |
90 | |
91 | |
92 | |
93 # | |
94 # Start of Program | |
95 # | |
96 parse_command_line ; | |
97 | |
98 load_barcode_file ( $barcode_file ) ; | |
99 | |
100 open_and_detect_input_format; | |
101 | |
102 if (defined $index_read_file) {open_index_and_detect_input_format ( $index_read_file );} | |
103 | |
104 match_sequences ; | |
105 | |
106 print_results unless $quiet; | |
107 | |
108 # | |
109 # End of program | |
110 # | |
111 | |
112 | |
113 | |
114 | |
115 | |
116 | |
117 | |
118 | |
119 sub parse_command_line { | |
120 my $help; | |
121 | |
122 usage() if (scalar @ARGV==0); | |
123 | |
124 my $result = GetOptions ( "bcfile=s" => \$barcode_file, | |
125 "eol" => \$barcodes_at_eol, | |
126 "bol" => \$barcodes_at_bol, | |
127 "idxfile=s" => \$index_read_file, | |
128 "idxidstrip=i" => \$read_id_check_strip_characters, | |
129 "exact" => \$exact_match, | |
130 "prefix=s" => \$newfile_prefix, | |
131 "suffix=s" => \$newfile_suffix, | |
132 "quiet" => \$quiet, | |
133 "partial=i" => \$allow_partial_overlap, | |
134 "debug" => \$debug, | |
135 "mismatches=i" => \$allowed_mismatches, | |
136 "help" => \$help | |
137 ) ; | |
138 | |
139 usage() if ($help); | |
140 | |
141 die "Error: barcode file not specified (use '--bcfile [FILENAME]')\n" unless defined $barcode_file; | |
142 die "Error: prefix path/filename not specified (use '--prefix [PATH]')\n" unless defined $newfile_prefix; | |
143 | |
144 if (! defined $index_read_file) { | |
145 if ($barcodes_at_bol == $barcodes_at_eol) { | |
146 die "Error: can't specify both --eol & --bol\n" if $barcodes_at_eol; | |
147 die "Error: must specify either --eol or --bol or --idxfile\n" ; | |
148 } | |
149 } | |
150 elsif ($barcodes_at_bol || $barcodes_at_eol) { | |
151 die "Error: Must specify only one of --idxfile, --eol, or --bol"; | |
152 } | |
153 | |
154 die "Error: invalid for value partial matches (valid values are 0 or greater)\n" if $allow_partial_overlap<0; | |
155 | |
156 $allowed_mismatches = 0 if $exact_match; | |
157 | |
158 die "Error: invalid value for mismatches (valid values are 0 or more)\n" if ($allowed_mismatches<0); | |
159 | |
160 die "Error: partial overlap value ($allow_partial_overlap) bigger than " . | |
161 "max. allowed mismatches ($allowed_mismatches)\n" if ($allow_partial_overlap > $allowed_mismatches); | |
162 | |
163 | |
164 exit unless $result; | |
165 } | |
166 | |
167 | |
168 | |
169 # | |
170 # Read the barcode file | |
171 # | |
172 sub load_barcode_file ($) { | |
173 my $filename = shift or croak "Missing barcode file name"; | |
174 | |
175 open BCFILE,"<$filename" or die "Error: failed to open barcode file ($filename)\n"; | |
176 while (<BCFILE>) { | |
177 next if m/^#/; | |
178 chomp; | |
179 my ($ident, $barcode) = split('\t') ; | |
180 | |
181 $barcode = uc($barcode); | |
182 | |
183 # Sanity checks on the barcodes | |
184 die "Error: bad data at barcode file ($filename) line $.\n" unless defined $barcode; | |
185 die "Error: bad barcode value ($barcode) at barcode file ($filename) line $.\n" | |
186 unless $barcode =~ m/^[AGCT]+$/; | |
187 | |
188 # Cleanup Identifiers (only allow alphanumeric, replace others with dash '-') | |
189 $ident =~ s/[^A-Za-z0-9]/-/g; | |
190 die "Error: bad identifier value ($ident) at barcode file ($filename) line $. (must be alphanumeric)\n" | |
191 unless $ident =~ m/^[A-Za-z0-9-]+$/; | |
192 | |
193 die "Error: badcode($ident, $barcode) is shorter or equal to maximum number of " . | |
194 "mismatches ($allowed_mismatches). This makes no sense. Specify fewer mismatches.\n" | |
195 if length($barcode)<=$allowed_mismatches; | |
196 | |
197 $barcodes_length = length($barcode) unless defined $barcodes_length; | |
198 die "Error: found barcodes in different lengths. this feature is not supported yet.\n" | |
199 unless $barcodes_length == length($barcode); | |
200 | |
201 push @barcodes, [$ident, $barcode]; | |
202 | |
203 if ($allow_partial_overlap>0) { | |
204 foreach my $i (1 .. $allow_partial_overlap) { | |
205 substr $barcode, ($barcodes_at_bol)?0:-1, 1, ''; | |
206 push @barcodes, [$ident, $barcode]; | |
207 } | |
208 } | |
209 } | |
210 close BCFILE; | |
211 | |
212 if ($debug) { | |
213 print STDERR "barcode\tsequence\n"; | |
214 foreach my $barcoderef (@barcodes) { | |
215 my ($ident, $seq) = @{$barcoderef}; | |
216 print STDERR $ident,"\t", $seq ,"\n"; | |
217 } | |
218 } | |
219 } | |
220 | |
221 # Create one output file for each barcode. | |
222 # (Also create a file for the dummy 'unmatched' barcode) | |
223 sub create_output_files { | |
224 my %barcodes = map { $_->[0] => 1 } @barcodes; #generate a uniq list of barcode identifiers; | |
225 $barcodes{'unmatched'} = 1 ; | |
226 | |
227 foreach my $ident (keys %barcodes) { | |
228 my $new_filename = $newfile_prefix . $ident . $newfile_suffix; | |
229 $filenames{$ident} = $new_filename; | |
230 open my $file, ">$new_filename" or die "Error: failed to create output file ($new_filename)\n"; | |
231 $files{$ident} = $file ; | |
232 } | |
233 } | |
234 | |
235 sub match_sequences { | |
236 | |
237 my %barcodes = map { $_->[0] => 1 } @barcodes; #generate a uniq list of barcode identifiers; | |
238 $barcodes{'unmatched'} = 1 ; | |
239 | |
240 #reset counters | |
241 foreach my $ident ( keys %barcodes ) { | |
242 $counts{$ident} = 0; | |
243 } | |
244 | |
245 create_output_files; | |
246 | |
247 # Read file FASTQ file | |
248 # split accotding to barcodes | |
249 while ( read_record ) { | |
250 chomp $seq_name; | |
251 chomp $seq_bases; | |
252 if (defined $index_read_file) { | |
253 read_index_record() or die "Error: Unable to read index sequence for sequence name ($seq_name), check to make sure the file lengths match.\n"; | |
254 chomp $index_seq_name; | |
255 chomp $index_seq_bases; | |
256 | |
257 # Assert that the read ids match | |
258 my $seq_name_match = &strip_read_id($seq_name); | |
259 my $index_seq_name_match = &strip_read_id($index_seq_name); | |
260 if ($seq_name_match ne $index_seq_name_match) { | |
261 die "Error: Index sequence name ($index_seq_name) does not match sequence name ($seq_name)\n"; | |
262 } | |
263 | |
264 } | |
265 | |
266 print STDERR "sequence $seq_bases: \n" if $debug; | |
267 | |
268 my $best_barcode_mismatches_count = $barcodes_length; | |
269 my $best_barcode_ident = undef; | |
270 | |
271 #Try all barcodes, find the one with the lowest mismatch count | |
272 foreach my $barcoderef (@barcodes) { | |
273 my ($ident, $barcode) = @{$barcoderef}; | |
274 | |
275 # Get DNA fragment (in the length of the barcodes) | |
276 # The barcode will be tested only against this fragment | |
277 # (no point in testing the barcode against the whole sequence) | |
278 my $sequence_fragment; | |
279 if ($barcodes_at_bol) { | |
280 $sequence_fragment = substr $seq_bases, 0, $barcodes_length; | |
281 } elsif ($barcodes_at_eol) { | |
282 $sequence_fragment = substr $seq_bases, - $barcodes_length; | |
283 } else { | |
284 $sequence_fragment = substr $index_seq_bases, 0, $barcodes_length; | |
285 } | |
286 | |
287 my $mm = mismatch_count($sequence_fragment, $barcode) ; | |
288 | |
289 # if this is a partial match, add the non-overlap as a mismatch | |
290 # (partial barcodes are shorter than the length of the original barcodes) | |
291 $mm += ($barcodes_length - length($barcode)); | |
292 | |
293 if ( $mm < $best_barcode_mismatches_count ) { | |
294 $best_barcode_mismatches_count = $mm ; | |
295 $best_barcode_ident = $ident ; | |
296 } | |
297 } | |
298 | |
299 $best_barcode_ident = 'unmatched' | |
300 if ( (!defined $best_barcode_ident) || $best_barcode_mismatches_count>$allowed_mismatches) ; | |
301 | |
302 print STDERR "sequence $seq_bases matched barcode: $best_barcode_ident\n" if $debug; | |
303 | |
304 $counts{$best_barcode_ident}++; | |
305 | |
306 #get the file associated with the matched barcode. | |
307 #(note: there's also a file associated with 'unmatched' barcode) | |
308 my $file = $files{$best_barcode_ident}; | |
309 | |
310 write_record($file); | |
311 } | |
312 } | |
313 | |
314 # Strip end of readids when matching to avoid mismatch between read 1, 2, 3, etc. | |
315 sub strip_read_id { | |
316 my $read_id = shift; | |
317 my $stripped_read_id = $read_id; | |
318 if ($read_id_check_strip_characters) { | |
319 if ($read_id =~ /@([a-zA-Z0-9_-]+):([0-9]+):([a-zA-Z0-9]+):([0-9]+):([0-9]+):([0-9]+):([0-9]+) ([0-9]+):([YN]):([0-9]+):([ACGT]+){0,1}/) { # CASAVA 1.8+ | |
320 my @parts = split(/ /,$read_id); | |
321 $stripped_read_id = $parts[0]; | |
322 } else { # CASAVA 1.7 and earlier | |
323 $stripped_read_id = substr($read_id, 0, length($read_id)-$read_id_check_strip_characters); | |
324 } | |
325 } | |
326 return $stripped_read_id; | |
327 } | |
328 | |
329 | |
330 #Quickly calculate hamming distance between two strings | |
331 # | |
332 #NOTE: Strings must be same length. | |
333 # returns number of different characters. | |
334 #see http://www.perlmonks.org/?node_id=500235 | |
335 sub mismatch_count($$) { length( $_[ 0 ] ) - ( ( $_[ 0 ] ^ $_[ 1 ] ) =~ tr[\0][\0] ) } | |
336 | |
337 | |
338 | |
339 sub print_results | |
340 { | |
341 print "Barcode\tCount\tLocation\n"; | |
342 my $total = 0 ; | |
343 foreach my $ident (sort keys %counts) { | |
344 print $ident, "\t", $counts{$ident},"\t",$filenames{$ident},"\n"; | |
345 $total += $counts{$ident}; | |
346 } | |
347 print "total\t",$total,"\n"; | |
348 } | |
349 | |
350 | |
351 sub read_record | |
352 { | |
353 $seq_name = $input_file_io->getline(); | |
354 | |
355 return undef unless defined $seq_name; # End of file? | |
356 | |
357 $seq_bases = $input_file_io->getline(); | |
358 die "Error: bad input file, expecting line with sequences\n" unless defined $seq_bases; | |
359 | |
360 # If using FASTQ format, read two more lines | |
361 if ($fastq_format) { | |
362 $seq_name2 = $input_file_io->getline(); | |
363 die "Error: bad input file, expecting line with sequence name2\n" unless defined $seq_name2; | |
364 | |
365 $seq_qualities = $input_file_io->getline(); | |
366 die "Error: bad input file, expecting line with quality scores\n" unless defined $seq_qualities; | |
367 } | |
368 return 1; | |
369 } | |
370 | |
371 sub write_record($) | |
372 { | |
373 my $file = shift; | |
374 | |
375 croak "Bad file handle" unless defined $file; | |
376 | |
377 print $file $seq_name,"\n"; | |
378 print $file $seq_bases,"\n"; | |
379 | |
380 #if using FASTQ format, write two more lines | |
381 if ($fastq_format) { | |
382 print $file $seq_name2; | |
383 print $file $seq_qualities; | |
384 } | |
385 } | |
386 | |
387 sub open_and_detect_input_format | |
388 { | |
389 $input_file_io = new IO::Handle; | |
390 die "Failed to open STDIN " unless $input_file_io->fdopen(fileno(STDIN),"r"); | |
391 | |
392 # Get the first characeter, and push it back | |
393 my $first_char = $input_file_io->getc(); | |
394 $input_file_io->ungetc(ord $first_char); | |
395 | |
396 if ($first_char eq '>') { | |
397 # FASTA format | |
398 $fastq_format = 0 ; | |
399 print STDERR "Detected FASTA format\n" if $debug; | |
400 } elsif ($first_char eq '@') { | |
401 # FASTQ format | |
402 $fastq_format = 1; | |
403 print STDERR "Detected FASTQ format\n" if $debug; | |
404 } else { | |
405 die "Error: unknown file format. First character = '$first_char' (expecting > or \@)\n"; | |
406 } | |
407 } | |
408 | |
409 | |
410 sub open_index_and_detect_input_format($) { | |
411 my $filename = shift or croak "Missing index read file name"; | |
412 | |
413 open IDXFILE,"<$filename" or die "Error: failed to open index read file ($filename)\n"; | |
414 | |
415 # Get the first line, and reset file pointer | |
416 my $first_line = <IDXFILE>; | |
417 my $first_char = substr($first_line, 0, 1); | |
418 seek(IDXFILE, 0, 0); | |
419 | |
420 if ($first_char eq '>') { | |
421 # FASTA format | |
422 $index_fastq_format = 0 ; | |
423 print STDERR "Detected FASTA format for index file\n" if $debug; | |
424 } elsif ($first_char eq '@') { | |
425 # FASTQ format | |
426 $index_fastq_format = 1; | |
427 print STDERR "Detected FASTQ format for index file\n" if $debug; | |
428 } else { | |
429 die "Error: unknown index file format. First character = '$first_char' (expecting > or \@)\n"; | |
430 } | |
431 } | |
432 | |
433 sub read_index_record | |
434 { | |
435 $index_seq_name = <IDXFILE>; | |
436 | |
437 return undef unless defined $index_seq_name; # End of file? | |
438 | |
439 $index_seq_bases = <IDXFILE>; | |
440 die "Error: bad input file, expecting line with sequences\n" unless defined $index_seq_bases; | |
441 | |
442 # If using FASTQ format, read two more lines | |
443 if ($index_fastq_format) { | |
444 $index_seq_name2 = <IDXFILE>; | |
445 die "Error: bad input file, expecting line with sequence name2\n" unless defined $index_seq_name2; | |
446 | |
447 $index_seq_qualities = <IDXFILE>; | |
448 die "Error: bad input file, expecting line with quality scores\n" unless defined $index_seq_qualities; | |
449 } | |
450 return 1; | |
451 } | |
452 | |
453 sub usage() | |
454 { | |
455 print<<EOF; | |
456 Barcode Splitter, by Assaf Gordon (gordon\@cshl.edu), 11sep2008 | |
457 | |
458 This program reads FASTA/FASTQ file and splits it into several smaller files, | |
459 Based on barcode matching. | |
460 FASTA/FASTQ data is read from STDIN (format is auto-detected.) | |
461 Output files will be writen to disk. | |
462 Summary will be printed to STDOUT. | |
463 | |
464 usage: $0 --bcfile FILE --prefix PREFIX [--suffix SUFFIX] [--bol|--eol|--idxfile] | |
465 [--mismatches N] [--exact] [--partial N] [--idxidstrip N] | |
466 [--help] [--quiet] [--debug] | |
467 | |
468 Arguments: | |
469 | |
470 --bcfile FILE - Barcodes file name. (see explanation below.) | |
471 --prefix PREFIX - File prefix. will be added to the output files. Can be used | |
472 to specify output directories. | |
473 --suffix SUFFIX - File suffix (optional). Can be used to specify file | |
474 extensions. | |
475 --bol - Try to match barcodes at the BEGINNING of sequences. | |
476 (What biologists would call the 5' end, and programmers | |
477 would call index 0.) | |
478 --eol - Try to match barcodes at the END of sequences. | |
479 (What biologists would call the 3' end, and programmers | |
480 would call the end of the string.) | |
481 --idxfile FILE - Read barcodes from separate index file (fasta or fastq) | |
482 NOTE: one of --bol, --eol, --idxfile must be specified, | |
483 but not more than one. | |
484 --idxidstrip N - When using index file, strip this number of characters | |
485 from the end of the sequence id before matching. | |
486 Automatically detects CASAVA 1.8 format and strips at a | |
487 space in the id, use 0 to disable this. | |
488 (Default is 1). | |
489 --mismatches N - Max. number of mismatches allowed. default is 1. | |
490 --exact - Same as '--mismatches 0'. If both --exact and --mismatches | |
491 are specified, '--exact' takes precedence. | |
492 --partial N - Allow partial overlap of barcodes. (see explanation below.) | |
493 (Default is not partial matching) | |
494 --quiet - Don't print counts and summary at the end of the run. | |
495 (Default is to print.) | |
496 --debug - Print lots of useless debug information to STDERR. | |
497 --help - This helpful help screen. | |
498 | |
499 Example (Assuming 's_2_100.txt' is a FASTQ file, 'mybarcodes.txt' is | |
500 the barcodes file): | |
501 | |
502 \$ cat s_2_100.txt | $0 --bcfile mybarcodes.txt --bol --mismatches 2 \\ | |
503 --prefix /tmp/bla_ --suffix ".txt" | |
504 | |
505 Barcode file format | |
506 ------------------- | |
507 Barcode files are simple text files. Each line should contain an identifier | |
508 (descriptive name for the barcode), and the barcode itself (A/C/G/T), | |
509 separated by a TAB character. Example: | |
510 | |
511 #This line is a comment (starts with a 'number' sign) | |
512 BC1 GATCT | |
513 BC2 ATCGT | |
514 BC3 GTGAT | |
515 BC4 TGTCT | |
516 | |
517 For each barcode, a new FASTQ file will be created (with the barcode's | |
518 identifier as part of the file name). Sequences matching the barcode | |
519 will be stored in the appropriate file. | |
520 | |
521 Running the above example (assuming "mybarcodes.txt" contains the above | |
522 barcodes), will create the following files: | |
523 /tmp/bla_BC1.txt | |
524 /tmp/bla_BC2.txt | |
525 /tmp/bla_BC3.txt | |
526 /tmp/bla_BC4.txt | |
527 /tmp/bla_unmatched.txt | |
528 The 'unmatched' file will contain all sequences that didn't match any barcode. | |
529 | |
530 Barcode matching | |
531 ---------------- | |
532 | |
533 ** Without partial matching: | |
534 | |
535 Count mismatches between the FASTA/Q sequences and the barcodes. | |
536 The barcode which matched with the lowest mismatches count (providing the | |
537 count is small or equal to '--mismatches N') 'gets' the sequences. | |
538 | |
539 Example (using the above barcodes): | |
540 Input Sequence: | |
541 GATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG | |
542 | |
543 Matching with '--bol --mismatches 1': | |
544 GATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG | |
545 GATCT (1 mismatch, BC1) | |
546 ATCGT (4 mismatches, BC2) | |
547 GTGAT (3 mismatches, BC3) | |
548 TGTCT (3 mismatches, BC4) | |
549 | |
550 This sequence will be classified as 'BC1' (it has the lowest mismatch count). | |
551 If '--exact' or '--mismatches 0' were specified, this sequence would be | |
552 classified as 'unmatched' (because, although BC1 had the lowest mismatch count, | |
553 it is above the maximum allowed mismatches). | |
554 | |
555 Matching with '--eol' (end of line) does the same, but from the other side | |
556 of the sequence. | |
557 | |
558 ** With partial matching (very similar to indels): | |
559 | |
560 Same as above, with the following addition: barcodes are also checked for | |
561 partial overlap (number of allowed non-overlapping bases is '--partial N'). | |
562 | |
563 Example: | |
564 Input sequence is ATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG | |
565 (Same as above, but note the missing 'G' at the beginning.) | |
566 | |
567 Matching (without partial overlapping) against BC1 yields 4 mismatches: | |
568 ATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG | |
569 GATCT (4 mismatches) | |
570 | |
571 Partial overlapping would also try the following match: | |
572 -ATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG | |
573 GATCT (1 mismatch) | |
574 | |
575 Note: scoring counts a missing base as a mismatch, so the final | |
576 mismatch count is 2 (1 'real' mismatch, 1 'missing base' mismatch). | |
577 If running with '--mismatches 2' (meaning allowing upto 2 mismatches) - this | |
578 seqeunce will be classified as BC1. | |
579 | |
580 EOF | |
581 | |
582 exit 1; | |
583 } |