comparison fastx_toolkit-0.0.6/scripts/fastx_barcode_splitter.pl @ 3:997f5136985f draft default tip

Uploaded
author xilinxu
date Thu, 14 Aug 2014 04:52:17 -0400
parents
children
comparison
equal deleted inserted replaced
2:dfe9332138cf 3:997f5136985f
1 #!/usr/bin/perl
2
3 # FASTX-toolkit - FASTA/FASTQ preprocessing tools.
4 # Copyright (C) 2009 A. Gordon (gordon@cshl.edu)
5 #
6 # This program is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU Affero General Public License as
8 # published by the Free Software Foundation, either version 3 of the
9 # License, or (at your option) any later version.
10 #
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU Affero General Public License for more details.
15 #
16 # You should have received a copy of the GNU Affero General Public License
17 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18
19 use strict;
20 use warnings;
21 use IO::Handle;
22 use Data::Dumper;
23 use Getopt::Long;
24 use Carp;
25
26 ##
27 ## This program splits a FASTQ/FASTA file into several smaller files,
28 ## Based on barcode matching.
29 ##
30 ## run with "--help" for usage information
31 ##
32 ## Assaf Gordon <gordon@cshl.edu> , 11sep2008
33
34 # Forward declarations
35 sub load_barcode_file ($);
36 sub parse_command_line ;
37 sub match_sequences ;
38 sub mismatch_count($$) ;
39 sub print_results;
40 sub open_and_detect_input_format;
41 sub read_record;
42 sub write_record($);
43 sub usage();
44
45 # Global flags and arguments,
46 # Set by command line argumens
47 my $barcode_file ;
48 my $barcodes_at_eol = 0 ;
49 my $barcodes_at_bol = 0 ;
50 my $exact_match = 0 ;
51 my $allow_partial_overlap = 0;
52 my $allowed_mismatches = 1;
53 my $newfile_suffix = '';
54 my $newfile_prefix ;
55 my $quiet = 0 ;
56 my $debug = 0 ;
57 my $fastq_format = 1;
58
59 # Global variables
60 # Populated by 'create_output_files'
61 my %filenames;
62 my %files;
63 my %counts = ( 'unmatched' => 0 );
64 my $barcodes_length;
65 my @barcodes;
66 my $input_file_io;
67
68
69 # The Four lines per record in FASTQ format.
70 # (when using FASTA format, only the first two are used)
71 my $seq_name;
72 my $seq_bases;
73 my $seq_name2;
74 my $seq_qualities;
75
76
77 #
78 # Start of Program
79 #
80 parse_command_line ;
81
82 load_barcode_file ( $barcode_file ) ;
83
84 open_and_detect_input_format;
85
86 match_sequences ;
87
88 print_results unless $quiet;
89
90 #
91 # End of program
92 #
93
94
95
96
97
98
99
100
101 sub parse_command_line {
102 my $help;
103
104 usage() if (scalar @ARGV==0);
105
106 my $result = GetOptions ( "bcfile=s" => \$barcode_file,
107 "eol" => \$barcodes_at_eol,
108 "bol" => \$barcodes_at_bol,
109 "exact" => \$exact_match,
110 "prefix=s" => \$newfile_prefix,
111 "suffix=s" => \$newfile_suffix,
112 "quiet" => \$quiet,
113 "partial=i" => \$allow_partial_overlap,
114 "debug" => \$debug,
115 "mismatches=i" => \$allowed_mismatches,
116 "help" => \$help
117 ) ;
118
119 usage() if ($help);
120
121 die "Error: barcode file not specified (use '--bcfile [FILENAME]')\n" unless defined $barcode_file;
122 die "Error: prefix path/filename not specified (use '--prefix [PATH]')\n" unless defined $newfile_prefix;
123
124 if ($barcodes_at_bol == $barcodes_at_eol) {
125 die "Error: can't specify both --eol & --bol\n" if $barcodes_at_eol;
126 die "Error: must specify either --eol or --bol\n" ;
127 }
128
129 die "Error: invalid for value partial matches (valid values are 0 or greater)\n" if $allow_partial_overlap<0;
130
131 $allowed_mismatches = 0 if $exact_match;
132
133 die "Error: invalid value for mismatches (valid values are 0 or more)\n" if ($allowed_mismatches<0);
134
135 die "Error: partial overlap value ($allow_partial_overlap) bigger than " .
136 "max. allowed mismatches ($allowed_mismatches)\n" if ($allow_partial_overlap > $allowed_mismatches);
137
138
139 exit unless $result;
140 }
141
142
143
144 #
145 # Read the barcode file
146 #
147 sub load_barcode_file ($) {
148 my $filename = shift or croak "Missing barcode file name";
149
150 open BCFILE,"<$filename" or die "Error: failed to open barcode file ($filename)\n";
151 while (<BCFILE>) {
152 next if m/^#/;
153 chomp;
154 my ($ident, $barcode) = split ;
155
156 $barcode = uc($barcode);
157
158 # Sanity checks on the barcodes
159 die "Error: bad data at barcode file ($filename) line $.\n" unless defined $barcode;
160 die "Error: bad barcode value ($barcode) at barcode file ($filename) line $.\n"
161 unless $barcode =~ m/^[AGCT]+$/;
162
163 die "Error: bad identifier value ($ident) at barcode file ($filename) line $. (must be alphanumeric)\n"
164 unless $ident =~ m/^\w+$/;
165
166 die "Error: badcode($ident, $barcode) is shorter or equal to maximum number of " .
167 "mismatches ($allowed_mismatches). This makes no sense. Specify fewer mismatches.\n"
168 if length($barcode)<=$allowed_mismatches;
169
170 $barcodes_length = length($barcode) unless defined $barcodes_length;
171 die "Error: found barcodes in different lengths. this feature is not supported yet.\n"
172 unless $barcodes_length == length($barcode);
173
174 push @barcodes, [$ident, $barcode];
175
176 if ($allow_partial_overlap>0) {
177 foreach my $i (1 .. $allow_partial_overlap) {
178 substr $barcode, ($barcodes_at_bol)?0:-1, 1, '';
179 push @barcodes, [$ident, $barcode];
180 }
181 }
182 }
183 close BCFILE;
184
185 if ($debug) {
186 print STDERR "barcode\tsequence\n";
187 foreach my $barcoderef (@barcodes) {
188 my ($ident, $seq) = @{$barcoderef};
189 print STDERR $ident,"\t", $seq ,"\n";
190 }
191 }
192 }
193
194 # Create one output file for each barcode.
195 # (Also create a file for the dummy 'unmatched' barcode)
196 sub create_output_files {
197 my %barcodes = map { $_->[0] => 1 } @barcodes; #generate a uniq list of barcode identifiers;
198 $barcodes{'unmatched'} = 1 ;
199
200 foreach my $ident (keys %barcodes) {
201 my $new_filename = $newfile_prefix . $ident . $newfile_suffix;
202 $filenames{$ident} = $new_filename;
203 open my $file, ">$new_filename" or die "Error: failed to create output file ($new_filename)\n";
204 $files{$ident} = $file ;
205 }
206 }
207
208 sub match_sequences {
209
210 my %barcodes = map { $_->[0] => 1 } @barcodes; #generate a uniq list of barcode identifiers;
211 $barcodes{'unmatched'} = 1 ;
212
213 #reset counters
214 foreach my $ident ( keys %barcodes ) {
215 $counts{$ident} = 0;
216 }
217
218 create_output_files;
219
220 # Read file FASTQ file
221 # split accotding to barcodes
222 while ( read_record ) {
223 chomp $seq_bases;
224
225 print STDERR "sequence $seq_bases: \n" if $debug;
226
227 my $best_barcode_mismatches_count = $barcodes_length;
228 my $best_barcode_ident = undef;
229
230 #Try all barcodes, find the one with the lowest mismatch count
231 foreach my $barcoderef (@barcodes) {
232 my ($ident, $barcode) = @{$barcoderef};
233
234 # Get DNA fragment (in the length of the barcodes)
235 # The barcode will be tested only against this fragment
236 # (no point in testing the barcode against the whole sequence)
237 my $sequence_fragment;
238 if ($barcodes_at_bol) {
239 $sequence_fragment = substr $seq_bases, 0, $barcodes_length;
240 } else {
241 $sequence_fragment = substr $seq_bases, - $barcodes_length;
242 }
243
244 my $mm = mismatch_count($sequence_fragment, $barcode) ;
245
246 # if this is a partial match, add the non-overlap as a mismatch
247 # (partial barcodes are shorter than the length of the original barcodes)
248 $mm += ($barcodes_length - length($barcode));
249
250 if ( $mm < $best_barcode_mismatches_count ) {
251 $best_barcode_mismatches_count = $mm ;
252 $best_barcode_ident = $ident ;
253 }
254 }
255
256 $best_barcode_ident = 'unmatched'
257 if ( (!defined $best_barcode_ident) || $best_barcode_mismatches_count>$allowed_mismatches) ;
258
259 print STDERR "sequence $seq_bases matched barcode: $best_barcode_ident\n" if $debug;
260
261 $counts{$best_barcode_ident}++;
262
263 #get the file associated with the matched barcode.
264 #(note: there's also a file associated with 'unmatched' barcode)
265 my $file = $files{$best_barcode_ident};
266
267 write_record($file);
268 }
269 }
270
271 #Quickly calculate hamming distance between two strings
272 #
273 #NOTE: Strings must be same length.
274 # returns number of different characters.
275 #see http://www.perlmonks.org/?node_id=500235
276 sub mismatch_count($$) { length( $_[ 0 ] ) - ( ( $_[ 0 ] ^ $_[ 1 ] ) =~ tr[\0][\0] ) }
277
278
279
280 sub print_results
281 {
282 print "Barcode\tCount\tLocation\n";
283 my $total = 0 ;
284 foreach my $ident (sort keys %counts) {
285 print $ident, "\t", $counts{$ident},"\t",$filenames{$ident},"\n";
286 $total += $counts{$ident};
287 }
288 print "total\t",$total,"\n";
289 }
290
291
292 sub read_record
293 {
294 $seq_name = $input_file_io->getline();
295
296 return undef unless defined $seq_name; # End of file?
297
298 $seq_bases = $input_file_io->getline();
299 die "Error: bad input file, expecting line with sequences\n" unless defined $seq_bases;
300
301 # If using FASTQ format, read two more lines
302 if ($fastq_format) {
303 $seq_name2 = $input_file_io->getline();
304 die "Error: bad input file, expecting line with sequence name2\n" unless defined $seq_name2;
305
306 $seq_qualities = $input_file_io->getline();
307 die "Error: bad input file, expecting line with quality scores\n" unless defined $seq_qualities;
308 }
309 return 1;
310 }
311
312 sub write_record($)
313 {
314 my $file = shift;
315
316 croak "Bad file handle" unless defined $file;
317
318 print $file $seq_name;
319 print $file $seq_bases,"\n";
320
321 #if using FASTQ format, write two more lines
322 if ($fastq_format) {
323 print $file $seq_name2;
324 print $file $seq_qualities;
325 }
326 }
327
328 sub open_and_detect_input_format
329 {
330 $input_file_io = new IO::Handle;
331 die "Failed to open STDIN " unless $input_file_io->fdopen(fileno(STDIN),"r");
332
333 # Get the first characeter, and push it back
334 my $first_char = $input_file_io->getc();
335 $input_file_io->ungetc(ord $first_char);
336
337 if ($first_char eq '>') {
338 # FASTA format
339 $fastq_format = 0 ;
340 print STDERR "Detected FASTA format\n" if $debug;
341 } elsif ($first_char eq '@') {
342 # FASTQ format
343 $fastq_format = 1;
344 print STDERR "Detected FASTQ format\n" if $debug;
345 } else {
346 die "Error: unknown file format. First character = '$first_char' (expecting > or \@)\n";
347 }
348 }
349
350 sub usage()
351 {
352 print<<EOF;
353 Barcode Splitter, by Assaf Gordon (gordon\@cshl.edu), 11sep2008
354
355 This program reads FASTA/FASTQ file and splits it into several smaller files,
356 Based on barcode matching.
357 FASTA/FASTQ data is read from STDIN (format is auto-detected.)
358 Output files will be writen to disk.
359 Summary will be printed to STDOUT.
360
361 usage: $0 --bcfile FILE --prefix PREFIX [--suffix SUFFIX] [--bol|--eol]
362 [--mismatches N] [--exact] [--partial N] [--help] [--quiet] [--debug]
363
364 Arguments:
365
366 --bcfile FILE - Barcodes file name. (see explanation below.)
367 --prefix PREFIX - File prefix. will be added to the output files. Can be used
368 to specify output directories.
369 --suffix SUFFIX - File suffix (optional). Can be used to specify file
370 extensions.
371 --bol - Try to match barcodes at the BEGINNING of sequences.
372 (What biologists would call the 5' end, and programmers
373 would call index 0.)
374 --eol - Try to match barcodes at the END of sequences.
375 (What biologists would call the 3' end, and programmers
376 would call the end of the string.)
377 NOTE: one of --bol, --eol must be specified, but not both.
378 --mismatches N - Max. number of mismatches allowed. default is 1.
379 --exact - Same as '--mismatches 0'. If both --exact and --mismatches
380 are specified, '--exact' takes precedence.
381 --partial N - Allow partial overlap of barcodes. (see explanation below.)
382 (Default is not partial matching)
383 --quiet - Don't print counts and summary at the end of the run.
384 (Default is to print.)
385 --debug - Print lots of useless debug information to STDERR.
386 --help - This helpful help screen.
387
388 Example (Assuming 's_2_100.txt' is a FASTQ file, 'mybarcodes.txt' is
389 the barcodes file):
390
391 \$ cat s_2_100.txt | $0 --bcfile mybarcodes.txt --bol --mismatches 2 \\
392 --prefix /tmp/bla_ --suffix ".txt"
393
394 Barcode file format
395 -------------------
396 Barcode files are simple text files. Each line should contain an identifier
397 (descriptive name for the barcode), and the barcode itself (A/C/G/T),
398 separated by a TAB character. Example:
399
400 #This line is a comment (starts with a 'number' sign)
401 BC1 GATCT
402 BC2 ATCGT
403 BC3 GTGAT
404 BC4 TGTCT
405
406 For each barcode, a new FASTQ file will be created (with the barcode's
407 identifier as part of the file name). Sequences matching the barcode
408 will be stored in the appropriate file.
409
410 Running the above example (assuming "mybarcodes.txt" contains the above
411 barcodes), will create the following files:
412 /tmp/bla_BC1.txt
413 /tmp/bla_BC2.txt
414 /tmp/bla_BC3.txt
415 /tmp/bla_BC4.txt
416 /tmp/bla_unmatched.txt
417 The 'unmatched' file will contain all sequences that didn't match any barcode.
418
419 Barcode matching
420 ----------------
421
422 ** Without partial matching:
423
424 Count mismatches between the FASTA/Q sequences and the barcodes.
425 The barcode which matched with the lowest mismatches count (providing the
426 count is small or equal to '--mismatches N') 'gets' the sequences.
427
428 Example (using the above barcodes):
429 Input Sequence:
430 GATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG
431
432 Matching with '--bol --mismatches 1':
433 GATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG
434 GATCT (1 mismatch, BC1)
435 ATCGT (4 mismatches, BC2)
436 GTGAT (3 mismatches, BC3)
437 TGTCT (3 mismatches, BC4)
438
439 This sequence will be classified as 'BC1' (it has the lowest mismatch count).
440 If '--exact' or '--mismatches 0' were specified, this sequence would be
441 classified as 'unmatched' (because, although BC1 had the lowest mismatch count,
442 it is above the maximum allowed mismatches).
443
444 Matching with '--eol' (end of line) does the same, but from the other side
445 of the sequence.
446
447 ** With partial matching (very similar to indels):
448
449 Same as above, with the following addition: barcodes are also checked for
450 partial overlap (number of allowed non-overlapping bases is '--partial N').
451
452 Example:
453 Input sequence is ATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG
454 (Same as above, but note the missing 'G' at the beginning.)
455
456 Matching (without partial overlapping) against BC1 yields 4 mismatches:
457 ATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG
458 GATCT (4 mismatches)
459
460 Partial overlapping would also try the following match:
461 -ATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG
462 GATCT (1 mismatch)
463
464 Note: scoring counts a missing base as a mismatch, so the final
465 mismatch count is 2 (1 'real' mismatch, 1 'missing base' mismatch).
466 If running with '--mismatches 2' (meaning allowing upto 2 mismatches) - this
467 seqeunce will be classified as BC1.
468
469 EOF
470
471 exit 1;
472 }