Mercurial > repos > xilinxu > xilinxu
comparison fastx_toolkit-0.0.6/scripts/fastx_barcode_splitter.pl @ 3:997f5136985f draft default tip
Uploaded
author | xilinxu |
---|---|
date | Thu, 14 Aug 2014 04:52:17 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
2:dfe9332138cf | 3:997f5136985f |
---|---|
1 #!/usr/bin/perl | |
2 | |
3 # FASTX-toolkit - FASTA/FASTQ preprocessing tools. | |
4 # Copyright (C) 2009 A. Gordon (gordon@cshl.edu) | |
5 # | |
6 # This program is free software: you can redistribute it and/or modify | |
7 # it under the terms of the GNU Affero General Public License as | |
8 # published by the Free Software Foundation, either version 3 of the | |
9 # License, or (at your option) any later version. | |
10 # | |
11 # This program is distributed in the hope that it will be useful, | |
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 # GNU Affero General Public License for more details. | |
15 # | |
16 # You should have received a copy of the GNU Affero General Public License | |
17 # along with this program. If not, see <http://www.gnu.org/licenses/>. | |
18 | |
19 use strict; | |
20 use warnings; | |
21 use IO::Handle; | |
22 use Data::Dumper; | |
23 use Getopt::Long; | |
24 use Carp; | |
25 | |
26 ## | |
27 ## This program splits a FASTQ/FASTA file into several smaller files, | |
28 ## Based on barcode matching. | |
29 ## | |
30 ## run with "--help" for usage information | |
31 ## | |
32 ## Assaf Gordon <gordon@cshl.edu> , 11sep2008 | |
33 | |
34 # Forward declarations | |
35 sub load_barcode_file ($); | |
36 sub parse_command_line ; | |
37 sub match_sequences ; | |
38 sub mismatch_count($$) ; | |
39 sub print_results; | |
40 sub open_and_detect_input_format; | |
41 sub read_record; | |
42 sub write_record($); | |
43 sub usage(); | |
44 | |
45 # Global flags and arguments, | |
46 # Set by command line argumens | |
47 my $barcode_file ; | |
48 my $barcodes_at_eol = 0 ; | |
49 my $barcodes_at_bol = 0 ; | |
50 my $exact_match = 0 ; | |
51 my $allow_partial_overlap = 0; | |
52 my $allowed_mismatches = 1; | |
53 my $newfile_suffix = ''; | |
54 my $newfile_prefix ; | |
55 my $quiet = 0 ; | |
56 my $debug = 0 ; | |
57 my $fastq_format = 1; | |
58 | |
59 # Global variables | |
60 # Populated by 'create_output_files' | |
61 my %filenames; | |
62 my %files; | |
63 my %counts = ( 'unmatched' => 0 ); | |
64 my $barcodes_length; | |
65 my @barcodes; | |
66 my $input_file_io; | |
67 | |
68 | |
69 # The Four lines per record in FASTQ format. | |
70 # (when using FASTA format, only the first two are used) | |
71 my $seq_name; | |
72 my $seq_bases; | |
73 my $seq_name2; | |
74 my $seq_qualities; | |
75 | |
76 | |
77 # | |
78 # Start of Program | |
79 # | |
80 parse_command_line ; | |
81 | |
82 load_barcode_file ( $barcode_file ) ; | |
83 | |
84 open_and_detect_input_format; | |
85 | |
86 match_sequences ; | |
87 | |
88 print_results unless $quiet; | |
89 | |
90 # | |
91 # End of program | |
92 # | |
93 | |
94 | |
95 | |
96 | |
97 | |
98 | |
99 | |
100 | |
101 sub parse_command_line { | |
102 my $help; | |
103 | |
104 usage() if (scalar @ARGV==0); | |
105 | |
106 my $result = GetOptions ( "bcfile=s" => \$barcode_file, | |
107 "eol" => \$barcodes_at_eol, | |
108 "bol" => \$barcodes_at_bol, | |
109 "exact" => \$exact_match, | |
110 "prefix=s" => \$newfile_prefix, | |
111 "suffix=s" => \$newfile_suffix, | |
112 "quiet" => \$quiet, | |
113 "partial=i" => \$allow_partial_overlap, | |
114 "debug" => \$debug, | |
115 "mismatches=i" => \$allowed_mismatches, | |
116 "help" => \$help | |
117 ) ; | |
118 | |
119 usage() if ($help); | |
120 | |
121 die "Error: barcode file not specified (use '--bcfile [FILENAME]')\n" unless defined $barcode_file; | |
122 die "Error: prefix path/filename not specified (use '--prefix [PATH]')\n" unless defined $newfile_prefix; | |
123 | |
124 if ($barcodes_at_bol == $barcodes_at_eol) { | |
125 die "Error: can't specify both --eol & --bol\n" if $barcodes_at_eol; | |
126 die "Error: must specify either --eol or --bol\n" ; | |
127 } | |
128 | |
129 die "Error: invalid for value partial matches (valid values are 0 or greater)\n" if $allow_partial_overlap<0; | |
130 | |
131 $allowed_mismatches = 0 if $exact_match; | |
132 | |
133 die "Error: invalid value for mismatches (valid values are 0 or more)\n" if ($allowed_mismatches<0); | |
134 | |
135 die "Error: partial overlap value ($allow_partial_overlap) bigger than " . | |
136 "max. allowed mismatches ($allowed_mismatches)\n" if ($allow_partial_overlap > $allowed_mismatches); | |
137 | |
138 | |
139 exit unless $result; | |
140 } | |
141 | |
142 | |
143 | |
144 # | |
145 # Read the barcode file | |
146 # | |
147 sub load_barcode_file ($) { | |
148 my $filename = shift or croak "Missing barcode file name"; | |
149 | |
150 open BCFILE,"<$filename" or die "Error: failed to open barcode file ($filename)\n"; | |
151 while (<BCFILE>) { | |
152 next if m/^#/; | |
153 chomp; | |
154 my ($ident, $barcode) = split ; | |
155 | |
156 $barcode = uc($barcode); | |
157 | |
158 # Sanity checks on the barcodes | |
159 die "Error: bad data at barcode file ($filename) line $.\n" unless defined $barcode; | |
160 die "Error: bad barcode value ($barcode) at barcode file ($filename) line $.\n" | |
161 unless $barcode =~ m/^[AGCT]+$/; | |
162 | |
163 die "Error: bad identifier value ($ident) at barcode file ($filename) line $. (must be alphanumeric)\n" | |
164 unless $ident =~ m/^\w+$/; | |
165 | |
166 die "Error: badcode($ident, $barcode) is shorter or equal to maximum number of " . | |
167 "mismatches ($allowed_mismatches). This makes no sense. Specify fewer mismatches.\n" | |
168 if length($barcode)<=$allowed_mismatches; | |
169 | |
170 $barcodes_length = length($barcode) unless defined $barcodes_length; | |
171 die "Error: found barcodes in different lengths. this feature is not supported yet.\n" | |
172 unless $barcodes_length == length($barcode); | |
173 | |
174 push @barcodes, [$ident, $barcode]; | |
175 | |
176 if ($allow_partial_overlap>0) { | |
177 foreach my $i (1 .. $allow_partial_overlap) { | |
178 substr $barcode, ($barcodes_at_bol)?0:-1, 1, ''; | |
179 push @barcodes, [$ident, $barcode]; | |
180 } | |
181 } | |
182 } | |
183 close BCFILE; | |
184 | |
185 if ($debug) { | |
186 print STDERR "barcode\tsequence\n"; | |
187 foreach my $barcoderef (@barcodes) { | |
188 my ($ident, $seq) = @{$barcoderef}; | |
189 print STDERR $ident,"\t", $seq ,"\n"; | |
190 } | |
191 } | |
192 } | |
193 | |
194 # Create one output file for each barcode. | |
195 # (Also create a file for the dummy 'unmatched' barcode) | |
196 sub create_output_files { | |
197 my %barcodes = map { $_->[0] => 1 } @barcodes; #generate a uniq list of barcode identifiers; | |
198 $barcodes{'unmatched'} = 1 ; | |
199 | |
200 foreach my $ident (keys %barcodes) { | |
201 my $new_filename = $newfile_prefix . $ident . $newfile_suffix; | |
202 $filenames{$ident} = $new_filename; | |
203 open my $file, ">$new_filename" or die "Error: failed to create output file ($new_filename)\n"; | |
204 $files{$ident} = $file ; | |
205 } | |
206 } | |
207 | |
208 sub match_sequences { | |
209 | |
210 my %barcodes = map { $_->[0] => 1 } @barcodes; #generate a uniq list of barcode identifiers; | |
211 $barcodes{'unmatched'} = 1 ; | |
212 | |
213 #reset counters | |
214 foreach my $ident ( keys %barcodes ) { | |
215 $counts{$ident} = 0; | |
216 } | |
217 | |
218 create_output_files; | |
219 | |
220 # Read file FASTQ file | |
221 # split accotding to barcodes | |
222 while ( read_record ) { | |
223 chomp $seq_bases; | |
224 | |
225 print STDERR "sequence $seq_bases: \n" if $debug; | |
226 | |
227 my $best_barcode_mismatches_count = $barcodes_length; | |
228 my $best_barcode_ident = undef; | |
229 | |
230 #Try all barcodes, find the one with the lowest mismatch count | |
231 foreach my $barcoderef (@barcodes) { | |
232 my ($ident, $barcode) = @{$barcoderef}; | |
233 | |
234 # Get DNA fragment (in the length of the barcodes) | |
235 # The barcode will be tested only against this fragment | |
236 # (no point in testing the barcode against the whole sequence) | |
237 my $sequence_fragment; | |
238 if ($barcodes_at_bol) { | |
239 $sequence_fragment = substr $seq_bases, 0, $barcodes_length; | |
240 } else { | |
241 $sequence_fragment = substr $seq_bases, - $barcodes_length; | |
242 } | |
243 | |
244 my $mm = mismatch_count($sequence_fragment, $barcode) ; | |
245 | |
246 # if this is a partial match, add the non-overlap as a mismatch | |
247 # (partial barcodes are shorter than the length of the original barcodes) | |
248 $mm += ($barcodes_length - length($barcode)); | |
249 | |
250 if ( $mm < $best_barcode_mismatches_count ) { | |
251 $best_barcode_mismatches_count = $mm ; | |
252 $best_barcode_ident = $ident ; | |
253 } | |
254 } | |
255 | |
256 $best_barcode_ident = 'unmatched' | |
257 if ( (!defined $best_barcode_ident) || $best_barcode_mismatches_count>$allowed_mismatches) ; | |
258 | |
259 print STDERR "sequence $seq_bases matched barcode: $best_barcode_ident\n" if $debug; | |
260 | |
261 $counts{$best_barcode_ident}++; | |
262 | |
263 #get the file associated with the matched barcode. | |
264 #(note: there's also a file associated with 'unmatched' barcode) | |
265 my $file = $files{$best_barcode_ident}; | |
266 | |
267 write_record($file); | |
268 } | |
269 } | |
270 | |
271 #Quickly calculate hamming distance between two strings | |
272 # | |
273 #NOTE: Strings must be same length. | |
274 # returns number of different characters. | |
275 #see http://www.perlmonks.org/?node_id=500235 | |
276 sub mismatch_count($$) { length( $_[ 0 ] ) - ( ( $_[ 0 ] ^ $_[ 1 ] ) =~ tr[\0][\0] ) } | |
277 | |
278 | |
279 | |
280 sub print_results | |
281 { | |
282 print "Barcode\tCount\tLocation\n"; | |
283 my $total = 0 ; | |
284 foreach my $ident (sort keys %counts) { | |
285 print $ident, "\t", $counts{$ident},"\t",$filenames{$ident},"\n"; | |
286 $total += $counts{$ident}; | |
287 } | |
288 print "total\t",$total,"\n"; | |
289 } | |
290 | |
291 | |
292 sub read_record | |
293 { | |
294 $seq_name = $input_file_io->getline(); | |
295 | |
296 return undef unless defined $seq_name; # End of file? | |
297 | |
298 $seq_bases = $input_file_io->getline(); | |
299 die "Error: bad input file, expecting line with sequences\n" unless defined $seq_bases; | |
300 | |
301 # If using FASTQ format, read two more lines | |
302 if ($fastq_format) { | |
303 $seq_name2 = $input_file_io->getline(); | |
304 die "Error: bad input file, expecting line with sequence name2\n" unless defined $seq_name2; | |
305 | |
306 $seq_qualities = $input_file_io->getline(); | |
307 die "Error: bad input file, expecting line with quality scores\n" unless defined $seq_qualities; | |
308 } | |
309 return 1; | |
310 } | |
311 | |
312 sub write_record($) | |
313 { | |
314 my $file = shift; | |
315 | |
316 croak "Bad file handle" unless defined $file; | |
317 | |
318 print $file $seq_name; | |
319 print $file $seq_bases,"\n"; | |
320 | |
321 #if using FASTQ format, write two more lines | |
322 if ($fastq_format) { | |
323 print $file $seq_name2; | |
324 print $file $seq_qualities; | |
325 } | |
326 } | |
327 | |
328 sub open_and_detect_input_format | |
329 { | |
330 $input_file_io = new IO::Handle; | |
331 die "Failed to open STDIN " unless $input_file_io->fdopen(fileno(STDIN),"r"); | |
332 | |
333 # Get the first characeter, and push it back | |
334 my $first_char = $input_file_io->getc(); | |
335 $input_file_io->ungetc(ord $first_char); | |
336 | |
337 if ($first_char eq '>') { | |
338 # FASTA format | |
339 $fastq_format = 0 ; | |
340 print STDERR "Detected FASTA format\n" if $debug; | |
341 } elsif ($first_char eq '@') { | |
342 # FASTQ format | |
343 $fastq_format = 1; | |
344 print STDERR "Detected FASTQ format\n" if $debug; | |
345 } else { | |
346 die "Error: unknown file format. First character = '$first_char' (expecting > or \@)\n"; | |
347 } | |
348 } | |
349 | |
350 sub usage() | |
351 { | |
352 print<<EOF; | |
353 Barcode Splitter, by Assaf Gordon (gordon\@cshl.edu), 11sep2008 | |
354 | |
355 This program reads FASTA/FASTQ file and splits it into several smaller files, | |
356 Based on barcode matching. | |
357 FASTA/FASTQ data is read from STDIN (format is auto-detected.) | |
358 Output files will be writen to disk. | |
359 Summary will be printed to STDOUT. | |
360 | |
361 usage: $0 --bcfile FILE --prefix PREFIX [--suffix SUFFIX] [--bol|--eol] | |
362 [--mismatches N] [--exact] [--partial N] [--help] [--quiet] [--debug] | |
363 | |
364 Arguments: | |
365 | |
366 --bcfile FILE - Barcodes file name. (see explanation below.) | |
367 --prefix PREFIX - File prefix. will be added to the output files. Can be used | |
368 to specify output directories. | |
369 --suffix SUFFIX - File suffix (optional). Can be used to specify file | |
370 extensions. | |
371 --bol - Try to match barcodes at the BEGINNING of sequences. | |
372 (What biologists would call the 5' end, and programmers | |
373 would call index 0.) | |
374 --eol - Try to match barcodes at the END of sequences. | |
375 (What biologists would call the 3' end, and programmers | |
376 would call the end of the string.) | |
377 NOTE: one of --bol, --eol must be specified, but not both. | |
378 --mismatches N - Max. number of mismatches allowed. default is 1. | |
379 --exact - Same as '--mismatches 0'. If both --exact and --mismatches | |
380 are specified, '--exact' takes precedence. | |
381 --partial N - Allow partial overlap of barcodes. (see explanation below.) | |
382 (Default is not partial matching) | |
383 --quiet - Don't print counts and summary at the end of the run. | |
384 (Default is to print.) | |
385 --debug - Print lots of useless debug information to STDERR. | |
386 --help - This helpful help screen. | |
387 | |
388 Example (Assuming 's_2_100.txt' is a FASTQ file, 'mybarcodes.txt' is | |
389 the barcodes file): | |
390 | |
391 \$ cat s_2_100.txt | $0 --bcfile mybarcodes.txt --bol --mismatches 2 \\ | |
392 --prefix /tmp/bla_ --suffix ".txt" | |
393 | |
394 Barcode file format | |
395 ------------------- | |
396 Barcode files are simple text files. Each line should contain an identifier | |
397 (descriptive name for the barcode), and the barcode itself (A/C/G/T), | |
398 separated by a TAB character. Example: | |
399 | |
400 #This line is a comment (starts with a 'number' sign) | |
401 BC1 GATCT | |
402 BC2 ATCGT | |
403 BC3 GTGAT | |
404 BC4 TGTCT | |
405 | |
406 For each barcode, a new FASTQ file will be created (with the barcode's | |
407 identifier as part of the file name). Sequences matching the barcode | |
408 will be stored in the appropriate file. | |
409 | |
410 Running the above example (assuming "mybarcodes.txt" contains the above | |
411 barcodes), will create the following files: | |
412 /tmp/bla_BC1.txt | |
413 /tmp/bla_BC2.txt | |
414 /tmp/bla_BC3.txt | |
415 /tmp/bla_BC4.txt | |
416 /tmp/bla_unmatched.txt | |
417 The 'unmatched' file will contain all sequences that didn't match any barcode. | |
418 | |
419 Barcode matching | |
420 ---------------- | |
421 | |
422 ** Without partial matching: | |
423 | |
424 Count mismatches between the FASTA/Q sequences and the barcodes. | |
425 The barcode which matched with the lowest mismatches count (providing the | |
426 count is small or equal to '--mismatches N') 'gets' the sequences. | |
427 | |
428 Example (using the above barcodes): | |
429 Input Sequence: | |
430 GATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG | |
431 | |
432 Matching with '--bol --mismatches 1': | |
433 GATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG | |
434 GATCT (1 mismatch, BC1) | |
435 ATCGT (4 mismatches, BC2) | |
436 GTGAT (3 mismatches, BC3) | |
437 TGTCT (3 mismatches, BC4) | |
438 | |
439 This sequence will be classified as 'BC1' (it has the lowest mismatch count). | |
440 If '--exact' or '--mismatches 0' were specified, this sequence would be | |
441 classified as 'unmatched' (because, although BC1 had the lowest mismatch count, | |
442 it is above the maximum allowed mismatches). | |
443 | |
444 Matching with '--eol' (end of line) does the same, but from the other side | |
445 of the sequence. | |
446 | |
447 ** With partial matching (very similar to indels): | |
448 | |
449 Same as above, with the following addition: barcodes are also checked for | |
450 partial overlap (number of allowed non-overlapping bases is '--partial N'). | |
451 | |
452 Example: | |
453 Input sequence is ATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG | |
454 (Same as above, but note the missing 'G' at the beginning.) | |
455 | |
456 Matching (without partial overlapping) against BC1 yields 4 mismatches: | |
457 ATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG | |
458 GATCT (4 mismatches) | |
459 | |
460 Partial overlapping would also try the following match: | |
461 -ATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG | |
462 GATCT (1 mismatch) | |
463 | |
464 Note: scoring counts a missing base as a mismatch, so the final | |
465 mismatch count is 2 (1 'real' mismatch, 1 'missing base' mismatch). | |
466 If running with '--mismatches 2' (meaning allowing upto 2 mismatches) - this | |
467 seqeunce will be classified as BC1. | |
468 | |
469 EOF | |
470 | |
471 exit 1; | |
472 } |