annotate filter_bam_by_list @ 0:42af0b971c55 default tip

intial commit
author Yusuf Ali <ali@yusuf.email>
date Wed, 25 Mar 2015 13:33:46 -0600
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1 #!/usr/bin/env perl
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
3 use strict;
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
4 use warnings;
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
5 use IO::Handle;
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
6
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
7 # Report lines of a file that have as one of the column values a value from the pattern file
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
8 @ARGV == 6 or die "Usage: $0 <input.bam> <named genomic regions.bed> <file of patterns> <filtered output.bam> <matched regions.bed> <output messages.txt>\n";
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
9
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
10 die "Input BAM file $ARGV[0] does not exist\n" if not -e $ARGV[0];
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
11 die "Input BAM file $ARGV[0] is not readable\n" if not -r $ARGV[0];
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
12
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
13 my @alts;
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
14 if(-e $ARGV[2]){
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
15 open(PATTERNS, $ARGV[2])
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
16 or die "Cannot open $ARGV[1] for reading: $!\n";
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
17 while(<PATTERNS>){
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
18 chomp;
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
19 push @alts, $_;
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
20 }
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
21 close(PATTERNS);
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
22 }
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
23 else{ # else assume the arg is a list of names directly
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
24 @alts = split /\s+/, $ARGV[2];
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
25 }
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
26
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
27 my @regions;
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
28 my %seen;
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
29 my $regex = "^(?:".join("|", @alts).")\$";
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
30 open(TAB, $ARGV[1])
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
31 or die "Cannot open $ARGV[1] for reading: $!\n";
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
32 while(<TAB>){
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
33 chomp;
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
34 my @F = split /\t/, $_;
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
35 next unless @F > 3;
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
36 if($F[3] =~ /$regex/io){
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
37 next if $seen{"$F[0]:$F[1]-$F[2]"}++; # sometimes regions are repeated, don't send these repeats to samtools
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
38 push @regions, $_;
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
39 }
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
40 }
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
41 close(TAB);
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
42
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
43 die "No matches to desired names in the provided named genomic regions BED file, aborting filtered BAM file creation" if not @regions;
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
44
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
45 open(BED, ">$ARGV[4]")
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
46 or die "Cannot open $ARGV[4] for writing: $!\n";
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
47 print BED join("\n", @regions), "\n";
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
48 close(BED);
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
49
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
50 open(ERRFILE, ">$ARGV[5]") or die "Cannot open $ARGV[5] for writing: $!\n";
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
51 STDOUT->fdopen(\*ERRFILE, "w") or die "Cannot redirect stdout to $ARGV[5]: $!\n";
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
52 STDERR->fdopen(\*ERRFILE, "w") or die "Cannot redirect stderr to $ARGV[5]: $!\n";
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
53 system("samtools view -h -L $ARGV[4] $ARGV[0] | samtools view -S -b - > $ARGV[3]") >> 8
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
54 and die "Samtools failed: exit status ", ($?>>8), "\n";
42af0b971c55 intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
55 system("samtools index $ARGV[3]");