annotate add_names_to_bed @ 0:5e699c743e37 default tip

initial commit
author Yusuf Ali <ali@yusuf.email>
date Wed, 25 Mar 2015 13:09:20 -0600
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1 #!/usr/bin/env perl
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
3 use strict;
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
4 use warnings;
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
5 use Set::IntervalTree;
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
6
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
7 # This script adds names to the first BED file regions, based on the names for regions provided
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
8 # by the second BED file. The output BED file is the same as the first BED file, but with names added where possible.
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
9 @ARGV == 4 or die "Usage: $0 <unnamed input regions.bed> <named input regions.bed> <renamed output.bed> <output log.txt>\n";
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
10
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
11 my %names;
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
12 open(NAMED_BED, $ARGV[1])
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
13 or die "Cannot open $ARGV[1] for reading: $!\n";
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
14 while(<NAMED_BED>){
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
15 my @F = split /\t/, $_;
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
16 next unless @F > 3;
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
17 $names{$F[0]} = Set::IntervalTree->new() if not exists $names{$F[0]};
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
18 $names{$F[0]}->insert($F[3], $F[1], $F[2]);
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
19 }
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
20 close(NAMED_BED);
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
21 die "The BED file $ARGV[1] did not contain any named regions, aborting renaming procedure\n" if not keys %names;
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
22
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
23 open(INPUT_BED, $ARGV[0])
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
24 or die "Cannot open $ARGV[0] for reading: $!\n";
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
25 open(OUTPUT_BED, ">$ARGV[2]")
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
26 or die "Cannot open $ARGV[2] for writing: $!\n";
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
27 my $num_datalines = 0;
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
28 my $num_changes = 0;
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
29 while(<INPUT_BED>){
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
30 chomp;
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
31 my @F = split /\t/, $_;
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
32 if(@F < 3){
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
33 # not a data line
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
34 print OUTPUT_BED $_, "\n";
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
35 next;
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
36 }
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
37 $num_datalines++;
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
38 my $chr = $F[0];
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
39 if(not exists $names{$chr}){ # must be a contig that was in the named BED file
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
40 if(exists $names{"chr$chr"}){
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
41 $chr = "chr$chr";
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
42 }
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
43 elsif($chr =~ /^chr(\S+)/ and exists $names{$1}){
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
44 $chr = $1;
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
45 }
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
46 else{
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
47 next;
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
48 }
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
49 }
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
50 my $renames = $names{$chr}->fetch($F[1], $F[2]+1);
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
51 if(not @$renames){
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
52 print OUTPUT_BED $_,"\n"; # no mod
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
53 }
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
54 else{
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
55 $num_changes++;
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
56 my %h;
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
57 my @renames = grep {not $h{$_}++} @$renames; # just get unique set of names
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
58 print OUTPUT_BED join("\t", @F[0..2], join("/", @renames), @F[4..$#F]), "\n"; # with new name
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
59 }
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
60 }
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
61 close(OUTPUT_BED);
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
62 close(INPUT_BED);
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
63
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
64 if(open(LOG, ">>$ARGV[3]")){
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
65 print LOG "Changed $num_changes/$num_datalines lines\n";
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
66 }
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
67 else{
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
68 print "Could not append to log file $ARGV[3]: writing to stdout instead\nChanged $num_changes/$num_datalines lines\n";
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
69 }
5e699c743e37 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
70