annotate add_names_to_bed @ 0:7cdd13ff182a default tip

initial commit
author Yusuf Ali <ali@yusuf.email>
date Wed, 25 Mar 2015 15:49:28 -0600
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1 #!/usr/bin/env perl
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
3 use strict;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
4 use warnings;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
5 use Set::IntervalTree;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
6
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
7 # This script adds names to the first BED file regions, based on the names for regions provided
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
8 # by the second BED file. The output BED file is the same as the first BED file, but with names added where possible.
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
9 @ARGV == 4 or die "Usage: $0 <unnamed input regions.bed> <named input regions.bed> <renamed output.bed> <output log.txt>\n";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
10
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
11 my %names;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
12 open(NAMED_BED, $ARGV[1])
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
13 or die "Cannot open $ARGV[1] for reading: $!\n";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
14 while(<NAMED_BED>){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
15 my @F = split /\t/, $_;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
16 next unless @F > 3;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
17 $names{$F[0]} = Set::IntervalTree->new() if not exists $names{$F[0]};
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
18 $names{$F[0]}->insert($F[3], $F[1], $F[2]);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
19 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
20 close(NAMED_BED);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
21 die "The BED file $ARGV[1] did not contain any named regions, aborting renaming procedure\n" if not keys %names;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
22
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
23 open(INPUT_BED, $ARGV[0])
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
24 or die "Cannot open $ARGV[0] for reading: $!\n";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
25 open(OUTPUT_BED, ">$ARGV[2]")
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
26 or die "Cannot open $ARGV[2] for writing: $!\n";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
27 my $num_datalines = 0;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
28 my $num_changes = 0;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
29 while(<INPUT_BED>){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
30 chomp;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
31 my @F = split /\t/, $_;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
32 if(@F < 3){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
33 # not a data line
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
34 print OUTPUT_BED $_, "\n";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
35 next;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
36 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
37 $num_datalines++;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
38 my $chr = $F[0];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
39 if(not exists $names{$chr}){ # must be a contig that was in the named BED file
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
40 if(exists $names{"chr$chr"}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
41 $chr = "chr$chr";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
42 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
43 elsif($chr =~ /^chr(\S+)/ and exists $names{$1}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
44 $chr = $1;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
45 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
46 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
47 next;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
48 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
49 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
50 my $renames = $names{$chr}->fetch($F[1], $F[2]+1);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
51 if(not @$renames){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
52 print OUTPUT_BED $_,"\n"; # no mod
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
53 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
54 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
55 $num_changes++;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
56 my %h;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
57 my @renames = grep {not $h{$_}++} @$renames; # just get unique set of names
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
58 print OUTPUT_BED join("\t", @F[0..2], join("/", @renames), @F[4..$#F]), "\n"; # with new name
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
59 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
60 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
61 close(OUTPUT_BED);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
62 close(INPUT_BED);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
63
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
64 if(open(LOG, ">>$ARGV[3]")){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
65 print LOG "Changed $num_changes/$num_datalines lines\n";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
66 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
67 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
68 print "Could not append to log file $ARGV[3]: writing to stdout instead\nChanged $num_changes/$num_datalines lines\n";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
69 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
70