0
|
1 #!/usr/bin/env perl
|
|
2
|
|
3 use strict;
|
|
4 use warnings;
|
|
5 use Set::IntervalTree;
|
|
6
|
|
7 # This script adds names to the first BED file regions, based on the names for regions provided
|
|
8 # by the second BED file. The output BED file is the same as the first BED file, but with names added where possible.
|
|
9 @ARGV == 4 or die "Usage: $0 <unnamed input regions.bed> <named input regions.bed> <renamed output.bed> <output log.txt>\n";
|
|
10
|
|
11 my %names;
|
|
12 open(NAMED_BED, $ARGV[1])
|
|
13 or die "Cannot open $ARGV[1] for reading: $!\n";
|
|
14 while(<NAMED_BED>){
|
|
15 my @F = split /\t/, $_;
|
|
16 next unless @F > 3;
|
|
17 $names{$F[0]} = Set::IntervalTree->new() if not exists $names{$F[0]};
|
|
18 $names{$F[0]}->insert($F[3], $F[1], $F[2]);
|
|
19 }
|
|
20 close(NAMED_BED);
|
|
21 die "The BED file $ARGV[1] did not contain any named regions, aborting renaming procedure\n" if not keys %names;
|
|
22
|
|
23 open(INPUT_BED, $ARGV[0])
|
|
24 or die "Cannot open $ARGV[0] for reading: $!\n";
|
|
25 open(OUTPUT_BED, ">$ARGV[2]")
|
|
26 or die "Cannot open $ARGV[2] for writing: $!\n";
|
|
27 my $num_datalines = 0;
|
|
28 my $num_changes = 0;
|
|
29 while(<INPUT_BED>){
|
|
30 chomp;
|
|
31 my @F = split /\t/, $_;
|
|
32 if(@F < 3){
|
|
33 # not a data line
|
|
34 print OUTPUT_BED $_, "\n";
|
|
35 next;
|
|
36 }
|
|
37 $num_datalines++;
|
|
38 my $chr = $F[0];
|
|
39 if(not exists $names{$chr}){ # must be a contig that was in the named BED file
|
|
40 if(exists $names{"chr$chr"}){
|
|
41 $chr = "chr$chr";
|
|
42 }
|
|
43 elsif($chr =~ /^chr(\S+)/ and exists $names{$1}){
|
|
44 $chr = $1;
|
|
45 }
|
|
46 else{
|
|
47 next;
|
|
48 }
|
|
49 }
|
|
50 my $renames = $names{$chr}->fetch($F[1], $F[2]+1);
|
|
51 if(not @$renames){
|
|
52 print OUTPUT_BED $_,"\n"; # no mod
|
|
53 }
|
|
54 else{
|
|
55 $num_changes++;
|
|
56 my %h;
|
|
57 my @renames = grep {not $h{$_}++} @$renames; # just get unique set of names
|
|
58 print OUTPUT_BED join("\t", @F[0..2], join("/", @renames), @F[4..$#F]), "\n"; # with new name
|
|
59 }
|
|
60 }
|
|
61 close(OUTPUT_BED);
|
|
62 close(INPUT_BED);
|
|
63
|
|
64 if(open(LOG, ">>$ARGV[3]")){
|
|
65 print LOG "Changed $num_changes/$num_datalines lines\n";
|
|
66 }
|
|
67 else{
|
|
68 print "Could not append to log file $ARGV[3]: writing to stdout instead\nChanged $num_changes/$num_datalines lines\n";
|
|
69 }
|
|
70
|