| 0 | 1 #!/usr/bin/env perl | 
|  | 2 | 
|  | 3 use strict; | 
|  | 4 use warnings; | 
|  | 5 use Set::IntervalTree; | 
|  | 6 | 
|  | 7 # This script adds names to the first BED file regions, based on the names for regions provided | 
|  | 8 # by the second BED file.  The output BED file is the same as the first BED file, but with names added where possible. | 
|  | 9 @ARGV == 4 or die "Usage: $0 <unnamed input regions.bed> <named input regions.bed> <renamed output.bed> <output log.txt>\n"; | 
|  | 10 | 
|  | 11 my %names; | 
|  | 12 open(NAMED_BED, $ARGV[1]) | 
|  | 13   or die "Cannot open $ARGV[1] for reading: $!\n"; | 
|  | 14 while(<NAMED_BED>){ | 
|  | 15   my @F = split /\t/, $_; | 
|  | 16   next unless @F > 3; | 
|  | 17   $names{$F[0]} = Set::IntervalTree->new() if not exists $names{$F[0]}; | 
|  | 18   $names{$F[0]}->insert($F[3], $F[1], $F[2]); | 
|  | 19 } | 
|  | 20 close(NAMED_BED); | 
|  | 21 die "The BED file $ARGV[1] did not contain any named regions, aborting renaming procedure\n" if not keys %names; | 
|  | 22 | 
|  | 23 open(INPUT_BED, $ARGV[0]) | 
|  | 24   or die "Cannot open $ARGV[0] for reading: $!\n"; | 
|  | 25 open(OUTPUT_BED, ">$ARGV[2]") | 
|  | 26   or die "Cannot open $ARGV[2] for writing: $!\n"; | 
|  | 27 my $num_datalines = 0; | 
|  | 28 my $num_changes = 0; | 
|  | 29 while(<INPUT_BED>){ | 
|  | 30   chomp; | 
|  | 31   my @F = split /\t/, $_; | 
|  | 32   if(@F < 3){ | 
|  | 33     # not a data line | 
|  | 34     print OUTPUT_BED $_, "\n"; | 
|  | 35     next; | 
|  | 36   } | 
|  | 37   $num_datalines++; | 
|  | 38   my $chr = $F[0]; | 
|  | 39   if(not exists $names{$chr}){ # must be a contig that was in the named BED file | 
|  | 40     if(exists $names{"chr$chr"}){ | 
|  | 41       $chr = "chr$chr"; | 
|  | 42     } | 
|  | 43     elsif($chr =~ /^chr(\S+)/ and exists $names{$1}){ | 
|  | 44       $chr = $1; | 
|  | 45     } | 
|  | 46     else{ | 
|  | 47       next; | 
|  | 48     } | 
|  | 49   } | 
|  | 50   my $renames = $names{$chr}->fetch($F[1], $F[2]+1); | 
|  | 51   if(not @$renames){ | 
|  | 52     print OUTPUT_BED $_,"\n"; # no mod | 
|  | 53   } | 
|  | 54   else{ | 
|  | 55     $num_changes++; | 
|  | 56     my %h; | 
|  | 57     my @renames = grep {not $h{$_}++} @$renames; # just get unique set of names | 
|  | 58     print OUTPUT_BED join("\t", @F[0..2], join("/", @renames), @F[4..$#F]), "\n"; # with new name | 
|  | 59   } | 
|  | 60 } | 
|  | 61 close(OUTPUT_BED); | 
|  | 62 close(INPUT_BED); | 
|  | 63 | 
|  | 64 if(open(LOG, ">>$ARGV[3]")){ | 
|  | 65   print LOG "Changed $num_changes/$num_datalines lines\n"; | 
|  | 66 } | 
|  | 67 else{ | 
|  | 68   print "Could not append to log file $ARGV[3]: writing to stdout instead\nChanged $num_changes/$num_datalines lines\n"; | 
|  | 69 } | 
|  | 70 |