# HG changeset patch # User Yusuf Ali # Date 1427310560 21600 # Node ID 5e699c743e3792ec2b6cba12eb3e99026baa49ef initial commit diff -r 000000000000 -r 5e699c743e37 AddNamesToBED.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/AddNamesToBED.xml Wed Mar 25 13:09:20 2015 -0600 @@ -0,0 +1,24 @@ + + + + based on overlap with a named regions BED file + echo 1.0.0 + add_names_to_bed $in_unnamed_bed $in_named_bed $out_renamed_bed $out_log + + + + + + + + + + + + +This tool replaces names (4th column) in a BED file, based on overlap of those regions with named regions +provided in another BED file. For example, a list of exome dud regions could be annotated with the gene +names in the dud regions, by running this tool with the UCSC refFlat BED file as the named regions source. + + + diff -r 000000000000 -r 5e699c743e37 add_names_to_bed --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/add_names_to_bed Wed Mar 25 13:09:20 2015 -0600 @@ -0,0 +1,70 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use Set::IntervalTree; + +# This script adds names to the first BED file regions, based on the names for regions provided +# by the second BED file. The output BED file is the same as the first BED file, but with names added where possible. +@ARGV == 4 or die "Usage: $0 \n"; + +my %names; +open(NAMED_BED, $ARGV[1]) + or die "Cannot open $ARGV[1] for reading: $!\n"; +while(){ + my @F = split /\t/, $_; + next unless @F > 3; + $names{$F[0]} = Set::IntervalTree->new() if not exists $names{$F[0]}; + $names{$F[0]}->insert($F[3], $F[1], $F[2]); +} +close(NAMED_BED); +die "The BED file $ARGV[1] did not contain any named regions, aborting renaming procedure\n" if not keys %names; + +open(INPUT_BED, $ARGV[0]) + or die "Cannot open $ARGV[0] for reading: $!\n"; +open(OUTPUT_BED, ">$ARGV[2]") + or die "Cannot open $ARGV[2] for writing: $!\n"; +my $num_datalines = 0; +my $num_changes = 0; +while(){ + chomp; + my @F = split /\t/, $_; + if(@F < 3){ + # not a data line + print OUTPUT_BED $_, "\n"; + next; + } + $num_datalines++; + my $chr = $F[0]; + if(not exists $names{$chr}){ # must be a contig that was in the named BED file + if(exists $names{"chr$chr"}){ + $chr = "chr$chr"; + } + elsif($chr =~ /^chr(\S+)/ and exists $names{$1}){ + $chr = $1; + } + else{ + next; + } + } + my $renames = $names{$chr}->fetch($F[1], $F[2]+1); + if(not @$renames){ + print OUTPUT_BED $_,"\n"; # no mod + } + else{ + $num_changes++; + my %h; + my @renames = grep {not $h{$_}++} @$renames; # just get unique set of names + print OUTPUT_BED join("\t", @F[0..2], join("/", @renames), @F[4..$#F]), "\n"; # with new name + } +} +close(OUTPUT_BED); +close(INPUT_BED); + +if(open(LOG, ">>$ARGV[3]")){ + print LOG "Changed $num_changes/$num_datalines lines\n"; +} +else{ + print "Could not append to log file $ARGV[3]: writing to stdout instead\nChanged $num_changes/$num_datalines lines\n"; +} +