changeset 0:5e699c743e37 default tip

initial commit
author Yusuf Ali <ali@yusuf.email>
date Wed, 25 Mar 2015 13:09:20 -0600
parents
children
files AddNamesToBED.xml add_names_to_bed
diffstat 2 files changed, 94 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/AddNamesToBED.xml	Wed Mar 25 13:09:20 2015 -0600
@@ -0,0 +1,24 @@
+<?xml version="1.0"?>
+
+<tool id="add_names_to_bed_1" name="Add names to a BED file">
+  <description>based on overlap with a named regions BED file</description>
+  <version_string>echo 1.0.0</version_string>
+  <command interpreter="perl">add_names_to_bed $in_unnamed_bed $in_named_bed $out_renamed_bed $out_log</command>
+  <inputs>
+    <param format="bed" name="in_unnamed_bed" type="data" label="BED file to rename" help="BED file without useful names for each region reported"/>
+    <param format="bed" name="in_named_bed" type="data" label="BED file with names to inherit" help="BED file with useful names for each region (4th column)"/>
+  </inputs>
+  <outputs>
+    <data name="out_renamed_bed" format="bed" type="data" label="BED file with renamed regions"/>
+    <data name="out_log" format="text" type="data" label="Renaming stats"/>
+  </outputs>
+
+  <tests/>
+
+  <help>
+This tool replaces names (4th column) in a BED file, based on overlap of those regions with named regions 
+provided in another BED file.  For example, a list of exome dud regions could be annotated with the gene
+names in the dud regions, by running this tool with the UCSC refFlat BED file as the named regions source.
+ </help>
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/add_names_to_bed	Wed Mar 25 13:09:20 2015 -0600
@@ -0,0 +1,70 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use Set::IntervalTree;
+
+# This script adds names to the first BED file regions, based on the names for regions provided 
+# by the second BED file.  The output BED file is the same as the first BED file, but with names added where possible.
+@ARGV == 4 or die "Usage: $0 <unnamed input regions.bed> <named input regions.bed> <renamed output.bed> <output log.txt>\n";
+
+my %names;
+open(NAMED_BED, $ARGV[1])
+  or die "Cannot open $ARGV[1] for reading: $!\n";
+while(<NAMED_BED>){
+  my @F = split /\t/, $_;
+  next unless @F > 3;
+  $names{$F[0]} = Set::IntervalTree->new() if not exists $names{$F[0]}; 
+  $names{$F[0]}->insert($F[3], $F[1], $F[2]);
+}
+close(NAMED_BED);
+die "The BED file $ARGV[1] did not contain any named regions, aborting renaming procedure\n" if not keys %names; 
+
+open(INPUT_BED, $ARGV[0])
+  or die "Cannot open $ARGV[0] for reading: $!\n";
+open(OUTPUT_BED, ">$ARGV[2]")
+  or die "Cannot open $ARGV[2] for writing: $!\n";
+my $num_datalines = 0;
+my $num_changes = 0;
+while(<INPUT_BED>){
+  chomp;
+  my @F = split /\t/, $_;
+  if(@F < 3){
+    # not a data line
+    print OUTPUT_BED $_, "\n";
+    next;
+  }
+  $num_datalines++;
+  my $chr = $F[0];
+  if(not exists $names{$chr}){ # must be a contig that was in the named BED file
+    if(exists $names{"chr$chr"}){
+      $chr = "chr$chr";
+    }
+    elsif($chr =~ /^chr(\S+)/ and exists $names{$1}){
+      $chr = $1;
+    }
+    else{
+      next;
+    }
+  }
+  my $renames = $names{$chr}->fetch($F[1], $F[2]+1);
+  if(not @$renames){
+    print OUTPUT_BED $_,"\n"; # no mod
+  }
+  else{
+    $num_changes++;
+    my %h;
+    my @renames = grep {not $h{$_}++} @$renames; # just get unique set of names
+    print OUTPUT_BED join("\t", @F[0..2], join("/", @renames), @F[4..$#F]), "\n"; # with new name
+  }
+}
+close(OUTPUT_BED);
+close(INPUT_BED);
+
+if(open(LOG, ">>$ARGV[3]")){
+  print LOG "Changed $num_changes/$num_datalines lines\n";
+}
+else{
+  print "Could not append to log file $ARGV[3]: writing to stdout instead\nChanged $num_changes/$num_datalines lines\n";
+}
+