Mercurial > repos > yusuf > add_names_bed
changeset 0:5e699c743e37 default tip
initial commit
author | Yusuf Ali <ali@yusuf.email> |
---|---|
date | Wed, 25 Mar 2015 13:09:20 -0600 |
parents | |
children | |
files | AddNamesToBED.xml add_names_to_bed |
diffstat | 2 files changed, 94 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/AddNamesToBED.xml Wed Mar 25 13:09:20 2015 -0600 @@ -0,0 +1,24 @@ +<?xml version="1.0"?> + +<tool id="add_names_to_bed_1" name="Add names to a BED file"> + <description>based on overlap with a named regions BED file</description> + <version_string>echo 1.0.0</version_string> + <command interpreter="perl">add_names_to_bed $in_unnamed_bed $in_named_bed $out_renamed_bed $out_log</command> + <inputs> + <param format="bed" name="in_unnamed_bed" type="data" label="BED file to rename" help="BED file without useful names for each region reported"/> + <param format="bed" name="in_named_bed" type="data" label="BED file with names to inherit" help="BED file with useful names for each region (4th column)"/> + </inputs> + <outputs> + <data name="out_renamed_bed" format="bed" type="data" label="BED file with renamed regions"/> + <data name="out_log" format="text" type="data" label="Renaming stats"/> + </outputs> + + <tests/> + + <help> +This tool replaces names (4th column) in a BED file, based on overlap of those regions with named regions +provided in another BED file. For example, a list of exome dud regions could be annotated with the gene +names in the dud regions, by running this tool with the UCSC refFlat BED file as the named regions source. + </help> + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/add_names_to_bed Wed Mar 25 13:09:20 2015 -0600 @@ -0,0 +1,70 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use Set::IntervalTree; + +# This script adds names to the first BED file regions, based on the names for regions provided +# by the second BED file. The output BED file is the same as the first BED file, but with names added where possible. +@ARGV == 4 or die "Usage: $0 <unnamed input regions.bed> <named input regions.bed> <renamed output.bed> <output log.txt>\n"; + +my %names; +open(NAMED_BED, $ARGV[1]) + or die "Cannot open $ARGV[1] for reading: $!\n"; +while(<NAMED_BED>){ + my @F = split /\t/, $_; + next unless @F > 3; + $names{$F[0]} = Set::IntervalTree->new() if not exists $names{$F[0]}; + $names{$F[0]}->insert($F[3], $F[1], $F[2]); +} +close(NAMED_BED); +die "The BED file $ARGV[1] did not contain any named regions, aborting renaming procedure\n" if not keys %names; + +open(INPUT_BED, $ARGV[0]) + or die "Cannot open $ARGV[0] for reading: $!\n"; +open(OUTPUT_BED, ">$ARGV[2]") + or die "Cannot open $ARGV[2] for writing: $!\n"; +my $num_datalines = 0; +my $num_changes = 0; +while(<INPUT_BED>){ + chomp; + my @F = split /\t/, $_; + if(@F < 3){ + # not a data line + print OUTPUT_BED $_, "\n"; + next; + } + $num_datalines++; + my $chr = $F[0]; + if(not exists $names{$chr}){ # must be a contig that was in the named BED file + if(exists $names{"chr$chr"}){ + $chr = "chr$chr"; + } + elsif($chr =~ /^chr(\S+)/ and exists $names{$1}){ + $chr = $1; + } + else{ + next; + } + } + my $renames = $names{$chr}->fetch($F[1], $F[2]+1); + if(not @$renames){ + print OUTPUT_BED $_,"\n"; # no mod + } + else{ + $num_changes++; + my %h; + my @renames = grep {not $h{$_}++} @$renames; # just get unique set of names + print OUTPUT_BED join("\t", @F[0..2], join("/", @renames), @F[4..$#F]), "\n"; # with new name + } +} +close(OUTPUT_BED); +close(INPUT_BED); + +if(open(LOG, ">>$ARGV[3]")){ + print LOG "Changed $num_changes/$num_datalines lines\n"; +} +else{ + print "Could not append to log file $ARGV[3]: writing to stdout instead\nChanged $num_changes/$num_datalines lines\n"; +} +