Mercurial > repos > yusuf > poor_gene_coverage
view add_names_to_bed @ 0:7cdd13ff182a default tip
initial commit
| author | Yusuf Ali <ali@yusuf.email> | 
|---|---|
| date | Wed, 25 Mar 2015 15:49:28 -0600 | 
| parents | |
| children | 
line wrap: on
 line source
#!/usr/bin/env perl use strict; use warnings; use Set::IntervalTree; # This script adds names to the first BED file regions, based on the names for regions provided # by the second BED file. The output BED file is the same as the first BED file, but with names added where possible. @ARGV == 4 or die "Usage: $0 <unnamed input regions.bed> <named input regions.bed> <renamed output.bed> <output log.txt>\n"; my %names; open(NAMED_BED, $ARGV[1]) or die "Cannot open $ARGV[1] for reading: $!\n"; while(<NAMED_BED>){ my @F = split /\t/, $_; next unless @F > 3; $names{$F[0]} = Set::IntervalTree->new() if not exists $names{$F[0]}; $names{$F[0]}->insert($F[3], $F[1], $F[2]); } close(NAMED_BED); die "The BED file $ARGV[1] did not contain any named regions, aborting renaming procedure\n" if not keys %names; open(INPUT_BED, $ARGV[0]) or die "Cannot open $ARGV[0] for reading: $!\n"; open(OUTPUT_BED, ">$ARGV[2]") or die "Cannot open $ARGV[2] for writing: $!\n"; my $num_datalines = 0; my $num_changes = 0; while(<INPUT_BED>){ chomp; my @F = split /\t/, $_; if(@F < 3){ # not a data line print OUTPUT_BED $_, "\n"; next; } $num_datalines++; my $chr = $F[0]; if(not exists $names{$chr}){ # must be a contig that was in the named BED file if(exists $names{"chr$chr"}){ $chr = "chr$chr"; } elsif($chr =~ /^chr(\S+)/ and exists $names{$1}){ $chr = $1; } else{ next; } } my $renames = $names{$chr}->fetch($F[1], $F[2]+1); if(not @$renames){ print OUTPUT_BED $_,"\n"; # no mod } else{ $num_changes++; my %h; my @renames = grep {not $h{$_}++} @$renames; # just get unique set of names print OUTPUT_BED join("\t", @F[0..2], join("/", @renames), @F[4..$#F]), "\n"; # with new name } } close(OUTPUT_BED); close(INPUT_BED); if(open(LOG, ">>$ARGV[3]")){ print LOG "Changed $num_changes/$num_datalines lines\n"; } else{ print "Could not append to log file $ARGV[3]: writing to stdout instead\nChanged $num_changes/$num_datalines lines\n"; }
