Mercurial > repos > yusuf > poor_gene_coverage
comparison add_names_to_bed @ 0:7cdd13ff182a default tip
initial commit
author | Yusuf Ali <ali@yusuf.email> |
---|---|
date | Wed, 25 Mar 2015 15:49:28 -0600 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:7cdd13ff182a |
---|---|
1 #!/usr/bin/env perl | |
2 | |
3 use strict; | |
4 use warnings; | |
5 use Set::IntervalTree; | |
6 | |
7 # This script adds names to the first BED file regions, based on the names for regions provided | |
8 # by the second BED file. The output BED file is the same as the first BED file, but with names added where possible. | |
9 @ARGV == 4 or die "Usage: $0 <unnamed input regions.bed> <named input regions.bed> <renamed output.bed> <output log.txt>\n"; | |
10 | |
11 my %names; | |
12 open(NAMED_BED, $ARGV[1]) | |
13 or die "Cannot open $ARGV[1] for reading: $!\n"; | |
14 while(<NAMED_BED>){ | |
15 my @F = split /\t/, $_; | |
16 next unless @F > 3; | |
17 $names{$F[0]} = Set::IntervalTree->new() if not exists $names{$F[0]}; | |
18 $names{$F[0]}->insert($F[3], $F[1], $F[2]); | |
19 } | |
20 close(NAMED_BED); | |
21 die "The BED file $ARGV[1] did not contain any named regions, aborting renaming procedure\n" if not keys %names; | |
22 | |
23 open(INPUT_BED, $ARGV[0]) | |
24 or die "Cannot open $ARGV[0] for reading: $!\n"; | |
25 open(OUTPUT_BED, ">$ARGV[2]") | |
26 or die "Cannot open $ARGV[2] for writing: $!\n"; | |
27 my $num_datalines = 0; | |
28 my $num_changes = 0; | |
29 while(<INPUT_BED>){ | |
30 chomp; | |
31 my @F = split /\t/, $_; | |
32 if(@F < 3){ | |
33 # not a data line | |
34 print OUTPUT_BED $_, "\n"; | |
35 next; | |
36 } | |
37 $num_datalines++; | |
38 my $chr = $F[0]; | |
39 if(not exists $names{$chr}){ # must be a contig that was in the named BED file | |
40 if(exists $names{"chr$chr"}){ | |
41 $chr = "chr$chr"; | |
42 } | |
43 elsif($chr =~ /^chr(\S+)/ and exists $names{$1}){ | |
44 $chr = $1; | |
45 } | |
46 else{ | |
47 next; | |
48 } | |
49 } | |
50 my $renames = $names{$chr}->fetch($F[1], $F[2]+1); | |
51 if(not @$renames){ | |
52 print OUTPUT_BED $_,"\n"; # no mod | |
53 } | |
54 else{ | |
55 $num_changes++; | |
56 my %h; | |
57 my @renames = grep {not $h{$_}++} @$renames; # just get unique set of names | |
58 print OUTPUT_BED join("\t", @F[0..2], join("/", @renames), @F[4..$#F]), "\n"; # with new name | |
59 } | |
60 } | |
61 close(OUTPUT_BED); | |
62 close(INPUT_BED); | |
63 | |
64 if(open(LOG, ">>$ARGV[3]")){ | |
65 print LOG "Changed $num_changes/$num_datalines lines\n"; | |
66 } | |
67 else{ | |
68 print "Could not append to log file $ARGV[3]: writing to stdout instead\nChanged $num_changes/$num_datalines lines\n"; | |
69 } | |
70 |