comparison GeneModel.pm @ 0:acc8d8bfeb9a

Uploaded
author jjohnson
date Wed, 08 Feb 2012 16:59:24 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:acc8d8bfeb9a
1 package GeneModel;
2 use strict;
3 use Carp;
4 use Data::Dumper;
5 use Tree::Interval;
6 use Tree::DAG_Node;
7 use English;
8 use Gene;
9
10 sub new {
11 my $class = shift;
12 my $obj = {};
13 $obj->{FMODEL} = {};
14 $obj->{RMODEL} = {};
15 if (@_) {
16 my %arg = @_;
17 $obj->{FMODEL} = $arg{-FMODEL} if($arg{-FMODEL});
18 $obj->{RMODEL} = $arg{-RMODEL} if($arg{-RMODEL});
19 }
20 return bless $obj, $class;
21 }
22
23 sub get_all_chr {
24 my $self = shift;
25 return keys %{$self->{FMODEL}};
26 }
27
28 sub add_gene {
29 my ($self, $gene) = @_;
30 if($gene->strand eq '+') {
31 $self->{FMODEL}->{$gene->chr}->insert([$gene->start, $gene->end], $gene);
32 }
33 else {
34 $self->{FMODEL}->{$gene->chr}->insert([$gene->start, $gene->end], $gene);
35 }
36 }
37
38 sub remove_gene {
39 my $self = shift;
40 my $gene = shift;
41 if($gene->strand eq '+') {
42 $self->{FMODEL}->{$gene->chr}->remove([$gene->start, $gene->end], $gene);
43 }
44 else {
45 $self->{RMODEL}->{$gene->chr}->remove([$gene->start, $gene->end], $gene);
46 }
47 }
48
49 sub n_genes {
50 my $self = shift;
51 my $n = 0;
52 foreach my $c (keys %{$self->{FMODEL}}) {
53 $n += $self->{FMODEL}->{$c}->size;
54 }
55 foreach my $c (keys %{$self->{RMODEL}}) {
56 $n += $self->{RMODEL}->{$c}->size;
57 }
58 return $n;
59 }
60
61 sub sub_model {
62 my ($self, $chr, $strand) = @_;
63 if($strand eq '+' ) {
64 return $self->{FMODEL}->{$chr};
65 }
66 else {
67 return $self->{RMODEL}->{$chr};
68 }
69 }
70 sub look_up_gene {
71 my ($self, $chr, $gene) = @_;
72 if($gene->isa('Gene')) {
73 if($gene->strand eq '+') {
74 return $self->{FMODEL}->{$chr}->lookup([$gene->start, $gene->end]);
75 }
76 else {
77 return $self->{RMODEL}->{$chr}->lookup([$gene->start, $gene->end]);
78 }
79 }
80 elsif(ref($gene) eq 'ARRAY') {
81 return (
82 $self->{FMODEL}->{$chr}->lookup([$gene->start, $gene->end]),
83 $self->{RMODEL}->{$chr}->lookup([$gene->start, $gene->end]) );
84 }
85 else {
86 croak "Not implemented look_up_gene for $gene";
87 }
88 }
89
90 sub overlap_genes {
91 my ($self, $chr, $fea, $ort) = @_;
92 my @genes;
93
94 if(ref($fea) eq 'ARRAY') {
95 @genes = $self->_overlap_array($chr, $fea, $ort);
96 }
97 elsif($fea->isa('Transcript')) {
98 foreach my $e (@{$fea->exons}) {
99 push @genes, $self->_overlap_array($chr, $e,
100 $fea->strand ? $fea->strand : undef);
101 }
102 }
103 else {
104 croak "Not implemented overlap for $fea";
105 }
106 my @rtn;
107 foreach my $g (@genes) {
108 push @rtn, $g if($g->overlap($fea));
109 }
110 return @rtn;
111 }
112
113 sub _overlap_array {
114 my($self, $chr, $a, $ort) = @_;
115 if($ort) {
116 return $ort == '+' ? $self->{FMODEL}->{$chr}->intersect($a) :
117 $self->{RMODEL}->{$chr}->intersect($a);
118 }
119 else {
120 return ($self->{FMODEL}->{$chr}->intersect($a), $self->{RMODEL}->{$chr}->intersect($a));
121 }
122 }
123
124 sub model {
125 my ($self, $chr, $strand, $val) = @_;
126 croak "Please specify chr and strand for GeneModel"
127 unless $chr && $strand;
128 my $model = $strand eq '+' ? $self->{FMODEL} : $self->{RMODEL};
129
130 if($val) {
131 croak "The model for each chrom must be a Tree::Interval"
132 unless $val->isa('Tree::Interval');
133 $model->{$chr} = $val;
134 }
135 return $model->{$chr};
136 }
137
138 sub from_file {
139 my ($self, $file, $format) = @_;
140 croak "$format is not implemented yet!" unless uc($format) eq "REFFLAT";
141 print STDERR "Processing gene model file, please wait ... ";
142 my $transcripts = _refFlat2Transcripts($file);
143 my $model = _Transcript2GeneModel($transcripts);
144 foreach my $c (keys %{$model}) {
145 foreach my $s (keys %{$model->{$c}}) {
146 if($s eq '+') {
147 $self->{FMODEL}->{$c} = $model->{$c}{$s};
148 }
149 else {
150 $self->{RMODEL}->{$c} = $model->{$c}{$s};
151 }
152 }
153 }
154 print STDERR "Done.\n";
155 }
156
157 sub _refFlat2Transcripts {
158 my $file = shift;
159 my %transcripts;
160
161 open my $FILE, "<$file" or croak "Can't open $file: $OS_ERROR";
162 while( my $line = <$FILE> ) {
163 next if($line =~ /^#/); #annotation line
164 my ($name, $refseq_id, $chr, $strand, $txStart, $txEnd, $cdsStart, $cdsEnd, $exonCount,
165 $exonStarts, $exonEnds) = split(/\s+/, $line);
166 my $type = $cdsEnd == $cdsStart ? "NR" : "NM";
167 my @starts = split /,/, $exonStarts;
168 my @ends = split /,/, $exonEnds;
169 if(scalar @starts != scalar @ends) {
170 warn "Number of exon starts is not same as exon ends!";
171 next;
172 }
173 $transcripts{$chr} = {} unless (exists $transcripts{$chr});
174 $transcripts{$chr}->{$strand} = [] unless (exists $transcripts{$chr}->{$strand});
175 my @exons;
176 for( my $i = 0; $i < $exonCount; $i++) {
177 push @exons, [$starts[$i], $ends[$i]];
178 }
179
180 my $t = Transcript->new(-START => $starts[0], -END => $ends[$#ends], -CHR => $chr,
181 -STRAND => $strand eq '+' ? 1 : -1, -NAME => $name, -REFSEQ_ID => $refseq_id,
182 -EXONS => \@exons, -TYPE => $type);
183 push @{$transcripts{$chr}->{$strand}}, $t;
184 }
185 #sort the list of transcripts
186 foreach my $c (keys %transcripts) {
187 foreach my $s (keys %{$transcripts{$c}}) {
188 my @tmp = @{$transcripts{$c}->{$s}};
189 @tmp = sort { $a->start <=> $b->start || $a->end <=> $b->end } @tmp;
190 $transcripts{$c}->{$s} = \@tmp;
191 }
192 }
193 return \%transcripts;
194 }
195
196 sub _Transcript2GeneModel {
197 my $transcripts = shift;
198 my %genemodel;
199
200 foreach my $c (keys %{$transcripts}) {
201 $genemodel{$c} = {};
202 foreach my $s (keys %{$transcripts->{$c}}) {
203 my $tree = $genemodel{$c}->{$s} = Tree::Interval->new();
204 foreach my $t (@{$transcripts->{$c}{$s}}) {
205 my @ol = $tree->intersect([$t->start, $t->end]);
206 if(scalar @ol == 0 ) {
207 my $gene = Gene->new();
208 $gene->add_transcript($t);
209 $tree->insert([$t->start, $t->end], $gene);
210 next;
211 }
212 my @real_ol;
213 foreach my $g (@ol) {
214 push @real_ol, $g->val if($g->val->overlap($t));
215 }
216 if(scalar @real_ol == 0) { #gene in gene?
217 # print STDERR "Gene in Gene: ", $t->name, " in ", $ol[0]->val->name, " \n";
218 my $gene = Gene->new();
219 $gene->add_transcript($t);
220 $tree->insert([$t->start, $t->end], $gene);
221 next;
222 }
223 if(scalar @real_ol == 1 ) { # transcript belongs to gene
224 my $g = $real_ol[0];
225 # print STDERR "Same Gene group: ", $t->name, " in ", $real_ol[0]->name, " \n";
226 if($t->start < $g->start || $t->end > $g->end) {
227 $tree->delete([$g->start, $g->end], $g);
228 $g->add_transcript($t);
229 $tree->insert([$g->start, $g->end], $g);
230 next;
231 }
232 $g->add_transcript($t);
233 next;
234 }
235 # a transcript make more genes same
236 # print STDERR "Join Gene group: ", $t->name, " join ", $real_ol[0]->name, "and", $real_ol[1]->name, " \n";
237 my $gene = Gene->new();
238 $gene->add_transcript($t);
239 foreach my $g (@real_ol) {
240 $tree->delete([$g->start, $g->end], $g);
241 foreach my $tp (@{$g->transcripts}) {
242 $gene->add_transcript($tp);
243 }
244 }
245 $tree->insert([$gene->start, $gene->end], $gene);
246 }
247 }
248 }
249 return \%genemodel;
250 }
251
252 sub _print_tree {
253 return;
254 my $tree = shift;
255 my $t = Tree::DAG_Node->lol_to_tree($tree->root->as_lol);
256 local $, = "\n";
257 print @{$t->draw_ascii_tree}, "\n";
258 }
259 1;