Mercurial > repos > jjohnson > crest
comparison GeneModel.pm @ 0:acc8d8bfeb9a
Uploaded
author | jjohnson |
---|---|
date | Wed, 08 Feb 2012 16:59:24 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:acc8d8bfeb9a |
---|---|
1 package GeneModel; | |
2 use strict; | |
3 use Carp; | |
4 use Data::Dumper; | |
5 use Tree::Interval; | |
6 use Tree::DAG_Node; | |
7 use English; | |
8 use Gene; | |
9 | |
10 sub new { | |
11 my $class = shift; | |
12 my $obj = {}; | |
13 $obj->{FMODEL} = {}; | |
14 $obj->{RMODEL} = {}; | |
15 if (@_) { | |
16 my %arg = @_; | |
17 $obj->{FMODEL} = $arg{-FMODEL} if($arg{-FMODEL}); | |
18 $obj->{RMODEL} = $arg{-RMODEL} if($arg{-RMODEL}); | |
19 } | |
20 return bless $obj, $class; | |
21 } | |
22 | |
23 sub get_all_chr { | |
24 my $self = shift; | |
25 return keys %{$self->{FMODEL}}; | |
26 } | |
27 | |
28 sub add_gene { | |
29 my ($self, $gene) = @_; | |
30 if($gene->strand eq '+') { | |
31 $self->{FMODEL}->{$gene->chr}->insert([$gene->start, $gene->end], $gene); | |
32 } | |
33 else { | |
34 $self->{FMODEL}->{$gene->chr}->insert([$gene->start, $gene->end], $gene); | |
35 } | |
36 } | |
37 | |
38 sub remove_gene { | |
39 my $self = shift; | |
40 my $gene = shift; | |
41 if($gene->strand eq '+') { | |
42 $self->{FMODEL}->{$gene->chr}->remove([$gene->start, $gene->end], $gene); | |
43 } | |
44 else { | |
45 $self->{RMODEL}->{$gene->chr}->remove([$gene->start, $gene->end], $gene); | |
46 } | |
47 } | |
48 | |
49 sub n_genes { | |
50 my $self = shift; | |
51 my $n = 0; | |
52 foreach my $c (keys %{$self->{FMODEL}}) { | |
53 $n += $self->{FMODEL}->{$c}->size; | |
54 } | |
55 foreach my $c (keys %{$self->{RMODEL}}) { | |
56 $n += $self->{RMODEL}->{$c}->size; | |
57 } | |
58 return $n; | |
59 } | |
60 | |
61 sub sub_model { | |
62 my ($self, $chr, $strand) = @_; | |
63 if($strand eq '+' ) { | |
64 return $self->{FMODEL}->{$chr}; | |
65 } | |
66 else { | |
67 return $self->{RMODEL}->{$chr}; | |
68 } | |
69 } | |
70 sub look_up_gene { | |
71 my ($self, $chr, $gene) = @_; | |
72 if($gene->isa('Gene')) { | |
73 if($gene->strand eq '+') { | |
74 return $self->{FMODEL}->{$chr}->lookup([$gene->start, $gene->end]); | |
75 } | |
76 else { | |
77 return $self->{RMODEL}->{$chr}->lookup([$gene->start, $gene->end]); | |
78 } | |
79 } | |
80 elsif(ref($gene) eq 'ARRAY') { | |
81 return ( | |
82 $self->{FMODEL}->{$chr}->lookup([$gene->start, $gene->end]), | |
83 $self->{RMODEL}->{$chr}->lookup([$gene->start, $gene->end]) ); | |
84 } | |
85 else { | |
86 croak "Not implemented look_up_gene for $gene"; | |
87 } | |
88 } | |
89 | |
90 sub overlap_genes { | |
91 my ($self, $chr, $fea, $ort) = @_; | |
92 my @genes; | |
93 | |
94 if(ref($fea) eq 'ARRAY') { | |
95 @genes = $self->_overlap_array($chr, $fea, $ort); | |
96 } | |
97 elsif($fea->isa('Transcript')) { | |
98 foreach my $e (@{$fea->exons}) { | |
99 push @genes, $self->_overlap_array($chr, $e, | |
100 $fea->strand ? $fea->strand : undef); | |
101 } | |
102 } | |
103 else { | |
104 croak "Not implemented overlap for $fea"; | |
105 } | |
106 my @rtn; | |
107 foreach my $g (@genes) { | |
108 push @rtn, $g if($g->overlap($fea)); | |
109 } | |
110 return @rtn; | |
111 } | |
112 | |
113 sub _overlap_array { | |
114 my($self, $chr, $a, $ort) = @_; | |
115 if($ort) { | |
116 return $ort == '+' ? $self->{FMODEL}->{$chr}->intersect($a) : | |
117 $self->{RMODEL}->{$chr}->intersect($a); | |
118 } | |
119 else { | |
120 return ($self->{FMODEL}->{$chr}->intersect($a), $self->{RMODEL}->{$chr}->intersect($a)); | |
121 } | |
122 } | |
123 | |
124 sub model { | |
125 my ($self, $chr, $strand, $val) = @_; | |
126 croak "Please specify chr and strand for GeneModel" | |
127 unless $chr && $strand; | |
128 my $model = $strand eq '+' ? $self->{FMODEL} : $self->{RMODEL}; | |
129 | |
130 if($val) { | |
131 croak "The model for each chrom must be a Tree::Interval" | |
132 unless $val->isa('Tree::Interval'); | |
133 $model->{$chr} = $val; | |
134 } | |
135 return $model->{$chr}; | |
136 } | |
137 | |
138 sub from_file { | |
139 my ($self, $file, $format) = @_; | |
140 croak "$format is not implemented yet!" unless uc($format) eq "REFFLAT"; | |
141 print STDERR "Processing gene model file, please wait ... "; | |
142 my $transcripts = _refFlat2Transcripts($file); | |
143 my $model = _Transcript2GeneModel($transcripts); | |
144 foreach my $c (keys %{$model}) { | |
145 foreach my $s (keys %{$model->{$c}}) { | |
146 if($s eq '+') { | |
147 $self->{FMODEL}->{$c} = $model->{$c}{$s}; | |
148 } | |
149 else { | |
150 $self->{RMODEL}->{$c} = $model->{$c}{$s}; | |
151 } | |
152 } | |
153 } | |
154 print STDERR "Done.\n"; | |
155 } | |
156 | |
157 sub _refFlat2Transcripts { | |
158 my $file = shift; | |
159 my %transcripts; | |
160 | |
161 open my $FILE, "<$file" or croak "Can't open $file: $OS_ERROR"; | |
162 while( my $line = <$FILE> ) { | |
163 next if($line =~ /^#/); #annotation line | |
164 my ($name, $refseq_id, $chr, $strand, $txStart, $txEnd, $cdsStart, $cdsEnd, $exonCount, | |
165 $exonStarts, $exonEnds) = split(/\s+/, $line); | |
166 my $type = $cdsEnd == $cdsStart ? "NR" : "NM"; | |
167 my @starts = split /,/, $exonStarts; | |
168 my @ends = split /,/, $exonEnds; | |
169 if(scalar @starts != scalar @ends) { | |
170 warn "Number of exon starts is not same as exon ends!"; | |
171 next; | |
172 } | |
173 $transcripts{$chr} = {} unless (exists $transcripts{$chr}); | |
174 $transcripts{$chr}->{$strand} = [] unless (exists $transcripts{$chr}->{$strand}); | |
175 my @exons; | |
176 for( my $i = 0; $i < $exonCount; $i++) { | |
177 push @exons, [$starts[$i], $ends[$i]]; | |
178 } | |
179 | |
180 my $t = Transcript->new(-START => $starts[0], -END => $ends[$#ends], -CHR => $chr, | |
181 -STRAND => $strand eq '+' ? 1 : -1, -NAME => $name, -REFSEQ_ID => $refseq_id, | |
182 -EXONS => \@exons, -TYPE => $type); | |
183 push @{$transcripts{$chr}->{$strand}}, $t; | |
184 } | |
185 #sort the list of transcripts | |
186 foreach my $c (keys %transcripts) { | |
187 foreach my $s (keys %{$transcripts{$c}}) { | |
188 my @tmp = @{$transcripts{$c}->{$s}}; | |
189 @tmp = sort { $a->start <=> $b->start || $a->end <=> $b->end } @tmp; | |
190 $transcripts{$c}->{$s} = \@tmp; | |
191 } | |
192 } | |
193 return \%transcripts; | |
194 } | |
195 | |
196 sub _Transcript2GeneModel { | |
197 my $transcripts = shift; | |
198 my %genemodel; | |
199 | |
200 foreach my $c (keys %{$transcripts}) { | |
201 $genemodel{$c} = {}; | |
202 foreach my $s (keys %{$transcripts->{$c}}) { | |
203 my $tree = $genemodel{$c}->{$s} = Tree::Interval->new(); | |
204 foreach my $t (@{$transcripts->{$c}{$s}}) { | |
205 my @ol = $tree->intersect([$t->start, $t->end]); | |
206 if(scalar @ol == 0 ) { | |
207 my $gene = Gene->new(); | |
208 $gene->add_transcript($t); | |
209 $tree->insert([$t->start, $t->end], $gene); | |
210 next; | |
211 } | |
212 my @real_ol; | |
213 foreach my $g (@ol) { | |
214 push @real_ol, $g->val if($g->val->overlap($t)); | |
215 } | |
216 if(scalar @real_ol == 0) { #gene in gene? | |
217 # print STDERR "Gene in Gene: ", $t->name, " in ", $ol[0]->val->name, " \n"; | |
218 my $gene = Gene->new(); | |
219 $gene->add_transcript($t); | |
220 $tree->insert([$t->start, $t->end], $gene); | |
221 next; | |
222 } | |
223 if(scalar @real_ol == 1 ) { # transcript belongs to gene | |
224 my $g = $real_ol[0]; | |
225 # print STDERR "Same Gene group: ", $t->name, " in ", $real_ol[0]->name, " \n"; | |
226 if($t->start < $g->start || $t->end > $g->end) { | |
227 $tree->delete([$g->start, $g->end], $g); | |
228 $g->add_transcript($t); | |
229 $tree->insert([$g->start, $g->end], $g); | |
230 next; | |
231 } | |
232 $g->add_transcript($t); | |
233 next; | |
234 } | |
235 # a transcript make more genes same | |
236 # print STDERR "Join Gene group: ", $t->name, " join ", $real_ol[0]->name, "and", $real_ol[1]->name, " \n"; | |
237 my $gene = Gene->new(); | |
238 $gene->add_transcript($t); | |
239 foreach my $g (@real_ol) { | |
240 $tree->delete([$g->start, $g->end], $g); | |
241 foreach my $tp (@{$g->transcripts}) { | |
242 $gene->add_transcript($tp); | |
243 } | |
244 } | |
245 $tree->insert([$gene->start, $gene->end], $gene); | |
246 } | |
247 } | |
248 } | |
249 return \%genemodel; | |
250 } | |
251 | |
252 sub _print_tree { | |
253 return; | |
254 my $tree = shift; | |
255 my $t = Tree::DAG_Node->lol_to_tree($tree->root->as_lol); | |
256 local $, = "\n"; | |
257 print @{$t->draw_ascii_tree}, "\n"; | |
258 } | |
259 1; |