0
|
1 package GeneModel;
|
|
2 use strict;
|
|
3 use Carp;
|
|
4 use Data::Dumper;
|
|
5 use Tree::Interval;
|
|
6 use Tree::DAG_Node;
|
|
7 use English;
|
|
8 use Gene;
|
|
9
|
|
10 sub new {
|
|
11 my $class = shift;
|
|
12 my $obj = {};
|
|
13 $obj->{FMODEL} = {};
|
|
14 $obj->{RMODEL} = {};
|
|
15 if (@_) {
|
|
16 my %arg = @_;
|
|
17 $obj->{FMODEL} = $arg{-FMODEL} if($arg{-FMODEL});
|
|
18 $obj->{RMODEL} = $arg{-RMODEL} if($arg{-RMODEL});
|
|
19 }
|
|
20 return bless $obj, $class;
|
|
21 }
|
|
22
|
|
23 sub get_all_chr {
|
|
24 my $self = shift;
|
|
25 return keys %{$self->{FMODEL}};
|
|
26 }
|
|
27
|
|
28 sub add_gene {
|
|
29 my ($self, $gene) = @_;
|
|
30 if($gene->strand eq '+') {
|
|
31 $self->{FMODEL}->{$gene->chr}->insert([$gene->start, $gene->end], $gene);
|
|
32 }
|
|
33 else {
|
|
34 $self->{FMODEL}->{$gene->chr}->insert([$gene->start, $gene->end], $gene);
|
|
35 }
|
|
36 }
|
|
37
|
|
38 sub remove_gene {
|
|
39 my $self = shift;
|
|
40 my $gene = shift;
|
|
41 if($gene->strand eq '+') {
|
|
42 $self->{FMODEL}->{$gene->chr}->remove([$gene->start, $gene->end], $gene);
|
|
43 }
|
|
44 else {
|
|
45 $self->{RMODEL}->{$gene->chr}->remove([$gene->start, $gene->end], $gene);
|
|
46 }
|
|
47 }
|
|
48
|
|
49 sub n_genes {
|
|
50 my $self = shift;
|
|
51 my $n = 0;
|
|
52 foreach my $c (keys %{$self->{FMODEL}}) {
|
|
53 $n += $self->{FMODEL}->{$c}->size;
|
|
54 }
|
|
55 foreach my $c (keys %{$self->{RMODEL}}) {
|
|
56 $n += $self->{RMODEL}->{$c}->size;
|
|
57 }
|
|
58 return $n;
|
|
59 }
|
|
60
|
|
61 sub sub_model {
|
|
62 my ($self, $chr, $strand) = @_;
|
|
63 if($strand eq '+' ) {
|
|
64 return $self->{FMODEL}->{$chr};
|
|
65 }
|
|
66 else {
|
|
67 return $self->{RMODEL}->{$chr};
|
|
68 }
|
|
69 }
|
|
70 sub look_up_gene {
|
|
71 my ($self, $chr, $gene) = @_;
|
|
72 if($gene->isa('Gene')) {
|
|
73 if($gene->strand eq '+') {
|
|
74 return $self->{FMODEL}->{$chr}->lookup([$gene->start, $gene->end]);
|
|
75 }
|
|
76 else {
|
|
77 return $self->{RMODEL}->{$chr}->lookup([$gene->start, $gene->end]);
|
|
78 }
|
|
79 }
|
|
80 elsif(ref($gene) eq 'ARRAY') {
|
|
81 return (
|
|
82 $self->{FMODEL}->{$chr}->lookup([$gene->start, $gene->end]),
|
|
83 $self->{RMODEL}->{$chr}->lookup([$gene->start, $gene->end]) );
|
|
84 }
|
|
85 else {
|
|
86 croak "Not implemented look_up_gene for $gene";
|
|
87 }
|
|
88 }
|
|
89
|
|
90 sub overlap_genes {
|
|
91 my ($self, $chr, $fea, $ort) = @_;
|
|
92 my @genes;
|
|
93
|
|
94 if(ref($fea) eq 'ARRAY') {
|
|
95 @genes = $self->_overlap_array($chr, $fea, $ort);
|
|
96 }
|
|
97 elsif($fea->isa('Transcript')) {
|
|
98 foreach my $e (@{$fea->exons}) {
|
|
99 push @genes, $self->_overlap_array($chr, $e,
|
|
100 $fea->strand ? $fea->strand : undef);
|
|
101 }
|
|
102 }
|
|
103 else {
|
|
104 croak "Not implemented overlap for $fea";
|
|
105 }
|
|
106 my @rtn;
|
|
107 foreach my $g (@genes) {
|
|
108 push @rtn, $g if($g->overlap($fea));
|
|
109 }
|
|
110 return @rtn;
|
|
111 }
|
|
112
|
|
113 sub _overlap_array {
|
|
114 my($self, $chr, $a, $ort) = @_;
|
|
115 if($ort) {
|
|
116 return $ort == '+' ? $self->{FMODEL}->{$chr}->intersect($a) :
|
|
117 $self->{RMODEL}->{$chr}->intersect($a);
|
|
118 }
|
|
119 else {
|
|
120 return ($self->{FMODEL}->{$chr}->intersect($a), $self->{RMODEL}->{$chr}->intersect($a));
|
|
121 }
|
|
122 }
|
|
123
|
|
124 sub model {
|
|
125 my ($self, $chr, $strand, $val) = @_;
|
|
126 croak "Please specify chr and strand for GeneModel"
|
|
127 unless $chr && $strand;
|
|
128 my $model = $strand eq '+' ? $self->{FMODEL} : $self->{RMODEL};
|
|
129
|
|
130 if($val) {
|
|
131 croak "The model for each chrom must be a Tree::Interval"
|
|
132 unless $val->isa('Tree::Interval');
|
|
133 $model->{$chr} = $val;
|
|
134 }
|
|
135 return $model->{$chr};
|
|
136 }
|
|
137
|
|
138 sub from_file {
|
|
139 my ($self, $file, $format) = @_;
|
|
140 croak "$format is not implemented yet!" unless uc($format) eq "REFFLAT";
|
|
141 print STDERR "Processing gene model file, please wait ... ";
|
|
142 my $transcripts = _refFlat2Transcripts($file);
|
|
143 my $model = _Transcript2GeneModel($transcripts);
|
|
144 foreach my $c (keys %{$model}) {
|
|
145 foreach my $s (keys %{$model->{$c}}) {
|
|
146 if($s eq '+') {
|
|
147 $self->{FMODEL}->{$c} = $model->{$c}{$s};
|
|
148 }
|
|
149 else {
|
|
150 $self->{RMODEL}->{$c} = $model->{$c}{$s};
|
|
151 }
|
|
152 }
|
|
153 }
|
|
154 print STDERR "Done.\n";
|
|
155 }
|
|
156
|
|
157 sub _refFlat2Transcripts {
|
|
158 my $file = shift;
|
|
159 my %transcripts;
|
|
160
|
|
161 open my $FILE, "<$file" or croak "Can't open $file: $OS_ERROR";
|
|
162 while( my $line = <$FILE> ) {
|
|
163 next if($line =~ /^#/); #annotation line
|
|
164 my ($name, $refseq_id, $chr, $strand, $txStart, $txEnd, $cdsStart, $cdsEnd, $exonCount,
|
|
165 $exonStarts, $exonEnds) = split(/\s+/, $line);
|
|
166 my $type = $cdsEnd == $cdsStart ? "NR" : "NM";
|
|
167 my @starts = split /,/, $exonStarts;
|
|
168 my @ends = split /,/, $exonEnds;
|
|
169 if(scalar @starts != scalar @ends) {
|
|
170 warn "Number of exon starts is not same as exon ends!";
|
|
171 next;
|
|
172 }
|
|
173 $transcripts{$chr} = {} unless (exists $transcripts{$chr});
|
|
174 $transcripts{$chr}->{$strand} = [] unless (exists $transcripts{$chr}->{$strand});
|
|
175 my @exons;
|
|
176 for( my $i = 0; $i < $exonCount; $i++) {
|
|
177 push @exons, [$starts[$i], $ends[$i]];
|
|
178 }
|
|
179
|
|
180 my $t = Transcript->new(-START => $starts[0], -END => $ends[$#ends], -CHR => $chr,
|
|
181 -STRAND => $strand eq '+' ? 1 : -1, -NAME => $name, -REFSEQ_ID => $refseq_id,
|
|
182 -EXONS => \@exons, -TYPE => $type);
|
|
183 push @{$transcripts{$chr}->{$strand}}, $t;
|
|
184 }
|
|
185 #sort the list of transcripts
|
|
186 foreach my $c (keys %transcripts) {
|
|
187 foreach my $s (keys %{$transcripts{$c}}) {
|
|
188 my @tmp = @{$transcripts{$c}->{$s}};
|
|
189 @tmp = sort { $a->start <=> $b->start || $a->end <=> $b->end } @tmp;
|
|
190 $transcripts{$c}->{$s} = \@tmp;
|
|
191 }
|
|
192 }
|
|
193 return \%transcripts;
|
|
194 }
|
|
195
|
|
196 sub _Transcript2GeneModel {
|
|
197 my $transcripts = shift;
|
|
198 my %genemodel;
|
|
199
|
|
200 foreach my $c (keys %{$transcripts}) {
|
|
201 $genemodel{$c} = {};
|
|
202 foreach my $s (keys %{$transcripts->{$c}}) {
|
|
203 my $tree = $genemodel{$c}->{$s} = Tree::Interval->new();
|
|
204 foreach my $t (@{$transcripts->{$c}{$s}}) {
|
|
205 my @ol = $tree->intersect([$t->start, $t->end]);
|
|
206 if(scalar @ol == 0 ) {
|
|
207 my $gene = Gene->new();
|
|
208 $gene->add_transcript($t);
|
|
209 $tree->insert([$t->start, $t->end], $gene);
|
|
210 next;
|
|
211 }
|
|
212 my @real_ol;
|
|
213 foreach my $g (@ol) {
|
|
214 push @real_ol, $g->val if($g->val->overlap($t));
|
|
215 }
|
|
216 if(scalar @real_ol == 0) { #gene in gene?
|
|
217 # print STDERR "Gene in Gene: ", $t->name, " in ", $ol[0]->val->name, " \n";
|
|
218 my $gene = Gene->new();
|
|
219 $gene->add_transcript($t);
|
|
220 $tree->insert([$t->start, $t->end], $gene);
|
|
221 next;
|
|
222 }
|
|
223 if(scalar @real_ol == 1 ) { # transcript belongs to gene
|
|
224 my $g = $real_ol[0];
|
|
225 # print STDERR "Same Gene group: ", $t->name, " in ", $real_ol[0]->name, " \n";
|
|
226 if($t->start < $g->start || $t->end > $g->end) {
|
|
227 $tree->delete([$g->start, $g->end], $g);
|
|
228 $g->add_transcript($t);
|
|
229 $tree->insert([$g->start, $g->end], $g);
|
|
230 next;
|
|
231 }
|
|
232 $g->add_transcript($t);
|
|
233 next;
|
|
234 }
|
|
235 # a transcript make more genes same
|
|
236 # print STDERR "Join Gene group: ", $t->name, " join ", $real_ol[0]->name, "and", $real_ol[1]->name, " \n";
|
|
237 my $gene = Gene->new();
|
|
238 $gene->add_transcript($t);
|
|
239 foreach my $g (@real_ol) {
|
|
240 $tree->delete([$g->start, $g->end], $g);
|
|
241 foreach my $tp (@{$g->transcripts}) {
|
|
242 $gene->add_transcript($tp);
|
|
243 }
|
|
244 }
|
|
245 $tree->insert([$gene->start, $gene->end], $gene);
|
|
246 }
|
|
247 }
|
|
248 }
|
|
249 return \%genemodel;
|
|
250 }
|
|
251
|
|
252 sub _print_tree {
|
|
253 return;
|
|
254 my $tree = shift;
|
|
255 my $t = Tree::DAG_Node->lol_to_tree($tree->root->as_lol);
|
|
256 local $, = "\n";
|
|
257 print @{$t->draw_ascii_tree}, "\n";
|
|
258 }
|
|
259 1;
|