Mercurial > repos > jjohnson > crest
diff GeneModel.pm @ 0:acc8d8bfeb9a
Uploaded
author | jjohnson |
---|---|
date | Wed, 08 Feb 2012 16:59:24 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/GeneModel.pm Wed Feb 08 16:59:24 2012 -0500 @@ -0,0 +1,259 @@ +package GeneModel; +use strict; +use Carp; +use Data::Dumper; +use Tree::Interval; +use Tree::DAG_Node; +use English; +use Gene; + +sub new { + my $class = shift; + my $obj = {}; + $obj->{FMODEL} = {}; + $obj->{RMODEL} = {}; + if (@_) { + my %arg = @_; + $obj->{FMODEL} = $arg{-FMODEL} if($arg{-FMODEL}); + $obj->{RMODEL} = $arg{-RMODEL} if($arg{-RMODEL}); + } + return bless $obj, $class; +} + +sub get_all_chr { + my $self = shift; + return keys %{$self->{FMODEL}}; +} + +sub add_gene { + my ($self, $gene) = @_; + if($gene->strand eq '+') { + $self->{FMODEL}->{$gene->chr}->insert([$gene->start, $gene->end], $gene); + } + else { + $self->{FMODEL}->{$gene->chr}->insert([$gene->start, $gene->end], $gene); + } +} + +sub remove_gene { + my $self = shift; + my $gene = shift; + if($gene->strand eq '+') { + $self->{FMODEL}->{$gene->chr}->remove([$gene->start, $gene->end], $gene); + } + else { + $self->{RMODEL}->{$gene->chr}->remove([$gene->start, $gene->end], $gene); + } +} + +sub n_genes { + my $self = shift; + my $n = 0; + foreach my $c (keys %{$self->{FMODEL}}) { + $n += $self->{FMODEL}->{$c}->size; + } + foreach my $c (keys %{$self->{RMODEL}}) { + $n += $self->{RMODEL}->{$c}->size; + } + return $n; +} + +sub sub_model { + my ($self, $chr, $strand) = @_; + if($strand eq '+' ) { + return $self->{FMODEL}->{$chr}; + } + else { + return $self->{RMODEL}->{$chr}; + } +} +sub look_up_gene { + my ($self, $chr, $gene) = @_; + if($gene->isa('Gene')) { + if($gene->strand eq '+') { + return $self->{FMODEL}->{$chr}->lookup([$gene->start, $gene->end]); + } + else { + return $self->{RMODEL}->{$chr}->lookup([$gene->start, $gene->end]); + } + } + elsif(ref($gene) eq 'ARRAY') { + return ( + $self->{FMODEL}->{$chr}->lookup([$gene->start, $gene->end]), + $self->{RMODEL}->{$chr}->lookup([$gene->start, $gene->end]) ); + } + else { + croak "Not implemented look_up_gene for $gene"; + } +} + +sub overlap_genes { + my ($self, $chr, $fea, $ort) = @_; + my @genes; + + if(ref($fea) eq 'ARRAY') { + @genes = $self->_overlap_array($chr, $fea, $ort); + } + elsif($fea->isa('Transcript')) { + foreach my $e (@{$fea->exons}) { + push @genes, $self->_overlap_array($chr, $e, + $fea->strand ? $fea->strand : undef); + } + } + else { + croak "Not implemented overlap for $fea"; + } + my @rtn; + foreach my $g (@genes) { + push @rtn, $g if($g->overlap($fea)); + } + return @rtn; +} + +sub _overlap_array { + my($self, $chr, $a, $ort) = @_; + if($ort) { + return $ort == '+' ? $self->{FMODEL}->{$chr}->intersect($a) : + $self->{RMODEL}->{$chr}->intersect($a); + } + else { + return ($self->{FMODEL}->{$chr}->intersect($a), $self->{RMODEL}->{$chr}->intersect($a)); + } +} + +sub model { + my ($self, $chr, $strand, $val) = @_; + croak "Please specify chr and strand for GeneModel" + unless $chr && $strand; + my $model = $strand eq '+' ? $self->{FMODEL} : $self->{RMODEL}; + + if($val) { + croak "The model for each chrom must be a Tree::Interval" + unless $val->isa('Tree::Interval'); + $model->{$chr} = $val; + } + return $model->{$chr}; +} + +sub from_file { + my ($self, $file, $format) = @_; + croak "$format is not implemented yet!" unless uc($format) eq "REFFLAT"; + print STDERR "Processing gene model file, please wait ... "; + my $transcripts = _refFlat2Transcripts($file); + my $model = _Transcript2GeneModel($transcripts); + foreach my $c (keys %{$model}) { + foreach my $s (keys %{$model->{$c}}) { + if($s eq '+') { + $self->{FMODEL}->{$c} = $model->{$c}{$s}; + } + else { + $self->{RMODEL}->{$c} = $model->{$c}{$s}; + } + } + } + print STDERR "Done.\n"; +} + +sub _refFlat2Transcripts { + my $file = shift; + my %transcripts; + + open my $FILE, "<$file" or croak "Can't open $file: $OS_ERROR"; + while( my $line = <$FILE> ) { + next if($line =~ /^#/); #annotation line + my ($name, $refseq_id, $chr, $strand, $txStart, $txEnd, $cdsStart, $cdsEnd, $exonCount, + $exonStarts, $exonEnds) = split(/\s+/, $line); + my $type = $cdsEnd == $cdsStart ? "NR" : "NM"; + my @starts = split /,/, $exonStarts; + my @ends = split /,/, $exonEnds; + if(scalar @starts != scalar @ends) { + warn "Number of exon starts is not same as exon ends!"; + next; + } + $transcripts{$chr} = {} unless (exists $transcripts{$chr}); + $transcripts{$chr}->{$strand} = [] unless (exists $transcripts{$chr}->{$strand}); + my @exons; + for( my $i = 0; $i < $exonCount; $i++) { + push @exons, [$starts[$i], $ends[$i]]; + } + + my $t = Transcript->new(-START => $starts[0], -END => $ends[$#ends], -CHR => $chr, + -STRAND => $strand eq '+' ? 1 : -1, -NAME => $name, -REFSEQ_ID => $refseq_id, + -EXONS => \@exons, -TYPE => $type); + push @{$transcripts{$chr}->{$strand}}, $t; + } + #sort the list of transcripts + foreach my $c (keys %transcripts) { + foreach my $s (keys %{$transcripts{$c}}) { + my @tmp = @{$transcripts{$c}->{$s}}; + @tmp = sort { $a->start <=> $b->start || $a->end <=> $b->end } @tmp; + $transcripts{$c}->{$s} = \@tmp; + } + } + return \%transcripts; +} + +sub _Transcript2GeneModel { + my $transcripts = shift; + my %genemodel; + + foreach my $c (keys %{$transcripts}) { + $genemodel{$c} = {}; + foreach my $s (keys %{$transcripts->{$c}}) { + my $tree = $genemodel{$c}->{$s} = Tree::Interval->new(); + foreach my $t (@{$transcripts->{$c}{$s}}) { + my @ol = $tree->intersect([$t->start, $t->end]); + if(scalar @ol == 0 ) { + my $gene = Gene->new(); + $gene->add_transcript($t); + $tree->insert([$t->start, $t->end], $gene); + next; + } + my @real_ol; + foreach my $g (@ol) { + push @real_ol, $g->val if($g->val->overlap($t)); + } + if(scalar @real_ol == 0) { #gene in gene? +# print STDERR "Gene in Gene: ", $t->name, " in ", $ol[0]->val->name, " \n"; + my $gene = Gene->new(); + $gene->add_transcript($t); + $tree->insert([$t->start, $t->end], $gene); + next; + } + if(scalar @real_ol == 1 ) { # transcript belongs to gene + my $g = $real_ol[0]; +# print STDERR "Same Gene group: ", $t->name, " in ", $real_ol[0]->name, " \n"; + if($t->start < $g->start || $t->end > $g->end) { + $tree->delete([$g->start, $g->end], $g); + $g->add_transcript($t); + $tree->insert([$g->start, $g->end], $g); + next; + } + $g->add_transcript($t); + next; + } + # a transcript make more genes same +# print STDERR "Join Gene group: ", $t->name, " join ", $real_ol[0]->name, "and", $real_ol[1]->name, " \n"; + my $gene = Gene->new(); + $gene->add_transcript($t); + foreach my $g (@real_ol) { + $tree->delete([$g->start, $g->end], $g); + foreach my $tp (@{$g->transcripts}) { + $gene->add_transcript($tp); + } + } + $tree->insert([$gene->start, $gene->end], $gene); + } + } + } + return \%genemodel; +} + +sub _print_tree { + return; + my $tree = shift; + my $t = Tree::DAG_Node->lol_to_tree($tree->root->as_lol); + local $, = "\n"; + print @{$t->draw_ascii_tree}, "\n"; +} +1;