Mercurial > repos > jjohnson > crest
view GeneModel.pm @ 1:4f6952e0af48 default tip
CREST - add crest.loc.sample
author | Jim Johnson <jj@umn.edu> |
---|---|
date | Wed, 08 Feb 2012 16:08:01 -0600 |
parents | acc8d8bfeb9a |
children |
line wrap: on
line source
package GeneModel; use strict; use Carp; use Data::Dumper; use Tree::Interval; use Tree::DAG_Node; use English; use Gene; sub new { my $class = shift; my $obj = {}; $obj->{FMODEL} = {}; $obj->{RMODEL} = {}; if (@_) { my %arg = @_; $obj->{FMODEL} = $arg{-FMODEL} if($arg{-FMODEL}); $obj->{RMODEL} = $arg{-RMODEL} if($arg{-RMODEL}); } return bless $obj, $class; } sub get_all_chr { my $self = shift; return keys %{$self->{FMODEL}}; } sub add_gene { my ($self, $gene) = @_; if($gene->strand eq '+') { $self->{FMODEL}->{$gene->chr}->insert([$gene->start, $gene->end], $gene); } else { $self->{FMODEL}->{$gene->chr}->insert([$gene->start, $gene->end], $gene); } } sub remove_gene { my $self = shift; my $gene = shift; if($gene->strand eq '+') { $self->{FMODEL}->{$gene->chr}->remove([$gene->start, $gene->end], $gene); } else { $self->{RMODEL}->{$gene->chr}->remove([$gene->start, $gene->end], $gene); } } sub n_genes { my $self = shift; my $n = 0; foreach my $c (keys %{$self->{FMODEL}}) { $n += $self->{FMODEL}->{$c}->size; } foreach my $c (keys %{$self->{RMODEL}}) { $n += $self->{RMODEL}->{$c}->size; } return $n; } sub sub_model { my ($self, $chr, $strand) = @_; if($strand eq '+' ) { return $self->{FMODEL}->{$chr}; } else { return $self->{RMODEL}->{$chr}; } } sub look_up_gene { my ($self, $chr, $gene) = @_; if($gene->isa('Gene')) { if($gene->strand eq '+') { return $self->{FMODEL}->{$chr}->lookup([$gene->start, $gene->end]); } else { return $self->{RMODEL}->{$chr}->lookup([$gene->start, $gene->end]); } } elsif(ref($gene) eq 'ARRAY') { return ( $self->{FMODEL}->{$chr}->lookup([$gene->start, $gene->end]), $self->{RMODEL}->{$chr}->lookup([$gene->start, $gene->end]) ); } else { croak "Not implemented look_up_gene for $gene"; } } sub overlap_genes { my ($self, $chr, $fea, $ort) = @_; my @genes; if(ref($fea) eq 'ARRAY') { @genes = $self->_overlap_array($chr, $fea, $ort); } elsif($fea->isa('Transcript')) { foreach my $e (@{$fea->exons}) { push @genes, $self->_overlap_array($chr, $e, $fea->strand ? $fea->strand : undef); } } else { croak "Not implemented overlap for $fea"; } my @rtn; foreach my $g (@genes) { push @rtn, $g if($g->overlap($fea)); } return @rtn; } sub _overlap_array { my($self, $chr, $a, $ort) = @_; if($ort) { return $ort == '+' ? $self->{FMODEL}->{$chr}->intersect($a) : $self->{RMODEL}->{$chr}->intersect($a); } else { return ($self->{FMODEL}->{$chr}->intersect($a), $self->{RMODEL}->{$chr}->intersect($a)); } } sub model { my ($self, $chr, $strand, $val) = @_; croak "Please specify chr and strand for GeneModel" unless $chr && $strand; my $model = $strand eq '+' ? $self->{FMODEL} : $self->{RMODEL}; if($val) { croak "The model for each chrom must be a Tree::Interval" unless $val->isa('Tree::Interval'); $model->{$chr} = $val; } return $model->{$chr}; } sub from_file { my ($self, $file, $format) = @_; croak "$format is not implemented yet!" unless uc($format) eq "REFFLAT"; print STDERR "Processing gene model file, please wait ... "; my $transcripts = _refFlat2Transcripts($file); my $model = _Transcript2GeneModel($transcripts); foreach my $c (keys %{$model}) { foreach my $s (keys %{$model->{$c}}) { if($s eq '+') { $self->{FMODEL}->{$c} = $model->{$c}{$s}; } else { $self->{RMODEL}->{$c} = $model->{$c}{$s}; } } } print STDERR "Done.\n"; } sub _refFlat2Transcripts { my $file = shift; my %transcripts; open my $FILE, "<$file" or croak "Can't open $file: $OS_ERROR"; while( my $line = <$FILE> ) { next if($line =~ /^#/); #annotation line my ($name, $refseq_id, $chr, $strand, $txStart, $txEnd, $cdsStart, $cdsEnd, $exonCount, $exonStarts, $exonEnds) = split(/\s+/, $line); my $type = $cdsEnd == $cdsStart ? "NR" : "NM"; my @starts = split /,/, $exonStarts; my @ends = split /,/, $exonEnds; if(scalar @starts != scalar @ends) { warn "Number of exon starts is not same as exon ends!"; next; } $transcripts{$chr} = {} unless (exists $transcripts{$chr}); $transcripts{$chr}->{$strand} = [] unless (exists $transcripts{$chr}->{$strand}); my @exons; for( my $i = 0; $i < $exonCount; $i++) { push @exons, [$starts[$i], $ends[$i]]; } my $t = Transcript->new(-START => $starts[0], -END => $ends[$#ends], -CHR => $chr, -STRAND => $strand eq '+' ? 1 : -1, -NAME => $name, -REFSEQ_ID => $refseq_id, -EXONS => \@exons, -TYPE => $type); push @{$transcripts{$chr}->{$strand}}, $t; } #sort the list of transcripts foreach my $c (keys %transcripts) { foreach my $s (keys %{$transcripts{$c}}) { my @tmp = @{$transcripts{$c}->{$s}}; @tmp = sort { $a->start <=> $b->start || $a->end <=> $b->end } @tmp; $transcripts{$c}->{$s} = \@tmp; } } return \%transcripts; } sub _Transcript2GeneModel { my $transcripts = shift; my %genemodel; foreach my $c (keys %{$transcripts}) { $genemodel{$c} = {}; foreach my $s (keys %{$transcripts->{$c}}) { my $tree = $genemodel{$c}->{$s} = Tree::Interval->new(); foreach my $t (@{$transcripts->{$c}{$s}}) { my @ol = $tree->intersect([$t->start, $t->end]); if(scalar @ol == 0 ) { my $gene = Gene->new(); $gene->add_transcript($t); $tree->insert([$t->start, $t->end], $gene); next; } my @real_ol; foreach my $g (@ol) { push @real_ol, $g->val if($g->val->overlap($t)); } if(scalar @real_ol == 0) { #gene in gene? # print STDERR "Gene in Gene: ", $t->name, " in ", $ol[0]->val->name, " \n"; my $gene = Gene->new(); $gene->add_transcript($t); $tree->insert([$t->start, $t->end], $gene); next; } if(scalar @real_ol == 1 ) { # transcript belongs to gene my $g = $real_ol[0]; # print STDERR "Same Gene group: ", $t->name, " in ", $real_ol[0]->name, " \n"; if($t->start < $g->start || $t->end > $g->end) { $tree->delete([$g->start, $g->end], $g); $g->add_transcript($t); $tree->insert([$g->start, $g->end], $g); next; } $g->add_transcript($t); next; } # a transcript make more genes same # print STDERR "Join Gene group: ", $t->name, " join ", $real_ol[0]->name, "and", $real_ol[1]->name, " \n"; my $gene = Gene->new(); $gene->add_transcript($t); foreach my $g (@real_ol) { $tree->delete([$g->start, $g->end], $g); foreach my $tp (@{$g->transcripts}) { $gene->add_transcript($tp); } } $tree->insert([$gene->start, $gene->end], $gene); } } } return \%genemodel; } sub _print_tree { return; my $tree = shift; my $t = Tree::DAG_Node->lol_to_tree($tree->root->as_lol); local $, = "\n"; print @{$t->draw_ascii_tree}, "\n"; } 1;