Mercurial > repos > siyuan > prada
view pyPRADA_1.2/bioclass.py @ 3:f17965495ec9 draft default tip
Uploaded
author | siyuan |
---|---|
date | Tue, 11 Mar 2014 12:14:01 -0400 |
parents | acc2ca1a3ba4 |
children |
line wrap: on
line source
#Module defines exon, transcript and gene object. #It is extendable for more attributes and functions. #It is part of py-PRADA. #Author: Siyuan Zheng (szheng2@mdanderson.org) #Last modified at 03/07/2013 class Exon(object): """Is an object with information about Exons and the Transcripts it is found in. If given an exon name, returns an object with exon location inforamtion and, gene and transcript information. e.g. >>> exon=bioclass.Exon(chr,int(start),int(end),strand,'tx','gene') exon.start exon.end exon.chr exon.strand exon.gene exon.transcript exon.name exon.length """ def __init__(self,chr,start,end,strand,tx,gene): if not all([isinstance(x,int) for x in [start,end]]): raise Exception('start,end must be int!') self.start=start self.end=end self.chr=chr self.strand=strand #'1' or '-1' self.gene=gene self.transcript=tx #exon may map to multi-transcripts, but for simplicity, use one self.name='%s:%s:%s:%s:%s'%(gene,chr,str(start),str(end),strand) #keep gene in the name too self.length=self._length() def _length(self): return abs(self.end-self.start)+1 class Transcript(object): """ Is an object with information about Transcript and the Gene it is found in. If given an transcript name, returns an object with transcript location inforamtion, gene information and exon objects. . e.g. >>> tx=bioclass.Transcript('tx','gene') tx.start tx.end tx.chr tx.strand tx.gene tx.exon tx.name tx.length #Add new exons to the transcript: tx.add_exon(exon) #Relative position of the transcript based on the exons defined: tx.exon_relative_pos() """ def __init__(self,name,gene): self.exon=[] self.name=name self.gene=gene self.length=0 self.strand=None self.start=None self.end=None self.cds_start=None self.cds_end=None self.is_primary=None def _update(self): self._basics() self._sort_exon() def _basics(self): assert len(self.exon)>0, 'no exon in the transcript' self.strand=self.exon[0].strand self.chr=self.exon[0].chr self.length=reduce(lambda x,y:x+y, [e.length for e in self.exon]) self.start=min([e.start for e in self.exon]) self.end=max([e.end for e in self.exon]) def _sort_exon(self): sorted_exons=sorted(self.exon, key=lambda x:x.start) self.exon=sorted_exons def add_exon(self,exon): nameset=[x.name for x in self.exon] if exon.name not in nameset: self.exon.append(exon) self._update() def set_cds(self,start,end): self.cds_start=start self.cds_end=end def set_primary(self,isprim): self.is_primary=isprim def exon_relative_pos(self): L=[x.length for x in self.exon] pos=[] if self.strand=='1': init=0 for item in L: region=(init+1, init+item) init=init+item pos.append(region) if self.strand=='-1': init=self.length for item in L: region=(init-item+1,init) init=init-item pos.append(region) relpos=dict(zip([x.name for x in self.exon],pos)) return relpos class Gene(object): """ Is an object with information about Gene. If given an gene name, returns an object with gene location inforamtion, transcript and exon objects. . e.g. >>> gene=bioclass.Gene('gene') gene.start gene.end gene.chr gene.strand gene.transcript gene.name gene.length #Add new transcript to the gene: gene.add_transcript() #Obtain a list of all the exons define within that gene: gene.get_exons() """ def __init__(self,name): self.name=name self.transcript=[] def _update(self): self._basics() def _basics(self): assert len(self.transcript)>0, 'no transcript in the gene' self.strand=self.transcript[0].strand self.chr=self.transcript[0].chr self.start=min([t.start for t in self.transcript]) self.end=max([t.end for t in self.transcript]) def add_transcript(self,tx): nameset=[x.name for x in self.transcript] if tx.name not in nameset: self.transcript.append(tx) self._update() def get_exons(self): exons={} for t in self.transcript: for e in t.exon: exons[e.name]=e return exons if __name__=='__main__': #below for testing purpose only. infile=open('/RIS/home/wtorres/RNAseq/hg19broad/Ensembl64.canonical.gene.exons.tab.txt') txdb={} #keep track of all transcripts exdb={} #keep track of all exons genedb={} #keep track of all genes i=0 for line in infile: i+=1 if i%100000==0: print '%d exon records loaded'%i chr,start,end,tx,gene,strand,cat=line.split() if cat != 'protein_coding': continue exon=Exon(chr,int(start),int(end),strand,tx,gene) exdb[exon.name]=exon if not txdb.has_key(tx): txdb[tx]=Transcript(tx,gene) txdb[tx].add_exon(exon) for txname in txdb: t=txdb[txname] if not genedb.has_key(t.gene): genedb[t.gene]=Gene(t.gene) genedb[t.gene].add_transcript(t) infile.close()