view pyPRADA_1.2/bioclass.py @ 0:acc2ca1a3ba4

Uploaded
author siyuan
date Thu, 20 Feb 2014 00:44:58 -0500
parents
children
line wrap: on
line source

#Module defines exon, transcript and gene object.
#It is extendable for more attributes and functions. 
#It is part of py-PRADA. 
#Author: Siyuan Zheng (szheng2@mdanderson.org)
#Last modified at 03/07/2013

class Exon(object):
    """Is an object with information about Exons and the Transcripts it is found in.

    If given an exon name, returns an object with exon location inforamtion and, gene and transcript information.

    e.g.

    >>> exon=bioclass.Exon(chr,int(start),int(end),strand,'tx','gene')
    exon.start
    exon.end
    exon.chr
    exon.strand
    exon.gene
    exon.transcript
    exon.name
    exon.length
    """

    def __init__(self,chr,start,end,strand,tx,gene):
        if not all([isinstance(x,int) for x in [start,end]]):
            raise Exception('start,end must be int!')
        self.start=start
        self.end=end
        self.chr=chr
        self.strand=strand  #'1' or '-1'
        self.gene=gene
        self.transcript=tx  #exon may map to multi-transcripts, but for simplicity, use one
        self.name='%s:%s:%s:%s:%s'%(gene,chr,str(start),str(end),strand) #keep gene in the name too
        self.length=self._length()
    def _length(self):
        return abs(self.end-self.start)+1

class Transcript(object):
    """
    Is an object with information about Transcript and the Gene it is found in.

    If given an transcript name, returns an object with transcript location inforamtion, gene information and exon objects.   .

    e.g.

    >>> tx=bioclass.Transcript('tx','gene')
    tx.start
    tx.end
    tx.chr
    tx.strand
    tx.gene
    tx.exon
    tx.name
    tx.length

    #Add new exons to the transcript:
    tx.add_exon(exon)

    #Relative position of the transcript based on the exons defined:
    tx.exon_relative_pos()
    """

    def __init__(self,name,gene):
        self.exon=[]
        self.name=name
        self.gene=gene
        self.length=0
        self.strand=None
        self.start=None
        self.end=None
        self.cds_start=None
        self.cds_end=None
        self.is_primary=None
    def _update(self):
        self._basics()
        self._sort_exon()
    def _basics(self):
        assert len(self.exon)>0, 'no exon in the transcript'
        self.strand=self.exon[0].strand
        self.chr=self.exon[0].chr
        self.length=reduce(lambda x,y:x+y, [e.length for e in self.exon])
        self.start=min([e.start for e in self.exon])
        self.end=max([e.end for e in self.exon])
    def _sort_exon(self):
        sorted_exons=sorted(self.exon, key=lambda x:x.start)
        self.exon=sorted_exons
    def add_exon(self,exon):
        nameset=[x.name for x in self.exon]
        if exon.name not in nameset:
            self.exon.append(exon)
        self._update()
    def set_cds(self,start,end):
        self.cds_start=start
        self.cds_end=end 
    def set_primary(self,isprim):
        self.is_primary=isprim
    def exon_relative_pos(self):
        L=[x.length for x in self.exon]
        pos=[]
        if self.strand=='1':
            init=0
            for item in L:
                region=(init+1, init+item)
                init=init+item
                pos.append(region)
        if self.strand=='-1':
            init=self.length
            for item in L:
                region=(init-item+1,init)
                init=init-item
                pos.append(region)
        relpos=dict(zip([x.name for x in self.exon],pos))
        return relpos

class Gene(object):
    """
    Is an object with information about Gene.

    If given an gene name, returns an object with gene location inforamtion, transcript and exon objects.   .

    e.g.

    >>> gene=bioclass.Gene('gene')
    gene.start
    gene.end
    gene.chr
    gene.strand
    gene.transcript
    gene.name
    gene.length

    #Add new transcript to the gene:
    gene.add_transcript()

    #Obtain a list of all the exons define within that gene:
    gene.get_exons()
    """

    def __init__(self,name):
        self.name=name
        self.transcript=[]
    def _update(self):
        self._basics()
    def _basics(self):
        assert len(self.transcript)>0, 'no transcript in the gene'
        self.strand=self.transcript[0].strand
        self.chr=self.transcript[0].chr
        self.start=min([t.start for t in self.transcript])
        self.end=max([t.end for t in self.transcript]) 
    
    def add_transcript(self,tx):
        nameset=[x.name for x in self.transcript]
        if tx.name not in nameset:
            self.transcript.append(tx)
        self._update()
    def get_exons(self):
        exons={}
        for t in self.transcript:
            for e in t.exon:
                exons[e.name]=e
        return exons

 
if __name__=='__main__':
    #below for testing purpose only.
    infile=open('/RIS/home/wtorres/RNAseq/hg19broad/Ensembl64.canonical.gene.exons.tab.txt')
    txdb={}    #keep track of all transcripts
    exdb={}    #keep track of all exons
    genedb={}  #keep track of all genes
    i=0
    for line in infile:
        i+=1
        if i%100000==0:
            print '%d exon records loaded'%i
        chr,start,end,tx,gene,strand,cat=line.split()
        if cat != 'protein_coding':
            continue
        exon=Exon(chr,int(start),int(end),strand,tx,gene)
        exdb[exon.name]=exon
        if not txdb.has_key(tx):
            txdb[tx]=Transcript(tx,gene)
        txdb[tx].add_exon(exon)
    for txname in txdb:
        t=txdb[txname]
        if not genedb.has_key(t.gene):
            genedb[t.gene]=Gene(t.gene)
        genedb[t.gene].add_transcript(t)
    infile.close()