view pyPRADA_1.2/parse_gft.py @ 0:acc2ca1a3ba4

Uploaded
author siyuan
date Thu, 20 Feb 2014 00:44:58 -0500
parents
children
line wrap: on
line source

import bioclass

featurefile='/RIS/home/verhaakgroup/PRADA/hg19broad/Ensembl64.canonical.gene.exons.tab.txt'

infile=open('/RIS/home/verhaakgroup/PRADA/hg19broad/Homo_sapiens.GRCh37.64.gtf')
txdb={}    #keep track of all transcripts
exdb={}    #keep track of all exons
genedb={}  #keep track of all genes
verbose=True
i=0
for line in infile:
    i+=1
    if verbose:
        if i%100000==0:
            print '%d features scanned'%i
    info=line.strip().split('\t')
    chr,cat,entity,start,end=info[0:5]
    strand=info[6]
    attributes=info[8]
    if cat != 'protein_coding':
        continue
    attinfo=attributes.split(';')
    atts=dict()
    for item in attinfo:
        item=item.strip()
        if not item:
            continue
        tmp=item.split()
        kk=tmp[0]
        vv=tmp[1][1:-1]
        atts[kk]=vv
    tx=atts['transcript_id']
    gene=atts['gene_name']
    if entity=='exon':
        exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene)
        exdb[exon.name]=exon
        if not txdb.has_key(tx):
            txdb[tx]=bioclass.Transcript(tx,gene)
        txdb[tx].add_exon(exon)
    for txname in txdb:
        if not genedb.has_key(t.gene):
            genedb[t.gene]=bioclass.Gene(t.gene)
        genedb[t.gene].add_transcript(t)

infile.close()



def read_feature(featurefile,verbose=True):
    '''Read exon/transcript/gene info and return bioclass obj'''
    infile=open(featurefile)
    txdb={}    #keep track of all transcripts
    exdb={}    #keep track of all exons
    genedb={}  #keep track of all genes
    i=0
    for line in infile:
        i+=1
        if verbose:
            if i%100000==0:
                print '%d exon records loaded'%i
        chr,start,end,tx,gene,strand,cat=line.split()
        if cat != 'protein_coding':
            continue
        exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene)
        exdb[exon.name]=exon
        if not txdb.has_key(tx):
            txdb[tx]=bioclass.Transcript(tx,gene)
        txdb[tx].add_exon(exon)
    for txname in txdb:
        t=txdb[txname]
        if not genedb.has_key(t.gene):
            genedb[t.gene]=bioclass.Gene(t.gene)
        genedb[t.gene].add_transcript(t)
    infile.close()
    return (txdb,genedb)

def read_feature_genes(featurefile,*args):
    '''Read exon/transcript/gene info for spec genes and return bioclass obj'''
    infile=open(featurefile)
    txdb={}
    exdb={}
    genedb={}
    for line in infile:
        chr,start,end,tx,gene,strand,cat=line.split()
        if gene not in args:
            continue
        if cat != 'protein_coding':
            continue
        exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene)
        exdb[exon.name]=exon
        if not txdb.has_key(tx):
            txdb[tx]=bioclass.Transcript(tx,gene)
        txdb[tx].add_exon(exon)
    for txname in txdb:
        t=txdb[txname]
        if not genedb.has_key(t.gene):
            genedb[t.gene]=bioclass.Gene(t.gene)
        genedb[t.gene].add_transcript(t)
    res=[]
    for g in args:
        if genedb.has_key(g):
            gobj=genedb[g]
        else:
            gobj=None
        res.append(gobj)
    infile.close()
    return res

if __name__=='__main__':
    pass