Mercurial > repos > siyuan > prada
view pyPRADA_1.2/parse_gft.py @ 0:acc2ca1a3ba4
Uploaded
author | siyuan |
---|---|
date | Thu, 20 Feb 2014 00:44:58 -0500 |
parents | |
children |
line wrap: on
line source
import bioclass featurefile='/RIS/home/verhaakgroup/PRADA/hg19broad/Ensembl64.canonical.gene.exons.tab.txt' infile=open('/RIS/home/verhaakgroup/PRADA/hg19broad/Homo_sapiens.GRCh37.64.gtf') txdb={} #keep track of all transcripts exdb={} #keep track of all exons genedb={} #keep track of all genes verbose=True i=0 for line in infile: i+=1 if verbose: if i%100000==0: print '%d features scanned'%i info=line.strip().split('\t') chr,cat,entity,start,end=info[0:5] strand=info[6] attributes=info[8] if cat != 'protein_coding': continue attinfo=attributes.split(';') atts=dict() for item in attinfo: item=item.strip() if not item: continue tmp=item.split() kk=tmp[0] vv=tmp[1][1:-1] atts[kk]=vv tx=atts['transcript_id'] gene=atts['gene_name'] if entity=='exon': exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene) exdb[exon.name]=exon if not txdb.has_key(tx): txdb[tx]=bioclass.Transcript(tx,gene) txdb[tx].add_exon(exon) for txname in txdb: if not genedb.has_key(t.gene): genedb[t.gene]=bioclass.Gene(t.gene) genedb[t.gene].add_transcript(t) infile.close() def read_feature(featurefile,verbose=True): '''Read exon/transcript/gene info and return bioclass obj''' infile=open(featurefile) txdb={} #keep track of all transcripts exdb={} #keep track of all exons genedb={} #keep track of all genes i=0 for line in infile: i+=1 if verbose: if i%100000==0: print '%d exon records loaded'%i chr,start,end,tx,gene,strand,cat=line.split() if cat != 'protein_coding': continue exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene) exdb[exon.name]=exon if not txdb.has_key(tx): txdb[tx]=bioclass.Transcript(tx,gene) txdb[tx].add_exon(exon) for txname in txdb: t=txdb[txname] if not genedb.has_key(t.gene): genedb[t.gene]=bioclass.Gene(t.gene) genedb[t.gene].add_transcript(t) infile.close() return (txdb,genedb) def read_feature_genes(featurefile,*args): '''Read exon/transcript/gene info for spec genes and return bioclass obj''' infile=open(featurefile) txdb={} exdb={} genedb={} for line in infile: chr,start,end,tx,gene,strand,cat=line.split() if gene not in args: continue if cat != 'protein_coding': continue exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene) exdb[exon.name]=exon if not txdb.has_key(tx): txdb[tx]=bioclass.Transcript(tx,gene) txdb[tx].add_exon(exon) for txname in txdb: t=txdb[txname] if not genedb.has_key(t.gene): genedb[t.gene]=bioclass.Gene(t.gene) genedb[t.gene].add_transcript(t) res=[] for g in args: if genedb.has_key(g): gobj=genedb[g] else: gobj=None res.append(gobj) infile.close() return res if __name__=='__main__': pass