Mercurial > repos > siyuan > prada
diff pyPRADA_1.2/parse_gft.py @ 0:acc2ca1a3ba4
Uploaded
author | siyuan |
---|---|
date | Thu, 20 Feb 2014 00:44:58 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyPRADA_1.2/parse_gft.py Thu Feb 20 00:44:58 2014 -0500 @@ -0,0 +1,110 @@ +import bioclass + +featurefile='/RIS/home/verhaakgroup/PRADA/hg19broad/Ensembl64.canonical.gene.exons.tab.txt' + +infile=open('/RIS/home/verhaakgroup/PRADA/hg19broad/Homo_sapiens.GRCh37.64.gtf') +txdb={} #keep track of all transcripts +exdb={} #keep track of all exons +genedb={} #keep track of all genes +verbose=True +i=0 +for line in infile: + i+=1 + if verbose: + if i%100000==0: + print '%d features scanned'%i + info=line.strip().split('\t') + chr,cat,entity,start,end=info[0:5] + strand=info[6] + attributes=info[8] + if cat != 'protein_coding': + continue + attinfo=attributes.split(';') + atts=dict() + for item in attinfo: + item=item.strip() + if not item: + continue + tmp=item.split() + kk=tmp[0] + vv=tmp[1][1:-1] + atts[kk]=vv + tx=atts['transcript_id'] + gene=atts['gene_name'] + if entity=='exon': + exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene) + exdb[exon.name]=exon + if not txdb.has_key(tx): + txdb[tx]=bioclass.Transcript(tx,gene) + txdb[tx].add_exon(exon) + for txname in txdb: + if not genedb.has_key(t.gene): + genedb[t.gene]=bioclass.Gene(t.gene) + genedb[t.gene].add_transcript(t) + +infile.close() + + + +def read_feature(featurefile,verbose=True): + '''Read exon/transcript/gene info and return bioclass obj''' + infile=open(featurefile) + txdb={} #keep track of all transcripts + exdb={} #keep track of all exons + genedb={} #keep track of all genes + i=0 + for line in infile: + i+=1 + if verbose: + if i%100000==0: + print '%d exon records loaded'%i + chr,start,end,tx,gene,strand,cat=line.split() + if cat != 'protein_coding': + continue + exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene) + exdb[exon.name]=exon + if not txdb.has_key(tx): + txdb[tx]=bioclass.Transcript(tx,gene) + txdb[tx].add_exon(exon) + for txname in txdb: + t=txdb[txname] + if not genedb.has_key(t.gene): + genedb[t.gene]=bioclass.Gene(t.gene) + genedb[t.gene].add_transcript(t) + infile.close() + return (txdb,genedb) + +def read_feature_genes(featurefile,*args): + '''Read exon/transcript/gene info for spec genes and return bioclass obj''' + infile=open(featurefile) + txdb={} + exdb={} + genedb={} + for line in infile: + chr,start,end,tx,gene,strand,cat=line.split() + if gene not in args: + continue + if cat != 'protein_coding': + continue + exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene) + exdb[exon.name]=exon + if not txdb.has_key(tx): + txdb[tx]=bioclass.Transcript(tx,gene) + txdb[tx].add_exon(exon) + for txname in txdb: + t=txdb[txname] + if not genedb.has_key(t.gene): + genedb[t.gene]=bioclass.Gene(t.gene) + genedb[t.gene].add_transcript(t) + res=[] + for g in args: + if genedb.has_key(g): + gobj=genedb[g] + else: + gobj=None + res.append(gobj) + infile.close() + return res + +if __name__=='__main__': + pass