prada: pyPRADA_1.2/parse_gft.py comparison

comparison pyPRADA_1.2/parse_gft.py @ 0:acc2ca1a3ba4

Uploaded

author	siyuan
date	Thu, 20 Feb 2014 00:44:58 -0500
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:acc2ca1a3ba4
+import bioclass
+featurefile='/RIS/home/verhaakgroup/PRADA/hg19broad/Ensembl64.canonical.gene.exons.tab.txt'
+infile=open('/RIS/home/verhaakgroup/PRADA/hg19broad/Homo_sapiens.GRCh37.64.gtf')
+txdb={}    #keep track of all transcripts
+exdb={}    #keep track of all exons
+genedb={}  #keep track of all genes
+verbose=True
+i=0
+for line in infile:
+i+=1
+if verbose:
+if i%100000==0:
+print '%d features scanned'%i
+info=line.strip().split('\t')
+chr,cat,entity,start,end=info[0:5]
+strand=info[6]
+attributes=info[8]
+if cat != 'protein_coding':
+continue
+attinfo=attributes.split(';')
+atts=dict()
+for item in attinfo:
+item=item.strip()
+if not item:
+continue
+tmp=item.split()
+kk=tmp[0]
+vv=tmp[1][1:-1]
+atts[kk]=vv
+tx=atts['transcript_id']
+gene=atts['gene_name']
+if entity=='exon':
+exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene)
+exdb[exon.name]=exon
+if not txdb.has_key(tx):
+txdb[tx]=bioclass.Transcript(tx,gene)
+txdb[tx].add_exon(exon)
+for txname in txdb:
+if not genedb.has_key(t.gene):
+genedb[t.gene]=bioclass.Gene(t.gene)
+genedb[t.gene].add_transcript(t)
+infile.close()
+def read_feature(featurefile,verbose=True):
+'''Read exon/transcript/gene info and return bioclass obj'''
+infile=open(featurefile)
+txdb={}    #keep track of all transcripts
+exdb={}    #keep track of all exons
+genedb={}  #keep track of all genes
+i=0
+for line in infile:
+i+=1
+if verbose:
+if i%100000==0:
+print '%d exon records loaded'%i
+chr,start,end,tx,gene,strand,cat=line.split()
+if cat != 'protein_coding':
+continue
+exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene)
+exdb[exon.name]=exon
+if not txdb.has_key(tx):
+txdb[tx]=bioclass.Transcript(tx,gene)
+txdb[tx].add_exon(exon)
+for txname in txdb:
+t=txdb[txname]
+if not genedb.has_key(t.gene):
+genedb[t.gene]=bioclass.Gene(t.gene)
+genedb[t.gene].add_transcript(t)
+infile.close()
+return (txdb,genedb)
+def read_feature_genes(featurefile,*args):
+'''Read exon/transcript/gene info for spec genes and return bioclass obj'''
+infile=open(featurefile)
+txdb={}
+exdb={}
+genedb={}
+for line in infile:
+chr,start,end,tx,gene,strand,cat=line.split()
+if gene not in args:
+continue
+if cat != 'protein_coding':
+continue
+exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene)
+exdb[exon.name]=exon
+if not txdb.has_key(tx):
+txdb[tx]=bioclass.Transcript(tx,gene)
+txdb[tx].add_exon(exon)
+for txname in txdb:
+t=txdb[txname]
+if not genedb.has_key(t.gene):
+genedb[t.gene]=bioclass.Gene(t.gene)
+genedb[t.gene].add_transcript(t)
+res=[]
+for g in args:
+if genedb.has_key(g):
+gobj=genedb[g]
+else:
+gobj=None
+res.append(gobj)
+infile.close()
+return res
+if __name__=='__main__':
+pass

Mercurial > repos > siyuan > prada

comparison pyPRADA_1.2/parse_gft.py @ 0:acc2ca1a3ba4