diff pyPRADA_1.2/parse_gft.py @ 0:acc2ca1a3ba4

Uploaded
author siyuan
date Thu, 20 Feb 2014 00:44:58 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pyPRADA_1.2/parse_gft.py	Thu Feb 20 00:44:58 2014 -0500
@@ -0,0 +1,110 @@
+import bioclass
+
+featurefile='/RIS/home/verhaakgroup/PRADA/hg19broad/Ensembl64.canonical.gene.exons.tab.txt'
+
+infile=open('/RIS/home/verhaakgroup/PRADA/hg19broad/Homo_sapiens.GRCh37.64.gtf')
+txdb={}    #keep track of all transcripts
+exdb={}    #keep track of all exons
+genedb={}  #keep track of all genes
+verbose=True
+i=0
+for line in infile:
+    i+=1
+    if verbose:
+        if i%100000==0:
+            print '%d features scanned'%i
+    info=line.strip().split('\t')
+    chr,cat,entity,start,end=info[0:5]
+    strand=info[6]
+    attributes=info[8]
+    if cat != 'protein_coding':
+        continue
+    attinfo=attributes.split(';')
+    atts=dict()
+    for item in attinfo:
+        item=item.strip()
+        if not item:
+            continue
+        tmp=item.split()
+        kk=tmp[0]
+        vv=tmp[1][1:-1]
+        atts[kk]=vv
+    tx=atts['transcript_id']
+    gene=atts['gene_name']
+    if entity=='exon':
+        exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene)
+        exdb[exon.name]=exon
+        if not txdb.has_key(tx):
+            txdb[tx]=bioclass.Transcript(tx,gene)
+        txdb[tx].add_exon(exon)
+    for txname in txdb:
+        if not genedb.has_key(t.gene):
+            genedb[t.gene]=bioclass.Gene(t.gene)
+        genedb[t.gene].add_transcript(t)
+
+infile.close()
+
+
+
+def read_feature(featurefile,verbose=True):
+    '''Read exon/transcript/gene info and return bioclass obj'''
+    infile=open(featurefile)
+    txdb={}    #keep track of all transcripts
+    exdb={}    #keep track of all exons
+    genedb={}  #keep track of all genes
+    i=0
+    for line in infile:
+        i+=1
+        if verbose:
+            if i%100000==0:
+                print '%d exon records loaded'%i
+        chr,start,end,tx,gene,strand,cat=line.split()
+        if cat != 'protein_coding':
+            continue
+        exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene)
+        exdb[exon.name]=exon
+        if not txdb.has_key(tx):
+            txdb[tx]=bioclass.Transcript(tx,gene)
+        txdb[tx].add_exon(exon)
+    for txname in txdb:
+        t=txdb[txname]
+        if not genedb.has_key(t.gene):
+            genedb[t.gene]=bioclass.Gene(t.gene)
+        genedb[t.gene].add_transcript(t)
+    infile.close()
+    return (txdb,genedb)
+
+def read_feature_genes(featurefile,*args):
+    '''Read exon/transcript/gene info for spec genes and return bioclass obj'''
+    infile=open(featurefile)
+    txdb={}
+    exdb={}
+    genedb={}
+    for line in infile:
+        chr,start,end,tx,gene,strand,cat=line.split()
+        if gene not in args:
+            continue
+        if cat != 'protein_coding':
+            continue
+        exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene)
+        exdb[exon.name]=exon
+        if not txdb.has_key(tx):
+            txdb[tx]=bioclass.Transcript(tx,gene)
+        txdb[tx].add_exon(exon)
+    for txname in txdb:
+        t=txdb[txname]
+        if not genedb.has_key(t.gene):
+            genedb[t.gene]=bioclass.Gene(t.gene)
+        genedb[t.gene].add_transcript(t)
+    res=[]
+    for g in args:
+        if genedb.has_key(g):
+            gobj=genedb[g]
+        else:
+            gobj=None
+        res.append(gobj)
+    infile.close()
+    return res
+
+if __name__=='__main__':
+    pass