diff pyPRADA_1.2/ioprada.py @ 0:acc2ca1a3ba4

Uploaded
author siyuan
date Thu, 20 Feb 2014 00:44:58 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pyPRADA_1.2/ioprada.py	Thu Feb 20 00:44:58 2014 -0500
@@ -0,0 +1,133 @@
+#define IO functions. May add more gradually. 
+import bioclass
+
+def read_conf(conffile):
+    '''read configure file (conf.txt)'''
+    ref_input=open(conffile)
+    refdict={}
+    for line in ref_input:
+        if not line.strip():
+            continue
+        if '#' in line:
+            contline=line.split('#')[0]
+            content=contline.strip()
+        else:
+            content=line.strip()
+        if not content:
+            continue
+        if content.startswith('--'):
+            key=content
+            refdict[key]={}
+        else:
+            idx=content.index('=')
+            a=content[0:idx].strip()
+            b=content[(idx+1):].strip()
+            refdict[key][a]=b
+    ref_input.close()
+    return refdict
+
+def read_feature(featurefile,verbose=True):
+    '''Read exon/transcript/gene info and return bioclass obj'''
+    infile=open(featurefile)
+    txdb={}    #keep track of all transcripts
+    exdb={}    #keep track of all exons
+    genedb={}  #keep track of all genes
+    i=0
+    for line in infile:
+        i+=1
+        if verbose:
+            if i%100000==0:
+                print '%d exon records loaded'%i
+        chr,start,end,tx,gene,strand,cat=line.split()
+        if cat != 'protein_coding':
+            continue
+        exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene)
+        exdb[exon.name]=exon
+        if not txdb.has_key(tx):
+            txdb[tx]=bioclass.Transcript(tx,gene)
+        txdb[tx].add_exon(exon)
+    for txname in txdb:
+        t=txdb[txname]
+        if not genedb.has_key(t.gene):
+            genedb[t.gene]=bioclass.Gene(t.gene)
+        genedb[t.gene].add_transcript(t)
+    infile.close()
+    return (txdb,genedb)
+
+def read_feature_genes(featurefile,*args):
+    '''Read exon/transcript/gene info for spec genes and return bioclass obj'''
+    infile=open(featurefile)
+    txdb={}
+    exdb={}
+    genedb={}
+    for line in infile:
+        chr,start,end,tx,gene,strand,cat=line.split()
+        if gene not in args:
+            continue
+        if cat != 'protein_coding':
+            continue
+        exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene)
+        exdb[exon.name]=exon
+        if not txdb.has_key(tx):
+            txdb[tx]=bioclass.Transcript(tx,gene)
+        txdb[tx].add_exon(exon)
+    for txname in txdb:
+        t=txdb[txname]
+        if not genedb.has_key(t.gene):
+            genedb[t.gene]=bioclass.Gene(t.gene)
+        genedb[t.gene].add_transcript(t)
+    res=[]
+    for g in args:
+        if genedb.has_key(g):
+            gobj=genedb[g]
+        else:
+            gobj=None
+        res.append(gobj)
+    infile.close()
+    return res
+
+def read_cds(cdsfile):
+    '''read CDS start/end for transcripts'''
+    infile=open(cdsfile)
+    tx_cds={}
+    for line in infile:
+        info=line.split()
+        tx=info[0]
+        cds_start,cds_end=int(info[3]),int(info[4])
+        tx_cds[tx]=(cds_start,cds_end)
+    infile.close()
+    return tx_cds
+
+def read_tx_cat(txcatfile):
+    '''find primary transcript for each gene'''
+    tx_primary={}
+    tx_cat={}
+    infile=open(txcatfile)
+    for line in infile:
+        if not line.strip():
+            continue
+        info=line.split()
+        if len(info) != 4:
+            continue
+        gene=info[2]
+        txid=info[1]
+        acc=int(info[3][-3:])
+        if tx_cat.has_key(gene):
+            tx_cat[gene].append([txid,acc])
+        else:
+            tx_cat[gene]=[[txid,acc]]
+    infile.close()
+    for item in tx_cat:
+        vv=tx_cat[item]
+        vv=sorted(vv,key=lambda x: x[1])
+        tx_primary[item]=vv[0][0]
+    return tx_primary
+
+
+if __name__=='__main__':
+    #The following lines are for module testing purpose only.
+    #They would not affect the program.
+    featurefile='/RIS/home/wtorres/RNAseq/hg19broad/Ensembl64.canonical.gene.exons.tab.txt'
+    cdsfile='/RIS/home/wtorres/RNAseq/hg19broad/ensembl.hg19.cds.txt'
+    txcatfile='/RIS/home/wtorres/RNAseq/hg19broad/Ensembl64_primary_transcript.txt'
+