prada: pyPRADA_1.2/ioprada.py comparison

comparison pyPRADA_1.2/ioprada.py @ 0:acc2ca1a3ba4

Uploaded

author	siyuan
date	Thu, 20 Feb 2014 00:44:58 -0500
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:acc2ca1a3ba4
+#define IO functions. May add more gradually.
+import bioclass
+def read_conf(conffile):
+'''read configure file (conf.txt)'''
+ref_input=open(conffile)
+refdict={}
+for line in ref_input:
+if not line.strip():
+continue
+if '#' in line:
+contline=line.split('#')[0]
+content=contline.strip()
+else:
+content=line.strip()
+if not content:
+continue
+if content.startswith('--'):
+key=content
+refdict[key]={}
+else:
+idx=content.index('=')
+a=content[0:idx].strip()
+b=content[(idx+1):].strip()
+refdict[key][a]=b
+ref_input.close()
+return refdict
+def read_feature(featurefile,verbose=True):
+'''Read exon/transcript/gene info and return bioclass obj'''
+infile=open(featurefile)
+txdb={}    #keep track of all transcripts
+exdb={}    #keep track of all exons
+genedb={}  #keep track of all genes
+i=0
+for line in infile:
+i+=1
+if verbose:
+if i%100000==0:
+print '%d exon records loaded'%i
+chr,start,end,tx,gene,strand,cat=line.split()
+if cat != 'protein_coding':
+continue
+exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene)
+exdb[exon.name]=exon
+if not txdb.has_key(tx):
+txdb[tx]=bioclass.Transcript(tx,gene)
+txdb[tx].add_exon(exon)
+for txname in txdb:
+t=txdb[txname]
+if not genedb.has_key(t.gene):
+genedb[t.gene]=bioclass.Gene(t.gene)
+genedb[t.gene].add_transcript(t)
+infile.close()
+return (txdb,genedb)
+def read_feature_genes(featurefile,*args):
+'''Read exon/transcript/gene info for spec genes and return bioclass obj'''
+infile=open(featurefile)
+txdb={}
+exdb={}
+genedb={}
+for line in infile:
+chr,start,end,tx,gene,strand,cat=line.split()
+if gene not in args:
+continue
+if cat != 'protein_coding':
+continue
+exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene)
+exdb[exon.name]=exon
+if not txdb.has_key(tx):
+txdb[tx]=bioclass.Transcript(tx,gene)
+txdb[tx].add_exon(exon)
+for txname in txdb:
+t=txdb[txname]
+if not genedb.has_key(t.gene):
+genedb[t.gene]=bioclass.Gene(t.gene)
+genedb[t.gene].add_transcript(t)
+res=[]
+for g in args:
+if genedb.has_key(g):
+gobj=genedb[g]
+else:
+gobj=None
+res.append(gobj)
+infile.close()
+return res
+def read_cds(cdsfile):
+'''read CDS start/end for transcripts'''
+infile=open(cdsfile)
+tx_cds={}
+for line in infile:
+info=line.split()
+tx=info[0]
+cds_start,cds_end=int(info[3]),int(info[4])
+tx_cds[tx]=(cds_start,cds_end)
+infile.close()
+return tx_cds
+def read_tx_cat(txcatfile):
+'''find primary transcript for each gene'''
+tx_primary={}
+tx_cat={}
+infile=open(txcatfile)
+for line in infile:
+if not line.strip():
+continue
+info=line.split()
+if len(info) != 4:
+continue
+gene=info[2]
+txid=info[1]
+acc=int(info[3][-3:])
+if tx_cat.has_key(gene):
+tx_cat[gene].append([txid,acc])
+else:
+tx_cat[gene]=[[txid,acc]]
+infile.close()
+for item in tx_cat:
+vv=tx_cat[item]
+vv=sorted(vv,key=lambda x: x[1])
+tx_primary[item]=vv[0][0]
+return tx_primary
+if __name__=='__main__':
+#The following lines are for module testing purpose only.
+#They would not affect the program.
+featurefile='/RIS/home/wtorres/RNAseq/hg19broad/Ensembl64.canonical.gene.exons.tab.txt'
+cdsfile='/RIS/home/wtorres/RNAseq/hg19broad/ensembl.hg19.cds.txt'
+txcatfile='/RIS/home/wtorres/RNAseq/hg19broad/Ensembl64_primary_transcript.txt'

Mercurial > repos > siyuan > prada

comparison pyPRADA_1.2/ioprada.py @ 0:acc2ca1a3ba4