comparison pyPRADA_1.2/ioprada.py @ 0:acc2ca1a3ba4

Uploaded
author siyuan
date Thu, 20 Feb 2014 00:44:58 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:acc2ca1a3ba4
1 #define IO functions. May add more gradually.
2 import bioclass
3
4 def read_conf(conffile):
5 '''read configure file (conf.txt)'''
6 ref_input=open(conffile)
7 refdict={}
8 for line in ref_input:
9 if not line.strip():
10 continue
11 if '#' in line:
12 contline=line.split('#')[0]
13 content=contline.strip()
14 else:
15 content=line.strip()
16 if not content:
17 continue
18 if content.startswith('--'):
19 key=content
20 refdict[key]={}
21 else:
22 idx=content.index('=')
23 a=content[0:idx].strip()
24 b=content[(idx+1):].strip()
25 refdict[key][a]=b
26 ref_input.close()
27 return refdict
28
29 def read_feature(featurefile,verbose=True):
30 '''Read exon/transcript/gene info and return bioclass obj'''
31 infile=open(featurefile)
32 txdb={} #keep track of all transcripts
33 exdb={} #keep track of all exons
34 genedb={} #keep track of all genes
35 i=0
36 for line in infile:
37 i+=1
38 if verbose:
39 if i%100000==0:
40 print '%d exon records loaded'%i
41 chr,start,end,tx,gene,strand,cat=line.split()
42 if cat != 'protein_coding':
43 continue
44 exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene)
45 exdb[exon.name]=exon
46 if not txdb.has_key(tx):
47 txdb[tx]=bioclass.Transcript(tx,gene)
48 txdb[tx].add_exon(exon)
49 for txname in txdb:
50 t=txdb[txname]
51 if not genedb.has_key(t.gene):
52 genedb[t.gene]=bioclass.Gene(t.gene)
53 genedb[t.gene].add_transcript(t)
54 infile.close()
55 return (txdb,genedb)
56
57 def read_feature_genes(featurefile,*args):
58 '''Read exon/transcript/gene info for spec genes and return bioclass obj'''
59 infile=open(featurefile)
60 txdb={}
61 exdb={}
62 genedb={}
63 for line in infile:
64 chr,start,end,tx,gene,strand,cat=line.split()
65 if gene not in args:
66 continue
67 if cat != 'protein_coding':
68 continue
69 exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene)
70 exdb[exon.name]=exon
71 if not txdb.has_key(tx):
72 txdb[tx]=bioclass.Transcript(tx,gene)
73 txdb[tx].add_exon(exon)
74 for txname in txdb:
75 t=txdb[txname]
76 if not genedb.has_key(t.gene):
77 genedb[t.gene]=bioclass.Gene(t.gene)
78 genedb[t.gene].add_transcript(t)
79 res=[]
80 for g in args:
81 if genedb.has_key(g):
82 gobj=genedb[g]
83 else:
84 gobj=None
85 res.append(gobj)
86 infile.close()
87 return res
88
89 def read_cds(cdsfile):
90 '''read CDS start/end for transcripts'''
91 infile=open(cdsfile)
92 tx_cds={}
93 for line in infile:
94 info=line.split()
95 tx=info[0]
96 cds_start,cds_end=int(info[3]),int(info[4])
97 tx_cds[tx]=(cds_start,cds_end)
98 infile.close()
99 return tx_cds
100
101 def read_tx_cat(txcatfile):
102 '''find primary transcript for each gene'''
103 tx_primary={}
104 tx_cat={}
105 infile=open(txcatfile)
106 for line in infile:
107 if not line.strip():
108 continue
109 info=line.split()
110 if len(info) != 4:
111 continue
112 gene=info[2]
113 txid=info[1]
114 acc=int(info[3][-3:])
115 if tx_cat.has_key(gene):
116 tx_cat[gene].append([txid,acc])
117 else:
118 tx_cat[gene]=[[txid,acc]]
119 infile.close()
120 for item in tx_cat:
121 vv=tx_cat[item]
122 vv=sorted(vv,key=lambda x: x[1])
123 tx_primary[item]=vv[0][0]
124 return tx_primary
125
126
127 if __name__=='__main__':
128 #The following lines are for module testing purpose only.
129 #They would not affect the program.
130 featurefile='/RIS/home/wtorres/RNAseq/hg19broad/Ensembl64.canonical.gene.exons.tab.txt'
131 cdsfile='/RIS/home/wtorres/RNAseq/hg19broad/ensembl.hg19.cds.txt'
132 txcatfile='/RIS/home/wtorres/RNAseq/hg19broad/Ensembl64_primary_transcript.txt'
133