annotate pyPRADA_1.2/ioprada.py @ 0:acc2ca1a3ba4

Uploaded
author siyuan
date Thu, 20 Feb 2014 00:44:58 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
1 #define IO functions. May add more gradually.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
2 import bioclass
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
3
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
4 def read_conf(conffile):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
5 '''read configure file (conf.txt)'''
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
6 ref_input=open(conffile)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
7 refdict={}
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
8 for line in ref_input:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
9 if not line.strip():
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
10 continue
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
11 if '#' in line:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
12 contline=line.split('#')[0]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
13 content=contline.strip()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
14 else:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
15 content=line.strip()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
16 if not content:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
17 continue
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
18 if content.startswith('--'):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
19 key=content
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
20 refdict[key]={}
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
21 else:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
22 idx=content.index('=')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
23 a=content[0:idx].strip()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
24 b=content[(idx+1):].strip()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
25 refdict[key][a]=b
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
26 ref_input.close()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
27 return refdict
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
28
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
29 def read_feature(featurefile,verbose=True):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
30 '''Read exon/transcript/gene info and return bioclass obj'''
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
31 infile=open(featurefile)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
32 txdb={} #keep track of all transcripts
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
33 exdb={} #keep track of all exons
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
34 genedb={} #keep track of all genes
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
35 i=0
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
36 for line in infile:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
37 i+=1
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
38 if verbose:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
39 if i%100000==0:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
40 print '%d exon records loaded'%i
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
41 chr,start,end,tx,gene,strand,cat=line.split()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
42 if cat != 'protein_coding':
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
43 continue
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
44 exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
45 exdb[exon.name]=exon
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
46 if not txdb.has_key(tx):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
47 txdb[tx]=bioclass.Transcript(tx,gene)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
48 txdb[tx].add_exon(exon)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
49 for txname in txdb:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
50 t=txdb[txname]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
51 if not genedb.has_key(t.gene):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
52 genedb[t.gene]=bioclass.Gene(t.gene)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
53 genedb[t.gene].add_transcript(t)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
54 infile.close()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
55 return (txdb,genedb)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
56
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
57 def read_feature_genes(featurefile,*args):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
58 '''Read exon/transcript/gene info for spec genes and return bioclass obj'''
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
59 infile=open(featurefile)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
60 txdb={}
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
61 exdb={}
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
62 genedb={}
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
63 for line in infile:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
64 chr,start,end,tx,gene,strand,cat=line.split()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
65 if gene not in args:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
66 continue
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
67 if cat != 'protein_coding':
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
68 continue
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
69 exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
70 exdb[exon.name]=exon
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
71 if not txdb.has_key(tx):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
72 txdb[tx]=bioclass.Transcript(tx,gene)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
73 txdb[tx].add_exon(exon)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
74 for txname in txdb:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
75 t=txdb[txname]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
76 if not genedb.has_key(t.gene):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
77 genedb[t.gene]=bioclass.Gene(t.gene)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
78 genedb[t.gene].add_transcript(t)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
79 res=[]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
80 for g in args:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
81 if genedb.has_key(g):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
82 gobj=genedb[g]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
83 else:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
84 gobj=None
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
85 res.append(gobj)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
86 infile.close()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
87 return res
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
88
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
89 def read_cds(cdsfile):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
90 '''read CDS start/end for transcripts'''
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
91 infile=open(cdsfile)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
92 tx_cds={}
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
93 for line in infile:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
94 info=line.split()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
95 tx=info[0]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
96 cds_start,cds_end=int(info[3]),int(info[4])
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
97 tx_cds[tx]=(cds_start,cds_end)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
98 infile.close()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
99 return tx_cds
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
100
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
101 def read_tx_cat(txcatfile):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
102 '''find primary transcript for each gene'''
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
103 tx_primary={}
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
104 tx_cat={}
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
105 infile=open(txcatfile)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
106 for line in infile:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
107 if not line.strip():
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
108 continue
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
109 info=line.split()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
110 if len(info) != 4:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
111 continue
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
112 gene=info[2]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
113 txid=info[1]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
114 acc=int(info[3][-3:])
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
115 if tx_cat.has_key(gene):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
116 tx_cat[gene].append([txid,acc])
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
117 else:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
118 tx_cat[gene]=[[txid,acc]]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
119 infile.close()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
120 for item in tx_cat:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
121 vv=tx_cat[item]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
122 vv=sorted(vv,key=lambda x: x[1])
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
123 tx_primary[item]=vv[0][0]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
124 return tx_primary
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
125
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
126
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
127 if __name__=='__main__':
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
128 #The following lines are for module testing purpose only.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
129 #They would not affect the program.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
130 featurefile='/RIS/home/wtorres/RNAseq/hg19broad/Ensembl64.canonical.gene.exons.tab.txt'
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
131 cdsfile='/RIS/home/wtorres/RNAseq/hg19broad/ensembl.hg19.cds.txt'
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
132 txcatfile='/RIS/home/wtorres/RNAseq/hg19broad/Ensembl64_primary_transcript.txt'
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
133