Mercurial > repos > siyuan > prada
comparison pyPRADA_1.2/ioprada.py @ 0:acc2ca1a3ba4
Uploaded
author | siyuan |
---|---|
date | Thu, 20 Feb 2014 00:44:58 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:acc2ca1a3ba4 |
---|---|
1 #define IO functions. May add more gradually. | |
2 import bioclass | |
3 | |
4 def read_conf(conffile): | |
5 '''read configure file (conf.txt)''' | |
6 ref_input=open(conffile) | |
7 refdict={} | |
8 for line in ref_input: | |
9 if not line.strip(): | |
10 continue | |
11 if '#' in line: | |
12 contline=line.split('#')[0] | |
13 content=contline.strip() | |
14 else: | |
15 content=line.strip() | |
16 if not content: | |
17 continue | |
18 if content.startswith('--'): | |
19 key=content | |
20 refdict[key]={} | |
21 else: | |
22 idx=content.index('=') | |
23 a=content[0:idx].strip() | |
24 b=content[(idx+1):].strip() | |
25 refdict[key][a]=b | |
26 ref_input.close() | |
27 return refdict | |
28 | |
29 def read_feature(featurefile,verbose=True): | |
30 '''Read exon/transcript/gene info and return bioclass obj''' | |
31 infile=open(featurefile) | |
32 txdb={} #keep track of all transcripts | |
33 exdb={} #keep track of all exons | |
34 genedb={} #keep track of all genes | |
35 i=0 | |
36 for line in infile: | |
37 i+=1 | |
38 if verbose: | |
39 if i%100000==0: | |
40 print '%d exon records loaded'%i | |
41 chr,start,end,tx,gene,strand,cat=line.split() | |
42 if cat != 'protein_coding': | |
43 continue | |
44 exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene) | |
45 exdb[exon.name]=exon | |
46 if not txdb.has_key(tx): | |
47 txdb[tx]=bioclass.Transcript(tx,gene) | |
48 txdb[tx].add_exon(exon) | |
49 for txname in txdb: | |
50 t=txdb[txname] | |
51 if not genedb.has_key(t.gene): | |
52 genedb[t.gene]=bioclass.Gene(t.gene) | |
53 genedb[t.gene].add_transcript(t) | |
54 infile.close() | |
55 return (txdb,genedb) | |
56 | |
57 def read_feature_genes(featurefile,*args): | |
58 '''Read exon/transcript/gene info for spec genes and return bioclass obj''' | |
59 infile=open(featurefile) | |
60 txdb={} | |
61 exdb={} | |
62 genedb={} | |
63 for line in infile: | |
64 chr,start,end,tx,gene,strand,cat=line.split() | |
65 if gene not in args: | |
66 continue | |
67 if cat != 'protein_coding': | |
68 continue | |
69 exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene) | |
70 exdb[exon.name]=exon | |
71 if not txdb.has_key(tx): | |
72 txdb[tx]=bioclass.Transcript(tx,gene) | |
73 txdb[tx].add_exon(exon) | |
74 for txname in txdb: | |
75 t=txdb[txname] | |
76 if not genedb.has_key(t.gene): | |
77 genedb[t.gene]=bioclass.Gene(t.gene) | |
78 genedb[t.gene].add_transcript(t) | |
79 res=[] | |
80 for g in args: | |
81 if genedb.has_key(g): | |
82 gobj=genedb[g] | |
83 else: | |
84 gobj=None | |
85 res.append(gobj) | |
86 infile.close() | |
87 return res | |
88 | |
89 def read_cds(cdsfile): | |
90 '''read CDS start/end for transcripts''' | |
91 infile=open(cdsfile) | |
92 tx_cds={} | |
93 for line in infile: | |
94 info=line.split() | |
95 tx=info[0] | |
96 cds_start,cds_end=int(info[3]),int(info[4]) | |
97 tx_cds[tx]=(cds_start,cds_end) | |
98 infile.close() | |
99 return tx_cds | |
100 | |
101 def read_tx_cat(txcatfile): | |
102 '''find primary transcript for each gene''' | |
103 tx_primary={} | |
104 tx_cat={} | |
105 infile=open(txcatfile) | |
106 for line in infile: | |
107 if not line.strip(): | |
108 continue | |
109 info=line.split() | |
110 if len(info) != 4: | |
111 continue | |
112 gene=info[2] | |
113 txid=info[1] | |
114 acc=int(info[3][-3:]) | |
115 if tx_cat.has_key(gene): | |
116 tx_cat[gene].append([txid,acc]) | |
117 else: | |
118 tx_cat[gene]=[[txid,acc]] | |
119 infile.close() | |
120 for item in tx_cat: | |
121 vv=tx_cat[item] | |
122 vv=sorted(vv,key=lambda x: x[1]) | |
123 tx_primary[item]=vv[0][0] | |
124 return tx_primary | |
125 | |
126 | |
127 if __name__=='__main__': | |
128 #The following lines are for module testing purpose only. | |
129 #They would not affect the program. | |
130 featurefile='/RIS/home/wtorres/RNAseq/hg19broad/Ensembl64.canonical.gene.exons.tab.txt' | |
131 cdsfile='/RIS/home/wtorres/RNAseq/hg19broad/ensembl.hg19.cds.txt' | |
132 txcatfile='/RIS/home/wtorres/RNAseq/hg19broad/Ensembl64_primary_transcript.txt' | |
133 |