0
|
1 import bioclass
|
|
2
|
|
3 featurefile='/RIS/home/verhaakgroup/PRADA/hg19broad/Ensembl64.canonical.gene.exons.tab.txt'
|
|
4
|
|
5 infile=open('/RIS/home/verhaakgroup/PRADA/hg19broad/Homo_sapiens.GRCh37.64.gtf')
|
|
6 txdb={} #keep track of all transcripts
|
|
7 exdb={} #keep track of all exons
|
|
8 genedb={} #keep track of all genes
|
|
9 verbose=True
|
|
10 i=0
|
|
11 for line in infile:
|
|
12 i+=1
|
|
13 if verbose:
|
|
14 if i%100000==0:
|
|
15 print '%d features scanned'%i
|
|
16 info=line.strip().split('\t')
|
|
17 chr,cat,entity,start,end=info[0:5]
|
|
18 strand=info[6]
|
|
19 attributes=info[8]
|
|
20 if cat != 'protein_coding':
|
|
21 continue
|
|
22 attinfo=attributes.split(';')
|
|
23 atts=dict()
|
|
24 for item in attinfo:
|
|
25 item=item.strip()
|
|
26 if not item:
|
|
27 continue
|
|
28 tmp=item.split()
|
|
29 kk=tmp[0]
|
|
30 vv=tmp[1][1:-1]
|
|
31 atts[kk]=vv
|
|
32 tx=atts['transcript_id']
|
|
33 gene=atts['gene_name']
|
|
34 if entity=='exon':
|
|
35 exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene)
|
|
36 exdb[exon.name]=exon
|
|
37 if not txdb.has_key(tx):
|
|
38 txdb[tx]=bioclass.Transcript(tx,gene)
|
|
39 txdb[tx].add_exon(exon)
|
|
40 for txname in txdb:
|
|
41 if not genedb.has_key(t.gene):
|
|
42 genedb[t.gene]=bioclass.Gene(t.gene)
|
|
43 genedb[t.gene].add_transcript(t)
|
|
44
|
|
45 infile.close()
|
|
46
|
|
47
|
|
48
|
|
49 def read_feature(featurefile,verbose=True):
|
|
50 '''Read exon/transcript/gene info and return bioclass obj'''
|
|
51 infile=open(featurefile)
|
|
52 txdb={} #keep track of all transcripts
|
|
53 exdb={} #keep track of all exons
|
|
54 genedb={} #keep track of all genes
|
|
55 i=0
|
|
56 for line in infile:
|
|
57 i+=1
|
|
58 if verbose:
|
|
59 if i%100000==0:
|
|
60 print '%d exon records loaded'%i
|
|
61 chr,start,end,tx,gene,strand,cat=line.split()
|
|
62 if cat != 'protein_coding':
|
|
63 continue
|
|
64 exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene)
|
|
65 exdb[exon.name]=exon
|
|
66 if not txdb.has_key(tx):
|
|
67 txdb[tx]=bioclass.Transcript(tx,gene)
|
|
68 txdb[tx].add_exon(exon)
|
|
69 for txname in txdb:
|
|
70 t=txdb[txname]
|
|
71 if not genedb.has_key(t.gene):
|
|
72 genedb[t.gene]=bioclass.Gene(t.gene)
|
|
73 genedb[t.gene].add_transcript(t)
|
|
74 infile.close()
|
|
75 return (txdb,genedb)
|
|
76
|
|
77 def read_feature_genes(featurefile,*args):
|
|
78 '''Read exon/transcript/gene info for spec genes and return bioclass obj'''
|
|
79 infile=open(featurefile)
|
|
80 txdb={}
|
|
81 exdb={}
|
|
82 genedb={}
|
|
83 for line in infile:
|
|
84 chr,start,end,tx,gene,strand,cat=line.split()
|
|
85 if gene not in args:
|
|
86 continue
|
|
87 if cat != 'protein_coding':
|
|
88 continue
|
|
89 exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene)
|
|
90 exdb[exon.name]=exon
|
|
91 if not txdb.has_key(tx):
|
|
92 txdb[tx]=bioclass.Transcript(tx,gene)
|
|
93 txdb[tx].add_exon(exon)
|
|
94 for txname in txdb:
|
|
95 t=txdb[txname]
|
|
96 if not genedb.has_key(t.gene):
|
|
97 genedb[t.gene]=bioclass.Gene(t.gene)
|
|
98 genedb[t.gene].add_transcript(t)
|
|
99 res=[]
|
|
100 for g in args:
|
|
101 if genedb.has_key(g):
|
|
102 gobj=genedb[g]
|
|
103 else:
|
|
104 gobj=None
|
|
105 res.append(gobj)
|
|
106 infile.close()
|
|
107 return res
|
|
108
|
|
109 if __name__=='__main__':
|
|
110 pass
|