annotate pyPRADA_1.2/parse_gft.py @ 0:acc2ca1a3ba4

Uploaded
author siyuan
date Thu, 20 Feb 2014 00:44:58 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
1 import bioclass
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
2
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
3 featurefile='/RIS/home/verhaakgroup/PRADA/hg19broad/Ensembl64.canonical.gene.exons.tab.txt'
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
4
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
5 infile=open('/RIS/home/verhaakgroup/PRADA/hg19broad/Homo_sapiens.GRCh37.64.gtf')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
6 txdb={} #keep track of all transcripts
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
7 exdb={} #keep track of all exons
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
8 genedb={} #keep track of all genes
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
9 verbose=True
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
10 i=0
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
11 for line in infile:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
12 i+=1
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
13 if verbose:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
14 if i%100000==0:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
15 print '%d features scanned'%i
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
16 info=line.strip().split('\t')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
17 chr,cat,entity,start,end=info[0:5]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
18 strand=info[6]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
19 attributes=info[8]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
20 if cat != 'protein_coding':
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
21 continue
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
22 attinfo=attributes.split(';')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
23 atts=dict()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
24 for item in attinfo:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
25 item=item.strip()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
26 if not item:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
27 continue
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
28 tmp=item.split()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
29 kk=tmp[0]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
30 vv=tmp[1][1:-1]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
31 atts[kk]=vv
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
32 tx=atts['transcript_id']
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
33 gene=atts['gene_name']
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
34 if entity=='exon':
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
35 exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
36 exdb[exon.name]=exon
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
37 if not txdb.has_key(tx):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
38 txdb[tx]=bioclass.Transcript(tx,gene)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
39 txdb[tx].add_exon(exon)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
40 for txname in txdb:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
41 if not genedb.has_key(t.gene):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
42 genedb[t.gene]=bioclass.Gene(t.gene)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
43 genedb[t.gene].add_transcript(t)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
44
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
45 infile.close()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
46
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
47
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
48
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
49 def read_feature(featurefile,verbose=True):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
50 '''Read exon/transcript/gene info and return bioclass obj'''
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
51 infile=open(featurefile)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
52 txdb={} #keep track of all transcripts
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
53 exdb={} #keep track of all exons
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
54 genedb={} #keep track of all genes
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
55 i=0
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
56 for line in infile:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
57 i+=1
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
58 if verbose:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
59 if i%100000==0:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
60 print '%d exon records loaded'%i
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
61 chr,start,end,tx,gene,strand,cat=line.split()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
62 if cat != 'protein_coding':
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
63 continue
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
64 exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
65 exdb[exon.name]=exon
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
66 if not txdb.has_key(tx):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
67 txdb[tx]=bioclass.Transcript(tx,gene)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
68 txdb[tx].add_exon(exon)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
69 for txname in txdb:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
70 t=txdb[txname]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
71 if not genedb.has_key(t.gene):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
72 genedb[t.gene]=bioclass.Gene(t.gene)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
73 genedb[t.gene].add_transcript(t)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
74 infile.close()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
75 return (txdb,genedb)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
76
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
77 def read_feature_genes(featurefile,*args):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
78 '''Read exon/transcript/gene info for spec genes and return bioclass obj'''
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
79 infile=open(featurefile)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
80 txdb={}
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
81 exdb={}
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
82 genedb={}
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
83 for line in infile:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
84 chr,start,end,tx,gene,strand,cat=line.split()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
85 if gene not in args:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
86 continue
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
87 if cat != 'protein_coding':
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
88 continue
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
89 exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
90 exdb[exon.name]=exon
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
91 if not txdb.has_key(tx):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
92 txdb[tx]=bioclass.Transcript(tx,gene)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
93 txdb[tx].add_exon(exon)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
94 for txname in txdb:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
95 t=txdb[txname]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
96 if not genedb.has_key(t.gene):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
97 genedb[t.gene]=bioclass.Gene(t.gene)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
98 genedb[t.gene].add_transcript(t)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
99 res=[]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
100 for g in args:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
101 if genedb.has_key(g):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
102 gobj=genedb[g]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
103 else:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
104 gobj=None
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
105 res.append(gobj)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
106 infile.close()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
107 return res
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
108
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
109 if __name__=='__main__':
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
110 pass