comparison pyPRADA_1.2/parse_gft.py @ 0:acc2ca1a3ba4

Uploaded
author siyuan
date Thu, 20 Feb 2014 00:44:58 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:acc2ca1a3ba4
1 import bioclass
2
3 featurefile='/RIS/home/verhaakgroup/PRADA/hg19broad/Ensembl64.canonical.gene.exons.tab.txt'
4
5 infile=open('/RIS/home/verhaakgroup/PRADA/hg19broad/Homo_sapiens.GRCh37.64.gtf')
6 txdb={} #keep track of all transcripts
7 exdb={} #keep track of all exons
8 genedb={} #keep track of all genes
9 verbose=True
10 i=0
11 for line in infile:
12 i+=1
13 if verbose:
14 if i%100000==0:
15 print '%d features scanned'%i
16 info=line.strip().split('\t')
17 chr,cat,entity,start,end=info[0:5]
18 strand=info[6]
19 attributes=info[8]
20 if cat != 'protein_coding':
21 continue
22 attinfo=attributes.split(';')
23 atts=dict()
24 for item in attinfo:
25 item=item.strip()
26 if not item:
27 continue
28 tmp=item.split()
29 kk=tmp[0]
30 vv=tmp[1][1:-1]
31 atts[kk]=vv
32 tx=atts['transcript_id']
33 gene=atts['gene_name']
34 if entity=='exon':
35 exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene)
36 exdb[exon.name]=exon
37 if not txdb.has_key(tx):
38 txdb[tx]=bioclass.Transcript(tx,gene)
39 txdb[tx].add_exon(exon)
40 for txname in txdb:
41 if not genedb.has_key(t.gene):
42 genedb[t.gene]=bioclass.Gene(t.gene)
43 genedb[t.gene].add_transcript(t)
44
45 infile.close()
46
47
48
49 def read_feature(featurefile,verbose=True):
50 '''Read exon/transcript/gene info and return bioclass obj'''
51 infile=open(featurefile)
52 txdb={} #keep track of all transcripts
53 exdb={} #keep track of all exons
54 genedb={} #keep track of all genes
55 i=0
56 for line in infile:
57 i+=1
58 if verbose:
59 if i%100000==0:
60 print '%d exon records loaded'%i
61 chr,start,end,tx,gene,strand,cat=line.split()
62 if cat != 'protein_coding':
63 continue
64 exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene)
65 exdb[exon.name]=exon
66 if not txdb.has_key(tx):
67 txdb[tx]=bioclass.Transcript(tx,gene)
68 txdb[tx].add_exon(exon)
69 for txname in txdb:
70 t=txdb[txname]
71 if not genedb.has_key(t.gene):
72 genedb[t.gene]=bioclass.Gene(t.gene)
73 genedb[t.gene].add_transcript(t)
74 infile.close()
75 return (txdb,genedb)
76
77 def read_feature_genes(featurefile,*args):
78 '''Read exon/transcript/gene info for spec genes and return bioclass obj'''
79 infile=open(featurefile)
80 txdb={}
81 exdb={}
82 genedb={}
83 for line in infile:
84 chr,start,end,tx,gene,strand,cat=line.split()
85 if gene not in args:
86 continue
87 if cat != 'protein_coding':
88 continue
89 exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene)
90 exdb[exon.name]=exon
91 if not txdb.has_key(tx):
92 txdb[tx]=bioclass.Transcript(tx,gene)
93 txdb[tx].add_exon(exon)
94 for txname in txdb:
95 t=txdb[txname]
96 if not genedb.has_key(t.gene):
97 genedb[t.gene]=bioclass.Gene(t.gene)
98 genedb[t.gene].add_transcript(t)
99 res=[]
100 for g in args:
101 if genedb.has_key(g):
102 gobj=genedb[g]
103 else:
104 gobj=None
105 res.append(gobj)
106 infile.close()
107 return res
108
109 if __name__=='__main__':
110 pass