Mercurial > repos > siyuan > prada
comparison pyPRADA_1.2/parse_gft.py @ 0:acc2ca1a3ba4
Uploaded
author | siyuan |
---|---|
date | Thu, 20 Feb 2014 00:44:58 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:acc2ca1a3ba4 |
---|---|
1 import bioclass | |
2 | |
3 featurefile='/RIS/home/verhaakgroup/PRADA/hg19broad/Ensembl64.canonical.gene.exons.tab.txt' | |
4 | |
5 infile=open('/RIS/home/verhaakgroup/PRADA/hg19broad/Homo_sapiens.GRCh37.64.gtf') | |
6 txdb={} #keep track of all transcripts | |
7 exdb={} #keep track of all exons | |
8 genedb={} #keep track of all genes | |
9 verbose=True | |
10 i=0 | |
11 for line in infile: | |
12 i+=1 | |
13 if verbose: | |
14 if i%100000==0: | |
15 print '%d features scanned'%i | |
16 info=line.strip().split('\t') | |
17 chr,cat,entity,start,end=info[0:5] | |
18 strand=info[6] | |
19 attributes=info[8] | |
20 if cat != 'protein_coding': | |
21 continue | |
22 attinfo=attributes.split(';') | |
23 atts=dict() | |
24 for item in attinfo: | |
25 item=item.strip() | |
26 if not item: | |
27 continue | |
28 tmp=item.split() | |
29 kk=tmp[0] | |
30 vv=tmp[1][1:-1] | |
31 atts[kk]=vv | |
32 tx=atts['transcript_id'] | |
33 gene=atts['gene_name'] | |
34 if entity=='exon': | |
35 exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene) | |
36 exdb[exon.name]=exon | |
37 if not txdb.has_key(tx): | |
38 txdb[tx]=bioclass.Transcript(tx,gene) | |
39 txdb[tx].add_exon(exon) | |
40 for txname in txdb: | |
41 if not genedb.has_key(t.gene): | |
42 genedb[t.gene]=bioclass.Gene(t.gene) | |
43 genedb[t.gene].add_transcript(t) | |
44 | |
45 infile.close() | |
46 | |
47 | |
48 | |
49 def read_feature(featurefile,verbose=True): | |
50 '''Read exon/transcript/gene info and return bioclass obj''' | |
51 infile=open(featurefile) | |
52 txdb={} #keep track of all transcripts | |
53 exdb={} #keep track of all exons | |
54 genedb={} #keep track of all genes | |
55 i=0 | |
56 for line in infile: | |
57 i+=1 | |
58 if verbose: | |
59 if i%100000==0: | |
60 print '%d exon records loaded'%i | |
61 chr,start,end,tx,gene,strand,cat=line.split() | |
62 if cat != 'protein_coding': | |
63 continue | |
64 exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene) | |
65 exdb[exon.name]=exon | |
66 if not txdb.has_key(tx): | |
67 txdb[tx]=bioclass.Transcript(tx,gene) | |
68 txdb[tx].add_exon(exon) | |
69 for txname in txdb: | |
70 t=txdb[txname] | |
71 if not genedb.has_key(t.gene): | |
72 genedb[t.gene]=bioclass.Gene(t.gene) | |
73 genedb[t.gene].add_transcript(t) | |
74 infile.close() | |
75 return (txdb,genedb) | |
76 | |
77 def read_feature_genes(featurefile,*args): | |
78 '''Read exon/transcript/gene info for spec genes and return bioclass obj''' | |
79 infile=open(featurefile) | |
80 txdb={} | |
81 exdb={} | |
82 genedb={} | |
83 for line in infile: | |
84 chr,start,end,tx,gene,strand,cat=line.split() | |
85 if gene not in args: | |
86 continue | |
87 if cat != 'protein_coding': | |
88 continue | |
89 exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene) | |
90 exdb[exon.name]=exon | |
91 if not txdb.has_key(tx): | |
92 txdb[tx]=bioclass.Transcript(tx,gene) | |
93 txdb[tx].add_exon(exon) | |
94 for txname in txdb: | |
95 t=txdb[txname] | |
96 if not genedb.has_key(t.gene): | |
97 genedb[t.gene]=bioclass.Gene(t.gene) | |
98 genedb[t.gene].add_transcript(t) | |
99 res=[] | |
100 for g in args: | |
101 if genedb.has_key(g): | |
102 gobj=genedb[g] | |
103 else: | |
104 gobj=None | |
105 res.append(gobj) | |
106 infile.close() | |
107 return res | |
108 | |
109 if __name__=='__main__': | |
110 pass |