Mercurial > repos > siyuan > prada
comparison pyPRADA_1.2/parse_gft.py @ 0:acc2ca1a3ba4
Uploaded
| author | siyuan |
|---|---|
| date | Thu, 20 Feb 2014 00:44:58 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:acc2ca1a3ba4 |
|---|---|
| 1 import bioclass | |
| 2 | |
| 3 featurefile='/RIS/home/verhaakgroup/PRADA/hg19broad/Ensembl64.canonical.gene.exons.tab.txt' | |
| 4 | |
| 5 infile=open('/RIS/home/verhaakgroup/PRADA/hg19broad/Homo_sapiens.GRCh37.64.gtf') | |
| 6 txdb={} #keep track of all transcripts | |
| 7 exdb={} #keep track of all exons | |
| 8 genedb={} #keep track of all genes | |
| 9 verbose=True | |
| 10 i=0 | |
| 11 for line in infile: | |
| 12 i+=1 | |
| 13 if verbose: | |
| 14 if i%100000==0: | |
| 15 print '%d features scanned'%i | |
| 16 info=line.strip().split('\t') | |
| 17 chr,cat,entity,start,end=info[0:5] | |
| 18 strand=info[6] | |
| 19 attributes=info[8] | |
| 20 if cat != 'protein_coding': | |
| 21 continue | |
| 22 attinfo=attributes.split(';') | |
| 23 atts=dict() | |
| 24 for item in attinfo: | |
| 25 item=item.strip() | |
| 26 if not item: | |
| 27 continue | |
| 28 tmp=item.split() | |
| 29 kk=tmp[0] | |
| 30 vv=tmp[1][1:-1] | |
| 31 atts[kk]=vv | |
| 32 tx=atts['transcript_id'] | |
| 33 gene=atts['gene_name'] | |
| 34 if entity=='exon': | |
| 35 exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene) | |
| 36 exdb[exon.name]=exon | |
| 37 if not txdb.has_key(tx): | |
| 38 txdb[tx]=bioclass.Transcript(tx,gene) | |
| 39 txdb[tx].add_exon(exon) | |
| 40 for txname in txdb: | |
| 41 if not genedb.has_key(t.gene): | |
| 42 genedb[t.gene]=bioclass.Gene(t.gene) | |
| 43 genedb[t.gene].add_transcript(t) | |
| 44 | |
| 45 infile.close() | |
| 46 | |
| 47 | |
| 48 | |
| 49 def read_feature(featurefile,verbose=True): | |
| 50 '''Read exon/transcript/gene info and return bioclass obj''' | |
| 51 infile=open(featurefile) | |
| 52 txdb={} #keep track of all transcripts | |
| 53 exdb={} #keep track of all exons | |
| 54 genedb={} #keep track of all genes | |
| 55 i=0 | |
| 56 for line in infile: | |
| 57 i+=1 | |
| 58 if verbose: | |
| 59 if i%100000==0: | |
| 60 print '%d exon records loaded'%i | |
| 61 chr,start,end,tx,gene,strand,cat=line.split() | |
| 62 if cat != 'protein_coding': | |
| 63 continue | |
| 64 exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene) | |
| 65 exdb[exon.name]=exon | |
| 66 if not txdb.has_key(tx): | |
| 67 txdb[tx]=bioclass.Transcript(tx,gene) | |
| 68 txdb[tx].add_exon(exon) | |
| 69 for txname in txdb: | |
| 70 t=txdb[txname] | |
| 71 if not genedb.has_key(t.gene): | |
| 72 genedb[t.gene]=bioclass.Gene(t.gene) | |
| 73 genedb[t.gene].add_transcript(t) | |
| 74 infile.close() | |
| 75 return (txdb,genedb) | |
| 76 | |
| 77 def read_feature_genes(featurefile,*args): | |
| 78 '''Read exon/transcript/gene info for spec genes and return bioclass obj''' | |
| 79 infile=open(featurefile) | |
| 80 txdb={} | |
| 81 exdb={} | |
| 82 genedb={} | |
| 83 for line in infile: | |
| 84 chr,start,end,tx,gene,strand,cat=line.split() | |
| 85 if gene not in args: | |
| 86 continue | |
| 87 if cat != 'protein_coding': | |
| 88 continue | |
| 89 exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene) | |
| 90 exdb[exon.name]=exon | |
| 91 if not txdb.has_key(tx): | |
| 92 txdb[tx]=bioclass.Transcript(tx,gene) | |
| 93 txdb[tx].add_exon(exon) | |
| 94 for txname in txdb: | |
| 95 t=txdb[txname] | |
| 96 if not genedb.has_key(t.gene): | |
| 97 genedb[t.gene]=bioclass.Gene(t.gene) | |
| 98 genedb[t.gene].add_transcript(t) | |
| 99 res=[] | |
| 100 for g in args: | |
| 101 if genedb.has_key(g): | |
| 102 gobj=genedb[g] | |
| 103 else: | |
| 104 gobj=None | |
| 105 res.append(gobj) | |
| 106 infile.close() | |
| 107 return res | |
| 108 | |
| 109 if __name__=='__main__': | |
| 110 pass |
