Mercurial > repos > siyuan > prada
comparison pyPRADA_1.2/ioprada.py @ 0:acc2ca1a3ba4
Uploaded
| author | siyuan |
|---|---|
| date | Thu, 20 Feb 2014 00:44:58 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:acc2ca1a3ba4 |
|---|---|
| 1 #define IO functions. May add more gradually. | |
| 2 import bioclass | |
| 3 | |
| 4 def read_conf(conffile): | |
| 5 '''read configure file (conf.txt)''' | |
| 6 ref_input=open(conffile) | |
| 7 refdict={} | |
| 8 for line in ref_input: | |
| 9 if not line.strip(): | |
| 10 continue | |
| 11 if '#' in line: | |
| 12 contline=line.split('#')[0] | |
| 13 content=contline.strip() | |
| 14 else: | |
| 15 content=line.strip() | |
| 16 if not content: | |
| 17 continue | |
| 18 if content.startswith('--'): | |
| 19 key=content | |
| 20 refdict[key]={} | |
| 21 else: | |
| 22 idx=content.index('=') | |
| 23 a=content[0:idx].strip() | |
| 24 b=content[(idx+1):].strip() | |
| 25 refdict[key][a]=b | |
| 26 ref_input.close() | |
| 27 return refdict | |
| 28 | |
| 29 def read_feature(featurefile,verbose=True): | |
| 30 '''Read exon/transcript/gene info and return bioclass obj''' | |
| 31 infile=open(featurefile) | |
| 32 txdb={} #keep track of all transcripts | |
| 33 exdb={} #keep track of all exons | |
| 34 genedb={} #keep track of all genes | |
| 35 i=0 | |
| 36 for line in infile: | |
| 37 i+=1 | |
| 38 if verbose: | |
| 39 if i%100000==0: | |
| 40 print '%d exon records loaded'%i | |
| 41 chr,start,end,tx,gene,strand,cat=line.split() | |
| 42 if cat != 'protein_coding': | |
| 43 continue | |
| 44 exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene) | |
| 45 exdb[exon.name]=exon | |
| 46 if not txdb.has_key(tx): | |
| 47 txdb[tx]=bioclass.Transcript(tx,gene) | |
| 48 txdb[tx].add_exon(exon) | |
| 49 for txname in txdb: | |
| 50 t=txdb[txname] | |
| 51 if not genedb.has_key(t.gene): | |
| 52 genedb[t.gene]=bioclass.Gene(t.gene) | |
| 53 genedb[t.gene].add_transcript(t) | |
| 54 infile.close() | |
| 55 return (txdb,genedb) | |
| 56 | |
| 57 def read_feature_genes(featurefile,*args): | |
| 58 '''Read exon/transcript/gene info for spec genes and return bioclass obj''' | |
| 59 infile=open(featurefile) | |
| 60 txdb={} | |
| 61 exdb={} | |
| 62 genedb={} | |
| 63 for line in infile: | |
| 64 chr,start,end,tx,gene,strand,cat=line.split() | |
| 65 if gene not in args: | |
| 66 continue | |
| 67 if cat != 'protein_coding': | |
| 68 continue | |
| 69 exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene) | |
| 70 exdb[exon.name]=exon | |
| 71 if not txdb.has_key(tx): | |
| 72 txdb[tx]=bioclass.Transcript(tx,gene) | |
| 73 txdb[tx].add_exon(exon) | |
| 74 for txname in txdb: | |
| 75 t=txdb[txname] | |
| 76 if not genedb.has_key(t.gene): | |
| 77 genedb[t.gene]=bioclass.Gene(t.gene) | |
| 78 genedb[t.gene].add_transcript(t) | |
| 79 res=[] | |
| 80 for g in args: | |
| 81 if genedb.has_key(g): | |
| 82 gobj=genedb[g] | |
| 83 else: | |
| 84 gobj=None | |
| 85 res.append(gobj) | |
| 86 infile.close() | |
| 87 return res | |
| 88 | |
| 89 def read_cds(cdsfile): | |
| 90 '''read CDS start/end for transcripts''' | |
| 91 infile=open(cdsfile) | |
| 92 tx_cds={} | |
| 93 for line in infile: | |
| 94 info=line.split() | |
| 95 tx=info[0] | |
| 96 cds_start,cds_end=int(info[3]),int(info[4]) | |
| 97 tx_cds[tx]=(cds_start,cds_end) | |
| 98 infile.close() | |
| 99 return tx_cds | |
| 100 | |
| 101 def read_tx_cat(txcatfile): | |
| 102 '''find primary transcript for each gene''' | |
| 103 tx_primary={} | |
| 104 tx_cat={} | |
| 105 infile=open(txcatfile) | |
| 106 for line in infile: | |
| 107 if not line.strip(): | |
| 108 continue | |
| 109 info=line.split() | |
| 110 if len(info) != 4: | |
| 111 continue | |
| 112 gene=info[2] | |
| 113 txid=info[1] | |
| 114 acc=int(info[3][-3:]) | |
| 115 if tx_cat.has_key(gene): | |
| 116 tx_cat[gene].append([txid,acc]) | |
| 117 else: | |
| 118 tx_cat[gene]=[[txid,acc]] | |
| 119 infile.close() | |
| 120 for item in tx_cat: | |
| 121 vv=tx_cat[item] | |
| 122 vv=sorted(vv,key=lambda x: x[1]) | |
| 123 tx_primary[item]=vv[0][0] | |
| 124 return tx_primary | |
| 125 | |
| 126 | |
| 127 if __name__=='__main__': | |
| 128 #The following lines are for module testing purpose only. | |
| 129 #They would not affect the program. | |
| 130 featurefile='/RIS/home/wtorres/RNAseq/hg19broad/Ensembl64.canonical.gene.exons.tab.txt' | |
| 131 cdsfile='/RIS/home/wtorres/RNAseq/hg19broad/ensembl.hg19.cds.txt' | |
| 132 txcatfile='/RIS/home/wtorres/RNAseq/hg19broad/Ensembl64_primary_transcript.txt' | |
| 133 |
