| 0 | 1 #define IO functions. May add more gradually. | 
|  | 2 import bioclass | 
|  | 3 | 
|  | 4 def read_conf(conffile): | 
|  | 5     '''read configure file (conf.txt)''' | 
|  | 6     ref_input=open(conffile) | 
|  | 7     refdict={} | 
|  | 8     for line in ref_input: | 
|  | 9         if not line.strip(): | 
|  | 10             continue | 
|  | 11         if '#' in line: | 
|  | 12             contline=line.split('#')[0] | 
|  | 13             content=contline.strip() | 
|  | 14         else: | 
|  | 15             content=line.strip() | 
|  | 16         if not content: | 
|  | 17             continue | 
|  | 18         if content.startswith('--'): | 
|  | 19             key=content | 
|  | 20             refdict[key]={} | 
|  | 21         else: | 
|  | 22             idx=content.index('=') | 
|  | 23             a=content[0:idx].strip() | 
|  | 24             b=content[(idx+1):].strip() | 
|  | 25             refdict[key][a]=b | 
|  | 26     ref_input.close() | 
|  | 27     return refdict | 
|  | 28 | 
|  | 29 def read_feature(featurefile,verbose=True): | 
|  | 30     '''Read exon/transcript/gene info and return bioclass obj''' | 
|  | 31     infile=open(featurefile) | 
|  | 32     txdb={}    #keep track of all transcripts | 
|  | 33     exdb={}    #keep track of all exons | 
|  | 34     genedb={}  #keep track of all genes | 
|  | 35     i=0 | 
|  | 36     for line in infile: | 
|  | 37         i+=1 | 
|  | 38         if verbose: | 
|  | 39             if i%100000==0: | 
|  | 40                 print '%d exon records loaded'%i | 
|  | 41         chr,start,end,tx,gene,strand,cat=line.split() | 
|  | 42         if cat != 'protein_coding': | 
|  | 43             continue | 
|  | 44         exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene) | 
|  | 45         exdb[exon.name]=exon | 
|  | 46         if not txdb.has_key(tx): | 
|  | 47             txdb[tx]=bioclass.Transcript(tx,gene) | 
|  | 48         txdb[tx].add_exon(exon) | 
|  | 49     for txname in txdb: | 
|  | 50         t=txdb[txname] | 
|  | 51         if not genedb.has_key(t.gene): | 
|  | 52             genedb[t.gene]=bioclass.Gene(t.gene) | 
|  | 53         genedb[t.gene].add_transcript(t) | 
|  | 54     infile.close() | 
|  | 55     return (txdb,genedb) | 
|  | 56 | 
|  | 57 def read_feature_genes(featurefile,*args): | 
|  | 58     '''Read exon/transcript/gene info for spec genes and return bioclass obj''' | 
|  | 59     infile=open(featurefile) | 
|  | 60     txdb={} | 
|  | 61     exdb={} | 
|  | 62     genedb={} | 
|  | 63     for line in infile: | 
|  | 64         chr,start,end,tx,gene,strand,cat=line.split() | 
|  | 65         if gene not in args: | 
|  | 66             continue | 
|  | 67         if cat != 'protein_coding': | 
|  | 68             continue | 
|  | 69         exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene) | 
|  | 70         exdb[exon.name]=exon | 
|  | 71         if not txdb.has_key(tx): | 
|  | 72             txdb[tx]=bioclass.Transcript(tx,gene) | 
|  | 73         txdb[tx].add_exon(exon) | 
|  | 74     for txname in txdb: | 
|  | 75         t=txdb[txname] | 
|  | 76         if not genedb.has_key(t.gene): | 
|  | 77             genedb[t.gene]=bioclass.Gene(t.gene) | 
|  | 78         genedb[t.gene].add_transcript(t) | 
|  | 79     res=[] | 
|  | 80     for g in args: | 
|  | 81         if genedb.has_key(g): | 
|  | 82             gobj=genedb[g] | 
|  | 83         else: | 
|  | 84             gobj=None | 
|  | 85         res.append(gobj) | 
|  | 86     infile.close() | 
|  | 87     return res | 
|  | 88 | 
|  | 89 def read_cds(cdsfile): | 
|  | 90     '''read CDS start/end for transcripts''' | 
|  | 91     infile=open(cdsfile) | 
|  | 92     tx_cds={} | 
|  | 93     for line in infile: | 
|  | 94         info=line.split() | 
|  | 95         tx=info[0] | 
|  | 96         cds_start,cds_end=int(info[3]),int(info[4]) | 
|  | 97         tx_cds[tx]=(cds_start,cds_end) | 
|  | 98     infile.close() | 
|  | 99     return tx_cds | 
|  | 100 | 
|  | 101 def read_tx_cat(txcatfile): | 
|  | 102     '''find primary transcript for each gene''' | 
|  | 103     tx_primary={} | 
|  | 104     tx_cat={} | 
|  | 105     infile=open(txcatfile) | 
|  | 106     for line in infile: | 
|  | 107         if not line.strip(): | 
|  | 108             continue | 
|  | 109         info=line.split() | 
|  | 110         if len(info) != 4: | 
|  | 111             continue | 
|  | 112         gene=info[2] | 
|  | 113         txid=info[1] | 
|  | 114         acc=int(info[3][-3:]) | 
|  | 115         if tx_cat.has_key(gene): | 
|  | 116             tx_cat[gene].append([txid,acc]) | 
|  | 117         else: | 
|  | 118             tx_cat[gene]=[[txid,acc]] | 
|  | 119     infile.close() | 
|  | 120     for item in tx_cat: | 
|  | 121         vv=tx_cat[item] | 
|  | 122         vv=sorted(vv,key=lambda x: x[1]) | 
|  | 123         tx_primary[item]=vv[0][0] | 
|  | 124     return tx_primary | 
|  | 125 | 
|  | 126 | 
|  | 127 if __name__=='__main__': | 
|  | 128     #The following lines are for module testing purpose only. | 
|  | 129     #They would not affect the program. | 
|  | 130     featurefile='/RIS/home/wtorres/RNAseq/hg19broad/Ensembl64.canonical.gene.exons.tab.txt' | 
|  | 131     cdsfile='/RIS/home/wtorres/RNAseq/hg19broad/ensembl.hg19.cds.txt' | 
|  | 132     txcatfile='/RIS/home/wtorres/RNAseq/hg19broad/Ensembl64_primary_transcript.txt' | 
|  | 133 |