| 
0
 | 
     1 #define IO functions. May add more gradually. 
 | 
| 
 | 
     2 import bioclass
 | 
| 
 | 
     3 
 | 
| 
 | 
     4 def read_conf(conffile):
 | 
| 
 | 
     5     '''read configure file (conf.txt)'''
 | 
| 
 | 
     6     ref_input=open(conffile)
 | 
| 
 | 
     7     refdict={}
 | 
| 
 | 
     8     for line in ref_input:
 | 
| 
 | 
     9         if not line.strip():
 | 
| 
 | 
    10             continue
 | 
| 
 | 
    11         if '#' in line:
 | 
| 
 | 
    12             contline=line.split('#')[0]
 | 
| 
 | 
    13             content=contline.strip()
 | 
| 
 | 
    14         else:
 | 
| 
 | 
    15             content=line.strip()
 | 
| 
 | 
    16         if not content:
 | 
| 
 | 
    17             continue
 | 
| 
 | 
    18         if content.startswith('--'):
 | 
| 
 | 
    19             key=content
 | 
| 
 | 
    20             refdict[key]={}
 | 
| 
 | 
    21         else:
 | 
| 
 | 
    22             idx=content.index('=')
 | 
| 
 | 
    23             a=content[0:idx].strip()
 | 
| 
 | 
    24             b=content[(idx+1):].strip()
 | 
| 
 | 
    25             refdict[key][a]=b
 | 
| 
 | 
    26     ref_input.close()
 | 
| 
 | 
    27     return refdict
 | 
| 
 | 
    28 
 | 
| 
 | 
    29 def read_feature(featurefile,verbose=True):
 | 
| 
 | 
    30     '''Read exon/transcript/gene info and return bioclass obj'''
 | 
| 
 | 
    31     infile=open(featurefile)
 | 
| 
 | 
    32     txdb={}    #keep track of all transcripts
 | 
| 
 | 
    33     exdb={}    #keep track of all exons
 | 
| 
 | 
    34     genedb={}  #keep track of all genes
 | 
| 
 | 
    35     i=0
 | 
| 
 | 
    36     for line in infile:
 | 
| 
 | 
    37         i+=1
 | 
| 
 | 
    38         if verbose:
 | 
| 
 | 
    39             if i%100000==0:
 | 
| 
 | 
    40                 print '%d exon records loaded'%i
 | 
| 
 | 
    41         chr,start,end,tx,gene,strand,cat=line.split()
 | 
| 
 | 
    42         if cat != 'protein_coding':
 | 
| 
 | 
    43             continue
 | 
| 
 | 
    44         exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene)
 | 
| 
 | 
    45         exdb[exon.name]=exon
 | 
| 
 | 
    46         if not txdb.has_key(tx):
 | 
| 
 | 
    47             txdb[tx]=bioclass.Transcript(tx,gene)
 | 
| 
 | 
    48         txdb[tx].add_exon(exon)
 | 
| 
 | 
    49     for txname in txdb:
 | 
| 
 | 
    50         t=txdb[txname]
 | 
| 
 | 
    51         if not genedb.has_key(t.gene):
 | 
| 
 | 
    52             genedb[t.gene]=bioclass.Gene(t.gene)
 | 
| 
 | 
    53         genedb[t.gene].add_transcript(t)
 | 
| 
 | 
    54     infile.close()
 | 
| 
 | 
    55     return (txdb,genedb)
 | 
| 
 | 
    56 
 | 
| 
 | 
    57 def read_feature_genes(featurefile,*args):
 | 
| 
 | 
    58     '''Read exon/transcript/gene info for spec genes and return bioclass obj'''
 | 
| 
 | 
    59     infile=open(featurefile)
 | 
| 
 | 
    60     txdb={}
 | 
| 
 | 
    61     exdb={}
 | 
| 
 | 
    62     genedb={}
 | 
| 
 | 
    63     for line in infile:
 | 
| 
 | 
    64         chr,start,end,tx,gene,strand,cat=line.split()
 | 
| 
 | 
    65         if gene not in args:
 | 
| 
 | 
    66             continue
 | 
| 
 | 
    67         if cat != 'protein_coding':
 | 
| 
 | 
    68             continue
 | 
| 
 | 
    69         exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene)
 | 
| 
 | 
    70         exdb[exon.name]=exon
 | 
| 
 | 
    71         if not txdb.has_key(tx):
 | 
| 
 | 
    72             txdb[tx]=bioclass.Transcript(tx,gene)
 | 
| 
 | 
    73         txdb[tx].add_exon(exon)
 | 
| 
 | 
    74     for txname in txdb:
 | 
| 
 | 
    75         t=txdb[txname]
 | 
| 
 | 
    76         if not genedb.has_key(t.gene):
 | 
| 
 | 
    77             genedb[t.gene]=bioclass.Gene(t.gene)
 | 
| 
 | 
    78         genedb[t.gene].add_transcript(t)
 | 
| 
 | 
    79     res=[]
 | 
| 
 | 
    80     for g in args:
 | 
| 
 | 
    81         if genedb.has_key(g):
 | 
| 
 | 
    82             gobj=genedb[g]
 | 
| 
 | 
    83         else:
 | 
| 
 | 
    84             gobj=None
 | 
| 
 | 
    85         res.append(gobj)
 | 
| 
 | 
    86     infile.close()
 | 
| 
 | 
    87     return res
 | 
| 
 | 
    88 
 | 
| 
 | 
    89 def read_cds(cdsfile):
 | 
| 
 | 
    90     '''read CDS start/end for transcripts'''
 | 
| 
 | 
    91     infile=open(cdsfile)
 | 
| 
 | 
    92     tx_cds={}
 | 
| 
 | 
    93     for line in infile:
 | 
| 
 | 
    94         info=line.split()
 | 
| 
 | 
    95         tx=info[0]
 | 
| 
 | 
    96         cds_start,cds_end=int(info[3]),int(info[4])
 | 
| 
 | 
    97         tx_cds[tx]=(cds_start,cds_end)
 | 
| 
 | 
    98     infile.close()
 | 
| 
 | 
    99     return tx_cds
 | 
| 
 | 
   100 
 | 
| 
 | 
   101 def read_tx_cat(txcatfile):
 | 
| 
 | 
   102     '''find primary transcript for each gene'''
 | 
| 
 | 
   103     tx_primary={}
 | 
| 
 | 
   104     tx_cat={}
 | 
| 
 | 
   105     infile=open(txcatfile)
 | 
| 
 | 
   106     for line in infile:
 | 
| 
 | 
   107         if not line.strip():
 | 
| 
 | 
   108             continue
 | 
| 
 | 
   109         info=line.split()
 | 
| 
 | 
   110         if len(info) != 4:
 | 
| 
 | 
   111             continue
 | 
| 
 | 
   112         gene=info[2]
 | 
| 
 | 
   113         txid=info[1]
 | 
| 
 | 
   114         acc=int(info[3][-3:])
 | 
| 
 | 
   115         if tx_cat.has_key(gene):
 | 
| 
 | 
   116             tx_cat[gene].append([txid,acc])
 | 
| 
 | 
   117         else:
 | 
| 
 | 
   118             tx_cat[gene]=[[txid,acc]]
 | 
| 
 | 
   119     infile.close()
 | 
| 
 | 
   120     for item in tx_cat:
 | 
| 
 | 
   121         vv=tx_cat[item]
 | 
| 
 | 
   122         vv=sorted(vv,key=lambda x: x[1])
 | 
| 
 | 
   123         tx_primary[item]=vv[0][0]
 | 
| 
 | 
   124     return tx_primary
 | 
| 
 | 
   125 
 | 
| 
 | 
   126 
 | 
| 
 | 
   127 if __name__=='__main__':
 | 
| 
 | 
   128     #The following lines are for module testing purpose only.
 | 
| 
 | 
   129     #They would not affect the program.
 | 
| 
 | 
   130     featurefile='/RIS/home/wtorres/RNAseq/hg19broad/Ensembl64.canonical.gene.exons.tab.txt'
 | 
| 
 | 
   131     cdsfile='/RIS/home/wtorres/RNAseq/hg19broad/ensembl.hg19.cds.txt'
 | 
| 
 | 
   132     txcatfile='/RIS/home/wtorres/RNAseq/hg19broad/Ensembl64_primary_transcript.txt'
 | 
| 
 | 
   133     
 |