Mercurial > repos > miller-lab > genome_diversity
diff rank_pathways_pct.py @ 27:8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
author | Richard Burhans <burhans@bx.psu.edu> |
---|---|
date | Mon, 15 Jul 2013 10:47:35 -0400 |
parents | 95a05c1ef5d5 |
children |
line wrap: on
line diff
--- a/rank_pathways_pct.py Mon Jun 03 12:29:29 2013 -0400 +++ b/rank_pathways_pct.py Mon Jul 15 10:47:35 2013 -0400 @@ -1,12 +1,12 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # -# calcfreq.py +# KEGGFisher.py # -# Copyright 2011 Oscar Bedoya-Reina <oscar@niska.bx.psu.edu> +# Copyright 2013 Oscar Reina <oscar@niska.bx.psu.edu> # # This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by +# it under the pathways of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # @@ -20,119 +20,187 @@ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, # MA 02110-1301, USA. -import argparse,os,sys +import argparse +import os +import sys +from fisher import pvalue as fisher from decimal import Decimal,getcontext -from LocationFile import LocationFile -from fisher import pvalue +from math import lgamma,exp,factorial -#method to rank the the pthways by mut. freq. -def rankd(ltfreqs): - ordvals=sorted(ltfreqs)#sort and reverse freqs. +def binProb(SAPs_KEGG,NoSAPs_KEGG,SAPs_all,NoSAPs_all,CntKEGG_All,totalSAPs,pKEGG): + """ + Returns binomial probability. + """ + def comb(CntKEGG_All,k): + return factorial(CntKEGG_All) / Decimal(str(factorial(k)*factorial(CntKEGG_All-k))) + probLow = 0 + for k in range(0, SAPs_KEGG+1): + cp=Decimal(str(comb(CntKEGG_All,k))) + bp=Decimal(str(pKEGG**k)) + dp=Decimal(str(1.0-pKEGG))**Decimal(str(CntKEGG_All-k)) + probLow+=cp*bp*dp + #~ + probHigh = 0 + for k in range(int(SAPs_KEGG),CntKEGG_All+1): + cp=Decimal(str(comb(CntKEGG_All,k))) + bp=Decimal(str(pKEGG**k)) + dp=Decimal(str(1.0-pKEGG))**Decimal(str(CntKEGG_All-k)) + probHigh+=cp*bp*dp + return probLow,probHigh + +def gauss_hypergeom(X, CntKEGG_All, SAPs_all, totalSAPs): + CntKEGG_All,SAPs_all,totalSAPs + """ + Returns the probability of drawing X successes of SAPs_all marked items + in CntKEGG_All draws from a bin of totalSAPs total items + """ + def logchoose(ni, ki): + try: + lgn1 = lgamma(ni+1) + lgk1 = lgamma(ki+1) + lgnk1 = lgamma(ni-ki+1) + except ValueError: + raise ValueError + return lgn1 - (lgnk1 + lgk1) #~ - outrnk=[] - tmpFreq0,tmpCount,pval,tmpPthw=ordvals.pop()#the highest possible value - crank=1 - outrnk.append('\t'.join([str(tmpCount),str(tmpFreq0),str(crank),str(pval),tmpPthw])) - totalnvals=len(ordvals) + r1 = logchoose(SAPs_all, X) + try: + r2 = logchoose(totalSAPs-SAPs_all, CntKEGG_All-X) + except ValueError: + return 0 + r3 = logchoose(totalSAPs,CntKEGG_All) + return exp(r1 + r2 - r3) + +def hypergeo_sf(SAPs_KEGG,NoSAPs_KEGG,SAPs_all,NoSAPs_all,CntKEGG_All,totalSAPs,pKEGG): + """ + Runs Hypergeometric probability test + """ + s = 0 + t=0 + for i in range(SAPs_KEGG,min(SAPs_all,CntKEGG_All)+1): + s += max(gauss_hypergeom(i,CntKEGG_All,SAPs_all,totalSAPs), 0.0) + for i in range(0, SAPs_KEGG+1): + t += max(gauss_hypergeom(i,CntKEGG_All,SAPs_all,totalSAPs), 0.0) + return min(max(t,0.0), 1),min(max(s,0.0), 1) + +def fisherexct(SAPs_KEGG,NoSAPs_KEGG,SAPs_all,NoSAPs_all,CntKEGG_All,totalSAPs,pKEGG): + """ + Runs Fisher's exact test + """ + ftest=fisher(SAPs_KEGG,NoSAPs_KEGG,SAPs_all,NoSAPs_all) + probLow,probHigh=ftest.left_tail,ftest.right_tail + return probLow,probHigh + +def rtrnKEGGcENSEMBLc(inBckgrndfile,columnENSEMBLTBckgrnd,columnKEGGBckgrnd): + """ + """ + dKEGGTENSEMBLT={} + for eachl in open(inBckgrndfile,'r'): + if eachl.strip(): + ENSEMBLT=eachl.splitlines()[0].split('\t')[columnENSEMBLTBckgrnd] + KEGGTs=set(eachl.splitlines()[0].split('\t')[columnKEGGBckgrnd].split('.')) + KEGGTs=KEGGTs.difference(set(['','U','N'])) + for KEGGT in KEGGTs: + try: + dKEGGTENSEMBLT[KEGGT].add(ENSEMBLT) + except: + dKEGGTENSEMBLT[KEGGT]=set([ENSEMBLT]) + ENSEMBLTGinKEGG=set.union(*dKEGGTENSEMBLT.values()) + return dKEGGTENSEMBLT,ENSEMBLTGinKEGG + +def rtrnENSEMBLcSAPs(inSAPsfile,columnENSEMBLT,ENSEMBLTGinKEGG): + """ + returns a set of the ENSEMBLT codes present in the input list and + in the KEGG file + """ + sENSEMBLTSAPsinKEGG=set() + for eachl in open(inSAPsfile,'r'): + ENSEMBLT=eachl.splitlines()[0].split('\t')[columnENSEMBLT] + if ENSEMBLT in ENSEMBLTGinKEGG: + sENSEMBLTSAPsinKEGG.add(ENSEMBLT) + return sENSEMBLTSAPsinKEGG + +def rtrnCounts(dKEGGTENSEMBLT,ENSEMBLTGinKEGG,sENSEMBLTSAPsinKEGG,statsTest): + """ + returns a list of the ENSEMBLT codes present in the input list and + in the KEGG file. The pathways in this list are: 'Go Term','# Genes in + the KEGG Term','# Genes in the list and in the KEGG Term','Enrichement + of the KEGG Term for genes in the input list','Genes in the input list + present in the KEGG term' + """ + totalSAPs=len(ENSEMBLTGinKEGG) + SAPs_all=len(sENSEMBLTSAPsinKEGG) + NoSAPs_all=totalSAPs-SAPs_all + pKEGG=SAPs_all/float(totalSAPs) + #~ + lp=len(dKEGGTENSEMBLT) cnt=0 - while totalnvals>cnt: + #~ + if statsTest=='fisher': + ptest=fisherexct + elif statsTest=='hypergeometric': + ptest=hypergeo_sf + elif statsTest=='binomial': + ptest=binProb + #~ + ltfreqs=[] + for echKEGGT in dKEGGTENSEMBLT: cnt+=1 - tmpFreq,tmpCount,pval,tmpPthw=ordvals.pop() - if tmpFreq!=tmpFreq0: - crank=len(outrnk)+1 - tmpFreq0=tmpFreq - outrnk.append('\t'.join([str(tmpCount),str(tmpFreq),str(crank),str(pval),tmpPthw])) - return outrnk - + CntKEGG_All=len(dKEGGTENSEMBLT[echKEGGT]) + SAPs_KEGG=len(dKEGGTENSEMBLT[echKEGGT].intersection(sENSEMBLTSAPsinKEGG)) + NoSAPs_KEGG=CntKEGG_All-SAPs_KEGG + probLow,probHigh=ptest(SAPs_KEGG,NoSAPs_KEGG,SAPs_all,NoSAPs_all,CntKEGG_All,totalSAPs,pKEGG) + ltfreqs.append([(SAPs_KEGG/Decimal(CntKEGG_All)),SAPs_KEGG,probLow,probHigh,echKEGGT]) + #~ + ltfreqs.sort() + ltfreqs.reverse() + outl=[] + cper,crank=Decimal('2'),0 + #~ + getcontext().prec=2#set 2 decimal places + for perc,cnt_go,pvalLow,pvalHigh,goTerm in ltfreqs: + if perc<cper: + crank+=1 + cper=perc + outl.append('\t'.join([str(cnt_go),str(Decimal(perc)*Decimal('1.0')),str(crank),str(Decimal(pvalLow)*Decimal('1.0')),str(Decimal(pvalHigh)*Decimal('1.0')),goTerm])) + #~ + return outl + def main(): - parser = argparse.ArgumentParser(description='Obtain KEGG images from a list of genes.') - parser.add_argument('--input',metavar='input TXT file',type=str,help='the input file with the table in txt format') - parser.add_argument('--output',metavar='output TXT file',type=str,help='the output file with the table in txt format. Column 1 is the count of genes in the list, Column 2 is the percentage of the pathway genes present on the list. Column 3 is the rank based on column 2') - parser.add_argument('--posKEGGclmn',metavar='column number',type=int,help='the column with the KEGG pathway code/name') - parser.add_argument('--KEGGgeneposcolmn',metavar='column number',type=int,help='column with the KEGG gene code') - parser.add_argument('--loc_file',metavar='location file',type=str,help='location file') - parser.add_argument('--species',metavar='species',type=str,help='species') - #~Open arguments - class C(object): - pass - fulargs=C() - parser.parse_args(sys.argv[1:],namespace=fulargs) - #test input vars - inputf,outputf,posKEGGclmn,Kgeneposcolmn=fulargs.input,fulargs.output,fulargs.posKEGGclmn,fulargs.KEGGgeneposcolmn - locf,species=fulargs.loc_file,fulargs.species - #make a dictionary of valid genes - posKEGGclmn-=1 - Kgeneposcolmn-=1 - dKEGGcPthws=dict([(x.split('\t')[Kgeneposcolmn],set(x.split('\t')[posKEGGclmn].split('.'))) for x in open(inputf).read().splitlines()[1:] if x.split('\t')[posKEGGclmn] not in set(['U','N'])]) - for u in ['U','N']: - try: - a=dKEGGcPthws.pop(u) - except: - pass - getcontext().prec=2#set 2 decimal places - sdGenes=set([x for x in dKEGGcPthws.keys() if x.find('.')>-1]) - while True:#to correct names with more than one gene - try: - mgenes=sdGenes.pop() - pthwsAssotd=dKEGGcPthws.pop(mgenes) - mgenes=mgenes.split('.') - for eachg in mgenes: - dKEGGcPthws[eachg]=pthwsAssotd - except: - break - #~ Count genes + #~ + parser = argparse.ArgumentParser(description='Returns the count of genes in KEGG categories and their statistical overrrepresentation, from a list of genes and an background file (i.e. plane text with ENSEMBLT and KEGG pathways).') + parser.add_argument('--input',metavar='input TXT file',type=str,help='the input file with the table in txt format.',required=True) + parser.add_argument('--inBckgrndfile',metavar='input TXT file',type=str,help='the input file with the background table in txt format.',required=True) + parser.add_argument('--output',metavar='output TXT file',type=str,help='the output file with the table in txt format.',required=True) + parser.add_argument('--columnENSEMBLT',metavar='column number',type=int,help='column with the ENSEMBL transcript code in the input file.',required=True) + parser.add_argument('--columnENSEMBLTBckgrnd',metavar='column number',type=int,help='column with the ENSEMBL transcript code in the background file.',required=True) + parser.add_argument('--columnKEGGBckgrnd',metavar='column number',type=int,help='column with the KEGG pathways in the background file.',required=True) + parser.add_argument('--statsTest',metavar='input TXT file',type=str,help='statistical test to compare KEGG pathways (i.e. fisher, hypergeometric, binomial).',required=True) + + args = parser.parse_args() - location_file = LocationFile(locf) - prefix, kxml_dir_path, dict_file = location_file.get_values(species) - dPthContsTotls = {} - try: - with open(dict_file) as fh: - for line in fh: - line = line.rstrip('\r\n') - value, key = line.split('\t') - dPthContsTotls[key] = int(value) - except IOError, err: - print >> sys.stderr, 'Error opening dict file {0}: {1}'.format(dict_file, err.strerror) - sys.exit(1) - - dPthContsTmp=dict([(x,0) for x in dPthContsTotls.keys()])#create a list of genes - sdGenes=set(dKEGGcPthws.keys())#list of all genes - cntGens=0 - ltGens=len(sdGenes) - while cntGens<ltGens: - cGen=sdGenes.pop() - sKEGGcPthws=dKEGGcPthws.pop(cGen) - for eachP in sKEGGcPthws: - if eachP!='N': - if eachP in dPthContsTmp: - dPthContsTmp[eachP]+=1 - else: - print >> sys.stderr, "Error: pathway not found in database: '{0}'".format(eachP) - sys.exit(1) - cntGens+=1 - #~ Calculate Freqs. - ltfreqs=[] - cntAllKEGGinGnm=sum(dPthContsTotls.values()) - cntListKEGGinGnm=sum(dPthContsTmp.values()) - cntAllKEGGNOTinGnm=cntAllKEGGinGnm-cntListKEGGinGnm - for pthw in dPthContsTotls: - cntAllKEGGinpthw=Decimal(dPthContsTotls[pthw]) - try: - cntListKEGGinpthw=Decimal(dPthContsTmp[pthw]) - except: - cntListKEGGinpthw=Decimal('0') - cntAllKEGGNOTinpthw=cntAllKEGGinpthw-cntListKEGGinpthw - pval=pvalue(cntListKEGGinpthw,cntAllKEGGNOTinpthw,cntListKEGGinGnm,cntAllKEGGNOTinGnm) - - ltfreqs.append([(cntListKEGGinpthw/cntAllKEGGinpthw),cntListKEGGinpthw,Decimal(str(pval.two_tail))*1,pthw]) - #~ ltfreqs=[((Decimal(dPthContsTmp[x])/Decimal(dPthContsTotls[x])),Decimal(dPthContsTmp[x]),x) for x in dPthContsTotls] - tabllfreqs='\n'.join(rankd(ltfreqs)) - salef=open(outputf,'w') - salef.write(tabllfreqs) - salef.close() + inSAPsfile = args.input + inBckgrndfile = args.inBckgrndfile + saleKEGGPCount = args.output + columnENSEMBLT = args.columnENSEMBLT + columnENSEMBLTBckgrnd = args.columnENSEMBLTBckgrnd + columnKEGGBckgrnd = args.columnKEGGBckgrnd + statsTest = args.statsTest + columnENSEMBLT-=1 + columnENSEMBLTBckgrnd-=1 + columnKEGGBckgrnd=-1 + #~ + dKEGGTENSEMBLT,ENSEMBLTGinKEGG=rtrnKEGGcENSEMBLc(inBckgrndfile,columnENSEMBLTBckgrnd,columnKEGGBckgrnd) + sENSEMBLTSAPsinKEGG=rtrnENSEMBLcSAPs(inSAPsfile,columnENSEMBLT,ENSEMBLTGinKEGG) + outl=rtrnCounts(dKEGGTENSEMBLT,ENSEMBLTGinKEGG,sENSEMBLTSAPsinKEGG,statsTest) + #~ + saleKEGGPCount=open(saleKEGGPCount,'w') + saleKEGGPCount.write('\n'.join(outl)) + saleKEGGPCount.close() + #~ return 0 - if __name__ == '__main__': main() +