Mercurial > repos > miller-lab > genome_diversity
view rank_pathways_pct.py @ 32:03c22b722882
remove BeautifulSoup dependency
author | Richard Burhans <burhans@bx.psu.edu> |
---|---|
date | Fri, 20 Sep 2013 13:54:23 -0400 |
parents | 8997f2ca8c7a |
children |
line wrap: on
line source
#!/usr/bin/env python # -*- coding: utf-8 -*- # # KEGGFisher.py # # Copyright 2013 Oscar Reina <oscar@niska.bx.psu.edu> # # This program is free software; you can redistribute it and/or modify # it under the pathways of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, # MA 02110-1301, USA. import argparse import os import sys from fisher import pvalue as fisher from decimal import Decimal,getcontext from math import lgamma,exp,factorial def binProb(SAPs_KEGG,NoSAPs_KEGG,SAPs_all,NoSAPs_all,CntKEGG_All,totalSAPs,pKEGG): """ Returns binomial probability. """ def comb(CntKEGG_All,k): return factorial(CntKEGG_All) / Decimal(str(factorial(k)*factorial(CntKEGG_All-k))) probLow = 0 for k in range(0, SAPs_KEGG+1): cp=Decimal(str(comb(CntKEGG_All,k))) bp=Decimal(str(pKEGG**k)) dp=Decimal(str(1.0-pKEGG))**Decimal(str(CntKEGG_All-k)) probLow+=cp*bp*dp #~ probHigh = 0 for k in range(int(SAPs_KEGG),CntKEGG_All+1): cp=Decimal(str(comb(CntKEGG_All,k))) bp=Decimal(str(pKEGG**k)) dp=Decimal(str(1.0-pKEGG))**Decimal(str(CntKEGG_All-k)) probHigh+=cp*bp*dp return probLow,probHigh def gauss_hypergeom(X, CntKEGG_All, SAPs_all, totalSAPs): CntKEGG_All,SAPs_all,totalSAPs """ Returns the probability of drawing X successes of SAPs_all marked items in CntKEGG_All draws from a bin of totalSAPs total items """ def logchoose(ni, ki): try: lgn1 = lgamma(ni+1) lgk1 = lgamma(ki+1) lgnk1 = lgamma(ni-ki+1) except ValueError: raise ValueError return lgn1 - (lgnk1 + lgk1) #~ r1 = logchoose(SAPs_all, X) try: r2 = logchoose(totalSAPs-SAPs_all, CntKEGG_All-X) except ValueError: return 0 r3 = logchoose(totalSAPs,CntKEGG_All) return exp(r1 + r2 - r3) def hypergeo_sf(SAPs_KEGG,NoSAPs_KEGG,SAPs_all,NoSAPs_all,CntKEGG_All,totalSAPs,pKEGG): """ Runs Hypergeometric probability test """ s = 0 t=0 for i in range(SAPs_KEGG,min(SAPs_all,CntKEGG_All)+1): s += max(gauss_hypergeom(i,CntKEGG_All,SAPs_all,totalSAPs), 0.0) for i in range(0, SAPs_KEGG+1): t += max(gauss_hypergeom(i,CntKEGG_All,SAPs_all,totalSAPs), 0.0) return min(max(t,0.0), 1),min(max(s,0.0), 1) def fisherexct(SAPs_KEGG,NoSAPs_KEGG,SAPs_all,NoSAPs_all,CntKEGG_All,totalSAPs,pKEGG): """ Runs Fisher's exact test """ ftest=fisher(SAPs_KEGG,NoSAPs_KEGG,SAPs_all,NoSAPs_all) probLow,probHigh=ftest.left_tail,ftest.right_tail return probLow,probHigh def rtrnKEGGcENSEMBLc(inBckgrndfile,columnENSEMBLTBckgrnd,columnKEGGBckgrnd): """ """ dKEGGTENSEMBLT={} for eachl in open(inBckgrndfile,'r'): if eachl.strip(): ENSEMBLT=eachl.splitlines()[0].split('\t')[columnENSEMBLTBckgrnd] KEGGTs=set(eachl.splitlines()[0].split('\t')[columnKEGGBckgrnd].split('.')) KEGGTs=KEGGTs.difference(set(['','U','N'])) for KEGGT in KEGGTs: try: dKEGGTENSEMBLT[KEGGT].add(ENSEMBLT) except: dKEGGTENSEMBLT[KEGGT]=set([ENSEMBLT]) ENSEMBLTGinKEGG=set.union(*dKEGGTENSEMBLT.values()) return dKEGGTENSEMBLT,ENSEMBLTGinKEGG def rtrnENSEMBLcSAPs(inSAPsfile,columnENSEMBLT,ENSEMBLTGinKEGG): """ returns a set of the ENSEMBLT codes present in the input list and in the KEGG file """ sENSEMBLTSAPsinKEGG=set() for eachl in open(inSAPsfile,'r'): ENSEMBLT=eachl.splitlines()[0].split('\t')[columnENSEMBLT] if ENSEMBLT in ENSEMBLTGinKEGG: sENSEMBLTSAPsinKEGG.add(ENSEMBLT) return sENSEMBLTSAPsinKEGG def rtrnCounts(dKEGGTENSEMBLT,ENSEMBLTGinKEGG,sENSEMBLTSAPsinKEGG,statsTest): """ returns a list of the ENSEMBLT codes present in the input list and in the KEGG file. The pathways in this list are: 'Go Term','# Genes in the KEGG Term','# Genes in the list and in the KEGG Term','Enrichement of the KEGG Term for genes in the input list','Genes in the input list present in the KEGG term' """ totalSAPs=len(ENSEMBLTGinKEGG) SAPs_all=len(sENSEMBLTSAPsinKEGG) NoSAPs_all=totalSAPs-SAPs_all pKEGG=SAPs_all/float(totalSAPs) #~ lp=len(dKEGGTENSEMBLT) cnt=0 #~ if statsTest=='fisher': ptest=fisherexct elif statsTest=='hypergeometric': ptest=hypergeo_sf elif statsTest=='binomial': ptest=binProb #~ ltfreqs=[] for echKEGGT in dKEGGTENSEMBLT: cnt+=1 CntKEGG_All=len(dKEGGTENSEMBLT[echKEGGT]) SAPs_KEGG=len(dKEGGTENSEMBLT[echKEGGT].intersection(sENSEMBLTSAPsinKEGG)) NoSAPs_KEGG=CntKEGG_All-SAPs_KEGG probLow,probHigh=ptest(SAPs_KEGG,NoSAPs_KEGG,SAPs_all,NoSAPs_all,CntKEGG_All,totalSAPs,pKEGG) ltfreqs.append([(SAPs_KEGG/Decimal(CntKEGG_All)),SAPs_KEGG,probLow,probHigh,echKEGGT]) #~ ltfreqs.sort() ltfreqs.reverse() outl=[] cper,crank=Decimal('2'),0 #~ getcontext().prec=2#set 2 decimal places for perc,cnt_go,pvalLow,pvalHigh,goTerm in ltfreqs: if perc<cper: crank+=1 cper=perc outl.append('\t'.join([str(cnt_go),str(Decimal(perc)*Decimal('1.0')),str(crank),str(Decimal(pvalLow)*Decimal('1.0')),str(Decimal(pvalHigh)*Decimal('1.0')),goTerm])) #~ return outl def main(): #~ parser = argparse.ArgumentParser(description='Returns the count of genes in KEGG categories and their statistical overrrepresentation, from a list of genes and an background file (i.e. plane text with ENSEMBLT and KEGG pathways).') parser.add_argument('--input',metavar='input TXT file',type=str,help='the input file with the table in txt format.',required=True) parser.add_argument('--inBckgrndfile',metavar='input TXT file',type=str,help='the input file with the background table in txt format.',required=True) parser.add_argument('--output',metavar='output TXT file',type=str,help='the output file with the table in txt format.',required=True) parser.add_argument('--columnENSEMBLT',metavar='column number',type=int,help='column with the ENSEMBL transcript code in the input file.',required=True) parser.add_argument('--columnENSEMBLTBckgrnd',metavar='column number',type=int,help='column with the ENSEMBL transcript code in the background file.',required=True) parser.add_argument('--columnKEGGBckgrnd',metavar='column number',type=int,help='column with the KEGG pathways in the background file.',required=True) parser.add_argument('--statsTest',metavar='input TXT file',type=str,help='statistical test to compare KEGG pathways (i.e. fisher, hypergeometric, binomial).',required=True) args = parser.parse_args() inSAPsfile = args.input inBckgrndfile = args.inBckgrndfile saleKEGGPCount = args.output columnENSEMBLT = args.columnENSEMBLT columnENSEMBLTBckgrnd = args.columnENSEMBLTBckgrnd columnKEGGBckgrnd = args.columnKEGGBckgrnd statsTest = args.statsTest columnENSEMBLT-=1 columnENSEMBLTBckgrnd-=1 columnKEGGBckgrnd=-1 #~ dKEGGTENSEMBLT,ENSEMBLTGinKEGG=rtrnKEGGcENSEMBLc(inBckgrndfile,columnENSEMBLTBckgrnd,columnKEGGBckgrnd) sENSEMBLTSAPsinKEGG=rtrnENSEMBLcSAPs(inSAPsfile,columnENSEMBLT,ENSEMBLTGinKEGG) outl=rtrnCounts(dKEGGTENSEMBLT,ENSEMBLTGinKEGG,sENSEMBLTSAPsinKEGG,statsTest) #~ saleKEGGPCount=open(saleKEGGPCount,'w') saleKEGGPCount.write('\n'.join(outl)) saleKEGGPCount.close() #~ return 0 if __name__ == '__main__': main()