Mercurial > repos > miller-lab > genome_diversity
view rank_terms.py @ 28:184d14e4270d
Update to Miller Lab devshed revision 4ede22dd5500
author | Richard Burhans <burhans@bx.psu.edu> |
---|---|
date | Wed, 17 Jul 2013 12:46:46 -0400 |
parents | 8997f2ca8c7a |
children |
line wrap: on
line source
#!/usr/bin/env python # -*- coding: utf-8 -*- # # GOFisher.py # # Copyright 2013 Oscar Reina <oscar@niska.bx.psu.edu> # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, # MA 02110-1301, USA. import argparse import os import sys from fisher import pvalue as fisher from decimal import Decimal,getcontext from math import lgamma,exp,factorial def binProb(SAPs_GO,NoSAPs_GO,SAPs_all,NoSAPs_all,CntGO_All,totalSAPs,pGO): """ Returns binomial probability. """ def comb(CntGO_All,k): return factorial(CntGO_All) / Decimal(str(factorial(k)*factorial(CntGO_All-k))) probLow = 0 for k in range(0, SAPs_GO+1): cp=Decimal(str(comb(CntGO_All,k))) bp=Decimal(str(pGO**k)) dp=Decimal(str(1.0-pGO))**Decimal(str(CntGO_All-k)) probLow+=cp*bp*dp #~ probHigh = 0 for k in range(int(SAPs_GO),CntGO_All+1): cp=Decimal(str(comb(CntGO_All,k))) bp=Decimal(str(pGO**k)) dp=Decimal(str(1.0-pGO))**Decimal(str(CntGO_All-k)) probHigh+=cp*bp*dp return probLow,probHigh def gauss_hypergeom(X, CntGO_All, SAPs_all, totalSAPs): CntGO_All,SAPs_all,totalSAPs """ Returns the probability of drawing X successes of SAPs_all marked items in CntGO_All draws from a bin of totalSAPs total items """ def logchoose(ni, ki): try: lgn1 = lgamma(ni+1) lgk1 = lgamma(ki+1) lgnk1 = lgamma(ni-ki+1) except ValueError: raise ValueError return lgn1 - (lgnk1 + lgk1) #~ r1 = logchoose(SAPs_all, X) try: r2 = logchoose(totalSAPs-SAPs_all, CntGO_All-X) except ValueError: return 0 r3 = logchoose(totalSAPs,CntGO_All) return exp(r1 + r2 - r3) def hypergeo_sf(SAPs_GO,NoSAPs_GO,SAPs_all,NoSAPs_all,CntGO_All,totalSAPs,pGO): """ Runs Hypergeometric probability test """ s = 0 t=0 for i in range(SAPs_GO,min(SAPs_all,CntGO_All)+1): s += max(gauss_hypergeom(i,CntGO_All,SAPs_all,totalSAPs), 0.0) for i in range(0, SAPs_GO+1): t += max(gauss_hypergeom(i,CntGO_All,SAPs_all,totalSAPs), 0.0) return min(max(t,0.0), 1),min(max(s,0.0), 1) def fisherexct(SAPs_GO,NoSAPs_GO,SAPs_all,NoSAPs_all,CntGO_All,totalSAPs,pGO): """ Runs Fisher's exact test """ ftest=fisher(SAPs_GO,NoSAPs_GO,SAPs_all,NoSAPs_all) probLow,probHigh=ftest.left_tail,ftest.right_tail return probLow,probHigh def rtrnGOcENSEMBLc(inExtnddfile,columnENSEMBLTExtndd,columnGOExtndd): """ """ dGOTENSEMBLT={} for eachl in open(inExtnddfile,'r'): if eachl.strip(): ENSEMBLT=eachl.splitlines()[0].split('\t')[columnENSEMBLTExtndd] GOTs=set(eachl.splitlines()[0].split('\t')[columnGOExtndd].split('.')) GOTs=GOTs.difference(set(['','U','N'])) for GOT in GOTs: try: dGOTENSEMBLT[GOT].add(ENSEMBLT) except: dGOTENSEMBLT[GOT]=set([ENSEMBLT]) ENSEMBLTGinGO=set.union(*dGOTENSEMBLT.values()) return dGOTENSEMBLT,ENSEMBLTGinGO def rtrnENSEMBLcSAPs(inSAPsfile,columnENSEMBLT,ENSEMBLTGinGO): """ returns a set of the ENSEMBLT codes present in the input list and in the GO file """ sENSEMBLTSAPsinGO=set() for eachl in open(inSAPsfile,'r'): ENSEMBLT=eachl.splitlines()[0].split('\t')[columnENSEMBLT] if ENSEMBLT in ENSEMBLTGinGO: sENSEMBLTSAPsinGO.add(ENSEMBLT) return sENSEMBLTSAPsinGO def rtrnCounts(dGOTENSEMBLT,ENSEMBLTGinGO,sENSEMBLTSAPsinGO,statsTest): """ returns a list of the ENSEMBLT codes present in the input list and in the GO file. The terms in this list are: 'Go Term','# Genes in the GO Term','# Genes in the list and in the GO Term','Enrichement of the GO Term for genes in the input list','Genes in the input list present in the GO term' """ totalSAPs=len(ENSEMBLTGinGO) SAPs_all=len(sENSEMBLTSAPsinGO) NoSAPs_all=totalSAPs-SAPs_all pGO=SAPs_all/float(totalSAPs) #~ lp=len(dGOTENSEMBLT) cnt=0 #~ if statsTest=='fisher': ptest=fisherexct elif statsTest=='hypergeometric': ptest=hypergeo_sf elif statsTest=='binomial': ptest=binProb #~ ltfreqs=[] for echGOT in dGOTENSEMBLT: cnt+=1 CntGO_All=len(dGOTENSEMBLT[echGOT]) SAPs_GO=len(dGOTENSEMBLT[echGOT].intersection(sENSEMBLTSAPsinGO)) NoSAPs_GO=CntGO_All-SAPs_GO probLow,probHigh=ptest(SAPs_GO,NoSAPs_GO,SAPs_all,NoSAPs_all,CntGO_All,totalSAPs,pGO) ltfreqs.append([(SAPs_GO/Decimal(CntGO_All)),SAPs_GO,probLow,probHigh,echGOT]) #~ ltfreqs.sort() ltfreqs.reverse() outl=[] cper,crank=Decimal('2'),0 #~ getcontext().prec=2#set 2 decimal places for perc,cnt_go,pvalLow,pvalHigh,goTerm in ltfreqs: if perc<cper: crank+=1 cper=perc outl.append('\t'.join([str(cnt_go),str(Decimal(perc)*Decimal('1.0')),str(crank),str(Decimal(pvalLow)*Decimal('1.0')),str(Decimal(pvalHigh)*Decimal('1.0')),goTerm])) #~ return outl def main(): #~ parser = argparse.ArgumentParser(description='Returns the count of genes in GO categories and their statistical overrrepresentation, from a list of genes and an extended file (i.e. plane text with ENSEMBLT and GO terms).') parser.add_argument('--input',metavar='input TXT file',type=str,help='the input file with the table in txt format.',required=True) parser.add_argument('--inExtnddfile',metavar='input TXT file',type=str,help='the input file with the extended table in txt format.',required=True) parser.add_argument('--output',metavar='output TXT file',type=str,help='the output file with the table in txt format.',required=True) parser.add_argument('--columnENSEMBLT',metavar='column number',type=int,help='column with the ENSEMBL transcript code in the input file.',required=True) parser.add_argument('--columnENSEMBLTExtndd',metavar='column number',type=int,help='column with the ENSEMBL transcript code in the extended file.',required=True) parser.add_argument('--columnGOExtndd',metavar='column number',type=int,help='column with the GO terms in the extended file.',required=True) parser.add_argument('--statsTest',metavar='input TXT file',type=str,help='statistical test to compare GO terms (i.e. fisher, hypergeometric, binomial).',required=True) args = parser.parse_args() inSAPsfile = args.input inExtnddfile = args.inExtnddfile saleGOPCount = args.output columnENSEMBLT = args.columnENSEMBLT columnENSEMBLTExtndd = args.columnENSEMBLTExtndd columnGOExtndd = args.columnGOExtndd statsTest = args.statsTest #~ dGOTENSEMBLT,ENSEMBLTGinGO=rtrnGOcENSEMBLc(inExtnddfile,columnENSEMBLTExtndd,columnGOExtndd) sENSEMBLTSAPsinGO=rtrnENSEMBLcSAPs(inSAPsfile,columnENSEMBLT,ENSEMBLTGinGO) outl=rtrnCounts(dGOTENSEMBLT,ENSEMBLTGinGO,sENSEMBLTSAPsinGO,statsTest) #~ saleGOPCount=open(saleGOPCount,'w') saleGOPCount.write('\n'.join(outl)) saleGOPCount.close() #~ return 0 if __name__ == '__main__': main()