comparison rank_terms.py @ 27:8997f2ca8c7a

Update to Miller Lab devshed revision bae0d3306d3b
author Richard Burhans <burhans@bx.psu.edu>
date Mon, 15 Jul 2013 10:47:35 -0400
parents 248b06e86022
children
comparison
equal deleted inserted replaced
26:91e835060ad2 27:8997f2ca8c7a
21 # MA 02110-1301, USA. 21 # MA 02110-1301, USA.
22 22
23 import argparse 23 import argparse
24 import os 24 import os
25 import sys 25 import sys
26 from fisher import pvalue 26 from fisher import pvalue as fisher
27 from decimal import Decimal,getcontext 27 from decimal import Decimal,getcontext
28 from math import lgamma,exp,factorial
29
30 def binProb(SAPs_GO,NoSAPs_GO,SAPs_all,NoSAPs_all,CntGO_All,totalSAPs,pGO):
31 """
32 Returns binomial probability.
33 """
34 def comb(CntGO_All,k):
35 return factorial(CntGO_All) / Decimal(str(factorial(k)*factorial(CntGO_All-k)))
36 probLow = 0
37 for k in range(0, SAPs_GO+1):
38 cp=Decimal(str(comb(CntGO_All,k)))
39 bp=Decimal(str(pGO**k))
40 dp=Decimal(str(1.0-pGO))**Decimal(str(CntGO_All-k))
41 probLow+=cp*bp*dp
42 #~
43 probHigh = 0
44 for k in range(int(SAPs_GO),CntGO_All+1):
45 cp=Decimal(str(comb(CntGO_All,k)))
46 bp=Decimal(str(pGO**k))
47 dp=Decimal(str(1.0-pGO))**Decimal(str(CntGO_All-k))
48 probHigh+=cp*bp*dp
49 return probLow,probHigh
50
51 def gauss_hypergeom(X, CntGO_All, SAPs_all, totalSAPs):
52 CntGO_All,SAPs_all,totalSAPs
53 """
54 Returns the probability of drawing X successes of SAPs_all marked items
55 in CntGO_All draws from a bin of totalSAPs total items
56 """
57 def logchoose(ni, ki):
58 try:
59 lgn1 = lgamma(ni+1)
60 lgk1 = lgamma(ki+1)
61 lgnk1 = lgamma(ni-ki+1)
62 except ValueError:
63 raise ValueError
64 return lgn1 - (lgnk1 + lgk1)
65 #~
66 r1 = logchoose(SAPs_all, X)
67 try:
68 r2 = logchoose(totalSAPs-SAPs_all, CntGO_All-X)
69 except ValueError:
70 return 0
71 r3 = logchoose(totalSAPs,CntGO_All)
72 return exp(r1 + r2 - r3)
73
74 def hypergeo_sf(SAPs_GO,NoSAPs_GO,SAPs_all,NoSAPs_all,CntGO_All,totalSAPs,pGO):
75 """
76 Runs Hypergeometric probability test
77 """
78 s = 0
79 t=0
80 for i in range(SAPs_GO,min(SAPs_all,CntGO_All)+1):
81 s += max(gauss_hypergeom(i,CntGO_All,SAPs_all,totalSAPs), 0.0)
82 for i in range(0, SAPs_GO+1):
83 t += max(gauss_hypergeom(i,CntGO_All,SAPs_all,totalSAPs), 0.0)
84 return min(max(t,0.0), 1),min(max(s,0.0), 1)
85
86 def fisherexct(SAPs_GO,NoSAPs_GO,SAPs_all,NoSAPs_all,CntGO_All,totalSAPs,pGO):
87 """
88 Runs Fisher's exact test
89 """
90 ftest=fisher(SAPs_GO,NoSAPs_GO,SAPs_all,NoSAPs_all)
91 probLow,probHigh=ftest.left_tail,ftest.right_tail
92 return probLow,probHigh
28 93
29 def rtrnGOcENSEMBLc(inExtnddfile,columnENSEMBLTExtndd,columnGOExtndd): 94 def rtrnGOcENSEMBLc(inExtnddfile,columnENSEMBLTExtndd,columnGOExtndd):
30 """ 95 """
31 """ 96 """
32 dGOTENSEMBLT={} 97 dGOTENSEMBLT={}
38 for GOT in GOTs: 103 for GOT in GOTs:
39 try: 104 try:
40 dGOTENSEMBLT[GOT].add(ENSEMBLT) 105 dGOTENSEMBLT[GOT].add(ENSEMBLT)
41 except: 106 except:
42 dGOTENSEMBLT[GOT]=set([ENSEMBLT]) 107 dGOTENSEMBLT[GOT]=set([ENSEMBLT])
43 #~
44 ##dGOTENSEMBLT.pop('')
45 ENSEMBLTGinGO=set.union(*dGOTENSEMBLT.values()) 108 ENSEMBLTGinGO=set.union(*dGOTENSEMBLT.values())
46 return dGOTENSEMBLT,ENSEMBLTGinGO 109 return dGOTENSEMBLT,ENSEMBLTGinGO
47 110
48 def rtrnENSEMBLcSAPs(inSAPsfile,columnENSEMBLT,ENSEMBLTGinGO): 111 def rtrnENSEMBLcSAPs(inSAPsfile,columnENSEMBLT,ENSEMBLTGinGO):
49 """ 112 """
55 ENSEMBLT=eachl.splitlines()[0].split('\t')[columnENSEMBLT] 118 ENSEMBLT=eachl.splitlines()[0].split('\t')[columnENSEMBLT]
56 if ENSEMBLT in ENSEMBLTGinGO: 119 if ENSEMBLT in ENSEMBLTGinGO:
57 sENSEMBLTSAPsinGO.add(ENSEMBLT) 120 sENSEMBLTSAPsinGO.add(ENSEMBLT)
58 return sENSEMBLTSAPsinGO 121 return sENSEMBLTSAPsinGO
59 122
60 def rtrnCounts(dGOTENSEMBLT,ENSEMBLTGinGO,sENSEMBLTSAPsinGO): 123 def rtrnCounts(dGOTENSEMBLT,ENSEMBLTGinGO,sENSEMBLTSAPsinGO,statsTest):
61 """ 124 """
62 returns a list of the ENSEMBLT codes present in the input list and 125 returns a list of the ENSEMBLT codes present in the input list and
63 in the GO file. The terms in this list are: 'Go Term','# Genes in 126 in the GO file. The terms in this list are: 'Go Term','# Genes in
64 the GO Term','# Genes in the list and in the GO Term','Enrichement 127 the GO Term','# Genes in the list and in the GO Term','Enrichement
65 of the GO Term for genes in the input list','Genes in the input list 128 of the GO Term for genes in the input list','Genes in the input list
66 present in the GO term' 129 present in the GO term'
67 """ 130 """
68 getcontext().prec=2#set 2 decimal places 131 totalSAPs=len(ENSEMBLTGinGO)
69 SAPs_all=len(sENSEMBLTSAPsinGO) 132 SAPs_all=len(sENSEMBLTSAPsinGO)
70 NoSAPs_all=len(ENSEMBLTGinGO)-SAPs_all 133 NoSAPs_all=totalSAPs-SAPs_all
134 pGO=SAPs_all/float(totalSAPs)
71 #~ 135 #~
72 lp=len(dGOTENSEMBLT) 136 lp=len(dGOTENSEMBLT)
73 cnt=0 137 cnt=0
138 #~
139 if statsTest=='fisher':
140 ptest=fisherexct
141 elif statsTest=='hypergeometric':
142 ptest=hypergeo_sf
143 elif statsTest=='binomial':
144 ptest=binProb
74 #~ 145 #~
75 ltfreqs=[] 146 ltfreqs=[]
76 for echGOT in dGOTENSEMBLT: 147 for echGOT in dGOTENSEMBLT:
77 cnt+=1 148 cnt+=1
78 ##print 'Running "%s", %s out of %s'%(echGOT,cnt,lp)
79 CntGO_All=len(dGOTENSEMBLT[echGOT]) 149 CntGO_All=len(dGOTENSEMBLT[echGOT])
80 SAPs_GO=len(dGOTENSEMBLT[echGOT].intersection(sENSEMBLTSAPsinGO)) 150 SAPs_GO=len(dGOTENSEMBLT[echGOT].intersection(sENSEMBLTSAPsinGO))
81 NoSAPs_GO=CntGO_All-SAPs_GO 151 NoSAPs_GO=CntGO_All-SAPs_GO
82 pval=pvalue(SAPs_GO,NoSAPs_GO,SAPs_all,NoSAPs_all) 152 probLow,probHigh=ptest(SAPs_GO,NoSAPs_GO,SAPs_all,NoSAPs_all,CntGO_All,totalSAPs,pGO)
83 #~ outl.append('\t'.join([str(x) for x in(str(pval.two_tail),CntGO_All,SAPs_GO,'.'.join(sorted(dGOTENSEMBLT[echGOT].intersection(sENSEMBLTSAPsinGO))),echGOT)])) 153 ltfreqs.append([(SAPs_GO/Decimal(CntGO_All)),SAPs_GO,probLow,probHigh,echGOT])
84 ltfreqs.append([(SAPs_GO/Decimal(CntGO_All)),SAPs_GO,Decimal(str(pval.two_tail))*1,echGOT])
85 #~ 154 #~
86 ltfreqs.sort() 155 ltfreqs.sort()
87 ltfreqs.reverse() 156 ltfreqs.reverse()
88 outl=[] 157 outl=[]
89 cper,crank=Decimal('2'),0 158 cper,crank=Decimal('2'),0
90 #~ 159 #~
91 for perc,cnt_go,pval,goTerm in ltfreqs: 160 getcontext().prec=2#set 2 decimal places
161 for perc,cnt_go,pvalLow,pvalHigh,goTerm in ltfreqs:
92 if perc<cper: 162 if perc<cper:
93 crank+=1 163 crank+=1
94 cper=perc 164 cper=perc
95 outl.append('\t'.join([str(cnt_go),str(perc),str(crank),str(pval),goTerm])) 165 outl.append('\t'.join([str(cnt_go),str(Decimal(perc)*Decimal('1.0')),str(crank),str(Decimal(pvalLow)*Decimal('1.0')),str(Decimal(pvalHigh)*Decimal('1.0')),goTerm]))
96 #~ 166 #~
97 return outl 167 return outl
98 168
99 169
100 def main(): 170 def main():
104 parser.add_argument('--inExtnddfile',metavar='input TXT file',type=str,help='the input file with the extended table in txt format.',required=True) 174 parser.add_argument('--inExtnddfile',metavar='input TXT file',type=str,help='the input file with the extended table in txt format.',required=True)
105 parser.add_argument('--output',metavar='output TXT file',type=str,help='the output file with the table in txt format.',required=True) 175 parser.add_argument('--output',metavar='output TXT file',type=str,help='the output file with the table in txt format.',required=True)
106 parser.add_argument('--columnENSEMBLT',metavar='column number',type=int,help='column with the ENSEMBL transcript code in the input file.',required=True) 176 parser.add_argument('--columnENSEMBLT',metavar='column number',type=int,help='column with the ENSEMBL transcript code in the input file.',required=True)
107 parser.add_argument('--columnENSEMBLTExtndd',metavar='column number',type=int,help='column with the ENSEMBL transcript code in the extended file.',required=True) 177 parser.add_argument('--columnENSEMBLTExtndd',metavar='column number',type=int,help='column with the ENSEMBL transcript code in the extended file.',required=True)
108 parser.add_argument('--columnGOExtndd',metavar='column number',type=int,help='column with the GO terms in the extended file.',required=True) 178 parser.add_argument('--columnGOExtndd',metavar='column number',type=int,help='column with the GO terms in the extended file.',required=True)
179 parser.add_argument('--statsTest',metavar='input TXT file',type=str,help='statistical test to compare GO terms (i.e. fisher, hypergeometric, binomial).',required=True)
109 180
110 args = parser.parse_args() 181 args = parser.parse_args()
111 182
112 inSAPsfile = args.input 183 inSAPsfile = args.input
113 inExtnddfile = args.inExtnddfile 184 inExtnddfile = args.inExtnddfile
114 saleGOPCount = args.output 185 saleGOPCount = args.output
115 columnENSEMBLT = args.columnENSEMBLT 186 columnENSEMBLT = args.columnENSEMBLT
116 columnENSEMBLTExtndd = args.columnENSEMBLTExtndd 187 columnENSEMBLTExtndd = args.columnENSEMBLTExtndd
117 columnGOExtndd = args.columnGOExtndd 188 columnGOExtndd = args.columnGOExtndd
189 statsTest = args.statsTest
118 190
119 #~ 191 #~
120 dGOTENSEMBLT,ENSEMBLTGinGO=rtrnGOcENSEMBLc(inExtnddfile,columnENSEMBLTExtndd,columnGOExtndd) 192 dGOTENSEMBLT,ENSEMBLTGinGO=rtrnGOcENSEMBLc(inExtnddfile,columnENSEMBLTExtndd,columnGOExtndd)
121 sENSEMBLTSAPsinGO=rtrnENSEMBLcSAPs(inSAPsfile,columnENSEMBLT,ENSEMBLTGinGO) 193 sENSEMBLTSAPsinGO=rtrnENSEMBLcSAPs(inSAPsfile,columnENSEMBLT,ENSEMBLTGinGO)
122 outl=rtrnCounts(dGOTENSEMBLT,ENSEMBLTGinGO,sENSEMBLTSAPsinGO) 194 outl=rtrnCounts(dGOTENSEMBLT,ENSEMBLTGinGO,sENSEMBLTSAPsinGO,statsTest)
123 #~ 195 #~
124 saleGOPCount=open(saleGOPCount,'w') 196 saleGOPCount=open(saleGOPCount,'w')
125 saleGOPCount.write('\n'.join(outl)) 197 saleGOPCount.write('\n'.join(outl))
126 saleGOPCount.close() 198 saleGOPCount.close()
127 #~ 199 #~