Mercurial > repos > miller-lab > genome_diversity
comparison rank_terms.py @ 27:8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
author | Richard Burhans <burhans@bx.psu.edu> |
---|---|
date | Mon, 15 Jul 2013 10:47:35 -0400 |
parents | 248b06e86022 |
children |
comparison
equal
deleted
inserted
replaced
26:91e835060ad2 | 27:8997f2ca8c7a |
---|---|
21 # MA 02110-1301, USA. | 21 # MA 02110-1301, USA. |
22 | 22 |
23 import argparse | 23 import argparse |
24 import os | 24 import os |
25 import sys | 25 import sys |
26 from fisher import pvalue | 26 from fisher import pvalue as fisher |
27 from decimal import Decimal,getcontext | 27 from decimal import Decimal,getcontext |
28 from math import lgamma,exp,factorial | |
29 | |
30 def binProb(SAPs_GO,NoSAPs_GO,SAPs_all,NoSAPs_all,CntGO_All,totalSAPs,pGO): | |
31 """ | |
32 Returns binomial probability. | |
33 """ | |
34 def comb(CntGO_All,k): | |
35 return factorial(CntGO_All) / Decimal(str(factorial(k)*factorial(CntGO_All-k))) | |
36 probLow = 0 | |
37 for k in range(0, SAPs_GO+1): | |
38 cp=Decimal(str(comb(CntGO_All,k))) | |
39 bp=Decimal(str(pGO**k)) | |
40 dp=Decimal(str(1.0-pGO))**Decimal(str(CntGO_All-k)) | |
41 probLow+=cp*bp*dp | |
42 #~ | |
43 probHigh = 0 | |
44 for k in range(int(SAPs_GO),CntGO_All+1): | |
45 cp=Decimal(str(comb(CntGO_All,k))) | |
46 bp=Decimal(str(pGO**k)) | |
47 dp=Decimal(str(1.0-pGO))**Decimal(str(CntGO_All-k)) | |
48 probHigh+=cp*bp*dp | |
49 return probLow,probHigh | |
50 | |
51 def gauss_hypergeom(X, CntGO_All, SAPs_all, totalSAPs): | |
52 CntGO_All,SAPs_all,totalSAPs | |
53 """ | |
54 Returns the probability of drawing X successes of SAPs_all marked items | |
55 in CntGO_All draws from a bin of totalSAPs total items | |
56 """ | |
57 def logchoose(ni, ki): | |
58 try: | |
59 lgn1 = lgamma(ni+1) | |
60 lgk1 = lgamma(ki+1) | |
61 lgnk1 = lgamma(ni-ki+1) | |
62 except ValueError: | |
63 raise ValueError | |
64 return lgn1 - (lgnk1 + lgk1) | |
65 #~ | |
66 r1 = logchoose(SAPs_all, X) | |
67 try: | |
68 r2 = logchoose(totalSAPs-SAPs_all, CntGO_All-X) | |
69 except ValueError: | |
70 return 0 | |
71 r3 = logchoose(totalSAPs,CntGO_All) | |
72 return exp(r1 + r2 - r3) | |
73 | |
74 def hypergeo_sf(SAPs_GO,NoSAPs_GO,SAPs_all,NoSAPs_all,CntGO_All,totalSAPs,pGO): | |
75 """ | |
76 Runs Hypergeometric probability test | |
77 """ | |
78 s = 0 | |
79 t=0 | |
80 for i in range(SAPs_GO,min(SAPs_all,CntGO_All)+1): | |
81 s += max(gauss_hypergeom(i,CntGO_All,SAPs_all,totalSAPs), 0.0) | |
82 for i in range(0, SAPs_GO+1): | |
83 t += max(gauss_hypergeom(i,CntGO_All,SAPs_all,totalSAPs), 0.0) | |
84 return min(max(t,0.0), 1),min(max(s,0.0), 1) | |
85 | |
86 def fisherexct(SAPs_GO,NoSAPs_GO,SAPs_all,NoSAPs_all,CntGO_All,totalSAPs,pGO): | |
87 """ | |
88 Runs Fisher's exact test | |
89 """ | |
90 ftest=fisher(SAPs_GO,NoSAPs_GO,SAPs_all,NoSAPs_all) | |
91 probLow,probHigh=ftest.left_tail,ftest.right_tail | |
92 return probLow,probHigh | |
28 | 93 |
29 def rtrnGOcENSEMBLc(inExtnddfile,columnENSEMBLTExtndd,columnGOExtndd): | 94 def rtrnGOcENSEMBLc(inExtnddfile,columnENSEMBLTExtndd,columnGOExtndd): |
30 """ | 95 """ |
31 """ | 96 """ |
32 dGOTENSEMBLT={} | 97 dGOTENSEMBLT={} |
38 for GOT in GOTs: | 103 for GOT in GOTs: |
39 try: | 104 try: |
40 dGOTENSEMBLT[GOT].add(ENSEMBLT) | 105 dGOTENSEMBLT[GOT].add(ENSEMBLT) |
41 except: | 106 except: |
42 dGOTENSEMBLT[GOT]=set([ENSEMBLT]) | 107 dGOTENSEMBLT[GOT]=set([ENSEMBLT]) |
43 #~ | |
44 ##dGOTENSEMBLT.pop('') | |
45 ENSEMBLTGinGO=set.union(*dGOTENSEMBLT.values()) | 108 ENSEMBLTGinGO=set.union(*dGOTENSEMBLT.values()) |
46 return dGOTENSEMBLT,ENSEMBLTGinGO | 109 return dGOTENSEMBLT,ENSEMBLTGinGO |
47 | 110 |
48 def rtrnENSEMBLcSAPs(inSAPsfile,columnENSEMBLT,ENSEMBLTGinGO): | 111 def rtrnENSEMBLcSAPs(inSAPsfile,columnENSEMBLT,ENSEMBLTGinGO): |
49 """ | 112 """ |
55 ENSEMBLT=eachl.splitlines()[0].split('\t')[columnENSEMBLT] | 118 ENSEMBLT=eachl.splitlines()[0].split('\t')[columnENSEMBLT] |
56 if ENSEMBLT in ENSEMBLTGinGO: | 119 if ENSEMBLT in ENSEMBLTGinGO: |
57 sENSEMBLTSAPsinGO.add(ENSEMBLT) | 120 sENSEMBLTSAPsinGO.add(ENSEMBLT) |
58 return sENSEMBLTSAPsinGO | 121 return sENSEMBLTSAPsinGO |
59 | 122 |
60 def rtrnCounts(dGOTENSEMBLT,ENSEMBLTGinGO,sENSEMBLTSAPsinGO): | 123 def rtrnCounts(dGOTENSEMBLT,ENSEMBLTGinGO,sENSEMBLTSAPsinGO,statsTest): |
61 """ | 124 """ |
62 returns a list of the ENSEMBLT codes present in the input list and | 125 returns a list of the ENSEMBLT codes present in the input list and |
63 in the GO file. The terms in this list are: 'Go Term','# Genes in | 126 in the GO file. The terms in this list are: 'Go Term','# Genes in |
64 the GO Term','# Genes in the list and in the GO Term','Enrichement | 127 the GO Term','# Genes in the list and in the GO Term','Enrichement |
65 of the GO Term for genes in the input list','Genes in the input list | 128 of the GO Term for genes in the input list','Genes in the input list |
66 present in the GO term' | 129 present in the GO term' |
67 """ | 130 """ |
68 getcontext().prec=2#set 2 decimal places | 131 totalSAPs=len(ENSEMBLTGinGO) |
69 SAPs_all=len(sENSEMBLTSAPsinGO) | 132 SAPs_all=len(sENSEMBLTSAPsinGO) |
70 NoSAPs_all=len(ENSEMBLTGinGO)-SAPs_all | 133 NoSAPs_all=totalSAPs-SAPs_all |
134 pGO=SAPs_all/float(totalSAPs) | |
71 #~ | 135 #~ |
72 lp=len(dGOTENSEMBLT) | 136 lp=len(dGOTENSEMBLT) |
73 cnt=0 | 137 cnt=0 |
138 #~ | |
139 if statsTest=='fisher': | |
140 ptest=fisherexct | |
141 elif statsTest=='hypergeometric': | |
142 ptest=hypergeo_sf | |
143 elif statsTest=='binomial': | |
144 ptest=binProb | |
74 #~ | 145 #~ |
75 ltfreqs=[] | 146 ltfreqs=[] |
76 for echGOT in dGOTENSEMBLT: | 147 for echGOT in dGOTENSEMBLT: |
77 cnt+=1 | 148 cnt+=1 |
78 ##print 'Running "%s", %s out of %s'%(echGOT,cnt,lp) | |
79 CntGO_All=len(dGOTENSEMBLT[echGOT]) | 149 CntGO_All=len(dGOTENSEMBLT[echGOT]) |
80 SAPs_GO=len(dGOTENSEMBLT[echGOT].intersection(sENSEMBLTSAPsinGO)) | 150 SAPs_GO=len(dGOTENSEMBLT[echGOT].intersection(sENSEMBLTSAPsinGO)) |
81 NoSAPs_GO=CntGO_All-SAPs_GO | 151 NoSAPs_GO=CntGO_All-SAPs_GO |
82 pval=pvalue(SAPs_GO,NoSAPs_GO,SAPs_all,NoSAPs_all) | 152 probLow,probHigh=ptest(SAPs_GO,NoSAPs_GO,SAPs_all,NoSAPs_all,CntGO_All,totalSAPs,pGO) |
83 #~ outl.append('\t'.join([str(x) for x in(str(pval.two_tail),CntGO_All,SAPs_GO,'.'.join(sorted(dGOTENSEMBLT[echGOT].intersection(sENSEMBLTSAPsinGO))),echGOT)])) | 153 ltfreqs.append([(SAPs_GO/Decimal(CntGO_All)),SAPs_GO,probLow,probHigh,echGOT]) |
84 ltfreqs.append([(SAPs_GO/Decimal(CntGO_All)),SAPs_GO,Decimal(str(pval.two_tail))*1,echGOT]) | |
85 #~ | 154 #~ |
86 ltfreqs.sort() | 155 ltfreqs.sort() |
87 ltfreqs.reverse() | 156 ltfreqs.reverse() |
88 outl=[] | 157 outl=[] |
89 cper,crank=Decimal('2'),0 | 158 cper,crank=Decimal('2'),0 |
90 #~ | 159 #~ |
91 for perc,cnt_go,pval,goTerm in ltfreqs: | 160 getcontext().prec=2#set 2 decimal places |
161 for perc,cnt_go,pvalLow,pvalHigh,goTerm in ltfreqs: | |
92 if perc<cper: | 162 if perc<cper: |
93 crank+=1 | 163 crank+=1 |
94 cper=perc | 164 cper=perc |
95 outl.append('\t'.join([str(cnt_go),str(perc),str(crank),str(pval),goTerm])) | 165 outl.append('\t'.join([str(cnt_go),str(Decimal(perc)*Decimal('1.0')),str(crank),str(Decimal(pvalLow)*Decimal('1.0')),str(Decimal(pvalHigh)*Decimal('1.0')),goTerm])) |
96 #~ | 166 #~ |
97 return outl | 167 return outl |
98 | 168 |
99 | 169 |
100 def main(): | 170 def main(): |
104 parser.add_argument('--inExtnddfile',metavar='input TXT file',type=str,help='the input file with the extended table in txt format.',required=True) | 174 parser.add_argument('--inExtnddfile',metavar='input TXT file',type=str,help='the input file with the extended table in txt format.',required=True) |
105 parser.add_argument('--output',metavar='output TXT file',type=str,help='the output file with the table in txt format.',required=True) | 175 parser.add_argument('--output',metavar='output TXT file',type=str,help='the output file with the table in txt format.',required=True) |
106 parser.add_argument('--columnENSEMBLT',metavar='column number',type=int,help='column with the ENSEMBL transcript code in the input file.',required=True) | 176 parser.add_argument('--columnENSEMBLT',metavar='column number',type=int,help='column with the ENSEMBL transcript code in the input file.',required=True) |
107 parser.add_argument('--columnENSEMBLTExtndd',metavar='column number',type=int,help='column with the ENSEMBL transcript code in the extended file.',required=True) | 177 parser.add_argument('--columnENSEMBLTExtndd',metavar='column number',type=int,help='column with the ENSEMBL transcript code in the extended file.',required=True) |
108 parser.add_argument('--columnGOExtndd',metavar='column number',type=int,help='column with the GO terms in the extended file.',required=True) | 178 parser.add_argument('--columnGOExtndd',metavar='column number',type=int,help='column with the GO terms in the extended file.',required=True) |
179 parser.add_argument('--statsTest',metavar='input TXT file',type=str,help='statistical test to compare GO terms (i.e. fisher, hypergeometric, binomial).',required=True) | |
109 | 180 |
110 args = parser.parse_args() | 181 args = parser.parse_args() |
111 | 182 |
112 inSAPsfile = args.input | 183 inSAPsfile = args.input |
113 inExtnddfile = args.inExtnddfile | 184 inExtnddfile = args.inExtnddfile |
114 saleGOPCount = args.output | 185 saleGOPCount = args.output |
115 columnENSEMBLT = args.columnENSEMBLT | 186 columnENSEMBLT = args.columnENSEMBLT |
116 columnENSEMBLTExtndd = args.columnENSEMBLTExtndd | 187 columnENSEMBLTExtndd = args.columnENSEMBLTExtndd |
117 columnGOExtndd = args.columnGOExtndd | 188 columnGOExtndd = args.columnGOExtndd |
189 statsTest = args.statsTest | |
118 | 190 |
119 #~ | 191 #~ |
120 dGOTENSEMBLT,ENSEMBLTGinGO=rtrnGOcENSEMBLc(inExtnddfile,columnENSEMBLTExtndd,columnGOExtndd) | 192 dGOTENSEMBLT,ENSEMBLTGinGO=rtrnGOcENSEMBLc(inExtnddfile,columnENSEMBLTExtndd,columnGOExtndd) |
121 sENSEMBLTSAPsinGO=rtrnENSEMBLcSAPs(inSAPsfile,columnENSEMBLT,ENSEMBLTGinGO) | 193 sENSEMBLTSAPsinGO=rtrnENSEMBLcSAPs(inSAPsfile,columnENSEMBLT,ENSEMBLTGinGO) |
122 outl=rtrnCounts(dGOTENSEMBLT,ENSEMBLTGinGO,sENSEMBLTSAPsinGO) | 194 outl=rtrnCounts(dGOTENSEMBLT,ENSEMBLTGinGO,sENSEMBLTSAPsinGO,statsTest) |
123 #~ | 195 #~ |
124 saleGOPCount=open(saleGOPCount,'w') | 196 saleGOPCount=open(saleGOPCount,'w') |
125 saleGOPCount.write('\n'.join(outl)) | 197 saleGOPCount.write('\n'.join(outl)) |
126 saleGOPCount.close() | 198 saleGOPCount.close() |
127 #~ | 199 #~ |