Mercurial > repos > miller-lab > genome_diversity
comparison rank_terms.py @ 22:95a05c1ef5d5
update to devshed revision aaece207bd01
author | Richard Burhans <burhans@bx.psu.edu> |
---|---|
date | Mon, 11 Mar 2013 11:28:06 -0400 |
parents | |
children | 248b06e86022 |
comparison
equal
deleted
inserted
replaced
21:d6b961721037 | 22:95a05c1ef5d5 |
---|---|
1 #!/usr/bin/env python | |
2 # -*- coding: utf-8 -*- | |
3 # | |
4 # GOFisher.py | |
5 # | |
6 # Copyright 2013 Oscar Reina <oscar@niska.bx.psu.edu> | |
7 # | |
8 # This program is free software; you can redistribute it and/or modify | |
9 # it under the terms of the GNU General Public License as published by | |
10 # the Free Software Foundation; either version 2 of the License, or | |
11 # (at your option) any later version. | |
12 # | |
13 # This program is distributed in the hope that it will be useful, | |
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 # GNU General Public License for more details. | |
17 # | |
18 # You should have received a copy of the GNU General Public License | |
19 # along with this program; if not, write to the Free Software | |
20 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, | |
21 # MA 02110-1301, USA. | |
22 | |
23 import argparse | |
24 import os | |
25 import sys | |
26 from fisher import pvalue | |
27 from decimal import Decimal,getcontext | |
28 | |
29 def rtrnGOcENSEMBLc(inExtnddfile,columnENSEMBLTExtndd,columnGOExtndd): | |
30 """ | |
31 """ | |
32 dGOTENSEMBLT={} | |
33 for eachl in open(inExtnddfile,'r'): | |
34 if eachl.strip(): | |
35 ENSEMBLT=eachl.splitlines()[0].split('\t')[columnENSEMBLTExtndd] | |
36 GOTs=set(eachl.splitlines()[0].split('\t')[columnGOExtndd].split('.')) | |
37 GOTs=GOTs.difference(set(['','U','N'])) | |
38 for GOT in GOTs: | |
39 try: | |
40 dGOTENSEMBLT[GOT].add(ENSEMBLT) | |
41 except: | |
42 dGOTENSEMBLT[GOT]=set([ENSEMBLT]) | |
43 #~ | |
44 ##dGOTENSEMBLT.pop('') | |
45 ENSEMBLTGinGO=set.union(*dGOTENSEMBLT.values()) | |
46 return dGOTENSEMBLT,ENSEMBLTGinGO | |
47 | |
48 def rtrnENSEMBLcSAPs(inSAPsfile,columnENSEMBLT,ENSEMBLTGinGO): | |
49 """ | |
50 returns a set of the ENSEMBLT codes present in the input list and | |
51 in the GO file | |
52 """ | |
53 sENSEMBLTSAPsinGO=set() | |
54 for eachl in open(inSAPsfile,'r'): | |
55 ENSEMBLT=eachl.splitlines()[0].split('\t')[columnENSEMBLT] | |
56 if ENSEMBLT in ENSEMBLTGinGO: | |
57 sENSEMBLTSAPsinGO.add(ENSEMBLT) | |
58 return sENSEMBLTSAPsinGO | |
59 | |
60 def rtrnCounts(dGOTENSEMBLT,ENSEMBLTGinGO,sENSEMBLTSAPsinGO): | |
61 """ | |
62 returns a list of the ENSEMBLT codes present in the input list and | |
63 in the GO file. The terms in this list are: 'Go Term','# Genes in | |
64 the GO Term','# Genes in the list and in the GO Term','Enrichement | |
65 of the GO Term for genes in the input list','Genes in the input list | |
66 present in the GO term' | |
67 """ | |
68 getcontext().prec=2#set 2 decimal places | |
69 SAPs_all=len(sENSEMBLTSAPsinGO) | |
70 NoSAPs_all=len(ENSEMBLTGinGO)-SAPs_all | |
71 #~ | |
72 lp=len(dGOTENSEMBLT) | |
73 cnt=0 | |
74 #~ | |
75 ltfreqs=[] | |
76 for echGOT in dGOTENSEMBLT: | |
77 cnt+=1 | |
78 ##print 'Running "%s", %s out of %s'%(echGOT,cnt,lp) | |
79 CntGO_All=len(dGOTENSEMBLT[echGOT]) | |
80 SAPs_GO=len(dGOTENSEMBLT[echGOT].intersection(sENSEMBLTSAPsinGO)) | |
81 NoSAPs_GO=CntGO_All-SAPs_GO | |
82 pval=pvalue(SAPs_GO,NoSAPs_GO,SAPs_all,NoSAPs_all) | |
83 #~ outl.append('\t'.join([str(x) for x in(str(pval.two_tail),CntGO_All,SAPs_GO,'.'.join(sorted(dGOTENSEMBLT[echGOT].intersection(sENSEMBLTSAPsinGO))),echGOT)])) | |
84 ltfreqs.append([(SAPs_GO/Decimal(CntGO_All)),SAPs_GO,Decimal(str(pval.two_tail))*1,echGOT]) | |
85 #~ | |
86 ltfreqs.sort() | |
87 ltfreqs.reverse() | |
88 outl=[] | |
89 cper,crank=Decimal('2'),0 | |
90 #~ | |
91 for perc,cnt_go,pval,goTerm in ltfreqs: | |
92 if perc<cper: | |
93 crank+=1 | |
94 cper=perc | |
95 outl.append('\t'.join([str(cnt_go),str(perc),str(crank),str(pval),goTerm])) | |
96 #~ | |
97 return outl | |
98 | |
99 | |
100 def main(): | |
101 #~ | |
102 parser = argparse.ArgumentParser(description='Returns the count of genes in GO categories and their statistical overrrepresentation, from a list of genes and an extended file (i.e. plane text with ENSEMBLT and GO terms).') | |
103 parser.add_argument('--input',metavar='input TXT file',type=str,help='the input file with the table in txt format.',required=True) | |
104 parser.add_argument('--inExtnddfile',metavar='input TXT file',type=str,help='the input file with the extended table in txt format.',required=True) | |
105 parser.add_argument('--output',metavar='output TXT file',type=str,help='the output file with the table in txt format.',required=True) | |
106 parser.add_argument('--columnENSEMBLT',metavar='column number',type=int,help='column with the ENSEMBL transcript code in the input file.',required=True) | |
107 parser.add_argument('--columnENSEMBLTExtndd',metavar='column number',type=int,help='column with the ENSEMBL transcript code in the extended file.',required=True) | |
108 parser.add_argument('--columnGOExtndd',metavar='column number',type=int,help='column with the GO terms in the extended file.',required=True) | |
109 | |
110 args = parser.parse_args() | |
111 | |
112 inSAPsfile = args.input | |
113 inExtnddfile = args.inExtnddfile | |
114 saleGOPCount = args.output | |
115 columnENSEMBLT = args.columnENSEMBLT | |
116 columnENSEMBLTExtndd = args.columnENSEMBLTExtndd | |
117 columnGOExtndd = args.columnGOExtndd | |
118 | |
119 #~ | |
120 dGOTENSEMBLT,ENSEMBLTGinGO=rtrnGOcENSEMBLc(inExtnddfile,columnENSEMBLTExtndd,columnGOExtndd) | |
121 sENSEMBLTSAPsinGO=rtrnENSEMBLcSAPs(inSAPsfile,columnENSEMBLT,ENSEMBLTGinGO) | |
122 outl=rtrnCounts(dGOTENSEMBLT,ENSEMBLTGinGO,sENSEMBLTSAPsinGO) | |
123 #~ | |
124 saleGOPCount=open(saleGOPCount,'w') | |
125 saleGOPCount.write('\n'.join(outl)) | |
126 saleGOPCount.close() | |
127 #~ | |
128 return 0 | |
129 | |
130 if __name__ == '__main__': | |
131 main() | |
132 |