Mercurial > repos > miller-lab > genome_diversity
view rank_terms.py @ 25:cba0d7a63b82
workaround for gd_genotype datatype
admix shift int -> float
author | Richard Burhans <burhans@bx.psu.edu> |
---|---|
date | Wed, 29 May 2013 13:49:19 -0400 |
parents | 248b06e86022 |
children | 8997f2ca8c7a |
line wrap: on
line source
#!/usr/bin/env python # -*- coding: utf-8 -*- # # GOFisher.py # # Copyright 2013 Oscar Reina <oscar@niska.bx.psu.edu> # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, # MA 02110-1301, USA. import argparse import os import sys from fisher import pvalue from decimal import Decimal,getcontext def rtrnGOcENSEMBLc(inExtnddfile,columnENSEMBLTExtndd,columnGOExtndd): """ """ dGOTENSEMBLT={} for eachl in open(inExtnddfile,'r'): if eachl.strip(): ENSEMBLT=eachl.splitlines()[0].split('\t')[columnENSEMBLTExtndd] GOTs=set(eachl.splitlines()[0].split('\t')[columnGOExtndd].split('.')) GOTs=GOTs.difference(set(['','U','N'])) for GOT in GOTs: try: dGOTENSEMBLT[GOT].add(ENSEMBLT) except: dGOTENSEMBLT[GOT]=set([ENSEMBLT]) #~ ##dGOTENSEMBLT.pop('') ENSEMBLTGinGO=set.union(*dGOTENSEMBLT.values()) return dGOTENSEMBLT,ENSEMBLTGinGO def rtrnENSEMBLcSAPs(inSAPsfile,columnENSEMBLT,ENSEMBLTGinGO): """ returns a set of the ENSEMBLT codes present in the input list and in the GO file """ sENSEMBLTSAPsinGO=set() for eachl in open(inSAPsfile,'r'): ENSEMBLT=eachl.splitlines()[0].split('\t')[columnENSEMBLT] if ENSEMBLT in ENSEMBLTGinGO: sENSEMBLTSAPsinGO.add(ENSEMBLT) return sENSEMBLTSAPsinGO def rtrnCounts(dGOTENSEMBLT,ENSEMBLTGinGO,sENSEMBLTSAPsinGO): """ returns a list of the ENSEMBLT codes present in the input list and in the GO file. The terms in this list are: 'Go Term','# Genes in the GO Term','# Genes in the list and in the GO Term','Enrichement of the GO Term for genes in the input list','Genes in the input list present in the GO term' """ getcontext().prec=2#set 2 decimal places SAPs_all=len(sENSEMBLTSAPsinGO) NoSAPs_all=len(ENSEMBLTGinGO)-SAPs_all #~ lp=len(dGOTENSEMBLT) cnt=0 #~ ltfreqs=[] for echGOT in dGOTENSEMBLT: cnt+=1 ##print 'Running "%s", %s out of %s'%(echGOT,cnt,lp) CntGO_All=len(dGOTENSEMBLT[echGOT]) SAPs_GO=len(dGOTENSEMBLT[echGOT].intersection(sENSEMBLTSAPsinGO)) NoSAPs_GO=CntGO_All-SAPs_GO pval=pvalue(SAPs_GO,NoSAPs_GO,SAPs_all,NoSAPs_all) #~ outl.append('\t'.join([str(x) for x in(str(pval.two_tail),CntGO_All,SAPs_GO,'.'.join(sorted(dGOTENSEMBLT[echGOT].intersection(sENSEMBLTSAPsinGO))),echGOT)])) ltfreqs.append([(SAPs_GO/Decimal(CntGO_All)),SAPs_GO,Decimal(str(pval.two_tail))*1,echGOT]) #~ ltfreqs.sort() ltfreqs.reverse() outl=[] cper,crank=Decimal('2'),0 #~ for perc,cnt_go,pval,goTerm in ltfreqs: if perc<cper: crank+=1 cper=perc outl.append('\t'.join([str(cnt_go),str(perc),str(crank),str(pval),goTerm])) #~ return outl def main(): #~ parser = argparse.ArgumentParser(description='Returns the count of genes in GO categories and their statistical overrrepresentation, from a list of genes and an extended file (i.e. plane text with ENSEMBLT and GO terms).') parser.add_argument('--input',metavar='input TXT file',type=str,help='the input file with the table in txt format.',required=True) parser.add_argument('--inExtnddfile',metavar='input TXT file',type=str,help='the input file with the extended table in txt format.',required=True) parser.add_argument('--output',metavar='output TXT file',type=str,help='the output file with the table in txt format.',required=True) parser.add_argument('--columnENSEMBLT',metavar='column number',type=int,help='column with the ENSEMBL transcript code in the input file.',required=True) parser.add_argument('--columnENSEMBLTExtndd',metavar='column number',type=int,help='column with the ENSEMBL transcript code in the extended file.',required=True) parser.add_argument('--columnGOExtndd',metavar='column number',type=int,help='column with the GO terms in the extended file.',required=True) args = parser.parse_args() inSAPsfile = args.input inExtnddfile = args.inExtnddfile saleGOPCount = args.output columnENSEMBLT = args.columnENSEMBLT columnENSEMBLTExtndd = args.columnENSEMBLTExtndd columnGOExtndd = args.columnGOExtndd #~ dGOTENSEMBLT,ENSEMBLTGinGO=rtrnGOcENSEMBLc(inExtnddfile,columnENSEMBLTExtndd,columnGOExtndd) sENSEMBLTSAPsinGO=rtrnENSEMBLcSAPs(inSAPsfile,columnENSEMBLT,ENSEMBLTGinGO) outl=rtrnCounts(dGOTENSEMBLT,ENSEMBLTGinGO,sENSEMBLTSAPsinGO) #~ saleGOPCount=open(saleGOPCount,'w') saleGOPCount.write('\n'.join(outl)) saleGOPCount.close() #~ return 0 if __name__ == '__main__': main()