Mercurial > repos > miller-lab > genome_diversity
diff calclenchange.py @ 12:4b6590dd7250
Uploaded
author | miller-lab |
---|---|
date | Wed, 12 Sep 2012 17:10:26 -0400 |
parents | 2c498d40ecde |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/calclenchange.py Wed Sep 12 17:10:26 2012 -0400 @@ -0,0 +1,280 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# calclenchange.py +# +# Copyright 2011 Oscar Bedoya-Reina <oscar@niska.bx.psu.edu> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, +# MA 02110-1301, USA. + +import argparse,mechanize,os,sys +from decimal import Decimal,getcontext +from xml.etree.ElementTree import ElementTree,tostring +import networkx as nx +from copy import copy + +#method to rank the the pthways by mut. freq. +def rankdN(ltfreqs): + ordvals=sorted(ltfreqs)#sort and reverse freqs. + #~ + outrnk=[] + tmpChng0,tmpOri,tmpMut,tmpPthw=ordvals.pop()#the highest possible value + if tmpOri=='C': + if tmpMut!='C': + tmpChng0='C-%s'%tmpMut + else: + tmpChng0=Decimal('0') + crank=1 + outrnk.append([str(tmpChng0),str(tmpOri),str(tmpMut),str(crank),tmpPthw]) + totalnvals=len(ordvals) + cnt=0 + while totalnvals>cnt: + cnt+=1 + tmpChng,tmpOri,tmpMut,tmpPthw=ordvals.pop() + if tmpOri=='C': + if tmpMut!='C': + tmpChng='C-%s'%tmpMut + else: + tmpChng=Decimal('0') + if tmpChng!=tmpChng0: + crank=len(outrnk)+1 + tmpChng0=tmpChng + outrnk.append([str(tmpChng),str(tmpOri),str(tmpMut),str(crank),tmpPthw]) + return outrnk + +#method to rank the the pthways by mut. freq. +def rankdAvr(ltfreqs): + ordvals=sorted(ltfreqs)#sort and reverse freqs. + #~ + outrnk={} + tmpChng0,tmpOri,tmpMut,tmpPthw=ordvals.pop()#the highest possible value + if tmpOri=='I': + if tmpMut!='I': + tmpChng0='I-%s'%tmpMut + else: + tmpChng0=Decimal('0') + crank=1 + outrnk[tmpPthw]='\t'.join([str(tmpChng0),str(tmpOri),str(tmpMut),str(crank)]) + totalnvals=len(ordvals) + cnt=0 + while totalnvals>cnt: + cnt+=1 + tmpChng,tmpOri,tmpMut,tmpPthw=ordvals.pop() + if tmpOri=='I': + if tmpMut!='I': + tmpChng='I-%s'%tmpMut + else: + tmpChng=Decimal('0') + if tmpChng!=tmpChng0: + crank=len(outrnk)+1 + tmpChng0=tmpChng + outrnk[tmpPthw]='\t'.join([str(tmpChng),str(tmpOri),str(tmpMut),str(crank)]) + return outrnk + +#this method takes as input a list of pairs of edges(beginNod,endNod) and returns a list of nodes with indegree 0 and outdegree 0 +def returnstartanendnodes(edges): + listID0st=set()#starts + listOD0en=set()#end + for beginNod,endNod in edges:# O(n) + listID0st.add(beginNod) + listOD0en.add(endNod) + startNdsID0=listID0st.difference(listOD0en) + endNdsOD0=listOD0en.difference(listID0st) + return startNdsID0,endNdsOD0 + +#~ Method to return nodes and edges +def returnNodesNEdgesfKXML(fpthwKGXML): + #~ + tree = ElementTree() + ptree=tree.parse(fpthwKGXML) + #~ + title=ptree.get('title') + prots=ptree.findall('entry') + reactns=ptree.findall('reaction') + #~ + edges,ndstmp=set(),set() + nreactns=len(reactns) + cr=0#count reacts + while nreactns>cr: + cr+=1 + reactn=reactns.pop() + mainid=reactn.get('id') + ndstmp.add(mainid)#add node + reacttyp=reactn.get('type') + sbstrts=reactn.findall('substrate') + while len(sbstrts)>0: + csbstrt=sbstrts.pop() + csbtsid=csbstrt.get('id') + ndstmp.add(csbtsid)#add node + if reacttyp=='irreversible': + edges.add((csbtsid,mainid))#add edges + elif reacttyp=='reversible': + edges.add((mainid,csbtsid))#add edges + edges.add((csbtsid,mainid))#add edges + #~ + prdcts=reactn.findall('product') + while len(prdcts)>0: + prdct=prdcts.pop() + prodctid=prdct.get('id') + ndstmp.add(prodctid)#add node + if reacttyp=='irreversible': + edges.add((mainid,prodctid))#add edges + elif reacttyp=='reversible': + edges.add((mainid,prodctid))#add edges + edges.add((prodctid,mainid))#add edges + #~ Nodes + nprots=len(prots) + cp=0#count prots + dnodes={} + while nprots>cp: + cp+=1 + prot=prots.pop() + tmpProtnm=prot.get('id') + if tmpProtnm in ndstmp: + dnodes[prot.get('id')]=set(prot.get('name').split())#each genename for each Id + return dnodes,edges,title + +#~ make calculation on pathways +def rtrnAvrgLen(edges,strNds,endNds): + wG=nx.DiGraph()#reference graph + wG.add_edges_from(edges) + dPairsSrcSnks=nx.all_pairs_shortest_path_length(wG)#dictionary between sources and sink and length + nstartNdsID0=len(strNds) + cstrtNds=0 + nPaths=0 + lPathLen=[] + while nstartNdsID0>cstrtNds: + cStartNd=strNds.pop()#current start node + dEndNdsLen=dPairsSrcSnks.pop(cStartNd) + for cendNd in dEndNdsLen: + if cendNd in endNds: + lPathLen.append(dEndNdsLen[cendNd]) + nPaths+=1 + cstrtNds+=1 + AvrgPthLen=0 + if nPaths!=0: + AvrgPthLen=Decimal(sum(lPathLen))/Decimal(str(nPaths)) + return nPaths,AvrgPthLen + +def main(): + parser = argparse.ArgumentParser(description='Rank pathways based on the change in length and number of paths connecting sources and sinks.') + parser.add_argument('--loc_file',metavar='correlational database',type=str,help='correlational database') + parser.add_argument('--species',metavar='species name',type=str,help='the species of interest in loc_file') + parser.add_argument('--output',metavar='output TXT file',type=str,help='the output file with the table in txt format. Column 1 is the diference between column 2 and column 3, Column 2 is the pathway average length (between sources and sinks) including the genes in the input list, Column 3 is the pathway average length EXCLUDING the genes in the input list, Column 4 is the rank based on column 1. Column 5 is the diference between column 6 and column 7, Column 6 is the number of paths between sources and sinks, including the genes in the input list, Column 7 is the number of paths between sources and sinks EXCLUDING the genes in the input list, Column 8 is the rank based on column 5. Column 9 I the pathway name' ) + parser.add_argument('--posKEGGclmn',metavar='column number',type=int,help='the column with the KEGG pathway code/name') + parser.add_argument('--KEGGgeneposcolmn',metavar='column number',type=int,help='column with the KEGG gene code') + parser.add_argument('--input',metavar='input TXT file',type=str,help='the input file with the table in txt format') + #~ + #~Open arguments + class C(object): + pass + fulargs=C() + parser.parse_args(sys.argv[1:],namespace=fulargs) + #test input vars + inputf,loc_file,species,output,posKEGGclmn,Kgeneposcolmn=fulargs.input,fulargs.loc_file,fulargs.species,fulargs.output,fulargs.posKEGGclmn,fulargs.KEGGgeneposcolmn + posKEGGclmn-=1#correct pos + Kgeneposcolmn-=1 + #~ Get the extra variables + crDB=[x.split() for x in open(loc_file).read().splitlines() if x.split()[0]==species][0] + sppPrefx,dinput=crDB[1],crDB[2] + #~ set decimal positions + getcontext().prec = 3 + #make a dictionary of valid genes + dKEGGcPthws=dict([(x.split('\t')[Kgeneposcolmn],set([y.split('=')[0] for y in x.split('\t')[posKEGGclmn].split('.')])) for x in open(inputf).read().splitlines()[1:] if x.strip()]) + sdGenes=set([x for x in dKEGGcPthws.keys() if x.find('.')>-1]) + while True:#to crrect names with more than one gene + try: + mgenes=sdGenes.pop() + pthwsAssotd=dKEGGcPthws.pop(mgenes) + mgenes=mgenes.split('.') + for eachg in mgenes: + dKEGGcPthws[eachg]=pthwsAssotd + except: + break + #~ + lPthwsF=[x for x in os.listdir(dinput) if x.find('.xml')>-1 if x not in ['cfa04070.xml']] + nPthws=len(lPthwsF) + cPthw=0 + lPthwPthN=[]#the output list for number of paths + lPthwPthAvr=[]#the output list for the length of paths + #~ + while cPthw<nPthws: + cPthw+=1 + KEGGpathw=lPthwsF.pop() + comdKEGGpathw=KEGGpathw.split('.')[0] + tmpddGenrcgenPresent=set() + sKEGGc=dKEGGcPthws.keys() + lsKEGGc=len(sKEGGc) + ctPthw=0 + while ctPthw < lsKEGGc:#to save memory + eachK=sKEGGc.pop() + alPthws=dKEGGcPthws[eachK] + if comdKEGGpathw in alPthws: + tmpddGenrcgenPresent.add(':'.join([sppPrefx,eachK])) + ctPthw+=1 + #~ Make graph calculations + dnodes,edges,title=returnNodesNEdgesfKXML(open(os.path.join(dinput,KEGGpathw))) + startNdsID0,endNdsOD0=returnstartanendnodes(edges) + startNdsOri=copy(startNdsID0) + #~ + nPaths='C'#stands for circuit + AvrgPthLen='I'#stand for infinite + if len(startNdsID0)>0 and len(endNdsOD0)>0: + nPaths,AvrgPthLen=rtrnAvrgLen(edges,startNdsID0,endNdsOD0) + #~ work with the genes in the list + genestodel=set() + lnodes=len(dnodes) + sNds=set(dnodes) + ctPthw=0 + while ctPthw<lnodes: + ctPthw+=1 + cNod=sNds.pop() + sgenes=dnodes.pop(cNod) + if len(sgenes.intersection(tmpddGenrcgenPresent))==len(sgenes): + genestodel.add(cNod) + #~ del nodes from graph edges + wnPaths,wAvrgPthLen=copy(nPaths),copy(AvrgPthLen) + if len(genestodel)>0: + wedges=set([x for x in edges if len(set(x).intersection(genestodel))==0]) + wstartNds,wendNds=returnstartanendnodes(wedges) + if nPaths!='C': + wstartNds=[x for x in wstartNds if x in startNdsOri] + wendNds=[x for x in wendNds if x in endNdsOD0] + if len(wstartNds)>0 and len(wendNds)>0: + wnPaths,wAvrgPthLen=rtrnAvrgLen(wedges,wstartNds,wendNds) + #~ Calculate the differences + orNP,mutNP,oriLen,mutLen=nPaths,wnPaths,AvrgPthLen,wAvrgPthLen + if nPaths=='C': + orNP=Decimal('1000') + oriLen=Decimal('1000') + if wnPaths=='C': + mutNP=Decimal('1000') + mutLen=Decimal('1000') + lPthwPthN.append([orNP-mutNP,nPaths,wnPaths,'='.join([comdKEGGpathw,title])])#print nPaths,AvrgPthLen + lPthwPthAvr.append([oriLen-mutLen,AvrgPthLen,wAvrgPthLen,'='.join([comdKEGGpathw,title])])#print nPaths,AvrgPthLen + doutrnkPthN=rankdN(lPthwPthN) + doutrnkPthAvr=rankdAvr(lPthwPthAvr) + #~ + sall=['\t'.join([doutrnkPthAvr[x[4]],'\t'.join(x)]) for x in doutrnkPthN] + salef=open(output,'w') + salef.write('\n'.join(sall)) + salef.close() + return 0 + + +if __name__ == '__main__': + main() +