comparison calclenchange.py @ 17:a3af29edcce2

Uploaded Miller Lab Devshed version a51c894f5bed
author miller-lab
date Fri, 28 Sep 2012 11:57:18 -0400
parents 2c498d40ecde
children
comparison
equal deleted inserted replaced
16:be0e2223c531 17:a3af29edcce2
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 #
4 # calclenchange.py
5 #
6 # Copyright 2011 Oscar Bedoya-Reina <oscar@niska.bx.psu.edu>
7 #
8 # This program is free software; you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation; either version 2 of the License, or
11 # (at your option) any later version.
12 #
13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
17 #
18 # You should have received a copy of the GNU General Public License
19 # along with this program; if not, write to the Free Software
20 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
21 # MA 02110-1301, USA.
22
23 import argparse,mechanize,os,sys
24 from decimal import Decimal,getcontext
25 from xml.etree.ElementTree import ElementTree,tostring
26 import networkx as nx
27 from copy import copy
28
29 #method to rank the the pthways by mut. freq.
30 def rankdN(ltfreqs):
31 ordvals=sorted(ltfreqs)#sort and reverse freqs.
32 #~
33 outrnk=[]
34 tmpChng0,tmpOri,tmpMut,tmpPthw=ordvals.pop()#the highest possible value
35 if tmpOri=='C':
36 if tmpMut!='C':
37 tmpChng0='C-%s'%tmpMut
38 else:
39 tmpChng0=Decimal('0')
40 crank=1
41 outrnk.append([str(tmpChng0),str(tmpOri),str(tmpMut),str(crank),tmpPthw])
42 totalnvals=len(ordvals)
43 cnt=0
44 while totalnvals>cnt:
45 cnt+=1
46 tmpChng,tmpOri,tmpMut,tmpPthw=ordvals.pop()
47 if tmpOri=='C':
48 if tmpMut!='C':
49 tmpChng='C-%s'%tmpMut
50 else:
51 tmpChng=Decimal('0')
52 if tmpChng!=tmpChng0:
53 crank=len(outrnk)+1
54 tmpChng0=tmpChng
55 outrnk.append([str(tmpChng),str(tmpOri),str(tmpMut),str(crank),tmpPthw])
56 return outrnk
57
58 #method to rank the the pthways by mut. freq.
59 def rankdAvr(ltfreqs):
60 ordvals=sorted(ltfreqs)#sort and reverse freqs.
61 #~
62 outrnk={}
63 tmpChng0,tmpOri,tmpMut,tmpPthw=ordvals.pop()#the highest possible value
64 if tmpOri=='I':
65 if tmpMut!='I':
66 tmpChng0='I-%s'%tmpMut
67 else:
68 tmpChng0=Decimal('0')
69 crank=1
70 outrnk[tmpPthw]='\t'.join([str(tmpChng0),str(tmpOri),str(tmpMut),str(crank)])
71 totalnvals=len(ordvals)
72 cnt=0
73 while totalnvals>cnt:
74 cnt+=1
75 tmpChng,tmpOri,tmpMut,tmpPthw=ordvals.pop()
76 if tmpOri=='I':
77 if tmpMut!='I':
78 tmpChng='I-%s'%tmpMut
79 else:
80 tmpChng=Decimal('0')
81 if tmpChng!=tmpChng0:
82 crank=len(outrnk)+1
83 tmpChng0=tmpChng
84 outrnk[tmpPthw]='\t'.join([str(tmpChng),str(tmpOri),str(tmpMut),str(crank)])
85 return outrnk
86
87 #this method takes as input a list of pairs of edges(beginNod,endNod) and returns a list of nodes with indegree 0 and outdegree 0
88 def returnstartanendnodes(edges):
89 listID0st=set()#starts
90 listOD0en=set()#end
91 for beginNod,endNod in edges:# O(n)
92 listID0st.add(beginNod)
93 listOD0en.add(endNod)
94 startNdsID0=listID0st.difference(listOD0en)
95 endNdsOD0=listOD0en.difference(listID0st)
96 return startNdsID0,endNdsOD0
97
98 #~ Method to return nodes and edges
99 def returnNodesNEdgesfKXML(fpthwKGXML):
100 #~
101 tree = ElementTree()
102 ptree=tree.parse(fpthwKGXML)
103 #~
104 title=ptree.get('title')
105 prots=ptree.findall('entry')
106 reactns=ptree.findall('reaction')
107 #~
108 edges,ndstmp=set(),set()
109 nreactns=len(reactns)
110 cr=0#count reacts
111 while nreactns>cr:
112 cr+=1
113 reactn=reactns.pop()
114 mainid=reactn.get('id')
115 ndstmp.add(mainid)#add node
116 reacttyp=reactn.get('type')
117 sbstrts=reactn.findall('substrate')
118 while len(sbstrts)>0:
119 csbstrt=sbstrts.pop()
120 csbtsid=csbstrt.get('id')
121 ndstmp.add(csbtsid)#add node
122 if reacttyp=='irreversible':
123 edges.add((csbtsid,mainid))#add edges
124 elif reacttyp=='reversible':
125 edges.add((mainid,csbtsid))#add edges
126 edges.add((csbtsid,mainid))#add edges
127 #~
128 prdcts=reactn.findall('product')
129 while len(prdcts)>0:
130 prdct=prdcts.pop()
131 prodctid=prdct.get('id')
132 ndstmp.add(prodctid)#add node
133 if reacttyp=='irreversible':
134 edges.add((mainid,prodctid))#add edges
135 elif reacttyp=='reversible':
136 edges.add((mainid,prodctid))#add edges
137 edges.add((prodctid,mainid))#add edges
138 #~ Nodes
139 nprots=len(prots)
140 cp=0#count prots
141 dnodes={}
142 while nprots>cp:
143 cp+=1
144 prot=prots.pop()
145 tmpProtnm=prot.get('id')
146 if tmpProtnm in ndstmp:
147 dnodes[prot.get('id')]=set(prot.get('name').split())#each genename for each Id
148 return dnodes,edges,title
149
150 #~ make calculation on pathways
151 def rtrnAvrgLen(edges,strNds,endNds):
152 wG=nx.DiGraph()#reference graph
153 wG.add_edges_from(edges)
154 dPairsSrcSnks=nx.all_pairs_shortest_path_length(wG)#dictionary between sources and sink and length
155 nstartNdsID0=len(strNds)
156 cstrtNds=0
157 nPaths=0
158 lPathLen=[]
159 while nstartNdsID0>cstrtNds:
160 cStartNd=strNds.pop()#current start node
161 dEndNdsLen=dPairsSrcSnks.pop(cStartNd)
162 for cendNd in dEndNdsLen:
163 if cendNd in endNds:
164 lPathLen.append(dEndNdsLen[cendNd])
165 nPaths+=1
166 cstrtNds+=1
167 AvrgPthLen=0
168 if nPaths!=0:
169 AvrgPthLen=Decimal(sum(lPathLen))/Decimal(str(nPaths))
170 return nPaths,AvrgPthLen
171
172 def main():
173 parser = argparse.ArgumentParser(description='Rank pathways based on the change in length and number of paths connecting sources and sinks.')
174 parser.add_argument('--loc_file',metavar='correlational database',type=str,help='correlational database')
175 parser.add_argument('--species',metavar='species name',type=str,help='the species of interest in loc_file')
176 parser.add_argument('--output',metavar='output TXT file',type=str,help='the output file with the table in txt format. Column 1 is the diference between column 2 and column 3, Column 2 is the pathway average length (between sources and sinks) including the genes in the input list, Column 3 is the pathway average length EXCLUDING the genes in the input list, Column 4 is the rank based on column 1. Column 5 is the diference between column 6 and column 7, Column 6 is the number of paths between sources and sinks, including the genes in the input list, Column 7 is the number of paths between sources and sinks EXCLUDING the genes in the input list, Column 8 is the rank based on column 5. Column 9 I the pathway name' )
177 parser.add_argument('--posKEGGclmn',metavar='column number',type=int,help='the column with the KEGG pathway code/name')
178 parser.add_argument('--KEGGgeneposcolmn',metavar='column number',type=int,help='column with the KEGG gene code')
179 parser.add_argument('--input',metavar='input TXT file',type=str,help='the input file with the table in txt format')
180 #~
181 #~Open arguments
182 class C(object):
183 pass
184 fulargs=C()
185 parser.parse_args(sys.argv[1:],namespace=fulargs)
186 #test input vars
187 inputf,loc_file,species,output,posKEGGclmn,Kgeneposcolmn=fulargs.input,fulargs.loc_file,fulargs.species,fulargs.output,fulargs.posKEGGclmn,fulargs.KEGGgeneposcolmn
188 posKEGGclmn-=1#correct pos
189 Kgeneposcolmn-=1
190 #~ Get the extra variables
191 crDB=[x.split() for x in open(loc_file).read().splitlines() if x.split()[0]==species][0]
192 sppPrefx,dinput=crDB[1],crDB[2]
193 #~ set decimal positions
194 getcontext().prec = 3
195 #make a dictionary of valid genes
196 dKEGGcPthws=dict([(x.split('\t')[Kgeneposcolmn],set([y.split('=')[0] for y in x.split('\t')[posKEGGclmn].split('.')])) for x in open(inputf).read().splitlines()[1:] if x.strip()])
197 sdGenes=set([x for x in dKEGGcPthws.keys() if x.find('.')>-1])
198 while True:#to crrect names with more than one gene
199 try:
200 mgenes=sdGenes.pop()
201 pthwsAssotd=dKEGGcPthws.pop(mgenes)
202 mgenes=mgenes.split('.')
203 for eachg in mgenes:
204 dKEGGcPthws[eachg]=pthwsAssotd
205 except:
206 break
207 #~
208 lPthwsF=[x for x in os.listdir(dinput) if x.find('.xml')>-1 if x not in ['cfa04070.xml']]
209 nPthws=len(lPthwsF)
210 cPthw=0
211 lPthwPthN=[]#the output list for number of paths
212 lPthwPthAvr=[]#the output list for the length of paths
213 #~
214 while cPthw<nPthws:
215 cPthw+=1
216 KEGGpathw=lPthwsF.pop()
217 comdKEGGpathw=KEGGpathw.split('.')[0]
218 tmpddGenrcgenPresent=set()
219 sKEGGc=dKEGGcPthws.keys()
220 lsKEGGc=len(sKEGGc)
221 ctPthw=0
222 while ctPthw < lsKEGGc:#to save memory
223 eachK=sKEGGc.pop()
224 alPthws=dKEGGcPthws[eachK]
225 if comdKEGGpathw in alPthws:
226 tmpddGenrcgenPresent.add(':'.join([sppPrefx,eachK]))
227 ctPthw+=1
228 #~ Make graph calculations
229 dnodes,edges,title=returnNodesNEdgesfKXML(open(os.path.join(dinput,KEGGpathw)))
230 startNdsID0,endNdsOD0=returnstartanendnodes(edges)
231 startNdsOri=copy(startNdsID0)
232 #~
233 nPaths='C'#stands for circuit
234 AvrgPthLen='I'#stand for infinite
235 if len(startNdsID0)>0 and len(endNdsOD0)>0:
236 nPaths,AvrgPthLen=rtrnAvrgLen(edges,startNdsID0,endNdsOD0)
237 #~ work with the genes in the list
238 genestodel=set()
239 lnodes=len(dnodes)
240 sNds=set(dnodes)
241 ctPthw=0
242 while ctPthw<lnodes:
243 ctPthw+=1
244 cNod=sNds.pop()
245 sgenes=dnodes.pop(cNod)
246 if len(sgenes.intersection(tmpddGenrcgenPresent))==len(sgenes):
247 genestodel.add(cNod)
248 #~ del nodes from graph edges
249 wnPaths,wAvrgPthLen=copy(nPaths),copy(AvrgPthLen)
250 if len(genestodel)>0:
251 wedges=set([x for x in edges if len(set(x).intersection(genestodel))==0])
252 wstartNds,wendNds=returnstartanendnodes(wedges)
253 if nPaths!='C':
254 wstartNds=[x for x in wstartNds if x in startNdsOri]
255 wendNds=[x for x in wendNds if x in endNdsOD0]
256 if len(wstartNds)>0 and len(wendNds)>0:
257 wnPaths,wAvrgPthLen=rtrnAvrgLen(wedges,wstartNds,wendNds)
258 #~ Calculate the differences
259 orNP,mutNP,oriLen,mutLen=nPaths,wnPaths,AvrgPthLen,wAvrgPthLen
260 if nPaths=='C':
261 orNP=Decimal('1000')
262 oriLen=Decimal('1000')
263 if wnPaths=='C':
264 mutNP=Decimal('1000')
265 mutLen=Decimal('1000')
266 lPthwPthN.append([orNP-mutNP,nPaths,wnPaths,'='.join([comdKEGGpathw,title])])#print nPaths,AvrgPthLen
267 lPthwPthAvr.append([oriLen-mutLen,AvrgPthLen,wAvrgPthLen,'='.join([comdKEGGpathw,title])])#print nPaths,AvrgPthLen
268 doutrnkPthN=rankdN(lPthwPthN)
269 doutrnkPthAvr=rankdAvr(lPthwPthAvr)
270 #~
271 sall=['\t'.join([doutrnkPthAvr[x[4]],'\t'.join(x)]) for x in doutrnkPthN]
272 salef=open(output,'w')
273 salef.write('\n'.join(sall))
274 salef.close()
275 return 0
276
277
278 if __name__ == '__main__':
279 main()
280