Mercurial > repos > miller-lab > genome_diversity
comparison calclenchange.py @ 0:2c498d40ecde
Uploaded
author | miller-lab |
---|---|
date | Mon, 09 Apr 2012 12:03:06 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:2c498d40ecde |
---|---|
1 #!/usr/bin/env python | |
2 # -*- coding: utf-8 -*- | |
3 # | |
4 # calclenchange.py | |
5 # | |
6 # Copyright 2011 Oscar Bedoya-Reina <oscar@niska.bx.psu.edu> | |
7 # | |
8 # This program is free software; you can redistribute it and/or modify | |
9 # it under the terms of the GNU General Public License as published by | |
10 # the Free Software Foundation; either version 2 of the License, or | |
11 # (at your option) any later version. | |
12 # | |
13 # This program is distributed in the hope that it will be useful, | |
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 # GNU General Public License for more details. | |
17 # | |
18 # You should have received a copy of the GNU General Public License | |
19 # along with this program; if not, write to the Free Software | |
20 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, | |
21 # MA 02110-1301, USA. | |
22 | |
23 import argparse,mechanize,os,sys | |
24 from decimal import Decimal,getcontext | |
25 from xml.etree.ElementTree import ElementTree,tostring | |
26 import networkx as nx | |
27 from copy import copy | |
28 | |
29 #method to rank the the pthways by mut. freq. | |
30 def rankdN(ltfreqs): | |
31 ordvals=sorted(ltfreqs)#sort and reverse freqs. | |
32 #~ | |
33 outrnk=[] | |
34 tmpChng0,tmpOri,tmpMut,tmpPthw=ordvals.pop()#the highest possible value | |
35 if tmpOri=='C': | |
36 if tmpMut!='C': | |
37 tmpChng0='C-%s'%tmpMut | |
38 else: | |
39 tmpChng0=Decimal('0') | |
40 crank=1 | |
41 outrnk.append([str(tmpChng0),str(tmpOri),str(tmpMut),str(crank),tmpPthw]) | |
42 totalnvals=len(ordvals) | |
43 cnt=0 | |
44 while totalnvals>cnt: | |
45 cnt+=1 | |
46 tmpChng,tmpOri,tmpMut,tmpPthw=ordvals.pop() | |
47 if tmpOri=='C': | |
48 if tmpMut!='C': | |
49 tmpChng='C-%s'%tmpMut | |
50 else: | |
51 tmpChng=Decimal('0') | |
52 if tmpChng!=tmpChng0: | |
53 crank=len(outrnk)+1 | |
54 tmpChng0=tmpChng | |
55 outrnk.append([str(tmpChng),str(tmpOri),str(tmpMut),str(crank),tmpPthw]) | |
56 return outrnk | |
57 | |
58 #method to rank the the pthways by mut. freq. | |
59 def rankdAvr(ltfreqs): | |
60 ordvals=sorted(ltfreqs)#sort and reverse freqs. | |
61 #~ | |
62 outrnk={} | |
63 tmpChng0,tmpOri,tmpMut,tmpPthw=ordvals.pop()#the highest possible value | |
64 if tmpOri=='I': | |
65 if tmpMut!='I': | |
66 tmpChng0='I-%s'%tmpMut | |
67 else: | |
68 tmpChng0=Decimal('0') | |
69 crank=1 | |
70 outrnk[tmpPthw]='\t'.join([str(tmpChng0),str(tmpOri),str(tmpMut),str(crank)]) | |
71 totalnvals=len(ordvals) | |
72 cnt=0 | |
73 while totalnvals>cnt: | |
74 cnt+=1 | |
75 tmpChng,tmpOri,tmpMut,tmpPthw=ordvals.pop() | |
76 if tmpOri=='I': | |
77 if tmpMut!='I': | |
78 tmpChng='I-%s'%tmpMut | |
79 else: | |
80 tmpChng=Decimal('0') | |
81 if tmpChng!=tmpChng0: | |
82 crank=len(outrnk)+1 | |
83 tmpChng0=tmpChng | |
84 outrnk[tmpPthw]='\t'.join([str(tmpChng),str(tmpOri),str(tmpMut),str(crank)]) | |
85 return outrnk | |
86 | |
87 #this method takes as input a list of pairs of edges(beginNod,endNod) and returns a list of nodes with indegree 0 and outdegree 0 | |
88 def returnstartanendnodes(edges): | |
89 listID0st=set()#starts | |
90 listOD0en=set()#end | |
91 for beginNod,endNod in edges:# O(n) | |
92 listID0st.add(beginNod) | |
93 listOD0en.add(endNod) | |
94 startNdsID0=listID0st.difference(listOD0en) | |
95 endNdsOD0=listOD0en.difference(listID0st) | |
96 return startNdsID0,endNdsOD0 | |
97 | |
98 #~ Method to return nodes and edges | |
99 def returnNodesNEdgesfKXML(fpthwKGXML): | |
100 #~ | |
101 tree = ElementTree() | |
102 ptree=tree.parse(fpthwKGXML) | |
103 #~ | |
104 title=ptree.get('title') | |
105 prots=ptree.findall('entry') | |
106 reactns=ptree.findall('reaction') | |
107 #~ | |
108 edges,ndstmp=set(),set() | |
109 nreactns=len(reactns) | |
110 cr=0#count reacts | |
111 while nreactns>cr: | |
112 cr+=1 | |
113 reactn=reactns.pop() | |
114 mainid=reactn.get('id') | |
115 ndstmp.add(mainid)#add node | |
116 reacttyp=reactn.get('type') | |
117 sbstrts=reactn.findall('substrate') | |
118 while len(sbstrts)>0: | |
119 csbstrt=sbstrts.pop() | |
120 csbtsid=csbstrt.get('id') | |
121 ndstmp.add(csbtsid)#add node | |
122 if reacttyp=='irreversible': | |
123 edges.add((csbtsid,mainid))#add edges | |
124 elif reacttyp=='reversible': | |
125 edges.add((mainid,csbtsid))#add edges | |
126 edges.add((csbtsid,mainid))#add edges | |
127 #~ | |
128 prdcts=reactn.findall('product') | |
129 while len(prdcts)>0: | |
130 prdct=prdcts.pop() | |
131 prodctid=prdct.get('id') | |
132 ndstmp.add(prodctid)#add node | |
133 if reacttyp=='irreversible': | |
134 edges.add((mainid,prodctid))#add edges | |
135 elif reacttyp=='reversible': | |
136 edges.add((mainid,prodctid))#add edges | |
137 edges.add((prodctid,mainid))#add edges | |
138 #~ Nodes | |
139 nprots=len(prots) | |
140 cp=0#count prots | |
141 dnodes={} | |
142 while nprots>cp: | |
143 cp+=1 | |
144 prot=prots.pop() | |
145 tmpProtnm=prot.get('id') | |
146 if tmpProtnm in ndstmp: | |
147 dnodes[prot.get('id')]=set(prot.get('name').split())#each genename for each Id | |
148 return dnodes,edges,title | |
149 | |
150 #~ make calculation on pathways | |
151 def rtrnAvrgLen(edges,strNds,endNds): | |
152 wG=nx.DiGraph()#reference graph | |
153 wG.add_edges_from(edges) | |
154 dPairsSrcSnks=nx.all_pairs_shortest_path_length(wG)#dictionary between sources and sink and length | |
155 nstartNdsID0=len(strNds) | |
156 cstrtNds=0 | |
157 nPaths=0 | |
158 lPathLen=[] | |
159 while nstartNdsID0>cstrtNds: | |
160 cStartNd=strNds.pop()#current start node | |
161 dEndNdsLen=dPairsSrcSnks.pop(cStartNd) | |
162 for cendNd in dEndNdsLen: | |
163 if cendNd in endNds: | |
164 lPathLen.append(dEndNdsLen[cendNd]) | |
165 nPaths+=1 | |
166 cstrtNds+=1 | |
167 AvrgPthLen=0 | |
168 if nPaths!=0: | |
169 AvrgPthLen=Decimal(sum(lPathLen))/Decimal(str(nPaths)) | |
170 return nPaths,AvrgPthLen | |
171 | |
172 def main(): | |
173 parser = argparse.ArgumentParser(description='Rank pathways based on the change in length and number of paths connecting sources and sinks.') | |
174 parser.add_argument('--loc_file',metavar='correlational database',type=str,help='correlational database') | |
175 parser.add_argument('--species',metavar='species name',type=str,help='the species of interest in loc_file') | |
176 parser.add_argument('--output',metavar='output TXT file',type=str,help='the output file with the table in txt format. Column 1 is the diference between column 2 and column 3, Column 2 is the pathway average length (between sources and sinks) including the genes in the input list, Column 3 is the pathway average length EXCLUDING the genes in the input list, Column 4 is the rank based on column 1. Column 5 is the diference between column 6 and column 7, Column 6 is the number of paths between sources and sinks, including the genes in the input list, Column 7 is the number of paths between sources and sinks EXCLUDING the genes in the input list, Column 8 is the rank based on column 5. Column 9 I the pathway name' ) | |
177 parser.add_argument('--posKEGGclmn',metavar='column number',type=int,help='the column with the KEGG pathway code/name') | |
178 parser.add_argument('--KEGGgeneposcolmn',metavar='column number',type=int,help='column with the KEGG gene code') | |
179 parser.add_argument('--input',metavar='input TXT file',type=str,help='the input file with the table in txt format') | |
180 #~ | |
181 #~Open arguments | |
182 class C(object): | |
183 pass | |
184 fulargs=C() | |
185 parser.parse_args(sys.argv[1:],namespace=fulargs) | |
186 #test input vars | |
187 inputf,loc_file,species,output,posKEGGclmn,Kgeneposcolmn=fulargs.input,fulargs.loc_file,fulargs.species,fulargs.output,fulargs.posKEGGclmn,fulargs.KEGGgeneposcolmn | |
188 posKEGGclmn-=1#correct pos | |
189 Kgeneposcolmn-=1 | |
190 #~ Get the extra variables | |
191 crDB=[x.split() for x in open(loc_file).read().splitlines() if x.split()[0]==species][0] | |
192 sppPrefx,dinput=crDB[1],crDB[2] | |
193 #~ set decimal positions | |
194 getcontext().prec = 3 | |
195 #make a dictionary of valid genes | |
196 dKEGGcPthws=dict([(x.split('\t')[Kgeneposcolmn],set([y.split('=')[0] for y in x.split('\t')[posKEGGclmn].split('.')])) for x in open(inputf).read().splitlines()[1:] if x.strip()]) | |
197 sdGenes=set([x for x in dKEGGcPthws.keys() if x.find('.')>-1]) | |
198 while True:#to crrect names with more than one gene | |
199 try: | |
200 mgenes=sdGenes.pop() | |
201 pthwsAssotd=dKEGGcPthws.pop(mgenes) | |
202 mgenes=mgenes.split('.') | |
203 for eachg in mgenes: | |
204 dKEGGcPthws[eachg]=pthwsAssotd | |
205 except: | |
206 break | |
207 #~ | |
208 lPthwsF=[x for x in os.listdir(dinput) if x.find('.xml')>-1 if x not in ['cfa04070.xml']] | |
209 nPthws=len(lPthwsF) | |
210 cPthw=0 | |
211 lPthwPthN=[]#the output list for number of paths | |
212 lPthwPthAvr=[]#the output list for the length of paths | |
213 #~ | |
214 while cPthw<nPthws: | |
215 cPthw+=1 | |
216 KEGGpathw=lPthwsF.pop() | |
217 comdKEGGpathw=KEGGpathw.split('.')[0] | |
218 tmpddGenrcgenPresent=set() | |
219 sKEGGc=dKEGGcPthws.keys() | |
220 lsKEGGc=len(sKEGGc) | |
221 ctPthw=0 | |
222 while ctPthw < lsKEGGc:#to save memory | |
223 eachK=sKEGGc.pop() | |
224 alPthws=dKEGGcPthws[eachK] | |
225 if comdKEGGpathw in alPthws: | |
226 tmpddGenrcgenPresent.add(':'.join([sppPrefx,eachK])) | |
227 ctPthw+=1 | |
228 #~ Make graph calculations | |
229 dnodes,edges,title=returnNodesNEdgesfKXML(open(os.path.join(dinput,KEGGpathw))) | |
230 startNdsID0,endNdsOD0=returnstartanendnodes(edges) | |
231 startNdsOri=copy(startNdsID0) | |
232 #~ | |
233 nPaths='C'#stands for circuit | |
234 AvrgPthLen='I'#stand for infinite | |
235 if len(startNdsID0)>0 and len(endNdsOD0)>0: | |
236 nPaths,AvrgPthLen=rtrnAvrgLen(edges,startNdsID0,endNdsOD0) | |
237 #~ work with the genes in the list | |
238 genestodel=set() | |
239 lnodes=len(dnodes) | |
240 sNds=set(dnodes) | |
241 ctPthw=0 | |
242 while ctPthw<lnodes: | |
243 ctPthw+=1 | |
244 cNod=sNds.pop() | |
245 sgenes=dnodes.pop(cNod) | |
246 if len(sgenes.intersection(tmpddGenrcgenPresent))==len(sgenes): | |
247 genestodel.add(cNod) | |
248 #~ del nodes from graph edges | |
249 wnPaths,wAvrgPthLen=copy(nPaths),copy(AvrgPthLen) | |
250 if len(genestodel)>0: | |
251 wedges=set([x for x in edges if len(set(x).intersection(genestodel))==0]) | |
252 wstartNds,wendNds=returnstartanendnodes(wedges) | |
253 if nPaths!='C': | |
254 wstartNds=[x for x in wstartNds if x in startNdsOri] | |
255 wendNds=[x for x in wendNds if x in endNdsOD0] | |
256 if len(wstartNds)>0 and len(wendNds)>0: | |
257 wnPaths,wAvrgPthLen=rtrnAvrgLen(wedges,wstartNds,wendNds) | |
258 #~ Calculate the differences | |
259 orNP,mutNP,oriLen,mutLen=nPaths,wnPaths,AvrgPthLen,wAvrgPthLen | |
260 if nPaths=='C': | |
261 orNP=Decimal('1000') | |
262 oriLen=Decimal('1000') | |
263 if wnPaths=='C': | |
264 mutNP=Decimal('1000') | |
265 mutLen=Decimal('1000') | |
266 lPthwPthN.append([orNP-mutNP,nPaths,wnPaths,'='.join([comdKEGGpathw,title])])#print nPaths,AvrgPthLen | |
267 lPthwPthAvr.append([oriLen-mutLen,AvrgPthLen,wAvrgPthLen,'='.join([comdKEGGpathw,title])])#print nPaths,AvrgPthLen | |
268 doutrnkPthN=rankdN(lPthwPthN) | |
269 doutrnkPthAvr=rankdAvr(lPthwPthAvr) | |
270 #~ | |
271 sall=['\t'.join([doutrnkPthAvr[x[4]],'\t'.join(x)]) for x in doutrnkPthN] | |
272 salef=open(output,'w') | |
273 salef.write('\n'.join(sall)) | |
274 salef.close() | |
275 return 0 | |
276 | |
277 | |
278 if __name__ == '__main__': | |
279 main() | |
280 |