Mercurial > repos > miller-lab > genome_diversity

diff calclenchange.py @ 12:4b6590dd7250
Uploaded
author: miller-lab
date: Wed, 12 Sep 2012 17:10:26 -0400
parents: 2c498d40ecde
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/calclenchange.py	Wed Sep 12 17:10:26 2012 -0400
@@ -0,0 +1,280 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+#       calclenchange.py
+#       
+#       Copyright 2011 Oscar Bedoya-Reina <oscar@niska.bx.psu.edu>
+#       
+#       This program is free software; you can redistribute it and/or modify
+#       it under the terms of the GNU General Public License as published by
+#       the Free Software Foundation; either version 2 of the License, or
+#       (at your option) any later version.
+#       
+#       This program is distributed in the hope that it will be useful,
+#       but WITHOUT ANY WARRANTY; without even the implied warranty of
+#       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#       GNU General Public License for more details.
+#       
+#       You should have received a copy of the GNU General Public License
+#       along with this program; if not, write to the Free Software
+#       Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+#       MA 02110-1301, USA.
+
+import argparse,mechanize,os,sys
+from decimal import Decimal,getcontext
+from xml.etree.ElementTree import ElementTree,tostring
+import networkx as nx
+from copy import copy
+
+#method to rank the the pthways by mut. freq.
+def rankdN(ltfreqs):
+	ordvals=sorted(ltfreqs)#sort and reverse freqs.
+	#~ 
+	outrnk=[]
+	tmpChng0,tmpOri,tmpMut,tmpPthw=ordvals.pop()#the highest possible value
+	if tmpOri=='C':
+		if tmpMut!='C':
+			tmpChng0='C-%s'%tmpMut
+		else:
+			tmpChng0=Decimal('0')
+	crank=1
+	outrnk.append([str(tmpChng0),str(tmpOri),str(tmpMut),str(crank),tmpPthw])
+	totalnvals=len(ordvals)
+	cnt=0
+	while totalnvals>cnt:
+		cnt+=1
+		tmpChng,tmpOri,tmpMut,tmpPthw=ordvals.pop()
+		if tmpOri=='C':
+			if tmpMut!='C':
+				tmpChng='C-%s'%tmpMut
+			else:
+				tmpChng=Decimal('0')
+		if tmpChng!=tmpChng0:
+			crank=len(outrnk)+1
+			tmpChng0=tmpChng
+		outrnk.append([str(tmpChng),str(tmpOri),str(tmpMut),str(crank),tmpPthw])
+	return outrnk
+
+#method to rank the the pthways by mut. freq.
+def rankdAvr(ltfreqs):
+	ordvals=sorted(ltfreqs)#sort and reverse freqs.
+	#~ 
+	outrnk={}
+	tmpChng0,tmpOri,tmpMut,tmpPthw=ordvals.pop()#the highest possible value
+	if tmpOri=='I':
+		if tmpMut!='I':
+			tmpChng0='I-%s'%tmpMut
+		else:
+			tmpChng0=Decimal('0')
+	crank=1
+	outrnk[tmpPthw]='\t'.join([str(tmpChng0),str(tmpOri),str(tmpMut),str(crank)])
+	totalnvals=len(ordvals)
+	cnt=0
+	while totalnvals>cnt:
+		cnt+=1
+		tmpChng,tmpOri,tmpMut,tmpPthw=ordvals.pop()
+		if tmpOri=='I':
+			if tmpMut!='I':
+				tmpChng='I-%s'%tmpMut
+			else:
+				tmpChng=Decimal('0')
+		if tmpChng!=tmpChng0:
+			crank=len(outrnk)+1
+			tmpChng0=tmpChng
+		outrnk[tmpPthw]='\t'.join([str(tmpChng),str(tmpOri),str(tmpMut),str(crank)])
+	return outrnk
+
+#this method takes as input a list of pairs of edges(beginNod,endNod) and returns a list of nodes with indegree 0 and outdegree 0
+def returnstartanendnodes(edges):
+	listID0st=set()#starts
+	listOD0en=set()#end
+	for beginNod,endNod in edges:# O(n)
+		listID0st.add(beginNod)
+		listOD0en.add(endNod)
+	startNdsID0=listID0st.difference(listOD0en)
+	endNdsOD0=listOD0en.difference(listID0st)
+	return startNdsID0,endNdsOD0
+
+#~ Method to return nodes and edges
+def returnNodesNEdgesfKXML(fpthwKGXML):
+	#~ 
+	tree = ElementTree()
+	ptree=tree.parse(fpthwKGXML)
+	#~ 
+	title=ptree.get('title')
+	prots=ptree.findall('entry')
+	reactns=ptree.findall('reaction')
+	#~ 
+	edges,ndstmp=set(),set()
+	nreactns=len(reactns)
+	cr=0#count reacts
+	while nreactns>cr:
+		cr+=1
+		reactn=reactns.pop()
+		mainid=reactn.get('id')
+		ndstmp.add(mainid)#add node
+		reacttyp=reactn.get('type')
+		sbstrts=reactn.findall('substrate')
+		while len(sbstrts)>0:
+			csbstrt=sbstrts.pop()
+			csbtsid=csbstrt.get('id')
+			ndstmp.add(csbtsid)#add node
+			if reacttyp=='irreversible':
+				edges.add((csbtsid,mainid))#add edges
+			elif reacttyp=='reversible':
+				edges.add((mainid,csbtsid))#add edges
+				edges.add((csbtsid,mainid))#add edges
+		#~ 
+		prdcts=reactn.findall('product')
+		while len(prdcts)>0:
+			prdct=prdcts.pop()
+			prodctid=prdct.get('id')
+			ndstmp.add(prodctid)#add node
+			if reacttyp=='irreversible':
+				edges.add((mainid,prodctid))#add edges
+			elif reacttyp=='reversible':
+				edges.add((mainid,prodctid))#add edges
+				edges.add((prodctid,mainid))#add edges
+	#~ Nodes
+	nprots=len(prots)
+	cp=0#count prots
+	dnodes={}
+	while nprots>cp:
+		cp+=1
+		prot=prots.pop()
+		tmpProtnm=prot.get('id')
+		if tmpProtnm in ndstmp:
+			dnodes[prot.get('id')]=set(prot.get('name').split())#each genename for each Id
+	return dnodes,edges,title
+
+#~ make calculation on pathways
+def rtrnAvrgLen(edges,strNds,endNds):
+	wG=nx.DiGraph()#reference graph
+	wG.add_edges_from(edges)
+	dPairsSrcSnks=nx.all_pairs_shortest_path_length(wG)#dictionary between sources and sink and length
+	nstartNdsID0=len(strNds)
+	cstrtNds=0
+	nPaths=0
+	lPathLen=[]
+	while nstartNdsID0>cstrtNds:
+		cStartNd=strNds.pop()#current start node
+		dEndNdsLen=dPairsSrcSnks.pop(cStartNd)
+		for cendNd in dEndNdsLen:
+			if cendNd in endNds:
+				lPathLen.append(dEndNdsLen[cendNd])
+				nPaths+=1
+		cstrtNds+=1
+	AvrgPthLen=0
+	if nPaths!=0:
+		AvrgPthLen=Decimal(sum(lPathLen))/Decimal(str(nPaths))
+	return nPaths,AvrgPthLen
+
+def main():
+	parser = argparse.ArgumentParser(description='Rank pathways based on the change in length and number of paths connecting sources and sinks.')
+	parser.add_argument('--loc_file',metavar='correlational database',type=str,help='correlational database')
+	parser.add_argument('--species',metavar='species name',type=str,help='the species of interest in loc_file')
+	parser.add_argument('--output',metavar='output TXT file',type=str,help='the output file with the table in txt format. Column 1 is the diference between column 2 and column 3, Column 2 is the pathway average length (between sources and sinks) including the genes in the input list, Column 3 is the pathway average length EXCLUDING the genes in the input list, Column 4 is the rank based on column 1. Column 5 is the diference between column 6 and column 7, Column 6 is the number of paths between sources and sinks, including the genes in the input list, Column 7 is the number of paths between sources and sinks EXCLUDING the genes in the input list, Column 8 is the rank based on column 5. Column 9 I the pathway name' )
+	parser.add_argument('--posKEGGclmn',metavar='column number',type=int,help='the column with the KEGG pathway code/name')
+	parser.add_argument('--KEGGgeneposcolmn',metavar='column number',type=int,help='column with the KEGG gene code')
+	parser.add_argument('--input',metavar='input TXT file',type=str,help='the input file with the table in txt format')
+	#~ 
+	#~Open arguments 
+	class C(object):
+		pass
+	fulargs=C()
+	parser.parse_args(sys.argv[1:],namespace=fulargs)
+	#test input vars
+	inputf,loc_file,species,output,posKEGGclmn,Kgeneposcolmn=fulargs.input,fulargs.loc_file,fulargs.species,fulargs.output,fulargs.posKEGGclmn,fulargs.KEGGgeneposcolmn
+	posKEGGclmn-=1#correct pos
+	Kgeneposcolmn-=1
+	#~ Get the extra variables
+	crDB=[x.split() for x in open(loc_file).read().splitlines() if x.split()[0]==species][0]
+	sppPrefx,dinput=crDB[1],crDB[2]
+	#~ set decimal positions
+	getcontext().prec = 3
+	#make a dictionary of valid genes
+	dKEGGcPthws=dict([(x.split('\t')[Kgeneposcolmn],set([y.split('=')[0] for y in x.split('\t')[posKEGGclmn].split('.')])) for x in open(inputf).read().splitlines()[1:] if x.strip()])
+	sdGenes=set([x for x in dKEGGcPthws.keys() if x.find('.')>-1])
+	while True:#to crrect names with more than one gene
+		try:
+			mgenes=sdGenes.pop()
+			pthwsAssotd=dKEGGcPthws.pop(mgenes)
+			mgenes=mgenes.split('.')
+			for eachg in mgenes:
+				dKEGGcPthws[eachg]=pthwsAssotd
+		except:
+			break
+	#~ 
+	lPthwsF=[x for x in os.listdir(dinput) if x.find('.xml')>-1 if x not in ['cfa04070.xml']]
+	nPthws=len(lPthwsF)
+	cPthw=0
+	lPthwPthN=[]#the output list for number of paths
+	lPthwPthAvr=[]#the output list for the length of paths
+	#~ 
+	while cPthw<nPthws:
+		cPthw+=1
+		KEGGpathw=lPthwsF.pop()
+		comdKEGGpathw=KEGGpathw.split('.')[0]
+		tmpddGenrcgenPresent=set()
+		sKEGGc=dKEGGcPthws.keys()
+		lsKEGGc=len(sKEGGc)
+		ctPthw=0
+		while ctPthw < lsKEGGc:#to save memory
+			eachK=sKEGGc.pop()
+			alPthws=dKEGGcPthws[eachK]
+			if comdKEGGpathw in alPthws:
+				tmpddGenrcgenPresent.add(':'.join([sppPrefx,eachK]))
+			ctPthw+=1
+		#~ Make graph calculations	
+		dnodes,edges,title=returnNodesNEdgesfKXML(open(os.path.join(dinput,KEGGpathw)))
+		startNdsID0,endNdsOD0=returnstartanendnodes(edges)
+		startNdsOri=copy(startNdsID0)
+		#~ 
+		nPaths='C'#stands for circuit
+		AvrgPthLen='I'#stand for infinite
+		if len(startNdsID0)>0 and len(endNdsOD0)>0:
+			nPaths,AvrgPthLen=rtrnAvrgLen(edges,startNdsID0,endNdsOD0)
+		#~ work with the genes in the list
+		genestodel=set()
+		lnodes=len(dnodes)
+		sNds=set(dnodes)
+		ctPthw=0
+		while ctPthw<lnodes:
+			ctPthw+=1
+			cNod=sNds.pop()
+			sgenes=dnodes.pop(cNod)
+			if len(sgenes.intersection(tmpddGenrcgenPresent))==len(sgenes):
+				genestodel.add(cNod)
+		#~ del nodes from graph edges
+		wnPaths,wAvrgPthLen=copy(nPaths),copy(AvrgPthLen)
+		if len(genestodel)>0:
+			wedges=set([x for x in edges if len(set(x).intersection(genestodel))==0])
+			wstartNds,wendNds=returnstartanendnodes(wedges)
+			if nPaths!='C':
+				wstartNds=[x for x in wstartNds if x in startNdsOri]
+				wendNds=[x for x in wendNds if x in endNdsOD0]
+			if len(wstartNds)>0 and len(wendNds)>0:
+				wnPaths,wAvrgPthLen=rtrnAvrgLen(wedges,wstartNds,wendNds)
+		#~ Calculate the differences
+		orNP,mutNP,oriLen,mutLen=nPaths,wnPaths,AvrgPthLen,wAvrgPthLen
+		if nPaths=='C':
+			orNP=Decimal('1000')
+			oriLen=Decimal('1000')
+		if wnPaths=='C':
+			mutNP=Decimal('1000')
+			mutLen=Decimal('1000')
+		lPthwPthN.append([orNP-mutNP,nPaths,wnPaths,'='.join([comdKEGGpathw,title])])#print nPaths,AvrgPthLen
+		lPthwPthAvr.append([oriLen-mutLen,AvrgPthLen,wAvrgPthLen,'='.join([comdKEGGpathw,title])])#print nPaths,AvrgPthLen
+	doutrnkPthN=rankdN(lPthwPthN)
+	doutrnkPthAvr=rankdAvr(lPthwPthAvr)
+	#~ 
+	sall=['\t'.join([doutrnkPthAvr[x[4]],'\t'.join(x)]) for x in doutrnkPthN]
+	salef=open(output,'w')
+	salef.write('\n'.join(sall))
+	salef.close()
+	return 0
+	
+
+if __name__ == '__main__':
+	main()
+
author	miller-lab
date	Wed, 12 Sep 2012 17:10:26 -0400
parents	2c498d40ecde
children