comparison altschulEriksonDinuclShuffle.py @ 1:f9d894fb823c

Uploaded
author xuebing
date Sat, 31 Mar 2012 21:47:59 -0400
parents
children
comparison
equal deleted inserted replaced
0:aad0a5d475ce 1:f9d894fb823c
1 #! /usr/bin/env python
2
3 # altschulEriksonDinuclShuffle.py
4 # P. Clote, Oct 2003
5 # NOTE: One cannot use function "count(s,word)" to count the number
6 # of occurrences of dinucleotide word in string s, since the built-in
7 # function counts only nonoverlapping words, presumably in a left to
8 # right fashion.
9
10
11 import sys,string,random
12
13
14
15 def computeCountAndLists(s):
16 #WARNING: Use of function count(s,'UU') returns 1 on word UUU
17 #since it apparently counts only nonoverlapping words UU
18 #For this reason, we work with the indices.
19
20 #Initialize lists and mono- and dinucleotide dictionaries
21 List = {} #List is a dictionary of lists
22 List['A'] = []; List['C'] = [];
23 List['G'] = []; List['T'] = [];
24 nuclList = ["A","C","G","T"]
25 s = s.upper()
26 s = s.replace("U","T")
27 nuclCnt = {} #empty dictionary
28 dinuclCnt = {} #empty dictionary
29 for x in nuclList:
30 nuclCnt[x]=0
31 dinuclCnt[x]={}
32 for y in nuclList:
33 dinuclCnt[x][y]=0
34
35 #Compute count and lists
36 nuclCnt[s[0]] = 1
37 nuclTotal = 1
38 dinuclTotal = 0
39 for i in range(len(s)-1):
40 x = s[i]; y = s[i+1]
41 List[x].append( y )
42 nuclCnt[y] += 1; nuclTotal += 1
43 dinuclCnt[x][y] += 1; dinuclTotal += 1
44 assert (nuclTotal==len(s))
45 assert (dinuclTotal==len(s)-1)
46 return nuclCnt,dinuclCnt,List
47
48
49 def chooseEdge(x,dinuclCnt):
50 numInList = 0
51 for y in ['A','C','G','T']:
52 numInList += dinuclCnt[x][y]
53 z = random.random()
54 denom=dinuclCnt[x]['A']+dinuclCnt[x]['C']+dinuclCnt[x]['G']+dinuclCnt[x]['T']
55 numerator = dinuclCnt[x]['A']
56 if z < float(numerator)/float(denom):
57 dinuclCnt[x]['A'] -= 1
58 return 'A'
59 numerator += dinuclCnt[x]['C']
60 if z < float(numerator)/float(denom):
61 dinuclCnt[x]['C'] -= 1
62 return 'C'
63 numerator += dinuclCnt[x]['G']
64 if z < float(numerator)/float(denom):
65 dinuclCnt[x]['G'] -= 1
66 return 'G'
67 dinuclCnt[x]['T'] -= 1
68 return 'T'
69
70 def connectedToLast(edgeList,nuclList,lastCh):
71 D = {}
72 for x in nuclList: D[x]=0
73 for edge in edgeList:
74 a = edge[0]; b = edge[1]
75 if b==lastCh: D[a]=1
76 for i in range(2):
77 for edge in edgeList:
78 a = edge[0]; b = edge[1]
79 if D[b]==1: D[a]=1
80 ok = 0
81 for x in nuclList:
82 if x!=lastCh and D[x]==0: return 0
83 return 1
84
85
86
87 def eulerian(s):
88 nuclCnt,dinuclCnt,List = computeCountAndLists(s)
89 #compute nucleotides appearing in s
90 nuclList = []
91 for x in ["A","C","G","T"]:
92 if x in s: nuclList.append(x)
93 #compute numInList[x] = number of dinucleotides beginning with x
94 numInList = {}
95 for x in nuclList:
96 numInList[x]=0
97 for y in nuclList:
98 numInList[x] += dinuclCnt[x][y]
99 #create dinucleotide shuffle L
100 firstCh = s[0] #start with first letter of s
101 lastCh = s[-1]
102 edgeList = []
103 for x in nuclList:
104 if x!= lastCh: edgeList.append( [x,chooseEdge(x,dinuclCnt)] )
105 ok = connectedToLast(edgeList,nuclList,lastCh)
106 return ok,edgeList,nuclList,lastCh
107
108
109 def shuffleEdgeList(L):
110 n = len(L); barrier = n
111 for i in range(n-1):
112 z = int(random.random() * barrier)
113 tmp = L[z]
114 L[z]= L[barrier-1]
115 L[barrier-1] = tmp
116 barrier -= 1
117 return L
118
119 def dinuclShuffle(s):
120 ok = 0
121 while not ok:
122 ok,edgeList,nuclList,lastCh = eulerian(s)
123 nuclCnt,dinuclCnt,List = computeCountAndLists(s)
124
125 #remove last edges from each vertex list, shuffle, then add back
126 #the removed edges at end of vertex lists.
127 for [x,y] in edgeList: List[x].remove(y)
128 for x in nuclList: shuffleEdgeList(List[x])
129 for [x,y] in edgeList: List[x].append(y)
130
131 #construct the eulerian path
132 L = [s[0]]; prevCh = s[0]
133 for i in range(len(s)-2):
134 ch = List[prevCh][0]
135 L.append( ch )
136 del List[prevCh][0]
137 prevCh = ch
138 L.append(s[-1])
139 t = string.join(L,"")
140 return t
141
142
143
144 if __name__ == '__main__':
145 if len(sys.argv)!=3:
146 print "Usage: python altschulEriksonDinuclShuffle.py GCATCGA 5"
147 sys.exit(1)
148 s = sys.argv[1].upper()
149 for i in range(int(sys.argv[2])):
150 print dinuclShuffle(s)