1
|
1 #! /usr/bin/env python
|
|
2
|
|
3 # altschulEriksonDinuclShuffle.py
|
|
4 # P. Clote, Oct 2003
|
|
5 # NOTE: One cannot use function "count(s,word)" to count the number
|
|
6 # of occurrences of dinucleotide word in string s, since the built-in
|
|
7 # function counts only nonoverlapping words, presumably in a left to
|
|
8 # right fashion.
|
|
9
|
|
10
|
|
11 import sys,string,random
|
|
12
|
|
13
|
|
14
|
|
15 def computeCountAndLists(s):
|
|
16 #WARNING: Use of function count(s,'UU') returns 1 on word UUU
|
|
17 #since it apparently counts only nonoverlapping words UU
|
|
18 #For this reason, we work with the indices.
|
|
19
|
|
20 #Initialize lists and mono- and dinucleotide dictionaries
|
|
21 List = {} #List is a dictionary of lists
|
|
22 List['A'] = []; List['C'] = [];
|
|
23 List['G'] = []; List['T'] = [];
|
|
24 nuclList = ["A","C","G","T"]
|
|
25 s = s.upper()
|
|
26 s = s.replace("U","T")
|
|
27 nuclCnt = {} #empty dictionary
|
|
28 dinuclCnt = {} #empty dictionary
|
|
29 for x in nuclList:
|
|
30 nuclCnt[x]=0
|
|
31 dinuclCnt[x]={}
|
|
32 for y in nuclList:
|
|
33 dinuclCnt[x][y]=0
|
|
34
|
|
35 #Compute count and lists
|
|
36 nuclCnt[s[0]] = 1
|
|
37 nuclTotal = 1
|
|
38 dinuclTotal = 0
|
|
39 for i in range(len(s)-1):
|
|
40 x = s[i]; y = s[i+1]
|
|
41 List[x].append( y )
|
|
42 nuclCnt[y] += 1; nuclTotal += 1
|
|
43 dinuclCnt[x][y] += 1; dinuclTotal += 1
|
|
44 assert (nuclTotal==len(s))
|
|
45 assert (dinuclTotal==len(s)-1)
|
|
46 return nuclCnt,dinuclCnt,List
|
|
47
|
|
48
|
|
49 def chooseEdge(x,dinuclCnt):
|
|
50 numInList = 0
|
|
51 for y in ['A','C','G','T']:
|
|
52 numInList += dinuclCnt[x][y]
|
|
53 z = random.random()
|
|
54 denom=dinuclCnt[x]['A']+dinuclCnt[x]['C']+dinuclCnt[x]['G']+dinuclCnt[x]['T']
|
|
55 numerator = dinuclCnt[x]['A']
|
|
56 if z < float(numerator)/float(denom):
|
|
57 dinuclCnt[x]['A'] -= 1
|
|
58 return 'A'
|
|
59 numerator += dinuclCnt[x]['C']
|
|
60 if z < float(numerator)/float(denom):
|
|
61 dinuclCnt[x]['C'] -= 1
|
|
62 return 'C'
|
|
63 numerator += dinuclCnt[x]['G']
|
|
64 if z < float(numerator)/float(denom):
|
|
65 dinuclCnt[x]['G'] -= 1
|
|
66 return 'G'
|
|
67 dinuclCnt[x]['T'] -= 1
|
|
68 return 'T'
|
|
69
|
|
70 def connectedToLast(edgeList,nuclList,lastCh):
|
|
71 D = {}
|
|
72 for x in nuclList: D[x]=0
|
|
73 for edge in edgeList:
|
|
74 a = edge[0]; b = edge[1]
|
|
75 if b==lastCh: D[a]=1
|
|
76 for i in range(2):
|
|
77 for edge in edgeList:
|
|
78 a = edge[0]; b = edge[1]
|
|
79 if D[b]==1: D[a]=1
|
|
80 ok = 0
|
|
81 for x in nuclList:
|
|
82 if x!=lastCh and D[x]==0: return 0
|
|
83 return 1
|
|
84
|
|
85
|
|
86
|
|
87 def eulerian(s):
|
|
88 nuclCnt,dinuclCnt,List = computeCountAndLists(s)
|
|
89 #compute nucleotides appearing in s
|
|
90 nuclList = []
|
|
91 for x in ["A","C","G","T"]:
|
|
92 if x in s: nuclList.append(x)
|
|
93 #compute numInList[x] = number of dinucleotides beginning with x
|
|
94 numInList = {}
|
|
95 for x in nuclList:
|
|
96 numInList[x]=0
|
|
97 for y in nuclList:
|
|
98 numInList[x] += dinuclCnt[x][y]
|
|
99 #create dinucleotide shuffle L
|
|
100 firstCh = s[0] #start with first letter of s
|
|
101 lastCh = s[-1]
|
|
102 edgeList = []
|
|
103 for x in nuclList:
|
|
104 if x!= lastCh: edgeList.append( [x,chooseEdge(x,dinuclCnt)] )
|
|
105 ok = connectedToLast(edgeList,nuclList,lastCh)
|
|
106 return ok,edgeList,nuclList,lastCh
|
|
107
|
|
108
|
|
109 def shuffleEdgeList(L):
|
|
110 n = len(L); barrier = n
|
|
111 for i in range(n-1):
|
|
112 z = int(random.random() * barrier)
|
|
113 tmp = L[z]
|
|
114 L[z]= L[barrier-1]
|
|
115 L[barrier-1] = tmp
|
|
116 barrier -= 1
|
|
117 return L
|
|
118
|
|
119 def dinuclShuffle(s):
|
|
120 ok = 0
|
|
121 while not ok:
|
|
122 ok,edgeList,nuclList,lastCh = eulerian(s)
|
|
123 nuclCnt,dinuclCnt,List = computeCountAndLists(s)
|
|
124
|
|
125 #remove last edges from each vertex list, shuffle, then add back
|
|
126 #the removed edges at end of vertex lists.
|
|
127 for [x,y] in edgeList: List[x].remove(y)
|
|
128 for x in nuclList: shuffleEdgeList(List[x])
|
|
129 for [x,y] in edgeList: List[x].append(y)
|
|
130
|
|
131 #construct the eulerian path
|
|
132 L = [s[0]]; prevCh = s[0]
|
|
133 for i in range(len(s)-2):
|
|
134 ch = List[prevCh][0]
|
|
135 L.append( ch )
|
|
136 del List[prevCh][0]
|
|
137 prevCh = ch
|
|
138 L.append(s[-1])
|
|
139 t = string.join(L,"")
|
|
140 return t
|
|
141
|
|
142
|
|
143
|
|
144 if __name__ == '__main__':
|
|
145 if len(sys.argv)!=3:
|
|
146 print "Usage: python altschulEriksonDinuclShuffle.py GCATCGA 5"
|
|
147 sys.exit(1)
|
|
148 s = sys.argv[1].upper()
|
|
149 for i in range(int(sys.argv[2])):
|
|
150 print dinuclShuffle(s)
|