annotate altschulEriksonDinuclShuffle.py @ 12:2f4ea569f048

Uploaded
author xuebing
date Sat, 10 Mar 2012 08:10:44 -0500
parents b7f1d9f8f3bc
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
11
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
1 #! /usr/bin/env python
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
2
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
3 # altschulEriksonDinuclShuffle.py
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
4 # P. Clote, Oct 2003
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
5 # NOTE: One cannot use function "count(s,word)" to count the number
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
6 # of occurrences of dinucleotide word in string s, since the built-in
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
7 # function counts only nonoverlapping words, presumably in a left to
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
8 # right fashion.
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
9
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
10
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
11 import sys,string,random
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
12
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
13
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
14
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
15 def computeCountAndLists(s):
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
16 #WARNING: Use of function count(s,'UU') returns 1 on word UUU
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
17 #since it apparently counts only nonoverlapping words UU
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
18 #For this reason, we work with the indices.
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
19
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
20 #Initialize lists and mono- and dinucleotide dictionaries
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
21 List = {} #List is a dictionary of lists
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
22 List['A'] = []; List['C'] = [];
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
23 List['G'] = []; List['T'] = [];
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
24 nuclList = ["A","C","G","T"]
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
25 s = s.upper()
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
26 s = s.replace("U","T")
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
27 nuclCnt = {} #empty dictionary
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
28 dinuclCnt = {} #empty dictionary
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
29 for x in nuclList:
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
30 nuclCnt[x]=0
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
31 dinuclCnt[x]={}
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
32 for y in nuclList:
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
33 dinuclCnt[x][y]=0
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
34
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
35 #Compute count and lists
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
36 nuclCnt[s[0]] = 1
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
37 nuclTotal = 1
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
38 dinuclTotal = 0
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
39 for i in range(len(s)-1):
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
40 x = s[i]; y = s[i+1]
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
41 List[x].append( y )
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
42 nuclCnt[y] += 1; nuclTotal += 1
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
43 dinuclCnt[x][y] += 1; dinuclTotal += 1
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
44 assert (nuclTotal==len(s))
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
45 assert (dinuclTotal==len(s)-1)
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
46 return nuclCnt,dinuclCnt,List
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
47
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
48
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
49 def chooseEdge(x,dinuclCnt):
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
50 numInList = 0
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
51 for y in ['A','C','G','T']:
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
52 numInList += dinuclCnt[x][y]
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
53 z = random.random()
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
54 denom=dinuclCnt[x]['A']+dinuclCnt[x]['C']+dinuclCnt[x]['G']+dinuclCnt[x]['T']
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
55 numerator = dinuclCnt[x]['A']
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
56 if z < float(numerator)/float(denom):
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
57 dinuclCnt[x]['A'] -= 1
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
58 return 'A'
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
59 numerator += dinuclCnt[x]['C']
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
60 if z < float(numerator)/float(denom):
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
61 dinuclCnt[x]['C'] -= 1
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
62 return 'C'
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
63 numerator += dinuclCnt[x]['G']
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
64 if z < float(numerator)/float(denom):
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
65 dinuclCnt[x]['G'] -= 1
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
66 return 'G'
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
67 dinuclCnt[x]['T'] -= 1
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
68 return 'T'
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
69
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
70 def connectedToLast(edgeList,nuclList,lastCh):
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
71 D = {}
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
72 for x in nuclList: D[x]=0
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
73 for edge in edgeList:
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
74 a = edge[0]; b = edge[1]
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
75 if b==lastCh: D[a]=1
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
76 for i in range(2):
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
77 for edge in edgeList:
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
78 a = edge[0]; b = edge[1]
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
79 if D[b]==1: D[a]=1
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
80 ok = 0
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
81 for x in nuclList:
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
82 if x!=lastCh and D[x]==0: return 0
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
83 return 1
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
84
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
85
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
86
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
87 def eulerian(s):
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
88 nuclCnt,dinuclCnt,List = computeCountAndLists(s)
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
89 #compute nucleotides appearing in s
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
90 nuclList = []
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
91 for x in ["A","C","G","T"]:
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
92 if x in s: nuclList.append(x)
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
93 #compute numInList[x] = number of dinucleotides beginning with x
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
94 numInList = {}
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
95 for x in nuclList:
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
96 numInList[x]=0
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
97 for y in nuclList:
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
98 numInList[x] += dinuclCnt[x][y]
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
99 #create dinucleotide shuffle L
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
100 firstCh = s[0] #start with first letter of s
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
101 lastCh = s[-1]
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
102 edgeList = []
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
103 for x in nuclList:
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
104 if x!= lastCh: edgeList.append( [x,chooseEdge(x,dinuclCnt)] )
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
105 ok = connectedToLast(edgeList,nuclList,lastCh)
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
106 return ok,edgeList,nuclList,lastCh
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
107
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
108
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
109 def shuffleEdgeList(L):
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
110 n = len(L); barrier = n
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
111 for i in range(n-1):
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
112 z = int(random.random() * barrier)
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
113 tmp = L[z]
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
114 L[z]= L[barrier-1]
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
115 L[barrier-1] = tmp
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
116 barrier -= 1
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
117 return L
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
118
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
119 def dinuclShuffle(s):
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
120 ok = 0
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
121 while not ok:
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
122 ok,edgeList,nuclList,lastCh = eulerian(s)
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
123 nuclCnt,dinuclCnt,List = computeCountAndLists(s)
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
124
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
125 #remove last edges from each vertex list, shuffle, then add back
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
126 #the removed edges at end of vertex lists.
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
127 for [x,y] in edgeList: List[x].remove(y)
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
128 for x in nuclList: shuffleEdgeList(List[x])
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
129 for [x,y] in edgeList: List[x].append(y)
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
130
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
131 #construct the eulerian path
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
132 L = [s[0]]; prevCh = s[0]
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
133 for i in range(len(s)-2):
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
134 ch = List[prevCh][0]
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
135 L.append( ch )
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
136 del List[prevCh][0]
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
137 prevCh = ch
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
138 L.append(s[-1])
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
139 t = string.join(L,"")
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
140 return t
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
141
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
142
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
143
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
144 if __name__ == '__main__':
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
145 if len(sys.argv)!=3:
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
146 print "Usage: python altschulEriksonDinuclShuffle.py GCATCGA 5"
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
147 sys.exit(1)
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
148 s = sys.argv[1].upper()
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
149 for i in range(int(sys.argv[2])):
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
150 print dinuclShuffle(s)