Mercurial > repos > xuebing > seq_shuffle_dinucleotide
comparison altschulEriksonDinuclShuffle.py @ 1:f9d894fb823c
Uploaded
author | xuebing |
---|---|
date | Sat, 31 Mar 2012 21:47:59 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:aad0a5d475ce | 1:f9d894fb823c |
---|---|
1 #! /usr/bin/env python | |
2 | |
3 # altschulEriksonDinuclShuffle.py | |
4 # P. Clote, Oct 2003 | |
5 # NOTE: One cannot use function "count(s,word)" to count the number | |
6 # of occurrences of dinucleotide word in string s, since the built-in | |
7 # function counts only nonoverlapping words, presumably in a left to | |
8 # right fashion. | |
9 | |
10 | |
11 import sys,string,random | |
12 | |
13 | |
14 | |
15 def computeCountAndLists(s): | |
16 #WARNING: Use of function count(s,'UU') returns 1 on word UUU | |
17 #since it apparently counts only nonoverlapping words UU | |
18 #For this reason, we work with the indices. | |
19 | |
20 #Initialize lists and mono- and dinucleotide dictionaries | |
21 List = {} #List is a dictionary of lists | |
22 List['A'] = []; List['C'] = []; | |
23 List['G'] = []; List['T'] = []; | |
24 nuclList = ["A","C","G","T"] | |
25 s = s.upper() | |
26 s = s.replace("U","T") | |
27 nuclCnt = {} #empty dictionary | |
28 dinuclCnt = {} #empty dictionary | |
29 for x in nuclList: | |
30 nuclCnt[x]=0 | |
31 dinuclCnt[x]={} | |
32 for y in nuclList: | |
33 dinuclCnt[x][y]=0 | |
34 | |
35 #Compute count and lists | |
36 nuclCnt[s[0]] = 1 | |
37 nuclTotal = 1 | |
38 dinuclTotal = 0 | |
39 for i in range(len(s)-1): | |
40 x = s[i]; y = s[i+1] | |
41 List[x].append( y ) | |
42 nuclCnt[y] += 1; nuclTotal += 1 | |
43 dinuclCnt[x][y] += 1; dinuclTotal += 1 | |
44 assert (nuclTotal==len(s)) | |
45 assert (dinuclTotal==len(s)-1) | |
46 return nuclCnt,dinuclCnt,List | |
47 | |
48 | |
49 def chooseEdge(x,dinuclCnt): | |
50 numInList = 0 | |
51 for y in ['A','C','G','T']: | |
52 numInList += dinuclCnt[x][y] | |
53 z = random.random() | |
54 denom=dinuclCnt[x]['A']+dinuclCnt[x]['C']+dinuclCnt[x]['G']+dinuclCnt[x]['T'] | |
55 numerator = dinuclCnt[x]['A'] | |
56 if z < float(numerator)/float(denom): | |
57 dinuclCnt[x]['A'] -= 1 | |
58 return 'A' | |
59 numerator += dinuclCnt[x]['C'] | |
60 if z < float(numerator)/float(denom): | |
61 dinuclCnt[x]['C'] -= 1 | |
62 return 'C' | |
63 numerator += dinuclCnt[x]['G'] | |
64 if z < float(numerator)/float(denom): | |
65 dinuclCnt[x]['G'] -= 1 | |
66 return 'G' | |
67 dinuclCnt[x]['T'] -= 1 | |
68 return 'T' | |
69 | |
70 def connectedToLast(edgeList,nuclList,lastCh): | |
71 D = {} | |
72 for x in nuclList: D[x]=0 | |
73 for edge in edgeList: | |
74 a = edge[0]; b = edge[1] | |
75 if b==lastCh: D[a]=1 | |
76 for i in range(2): | |
77 for edge in edgeList: | |
78 a = edge[0]; b = edge[1] | |
79 if D[b]==1: D[a]=1 | |
80 ok = 0 | |
81 for x in nuclList: | |
82 if x!=lastCh and D[x]==0: return 0 | |
83 return 1 | |
84 | |
85 | |
86 | |
87 def eulerian(s): | |
88 nuclCnt,dinuclCnt,List = computeCountAndLists(s) | |
89 #compute nucleotides appearing in s | |
90 nuclList = [] | |
91 for x in ["A","C","G","T"]: | |
92 if x in s: nuclList.append(x) | |
93 #compute numInList[x] = number of dinucleotides beginning with x | |
94 numInList = {} | |
95 for x in nuclList: | |
96 numInList[x]=0 | |
97 for y in nuclList: | |
98 numInList[x] += dinuclCnt[x][y] | |
99 #create dinucleotide shuffle L | |
100 firstCh = s[0] #start with first letter of s | |
101 lastCh = s[-1] | |
102 edgeList = [] | |
103 for x in nuclList: | |
104 if x!= lastCh: edgeList.append( [x,chooseEdge(x,dinuclCnt)] ) | |
105 ok = connectedToLast(edgeList,nuclList,lastCh) | |
106 return ok,edgeList,nuclList,lastCh | |
107 | |
108 | |
109 def shuffleEdgeList(L): | |
110 n = len(L); barrier = n | |
111 for i in range(n-1): | |
112 z = int(random.random() * barrier) | |
113 tmp = L[z] | |
114 L[z]= L[barrier-1] | |
115 L[barrier-1] = tmp | |
116 barrier -= 1 | |
117 return L | |
118 | |
119 def dinuclShuffle(s): | |
120 ok = 0 | |
121 while not ok: | |
122 ok,edgeList,nuclList,lastCh = eulerian(s) | |
123 nuclCnt,dinuclCnt,List = computeCountAndLists(s) | |
124 | |
125 #remove last edges from each vertex list, shuffle, then add back | |
126 #the removed edges at end of vertex lists. | |
127 for [x,y] in edgeList: List[x].remove(y) | |
128 for x in nuclList: shuffleEdgeList(List[x]) | |
129 for [x,y] in edgeList: List[x].append(y) | |
130 | |
131 #construct the eulerian path | |
132 L = [s[0]]; prevCh = s[0] | |
133 for i in range(len(s)-2): | |
134 ch = List[prevCh][0] | |
135 L.append( ch ) | |
136 del List[prevCh][0] | |
137 prevCh = ch | |
138 L.append(s[-1]) | |
139 t = string.join(L,"") | |
140 return t | |
141 | |
142 | |
143 | |
144 if __name__ == '__main__': | |
145 if len(sys.argv)!=3: | |
146 print "Usage: python altschulEriksonDinuclShuffle.py GCATCGA 5" | |
147 sys.exit(1) | |
148 s = sys.argv[1].upper() | |
149 for i in range(int(sys.argv[2])): | |
150 print dinuclShuffle(s) |