Mercurial > repos > xuebing > sharplab_seq_motif
comparison mytools/fasta-dinucleotide-shuffle.py @ 0:39217fa39ff2
Uploaded
author | xuebing |
---|---|
date | Tue, 13 Mar 2012 23:34:52 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:39217fa39ff2 |
---|---|
1 #!/usr/bin/python | |
2 | |
3 import sys, string, random | |
4 import sequence | |
5 | |
6 # | |
7 # turn on psyco to speed up by 3X | |
8 # | |
9 if __name__=='__main__': | |
10 try: | |
11 import psyco | |
12 #psyco.log() | |
13 psyco.full() | |
14 psyco_found = True | |
15 except ImportError: | |
16 # psyco_found = False | |
17 pass | |
18 # print >> sys.stderr, "psyco_found", psyco_found | |
19 | |
20 | |
21 # altschulEriksonDinuclShuffle.py | |
22 # P. Clote, Oct 2003 | |
23 | |
24 def computeCountAndLists(s): | |
25 | |
26 #Initialize lists and mono- and dinucleotide dictionaries | |
27 List = {} #List is a dictionary of lists | |
28 List['A'] = []; List['C'] = []; | |
29 List['G'] = []; List['T'] = []; | |
30 # FIXME: is this ok? | |
31 List['N'] = [] | |
32 nuclList = ["A","C","G","T","N"] | |
33 s = s.upper() | |
34 #s = s.replace("U","T") | |
35 nuclCnt = {} #empty dictionary | |
36 dinuclCnt = {} #empty dictionary | |
37 for x in nuclList: | |
38 nuclCnt[x]=0 | |
39 dinuclCnt[x]={} | |
40 for y in nuclList: | |
41 dinuclCnt[x][y]=0 | |
42 | |
43 #Compute count and lists | |
44 nuclCnt[s[0]] = 1 | |
45 nuclTotal = 1 | |
46 dinuclTotal = 0 | |
47 for i in range(len(s)-1): | |
48 x = s[i]; y = s[i+1] | |
49 List[x].append( y ) | |
50 nuclCnt[y] += 1; nuclTotal += 1 | |
51 dinuclCnt[x][y] += 1; dinuclTotal += 1 | |
52 assert (nuclTotal==len(s)) | |
53 assert (dinuclTotal==len(s)-1) | |
54 return nuclCnt,dinuclCnt,List | |
55 | |
56 | |
57 def chooseEdge(x,dinuclCnt): | |
58 z = random.random() | |
59 denom=dinuclCnt[x]['A']+dinuclCnt[x]['C']+dinuclCnt[x]['G']+dinuclCnt[x]['T']+dinuclCnt[x]['N'] | |
60 numerator = dinuclCnt[x]['A'] | |
61 if z < float(numerator)/float(denom): | |
62 dinuclCnt[x]['A'] -= 1 | |
63 return 'A' | |
64 numerator += dinuclCnt[x]['C'] | |
65 if z < float(numerator)/float(denom): | |
66 dinuclCnt[x]['C'] -= 1 | |
67 return 'C' | |
68 numerator += dinuclCnt[x]['G'] | |
69 if z < float(numerator)/float(denom): | |
70 dinuclCnt[x]['G'] -= 1 | |
71 return 'G' | |
72 numerator += dinuclCnt[x]['T'] | |
73 if z < float(numerator)/float(denom): | |
74 dinuclCnt[x]['T'] -= 1 | |
75 return 'T' | |
76 dinuclCnt[x]['N'] -= 1 | |
77 return 'N' | |
78 | |
79 def connectedToLast(edgeList,nuclList,lastCh): | |
80 D = {} | |
81 for x in nuclList: D[x]=0 | |
82 for edge in edgeList: | |
83 a = edge[0]; b = edge[1] | |
84 if b==lastCh: D[a]=1 | |
85 for i in range(3): | |
86 for edge in edgeList: | |
87 a = edge[0]; b = edge[1] | |
88 if D[b]==1: D[a]=1 | |
89 ok = 0 | |
90 for x in nuclList: | |
91 if x!=lastCh and D[x]==0: return 0 | |
92 return 1 | |
93 | |
94 def eulerian(s): | |
95 nuclCnt,dinuclCnt,List = computeCountAndLists(s) | |
96 #compute nucleotides appearing in s | |
97 nuclList = [] | |
98 for x in ["A","C","G","T","N"]: | |
99 if x in s: nuclList.append(x) | |
100 #create dinucleotide shuffle L | |
101 firstCh = s[0] #start with first letter of s | |
102 lastCh = s[-1] | |
103 edgeList = [] | |
104 for x in nuclList: | |
105 if x!= lastCh: edgeList.append( [x,chooseEdge(x,dinuclCnt)] ) | |
106 ok = connectedToLast(edgeList,nuclList,lastCh) | |
107 return ok,edgeList,nuclList,lastCh | |
108 | |
109 | |
110 def shuffleEdgeList(L): | |
111 n = len(L); barrier = n | |
112 for i in range(n-1): | |
113 z = int(random.random() * barrier) | |
114 tmp = L[z] | |
115 L[z]= L[barrier-1] | |
116 L[barrier-1] = tmp | |
117 barrier -= 1 | |
118 return L | |
119 | |
120 def dinuclShuffle(s): | |
121 ok = 0 | |
122 while not ok: | |
123 ok,edgeList,nuclList,lastCh = eulerian(s) | |
124 nuclCnt,dinuclCnt,List = computeCountAndLists(s) | |
125 | |
126 #remove last edges from each vertex list, shuffle, then add back | |
127 #the removed edges at end of vertex lists. | |
128 for [x,y] in edgeList: List[x].remove(y) | |
129 for x in nuclList: shuffleEdgeList(List[x]) | |
130 for [x,y] in edgeList: List[x].append(y) | |
131 | |
132 #construct the eulerian path | |
133 L = [s[0]]; prevCh = s[0] | |
134 for i in range(len(s)-2): | |
135 ch = List[prevCh][0] | |
136 L.append( ch ) | |
137 del List[prevCh][0] | |
138 prevCh = ch | |
139 L.append(s[-1]) | |
140 t = string.join(L,"") | |
141 return t | |
142 | |
143 def main(): | |
144 | |
145 # | |
146 # defaults | |
147 # | |
148 file_name = None | |
149 seed = 1 | |
150 copies = 1 | |
151 | |
152 # | |
153 # get command line arguments | |
154 # | |
155 usage = """USAGE: | |
156 %s [options] | |
157 | |
158 -f <filename> file name (required) | |
159 -t <tag> added to shuffled sequence names | |
160 -s <seed> random seed; default: %d | |
161 -c <n> make <n> shuffled copies of each sequence; default: %d | |
162 -h print this usage message | |
163 """ % (sys.argv[0], seed, copies) | |
164 | |
165 # no arguments: print usage | |
166 if len(sys.argv) == 1: | |
167 print >> sys.stderr, usage; sys.exit(1) | |
168 | |
169 tag = ""; | |
170 | |
171 # parse command line | |
172 i = 1 | |
173 while i < len(sys.argv): | |
174 arg = sys.argv[i] | |
175 if (arg == "-f"): | |
176 i += 1 | |
177 try: file_name = sys.argv[i] | |
178 except: print >> sys.stderr, usage; sys.exit(1) | |
179 elif (arg == "-t"): | |
180 i += 1 | |
181 try: tag = sys.argv[i] | |
182 except: print >> sys.stderr, usage; sys.exit(1) | |
183 elif (arg == "-s"): | |
184 i += 1 | |
185 try: seed = string.atoi(sys.argv[i]) | |
186 except: print >> sys.stderr, usage; sys.exit(1) | |
187 elif (arg == "-c"): | |
188 i += 1 | |
189 try: copies = string.atoi(sys.argv[i]) | |
190 except: print >> sys.stderr, usage; sys.exit(1) | |
191 elif (arg == "-h"): | |
192 print >> sys.stderr, usage; sys.exit(1) | |
193 else: | |
194 print >> sys.stderr, "Unknown command line argument: " + arg | |
195 sys.exit(1) | |
196 i += 1 | |
197 | |
198 # check that required arguments given | |
199 if (file_name == None): | |
200 print >> sys.stderr, usage; sys.exit(1) | |
201 | |
202 random.seed(seed) | |
203 | |
204 # read sequences | |
205 seqs = sequence.readFASTA(file_name,'Extended DNA') | |
206 | |
207 for s in seqs: | |
208 str = s.getString() | |
209 #FIXME altschul can't handle ambigs | |
210 name = s.getName() | |
211 | |
212 #print >> sys.stderr, ">%s" % name | |
213 | |
214 for i in range(copies): | |
215 | |
216 shuffledSeq = dinuclShuffle(str) | |
217 | |
218 if (copies == 1): | |
219 print >> sys.stdout, ">%s\n%s" % (name+tag, shuffledSeq) | |
220 else: | |
221 print >> sys.stdout, ">%s_%d\n%s" % (name+tag, i, shuffledSeq) | |
222 | |
223 if __name__ == '__main__': main() |