0
|
1 #!/usr/bin/python
|
|
2
|
|
3 import sys, string, random
|
|
4 import sequence
|
|
5
|
|
6 #
|
|
7 # turn on psyco to speed up by 3X
|
|
8 #
|
|
9 if __name__=='__main__':
|
|
10 try:
|
|
11 import psyco
|
|
12 #psyco.log()
|
|
13 psyco.full()
|
|
14 psyco_found = True
|
|
15 except ImportError:
|
|
16 # psyco_found = False
|
|
17 pass
|
|
18 # print >> sys.stderr, "psyco_found", psyco_found
|
|
19
|
|
20
|
|
21 # altschulEriksonDinuclShuffle.py
|
|
22 # P. Clote, Oct 2003
|
|
23
|
|
24 def computeCountAndLists(s):
|
|
25
|
|
26 #Initialize lists and mono- and dinucleotide dictionaries
|
|
27 List = {} #List is a dictionary of lists
|
|
28 List['A'] = []; List['C'] = [];
|
|
29 List['G'] = []; List['T'] = [];
|
|
30 # FIXME: is this ok?
|
|
31 List['N'] = []
|
|
32 nuclList = ["A","C","G","T","N"]
|
|
33 s = s.upper()
|
|
34 #s = s.replace("U","T")
|
|
35 nuclCnt = {} #empty dictionary
|
|
36 dinuclCnt = {} #empty dictionary
|
|
37 for x in nuclList:
|
|
38 nuclCnt[x]=0
|
|
39 dinuclCnt[x]={}
|
|
40 for y in nuclList:
|
|
41 dinuclCnt[x][y]=0
|
|
42
|
|
43 #Compute count and lists
|
|
44 nuclCnt[s[0]] = 1
|
|
45 nuclTotal = 1
|
|
46 dinuclTotal = 0
|
|
47 for i in range(len(s)-1):
|
|
48 x = s[i]; y = s[i+1]
|
|
49 List[x].append( y )
|
|
50 nuclCnt[y] += 1; nuclTotal += 1
|
|
51 dinuclCnt[x][y] += 1; dinuclTotal += 1
|
|
52 assert (nuclTotal==len(s))
|
|
53 assert (dinuclTotal==len(s)-1)
|
|
54 return nuclCnt,dinuclCnt,List
|
|
55
|
|
56
|
|
57 def chooseEdge(x,dinuclCnt):
|
|
58 z = random.random()
|
|
59 denom=dinuclCnt[x]['A']+dinuclCnt[x]['C']+dinuclCnt[x]['G']+dinuclCnt[x]['T']+dinuclCnt[x]['N']
|
|
60 numerator = dinuclCnt[x]['A']
|
|
61 if z < float(numerator)/float(denom):
|
|
62 dinuclCnt[x]['A'] -= 1
|
|
63 return 'A'
|
|
64 numerator += dinuclCnt[x]['C']
|
|
65 if z < float(numerator)/float(denom):
|
|
66 dinuclCnt[x]['C'] -= 1
|
|
67 return 'C'
|
|
68 numerator += dinuclCnt[x]['G']
|
|
69 if z < float(numerator)/float(denom):
|
|
70 dinuclCnt[x]['G'] -= 1
|
|
71 return 'G'
|
|
72 numerator += dinuclCnt[x]['T']
|
|
73 if z < float(numerator)/float(denom):
|
|
74 dinuclCnt[x]['T'] -= 1
|
|
75 return 'T'
|
|
76 dinuclCnt[x]['N'] -= 1
|
|
77 return 'N'
|
|
78
|
|
79 def connectedToLast(edgeList,nuclList,lastCh):
|
|
80 D = {}
|
|
81 for x in nuclList: D[x]=0
|
|
82 for edge in edgeList:
|
|
83 a = edge[0]; b = edge[1]
|
|
84 if b==lastCh: D[a]=1
|
|
85 for i in range(3):
|
|
86 for edge in edgeList:
|
|
87 a = edge[0]; b = edge[1]
|
|
88 if D[b]==1: D[a]=1
|
|
89 ok = 0
|
|
90 for x in nuclList:
|
|
91 if x!=lastCh and D[x]==0: return 0
|
|
92 return 1
|
|
93
|
|
94 def eulerian(s):
|
|
95 nuclCnt,dinuclCnt,List = computeCountAndLists(s)
|
|
96 #compute nucleotides appearing in s
|
|
97 nuclList = []
|
|
98 for x in ["A","C","G","T","N"]:
|
|
99 if x in s: nuclList.append(x)
|
|
100 #create dinucleotide shuffle L
|
|
101 firstCh = s[0] #start with first letter of s
|
|
102 lastCh = s[-1]
|
|
103 edgeList = []
|
|
104 for x in nuclList:
|
|
105 if x!= lastCh: edgeList.append( [x,chooseEdge(x,dinuclCnt)] )
|
|
106 ok = connectedToLast(edgeList,nuclList,lastCh)
|
|
107 return ok,edgeList,nuclList,lastCh
|
|
108
|
|
109
|
|
110 def shuffleEdgeList(L):
|
|
111 n = len(L); barrier = n
|
|
112 for i in range(n-1):
|
|
113 z = int(random.random() * barrier)
|
|
114 tmp = L[z]
|
|
115 L[z]= L[barrier-1]
|
|
116 L[barrier-1] = tmp
|
|
117 barrier -= 1
|
|
118 return L
|
|
119
|
|
120 def dinuclShuffle(s):
|
|
121 ok = 0
|
|
122 while not ok:
|
|
123 ok,edgeList,nuclList,lastCh = eulerian(s)
|
|
124 nuclCnt,dinuclCnt,List = computeCountAndLists(s)
|
|
125
|
|
126 #remove last edges from each vertex list, shuffle, then add back
|
|
127 #the removed edges at end of vertex lists.
|
|
128 for [x,y] in edgeList: List[x].remove(y)
|
|
129 for x in nuclList: shuffleEdgeList(List[x])
|
|
130 for [x,y] in edgeList: List[x].append(y)
|
|
131
|
|
132 #construct the eulerian path
|
|
133 L = [s[0]]; prevCh = s[0]
|
|
134 for i in range(len(s)-2):
|
|
135 ch = List[prevCh][0]
|
|
136 L.append( ch )
|
|
137 del List[prevCh][0]
|
|
138 prevCh = ch
|
|
139 L.append(s[-1])
|
|
140 t = string.join(L,"")
|
|
141 return t
|
|
142
|
|
143 def main():
|
|
144
|
|
145 #
|
|
146 # defaults
|
|
147 #
|
|
148 file_name = None
|
|
149 seed = 1
|
|
150 copies = 1
|
|
151
|
|
152 #
|
|
153 # get command line arguments
|
|
154 #
|
|
155 usage = """USAGE:
|
|
156 %s [options]
|
|
157
|
|
158 -f <filename> file name (required)
|
|
159 -t <tag> added to shuffled sequence names
|
|
160 -s <seed> random seed; default: %d
|
|
161 -c <n> make <n> shuffled copies of each sequence; default: %d
|
|
162 -h print this usage message
|
|
163 """ % (sys.argv[0], seed, copies)
|
|
164
|
|
165 # no arguments: print usage
|
|
166 if len(sys.argv) == 1:
|
|
167 print >> sys.stderr, usage; sys.exit(1)
|
|
168
|
|
169 tag = "";
|
|
170
|
|
171 # parse command line
|
|
172 i = 1
|
|
173 while i < len(sys.argv):
|
|
174 arg = sys.argv[i]
|
|
175 if (arg == "-f"):
|
|
176 i += 1
|
|
177 try: file_name = sys.argv[i]
|
|
178 except: print >> sys.stderr, usage; sys.exit(1)
|
|
179 elif (arg == "-t"):
|
|
180 i += 1
|
|
181 try: tag = sys.argv[i]
|
|
182 except: print >> sys.stderr, usage; sys.exit(1)
|
|
183 elif (arg == "-s"):
|
|
184 i += 1
|
|
185 try: seed = string.atoi(sys.argv[i])
|
|
186 except: print >> sys.stderr, usage; sys.exit(1)
|
|
187 elif (arg == "-c"):
|
|
188 i += 1
|
|
189 try: copies = string.atoi(sys.argv[i])
|
|
190 except: print >> sys.stderr, usage; sys.exit(1)
|
|
191 elif (arg == "-h"):
|
|
192 print >> sys.stderr, usage; sys.exit(1)
|
|
193 else:
|
|
194 print >> sys.stderr, "Unknown command line argument: " + arg
|
|
195 sys.exit(1)
|
|
196 i += 1
|
|
197
|
|
198 # check that required arguments given
|
|
199 if (file_name == None):
|
|
200 print >> sys.stderr, usage; sys.exit(1)
|
|
201
|
|
202 random.seed(seed)
|
|
203
|
|
204 # read sequences
|
|
205 seqs = sequence.readFASTA(file_name,'Extended DNA')
|
|
206
|
|
207 for s in seqs:
|
|
208 str = s.getString()
|
|
209 #FIXME altschul can't handle ambigs
|
|
210 name = s.getName()
|
|
211
|
|
212 #print >> sys.stderr, ">%s" % name
|
|
213
|
|
214 for i in range(copies):
|
|
215
|
|
216 shuffledSeq = dinuclShuffle(str)
|
|
217
|
|
218 if (copies == 1):
|
|
219 print >> sys.stdout, ">%s\n%s" % (name+tag, shuffledSeq)
|
|
220 else:
|
|
221 print >> sys.stdout, ">%s_%d\n%s" % (name+tag, i, shuffledSeq)
|
|
222
|
|
223 if __name__ == '__main__': main()
|