annotate mytools/fasta-dinucleotide-shuffle.py @ 9:8cec2078632a

Uploaded
author xuebing
date Fri, 16 Mar 2012 14:15:10 -0400
parents 39217fa39ff2
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
1 #!/usr/bin/python
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
2
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
3 import sys, string, random
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
4 import sequence
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
5
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
6 #
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
7 # turn on psyco to speed up by 3X
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
8 #
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
9 if __name__=='__main__':
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
10 try:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
11 import psyco
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
12 #psyco.log()
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
13 psyco.full()
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
14 psyco_found = True
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
15 except ImportError:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
16 # psyco_found = False
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
17 pass
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
18 # print >> sys.stderr, "psyco_found", psyco_found
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
19
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
20
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
21 # altschulEriksonDinuclShuffle.py
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
22 # P. Clote, Oct 2003
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
23
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
24 def computeCountAndLists(s):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
25
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
26 #Initialize lists and mono- and dinucleotide dictionaries
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
27 List = {} #List is a dictionary of lists
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
28 List['A'] = []; List['C'] = [];
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
29 List['G'] = []; List['T'] = [];
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
30 # FIXME: is this ok?
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
31 List['N'] = []
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
32 nuclList = ["A","C","G","T","N"]
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
33 s = s.upper()
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
34 #s = s.replace("U","T")
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
35 nuclCnt = {} #empty dictionary
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
36 dinuclCnt = {} #empty dictionary
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
37 for x in nuclList:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
38 nuclCnt[x]=0
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
39 dinuclCnt[x]={}
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
40 for y in nuclList:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
41 dinuclCnt[x][y]=0
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
42
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
43 #Compute count and lists
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
44 nuclCnt[s[0]] = 1
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
45 nuclTotal = 1
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
46 dinuclTotal = 0
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
47 for i in range(len(s)-1):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
48 x = s[i]; y = s[i+1]
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
49 List[x].append( y )
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
50 nuclCnt[y] += 1; nuclTotal += 1
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
51 dinuclCnt[x][y] += 1; dinuclTotal += 1
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
52 assert (nuclTotal==len(s))
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
53 assert (dinuclTotal==len(s)-1)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
54 return nuclCnt,dinuclCnt,List
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
55
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
56
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
57 def chooseEdge(x,dinuclCnt):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
58 z = random.random()
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
59 denom=dinuclCnt[x]['A']+dinuclCnt[x]['C']+dinuclCnt[x]['G']+dinuclCnt[x]['T']+dinuclCnt[x]['N']
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
60 numerator = dinuclCnt[x]['A']
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
61 if z < float(numerator)/float(denom):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
62 dinuclCnt[x]['A'] -= 1
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
63 return 'A'
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
64 numerator += dinuclCnt[x]['C']
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
65 if z < float(numerator)/float(denom):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
66 dinuclCnt[x]['C'] -= 1
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
67 return 'C'
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
68 numerator += dinuclCnt[x]['G']
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
69 if z < float(numerator)/float(denom):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
70 dinuclCnt[x]['G'] -= 1
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
71 return 'G'
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
72 numerator += dinuclCnt[x]['T']
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
73 if z < float(numerator)/float(denom):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
74 dinuclCnt[x]['T'] -= 1
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
75 return 'T'
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
76 dinuclCnt[x]['N'] -= 1
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
77 return 'N'
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
78
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
79 def connectedToLast(edgeList,nuclList,lastCh):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
80 D = {}
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
81 for x in nuclList: D[x]=0
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
82 for edge in edgeList:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
83 a = edge[0]; b = edge[1]
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
84 if b==lastCh: D[a]=1
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
85 for i in range(3):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
86 for edge in edgeList:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
87 a = edge[0]; b = edge[1]
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
88 if D[b]==1: D[a]=1
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
89 ok = 0
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
90 for x in nuclList:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
91 if x!=lastCh and D[x]==0: return 0
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
92 return 1
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
93
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
94 def eulerian(s):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
95 nuclCnt,dinuclCnt,List = computeCountAndLists(s)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
96 #compute nucleotides appearing in s
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
97 nuclList = []
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
98 for x in ["A","C","G","T","N"]:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
99 if x in s: nuclList.append(x)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
100 #create dinucleotide shuffle L
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
101 firstCh = s[0] #start with first letter of s
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
102 lastCh = s[-1]
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
103 edgeList = []
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
104 for x in nuclList:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
105 if x!= lastCh: edgeList.append( [x,chooseEdge(x,dinuclCnt)] )
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
106 ok = connectedToLast(edgeList,nuclList,lastCh)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
107 return ok,edgeList,nuclList,lastCh
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
108
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
109
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
110 def shuffleEdgeList(L):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
111 n = len(L); barrier = n
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
112 for i in range(n-1):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
113 z = int(random.random() * barrier)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
114 tmp = L[z]
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
115 L[z]= L[barrier-1]
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
116 L[barrier-1] = tmp
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
117 barrier -= 1
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
118 return L
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
119
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
120 def dinuclShuffle(s):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
121 ok = 0
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
122 while not ok:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
123 ok,edgeList,nuclList,lastCh = eulerian(s)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
124 nuclCnt,dinuclCnt,List = computeCountAndLists(s)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
125
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
126 #remove last edges from each vertex list, shuffle, then add back
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
127 #the removed edges at end of vertex lists.
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
128 for [x,y] in edgeList: List[x].remove(y)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
129 for x in nuclList: shuffleEdgeList(List[x])
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
130 for [x,y] in edgeList: List[x].append(y)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
131
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
132 #construct the eulerian path
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
133 L = [s[0]]; prevCh = s[0]
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
134 for i in range(len(s)-2):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
135 ch = List[prevCh][0]
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
136 L.append( ch )
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
137 del List[prevCh][0]
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
138 prevCh = ch
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
139 L.append(s[-1])
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
140 t = string.join(L,"")
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
141 return t
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
142
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
143 def main():
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
144
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
145 #
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
146 # defaults
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
147 #
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
148 file_name = None
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
149 seed = 1
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
150 copies = 1
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
151
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
152 #
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
153 # get command line arguments
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
154 #
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
155 usage = """USAGE:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
156 %s [options]
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
157
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
158 -f <filename> file name (required)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
159 -t <tag> added to shuffled sequence names
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
160 -s <seed> random seed; default: %d
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
161 -c <n> make <n> shuffled copies of each sequence; default: %d
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
162 -h print this usage message
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
163 """ % (sys.argv[0], seed, copies)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
164
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
165 # no arguments: print usage
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
166 if len(sys.argv) == 1:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
167 print >> sys.stderr, usage; sys.exit(1)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
168
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
169 tag = "";
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
170
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
171 # parse command line
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
172 i = 1
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
173 while i < len(sys.argv):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
174 arg = sys.argv[i]
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
175 if (arg == "-f"):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
176 i += 1
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
177 try: file_name = sys.argv[i]
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
178 except: print >> sys.stderr, usage; sys.exit(1)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
179 elif (arg == "-t"):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
180 i += 1
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
181 try: tag = sys.argv[i]
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
182 except: print >> sys.stderr, usage; sys.exit(1)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
183 elif (arg == "-s"):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
184 i += 1
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
185 try: seed = string.atoi(sys.argv[i])
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
186 except: print >> sys.stderr, usage; sys.exit(1)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
187 elif (arg == "-c"):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
188 i += 1
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
189 try: copies = string.atoi(sys.argv[i])
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
190 except: print >> sys.stderr, usage; sys.exit(1)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
191 elif (arg == "-h"):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
192 print >> sys.stderr, usage; sys.exit(1)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
193 else:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
194 print >> sys.stderr, "Unknown command line argument: " + arg
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
195 sys.exit(1)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
196 i += 1
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
197
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
198 # check that required arguments given
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
199 if (file_name == None):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
200 print >> sys.stderr, usage; sys.exit(1)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
201
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
202 random.seed(seed)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
203
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
204 # read sequences
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
205 seqs = sequence.readFASTA(file_name,'Extended DNA')
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
206
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
207 for s in seqs:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
208 str = s.getString()
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
209 #FIXME altschul can't handle ambigs
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
210 name = s.getName()
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
211
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
212 #print >> sys.stderr, ">%s" % name
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
213
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
214 for i in range(copies):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
215
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
216 shuffledSeq = dinuclShuffle(str)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
217
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
218 if (copies == 1):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
219 print >> sys.stdout, ">%s\n%s" % (name+tag, shuffledSeq)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
220 else:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
221 print >> sys.stdout, ">%s_%d\n%s" % (name+tag, i, shuffledSeq)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
222
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
223 if __name__ == '__main__': main()