annotate tools/mytools/fasta-dinucleotide-shuffle.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1 #!/usr/bin/python
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
2
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
3 import sys, string, random
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
4 import sequence
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
5
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
6 #
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
7 # turn on psyco to speed up by 3X
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
8 #
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
9 if __name__=='__main__':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
10 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
11 import psyco
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
12 #psyco.log()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
13 psyco.full()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
14 psyco_found = True
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
15 except ImportError:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
16 # psyco_found = False
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
17 pass
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
18 # print >> sys.stderr, "psyco_found", psyco_found
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
19
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
20
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
21 # altschulEriksonDinuclShuffle.py
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
22 # P. Clote, Oct 2003
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
23
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
24 def computeCountAndLists(s):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
25
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
26 #Initialize lists and mono- and dinucleotide dictionaries
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
27 List = {} #List is a dictionary of lists
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
28 List['A'] = []; List['C'] = [];
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
29 List['G'] = []; List['T'] = [];
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
30 # FIXME: is this ok?
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
31 List['N'] = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
32 nuclList = ["A","C","G","T","N"]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
33 s = s.upper()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
34 #s = s.replace("U","T")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
35 nuclCnt = {} #empty dictionary
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
36 dinuclCnt = {} #empty dictionary
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
37 for x in nuclList:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
38 nuclCnt[x]=0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
39 dinuclCnt[x]={}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
40 for y in nuclList:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
41 dinuclCnt[x][y]=0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
42
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
43 #Compute count and lists
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
44 nuclCnt[s[0]] = 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
45 nuclTotal = 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
46 dinuclTotal = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
47 for i in range(len(s)-1):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
48 x = s[i]; y = s[i+1]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
49 List[x].append( y )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
50 nuclCnt[y] += 1; nuclTotal += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
51 dinuclCnt[x][y] += 1; dinuclTotal += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
52 assert (nuclTotal==len(s))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
53 assert (dinuclTotal==len(s)-1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
54 return nuclCnt,dinuclCnt,List
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
55
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
56
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
57 def chooseEdge(x,dinuclCnt):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
58 z = random.random()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
59 denom=dinuclCnt[x]['A']+dinuclCnt[x]['C']+dinuclCnt[x]['G']+dinuclCnt[x]['T']+dinuclCnt[x]['N']
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
60 numerator = dinuclCnt[x]['A']
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
61 if z < float(numerator)/float(denom):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
62 dinuclCnt[x]['A'] -= 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
63 return 'A'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
64 numerator += dinuclCnt[x]['C']
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
65 if z < float(numerator)/float(denom):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
66 dinuclCnt[x]['C'] -= 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
67 return 'C'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
68 numerator += dinuclCnt[x]['G']
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
69 if z < float(numerator)/float(denom):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
70 dinuclCnt[x]['G'] -= 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
71 return 'G'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
72 numerator += dinuclCnt[x]['T']
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
73 if z < float(numerator)/float(denom):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
74 dinuclCnt[x]['T'] -= 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
75 return 'T'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
76 dinuclCnt[x]['N'] -= 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
77 return 'N'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
78
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
79 def connectedToLast(edgeList,nuclList,lastCh):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
80 D = {}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
81 for x in nuclList: D[x]=0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
82 for edge in edgeList:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
83 a = edge[0]; b = edge[1]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
84 if b==lastCh: D[a]=1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
85 for i in range(3):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
86 for edge in edgeList:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
87 a = edge[0]; b = edge[1]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
88 if D[b]==1: D[a]=1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
89 ok = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
90 for x in nuclList:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
91 if x!=lastCh and D[x]==0: return 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
92 return 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
93
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
94 def eulerian(s):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
95 nuclCnt,dinuclCnt,List = computeCountAndLists(s)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
96 #compute nucleotides appearing in s
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
97 nuclList = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
98 for x in ["A","C","G","T","N"]:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
99 if x in s: nuclList.append(x)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
100 #create dinucleotide shuffle L
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
101 firstCh = s[0] #start with first letter of s
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
102 lastCh = s[-1]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
103 edgeList = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
104 for x in nuclList:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
105 if x!= lastCh: edgeList.append( [x,chooseEdge(x,dinuclCnt)] )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
106 ok = connectedToLast(edgeList,nuclList,lastCh)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
107 return ok,edgeList,nuclList,lastCh
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
108
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
109
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
110 def shuffleEdgeList(L):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
111 n = len(L); barrier = n
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
112 for i in range(n-1):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
113 z = int(random.random() * barrier)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
114 tmp = L[z]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
115 L[z]= L[barrier-1]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
116 L[barrier-1] = tmp
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
117 barrier -= 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
118 return L
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
119
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
120 def dinuclShuffle(s):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
121 ok = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
122 while not ok:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
123 ok,edgeList,nuclList,lastCh = eulerian(s)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
124 nuclCnt,dinuclCnt,List = computeCountAndLists(s)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
125
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
126 #remove last edges from each vertex list, shuffle, then add back
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
127 #the removed edges at end of vertex lists.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
128 for [x,y] in edgeList: List[x].remove(y)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
129 for x in nuclList: shuffleEdgeList(List[x])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
130 for [x,y] in edgeList: List[x].append(y)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
131
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
132 #construct the eulerian path
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
133 L = [s[0]]; prevCh = s[0]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
134 for i in range(len(s)-2):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
135 ch = List[prevCh][0]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
136 L.append( ch )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
137 del List[prevCh][0]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
138 prevCh = ch
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
139 L.append(s[-1])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
140 t = string.join(L,"")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
141 return t
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
142
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
143 def main():
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
144
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
145 #
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
146 # defaults
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
147 #
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
148 file_name = None
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
149 seed = 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
150 copies = 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
151
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
152 #
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
153 # get command line arguments
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
154 #
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
155 usage = """USAGE:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
156 %s [options]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
157
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
158 -f <filename> file name (required)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
159 -t <tag> added to shuffled sequence names
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
160 -s <seed> random seed; default: %d
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
161 -c <n> make <n> shuffled copies of each sequence; default: %d
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
162 -h print this usage message
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
163 """ % (sys.argv[0], seed, copies)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
164
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
165 # no arguments: print usage
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
166 if len(sys.argv) == 1:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
167 print >> sys.stderr, usage; sys.exit(1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
168
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
169 tag = "";
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
170
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
171 # parse command line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
172 i = 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
173 while i < len(sys.argv):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
174 arg = sys.argv[i]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
175 if (arg == "-f"):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
176 i += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
177 try: file_name = sys.argv[i]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
178 except: print >> sys.stderr, usage; sys.exit(1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
179 elif (arg == "-t"):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
180 i += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
181 try: tag = sys.argv[i]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
182 except: print >> sys.stderr, usage; sys.exit(1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
183 elif (arg == "-s"):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
184 i += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
185 try: seed = string.atoi(sys.argv[i])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
186 except: print >> sys.stderr, usage; sys.exit(1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
187 elif (arg == "-c"):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
188 i += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
189 try: copies = string.atoi(sys.argv[i])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
190 except: print >> sys.stderr, usage; sys.exit(1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
191 elif (arg == "-h"):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
192 print >> sys.stderr, usage; sys.exit(1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
193 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
194 print >> sys.stderr, "Unknown command line argument: " + arg
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
195 sys.exit(1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
196 i += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
197
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
198 # check that required arguments given
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
199 if (file_name == None):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
200 print >> sys.stderr, usage; sys.exit(1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
201
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
202 random.seed(seed)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
203
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
204 # read sequences
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
205 seqs = sequence.readFASTA(file_name,'Extended DNA')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
206
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
207 for s in seqs:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
208 str = s.getString()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
209 #FIXME altschul can't handle ambigs
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
210 name = s.getName()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
211
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
212 #print >> sys.stderr, ">%s" % name
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
213
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
214 for i in range(copies):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
215
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
216 shuffledSeq = dinuclShuffle(str)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
217
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
218 if (copies == 1):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
219 print >> sys.stdout, ">%s\n%s" % (name+tag, shuffledSeq)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
220 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
221 print >> sys.stdout, ">%s_%d\n%s" % (name+tag, i, shuffledSeq)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
222
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
223 if __name__ == '__main__': main()