comparison mytools/fasta-dinucleotide-shuffle.py @ 7:f0dc65e7f6c0

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:59:07 -0500
parents
children
comparison
equal deleted inserted replaced
6:094f377db4e5 7:f0dc65e7f6c0
1 #!/usr/bin/python
2
3 import sys, string, random
4 import sequence
5
6 #
7 # turn on psyco to speed up by 3X
8 #
9 if __name__=='__main__':
10 try:
11 import psyco
12 #psyco.log()
13 psyco.full()
14 psyco_found = True
15 except ImportError:
16 # psyco_found = False
17 pass
18 # print >> sys.stderr, "psyco_found", psyco_found
19
20
21 # altschulEriksonDinuclShuffle.py
22 # P. Clote, Oct 2003
23
24 def computeCountAndLists(s):
25
26 #Initialize lists and mono- and dinucleotide dictionaries
27 List = {} #List is a dictionary of lists
28 List['A'] = []; List['C'] = [];
29 List['G'] = []; List['T'] = [];
30 # FIXME: is this ok?
31 List['N'] = []
32 nuclList = ["A","C","G","T","N"]
33 s = s.upper()
34 #s = s.replace("U","T")
35 nuclCnt = {} #empty dictionary
36 dinuclCnt = {} #empty dictionary
37 for x in nuclList:
38 nuclCnt[x]=0
39 dinuclCnt[x]={}
40 for y in nuclList:
41 dinuclCnt[x][y]=0
42
43 #Compute count and lists
44 nuclCnt[s[0]] = 1
45 nuclTotal = 1
46 dinuclTotal = 0
47 for i in range(len(s)-1):
48 x = s[i]; y = s[i+1]
49 List[x].append( y )
50 nuclCnt[y] += 1; nuclTotal += 1
51 dinuclCnt[x][y] += 1; dinuclTotal += 1
52 assert (nuclTotal==len(s))
53 assert (dinuclTotal==len(s)-1)
54 return nuclCnt,dinuclCnt,List
55
56
57 def chooseEdge(x,dinuclCnt):
58 z = random.random()
59 denom=dinuclCnt[x]['A']+dinuclCnt[x]['C']+dinuclCnt[x]['G']+dinuclCnt[x]['T']+dinuclCnt[x]['N']
60 numerator = dinuclCnt[x]['A']
61 if z < float(numerator)/float(denom):
62 dinuclCnt[x]['A'] -= 1
63 return 'A'
64 numerator += dinuclCnt[x]['C']
65 if z < float(numerator)/float(denom):
66 dinuclCnt[x]['C'] -= 1
67 return 'C'
68 numerator += dinuclCnt[x]['G']
69 if z < float(numerator)/float(denom):
70 dinuclCnt[x]['G'] -= 1
71 return 'G'
72 numerator += dinuclCnt[x]['T']
73 if z < float(numerator)/float(denom):
74 dinuclCnt[x]['T'] -= 1
75 return 'T'
76 dinuclCnt[x]['N'] -= 1
77 return 'N'
78
79 def connectedToLast(edgeList,nuclList,lastCh):
80 D = {}
81 for x in nuclList: D[x]=0
82 for edge in edgeList:
83 a = edge[0]; b = edge[1]
84 if b==lastCh: D[a]=1
85 for i in range(3):
86 for edge in edgeList:
87 a = edge[0]; b = edge[1]
88 if D[b]==1: D[a]=1
89 ok = 0
90 for x in nuclList:
91 if x!=lastCh and D[x]==0: return 0
92 return 1
93
94 def eulerian(s):
95 nuclCnt,dinuclCnt,List = computeCountAndLists(s)
96 #compute nucleotides appearing in s
97 nuclList = []
98 for x in ["A","C","G","T","N"]:
99 if x in s: nuclList.append(x)
100 #create dinucleotide shuffle L
101 firstCh = s[0] #start with first letter of s
102 lastCh = s[-1]
103 edgeList = []
104 for x in nuclList:
105 if x!= lastCh: edgeList.append( [x,chooseEdge(x,dinuclCnt)] )
106 ok = connectedToLast(edgeList,nuclList,lastCh)
107 return ok,edgeList,nuclList,lastCh
108
109
110 def shuffleEdgeList(L):
111 n = len(L); barrier = n
112 for i in range(n-1):
113 z = int(random.random() * barrier)
114 tmp = L[z]
115 L[z]= L[barrier-1]
116 L[barrier-1] = tmp
117 barrier -= 1
118 return L
119
120 def dinuclShuffle(s):
121 ok = 0
122 while not ok:
123 ok,edgeList,nuclList,lastCh = eulerian(s)
124 nuclCnt,dinuclCnt,List = computeCountAndLists(s)
125
126 #remove last edges from each vertex list, shuffle, then add back
127 #the removed edges at end of vertex lists.
128 for [x,y] in edgeList: List[x].remove(y)
129 for x in nuclList: shuffleEdgeList(List[x])
130 for [x,y] in edgeList: List[x].append(y)
131
132 #construct the eulerian path
133 L = [s[0]]; prevCh = s[0]
134 for i in range(len(s)-2):
135 ch = List[prevCh][0]
136 L.append( ch )
137 del List[prevCh][0]
138 prevCh = ch
139 L.append(s[-1])
140 t = string.join(L,"")
141 return t
142
143 def main():
144
145 #
146 # defaults
147 #
148 file_name = None
149 seed = 1
150 copies = 1
151
152 #
153 # get command line arguments
154 #
155 usage = """USAGE:
156 %s [options]
157
158 -f <filename> file name (required)
159 -t <tag> added to shuffled sequence names
160 -s <seed> random seed; default: %d
161 -c <n> make <n> shuffled copies of each sequence; default: %d
162 -h print this usage message
163 """ % (sys.argv[0], seed, copies)
164
165 # no arguments: print usage
166 if len(sys.argv) == 1:
167 print >> sys.stderr, usage; sys.exit(1)
168
169 tag = "";
170
171 # parse command line
172 i = 1
173 while i < len(sys.argv):
174 arg = sys.argv[i]
175 if (arg == "-f"):
176 i += 1
177 try: file_name = sys.argv[i]
178 except: print >> sys.stderr, usage; sys.exit(1)
179 elif (arg == "-t"):
180 i += 1
181 try: tag = sys.argv[i]
182 except: print >> sys.stderr, usage; sys.exit(1)
183 elif (arg == "-s"):
184 i += 1
185 try: seed = string.atoi(sys.argv[i])
186 except: print >> sys.stderr, usage; sys.exit(1)
187 elif (arg == "-c"):
188 i += 1
189 try: copies = string.atoi(sys.argv[i])
190 except: print >> sys.stderr, usage; sys.exit(1)
191 elif (arg == "-h"):
192 print >> sys.stderr, usage; sys.exit(1)
193 else:
194 print >> sys.stderr, "Unknown command line argument: " + arg
195 sys.exit(1)
196 i += 1
197
198 # check that required arguments given
199 if (file_name == None):
200 print >> sys.stderr, usage; sys.exit(1)
201
202 random.seed(seed)
203
204 # read sequences
205 seqs = sequence.readFASTA(file_name,'Extended DNA')
206
207 for s in seqs:
208 str = s.getString()
209 #FIXME altschul can't handle ambigs
210 name = s.getName()
211
212 #print >> sys.stderr, ">%s" % name
213
214 for i in range(copies):
215
216 shuffledSeq = dinuclShuffle(str)
217
218 if (copies == 1):
219 print >> sys.stdout, ">%s\n%s" % (name+tag, shuffledSeq)
220 else:
221 print >> sys.stdout, ">%s_%d\n%s" % (name+tag, i, shuffledSeq)
222
223 if __name__ == '__main__': main()