Mercurial > repos > xuebing > sharplabtool
comparison altschulEriksonDinuclShuffle.py @ 14:76e1b1b21cce default tip
Deleted selected files
| author | xuebing |
|---|---|
| date | Tue, 13 Mar 2012 19:05:10 -0400 |
| parents | 292186c14b08 |
| children |
comparison
equal
deleted
inserted
replaced
| 13:292186c14b08 | 14:76e1b1b21cce |
|---|---|
| 1 #! /usr/bin/env python | |
| 2 | |
| 3 # altschulEriksonDinuclShuffle.py | |
| 4 # P. Clote, Oct 2003 | |
| 5 # NOTE: One cannot use function "count(s,word)" to count the number | |
| 6 # of occurrences of dinucleotide word in string s, since the built-in | |
| 7 # function counts only nonoverlapping words, presumably in a left to | |
| 8 # right fashion. | |
| 9 | |
| 10 | |
| 11 import sys,string,random | |
| 12 | |
| 13 | |
| 14 | |
| 15 def computeCountAndLists(s): | |
| 16 #WARNING: Use of function count(s,'UU') returns 1 on word UUU | |
| 17 #since it apparently counts only nonoverlapping words UU | |
| 18 #For this reason, we work with the indices. | |
| 19 | |
| 20 #Initialize lists and mono- and dinucleotide dictionaries | |
| 21 List = {} #List is a dictionary of lists | |
| 22 List['A'] = []; List['C'] = []; | |
| 23 List['G'] = []; List['T'] = []; | |
| 24 nuclList = ["A","C","G","T"] | |
| 25 s = s.upper() | |
| 26 s = s.replace("U","T") | |
| 27 nuclCnt = {} #empty dictionary | |
| 28 dinuclCnt = {} #empty dictionary | |
| 29 for x in nuclList: | |
| 30 nuclCnt[x]=0 | |
| 31 dinuclCnt[x]={} | |
| 32 for y in nuclList: | |
| 33 dinuclCnt[x][y]=0 | |
| 34 | |
| 35 #Compute count and lists | |
| 36 nuclCnt[s[0]] = 1 | |
| 37 nuclTotal = 1 | |
| 38 dinuclTotal = 0 | |
| 39 for i in range(len(s)-1): | |
| 40 x = s[i]; y = s[i+1] | |
| 41 List[x].append( y ) | |
| 42 nuclCnt[y] += 1; nuclTotal += 1 | |
| 43 dinuclCnt[x][y] += 1; dinuclTotal += 1 | |
| 44 assert (nuclTotal==len(s)) | |
| 45 assert (dinuclTotal==len(s)-1) | |
| 46 return nuclCnt,dinuclCnt,List | |
| 47 | |
| 48 | |
| 49 def chooseEdge(x,dinuclCnt): | |
| 50 numInList = 0 | |
| 51 for y in ['A','C','G','T']: | |
| 52 numInList += dinuclCnt[x][y] | |
| 53 z = random.random() | |
| 54 denom=dinuclCnt[x]['A']+dinuclCnt[x]['C']+dinuclCnt[x]['G']+dinuclCnt[x]['T'] | |
| 55 numerator = dinuclCnt[x]['A'] | |
| 56 if z < float(numerator)/float(denom): | |
| 57 dinuclCnt[x]['A'] -= 1 | |
| 58 return 'A' | |
| 59 numerator += dinuclCnt[x]['C'] | |
| 60 if z < float(numerator)/float(denom): | |
| 61 dinuclCnt[x]['C'] -= 1 | |
| 62 return 'C' | |
| 63 numerator += dinuclCnt[x]['G'] | |
| 64 if z < float(numerator)/float(denom): | |
| 65 dinuclCnt[x]['G'] -= 1 | |
| 66 return 'G' | |
| 67 dinuclCnt[x]['T'] -= 1 | |
| 68 return 'T' | |
| 69 | |
| 70 def connectedToLast(edgeList,nuclList,lastCh): | |
| 71 D = {} | |
| 72 for x in nuclList: D[x]=0 | |
| 73 for edge in edgeList: | |
| 74 a = edge[0]; b = edge[1] | |
| 75 if b==lastCh: D[a]=1 | |
| 76 for i in range(2): | |
| 77 for edge in edgeList: | |
| 78 a = edge[0]; b = edge[1] | |
| 79 if D[b]==1: D[a]=1 | |
| 80 ok = 0 | |
| 81 for x in nuclList: | |
| 82 if x!=lastCh and D[x]==0: return 0 | |
| 83 return 1 | |
| 84 | |
| 85 | |
| 86 | |
| 87 def eulerian(s): | |
| 88 nuclCnt,dinuclCnt,List = computeCountAndLists(s) | |
| 89 #compute nucleotides appearing in s | |
| 90 nuclList = [] | |
| 91 for x in ["A","C","G","T"]: | |
| 92 if x in s: nuclList.append(x) | |
| 93 #compute numInList[x] = number of dinucleotides beginning with x | |
| 94 numInList = {} | |
| 95 for x in nuclList: | |
| 96 numInList[x]=0 | |
| 97 for y in nuclList: | |
| 98 numInList[x] += dinuclCnt[x][y] | |
| 99 #create dinucleotide shuffle L | |
| 100 firstCh = s[0] #start with first letter of s | |
| 101 lastCh = s[-1] | |
| 102 edgeList = [] | |
| 103 for x in nuclList: | |
| 104 if x!= lastCh: edgeList.append( [x,chooseEdge(x,dinuclCnt)] ) | |
| 105 ok = connectedToLast(edgeList,nuclList,lastCh) | |
| 106 return ok,edgeList,nuclList,lastCh | |
| 107 | |
| 108 | |
| 109 def shuffleEdgeList(L): | |
| 110 n = len(L); barrier = n | |
| 111 for i in range(n-1): | |
| 112 z = int(random.random() * barrier) | |
| 113 tmp = L[z] | |
| 114 L[z]= L[barrier-1] | |
| 115 L[barrier-1] = tmp | |
| 116 barrier -= 1 | |
| 117 return L | |
| 118 | |
| 119 def dinuclShuffle(s): | |
| 120 ok = 0 | |
| 121 while not ok: | |
| 122 ok,edgeList,nuclList,lastCh = eulerian(s) | |
| 123 nuclCnt,dinuclCnt,List = computeCountAndLists(s) | |
| 124 | |
| 125 #remove last edges from each vertex list, shuffle, then add back | |
| 126 #the removed edges at end of vertex lists. | |
| 127 for [x,y] in edgeList: List[x].remove(y) | |
| 128 for x in nuclList: shuffleEdgeList(List[x]) | |
| 129 for [x,y] in edgeList: List[x].append(y) | |
| 130 | |
| 131 #construct the eulerian path | |
| 132 L = [s[0]]; prevCh = s[0] | |
| 133 for i in range(len(s)-2): | |
| 134 ch = List[prevCh][0] | |
| 135 L.append( ch ) | |
| 136 del List[prevCh][0] | |
| 137 prevCh = ch | |
| 138 L.append(s[-1]) | |
| 139 t = string.join(L,"") | |
| 140 return t | |
| 141 | |
| 142 | |
| 143 | |
| 144 if __name__ == '__main__': | |
| 145 if len(sys.argv)!=3: | |
| 146 print "Usage: python altschulEriksonDinuclShuffle.py GCATCGA 5" | |
| 147 sys.exit(1) | |
| 148 s = sys.argv[1].upper() | |
| 149 for i in range(int(sys.argv[2])): | |
| 150 print dinuclShuffle(s) |
