Mercurial > repos > xuebing > sharplabtool
diff altschulEriksonDinuclShuffle.py @ 14:76e1b1b21cce default tip
Deleted selected files
author | xuebing |
---|---|
date | Tue, 13 Mar 2012 19:05:10 -0400 |
parents | 292186c14b08 |
children |
line wrap: on
line diff
--- a/altschulEriksonDinuclShuffle.py Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,150 +0,0 @@ -#! /usr/bin/env python - -# altschulEriksonDinuclShuffle.py -# P. Clote, Oct 2003 -# NOTE: One cannot use function "count(s,word)" to count the number -# of occurrences of dinucleotide word in string s, since the built-in -# function counts only nonoverlapping words, presumably in a left to -# right fashion. - - -import sys,string,random - - - -def computeCountAndLists(s): - #WARNING: Use of function count(s,'UU') returns 1 on word UUU - #since it apparently counts only nonoverlapping words UU - #For this reason, we work with the indices. - - #Initialize lists and mono- and dinucleotide dictionaries - List = {} #List is a dictionary of lists - List['A'] = []; List['C'] = []; - List['G'] = []; List['T'] = []; - nuclList = ["A","C","G","T"] - s = s.upper() - s = s.replace("U","T") - nuclCnt = {} #empty dictionary - dinuclCnt = {} #empty dictionary - for x in nuclList: - nuclCnt[x]=0 - dinuclCnt[x]={} - for y in nuclList: - dinuclCnt[x][y]=0 - - #Compute count and lists - nuclCnt[s[0]] = 1 - nuclTotal = 1 - dinuclTotal = 0 - for i in range(len(s)-1): - x = s[i]; y = s[i+1] - List[x].append( y ) - nuclCnt[y] += 1; nuclTotal += 1 - dinuclCnt[x][y] += 1; dinuclTotal += 1 - assert (nuclTotal==len(s)) - assert (dinuclTotal==len(s)-1) - return nuclCnt,dinuclCnt,List - - -def chooseEdge(x,dinuclCnt): - numInList = 0 - for y in ['A','C','G','T']: - numInList += dinuclCnt[x][y] - z = random.random() - denom=dinuclCnt[x]['A']+dinuclCnt[x]['C']+dinuclCnt[x]['G']+dinuclCnt[x]['T'] - numerator = dinuclCnt[x]['A'] - if z < float(numerator)/float(denom): - dinuclCnt[x]['A'] -= 1 - return 'A' - numerator += dinuclCnt[x]['C'] - if z < float(numerator)/float(denom): - dinuclCnt[x]['C'] -= 1 - return 'C' - numerator += dinuclCnt[x]['G'] - if z < float(numerator)/float(denom): - dinuclCnt[x]['G'] -= 1 - return 'G' - dinuclCnt[x]['T'] -= 1 - return 'T' - -def connectedToLast(edgeList,nuclList,lastCh): - D = {} - for x in nuclList: D[x]=0 - for edge in edgeList: - a = edge[0]; b = edge[1] - if b==lastCh: D[a]=1 - for i in range(2): - for edge in edgeList: - a = edge[0]; b = edge[1] - if D[b]==1: D[a]=1 - ok = 0 - for x in nuclList: - if x!=lastCh and D[x]==0: return 0 - return 1 - - - -def eulerian(s): - nuclCnt,dinuclCnt,List = computeCountAndLists(s) - #compute nucleotides appearing in s - nuclList = [] - for x in ["A","C","G","T"]: - if x in s: nuclList.append(x) - #compute numInList[x] = number of dinucleotides beginning with x - numInList = {} - for x in nuclList: - numInList[x]=0 - for y in nuclList: - numInList[x] += dinuclCnt[x][y] - #create dinucleotide shuffle L - firstCh = s[0] #start with first letter of s - lastCh = s[-1] - edgeList = [] - for x in nuclList: - if x!= lastCh: edgeList.append( [x,chooseEdge(x,dinuclCnt)] ) - ok = connectedToLast(edgeList,nuclList,lastCh) - return ok,edgeList,nuclList,lastCh - - -def shuffleEdgeList(L): - n = len(L); barrier = n - for i in range(n-1): - z = int(random.random() * barrier) - tmp = L[z] - L[z]= L[barrier-1] - L[barrier-1] = tmp - barrier -= 1 - return L - -def dinuclShuffle(s): - ok = 0 - while not ok: - ok,edgeList,nuclList,lastCh = eulerian(s) - nuclCnt,dinuclCnt,List = computeCountAndLists(s) - - #remove last edges from each vertex list, shuffle, then add back - #the removed edges at end of vertex lists. - for [x,y] in edgeList: List[x].remove(y) - for x in nuclList: shuffleEdgeList(List[x]) - for [x,y] in edgeList: List[x].append(y) - - #construct the eulerian path - L = [s[0]]; prevCh = s[0] - for i in range(len(s)-2): - ch = List[prevCh][0] - L.append( ch ) - del List[prevCh][0] - prevCh = ch - L.append(s[-1]) - t = string.join(L,"") - return t - - - -if __name__ == '__main__': - if len(sys.argv)!=3: - print "Usage: python altschulEriksonDinuclShuffle.py GCATCGA 5" - sys.exit(1) - s = sys.argv[1].upper() - for i in range(int(sys.argv[2])): - print dinuclShuffle(s)