Mercurial > repos > xuebing > sharplabtool
diff fasta-dinucleotide-shuffle.py @ 14:76e1b1b21cce default tip
Deleted selected files
author | xuebing |
---|---|
date | Tue, 13 Mar 2012 19:05:10 -0400 |
parents | 292186c14b08 |
children |
line wrap: on
line diff
--- a/fasta-dinucleotide-shuffle.py Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,223 +0,0 @@ -#!/usr/bin/python - -import sys, string, random -import sequence - -# -# turn on psyco to speed up by 3X -# -if __name__=='__main__': - try: - import psyco - #psyco.log() - psyco.full() - psyco_found = True - except ImportError: -# psyco_found = False - pass -# print >> sys.stderr, "psyco_found", psyco_found - - -# altschulEriksonDinuclShuffle.py -# P. Clote, Oct 2003 - -def computeCountAndLists(s): - - #Initialize lists and mono- and dinucleotide dictionaries - List = {} #List is a dictionary of lists - List['A'] = []; List['C'] = []; - List['G'] = []; List['T'] = []; - # FIXME: is this ok? - List['N'] = [] - nuclList = ["A","C","G","T","N"] - s = s.upper() - #s = s.replace("U","T") - nuclCnt = {} #empty dictionary - dinuclCnt = {} #empty dictionary - for x in nuclList: - nuclCnt[x]=0 - dinuclCnt[x]={} - for y in nuclList: - dinuclCnt[x][y]=0 - - #Compute count and lists - nuclCnt[s[0]] = 1 - nuclTotal = 1 - dinuclTotal = 0 - for i in range(len(s)-1): - x = s[i]; y = s[i+1] - List[x].append( y ) - nuclCnt[y] += 1; nuclTotal += 1 - dinuclCnt[x][y] += 1; dinuclTotal += 1 - assert (nuclTotal==len(s)) - assert (dinuclTotal==len(s)-1) - return nuclCnt,dinuclCnt,List - - -def chooseEdge(x,dinuclCnt): - z = random.random() - denom=dinuclCnt[x]['A']+dinuclCnt[x]['C']+dinuclCnt[x]['G']+dinuclCnt[x]['T']+dinuclCnt[x]['N'] - numerator = dinuclCnt[x]['A'] - if z < float(numerator)/float(denom): - dinuclCnt[x]['A'] -= 1 - return 'A' - numerator += dinuclCnt[x]['C'] - if z < float(numerator)/float(denom): - dinuclCnt[x]['C'] -= 1 - return 'C' - numerator += dinuclCnt[x]['G'] - if z < float(numerator)/float(denom): - dinuclCnt[x]['G'] -= 1 - return 'G' - numerator += dinuclCnt[x]['T'] - if z < float(numerator)/float(denom): - dinuclCnt[x]['T'] -= 1 - return 'T' - dinuclCnt[x]['N'] -= 1 - return 'N' - -def connectedToLast(edgeList,nuclList,lastCh): - D = {} - for x in nuclList: D[x]=0 - for edge in edgeList: - a = edge[0]; b = edge[1] - if b==lastCh: D[a]=1 - for i in range(3): - for edge in edgeList: - a = edge[0]; b = edge[1] - if D[b]==1: D[a]=1 - ok = 0 - for x in nuclList: - if x!=lastCh and D[x]==0: return 0 - return 1 - -def eulerian(s): - nuclCnt,dinuclCnt,List = computeCountAndLists(s) - #compute nucleotides appearing in s - nuclList = [] - for x in ["A","C","G","T","N"]: - if x in s: nuclList.append(x) - #create dinucleotide shuffle L - firstCh = s[0] #start with first letter of s - lastCh = s[-1] - edgeList = [] - for x in nuclList: - if x!= lastCh: edgeList.append( [x,chooseEdge(x,dinuclCnt)] ) - ok = connectedToLast(edgeList,nuclList,lastCh) - return ok,edgeList,nuclList,lastCh - - -def shuffleEdgeList(L): - n = len(L); barrier = n - for i in range(n-1): - z = int(random.random() * barrier) - tmp = L[z] - L[z]= L[barrier-1] - L[barrier-1] = tmp - barrier -= 1 - return L - -def dinuclShuffle(s): - ok = 0 - while not ok: - ok,edgeList,nuclList,lastCh = eulerian(s) - nuclCnt,dinuclCnt,List = computeCountAndLists(s) - - #remove last edges from each vertex list, shuffle, then add back - #the removed edges at end of vertex lists. - for [x,y] in edgeList: List[x].remove(y) - for x in nuclList: shuffleEdgeList(List[x]) - for [x,y] in edgeList: List[x].append(y) - - #construct the eulerian path - L = [s[0]]; prevCh = s[0] - for i in range(len(s)-2): - ch = List[prevCh][0] - L.append( ch ) - del List[prevCh][0] - prevCh = ch - L.append(s[-1]) - t = string.join(L,"") - return t - -def main(): - - # - # defaults - # - file_name = None - seed = 1 - copies = 1 - - # - # get command line arguments - # - usage = """USAGE: - %s [options] - - -f <filename> file name (required) - -t <tag> added to shuffled sequence names - -s <seed> random seed; default: %d - -c <n> make <n> shuffled copies of each sequence; default: %d - -h print this usage message - """ % (sys.argv[0], seed, copies) - - # no arguments: print usage - if len(sys.argv) == 1: - print >> sys.stderr, usage; sys.exit(1) - - tag = ""; - - # parse command line - i = 1 - while i < len(sys.argv): - arg = sys.argv[i] - if (arg == "-f"): - i += 1 - try: file_name = sys.argv[i] - except: print >> sys.stderr, usage; sys.exit(1) - elif (arg == "-t"): - i += 1 - try: tag = sys.argv[i] - except: print >> sys.stderr, usage; sys.exit(1) - elif (arg == "-s"): - i += 1 - try: seed = string.atoi(sys.argv[i]) - except: print >> sys.stderr, usage; sys.exit(1) - elif (arg == "-c"): - i += 1 - try: copies = string.atoi(sys.argv[i]) - except: print >> sys.stderr, usage; sys.exit(1) - elif (arg == "-h"): - print >> sys.stderr, usage; sys.exit(1) - else: - print >> sys.stderr, "Unknown command line argument: " + arg - sys.exit(1) - i += 1 - - # check that required arguments given - if (file_name == None): - print >> sys.stderr, usage; sys.exit(1) - - random.seed(seed) - - # read sequences - seqs = sequence.readFASTA(file_name,'Extended DNA') - - for s in seqs: - str = s.getString() - #FIXME altschul can't handle ambigs - name = s.getName() - - #print >> sys.stderr, ">%s" % name - - for i in range(copies): - - shuffledSeq = dinuclShuffle(str) - - if (copies == 1): - print >> sys.stdout, ">%s\n%s" % (name+tag, shuffledSeq) - else: - print >> sys.stdout, ">%s_%d\n%s" % (name+tag, i, shuffledSeq) - -if __name__ == '__main__': main()