annotate mytools/seq2meme.py @ 7:f0dc65e7f6c0

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:59:07 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
7
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
1 # -*- coding: iso-8859-1 -*-
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
2 import random,sys,math
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
3
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
4 #import pylab
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
5
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
6 def readMotifFile(filename):
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
7
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
8 try:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
9 f=open(filename)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
10 except IOError:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
11 print "could not open",filename,"Are you sure this file exists?"
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
12 sys.exit(1)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
13
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
14 seqs=[]
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
15 maxL = 0
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
16 for line in f:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
17 if '>' in line or 'N' in line:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
18 next
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
19 else:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
20 seq = line.strip().upper()
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
21 if maxL < len(seq):
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
22 maxL = len(seq)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
23 seqs.append(seq)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
24 f.close()
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
25
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
26 print len(seqs),'sequences loaded'
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
27 print 'max seq length:',maxL
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
28 for i in range(len(seqs)):
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
29 if len(seqs[i]) < maxL:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
30 del seqs[i]
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
31 print len(seqs),'sequences with length = ',maxL
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
32 return seqs
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
33
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
34
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
35 def createWeightMatrix(seqs,psuedocont):
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
36
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
37 motifWidth = len(seqs[0])
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
38 weightMatrix = []
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
39 for i in range(motifWidth):
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
40 weightMatrix.append({'A':psuedocont,'C':psuedocont,'G':psuedocont,'T':psuedocont})
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
41
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
42 #Use a for loop to iterate through all the sequences. For each sequence, begin at the start site in starts, and look at motifWidth bases. Count how many times each base appears in each position of the motif
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
43 #YOUR CODE HERE
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
44 for seq in seqs:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
45 for pos in range(motifWidth):
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
46 weightMatrix[pos][seq[pos]] = weightMatrix[pos][seq[pos]] + 1.0
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
47
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
48 #Normalize your weight matrix (so that it contains probabilities rather than counts)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
49 #Remember the added psuedocounts when you normalize!
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
50 for pos in range(motifWidth):
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
51 totalCount = sum(weightMatrix[pos].values())
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
52 for letter in weightMatrix[pos].keys():
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
53 weightMatrix[pos][letter] = weightMatrix[pos][letter]/totalCount
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
54
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
55 #Return your weight matrix
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
56 return weightMatrix
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
57
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
58 def printMemeFormat(weightMatrix,motifName,filename,nSite,background):
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
59 f = open(filename,'w')
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
60 f.write('MEME version 4.4\n\n')
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
61
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
62 f.write('ALPHABET= ACGT\n\n')
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
63
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
64 f.write('strands: + -\n\n')
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
65
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
66 f.write('Background letter frequencies (from:\n')
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
67 f.write(background+'\n\n')
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
68
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
69 f.write('MOTIF '+motifName+'\n\n')
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
70
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
71 f.write('letter-probability matrix: alength= 4 '+'w= '+str(len(weightMatrix))+' nsites= '+str(nSite)+' E= 0\n')
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
72 for position in range(len(weightMatrix)):
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
73 probsThisPosition=weightMatrix[position]
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
74 f.write(' '+"%.6f" %(probsThisPosition['A'])+'\t '+"%.6f" %(probsThisPosition['C'])+'\t '+"%.6f" %(probsThisPosition['G'])+'\t '+"%.6f" %(probsThisPosition['T'])+'\t\n')
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
75 f.write('\n\n')
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
76 f.close()
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
77
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
78 #get a two decimal-place string representation of a float f
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
79 def twoDecimal(f):
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
80 return "%.6f" %(f)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
81
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
82 def run():
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
83
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
84 #Get file name from command line
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
85 if len(sys.argv) < 3:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
86 print "python seq2meme.py motif_fasta outputfile motifName psuedocont background"
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
87 sys.exit(1)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
88 else:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
89 motifFile=sys.argv[1] #
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
90 outFile=sys.argv[2]
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
91 motifName=sys.argv[3]
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
92 psuedocont = float(sys.argv[4])
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
93 background=' '.join(sys.argv[5].strip().split(','))
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
94
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
95 motifs=readMotifFile(motifFile)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
96
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
97 #Create weight matrix
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
98 motifWeightMatrix=createWeightMatrix(motifs,psuedocont)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
99 printMemeFormat(motifWeightMatrix,motifName,outFile,len(motifs),background)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
100 run()
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
101