annotate seq2meme.py @ 15:0e221dbd17b2 default tip

Uploaded
author xuebing
date Sat, 31 Mar 2012 08:53:06 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
15
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
1 # -*- coding: iso-8859-1 -*-
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
2 import random,sys,math
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
3
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
4 #import pylab
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
5
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
6 def readMotifFile(filename):
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
7
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
8 try:
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
9 f=open(filename)
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
10 except IOError:
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
11 print "could not open",filename,"Are you sure this file exists?"
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
12 sys.exit(1)
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
13
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
14 seqs=[]
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
15 maxL = 0
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
16 for line in f:
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
17 if '>' in line or 'N' in line:
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
18 next
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
19 else:
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
20 seq = line.strip().upper()
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
21 if maxL < len(seq):
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
22 maxL = len(seq)
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
23 seqs.append(seq)
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
24 f.close()
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
25
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
26 print len(seqs),'sequences loaded'
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
27 print 'max seq length:',maxL
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
28 for i in range(len(seqs)):
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
29 if len(seqs[i]) < maxL:
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
30 del seqs[i]
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
31 print len(seqs),'sequences with length = ',maxL
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
32 return seqs
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
33
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
34
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
35 def createWeightMatrix(seqs,psuedocont):
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
36
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
37 motifWidth = len(seqs[0])
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
38 weightMatrix = []
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
39 for i in range(motifWidth):
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
40 weightMatrix.append({'A':psuedocont,'C':psuedocont,'G':psuedocont,'T':psuedocont})
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
41
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
42 #Use a for loop to iterate through all the sequences. For each sequence, begin at the start site in starts, and look at motifWidth bases. Count how many times each base appears in each position of the motif
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
43 #YOUR CODE HERE
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
44 for seq in seqs:
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
45 for pos in range(motifWidth):
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
46 weightMatrix[pos][seq[pos]] = weightMatrix[pos][seq[pos]] + 1.0
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
47
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
48 #Normalize your weight matrix (so that it contains probabilities rather than counts)
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
49 #Remember the added psuedocounts when you normalize!
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
50 for pos in range(motifWidth):
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
51 totalCount = sum(weightMatrix[pos].values())
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
52 for letter in weightMatrix[pos].keys():
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
53 weightMatrix[pos][letter] = weightMatrix[pos][letter]/totalCount
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
54
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
55 #Return your weight matrix
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
56 return weightMatrix
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
57
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
58 def printMemeFormat(weightMatrix,motifName,filename,nSite,background):
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
59 f = open(filename,'w')
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
60 f.write('MEME version 4.4\n\n')
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
61
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
62 f.write('ALPHABET= ACGT\n\n')
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
63
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
64 f.write('strands: + -\n\n')
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
65
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
66 f.write('Background letter frequencies (from:\n')
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
67 f.write(background+'\n\n')
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
68
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
69 f.write('MOTIF '+motifName+'\n\n')
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
70
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
71 f.write('BL MOTIF '+motifName+' width='+str(len(weightMatrix))+' seqs='+str(nSite)+'\n')
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
72 f.write('letter-probability matrix: alength= 4 '+'w= '+str(len(weightMatrix))+' nsites= '+str(nSite)+' E= 0\n')
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
73 for position in range(len(weightMatrix)):
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
74 probsThisPosition=weightMatrix[position]
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
75 f.write(' '+"%.6f" %(probsThisPosition['A'])+'\t '+"%.6f" %(probsThisPosition['C'])+'\t '+"%.6f" %(probsThisPosition['G'])+'\t '+"%.6f" %(probsThisPosition['T'])+'\t\n')
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
76 f.write('\n\n')
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
77 f.close()
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
78
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
79 #get a two decimal-place string representation of a float f
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
80 def twoDecimal(f):
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
81 return "%.6f" %(f)
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
82
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
83 def run():
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
84
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
85 #Get file name from command line
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
86 if len(sys.argv) < 3:
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
87 print "python seq2meme.py motif_fasta outputfile motifName psuedocont background"
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
88 sys.exit(1)
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
89 else:
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
90 motifFile=sys.argv[1] #
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
91 outFile=sys.argv[2]
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
92 motifName=sys.argv[3]
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
93 psuedocont = float(sys.argv[4])
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
94 background=' '.join(sys.argv[5].strip().split(','))
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
95
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
96 motifs=readMotifFile(motifFile)
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
97
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
98 #Create weight matrix
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
99 motifWeightMatrix=createWeightMatrix(motifs,psuedocont)
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
100 printMemeFormat(motifWeightMatrix,motifName,outFile,len(motifs),background)
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
101 run()
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
102