annotate mytools/seq2meme.py @ 9:8cec2078632a

Uploaded
author xuebing
date Fri, 16 Mar 2012 14:15:10 -0400
parents 39217fa39ff2
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
1 # -*- coding: iso-8859-1 -*-
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
2 import random,sys,math
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
3
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
4 #import pylab
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
5
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
6 def readMotifFile(filename):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
7
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
8 try:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
9 f=open(filename)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
10 except IOError:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
11 print "could not open",filename,"Are you sure this file exists?"
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
12 sys.exit(1)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
13
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
14 seqs=[]
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
15 maxL = 0
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
16 for line in f:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
17 if '>' in line or 'N' in line:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
18 next
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
19 else:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
20 seq = line.strip().upper()
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
21 if maxL < len(seq):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
22 maxL = len(seq)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
23 seqs.append(seq)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
24 f.close()
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
25
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
26 print len(seqs),'sequences loaded'
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
27 print 'max seq length:',maxL
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
28 for i in range(len(seqs)):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
29 if len(seqs[i]) < maxL:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
30 del seqs[i]
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
31 print len(seqs),'sequences with length = ',maxL
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
32 return seqs
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
33
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
34
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
35 def createWeightMatrix(seqs,psuedocont):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
36
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
37 motifWidth = len(seqs[0])
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
38 weightMatrix = []
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
39 for i in range(motifWidth):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
40 weightMatrix.append({'A':psuedocont,'C':psuedocont,'G':psuedocont,'T':psuedocont})
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
41
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
42 #Use a for loop to iterate through all the sequences. For each sequence, begin at the start site in starts, and look at motifWidth bases. Count how many times each base appears in each position of the motif
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
43 #YOUR CODE HERE
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
44 for seq in seqs:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
45 for pos in range(motifWidth):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
46 weightMatrix[pos][seq[pos]] = weightMatrix[pos][seq[pos]] + 1.0
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
47
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
48 #Normalize your weight matrix (so that it contains probabilities rather than counts)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
49 #Remember the added psuedocounts when you normalize!
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
50 for pos in range(motifWidth):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
51 totalCount = sum(weightMatrix[pos].values())
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
52 for letter in weightMatrix[pos].keys():
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
53 weightMatrix[pos][letter] = weightMatrix[pos][letter]/totalCount
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
54
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
55 #Return your weight matrix
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
56 return weightMatrix
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
57
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
58 def printMemeFormat(weightMatrix,motifName,filename,nSite,background):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
59 f = open(filename,'w')
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
60 f.write('MEME version 4.4\n\n')
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
61
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
62 f.write('ALPHABET= ACGT\n\n')
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
63
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
64 f.write('strands: + -\n\n')
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
65
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
66 f.write('Background letter frequencies (from:\n')
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
67 f.write(background+'\n\n')
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
68
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
69 f.write('MOTIF '+motifName+'\n\n')
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
70
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
71 f.write('letter-probability matrix: alength= 4 '+'w= '+str(len(weightMatrix))+' nsites= '+str(nSite)+' E= 0\n')
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
72 for position in range(len(weightMatrix)):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
73 probsThisPosition=weightMatrix[position]
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
74 f.write(' '+"%.6f" %(probsThisPosition['A'])+'\t '+"%.6f" %(probsThisPosition['C'])+'\t '+"%.6f" %(probsThisPosition['G'])+'\t '+"%.6f" %(probsThisPosition['T'])+'\t\n')
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
75 f.write('\n\n')
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
76 f.close()
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
77
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
78 #get a two decimal-place string representation of a float f
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
79 def twoDecimal(f):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
80 return "%.6f" %(f)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
81
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
82 def run():
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
83
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
84 #Get file name from command line
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
85 if len(sys.argv) < 3:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
86 print "python seq2meme.py motif_fasta outputfile motifName psuedocont background"
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
87 sys.exit(1)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
88 else:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
89 motifFile=sys.argv[1] #
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
90 outFile=sys.argv[2]
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
91 motifName=sys.argv[3]
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
92 psuedocont = float(sys.argv[4])
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
93 background=' '.join(sys.argv[5].strip().split(','))
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
94
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
95 motifs=readMotifFile(motifFile)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
96
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
97 #Create weight matrix
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
98 motifWeightMatrix=createWeightMatrix(motifs,psuedocont)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
99 printMemeFormat(motifWeightMatrix,motifName,outFile,len(motifs),background)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
100 run()
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
101