comparison utilities/plotMutModel.py @ 0:6e75a84e9338 draft

planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
author thondeboer
date Tue, 15 May 2018 02:39:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:6e75a84e9338
1 #!/usr/bin/env python
2
3 #
4 # a quick script for comparing mutation models
5 #
6 # python plotMutModel.py -i model1.p [model2.p] [model3.p]... -l legend_label1 [legend_label2] [legend_label3]... -o path/to/pdf_plot_prefix
7 #
8
9 import sys
10 import pickle
11 import bisect
12 import numpy as np
13 import matplotlib.pyplot as mpl
14 import matplotlib.colors as colors
15 import matplotlib.cm as cmx
16 import argparse
17
18 #mpl.rc('text',usetex=True)
19 #mpl.rcParams['text.latex.preamble']=[r"\usepackage{amsmath}"]
20
21 parser = argparse.ArgumentParser(description='Plot and compare mutation models from genMutModel.py Usage: python plotMutModel.py -i model1.p [model2.p] [model3.p]... -l legend_label1 [legend_label2] [legend_label3]... -o path/to/pdf_plot_prefix')
22 parser.add_argument('-i', type=str, required=True, metavar='<str>', nargs='+', help="* mutation_model_1.p [mutation_model_2.p] [mutation_model_3] ...")
23 parser.add_argument('-l', type=str, required=True, metavar='<str>', nargs='+', help="* legend labels: model1_name [model2_name] [model3_name]...")
24 parser.add_argument('-o', type=str, required=True, metavar='<str>', help="* output pdf prefix")
25 args = parser.parse_args()
26
27
28
29 def getColor(i,N,colormap='jet'):
30 cm = mpl.get_cmap(colormap)
31 cNorm = colors.Normalize(vmin=0, vmax=N+1)
32 scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=cm)
33 colorVal = scalarMap.to_rgba(i)
34 return colorVal
35
36 def isInBed(track,ind):
37 if ind in track:
38 return True
39 elif bisect.bisect(track,ind)%1 == 1:
40 return True
41 else:
42 return False
43
44 def getBedOverlap(track,ind_s,ind_e):
45 if ind_s in track:
46 myInd = track.index(ind_s)
47 return min([track[myInd+1]-ind_s+1,ind_e-ind_s+1])
48 else:
49 myInd = bisect.bisect(track,ind_s)
50 if myInd%1 and myInd < len(track)-1:
51 return min([track[myInd+1]-ind_s+1,ind_e-ind_s+1])
52 return 0
53
54 # a waaaaaaay slower version of the above function ^^
55 #def getTrackOverlap(track1,track2):
56 # otrack = [0 for n in xrange(max(track1+track2)+1)]
57 # for i in xrange(0,len(track1),2):
58 # for j in xrange(track1[i],track1[i+1]+1):
59 # otrack[j] = 1
60 # ocount = 0
61 # for i in xrange(0,len(track2),2):
62 # for j in xrange(track2[i],track2[i+1]+1):
63 # if otrack[j]:
64 # ocount += 1
65 # return ocount
66
67 OUP = args.o
68 LAB = args.l
69 #print LAB
70 INP = args.i
71
72 N_FILES = len(INP)
73
74 mpl.rcParams.update({'font.size': 13, 'font.weight':'bold', 'lines.linewidth': 3})
75
76 #################################################
77 #
78 # BASIC STATS
79 #
80 #################################################
81 mpl.figure(0,figsize=(12,10))
82
83 mpl.subplot(2,2,1)
84 colorInd = 0
85 for fn in INP:
86 myCol = getColor(colorInd,N_FILES)
87 colorInd += 1
88 DATA_DICT = pickle.load( open( fn, "rb" ) )
89 [AVG_MUT_RATE, SNP_FREQ, INDEL_FREQ] = [DATA_DICT['AVG_MUT_RATE'], DATA_DICT['SNP_FREQ'], DATA_DICT['INDEL_FREQ']]
90 mpl.bar([colorInd-1],[AVG_MUT_RATE],1.,color=myCol)
91 mpl.xlim([-1,N_FILES+1])
92 mpl.grid()
93 mpl.xticks([],[])
94 mpl.ylabel('Frequency')
95 mpl.title('Overall mutation rate (1/bp)')
96
97 mpl.subplot(2,2,2)
98 colorInd = 0
99 for fn in INP:
100 myCol = getColor(colorInd,N_FILES)
101 colorInd += 1
102 DATA_DICT = pickle.load( open( fn, "rb" ) )
103 [AVG_MUT_RATE, SNP_FREQ, INDEL_FREQ] = [DATA_DICT['AVG_MUT_RATE'], DATA_DICT['SNP_FREQ'], DATA_DICT['INDEL_FREQ']]
104 mpl.bar([colorInd-1],[SNP_FREQ],1.,color=myCol)
105 mpl.bar([colorInd-1],[1.-SNP_FREQ],1.,color=myCol,bottom=[SNP_FREQ],hatch='/')
106 mpl.axis([-1,N_FILES+1,0,1.2])
107 mpl.grid()
108 mpl.xticks([],[])
109 mpl.yticks([0,.2,.4,.6,.8,1.],[0,0.2,0.4,0.6,0.8,1.0])
110 mpl.ylabel('Frequency')
111 mpl.title('SNP freq [ ] & indel freq [//]')
112
113 mpl.subplot(2,1,2)
114 colorInd = 0
115 legText = LAB
116 for fn in INP:
117 myCol = getColor(colorInd,N_FILES)
118 colorInd += 1
119 DATA_DICT = pickle.load( open( fn, "rb" ) )
120 [AVG_MUT_RATE, SNP_FREQ, INDEL_FREQ] = [DATA_DICT['AVG_MUT_RATE'], DATA_DICT['SNP_FREQ'], DATA_DICT['INDEL_FREQ']]
121 x = sorted(INDEL_FREQ.keys())
122 y = [INDEL_FREQ[n] for n in x]
123 mpl.plot(x,y,color=myCol)
124 #legText.append(fn)
125 mpl.grid()
126 mpl.xlabel('Indel size (bp)', fontweight='bold')
127 mpl.ylabel('Frequency')
128 mpl.title('Indel frequency by size (- deletion, + insertion)')
129 mpl.legend(legText)
130 #mpl.show()
131 mpl.savefig(OUP+'_plot1_mutRates.pdf')
132
133 #################################################
134 #
135 # TRINUC PRIOR PROB
136 #
137 #################################################
138 mpl.figure(1,figsize=(14,6))
139 colorInd = 0
140 legText = LAB
141 for fn in INP:
142 myCol = getColor(colorInd,N_FILES)
143 colorInd += 1
144 DATA_DICT = pickle.load( open( fn, "rb" ) )
145 TRINUC_MUT_PROB = DATA_DICT['TRINUC_MUT_PROB']
146
147 x = range(colorInd-1,len(TRINUC_MUT_PROB)*N_FILES,N_FILES)
148 xt = sorted(TRINUC_MUT_PROB.keys())
149 y = [TRINUC_MUT_PROB[k] for k in xt]
150 markerline, stemlines, baseline = mpl.stem(x,y,'-.')
151 mpl.setp(markerline, 'markerfacecolor', myCol)
152 mpl.setp(markerline, 'markeredgecolor', myCol)
153 mpl.setp(baseline, 'color', myCol, 'linewidth', 0)
154 mpl.setp(stemlines, 'color', myCol, 'linewidth', 3)
155 if colorInd == 1:
156 mpl.xticks(x,xt,rotation=90)
157 #legText.append(fn)
158 mpl.grid()
159 mpl.ylabel('p(trinucleotide mutates)')
160 mpl.legend(legText)
161 #mpl.show()
162 mpl.savefig(OUP+'_plot2_trinucPriors.pdf')
163
164 #################################################
165 #
166 # TRINUC TRANS PROB
167 #
168 #################################################
169 plotNum = 3
170 for fn in INP:
171 fig = mpl.figure(plotNum,figsize=(12,10))
172 DATA_DICT = pickle.load( open( fn, "rb" ) )
173 TRINUC_TRANS_PROBS = DATA_DICT['TRINUC_TRANS_PROBS']
174
175 xt2 = [m[3] for m in sorted([(n[0],n[2],n[1],n) for n in xt])]
176 reverse_dict = {xt2[i]:i for i in xrange(len(xt2))}
177 Z = np.zeros((64,64))
178 L = [['' for n in xrange(64)] for m in xrange(64)]
179 for k in TRINUC_TRANS_PROBS:
180 i = reverse_dict[k[0]]
181 j = reverse_dict[k[1]]
182 Z[i][j] = TRINUC_TRANS_PROBS[k]
183
184 HARDCODED_LABEL = ['A_A','A_C','A_G','A_T',
185 'C_A','C_C','C_G','C_T',
186 'G_A','G_C','G_G','G_T',
187 'T_A','T_C','T_G','T_T']
188
189 for pi in xrange(16):
190 mpl.subplot(4,4,pi+1)
191 Z2 = Z[pi*4:(pi+1)*4,pi*4:(pi+1)*4]
192 X, Y = np.meshgrid( range(0,len(Z2[0])+1), range(0,len(Z2)+1) )
193 im = mpl.pcolormesh(X,Y,Z2[::-1,:],vmin=0.0,vmax=0.5)
194 mpl.axis([0,4,0,4])
195 mpl.xticks([0.5,1.5,2.5,3.5],['A','C','G','T'])
196 mpl.yticks([0.5,1.5,2.5,3.5],['T','G','C','A'])
197 mpl.text(1.6, 1.8, HARDCODED_LABEL[pi], color='white')
198
199 # colorbar haxx
200 fig.subplots_adjust(right=0.8)
201 cbar_ax = fig.add_axes([0.85, 0.15, 0.05, 0.7])
202 cb = fig.colorbar(im,cax=cbar_ax)
203 cb.set_label(r"p(X$Y_1$Z->X$Y_2$Z | X_Z mutates)")
204
205 #mpl.tight_layout()
206 #mpl.figtext(0.24,0.94,'Trinucleotide Mutation Frequency',size=20)
207 #mpl.show()
208 mpl.savefig(OUP+'_plot'+str(plotNum)+'_trinucTrans.pdf')
209 plotNum += 1
210
211 #################################################
212 #
213 # HIGH MUT REGIONS
214 #
215 #################################################
216 track_byFile_byChr = [{} for n in INP]
217 bp_total_byFile = [0 for n in INP]
218 colorInd = 0
219 for fn in INP:
220 DATA_DICT = pickle.load( open( fn, "rb" ) )
221 HIGH_MUT_REGIONS = DATA_DICT['HIGH_MUT_REGIONS']
222 for region in HIGH_MUT_REGIONS:
223 if region[0] not in track_byFile_byChr[colorInd]:
224 track_byFile_byChr[colorInd][region[0]] = []
225 track_byFile_byChr[colorInd][region[0]].extend([region[1],region[2]])
226 bp_total_byFile[colorInd] += region[2]-region[1]+1
227 colorInd += 1
228
229 bp_overlap_count = [[0 for m in INP] for n in INP]
230 for i in xrange(N_FILES):
231 bp_overlap_count[i][i] = bp_total_byFile[i]
232 for j in xrange(i+1,N_FILES):
233 for k in track_byFile_byChr[i].keys():
234 if k in track_byFile_byChr[j]:
235 for ii in xrange(len(track_byFile_byChr[i][k][::2])):
236 bp_overlap_count[i][j] += getBedOverlap(track_byFile_byChr[j][k],track_byFile_byChr[i][k][ii*2],track_byFile_byChr[i][k][ii*2+1])
237
238 print ''
239 print 'HIGH_MUT_REGION OVERLAP BETWEEN '+str(N_FILES)+' MODELS...'
240 for i in xrange(N_FILES):
241 for j in xrange(i,N_FILES):
242 nDissimilar = (bp_overlap_count[i][i]-bp_overlap_count[i][j]) + (bp_overlap_count[j][j]-bp_overlap_count[i][j])
243 if bp_overlap_count[i][j] == 0:
244 percentageV = 0.0
245 else:
246 percentageV = bp_overlap_count[i][j]/float(bp_overlap_count[i][j]+nDissimilar)
247 print 'overlap['+str(i)+','+str(j)+'] = '+str(bp_overlap_count[i][j])+' bp ({0:.3f}%)'.format(percentageV*100.)
248 print ''
249
250 #################################################
251 #
252 # COMMON VARIANTS
253 #
254 #################################################
255 setofVars = [set([]) for n in INP]
256 colorInd = 0
257 for fn in INP:
258 DATA_DICT = pickle.load( open( fn, "rb" ) )
259 COMMON_VARIANTS = DATA_DICT['COMMON_VARIANTS']
260 for n in COMMON_VARIANTS:
261 setofVars[colorInd].add(n)
262 colorInd += 1
263
264 print ''
265 print 'COMMON_VARIANTS OVERLAP BETWEEN '+str(N_FILES)+' MODELS...'
266 for i in xrange(N_FILES):
267 for j in xrange(i,N_FILES):
268 overlapCount = len(setofVars[i].intersection(setofVars[j]))
269 nDissimilar = (len(setofVars[i])-overlapCount) + (len(setofVars[j])-overlapCount)
270 if overlapCount == 0:
271 percentageV = 0.0
272 else:
273 percentageV = overlapCount/float(overlapCount+nDissimilar)
274 print 'overlap['+str(i)+','+str(j)+'] = '+str(overlapCount)+' variants ({0:.3f}%)'.format(percentageV*100.)
275 print ''
276
277
278