Mercurial > repos > thondeboer > neat_genreads
comparison utilities/plotMutModel.py @ 0:6e75a84e9338 draft
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
| author | thondeboer |
|---|---|
| date | Tue, 15 May 2018 02:39:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:6e75a84e9338 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 # | |
| 4 # a quick script for comparing mutation models | |
| 5 # | |
| 6 # python plotMutModel.py -i model1.p [model2.p] [model3.p]... -l legend_label1 [legend_label2] [legend_label3]... -o path/to/pdf_plot_prefix | |
| 7 # | |
| 8 | |
| 9 import sys | |
| 10 import pickle | |
| 11 import bisect | |
| 12 import numpy as np | |
| 13 import matplotlib.pyplot as mpl | |
| 14 import matplotlib.colors as colors | |
| 15 import matplotlib.cm as cmx | |
| 16 import argparse | |
| 17 | |
| 18 #mpl.rc('text',usetex=True) | |
| 19 #mpl.rcParams['text.latex.preamble']=[r"\usepackage{amsmath}"] | |
| 20 | |
| 21 parser = argparse.ArgumentParser(description='Plot and compare mutation models from genMutModel.py Usage: python plotMutModel.py -i model1.p [model2.p] [model3.p]... -l legend_label1 [legend_label2] [legend_label3]... -o path/to/pdf_plot_prefix') | |
| 22 parser.add_argument('-i', type=str, required=True, metavar='<str>', nargs='+', help="* mutation_model_1.p [mutation_model_2.p] [mutation_model_3] ...") | |
| 23 parser.add_argument('-l', type=str, required=True, metavar='<str>', nargs='+', help="* legend labels: model1_name [model2_name] [model3_name]...") | |
| 24 parser.add_argument('-o', type=str, required=True, metavar='<str>', help="* output pdf prefix") | |
| 25 args = parser.parse_args() | |
| 26 | |
| 27 | |
| 28 | |
| 29 def getColor(i,N,colormap='jet'): | |
| 30 cm = mpl.get_cmap(colormap) | |
| 31 cNorm = colors.Normalize(vmin=0, vmax=N+1) | |
| 32 scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=cm) | |
| 33 colorVal = scalarMap.to_rgba(i) | |
| 34 return colorVal | |
| 35 | |
| 36 def isInBed(track,ind): | |
| 37 if ind in track: | |
| 38 return True | |
| 39 elif bisect.bisect(track,ind)%1 == 1: | |
| 40 return True | |
| 41 else: | |
| 42 return False | |
| 43 | |
| 44 def getBedOverlap(track,ind_s,ind_e): | |
| 45 if ind_s in track: | |
| 46 myInd = track.index(ind_s) | |
| 47 return min([track[myInd+1]-ind_s+1,ind_e-ind_s+1]) | |
| 48 else: | |
| 49 myInd = bisect.bisect(track,ind_s) | |
| 50 if myInd%1 and myInd < len(track)-1: | |
| 51 return min([track[myInd+1]-ind_s+1,ind_e-ind_s+1]) | |
| 52 return 0 | |
| 53 | |
| 54 # a waaaaaaay slower version of the above function ^^ | |
| 55 #def getTrackOverlap(track1,track2): | |
| 56 # otrack = [0 for n in xrange(max(track1+track2)+1)] | |
| 57 # for i in xrange(0,len(track1),2): | |
| 58 # for j in xrange(track1[i],track1[i+1]+1): | |
| 59 # otrack[j] = 1 | |
| 60 # ocount = 0 | |
| 61 # for i in xrange(0,len(track2),2): | |
| 62 # for j in xrange(track2[i],track2[i+1]+1): | |
| 63 # if otrack[j]: | |
| 64 # ocount += 1 | |
| 65 # return ocount | |
| 66 | |
| 67 OUP = args.o | |
| 68 LAB = args.l | |
| 69 #print LAB | |
| 70 INP = args.i | |
| 71 | |
| 72 N_FILES = len(INP) | |
| 73 | |
| 74 mpl.rcParams.update({'font.size': 13, 'font.weight':'bold', 'lines.linewidth': 3}) | |
| 75 | |
| 76 ################################################# | |
| 77 # | |
| 78 # BASIC STATS | |
| 79 # | |
| 80 ################################################# | |
| 81 mpl.figure(0,figsize=(12,10)) | |
| 82 | |
| 83 mpl.subplot(2,2,1) | |
| 84 colorInd = 0 | |
| 85 for fn in INP: | |
| 86 myCol = getColor(colorInd,N_FILES) | |
| 87 colorInd += 1 | |
| 88 DATA_DICT = pickle.load( open( fn, "rb" ) ) | |
| 89 [AVG_MUT_RATE, SNP_FREQ, INDEL_FREQ] = [DATA_DICT['AVG_MUT_RATE'], DATA_DICT['SNP_FREQ'], DATA_DICT['INDEL_FREQ']] | |
| 90 mpl.bar([colorInd-1],[AVG_MUT_RATE],1.,color=myCol) | |
| 91 mpl.xlim([-1,N_FILES+1]) | |
| 92 mpl.grid() | |
| 93 mpl.xticks([],[]) | |
| 94 mpl.ylabel('Frequency') | |
| 95 mpl.title('Overall mutation rate (1/bp)') | |
| 96 | |
| 97 mpl.subplot(2,2,2) | |
| 98 colorInd = 0 | |
| 99 for fn in INP: | |
| 100 myCol = getColor(colorInd,N_FILES) | |
| 101 colorInd += 1 | |
| 102 DATA_DICT = pickle.load( open( fn, "rb" ) ) | |
| 103 [AVG_MUT_RATE, SNP_FREQ, INDEL_FREQ] = [DATA_DICT['AVG_MUT_RATE'], DATA_DICT['SNP_FREQ'], DATA_DICT['INDEL_FREQ']] | |
| 104 mpl.bar([colorInd-1],[SNP_FREQ],1.,color=myCol) | |
| 105 mpl.bar([colorInd-1],[1.-SNP_FREQ],1.,color=myCol,bottom=[SNP_FREQ],hatch='/') | |
| 106 mpl.axis([-1,N_FILES+1,0,1.2]) | |
| 107 mpl.grid() | |
| 108 mpl.xticks([],[]) | |
| 109 mpl.yticks([0,.2,.4,.6,.8,1.],[0,0.2,0.4,0.6,0.8,1.0]) | |
| 110 mpl.ylabel('Frequency') | |
| 111 mpl.title('SNP freq [ ] & indel freq [//]') | |
| 112 | |
| 113 mpl.subplot(2,1,2) | |
| 114 colorInd = 0 | |
| 115 legText = LAB | |
| 116 for fn in INP: | |
| 117 myCol = getColor(colorInd,N_FILES) | |
| 118 colorInd += 1 | |
| 119 DATA_DICT = pickle.load( open( fn, "rb" ) ) | |
| 120 [AVG_MUT_RATE, SNP_FREQ, INDEL_FREQ] = [DATA_DICT['AVG_MUT_RATE'], DATA_DICT['SNP_FREQ'], DATA_DICT['INDEL_FREQ']] | |
| 121 x = sorted(INDEL_FREQ.keys()) | |
| 122 y = [INDEL_FREQ[n] for n in x] | |
| 123 mpl.plot(x,y,color=myCol) | |
| 124 #legText.append(fn) | |
| 125 mpl.grid() | |
| 126 mpl.xlabel('Indel size (bp)', fontweight='bold') | |
| 127 mpl.ylabel('Frequency') | |
| 128 mpl.title('Indel frequency by size (- deletion, + insertion)') | |
| 129 mpl.legend(legText) | |
| 130 #mpl.show() | |
| 131 mpl.savefig(OUP+'_plot1_mutRates.pdf') | |
| 132 | |
| 133 ################################################# | |
| 134 # | |
| 135 # TRINUC PRIOR PROB | |
| 136 # | |
| 137 ################################################# | |
| 138 mpl.figure(1,figsize=(14,6)) | |
| 139 colorInd = 0 | |
| 140 legText = LAB | |
| 141 for fn in INP: | |
| 142 myCol = getColor(colorInd,N_FILES) | |
| 143 colorInd += 1 | |
| 144 DATA_DICT = pickle.load( open( fn, "rb" ) ) | |
| 145 TRINUC_MUT_PROB = DATA_DICT['TRINUC_MUT_PROB'] | |
| 146 | |
| 147 x = range(colorInd-1,len(TRINUC_MUT_PROB)*N_FILES,N_FILES) | |
| 148 xt = sorted(TRINUC_MUT_PROB.keys()) | |
| 149 y = [TRINUC_MUT_PROB[k] for k in xt] | |
| 150 markerline, stemlines, baseline = mpl.stem(x,y,'-.') | |
| 151 mpl.setp(markerline, 'markerfacecolor', myCol) | |
| 152 mpl.setp(markerline, 'markeredgecolor', myCol) | |
| 153 mpl.setp(baseline, 'color', myCol, 'linewidth', 0) | |
| 154 mpl.setp(stemlines, 'color', myCol, 'linewidth', 3) | |
| 155 if colorInd == 1: | |
| 156 mpl.xticks(x,xt,rotation=90) | |
| 157 #legText.append(fn) | |
| 158 mpl.grid() | |
| 159 mpl.ylabel('p(trinucleotide mutates)') | |
| 160 mpl.legend(legText) | |
| 161 #mpl.show() | |
| 162 mpl.savefig(OUP+'_plot2_trinucPriors.pdf') | |
| 163 | |
| 164 ################################################# | |
| 165 # | |
| 166 # TRINUC TRANS PROB | |
| 167 # | |
| 168 ################################################# | |
| 169 plotNum = 3 | |
| 170 for fn in INP: | |
| 171 fig = mpl.figure(plotNum,figsize=(12,10)) | |
| 172 DATA_DICT = pickle.load( open( fn, "rb" ) ) | |
| 173 TRINUC_TRANS_PROBS = DATA_DICT['TRINUC_TRANS_PROBS'] | |
| 174 | |
| 175 xt2 = [m[3] for m in sorted([(n[0],n[2],n[1],n) for n in xt])] | |
| 176 reverse_dict = {xt2[i]:i for i in xrange(len(xt2))} | |
| 177 Z = np.zeros((64,64)) | |
| 178 L = [['' for n in xrange(64)] for m in xrange(64)] | |
| 179 for k in TRINUC_TRANS_PROBS: | |
| 180 i = reverse_dict[k[0]] | |
| 181 j = reverse_dict[k[1]] | |
| 182 Z[i][j] = TRINUC_TRANS_PROBS[k] | |
| 183 | |
| 184 HARDCODED_LABEL = ['A_A','A_C','A_G','A_T', | |
| 185 'C_A','C_C','C_G','C_T', | |
| 186 'G_A','G_C','G_G','G_T', | |
| 187 'T_A','T_C','T_G','T_T'] | |
| 188 | |
| 189 for pi in xrange(16): | |
| 190 mpl.subplot(4,4,pi+1) | |
| 191 Z2 = Z[pi*4:(pi+1)*4,pi*4:(pi+1)*4] | |
| 192 X, Y = np.meshgrid( range(0,len(Z2[0])+1), range(0,len(Z2)+1) ) | |
| 193 im = mpl.pcolormesh(X,Y,Z2[::-1,:],vmin=0.0,vmax=0.5) | |
| 194 mpl.axis([0,4,0,4]) | |
| 195 mpl.xticks([0.5,1.5,2.5,3.5],['A','C','G','T']) | |
| 196 mpl.yticks([0.5,1.5,2.5,3.5],['T','G','C','A']) | |
| 197 mpl.text(1.6, 1.8, HARDCODED_LABEL[pi], color='white') | |
| 198 | |
| 199 # colorbar haxx | |
| 200 fig.subplots_adjust(right=0.8) | |
| 201 cbar_ax = fig.add_axes([0.85, 0.15, 0.05, 0.7]) | |
| 202 cb = fig.colorbar(im,cax=cbar_ax) | |
| 203 cb.set_label(r"p(X$Y_1$Z->X$Y_2$Z | X_Z mutates)") | |
| 204 | |
| 205 #mpl.tight_layout() | |
| 206 #mpl.figtext(0.24,0.94,'Trinucleotide Mutation Frequency',size=20) | |
| 207 #mpl.show() | |
| 208 mpl.savefig(OUP+'_plot'+str(plotNum)+'_trinucTrans.pdf') | |
| 209 plotNum += 1 | |
| 210 | |
| 211 ################################################# | |
| 212 # | |
| 213 # HIGH MUT REGIONS | |
| 214 # | |
| 215 ################################################# | |
| 216 track_byFile_byChr = [{} for n in INP] | |
| 217 bp_total_byFile = [0 for n in INP] | |
| 218 colorInd = 0 | |
| 219 for fn in INP: | |
| 220 DATA_DICT = pickle.load( open( fn, "rb" ) ) | |
| 221 HIGH_MUT_REGIONS = DATA_DICT['HIGH_MUT_REGIONS'] | |
| 222 for region in HIGH_MUT_REGIONS: | |
| 223 if region[0] not in track_byFile_byChr[colorInd]: | |
| 224 track_byFile_byChr[colorInd][region[0]] = [] | |
| 225 track_byFile_byChr[colorInd][region[0]].extend([region[1],region[2]]) | |
| 226 bp_total_byFile[colorInd] += region[2]-region[1]+1 | |
| 227 colorInd += 1 | |
| 228 | |
| 229 bp_overlap_count = [[0 for m in INP] for n in INP] | |
| 230 for i in xrange(N_FILES): | |
| 231 bp_overlap_count[i][i] = bp_total_byFile[i] | |
| 232 for j in xrange(i+1,N_FILES): | |
| 233 for k in track_byFile_byChr[i].keys(): | |
| 234 if k in track_byFile_byChr[j]: | |
| 235 for ii in xrange(len(track_byFile_byChr[i][k][::2])): | |
| 236 bp_overlap_count[i][j] += getBedOverlap(track_byFile_byChr[j][k],track_byFile_byChr[i][k][ii*2],track_byFile_byChr[i][k][ii*2+1]) | |
| 237 | |
| 238 print '' | |
| 239 print 'HIGH_MUT_REGION OVERLAP BETWEEN '+str(N_FILES)+' MODELS...' | |
| 240 for i in xrange(N_FILES): | |
| 241 for j in xrange(i,N_FILES): | |
| 242 nDissimilar = (bp_overlap_count[i][i]-bp_overlap_count[i][j]) + (bp_overlap_count[j][j]-bp_overlap_count[i][j]) | |
| 243 if bp_overlap_count[i][j] == 0: | |
| 244 percentageV = 0.0 | |
| 245 else: | |
| 246 percentageV = bp_overlap_count[i][j]/float(bp_overlap_count[i][j]+nDissimilar) | |
| 247 print 'overlap['+str(i)+','+str(j)+'] = '+str(bp_overlap_count[i][j])+' bp ({0:.3f}%)'.format(percentageV*100.) | |
| 248 print '' | |
| 249 | |
| 250 ################################################# | |
| 251 # | |
| 252 # COMMON VARIANTS | |
| 253 # | |
| 254 ################################################# | |
| 255 setofVars = [set([]) for n in INP] | |
| 256 colorInd = 0 | |
| 257 for fn in INP: | |
| 258 DATA_DICT = pickle.load( open( fn, "rb" ) ) | |
| 259 COMMON_VARIANTS = DATA_DICT['COMMON_VARIANTS'] | |
| 260 for n in COMMON_VARIANTS: | |
| 261 setofVars[colorInd].add(n) | |
| 262 colorInd += 1 | |
| 263 | |
| 264 print '' | |
| 265 print 'COMMON_VARIANTS OVERLAP BETWEEN '+str(N_FILES)+' MODELS...' | |
| 266 for i in xrange(N_FILES): | |
| 267 for j in xrange(i,N_FILES): | |
| 268 overlapCount = len(setofVars[i].intersection(setofVars[j])) | |
| 269 nDissimilar = (len(setofVars[i])-overlapCount) + (len(setofVars[j])-overlapCount) | |
| 270 if overlapCount == 0: | |
| 271 percentageV = 0.0 | |
| 272 else: | |
| 273 percentageV = overlapCount/float(overlapCount+nDissimilar) | |
| 274 print 'overlap['+str(i)+','+str(j)+'] = '+str(overlapCount)+' variants ({0:.3f}%)'.format(percentageV*100.) | |
| 275 print '' | |
| 276 | |
| 277 | |
| 278 |
