Mercurial > repos > thondeboer > neat_genreads
diff utilities/plotMutModel.py @ 0:6e75a84e9338 draft
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
| author | thondeboer |
|---|---|
| date | Tue, 15 May 2018 02:39:53 -0400 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/utilities/plotMutModel.py Tue May 15 02:39:53 2018 -0400 @@ -0,0 +1,278 @@ +#!/usr/bin/env python + +# +# a quick script for comparing mutation models +# +# python plotMutModel.py -i model1.p [model2.p] [model3.p]... -l legend_label1 [legend_label2] [legend_label3]... -o path/to/pdf_plot_prefix +# + +import sys +import pickle +import bisect +import numpy as np +import matplotlib.pyplot as mpl +import matplotlib.colors as colors +import matplotlib.cm as cmx +import argparse + +#mpl.rc('text',usetex=True) +#mpl.rcParams['text.latex.preamble']=[r"\usepackage{amsmath}"] + +parser = argparse.ArgumentParser(description='Plot and compare mutation models from genMutModel.py Usage: python plotMutModel.py -i model1.p [model2.p] [model3.p]... -l legend_label1 [legend_label2] [legend_label3]... -o path/to/pdf_plot_prefix') +parser.add_argument('-i', type=str, required=True, metavar='<str>', nargs='+', help="* mutation_model_1.p [mutation_model_2.p] [mutation_model_3] ...") +parser.add_argument('-l', type=str, required=True, metavar='<str>', nargs='+', help="* legend labels: model1_name [model2_name] [model3_name]...") +parser.add_argument('-o', type=str, required=True, metavar='<str>', help="* output pdf prefix") +args = parser.parse_args() + + + +def getColor(i,N,colormap='jet'): + cm = mpl.get_cmap(colormap) + cNorm = colors.Normalize(vmin=0, vmax=N+1) + scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=cm) + colorVal = scalarMap.to_rgba(i) + return colorVal + +def isInBed(track,ind): + if ind in track: + return True + elif bisect.bisect(track,ind)%1 == 1: + return True + else: + return False + +def getBedOverlap(track,ind_s,ind_e): + if ind_s in track: + myInd = track.index(ind_s) + return min([track[myInd+1]-ind_s+1,ind_e-ind_s+1]) + else: + myInd = bisect.bisect(track,ind_s) + if myInd%1 and myInd < len(track)-1: + return min([track[myInd+1]-ind_s+1,ind_e-ind_s+1]) + return 0 + +# a waaaaaaay slower version of the above function ^^ +#def getTrackOverlap(track1,track2): +# otrack = [0 for n in xrange(max(track1+track2)+1)] +# for i in xrange(0,len(track1),2): +# for j in xrange(track1[i],track1[i+1]+1): +# otrack[j] = 1 +# ocount = 0 +# for i in xrange(0,len(track2),2): +# for j in xrange(track2[i],track2[i+1]+1): +# if otrack[j]: +# ocount += 1 +# return ocount + +OUP = args.o +LAB = args.l +#print LAB +INP = args.i + +N_FILES = len(INP) + +mpl.rcParams.update({'font.size': 13, 'font.weight':'bold', 'lines.linewidth': 3}) + +################################################# +# +# BASIC STATS +# +################################################# +mpl.figure(0,figsize=(12,10)) + +mpl.subplot(2,2,1) +colorInd = 0 +for fn in INP: + myCol = getColor(colorInd,N_FILES) + colorInd += 1 + DATA_DICT = pickle.load( open( fn, "rb" ) ) + [AVG_MUT_RATE, SNP_FREQ, INDEL_FREQ] = [DATA_DICT['AVG_MUT_RATE'], DATA_DICT['SNP_FREQ'], DATA_DICT['INDEL_FREQ']] + mpl.bar([colorInd-1],[AVG_MUT_RATE],1.,color=myCol) +mpl.xlim([-1,N_FILES+1]) +mpl.grid() +mpl.xticks([],[]) +mpl.ylabel('Frequency') +mpl.title('Overall mutation rate (1/bp)') + +mpl.subplot(2,2,2) +colorInd = 0 +for fn in INP: + myCol = getColor(colorInd,N_FILES) + colorInd += 1 + DATA_DICT = pickle.load( open( fn, "rb" ) ) + [AVG_MUT_RATE, SNP_FREQ, INDEL_FREQ] = [DATA_DICT['AVG_MUT_RATE'], DATA_DICT['SNP_FREQ'], DATA_DICT['INDEL_FREQ']] + mpl.bar([colorInd-1],[SNP_FREQ],1.,color=myCol) + mpl.bar([colorInd-1],[1.-SNP_FREQ],1.,color=myCol,bottom=[SNP_FREQ],hatch='/') +mpl.axis([-1,N_FILES+1,0,1.2]) +mpl.grid() +mpl.xticks([],[]) +mpl.yticks([0,.2,.4,.6,.8,1.],[0,0.2,0.4,0.6,0.8,1.0]) +mpl.ylabel('Frequency') +mpl.title('SNP freq [ ] & indel freq [//]') + +mpl.subplot(2,1,2) +colorInd = 0 +legText = LAB +for fn in INP: + myCol = getColor(colorInd,N_FILES) + colorInd += 1 + DATA_DICT = pickle.load( open( fn, "rb" ) ) + [AVG_MUT_RATE, SNP_FREQ, INDEL_FREQ] = [DATA_DICT['AVG_MUT_RATE'], DATA_DICT['SNP_FREQ'], DATA_DICT['INDEL_FREQ']] + x = sorted(INDEL_FREQ.keys()) + y = [INDEL_FREQ[n] for n in x] + mpl.plot(x,y,color=myCol) + #legText.append(fn) +mpl.grid() +mpl.xlabel('Indel size (bp)', fontweight='bold') +mpl.ylabel('Frequency') +mpl.title('Indel frequency by size (- deletion, + insertion)') +mpl.legend(legText) +#mpl.show() +mpl.savefig(OUP+'_plot1_mutRates.pdf') + +################################################# +# +# TRINUC PRIOR PROB +# +################################################# +mpl.figure(1,figsize=(14,6)) +colorInd = 0 +legText = LAB +for fn in INP: + myCol = getColor(colorInd,N_FILES) + colorInd += 1 + DATA_DICT = pickle.load( open( fn, "rb" ) ) + TRINUC_MUT_PROB = DATA_DICT['TRINUC_MUT_PROB'] + + x = range(colorInd-1,len(TRINUC_MUT_PROB)*N_FILES,N_FILES) + xt = sorted(TRINUC_MUT_PROB.keys()) + y = [TRINUC_MUT_PROB[k] for k in xt] + markerline, stemlines, baseline = mpl.stem(x,y,'-.') + mpl.setp(markerline, 'markerfacecolor', myCol) + mpl.setp(markerline, 'markeredgecolor', myCol) + mpl.setp(baseline, 'color', myCol, 'linewidth', 0) + mpl.setp(stemlines, 'color', myCol, 'linewidth', 3) + if colorInd == 1: + mpl.xticks(x,xt,rotation=90) + #legText.append(fn) +mpl.grid() +mpl.ylabel('p(trinucleotide mutates)') +mpl.legend(legText) +#mpl.show() +mpl.savefig(OUP+'_plot2_trinucPriors.pdf') + +################################################# +# +# TRINUC TRANS PROB +# +################################################# +plotNum = 3 +for fn in INP: + fig = mpl.figure(plotNum,figsize=(12,10)) + DATA_DICT = pickle.load( open( fn, "rb" ) ) + TRINUC_TRANS_PROBS = DATA_DICT['TRINUC_TRANS_PROBS'] + + xt2 = [m[3] for m in sorted([(n[0],n[2],n[1],n) for n in xt])] + reverse_dict = {xt2[i]:i for i in xrange(len(xt2))} + Z = np.zeros((64,64)) + L = [['' for n in xrange(64)] for m in xrange(64)] + for k in TRINUC_TRANS_PROBS: + i = reverse_dict[k[0]] + j = reverse_dict[k[1]] + Z[i][j] = TRINUC_TRANS_PROBS[k] + + HARDCODED_LABEL = ['A_A','A_C','A_G','A_T', + 'C_A','C_C','C_G','C_T', + 'G_A','G_C','G_G','G_T', + 'T_A','T_C','T_G','T_T'] + + for pi in xrange(16): + mpl.subplot(4,4,pi+1) + Z2 = Z[pi*4:(pi+1)*4,pi*4:(pi+1)*4] + X, Y = np.meshgrid( range(0,len(Z2[0])+1), range(0,len(Z2)+1) ) + im = mpl.pcolormesh(X,Y,Z2[::-1,:],vmin=0.0,vmax=0.5) + mpl.axis([0,4,0,4]) + mpl.xticks([0.5,1.5,2.5,3.5],['A','C','G','T']) + mpl.yticks([0.5,1.5,2.5,3.5],['T','G','C','A']) + mpl.text(1.6, 1.8, HARDCODED_LABEL[pi], color='white') + + # colorbar haxx + fig.subplots_adjust(right=0.8) + cbar_ax = fig.add_axes([0.85, 0.15, 0.05, 0.7]) + cb = fig.colorbar(im,cax=cbar_ax) + cb.set_label(r"p(X$Y_1$Z->X$Y_2$Z | X_Z mutates)") + + #mpl.tight_layout() + #mpl.figtext(0.24,0.94,'Trinucleotide Mutation Frequency',size=20) + #mpl.show() + mpl.savefig(OUP+'_plot'+str(plotNum)+'_trinucTrans.pdf') + plotNum += 1 + +################################################# +# +# HIGH MUT REGIONS +# +################################################# +track_byFile_byChr = [{} for n in INP] +bp_total_byFile = [0 for n in INP] +colorInd = 0 +for fn in INP: + DATA_DICT = pickle.load( open( fn, "rb" ) ) + HIGH_MUT_REGIONS = DATA_DICT['HIGH_MUT_REGIONS'] + for region in HIGH_MUT_REGIONS: + if region[0] not in track_byFile_byChr[colorInd]: + track_byFile_byChr[colorInd][region[0]] = [] + track_byFile_byChr[colorInd][region[0]].extend([region[1],region[2]]) + bp_total_byFile[colorInd] += region[2]-region[1]+1 + colorInd += 1 + +bp_overlap_count = [[0 for m in INP] for n in INP] +for i in xrange(N_FILES): + bp_overlap_count[i][i] = bp_total_byFile[i] + for j in xrange(i+1,N_FILES): + for k in track_byFile_byChr[i].keys(): + if k in track_byFile_byChr[j]: + for ii in xrange(len(track_byFile_byChr[i][k][::2])): + bp_overlap_count[i][j] += getBedOverlap(track_byFile_byChr[j][k],track_byFile_byChr[i][k][ii*2],track_byFile_byChr[i][k][ii*2+1]) + +print '' +print 'HIGH_MUT_REGION OVERLAP BETWEEN '+str(N_FILES)+' MODELS...' +for i in xrange(N_FILES): + for j in xrange(i,N_FILES): + nDissimilar = (bp_overlap_count[i][i]-bp_overlap_count[i][j]) + (bp_overlap_count[j][j]-bp_overlap_count[i][j]) + if bp_overlap_count[i][j] == 0: + percentageV = 0.0 + else: + percentageV = bp_overlap_count[i][j]/float(bp_overlap_count[i][j]+nDissimilar) + print 'overlap['+str(i)+','+str(j)+'] = '+str(bp_overlap_count[i][j])+' bp ({0:.3f}%)'.format(percentageV*100.) +print '' + +################################################# +# +# COMMON VARIANTS +# +################################################# +setofVars = [set([]) for n in INP] +colorInd = 0 +for fn in INP: + DATA_DICT = pickle.load( open( fn, "rb" ) ) + COMMON_VARIANTS = DATA_DICT['COMMON_VARIANTS'] + for n in COMMON_VARIANTS: + setofVars[colorInd].add(n) + colorInd += 1 + +print '' +print 'COMMON_VARIANTS OVERLAP BETWEEN '+str(N_FILES)+' MODELS...' +for i in xrange(N_FILES): + for j in xrange(i,N_FILES): + overlapCount = len(setofVars[i].intersection(setofVars[j])) + nDissimilar = (len(setofVars[i])-overlapCount) + (len(setofVars[j])-overlapCount) + if overlapCount == 0: + percentageV = 0.0 + else: + percentageV = overlapCount/float(overlapCount+nDissimilar) + print 'overlap['+str(i)+','+str(j)+'] = '+str(overlapCount)+' variants ({0:.3f}%)'.format(percentageV*100.) +print '' + + +
