# HG changeset patch # User davidvanzessen # Date 1480602726 18000 # Node ID 81453585dfc3012427a11bfebeddd2c6c9084c32 # Parent 0bea8c187a90be2cc5eb472d137a13e46526b304 Uploaded diff -r 0bea8c187a90 -r 81453585dfc3 aa_histogram.r --- a/aa_histogram.r Mon Nov 28 10:27:22 2016 -0500 +++ b/aa_histogram.r Thu Dec 01 09:32:06 2016 -0500 @@ -15,50 +15,49 @@ absent.aa.by.id = read.table(absent.aa.by.id.file, sep="\t", fill=T, header=T, quote="") for(gene in genes){ + if(gene == ""){ + mutations.by.id.gene = mutations.by.id[!grepl("unmatched", mutations.by.id$best_match),] + absent.aa.by.id.gene = absent.aa.by.id[!grepl("unmatched", absent.aa.by.id$best_match),] + } else { + mutations.by.id.gene = mutations.by.id[grepl(paste("^", gene, sep=""), mutations.by.id$best_match),] + absent.aa.by.id.gene = absent.aa.by.id[grepl(paste("^", gene, sep=""), absent.aa.by.id$best_match),] + } + print(paste("nrow", gene, nrow(absent.aa.by.id.gene))) + if(nrow(mutations.by.id.gene) == 0){ + next + } - if(gene == ""){ - mutations.by.id.gene = mutations.by.id[!grepl("unmatched", mutations.by.id$best_match),] - absent.aa.by.id.gene = absent.aa.by.id[!grepl("unmatched", absent.aa.by.id$best_match),] - } else { - mutations.by.id.gene = mutations.by.id[grepl(paste("^", gene, sep=""), mutations.by.id$best_match),] - absent.aa.by.id.gene = absent.aa.by.id[grepl(paste("^", gene, sep=""), absent.aa.by.id$best_match),] - } - print(paste("nrow", gene, nrow(absent.aa.by.id.gene))) - if(nrow(mutations.by.id.gene) == 0){ - next - } - - mutations.at.position = colSums(mutations.by.id.gene[,-c(1,2)]) - aa.at.position = colSums(absent.aa.by.id.gene[,-c(1,2,3,4)]) + mutations.at.position = colSums(mutations.by.id.gene[,-c(1,2)]) + aa.at.position = colSums(absent.aa.by.id.gene[,-c(1,2,3,4)]) - dat_freq = mutations.at.position / aa.at.position - dat_freq[is.na(dat_freq)] = 0 - dat_dt = data.frame(i=1:length(dat_freq), freq=dat_freq) + dat_freq = mutations.at.position / aa.at.position + dat_freq[is.na(dat_freq)] = 0 + dat_dt = data.frame(i=1:length(dat_freq), freq=dat_freq) - print("---------------- plot ----------------") + print("---------------- plot ----------------") - m = ggplot(dat_dt, aes(x=i, y=freq)) + theme(axis.text.x = element_text(angle = 90, hjust = 1), text = element_text(size=13, colour="black")) - m = m + geom_bar(stat="identity", colour = "black", fill = "darkgrey", alpha=0.8) + scale_x_continuous(breaks=dat_dt$i, labels=dat_dt$i) - m = m + annotate("segment", x = 0.5, y = -0.05, xend=26.5, yend=-0.05, colour="darkgreen", size=1) + annotate("text", x = 13, y = -0.1, label="FR1") - m = m + annotate("segment", x = 26.5, y = -0.07, xend=38.5, yend=-0.07, colour="darkblue", size=1) + annotate("text", x = 32.5, y = -0.15, label="CDR1") - m = m + annotate("segment", x = 38.5, y = -0.05, xend=55.5, yend=-0.05, colour="darkgreen", size=1) + annotate("text", x = 47, y = -0.1, label="FR2") - m = m + annotate("segment", x = 55.5, y = -0.07, xend=65.5, yend=-0.07, colour="darkblue", size=1) + annotate("text", x = 60.5, y = -0.15, label="CDR2") - m = m + annotate("segment", x = 65.5, y = -0.05, xend=104.5, yend=-0.05, colour="darkgreen", size=1) + annotate("text", x = 85, y = -0.1, label="FR3") - m = m + expand_limits(y=c(-0.1,1)) + xlab("AA position") + ylab("Frequency") + ggtitle(paste(gene, "AA mutation frequency")) - m = m + theme(panel.background = element_rect(fill = "white", colour="black"), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank()) - m = m + scale_colour_manual(values=c("black")) + m = ggplot(dat_dt, aes(x=i, y=freq)) + theme(axis.text.x = element_text(angle = 90, hjust = 1), text = element_text(size=13, colour="black")) + m = m + geom_bar(stat="identity", colour = "black", fill = "darkgrey", alpha=0.8) + scale_x_continuous(breaks=dat_dt$i, labels=dat_dt$i) + m = m + annotate("segment", x = 0.5, y = -0.05, xend=26.5, yend=-0.05, colour="darkgreen", size=1) + annotate("text", x = 13, y = -0.1, label="FR1") + m = m + annotate("segment", x = 26.5, y = -0.07, xend=38.5, yend=-0.07, colour="darkblue", size=1) + annotate("text", x = 32.5, y = -0.15, label="CDR1") + m = m + annotate("segment", x = 38.5, y = -0.05, xend=55.5, yend=-0.05, colour="darkgreen", size=1) + annotate("text", x = 47, y = -0.1, label="FR2") + m = m + annotate("segment", x = 55.5, y = -0.07, xend=65.5, yend=-0.07, colour="darkblue", size=1) + annotate("text", x = 60.5, y = -0.15, label="CDR2") + m = m + annotate("segment", x = 65.5, y = -0.05, xend=104.5, yend=-0.05, colour="darkgreen", size=1) + annotate("text", x = 85, y = -0.1, label="FR3") + m = m + expand_limits(y=c(-0.1,1)) + xlab("AA position") + ylab("Frequency") + ggtitle(paste(gene, "AA mutation frequency")) + m = m + theme(panel.background = element_rect(fill = "white", colour="black"), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank()) + #m = m + scale_colour_manual(values=c("black")) - print("---------------- write/print ----------------") + print("---------------- write/print ----------------") - dat.sums = data.frame(index=1:length(mutations.at.position), mutations.at.position=mutations.at.position, aa.at.position=aa.at.position) + dat.sums = data.frame(index=1:length(mutations.at.position), mutations.at.position=mutations.at.position, aa.at.position=aa.at.position) - write.table(dat.sums, paste(outdir, "/aa_histogram_sum_", gene, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T) - write.table(mutations.by.id.gene, paste(outdir, "/aa_histogram_count_", gene, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T) - write.table(absent.aa.by.id.gene, paste(outdir, "/aa_histogram_absent_", gene, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T) - write.table(dat_dt, paste(outdir, "/aa_histogram_", gene, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T) - - png(filename=paste(outdir, "/aa_histogram_", gene, ".png", sep=""), width=1280, height=720) - print(m) - dev.off() + write.table(dat.sums, paste(outdir, "/aa_histogram_sum_", gene, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T) + write.table(mutations.by.id.gene, paste(outdir, "/aa_histogram_count_", gene, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T) + write.table(absent.aa.by.id.gene, paste(outdir, "/aa_histogram_absent_", gene, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T) + write.table(dat_dt, paste(outdir, "/aa_histogram_", gene, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T) + + png(filename=paste(outdir, "/aa_histogram_", gene, ".png", sep=""), width=1280, height=720) + print(m) + dev.off() } diff -r 0bea8c187a90 -r 81453585dfc3 pattern_plots.r --- a/pattern_plots.r Mon Nov 28 10:27:22 2016 -0500 +++ b/pattern_plots.r Thu Dec 01 09:32:06 2016 -0500 @@ -18,6 +18,8 @@ plot3.png = paste(plot3.path, ".png", sep="") plot3.txt = paste(plot3.path, ".txt", sep="") +clean.output = args[5] + dat = read.table(input.file, header=F, sep=",", quote="", stringsAsFactors=F, fill=T, row.names=1) @@ -28,6 +30,11 @@ names(dat) = new.names +clean.dat = dat +clean.dat = clean.dat[,c(paste(rep(classes, each=3), xyz, sep="."), paste("all", xyz, sep="."), paste("un", xyz, sep="."))] + +write.table(clean.dat, clean.output, quote=F, sep="\t", na="", row.names=T, col.names=NA) + dat["RGYW.WRCY",] = colSums(dat[c(13,14),], na.rm=T) dat["TW.WA",] = colSums(dat[c(15,16),], na.rm=T) @@ -51,26 +58,24 @@ print(p) dev.off() -data2 = dat[5:8,] - -data2["sum",] = colSums(data2, na.rm=T) +data2 = dat[c(1, 5:8),] data2 = data2[,names(data2)[grepl("\\.x", names(data2))]] names(data2) = gsub(".x", "", names(data2)) -data2["A/T",] = round(colSums(data2[3:4,]) / data2["sum",] * 100, 1) -data2["A/T",is.nan(unlist(data2["A/T",]))] = 0 +data2["A/T",] = dat["Targeting of A T (%)",names(dat)[grepl("\\.z", names(dat))]] -data2["G/C transversions",] = round(data2[2,] / data2["sum",] * 100, 1) -data2["G/C transitions",] = round(data2[1,] / data2["sum",] * 100, 1) +data2["G/C transitions",] = round(data2["Transitions at G C (%)",] / data2["Number of Mutations (%)",] * 100, 1) +data2["mutation.at.gc",] = dat["Transitions at G C (%)",names(dat)[grepl("\\.y", names(dat))]] +data2["G/C transversions",] = round((data2["mutation.at.gc",] - data2["Transitions at G C (%)",]) / data2["Number of Mutations (%)",] * 100, 1) data2["G/C transversions",is.nan(unlist(data2["G/C transversions",]))] = 0 data2["G/C transversions",is.infinite(unlist(data2["G/C transversions",]))] = 0 data2["G/C transitions",is.nan(unlist(data2["G/C transitions",]))] = 0 data2["G/C transitions",is.infinite(unlist(data2["G/C transitions",]))] = 0 -data2 = melt(t(data2[6:8,])) +data2 = melt(t(data2[c("A/T","G/C transitions","G/C transversions"),])) names(data2) = c("Class", "Type", "value") @@ -92,11 +97,11 @@ data3[is.na(data3)] = 0 #data3[is.infinite(data3)] = 0 -data3["G/C transitions",] = round(data3[1,] / (data3[5,] + data3[7,]) * 100, 1) +data3["G/C transitions",] = round(data3["Transitions at G C (%)",] / (data3["C",] + data3["G",]) * 100, 1) -data3["G/C transversions",] = round(data3[2,] / (data3[5,] + data3[7,]) * 100, 1) +data3["G/C transversions",] = round((data3["Targeting of G C (%)",] - data3["Transitions at G C (%)",]) / (data3["C",] + data3["G",]) * 100, 1) -data3["A/T",] = round(data3[3,] / (data3[4,] + data3[6,]) * 100, 1) +data3["A/T",] = round(data3["Targeting of A T (%)",] / (data3["A",] + data3["T",]) * 100, 1) data3["G/C transitions",is.nan(unlist(data3["G/C transitions",]))] = 0 data3["G/C transitions",is.infinite(unlist(data3["G/C transitions",]))] = 0 diff -r 0bea8c187a90 -r 81453585dfc3 shm_csr.r --- a/shm_csr.r Mon Nov 28 10:27:22 2016 -0500 +++ b/shm_csr.r Thu Dec 01 09:32:06 2016 -0500 @@ -15,7 +15,7 @@ if(length(dat$Sequence.ID) == 0){ setwd(outputdir) result = data.frame(x = rep(0, 5), y = rep(0, 5), z = rep(NA, 5)) - row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)") + row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of G C (%)") write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F) transitionTable = data.frame(A=rep(0, 4),C=rep(0, 4),G=rep(0, 4),T=rep(0, 4)) row.names(transitionTable) = c("A", "C", "G", "T") @@ -296,9 +296,9 @@ transition2 = merge(transition2, base.order, by.x="variable", by.y="base") transition2[is.na(transition2$value),]$value = 0 - + if(any(transition2$value != 0)){ #having rows of data but a transition table filled with 0 is bad - print("Plotting stacked transition") + print("Plotting heatmap and transition") png(filename=paste("transitions_stacked_", name, ".png", sep="")) p = ggplot(transition2, aes(factor(reorder(id, order.x)), y=value, fill=factor(reorder(variable, order.y)))) + geom_bar(position="fill", stat="identity", colour="black") #stacked bar p = p + xlab("From base") + ylab("To base") + ggtitle("Mutations frequency from base to base") + guides(fill=guide_legend(title=NULL)) @@ -306,16 +306,13 @@ #p = p + scale_colour_manual(values=c("A" = "black", "G" = "black", "C" = "black", "T" = "black")) print(p) dev.off() - - print("Plotting heatmap transition") - png(filename=paste("transitions_heatmap_", name, ".png", sep="")) p = ggplot(transition2, aes(factor(reorder(id, order.x)), factor(reorder(variable, order.y)))) + geom_tile(aes(fill = value)) + scale_fill_gradient(low="white", high="steelblue") #heatmap p = p + xlab("From base") + ylab("To base") + ggtitle("Mutations frequency from base to base") + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=13, colour="black")) print(p) dev.off() } else { - print("No data to plot") + #print("No data to plot") } } @@ -338,22 +335,23 @@ func = funcs[[i]] fname = fnames[[i]] + print(paste("Creating table for", fname)) + rows = 9 if(fname == "sum"){ rows = 11 } matrx = matrix(data = 0, ncol=((length(genes) + 1) * 3),nrow=rows) for(i in 1:length(genes)){ - print(paste("Creating table for", fname, genes[i])) matrx = calculate_result(i, genes[i], dat, matrx, func, fname, genes[i]) } matrx = calculate_result(i + 1, ".*", dat[!grepl("unmatched", dat$best_match),], matrx, func, fname, name="all") result = data.frame(matrx) if(fname == "sum"){ - row.names(result) = c("Number of Mutations (%)", "Transitions (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)", "Transitions at A T (%)", "Targeting of A T (%)", "FR R/S (ratio)", "CDR R/S (ratio)", "nt in FR", "nt in CDR") + row.names(result) = c("Number of Mutations (%)", "Transitions (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of G C (%)", "Transitions at A T (%)", "Targeting of A T (%)", "FR R/S (ratio)", "CDR R/S (ratio)", "nt in FR", "nt in CDR") } else { - row.names(result) = c("Number of Mutations (%)", "Transitions (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)", "Transitions at A T (%)", "Targeting of A T (%)", "FR R/S (ratio)", "CDR R/S (ratio)") + row.names(result) = c("Number of Mutations (%)", "Transitions (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of G C (%)", "Transitions at A T (%)", "Targeting of A T (%)", "FR R/S (ratio)", "CDR R/S (ratio)") } write.table(x=result, file=paste("mutations_", fname, ".txt", sep=""), sep=",",quote=F,row.names=T,col.names=F) } @@ -368,7 +366,7 @@ new.table[,1] = as.character(new.table[,1]) new.table[2,1] = "Median of Number of Mutations (%)" -#sum.table = sum.table[c("Number of Mutations (%)", "Median of Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)", "Transitions at A T (%)", "Targeting of A T (%)", "FR R/S (ratio)", "CDR R/S (ratio)", "nt in FR", "nt in CDR"),] +#sum.table = sum.table[c("Number of Mutations (%)", "Median of Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of G C (%)", "Transitions at A T (%)", "Targeting of A T (%)", "FR R/S (ratio)", "CDR R/S (ratio)", "nt in FR", "nt in CDR"),] write.table(x=new.table, file="mutations_sum.txt", sep=",",quote=F,row.names=F,col.names=F) @@ -467,7 +465,9 @@ frequency_bins_data = data.frame(data.table(dat)[, list(frequency_count=.N), by=c("best_match", "best_match_class", "frequency_bins")]) -frequency_bins_data = merge(frequency_bins_data, frequency_bins_sum, by="best_match_class") +frequency_bins_sum = data.frame(data.table(dat)[, list(class_sum=sum(.N)), by=c("best_match")]) + +frequency_bins_data = merge(frequency_bins_data, frequency_bins_sum, by="best_match") frequency_bins_data$frequency = round(frequency_bins_data$frequency_count / frequency_bins_data$class_sum * 100, 2) diff -r 0bea8c187a90 -r 81453585dfc3 wrapper.sh --- a/wrapper.sh Mon Nov 28 10:27:22 2016 -0500 +++ b/wrapper.sh Thu Dec 01 09:32:06 2016 -0500 @@ -230,7 +230,7 @@ echo "---------------- main tables ----------------
" >> $log echo "
" >> $output -echo "
" >> $output +echo "
" >> $output for func in ${funcs[@]} do @@ -243,7 +243,7 @@ echo "---------------- pattern_plots.r ----------------" echo "---------------- pattern_plots.r ----------------
" >> $log - Rscript $dir/pattern_plots.r $outdir/data_${func}.txt $outdir/plot1 $outdir/plot2 $outdir/plot3 2>&1 + Rscript $dir/pattern_plots.r $outdir/data_${func}.txt $outdir/plot1 $outdir/plot2 $outdir/plot3 $outdir/shm_overview.txt 2>&1 echo "" >> $output echo "" >> $output @@ -301,7 +301,7 @@ echo "---------------- images ----------------" echo "---------------- images ----------------
" >> $log -echo "
" >> $output +echo "
" >> $output if [ -a $outdir/scatter.png ] then @@ -314,7 +314,7 @@ echo "
" >> $output #SHM frequency tab end -echo "
" >> $output +echo "
" >> $output echo "
info
" >> $output @@ -322,8 +322,21 @@ do echo "" >> $output echo "" >> $output - echo "" >> $output - echo "" >> $output + + if [ -e $outdir/transitions_heatmap_${gene}.png ] + then + echo "" >> $output + else + echo "" >> $output + fi + + if [ -e $outdir/transitions_stacked_${gene}.png ] + then + echo "" >> $output + else + echo "" >> $output + fi + echo "

${gene}

" >> $output echo "" >> $output first="true" @@ -367,30 +380,66 @@ echo "
" >> $output -if [ -a $outdir/aa_histogram.png ] +if [ -e $outdir/aa_histogram.png ] then echo "
" >> $output +fi + +if [ -e $outdir/aa_histogram_IGA.png ] +then echo "
" >> $output +fi + +if [ -e $outdir/aa_histogram_IGG.png ] +then echo "
" >> $output +fi + +if [ -e $outdir/aa_histogram_IGM.png ] +then echo "
" >> $output +fi + +if [ -e $outdir/aa_histogram_IGE.png ] +then echo "
" >> $output fi -echo "" >> $output -echo "" >> $output -echo "" >> $output -echo "" >> $output -echo "" >> $output + +if [ -e $outdir/baseline.png ] +then + echo "" >> $output +fi + +if [ -e $outdir/baseline_IGA.png ] +then + echo "" >> $output +fi + +if [ -e $outdir/baseline_IGG.png ] +then + echo "" >> $output +fi + +if [ -e $outdir/baseline_IGM.png ] +then + echo "" >> $output +fi + +if [ -e $outdir/baseline_IGE.png ] +then + echo "" >> $output +fi echo "
" >> $output #antigen selection tab end echo "
" >> $output #CSR tab -if [ -a $outdir/IGA.png ] +if [ -e $outdir/IGA.png ] then echo "
" >> $output fi -if [ -a $outdir/IGG.png ] +if [ -e $outdir/IGG.png ] then echo "
" >> $output fi @@ -492,7 +541,7 @@ clonality_table $outdir/change_o/change-o-defined_clones-summary-IGM.txt $output echo "
" >> $output - echo "
" >> $output + echo "
" >> $output cat "$outdir/sequence_overview/index.html" >> $output echo "
" >> $output @@ -512,7 +561,7 @@ echo "
" >> $output echo "" >> $output -echo "" >> $output +echo "" >> $output echo "" >> $output echo "" >> $output echo "" >> $output @@ -526,17 +575,17 @@ echo "" >> $output echo "" >> $output -echo "" >> $output -echo "" >> $output -echo "" >> $output -echo "" >> $output -echo "" >> $output -echo "" >> $output -echo "" >> $output -echo "" >> $output -echo "" >> $output -echo "" >> $output -echo "" >> $output +echo "" >> $output +echo "" >> $output +echo "" >> $output +echo "" >> $output +echo "" >> $output +echo "" >> $output +echo "" >> $output +echo "" >> $output +echo "" >> $output +echo "" >> $output +echo "" >> $output echo "" >> $output echo "" >> $output
To
The alignment info on the unmatched sequencesDownload
SHM Overview
The SHM Overview table as a datasetDownload
The SHM Overview table as a datasetDownload
Motif data per sequence IDDownload
Mutation data per sequence IDDownload
Base count for every sequenceView
The data for frequency by subclassDownload
Transition Tables
The data for the 'all' transition plotDownload
The data for the 'IGA' transition plotDownload
The data for the 'IGA1' transition plotDownload
The data for the 'IGA1' transition plotDownload
The data for the 'IGG' transition plotDownload
The data for the 'IGG1' transition plotDownload
The data for the 'IGG2' transition plotDownload
The data for the 'IGG3' transition plotDownload
The data for the 'IGG4' transition plotDownload
The data for the 'IGM' transition plotDownload
The data for the 'IGE' transition plotDownload
The data for the 'all' transition plotDownload
The data for the 'IGA' transition plotDownload
The data for the 'IGA1' transition plotDownload
The data for the 'IGA1' transition plotDownload
The data for the 'IGG' transition plotDownload
The data for the 'IGG1' transition plotDownload
The data for the 'IGG2' transition plotDownload
The data for the 'IGG3' transition plotDownload
The data for the 'IGG4' transition plotDownload
The data for the 'IGM' transition plotDownload
The data for the 'IGE' transition plotDownload
Antigen Selection
AA mutation data per sequence IDDownload