Mercurial > repos > bornea > dotplot_runner
changeset 0:dfa3436beb67 draft
Uploaded
author | bornea |
---|---|
date | Fri, 29 Jan 2016 09:56:02 -0500 |
parents | |
children | 7e98f893e9ae |
files | Dotplot_Release/BaitCheck.pl Dotplot_Release/Normalization.R Dotplot_Release/Normalization_sigpreys.R Dotplot_Release/R_dotPlot.R Dotplot_Release/R_dotPlot_hc.R Dotplot_Release/R_dotPlot_nc.R Dotplot_Release/SOFD.pl Dotplot_Release/SaintConvert.pl Dotplot_Release/Step1_data_reformating.R Dotplot_Release/Step2_data_filtering.R Dotplot_Release/Step3_nestedcluster Dotplot_Release/Step4_biclustering.R Dotplot_Release/biclust.tar.gz Dotplot_Release/biclust_param.txt Dotplot_Release/dotplot.bash Dotplot_Release/pheatmap_j.R |
diffstat | 16 files changed, 1900 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Dotplot_Release/BaitCheck.pl Fri Jan 29 09:56:02 2016 -0500 @@ -0,0 +1,45 @@ +#!/usr/bin/perl + +# 27/04/2014 + +if($#ARGV==0){ + print "This program checks the number of baits in a Saint Output File.\n"; + print "\nusage:\n $0\n-i [csv saint output file]]\n\n"; + die; +} +else{ + $i=0; + $cutoff=0.01; + while($i<=$#ARGV){ + if($ARGV[$i] eq '-i'){ + $i++; + $ifile=$ARGV[$i]; + } + else{ + die "\Incorrect program usage\n\n"; + } + $i++; + } +} + +$file=''; +open(IFILE,"<$ifile") || die "$ifile can't be opened: $!"; +{ local $/=undef; $file=<IFILE>; } +@lines=split /[\r\n]+/, $file; +foreach $line (@lines) { + if($line =~ /^Bait/){ + } + elsif($line =~ /^([^\t]+)/){ + if($1 ne $bait[$baitn]){ + $baitn++; + $bait[$baitn]=$1; + } + } + else{ + } +} +close(IFILE); + +print $baitn; + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Dotplot_Release/Normalization.R Fri Jan 29 09:56:02 2016 -0500 @@ -0,0 +1,44 @@ +#!/usr/bin/env Rscript + +args <- commandArgs(trailingOnly = TRUE) + +#this programs normalizes a saint input file based on the spectral counts of all preys + +d = read.delim(args[1], header=T, sep="\t", as.is=T) + +baitn = 1 +curr_bait <- d$Bait[1] +s <- vector() +s[1] = 0 +for(i in 1:length(d$Bait)){ + if(curr_bait != d$Bait[i]){ + baitn <- baitn + 1 + curr_bait <- d$Bait[i] + s[baitn] <- d$AvgSpec[i] + } + else{ + s[baitn] <- s[baitn] + d$AvgSpec[i] + } +} + +med.s = median(s) +s = s / med.s + +d_n <- d +baitn = 1 +curr_bait <- d_n$Bait[1] +for(i in 1:length(d_n$Bait)){ + if(curr_bait != d_n$Bait[i]){ + baitn <- baitn + 1 + curr_bait <- d_n$Bait[i] + d_n$AvgSpec[i] <- d_n$AvgSpec[i]/s[baitn] + } + else{ + d_n$AvgSpec[i] <- d_n$AvgSpec[i]/s[baitn] + } +} + +#print normalized data to file + +write.table(d_n, file = "norm_saint.txt", sep="\t", quote=F, row.names=F) +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Dotplot_Release/Normalization_sigpreys.R Fri Jan 29 09:56:02 2016 -0500 @@ -0,0 +1,46 @@ +#!/usr/bin/env Rscript + +#this programs normalizes a saint input file based on the spectral counts of "signficant" preys +# that is, preys with an FDR <= the secondary cutoff as supplied to the dotplot script + +args <- commandArgs(trailingOnly = TRUE) + +d = read.delim(args[1], header=T, as.is=T) +d <- d[d$BFDR <= as.numeric(args[2]),] + +baitn = 1 +curr_bait <- d$Bait[1] +s <- vector() +s[1] = 0 +for(i in 1:length(d$Bait)){ + if(curr_bait != d$Bait[i]){ + baitn <- baitn + 1 + curr_bait <- d$Bait[i] + s[baitn] <- d$AvgSpec[i] + } + else{ + s[baitn] <- s[baitn] + d$AvgSpec[i] + } +} + +med.s = median(s) +s = s / med.s + +d_n <- d +baitn = 1 +curr_bait <- d_n$Bait[1] +for(i in 1:length(d_n$Bait)){ + if(curr_bait != d_n$Bait[i]){ + baitn <- baitn + 1 + curr_bait <- d_n$Bait[i] + d_n$AvgSpec[i] <- d_n$AvgSpec[i]/s[baitn] + } + else{ + d_n$AvgSpec[i] <- d_n$AvgSpec[i]/s[baitn] + } +} + +#print normalized data to file + +write.table(d_n, file = "norm_saint.txt", sep="\t", quote=F, row.names=F) +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Dotplot_Release/R_dotPlot.R Fri Jan 29 09:56:02 2016 -0500 @@ -0,0 +1,83 @@ +#!/usr/bin/env Rscript + +args <- commandArgs(trailingOnly = TRUE) + +library('latticeExtra') +library('colorRamps') + +data.file <- read.table("SC_data.txt", sep="\t", header=TRUE, row.names=1) ### import spectral count data +data.file2 <- read.table("FDR_data.txt", sep="\t", header=TRUE, row.names=1) ### import FDR count data +data.file3 <- read.table("clustered_matrix.txt", sep="\t", header=TRUE, row.names=1) ### import clustered matrix +data.file4 <- scan("singletons.txt", what="", sep="\n", strip.white=T) ### import singleton data + +#setting parameters + +Sfirst=as.numeric(args[1]) #first FDR cutoff +Ssecond=as.numeric(args[2]) #second FDR cutoff +maxp=as.integer(args[3]) #maximum value for a spectral count + +#calculate column and row lengths + +#determine bait and prey ordering + +bait_levels=names(data.file3) +prey_levels=c(rownames(data.file3),data.file4) + +x_ord=factor(row.names(data.file),levels=prey_levels) +y_ord=factor(names(data.file),levels=bait_levels) + +df<-data.frame(y=rep(y_ord,nrow(data.file)) + ,x=rep(x_ord, each=ncol(data.file)) + ,z1=as.vector(t(data.file)) # Circle color + ,z2=as.vector(t(data.file/apply(data.file,1,max))) # Circle size + ,z3=as.vector(t(data.file2)) # FDR +) + +df$z1[df$z1>maxp] <- maxp #maximum value for spectral count +df$z2[df$z2==0] <- NA +df$z3[df$z3>Ssecond] <- 0.05*maxp +df$z3[df$z3<=Ssecond & df$z3>Sfirst] <- 0.5*maxp +df$z3[df$z3<=Sfirst] <- 1*maxp +df$z4 <- df$z1 +df$z4[df$z4==0] <- 0 +df$z4[df$z4>0] <- 2.5 + +# The labeling for the colorkey + +labelat = c(0, maxp) +labeltext = c(0, maxp) + +# color scheme to use + +nmb.colors<-maxp +z.colors<-grey(rev(seq(0,0.9,0.9/nmb.colors))) #grayscale color scale + +#plot + +pl <- levelplot(z1~x*y, data=df + ,col.regions =z.colors #terrain.colors(100) + ,scales = list(x = list(rot = 90), y=list(cex=0.8), tck=0) # rotates X,Y labels and changes scale + ,colorkey = FALSE + ,xlab="Prey", ylab="Bait" + ,panel=function(x,y,z,...,col.regions){ + print(x) + z.c<-df$z1[ (df$x %in% as.character(x)) & (df$y %in% y)] + z.2<-df$z2[ (df$x %in% as.character(x)) & (df$y %in% y)] + z.3<-df$z3 + z.4<-df$z4 + panel.xyplot(x,y + ,as.table=TRUE + ,pch=21 # point type to use (circles in this case) + ,cex=((z.2-min(z.2,na.rm=TRUE))/(max(z.2,na.rm=TRUE)-min(z.2,na.rm=TRUE)))*3 #circle size + ,fill=z.colors[floor((z.c-min(z.c,na.rm=TRUE))*nmb.colors/(max(z.c,na.rm=TRUE)-min(z.c,na.rm=TRUE)))+1] # circle colors + ,col=z.colors[1+z.3] # border colors + ,lex=z.4 #border thickness + ) + } + #,main="Fold change" # graph main title + ) +if(ncol(data.file) > 4) ht=3.5+(0.36*((ncol(data.file)-1)-4)) else ht=3.5 +if(nrow(data.file) > 20) wd=8.25+(0.29*(nrow(data.file)-20)) else wd=5+(0.28*(nrow(data.file)-10)) +pdf("dotplot.pdf", onefile = FALSE, paper = "special", height = ht, width = wd, pointsize = 2) +print(pl) +dev.off()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Dotplot_Release/R_dotPlot_hc.R Fri Jan 29 09:56:02 2016 -0500 @@ -0,0 +1,125 @@ +#!/usr/bin/env Rscript + +args <- commandArgs(trailingOnly = TRUE) + +pheatmapj_loc <- paste(args[6],"pheatmap_j.R",sep="/") +heatmap2j_loc <- paste(args[6],"heatmap_2j.R",sep="/") + +library('latticeExtra') +library('RColorBrewer') +library('grid') +library(reshape2) +library('gplots') +library('gtools') +source(pheatmapj_loc) +source(heatmap2j_loc) + +data.file <- read.table("SC_data.txt", sep="\t", header=TRUE, row.names=1) ### import spectral count data +data.file2 <- read.table("FDR_data.txt", sep="\t", header=TRUE, row.names=1) ### import FDR count data + +#setting parameters + +Sfirst=as.numeric(args[1]) #first FDR cutoff +Ssecond=as.numeric(args[2]) #second FDR cutoff +maxp=as.integer(args[3]) #maximum value for a spectral count +methd <- args[4] +dist_methd <- args[5] + +#determine bait and prey ordering + +dist_bait <- dist(as.matrix(t(data.file)), method= dist_methd) # "euclidean", "maximum", "manhattan", "canberra", "binary" or "minkowski" +dist_prey <- dist(as.matrix(data.file), method= dist_methd) + +if(methd == "ward"){ + dist_bait <- dist_bait^2 #comment out this line and the next if not using Ward's method of clustering + dist_prey <- dist_prey^2 +} + +hc_bait <- hclust(dist_bait, method = methd) # method = "average", "single", "complete", "ward", "mcquitty", "median" or "centroid" +hc_prey <- hclust(dist_prey, method = methd) + +data.file = data.file[hc_prey$order, , drop = FALSE] +data.file = data.file[, hc_bait$order, drop = FALSE] +data.file2 = data.file2[hc_prey$order, , drop = FALSE] +data.file2 = data.file2[, hc_bait$order, drop = FALSE] + +x_ord=factor(row.names(data.file), levels=row.names(data.file)) +y_ord=factor(names(data.file[1,]), levels=names(data.file[1,])) + +df<-data.frame(y=rep(y_ord, nrow(data.file)) + ,x=rep(x_ord, each=ncol(data.file)) + ,z1=as.vector(t(data.file)) # Circle color + ,z2=as.vector(t(data.file/apply(data.file,1,max))) # Circle size + ,z3=as.vector(t(data.file2)) # FDR +) + +df$z1[df$z1>maxp] <- maxp #maximum value for spectral count +df$z2[df$z2==0] <- NA +df$z3[df$z3>Ssecond] <- 0.05*maxp +df$z3[df$z3<=Ssecond & df$z3>Sfirst] <- 0.5*maxp +df$z3[df$z3<=Sfirst] <- 1*maxp +df$z4 <- df$z1 +df$z4[df$z4==0] <- 0 +df$z4[df$z4>0] <- 2.5 + +# The labeling for the colorkey + +labelat = c(0, maxp) +labeltext = c(0, maxp) + +# color scheme to use + +nmb.colors<-maxp +z.colors<-grey(rev(seq(0,0.9,0.9/nmb.colors))) #grayscale color scale + +#plot dotplot + +pl <- levelplot(z1~x*y, data=df + ,col.regions =z.colors #terrain.colors(100) + ,scales = list(x = list(rot = 90), y=list(cex=0.8), tck=0) # rotates X,Y labels and changes scale + ,colorkey = FALSE + #,colorkey = list(space="bottom", width=1.5, height=0.3, labels=list(at = labelat, labels = labeltext)) #put colorkey at top with my labeling scheme + ,xlab="Prey", ylab="Bait" + ,panel=function(x,y,z,...,col.regions){ + print(x) + z.c<-df$z1[ (df$x %in% as.character(x)) & (df$y %in% y)] + z.2<-df$z2[ (df$x %in% as.character(x)) & (df$y %in% y)] + z.3<-df$z3 + z.4<-df$z4 + panel.xyplot(x,y + ,as.table=TRUE + ,pch=21 # point type to use (circles in this case) + ,cex=((z.2-min(z.2,na.rm=TRUE))/(max(z.2,na.rm=TRUE)-min(z.2,na.rm=TRUE)))*3 #circle size + ,fill=z.colors[floor((z.c-min(z.c,na.rm=TRUE))*nmb.colors/(max(z.c,na.rm=TRUE)-min(z.c,na.rm=TRUE)))+1] # circle colors + ,col=z.colors[1+z.3] # border colors + ,lex=z.4 #border thickness + ) + } + #,main="Fold change" # graph main title + ) +if(ncol(data.file) > 4) ht=3.5+(0.36*((ncol(data.file)-1)-4)) else ht=3.5 +if(nrow(data.file) > 20) wd=8.25+(0.29*(nrow(data.file) -20)) else wd=5.7+(0.28*(nrow(data.file) -10)) +pdf("dotplot.pdf", onefile = FALSE, paper = "special", height = ht, width = wd, pointsize = 2) +print(pl) +dev.off() + +#plot bait vs prey heatmap + +heat_df <- acast(df, y~x, value.var="z1") +heat_df <- apply(heat_df, 2, rev) + +if(ncol(data.file) > 4) ht=3.5+(0.1*((ncol(data.file)-1)-4)) else ht=3.5 +if(nrow(data.file) > 20) wd=8.25+(0.1*(nrow(data.file)-20)) else wd=5+(0.1*(nrow(data.file)-10)) +pdf("heatmap_borders.pdf", onefile = FALSE, paper = "special", height = ht, width = wd, pointsize = 2) +pheatmap_j(heat_df, scale="none", border_color="black", border_width = 0.1, cluster_rows=FALSE, cluster_cols=FALSE, col=colorRampPalette(c("#FFFFFF", brewer.pal(9,"Blues")))(100)) +dev.off() + +pdf("heatmap_no_borders.pdf", onefile = FALSE, paper = "special", height = ht, width = wd, pointsize = 2) +pheatmap_j(heat_df, scale="none", border_color=NA, cluster_rows=FALSE, cluster_cols=FALSE, col=colorRampPalette(c("#FFFFFF", brewer.pal(9,"Blues")))(100)) +dev.off() + +#plot bait vs bait heatmap using dist matrix +dist_bait <- dist_bait/max(dist_bait) +pdf("bait2bait.pdf", onefile = FALSE, paper = "special") +heatmap_2j(as.matrix(dist_bait), trace="none", scale="none", density.info="none", col=rev(colorRampPalette(c("#FFFFFF", brewer.pal(9,"Blues")))(100)), xMin=0, xMax=1, margins=c(1.5*max(nchar(rownames(as.matrix(dist_bait)))),1.5*max(nchar(colnames(as.matrix(dist_bait)))))) +dev.off()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Dotplot_Release/R_dotPlot_nc.R Fri Jan 29 09:56:02 2016 -0500 @@ -0,0 +1,140 @@ +#!/usr/bin/env Rscript + +args <- commandArgs(trailingOnly = TRUE) + +pheatmapj_loc <- paste(args[9],"pheatmap_j.R",sep="/") + +library('latticeExtra') +library('RColorBrewer') +library('grid') +library(reshape2) +source(pheatmapj_loc) + +data.file <- read.table("SC_data.txt", sep="\t", header=TRUE, row.names=1) ### import spectral count data +data.file2 <- read.table("FDR_data.txt", sep="\t", header=TRUE, row.names=1) ### import FDR count data +bait_l <- scan(args[4], what="") ### import bait list +if(args[5] == 0) prey_l <- scan(args[6], what="") ### import prey list +methd <- args[7] +dist_methd <- args[8] + +#setting parameters + +Sfirst=as.numeric(args[1]) #first FDR cutoff +Ssecond=as.numeric(args[2]) #second FDR cutoff +maxp=as.integer(args[3]) #maximum value for a spectral count + +#extract only needed data + +if(args[5] == 0){ + remove <- vector() + remove <- prey_l[prey_l %in% row.names(data.file)] + prey_l <- prey_l[prey_l %in% remove] + remove <- bait_l[bait_l %in% names(data.file)] + bait_l <- bait_l[bait_l %in% remove] + data.file <- data.file[prey_l, bait_l] + data.file2 <- data.file2[prey_l, bait_l] +} else{ + remove <- vector() + remove <- bait_l[bait_l %in% names(data.file)] + bait_l <- bait_l[bait_l %in% remove] + data.file <- data.file[, bait_l] + data.file2 <- data.file2[, bait_l] + prey_keep = apply(data.file2, 1, function(x) sum(x<=Sfirst) >= 1) + data.file <- data.file[prey_keep,] + data.file2 <- data.file2[prey_keep,] +} + +#determine bait and prey ordering + +y_ord=factor(names(data.file[1,]),levels=bait_l) + +if(args[5] == 0){ + x_ord=factor(rownames(data.file),levels=prey_l) +} else { + + data.file <- data.file[which(rowSums(data.file) > 0),] + dist_prey <- dist(as.matrix(data.file), method= dist_methd) + + if(methd == "ward"){ + dist_prey <- dist_prey^2 + } + + hc_prey <- hclust(dist_prey, method = methd) + + data.file = data.file[hc_prey$order, , drop = FALSE] + data.file2 = data.file2[hc_prey$order, , drop = FALSE] + + x_ord=factor(row.names(data.file), levels=row.names(data.file)) +} + +df<-data.frame(y=rep(y_ord, nrow(data.file)) + ,x=rep(x_ord, each=ncol(data.file)) + ,z1=as.vector(t(data.file)) # Circle color + ,z2=as.vector(t(data.file/apply(data.file,1,max))) # Circle size + ,z3=as.vector(t(data.file2)) # FDR +) + +df$z1[df$z1>maxp] <- maxp #maximum value for spectral count +df$z2[df$z2==0] <- NA +df$z3[df$z3>Ssecond] <- 0.05*maxp +df$z3[df$z3<=Ssecond & df$z3>Sfirst] <- 0.5*maxp +df$z3[df$z3<=Sfirst] <- 1*maxp +df$z4 <- df$z1 +df$z4[df$z4==0] <- 0 +df$z4[df$z4>0] <- 2.5 + +# The labeling for the colorkey + +labelat = c(0, maxp) +labeltext = c(0, maxp) + +# color scheme to use + +nmb.colors<-maxp +z.colors<-grey(rev(seq(0,0.9,0.9/nmb.colors))) #grayscale color scale + +#plot dotplot + +pl <- levelplot(z1~x*y, data=df + ,col.regions =z.colors #terrain.colors(100) + ,scales = list(x = list(rot = 90), y=list(cex=0.8), tck=0) # rotates X,Y labels and changes scale + ,colorkey = FALSE + #,colorkey = list(space="bottom", width=1.5, height=0.3, labels=list(at = labelat, labels = labeltext)) #put colorkey at top with my labeling scheme + ,xlab="Prey", ylab="Bait" + ,panel=function(x,y,z,...,col.regions){ + print(x) + z.c<-df$z1[ (df$x %in% as.character(x)) & (df$y %in% y)] + z.2<-df$z2[ (df$x %in% as.character(x)) & (df$y %in% y)] + z.3<-df$z3 + z.4<-df$z4 + panel.xyplot(x,y + ,as.table=TRUE + ,pch=21 # point type to use (circles in this case) + ,cex=((z.2-min(z.2,na.rm=TRUE))/(max(z.2,na.rm=TRUE)-min(z.2,na.rm=TRUE)))*3 #circle size + ,fill=z.colors[floor((z.c-min(z.c,na.rm=TRUE))*nmb.colors/(max(z.c,na.rm=TRUE)-min(z.c,na.rm=TRUE)))+1] # circle colors + ,col=z.colors[1+z.3] # border colors + ,lex=z.4 #border thickness + ) + } + #,main="Fold change" # graph main title + ) +if(ncol(data.file) > 4) ht=3.5+(0.36*((ncol(data.file)-1)-4)) else ht=3.5 +if(nrow(data.file) > 20) wd=8.25+(0.29*(nrow(data.file)-20)) else wd=5.7+(0.28*(nrow(data.file)-10)) +pdf("dotplot.pdf", onefile = FALSE, paper = "special", height = ht, width = wd, pointsize = 2) +print(pl) +dev.off() + +#plot heatmap + +heat_df <- acast(df, y~x, value.var="z1") +heat_df <- apply(heat_df, 2, rev) + +if(ncol(data.file) > 4) ht=3.5+(0.1*((ncol(data.file)-1)-4)) else ht=3.5 +if(nrow(data.file) > 20) wd=8.25+(0.1*(nrow(data.file)-20)) else wd=5+(0.1*(nrow(data.file)-10)) +pdf("heatmap_borders.pdf", onefile = FALSE, paper = "special", height = ht, width = wd, pointsize = 2) +pheatmap_j(heat_df, scale="none", border_color="black", border_width = 0.1, cluster_rows=FALSE, cluster_cols=FALSE, col=colorRampPalette(c("#FFFFFF", brewer.pal(9,"Blues")))(100)) +dev.off() + +pdf("heatmap_no_borders.pdf", onefile = FALSE, paper = "special", height = ht, width = wd, pointsize = 2) +pheatmap_j(heat_df, scale="none", border_color=NA, cluster_rows=FALSE, cluster_cols=FALSE, col=colorRampPalette(c("#FFFFFF", brewer.pal(9,"Blues")))(100)) +dev.off()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Dotplot_Release/SOFD.pl Fri Jan 29 09:56:02 2016 -0500 @@ -0,0 +1,112 @@ +#!/usr/bin/perl + +# 17/12/2013 + +if($#ARGV==0){ + print "This program takes the Saint Output File and produces two matrices.\n"; + print "One has average spectral counts and the other FDR scores,\n"; + print "\nusage:\n $0\n-i [csv saint output file]\n-s [FDR cutoff, default=0.01]\n\n"; + die; +} +else{ + $i=0; + $cutoff=0.01; + $spec_cutoff=0; + while($i<=$#ARGV){ + if($ARGV[$i] eq '-i'){ + $i++; + $ifile=$ARGV[$i]; + } + elsif($ARGV[$i] eq '-s'){ + $i++; + if($ARGV[$i]>1 || $ARGV[$i]<0){ + die "\nFDR cutoff must be between 0 and 1 \n\n"; + } + $cutoff=$ARGV[$i]; + } + elsif($ARGV[$i] eq '-x'){ + $i++; + if($ARGV[$i]<0){ + die "\nAvgSpec cutoff must be > 0 \n\n"; + } + $spec_cutoff=$ARGV[$i]; + } + else{ + die "\Incorrect program usage\n\n"; + } + $i++; + } +} + +$baitn=0, $bait[0]=xxxx, $sig_preysn=0; +$file=''; +open(IFILE,"<$ifile") || die "$ifile can't be opened: $!"; +{ local $/=undef; $file=<IFILE>; } +@lines=split /[\r\n]+/, $file; +foreach $line (@lines) { + if($line =~ /^Bait/){ + } + elsif($line =~ /^([^\t]+)\t[^\t]+\t([^\t]+)\t[^\t]+\t[\d]+\t([\d\.]+)\t[\d]+\t[^\t]+\t[^\t]+\t[^\t]+\t[^\t]+\t[^\t]+\t([^\t]+)\t[^\t]+\t([^\t]+)\t/){ + if($1 ne $bait[$baitn]){ + $baitn++; + $bait[$baitn]=$1; + $preyn[$baitn]=0; + } + $preyn[$baitn]++; + $preys[$baitn][$preyn[$baitn]]=$2; + $avgspec[$baitn][$preyn[$baitn]]=$3; + $saint[$baitn][$preyn[$baitn]]=$4; + $fdr[$baitn][$preyn[$baitn]]=$5; + if($5 <= $cutoff && $3 >= $spec_cutoff){ + $check_prey=0; + for($i=1; $i<=$sig_preysn; $i++){ + if($sig_preys[$i] eq $2){ + $check_prey=1; + } + } + if($check_prey==0){ + $sig_preysn++; + $sig_preys[$sig_preysn]=$2; + } + } + } + else{ + } +} +close(IFILE); + +open(SC_FILE, ">SC_data.txt"); +open(FDR_FILE, ">FDR_data.txt"); + +for($i=1; $i<=$baitn; $i++){ + print SC_FILE "\t$bait[$i]"; + print FDR_FILE "\t$bait[$i]"; +} +print SC_FILE "\n"; +print FDR_FILE "\n"; +for($i=1; $i<=$sig_preysn; $i++){ + print SC_FILE "$sig_preys[$i]"; + print FDR_FILE "$sig_preys[$i]"; + for($j=1; $j<=$baitn; $j++){ + $krem=0; + for($k=1; $k<=$preyn[$j]; $k++){; + if($preys[$j][$k] eq $sig_preys[$i]){ + $krem=$k; + last; + } + } + if($krem != 0){ + print SC_FILE "\t$avgspec[$j][$krem]"; + print FDR_FILE "\t$fdr[$j][$krem]"; + } + else{ + print SC_FILE "\t0"; + print FDR_FILE "\t1"; + } + } + print SC_FILE "\n"; + print FDR_FILE "\n"; +} +close(SC_FILE); +close(FDR_FILE); +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Dotplot_Release/SaintConvert.pl Fri Jan 29 09:56:02 2016 -0500 @@ -0,0 +1,52 @@ +#!/usr/bin/perl + +# 17/12/2013 + +if($#ARGV==0){ + print "This program takes non-SaintExpress formatted data and converts it to look like it.\n"; + print "\nusage:\n $0\n-i [csv saint output file]\n\n"; + die; +} +else{ + $i=0; + while($i<=$#ARGV){ + if($ARGV[$i] eq '-i'){ + $i++; + $ifile=$ARGV[$i]; + } + else{ + die "\Incorrect program usage\n\n"; + } + $i++; + } +} + +$i=0; +$file=''; +open(IFILE,"<$ifile") || die "$ifile can't be opened: $!"; +{ local $/=undef; $file=<IFILE>; } +@lines=split /[\r\n]+/, $file; +foreach $line (@lines) { + if($line =~ /^Bait/){ + } + elsif($line =~ /^([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)/){ + $bait[$i]=$1; + $prey[$i]=$2; + $spec[$i]=$3; + $fdr[$i]=$4; + $i++; + } + else{ + } +} +close(IFILE); +$line_count=$i; + +open(OFILE, ">mockSaintExpress.txt"); +print OFILE "Bait\tPrey\tPreyGene\tSpec\tSpecSum\tAvgSpec\tNumReplicates\tctrlCounts\tAvgP\tMaxP\tTopoAvgP\tTopoMaxP\tSaintScore\tFoldChange\tBFDR\tboosted_by\n"; + +for($i=0; $i<$line_count; $i++){ + print OFILE "$bait[$i]\t111\t$prey[$i]\t111\t111\t$spec[$i]\t111\t111\t111\t111\t111\t111\t111\t111\t$fdr[$i]\t111\n"; +} +close(OFILE); +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Dotplot_Release/Step1_data_reformating.R Fri Jan 29 09:56:02 2016 -0500 @@ -0,0 +1,41 @@ +#!/usr/bin/env Rscript + +args <- commandArgs(trailingOnly = TRUE) + +d = read.delim(args[1], header=T, sep="\t", as.is=T) + +### Select Prey interactions were at least one Bait > Probability Threshold + +preylist=unique(c(d$PreyGene[d$BFDR <= as.numeric(args[2])])) +pid = d$PreyGene %in% preylist +d = d[pid,] + +bb = unique(d$Bait) +pp = unique(d$PreyGene) + +nbait = length(bb) +nprey = length(pp) + +### Reformat the SAINToutput data into a spreadsheet +mat = matrix(0, nprey, nbait) + +n = nrow(d) +mb = match(d$Bait, bb) +mp = match(d$PreyGene, pp) + +### Using the AvgSpec for the spectral counts +for(i in 1:n) { + mat[mp[i],mb[i]] = d$AvgSpec[i] +} + +rownames(mat) = pp +colnames(mat) = bb + +outfile <- paste(c(args[3]), "matrix.txt", sep="_") +### The following file is the outcome of running this step. +write.table(mat, outfile, sep="\t", quote=F) + + + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Dotplot_Release/Step2_data_filtering.R Fri Jan 29 09:56:02 2016 -0500 @@ -0,0 +1,28 @@ +#!/usr/bin/env Rscript + +args <- commandArgs(trailingOnly = TRUE) + +d = read.delim(args[1], header=T, as.is=T) + +d2 = d +d2s = d + +ss_cutoff <- as.numeric(args[2]) +### Here I'm only going to take the preys which appeared in at least 2 baits with >args[2] counts +id = apply(d, 1, function(x) sum(x>ss_cutoff) >= 2) +id2 = apply(d, 1, function(x) sum(x>ss_cutoff) < 2) +d2 = d2[id, ] +d2s = d2s[id2, 0] +max.d2 = max(as.numeric(as.matrix(d2))) +d2 = d2 / max.d2 * 10 + +d3 = data.frame(PROT = rownames(d2), d2) + +outfile <- paste(c(args[3]), "dat", sep=".") + +### The following file is the outcome of running this step. +write.table(d3, outfile, sep="\t", quote=F, row.names=F) +### This is the final input file for nested cluster algorithm + +write.table(d2s, "singletons.txt", quote=F) +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Dotplot_Release/Step4_biclustering.R Fri Jan 29 09:56:02 2016 -0500 @@ -0,0 +1,201 @@ +#!/usr/bin/env Rscript + +args <- commandArgs(trailingOnly = TRUE) + +d = read.delim(args[1], header=T, sep="\t", as.is=T, row.names=1) + +clusters = read.delim("Clusters", header=T, sep="\t", as.is=T)[,-1] +clusters = data.frame(Bait=colnames(clusters), Cluster=as.numeric(clusters[1,])) +nested.clusters = read.delim("NestedClusters", header=F, sep="\t", as.is=T)[1:dim(d)[1],] +nested.phi = read.delim("NestedMu", header=F, sep="\t", as.is=T)[1:dim(d)[1],] +nested.phi2 = read.delim("NestedSigma2", header=F, sep="\t", as.is=T)[1:dim(d)[1],] +mcmc = read.delim("MCMCparameters", header=F, sep="\t", as.is=T) + +### distance between bait using phi (also reorder cluster names) +### report nested clusters with positive counts only +### rearrange rows and columns of the raw data matrix according to the back-tracking algorithm + +recursivePaste = function(x) { + n = length(x) + x = x[order(x)] + y = x[1] + if(n > 1) { + for(i in 2:n) y = paste(y, x[i], sep="-") + } + y +} + +calcDist = function(x, y) { + if(length(x) != length(y)) stop("different length\n") + else res = sum(abs(x-y)) + res +} + + +#clusters, nested.clusters, nested.phi, d + +bcl = clusters +pcl = nested.clusters +phi = nested.phi +phi2 = nested.phi2 +dat = d + + +## bipartite graph +make.graphlet = function(b,p,s) { + g = NULL + g$b = b + g$p = p + g$s = as.numeric(s) + g +} + +make.hub = function(b,p) { + g = NULL + g$b = b + g$p = p + g +} + +jaccard = function(x,y) { + j = length(intersect(x,y)) / length(union(x,y)) + j +} + +merge.graphlets = function(x, y) { + g = NULL + g$b = union(x$b, y$b) + g$p = union(x$p, y$p) + g$s1 = rep(0,length(g$p)) + g$s2 = rep(0,length(g$p)) + g$s1[match(x$p, g$p)] = x$s + g$s2[match(y$p, g$p)] = y$s + g$s = apply(cbind(g$s1, g$s2), 1, max) + g +} + +summarizeDP = function(bcl, pcl, phi, phi2, dat, hub.size=0.5, ...) { + pcl = as.matrix(pcl) + phi = as.matrix(phi) + phi2 = as.matrix(phi2) + dat = as.matrix(dat) + rownames(phi) = rownames(dat) + rownames(phi2) = rownames(dat) + + ubcl = unique(as.numeric(bcl$Cluster)) + n = length(ubcl) + pcl = pcl[,ubcl] + phi = phi[,ubcl] + phi2 = phi2[,ubcl] + phi[phi < 0.05] = 0 + + bcl$Cluster = match(as.numeric(bcl$Cluster), ubcl) + colnames(pcl) = colnames(phi) = colnames(phi2) = paste("CL", 1:n, sep="") + + ## remove non-reproducible mean values + nprey = dim(dat)[1]; nbait = dim(dat)[2] + preys = rownames(dat); baits = colnames(dat) + n = length(unique(bcl$Cluster)) + for(j in 1:n) { + id = c(1:nbait)[bcl$Cluster == j] + for(k in 1:nprey) { + do.it = ifelse(mean(as.numeric(dat[k,id]) > 0) <= 0.5,TRUE,FALSE) + if(do.it) { + phi[k,j] = 0 + } + } + } + + ## create bipartite graphs (graphlets) + gr = NULL + for(j in 1:n) { + id = c(1:nbait)[bcl$Cluster == j] + id2 = c(1:nprey)[phi[,j] > 0] + gr[[j]] = make.graphlet(baits[id], preys[id2], phi[id2,j]) + } + + ## intersecting preys between graphlets + gr2 = NULL + cur = 1 + for(i in 1:n) { + for(j in 1:n) { + if(i != j) { + combine = jaccard(gr[[i]]$p, gr[[j]]$p) >= 0.75 + if(combine) { + gr2[[cur]] = merge.graphlets(gr[[i]], gr[[j]]) + cur = cur + 1 + } + } + } + } + + old.phi = phi + phi = phi[, bcl$Cluster] + phi2 = phi2[, bcl$Cluster] + ## find hub preys + proceed = apply(old.phi, 1, function(x) sum(x>0) >= 2) + h = NULL + cur = 1 + for(k in 1:nprey) { + if(proceed[k]) { + id = as.numeric(phi[k,]) > 0 + if(mean(id) >= hub.size) { + h[[cur]] = make.hub(baits[id], preys[k]) + cur = cur + 1 + } + } + } + nhub = cur - 1 + + res = list(data=dat, baitCL=bcl, phi=phi, phi2=phi2, gr = gr, gr2 = gr2, hub = h) + res +} + +res = summarizeDP(clusters, nested.clusters, nested.phi, nested.phi2, d) + +write.table(res$baitCL[order(res$baitCL$Cluster),], "baitClusters", sep="\t", quote=F, row.names=F) +write.table(res$data, "clusteredData", sep="\t", quote=F) + +##### SOFT +library(gplots) +tmpd = res$data +tmpm = res$phi +colnames(tmpm) = paste(colnames(res$data), colnames(tmpm)) + +pdf("estimated.pdf", height=25, width=8) +my.hclust<-hclust(dist(tmpd)) +my.dend<-as.dendrogram(my.hclust) +tmp.res = heatmap.2(tmpm, Rowv=my.dend, Colv=T, trace="n", col=rev(heat.colors(10)), breaks=seq(0,.5,by=0.05), margins=c(10,10), keysize=0.8, cexRow=0.4) +#tmp.res = heatmap.2(tmpm, Rowv=T, Colv=T, trace="n", col=rev(heat.colors(10)), breaks=seq(0,.5,by=0.05), margins=c(10,10), keysize=0.8, cexRow=0.4) +tmpd = tmpd[rev(tmp.res$rowInd),tmp.res$colInd] +write.table(tmpd, "clustered_matrix.txt", sep="\t", quote=F) +heatmap.2(tmpd, Rowv=F, Colv=F, trace="n", col=rev(heat.colors(10)), breaks=seq(0,.5,by=0.05), margins=c(10,10), keysize=0.8, cexRow=0.4) +dev.off() + + +### Statistical Plots +dd = dist(1-cor((res$phi), method="pearson")) +dend = as.dendrogram(hclust(dd, "ave")) +#plot(dend) + +pdf("bait2bait.pdf") +tmp = res$phi +colnames(tmp) = paste(colnames(res$phi), res$baitCL$Bait, sep="_") + +###dd = cor(tmp[,-26]) ### This line is only for Chris' data (one bait has all zeros in the estimated parameters) +dd = cor(tmp) ### This line is only for Chris' data (one bait has all zeros in the estimated parameters) + +write.table(dd, "bait2bait_matrix.txt", sep="\t", quote=F) +heatmap.2(as.matrix(dd), trace="n", breaks=seq(-1,1,by=0.1), col=(greenred(20)), cexRow=0.7, cexCol=0.7) +dev.off() + +tmp = mcmc[,2] +ymax = max(tmp) +ymin = min(tmp) +pdf("stats.pdf", height=12, width=12) + +plot(mcmc[mcmc[,4]=="G",3], type="s", xlab="Iterations", ylab="Number of Clusters", main="") +plot(mcmc[,2], type="l", xlab="Iterations", ylab="Log-Likelihood", main="", ylim=c(ymin,ymax)) + +dev.off() +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Dotplot_Release/biclust_param.txt Fri Jan 29 09:56:02 2016 -0500 @@ -0,0 +1,12 @@ +np 10 +nb 100 +a 1.0 +b 1.0 +lambda 0.0 +nu 25.0 +alpha 1.0 +rho 1.0 +gamma 1.0 +nburn 5000 +niter 10000 +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Dotplot_Release/dotplot.bash Fri Jan 29 09:56:02 2016 -0500 @@ -0,0 +1,252 @@ +#!/bin/bash +#SCRIPT=$(readlink -e $0) +#SCRIPTPATH=`dirname $SCRIPT` +pushd `dirname $0` > /dev/null +SCRIPTPATH=`pwd` +popd > /dev/null + +usage() { printf "Usage: $0 +[-f <saint_file_name.txt>] +[-i <0 for SaintExpress format, 1 for other>] +[-c <clustering to perform. Options: b (biclustering), h (hierarchical), n (none, requires input text files for bait and prey ordering; see options -b and -p)>] +[-n <clustering type to be performed if option -c is set to \"h\">] +[-d <distance metric to use if option -c is set to \"h\">] +[-b <list of bait proteins in display order (see option -c n)>] +[-p <list of prey proteins in display order (see option -c n). Set this to \"all\" if you want to include all preys and cluster them>] +[-s <primary FDR cutoff [0-1, recommended=0.01]>] +[-t <secondary FDR cutoff [must be less than the primary, recommended=0.025]> +[-x <spectral count minimum. Only preys with >= this will be used]> +[-m <maximum spectral count>] +[-N <normalization, 0 for no (default), 1 for yes, 2 for normalization based on significant preys counts (prey FDR <= option -t)>] +[-C <FDR cutoff for normalization if using option -N 2 (deafult is -t)>]\n" +1>&2; exit 1; } + +N=0 +n="ward" +d="canberra" +x=0 +i=0 +while getopts ":f:i:s:t:x:m:c:n:d:b:p:N:C:" o; do + case "${o}" in + f) + f=${OPTARG} + ;; + i) + i=${OPTARG} + ;; + s) + s=${OPTARG} + ;; + t) + t=${OPTARG} + ;; + x) + x=${OPTARG} + ;; + m) + m=${OPTARG} + ;; + c) + c=${OPTARG} + ;; + n) + n=${OPTARG} + ;; + d) + d=${OPTARG} + ;; + b) + b=${OPTARG} + ;; + p) + p=${OPTARG} + ;; + N) + N=${OPTARG} + ;; + C) + C=${OPTARG} + ;; + *) + usage + ;; + esac +done +shift $((OPTIND-1)) + +filename=${f%%.*} +echo "Saint input file = ${f}" +echo "Primary FDR cutoff = ${s}" +echo "Secondary FDR cutoff for dotplot = ${t}" +echo "Minimum spectral count for significant preys = ${x}" +echo "Maximum spectral count for dot plot = ${m}" + +if [ -z "${f}" ] || [ -z "${s}" ] || [ -z "${t}" ] || [ -z "${m}" ] || [ -z "${c}" ]; then + usage +fi + +if [ "${i}" == 1 ]; then + $SCRIPTPATH/SaintConvert.pl -i ${f} + f="mockSaintExpress.txt" +fi + +if [ "${x}" -ge "${m}" ]; then + echo "spectral count minimum (${x}) cannot be greater than or equal to the maximum (${m})" + exit 1; +elif [ "${x}" -lt 0 ]; then + echo "spectral count minimum (${x}) cannot be less than 0. Setting to 0 and continuing" + x=0 +fi + +###Check for normalization + +if [ "${N}" == 1 ]; then + printf "\nNormalization is being performed\n" + $SCRIPTPATH/Normalization.R ${f} + f="norm_saint.txt" +elif [ "${N}" == 2 ]; then + printf "\nNormalization is being performed\n" + if [ -z "${C}" ]; then + C=${t} + fi + $SCRIPTPATH/Normalization_sigpreys.R ${f} ${C} + f="norm_saint.txt" +fi + + +###Check for clustering etc + +if [ "${c}" == "h" ] && [ -z "${n}" ]; then + printf "\nHierarchial clustering was selected (-c = h), but no clustering method (-n) was chosen.\n" + printf "The input parameter -n must be set to one of \"average\", \"centroid\", \"complete\", \"mcquitty\",\n" + printf "\"median\", \"single\" or \"ward\". \"ward\" will be selected as default.\n\n" + n="ward" +elif [ "${c}" == "h" ] && [ -n "${n}" ]; then + if [ "${n}" == "average" ] || [ "${n}" == "centroid" ] || [ "${n}" == "complete" ] || [ "${n}" == "mcquitty" ] || [ "${n}" == "median" ] || [ "${n}" == "single" ] || [ "${n}" == "ward" ]; then + printf "\nHierarchical clustering (method = ${n}) will be performed\n\n" + else + printf "\n${n} is not a valid Hierarchical clustering method.\n" + printf "Choose one of \"average\", \"centroid\", \"complete\", \"mcquitty\", \"median\", \"single\" or \"ward\"\n\n" + exit 1 + fi +fi + +p_c=0 +if [ "${c}" == "h" ] && [ -z "${d}" ]; then + printf "\nHierarchial clustering was selected (-c = h), but no distance metric (-d) was chosen.\n" + printf "The input parameter -d must be set to one of \"binary\", \"canberra\", \"euclidean\",\n" + printf "\"manhattan\", \"maximum\" or \"minkowski\". \"canberra\" will be selected as default.\n\n" + d="canberra" +elif [ "${c}" == "h" ] && [ -n "${d}" ]; then + if [ "${d}" == "binary" ] || [ "${d}" == "canberra" ] || [ "${d}" == "euclidean" ] || [ "${d}" == "manhattan" ] || [ "${d}" == "maximum" ] || [ "${d}" == "minkowski" ]; then + printf "\nHierarchical clustering (distance metric = ${d}) will be performed\n\n" + else + printf "\n${d} is not a valid Hierarchical clustering distance metric.\n" + printf "Choose one of \"binary\", \"canberra\", \"euclidean\", \"manhattan\", \"maximum\" or \"minkowski\"\n\n" + exit 1 + fi +fi + +if [ "${c}" == "n" ] && [ -z "${b}" ]; then + printf "\n\"No Clustering\" option was selected (-c = n), but no bait list was included (option -b).\n" + printf "Bait list must be in .txt formart.\n\n" + exit 1 +elif [ "${c}" == "n" ] && [ -z "${p}" ]; then + printf "\n\"No Clustering\" option was selected (-c = n), but no prey list was included (option -p).\n" + printf "Prey list must be in .txt formart.\n\n" + exit 1 +elif [ "${c}" == "n" ] && [ "${p}" == "all" ]; then + printf "\n\"No Clustering\" option was selected (-c = n) for baits, but preys will still be clustered.\n" + printf "using \"ward\" and \"canberra\" as defaults or options as supplied on command line.\n\n" + p="empty" + p_c=1 + n="ward" + d="canberra" +fi + + +###Check number of baits + +bait_n=$(perl $SCRIPTPATH/BaitCheck.pl -i ${f}) +echo "Number of baits = "$bait_n +printf "\n\n" + +if [ "${c}" == "b" ] && [ $bait_n == 2 ]; then + printf "\nWarning only 2 baits are present. Biclustering will not performed.\n" + printf "Hierarchical clustering (method = ward) will be performed instead.\n\n" + c="h" + n="ward" +fi + + +###Generate plots + +if [ "${c}" == "b" ]; then + printf "\nBiclustering will be performed\n\n" + $SCRIPTPATH/Step1_data_reformating.R ${f} ${s} ${filename} + $SCRIPTPATH/Step2_data_filtering.R ${filename}_matrix.txt ${x} ${filename} + GSL_RNG_SEED=123 $SCRIPTPATH/Step3_nestedcluster ${filename}.dat $SCRIPTPATH/biclust_param.txt + $SCRIPTPATH/Step4_biclustering.R ${filename}.dat + + $SCRIPTPATH/SOFD.pl -i ${f} -s ${s} -x ${x} + $SCRIPTPATH/R_dotPlot.R ${s} ${t} ${m} + mkdir Output_${filename} + mkdir Output_${filename}/TempData_${filename} + mv bait_lists Output_${filename}/TempData_${filename} + mv Clusters Output_${filename}/TempData_${filename} + mv MCMCparameters Output_${filename}/TempData_${filename} + mv NestedClusters Output_${filename}/TempData_${filename} + mv NestedMu Output_${filename}/TempData_${filename} + mv NestedSigma2 Output_${filename}/TempData_${filename} + mv OPTclusters Output_${filename}/TempData_${filename} + mv ${filename}_matrix.txt Output_${filename}/TempData_${filename} + mv ${filename}.dat Output_${filename}/TempData_${filename} + mv SC_data.txt Output_${filename}/TempData_${filename} + mv FDR_data.txt Output_${filename}/TempData_${filename} + mv clustered_matrix.txt Output_${filename}/TempData_${filename} + mv singletons.txt Output_${filename}/TempData_${filename} + mv bait2bait_matrix.txt Output_${filename}/TempData_${filename} + mv baitClusters Output_${filename}/TempData_${filename} + mv clusteredData Output_${filename}/TempData_${filename} + mv dotplot.pdf Output_${filename} + mv bait2bait.pdf Output_${filename} + mv estimated.pdf Output_${filename} + mv stats.pdf Output_${filename} + cp $SCRIPTPATH/legend.pdf Output_${filename} +elif [ "${c}" == "h" ]; then + + $SCRIPTPATH/SOFD.pl -i ${f} -s ${s} -x ${x} + $SCRIPTPATH/R_dotPlot_hc.R ${s} ${t} ${m} ${n} ${d} $SCRIPTPATH + + mkdir Output_${filename} + mkdir Output_${filename}/TempData_${filename} + mv dotplot.pdf Output_${filename} + mv heatmap_borders.pdf Output_${filename} + mv heatmap_no_borders.pdf Output_${filename} + mv bait2bait.pdf Output_${filename} + mv SC_data.txt Output_${filename}/TempData_${filename} + mv FDR_data.txt Output_${filename}/TempData_${filename} + cp $SCRIPTPATH/legend.pdf Output_${filename} +elif [ "${c}" == "n" ]; then + + $SCRIPTPATH/SOFD.pl -i ${f} -s ${s} -x ${x} + echo "$SCRIPTPATH/R_dotPlot_nc.R ${s} ${t} ${m} ${b} $p_c ${p} ${n} ${d} $SCRIPTPATH" + $SCRIPTPATH/R_dotPlot_nc.R ${s} ${t} ${m} ${b} $p_c ${p} ${n} ${d} $SCRIPTPATH + + mkdir Output_${filename} + mkdir Output_${filename}/TempData_${filename} + mv dotplot.pdf Output_${filename} + mv heatmap_borders.pdf Output_${filename} + mv heatmap_no_borders.pdf Output_${filename} + mv SC_data.txt Output_${filename}/TempData_${filename} + mv FDR_data.txt Output_${filename}/TempData_${filename} + cp $SCRIPTPATH/legend.pdf Output_${filename} +else + printf -- "-c must be one of [b, h, n]: b (biclustering), h (hierarchical), n (none, requires input text files for bait and prey ordering>\n" + exit 1; +fi + +if [ "${N}" == "1" ] || [ "${N}" == "2" ]; then + mv norm_saint.txt Output_${filename}/TempData_${filename} +fi +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Dotplot_Release/pheatmap_j.R Fri Jan 29 09:56:02 2016 -0500 @@ -0,0 +1,719 @@ +lo = function(rown, coln, nrow, ncol, cellheight = NA, cellwidth = NA, treeheight_col, treeheight_row, legend, annotation, annotation_colors, annotation_legend, main, fontsize, fontsize_row, fontsize_col, ...){ + # Get height of colnames and length of rownames + if(!is.null(coln[1])){ + longest_coln = which.max(strwidth(coln, units = 'in')) + gp = list(fontsize = fontsize_col, ...) + coln_height = unit(1, "grobheight", textGrob(coln[longest_coln], rot = 90, gp = do.call(gpar, gp))) + unit(5, "bigpts") + } + else{ + coln_height = unit(5, "bigpts") + } + + if(!is.null(rown[1])){ + longest_rown = which.max(strwidth(rown, units = 'in')) + gp = list(fontsize = fontsize_row, ...) + rown_width = unit(1, "grobwidth", textGrob(rown[longest_rown], gp = do.call(gpar, gp))) + unit(10, "bigpts") + } + else{ + rown_width = unit(5, "bigpts") + } + + gp = list(fontsize = fontsize, ...) + # Legend position + if(!is.na(legend[1])){ + longest_break = which.max(nchar(names(legend))) + longest_break = unit(1.1, "grobwidth", textGrob(as.character(names(legend))[longest_break], gp = do.call(gpar, gp))) + title_length = unit(1.1, "grobwidth", textGrob("Scale", gp = gpar(fontface = "bold", ...))) + legend_width = unit(12, "bigpts") + longest_break * 1.2 + legend_width = max(title_length, legend_width) + } + else{ + legend_width = unit(0, "bigpts") + } + + # Set main title height + if(is.na(main)){ + main_height = unit(0, "npc") + } + else{ + main_height = unit(1.5, "grobheight", textGrob(main, gp = gpar(fontsize = 1.3 * fontsize, ...))) + } + + # Column annotations + if(!is.na(annotation[[1]][1])){ + # Column annotation height + annot_height = unit(ncol(annotation) * (8 + 2) + 2, "bigpts") + # Width of the correponding legend + longest_ann = which.max(nchar(as.matrix(annotation))) + annot_legend_width = unit(1.2, "grobwidth", textGrob(as.matrix(annotation)[longest_ann], gp = gpar(...))) + unit(12, "bigpts") + if(!annotation_legend){ + annot_legend_width = unit(0, "npc") + } + } + else{ + annot_height = unit(0, "bigpts") + annot_legend_width = unit(0, "bigpts") + } + + # Tree height + treeheight_col = unit(treeheight_col, "bigpts") + unit(5, "bigpts") + treeheight_row = unit(treeheight_row, "bigpts") + unit(5, "bigpts") + + # Set cell sizes + if(is.na(cellwidth)){ + matwidth = unit(1, "npc") - rown_width - legend_width - treeheight_row - annot_legend_width + } + else{ + matwidth = unit(cellwidth * ncol, "bigpts") + } + + if(is.na(cellheight)){ + matheight = unit(1, "npc") - main_height - coln_height - treeheight_col - annot_height + } + else{ + matheight = unit(cellheight * nrow, "bigpts") + } + + + # Produce layout() + pushViewport(viewport(layout = grid.layout(nrow = 5, ncol = 5, widths = unit.c(treeheight_row, matwidth, rown_width, legend_width, annot_legend_width), heights = unit.c(main_height, treeheight_col, annot_height, matheight, coln_height)), gp = do.call(gpar, gp))) + + # Get cell dimensions + pushViewport(vplayout(4, 2)) + cellwidth = convertWidth(unit(0:1, "npc"), "bigpts", valueOnly = T)[2] / ncol + cellheight = convertHeight(unit(0:1, "npc"), "bigpts", valueOnly = T)[2] / nrow + upViewport() + + # Return minimal cell dimension in bigpts to decide if borders are drawn + mindim = min(cellwidth, cellheight) + return(mindim) +} + +draw_dendrogram = function(hc, horizontal = T){ + h = hc$height / max(hc$height) / 1.05 + m = hc$merge + o = hc$order + n = length(o) + + m[m > 0] = n + m[m > 0] + m[m < 0] = abs(m[m < 0]) + + dist = matrix(0, nrow = 2 * n - 1, ncol = 2, dimnames = list(NULL, c("x", "y"))) + dist[1:n, 1] = 1 / n / 2 + (1 / n) * (match(1:n, o) - 1) + + for(i in 1:nrow(m)){ + dist[n + i, 1] = (dist[m[i, 1], 1] + dist[m[i, 2], 1]) / 2 + dist[n + i, 2] = h[i] + } + + draw_connection = function(x1, x2, y1, y2, y){ + grid.lines(x = c(x1, x1), y = c(y1, y)) + grid.lines(x = c(x2, x2), y = c(y2, y)) + grid.lines(x = c(x1, x2), y = c(y, y)) + } + + if(horizontal){ + for(i in 1:nrow(m)){ + draw_connection(dist[m[i, 1], 1], dist[m[i, 2], 1], dist[m[i, 1], 2], dist[m[i, 2], 2], h[i]) + } + } + + else{ + gr = rectGrob() + pushViewport(viewport(height = unit(1, "grobwidth", gr), width = unit(1, "grobheight", gr), angle = 90)) + dist[, 1] = 1 - dist[, 1] + for(i in 1:nrow(m)){ + draw_connection(dist[m[i, 1], 1], dist[m[i, 2], 1], dist[m[i, 1], 2], dist[m[i, 2], 2], h[i]) + } + upViewport() + } +} + +draw_matrix = function(matrix, border_color, border_width, fmat, fontsize_number){ + n = nrow(matrix) + m = ncol(matrix) + x = (1:m)/m - 1/2/m + y = 1 - ((1:n)/n - 1/2/n) + for(i in 1:m){ + grid.rect(x = x[i], y = y[1:n], width = 1/m, height = 1/n, gp = gpar(fill = matrix[,i], col = border_color, lwd = border_width)) + if(attr(fmat, "draw")){ + grid.text(x = x[i], y = y[1:n], label = fmat[, i], gp = gpar(col = "grey30", fontsize = fontsize_number)) + } + } +} + +draw_colnames = function(coln, ...){ + m = length(coln) + x = (1:m)/m - 1/2/m + grid.text(coln, x = x, y = unit(0.96, "npc"), just="right", rot = 90, gp = gpar(...)) +} + +draw_rownames = function(rown, ...){ + n = length(rown) + y = 1 - ((1:n)/n - 1/2/n) + grid.text(rown, x = unit(0.04, "npc"), y = y, vjust = 0.5, hjust = 0, gp = gpar(...)) +} + +draw_legend = function(color, breaks, legend, ...){ + height = min(unit(1, "npc"), unit(150, "bigpts")) + pushViewport(viewport(x = 0, y = unit(1, "npc"), just = c(0, 1), height = height)) + legend_pos = (legend - min(breaks)) / (max(breaks) - min(breaks)) + breaks = (breaks - min(breaks)) / (max(breaks) - min(breaks)) + h = breaks[-1] - breaks[-length(breaks)] + grid.rect(x = 0, y = breaks[-length(breaks)], width = unit(10, "bigpts"), height = h, hjust = 0, vjust = 0, gp = gpar(fill = color, col = "#FFFFFF00")) + grid.text(names(legend), x = unit(12, "bigpts"), y = legend_pos, hjust = 0, gp = gpar(...)) + upViewport() +} + +convert_annotations = function(annotation, annotation_colors){ + new = annotation + for(i in 1:ncol(annotation)){ + a = annotation[, i] + b = annotation_colors[[colnames(annotation)[i]]] + if(is.character(a) | is.factor(a)){ + a = as.character(a) + if(length(setdiff(a, names(b))) > 0){ + stop(sprintf("Factor levels on variable %s do not match with annotation_colors", colnames(annotation)[i])) + } + new[, i] = b[a] + } + else{ + a = cut(a, breaks = 100) + new[, i] = colorRampPalette(b)(100)[a] + } + } + return(as.matrix(new)) +} + +draw_annotations = function(converted_annotations, border_color, border_width){ + n = ncol(converted_annotations) + m = nrow(converted_annotations) + x = (1:m)/m - 1/2/m + y = cumsum(rep(8, n)) - 4 + cumsum(rep(2, n)) + for(i in 1:m){ + grid.rect(x = x[i], unit(y[1:n], "bigpts"), width = 1/m, height = unit(8, "bigpts"), gp = gpar(fill = converted_annotations[i, ], col = border_color, lwd = border_width)) + } +} + +draw_annotation_legend = function(annotation, annotation_colors, border_color, border_width, ...){ + y = unit(1, "npc") + text_height = unit(1, "grobheight", textGrob("FGH", gp = gpar(...))) + for(i in names(annotation_colors)){ + grid.text(i, x = 0, y = y, vjust = 1, hjust = 0, gp = gpar(fontface = "bold", ...)) + y = y - 1.5 * text_height + if(is.character(annotation[, i]) | is.factor(annotation[, i])){ + for(j in 1:length(annotation_colors[[i]])){ + grid.rect(x = unit(0, "npc"), y = y, hjust = 0, vjust = 1, height = text_height, width = text_height, gp = gpar(col = border_color, lwd = border_width, fill = annotation_colors[[i]][j])) + grid.text(names(annotation_colors[[i]])[j], x = text_height * 1.3, y = y, hjust = 0, vjust = 1, gp = gpar(...)) + y = y - 1.5 * text_height + } + } + else{ + yy = y - 4 * text_height + seq(0, 1, 0.02) * 4 * text_height + h = 4 * text_height * 0.02 + grid.rect(x = unit(0, "npc"), y = yy, hjust = 0, vjust = 1, height = h, width = text_height, gp = gpar(col = "#FFFFFF00", fill = colorRampPalette(annotation_colors[[i]])(50))) + txt = rev(range(grid.pretty(range(annotation[, i], na.rm = TRUE)))) + yy = y - c(0, 3) * text_height + grid.text(txt, x = text_height * 1.3, y = yy, hjust = 0, vjust = 1, gp = gpar(...)) + y = y - 4.5 * text_height + } + y = y - 1.5 * text_height + } +} + +draw_main = function(text, ...){ + grid.text(text, gp = gpar(fontface = "bold", ...)) +} + +vplayout = function(x, y){ + return(viewport(layout.pos.row = x, layout.pos.col = y)) +} + +heatmap_motor = function(matrix, border_color, border_width, cellwidth, cellheight, tree_col, tree_row, treeheight_col, treeheight_row, filename, width, height, breaks, color, legend, annotation, annotation_colors, annotation_legend, main, fontsize, fontsize_row, fontsize_col, fmat, fontsize_number, ...){ + grid.newpage() + + # Set layout + mindim = lo(coln = colnames(matrix), rown = rownames(matrix), nrow = nrow(matrix), ncol = ncol(matrix), cellwidth = cellwidth, cellheight = cellheight, treeheight_col = treeheight_col, treeheight_row = treeheight_row, legend = legend, annotation = annotation, annotation_colors = annotation_colors, annotation_legend = annotation_legend, main = main, fontsize = fontsize, fontsize_row = fontsize_row, fontsize_col = fontsize_col, ...) + + if(!is.na(filename)){ + pushViewport(vplayout(1:5, 1:5)) + + if(is.na(height)){ + height = convertHeight(unit(0:1, "npc"), "inches", valueOnly = T)[2] + } + if(is.na(width)){ + width = convertWidth(unit(0:1, "npc"), "inches", valueOnly = T)[2] + } + + # Get file type + r = regexpr("\\.[a-zA-Z]*$", filename) + if(r == -1) stop("Improper filename") + ending = substr(filename, r + 1, r + attr(r, "match.length")) + + f = switch(ending, + pdf = function(x, ...) pdf(x, ...), + png = function(x, ...) png(x, units = "in", res = 300, ...), + jpeg = function(x, ...) jpeg(x, units = "in", res = 300, ...), + jpg = function(x, ...) jpeg(x, units = "in", res = 300, ...), + tiff = function(x, ...) tiff(x, units = "in", res = 300, compression = "lzw", ...), + bmp = function(x, ...) bmp(x, units = "in", res = 300, ...), + stop("File type should be: pdf, png, bmp, jpg, tiff") + ) + + # print(sprintf("height:%f width:%f", height, width)) + f(filename, height = height, width = width) + heatmap_motor(matrix, cellwidth = cellwidth, cellheight = cellheight, border_color = border_color, border_width = border_width, tree_col = tree_col, tree_row = tree_row, treeheight_col = treeheight_col, treeheight_row = treeheight_row, breaks = breaks, color = color, legend = legend, annotation = annotation, annotation_colors = annotation_colors, annotation_legend = annotation_legend, filename = NA, main = main, fontsize = fontsize, fontsize_row = fontsize_row, fontsize_col = fontsize_col, fmat = fmat, fontsize_number = fontsize_number, ...) + dev.off() + upViewport() + return() + } + + # Omit border color if cell size is too small + if(mindim < 3) border_color = NA + + # Draw title + if(!is.na(main)){ + pushViewport(vplayout(1, 2)) + draw_main(main, fontsize = 1.3 * fontsize, ...) + upViewport() + } + + # Draw tree for the columns + if(!is.na(tree_col[[1]][1]) & treeheight_col != 0){ + pushViewport(vplayout(2, 2)) + draw_dendrogram(tree_col, horizontal = T) + upViewport() + } + + # Draw tree for the rows + if(!is.na(tree_row[[1]][1]) & treeheight_row != 0){ + pushViewport(vplayout(4, 1)) + draw_dendrogram(tree_row, horizontal = F) + upViewport() + } + + # Draw matrix + pushViewport(vplayout(4, 2)) + draw_matrix(matrix, border_color, border_width, fmat, fontsize_number) + upViewport() + + # Draw colnames + if(length(colnames(matrix)) != 0){ + pushViewport(vplayout(5, 2)) + pars = list(colnames(matrix), fontsize = fontsize_col, ...) + do.call(draw_colnames, pars) + upViewport() + } + + # Draw rownames + if(length(rownames(matrix)) != 0){ + pushViewport(vplayout(4, 3)) + pars = list(rownames(matrix), fontsize = fontsize_row, ...) + do.call(draw_rownames, pars) + upViewport() + } + + # Draw annotation tracks + if(!is.na(annotation[[1]][1])){ + pushViewport(vplayout(3, 2)) + converted_annotation = convert_annotations(annotation, annotation_colors) + draw_annotations(converted_annotation, border_color, border_width) + upViewport() + } + + # Draw annotation legend + if(!is.na(annotation[[1]][1]) & annotation_legend){ + if(length(rownames(matrix)) != 0){ + pushViewport(vplayout(4:5, 5)) + } + else{ + pushViewport(vplayout(3:5, 5)) + } + draw_annotation_legend(annotation, annotation_colors, border_color, border_width, fontsize = fontsize, ...) + upViewport() + } + + # Draw legend + if(!is.na(legend[1])){ + length(colnames(matrix)) + if(length(rownames(matrix)) != 0){ + pushViewport(vplayout(4:5, 4)) + } + else{ + pushViewport(vplayout(3:5, 4)) + } + draw_legend(color, breaks, legend, fontsize = fontsize, ...) + upViewport() + } + + +} + +generate_breaks = function(x, n, center = F){ + if(center){ + m = max(abs(c(min(x, na.rm = T), max(x, na.rm = T)))) + res = seq(-m, m, length.out = n + 1) + } + else{ + res = seq(min(x, na.rm = T), max(x, na.rm = T), length.out = n + 1) + } + + return(res) +} + +scale_vec_colours = function(x, col = rainbow(10), breaks = NA){ + return(col[as.numeric(cut(x, breaks = breaks, include.lowest = T))]) +} + +scale_colours = function(mat, col = rainbow(10), breaks = NA){ + mat = as.matrix(mat) + return(matrix(scale_vec_colours(as.vector(mat), col = col, breaks = breaks), nrow(mat), ncol(mat), dimnames = list(rownames(mat), colnames(mat)))) +} + +cluster_mat = function(mat, distance, method){ + if(!(method %in% c("ward", "single", "complete", "average", "mcquitty", "median", "centroid"))){ + stop("clustering method has to one form the list: 'ward', 'single', 'complete', 'average', 'mcquitty', 'median' or 'centroid'.") + } + if(!(distance[1] %in% c("correlation", "euclidean", "maximum", "manhattan", "canberra", "binary", "minkowski")) & class(distance) != "dist"){ + print(!(distance[1] %in% c("correlation", "euclidean", "maximum", "manhattan", "canberra", "binary", "minkowski")) | class(distance) != "dist") + stop("distance has to be a dissimilarity structure as produced by dist or one measure form the list: 'correlation', 'euclidean', 'maximum', 'manhattan', 'canberra', 'binary', 'minkowski'") + } + if(distance[1] == "correlation"){ + d = as.dist(1 - cor(t(mat))) + } + else{ + if(class(distance) == "dist"){ + d = distance + } + else{ + d = dist(mat, method = distance) + } + } + + return(hclust(d, method = method)) +} + +scale_rows = function(x){ + m = apply(x, 1, mean, na.rm = T) + s = apply(x, 1, sd, na.rm = T) + return((x - m) / s) +} + +scale_mat = function(mat, scale){ + if(!(scale %in% c("none", "row", "column"))){ + stop("scale argument shoud take values: 'none', 'row' or 'column'") + } + mat = switch(scale, none = mat, row = scale_rows(mat), column = t(scale_rows(t(mat)))) + return(mat) +} + +generate_annotation_colours = function(annotation, annotation_colors, drop){ + if(is.na(annotation_colors)[[1]][1]){ + annotation_colors = list() + } + count = 0 + for(i in 1:ncol(annotation)){ + if(is.character(annotation[, i]) | is.factor(annotation[, i])){ + if (is.factor(annotation[, i]) & !drop){ + count = count + length(levels(annotation[, i])) + } + else{ + count = count + length(unique(annotation[, i])) + } + } + } + + factor_colors = hsv((seq(0, 1, length.out = count + 1)[-1] + + 0.2)%%1, 0.7, 0.95) + + set.seed(3453) + + for(i in 1:ncol(annotation)){ + if(!(colnames(annotation)[i] %in% names(annotation_colors))){ + if(is.character(annotation[, i]) | is.factor(annotation[, i])){ + n = length(unique(annotation[, i])) + if (is.factor(annotation[, i]) & !drop){ + n = length(levels(annotation[, i])) + } + ind = sample(1:length(factor_colors), n) + annotation_colors[[colnames(annotation)[i]]] = factor_colors[ind] + l = levels(as.factor(annotation[, i])) + l = l[l %in% unique(annotation[, i])] + if (is.factor(annotation[, i]) & !drop){ + l = levels(annotation[, i]) + } + names(annotation_colors[[colnames(annotation)[i]]]) = l + factor_colors = factor_colors[-ind] + } + else{ + r = runif(1) + annotation_colors[[colnames(annotation)[i]]] = hsv(r, c(0.1, 1), 1) + } + } + } + return(annotation_colors) +} + +kmeans_pheatmap = function(mat, k = min(nrow(mat), 150), sd_limit = NA, ...){ + # Filter data + if(!is.na(sd_limit)){ + s = apply(mat, 1, sd) + mat = mat[s > sd_limit, ] + } + + # Cluster data + set.seed(1245678) + km = kmeans(mat, k, iter.max = 100) + mat2 = km$centers + + # Compose rownames + t = table(km$cluster) + rownames(mat2) = sprintf("cl%s_size_%d", names(t), t) + + # Draw heatmap + pheatmap(mat2, ...) +} + +#' A function to draw clustered heatmaps. +#' +#' A function to draw clustered heatmaps where one has better control over some graphical +#' parameters such as cell size, etc. +#' +#' The function also allows to aggregate the rows using kmeans clustering. This is +#' advisable if number of rows is so big that R cannot handle their hierarchical +#' clustering anymore, roughly more than 1000. Instead of showing all the rows +#' separately one can cluster the rows in advance and show only the cluster centers. +#' The number of clusters can be tuned with parameter kmeans_k. +#' +#' @param mat numeric matrix of the values to be plotted. +#' @param color vector of colors used in heatmap. +#' @param kmeans_k the number of kmeans clusters to make, if we want to agggregate the +#' rows before drawing heatmap. If NA then the rows are not aggregated. +#' @param breaks a sequence of numbers that covers the range of values in mat and is one +#' element longer than color vector. Used for mapping values to colors. Useful, if needed +#' to map certain values to certain colors, to certain values. If value is NA then the +#' breaks are calculated automatically. +#' @param border_color color of cell borders on heatmap, use NA if no border should be +#' drawn. +#' @param cellwidth individual cell width in points. If left as NA, then the values +#' depend on the size of plotting window. +#' @param cellheight individual cell height in points. If left as NA, +#' then the values depend on the size of plotting window. +#' @param scale character indicating if the values should be centered and scaled in +#' either the row direction or the column direction, or none. Corresponding values are +#' \code{"row"}, \code{"column"} and \code{"none"} +#' @param cluster_rows boolean values determining if rows should be clustered, +#' @param cluster_cols boolean values determining if columns should be clustered. +#' @param clustering_distance_rows distance measure used in clustering rows. Possible +#' values are \code{"correlation"} for Pearson correlation and all the distances +#' supported by \code{\link{dist}}, such as \code{"euclidean"}, etc. If the value is none +#' of the above it is assumed that a distance matrix is provided. +#' @param clustering_distance_cols distance measure used in clustering columns. Possible +#' values the same as for clustering_distance_rows. +#' @param clustering_method clustering method used. Accepts the same values as +#' \code{\link{hclust}}. +#' @param treeheight_row the height of a tree for rows, if these are clustered. +#' Default value 50 points. +#' @param treeheight_col the height of a tree for columns, if these are clustered. +#' Default value 50 points. +#' @param legend logical to determine if legend should be drawn or not. +#' @param legend_breaks vector of breakpoints for the legend. +#' @param legend_labels vector of labels for the \code{legend_breaks}. +#' @param annotation data frame that specifies the annotations shown on top of the +#' columns. Each row defines the features for a specific column. The columns in the data +#' and rows in the annotation are matched using corresponding row and column names. Note +#' that color schemes takes into account if variable is continuous or discrete. +#' @param annotation_colors list for specifying annotation track colors manually. It is +#' possible to define the colors for only some of the features. Check examples for +#' details. +#' @param annotation_legend boolean value showing if the legend for annotation tracks +#' should be drawn. +#' @param drop_levels logical to determine if unused levels are also shown in the legend +#' @param show_rownames boolean specifying if column names are be shown. +#' @param show_colnames boolean specifying if column names are be shown. +#' @param main the title of the plot +#' @param fontsize base fontsize for the plot +#' @param fontsize_row fontsize for rownames (Default: fontsize) +#' @param fontsize_col fontsize for colnames (Default: fontsize) +#' @param display_numbers logical determining if the numeric values are also printed to +#' the cells. +#' @param number_format format strings (C printf style) of the numbers shown in cells. +#' For example "\code{\%.2f}" shows 2 decimal places and "\code{\%.1e}" shows exponential +#' notation (see more in \code{\link{sprintf}}). +#' @param fontsize_number fontsize of the numbers displayed in cells +#' @param filename file path where to save the picture. Filetype is decided by +#' the extension in the path. Currently following formats are supported: png, pdf, tiff, +#' bmp, jpeg. Even if the plot does not fit into the plotting window, the file size is +#' calculated so that the plot would fit there, unless specified otherwise. +#' @param width manual option for determining the output file width in inches. +#' @param height manual option for determining the output file height in inches. +#' @param \dots graphical parameters for the text used in plot. Parameters passed to +#' \code{\link{grid.text}}, see \code{\link{gpar}}. +#' +#' @return +#' Invisibly a list of components +#' \itemize{ +#' \item \code{tree_row} the clustering of rows as \code{\link{hclust}} object +#' \item \code{tree_col} the clustering of columns as \code{\link{hclust}} object +#' \item \code{kmeans} the kmeans clustering of rows if parameter \code{kmeans_k} was +#' specified +#' } +#' +#' @author Raivo Kolde <rkolde@@gmail.com> +#' @examples +#' # Generate some data +#' test = matrix(rnorm(200), 20, 10) +#' test[1:10, seq(1, 10, 2)] = test[1:10, seq(1, 10, 2)] + 3 +#' test[11:20, seq(2, 10, 2)] = test[11:20, seq(2, 10, 2)] + 2 +#' test[15:20, seq(2, 10, 2)] = test[15:20, seq(2, 10, 2)] + 4 +#' colnames(test) = paste("Test", 1:10, sep = "") +#' rownames(test) = paste("Gene", 1:20, sep = "") +#' +#' # Draw heatmaps +#' pheatmap(test) +#' pheatmap(test, kmeans_k = 2) +#' pheatmap(test, scale = "row", clustering_distance_rows = "correlation") +#' pheatmap(test, color = colorRampPalette(c("navy", "white", "firebrick3"))(50)) +#' pheatmap(test, cluster_row = FALSE) +#' pheatmap(test, legend = FALSE) +#' pheatmap(test, display_numbers = TRUE) +#' pheatmap(test, display_numbers = TRUE, number_format = "%.1e") +#' pheatmap(test, cluster_row = FALSE, legend_breaks = -1:4, legend_labels = c("0", +#' "1e-4", "1e-3", "1e-2", "1e-1", "1")) +#' pheatmap(test, cellwidth = 15, cellheight = 12, main = "Example heatmap") +#' pheatmap(test, cellwidth = 15, cellheight = 12, fontsize = 8, filename = "test.pdf") +#' +#' +#' # Generate column annotations +#' annotation = data.frame(Var1 = factor(1:10 %% 2 == 0, +#' labels = c("Class1", "Class2")), Var2 = 1:10) +#' annotation$Var1 = factor(annotation$Var1, levels = c("Class1", "Class2", "Class3")) +#' rownames(annotation) = paste("Test", 1:10, sep = "") +#' +#' pheatmap(test, annotation = annotation) +#' pheatmap(test, annotation = annotation, annotation_legend = FALSE) +#' pheatmap(test, annotation = annotation, annotation_legend = FALSE, drop_levels = FALSE) +#' +#' # Specify colors +#' Var1 = c("navy", "darkgreen") +#' names(Var1) = c("Class1", "Class2") +#' Var2 = c("lightgreen", "navy") +#' +#' ann_colors = list(Var1 = Var1, Var2 = Var2) +#' +#' pheatmap(test, annotation = annotation, annotation_colors = ann_colors, main = "Example") +#' +#' # Specifying clustering from distance matrix +#' drows = dist(test, method = "minkowski") +#' dcols = dist(t(test), method = "minkowski") +#' pheatmap(test, clustering_distance_rows = drows, clustering_distance_cols = dcols) +#' +#' @export +pheatmap_j = function(mat, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdYlBu")))(100), kmeans_k = NA, breaks = NA, border_color = "grey60", border_width = 1, cellwidth = NA, cellheight = NA, scale = "none", cluster_rows = TRUE, cluster_cols = TRUE, clustering_distance_rows = "euclidean", clustering_distance_cols = "euclidean", clustering_method = "complete", treeheight_row = ifelse(cluster_rows, 50, 0), treeheight_col = ifelse(cluster_cols, 50, 0), legend = TRUE, legend_breaks = NA, legend_labels = NA, annotation = NA, annotation_colors = NA, annotation_legend = TRUE, drop_levels = TRUE, show_rownames = T, show_colnames = T, main = NA, fontsize = 10, fontsize_row = fontsize, fontsize_col = fontsize, display_numbers = F, number_format = "%.2f", fontsize_number = 0.8 * fontsize, filename = NA, width = NA, height = NA, ...){ + + # Preprocess matrix + mat = as.matrix(mat) + if(scale != "none"){ + mat = scale_mat(mat, scale) + if(is.na(breaks)){ + breaks = generate_breaks(mat, length(color), center = T) + } + } + + + # Kmeans + if(!is.na(kmeans_k)){ + # Cluster data + km = kmeans(mat, kmeans_k, iter.max = 100) + mat = km$centers + + # Compose rownames + t = table(km$cluster) + rownames(mat) = sprintf("cl%s_size_%d", names(t), t) + } + else{ + km = NA + } + + # Do clustering + if(cluster_rows){ + tree_row = cluster_mat(mat, distance = clustering_distance_rows, method = clustering_method) + mat = mat[tree_row$order, , drop = FALSE] + } + else{ + tree_row = NA + treeheight_row = 0 + } + + if(cluster_cols){ + tree_col = cluster_mat(t(mat), distance = clustering_distance_cols, method = clustering_method) + mat = mat[, tree_col$order, drop = FALSE] + } + else{ + tree_col = NA + treeheight_col = 0 + } + + # Format numbers to be displayed in cells + if(display_numbers){ + fmat = matrix(sprintf(number_format, mat), nrow = nrow(mat), ncol = ncol(mat)) + attr(fmat, "draw") = TRUE + } + else{ + fmat = matrix(NA, nrow = nrow(mat), ncol = ncol(mat)) + attr(fmat, "draw") = FALSE + } + + + # Colors and scales + if(!is.na(legend_breaks[1]) & !is.na(legend_labels[1])){ + if(length(legend_breaks) != length(legend_labels)){ + stop("Lengths of legend_breaks and legend_labels must be the same") + } + } + + + if(is.na(breaks[1])){ + breaks = generate_breaks(as.vector(mat), length(color)) + } + if (legend & is.na(legend_breaks[1])) { + legend = grid.pretty(range(as.vector(breaks))) + names(legend) = legend + } + else if(legend & !is.na(legend_breaks[1])){ + legend = legend_breaks[legend_breaks >= min(breaks) & legend_breaks <= max(breaks)] + + if(!is.na(legend_labels[1])){ + legend_labels = legend_labels[legend_breaks >= min(breaks) & legend_breaks <= max(breaks)] + names(legend) = legend_labels + } + else{ + names(legend) = legend + } + } + else { + legend = NA + } + mat = scale_colours(mat, col = color, breaks = breaks) + + # Preparing annotation colors + if(!is.na(annotation[[1]][1])){ + annotation = annotation[colnames(mat), , drop = F] + annotation_colors = generate_annotation_colours(annotation, annotation_colors, drop = drop_levels) + } + + if(!show_rownames){ + rownames(mat) = NULL + } + + if(!show_colnames){ + colnames(mat) = NULL + } + + # Draw heatmap + heatmap_motor(mat, border_color = border_color, border_width = border_width, cellwidth = cellwidth, cellheight = cellheight, treeheight_col = treeheight_col, treeheight_row = treeheight_row, tree_col = tree_col, tree_row = tree_row, filename = filename, width = width, height = height, breaks = breaks, color = color, legend = legend, annotation = annotation, annotation_colors = annotation_colors, annotation_legend = annotation_legend, main = main, fontsize = fontsize, fontsize_row = fontsize_row, fontsize_col = fontsize_col, fmat = fmat, fontsize_number = fontsize_number, ...) + + invisible(list(tree_row = tree_row, tree_col = tree_col, kmeans = km)) +} + +