Mercurial > repos > jeremyjliu > region_motif_enrichment
changeset 0:5c044273554d draft
initial commit
author | jeremyjliu |
---|---|
date | Tue, 05 Aug 2014 13:56:22 -0400 |
parents | |
children | 1de6f7f3d513 |
files | region_motif_compare.r region_motif_compare.xml region_motif_db/pwms/jaspar.jolma.pwms.from.seq.RData region_motif_db/pwms/mm9.pwms.from.seq.RData region_motif_db/pwms/pouya.pwms.from.seq.RData region_motif_intersect.r region_motif_intersect.xml region_motif_lib/plotting.r region_motif_lib/regions.cpp region_motif_lib/regions.o region_motif_lib/regions.r region_motif_lib/regions.so |
diffstat | 12 files changed, 1595 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/region_motif_compare.r Tue Aug 05 13:56:22 2014 -0400 @@ -0,0 +1,189 @@ +# Name: region_motif_compare.r +# Description: Reads in two count files and determines enriched and depleted +# motifs (or any location based feature) based on poisson tests and gc +# corrections. All enrichment ratios relative to overall count / gc ratios. +# Author: Jeremy liu +# Email: jeremy.liu@yale.edu +# Date: 14/07/03 +# Note: This script is meant to be invoked with the following command +# R --slave --vanilla -f ./region_motif_compare.r --args <workingdir> <db> <intab1> <intab2> +# <enriched_tab> <depleted_tab> <plots_png> +# <workingdir> is working directory of galaxy installation +# <db> types: "t" test, "p" pouya, "j" jaspar jolma, "m" mouse, "c" combined +# Dependencies: plotting.r + +# Auxiliary function to concatenate multiple strings +concat <- function(...) { + input_list <- list(...) + return(paste(input_list, sep="", collapse="")) +} + +# Supress all warning messages to prevent Galaxy treating warnings as errors +options(warn=-1) + +# Set common and data directories +args <- commandArgs() +workingDir = args[7] +commonDir = concat(workingDir, "/tools/my_tools") +dbCode = args[8] +# dbCode "c" implemented when pwmFile is loaded +if (dbCode == "t" | dbCode == "p") { + pwmFile = concat(commonDir, "/region_motif_db/pwms/pouya.pwms.from.seq.RData") +} else if (dbCode == "j") { + pwmFile = concat(commonDir, "/region_motif_db/pwms/jaspar.jolma.pwms.from.seq.RData") +} else if (dbCode == "m") { + pwmFile = concat(commonDir, "/region_motif_db/pwms/mm9.pwms.from.seq.RData") +} else if (dbCode == "c") { # rest of dbCode "c" implemeted when pwmFile loaded + pwmFile = concat(commonDir, "/region_motif_db/pwms/pouya.pwms.from.seq.RData") + pwmFile2 = concat(commonDir, "/region_motif_db/pwms/jaspar.jolma.pwms.from.seq.RData") +} else { + pwmFile = concat(commonDir, "/region_motif_db/pwms/pouya.pwms.from.seq.RData") +} + +# Set input and reference files +inTab1 = args[9] +inTab2 = args[10] +enrichTab = args[11] +depleteTab = args[12] +plotsPng = args[13] + +# Load dependencies +source(concat(commonDir, "/region_motif_lib/plotting.r")) + +# Auxiliary function to read in tab file and prepare the data +read_tsv <- function(file) { + data = read.table(file, sep="\t", stringsAsFactors=FALSE) + names(data)[names(data) == "V1"] = "motif" + names(data)[names(data) == "V2"] = "counts" + return(data) +} + +startTime = Sys.time() +cat("Running ... Started at:", format(startTime, "%a %b %d %X %Y"), "...\n") + +# Loading motif position weight matrix (pwm) file and input tab file +#cat("Loading and reading input region motif count files...\n") +load(pwmFile) # pwms data structure +if (dbCode == "c") { # Remaining implementation of dbCode "c" combined + temp = pwms + load(pwmFile2) + pwms = append(temp, pwms) +} +region1DF = read_tsv(inTab1) +region2DF = read_tsv(inTab2) +region1Counts = region1DF$counts +region2Counts = region2DF$counts +names(region1Counts) = region1DF$motif +names(region2Counts) = region2DF$motif + +# Processing count vectors to account for missing 0 count motifs, then sorting +#cat("Performing 0 count correction and sorting...\n") +allNames = union(names(region1Counts), names(region2Counts)) +region1Diff = setdiff(allNames, names(region1Counts)) +region2Diff = setdiff(allNames, names(region2Counts)) +addCounts1 = rep(0, length(region1Diff)) +addCounts2 = rep(0, length(region2Diff)) +names(addCounts1) = region1Diff +names(addCounts2) = region2Diff +newCounts1 = append(region1Counts, addCounts1) +newCounts2 = append(region2Counts, addCounts2) +region1Counts = newCounts1[sort.int(names(newCounts1), index.return=TRUE)$ix] +region2Counts = newCounts2[sort.int(names(newCounts2), index.return=TRUE)$ix] + +# Generate gc content matrix +gc = sapply(pwms, function(i) mean(i[2:3,3:18])) + +# Apply poisson test, calculate p and q values, and filter significant results +#cat("Applying poisson test...\n") +rValue = sum(region2Counts) / sum(region1Counts) +pValue = sapply(seq(along=region1Counts), function(i) { + poisson.test(c(region1Counts[i], region2Counts[i]), r=1/rValue)$p.value +}) +qValue = p.adjust(pValue, "fdr") +indices = which(qValue<0.1 & abs(log2(region1Counts/region2Counts/rValue))>log2(1.5)) + +# Setting up output diagnostic plots, 4 in 1 png image +png(plotsPng, width=800, height=800) +xlab = "region1_count" +ylab = "region2_count" +lim = c(0.5, 5000) +layout(matrix(1:4, ncol=2)) +par(mar=c(5, 5, 5, 1)) + +# Plot all motif counts along the linear correlation coefficient +plot.scatter(region1Counts+0.5, region2Counts+0.5, log="xy", xlab=xlab, ylab=ylab, + cex.lab=2.2, cex.axis=1.8, xlim=lim, ylim=lim*rValue) +abline(0, rValue, untf=T) +abline(0, rValue*2, untf=T, lty=2) +abline(0, rValue/2, untf=T, lty=2) + +# Plot enriched and depleted motifs in red, housed in second plot +plot.scatter(region1Counts+0.5, region2Counts+0.5, log="xy", xlab=xlab, ylab=ylab, + cex.lab=2.2, cex.axis=1.8, xlim=lim, ylim=lim*rValue) +points(region1Counts[indices]+0.5, region2Counts[indices]+0.5, col="red") +abline(0, rValue, untf=T) +abline(0, rValue*2, untf=T, lty=2) +abline(0, rValue/2, untf=T, lty=2) + +# Apply and plot gc correction and loess curve +#cat("Applying gc correction, rerunning poisson test...\n") +ind = which(region1Counts>5) +gc = gc[names(region2Counts)] # Reorder the indices of pwms to match input data +lo = plot.scatter(gc,log2(region2Counts/region1Counts),draw.loess=T, + xlab="gc content of motif",ylab=paste("log2(",ylab,"/",xlab,")"), + cex.lab=2.2,cex.axis=1.8,ind=ind) # This function is in plotting.r +gcCorrection = 2^approx(lo$loess,xout=gc,rule=2)$y +save(gc, file="gc.RData") + +# Recalculate p and q values, and filter for significant entries +pValueGC = sapply(seq(along=region1Counts),function(i) { + poisson.test(c(region1Counts[i],region2Counts[i]),r=1/gcCorrection[i])$p.value +}) +qValueGC=p.adjust(pValueGC,"fdr") +indicesGC = which(qValueGC<0.1 & abs(log2(region1Counts/region2Counts*gcCorrection))>log2(1.5)) + +# Plot gc corrected motif counts +plot.scatter(region1Counts+0.5, (region2Counts+0.5)/gcCorrection, log="xy", + xlab=xlab, ylab=paste(ylab,"(normalized)"), cex.lab=2.2, cex.axis=1.8, + xlim=lim, ylim=lim) +points(region1Counts[indicesGC]+0.5, + (region2Counts[indicesGC]+0.5)/gcCorrection[indicesGC], col="red") +abline(0,1) +abline(0,1*2,untf=T,lty=2) +abline(0,1/2,untf=T,lty=2) + +# Trim results, compile statistics and output to file +# Only does so if significant results are computed +if(length(indicesGC) > 0) { + # Calculate expected counts and enrichment ratios + #cat("Calculating statistics...\n") + nullExpect = region1Counts * gcCorrection + enrichment = region2Counts / nullExpect + + # Reorder selected indices in ascending pvalue + #cat("Reordering by ascending pvalue...\n") + indicesReorder = indicesGC[order(pValueGC[indicesGC])] + + # Combine data into one data frame and output to two files + #cat("Splitting and outputting data...\n") + outDF = data.frame(motif=names(pValueGC), p=as.numeric(pValueGC), q=qValueGC, + stringsAsFactors=F, region_1_count=region1Counts, + null_expectation=round(nullExpect,2), region_2_count=region2Counts, + enrichment=enrichment)[indicesReorder,] + names(outDF)[which(names(outDF)=="region_1_count")]=xlab + names(outDF)[which(names(outDF)=="region_2_count")]=ylab + indicesEnrich = which(outDF$enrichment>1) + indicesDeplete = which(outDF$enrichment<1) + outDF$enrichment = ifelse(outDF$enrichment>1, + round(outDF$enrichment,3), + paste("1/",round(1/outDF$enrichment,3))) + write.table(outDF[indicesEnrich,], file=enrichTab, quote=FALSE, + sep="\t", append=FALSE, row.names=FALSE, col.names=TRUE) + write.table(outDF[indicesDeplete,], file=depleteTab, quote=FALSE, + sep="\t", append=FALSE, row.names=FALSE, col.names=TRUE) +} + +# Catch display messages and output timing information +catchMessage = dev.off() +cat("Done. Job started at:", format(startTime, "%a %b %d %X %Y."), + "Job ended at:", format(Sys.time(), "%a %b %d %X %Y."), "\n")
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/region_motif_compare.xml Tue Aug 05 13:56:22 2014 -0400 @@ -0,0 +1,29 @@ +<tool id="region_motif_compare" name="Region Motif Count Compare"> + <description>for comparing the motif counts in different region sets</description> + <command interpreter="bash"> + /usr/bin/R --slave --vanilla -f $GALAXY_ROOT_DIR/tools/my_tools/region_motif_compare.r --args $GALAXY_ROOT_DIR $db_type $in_tab_1 $in_tab_2 $out_enriched $out_depleted $out_plots + </command> + <inputs> + <param name="in_tab_1" type="data" format="tabular" label="Region Set 1 Motif Count File"/> + <param name="in_tab_2" type="data" format="tabular" label="Region Set 2 Motif Count File"/> + <param name="db_type" type="select" label="Select Motif Database" > + <option value="t">Test Pouya Subset (hg19)</option> + <option value="p">Pouya Encode Motifs (hg19)</option> + <option value="j">Jaspar and Jolma Motifs (hg19)</option> + <option value="m">Mouse Motifs (mm9)</option> + <option value="c">Pouya, Jaspar, and Jolma Combined (hg19)</option> + </param> + </inputs> + <outputs> + <data name="out_enriched" format="tabular" label="Enriched Motifs"/> + <data name="out_depleted" format="tabular" label="Depleted Motifs"/> + <data name="out_plots" format="png" label="Motif Count Comparison Plots"/> + </outputs> + + <help> + This tools reads in two counts file and determines enriched and depleted + motifs in two different region sets based on poisson calculation with + gc correction. + </help> + +</tool> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/region_motif_intersect.r Tue Aug 05 13:56:22 2014 -0400 @@ -0,0 +1,102 @@ +# Name: region_motif_intersect.r +# Description: Takes a bed file of target regions and counts intersections +# of each motif (built in rdata database) and target regions. +# Author: Jeremy liu +# Email: jeremy.liu@yale.edu +# Date: 14/07/02 +# Note: This script is meant to be invoked with the following command +# R --slave --vanilla -f ./region_motif_intersect.r --args <workingdir> <db> <inbed> <outtab> +# <workingdir> is working directory of galaxy installation +# <db> types: "t" test, "p" pouya, "j" jaspar jolma, "m" mouse +# Dependencies: none + +# Auxiliary function to concatenate multiple strings +concat <- function(...) { + input_list <- list(...) + return(paste(input_list, sep="", collapse="")) +} + +# Set common and data directories +args <- commandArgs() +workingDir = args[7] +commonDir = concat(workingDir, "/tools/my_tools") +dbCode = args[8] +if (dbCode == "t") { + motifDB = concat(commonDir, "/region_motif_db/pouya_test_motifs.bed.bgz") +} else if (dbCode == "p") { + motifDB = concat(commonDir, "/region_motif_db/pouya_motifs.bed.bgz") +} else if (dbCode == "j") { + motifDB = concat(commonDir, "/region_motif_db/jaspar_jolma_motifs.bed.bgz") +} else if (dbCode == "m") { + motifDB = concat(commonDir, "/region_motif_db/mm9_motifs_split.bed.bgz") +} else { + motifDB = concat(commonDir, "/region_motif_db/pouya_motifs.bed.bgz") +} + +# Set input and reference files, comment to toggle commmand line arguments +inBed = args[9] +outTab = args[10] + +# Auxiliary function to read in BED file +read_bed <- function(file) { + return(read.table(file, sep="\t", stringsAsFactors=FALSE)) +} + +startTime = Sys.time() +cat("Running ... Started at:", format(startTime, "%a %b %d %X %Y"), "...\n") + +# Load dependencies +#cat("Loading dependencies...\n") +suppressPackageStartupMessages(library(Rsamtools, quietly=TRUE)) + +# Initializing hash table (as env) with motif names and loading tabix file +#cat("Loading motif database and initializing hash table...\n") +motifTable = new.env() +motifTbx <- TabixFile(motifDB) + +# Loading input bed file, convert integer columns to numeric, name columns +#cat("Loading region file...\n") +regionsDF = read_bed(inBed) +dfTemp = sapply(regionsDF, is.integer) +regionsDF[dfTemp] = lapply(regionsDF[dfTemp], as.numeric) +names(regionsDF)[names(regionsDF) == "V1"] = "chr" +names(regionsDF)[names(regionsDF) == "V2"] = "start" +names(regionsDF)[names(regionsDF) == "V3"] = "end" + +# Filtering regions to exclude chromosomes not in motif database +#cat("Determining intersection counts...\n") +motifTbxChrs = seqnamesTabix(motifTbx) +regionsDFFilter = subset(regionsDF, chr %in% motifTbxChrs) + +# Loading regions into GRanges object and scanning motif tabix database +# Region end is incremented by 1 since scanTabix querying is inclusive for +# position start but exclusive for position end. +param = GRanges(regionsDFFilter$chr, IRanges(regionsDFFilter$start, + end=regionsDFFilter$end + 1)) +regionsIntersects = scanTabix(motifTbx, param=param) + +# Parsing result list and updating motif count hash table +#cat("Parsing result list...\n") +for(regionIntersects in regionsIntersects) { + for(regionIntersect in strsplit(regionIntersects, " ")) { + intersectMotif = strsplit(regionIntersect, "\t")[[1]][4] + if(is.null(motifTable[[intersectMotif]])) { + motifTable[[intersectMotif]] = 1 + } else { + motifTable[[intersectMotif]] = motifTable[[intersectMotif]] + 1 + } + } +} + +# Converting motif count hash table to an integer vector for output +counts = integer(length = length(ls(motifTable))) +names(counts) = ls(motifTable) +for(motifName in ls(motifTable)) { + counts[motifName] = as.integer(motifTable[[motifName]]) +} + +# Outputting intersection counts to tab delineated file +#cat("Outputting to file...\n") +write.table(counts, outTab, quote=FALSE, sep="\t", row.names=TRUE, col.names=FALSE) +cat("Done. Job started at:", format(startTime, "%a %b %d %X %Y."), + "Job ended at:", format(Sys.time(), "%a %b %d %X %Y."), "\n")
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/region_motif_intersect.xml Tue Aug 05 13:56:22 2014 -0400 @@ -0,0 +1,24 @@ +<tool id="region_motif_intersect" name="Region Motif Intersect"> + <description>for computing the motifs that lie inside a region set</description> + <command interpreter="bash"> + /usr/bin/R --slave --vanilla -f $GALAXY_ROOT_DIR/tools/my_tools/region_motif_intersect.r --args $GALAXY_ROOT_DIR $db_type $in_bed $out_tab + </command> + <inputs> + <param name="in_bed" type="data" format="bed" label="Input BED File" /> + <param name="db_type" type="select" label="Select Motif Database" > + <option value="t">Test Pouya Subset (hg19)</option> + <option value="p">Pouya Encode Motifs (hg19)</option> + <option value="j">Jaspar and Jolma Motifs (hg19)</option> + <option value="m">Mouse Motifs (mm9)</option> + </param> + </inputs> + <outputs> + <data name="out_tab" format="tabular" /> + </outputs> + + <help> + This tool computes the motifs and the number of motifs that intersect + any region in a input set of regions. + </help> + +</tool> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/region_motif_lib/plotting.r Tue Aug 05 13:56:22 2014 -0400 @@ -0,0 +1,815 @@ +library(graphics, quietly=TRUE) + +plot.verbose=F +name.cleaner<-function(...,sep="",replace="_") { + plot.name=gsub(" ",replace,paste(...,sep=sep)) + plot.name=gsub("/",replace,plot.name) + plot.name=gsub(",",replace,plot.name) + plot.name=gsub("'",replace,plot.name) + plot.name=gsub("\\+","plus",plot.name) + plot.name=gsub("\\(","",plot.name) + plot.name=gsub("\\)","",plot.name) + return(plot.name) +} +plot.namer <- function(..., date=0, fig.dir=0, format="png",sep="") { + plot.name=name.cleaner(...,sep=sep) + if(date==0) date=gsub("-","",as.character(Sys.Date())) + if(fig.dir==0) fig.dir="/Users/alver/allplots" + plot.name=paste(fig.dir,"/",date,plot.name,".",format,sep="") + if(plot.verbose) cat(" saving figure: ",plot.name,"\n") + return(plot.name) +} + +plot.scatter <- function(x,y=NULL,f=0.1,same=FALSE,n.points=-1,draw.lowess=FALSE,write.r=TRUE,cex.r=1,col=NULL,col.line=NULL,lwd.line=1, + draw.loess=FALSE,span=0.5,bandwidth=bandwidth,draw.prof=FALSE,xlog=FALSE,ylog=FALSE,cor.method="pearson",log="",ind=NULL, + draw.spread=FALSE,...) { + + ## if col is the same length as x, use col for each point matching x. + ## if col is the same length as ind, use col for each point matching x[ind]. + ## else use densCols function based on col. + ## if col is null, densCols is used with bluetone for first plot and redtone for same=T. + + #print(length(x)) + #print(length(y)) + + xy <- xy.coords(x, y) + x=xy$x + y=xy$y + + output=list() + col.use = col + + if(!is.null(ind)) { + if(length(col.use)==length(x)) { + col.use=col.use[ind] + } + x=x[ind] + y=y[ind] + } + + if(length(col.use)!=length(x)) { + col.use=rep(NA,length(x)) + } + + + take=which(is.finite(x) & is.finite(y)) + x=x[take] + y=y[take] + col.use=col.use[take] + + if(grepl("x",log)) xlog=TRUE + if(grepl("y",log)) ylog=TRUE + if(xlog) log="x" + if(ylog) log=paste(log,"y",sep="") + + if(draw.lowess) { + lo = lowess(x,y,f) + output$lowess=lo + } + if(draw.loess | draw.spread) { + px=x;py=y + if(xlog) px=log(x) + if(ylog) py=log(y) + ind = which(is.finite(px+py)) + px=px[ind] + py=py[ind] + lo = loess(py ~ px,span=span,iterations=5) + lo.y=as.numeric(lo$fitted) + lo.x=as.numeric(lo$x) + if(draw.spread) lo.sd = loess((lo.y-py)^2 ~ lo.x,span=span*1.5,iterations=5) + if(xlog) lo.x=exp(lo.x) + if(ylog) lo.y=exp(lo.y) + lo =data.frame(x=lo.x,y=lo.y) + if(draw.spread) { + lo.sd=lo.sd$fitted + if(ylog) lo.sd=lo.sd*lo.y*lo.y + lo$sd=sqrt(pmax(0,lo.sd)) + } + lo=unique(lo) + lo = lo[order(lo$x),] + output$loess=lo + } + + if(draw.prof) { + px=x;py=y + if(xlog) px=log(x) + p=prof(px,py,50) + if(xlog) p$x=exp(p$x) + output$prof=p + } + + r=cor(x,y,method=cor.method) + output$cor=r + output$cor.method=cor.method + + len=length(x) + if(n.points>0 & n.points<len) { + take=sample(1:len,n.points) + x=x[take] + y=y[take] + col.use=col.use[take] + } + + if(xlog) { + ind = which(x>0) + x=x[ind] + y=y[ind] + col.use=col.use[ind] + } + xcol=x + if(xlog) xcol=log(xcol) + if(ylog) { + ind = which(y>0) + x=x[ind] + xcol=xcol[ind] + y=y[ind] + col.use=col.use[ind] + } + ycol=y + if(ylog) ycol=log(ycol) + + if(is.null(col)) { + if(!same) { + col=colorRampPalette(blues9[-(1:3)]) + } else { + col=colorRampPalette(c("lightpink","red","darkred")) + } + } + if(!is.na(col.use[1])) { + col=col.use + } else { + col= suppressPackageStartupMessages(densCols(xcol,ycol,col =col,bandwidth=bandwidth,nbin=500)) + } + if(!same) { + plot(x,y,col=col,log=log,...) + } else { + points(x,y,col=col,...) + } + + if(is.null(col.line)) { + col.line="darkblue" + if(same) col.line="darkred" + } + if(draw.lowess | draw.loess) lines(lo,col=col.line,lwd=lwd.line) + if(draw.spread) { + lines(lo$x,lo$y+lo$sd,col=col.line,lwd=lwd.line) + lines(lo$x,lo$y-lo$sd,col=col.line,lwd=lwd.line) + } + if(draw.prof) { + points(p) + plot.prof(p) + } + if(write.r & !same) mtext(paste("r=",round(r,3),sep=""),cex=cex.r) + return(invisible(output)) +} + +#color.int=c(144,586,465,257,490,100,74,24) +#coli=1 +#cols = integer() +colramp.bwr = vector() +colramp.byr = vector() +colramp.bw = vector() +colramp.bw2 = vector() + +plot.save=F + +setup.plotting <- function() { + pdf.options(useDingbats = FALSE) +# cols<<-colors()[color.int] +# cols<<-rep(cols,100) + colramp.bwr <<- colorRampPalette(c("blue","white","red"),space="Lab")(100); + colramp.byr <<- colorRampPalette(c("blue","yellow","red"),space="Lab")(100); + colramp.bw <<- colorRampPalette(c("white","black"),space="Lab")(100) + colramp.bw2 <<- colorRampPalette(c("grey92","grey5"),space="Lab")(100) +} + + +plot.cluster <- function(x,k, max.points.cl=-1, image.sep=-1,col=NULL, reorder=FALSE) { + x[which(is.na(x))]=0 + if(reorder) { + o=hclust(dist(t(x)))$order + x=x[,o] + } + if(image.sep<0) { + if(max.points.cl>0) { + image.sep=ceiling(0.2*max.points.cl) + } else { + image.sep=ceiling(0.2 * nrow(x) / nrow(k$centers)) + } + } + + distances<-dist(k$centers) + hcl=hclust(distances) + + adjust.branch.sep <-function(ddr,lengths) { + assign.branch.sep <- function(d,i.leaf) { + if(is.leaf(d)) { + attr(d,"members")<-lengths[i.leaf] + i.leaf=i.leaf+1 + output=list(d=d,i.leaf=i.leaf) + return(output) + } + else{ + input=assign.branch.sep(d[[1]],i.leaf) + i.leaf=input$i.leaf + d[[1]]=input$d + + input=assign.branch.sep(d[[2]],i.leaf) + i.leaf=input$i.leaf + d[[2]]=input$d + + attr(d,"members")<-attr(d[[1]],"members")+attr(d[[2]],"members") + output=list(d=d,i.leaf=i.leaf) + return(output) + } + } + ddr<-as.dendrogram(ddr) + ddr=assign.branch.sep(ddr,1)$d + return(ddr) + } + + n.points.actual=k$size + if(max.points.cl>0) { + k$size[which(k$size>max.points.cl)] = max.points.cl +} + + ddr<-adjust.branch.sep(hcl,k$size[hcl$order]+image.sep) + centers=length(hcl$order) + + n.points=sum(k$size) + n.dims=ncol(x) + z=matrix(numeric((n.points+(centers-1)*image.sep)*n.dims),ncol=n.dims) + + + last.row=0 + cluster.y.pos=numeric(centers) + for(i.c in hcl$order) { + n.p=k$size[i.c] + z[last.row+1:n.p,] = x[which(k$cluster==i.c)[1:n.p],] + cluster.y.pos[i.c]=last.row+n.p/2 + last.row=last.row+n.p+image.sep + } + + zlim=c(0,max(z)) + if(min(z)<0) { + m=max(c(z,-z)) + zlim=c(-m,m) + } + if(is.null(col)) { + if(min(z)>=0) { + col= colramp.bw + } else { + col= colorRampPalette(c("blue","yellow","red"),space="Lab")(100); + } + } + x.pl=seq1(n.dims+1)-0.5 + y.pl=seq1(nrow(z)+1)-0.5 + l <- layout(matrix(1:2,ncol=2),widths=c(1,5)) + par(mar = c(6,0.5,6,0)) + my.plot.dendrogram(ddr,horiz=T,axes=F,yaxs="i",xaxs="i",leaflab="none",center=T,lwd=10) + par(mar = c(6,0.1,6,2.1)) + image(x=x.pl,y=y.pl,z=t(z),zlim=zlim,axes=FALSE,xlab="",col=col) + mtext("cluster",side=4,adj=1.1) + mtext("points",side=4,adj=1.1,line=1) + mtext(seq1(centers),side=4,at=cluster.y.pos) + mtext(n.points.actual,side=4,at=cluster.y.pos,line=1) + + if(!is.null(dimnames(x)[[2]])) { + mtext(dimnames(x)[[2]],side=1,at=seq1(n.dims),las=2) + } +} + +plot.cluster2 <- function(k, n.clusters=-1, n.clusters.per.panel=4, cols=c("black","red","blue","darkgreen","orange"),f=0,xshift=0,...) { + if(n.clusters<=0) n.clusters=nrow(k$centers) + + n.elements=as.numeric(unlist(lapply(seq1(n.clusters), function(cl) length(which(abs(k$cluster)==cl))))) + + distances<-dist(k$centers) + n.panels = ceiling(n.clusters/n.clusters.per.panel) + n.rows=ceiling(sqrt(n.panels)) + n.cols=ceiling(n.panels/n.rows) + n.panels.layout=n.rows*n.cols + + layout(matrix(seq1(n.panels.layout),nrow=n.rows,byrow=TRUE)) + + min=min(k$centers) + max=max(k$centers) + + if(f>0) { + for(i.cluster in seq1(n.clusters)) { + k$centers[i.cluster,]=lowess(k$centers[i.cluster,],f=f)$y + } + } + + ## hcl=hclust(distances) + hcl=list() + hcl$order=1:n.clusters + + for(i.cluster in seq1(n.clusters)) { + if(i.cluster %% n.clusters.per.panel == 1 ) { + clusters.of.panel=i.cluster:(i.cluster+n.clusters.per.panel-1) + clusters.of.panel=clusters.of.panel[which(clusters.of.panel<=n.clusters)] + clusters.of.panel=hcl$order[clusters.of.panel] + plot(c(0,length(k$centers[1,]))+xshift,c(min,max),type="n",...) + mtext(paste(clusters.of.panel," (",n.elements[clusters.of.panel],")",sep=""),line=length(clusters.of.panel)-seq1(length(clusters.of.panel)),col=cols[seq1(length(clusters.of.panel)) %% n.clusters.per.panel+1] ) + } + # lines(k$centers[hcl$order[i.cluster],],col=cols[i.cluster %% n.clusters.per.panel+1]) + lines(seq1(length(k$centers[1,]))+xshift,k$centers[hcl$order[i.cluster],],col=cols[i.cluster %% n.clusters.per.panel+1]) + } +} + +my.colors <- function(n) { + few.colors=c("black","red","blue","green3","mediumorchid3","gold2","darkcyan","sienna2") + if(n<=length(few.colors)) return(few.colors [seq1(n)]) + col=integer(n) + n.families=7 + n.members=ceiling(n/n.families) + for(i in seq1(n)) { + member=ceiling(i/n.families) + ratio=(member-1)/(n.members-1) + c2=0+0.8*ratio + if(member %% 2 == 1) ratio=-ratio + c1=0.8-0.2*ratio + c3=0.75-0.2*ratio + if(i %% n.families == 1) {col[i]=rgb(c2,c2,c2)} + if(i %% n.families == 2) {col[i]=rgb(c1,c1/2,c1/2)} + if(i %% n.families == 3) {col[i]=rgb(c1/2,0.9*c1,c1/2)} + if(i %% n.families == 4) {col[i]=rgb(c1/2,c1/2,c1)} + if(i %% n.families == 5) {col[i]=rgb(c3,c3,c3/2)} + if(i %% n.families == 6) {col[i]=rgb(c3,c3/2,c3)} + if(i %% n.families == 0) {col[i]=rgb(c3/2,c3,c3)} + } + return(col) +} + +plot.my.colors <-function(n) { + x11() + col=my.colors(n) + plot(x=c(0,n),y=c(0,1),type="n") + segments(seq1(n)-1,runif(n),seq1(n),runif(n),col=col) +} + + +plot.colors <-function() { + x11(width=10,height=10) + plot(c(0,26),c(0,26),type="n") + c=colors() + n=length(c) + i=1:n + x=i%%26 + y=floor(i/26) + rect(x,y,x+1,y+1,col=c[i],border=c[i]) + text(x+0.5,y+0.5,i) +} + + +adjust.branch.sep <-function(ddr,lengths) { + assign.branch.sep <- function(d,i.leaf) { + if(is.leaf(d)) { + attr(d,"members")<-lengths[i.leaf] + i.leaf=i.leaf+1 + output=list(d=d,i.leaf=i.leaf) + return(output) + } + else{ + input=assign.branch.sep(d[[1]],i.leaf) + i.leaf=input$i.leaf + d[[1]]=input$d + + input=assign.branch.sep(d[[2]],i.leaf) + i.leaf=input$i.leaf + d[[2]]=input$d + + attr(d,"members")<-attr(d[[1]],"members")+attr(d[[2]],"members") + output=list(d=d,i.leaf=i.leaf) + return(output) + } + } + ddr<-as.dendrogram(ddr) + ddr=assign.branch.sep(ddr,1)$d + return(ddr) +} +t.dhcol <- function(dr,h,cols=c(1)) { + # check child heights + if(attr(dr[[1]],"height")<h) { + # color + ecol <- cols[coli]; + coli <<- coli+1; + dr[[1]] <- dendrapply(dr[[1]],function(e) { attr(e,"edgePar") <- list(col=ecol); e}); + attr(dr[[1]],"edgePar") <- list(col=ecol,p.border=NA,p.col=NA,t.col=1,t.cex=1.3); + } else { + dr[[1]] <- t.dhcol(dr[[1]],h,cols); + } + + if(attr(dr[[2]],"height")<h) { + # color + ecol <- cols[coli]; + coli <<- coli+1; + dr[[2]] <- dendrapply(dr[[2]],function(e) { attr(e,"edgePar") <- list(col=ecol); e}); + attr(dr[[2]],"edgePar") <- list(col=ecol,p.border=NA,p.col=NA,t.col=1,t.cex=1.3); + } else { + dr[[2]] <- t.dhcol(dr[[2]],h,cols); + } + return(dr); +} + + + +### The rest is PeterK's my.plot.dendogram + +## FIXME: need larger par("mar")[1] or [4] for longish labels ! +## {probably don't change, just print a warning ..} +my.plot.dendrogram <- + function (x, type = c("rectangle", "triangle"), center = FALSE, + edge.root = is.leaf(x) || !is.null(attr(x, "edgetext")), + nodePar = NULL, edgePar = list(), + leaflab = c("perpendicular", "textlike", "none"), dLeaf = NULL, + xlab = "", ylab = "", xaxt="n", yaxt="s", + horiz = FALSE, frame.plot = FALSE, ...) +{ + type <- match.arg(type) + leaflab <- match.arg(leaflab) + hgt <- attr(x, "height") + if (edge.root && is.logical(edge.root)) + edge.root <- 0.0625 * if(is.leaf(x)) 1 else hgt + mem.x <- .my.memberDend(x) + yTop <- hgt + edge.root + if(center) { x1 <- 0.5 ; x2 <- mem.x + 0.5 } + else { x1 <- 1 ; x2 <- mem.x } + xlim <- c(x1 - 1/2, x2 + 1/2) + ylim <- c(0, yTop) + if (horiz) {## swap and reverse direction on `x': + xl <- xlim; xlim <- rev(ylim); ylim <- xl + tmp <- xaxt; xaxt <- yaxt; yaxt <- tmp + } + plot(0, xlim = xlim, ylim = ylim, type = "n", xlab = xlab, ylab = ylab, + xaxt = xaxt, yaxt = yaxt, frame.plot = frame.plot, ...) + if(is.null(dLeaf)) + dLeaf <- .75*(if(horiz) strwidth("w") else strheight("x")) + + if (edge.root) { +### FIXME: the first edge + edgetext is drawn here, all others in plotNode() +### ----- maybe use trick with adding a single parent node to the top ? + x0 <- my.plotNodeLimit(x1, x2, x, center)$x + if (horiz) + segments(hgt, x0, yTop, x0) + else segments(x0, hgt, x0, yTop) + if (!is.null(et <- attr(x, "edgetext"))) { + my <- mean(hgt, yTop) + if (horiz) + text(my, x0, et) + else text(x0, my, et) + } + } + my.plotNode(x1, x2, x, type = type, center = center, leaflab = leaflab, + dLeaf = dLeaf, nodePar = nodePar, edgePar = edgePar, horiz = horiz) +} + +### the work horse: plot node (if pch) and lines to all children +my.plotNode <- + function(x1, x2, subtree, type, center, leaflab, dLeaf, + nodePar, edgePar, horiz = FALSE) +{ + inner <- !is.leaf(subtree) && x1 != x2 + yTop <- attr(subtree, "height") + bx <- my.plotNodeLimit(x1, x2, subtree, center) + xTop <- bx$x + usrpar <- par("usr"); + + ## handle node specific parameters in "nodePar": + hasP <- !is.null(nPar <- attr(subtree, "nodePar")) + if(!hasP) nPar <- nodePar + + if(getOption("verbose")) { + cat(if(inner)"inner node" else "leaf", ":") + if(!is.null(nPar)) { cat(" with node pars\n"); str(nPar) } + cat(if(inner)paste(" height", formatC(yTop),"; "), + "(x1,x2)= (",formatC(x1,wid=4),",",formatC(x2,wid=4),")", + "--> xTop=", formatC(xTop, wid=8),"\n", sep="") + } + + Xtract <- function(nam, L, default, indx) + rep(if(nam %in% names(L)) L[[nam]] else default, + length.out = indx)[indx] + asTxt <- function(x) # to allow 'plotmath' labels: + if(is.character(x) || is.expression(x) || is.null(x)) x else as.character(x) + + i <- if(inner || hasP) 1 else 2 # only 1 node specific par + + if(!is.null(nPar)) { ## draw this node + pch <- Xtract("pch", nPar, default = 1:2, i) + cex <- Xtract("cex", nPar, default = c(1,1), i) + col <- Xtract("col", nPar, default = par("col"), i) + bg <- Xtract("bg", nPar, default = par("bg"), i) + points(if (horiz) cbind(yTop, xTop) else cbind(xTop, yTop), + pch = pch, bg = bg, col = col, cex = cex) + } + + if(leaflab == "textlike") + p.col <- Xtract("p.col", nPar, default = "white", i) + lab.col <- Xtract("lab.col", nPar, default = par("col"), i) + lab.cex <- Xtract("lab.cex", nPar, default = c(1,1), i) + lab.font <- Xtract("lab.font", nPar, default = par("font"), i) + if (is.leaf(subtree)) { + ## label leaf + if (leaflab == "perpendicular") { # somewhat like plot.hclust + if(horiz) { + X <- yTop + dLeaf * lab.cex + Y <- xTop; srt <- 0; adj <- c(0, 0.5) + } + else { + Y <- yTop - dLeaf * lab.cex + X <- xTop; srt <- 90; adj <- 1 + } + nodeText <- asTxt(attr(subtree,"label")) + text(X, Y, nodeText, xpd = TRUE, srt = srt, adj = adj, + cex = lab.cex, col = lab.col, font = lab.font) + } + } + else if (inner) { + segmentsHV <- function(x0, y0, x1, y1) { + if (horiz) + segments(y0, x0, y1, x1, col = col, lty = lty, lwd = lwd) + else segments(x0, y0, x1, y1, col = col, lty = lty, lwd = lwd) + } + for (k in 1:length(subtree)) { + child <- subtree[[k]] + ## draw lines to the children and draw them recursively + yBot <- attr(child, "height") + if (getOption("verbose")) cat("ch.", k, "@ h=", yBot, "; ") + if (is.null(yBot)) + yBot <- 0 + xBot <- + if (center) mean(bx$limit[k:(k + 1)]) + else bx$limit[k] + .my.midDend(child) + + hasE <- !is.null(ePar <- attr(child, "edgePar")) + if (!hasE) + ePar <- edgePar + i <- if (!is.leaf(child) || hasE) 1 else 2 + ## define line attributes for segmentsHV(): + col <- Xtract("col", ePar, default = par("col"), i) + lty <- Xtract("lty", ePar, default = par("lty"), i) + lwd <- Xtract("lwd", ePar, default = par("lwd"), i) + if (type == "triangle") { + segmentsHV(xTop, yTop, xBot, yBot) + } + else { # rectangle + segmentsHV(xTop,yTop, xBot,yTop)# h + segmentsHV(xBot,yTop, xBot,yBot)# v + } + vln <- NULL + if (is.leaf(child) && leaflab == "textlike") { + nodeText <- asTxt(attr(child,"label")) + if(getOption("verbose")) + cat('-- with "label"',format(nodeText)) + hln <- 0.6 * strwidth(nodeText, cex = lab.cex)/2 + vln <- 1.5 * strheight(nodeText, cex = lab.cex)/2 + rect(xBot - hln, yBot, + xBot + hln, yBot + 2 * vln, col = p.col) + text(xBot, yBot + vln, nodeText, xpd = TRUE, + cex = lab.cex, col = lab.col, font = lab.font) + } + if (!is.null(attr(child, "edgetext"))) { + edgeText <- asTxt(attr(child, "edgetext")) + if(getOption("verbose")) + cat('-- with "edgetext"',format(edgeText)) + if (!is.null(vln)) { + mx <- + if(type == "triangle") + (xTop+ xBot+ ((xTop - xBot)/(yTop - yBot)) * vln)/2 + else xBot + my <- (yTop + yBot + 2 * vln)/2 + } + else { + mx <- if(type == "triangle") (xTop + xBot)/2 else xBot + my <- (yTop + yBot)/2 + } + ## Both for "triangle" and "rectangle" : Diamond + Text + + p.col <- Xtract("p.col", ePar, default = "white", i) + p.border <- Xtract("p.border", ePar, default = par("fg"), i) + ## edge label pars: defaults from the segments pars + p.lwd <- Xtract("p.lwd", ePar, default = lwd, i) + p.lty <- Xtract("p.lty", ePar, default = lty, i) + t.col <- Xtract("t.col", ePar, default = col, i) + t.cex <- Xtract("t.cex", ePar, default = 1, i) + t.font<- Xtract("t.font",ePar, default= par("font"), i) + t.shift <- Xtract("t.shift", ePar, default = 0.01, i) + + vlm <- strheight(c(edgeText,"h"), cex = t.cex)/2 + hlm <- strwidth (c(edgeText,"m"), cex = t.cex)/2 + hl3 <- c(hlm[1], hlm[1] + hlm[2], hlm[1]) + #polygon(mx+ c(-hl3, hl3), my + sum(vlm)*c(-1:1,1:-1), + # col = p.col, border= p.border, lty = p.lty, lwd = p.lwd) + #text(mx, my, edgeText, cex = t.cex, col = t.col, font = t.font) + if(horiz) { + text(my, mx+t.shift*abs(usrpar[3]-usrpar[4]), edgeText, cex = t.cex, col = t.col, font = t.font) + } else { + text(mx+t.shift*abs(usrpar[2]-usrpar[1]), my, edgeText, cex = t.cex, col = t.col, font = t.font) + } + } + my.plotNode(bx$limit[k], bx$limit[k + 1], subtree = child, + type, center, leaflab, dLeaf, nodePar, edgePar, horiz) + } + } +} + +my.plotNodeLimit <- function(x1, x2, subtree, center) +{ + ## get the left borders limit[k] of all children k=1..K, and + ## the handle point `x' for the edge connecting to the parent. + inner <- !is.leaf(subtree) && x1 != x2 + if(inner) { + K <- length(subtree) + mTop <- .my.memberDend(subtree) + limit <- integer(K) + xx1 <- x1 + for(k in 1:K) { + m <- .my.memberDend(subtree[[k]]) + ##if(is.null(m)) m <- 1 + xx1 <- xx1 + (if(center) (x2-x1) * m/mTop else m) + limit[k] <- xx1 + } + limit <- c(x1, limit) + } else { ## leaf + limit <- c(x1, x2) + } + mid <- attr(subtree, "midpoint") + center <- center || (inner && !is.numeric(mid)) + x <- if(center) mean(c(x1,x2)) else x1 + (if(inner) mid else 0) + list(x = x, limit = limit) +} + +.my.memberDend <- function(x) { + r <- attr(x,"x.member") + if(is.null(r)) { + r <- attr(x,"members") + if(is.null(r)) r <- 1:1 + } + r +} + +.my.midDend <- function(x) + if(is.null(mp <- attr(x, "midpoint"))) 0 else mp + + +## original Andy Liaw; modified RG, MM : +my.heatmap <- function (x, Rowv=NULL, Colv=if(symm)"Rowv" else NULL, + distfun = dist, hclustfun = hclust, + reorderfun = function(d,w) reorder(d,w), + add.expr, symm = FALSE, revC = identical(Colv, "Rowv"), + scale = c("row", "column", "none"), na.rm=TRUE, + margins = c(5, 5), ColSideColors, RowSideColors, + cexRow = 0.2 + 1/log10(nr), cexCol = 0.2 + 1/log10(nc), + labRow = NULL, labCol = NULL, main = NULL, xlab = NULL, ylab = NULL, + keep.dendro = FALSE, + verbose = getOption("verbose"), imageSize=4, imageVSize=imageSize,imageHSize=imageSize,lasCol=2, lasRow=2, respect=F, ...) +{ + scale <- if(symm && missing(scale)) "none" else match.arg(scale) + if(length(di <- dim(x)) != 2 || !is.numeric(x)) + stop("'x' must be a numeric matrix") + nr <- di[1] + nc <- di[2] + if(nr <= 1 || nc <= 1) + stop("'x' must have at least 2 rows and 2 columns") + if(!is.numeric(margins) || length(margins) != 2) + stop("'margins' must be a numeric vector of length 2") + + doRdend <- !identical(Rowv,NA) + doCdend <- !identical(Colv,NA) + ## by default order by row/col means + if(is.null(Rowv)) Rowv <- rowMeans(x, na.rm = na.rm) + if(is.null(Colv)) Colv <- colMeans(x, na.rm = na.rm) + + ## get the dendrograms and reordering indices + + if(doRdend) { + if(inherits(Rowv, "dendrogram")) + ddr <- Rowv + else { + hcr <- hclustfun(distfun(x)) + ddr <- as.dendrogram(hcr) + if(!is.logical(Rowv) || Rowv) + ddr <- reorderfun(ddr, Rowv) + } + if(nr != length(rowInd <- order.dendrogram(ddr))) + stop("row dendrogram ordering gave index of wrong length") + } + else rowInd <- 1:nr + + if(doCdend) { + if(inherits(Colv, "dendrogram")) + ddc <- Colv + else if(identical(Colv, "Rowv")) { + if(nr != nc) + stop('Colv = "Rowv" but nrow(x) != ncol(x)') + ddc <- ddr + } + else { + hcc <- hclustfun(distfun(if(symm)x else t(x))) + ddc <- as.dendrogram(hcc) + if(!is.logical(Colv) || Colv) + ddc <- reorderfun(ddc, Colv) + } + if(nc != length(colInd <- order.dendrogram(ddc))) + stop("column dendrogram ordering gave index of wrong length") + } + else colInd <- 1:nc + + ## reorder x + x <- x[rowInd, colInd] + + labRow <- + if(is.null(labRow)) + if(is.null(rownames(x))) (1:nr)[rowInd] else rownames(x) + else labRow[rowInd] + labCol <- + if(is.null(labCol)) + if(is.null(colnames(x))) (1:nc)[colInd] else colnames(x) + else labCol[colInd] + + if(scale == "row") { + x <- sweep(x, 1, rowMeans(x, na.rm = na.rm)) + sx <- apply(x, 1, sd, na.rm = na.rm) + x <- sweep(x, 1, sx, "/") + } + else if(scale == "column") { + x <- sweep(x, 2, colMeans(x, na.rm = na.rm)) + sx <- apply(x, 2, sd, na.rm = na.rm) + x <- sweep(x, 2, sx, "/") + } + + ## Calculate the plot layout + lmat <- rbind(c(NA, 3), 2:1) + lwid <- c(if(doRdend) 1 else 0.05, imageHSize) + lhei <- c((if(doCdend) 1 else 0.05) + if(!is.null(main)) 0.2 else 0, imageVSize) + if(!missing(ColSideColors)) { ## add middle row to layout + if(!is.character(ColSideColors) || length(ColSideColors) != nc) + stop("'ColSideColors' must be a character vector of length ncol(x)") + lmat <- rbind(lmat[1,]+1, c(NA,1), lmat[2,]+1) + lhei <- c(lhei[1], 0.2, lhei[2]) + } + if(!missing(RowSideColors)) { ## add middle column to layout + if(!is.character(RowSideColors) || length(RowSideColors) != nr) + stop("'RowSideColors' must be a character vector of length nrow(x)") + lmat <- cbind(lmat[,1]+1, c(rep(NA, nrow(lmat)-1), 1), lmat[,2]+1) + lwid <- c(lwid[1], 0.2, lwid[2]) + } + lmat[is.na(lmat)] <- 0 + if(verbose) { + cat("layout: widths = ", lwid, ", heights = ", lhei,"; lmat=\n") + print(lmat) + } + + ## Graphics `output' ----------------------- + + op <- par(no.readonly = TRUE) + on.exit(par(op)) + layout(lmat, widths = lwid, heights = lhei, respect = respect) + ## draw the side bars + if(!missing(RowSideColors)) { + par(mar = c(margins[1],0, 0,0.5)) + image(rbind(1:nr), col = RowSideColors[rowInd], axes = FALSE) + } + if(!missing(ColSideColors)) { + par(mar = c(0.5,0, 0,margins[2])) + image(cbind(1:nc), col = ColSideColors[colInd], axes = FALSE) + } + ## draw the main carpet + par(mar = c(margins[1], 0, 0, margins[2])) + if(!symm || scale != "none") + x <- t(x) + if(revC) { # x columns reversed + iy <- nr:1 + ddr <- rev(ddr) + x <- x[,iy] + } else iy <- 1:nr + + image(1:nc, 1:nr, x, xlim = 0.5+ c(0, nc), ylim = 0.5+ c(0, nr), + axes = FALSE, xlab = "", ylab = "", ...) + axis(1, 1:nc, labels= labCol, las= lasCol, line= -0.5, tick= 0, cex.axis= cexCol) + if(!is.null(xlab)) mtext(xlab, side = 1, line = margins[1] - 1.25) + axis(4, iy, labels= labRow, las= lasRow, line= -0.5, tick= 0, cex.axis= cexRow) + if(!is.null(ylab)) mtext(ylab, side = 4, line = margins[2] - 1.25,las=lasRow) + if (!missing(add.expr)) + eval(substitute(add.expr)) + + ## the two dendrograms : + par(mar = c(margins[1], 0, 0, 0)) + if(doRdend) + my.plot.dendrogram(ddr, horiz = TRUE, axes = FALSE, yaxs = "i", leaflab = "none") + else frame() + + par(mar = c(0, 0, if(!is.null(main)) 1 else 0, margins[2])) + if(doCdend) + my.plot.dendrogram(ddc, axes = FALSE, xaxs = "i", leaflab = "none") + else if(!is.null(main)) frame() + + ## title + if(!is.null(main)) title(main, cex.main = 1.5*op[["cex.main"]]) + + invisible(list(rowInd = rowInd, colInd = colInd, + Rowv = if(keep.dendro && doRdend) ddr, + Colv = if(keep.dendro && doCdend) ddc )) +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/region_motif_lib/regions.cpp Tue Aug 05 13:56:22 2014 -0400 @@ -0,0 +1,123 @@ +#include <iostream> +#include <vector> +#include <algorithm> +using namespace std; + +extern "C" { + typedef pair <int,int> Se_t; + bool se_lt (const Se_t &l,const Se_t &r) { return l.first < r.first; } + + void merge_regions(int *regions, int*nregionsR,int *merge_sepR) { + int nregs=nregionsR[0]; + if(nregs==0) return; + int sep=merge_sepR[0]; + if(sep<1) sep=1; + vector<Se_t> reg(nregs); + for(int ireg=0;ireg<nregs;ireg++) { + reg[ireg]=make_pair(regions[ireg*2],regions[ireg*2+1]); + } + sort(reg.begin(),reg.end(),se_lt); + int *reg_index = new int[nregs]; + for(int ireg=1;ireg<nregs;ireg++) reg_index[ireg]=-1; + reg_index[0]=0; + int last_ireg=0; + int counter=1; + for(int ireg=1;ireg<nregs;ireg++) { + if(reg[ireg].first<=reg[last_ireg].second+sep) { + if(reg[ireg].second>reg[last_ireg].second) reg[last_ireg].second=reg[ireg].second; + } else { + last_ireg=ireg; + reg_index[counter]=ireg; + counter++; + } + } + for(int ireg=0;ireg<counter;ireg++) { + regions[ireg*2] = reg[reg_index[ireg]].first; + regions[ireg*2+1] = reg[reg_index[ireg]].second; + } + nregionsR[0]=counter; + delete [] reg_index; + } + void region_minus_region(int *regions, int*nregionsR,int *region2s, int*nregion2sR,int *updatedregions) { + int sep=1; + merge_regions(regions,nregionsR,&sep); + merge_regions(region2s,nregion2sR,&sep); + int nregs=nregionsR[0]; + int nreg2s=nregion2sR[0]; + for(int i=0;i<2*(nregs+nreg2s);i++) updatedregions[i]=-1; + if(nregs==0) return; + int ireg = 0; + int iregout = 0; + for(int ireg2=0; ireg2<nreg2s;ireg2++) { + if(ireg==nregs) break; + if(region2s[ireg2*2+1] < regions[2*ireg]) continue; + if(region2s[ireg2*2] > regions[2*ireg+1]) { + updatedregions[2*iregout] = regions[2*ireg]; + updatedregions[2*iregout+1] = regions[2*ireg+1]; + ireg++; + ireg2--; + iregout++; + continue; + } + int s = regions[ireg*2]; + int e = regions[ireg*2+1]; + int s2 = region2s[ireg2*2]; + int e2 = region2s[ireg2*2+1]; + if(s2<=s && e2>=e) { + ireg++; + ireg2--; + } + else if(s2<=s) { + regions[ireg*2] = e2+1; + continue; + } else if(e2>=e) { + updatedregions[2*iregout] = s; + updatedregions[2*iregout+1] = s2-1; + ireg2--; + iregout++; + ireg++; + } else { + updatedregions[2*iregout] = s; + updatedregions[2*iregout+1] = s2-1; + regions[ireg*2] = e2+1; + iregout++; + ireg2--; + } + } + while(ireg<nregs) { + updatedregions[2*iregout] = regions[2*ireg]; + updatedregions[2*iregout+1] = regions[2*ireg+1]; + ireg++; + iregout++; + } + } + void intersection_of_regions(int *regions, int*nregionsR,int *region2s, int*nregion2sR,int *updatedregions) { + int sep=1; + merge_regions(regions,nregionsR,&sep); + merge_regions(region2s,nregion2sR,&sep); + int nregs=nregionsR[0]; + int nreg2s=nregion2sR[0]; + for(int i=0;i<2*(nregs+nreg2s);i++) updatedregions[i]=-1; + if(nregs==0) return; + if(nreg2s==0) return; + int ireg2 = 0; + int iregout = 0; + for(int ireg=0; ireg<nregs;ireg++) { + if(ireg2==nreg2s) return; + if(regions[ireg*2+1] < region2s[2*ireg2]) continue; + if(regions[ireg*2] > region2s[2*ireg2+1]) {ireg2++; ireg--; continue;} + + int s = regions[ireg*2]; + if(s<region2s[ireg2*2]) s = region2s[ireg2*2]; + int e = regions[ireg*2+1]; + if(e>region2s[ireg2*2+1]) { + e = region2s[ireg2*2+1]; + ireg2++; + ireg--; + } + updatedregions[2*iregout] = s; + updatedregions[2*iregout+1] = e; + iregout++; + } + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/region_motif_lib/regions.r Tue Aug 05 13:56:22 2014 -0400 @@ -0,0 +1,313 @@ + +# SHOULD ONLY OCCUR IN ONE FILE +#common.dir = "/Users/jeremyliu1/galaxy-dist/tools/my_tools" + +# commonDir from region_motif_intersect.r sourcing file +dyn.load(paste(commonDir, "/region_motif_lib/regions.so",sep="")) + +##reg = matrix(cbind(from,to)) from<to +##region[[chr]] = reg +##pos = unique(integer()) +##poslist = list(chr,pos, optional(strand=c(-1,0,+1))) + +# USED +merge.reg <- function(...,sep=1) { + ##This function returns union of regs. + reg = rbind(...) + x=.C("merge_regions",as.integer(t(reg)),as.integer(nrow(reg)),as.integer(sep)) + reg=matrix(x[[1]][1:(x[[2]]*2)],ncol=2,byrow=TRUE) + reg = matrix(reg[which(reg[,2]>reg[,1]),],ncol=2) + reg[which(reg==0)]=1 + return(reg) +} + +merge.regions<-function(...,sep=1) { + ##This function returns union of regions. + regions=list(...) + chrs = unique(unlist(lapply(regions,names),use.names=F)) + region = list() + for(chr in chrs) { + region[[chr]] = do.call("merge.reg",c(lapply(regions,function(i) i[[chr]]),sep=sep)) + } + return(region) +} + +plot.reg<-function(reg,xlim=NULL,y=NULL,vertical=FALSE,...) { + ##This function does not stack if reg is overlapping. + ##new plot is made unless y is specified. + if(nrow(reg)==0) return() + if(is.null(xlim)) xlim=range(reg) + if(is.null(y)) { + plot(xlim,c(0,1),type="n",axes=FALSE,xlab=" ",ylab=" ") + y=0.5 + } + segments(reg[,1],y,reg[,2],...) + if(vertical) abline(v=reg) +} + +distance.to.closest.reg.of.reg <- function(reg,reg2) { + ##for each element of reg, what is the closest distance to any element of reg2? + reg2 = merge.reg(reg2) + reg2 = c(-Inf,t(reg2),Inf) + s=reg[,1] + e=reg[,2] + sbin = as.integer(cut(s,reg2)) + ebin = as.integer(cut(e,reg2)) + d = pmin(s-reg2[sbin], reg2[sbin+1]-s, e-reg2[ebin], reg2[ebin+1]-e) + d[which(sbin!=ebin | sbin%%2==0)] = 0 + return(d) +} + +# USED +distance.to.closest.reg.of.pos <- function(pos,reg) { + ##for each element of pos, what is the closest distance to any element of reg? + reg = merge.reg(reg) + reg = c(-Inf,t(reg),Inf) + pbin = as.integer(cut(pos,reg)) + d = pmin(pos-reg[pbin], reg[pbin+1]-pos) + d[which(pbin%%2==0)] = 0 + return(d) +} + +distance.to.closest.pos.of.reg <- function(reg,pos,pos.strand=NULL,index.return=FALSE) { + ##for each element of reg, what is the closest distance to any element of pos? + ##if strand is provided, distance is along strand + o = order(pos) + pos = c(-Inf,pos[o],Inf) + o = c(o[1],o,o[length(o)]) + + s=reg[,1] + e=reg[,2] + sbin = as.integer(cut(s,pos)) + ebin = as.integer(cut(e,pos)) + + d=integer(nrow(reg)) + s.is.closer = s-pos[sbin] < pos[sbin+1]-e + if(index.return) { + return(ifelse(s.is.closer,o[sbin],o[sbin+1])) + } + d = ifelse(s.is.closer, s-pos[sbin], e-pos[sbin+1]) + d[which(sbin!=ebin)] = 0 + if(!is.null(pos.strand)) { + reg.strand = ifelse(s.is.closer,pos.strand[o][sbin],pos.strand[o][sbin+1]) + d = d * reg.strand + } + return(d) +} + +if(F) { + pos = sample(seq(0,1000,200)) + pos2 = sample(seq(10,1010,100)) + pos.strand = sample(c(1,-1),6,replace = T) + pos2.strand = sample(c(1,-1),11,replace = T) +} + +distance.to.closest.pos.of.pos <- function(pos,pos2,pos.strand=NULL,pos2.strand=NULL, ignore.pos.strand=TRUE,index.return=FALSE) { + ##for each element of pos, what is the closest distance to any element of pos2? + ##if index.return==TRUE, index of pos2 closest to pos is returned + ##else if strand2 is provided, distance is along strand2 + ##if strand and strand2 are both provided and !ignore.pos.strand + ## then output is a list giving plus.up, plus.down, minus.up, minus.down + ## plus.up: distance to closest upstream on the same same strand etc. etc. + o = order(pos2) + pos2 = c(-Inf,pos2[o],Inf) + if(!is.null(pos2.strand)) pos2.strand = c(-Inf,pos2.strand[o],Inf) + + if(is.null(pos2.strand) | is.null(pos.strand) | ignore.pos.strand) { + pbin = as.integer(cut(pos,pos2)) + + pbin = ifelse(pos-pos2[pbin] < pos2[pbin+1]-pos,pbin,pbin+1) + d = pos-pos2[pbin] + if(!is.null(pos2.strand)) d = d * pos2.strand[pbin] + + if(index.return) return(o[pbin-1]) + return(d) + } + strands = list(plus=1,minus=-1) + relcoords = list(up=0,down=1) + ind = lapply(strands,function(strand) { + ind.p = c(1,which(pos2.strand==strand),length(pos2)) + pbin.p = cut(pos,pos2[ind.p],labels=FALSE) + as.data.frame(lapply(relcoords,function(i) ind.p[pbin.p+i])) + }) + ind.temp = ind + ind.minus = which(pos.strand==-1) + if(length(ind.minus)>0) { + ind[[1]][ind.minus,]=ind.temp[[2]][ind.minus,2:1] + ind[[2]][ind.minus,]=ind.temp[[1]][ind.minus,2:1] + } + ind = unlist(ind,recursive=FALSE) + if(index.return) { + return( lapply(ind,function(i) { + i[which(i==1)]=NA + i[which(i==length(pos2))]=NA + o[i-1] + }) ) + } + return(lapply(ind,function(i) pos.strand*(pos2[i]-pos))) +} + +distance.to.closest.region.of.region <- function(region,region2) { + ##for each element of region[[chr]], what is the closest distance to any element of region2[[chr]]? + ##returns d[[chr]] + chrs = names(region) + d=list() + for(chr in chrs) { + if(is.null(region2[[chr]])) { + d[[chr]] = rep(Inf,nrow(region[[chr]])) + } else { + d[[chr]] = distance.to.closest.reg.of.reg(region[[chr]],region2[[chr]]) + } + } + return(d) +} + +# USED +distance.to.closest.region.of.poslist <- function(poslist,region) { + ##for each element of poslist, what is the closest distance to any element of region? + chrs = names(table(poslist$chr)) + d=integer() + for(chr in chrs) { + ind = which(poslist$chr==chr) + pos=poslist$pos[ind] + if(is.null(region[[chr]])) { + d[ind] = Inf + } else { + d[ind] = distance.to.closest.reg.of.pos(pos,region[[chr]]) + } + } + return(d) +} +distance.to.closest.poslist.of.region <- function(region,poslist,index.return=FALSE) { + ##for each element of region, what is the closest distance to any element of poslist? + chrs = names(region) + d=list() + for(chr in chrs) { + ind = which(poslist$chr==chr) + pos=poslist$pos[ind] + pos.strand=poslist$strand[ind] + d[[chr]] = distance.to.closest.pos.of.reg(region[[chr]],pos,pos.strand,index.return=index.return) + if(index.return) d[[chr]] = ind[d[[chr]]] + } + return(d) +} + +distance.to.closest.poslist.of.poslist <- function(poslist,poslist2,ignore.poslist.strand=TRUE,index.return=FALSE) { + ##for each element of poslist, what is the closest distance to any element of poslist2? + ##if poslist2$strand is provided, distance is along strand2 + ##if strand and strand2 are provided and no ignore.poslist.strand + ## then output is a list giving plus.up, plus.down, minus.up, minus.down + ## plus.up: distance to closest upstream on the same same strand etc. etc. + ##if index.return==TRUE, index of pos2 closest to pos is returned + + chrs = names(table(poslist$chr)) + + d=integer() + stranded = !(is.null(poslist2$strand) | is.null(poslist$strand) | ignore.poslist.strand) + if(stranded) { + brs = c("plus.up","plus.down","minus.up","minus.down") + d=list() + for(br in brs) d[[br]]=integer() + } + + for(chr in chrs) { + ind = which(poslist$chr==chr) + ind2 = which(poslist2$chr==chr) + pos=poslist$pos[ind] + pos2=poslist2$pos[ind2] + pos.strand=poslist$strand[ind] + pos2.strand=poslist2$strand[ind2] + if(!stranded) { + d[ind] = distance.to.closest.pos.of.pos(pos,pos2,pos.strand,pos2.strand,ignore.poslist.strand,index.return=index.return) + if(index.return) d[ind] = ind2[d[ind]] + } else { + x = distance.to.closest.pos.of.pos(pos,pos2,pos.strand,pos2.strand,ignore.poslist.strand) + for(br in brs) { + d[[br]][ind] = x[[br]] + if(index.return) d[[br]][ind] = ind2[d[[br]][ind]] + } + } + } + return(d) +} + + +reg.minus.reg <- function(reg,reg2) { + x = .C("region_minus_region",as.integer(t(reg)),as.integer(nrow(reg)),as.integer(t(reg2)),as.integer(nrow(reg2)),integer((nrow(reg)+nrow(reg2))*2))[[5]] + x=x[which(x>=0)] + return(matrix(x,ncol=2,byrow=TRUE)) +} + +intersection.of.regs <- function(reg,reg2) { + x = .C("intersection_of_regions",as.integer(t(reg)),as.integer(nrow(reg)),as.integer(t(reg2)),as.integer(nrow(reg2)),integer((nrow(reg)+nrow(reg2))*2))[[5]] + x=x[which(x>=0)] + return(matrix(x,ncol=2,byrow=TRUE)) +} + +region.minus.region<-function(region,region2) { + chrs = names(region) + for(chr in chrs) { + if(is.null(region[[chr]])) next + if(!is.null(region2[[chr]])) { + region[[chr]] = reg.minus.reg(region[[chr]],region2[[chr]]) + } + } + return(region) +} + +intersection.of.regions<-function(region,region2) { + chrs = names(region) + for(chr in chrs) { + if(is.null(region2[[chr]])) { + region[[chr]]<-NULL + } else { + region[[chr]] = intersection.of.regs(region[[chr]],region2[[chr]]) + } + } + return(region) +} + +reg.around.pos <-function(pos,range=500,strand=NULL) { + if(length(range)==1) range=c(range,range) + if(is.null(strand)) strand = 1; + reg = cbind(pos-range[1]*strand,pos+range[2]*strand); + ind = which(reg[,2]<reg[,1]) + reg[ind,] = reg[ind,2:1] + ind = which(reg<=0) + reg[ind] = 1 + return(reg) +} + + +region.around.poslist <-function(poslist,range=500) { + chrs = names(table(poslist$chr)) + region=list() + for(chr in chrs) { + ind = which(poslist$chr==chr) + pos=poslist$pos[ind] + strand = 1 + if(!is.null(poslist$strand)) { + strand = poslist$strand[ind] + } + region[[chr]] = reg.around.pos(pos,range,strand) + } + return(region) +} + + +poslist.of.region.centers <-function(region) { + chrs = names(region) + n=sapply(region,nrow) + return(data.frame(chr=rep(chrs,n),pos=unlist(lapply(region,function(chr)(chr[,1]+chr[,2])/2),use.names = FALSE))) +} + +write.gff.region<-function(region,outfname) { + region = lapply(region,function(chr) list(s=chr[,1],e=chr[,2])) + out=unlist.chr(region) + out$chr=rep(names(region),sapply(region,function(i) length(i$s))) + empty=rep(".",length(out$chr)) + write.table(data.frame(out$chr,empty,empty,out$s,out$e,empty,empty,empty,empty),quote=FALSE,sep="\t",file=outfname,col.names=FALSE,row.names=FALSE) +} + +number.of.regions = function(region)sum(sapply(region,nrow)) +size.of.regions = function(region) sum(sapply(merge.regions(region),function(reg) sum(reg[,2]-reg[,1])))