# HG changeset patch
# User vandelj
# Date 1593178606 14400
# Node ID 3022feec50fed36d999b18f9cc352db1ef6b1901
"planemo upload for repository https://github.com/juliechevalier/GIANT/tree/master commit cb276a594444c8f32e9819fefde3a21f121d35df"
diff -r 000000000000 -r 3022feec50fe galaxy/wrappers/FormatForGSEA.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/galaxy/wrappers/FormatForGSEA.xml Fri Jun 26 09:36:46 2020 -0400
@@ -0,0 +1,228 @@
+
+ Format input files for GSEA software
+
+
+
+
+
+ 1{nlines++} ARGIND==2 && FNR==1{print "\#1.2\n"nlines"\t"NF-1"\n";print "NAME\tDESCRIPTION"; for(i=2;i<=NF;i++)print"\t"\$i;print "\n"} ARGIND==2 && FNR>1{print \$1"\tna";for(i=2;i<=NF;i++)print "\t"\$i;print "\n"}' $mainCondition.expressionData $mainCondition.expressionData > $outExpression;
+
+ if [ ! -e $outExpression ]; then
+ printf "[ERROR]Expression file failed" >> $log;
+ exit 10;
+ fi
+ ;
+ awk -v factor="$mainCondition.factorToInclude" 'BEGIN{FS="\t";OFS="";ORS="";nameCond="";line="";cpt=0;lgt=0} ARGIND==1 && FNR==1{for(iCond=2;iCond<=NF;iCond++){conditionOrder[iCond-1]=\$iCond};cpt=NF-1} ARGIND==2 && FNR==1{for(i=1;i<=NF;i++)if(\$i==factor)factorInd=i} ARGIND==2 && FNR>1{valueFact[\$1]=\$factorInd} END{for(i=1;i<=cpt;i++){ line=line""valueFact[conditionOrder[i]]" "; if(dico[valueFact[conditionOrder[i]]]!=1){lgt++; nameCond=nameCond""valueFact[conditionOrder[i]]" ";dico[valueFact[conditionOrder[i]]]=1}};print cpt" "lgt" 1\n";print "\# "nameCond"\n";print line}' $mainCondition.expressionData $mainCondition.conditionInformation > $outPhenotypes;
+
+ if [ ! -e $outPhenotypes ]; then
+ printf "[ERROR]Phenotype file failed" >> $log;
+ exit 10;
+ fi
+ ;
+#else:
+
+ if [ \$(awk 'NR==1{print (NF-2)%5}' $mainCondition.differentialAnalysis ) -ne 0 ]; then
+ printf "[ERROR] please check that differential analysis file respect requested input format especially concerning the column number" >> $log;
+ exit 10;
+ fi
+ ;
+
+ awk -v comparison="$mainCondition.comparisonsToUse" -v rkIndice="$mainCondition.rankingIndice" -v pvalTh="$mainCondition.pvalThreshold" 'BEGIN{FS="\t"} NR==1{if(rkIndice=="Tstat" || rkIndice=="absTstat"){start=7}else{start=6};for(i=start;i<=NF;i=i+5){if(\$i==comparison)refCol=i};} NR>2{if(rkIndice=="Tstat" || rkIndice=="absTstat"){if(\$(refCol-3)0){val=\$refCol}else{val=-\$refCol}};print \$1"\t"val}}else{if(\$(refCol-2)0){val=\$refCol}else{val=-\$refCol}};print \$1"\t"val}}}' $mainCondition.differentialAnalysis > ./temp.txt;
+ printf "#NAME\tSCORE\n" > $outRankedGenes;
+ LC_ALL=C sort -t$'\t' -k2,2 -gr ./temp.txt >> $outRankedGenes;
+ rm ./temp.txt;
+
+ if [ ! -e $outRankedGenes ]; then
+ printf "[ERROR]Rank file failed" >> $log;
+ exit 10;
+ fi
+ ;
+#end if
+
+ printf "[INFO]End of tool script" >> $log;
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ mainCondition['selection']=="classicGSEA"
+
+
+
+ mainCondition['selection']=="classicGSEA"
+
+
+
+ mainCondition['selection']=="rankedGSEA"
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ @misc{vandel_jimmy_2018_1477870, author = {Vandel, J. and Gheeraert, C. and Eeckhoute, J. and Staels, B. and Lefebvre, P. and Dubois-Chevalier, J.}, title = {GIANT: Galaxy-based Interactive tools for ANalaysis of Transcriptomic data}, month = nov, year = 2018, doi = {10.5281/zenodo.1477870}, url = {https://doi.org/10.5281/zenodo.1477870}
+ }
+
+ @article {Subramanian15545,
+ author = {Subramanian, Aravind and Tamayo, Pablo and Mootha, Vamsi K. and Mukherjee, Sayan and Ebert, Benjamin L. and Gillette, Michael A. and Paulovich, Amanda and Pomeroy, Scott L. and Golub, Todd R. and Lander, Eric S. and Mesirov, Jill P.},
+ title = {Gene set enrichment analysis: A knowledge-based approach for interpreting genome-wide expression profiles},
+ volume = {102},
+ number = {43},
+ pages = {15545--15550},
+ year = {2005},
+ publisher = {National Academy of Sciences},
+ issn = {0027-8424},
+ journal = {Proceedings of the National Academy of Sciences}
+ }
+
+
+
diff -r 000000000000 -r 3022feec50fe src/ExprPlotsScript.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/ExprPlotsScript.R Fri Jun 26 09:36:46 2020 -0400
@@ -0,0 +1,465 @@
+# A command-line interface to basic plots for use with Galaxy
+# written by Jimmy Vandel
+# one of these arguments is required:
+#
+#
+initial.options <- commandArgs(trailingOnly = FALSE)
+file.arg.name <- "--file="
+script.name <- sub(file.arg.name, "", initial.options[grep(file.arg.name, initial.options)])
+script.basename <- dirname(script.name)
+source(file.path(script.basename, "utils.R"))
+source(file.path(script.basename, "getopt.R"))
+
+#addComment("Welcome R!")
+
+# setup R error handling to go to stderr
+options( show.error.messages=F, error = function () { cat(geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
+
+# we need that to not crash galaxy with an UTF8 error on German LC settings.
+loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
+loc <- Sys.setlocale("LC_NUMERIC", "C")
+
+#get starting time
+start.time <- Sys.time()
+
+#get options
+options(stringAsfactors = FALSE, useFancyQuotes = FALSE)
+args <- commandArgs()
+
+
+# get options, using the spec as defined by the enclosed list.
+# we read the options from the default: commandArgs(TRUE).
+spec <- matrix(c(
+ "dataFile", "i", 1, "character",
+ "factorInfo","t", 1, "character",
+ "dataFileFormat","j",1,"character",
+ "conditionNames","c",1,"character",
+ "format", "f", 1, "character",
+ "quiet", "q", 0, "logical",
+ "log", "l", 1, "character",
+ "histo" , "h", 1, "character",
+ "maPlot" , "a", 1, "character",
+ "boxplot" , "b", 1, "character",
+ "microarray" , "m", 1, "character",
+ "acp" , "p" , 1, "character",
+ "screePlot" , "s" , 1, "character"),
+ byrow=TRUE, ncol=4)
+opt <- getopt(spec)
+
+# enforce the following required arguments
+if (is.null(opt$log)) {
+ addComment("[ERROR]'log file' is required")
+ q( "no", 1, F )
+}
+addComment("[INFO]Start of R script",T,opt$log,display=FALSE)
+if (is.null(opt$dataFile) || is.null(opt$dataFileFormat)) {
+ addComment("[ERROR]'dataFile' and it format are required",T,opt$log)
+ q( "no", 1, F )
+}
+if (is.null(opt$format)) {
+ addComment("[ERROR]'output format' is required",T,opt$log)
+ q( "no", 1, F )
+}
+if (is.null(opt$histo) & is.null(opt$maPlot) & is.null(opt$boxplot) & is.null(opt$microarray) & is.null(opt$acp)){
+ addComment("[ERROR]Select at least one plot to draw",T,opt$log)
+ q( "no", 1, F )
+}
+
+verbose <- if (is.null(opt$quiet)) {
+ TRUE
+}else{
+ FALSE}
+
+addComment("[INFO]Parameters checked!",T,opt$log,display=FALSE)
+
+addComment(c("[INFO]Working directory: ",getwd()),TRUE,opt$log,display=FALSE)
+addComment(c("[INFO]Command line: ",args),TRUE,opt$log,display=FALSE)
+
+#directory for plots
+dir.create(file.path(getwd(), "plotDir"))
+dir.create(file.path(getwd(), "plotLyDir"))
+
+#silent package loading
+suppressPackageStartupMessages({
+ library("oligo")
+ library("ff")
+ library("ggplot2")
+ library("plotly")
+})
+
+
+#chargement des fichiers en entrée
+#fichier de type CEL
+dataAreFromCel=FALSE
+if(toupper(opt$dataFileFormat)=="CEL"){
+ dataAreFromCel=TRUE
+ celData=read.celfiles(unlist(strsplit(opt$dataFile,",")))
+ #load all expressions
+ dataMatrix=exprs(celData)
+ #select "pm" probes
+ probeInfo=getProbeInfo(celData,probeType = c("pm"),target="probeset")
+ #reduce dataMatrix to log expression matrix for a randomly probe selection
+ dataMatrix=log2(dataMatrix[sample(unique(probeInfo[,1]),min(100000,length(unique(probeInfo[,1])))),])
+ addComment("[INFO]Raw data are log2 transformed",TRUE,opt$log,display=FALSE)
+ remove(probeInfo)
+}else{
+ #fichier deja tabule
+ dataMatrix=read.csv(file=opt$dataFile,header=F,sep="\t",colClasses="character")
+ #remove first row to convert it as colnames (to avoid X before colnames with header=T)
+ colNamesData=dataMatrix[1,-1]
+ dataMatrix=dataMatrix[-1,]
+ #remove first colum to convert it as rownames
+ rowNamesData=dataMatrix[,1]
+ dataMatrix=dataMatrix[,-1]
+ if(is.data.frame(dataMatrix)){
+ dataMatrix=data.matrix(dataMatrix)
+ }else{
+ dataMatrix=data.matrix(as.numeric(dataMatrix))
+ }
+ dimnames(dataMatrix)=list(rowNamesData,colNamesData)
+ if(any(duplicated(rowNamesData)))addComment("[WARNING] several rows share the same probe/gene name, you should merge or rename them to avoid further analysis mistakes",TRUE,opt$log,display=FALSE)
+}
+
+addComment("[INFO]Input data loaded",TRUE,opt$log,display=FALSE)
+addComment(c("[INFO]Dim of data matrix:",dim(dataMatrix)),T,opt$log,display=FALSE)
+
+#get number of conditions
+nbConditions=ncol(dataMatrix)
+
+#get condition names if they are specified
+if(!is.null(opt$conditionNames) && length(opt$conditionNames)==nbConditions){
+ nameConditions=opt$conditionNames
+ colnames(dataMatrix)=nameConditions
+ #rownames(phenoData(celData)@data)=nameConditions
+ #rownames(protocolData(celData)@data)=nameConditions
+}else{
+ nameConditions=colnames(dataMatrix)
+}
+
+#create a correspondance table between plot file names and name displayed in figure legend and html items
+correspondanceNameTable=matrix("",ncol=2,nrow=nbConditions)
+correspondanceNameTable[,1]=paste("Condition",1:nbConditions,sep="")
+correspondanceNameTable[,2]=nameConditions
+rownames(correspondanceNameTable)=correspondanceNameTable[,2]
+
+addComment("[INFO]Retreive condition names",TRUE,opt$log,display=FALSE)
+
+if(!is.null(opt$factorInfo)){
+ #chargement du fichier des facteurs
+ factorInfoMatrix=read.csv(file=file.path(getwd(), opt$factorInfo),header=F,sep="\t",colClasses="character")
+ #remove first row to convert it as colnames
+ colnames(factorInfoMatrix)=factorInfoMatrix[1,]
+ factorInfoMatrix=factorInfoMatrix[-1,]
+ #use first colum to convert it as rownames but not removing it to avoid conversion as vector in unique factor case
+ rownames(factorInfoMatrix)=factorInfoMatrix[,1]
+
+
+ if(length(setdiff(colnames(dataMatrix),rownames(factorInfoMatrix)))!=0){
+ addComment("[ERROR]Missing samples in factor file",T,opt$log)
+ q( "no", 1, F )
+ }
+
+ #order sample as in expression matrix and remove spurious sample
+ factorInfoMatrix=factorInfoMatrix[colnames(dataMatrix),]
+
+ addComment("[INFO]Factors OK",T,opt$log,display=FALSE)
+ addComment(c("[INFO]Dim of factorInfo matrix:",dim(factorInfoMatrix)),T,opt$log,display=FALSE)
+
+}
+
+addComment("[INFO]Ready to plot",T,opt$log,display=FALSE)
+
+
+##----------------------
+
+###plot histograms###
+histogramPerFigure=50
+if (!is.null(opt$histo)) {
+ for(iToPlot in 1:(((nbConditions-1)%/%histogramPerFigure)+1)){
+ firstPlot=1+histogramPerFigure*(iToPlot-1)
+ lastPlot=min(nbConditions,histogramPerFigure*iToPlot)
+ dataToPlot=data.frame(x=c(dataMatrix[,firstPlot:lastPlot]),Experiment=rep(colnames(dataMatrix)[firstPlot:lastPlot],each=nrow(dataMatrix)))
+ p <- ggplot(data=dataToPlot, aes(x = x, color=Experiment)) + stat_density(geom="line", size=1, position="identity") +
+ ggtitle("Intensity densities") + theme_bw() + ylab(label="Density") +
+ theme(panel.border=element_blank(),plot.title = element_text(hjust = 0.5))
+ if(dataAreFromCel){
+ #original ploting function
+ #hist(celData[,firstPlot:lastPlot],lty=rep(1,nbConditions)[firstPlot:lastPlot],lwd=2,which='pm',target="probeset",transfo=log2,col=rainbow(nbConditions)[firstPlot:lastPlot])
+ p <- p + xlab(label="Log2 intensities")
+ }else{
+ p <- p + xlab(label="Intensities")
+ }
+ if(opt$format=="pdf"){
+ pdf(paste(c("./plotDir/",opt$histo,iToPlot,".pdf"),collapse=""))}else{
+ png(paste(c("./plotDir/",opt$histo,iToPlot,".png"),collapse=""))
+ }
+ print(p)
+ dev.off()
+ #save plotly files
+ pp <- ggplotly(p)
+ htmlwidgets::saveWidget(as_widget(pp), paste(c(file.path(getwd(), "plotLyDir"),"/",opt$histo,iToPlot,".html"),collapse=""),selfcontained = F)
+ }
+ remove(p,dataToPlot)
+ addComment("[INFO]Histograms drawn",T,opt$log,display=FALSE)
+}
+
+##----------------------
+
+###plot MAplots###
+MAplotPerPage=4
+if (!is.null(opt$maPlot)) {
+ iToPlot=1
+ plotVector=list()
+ toTake=sample(nrow(dataMatrix),min(200000,nrow(dataMatrix)))
+ refMedianColumn=rowMedians(as.matrix(dataMatrix[toTake,]))
+ if(length(toTake)>100000)addComment(c("[INFO]high number of input data rows ",length(toTake),"; the generation of MA plot can take a while, please be patient"),TRUE,opt$log,display=FALSE)
+ for (iCondition in 1:nbConditions){
+ #MAplot(celData,which=i,what=pm,transfo=log2)
+ #smoothScatter(x=xToPlot,y=yToPlot,main=nameConditions[iCondition])
+ dataA=dataMatrix[toTake,iCondition]
+ dataB=refMedianColumn####ATTENTION PAR DEFAUT
+ xToPlot=0.5*(dataA+dataB)
+ yToPlot=dataA-dataB
+ tempX=seq(min(xToPlot),max(xToPlot),0.1)
+ tempY=unlist(lapply(tempX,function(x){median(yToPlot[intersect(which(xToPlot>=(x-0.1/2)),which(xToPlot<(x+0.1/2)))])}))
+
+ dataToPlot=data.frame(x=xToPlot,y=yToPlot)
+ dataMedianToPlot=data.frame(x=tempX,y=tempY)
+ p <- ggplot(data=dataToPlot, aes(x,y)) + stat_density2d(aes(fill = ..density..^0.25), geom = "tile", contour = FALSE, n = 100) +
+ scale_fill_continuous(low = "white", high = "dodgerblue4") + geom_smooth(data=dataMedianToPlot,colour="red", size=0.5, se=FALSE) +
+ ggtitle(correspondanceNameTable[iCondition,2]) + theme_bw() + xlab(label="") + ylab(label="") +
+ theme(panel.border=element_blank(),plot.title = element_text(hjust = 0.5),legend.position = "none")
+ plotVector[[length(plotVector)+1]]=p
+
+ #save plotly files
+ pp <- ggplotly(p)
+ htmlwidgets::saveWidget(as_widget(pp), paste(c(file.path(getwd(), "plotLyDir"),"/",opt$maPlot,"_",correspondanceNameTable[iCondition,1],".html"),collapse=""),selfcontained = F)
+
+ if(iCondition==nbConditions || length(plotVector)==MAplotPerPage){
+ #define a new plotting file
+ if(opt$format=="pdf"){
+ pdf(paste(c("./plotDir/",opt$maPlot,iToPlot,".pdf"),collapse=""))}else{
+ png(paste(c("./plotDir/",opt$maPlot,iToPlot,".png"),collapse=""))
+ }
+ multiplot(plotlist=plotVector,cols=2)
+ dev.off()
+ if(iCondition0)c(sample(c(x),min(length(x),1000)),max(c(x)),min(c(x))))
+ dataOutliers=data.frame(yVal=unlist(boxplotsOutliers),xVal=unlist(lapply(seq_along(boxplotsOutliers),function(x)rep(names(boxplotsOutliers)[x],length(boxplotsOutliers[[x]])))))
+ #plot boxplots without outliers
+ p <- ggplot(data=dataToPlot, aes(y = intensities, x=Experiment ,color=Experiment)) + geom_boxplot(outlier.colour=NA,outlier.shape =NA) +
+ ggtitle("Intensities") + theme_bw() + xlab(label="") +
+ theme(panel.border=element_blank(),plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 45, hjust = 1),plot.margin=unit(c(10,10,max(unlist(lapply(dataToPlot$Experiment,function(x)nchar(as.character(x))))),15+max(unlist(lapply(dataToPlot$Experiment,function(x)nchar(as.character(x)))))),"mm"))
+ #add to plot sampled outliers
+ p <- p + geom_point(data=dataOutliers,aes(x=xVal,y=yVal,color=xVal),inherit.aes = F)
+ if(dataAreFromCel){
+ #original plotting function
+ #boxplot(celData[,firstPlot:lastPlot],which='pm',col=rainbow(nbConditions)[firstPlot:lastPlot],target="probeset",transfo=log2,names=nameConditions[firstPlot:lastPlot],main="Intensities")
+ p <- p + ylab(label="Log2 intensities")
+ }else{
+ p <- p + ylab(label="Intensities")
+ }
+ if(opt$format=="pdf"){
+ pdf(paste(c("./plotDir/",opt$boxplot,iToPlot,".pdf"),collapse=""))}else{
+ png(paste(c("./plotDir/",opt$boxplot,iToPlot,".png"),collapse=""))
+ }
+ print(p)
+ dev.off()
+ #save plotly files
+ pp <- ggplotly(p)
+
+ #modify plotly object to get HTML file not too heavy for loading
+ for(iData in 1:length(pp$x$data)){
+ ##get kept outliers y values
+ #yPointsToKeep=dataOutliers$yVal[which(dataOutliers$xVal==pp$x$data[[iData]]$name)]
+ if(pp$x$data[[iData]]$type=="scatter"){
+ ##scatter plot represent outliers points added to boxplot through geom_point
+ ##nothing to do as outliers have been sampled allready, just have to modify hover text
+ #if(length(yPointsToKeep)>0){
+ #pointsToKeep=which(pp$x$data[[iData]]$y %in% yPointsToKeep)
+ #pp$x$data[[iData]]$x=pp$x$data[[iData]]$x[pointsToKeep]
+ #pp$x$data[[iData]]$y=pp$x$data[[iData]]$y[pointsToKeep]
+ #pp$x$data[[iData]]$text=pp$x$data[[iData]]$text[pointsToKeep]
+ #}else{
+ #pp$x$data[[iData]]$x=NULL
+ #pp$x$data[[iData]]$y=NULL
+ #pp$x$data[[iData]]$marker$opacity=0
+ #pp$x$data[[iData]]$hoverinfo=NULL
+ #pp$x$data[[iData]]$text=NULL
+ #}
+ #modify text to display
+ if(dataAreFromCel){
+ pp$x$data[[iData]]$text=unlist(lapply(seq_along(pp$x$data[[iData]]$y),function(x)return(paste(c("log2(intensity) ",prettyNum(pp$x$data[[iData]]$y[x],digits=4)),collapse = ""))))
+ }else{
+ pp$x$data[[iData]]$text=unlist(lapply(seq_along(pp$x$data[[iData]]$y),function(x)return(paste(c("intensity ",prettyNum(pp$x$data[[iData]]$y[x],digits=4)),collapse = ""))))
+ }
+ }else{
+ ##disable marker plotting to keep only box and whiskers plot (outliers are displayed through scatter plot)
+ pp$x$data[[iData]]$marker$opacity=0
+
+ #sample 50000 points amongst all data to get a lighter html file, sampling size should not be too low to avoid modifying limit of boxplots
+ pp$x$data[[iData]]$y=c(sample(dataMatrix[,pp$x$data[[iData]]$name],min(length(dataMatrix[,pp$x$data[[iData]]$name]),50000)),min(dataMatrix[,pp$x$data[[iData]]$name]),max(dataMatrix[,pp$x$data[[iData]]$name]))
+ pp$x$data[[iData]]$x=rep(pp$x$data[[iData]]$x[1],length(pp$x$data[[iData]]$y))
+
+ ##first remove outliers info
+ #downUpValues=boxplot.stats(dataMatrix[,pp$x$data[[iData]]$name])$stats
+ #if(verbose)addComment(c("filter values for boxplot",pp$x$data[[iData]]$name,"between",min(downUpValues),"and",max(downUpValues)),T,opt$log)
+ #pointsToRemove=which(pp$x$data[[iData]]$y0)pp$x$data[[iData]]$y=pp$x$data[[iData]]$y[-pointsToRemove]
+ #pointsToRemove=which(pp$x$data[[iData]]$y>max(downUpValues))
+ #if(length(pointsToRemove)>0)pp$x$data[[iData]]$y=pp$x$data[[iData]]$y[-pointsToRemove]
+ #then add sampled outliers info
+ #pp$x$data[[iData]]$y=c(yPointsToKeep,pp$x$data[[iData]]$y)
+ #pp$x$data[[iData]]$x=rep(pp$x$data[[iData]]$x[1],length(pp$x$data[[iData]]$y))
+ }
+ }
+
+ htmlwidgets::saveWidget(as_widget(pp), paste(c(file.path(getwd(), "plotLyDir"),"/",opt$boxplot,iToPlot,".html"),collapse=""),selfcontained = F)
+ }
+ remove(p,dataToPlot)
+ addComment("[INFO]Boxplots drawn",T,opt$log,display=FALSE)
+
+}
+
+##----------------------
+
+###plot microarrays (only for .CEL files)###
+if (!is.null(opt$microarray) && dataAreFromCel) {
+ for (iCondition in 1:nbConditions){
+ if(opt$format=="pdf"){
+ pdf(paste(c("./plotDir/",opt$microarray,"_",correspondanceNameTable[iCondition,1],".pdf"),collapse=""),onefile = F,width = 5,height = 5)}else{
+ png(paste(c("./plotDir/",opt$microarray,"_",correspondanceNameTable[iCondition,1],".png"),collapse=""))
+ }
+ image(celData[,iCondition],main=correspondanceNameTable[iCondition,2])
+ dev.off()
+ }
+ addComment("[INFO]Microarray drawn",T,opt$log,display=FALSE)
+}
+
+##----------------------
+
+###plot PCA plot###
+if (!is.null(opt$acp)){
+ ##to avoid error when nrow is too large, results quite stable with 200k random selected rows
+ randomSelection=sample(nrow(dataMatrix),min(200000,nrow(dataMatrix)))
+ #remove constant variables
+
+ dataFiltered=dataMatrix[randomSelection,]
+ toRemove=which(unlist(apply(dataFiltered,1,var))==0)
+ if(length(toRemove)>0){
+ dataFiltered=dataFiltered[-toRemove,]
+ }
+ ##geom_text(aes(label=Experiments,hjust=1, vjust=1.3), y = PC2+0.01)
+ PACres = prcomp(t(dataFiltered),scale.=TRUE)
+
+ if(!is.null(opt$screePlot)){
+ #scree plot
+ #p <- fviz_eig(PACres)
+ dataToPlot=data.frame(compo=seq(1,length(PACres$sdev)),var=(PACres$sdev^2/sum(PACres$sdev^2))*100)
+ p<-ggplot(data=dataToPlot, aes(x=compo, y=var)) + geom_bar(stat="identity", fill="steelblue") + geom_line() + geom_point() +
+ ggtitle("Scree plot") + theme_bw() + theme(panel.border=element_blank(),plot.title = element_text(hjust = 0.5)) +
+ xlab(label="Dimensions") + ylab(label="% explained variances") + scale_x_discrete(limits=dataToPlot$compo)
+ pp <- ggplotly(p)
+
+ if(opt$format=="pdf"){
+ pdf(paste(c("./plotDir/",opt$screePlot,".pdf"),collapse=""))}else{
+ png(paste(c("./plotDir/",opt$screePlot,".png"),collapse=""))
+ }
+ plot(p)
+ dev.off()
+ htmlwidgets::saveWidget(as_widget(pp), paste(c(file.path(getwd(), "plotLyDir"),"/",opt$screePlot,".html"),collapse=""),selfcontained = F)
+ }
+
+ #now plot pca plots
+
+ if(!is.null(opt$factorInfo)){
+ fileIdent=""
+ symbolset = c("circle","cross","square","diamond","circle-open","square-open","diamond-open","x")
+
+ #save equivalence between real factor names and generic ones in correspondanceNameTable
+ correspondanceNameTable=rbind(correspondanceNameTable,matrix(c(paste("Factor",1:(ncol(factorInfoMatrix)-1),sep=""),colnames(factorInfoMatrix)[-1]),ncol=2,nrow=ncol(factorInfoMatrix)-1))
+ rownames(correspondanceNameTable)=correspondanceNameTable[,2]
+
+ #first order factors from decreasing groups number
+ orderedFactors=colnames(factorInfoMatrix)[-1][order(unlist(lapply(colnames(factorInfoMatrix)[-1],function(x)length(table(factorInfoMatrix[,x])))),decreasing = T)]
+ allFactorsBigger=length(table(factorInfoMatrix[,orderedFactors[length(orderedFactors)]]))>length(symbolset)
+ if(allFactorsBigger)addComment("[WARNING]All factors are composed of too many groups to display two factors at same time, each PCA plot will display only one factor groups",T,opt$log,display=FALSE)
+ for(iFactor in 1:length(orderedFactors)){
+ #if it is the last factor of the list or if all factor
+ if(iFactor==length(orderedFactors) || allFactorsBigger){
+ if(length(orderedFactors)==1 || allFactorsBigger){
+ dataToPlot=data.frame(PC1=PACres$x[,1],PC2=PACres$x[,2],PC3=PACres$x[,3],Experiments=rownames(PACres$x), Attribute1=factorInfoMatrix[rownames(PACres$x),orderedFactors[iFactor]], hoverLabel=unlist(lapply(rownames(PACres$x),function(x)paste(factorInfoMatrix[x,-1],collapse=","))))
+ p <- plot_ly(dataToPlot,x = ~PC1, y = ~PC2, z = ~PC3, type = 'scatter3d', mode="markers", color=~Attribute1,colors=rainbow(length(levels(dataToPlot$Attribute1))+2),hoverinfo = 'text', text = ~paste(Experiments,"\n",hoverLabel),marker=list(size=5))%>%
+ layout(title = "Principal Component Analysis", scene = list(xaxis = list(title = "Component 1"),yaxis = list(title = "Component 2"),zaxis = list(title = "Component 3")),
+ legend=list(font = list(family = "sans-serif",size = 15,color = "#000")))
+ fileIdent=correspondanceNameTable[orderedFactors[iFactor],1]
+ #add text label to plot
+ ##p <- add_text(p,x = dataToPlot$PC1, y = dataToPlot$PC2 + (max(PACres$x[,2])-min(PACres$x[,2]))*0.02, z = dataToPlot$PC3, mode = 'text', inherit = F, text=rownames(PACres$x), hoverinfo='skip', showlegend = FALSE, color=I('black'))
+ #save the plotly plot
+ htmlwidgets::saveWidget(as_widget(p), paste(c(file.path(getwd(), "plotLyDir"),"/",opt$acp,"_",fileIdent,".html"),collapse=""),selfcontained = F)
+ }
+ }else{
+ for(iFactorBis in (iFactor+1):length(orderedFactors)){
+ if(length(table(factorInfoMatrix[,orderedFactors[iFactorBis]]))<=length(symbolset)){
+ dataToPlot=data.frame(PC1=PACres$x[,1],PC2=PACres$x[,2],PC3=PACres$x[,3],Experiments=rownames(PACres$x), Attribute1=factorInfoMatrix[rownames(PACres$x),orderedFactors[iFactor]], Attribute2=factorInfoMatrix[rownames(PACres$x),orderedFactors[iFactorBis]], hoverLabel=unlist(lapply(rownames(PACres$x),function(x)paste(factorInfoMatrix[x,-1],collapse=","))))
+ p <- plot_ly(dataToPlot,x = ~PC1, y = ~PC2, z = ~PC3, type = 'scatter3d', mode="markers", color=~Attribute1,colors=rainbow(length(levels(dataToPlot$Attribute1))+2),symbol=~Attribute2,symbols = symbolset,hoverinfo = 'text', text = ~paste(Experiments,"\n",hoverLabel),marker=list(size=5))%>%
+ layout(title = "Principal Component Analysis", scene = list(xaxis = list(title = "Component 1"),yaxis = list(title = "Component 2"),zaxis = list(title = "Component 3")),
+ legend=list(font = list(family = "sans-serif",size = 15,color = "#000")))
+ fileIdent=paste(correspondanceNameTable[orderedFactors[c(iFactor,iFactorBis)],1],collapse="_AND_")
+ #add text label to plot
+ ##p <- add_text(p,x = dataToPlot$PC1, y = dataToPlot$PC2 + (max(PACres$x[,2])-min(PACres$x[,2]))*0.02, z = dataToPlot$PC3, mode = 'text', inherit = F, text=rownames(PACres$x), hoverinfo='skip', showlegend = FALSE, color=I('black'))
+ #save the plotly plot
+ htmlwidgets::saveWidget(as_widget(p), paste(c(file.path(getwd(), "plotLyDir"),"/",opt$acp,"_",fileIdent,".html"),collapse=""),selfcontained = F)
+ }else{
+ addComment(c("[WARNING]PCA with",orderedFactors[iFactor],"and",orderedFactors[iFactorBis],"groups cannot be displayed, too many groups (max",length(symbolset),")"),T,opt$log,display=FALSE)
+ }
+ }
+ }
+ }
+ }else{
+ dataToPlot=data.frame(PC1=PACres$x[,1],PC2=PACres$x[,2],PC3=PACres$x[,3],Experiments=rownames(PACres$x))
+ p <- plot_ly(dataToPlot,x = ~PC1, y = ~PC2, z = ~PC3, type = 'scatter3d', mode="markers",marker=list(size=5,color="salmon"),hoverinfo = 'text',text = ~paste(Experiments))%>%
+ layout(title = "Principal Component Analysis", scene = list(xaxis = list(title = "Component 1"),yaxis = list(title = "Component 2"),zaxis = list(title = "Component 3")),
+ legend=list(font = list(family = "sans-serif",size = 15,color = "#000")))
+ ##p <- add_text(p,x = dataToPlot$PC1, y = dataToPlot$PC2 + (max(PACres$x[,2])-min(PACres$x[,2]))*0.02, z = dataToPlot$PC3, mode = 'text', inherit = F, text=rownames(PACres$x), hoverinfo='skip', showlegend = FALSE, color=I('black'))
+
+ #save plotly files
+ htmlwidgets::saveWidget(as_widget(p), paste(c(file.path(getwd(), "plotLyDir"),"/",opt$acp,"_plot.html"),collapse=""),selfcontained = F)
+ }
+ remove(p,dataToPlot,dataFiltered)
+ addComment("[INFO]ACP plot drawn",T,opt$log,display=FALSE)
+}
+
+#write correspondances between plot file names and displayed names in figure legends, usefull to define html items in xml file
+write.table(correspondanceNameTable,file=file.path(getwd(), "correspondanceFileNames.csv"),quote=FALSE,sep="\t",col.names = F,row.names = F)
+
+end.time <- Sys.time()
+addComment(c("[INFO]Total execution time for R script:",as.numeric(end.time - start.time,units="mins"),"mins"),T,opt$log,display=FALSE)
+
+addComment("[INFO]End of R script",T,opt$log,display=FALSE)
+
+printSessionInfo(opt$log)
+#sessionInfo()
diff -r 000000000000 -r 3022feec50fe src/General_functions.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/General_functions.py Fri Jun 26 09:36:46 2020 -0400
@@ -0,0 +1,206 @@
+import re
+import numpy as np
+
+def get_column_names( file_path, toNotConsider=-1, each=1):
+ options=[]
+ inputfile = open(file_path)
+ firstLine = next(inputfile).strip().split("\t")
+ cpt=0
+ for i, field_component in enumerate( firstLine ):
+ if i!=toNotConsider:#to squeeze the first column
+ if cpt==0:
+ options.append( ( field_component, field_component, False ) )
+ cpt+=1
+ if cpt==each:
+ cpt=0
+ inputfile.close()
+ return options
+
+def get_column_names_filteredList( file_path, toNotConsider=[], each=1):
+ options=[]
+ inputfile = open(file_path)
+ firstLine = next(inputfile).strip().split("\t")
+ cpt=0
+ for i, field_component in enumerate( firstLine ):
+ if i not in toNotConsider:#to squeeze the first columns
+ if cpt==0:
+ options.append( ( field_component, field_component, False ) )
+ cpt+=1
+ if cpt==each:
+ cpt=0
+ inputfile.close()
+ return options
+
+def get_column_names_mergeNumber(file_path, numberToMerge=1, toNotConsider=[]):
+ options=[]
+ inputfile = open(file_path)
+ if int(numberToMerge)>0:
+ iHeader=0
+ for iCurrentLine in inputfile:
+ iHeader=iHeader+1
+ if iHeader>int(numberToMerge):
+ break
+ currentLine=iCurrentLine.strip().split("\t")
+ iOption=-1
+ for i, field_component in enumerate( currentLine ):
+ if i not in toNotConsider:#to squeeze specified columns
+ iOption=iOption+1
+ if iHeader==1:
+ options.append( ( str(field_component), str(field_component), False ) )
+ else:
+ options[iOption]=(options[iOption][0]+"_"+str(field_component),options[iOption][1]+"_"+str(field_component),False)
+ else:
+ currentLine = next(inputfile).strip().split("\t")
+ for i, field_component in enumerate( currentLine ):
+ if i not in toNotConsider:#to squeeze specified columns
+ options.append( ( "Column_"+str(i), "Column_"+str(i), False ) )
+ inputfile.close()
+ return options
+
+def get_row_names( file_path, factorName ):
+ inputfile = open(file_path)
+ firstLine = next(inputfile).strip().split("\t")
+ iColumn=-1
+ for i, field_component in enumerate( firstLine ):
+ if field_component==factorName:#to test
+ iColumn=i
+ options=[]
+ if iColumn!=-1:
+ for nextLine in inputfile:
+ nextLine=nextLine.strip().split("\t")
+ if len(nextLine)>1:
+ if (nextLine[iColumn], nextLine[iColumn], False) not in options:
+ options.append( (nextLine[iColumn], nextLine[iColumn], False) )
+ inputfile.close()
+ return options
+
+def get_condition_file_names( file_list, toNotConsider=-1, each=1):
+ options=[]
+ if not isinstance(file_list,list):#if input file is a tabular file, act as get_column_names
+ inputfile = open(file_list.file_name)
+ firstLine = next(inputfile).strip().split("\t")
+ cpt=0
+ for i, field_component in enumerate( firstLine ):
+ if i!=toNotConsider:#to squeeze the first column
+ if cpt==0:
+ options.append( ( field_component, field_component, False ) )
+ cpt+=1
+ if cpt==each:
+ cpt=0
+ inputfile.close()
+ else:#if input file is a .cel file list or a collection
+ if not hasattr(file_list[0],'collection'):#if it is not a collection, get name easily
+ for i, field_component in enumerate( file_list ):
+ options.append( ( field_component.name, field_component.name, False ) )
+ else:#if the file is a collection, have to get deeper in the corresponding HistoryDatasetCollectionAssociation object
+ for i, field_component in enumerate( file_list[0].collection.elements ):
+ options.append( ( field_component.element_identifier, field_component.element_identifier, False ) )
+ return options
+
+def generateFactorFile( file_list, factor_list, outputFileName, logFile):
+ forbidenCharacters={"*",":",",","|"}
+ outputfile = open(outputFileName, 'w')
+ outputLog = open(logFile, 'w')
+ sampleList=[]
+ if not isinstance(file_list,list):
+ conditionNames=get_condition_file_names(file_list,0) #unique expression file, remove the first column (index=0)
+ else :
+ conditionNames=get_condition_file_names(file_list) #.CEL files
+ for iSample, sample_component in enumerate (conditionNames):
+ sampleList.append(str(sample_component[1]))
+ outputLog.write("[INFO] "+str(len(sampleList))+" sample are detected as input\n")
+ globalDict=dict()
+ factorNameList=[]
+ firstLine="Conditions"
+ if len(factor_list)==0:#check if there is at least one factor available
+ outputLog.write("[ERROR] no factor was defined !\n")
+ return 1
+ else:
+ for iFactor, factor_component in enumerate( factor_list ):
+ currentSampleList=list(sampleList)
+ currentFactor=str(factor_component['factorName'])
+ #check if factor name contains forbidden characters
+ for specialCharacter in forbidenCharacters:
+ if currentFactor.find(specialCharacter)!=-1:
+ outputLog.write("[ERROR] '"+specialCharacter+"' character is forbidden in factor name : '"+currentFactor+"'\n")
+ return 4
+ #check if factor allready named like that
+ if not globalDict.get(currentFactor) is None:
+ outputLog.write("[ERROR] '"+currentFactor+"' is used several times as factor name\n")
+ return 3
+ globalDict[currentFactor]=dict()
+ firstLine=firstLine+"\t"+currentFactor
+ factorNameList.append(currentFactor)
+ if len(factor_component['valueList'])<=1:#check if there is at least two value available
+ outputLog.write("[ERROR] at least two different values are necessary for '"+currentFactor+"' factor\n")
+ return 1
+ else:
+ for iValue, value_component in enumerate( factor_component['valueList'] ):
+ currentValue=str(value_component['valueName'])
+ #check if factor name contains forbidden characters
+ for specialCharacter in forbidenCharacters:
+ if currentValue.find(specialCharacter)!=-1:
+ outputLog.write("[ERROR] '"+specialCharacter+"' character is forbidden in value name : '"+currentValue+"'\n")
+ return 4
+ currentSample=str(value_component['valueConditions']).split(",")
+ for iSample, sample_component in enumerate (currentSample):
+ if not sample_component in currentSampleList:
+ outputLog.write("[ERROR] sample "+sample_component+" was assigned several times for factor '"+currentFactor+"'\n")
+ return 2
+ currentSampleList.remove(sample_component)
+ globalDict[currentFactor][sample_component]=currentValue
+ if(len(currentSampleList)>0):
+ outputLog.write("[ERROR] for factor '"+currentFactor+"'' sample "+str(currentSampleList)+" are not assigned to any value\n")
+ return 2
+ outputLog.write("[INFO] "+str(len(globalDict))+" factors are detected\n")
+ #start writing the factor file
+ outputfile.write(firstLine+"\n")
+ for iSample, sample_component in enumerate(sampleList):
+ newLine=sample_component
+ for iFactor, factor_component in enumerate(factorNameList):
+ newLine=newLine+"\t"+globalDict[factor_component][sample_component]
+ outputfile.write(newLine+"\n")
+ outputfile.close()
+ outputLog.close()
+ return 0
+
+def selectSubSetTable(file_path,headerLine_number,columnsToAdd,columnNamesToKeep,outputFileName,logFile):
+ outputLog = open(logFile, 'w')
+ outputLog.write("[INFO] header line number : "+ headerLine_number+" lines\n")
+ availableColumnsTuple=get_column_names_mergeNumber(file_path, headerLine_number)
+ #convert tuple list as a simple array
+ availableColumns=[]
+ for iTuple, tuple_content in enumerate (availableColumnsTuple):
+ availableColumns.append(str(tuple_content[0]))
+ if len(availableColumns)==0:
+ outputLog.write("[ERROR] No detected columns in input file\n")
+ return 1
+ selectedColumns=list(columnsToAdd)
+ for iVolcano, volcano_content in enumerate(columnNamesToKeep):
+ selectedColumns.append(availableColumns.index(volcano_content['pvalColumn']))
+ if volcano_content['fdrColumn'] in availableColumns:
+ selectedColumns.append(availableColumns.index(volcano_content['fdrColumn']))
+ else:
+ selectedColumns.append(0)
+ selectedColumns.append(availableColumns.index(volcano_content['fcColumn']))
+ if len(selectedColumns)!=(3*len(columnNamesToKeep)+len(columnsToAdd)):
+ outputLog.write("[ERROR] matching between input file colnames and requested column names failed\n")
+ return 1
+ outputLog.write("[INFO] columns kept : "+str(selectedColumns)+"\n")
+ #start writting formatted file
+ inputfile = open(file_path)
+ outputfile = open(outputFileName, 'w')
+ iLineCpt=-1
+ for iCurrentLine in inputfile:
+ iLineCpt=iLineCpt+1
+ if iLineCpt>=int(headerLine_number):
+ currentLineFields=np.array(iCurrentLine.strip().split("\t"))
+ newLine="\t".join(currentLineFields[selectedColumns])
+ outputfile.write(newLine+"\n")
+ if iLineCpt1:
+ if (nextLine[iColumn], nextLine[iColumn], False) not in options:
+ options.append( (nextLine[iColumn], nextLine[iColumn], False) )
+ inputfile.close()
+ return options
+
+def get_row_names_interaction( file_path, factorNameA, factorNameB ):
+ inputfile = open(file_path)
+ firstLine = next(inputfile).strip().split("\t")
+ iColumnA=-1
+ iColumnB=-1
+ for i, field_component in enumerate( firstLine ):
+ if field_component==factorNameA:#to test
+ iColumnA=i
+ if field_component==factorNameB:#to test
+ iColumnB=i
+ possibleValuesA=[]
+ possibleValuesB=[]
+ if iColumnA!=-1 and iColumnB!=-1:
+ for nextLine in inputfile:
+ nextLine=nextLine.strip().split("\t")
+ if len(nextLine)>1:
+ if nextLine[iColumnA] not in possibleValuesA:
+ possibleValuesA.append(nextLine[iColumnA])
+ if nextLine[iColumnB] not in possibleValuesB:
+ possibleValuesB.append(nextLine[iColumnB])
+ inputfile.close()
+ options=[]
+ if len(possibleValuesA)>=1 and len(possibleValuesB)>=1 and possibleValuesA[0]!="None" and possibleValuesB[0]!="None":
+ for counterA in range(len(possibleValuesA)):
+ for counterB in range(len(possibleValuesB)):
+ options.append( (possibleValuesA[counterA]+"*"+possibleValuesB[counterB], possibleValuesA[counterA]+"*"+possibleValuesB[counterB], False) )
+ return options
+
+def get_comparisonsA( factorA, valuesA ):
+ options=[]
+ formatValuesA=re.sub("(^\[u')|('\]$)","", str(valuesA))
+ possibleValues=formatValuesA.split("', u'")
+ if len(possibleValues)>=2:
+ for counter in range(len(possibleValues)-1):
+ for innerCounter in range(counter+1,len(possibleValues)):
+ options.append( (possibleValues[counter]+" - "+possibleValues[innerCounter], possibleValues[counter]+" - "+possibleValues[innerCounter], False) )
+ options.append( (possibleValues[innerCounter]+" - "+possibleValues[counter], possibleValues[innerCounter]+" - "+possibleValues[counter], False) )
+ return options
+
+def get_comparisonsAB(factorA, valuesA, factorB, valuesB, interaction):
+ options=[]
+ formatValuesA=re.sub("(^\[u')|('\]$)","", str(valuesA))
+ possibleValuesA=formatValuesA.split("', u'")
+ formatValuesB=re.sub("(^\[u')|('\]$)","", str(valuesB))
+ possibleValuesB=formatValuesB.split("', u'")
+ if str(interaction)=="False":
+ if len(possibleValuesA)>=2:
+ for counter in range(len(possibleValuesA)-1):
+ for innerCounter in range(counter+1,len(possibleValuesA)):
+ options.append( (possibleValuesA[counter]+" - "+possibleValuesA[innerCounter], possibleValuesA[counter]+" - "+possibleValuesA[innerCounter], False) )
+ options.append( (possibleValuesA[innerCounter]+" - "+possibleValuesA[counter], possibleValuesA[innerCounter]+" - "+possibleValuesA[counter], False) )
+ if len(possibleValuesB)>=2:
+ for counter in range(len(possibleValuesB)-1):
+ for innerCounter in range(counter+1,len(possibleValuesB)):
+ options.append( (possibleValuesB[counter]+" - "+possibleValuesB[innerCounter], possibleValuesB[counter]+" - "+possibleValuesB[innerCounter], False) )
+ options.append( (possibleValuesB[innerCounter]+" - "+possibleValuesB[counter], possibleValuesB[innerCounter]+" - "+possibleValuesB[counter], False) )
+ else:
+ if len(possibleValuesA)>=1 and len(possibleValuesB)>=1 and possibleValuesA[0]!="None" and possibleValuesB[0]!="None":
+ for counterA in range(len(possibleValuesA)):
+ for innerCounterA in range(len(possibleValuesA)):
+ for counterB in range(len(possibleValuesB)):
+ for innerCounterB in range(len(possibleValuesB)):
+ if not(counterA==innerCounterA and counterB==innerCounterB):
+ options.append( ("("+possibleValuesA[counterA]+" * "+possibleValuesB[counterB]+") - ("+possibleValuesA[innerCounterA]+" * "+possibleValuesB[innerCounterB]+")","("+possibleValuesA[counterA]+" * "+possibleValuesB[counterB]+") - ("+possibleValuesA[innerCounterA]+" * "+possibleValuesB[innerCounterB]+")", False) )
+ return options
+
+def get_row_names_allInteractions( file_path, factorSelected):
+ formatFactors=re.sub("(^\[u')|('\]$)","", str(factorSelected))
+ factorsList=formatFactors.split("', u'")
+ iColumn=[None] * len(factorsList)
+ valuesList=[None] * len(factorsList)
+
+ inputfile = open(file_path)
+ firstLine = next(inputfile).strip().split("\t")
+ for iField, fieldComponent in enumerate( firstLine ):
+ for iFactor, factorComponent in enumerate(factorsList):
+ if fieldComponent==factorComponent:
+ iColumn[iFactor]=iField
+ valuesList[iFactor]=[]
+
+ for nextLine in inputfile:
+ nextLine=nextLine.strip().split("\t")
+ if len(nextLine)>1:
+ for iFactor, factorComponent in enumerate(factorsList):
+ if nextLine[iColumn[iFactor]] not in valuesList[iFactor]:
+ valuesList[iFactor].append(nextLine[iColumn[iFactor]])
+ inputfile.close()
+
+ allCombinations=[]
+ for iFactor, factorComponent in enumerate(factorsList):
+ if iFactor==0:
+ allCombinations=valuesList[iFactor]
+ else:
+ currentCombinations=allCombinations
+ allCombinations=[]
+ for iValue, valueComponent in enumerate(valuesList[iFactor]):
+ for iCombination, combination in enumerate(currentCombinations):
+ allCombinations.append(combination+"*"+valueComponent)
+
+ options=[]
+ for iCombination, combination in enumerate(allCombinations):
+ options.append((combination,combination,False))
+
+ return options
+
+def get_allrow_names( file_path, factorSelected ):
+ formatFactors=re.sub("(^\[u')|('\]$)","", str(factorSelected))
+ factorsList=formatFactors.split("', u'")
+ iColumn=[None] * len(factorsList)
+ valuesList=[None] * len(factorsList)
+
+ inputfile = open(file_path)
+ firstLine = next(inputfile).strip().split("\t")
+ for iField, fieldComponent in enumerate( firstLine ):
+ for iFactor, factorComponent in enumerate(factorsList):
+ if fieldComponent==factorComponent:
+ iColumn[iFactor]=iField
+ valuesList[iFactor]=[]
+
+ for nextLine in inputfile:
+ nextLine=nextLine.strip().split("\t")
+ if len(nextLine)>1:
+ for iFactor, factorComponent in enumerate(factorsList):
+ if nextLine[iColumn[iFactor]] not in valuesList[iFactor]:
+ valuesList[iFactor].append(nextLine[iColumn[iFactor]])
+ inputfile.close()
+
+ allValues=[]
+ for iFactor, factorComponent in enumerate(factorsList):
+ for iValue, valueComponent in enumerate(valuesList[iFactor]):
+ allValues.append(factorComponent+":"+valueComponent)
+
+ options=[]
+ for iValue, valueComponent in enumerate(allValues):
+ options.append((valueComponent,valueComponent,False))
+
+ return options
+
+def replaceNamesInFiles(expressionFile_name,conditionFile_name,outputExpressionFile,outputConditionFile,ouputDictionnary):
+ dico={}
+ forbidenCharacters={"*",":",",","|"}
+ ##start with expression file, read only the first line
+ inputfile = open(expressionFile_name)
+ outputfile = open(outputExpressionFile, 'w')
+ firstLine = next(inputfile).rstrip().split("\t")
+ iCondition=1
+ newFirstLine=""
+ for i, field_component in enumerate( firstLine ):
+ if (i>0):
+ #conditions names should not be redundant with other conditions
+ if(field_component not in dico):
+ dico[field_component]="Condition"+str(iCondition)
+ newFirstLine+="\t"+"Condition"+str(iCondition)
+ iCondition+=1
+ else:
+ raise NameError('condition name allready exists!')
+ else:
+ newFirstLine+=field_component
+ outputfile.write(newFirstLine+"\n")
+ for line in inputfile:
+ outputfile.write(line)
+ outputfile.close()
+ inputfile.close()
+ #then parse condition file, read all lines in this case
+ inputfile = open(conditionFile_name)
+ outputfile = open(outputConditionFile, 'w')
+ firstLine=1
+ iFactor=1
+ iValue=1
+ for line in inputfile:
+ currentLine = line.rstrip().split("\t")
+ newCurrentLine=""
+ for i, field_component in enumerate( currentLine ):
+ #special treatment for the first line
+ if (firstLine==1):
+ if (i==0):
+ newCurrentLine=field_component
+ else:
+ #factor names should not be redundant with other factors or conditions
+ if(field_component not in dico):
+ dico[field_component]="Factor"+str(iFactor)
+ newCurrentLine+="\t"+"Factor"+str(iFactor)
+ iFactor+=1
+ else:
+ raise NameError('factor name allready exists!')
+ else:
+ if (i==0):
+ #check if condition name allready exist and used it if it is, or create a new one if not
+ if(field_component not in dico):
+ dico[field_component]="Condition"+str(iCondition)
+ newCurrentLine="Condition"+str(iCondition)
+ iCondition+=1
+ else:
+ newCurrentLine=dico[field_component]
+ else:
+ if(field_component not in dico):
+ dico[field_component]="Value"+str(iValue)
+ newCurrentLine+="\tValue"+str(iValue)
+ iValue+=1
+ else:
+ newCurrentLine+="\t"+dico[field_component]
+ outputfile.write(newCurrentLine+"\n")
+ firstLine=0
+ outputfile.close()
+ inputfile.close()
+ ##check if any entries in dictionnary contains forbiden character
+ for key, value in dico.items():
+ for specialCharacter in forbidenCharacters:
+ if value.startswith("Condition")==False and key.find(specialCharacter)!=-1:
+ return 1
+ ##then write dictionnary in a additional file
+ outputfile = open(ouputDictionnary, 'w')
+ for key, value in dico.items():
+ outputfile.write(key+"\t"+value+"\n")
+ outputfile.close()
+ return 0
+
+
+def replaceNamesBlockInFiles(expressionFile_name,conditionFile_name,blockingFile_name,outputExpressionFile,outputConditionFile,outputBlockingFile,ouputDictionnary):
+ dico={}
+ forbidenCharacters={"*",":",",","|"}
+ ##start with expression file, read only the first line
+ inputfile = open(expressionFile_name)
+ outputfile = open(outputExpressionFile, 'w')
+ firstLine = next(inputfile).rstrip().split("\t")
+ iCondition=1
+ newFirstLine=""
+ for i, field_component in enumerate( firstLine ):
+ if (i>0):
+ #conditions names should not be redundant with other conditions
+ if(field_component not in dico):
+ dico[field_component]="Condition"+str(iCondition)
+ newFirstLine+="\t"+"Condition"+str(iCondition)
+ iCondition+=1
+ else:
+ raise NameError('condition name allready exists!')
+ else:
+ newFirstLine+=field_component
+ outputfile.write(newFirstLine+"\n")
+ for line in inputfile:
+ outputfile.write(line)
+ outputfile.close()
+ inputfile.close()
+ #then parse condition file, read all lines in this case
+ iFactor=1
+ iValue=1
+ for fileNum in range(2):
+ if fileNum==0:
+ inputfile = open(conditionFile_name)
+ outputfile = open(outputConditionFile, 'w')
+ else:
+ inputfile = open(blockingFile_name)
+ outputfile = open(outputBlockingFile, 'w')
+ firstLine=1
+ for line in inputfile:
+ currentLine = line.rstrip().split("\t")
+ newCurrentLine=""
+ for i, field_component in enumerate( currentLine ):
+ #special treatment for the first line
+ if (firstLine==1):
+ if (i==0):
+ newCurrentLine=field_component
+ else:
+ #factor names should not be redundant with other factors or conditions
+ if(field_component not in dico):
+ dico[field_component]="Factor"+str(iFactor)
+ newCurrentLine+="\t"+"Factor"+str(iFactor)
+ iFactor+=1
+ else:
+ raise NameError('factor name allready exists!')
+ else:
+ if (i==0):
+ #check if condition name allready exist and used it if it is, or create a new one if not
+ if(field_component not in dico):
+ dico[field_component]="Condition"+str(iCondition)
+ newCurrentLine="Condition"+str(iCondition)
+ iCondition+=1
+ else:
+ newCurrentLine=dico[field_component]
+ else:
+ if(field_component not in dico):
+ dico[field_component]="Value"+str(iValue)
+ newCurrentLine+="\tValue"+str(iValue)
+ iValue+=1
+ else:
+ newCurrentLine+="\t"+dico[field_component]
+ outputfile.write(newCurrentLine+"\n")
+ firstLine=0
+ outputfile.close()
+ inputfile.close()
+ ##check if any entries in dictionnary contains forbiden character
+ for key, value in dico.items():
+ for specialCharacter in forbidenCharacters:
+ if value.startswith("Condition")==False and key.find(specialCharacter)!=-1:
+ return 1
+ ##then write dictionnary in a additional file
+ outputfile = open(ouputDictionnary, 'w')
+ for key, value in dico.items():
+ outputfile.write(key+"\t"+value+"\n")
+ outputfile.close()
+ return 0
diff -r 000000000000 -r 3022feec50fe src/LIMMAscriptV4.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/LIMMAscriptV4.R Fri Jun 26 09:36:46 2020 -0400
@@ -0,0 +1,1002 @@
+# A command-line interface for LIMMA to use with Galaxy
+# written by Jimmy Vandel
+# one of these arguments is required:
+#
+#
+initial.options <- commandArgs(trailingOnly = FALSE)
+file.arg.name <- "--file="
+script.name <- sub(file.arg.name, "", initial.options[grep(file.arg.name, initial.options)])
+script.basename <- dirname(script.name)
+source(file.path(script.basename, "utils.R"))
+source(file.path(script.basename, "getopt.R"))
+
+#addComment("Welcome R!")
+
+# setup R error handling to go to stderr
+options( show.error.messages=F, error = function () { cat(geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
+
+# we need that to not crash galaxy with an UTF8 error on German LC settings.
+loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
+loc <- Sys.setlocale("LC_NUMERIC", "C")
+
+#get starting time
+start.time <- Sys.time()
+
+options(stringAsfactors = FALSE, useFancyQuotes = FALSE)
+args <- commandArgs()
+
+# get options, using the spec as defined by the enclosed list.
+# we read the options from the default: commandArgs(TRUE).
+spec <- matrix(c(
+ "dataFile", "i", 1, "character",
+ "factorInfo","a", 1, "character",
+ "blockingInfo","b", 1, "character",
+ "dicoRenaming","g",1,"character",
+ "blockingPolicy","u", 1, "character",
+ "fdrThreshold","t", 1, "double",
+ "thresholdFC","d", 1, "double",
+ "format", "f", 1, "character",
+ "histo","h", 1, "character",
+ "volcano","v", 1, "character",
+ "factorsContrast","r", 1, "character",
+ "contrastNames","p", 1, "character",
+ "firstGroupContrast","m", 1, "character",
+ "secondGroupContrast","n", 1, "character",
+ "controlGroups","c", 1, "character",
+ "fratioFile","s",1,"character",
+ "organismID","x",1,"character",
+ "rowNameType","y",1,"character",
+ "quiet", "q", 0, "logical",
+ "log", "l", 1, "character",
+ "outputFile" , "o", 1, "character",
+ "outputDfFile" , "z", 1, "character"),
+ byrow=TRUE, ncol=4)
+opt <- getopt(spec)
+
+# enforce the following required arguments
+if (is.null(opt$log)) {
+ addComment("[ERROR]'log file' is required\n")
+ q( "no", 1, F )
+}
+addComment("[INFO]Start of R script",T,opt$log,display=FALSE)
+if (is.null(opt$dataFile)) {
+ addComment("[ERROR]'dataFile' is required",T,opt$log)
+ q( "no", 1, F )
+}
+if (!is.null(opt$blockingInfo) && is.null(opt$blockingPolicy) ) {
+ addComment("[ERROR]blocking policy is missing",T,opt$log)
+ q( "no", 1, F )
+}
+if (is.null(opt$dicoRenaming)) {
+ addComment("[ERROR]renaming dictionnary is missing",T,opt$log)
+ q( "no", 1, F )
+}
+if (is.null(opt$factorsContrast)) {
+ addComment("[ERROR]factor informations are missing",T,opt$log)
+ q( "no", 1, F )
+}
+if (length(opt$firstGroupContrast)!=length(opt$secondGroupContrast)) {
+ addComment("[ERROR]some contrast groups seems to be empty",T,opt$log)
+ q( "no", 1, F )
+}
+if (is.null(opt$factorInfo)) {
+ addComment("[ERROR]factors info is missing",T,opt$log)
+ q( "no", 1, F )
+}
+if (is.null(opt$format)) {
+ addComment("[ERROR]'output format' is required",T,opt$log)
+ q( "no", 1, F )
+}
+if (is.null(opt$fdrThreshold)) {
+ addComment("[ERROR]'FDR threshold' is required",T,opt$log)
+ q( "no", 1, F )
+}
+if (is.null(opt$outputFile) || is.null(opt$outputDfFile)){
+ addComment("[ERROR]'output files' are required",T,opt$log)
+ q( "no", 1, F )
+}
+if (is.null(opt$thresholdFC)){
+ addComment("[ERROR]'FC threshold' is required",T,opt$log)
+ q( "no", 1, F )
+}
+if (is.null(opt$fratioFile)) {
+ addComment("[ERROR]F-ratio parameter is missing",T,opt$log)
+ q( "no", 1, F )
+}
+
+#demande si le script sera bavard
+verbose <- if (is.null(opt$quiet)) {
+ TRUE
+}else{
+ FALSE
+}
+
+#paramètres internes
+#pour savoir si on remplace les FC calculés par LIMMA par un calcul du LS-MEAN (ie moyenne de moyennes de chaque groupe dans chaque terme du contraste plutôt qu'une moyenne globale dans chaque terme)
+useLSmean=FALSE
+
+addComment("[INFO]Parameters checked!",T,opt$log,display=FALSE)
+
+addComment(c("[INFO]Working directory: ",getwd()),TRUE,opt$log,display=FALSE)
+addComment(c("[INFO]Command line: ",args),TRUE,opt$log,display=FALSE)
+
+#directory for plots
+dir.create(file.path(getwd(), "plotDir"))
+dir.create(file.path(getwd(), "plotLyDir"))
+
+#charge des packages silencieusement
+suppressPackageStartupMessages({
+ library("methods")
+ library("limma")
+ library("biomaRt")
+ library("ggplot2")
+ library("plotly")
+ library("stringr")
+ library("RColorBrewer")
+})
+
+
+#chargement du fichier dictionnaire de renommage
+renamingDico=read.csv(file=file.path(getwd(), opt$dicoRenaming),header=F,sep="\t",colClasses="character")
+rownames(renamingDico)=renamingDico[,2]
+
+
+#chargement des fichiers en entrée
+expDataMatrix=read.csv(file=file.path(getwd(), opt$dataFile),header=F,sep="\t",colClasses="character")
+#remove first row to convert it as colnames (to avoid X before colnames with header=T)
+colNamesData=expDataMatrix[1,-1]
+expDataMatrix=expDataMatrix[-1,]
+#remove first colum to convert it as rownames
+rowNamesData=expDataMatrix[,1]
+expDataMatrix=expDataMatrix[,-1]
+if(is.data.frame(expDataMatrix)){
+ expDataMatrix=data.matrix(expDataMatrix)
+}else{
+ expDataMatrix=data.matrix(as.numeric(expDataMatrix))
+}
+dimnames(expDataMatrix)=list(rowNamesData,colNamesData)
+
+#test the number of rows that are constant in dataMatrix
+nbConstantRows=length(which(unlist(apply(expDataMatrix,1,var))==0))
+if(nbConstantRows>0){
+ addComment(c("[WARNING]",nbConstantRows,"rows are constant across conditions in input data file"),T,opt$log,display=FALSE)
+}
+
+#test if all condition names are present in dico
+if(!all(colnames(expDataMatrix) %in% rownames(renamingDico))){
+ addComment("[ERROR]Missing condition names in renaming dictionary",T,opt$log)
+ q( "no", 1, F )
+}
+
+addComment("[INFO]Expression data loaded and checked",T,opt$log,display=FALSE)
+addComment(c("[INFO]Dim of expression matrix:",dim(expDataMatrix)),T,opt$log,display=FALSE)
+
+#chargement du fichier des facteurs
+factorInfoMatrix=read.csv(file=file.path(getwd(), opt$factorInfo),header=F,sep="\t",colClasses="character")
+#remove first row to convert it as colnames
+colnames(factorInfoMatrix)=factorInfoMatrix[1,]
+factorInfoMatrix=factorInfoMatrix[-1,]
+#use first colum to convert it as rownames but not removing it to avoid conversion as vector in unique factor case
+rownames(factorInfoMatrix)=factorInfoMatrix[,1]
+
+if(length(setdiff(colnames(expDataMatrix),rownames(factorInfoMatrix)))!=0){
+ addComment("[ERROR]Missing samples in factor file",T,opt$log)
+ q( "no", 1, F )
+}
+
+#order sample as in expression matrix and remove spurious sample
+factorInfoMatrix=factorInfoMatrix[colnames(expDataMatrix),]
+
+#test if all values names are present in dico
+if(!all(unlist(factorInfoMatrix) %in% rownames(renamingDico))){
+ addComment("[ERROR]Missing factor names in renaming dictionary",T,opt$log)
+ q( "no", 1, F )
+}
+
+addComment("[INFO]Factors OK",T,opt$log,display=FALSE)
+addComment(c("[INFO]Dim of factorInfo matrix:",dim(factorInfoMatrix)),T,opt$log,display=FALSE)
+
+##manage blocking factor
+blockingFactor=NULL
+blockinFactorsList=NULL
+if(!is.null(opt$blockingInfo)){
+
+ #chargement du fichier des blocking factors
+ blockingInfoMatrix=read.csv(file=file.path(getwd(), opt$blockingInfo),header=F,sep="\t",colClasses="character")
+ #remove first row to convert it as colnames
+ colnames(blockingInfoMatrix)=blockingInfoMatrix[1,]
+ blockingInfoMatrix=blockingInfoMatrix[-1,]
+ #use first colum to convert it as rownames but not removing it to avoid conversion as vector in unique factor case
+ rownames(blockingInfoMatrix)=blockingInfoMatrix[,1]
+
+
+ if(length(setdiff(colnames(expDataMatrix),rownames(blockingInfoMatrix)))!=0){
+ addComment("[ERROR]Missing samples in blocking factor file",T,opt$log)
+ q( "no", 1, F )
+ }
+
+ #order sample as in expression matrix
+ blockingInfoMatrix=blockingInfoMatrix[colnames(expDataMatrix),]
+
+ #test if all blocking names are present in dico
+ if(!all(unlist(blockingInfoMatrix) %in% rownames(renamingDico))){
+ addComment("[ERROR]Missing blocking names in renaming dictionary",T,opt$log)
+ q( "no", 1, F )
+ }
+
+ #remove blocking factors allready present as real factors
+ blockingNotInMainFactors=setdiff(colnames(blockingInfoMatrix)[-1],colnames(factorInfoMatrix)[-1])
+
+ if(length(blockingNotInMainFactors)<(ncol(blockingInfoMatrix)-1))addComment("[WARNING]Blocking factors cannot be principal factors",T,opt$log,display=FALSE)
+
+ if(length(blockingNotInMainFactors)>0){
+
+ blockingInfoMatrix=blockingInfoMatrix[,c(colnames(blockingInfoMatrix)[1],blockingNotInMainFactors)]
+
+ groupBlocking=rep("c",ncol(expDataMatrix))
+ #for each blocking factor
+ for(blockingFact in blockingNotInMainFactors){
+ if(opt$blockingPolicy=="correlated"){
+ indNewFact=as.numeric(factor(blockingInfoMatrix[,blockingFact]))
+ groupBlocking=paste(groupBlocking,indNewFact,sep="_")
+ }else{
+ if(is.null(blockinFactorsList))blockinFactorsList=list()
+ blockinFactorsList[[blockingFact]]=factor(unlist(lapply(blockingInfoMatrix[,blockingFact],function(x)paste(c(blockingFact,"_",x),collapse=""))))
+ }
+ }
+ if(opt$blockingPolicy=="correlated"){
+ blockingFactor=factor(groupBlocking)
+ if(length(levels(blockingFactor))==1){
+ addComment("[ERROR]Selected blocking factors seems to be constant",T,opt$log)
+ q( "no", 1, F )
+ }
+ }
+ addComment("[INFO]Blocking info OK",T,opt$log,display=FALSE)
+ }else{
+ addComment("[WARNING]No blocking factors will be considered",T,opt$log,display=FALSE)
+ }
+}
+
+
+##rename different input parameters using renamingDictionary
+opt$factorsContrast=renamingDico[unlist(lapply(unlist(strsplit(opt$factorsContrast,",")),function(x)which(renamingDico[,1]==x))),2]
+
+userDefinedContrasts=FALSE
+if(!is.null(opt$firstGroupContrast) && !is.null(opt$secondGroupContrast)){
+ userDefinedContrasts=TRUE
+ for(iContrast in 1:length(opt$firstGroupContrast)){
+ opt$firstGroupContrast[iContrast]=paste(unlist(lapply(unlist(strsplit(opt$firstGroupContrast[iContrast],",")),function(x)paste(renamingDico[unlist(lapply(unlist(strsplit(x,"\\*")),function(x)which(renamingDico[,1]==x))),2],collapse="*"))),collapse=",")
+ opt$secondGroupContrast[iContrast]=paste(unlist(lapply(unlist(strsplit(opt$secondGroupContrast[iContrast],",")),function(x)paste(renamingDico[unlist(lapply(unlist(strsplit(x,"\\*")),function(x)which(renamingDico[,1]==x))),2],collapse="*"))),collapse=",")
+ }
+}
+
+if(!is.null(opt$controlGroups)){
+ renamedGroups=c()
+ for(iGroup in unlist(strsplit(opt$controlGroups,","))){
+ renamedControlGroup=paste(renamingDico[unlist(lapply(unlist(strsplit(iGroup,":")),function(x)which(renamingDico[,1]==x))),2],collapse=":")
+ if(length(renamedControlGroup)==0 || any(which(unlist(gregexpr(text = renamedControlGroup,pattern = ":"))==-1))){
+ addComment("[ERROR]Control groups for interaction seem to mismatch, please check them.",T,opt$log)
+ q( "no", 1, F )
+ }
+ renamedGroups=c(renamedGroups,renamedControlGroup)
+ }
+ opt$controlGroups=renamedGroups
+}
+addComment("[INFO]Contrast variables are renamed to avoid confusion",T,opt$log,display=FALSE)
+##renaming done
+
+#to convert factor as numeric value --> useless now ?
+#expDataMatrix=apply(expDataMatrix,c(1,2),function(x)as.numeric(paste(x)))
+
+#get factors info for LIMMA
+factorsList=list()
+for(iFactor in opt$factorsContrast){
+ if(!(iFactor %in% colnames(factorInfoMatrix))){
+ addComment("[ERROR]Required factors are missing in input file",T,opt$log)
+ q( "no", 1, F )
+ }
+ factorsList[[iFactor]]=factor(unlist(lapply(factorInfoMatrix[,iFactor],function(x)paste(c(iFactor,"_",x),collapse=""))))
+ if(length(levels(factorsList[[iFactor]]))==1){
+ addComment("[ERROR]One selected factor seems to be constant",T,opt$log)
+ q( "no", 1, F )
+ }
+}
+
+#check if there is at least 2 factors to allow interaction computation
+if(!is.null(opt$controlGroups) && length(factorsList)<2){
+ addComment("[ERROR]You cannot ask for interaction with less than 2 factors",T,opt$log)
+ q( "no", 1, F )
+}
+
+#merge all factors as a single one
+factorsMerged=as.character(factorsList[[opt$factorsContrast[1]]])
+for(iFactor in opt$factorsContrast[-1]){
+ factorsMerged=paste(factorsMerged,as.character(factorsList[[iFactor]]),sep=".")
+}
+factorsMerged=factor(factorsMerged)
+
+#checked that coefficient number (ie. factorsMerged levels) is strictly smaller than sample size
+if(length(levels(factorsMerged))>=length(factorsMerged)){
+ addComment(c("[ERROR]No enough samples (",length(factorsMerged),") to estimate ",length(levels(factorsMerged))," coefficients"),T,opt$log)
+ q( "no", 1, F )
+}
+
+#get the sample size of each factor values
+sampleSizeFactor=table(factorsMerged)
+
+
+if(!is.null(blockinFactorsList)){
+ factorString=c("blockinFactorsList[['", names(blockinFactorsList)[1],"']]")
+ for(blockingFact in names(blockinFactorsList)[-1]){
+ factorString=c(factorString," + blockinFactorsList[['",blockingFact,"']]")
+ }
+ design = model.matrix(as.formula(paste(c("~ factorsMerged +",factorString," + 0"),collapse="")))
+
+ #rename design columns
+ coeffMeaning = levels(factorsMerged)
+ for(blockingFact in blockinFactorsList){
+ coeffMeaning=c(coeffMeaning,levels(blockingFact)[-1])
+ }
+ colnames(design) = coeffMeaning
+}else{
+ design = model.matrix(as.formula( ~ factorsMerged + 0))
+
+ #rename degin columns
+ coeffMeaning = levels(factorsMerged)
+ colnames(design) = coeffMeaning
+}
+
+addComment(c("[INFO]Available coefficients: ",coeffMeaning),T,opt$log,display=F)
+
+estimableCoeff=which(colSums(design)!=0)
+
+addComment("[INFO]Design done",T,opt$log,display=F)
+
+ #use blocking factor if exists
+if(!is.null(blockingFactor)){
+ corfit <- duplicateCorrelation(expDataMatrix, design, block=blockingFactor)
+
+ addComment(c("[INFO]Correlation within groups: ",corfit$consensus.correlation),T,opt$log,display=F)
+
+ #run linear model fit
+ data.fit = lmFit(expDataMatrix,design,block = blockingFactor, correlation=corfit$consensus.correlation)
+}else{
+ #run linear model fit
+ data.fit = lmFit(expDataMatrix,design)
+}
+
+estimatedCoeff=which(!is.na(data.fit$coefficients[1,]))
+
+addComment("[INFO]Lmfit done",T,opt$log,display=F)
+
+#catch situation where some coefficients cannot be estimated, probably due to dependances between design columns
+#if(length(setdiff(estimableCoeff,estimatedCoeff))>0){
+# addComment("[ERROR]Error in design matrix, check your group definitions",T,opt$log)
+# q( "no", 1, F )
+#}
+#to strong condition, should return ERROR only when coefficients relative to principal factors cannot be estimated, otherwise, return a simple WARNING
+
+#define requested contrasts
+requiredContrasts=c()
+humanReadingContrasts=c()
+persoContrastName=c()
+if(userDefinedContrasts){
+ for(iContrast in 1:length(opt$firstGroupContrast)){
+ posGroup=unlist(lapply(unlist(strsplit(opt$firstGroupContrast[iContrast],",")),function(x)paste(paste(opt$factorsContrast,unlist(strsplit(x,"\\*")),sep="_"),collapse=".")))
+ negGroup=unlist(lapply(unlist(strsplit(opt$secondGroupContrast[iContrast],",")),function(x)paste(paste(opt$factorsContrast,unlist(strsplit(x,"\\*")),sep="_"),collapse=".")))
+ #clear posGroup and negGroup from empty groups
+ emptyPosGroups=which(!(posGroup%in%coeffMeaning))
+ if(length(emptyPosGroups)>0){
+ addComment(c("[WARNING]The group(s)",posGroup[emptyPosGroups],"is/are removed from contrast as it/they is/are empty"),T,opt$log,display=FALSE)
+ posGroup=posGroup[-emptyPosGroups]
+ currentHumanContrast=paste(unlist(strsplit(opt$firstGroupContrast[iContrast],","))[-emptyPosGroups],collapse="+")
+ }else{
+ currentHumanContrast=paste(unlist(strsplit(opt$firstGroupContrast[iContrast],",")),collapse="+")
+ }
+ emptyNegGroups=which(!(negGroup%in%coeffMeaning))
+ if(length(emptyNegGroups)>0){
+ addComment(c("[WARNING]The group(s)",negGroup[emptyNegGroups],"is/are removed from contrast as it/they is/are empty"),T,opt$log,display=FALSE)
+ negGroup=negGroup[-emptyNegGroups]
+ currentHumanContrast=paste(c(currentHumanContrast,unlist(strsplit(opt$secondGroupContrast[iContrast],","))[-emptyNegGroups]),collapse="-")
+ }else{
+ currentHumanContrast=paste(c(currentHumanContrast,unlist(strsplit(opt$secondGroupContrast[iContrast],","))),collapse="-")
+ }
+ if(length(posGroup)==0 || length(negGroup)==0 ){
+ addComment(c("[WARNING]Contrast",currentHumanContrast,"cannot be estimated due to empty group"),T,opt$log,display=FALSE)
+ }else{
+ if(all(posGroup%in%negGroup) && all(negGroup%in%posGroup)){
+ addComment(c("[WARNING]Contrast",currentHumanContrast,"cannot be estimated due to null contrast"),T,opt$log,display=FALSE)
+ }else{
+ #get coefficients required for first group added as positive
+ positiveCoeffWeights=sampleSizeFactor[posGroup]/sum(sampleSizeFactor[posGroup])
+ #positiveCoeffWeights=rep(1,length(posGroup))
+ #names(positiveCoeffWeights)=names(sampleSizeFactor[posGroup])
+ #get coefficients required for second group added as negative
+ negativeCoeffWeights=sampleSizeFactor[negGroup]/sum(sampleSizeFactor[negGroup])
+ #negativeCoeffWeights=rep(1,length(negGroup))
+ #names(negativeCoeffWeights)=names(sampleSizeFactor[negGroup])
+ #build the resulting contrast
+ currentContrast=paste(paste(positiveCoeffWeights[posGroup],posGroup,sep="*"),collapse="+")
+ currentContrast=paste(c(currentContrast,paste(paste(negativeCoeffWeights[negGroup],negGroup,sep="*"),collapse="-")),collapse="-")
+ requiredContrasts=c(requiredContrasts,currentContrast)
+
+ #build the human reading contrast
+ humanReadingContrasts=c(humanReadingContrasts,currentHumanContrast)
+ if(!is.null(opt$contrastNames) && nchar(opt$contrastNames[iContrast])>0){
+ persoContrastName=c(persoContrastName,opt$contrastNames[iContrast])
+ }else{
+ persoContrastName=c(persoContrastName,"")
+ }
+
+ addComment(c("[INFO]Contrast added : ",currentHumanContrast),T,opt$log,display=F)
+ addComment(c("with complete formula ",currentContrast),T,opt$log,display=F)
+ }
+ }
+ }
+}
+
+
+ #define the true formula with interactions to get interaction coefficients
+ factorString=c("factorsList[['", names(factorsList)[1],"']]")
+ for(iFactor in names(factorsList)[-1]){
+ factorString=c(factorString," * factorsList[['",iFactor,"']]")
+ }
+
+ if(!is.null(blockinFactorsList)){
+ for(blockingFact in names(blockinFactorsList)){
+ factorString=c(factorString," + blockinFactorsList[['",blockingFact,"']]")
+ }
+ }
+
+ #should not be null at the end
+ allFtestMeanSquare=NULL
+ #to get the F-test values
+ estimatedInteractions=rownames(anova(lm(as.formula(paste(c("expDataMatrix[1,] ~ ",factorString),collapse="")))))
+ estimatedInteractions=c(unlist(lapply(estimatedInteractions[-length(estimatedInteractions)],function(x){temp=unlist(strsplit(x,"[ \" | : ]"));paste(temp[seq(2,length(temp),3)],collapse=":")})),estimatedInteractions[length(estimatedInteractions)])
+ #rename estimated interaction terms using renamingDico
+ estimatedInteractions=c(unlist(lapply(estimatedInteractions[-length(estimatedInteractions)],function(x)paste(renamingDico[unlist(strsplit(x,":")),1],collapse=":"))),estimatedInteractions[length(estimatedInteractions)])
+ t <- unlist(apply(expDataMatrix,1,function(x){temp=anova(lm(as.formula(paste(c("x ~ ",factorString),collapse=""))))$`Mean Sq`;temp/temp[length(temp)]}))
+ allFtestMeanSquare <- t(matrix(t,nrow=length(estimatedInteractions)))
+ #remove from allFtest rows containing NA
+ if(length(which(is.na(allFtestMeanSquare[,1])))>0)allFtestMeanSquare=allFtestMeanSquare[-(which(is.na(allFtestMeanSquare[,1]))),]
+ colnames(allFtestMeanSquare)=estimatedInteractions
+
+ #add contrasts corresponding to interaction terms
+ if(!is.null(opt$controlGroups)){
+ #first load user defined control group for each factor
+ controlGroup=rep(NA,length(factorsList))
+ names(controlGroup)=names(factorsList)
+ for(iGroup in opt$controlGroups){
+ splitGroup=unlist(strsplit(iGroup,":"))
+ splitGroup[2]=paste(splitGroup[1],splitGroup[2],sep = "_")
+ #check if defined control group is really a level of the corresponding factor
+ if(!splitGroup[1]%in%names(controlGroup) || !splitGroup[2]%in%factorsList[[splitGroup[1]]]){
+ addComment(c("[ERROR]The factor name",splitGroup[1],"does not exist or group name",splitGroup[2]),T,opt$log)
+ q( "no", 1, F )
+ }
+ if(!is.na(controlGroup[splitGroup[1]])){
+ addComment("[ERROR]Several control groups are defined for the same factor, please select only one control group for each factor if you want to compute interaction contrasts",T,opt$log)
+ q( "no", 1, F )
+ }
+ controlGroup[splitGroup[1]]=splitGroup[2]
+ }
+
+ #check if all factor have a defined control group
+ if(any(is.na(controlGroup))){
+ addComment("[ERROR]Missing control group for some factors, please check them if you want to compute interaction contrasts",T,opt$log)
+ q( "no", 1, F )
+ }
+
+ nbFactors=length(factorsList)
+ interactionContrasts=c()
+ contrastClass=c()
+ #initialize list for the first level
+ newPreviousLoopContrast=list()
+ for(iFactorA in 1:(nbFactors-1)){
+ nameFactorA=names(factorsList)[iFactorA]
+ compA=c()
+ for(levelA in setdiff(levels(factorsList[[iFactorA]]),controlGroup[nameFactorA])){
+ compA=c(compA,paste(levelA,controlGroup[nameFactorA],sep="-"))
+ }
+ newPreviousLoopContrast[[as.character(iFactorA)]]=compA
+ }
+ #make a loop for growing interaction set
+ for(globalIfactor in 1:(nbFactors-1)){
+ previousLoopContrast=newPreviousLoopContrast
+ newPreviousLoopContrast=list()
+ #factor A reuse contrasts made at previsous loop
+ for(iFactorA in names(previousLoopContrast)){
+ compA=previousLoopContrast[[iFactorA]]
+
+ if(max(as.integer(unlist(strsplit(iFactorA,"\\."))))1 && iFactor0){
+ requiredContrasts=c(requiredContrasts,interactionContrasts[-contrastToIgnore])
+ humanReadingContrasts=c(humanReadingContrasts,humanReadingInteraction[-contrastToIgnore])
+ persoContrastName=c(persoContrastName,rep("",length(humanReadingInteraction[-contrastToIgnore])))
+ }else{
+ requiredContrasts=c(requiredContrasts,interactionContrasts)
+ humanReadingContrasts=c(humanReadingContrasts,humanReadingInteraction)
+ persoContrastName=c(persoContrastName,rep("",length(humanReadingInteraction)))
+ }
+ }#end of intreaction contrasts
+
+
+ #remove from requiredContrasts contrasts that cannot be estimated
+ toRemove=unique(unlist(lapply(setdiff(coeffMeaning,names(estimatedCoeff)),function(x)grep(x,requiredContrasts))))
+ if(length(toRemove)>0){
+ addComment(c("[WARNING]",length(toRemove)," contrasts are removed, due to missing coefficients"),T,opt$log,display=FALSE)
+ requiredContrasts=requiredContrasts[-toRemove]
+ humanReadingContrasts=humanReadingContrasts[-toRemove]
+ persoContrastName=persoContrastName[-toRemove]
+ }
+
+ if(length(requiredContrasts)==0){
+ addComment("[ERROR]No contrast to compute, please check your contrast definition.",T,opt$log)
+ q( "no", 1, F )
+ }
+
+ #compute for each contrast mean of coefficients in posGroup and negGroup for FC computation of log(FC) with LSmean as in Partek
+ meanPosGroup=list()
+ meanNegGroup=list()
+ for(iContrast in 1:length(requiredContrasts)){
+ #define posGroup and negGroup
+ #first split contrast
+ temp=unlist(strsplit(requiredContrasts[iContrast],"[ + ]"))
+ splitContrast=temp[1]
+ for(iTemp in temp[-1])splitContrast=c(splitContrast,"+",iTemp)
+ splitContrast=unlist(lapply(splitContrast,function(x){temp=unlist(strsplit(x,"-"));splitCompB=temp[1];for(iTemp in temp[-1])splitCompB=c(splitCompB,"-",iTemp);splitCompB}))
+ #and then put each term in good group
+ posGroup=c()
+ negGroup=c()
+ nextIsPos=TRUE
+ for(iSplit in splitContrast){
+ if(iSplit=="+")nextIsPos=TRUE
+ if(iSplit=="-")nextIsPos=FALSE
+ if(iSplit!="-" && iSplit!="+"){
+ #remove weights of contrast terms
+ iSplitBis=unlist(strsplit(iSplit,"[*]"))
+ iSplitBis=iSplitBis[length(iSplitBis)]
+ if(nextIsPos)posGroup=c(posGroup,iSplitBis)
+ else negGroup=c(negGroup,iSplitBis)
+ }
+ }
+ #compute means for each group
+ meanPosGroup[[iContrast]]=apply(as.matrix(data.fit$coefficients[,posGroup],ncol=length(posGroup)),1,mean)
+ meanNegGroup[[iContrast]]=apply(as.matrix(data.fit$coefficients[,negGroup],ncol=length(negGroup)),1,mean)
+ }
+
+
+ contrast.matrix = makeContrasts(contrasts=requiredContrasts,levels=design)
+ data.fit.con = contrasts.fit(data.fit,contrast.matrix)
+
+ addComment("[INFO]Contrast definition done",T,opt$log,T,display=FALSE)
+
+ #compute LIMMA statistics
+ data.fit.eb = eBayes(data.fit.con)
+
+ addComment("[INFO]Estimation done",T,opt$log,T,display=FALSE)
+
+ #adjust p.value through FDR
+ data.fit.eb$adj_p.value=data.fit.eb$p.value
+ for(iComparison in 1:ncol(data.fit.eb$adj_p.value)){
+ data.fit.eb$adj_p.value[,iComparison]=p.adjust(data.fit.eb$p.value[,iComparison],"fdr")
+ }
+
+ #add a new field based on LS-means for each contrast instead of global mean like they were calculated in coefficients field
+ data.fit.eb$coefficientsLS=data.fit.eb$coefficients
+ if(ncol(data.fit.eb$coefficients)!=length(meanPosGroup)){
+ addComment("[ERROR]Estimated contrasts number unexpected",T,opt$log)
+ q( "no", 1, F )
+ }
+ for(iContrast in 1:length(meanPosGroup)){
+ data.fit.eb$coefficientsLS[,iContrast]=meanPosGroup[[iContrast]][rownames(data.fit.eb$coefficientsLS)]-meanNegGroup[[iContrast]][rownames(data.fit.eb$coefficientsLS)]
+ }
+
+ #if requested replace coefficient computed as global mean by LS-means values
+ if(useLSmean)data.fit.eb$coefficients=data.fit.eb$coefficientsLS
+
+addComment("[INFO]Core treatment done",T,opt$log,T,display=FALSE)
+
+
+##convert humanReadingContrasts with namingDictionary to create humanReadingContrastsRenamed and keep original humanReadingContrasts names for file names
+humanReadingContrastsRenamed=rep("",length(humanReadingContrasts))
+for(iContrast in 1:length(humanReadingContrasts)){
+ if(persoContrastName[iContrast]==""){
+ #if(verbose)addComment(humanReadingContrasts[iContrast])
+ specialCharacters=str_extract_all(humanReadingContrasts[iContrast],"[+|*|_|:|-]")[[1]]
+ #if(verbose)addComment(specialCharacters)
+ nameConverted=unlist(lapply(strsplit(humanReadingContrasts[iContrast],"[+|*|_|:|-]")[[1]],function(x)renamingDico[x,1]))
+ #if(verbose)addComment(nameConverted)
+ humanReadingContrastsRenamed[iContrast]=paste(nameConverted,specialCharacters,collapse="",sep="")
+ #if(verbose)addComment(humanReadingContrastsRenamed[iContrast])
+ humanReadingContrastsRenamed[iContrast]=substr(humanReadingContrastsRenamed[iContrast],1,nchar(humanReadingContrastsRenamed[iContrast])-1)
+ }else{
+ humanReadingContrastsRenamed[iContrast]=persoContrastName[iContrast]
+ }
+}
+
+#write correspondances between plot file names (humanReadingContrasts) and displayed names in figure legends (humanReadingContrastsRenamed), usefull to define html items in xml file
+correspondanceTable=matrix("",ncol=2,nrow=ncol(data.fit.eb$p.value))
+correspondanceTable[,1]=unlist(lapply(humanReadingContrasts,function(x)gsub(":","_INT_",gsub("\\+","_PLUS_",gsub("\\*","_AND_",x)))))
+correspondanceTable[,2]=humanReadingContrastsRenamed
+rownames(correspondanceTable)=correspondanceTable[,2]
+write.table(correspondanceTable,file=file.path(getwd(), "correspondanceFileNames.csv"),quote=FALSE,sep="\t",col.names = F,row.names = F)
+
+#plot nominal p-val histograms for selected comparisons
+histogramPerPage=6
+if (!is.null(opt$histo)) {
+ iToPlot=1
+ plotVector=list()
+ nbComparisons=ncol(data.fit.eb$p.value)
+ for (iComparison in 1:nbComparisons){
+ dataToPlot=data.frame(pval=data.fit.eb$p.value[,iComparison],id=rownames(data.fit.eb$p.value))
+ p <- ggplot(data=dataToPlot, aes(x=pval)) + geom_histogram(colour="red", fill="salmon") +
+ theme_bw() + ggtitle(humanReadingContrastsRenamed[iComparison]) + ylab(label="Frequencies") + xlab(label="Nominal p-val") +
+ theme(panel.border=element_blank(),plot.title = element_text(hjust = 0.5))
+ plotVector[[length(plotVector)+1]]=p
+
+ pp <- ggplotly(p)
+ htmlwidgets::saveWidget(as_widget(pp), paste(c(file.path(getwd(), "plotLyDir"),"/",opt$histo,"_",correspondanceTable[humanReadingContrastsRenamed[iComparison],1],".html"),collapse=""),selfcontained = F)
+
+ if(iComparison==nbComparisons || length(plotVector)==histogramPerPage){
+ #plot and close the actual plot
+ if(opt$format=="pdf"){
+ pdf(paste(c("./plotDir/",opt$histo,iToPlot,".pdf"),collapse=""))}else{
+ png(paste(c("./plotDir/",opt$histo,iToPlot,".png"),collapse=""))
+ }
+ multiplot(plotlist=plotVector,cols=2)
+ dev.off()
+ if(iComparison0)pvalThresholdFDR=min(probeWithLowFDR)
+
+ #get significant points over FC and FDR thresholds
+ significativePoints=intersect(which(abs(data.fit.eb$coefficients[,iComparison])>=logFCthreshold),which(data.fit.eb$adj_p.value[,iComparison]<=opt$fdrThreshold))
+
+ #to reduce size of html plot, we keep 20000 points maximum sampled amongst genes with pval>=33%(pval) and abs(log2(FC))<=66%(abs(log2(FC)))
+ htmlPointsToRemove=intersect(which(abs(data.fit.eb$coefficients[,iComparison])<=quantile(abs(data.fit.eb$coefficients[,iComparison]),c(0.66))),which(data.fit.eb$p.value[,iComparison]>=quantile(abs(data.fit.eb$p.value[,iComparison]),c(0.33))))
+ if(length(htmlPointsToRemove)>20000){
+ htmlPointsToRemove=setdiff(htmlPointsToRemove,sample(htmlPointsToRemove,20000))
+ }else{
+ htmlPointsToRemove=c()
+ }
+
+ xMinLimPlot=min(data.fit.eb$coefficients[,iComparison])-0.2
+ xMaxLimPlot=max(data.fit.eb$coefficients[,iComparison])+0.2
+ yMaxLimPlot= max(-log10(data.fit.eb$p.value[,iComparison]))+0.2
+
+ if(length(significativePoints)>0){
+ dataSignifToPlot=data.frame(pval=-log10(data.fit.eb$p.value[significativePoints,iComparison]),FC=data.fit.eb$coefficients[significativePoints,iComparison],description=paste(names(data.fit.eb$coefficients[significativePoints,iComparison]),"\n","FC: " , round(2^data.fit.eb$coefficients[significativePoints,iComparison],2) , " | FDR p-val: ",prettyNum(data.fit.eb$adj_p.value[significativePoints,iComparison],digits=4), sep=""))
+ #to test if remains any normal points to draw
+ if(length(significativePoints)0)p <- p + geom_point(data=dataSignifToPlot,aes(colour=description))
+
+ ##interactive plot
+ if(length(htmlPointsToRemove)>0){
+ pointToRemove=union(htmlPointsToRemove,significativePoints)
+ #to test if remains any normal points to draw
+ if(length(pointToRemove)40000)addComment(c("[WARNING]For",humanReadingContrastsRenamed[iComparison],"volcano, numerous points to plot(",nrow(dataToPlot)+nrow(dataSignifToPlot),"), resulting volcano could be heavy, using more stringent thresholds could be helpful."),T,opt$log)
+
+ phtml <- plot_ly(data=dataToPlot, x=~FC, y=~pval,type="scatter", mode="markers",showlegend = FALSE, marker = list(color="gray",opacity=0.5), text=~description, hoverinfo="text") %>%
+ layout(title = humanReadingContrastsRenamed[iComparison],xaxis=list(title="Log2 Fold Change",showgrid=TRUE, zeroline=FALSE),yaxis=list(title="-log10(p-val)", showgrid=TRUE, zeroline=FALSE))
+ if(length(significativePoints)>0) phtml=add_markers(phtml,data=dataSignifToPlot, x=~FC, y=~pval, mode="markers" , marker=list( color=log10(abs(dataSignifToPlot$FC)*dataSignifToPlot$pval),colorscale='Rainbow'), text=~description, hoverinfo="text", inherit = FALSE) %>% hide_colorbar()
+ if(logFCthreshold!=0){
+ phtml=add_trace(phtml,x=c(-logFCthreshold,-logFCthreshold), y=c(0,yMaxLimPlot), type="scatter", mode = "lines", line=list(color="coral",dash="dash"), hoverinfo='none', showlegend = FALSE,inherit = FALSE)
+ phtml=add_annotations(phtml,x=-logFCthreshold,y=0,xref = "x",yref = "y",text = paste(c("log2(1/FC=",opt$thresholdFC,")"),collapse=""),xanchor = 'right',showarrow = F,textangle=270,font=list(color="coral"))
+ phtml=add_trace(phtml,x=c(logFCthreshold,logFCthreshold), y=c(0, yMaxLimPlot), type="scatter", mode = "lines", line=list(color="coral",dash="dash"), hoverinfo='none', showlegend = FALSE,inherit = FALSE)
+ phtml=add_annotations(phtml,x=logFCthreshold,y=0,xref = "x",yref = "y",text = paste(c("log2(FC=",opt$thresholdFC,")"),collapse=""),xanchor = 'right',showarrow = F,textangle=270,font=list(color="coral"))
+ }
+ if(!is.null(pvalThresholdFDR)){
+ phtml=add_trace(phtml,x=c(xMinLimPlot,xMaxLimPlot), y=c(pvalThresholdFDR,pvalThresholdFDR), type="scatter", mode = "lines", line=list(color="cornflowerblue",dash="dash"), hoverinfo='none', showlegend = FALSE,inherit = FALSE)
+ phtml=add_annotations(phtml,x=xMinLimPlot,y=pvalThresholdFDR+0.1,xref = "x",yref = "y",text = paste(c("FDR pval limit(",opt$fdrThreshold,")"),collapse=""),xanchor = 'left',showarrow = F,font=list(color="cornflowerblue"))
+ }
+ plotVector[[length(plotVector)+1]]=p
+
+ #save plotly files
+ pp <- ggplotly(phtml)
+ htmlwidgets::saveWidget(as_widget(pp), paste(c(file.path(getwd(), "plotLyDir"),"/",opt$volcano,"_",correspondanceTable[humanReadingContrastsRenamed[iComparison],1],".html"),collapse=""),selfcontained = F)
+
+
+ if(iComparison==nbComparisons || length(plotVector)==volcanoPerPage){
+ #plot and close the actual plot
+ if(opt$format=="pdf"){
+ pdf(paste(c("./plotDir/",opt$volcano,"_",correspondanceTable[humanReadingContrastsRenamed[iComparison],1],".pdf"),collapse=""))}else{
+ png(paste(c("./plotDir/",opt$volcano,"_",correspondanceTable[humanReadingContrastsRenamed[iComparison],1],".png"),collapse=""))
+ }
+ multiplot(plotlist=plotVector,cols=1)
+ dev.off()
+ if(iComparison0)))
+#filter out genes with lower FC for all comparisons
+genesToKeep=intersect(genesToKeep,names(which(apply(data.fit.eb$coefficients,1,function(x)length(which(abs(x)>=logFCthreshold))>0))))
+
+if(length(genesToKeep)>0){
+ data.fit.eb$adj_p.value=matrix(data.fit.eb$adj_p.value[genesToKeep,],ncol=ncol(data.fit.eb$adj_p.value))
+ rownames(data.fit.eb$adj_p.value)=genesToKeep
+ colnames(data.fit.eb$adj_p.value)=colnames(data.fit.eb$p.value)
+
+ data.fit.eb$p.value=matrix(data.fit.eb$p.value[genesToKeep,],ncol=ncol(data.fit.eb$p.value))
+ rownames(data.fit.eb$p.value)=genesToKeep
+ colnames(data.fit.eb$p.value)=colnames(data.fit.eb$adj_p.value)
+
+ data.fit.eb$coefficients=matrix(data.fit.eb$coefficients[genesToKeep,],ncol=ncol(data.fit.eb$coefficients))
+ rownames(data.fit.eb$coefficients)=genesToKeep
+ colnames(data.fit.eb$coefficients)=colnames(data.fit.eb$adj_p.value)
+
+ data.fit.eb$t=matrix(data.fit.eb$t[genesToKeep,],ncol=ncol(data.fit.eb$t))
+ rownames(data.fit.eb$t)=genesToKeep
+ colnames(data.fit.eb$t)=colnames(data.fit.eb$adj_p.value)
+
+ dfMatrix=dfMatrix[genesToKeep,,drop=FALSE]
+
+}else{
+ addComment(c("[WARNING]No significative genes considering the given FDR threshold : ",opt$fdrThreshold),T,opt$log,display=FALSE)
+}
+
+addComment("[INFO]Significant genes filtering done",T,opt$log,T,display=FALSE)
+
+
+#plot VennDiagramm for genes below threshold between comparisons
+#t=apply(data.fit.eb$adj_p.value[,1:4],2,function(x)names(which(x<=opt$threshold)))
+#get.venn.partitions(t)
+#vennCounts(data.fit.eb$adj_p.value[,1:4]<=opt$threshold)
+
+#make a simple sort genes based only on the first comparison
+#newOrder=order(data.fit.eb$adj_p.value[,1])
+#data.fit.eb$adj_p.value=data.fit.eb$adj_p.value[newOrder,]
+
+#alternative sorting strategy based on the mean gene rank over all comparisons
+if(length(genesToKeep)>1){
+ currentRank=rep(0,nrow(data.fit.eb$adj_p.value))
+ for(iComparison in 1:ncol(data.fit.eb$adj_p.value)){
+ currentRank=currentRank+rank(data.fit.eb$adj_p.value[,iComparison])
+ }
+ currentRank=currentRank/ncol(data.fit.eb$adj_p.value)
+ newOrder=order(currentRank)
+
+ data.fit.eb$adj_p.value=matrix(data.fit.eb$adj_p.value[newOrder,],ncol=ncol(data.fit.eb$adj_p.value))
+ rownames(data.fit.eb$adj_p.value)=rownames(data.fit.eb$p.value)[newOrder]
+ colnames(data.fit.eb$adj_p.value)=colnames(data.fit.eb$p.value)
+
+ data.fit.eb$p.value=matrix(data.fit.eb$p.value[newOrder,],ncol=ncol(data.fit.eb$p.value))
+ rownames(data.fit.eb$p.value)=rownames(data.fit.eb$adj_p.value)
+ colnames(data.fit.eb$p.value)=colnames(data.fit.eb$adj_p.value)
+
+ data.fit.eb$coefficients=matrix(data.fit.eb$coefficients[newOrder,],ncol=ncol(data.fit.eb$coefficients))
+ rownames(data.fit.eb$coefficients)=rownames(data.fit.eb$adj_p.value)
+ colnames(data.fit.eb$coefficients)=colnames(data.fit.eb$adj_p.value)
+
+ data.fit.eb$t=matrix(data.fit.eb$t[newOrder,],ncol=ncol(data.fit.eb$t))
+ rownames(data.fit.eb$t)=rownames(data.fit.eb$adj_p.value)
+ colnames(data.fit.eb$t)=colnames(data.fit.eb$adj_p.value)
+
+ dfMatrix=dfMatrix[newOrder,,drop=FALSE]
+}
+
+
+#formating output matrices depending on genes to keep
+if(length(genesToKeep)==0){
+ outputData=matrix(0,ncol=ncol(data.fit.eb$adj_p.value)*5+2,nrow=3)
+ outputData[1,]=c("X","X",rep(humanReadingContrastsRenamed,each=5))
+ outputData[2,]=c("X","X",rep(c("p-val","FDR.p-val","FC","log2(FC)","t-stat"),ncol(data.fit.eb$adj_p.value)))
+ outputData[,1]=c("LIMMA","Gene","noGene")
+ outputData[,2]=c("Comparison","Info","noInfo")
+
+ outputDfData=matrix(0,ncol=3+1,nrow=2)
+ outputDfData[1,]=c("X","df.residual","df.prior","df.total")
+ outputDfData[,1]=c("Statistics","noGene")
+}else{
+ if(length(genesToKeep)==1){
+ outputData=matrix(0,ncol=ncol(data.fit.eb$adj_p.value)*5+2,nrow=3)
+ outputData[1,]=c("X","X",rep(humanReadingContrastsRenamed,each=5))
+ outputData[2,]=c("X","X",rep(c("p-val","FDR.p-val","FC","log2(FC)","t-stat"),ncol(data.fit.eb$adj_p.value)))
+ outputData[,1]=c("LIMMA","Gene",genesToKeep)
+ outputData[,2]=c("Comparison","Info","na")
+ if(!is.null(rowItemInfo))outputData[3,2]=rowItemInfo[genesToKeep]
+ outputData[3,seq(3,ncol(outputData),5)]=prettyNum(data.fit.eb$p.value,digits=4)
+ outputData[3,seq(4,ncol(outputData),5)]=prettyNum(data.fit.eb$adj_p.value,digits=4)
+ outputData[3,seq(5,ncol(outputData),5)]=prettyNum(2^data.fit.eb$coefficients,digits=4)
+ outputData[3,seq(6,ncol(outputData),5)]=prettyNum(data.fit.eb$coefficients,digits=4)
+ outputData[3,seq(7,ncol(outputData),5)]=prettyNum(data.fit.eb$t,digits=4)
+
+ outputDfData=matrix(0,ncol=3+1,nrow=1+nrow(dfMatrix))
+ outputDfData[1,]=c("Statistics","df.residual","df.prior","df.total")
+ outputDfData[2,]=c(rownames(dfMatrix),prettyNum(dfMatrix[,c("df.residual","df.prior","df.total")],digits=4))
+ }else{
+ #format matrix to be correctly read by galaxy (move headers in first column and row)
+ outputData=matrix(0,ncol=ncol(data.fit.eb$adj_p.value)*5+2,nrow=nrow(data.fit.eb$adj_p.value)+2)
+ outputData[1,]=c("X","X",rep(humanReadingContrastsRenamed,each=5))
+ outputData[2,]=c("X","X",rep(c("p-val","FDR.p-val","FC","log2(FC)","t-stat"),ncol(data.fit.eb$adj_p.value)))
+ outputData[,1]=c("LIMMA","Gene",rownames(data.fit.eb$adj_p.value))
+ outputData[,2]=c("Comparison","Info",rep("na",nrow(data.fit.eb$adj_p.value)))
+ if(!is.null(rowItemInfo))outputData[3:nrow(outputData),2]=rowItemInfo[rownames(data.fit.eb$adj_p.value)]
+ outputData[3:nrow(outputData),seq(3,ncol(outputData),5)]=prettyNum(data.fit.eb$p.value,digits=4)
+ outputData[3:nrow(outputData),seq(4,ncol(outputData),5)]=prettyNum(data.fit.eb$adj_p.value,digits=4)
+ outputData[3:nrow(outputData),seq(5,ncol(outputData),5)]=prettyNum(2^data.fit.eb$coefficients,digits=4)
+ outputData[3:nrow(outputData),seq(6,ncol(outputData),5)]=prettyNum(data.fit.eb$coefficients,digits=4)
+ outputData[3:nrow(outputData),seq(7,ncol(outputData),5)]=prettyNum(data.fit.eb$t,digits=4)
+
+ outputDfData=matrix(0,ncol=3+1,nrow=1+nrow(dfMatrix))
+ outputDfData[1,]=c("Statistics","df.residual","df.prior","df.total")
+ outputDfData[2:(1+nrow(dfMatrix)),]=cbind(rownames(dfMatrix),prettyNum(dfMatrix[,c("df.residual")],digits=4),prettyNum(dfMatrix[,c("df.prior")],digits=4),prettyNum(dfMatrix[,c("df.total")],digits=4))
+ }
+}
+addComment("[INFO]Formated output",T,opt$log,display=FALSE)
+
+#write output results
+write.table(outputData,file=opt$outputFile,quote=FALSE,sep="\t",col.names = F,row.names = F)
+
+#write df info file
+write.table(outputDfData,file=opt$outputDfFile,quote=FALSE,sep="\t",col.names = F,row.names = F)
+
+end.time <- Sys.time()
+addComment(c("[INFO]Total execution time for R script:",as.numeric(end.time - start.time,units="mins"),"mins"),T,opt$log,display=FALSE)
+
+addComment("[INFO]End of R script",T,opt$log,display=FALSE)
+
+printSessionInfo(opt$log)
+#sessionInfo()
+
+
+
diff -r 000000000000 -r 3022feec50fe src/VolcanoPlotsScript.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/VolcanoPlotsScript.R Fri Jun 26 09:36:46 2020 -0400
@@ -0,0 +1,426 @@
+# R script to plot volcanos through Galaxy based GIANT tool
+# written by Jimmy Vandel
+#
+#
+initial.options <- commandArgs(trailingOnly = FALSE)
+file.arg.name <- "--file="
+script.name <- sub(file.arg.name, "", initial.options[grep(file.arg.name, initial.options)])
+script.basename <- dirname(script.name)
+source(file.path(script.basename, "utils.R"))
+source(file.path(script.basename, "getopt.R"))
+
+#addComment("Welcome R!")
+
+# setup R error handling to go to stderr
+options( show.error.messages=F, error = function () { cat(geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
+
+# we need that to not crash galaxy with an UTF8 error on German LC settings.
+loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
+loc <- Sys.setlocale("LC_NUMERIC", "C")
+
+#get starting time
+start.time <- Sys.time()
+
+options(stringAsfactors = FALSE, useFancyQuotes = FALSE)
+args <- commandArgs()
+
+# get options, using the spec as defined by the enclosed list.
+# we read the options from the default: commandArgs(TRUE).
+spec <- matrix(c(
+ "statisticsFile", "i", 1, "character",
+ "volcanoName" , "n", 1, "character",
+ "pvalColumnName" , "p", 1, "character",
+ "fdrColumnName" , "m", 1, "character",
+ "fcColumnName" , "c", 1, "character",
+ "fcKind","d", 1, "character",
+ "fdrThreshold","s", 1, "double",
+ "fcThreshold","e", 1, "double",
+ "organismID","x",1,"character",
+ "rowNameType","y",1,"character",
+ "log", "l", 1, "character",
+ "outputFile" , "o", 1, "character",
+ "format", "f", 1, "character",
+ "quiet", "q", 0, "logical"),
+ byrow=TRUE, ncol=4)
+opt <- getopt(spec)
+
+# enforce the following required arguments
+if (is.null(opt$log)) {
+ addComment("[ERROR]'log file' is required\n")
+ q( "no", 1, F )
+}
+addComment("[INFO]Start of R script",T,opt$log,display=FALSE)
+if (is.null(opt$statisticsFile)) {
+ addComment("[ERROR]'statisticsFile' is required",T,opt$log)
+ q( "no", 1, F )
+}
+if (length(opt$pvalColumnName)==0 || length(opt$fdrColumnName)==0 || length(opt$fcColumnName)==0) {
+ addComment("[ERROR]no selected columns",T,opt$log)
+ q( "no", 1, F )
+}
+if (length(opt$pvalColumnName)!=length(opt$fcColumnName) || length(opt$pvalColumnName)!=length(opt$fdrColumnName)) {
+ addComment("[ERROR]different number of selected columns between p.val, adj-p.val and FC ",T,opt$log)
+ q( "no", 1, F )
+}
+if (is.null(opt$fcKind)) {
+ addComment("[ERROR]'fcKind' is required",T,opt$log)
+ q( "no", 1, F )
+}
+if (is.null(opt$fdrThreshold)) {
+ addComment("[ERROR]'FDR threshold' is required",T,opt$log)
+ q( "no", 1, F )
+}
+if (is.null(opt$fcThreshold)) {
+ addComment("[ERROR]'FC threshold' is required",T,opt$log)
+ q( "no", 1, F )
+}
+if (is.null(opt$outputFile)) {
+ addComment("[ERROR]'output file' is required",T,opt$log)
+ q( "no", 1, F )
+}
+if (is.null(opt$format)) {
+ addComment("[ERROR]'output format' is required",T,opt$log)
+ q( "no", 1, F )
+}
+
+#demande si le script sera bavard
+verbose <- if (is.null(opt$quiet)) {
+ TRUE
+}else{
+ FALSE
+}
+
+#paramètres internes
+addComment("[INFO]Parameters checked test mode !",T,opt$log,display=FALSE)
+
+addComment(c("[INFO]Working directory: ",getwd()),TRUE,opt$log,display=FALSE)
+addComment(c("[INFO]Command line: ",args),TRUE,opt$log,display=FALSE)
+
+#directory for plots
+dir.create(file.path(getwd(), "plotDir"))
+dir.create(file.path(getwd(), "plotLyDir"))
+
+#charge des packages silencieusement
+suppressPackageStartupMessages({
+ library("methods")
+ library("biomaRt")
+ library("ggplot2")
+ library("plotly")
+ library("stringr")
+})
+
+#define some usefull variable
+nbVolcanosToPlot=length(opt$pvalColumnName)
+
+#load input file
+statDataMatrix=read.csv(file=file.path(getwd(), opt$statisticsFile),header=F,sep="\t",colClasses="character")
+#remove first colum to convert it as rownames
+rownames(statDataMatrix)=statDataMatrix[,1]
+statDataMatrix=statDataMatrix[,-1]
+
+#identify lines without adjusted p-value info (should contain the same content as rownames) and replace them with NA values
+FDRinfo=rep(TRUE,nbVolcanosToPlot)
+for(iVolcano in 1:nbVolcanosToPlot){
+ #input parameter should be None when adjusted p-val are not available
+ if(opt$fdrColumnName[iVolcano]=="None"){
+ #content of the corresponding column should also be the same as rownames
+ if(!all(statDataMatrix[,(iVolcano-1)*3+2]==rownames(statDataMatrix))){
+ addComment(c("[ERROR]It seems that input stat matrix contains adjusted p-values for volcano",iVolcano,"whereas input parameter indicates that not."),T,opt$log)
+ q( "no", 1, F )
+ }
+ FDRinfo[iVolcano]=FALSE
+ statDataMatrix[,(iVolcano-1)*3+2]=NA
+ }
+}
+
+if(is.data.frame(statDataMatrix)){
+ statDataMatrix=data.matrix(statDataMatrix)
+}else{
+ statDataMatrix=data.matrix(as.numeric(statDataMatrix))
+}
+
+#check if available column number match with volcano requested number
+if(ncol(statDataMatrix)!=3*nbVolcanosToPlot){
+ addComment("[ERROR]Input file column number is different from requested volcano number",T,opt$log)
+ q( "no", 1, F )
+}
+
+#build global dataFrame with data and fill with p.val and log2(FC) and FDR
+dataFrame=data.frame(row.names = rownames(statDataMatrix))
+#start with p-value
+dataFrame$p.value=statDataMatrix[,seq(1,nbVolcanosToPlot*3,3),drop=FALSE]
+#compute FDR if needed or just get available info
+dataFrame$adj_p.value=dataFrame$p.value
+for(iVolcano in 1:nbVolcanosToPlot){
+ #adjusted p-value are already computed
+ if(FDRinfo[iVolcano]){
+ dataFrame$adj_p.value[,iVolcano]=statDataMatrix[,(iVolcano-1)*3+2,drop=FALSE]
+ }else{
+ #adjusted p-value should be computed based on p-val using FDR
+ dataFrame$adj_p.value[,iVolcano]=p.adjust(dataFrame$p.value[,iVolcano,drop=FALSE],"fdr")
+ addComment(c("[INFO]Adjusted p-values are not available in input for volcano",iVolcano,", FDR approach will be used on available raw p-values"),T,opt$log)
+ }
+}
+if(opt$fcKind=="FC"){
+ #we should transform as Log2FC
+ dataFrame$coefficients=log2(statDataMatrix[,seq(3,nbVolcanosToPlot*3,3),drop=FALSE])
+ addComment(c("[INFO]FC are converted in log2(FC) for plotting"),T,opt$log)
+}else{
+ dataFrame$coefficients=statDataMatrix[,seq(3,nbVolcanosToPlot*3,3),drop=FALSE]
+}
+
+addComment(c("[INFO]Input data available for",nbVolcanosToPlot,"volcano(s) with",nrow(statDataMatrix),"rows"),T,opt$log)
+
+
+#plot VOLCANOs
+volcanoPerPage=1
+logFCthreshold=log2(opt$fcThreshold)
+iToPlot=1
+plotVector=list()
+volcanoNameList=c()
+for (iVolcano in 1:nbVolcanosToPlot){
+
+ if(nchar(opt$volcanoName[iVolcano])>0){
+ curentVolcanoName=opt$volcanoName[iVolcano]
+ }else{
+ curentVolcanoName=paste(iVolcano,opt$pvalColumnName[iVolcano],sep="_")
+ }
+
+ #keep only rows without NA for p-val, adjusted p-val and coeff
+ pValToPlot=dataFrame$p.value[,iVolcano]
+ fdrToPlot=dataFrame$adj_p.value[,iVolcano]
+ coeffToPlot=dataFrame$coefficients[,iVolcano]
+
+ rowToRemove=unique(c(which(is.na(pValToPlot)),which(is.na(fdrToPlot)),which(is.na(coeffToPlot))))
+ if(length(rowToRemove)>0){
+ pValToPlot=pValToPlot[-rowToRemove]
+ fdrToPlot=fdrToPlot[-rowToRemove]
+ coeffToPlot=coeffToPlot[-rowToRemove]
+ }
+ addComment(c("[INFO]For",curentVolcanoName,"volcano,",length(rowToRemove),"rows are discarded due to NA values,",length(pValToPlot),"remaining rows."),T,opt$log)
+
+ #save volcano name
+ volcanoNameList=c(volcanoNameList,curentVolcanoName)
+
+ #remove characters possibly troubling
+ volcanoFileName=iVolcano
+
+ #define the log10(p-val) threshold corresponding to FDR threshold fixed by user
+ probeWithLowFDR=-log10(pValToPlot[which(fdrToPlot<=opt$fdrThreshold)])
+ pvalThresholdFDR=NULL
+ if(length(probeWithLowFDR)>0)pvalThresholdFDR=min(probeWithLowFDR)
+
+ #get significant points over FC and FDR thresholds
+ significativePoints=intersect(which(abs(coeffToPlot)>=logFCthreshold),which(fdrToPlot<=opt$fdrThreshold))
+
+ #to reduce size of html plot, we keep 20000 points maximum sampled amongst genes with pval>=33%(pval) and abs(log2(FC))<=66%(abs(log2(FC)))
+ htmlPointsToRemove=intersect(which(abs(coeffToPlot)<=quantile(abs(coeffToPlot),c(0.66))),which(pValToPlot>=quantile(abs(pValToPlot),c(0.33))))
+ if(length(htmlPointsToRemove)>20000){
+ htmlPointsToRemove=setdiff(htmlPointsToRemove,sample(htmlPointsToRemove,20000))
+ }else{
+ htmlPointsToRemove=c()
+ }
+
+ xMinLimPlot=min(coeffToPlot)-0.2
+ xMaxLimPlot=max(coeffToPlot)+0.2
+ yMaxLimPlot= max(-log10(pValToPlot))+0.2
+
+ if(length(significativePoints)>0){
+ dataSignifToPlot=data.frame(pval=-log10(pValToPlot[significativePoints]),FC=coeffToPlot[significativePoints],description=paste(names(coeffToPlot[significativePoints]),"\n","FC: " , round(2^coeffToPlot[significativePoints],2) , " | Adjusted p-val: ",prettyNum(fdrToPlot[significativePoints],digits=4), sep=""))
+ #to test if remains any normal points to draw
+ if(length(significativePoints)0)p <- p + geom_point(data=dataSignifToPlot,aes(colour=description))
+
+ ##interactive plot
+
+ if(length(htmlPointsToRemove)>0){
+ pointToRemove=union(htmlPointsToRemove,significativePoints)
+ #to test if it remains any normal points to draw
+ if(length(pointToRemove)40000)addComment(c("[WARNING]For",curentVolcanoName,"volcano, numerous points to plot(",nrow(dataToPlot)+nrow(dataSignifToPlot),"), resulting volcano could be heavy, using more stringent thresholds could be helpful."),T,opt$log)
+
+ phtml <- plot_ly(data=dataToPlot, x=~FC, y=~pval,type="scatter", mode="markers",showlegend = FALSE, marker = list(color="gray",opacity=0.5), text=~description, hoverinfo="text") %>%
+ layout(title = curentVolcanoName[iVolcano],xaxis=list(title="Log2 Fold Change",showgrid=TRUE, zeroline=FALSE),yaxis=list(title="-Log10(p-val)", showgrid=TRUE, zeroline=FALSE))
+ if(length(significativePoints)>0) phtml=add_markers(phtml,data=dataSignifToPlot, x=~FC, y=~pval, mode="markers" , marker=list( color=log10(abs(dataSignifToPlot$FC)*dataSignifToPlot$pval),colorscale='Rainbow'), text=~description, hoverinfo="text", inherit = FALSE) %>% hide_colorbar()
+ if(logFCthreshold!=0){
+ phtml=add_trace(phtml,x=c(-logFCthreshold,-logFCthreshold), y=c(0,yMaxLimPlot), type="scatter", mode = "lines", line=list(color="coral",dash="dash"), hoverinfo='none', showlegend = FALSE,inherit = FALSE)
+ phtml=add_annotations(phtml,x=-logFCthreshold,y=0,xref = "x",yref = "y",text = paste(c("log2(1/FC=",opt$fcThreshold,")"),collapse=""),xanchor = 'right',showarrow = F,textangle=270,font=list(color="coral"))
+ phtml=add_trace(phtml,x=c(logFCthreshold,logFCthreshold), y=c(0, yMaxLimPlot), type="scatter", mode = "lines", line=list(color="coral",dash="dash"), hoverinfo='none', showlegend = FALSE,inherit = FALSE)
+ phtml=add_annotations(phtml,x=logFCthreshold,y=0,xref = "x",yref = "y",text = paste(c("log2(FC=",opt$fcThreshold,")"),collapse=""),xanchor = 'right',showarrow = F,textangle=270,font=list(color="coral"))
+ }
+ if(!is.null(pvalThresholdFDR)){
+ phtml=add_trace(phtml,x=c(xMinLimPlot,xMaxLimPlot), y=c(pvalThresholdFDR,pvalThresholdFDR), type="scatter", mode = "lines", line=list(color="cornflowerblue",dash="dash"), hoverinfo='none', showlegend = FALSE,inherit = FALSE)
+ phtml=add_annotations(phtml,x=xMinLimPlot,y=pvalThresholdFDR+0.1,xref = "x",yref = "y",text = paste(c("Adjusted pval limit(",opt$fdrThreshold,")"),collapse=""),xanchor = 'left',showarrow = F,font=list(color="cornflowerblue"))
+ }
+ plotVector[[length(plotVector)+1]]=p
+
+ #save plotly files
+ pp <- ggplotly(phtml)
+ htmlwidgets::saveWidget(as_widget(pp), paste(c(file.path(getwd(), "plotLyDir"),"/Volcanos_",volcanoFileName,".html"),collapse=""),selfcontained = F)
+
+
+ if(iVolcano==nbVolcanosToPlot || length(plotVector)==volcanoPerPage){
+ #plot and close the actual plot
+ if(opt$format=="pdf"){
+ pdf(paste(c("./plotDir/Volcanos_",volcanoFileName,".pdf"),collapse=""))}else{
+ png(paste(c("./plotDir/Volcanos_",volcanoFileName,".png"),collapse=""))
+ }
+ multiplot(plotlist=plotVector,cols=1)
+ dev.off()
+ if(iVolcano0)))
+#filter out genes with lower FC for all comparisons
+genesToKeep=intersect(genesToKeep,names(which(apply(dataFrame$coefficients,1,function(x)length(which(abs(x)>=logFCthreshold))>0))))
+
+if(length(genesToKeep)>0){
+ dataFrameNew=data.frame(row.names=genesToKeep)
+
+ dataFrameNew$adj_p.value=matrix(dataFrame$adj_p.value[genesToKeep,,drop=FALSE],ncol=ncol(dataFrame$adj_p.value))
+ rownames(dataFrameNew$adj_p.value)=genesToKeep
+ colnames(dataFrameNew$adj_p.value)=colnames(dataFrame$p.value)
+
+ dataFrameNew$p.value=matrix(dataFrame$p.value[genesToKeep,,drop=FALSE],ncol=ncol(dataFrame$p.value))
+ rownames(dataFrameNew$p.value)=genesToKeep
+ colnames(dataFrameNew$p.value)=colnames(dataFrame$adj_p.value)
+
+ dataFrameNew$coefficients=matrix(dataFrame$coefficients[genesToKeep,,drop=FALSE],ncol=ncol(dataFrame$coefficients))
+ rownames(dataFrameNew$coefficients)=genesToKeep
+ colnames(dataFrameNew$coefficients)=colnames(dataFrame$adj_p.value)
+
+ dataFrame=dataFrameNew
+ rm(dataFrameNew)
+}else{
+ addComment("[WARNING]No significative genes",T,opt$log,display=FALSE)
+}
+
+addComment("[INFO]Significant genes filtering done",T,opt$log,T,display=FALSE)
+
+
+#plot VennDiagramm for genes below threshold between comparisons
+#t=apply(dataFrame$adj_p.value[,1:4],2,function(x)names(which(x<=opt$threshold)))
+#get.venn.partitions(t)
+#vennCounts(dataFrame$adj_p.value[,1:4]<=opt$threshold)
+
+#make a simple sort genes based only on the first comparison
+#newOrder=order(dataFrame$adj_p.value[,1])
+#dataFrame$adj_p.value=dataFrame$adj_p.value[newOrder,]
+
+#alternative sorting strategy based on the mean gene rank over all comparisons
+if(length(genesToKeep)>1){
+ currentRank=rep(0,nrow(dataFrame$adj_p.value))
+ for(iVolcano in 1:ncol(dataFrame$adj_p.value)){
+ currentRank=currentRank+rank(dataFrame$adj_p.value[,iVolcano])
+ }
+ currentRank=currentRank/ncol(dataFrame$adj_p.value)
+ newOrder=order(currentRank)
+ rownames(dataFrame)=rownames(dataFrame)[newOrder]
+
+ dataFrame$adj_p.value=matrix(dataFrame$adj_p.value[newOrder,],ncol=ncol(dataFrame$adj_p.value))
+ rownames(dataFrame$adj_p.value)=rownames(dataFrame$p.value)[newOrder]
+ colnames(dataFrame$adj_p.value)=colnames(dataFrame$p.value)
+
+ dataFrame$p.value=matrix(dataFrame$p.value[newOrder,],ncol=ncol(dataFrame$p.value))
+ rownames(dataFrame$p.value)=rownames(dataFrame$adj_p.value)
+ colnames(dataFrame$p.value)=colnames(dataFrame$adj_p.value)
+
+ dataFrame$coefficients=matrix(dataFrame$coefficients[newOrder,],ncol=ncol(dataFrame$coefficients))
+ rownames(dataFrame$coefficients)=rownames(dataFrame$adj_p.value)
+ colnames(dataFrame$coefficients)=colnames(dataFrame$adj_p.value)
+}
+
+#formating output matrix depending on genes to keep
+if(length(genesToKeep)==0){
+ outputData=matrix(0,ncol=ncol(dataFrame$adj_p.value)*4+2,nrow=3)
+ outputData[1,]=c("X","X",rep(volcanoNameList,each=4))
+ outputData[2,]=c("X","X",rep(c("p-val","Adjusted.p-val","FC","log2(FC)"),ncol(dataFrame$adj_p.value)))
+ outputData[,1]=c("Volcano","Gene","noGene")
+ outputData[,2]=c("Comparison","Info","noInfo")
+}else{
+ if(length(genesToKeep)==1){
+ outputData=matrix(0,ncol=ncol(dataFrame$adj_p.value)*4+2,nrow=3)
+ outputData[1,]=c("X","X",rep(volcanoNameList,each=4))
+ outputData[2,]=c("X","X",rep(c("p-val","Adjusted.p-val","FC","log2(FC)"),ncol(dataFrame$adj_p.value)))
+ outputData[,1]=c("Volcano","Gene",genesToKeep)
+ outputData[,2]=c("Comparison","Info","na")
+ if(!is.null(rowItemInfo))outputData[3,2]=rowItemInfo[genesToKeep]
+ outputData[3,seq(3,ncol(outputData),4)]=prettyNum(dataFrame$p.value,digits=4)
+ outputData[3,seq(4,ncol(outputData),4)]=prettyNum(dataFrame$adj_p.value,digits=4)
+ outputData[3,seq(5,ncol(outputData),4)]=prettyNum(2^dataFrame$coefficients,digits=4)
+ outputData[3,seq(6,ncol(outputData),4)]=prettyNum(dataFrame$coefficients,digits=4)
+ }else{
+ #format matrix to be correctly read by galaxy (move headers in first column and row)
+ outputData=matrix(0,ncol=ncol(dataFrame$adj_p.value)*4+2,nrow=nrow(dataFrame$adj_p.value)+2)
+ outputData[1,]=c("X","X",rep(volcanoNameList,each=4))
+ outputData[2,]=c("X","X",rep(c("p-val","Adjusted.p-val","FC","log2(FC)"),ncol(dataFrame$adj_p.value)))
+ outputData[,1]=c("Volcano","Gene",rownames(dataFrame$adj_p.value))
+ outputData[,2]=c("Comparison","Info",rep("na",nrow(dataFrame$adj_p.value)))
+ if(!is.null(rowItemInfo))outputData[3:nrow(outputData),2]=rowItemInfo[rownames(dataFrame$adj_p.value)]
+ outputData[3:nrow(outputData),seq(3,ncol(outputData),4)]=prettyNum(dataFrame$p.value,digits=4)
+ outputData[3:nrow(outputData),seq(4,ncol(outputData),4)]=prettyNum(dataFrame$adj_p.value,digits=4)
+ outputData[3:nrow(outputData),seq(5,ncol(outputData),4)]=prettyNum(2^dataFrame$coefficients,digits=4)
+ outputData[3:nrow(outputData),seq(6,ncol(outputData),4)]=prettyNum(dataFrame$coefficients,digits=4)
+ }
+}
+addComment("[INFO]Formated output",T,opt$log,display=FALSE)
+
+#write output results
+write.table(outputData,file=opt$outputFile,quote=FALSE,sep="\t",col.names = F,row.names = F)
+
+
+end.time <- Sys.time()
+addComment(c("[INFO]Total execution time for R script:",as.numeric(end.time - start.time,units="mins"),"mins"),T,opt$log,display=FALSE)
+
+addComment("[INFO]End of R script",T,opt$log,display=FALSE)
+
+printSessionInfo(opt$log)
+
+#sessionInfo()
\ No newline at end of file
diff -r 000000000000 -r 3022feec50fe src/getopt.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/getopt.R Fri Jun 26 09:36:46 2020 -0400
@@ -0,0 +1,773 @@
+# Copyright (c) 2008-2010 Allen Day
+# Copyright (c) 2011-2013 Trevor L. Davis
+#
+# Modified by J.Vandel 2017 to consider situation of multiple identical flag
+# and concatenate as a vector the set of parameter for the same flag instead of
+# keeping only the last value as done by the previous version.
+#
+# This file is free software: you may copy, redistribute and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation, either version 2 of the License, or (at your
+# option) any later version.
+#
+# This file is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+#' C-like getopt behavior
+#'
+#' getopt is primarily intended to be used with ``\link{Rscript}''. It
+#' facilitates writing ``\#!'' shebang scripts that accept short and long
+#' flags/options. It can also be used from ``R'' directly, but is probably less
+#' useful in this context.
+#'
+#' getopt() returns a \link{list} data structure containing \link{names} of the
+#' flags that were present in the \link{character} \link{vector} passed in under
+#' the \emph{opt} argument. Each value of the \link{list} is coerced to the
+#' data type specified according to the value of the \emph{spec} argument. See
+#' below for details.
+#'
+#' Notes on naming convention:
+#'
+#' 1. An \emph{option} is one of the shell-split input strings.
+#'
+#' 2. A \emph{flag} is a type of \emph{option}. a \emph{flag} can be defined as
+#' having no \emph{argument} (defined below), a required \emph{argument}, or an
+#' optional \emph{argument}.
+#'
+#' 3. An \emph{argument} is a type of \emph{option}, and is the value associated
+#' with a flag.
+#'
+#' 4. A \emph{long flag} is a type of \emph{flag}, and begins with the string
+#' ``--''. If the \emph{long flag} has an associated \emph{argument}, it may be
+#' delimited from the \emph{long flag} by either a trailing \emph{=}, or may be
+#' the subsequent \emph{option}.
+#'
+#' 5. A \emph{short flag} is a type of \emph{flag}, and begins with the string
+#' ``-''. If a \emph{short flag} has an associated \emph{argument}, it is the
+#' subsequent \emph{option}. \emph{short flags} may be bundled together,
+#' sharing a single leading ``-'', but only the final \emph{short flag} is able
+#' to have a corresponding \emph{argument}.
+#'
+#' Many users wonder whether they should use the getopt package, optparse package,
+#' or argparse package.
+#' Here is some of the major differences:
+#'
+#' Features available in \code{getopt} unavailable in \code{optparse}
+#'
+#' 1. As well as allowing one to specify options that take either
+#' no argument or a required argument like \code{optparse},
+#' \code{getopt} also allows one to specify option with an optional argument.
+#'
+#' Some features implemented in \code{optparse} package unavailable in \code{getopt}
+#'
+#' 1. Limited support for capturing positional arguments after the optional arguments
+#' when \code{positional_arguments} set to TRUE in \code{parse_args}
+#'
+#' 2. Automatic generation of an help option and printing of help text when encounters an "-h"
+#'
+#' 3. Option to specify default arguments for options as well the
+#' variable name to store option values
+#'
+#' There is also new package \code{argparse} introduced in 2012 which contains
+#' all the features of both getopt and optparse but which has a dependency on
+#' Python 2.7 or 3.2+ and has not been used in production since 2008 or 2009
+#' like the getopt and optparse packages.
+#'
+#' Some Features unlikely to be implemented in \code{getopt}:
+#'
+#' 1. Support for multiple, identical flags, e.g. for "-m 3 -v 5 -v", the
+#' trailing "-v" overrides the preceding "-v 5", result is v=TRUE (or equivalent
+#' typecast).
+#'
+#' 2. Support for multi-valued flags, e.g. "--libpath=/usr/local/lib
+#' --libpath=/tmp/foo".
+#'
+#' 3. Support for lists, e.g. "--define os=linux --define os=redhat" would
+#' set result$os$linux=TRUE and result$os$redhat=TRUE.
+#'
+#' 4. Support for incremental, argument-less flags, e.g. "/path/to/script
+#' -vvv" should set v=3.
+#'
+#' 5. Support partial-but-unique string match on options, e.g. "--verb" and
+#' "--verbose" both match long flag "--verbose".
+#'
+#' 6. No support for mixing in positional arguments or extra arguments that
+#' don't match any options. For example, you can't do "my.R --arg1 1 foo bar
+#' baz" and recover "foo", "bar", "baz" as a list. Likewise for "my.R foo
+#' --arg1 1 bar baz".
+#'
+#' @aliases getopt getopt-package
+#' @param spec The getopt specification, or spec of what options are considered
+#' valid. The specification must be either a 4-5 column \link{matrix}, or a
+#' \link{character} \link{vector} coercible into a 4 column \link{matrix} using
+#' \link{matrix}(x,ncol=4,byrow=TRUE) command. The \link{matrix}/\link{vector}
+#' contains:
+#'
+#' Column 1: the \emph{long flag} name. A multi-\link{character} string.
+#'
+#' Column 2: \emph{short flag} alias of Column 1. A single-\link{character}
+#' string.
+#'
+#' Column 3: \emph{Argument} mask of the \emph{flag}. An \link{integer}.
+#' Possible values: 0=no argument, 1=required argument, 2=optional argument.
+#'
+#' Column 4: Data type to which the \emph{flag}'s argument shall be cast using
+#' \link{storage.mode}. A multi-\link{character} string. This only considered
+#' for same-row Column 3 values of 1,2. Possible values: \link{logical},
+#' \link{integer}, \link{double}, \link{complex}, \link{character}.
+#' If \link{numeric} is encountered then it will be converted to double.
+#'
+#' Column 5 (optional): A brief description of the purpose of the option.
+#'
+#' The terms \emph{option}, \emph{flag}, \emph{long flag}, \emph{short flag},
+#' and \emph{argument} have very specific meanings in the context of this
+#' document. Read the ``Description'' section for definitions.
+#' @param opt This defaults to the return value of \link{commandArgs}(TRUE).
+#'
+#' If R was invoked directly via the ``R'' command, this corresponds to all
+#' arguments passed to R after the ``--args'' flag.
+#'
+#' If R was invoked via the ``\link{Rscript}'' command, this corresponds to all
+#' arguments after the name of the R script file.
+#'
+#' Read about \link{commandArgs} and \link{Rscript} to learn more.
+#' @param command The string to use in the usage message as the name of the
+#' script. See argument \emph{usage}.
+#' @param usage If TRUE, argument \emph{opt} will be ignored and a usage
+#' statement (character string) will be generated and returned from \emph{spec}.
+#' @param debug This is used internally to debug the getopt() function itself.
+#' @author Allen Day
+#' @seealso \code{\link{getopt}}
+#' @keywords data
+#' @export
+#' @examples
+#'
+#' #!/path/to/Rscript
+#' library('getopt');
+#' #get options, using the spec as defined by the enclosed list.
+#' #we read the options from the default: commandArgs(TRUE).
+#' spec = matrix(c(
+#' 'verbose', 'v', 2, "integer",
+#' 'help' , 'h', 0, "logical",
+#' 'count' , 'c', 1, "integer",
+#' 'mean' , 'm', 1, "double",
+#' 'sd' , 's', 1, "double"
+#' ), byrow=TRUE, ncol=4);
+#' opt = getopt(spec);
+#'
+#' # if help was asked for print a friendly message
+#' # and exit with a non-zero error code
+#' if ( !is.null(opt$help) ) {
+#' cat(getopt(spec, usage=TRUE));
+#' q(status=1);
+#' }
+#'
+#' #set some reasonable defaults for the options that are needed,
+#' #but were not specified.
+#' if ( is.null(opt$mean ) ) { opt$mean = 0 }
+#' if ( is.null(opt$sd ) ) { opt$sd = 1 }
+#' if ( is.null(opt$count ) ) { opt$count = 10 }
+#' if ( is.null(opt$verbose ) ) { opt$verbose = FALSE }
+#'
+#' #print some progress messages to stderr, if requested.
+#' if ( opt$verbose ) { write("writing...",stderr()); }
+#'
+#' #do some operation based on user input.
+#' cat(paste(rnorm(opt$count,mean=opt$mean,sd=opt$sd),collapse="\n"));
+#' cat("\n");
+#'
+#' #signal success and exit.
+#' #q(status=0);
+getopt = function (spec=NULL,opt=commandArgs(TRUE),command=get_Rscript_filename(),usage=FALSE,debug=FALSE) {
+
+ # littler compatibility - map argv vector to opt
+ if (exists("argv", where = .GlobalEnv, inherits = FALSE)) {
+ opt = get("argv", envir = .GlobalEnv);
+ }
+
+ ncol=4;
+ maxcol=6;
+ col.long.name = 1;
+ col.short.name = 2;
+ col.has.argument = 3;
+ col.mode = 4;
+ col.description = 5;
+
+ flag.no.argument = 0;
+ flag.required.argument = 1;
+ flag.optional.argument = 2;
+
+ result = list();
+ result$ARGS = vector(mode="character");
+
+ #no spec. fail.
+ if ( is.null(spec) ) {
+ stop('argument "spec" must be non-null.');
+
+ #spec is not a matrix. attempt to coerce, if possible. issue a warning.
+ } else if ( !is.matrix(spec) ) {
+ if ( length(spec)/4 == as.integer(length(spec)/4) ) {
+ warning('argument "spec" was coerced to a 4-column (row-major) matrix. use a matrix to prevent the coercion');
+ spec = matrix( spec, ncol=ncol, byrow=TRUE );
+ } else {
+ stop('argument "spec" must be a matrix, or a character vector with length divisible by 4, rtfm.');
+ }
+
+ #spec is a matrix, but it has too few columns.
+ } else if ( dim(spec)[2] < ncol ) {
+ stop(paste('"spec" should have at least ",ncol," columns.',sep=''));
+
+ #spec is a matrix, but it has too many columns.
+ } else if ( dim(spec)[2] > maxcol ) {
+ stop(paste('"spec" should have no more than ",maxcol," columns.',sep=''));
+
+ #spec is a matrix, and it has some optional columns.
+ } else if ( dim(spec)[2] != ncol ) {
+ ncol = dim(spec)[2];
+ }
+
+ #sanity check. make sure long names are unique, and short names are unique.
+ if ( length(unique(spec[,col.long.name])) != length(spec[,col.long.name]) ) {
+ stop(paste('redundant long names for flags (column ',col.long.name,').',sep=''));
+ }
+ if ( length(na.omit(unique(spec[,col.short.name]))) != length(na.omit(spec[,col.short.name])) ) {
+ stop(paste('redundant short names for flags (column ',col.short.name,').',sep=''));
+ }
+ # convert numeric type to double type
+ spec[,4] <- gsub("numeric", "double", spec[,4])
+
+ # if usage=TRUE, don't process opt, but generate a usage string from the data in spec
+ if ( usage ) {
+ ret = '';
+ ret = paste(ret,"Usage: ",command,sep='');
+ for ( j in 1:(dim(spec))[1] ) {
+ ret = paste(ret,' [-[-',spec[j,col.long.name],'|',spec[j,col.short.name],']',sep='');
+ if (spec[j,col.has.argument] == flag.no.argument) {
+ ret = paste(ret,']',sep='');
+ } else if (spec[j,col.has.argument] == flag.required.argument) {
+ ret = paste(ret,' <',spec[j,col.mode],'>]',sep='');
+ } else if (spec[j,col.has.argument] == flag.optional.argument) {
+ ret = paste(ret,' [<',spec[j,col.mode],'>]]',sep='');
+ }
+ }
+ # include usage strings
+ if ( ncol >= 5 ) {
+ max.long = max(apply(cbind(spec[,col.long.name]),1,function(x)length(strsplit(x,'')[[1]])));
+ ret = paste(ret,"\n",sep='');
+ for (j in 1:(dim(spec))[1] ) {
+ ret = paste(ret,sprintf(paste(" -%s|--%-",max.long,"s %s\n",sep=''),
+ spec[j,col.short.name],spec[j,col.long.name],spec[j,col.description]
+ ),sep='');
+ }
+ }
+ else {
+ ret = paste(ret,"\n",sep='');
+ }
+ return(ret);
+ }
+
+ #XXX check spec validity here. e.g. column three should be convertible to integer
+
+ i = 1;
+
+ while ( i <= length(opt) ) {
+ if ( debug ) print(paste("processing",opt[i]));
+
+ current.flag = 0; #XXX use NA
+ optstring = opt[i];
+
+
+ #long flag
+ if ( substr(optstring, 1, 2) == '--' ) {
+ if ( debug ) print(paste(" long option:",opt[i]));
+
+ optstring = substring(optstring,3);
+
+ this.flag = NA;
+ this.argument = NA;
+ kv = strsplit(optstring, '=')[[1]];
+ if ( !is.na(kv[2]) ) {
+ this.flag = kv[1];
+ this.argument = paste(kv[-1], collapse="=");
+ } else {
+ this.flag = optstring;
+ }
+
+ rowmatch = grep( this.flag, spec[,col.long.name],fixed=TRUE );
+
+ #long flag is invalid, matches no options
+ if ( length(rowmatch) == 0 ) {
+ stop(paste('long flag "', this.flag, '" is invalid', sep=''));
+
+ #long flag is ambiguous, matches too many options
+ } else if ( length(rowmatch) > 1 ) {
+ # check if there is an exact match and use that
+ rowmatch = which(this.flag == spec[,col.long.name])
+ if(length(rowmatch) == 0) {
+ stop(paste('long flag "', this.flag, '" is ambiguous', sep=''));
+ }
+ }
+
+ #if we have an argument
+ if ( !is.na(this.argument) ) {
+ #if we can't accept the argument, bail out
+ if ( spec[rowmatch, col.has.argument] == flag.no.argument ) {
+ stop(paste('long flag "', this.flag, '" accepts no arguments', sep=''));
+
+ #otherwise assign the argument to the flag
+ } else {
+ storage.mode(this.argument) = spec[rowmatch, col.mode];
+ #don't need here to remove the last value of the vector as argument is in the same string as
+ #the flag name "--flag=argument" so no spurious TRUE was added
+ result[[spec[rowmatch, col.long.name]]] = c(result[[spec[rowmatch, col.long.name]]],this.argument);
+ i = i + 1;
+ next;
+ }
+
+ #otherwise, we don't have an argument
+ } else {
+ #if we require an argument, bail out
+ ###if ( spec[rowmatch, col.has.argument] == flag.required.argument ) {
+ ### stop(paste('long flag "', this.flag, '" requires an argument', sep=''));
+
+ #long flag has no attached argument. set flag as present. set current.flag so we can peek ahead later and consume the argument if it's there
+ ###} else {
+ result[[spec[rowmatch, col.long.name]]] = c(result[[spec[rowmatch, col.long.name]]],TRUE);
+ current.flag = rowmatch;
+ ###}
+ }
+
+ #short flag(s)
+ } else if ( substr(optstring, 1, 1) == '-' ) {
+ if ( debug ) print(paste(" short option:",opt[i]));
+
+ these.flags = strsplit(optstring,'')[[1]];
+
+ done = FALSE;
+ for ( j in 2:length(these.flags) ) {
+ this.flag = these.flags[j];
+ rowmatch = grep( this.flag, spec[,col.short.name],fixed=TRUE );
+
+ #short flag is invalid, matches no options
+ if ( length(rowmatch) == 0 ) {
+ stop(paste('short flag "', this.flag, '" is invalid', sep=''));
+
+ #short flag is ambiguous, matches too many options
+ } else if ( length(rowmatch) > 1 ) {
+ stop(paste('short flag "', this.flag, '" is ambiguous', sep=''));
+
+ #short flag has an argument, but is not the last in a compound flag string
+ } else if ( j < length(these.flags) & spec[rowmatch,col.has.argument] == flag.required.argument ) {
+ stop(paste('short flag "', this.flag, '" requires an argument, but has none', sep=''));
+
+ #short flag has no argument, flag it as present
+ } else if ( spec[rowmatch,col.has.argument] == flag.no.argument ) {
+ result[[spec[rowmatch, col.long.name]]] = c(result[[spec[rowmatch, col.long.name]]],TRUE);
+ done = TRUE;
+
+ #can't definitively process this flag yet, need to see if next option is an argument or not
+ } else {
+ result[[spec[rowmatch, col.long.name]]] = c(result[[spec[rowmatch, col.long.name]]],TRUE);
+ current.flag = rowmatch;
+ done = FALSE;
+ }
+ }
+ if ( done ) {
+ i = i + 1;
+ next;
+ }
+ }
+
+ #invalid opt
+ if ( current.flag == 0 ) {
+ stop(paste('"', optstring, '" is not a valid option, or does not support an argument', sep=''));
+ #TBD support for positional args
+ #if ( debug ) print(paste('"', optstring, '" not a valid option. It is appended to getopt(...)$ARGS', sep=''));
+ #result$ARGS = append(result$ARGS, optstring);
+
+ # some dangling flag, handle it
+ } else if ( current.flag > 0 ) {
+ if ( debug ) print(' dangling flag');
+ if ( length(opt) > i ) {
+ peek.optstring = opt[i + 1];
+ if ( debug ) print(paste(' peeking ahead at: "',peek.optstring,'"',sep=''));
+
+ #got an argument. attach it, increment the index, and move on to the next option. we don't allow arguments beginning with '-' UNLESS
+ #specfile indicates the value is an "integer" or "double", in which case we allow a leading dash (and verify trailing digits/decimals).
+ if ( substr(peek.optstring, 1, 1) != '-' |
+ #match negative double
+ ( substr(peek.optstring, 1, 1) == '-'
+ & regexpr('^-[0123456789]*\\.?[0123456789]+$',peek.optstring) > 0
+ & spec[current.flag, col.mode]== 'double'
+ ) |
+ #match negative integer
+ ( substr(peek.optstring, 1, 1) == '-'
+ & regexpr('^-[0123456789]+$',peek.optstring) > 0
+ & spec[current.flag, col.mode]== 'integer'
+ )
+ ) {
+ if ( debug ) print(paste(' consuming argument *',peek.optstring,'*',sep=''));
+ storage.mode(peek.optstring) = spec[current.flag, col.mode];
+ #remove the last argument put in result for current.flag that should be a TRUE and concatenate argument with previous ones
+ result[[spec[current.flag, col.long.name]]] = c(result[[spec[current.flag, col.long.name]]][-length(result[[spec[current.flag, col.long.name]]])],peek.optstring);
+ i = i + 1;
+
+ #a lone dash
+ } else if ( substr(peek.optstring, 1, 1) == '-' & length(strsplit(peek.optstring,'')[[1]]) == 1 ) {
+ if ( debug ) print(' consuming "lone dash" argument');
+ storage.mode(peek.optstring) = spec[current.flag, col.mode];
+ #remove the last argument put in result for current.flag that should be a TRUE and concatenate argument with previous ones
+ result[[spec[current.flag, col.long.name]]] =c(result[[spec[current.flag, col.long.name]]][-length(result[[spec[current.flag, col.long.name]]])],peek.optstring);
+ i = i + 1;
+
+ #no argument
+ } else {
+ if ( debug ) print(' no argument!');
+
+ #if we require an argument, bail out
+ if ( spec[current.flag, col.has.argument] == flag.required.argument ) {
+ stop(paste('flag "', this.flag, '" requires an argument', sep=''));
+
+ #otherwise set flag as present.
+ } else if (
+ spec[current.flag, col.has.argument] == flag.optional.argument |
+ spec[current.flag, col.has.argument] == flag.no.argument
+ ) {
+ x = TRUE;
+ storage.mode(x) = spec[current.flag, col.mode];
+ result[[spec[current.flag, col.long.name]]] = c(result[[spec[current.flag, col.long.name]]],x);
+ } else {
+ stop(paste("This should never happen.",
+ "Is your spec argument correct? Maybe you forgot to set",
+ "ncol=4, byrow=TRUE in your matrix call?"));
+ }
+ }
+ #trailing flag without required argument
+ } else if ( spec[current.flag, col.has.argument] == flag.required.argument ) {
+ stop(paste('flag "', this.flag, '" requires an argument', sep=''));
+
+ #trailing flag without optional argument
+ } else if ( spec[current.flag, col.has.argument] == flag.optional.argument ) {
+ x = TRUE;
+ storage.mode(x) = spec[current.flag, col.mode];
+ result[[spec[current.flag, col.long.name]]] = c(result[[spec[current.flag, col.long.name]]],x);
+
+ #trailing flag without argument
+ } else if ( spec[current.flag, col.has.argument] == flag.no.argument ) {
+ x = TRUE;
+ storage.mode(x) = spec[current.flag, col.mode];
+ result[[spec[current.flag, col.long.name]]] = c(result[[spec[current.flag, col.long.name]]],x);
+ } else {
+ stop("this should never happen (2). please inform the author.");
+ }
+ #no dangling flag, nothing to do.
+ } else {
+ }
+
+ i = i+1;
+ }
+ return(result);
+}
+
+
+
+#########################
+#set a modified version using only long named parameters
+
+getoptLong = function (spec=NULL,opt=commandArgs(TRUE),command=get_Rscript_filename(),usage=FALSE,debug=FALSE) {
+
+ # littler compatibility - map argv vector to opt
+ if (exists("argv", where = .GlobalEnv, inherits = FALSE)) {
+ opt = get("argv", envir = .GlobalEnv);
+ }
+
+ ncol=4;
+ maxcol=6;
+ col.long.name = 1;
+ #col.short.name = 2;
+ col.has.argument = 3;
+ col.mode = 4;
+ col.description = 5;
+
+ flag.no.argument = 0;
+ flag.required.argument = 1;
+ flag.optional.argument = 2;
+
+ result = list();
+ result$ARGS = vector(mode="character");
+
+ #no spec. fail.
+ if ( is.null(spec) ) {
+ stop('argument "spec" must be non-null.');
+
+ #spec is not a matrix. attempt to coerce, if possible. issue a warning.
+ } else if ( !is.matrix(spec) ) {
+ if ( length(spec)/4 == as.integer(length(spec)/4) ) {
+ warning('argument "spec" was coerced to a 4-column (row-major) matrix. use a matrix to prevent the coercion');
+ spec = matrix( spec, ncol=ncol, byrow=TRUE );
+ } else {
+ stop('argument "spec" must be a matrix, or a character vector with length divisible by 4, rtfm.');
+ }
+
+ #spec is a matrix, but it has too few columns.
+ } else if ( dim(spec)[2] < ncol ) {
+ stop(paste('"spec" should have at least ",ncol," columns.',sep=''));
+
+ #spec is a matrix, but it has too many columns.
+ } else if ( dim(spec)[2] > maxcol ) {
+ stop(paste('"spec" should have no more than ",maxcol," columns.',sep=''));
+
+ #spec is a matrix, and it has some optional columns.
+ } else if ( dim(spec)[2] != ncol ) {
+ ncol = dim(spec)[2];
+ }
+
+ #sanity check. make sure long names are unique, and short names are unique.
+ if ( length(unique(spec[,col.long.name])) != length(spec[,col.long.name]) ) {
+ stop(paste('redundant long names for flags (column ',col.long.name,').',sep=''));
+ }
+ # if ( length(na.omit(unique(spec[,col.short.name]))) != length(na.omit(spec[,col.short.name])) ) {
+ # stop(paste('redundant short names for flags (column ',col.short.name,').',sep=''));
+ # }
+ # convert numeric type to double type
+ spec[,4] <- gsub("numeric", "double", spec[,4])
+
+ # if usage=TRUE, don't process opt, but generate a usage string from the data in spec
+ if ( usage ) {
+ ret = '';
+ ret = paste(ret,"Usage: ",command,sep='');
+ for ( j in 1:(dim(spec))[1] ) {
+ ret = paste(ret,' [-[-',spec[j,col.long.name],']',sep='');
+ if (spec[j,col.has.argument] == flag.no.argument) {
+ ret = paste(ret,']',sep='');
+ } else if (spec[j,col.has.argument] == flag.required.argument) {
+ ret = paste(ret,' <',spec[j,col.mode],'>]',sep='');
+ } else if (spec[j,col.has.argument] == flag.optional.argument) {
+ ret = paste(ret,' [<',spec[j,col.mode],'>]]',sep='');
+ }
+ }
+ # include usage strings
+ if ( ncol >= 5 ) {
+ max.long = max(apply(cbind(spec[,col.long.name]),1,function(x)length(strsplit(x,'')[[1]])));
+ ret = paste(ret,"\n",sep='');
+ for (j in 1:(dim(spec))[1] ) {
+ ret = paste(ret,sprintf(paste("--%-",max.long,"s %s\n",sep='')
+ ,spec[j,col.long.name],spec[j,col.description]
+ ),sep='');
+ }
+ }
+ else {
+ ret = paste(ret,"\n",sep='');
+ }
+ return(ret);
+ }
+
+ #XXX check spec validity here. e.g. column three should be convertible to integer
+
+ i = 1;
+
+ while ( i <= length(opt) ) {
+ if ( debug ) print(paste("processing",opt[i]));
+
+ current.flag = 0; #XXX use NA
+ optstring = opt[i];
+
+
+ #long flag
+ if ( substr(optstring, 1, 2) == '--' ) {
+ if ( debug ) print(paste(" long option:",opt[i]));
+
+ optstring = substring(optstring,3);
+
+ this.flag = NA;
+ this.argument = NA;
+ kv = strsplit(optstring, '=')[[1]];
+ if ( !is.na(kv[2]) ) {
+ this.flag = kv[1];
+ this.argument = paste(kv[-1], collapse="=");
+ } else {
+ this.flag = optstring;
+ }
+
+ rowmatch = grep( this.flag, spec[,col.long.name],fixed=TRUE );
+
+ #long flag is invalid, matches no options
+ if ( length(rowmatch) == 0 ) {
+ stop(paste('long flag "', this.flag, '" is invalid', sep=''));
+
+ #long flag is ambiguous, matches too many options
+ } else if ( length(rowmatch) > 1 ) {
+ # check if there is an exact match and use that
+ rowmatch = which(this.flag == spec[,col.long.name])
+ if(length(rowmatch) == 0) {
+ stop(paste('long flag "', this.flag, '" is ambiguous', sep=''));
+ }
+ }
+
+ #if we have an argument
+ if ( !is.na(this.argument) ) {
+ #if we can't accept the argument, bail out
+ if ( spec[rowmatch, col.has.argument] == flag.no.argument ) {
+ stop(paste('long flag "', this.flag, '" accepts no arguments', sep=''));
+
+ #otherwise assign the argument to the flag
+ } else {
+ storage.mode(this.argument) = spec[rowmatch, col.mode];
+ #don't need here to remove the last value of the vector as argument is in the same string as
+ #the flag name "--flag=argument" so no spurious TRUE was added
+ result[[spec[rowmatch, col.long.name]]] = c(result[[spec[rowmatch, col.long.name]]],this.argument);
+ i = i + 1;
+ next;
+ }
+
+ #otherwise, we don't have an argument
+ } else {
+ #if we require an argument, bail out
+ ###if ( spec[rowmatch, col.has.argument] == flag.required.argument ) {
+ ### stop(paste('long flag "', this.flag, '" requires an argument', sep=''));
+
+ #long flag has no attached argument. set flag as present. set current.flag so we can peek ahead later and consume the argument if it's there
+ ###} else {
+ result[[spec[rowmatch, col.long.name]]] = c(result[[spec[rowmatch, col.long.name]]],TRUE);
+ current.flag = rowmatch;
+ ###}
+ }
+
+ #short flag(s)
+ }
+ #else if ( substr(optstring, 1, 1) == '-' ) {
+ # if ( debug ) print(paste(" short option:",opt[i]));
+ #
+ # these.flags = strsplit(optstring,'')[[1]];
+ #
+ # done = FALSE;
+ # for ( j in 2:length(these.flags) ) {
+ # this.flag = these.flags[j];
+ # rowmatch = grep( this.flag, spec[,col.short.name],fixed=TRUE );
+ #
+ # #short flag is invalid, matches no options
+ # if ( length(rowmatch) == 0 ) {
+ # stop(paste('short flag "', this.flag, '" is invalid', sep=''));
+ #
+ # #short flag is ambiguous, matches too many options
+ # } else if ( length(rowmatch) > 1 ) {
+ # stop(paste('short flag "', this.flag, '" is ambiguous', sep=''));
+ #
+ # #short flag has an argument, but is not the last in a compound flag string
+ # } else if ( j < length(these.flags) & spec[rowmatch,col.has.argument] == flag.required.argument ) {
+ # stop(paste('short flag "', this.flag, '" requires an argument, but has none', sep=''));
+ #
+ # #short flag has no argument, flag it as present
+ # } else if ( spec[rowmatch,col.has.argument] == flag.no.argument ) {
+ # result[[spec[rowmatch, col.long.name]]] = c(result[[spec[rowmatch, col.long.name]]],TRUE);
+ # done = TRUE;
+ #
+ # #can't definitively process this flag yet, need to see if next option is an argument or not
+ # } else {
+ # result[[spec[rowmatch, col.long.name]]] = c(result[[spec[rowmatch, col.long.name]]],TRUE);
+ # current.flag = rowmatch;
+ # done = FALSE;
+ # }
+ # }
+ # if ( done ) {
+ # i = i + 1;
+ # next;
+ # }
+ # }
+
+ #invalid opt
+ if ( current.flag == 0 ) {
+ stop(paste('"', optstring, '" is not a valid option, or does not support an argument', sep=''));
+ #TBD support for positional args
+ #if ( debug ) print(paste('"', optstring, '" not a valid option. It is appended to getopt(...)$ARGS', sep=''));
+ #result$ARGS = append(result$ARGS, optstring);
+
+ # some dangling flag, handle it
+ } else if ( current.flag > 0 ) {
+ if ( debug ) print(' dangling flag');
+ if ( length(opt) > i ) {
+ peek.optstring = opt[i + 1];
+ if ( debug ) print(paste(' peeking ahead at: "',peek.optstring,'"',sep=''));
+
+ #got an argument. attach it, increment the index, and move on to the next option. we don't allow arguments beginning with '-' UNLESS
+ #specfile indicates the value is an "integer" or "double", in which case we allow a leading dash (and verify trailing digits/decimals).
+ if ( substr(peek.optstring, 1, 1) != '-' |
+ #match negative double
+ ( substr(peek.optstring, 1, 1) == '-'
+ & regexpr('^-[0123456789]*\\.?[0123456789]+$',peek.optstring) > 0
+ & spec[current.flag, col.mode]== 'double'
+ ) |
+ #match negative integer
+ ( substr(peek.optstring, 1, 1) == '-'
+ & regexpr('^-[0123456789]+$',peek.optstring) > 0
+ & spec[current.flag, col.mode]== 'integer'
+ )
+ ) {
+ if ( debug ) print(paste(' consuming argument *',peek.optstring,'*',sep=''));
+ storage.mode(peek.optstring) = spec[current.flag, col.mode];
+ #remove the last argument put in result for current.flag that should be a TRUE and concatenate argument with previous ones
+ result[[spec[current.flag, col.long.name]]] = c(result[[spec[current.flag, col.long.name]]][-length(result[[spec[current.flag, col.long.name]]])],peek.optstring);
+ i = i + 1;
+
+ #a lone dash
+ } else if ( substr(peek.optstring, 1, 1) == '-' & length(strsplit(peek.optstring,'')[[1]]) == 1 ) {
+ if ( debug ) print(' consuming "lone dash" argument');
+ storage.mode(peek.optstring) = spec[current.flag, col.mode];
+ #remove the last argument put in result for current.flag that should be a TRUE and concatenate argument with previous ones
+ result[[spec[current.flag, col.long.name]]] =c(result[[spec[current.flag, col.long.name]]][-length(result[[spec[current.flag, col.long.name]]])],peek.optstring);
+ i = i + 1;
+
+ #no argument
+ } else {
+ if ( debug ) print(' no argument!');
+
+ #if we require an argument, bail out
+ if ( spec[current.flag, col.has.argument] == flag.required.argument ) {
+ stop(paste('flag "', this.flag, '" requires an argument', sep=''));
+
+ #otherwise set flag as present.
+ } else if (
+ spec[current.flag, col.has.argument] == flag.optional.argument |
+ spec[current.flag, col.has.argument] == flag.no.argument
+ ) {
+ x = TRUE;
+ storage.mode(x) = spec[current.flag, col.mode];
+ result[[spec[current.flag, col.long.name]]] = c(result[[spec[current.flag, col.long.name]]],x);
+ } else {
+ stop(paste("This should never happen.",
+ "Is your spec argument correct? Maybe you forgot to set",
+ "ncol=4, byrow=TRUE in your matrix call?"));
+ }
+ }
+ #trailing flag without required argument
+ } else if ( spec[current.flag, col.has.argument] == flag.required.argument ) {
+ stop(paste('flag "', this.flag, '" requires an argument', sep=''));
+
+ #trailing flag without optional argument
+ } else if ( spec[current.flag, col.has.argument] == flag.optional.argument ) {
+ x = TRUE;
+ storage.mode(x) = spec[current.flag, col.mode];
+ result[[spec[current.flag, col.long.name]]] = c(result[[spec[current.flag, col.long.name]]],x);
+
+ #trailing flag without argument
+ } else if ( spec[current.flag, col.has.argument] == flag.no.argument ) {
+ x = TRUE;
+ storage.mode(x) = spec[current.flag, col.mode];
+ result[[spec[current.flag, col.long.name]]] = c(result[[spec[current.flag, col.long.name]]],x);
+ } else {
+ stop("this should never happen (2). please inform the author.");
+ }
+ #no dangling flag, nothing to do.
+ } else {
+ }
+
+ i = i+1;
+ }
+ return(result);
+}
+
diff -r 000000000000 -r 3022feec50fe src/heatMapClustering.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/heatMapClustering.R Fri Jun 26 09:36:46 2020 -0400
@@ -0,0 +1,896 @@
+# A command-line interface to plot heatmap based on expression or diff. exp. analysis
+# written by Jimmy Vandel
+# one of these arguments is required:
+#
+#
+initial.options <- commandArgs(trailingOnly = FALSE)
+file.arg.name <- "--file="
+script.name <- sub(file.arg.name, "", initial.options[grep(file.arg.name, initial.options)])
+script.basename <- dirname(script.name)
+source(file.path(script.basename, "utils.R"))
+source(file.path(script.basename, "getopt.R"))
+
+#addComment("Welcome R!")
+
+# setup R error handling to go to stderr
+options( show.error.messages=F, error = function () { cat(geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
+
+# we need that to not crash galaxy with an UTF8 error on German LC settings.
+loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
+loc <- Sys.setlocale("LC_NUMERIC", "C")
+
+#get starting time
+start.time <- Sys.time()
+
+
+options(stringAsfactors = FALSE, useFancyQuotes = FALSE, OutDec=".")
+
+#get options
+args <- commandArgs()
+
+# get options, using the spec as defined by the enclosed list.
+# we read the options from the default: commandArgs(TRUE).
+spec <- matrix(c(
+ "expressionFile", "x", 1, "character",
+ "diffAnalyseFile", "x", 1, "character",
+ "factorInfo","x", 1, "character",
+ "genericData","x", 0, "logical",
+ "comparisonName","x",1,"character",
+ "comparisonNameLow","x",1,"character",
+ "comparisonNameHigh","x",1,"character",
+ "filterInputOutput","x", 1, "character",
+ "FCthreshold","x", 1, "double",
+ "pvalThreshold","x", 1, "double",
+ "geneListFiltering","x",1,"character",
+ "clusterNumber","x",1,"integer",
+ "maxRows","x",1,"integer",
+ "sampleClusterNumber","x",1,"integer",
+ "dataTransformation","x",1,"character",
+ "distanceMeasure","x",1,"character",
+ "aggloMethod","x",1,"character",
+ "personalColors","x",1,"character",
+ "sideBarColorPalette","x",1,"character",
+ "format", "x", 1, "character",
+ "quiet", "x", 0, "logical",
+ "log", "x", 1, "character",
+ "outputFile" , "x", 1, "character"),
+ byrow=TRUE, ncol=4)
+opt <- getoptLong(spec)
+
+# enforce the following required arguments
+if (is.null(opt$log)) {
+ addComment("[ERROR]'log file' is required")
+ q( "no", 1, F )
+}
+addComment("[INFO]Start of R script",T,opt$log,display=FALSE)
+if (is.null(opt$format)) {
+ addComment("[ERROR]'output format' is required",T,opt$log)
+ q( "no", 1, F )
+}
+if (is.null(opt$outputFile)) {
+ addComment("[ERROR]'output file' is required",T,opt$log)
+ q( "no", 1, F )
+}
+
+if(is.null(opt$expressionFile) && !is.null(opt$genericData)){
+ addComment("[ERROR]generic data clustering is based on expression clustering",T,opt$log)
+ q( "no", 1, F )
+}
+
+if (is.null(opt$clusterNumber) || opt$clusterNumber<2) {
+ addComment("[ERROR]valid genes clusters number is required",T,opt$log)
+ q( "no", 1, F )
+}
+
+if (is.null(opt$sampleClusterNumber) || opt$sampleClusterNumber<1) {
+ addComment("[ERROR]valid samples clusters number is required",T,opt$log)
+ q( "no", 1, F )
+}
+
+if (is.null(opt$dataTransformation)) {
+ addComment("[ERROR]data transformation option is required",T,opt$log)
+ q( "no", 1, F )
+}
+
+if (is.null(opt$distanceMeasure)) {
+ addComment("[ERROR]distance measure option is required",T,opt$log)
+ q( "no", 1, F )
+}
+
+if (is.null(opt$aggloMethod)) {
+ addComment("[ERROR]agglomeration method option is required",T,opt$log)
+ q( "no", 1, F )
+}
+
+if (is.null(opt$maxRows) || opt$maxRows<2) {
+ addComment("[ERROR]valid plotted row number is required",T,opt$log)
+ q( "no", 1, F )
+}
+
+if (!is.null(opt[["comparisonName"]]) && nchar(opt[["comparisonName"]])==0){
+ addComment("[ERROR]you have to specify comparison",T,opt$log)
+ q( "no", 1, F )
+}
+
+if (!is.null(opt$comparisonNameLow) && nchar(opt$comparisonNameLow)==0){
+ addComment("[ERROR]you have to specify comparisonLow",T,opt$log)
+ q( "no", 1, F )
+}
+
+if (!is.null(opt$comparisonNameHigh) && nchar(opt$comparisonNameHigh)==0){
+ addComment("[ERROR]you have to specify comparisonHigh",T,opt$log)
+ q( "no", 1, F )
+}
+
+if (is.null(opt$genericData) && (!is.null(opt$comparisonNameLow) || !is.null(opt$comparisonNameHigh))){
+ addComment("[ERROR]comparisonLow and comparisonHigh can be specified only with generic data",T,opt$log)
+ q( "no", 1, F )
+}
+
+if (!is.null(opt$genericData) && !is.null(opt[["comparisonName"]])){
+ addComment("[ERROR]basic comparison cannot be specified for generic data",T,opt$log)
+ q( "no", 1, F )
+}
+
+if ((!is.null(opt[["comparisonName"]]) || !is.null(opt$comparisonNameLow) || !is.null(opt$comparisonNameHigh)) && is.null(opt$diffAnalyseFile)) {
+ addComment("[ERROR]'diff. exp. analysis file' is required",T,opt$log)
+ q( "no", 1, F )
+}
+
+if (!is.null(opt$genericData) && !is.null(opt$diffAnalyseFile) && is.null(opt$comparisonNameLow) && is.null(opt$comparisonNameHigh)){
+ addComment("[ERROR]Missing comparison information for filtering",T,opt$log)
+ q( "no", 1, F )
+}
+
+if ((!is.null(opt$FCthreshold) || !is.null(opt$pvalThreshold)) && (is.null(opt[["comparisonName"]]) && is.null(opt$comparisonNameLow) && is.null(opt$comparisonNameHigh))) {
+ addComment("[ERROR]'comparisons' are missing for filtering",T,opt$log)
+ q( "no", 1, F )
+}
+
+if ((!is.null(opt$FCthreshold) || !is.null(opt$pvalThreshold)) && !is.null(opt$geneListFiltering)) {
+ addComment("[ERROR]Cannot have two filtering strategies",T,opt$log)
+ q( "no", 1, F )
+}
+
+verbose <- if (is.null(opt$quiet)) {
+ TRUE
+}else{
+ FALSE}
+
+addComment("[INFO]Parameters checked!",T,opt$log,display=FALSE)
+
+addComment(c("[INFO]Working directory: ",getwd()),TRUE,opt$log,display=FALSE)
+addComment(c("[INFO]Command line: ",args),TRUE,opt$log,display=FALSE)
+
+#directory for plots and HTML
+dir.create(file.path(getwd(), "plotDir"))
+dir.create(file.path(getwd(), "plotLyDir"))
+
+#silent package loading
+suppressPackageStartupMessages({
+ library("plotly")
+ library("dendextend")
+ #library("ggdendro")
+ #library("plyr")
+ library("ggplot2")
+ library("heatmaply")
+ library("circlize")
+ #library("RColorBrewer")
+ #source("https://bioconductor.org/biocLite.R")
+ #biocLite("ComplexHeatmap")
+ library("ComplexHeatmap")
+ #library("processx")
+})
+
+expressionToCluster=!is.null(opt$expressionFile)
+
+#load input data files
+if(expressionToCluster){
+ #first expression data
+ expressionMatrix=read.csv(file=opt$expressionFile,header=F,sep="\t",colClasses="character")
+ #remove first row to convert it as colnames (to avoid X before colnames with header=T)
+ colNamesData=expressionMatrix[1,-1]
+ expressionMatrix=expressionMatrix[-1,]
+ #remove first colum to convert it as rownames
+ rowNamesData=expressionMatrix[,1]
+ expressionMatrix=expressionMatrix[,-1]
+ if(is.data.frame(expressionMatrix)){
+ expressionMatrix=data.matrix(expressionMatrix)
+ }else{
+ expressionMatrix=data.matrix(as.numeric(expressionMatrix))
+ }
+ dimnames(expressionMatrix)=list(rowNamesData,colNamesData)
+
+ #check input files
+ if (!is.numeric(expressionMatrix)) {
+ addComment("[ERROR]Expression data is not fully numeric!",T,opt$log,display=FALSE)
+ q( "no", 1, F )
+ }
+
+ addComment("[INFO]Expression data loaded and checked")
+ addComment(c("[INFO]Dim of expression matrix:",dim(expressionMatrix)),T,opt$log,display=FALSE)
+}
+
+nbComparisons=0
+nbColPerContrast=5
+comparisonMatrix=NULL
+comparisonMatrixInfoGene=NULL
+#if available comparisons
+if(!is.null(opt[["comparisonName"]])){
+ #load results from differential expression analysis
+ #consider first row contains column names
+ comparisonMatrix=read.csv(file=opt$diffAnalyseFile,header=F,sep="\t")
+ colnames(comparisonMatrix)=as.character(unlist(comparisonMatrix[1,]))
+ #remove the second line also as it's information line (p-val,FDR.p-val,FC,logFC)
+ comparisonMatrix=comparisonMatrix[-c(1,2),]
+ #remove first and second colums, convert the first one as rownames
+ rownames(comparisonMatrix)=as.character(unlist(comparisonMatrix[,1]))
+ #and save second column content that contain geneInfo
+ comparisonMatrixInfoGene=as.character(unlist(comparisonMatrix[,2]))
+ names(comparisonMatrixInfoGene)=as.character(unlist(comparisonMatrix[,1]))
+ comparisonMatrix=comparisonMatrix[,-c(1,2)]
+
+ comparisonMatrix=matrix(as.numeric(as.matrix(comparisonMatrix)),ncol=ncol(comparisonMatrix),dimnames = dimnames(comparisonMatrix))
+
+ if (ncol(comparisonMatrix)%%nbColPerContrast != 0) {
+ addComment("[ERROR]Diff. exp. data does not contain good number of columns per contrast, should contains in this order:p-val,FDR.p-val,FC,log2(FC) and t-stat",T,opt$log,display=FALSE)
+ q( "no", 1, F )
+ }
+
+ if(max(comparisonMatrix[,c(seq(1,ncol(comparisonMatrix),nbColPerContrast),seq(2,ncol(comparisonMatrix),nbColPerContrast))])>1 || min(comparisonMatrix[,c(seq(1,ncol(comparisonMatrix),nbColPerContrast),seq(2,ncol(comparisonMatrix),nbColPerContrast))])<0){
+ addComment("[ERROR]Seem that diff. exp. data does not contain correct values for p-val and FDR.p-val columns, should be including in [0,1] interval",T,opt$log,display=FALSE)
+ q( "no", 1, F )
+ }
+
+ if (!is.numeric(comparisonMatrix)) {
+ addComment("[ERROR]Diff. exp. data is not fully numeric!",T,opt$log,display=FALSE)
+ q( "no", 1, F )
+ }
+
+ if(expressionToCluster && length(setdiff(rownames(comparisonMatrix),rownames(expressionMatrix)))!=0){
+ addComment("[WARNING]All genes from diff. exp. file are not included in expression file",T,opt$log,display=FALSE)
+ }
+
+ if(expressionToCluster && length(setdiff(rownames(expressionMatrix),rownames(comparisonMatrix)))!=0){
+ addComment("[WARNING]All genes from expression file are not included in diff. exp. file",T,opt$log,display=FALSE)
+ }
+
+ addComment("[INFO]Diff. exp. analysis loaded and checked",T,opt$log,display=FALSE)
+ addComment(c("[INFO]Dim of original comparison matrix:",dim(comparisonMatrix)),T,opt$log,display=FALSE)
+
+ #restrict to user specified comparisons
+ restrictedComparisons=unlist(strsplit(opt[["comparisonName"]],","))
+ #should be improved to avoid selection of column names starting too similarly
+ colToKeep=which(unlist(lapply(colnames(comparisonMatrix),function(x)any(startsWith(x,restrictedComparisons)))))
+ comparisonMatrix=matrix(comparisonMatrix[,colToKeep],ncol=length(colToKeep),dimnames = list(rownames(comparisonMatrix),colnames(comparisonMatrix)[colToKeep]))
+
+ #get number of required comparisons
+ nbComparisons=ncol(comparisonMatrix)/nbColPerContrast
+
+ addComment(c("[INFO]Dim of effective filtering matrix:",dim(comparisonMatrix)),T,opt$log,display=FALSE)
+}
+
+#should be only the case with generic data
+if(!is.null(opt$comparisonNameLow) || !is.null(opt$comparisonNameHigh)){
+ #load generic data used for filtering
+ nbColPerContrast=1
+ #consider first row contains column names
+ comparisonMatrix=read.csv(file=opt$diffAnalyseFile,header=F,sep="\t")
+ colnames(comparisonMatrix)=as.character(unlist(comparisonMatrix[1,]))
+ #remove first colum, convert the first one as rownames
+ rownames(comparisonMatrix)=as.character(unlist(comparisonMatrix[,1]))
+ comparisonMatrix=comparisonMatrix[-1,-1]
+
+ comparisonMatrix=matrix(as.numeric(as.matrix(comparisonMatrix)),ncol=ncol(comparisonMatrix),dimnames = dimnames(comparisonMatrix))
+
+ if (!is.numeric(comparisonMatrix)) {
+ addComment("[ERROR]Filtering matrix is not fully numeric!",T,opt$log,display=FALSE)
+ q( "no", 1, F )
+ }
+
+ if(expressionToCluster && length(setdiff(rownames(comparisonMatrix),rownames(expressionMatrix)))!=0){
+ addComment("[WARNING]All genes from filtering file are not included in expression file",T,opt$log,display=FALSE)
+ }
+
+ if(expressionToCluster && length(setdiff(rownames(expressionMatrix),rownames(comparisonMatrix)))!=0){
+ addComment("[WARNING]All genes from expression file are not included in filtering file",T,opt$log,display=FALSE)
+ }
+
+ addComment("[INFO]Filtering file loaded and checked",T,opt$log,display=FALSE)
+ addComment(c("[INFO]Dim of original filtering matrix:",dim(comparisonMatrix)),T,opt$log,display=FALSE)
+
+ #restrict to user specified comparisons
+ restrictedComparisons=c()
+ if(!is.null(opt$comparisonNameLow))restrictedComparisons=unique(c(restrictedComparisons,unlist(strsplit(opt$comparisonNameLow,","))))
+ if(!is.null(opt$comparisonNameHigh))restrictedComparisons=unique(c(restrictedComparisons,unlist(strsplit(opt$comparisonNameHigh,","))))
+
+ if (!all(restrictedComparisons%in%colnames(comparisonMatrix))){
+ addComment("[ERROR]Selected columns in filtering file are not present in filtering matrix!",T,opt$log,display=FALSE)
+ q( "no", 1, F )
+ }
+ comparisonMatrix=matrix(comparisonMatrix[,restrictedComparisons],ncol=length(restrictedComparisons),dimnames = list(rownames(comparisonMatrix),restrictedComparisons))
+
+ #get number of required comparisons
+ nbComparisons=ncol(comparisonMatrix)
+
+ addComment(c("[INFO]Dim of effective filtering matrix:",dim(comparisonMatrix)),T,opt$log,display=FALSE)
+}
+
+
+
+factorInfoMatrix=NULL
+if(!is.null(opt$factorInfo)){
+ #get group information
+ #load factors file
+ factorInfoMatrix=read.csv(file=opt$factorInfo,header=F,sep="\t",colClasses="character")
+ #remove first row to convert it as colnames
+ colnames(factorInfoMatrix)=factorInfoMatrix[1,]
+ factorInfoMatrix=factorInfoMatrix[-1,]
+ #use first colum to convert it as rownames but not removing it to avoid conversion as vector in unique factor case
+ rownames(factorInfoMatrix)=factorInfoMatrix[,1]
+
+ factorBarColor=colnames(factorInfoMatrix)[2]
+
+ if(ncol(factorInfoMatrix)>2){
+ addComment("[ERROR]Factors file should not contain more than 2 columns",T,opt$log,display=FALSE)
+ q( "no", 1, F )
+ }
+
+ #factor file is used for color band on heatmap, so all expression matrix column should be in the factor file
+ if(expressionToCluster && length(setdiff(colnames(expressionMatrix),rownames(factorInfoMatrix)))!=0){
+ addComment("[ERROR]Missing samples in factor file",T,opt$log,display=FALSE)
+ q( "no", 1, F )
+ }
+
+ #factor file is used for color band on heatmap, so all comparison matrix column should be in the factor file
+ if(!expressionToCluster && length(setdiff(colnames(comparisonMatrix),rownames(factorInfoMatrix)))!=0){
+ addComment("[ERROR]Missing differential contrasts in factor file",T,opt$log,display=FALSE)
+ q( "no", 1, F )
+ }
+
+ addComment("[INFO]Factors OK",T,opt$log,display=FALSE)
+ addComment(c("[INFO]Dim of factorInfo matrix:",dim(factorInfoMatrix)),T,opt$log,display=FALSE)
+}
+
+if(!is.null(opt$personalColors)){
+ ##parse personal colors
+ personalColors=unlist(strsplit(opt$personalColors,","))
+ if(length(personalColors)==2){
+ ##add medium color between two to get three colors
+ personalColors=c(personalColors[1],paste(c("#",as.character(as.hexmode(floor(apply(col2rgb(personalColors),1,mean))))),collapse=""),personalColors[2])
+ }
+ if(length(personalColors)!=3){
+ addComment("[ERROR]Personalized colors doesn't contain enough colors",T,opt$log,display=FALSE)
+ q( "no", 1, F )
+ }
+
+}
+
+
+if(!is.null(opt$filterInputOutput) && opt$filterInputOutput=="input"){
+ #filter input data
+
+ if(is.null(opt$geneListFiltering)){
+ #filtering using stat thresholds
+ #rowToKeep=intersect(which(comparisonMatrix[,seq(2,ncol(comparisonMatrix),4)]<=opt$pvalThreshold),which(abs(comparisonMatrix[,seq(4,ncol(comparisonMatrix),4)])>=log2(opt$FCthreshold)))
+ if(is.null(opt$genericData)){
+ #diff. expression matrix
+ rowToKeep=names(which(unlist(apply(comparisonMatrix,1,function(x)length(intersect(which(x[seq(2,length(x),nbColPerContrast)]log2(opt$FCthreshold))))!=0))))
+ }else{
+ #generic filtering matrix
+ rowToKeep=rownames(comparisonMatrix)
+ if(!is.null(opt$comparisonNameLow)){
+ restrictedLowComparisons=unlist(strsplit(opt$comparisonNameLow,","))
+ rowToKeep=intersect(rowToKeep,names(which(unlist(apply(comparisonMatrix,1,function(x)length(which(x[restrictedLowComparisons]>opt$FCthreshold))!=0)))))
+ }
+ if(!is.null(opt$comparisonNameHigh)){
+ restrictedHighComparisons=unlist(strsplit(opt$comparisonNameHigh,","))
+ rowToKeep=intersect(rowToKeep,names(which(unlist(apply(comparisonMatrix,1,function(x)length(which(x[restrictedHighComparisons]nrow(dataToHeatMap)){
+ #correct number of clusters if needed
+ nbClusters=nrow(dataToHeatMap)
+ addComment(c("[WARNING]Not enough rows to reach required clusters number, it is reduced to number of rows:",nbClusters),T,opt$log,display=FALSE)
+}
+
+nbSampleClusters=opt$sampleClusterNumber
+if(nbSampleClusters>ncol(dataToHeatMap)){
+ #correct number of clusters if needed
+ nbSampleClusters=ncol(dataToHeatMap)
+ addComment(c("[WARNING]Not enough columns to reach required conditions clusters number, it is reduced to number of columns:",nbSampleClusters),T,opt$log,display=FALSE)
+}
+
+colClust=FALSE
+rowClust=FALSE
+effectiveRowClust=FALSE
+
+#make appropriate clustering if needed
+if(nrow(dataToHeatMap)>1 && nbClusters>1)rowClust=hclust(distExtended(dataToHeatMap,method = opt$distanceMeasure),method = opt$aggloMethod)
+if(ncol(dataToHeatMap)>1 && nbSampleClusters>1)colClust=hclust(distExtended(t(dataToHeatMap),method = opt$distanceMeasure),method = opt$aggloMethod)
+
+if(nrow(dataToHeatMap)>maxRowsToDisplay){
+ #make subsampling based on preliminary global clustering
+ #clusteringResults=cutree(rowClust,nbClusters)
+ #heatMapGenesToKeep=unlist(lapply(seq(1,nbClusters),function(x)sample(which(clusteringResults==x),min(length(which(clusteringResults==x)),round(maxRowsToDisplay/nbClusters)))))
+ ##OR
+ #basic subsampling
+ heatMapGenesToKeep=sample(rownames(dataToHeatMap),maxRowsToDisplay)
+ effectiveDataToHeatMap=matrix(dataToHeatMap[heatMapGenesToKeep,],ncol=ncol(dataToHeatMap),dimnames=list(heatMapGenesToKeep,colnames(dataToHeatMap)))
+ effectiveNbClusters=min(nbClusters,maxRowsToDisplay)
+ if(nrow(effectiveDataToHeatMap)>1 && effectiveNbClusters>1)effectiveRowClust=hclust(distExtended(effectiveDataToHeatMap, method = opt$distanceMeasure),method = opt$aggloMethod)
+ addComment(c("[WARNING]Too many rows for efficient heatmap drawing",maxRowsToDisplay,"subsampling is done for vizualization only"),T,opt$log,display=FALSE)
+ rm(heatMapGenesToKeep)
+}else{
+ effectiveDataToHeatMap=dataToHeatMap
+ effectiveRowClust=rowClust
+ effectiveNbClusters=nbClusters
+}
+
+addComment(c("[INFO]Dim of plotted heatmap matrix:",dim(effectiveDataToHeatMap)),T,opt$log,display=FALSE)
+
+personalized_hoverinfo=matrix("",ncol = ncol(effectiveDataToHeatMap),nrow = nrow(effectiveDataToHeatMap),dimnames = dimnames(effectiveDataToHeatMap))
+if(expressionToCluster){
+ for(iCol in colnames(effectiveDataToHeatMap)){for(iRow in rownames(effectiveDataToHeatMap)){personalized_hoverinfo[iRow,iCol]=paste(c("Probe: ",iRow,"\nCondition: ",iCol,"\n",valueMeaning,": ",effectiveDataToHeatMap[iRow,iCol]),collapse="")}}
+}else{
+ for(iCol in colnames(effectiveDataToHeatMap)){for(iRow in rownames(effectiveDataToHeatMap)){personalized_hoverinfo[iRow,iCol]=paste(c("Probe: ",iRow,"\nCondition: ",iCol,"\nFC: ",round(2^effectiveDataToHeatMap[iRow,iCol],2)),collapse="")}}
+}
+
+#trying to overcome limitation of heatmaply package to modify xtick and ytick label, using directly plotly functions, but for now plotly do not permit to have personalized color for each x/y tick separately
+test=FALSE
+if(test==TRUE){
+
+ #define dendogram shapes
+ dd.row <- as.dendrogram(effectiveRowClust)
+ dd.col <- as.dendrogram(colClust)
+
+ #and color them
+ dd.row=color_branches(dd.row, k = effectiveNbClusters, groupLabels = T)
+ dd.col=color_branches(dd.col, k = nbSampleClusters, groupLabels = T)
+
+ #generating function for dendogram from segment list
+ ggdend <- function(df) {
+ ggplot() +
+ geom_segment(data = df, aes(x=x, y=y, xend=xend, yend=yend)) +
+ labs(x = "", y = "") + theme_minimal() +
+ theme(axis.text = element_blank(), axis.ticks = element_blank(),
+ panel.grid = element_blank())
+ }
+
+ # generate x/y dendogram plots
+ px <- ggdend(dendro_data(dd.col)$segments)
+ py <- ggdend(dendro_data(dd.row)$segments) + coord_flip()
+
+ # reshape data matrix
+ col.ord <- order.dendrogram(dd.col)
+ row.ord <- order.dendrogram(dd.row)
+ xx <- effectiveDataToHeatMap[row.ord, col.ord]
+ # and also personalized_hoverinfo
+ personalized_hoverinfo=personalized_hoverinfo[row.ord, col.ord]
+
+ # hide axis ticks and grid lines
+ eaxis <- list(
+ showticklabels = FALSE,
+ showgrid = FALSE,
+ zeroline = FALSE
+ )
+
+ #make the empty plot
+ p_empty <- plot_ly() %>%
+ layout(margin = list(l = 200),
+ xaxis = eaxis,
+ yaxis = eaxis)
+
+ heatmap.plotly <- plot_ly(
+ z = xx, x = 1:ncol(xx), y = 1:nrow(xx), colors = viridis(n = 101, alpha = 1, begin = 0, end = 1, option = "inferno"),
+ type = "heatmap", showlegend = FALSE, text = personalized_hoverinfo, hoverinfo = "text",
+ colorbar = list(
+ # Capitalise first letter
+ title = valueMeaning,
+ tickmode = "array",
+ len = 0.3
+ )
+ ) %>%
+ layout(
+ xaxis = list(
+ tickfont = list(size = 10,color=get_leaves_branches_col(dd.row)),
+ tickangle = 45,
+ tickvals = 1:ncol(xx), ticktext = colnames(xx),
+ linecolor = "#ffffff",
+ range = c(0.5, ncol(xx) + 0.5),
+ showticklabels = TRUE
+ ),
+ yaxis = list(
+ tickfont = list(size = 10, color=get_leaves_branches_col(dd.col)),
+ tickangle = 0,
+ tickvals = 1:nrow(xx), ticktext = rownames(xx),
+ linecolor = "#ffffff",
+ range = c(0.5, nrow(xx) + 0.5),
+ showticklabels = TRUE
+ )
+ )
+
+ #generate plotly
+ pp <- subplot(px, p_empty, heatmap.plotly, py, nrows = 2, margin = 0,widths = c(0.8,0.2),heights = c(0.2,0.8), shareX = TRUE,
+ shareY = TRUE)
+
+ #save image file
+ export(pp, file = paste(c(file.path(getwd(), "plotDir"),"/Heatmap.",opt$format),collapse=""))
+ #rise a bug due to token stuf
+ #orca(pp, file = paste(c(file.path(getwd(), "plotDir"),"/Heatmap.",opt$format),collapse=""))
+
+
+ #save plotLy file
+ htmlwidgets::saveWidget(as_widget(pp), paste(c(file.path(getwd(), "plotLyDir"),"/Heatmap.html"),collapse=""),selfcontained = F)
+
+ #htmlwidgets::saveWidget(as_widget(pp),"~/Bureau/test.html",selfcontained = F)
+
+}else{ #test
+ label_names=c("Probe","Condition",valueMeaning)
+
+ # #color hclust objects
+ # dd.row=color_branches(effectiveRowClust, k = effectiveNbClusters)
+ # #rowColors=get_leaves_branches_col(dd.row)
+ # #rowColors[order.dendrogram(dd.row)]=rowColors
+ # rowGroup=cutree(effectiveRowClust, k = effectiveNbClusters)
+ #
+ # #get order of class as they will be displayed on the dendogram
+ # rowGroupRenamed=data.frame(cluster=mapvalues(rowGroup, unique(rowGroup[order.dendrogram(dd.row)[nleaves(dd.row):1]]), 1:effectiveNbClusters))
+ #
+ # dd.col=color_branches(colClust, k = nbSampleClusters)
+ # #colColors=get_leaves_branches_col(dd.col)
+ # #colColors[order.dendrogram(dd.col)]=colColors
+ # colGroup=cutree(colClust, k = nbSampleClusters)
+ #
+ # # #get order of class as they will be displayed on the dendogram
+ # colGroupRenamed=data.frame(sampleCluster=mapvalues(colGroup, unique(colGroup[order.dendrogram(dd.col)[nleaves(dd.col):1]]), 1:nbSampleClusters))
+
+
+ #while option is not correctly managed by heatmap apply, put personalized_hoverinfo to NULL
+ personalized_hoverinfo=NULL
+
+ if(is.null(opt$personalColors)){
+ heatmapColors=viridis(n = 101, alpha = 1, begin = 0, end = 1, option = "inferno")
+ }else{
+ heatmapColors=personalColors
+ }
+
+ colGroupRenamed=NULL
+ if(!is.null(factorInfoMatrix)){
+ colGroupRenamed=eval(parse(text=(paste("data.frame(",factorBarColor,"=factorInfoMatrix[colnames(effectiveDataToHeatMap),2])",sep=""))))
+ sideBarGroupNb=length(table(factorInfoMatrix[colnames(effectiveDataToHeatMap),2]))
+ sideBarColorPaletteName="Spectral"
+ if(!is.null(opt$sideBarColorPalette) && opt$sideBarColorPalette%in%rownames(RColorBrewer::brewer.pal.info)){
+ sideBarColorPaletteName=opt$sideBarColorPalette
+ }
+ sideBarColorPalette=setNames(colorRampPalette(RColorBrewer::brewer.pal(RColorBrewer::brewer.pal.info[sideBarColorPaletteName,"maxcolors"], sideBarColorPaletteName))(sideBarGroupNb),unique(factorInfoMatrix[colnames(effectiveDataToHeatMap),2]))
+ }
+
+ if(!is.null(colGroupRenamed)){
+ pp <- heatmaply(effectiveDataToHeatMap,key.title = valueMeaning,k_row=effectiveNbClusters,k_col=nbSampleClusters,col_side_colors=colGroupRenamed,col_side_palette=sideBarColorPalette,Rowv=effectiveRowClust,Colv=colClust,label_names=label_names,custom_hovertext=personalized_hoverinfo,plot_method = "plotly",colors = heatmapColors)
+ }else{
+ pp <- heatmaply(effectiveDataToHeatMap,key.title = valueMeaning,k_row=effectiveNbClusters,k_col=nbSampleClusters,Rowv=effectiveRowClust,Colv=colClust,label_names=label_names,custom_hovertext=personalized_hoverinfo,plot_method = "plotly",colors = heatmapColors)
+ }
+
+
+ #save image file
+ export(pp, file = paste(c(file.path(getwd(), "plotDir"),"/Heatmap.",opt$format),collapse=""))
+ #rise a bug due to token stuf
+ #orca(pp, file = paste(c(file.path(getwd(), "plotDir"),"/Heatmap.",opt$format),collapse=""))
+
+
+ #save plotLy file
+ htmlwidgets::saveWidget(as_widget(pp), paste(c(file.path(getwd(), "plotLyDir"),"/Heatmap.html"),collapse=""),selfcontained = F)
+
+}
+addComment("[INFO]Heatmap drawn",T,opt$log,display=FALSE)
+
+
+#plot circular heatmap
+if(!class(effectiveRowClust)=="logical"){
+ dendo=as.dendrogram(effectiveRowClust)
+
+ if(is.null(opt$personalColors)){
+ col_fun = colorRamp2(quantile(effectiveDataToHeatMap,probs = seq(0,1,0.01)), viridis(101,option = "inferno"))
+ }else{
+ col_fun = colorRamp2(quantile(effectiveDataToHeatMap,probs = seq(0,1,0.5)), personalColors)
+ }
+
+ if(opt$format=="pdf"){
+ pdf(paste(c("./plotDir/circularPlot.pdf"),collapse=""))}else{
+ png(paste(c("./plotDir/circularPlot.png"),collapse=""))
+ }
+
+ circos.par(cell.padding = c(0, 0, 0, 0), gap.degree = 5)
+ circos.initialize(c(rep("a",nrow(effectiveDataToHeatMap)),"b"),xlim=cbind(c(0,0),c(nrow(effectiveDataToHeatMap),5)))
+ circos.track(ylim = c(0, 1), bg.border = NA, panel.fun = function(x, y) {
+ if(CELL_META$sector.index=="a"){
+ nr = ncol(effectiveDataToHeatMap)
+ nc = nrow(effectiveDataToHeatMap)
+ circos.text(1:nc- 0.5, rep(0,nc), adj = c(0, 0),
+ rownames(effectiveDataToHeatMap)[order.dendrogram(dendo)], facing = "clockwise", niceFacing = TRUE, cex = 0.3)
+ }
+ })
+
+ circos.track(ylim = c(0, ncol(effectiveDataToHeatMap)), bg.border = NA, panel.fun = function(x, y) {
+
+ m = t(matrix(effectiveDataToHeatMap[order.dendrogram(dendo),],ncol=ncol(effectiveDataToHeatMap)))
+ col_mat = col_fun(m)
+ nr = nrow(m)
+ nc = ncol(m)
+ if(CELL_META$sector.index=="a"){
+ for(i in 1:nr) {
+ circos.rect(1:nc - 1, rep(nr - i, nc),
+ 1:nc, rep(nr - i + 1, nc),
+ border = col_mat[i, ], col = col_mat[i, ])
+ }
+ }else{
+ circos.text(rep(1,nr), seq(nr,1,-1) , colnames(effectiveDataToHeatMap),cex = 0.3)
+ }
+ })
+
+ #dendo = color_branches(dendo, k = effectiveNbClusters, col = colorRampPalette(brewer.pal(12,"Set3"))(effectiveNbClusters))
+ dendo = color_branches(dendo, k = effectiveNbClusters, col = rev(colorspace::rainbow_hcl(effectiveNbClusters)))
+
+
+ circos.track(ylim = c(0, attributes(dendo)$height), bg.border = NA, track.height = 0.25,
+ panel.fun = function(x, y) {
+ if(CELL_META$sector.index=="a")circos.dendrogram(dendo)} )
+
+ circos.clear()
+ ##add legend
+ lgd_links = Legend(at = seq(ceiling(min(effectiveDataToHeatMap)),floor(max(effectiveDataToHeatMap)),ceiling((floor(max(effectiveDataToHeatMap))-ceiling(min(effectiveDataToHeatMap)))/4)), col_fun = col_fun,
+ title_position = "topleft", grid_width = unit(5, "mm") ,title = valueMeaning)
+
+ pushViewport(viewport(x = 0.85, y = 0.80,
+ width = 0.1,
+ height = 0.1,
+ just = c("left", "bottom")))
+ grid.draw(lgd_links)
+ upViewport()
+
+
+ dev.off()
+
+ addComment("[INFO]Circular heatmap drawn",T,opt$log,display=FALSE)
+ loc <- Sys.setlocale("LC_NUMERIC","C")
+}else{
+ addComment(c("[WARNING]Circular plot will not be plotted considering row or cluster number < 2"),T,opt$log,display=FALSE)
+}
+rm(effectiveDataToHeatMap,effectiveRowClust,effectiveNbClusters)
+
+#plot screeplot
+if(class(rowClust)!="logical" && nrow(dataToHeatMap)>2){
+ screePlotData=c()
+ for(iNbClusters in 2:(nbClusters+min(10,max(0,nrow(dataToHeatMap)-nbClusters)))){
+ clusteringResults=cutree(rowClust,iNbClusters)
+ #clusteringResults=kmeans(dataToHeatMap,iNbClusters)$cluster
+
+ #compute variance between each intra-class points amongst themselves (need at least 3 points by cluster)
+ #screePlotData=c(screePlotData,sum(unlist(lapply(seq(1,iNbClusters),function(x){temp=which(clusteringResults==x);if(length(temp)>2){var(dist(dataToHeatMap[temp,]))}else{0}}))) )
+ #compute variance between each intra-class points and fictive mean point (need at least 2 points by cluster)
+ #screePlotData=c(screePlotData,sum(unlist(lapply(seq(1,iNbClusters),function(x){temp=which(clusteringResults==x);if(length(temp)>1){ var(dist(rbind(apply(dataToHeatMap[temp,],2,mean),dataToHeatMap[temp,]))[1:length(temp)]) }else{0}}))) )
+ if(ncol(dataToHeatMap)>1)screePlotData=c(screePlotData,sum(unlist(lapply(seq(1,iNbClusters),function(x){temp=which(clusteringResults==x);if(length(temp)>1){ sum((distExtended(rbind(apply(dataToHeatMap[temp,],2,mean),dataToHeatMap[temp,]),method = opt$distanceMeasure)[1:length(temp)])^2) }else{0}}))) )
+ else screePlotData=c(screePlotData,sum(unlist(lapply(seq(1,iNbClusters),function(x){temp=which(clusteringResults==x);if(length(temp)>1){ sum((dataToHeatMap[temp,]-mean(dataToHeatMap[temp,]))^2) }else{0}}))) )
+ }
+
+ dataToPlot=data.frame(clusterNb=seq(2,length(screePlotData)+1),wcss=screePlotData)
+ p <- ggplot(data=dataToPlot, aes(clusterNb,wcss)) + geom_point(colour="#EE4444") + geom_line(colour="#DD9999") +
+ ggtitle("Scree plot") + theme_bw() + xlab(label="Cluster number") + ylab(label="Within cluster sum of squares") +
+ theme(panel.border=element_blank(),plot.title = element_text(hjust = 0.5),legend.position = "none") +
+ scale_x_continuous(breaks=seq(min(dataToPlot$clusterNb), max(dataToPlot$clusterNb), 1))
+
+ #save plotly files
+ pp <- ggplotly(p)
+
+ if(opt$format=="pdf"){
+ pdf(paste(c("./plotDir/screePlot.pdf"),collapse=""))}else{
+ png(paste(c("./plotDir/screePlot.png"),collapse=""))
+ }
+ plot(p)
+ dev.off()
+
+ #save plotly files
+ htmlwidgets::saveWidget(as_widget(pp), paste(c(file.path(getwd(), "plotLyDir"),"/screePlot.html"),collapse=""),selfcontained = F)
+
+ addComment("[INFO]Scree plot drawn",T,opt$log,display=FALSE)
+}else{
+ addComment(c("[WARNING]Scree plot will not be plotted considering row number <= 2"),T,opt$log,display=FALSE)
+}
+
+##----------------------
+
+#filter output based on parameters
+
+rowToKeep=rownames(dataToHeatMap)
+if(!is.null(opt$filterInputOutput) && opt$filterInputOutput=="output"){
+ #rowToKeep=intersect(which(comparisonMatrix[,seq(2,ncol(comparisonMatrix),4)]<=opt$pvalThreshold),which(abs(comparisonMatrix[,seq(4,ncol(comparisonMatrix),4)])>=log2(opt$FCthreshold)))
+ if(is.null(opt$geneListFiltering)){
+ if(is.null(opt$genericData)){
+ #diff. expression matrix
+ rowToKeep=names(which(unlist(apply(comparisonMatrix,1,function(x)length(intersect(which(x[seq(2,length(x),nbColPerContrast)]<=opt$pvalThreshold),which(abs(x[seq(4,length(x),nbColPerContrast)])>=log2(opt$FCthreshold))))!=0))))
+ }else{
+ #generic filtering matrix
+ rowToKeep=rownames(comparisonMatrix)
+ if(!is.null(opt$comparisonNameLow)){
+ restrictedLowComparisons=unlist(strsplit(opt$comparisonNameLow,","))
+ rowToKeep=intersect(rowToKeep,names(which(unlist(apply(comparisonMatrix,1,function(x)length(which(x[restrictedLowComparisons]>opt$FCthreshold))!=0)))))
+ }
+ if(!is.null(opt$comparisonNameHigh)){
+ restrictedHighComparisons=unlist(strsplit(opt$comparisonNameHigh,","))
+ rowToKeep=intersect(rowToKeep,names(which(unlist(apply(comparisonMatrix,1,function(x)length(which(x[restrictedHighComparisons]
+#
+# This file is free software: you may copy, redistribute and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation, either version 2 of the License, or (at your
+# option) any later version.
+#
+# This file is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+
+#extendedDist function to correlation measure
+distExtended <- function(x,method) {
+ if(method %in% c("euclidean", "maximum", "manhattan", "canberra", "binary", "minkowski"))return(dist(x,method = method))
+ if(method %in% c("pearson", "spearman", "kendall"))return(as.dist(1-cor(t(x),method=method))/2)
+ if(method %in% c("absPearson", "absSpearman", "absKendall"))return(as.dist(1-abs(cor(t(x),method=method))))
+ return(NULL)
+}
+
+##comment function to display message and optionnaly add it to log file
+
+addComment <- function(text,addToFile=FALSE,fileName=NULL,append=TRUE,display=TRUE){
+ if(display)cat(paste(c(text,"\n"),collapse = " "))
+ if(addToFile)write(paste(text,collapse = " "),fileName,append=append)
+}
+
+printSessionInfo <- function(fileName=NULL,append=TRUE){
+ addComment("[INFO]R session info :",T,fileName,display=FALSE)
+ tempInfo=sessionInfo()
+ write(paste(tempInfo$R.version$version.string),fileName,append=append)
+ write(paste("Platform",tempInfo$platform,sep = " : "),fileName,append=append)
+ write(paste("Running under",tempInfo$running,sep = " : "),fileName,append=append)
+ write(paste("Local variables",tempInfo$locale,sep = " : "),fileName,append=append)
+ write(paste("Attached base packages",paste(tempInfo$basePkgs,collapse = "; "),sep = " : "),fileName,append=append)
+ if(length(tempInfo$otherPkgs)>0){
+ lineToPrint=""
+ for(iPack in tempInfo$otherPkgs){
+ lineToPrint=paste(lineToPrint,iPack$Package," ",iPack$Version,"; ",sep = "")
+ }
+ write(paste("Other attached packages",lineToPrint,sep = " : "),fileName,append=append)
+ }
+ if(length(tempInfo$loadedOnly)>0){
+ lineToPrint=""
+ for(iPack in tempInfo$loadedOnly){
+ lineToPrint=paste(lineToPrint,iPack$Package," ",iPack$Version,"; ",sep = "")
+ }
+ write(paste("Loaded packages",lineToPrint,sep = " : "),fileName,append=append)
+ }
+}
+
+##negative of a mathematical expression
+negativeExpression <- function(expression){
+ expression=gsub("\\+","_toMinus_",expression)
+ expression=gsub("\\-","+",expression)
+ expression=gsub("_toMinus_","-",expression)
+ if(substr(expression,1,1)!="-" && substr(expression,1,1)!="+"){
+ expression=paste(c("-",expression),collapse="")
+ }
+
+ return(expression)
+}
+
+#' Returns file name of calling Rscript
+#'
+#' \code{get_Rscript_filename} returns the file name of calling Rscript
+#' @return A string with the filename of the calling script.
+#' If not found (i.e. you are in a interactive session) returns NA.
+#'
+#' @export
+get_Rscript_filename <- function() {
+ prog <- sub("--file=", "", grep("--file=", commandArgs(), value=TRUE)[1])
+ if( .Platform$OS.type == "windows") {
+ prog <- gsub("\\\\", "\\\\\\\\", prog)
+ }
+ prog
+}
+
+#' Recursively sorts a list
+#'
+#' \code{sort_list} returns a sorted list
+#' @param unsorted_list A list.
+#' @return A sorted list.
+#' @export
+sort_list <- function(unsorted_list) {
+ for(ii in seq(along=unsorted_list)) {
+ if(is.list(unsorted_list[[ii]])) {
+ unsorted_list[[ii]] <- sort_list(unsorted_list[[ii]])
+ }
+ }
+ unsorted_list[sort(names(unsorted_list))]
+}
+
+
+# Multiple plot function
+#
+# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
+# - cols: Number of columns in layout
+# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
+#
+# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
+# then plot 1 will go in the upper left, 2 will go in the upper right, and
+# 3 will go all the way across the bottom.
+#
+multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
+ library(grid)
+
+ # Make a list from the ... arguments and plotlist
+ plots <- c(list(...), plotlist)
+
+ numPlots = length(plots)
+
+ # If layout is NULL, then use 'cols' to determine layout
+ if (is.null(layout)) {
+ # Make the panel
+ # ncol: Number of columns of plots
+ # nrow: Number of rows needed, calculated from # of cols
+ layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
+ ncol = cols, nrow = ceiling(numPlots/cols))
+ }
+
+ if (numPlots==1) {
+ print(plots[[1]])
+
+ } else {
+ # Set up the page
+ grid.newpage()
+ pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
+
+ # Make each plot, in the correct location
+ for (i in 1:numPlots) {
+ # Get the i,j matrix positions of the regions that contain this subplot
+ matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
+
+ print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
+ layout.pos.col = matchidx$col))
+ }
+ }
+}