# HG changeset patch
# User vandelj
# Date 1593178511 14400
# Node ID 708f43bda2b659a79bb326db395cb5d3e4ddce3e
"planemo upload for repository https://github.com/juliechevalier/GIANT/tree/master commit cb276a594444c8f32e9819fefde3a21f121d35df"
diff -r 000000000000 -r 708f43bda2b6 galaxy/wrappers/ArrayNormalization.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/galaxy/wrappers/ArrayNormalization.xml Fri Jun 26 09:35:11 2020 -0400
@@ -0,0 +1,361 @@
+
+ Apply Affymetrix Power Tool summarize function to .CEL collection
+
+ apt-probeset-summarize
+
+
+
+
+
+
+
+
+
+ > $log 2>&1;
+
+ret_code=\$?;
+
+printf "[INFO]Normalized files: "$inputData >> $log;
+
+if [ \$ret_code != 0 ]; then
+ exit \$ret_code;
+else
+
+ ##launch post-treatment scripts
+
+ bash $scriptFormat;
+ #if $annotationCondition.addAnnotation=="true":
+ echo "[INFO]Annotation file used:" >> $log;
+ #if $mainCondition.arraySelection=="other":
+ bash $scriptAnnotation $mainCondition.annotationFile;
+ echo $mainCondition.annotationFile.name >> $log;
+ #else:
+ #if $normLevel=="core":
+ bash $scriptAnnotation $mainCondition.arrayID.fields.pathAnnotTrans;
+ #else:
+ bash $scriptAnnotation $mainCondition.arrayID.fields.pathAnnotProbe;
+ #end if
+ cat $mainCondition.arrayID.fields.versionInfo >> $log;
+ #end if
+ #end if
+ fi;
+ printf "[INFO]End of tool script" >> $log;
+ ]]>
+
+
+
+
+
+
+> ./tempColumnName.txt
+#end for
+awk ' BEGIN{firstLine=1;OFS="";ORS=""} ARGIND==1{tab[FNR]=$1;nbCol=FNR} ARGIND==2 && !($1 ~ /\\#/){if(firstLine==0){print "\n"\$0}else{print "Conditions";for(i=1;i<=nbCol;i++){print "\t"tab[i]};firstLine=0}}' ./tempColumnName.txt ./apt_output/*summary.txt > $outputData
+
+if [ ! -e $outputData ]; then
+ printf "[ERROR]Formating results failed" >> $log;
+ exit 10
+fi
+]]>
+
+
+
+1{if(\$1 in dico){\$1=dico[\$1]"_("\$1")";print \$0}}' \$1 $outputData > ./tempData
+ #else:
+ awk 'BEGIN{OFS="\t"} ARGIND==1{dico[\$1]=\$2} ARGIND==2 && FNR==1{print \$0} ARGIND==2 && FNR>1{if(\$1 in dico){\$1=dico[\$1]"_("\$1")"};print \$0}' \$1 $outputData > ./tempData
+ #end if
+
+#end if
+#if $annotationCondition.mergingMethod=="mean":
+
+ #if $annotationCondition.keepAnnotated=="true":
+ awk 'BEGIN{OFS="\t"} ARGIND==1{dico[\$1]=\$2} ARGIND==2 && FNR==1{print \$0;nbCol=NF} ARGIND==2 && FNR>1{if(\$1 in dico){cpt[dico[\$1]]++;for(i=2;i<=nbCol;i++)sum[dico[\$1]][i]+=\$i}} END{for(iName in cpt){line=iName;for(iCol=2;iCol<=nbCol;iCol++){line=line"\t"sum[iName][iCol]/cpt[iName]};print line}}' \$1 $outputData > ./tempData
+ #else:
+ awk 'BEGIN{OFS="\t"} ARGIND==1{dico[\$1]=\$2} ARGIND==2 && FNR==1{print \$0;nbCol=NF} ARGIND==2 && FNR>1{if(\$1 in dico){cpt[dico[\$1]]++;for(i=2;i<=nbCol;i++)sum[dico[\$1]][i]+=\$i}else{print \$0}} END{for(iName in cpt){line=iName;for(iCol=2;iCol<=nbCol;iCol++){line=line"\t"sum[iName][iCol]/cpt[iName]};print line}}' \$1 $outputData > ./tempData
+ #end if
+
+#end if
+
+#if $annotationCondition.mergingMethod=="higherVar":
+
+ #if $annotationCondition.keepAnnotated=="true":
+ awk 'BEGIN{OFS="\t"} ARGIND==1{dico[\$1]=\$2} ARGIND==2 && FNR==1{print \$0;nbCol=NF} ARGIND==2 && FNR>1{if(\$1 in dico){mean=0;for(i=2;i<=nbCol;i++){mean+=\$i};mean=mean/(nbCol-1);curVar=0;for(i=2;i<=nbCol;i++){curVar+=(mean-\$i)^2};if(curVar>var[dico[\$1]]){var[dico[\$1]]=curVar;for(i=2;i<=nbCol;i++){bestVar[dico[\$1]][i]=\$i}}}} END{for(iName in bestVar){line=iName;for(iCol=2;iCol<=nbCol;iCol++){line=line"\t"bestVar[iName][iCol]};print line}}' \$1 $outputData > ./tempData
+ #else:
+ awk 'BEGIN{OFS="\t"} ARGIND==1{dico[\$1]=\$2} ARGIND==2 && FNR==1{print \$0;nbCol=NF} ARGIND==2 && FNR>1{if(\$1 in dico){mean=0;for(i=2;i<=nbCol;i++){mean+=\$i};mean=mean/(nbCol-1);curVar=0;for(i=2;i<=nbCol;i++){curVar+=(mean-\$i)^2};if(curVar>var[dico[\$1]]){var[dico[\$1]]=curVar;for(i=2;i<=nbCol;i++){bestVar[dico[\$1]][i]=\$i}}}else{print \$0}} END{for(iName in bestVar){line=iName;for(iCol=2;iCol<=nbCol;iCol++){line=line"\t"bestVar[iName][iCol]};print line}}' \$1 $outputData > ./tempData
+ #end if
+
+#end if
+
+#if $annotationCondition.mergingMethod=="lowerVar":
+
+ #if $annotationCondition.keepAnnotated=="true":
+ awk 'BEGIN{OFS="\t"} ARGIND==1{dico[\$1]=\$2} ARGIND==2 && FNR==1{print \$0;nbCol=NF} ARGIND==2 && FNR>1{if(\$1 in dico){mean=0;for(i=2;i<=nbCol;i++){mean+=\$i};mean=mean/(nbCol-1);curVar=0;for(i=2;i<=nbCol;i++){curVar+=(mean-\$i)^2};if(var[dico[\$1]]==0 || curVar ./tempData
+ #else:
+ awk 'BEGIN{OFS="\t"} ARGIND==1{dico[\$1]=\$2} ARGIND==2 && FNR==1{print \$0;nbCol=NF} ARGIND==2 && FNR>1{if(\$1 in dico){mean=0;for(i=2;i<=nbCol;i++){mean+=\$i};mean=mean/(nbCol-1);curVar=0;for(i=2;i<=nbCol;i++){curVar+=(mean-\$i)^2};if(var[dico[\$1]]==0 || curVar ./tempData
+ #end if
+
+#end if
+
+if [ -e ./tempData ]; then
+ mv -f ./tempData $outputData
+else
+ printf "[ERROR]Annotation merge failed" >> $log;
+ exit 10
+fi
+
+#end if
+
+]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ @misc{vandel_jimmy_2018_1477870, author = {Vandel, J. and Gheeraert, C. and Eeckhoute, J. and Staels, B. and Lefebvre, P. and Dubois-Chevalier, J.}, title = {GIANT: Galaxy-based Interactive tools for ANalaysis of Transcriptomic data}, month = nov, year = 2018, doi = {10.5281/zenodo.1477870}, url = {https://doi.org/10.5281/zenodo.1477870}
+ }
+
+ @online{apt, author = {Thermo Fisher Scientific}, title = {Affymetrix Power Tools}, publisher = {Life Technologies Corp.}, address = {USA, MA}, version = {2.10.2}, year = {2018}, url = {https://www.affymetrix.com/support/developer/powertools/changelog/index.html}
+ }
+
+
diff -r 000000000000 -r 708f43bda2b6 galaxy/wrappers/tool-data/aptTool.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/galaxy/wrappers/tool-data/aptTool.loc.sample Fri Jun 26 09:35:11 2020 -0400
@@ -0,0 +1,17 @@
+#This file lists the locations of all files required by apt tool
+#under the "apt" directory (a directory that contains a directory
+#for each chip). This file has the TAB delimited format :
+#
+#
+#
+#
+HTA2.0 HTA 2.0 arrays /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/HTA2.0/HTA2.0.pgf /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/HTA2.0/HTA2.0.clf /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/HTA2.0/HTA2.0.mps /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/HTA2.0/HTA2.0.bgp /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/HTA2.0/HTA2.0.cdf /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/HTA2.0/formatedAnnotation36.transcript.csv /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/HTA2.0/formatedAnnotation36.probeset.csv /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/HTA2.0/versionInfo.txt
+Mouse430A1.0 Mouse Genome 430A 1.0 arrays null null null null /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/MouseGenome430A_1.0/Mouse430A_1.0.cdf /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/MouseGenome430A_1.0/formatedAnnotation.csv /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/MouseGenome430A_1.0/formatedAnnotation.csv /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/MouseGenome430A_1.0/versionInfo.txt
+Mouse430B1.0 Mouse Genome 430B 1.0 arrays null null null null /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/MouseGenome430B_1.0/Mouse430B_1.0.cdf /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/MouseGenome430B_1.0/formatedAnnotation.csv /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/MouseGenome430B_1.0/formatedAnnotation.csv /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/MouseGenome430B_1.0/versionInfo.txt
+Mouse4302.0 Mouse Genome 430 2.0 arrays null null null null /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/MouseGenome430_2.0/Mouse430_2.0.cdf /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/MouseGenome430_2.0/formatedAnnotation.csv /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/MouseGenome430_2.0/formatedAnnotation.csv /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/MouseGenome430_2.0/versionInfo.txt
+MoGene1.0ST Mouse Gene 1.0 ST arrays /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/MoGene1.0ST/MoGene1.0ST.pgf /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/MoGene1.0ST/MoGene1.0ST.clf /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/MoGene1.0ST/MoGene1.0ST.mps /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/MoGene1.0ST/MoGene1.0ST.bgp null /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/MoGene1.0ST/formatedAnnotation.transcript.csv /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/MoGene1.0ST/formatedAnnotation.probeset.csv /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/MoGene1.0ST/versionInfo.txt
+MoGene1.1ST Mouse Gene 1.1 ST arrays /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/MoGene1.1ST/MoGene1.1ST.pgf /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/MoGene1.1ST/MoGene1.1ST.clf /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/MoGene1.1ST/MoGene1.1ST.mps /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/MoGene1.1ST/MoGene1.1ST.bgp null /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/MoGene1.1ST/formatedAnnotation.transcript.csv /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/MoGene1.1ST/formatedAnnotation.probeset.csv /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/MoGene1.1ST/versionInfo.txt
+MoGene2.0ST Mouse Gene 2.0 ST arrays /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/MoGene2.0ST/MoGene2.0ST.pgf /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/MoGene2.0ST/MoGene2.0ST.clf /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/MoGene2.0ST/MoGene2.0ST.mps /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/MoGene2.0ST/MoGene2.0ST.bgp null /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/MoGene2.0ST/formatedAnnotation.transcript.csv /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/MoGene2.0ST/formatedAnnotation.probeset.csv /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/MoGene2.0ST/versionInfo.txt
+MTA1.0 Mouse Transcriptome 1.0 arrays /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/MTA1.0/MTA1.0.pgf /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/MTA1.0/MTA1.0.clf /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/MTA1.0/MTA1.0.mps /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/MTA1.0/MTA1.0.bgp null /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/MTA1.0/formatedAnnotation.transcript.csv /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/MTA1.0/formatedAnnotation.probeset.csv /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/MTA1.0/versionInfo.txt
+HuGene1.0ST Human Gene 1.0 ST arrays /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/HuGene1.0ST/HuGene1.0ST.pgf /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/HuGene1.0ST/HuGene1.0ST.clf /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/HuGene1.0ST/HuGene1.0ST.mps /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/HuGene1.0ST/HuGene1.0ST.bgp null /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/HuGene1.0ST/formatedAnnotation.transcript.csv /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/HuGene1.0ST/formatedAnnotation.probeset.csv /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/HuGene1.0ST/versionInfo.txt
+HuGene2.0ST Human Gene 2.0 ST arrays /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/HuGene2.0ST/HuGene2.0ST.pgf /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/HuGene2.0ST/HuGene2.0ST.clf /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/HuGene2.0ST/HuGene2.0ST.mps /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/HuGene2.0ST/HuGene2.0ST.bgp null /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/HuGene2.0ST/formatedAnnotation.transcript.csv /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/HuGene2.0ST/formatedAnnotation.probeset.csv /home/galaxy/Data_Storage/Galaxy_tool-data/affyChipConfig/HuGene2.0ST/versionInfo.txt
diff -r 000000000000 -r 708f43bda2b6 galaxy/wrappers/tool_data_table_conf.xml.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/galaxy/wrappers/tool_data_table_conf.xml.sample Fri Jun 26 09:35:11 2020 -0400
@@ -0,0 +1,13 @@
+
+
+
+
+ value, name, pathPGF, pathCLF, pathMPS, pathBGP, pathCDF, pathAnnotTrans, pathAnnotProbe, versionInfo
+
+
+
+
+ value, name, rowTypeID, rowTypeName
+
+
+
diff -r 000000000000 -r 708f43bda2b6 src/ExprPlotsScript.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/ExprPlotsScript.R Fri Jun 26 09:35:11 2020 -0400
@@ -0,0 +1,465 @@
+# A command-line interface to basic plots for use with Galaxy
+# written by Jimmy Vandel
+# one of these arguments is required:
+#
+#
+initial.options <- commandArgs(trailingOnly = FALSE)
+file.arg.name <- "--file="
+script.name <- sub(file.arg.name, "", initial.options[grep(file.arg.name, initial.options)])
+script.basename <- dirname(script.name)
+source(file.path(script.basename, "utils.R"))
+source(file.path(script.basename, "getopt.R"))
+
+#addComment("Welcome R!")
+
+# setup R error handling to go to stderr
+options( show.error.messages=F, error = function () { cat(geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
+
+# we need that to not crash galaxy with an UTF8 error on German LC settings.
+loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
+loc <- Sys.setlocale("LC_NUMERIC", "C")
+
+#get starting time
+start.time <- Sys.time()
+
+#get options
+options(stringAsfactors = FALSE, useFancyQuotes = FALSE)
+args <- commandArgs()
+
+
+# get options, using the spec as defined by the enclosed list.
+# we read the options from the default: commandArgs(TRUE).
+spec <- matrix(c(
+ "dataFile", "i", 1, "character",
+ "factorInfo","t", 1, "character",
+ "dataFileFormat","j",1,"character",
+ "conditionNames","c",1,"character",
+ "format", "f", 1, "character",
+ "quiet", "q", 0, "logical",
+ "log", "l", 1, "character",
+ "histo" , "h", 1, "character",
+ "maPlot" , "a", 1, "character",
+ "boxplot" , "b", 1, "character",
+ "microarray" , "m", 1, "character",
+ "acp" , "p" , 1, "character",
+ "screePlot" , "s" , 1, "character"),
+ byrow=TRUE, ncol=4)
+opt <- getopt(spec)
+
+# enforce the following required arguments
+if (is.null(opt$log)) {
+ addComment("[ERROR]'log file' is required")
+ q( "no", 1, F )
+}
+addComment("[INFO]Start of R script",T,opt$log,display=FALSE)
+if (is.null(opt$dataFile) || is.null(opt$dataFileFormat)) {
+ addComment("[ERROR]'dataFile' and it format are required",T,opt$log)
+ q( "no", 1, F )
+}
+if (is.null(opt$format)) {
+ addComment("[ERROR]'output format' is required",T,opt$log)
+ q( "no", 1, F )
+}
+if (is.null(opt$histo) & is.null(opt$maPlot) & is.null(opt$boxplot) & is.null(opt$microarray) & is.null(opt$acp)){
+ addComment("[ERROR]Select at least one plot to draw",T,opt$log)
+ q( "no", 1, F )
+}
+
+verbose <- if (is.null(opt$quiet)) {
+ TRUE
+}else{
+ FALSE}
+
+addComment("[INFO]Parameters checked!",T,opt$log,display=FALSE)
+
+addComment(c("[INFO]Working directory: ",getwd()),TRUE,opt$log,display=FALSE)
+addComment(c("[INFO]Command line: ",args),TRUE,opt$log,display=FALSE)
+
+#directory for plots
+dir.create(file.path(getwd(), "plotDir"))
+dir.create(file.path(getwd(), "plotLyDir"))
+
+#silent package loading
+suppressPackageStartupMessages({
+ library("oligo")
+ library("ff")
+ library("ggplot2")
+ library("plotly")
+})
+
+
+#chargement des fichiers en entrée
+#fichier de type CEL
+dataAreFromCel=FALSE
+if(toupper(opt$dataFileFormat)=="CEL"){
+ dataAreFromCel=TRUE
+ celData=read.celfiles(unlist(strsplit(opt$dataFile,",")))
+ #load all expressions
+ dataMatrix=exprs(celData)
+ #select "pm" probes
+ probeInfo=getProbeInfo(celData,probeType = c("pm"),target="probeset")
+ #reduce dataMatrix to log expression matrix for a randomly probe selection
+ dataMatrix=log2(dataMatrix[sample(unique(probeInfo[,1]),min(100000,length(unique(probeInfo[,1])))),])
+ addComment("[INFO]Raw data are log2 transformed",TRUE,opt$log,display=FALSE)
+ remove(probeInfo)
+}else{
+ #fichier deja tabule
+ dataMatrix=read.csv(file=opt$dataFile,header=F,sep="\t",colClasses="character")
+ #remove first row to convert it as colnames (to avoid X before colnames with header=T)
+ colNamesData=dataMatrix[1,-1]
+ dataMatrix=dataMatrix[-1,]
+ #remove first colum to convert it as rownames
+ rowNamesData=dataMatrix[,1]
+ dataMatrix=dataMatrix[,-1]
+ if(is.data.frame(dataMatrix)){
+ dataMatrix=data.matrix(dataMatrix)
+ }else{
+ dataMatrix=data.matrix(as.numeric(dataMatrix))
+ }
+ dimnames(dataMatrix)=list(rowNamesData,colNamesData)
+ if(any(duplicated(rowNamesData)))addComment("[WARNING] several rows share the same probe/gene name, you should merge or rename them to avoid further analysis mistakes",TRUE,opt$log,display=FALSE)
+}
+
+addComment("[INFO]Input data loaded",TRUE,opt$log,display=FALSE)
+addComment(c("[INFO]Dim of data matrix:",dim(dataMatrix)),T,opt$log,display=FALSE)
+
+#get number of conditions
+nbConditions=ncol(dataMatrix)
+
+#get condition names if they are specified
+if(!is.null(opt$conditionNames) && length(opt$conditionNames)==nbConditions){
+ nameConditions=opt$conditionNames
+ colnames(dataMatrix)=nameConditions
+ #rownames(phenoData(celData)@data)=nameConditions
+ #rownames(protocolData(celData)@data)=nameConditions
+}else{
+ nameConditions=colnames(dataMatrix)
+}
+
+#create a correspondance table between plot file names and name displayed in figure legend and html items
+correspondanceNameTable=matrix("",ncol=2,nrow=nbConditions)
+correspondanceNameTable[,1]=paste("Condition",1:nbConditions,sep="")
+correspondanceNameTable[,2]=nameConditions
+rownames(correspondanceNameTable)=correspondanceNameTable[,2]
+
+addComment("[INFO]Retreive condition names",TRUE,opt$log,display=FALSE)
+
+if(!is.null(opt$factorInfo)){
+ #chargement du fichier des facteurs
+ factorInfoMatrix=read.csv(file=file.path(getwd(), opt$factorInfo),header=F,sep="\t",colClasses="character")
+ #remove first row to convert it as colnames
+ colnames(factorInfoMatrix)=factorInfoMatrix[1,]
+ factorInfoMatrix=factorInfoMatrix[-1,]
+ #use first colum to convert it as rownames but not removing it to avoid conversion as vector in unique factor case
+ rownames(factorInfoMatrix)=factorInfoMatrix[,1]
+
+
+ if(length(setdiff(colnames(dataMatrix),rownames(factorInfoMatrix)))!=0){
+ addComment("[ERROR]Missing samples in factor file",T,opt$log)
+ q( "no", 1, F )
+ }
+
+ #order sample as in expression matrix and remove spurious sample
+ factorInfoMatrix=factorInfoMatrix[colnames(dataMatrix),]
+
+ addComment("[INFO]Factors OK",T,opt$log,display=FALSE)
+ addComment(c("[INFO]Dim of factorInfo matrix:",dim(factorInfoMatrix)),T,opt$log,display=FALSE)
+
+}
+
+addComment("[INFO]Ready to plot",T,opt$log,display=FALSE)
+
+
+##----------------------
+
+###plot histograms###
+histogramPerFigure=50
+if (!is.null(opt$histo)) {
+ for(iToPlot in 1:(((nbConditions-1)%/%histogramPerFigure)+1)){
+ firstPlot=1+histogramPerFigure*(iToPlot-1)
+ lastPlot=min(nbConditions,histogramPerFigure*iToPlot)
+ dataToPlot=data.frame(x=c(dataMatrix[,firstPlot:lastPlot]),Experiment=rep(colnames(dataMatrix)[firstPlot:lastPlot],each=nrow(dataMatrix)))
+ p <- ggplot(data=dataToPlot, aes(x = x, color=Experiment)) + stat_density(geom="line", size=1, position="identity") +
+ ggtitle("Intensity densities") + theme_bw() + ylab(label="Density") +
+ theme(panel.border=element_blank(),plot.title = element_text(hjust = 0.5))
+ if(dataAreFromCel){
+ #original ploting function
+ #hist(celData[,firstPlot:lastPlot],lty=rep(1,nbConditions)[firstPlot:lastPlot],lwd=2,which='pm',target="probeset",transfo=log2,col=rainbow(nbConditions)[firstPlot:lastPlot])
+ p <- p + xlab(label="Log2 intensities")
+ }else{
+ p <- p + xlab(label="Intensities")
+ }
+ if(opt$format=="pdf"){
+ pdf(paste(c("./plotDir/",opt$histo,iToPlot,".pdf"),collapse=""))}else{
+ png(paste(c("./plotDir/",opt$histo,iToPlot,".png"),collapse=""))
+ }
+ print(p)
+ dev.off()
+ #save plotly files
+ pp <- ggplotly(p)
+ htmlwidgets::saveWidget(as_widget(pp), paste(c(file.path(getwd(), "plotLyDir"),"/",opt$histo,iToPlot,".html"),collapse=""),selfcontained = F)
+ }
+ remove(p,dataToPlot)
+ addComment("[INFO]Histograms drawn",T,opt$log,display=FALSE)
+}
+
+##----------------------
+
+###plot MAplots###
+MAplotPerPage=4
+if (!is.null(opt$maPlot)) {
+ iToPlot=1
+ plotVector=list()
+ toTake=sample(nrow(dataMatrix),min(200000,nrow(dataMatrix)))
+ refMedianColumn=rowMedians(as.matrix(dataMatrix[toTake,]))
+ if(length(toTake)>100000)addComment(c("[INFO]high number of input data rows ",length(toTake),"; the generation of MA plot can take a while, please be patient"),TRUE,opt$log,display=FALSE)
+ for (iCondition in 1:nbConditions){
+ #MAplot(celData,which=i,what=pm,transfo=log2)
+ #smoothScatter(x=xToPlot,y=yToPlot,main=nameConditions[iCondition])
+ dataA=dataMatrix[toTake,iCondition]
+ dataB=refMedianColumn####ATTENTION PAR DEFAUT
+ xToPlot=0.5*(dataA+dataB)
+ yToPlot=dataA-dataB
+ tempX=seq(min(xToPlot),max(xToPlot),0.1)
+ tempY=unlist(lapply(tempX,function(x){median(yToPlot[intersect(which(xToPlot>=(x-0.1/2)),which(xToPlot<(x+0.1/2)))])}))
+
+ dataToPlot=data.frame(x=xToPlot,y=yToPlot)
+ dataMedianToPlot=data.frame(x=tempX,y=tempY)
+ p <- ggplot(data=dataToPlot, aes(x,y)) + stat_density2d(aes(fill = ..density..^0.25), geom = "tile", contour = FALSE, n = 100) +
+ scale_fill_continuous(low = "white", high = "dodgerblue4") + geom_smooth(data=dataMedianToPlot,colour="red", size=0.5, se=FALSE) +
+ ggtitle(correspondanceNameTable[iCondition,2]) + theme_bw() + xlab(label="") + ylab(label="") +
+ theme(panel.border=element_blank(),plot.title = element_text(hjust = 0.5),legend.position = "none")
+ plotVector[[length(plotVector)+1]]=p
+
+ #save plotly files
+ pp <- ggplotly(p)
+ htmlwidgets::saveWidget(as_widget(pp), paste(c(file.path(getwd(), "plotLyDir"),"/",opt$maPlot,"_",correspondanceNameTable[iCondition,1],".html"),collapse=""),selfcontained = F)
+
+ if(iCondition==nbConditions || length(plotVector)==MAplotPerPage){
+ #define a new plotting file
+ if(opt$format=="pdf"){
+ pdf(paste(c("./plotDir/",opt$maPlot,iToPlot,".pdf"),collapse=""))}else{
+ png(paste(c("./plotDir/",opt$maPlot,iToPlot,".png"),collapse=""))
+ }
+ multiplot(plotlist=plotVector,cols=2)
+ dev.off()
+ if(iCondition0)c(sample(c(x),min(length(x),1000)),max(c(x)),min(c(x))))
+ dataOutliers=data.frame(yVal=unlist(boxplotsOutliers),xVal=unlist(lapply(seq_along(boxplotsOutliers),function(x)rep(names(boxplotsOutliers)[x],length(boxplotsOutliers[[x]])))))
+ #plot boxplots without outliers
+ p <- ggplot(data=dataToPlot, aes(y = intensities, x=Experiment ,color=Experiment)) + geom_boxplot(outlier.colour=NA,outlier.shape =NA) +
+ ggtitle("Intensities") + theme_bw() + xlab(label="") +
+ theme(panel.border=element_blank(),plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 45, hjust = 1),plot.margin=unit(c(10,10,max(unlist(lapply(dataToPlot$Experiment,function(x)nchar(as.character(x))))),15+max(unlist(lapply(dataToPlot$Experiment,function(x)nchar(as.character(x)))))),"mm"))
+ #add to plot sampled outliers
+ p <- p + geom_point(data=dataOutliers,aes(x=xVal,y=yVal,color=xVal),inherit.aes = F)
+ if(dataAreFromCel){
+ #original plotting function
+ #boxplot(celData[,firstPlot:lastPlot],which='pm',col=rainbow(nbConditions)[firstPlot:lastPlot],target="probeset",transfo=log2,names=nameConditions[firstPlot:lastPlot],main="Intensities")
+ p <- p + ylab(label="Log2 intensities")
+ }else{
+ p <- p + ylab(label="Intensities")
+ }
+ if(opt$format=="pdf"){
+ pdf(paste(c("./plotDir/",opt$boxplot,iToPlot,".pdf"),collapse=""))}else{
+ png(paste(c("./plotDir/",opt$boxplot,iToPlot,".png"),collapse=""))
+ }
+ print(p)
+ dev.off()
+ #save plotly files
+ pp <- ggplotly(p)
+
+ #modify plotly object to get HTML file not too heavy for loading
+ for(iData in 1:length(pp$x$data)){
+ ##get kept outliers y values
+ #yPointsToKeep=dataOutliers$yVal[which(dataOutliers$xVal==pp$x$data[[iData]]$name)]
+ if(pp$x$data[[iData]]$type=="scatter"){
+ ##scatter plot represent outliers points added to boxplot through geom_point
+ ##nothing to do as outliers have been sampled allready, just have to modify hover text
+ #if(length(yPointsToKeep)>0){
+ #pointsToKeep=which(pp$x$data[[iData]]$y %in% yPointsToKeep)
+ #pp$x$data[[iData]]$x=pp$x$data[[iData]]$x[pointsToKeep]
+ #pp$x$data[[iData]]$y=pp$x$data[[iData]]$y[pointsToKeep]
+ #pp$x$data[[iData]]$text=pp$x$data[[iData]]$text[pointsToKeep]
+ #}else{
+ #pp$x$data[[iData]]$x=NULL
+ #pp$x$data[[iData]]$y=NULL
+ #pp$x$data[[iData]]$marker$opacity=0
+ #pp$x$data[[iData]]$hoverinfo=NULL
+ #pp$x$data[[iData]]$text=NULL
+ #}
+ #modify text to display
+ if(dataAreFromCel){
+ pp$x$data[[iData]]$text=unlist(lapply(seq_along(pp$x$data[[iData]]$y),function(x)return(paste(c("log2(intensity) ",prettyNum(pp$x$data[[iData]]$y[x],digits=4)),collapse = ""))))
+ }else{
+ pp$x$data[[iData]]$text=unlist(lapply(seq_along(pp$x$data[[iData]]$y),function(x)return(paste(c("intensity ",prettyNum(pp$x$data[[iData]]$y[x],digits=4)),collapse = ""))))
+ }
+ }else{
+ ##disable marker plotting to keep only box and whiskers plot (outliers are displayed through scatter plot)
+ pp$x$data[[iData]]$marker$opacity=0
+
+ #sample 50000 points amongst all data to get a lighter html file, sampling size should not be too low to avoid modifying limit of boxplots
+ pp$x$data[[iData]]$y=c(sample(dataMatrix[,pp$x$data[[iData]]$name],min(length(dataMatrix[,pp$x$data[[iData]]$name]),50000)),min(dataMatrix[,pp$x$data[[iData]]$name]),max(dataMatrix[,pp$x$data[[iData]]$name]))
+ pp$x$data[[iData]]$x=rep(pp$x$data[[iData]]$x[1],length(pp$x$data[[iData]]$y))
+
+ ##first remove outliers info
+ #downUpValues=boxplot.stats(dataMatrix[,pp$x$data[[iData]]$name])$stats
+ #if(verbose)addComment(c("filter values for boxplot",pp$x$data[[iData]]$name,"between",min(downUpValues),"and",max(downUpValues)),T,opt$log)
+ #pointsToRemove=which(pp$x$data[[iData]]$y0)pp$x$data[[iData]]$y=pp$x$data[[iData]]$y[-pointsToRemove]
+ #pointsToRemove=which(pp$x$data[[iData]]$y>max(downUpValues))
+ #if(length(pointsToRemove)>0)pp$x$data[[iData]]$y=pp$x$data[[iData]]$y[-pointsToRemove]
+ #then add sampled outliers info
+ #pp$x$data[[iData]]$y=c(yPointsToKeep,pp$x$data[[iData]]$y)
+ #pp$x$data[[iData]]$x=rep(pp$x$data[[iData]]$x[1],length(pp$x$data[[iData]]$y))
+ }
+ }
+
+ htmlwidgets::saveWidget(as_widget(pp), paste(c(file.path(getwd(), "plotLyDir"),"/",opt$boxplot,iToPlot,".html"),collapse=""),selfcontained = F)
+ }
+ remove(p,dataToPlot)
+ addComment("[INFO]Boxplots drawn",T,opt$log,display=FALSE)
+
+}
+
+##----------------------
+
+###plot microarrays (only for .CEL files)###
+if (!is.null(opt$microarray) && dataAreFromCel) {
+ for (iCondition in 1:nbConditions){
+ if(opt$format=="pdf"){
+ pdf(paste(c("./plotDir/",opt$microarray,"_",correspondanceNameTable[iCondition,1],".pdf"),collapse=""),onefile = F,width = 5,height = 5)}else{
+ png(paste(c("./plotDir/",opt$microarray,"_",correspondanceNameTable[iCondition,1],".png"),collapse=""))
+ }
+ image(celData[,iCondition],main=correspondanceNameTable[iCondition,2])
+ dev.off()
+ }
+ addComment("[INFO]Microarray drawn",T,opt$log,display=FALSE)
+}
+
+##----------------------
+
+###plot PCA plot###
+if (!is.null(opt$acp)){
+ ##to avoid error when nrow is too large, results quite stable with 200k random selected rows
+ randomSelection=sample(nrow(dataMatrix),min(200000,nrow(dataMatrix)))
+ #remove constant variables
+
+ dataFiltered=dataMatrix[randomSelection,]
+ toRemove=which(unlist(apply(dataFiltered,1,var))==0)
+ if(length(toRemove)>0){
+ dataFiltered=dataFiltered[-toRemove,]
+ }
+ ##geom_text(aes(label=Experiments,hjust=1, vjust=1.3), y = PC2+0.01)
+ PACres = prcomp(t(dataFiltered),scale.=TRUE)
+
+ if(!is.null(opt$screePlot)){
+ #scree plot
+ #p <- fviz_eig(PACres)
+ dataToPlot=data.frame(compo=seq(1,length(PACres$sdev)),var=(PACres$sdev^2/sum(PACres$sdev^2))*100)
+ p<-ggplot(data=dataToPlot, aes(x=compo, y=var)) + geom_bar(stat="identity", fill="steelblue") + geom_line() + geom_point() +
+ ggtitle("Scree plot") + theme_bw() + theme(panel.border=element_blank(),plot.title = element_text(hjust = 0.5)) +
+ xlab(label="Dimensions") + ylab(label="% explained variances") + scale_x_discrete(limits=dataToPlot$compo)
+ pp <- ggplotly(p)
+
+ if(opt$format=="pdf"){
+ pdf(paste(c("./plotDir/",opt$screePlot,".pdf"),collapse=""))}else{
+ png(paste(c("./plotDir/",opt$screePlot,".png"),collapse=""))
+ }
+ plot(p)
+ dev.off()
+ htmlwidgets::saveWidget(as_widget(pp), paste(c(file.path(getwd(), "plotLyDir"),"/",opt$screePlot,".html"),collapse=""),selfcontained = F)
+ }
+
+ #now plot pca plots
+
+ if(!is.null(opt$factorInfo)){
+ fileIdent=""
+ symbolset = c("circle","cross","square","diamond","circle-open","square-open","diamond-open","x")
+
+ #save equivalence between real factor names and generic ones in correspondanceNameTable
+ correspondanceNameTable=rbind(correspondanceNameTable,matrix(c(paste("Factor",1:(ncol(factorInfoMatrix)-1),sep=""),colnames(factorInfoMatrix)[-1]),ncol=2,nrow=ncol(factorInfoMatrix)-1))
+ rownames(correspondanceNameTable)=correspondanceNameTable[,2]
+
+ #first order factors from decreasing groups number
+ orderedFactors=colnames(factorInfoMatrix)[-1][order(unlist(lapply(colnames(factorInfoMatrix)[-1],function(x)length(table(factorInfoMatrix[,x])))),decreasing = T)]
+ allFactorsBigger=length(table(factorInfoMatrix[,orderedFactors[length(orderedFactors)]]))>length(symbolset)
+ if(allFactorsBigger)addComment("[WARNING]All factors are composed of too many groups to display two factors at same time, each PCA plot will display only one factor groups",T,opt$log,display=FALSE)
+ for(iFactor in 1:length(orderedFactors)){
+ #if it is the last factor of the list or if all factor
+ if(iFactor==length(orderedFactors) || allFactorsBigger){
+ if(length(orderedFactors)==1 || allFactorsBigger){
+ dataToPlot=data.frame(PC1=PACres$x[,1],PC2=PACres$x[,2],PC3=PACres$x[,3],Experiments=rownames(PACres$x), Attribute1=factorInfoMatrix[rownames(PACres$x),orderedFactors[iFactor]], hoverLabel=unlist(lapply(rownames(PACres$x),function(x)paste(factorInfoMatrix[x,-1],collapse=","))))
+ p <- plot_ly(dataToPlot,x = ~PC1, y = ~PC2, z = ~PC3, type = 'scatter3d', mode="markers", color=~Attribute1,colors=rainbow(length(levels(dataToPlot$Attribute1))+2),hoverinfo = 'text', text = ~paste(Experiments,"\n",hoverLabel),marker=list(size=5))%>%
+ layout(title = "Principal Component Analysis", scene = list(xaxis = list(title = "Component 1"),yaxis = list(title = "Component 2"),zaxis = list(title = "Component 3")),
+ legend=list(font = list(family = "sans-serif",size = 15,color = "#000")))
+ fileIdent=correspondanceNameTable[orderedFactors[iFactor],1]
+ #add text label to plot
+ ##p <- add_text(p,x = dataToPlot$PC1, y = dataToPlot$PC2 + (max(PACres$x[,2])-min(PACres$x[,2]))*0.02, z = dataToPlot$PC3, mode = 'text', inherit = F, text=rownames(PACres$x), hoverinfo='skip', showlegend = FALSE, color=I('black'))
+ #save the plotly plot
+ htmlwidgets::saveWidget(as_widget(p), paste(c(file.path(getwd(), "plotLyDir"),"/",opt$acp,"_",fileIdent,".html"),collapse=""),selfcontained = F)
+ }
+ }else{
+ for(iFactorBis in (iFactor+1):length(orderedFactors)){
+ if(length(table(factorInfoMatrix[,orderedFactors[iFactorBis]]))<=length(symbolset)){
+ dataToPlot=data.frame(PC1=PACres$x[,1],PC2=PACres$x[,2],PC3=PACres$x[,3],Experiments=rownames(PACres$x), Attribute1=factorInfoMatrix[rownames(PACres$x),orderedFactors[iFactor]], Attribute2=factorInfoMatrix[rownames(PACres$x),orderedFactors[iFactorBis]], hoverLabel=unlist(lapply(rownames(PACres$x),function(x)paste(factorInfoMatrix[x,-1],collapse=","))))
+ p <- plot_ly(dataToPlot,x = ~PC1, y = ~PC2, z = ~PC3, type = 'scatter3d', mode="markers", color=~Attribute1,colors=rainbow(length(levels(dataToPlot$Attribute1))+2),symbol=~Attribute2,symbols = symbolset,hoverinfo = 'text', text = ~paste(Experiments,"\n",hoverLabel),marker=list(size=5))%>%
+ layout(title = "Principal Component Analysis", scene = list(xaxis = list(title = "Component 1"),yaxis = list(title = "Component 2"),zaxis = list(title = "Component 3")),
+ legend=list(font = list(family = "sans-serif",size = 15,color = "#000")))
+ fileIdent=paste(correspondanceNameTable[orderedFactors[c(iFactor,iFactorBis)],1],collapse="_AND_")
+ #add text label to plot
+ ##p <- add_text(p,x = dataToPlot$PC1, y = dataToPlot$PC2 + (max(PACres$x[,2])-min(PACres$x[,2]))*0.02, z = dataToPlot$PC3, mode = 'text', inherit = F, text=rownames(PACres$x), hoverinfo='skip', showlegend = FALSE, color=I('black'))
+ #save the plotly plot
+ htmlwidgets::saveWidget(as_widget(p), paste(c(file.path(getwd(), "plotLyDir"),"/",opt$acp,"_",fileIdent,".html"),collapse=""),selfcontained = F)
+ }else{
+ addComment(c("[WARNING]PCA with",orderedFactors[iFactor],"and",orderedFactors[iFactorBis],"groups cannot be displayed, too many groups (max",length(symbolset),")"),T,opt$log,display=FALSE)
+ }
+ }
+ }
+ }
+ }else{
+ dataToPlot=data.frame(PC1=PACres$x[,1],PC2=PACres$x[,2],PC3=PACres$x[,3],Experiments=rownames(PACres$x))
+ p <- plot_ly(dataToPlot,x = ~PC1, y = ~PC2, z = ~PC3, type = 'scatter3d', mode="markers",marker=list(size=5,color="salmon"),hoverinfo = 'text',text = ~paste(Experiments))%>%
+ layout(title = "Principal Component Analysis", scene = list(xaxis = list(title = "Component 1"),yaxis = list(title = "Component 2"),zaxis = list(title = "Component 3")),
+ legend=list(font = list(family = "sans-serif",size = 15,color = "#000")))
+ ##p <- add_text(p,x = dataToPlot$PC1, y = dataToPlot$PC2 + (max(PACres$x[,2])-min(PACres$x[,2]))*0.02, z = dataToPlot$PC3, mode = 'text', inherit = F, text=rownames(PACres$x), hoverinfo='skip', showlegend = FALSE, color=I('black'))
+
+ #save plotly files
+ htmlwidgets::saveWidget(as_widget(p), paste(c(file.path(getwd(), "plotLyDir"),"/",opt$acp,"_plot.html"),collapse=""),selfcontained = F)
+ }
+ remove(p,dataToPlot,dataFiltered)
+ addComment("[INFO]ACP plot drawn",T,opt$log,display=FALSE)
+}
+
+#write correspondances between plot file names and displayed names in figure legends, usefull to define html items in xml file
+write.table(correspondanceNameTable,file=file.path(getwd(), "correspondanceFileNames.csv"),quote=FALSE,sep="\t",col.names = F,row.names = F)
+
+end.time <- Sys.time()
+addComment(c("[INFO]Total execution time for R script:",as.numeric(end.time - start.time,units="mins"),"mins"),T,opt$log,display=FALSE)
+
+addComment("[INFO]End of R script",T,opt$log,display=FALSE)
+
+printSessionInfo(opt$log)
+#sessionInfo()
diff -r 000000000000 -r 708f43bda2b6 src/General_functions.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/General_functions.py Fri Jun 26 09:35:11 2020 -0400
@@ -0,0 +1,206 @@
+import re
+import numpy as np
+
+def get_column_names( file_path, toNotConsider=-1, each=1):
+ options=[]
+ inputfile = open(file_path)
+ firstLine = next(inputfile).strip().split("\t")
+ cpt=0
+ for i, field_component in enumerate( firstLine ):
+ if i!=toNotConsider:#to squeeze the first column
+ if cpt==0:
+ options.append( ( field_component, field_component, False ) )
+ cpt+=1
+ if cpt==each:
+ cpt=0
+ inputfile.close()
+ return options
+
+def get_column_names_filteredList( file_path, toNotConsider=[], each=1):
+ options=[]
+ inputfile = open(file_path)
+ firstLine = next(inputfile).strip().split("\t")
+ cpt=0
+ for i, field_component in enumerate( firstLine ):
+ if i not in toNotConsider:#to squeeze the first columns
+ if cpt==0:
+ options.append( ( field_component, field_component, False ) )
+ cpt+=1
+ if cpt==each:
+ cpt=0
+ inputfile.close()
+ return options
+
+def get_column_names_mergeNumber(file_path, numberToMerge=1, toNotConsider=[]):
+ options=[]
+ inputfile = open(file_path)
+ if int(numberToMerge)>0:
+ iHeader=0
+ for iCurrentLine in inputfile:
+ iHeader=iHeader+1
+ if iHeader>int(numberToMerge):
+ break
+ currentLine=iCurrentLine.strip().split("\t")
+ iOption=-1
+ for i, field_component in enumerate( currentLine ):
+ if i not in toNotConsider:#to squeeze specified columns
+ iOption=iOption+1
+ if iHeader==1:
+ options.append( ( str(field_component), str(field_component), False ) )
+ else:
+ options[iOption]=(options[iOption][0]+"_"+str(field_component),options[iOption][1]+"_"+str(field_component),False)
+ else:
+ currentLine = next(inputfile).strip().split("\t")
+ for i, field_component in enumerate( currentLine ):
+ if i not in toNotConsider:#to squeeze specified columns
+ options.append( ( "Column_"+str(i), "Column_"+str(i), False ) )
+ inputfile.close()
+ return options
+
+def get_row_names( file_path, factorName ):
+ inputfile = open(file_path)
+ firstLine = next(inputfile).strip().split("\t")
+ iColumn=-1
+ for i, field_component in enumerate( firstLine ):
+ if field_component==factorName:#to test
+ iColumn=i
+ options=[]
+ if iColumn!=-1:
+ for nextLine in inputfile:
+ nextLine=nextLine.strip().split("\t")
+ if len(nextLine)>1:
+ if (nextLine[iColumn], nextLine[iColumn], False) not in options:
+ options.append( (nextLine[iColumn], nextLine[iColumn], False) )
+ inputfile.close()
+ return options
+
+def get_condition_file_names( file_list, toNotConsider=-1, each=1):
+ options=[]
+ if not isinstance(file_list,list):#if input file is a tabular file, act as get_column_names
+ inputfile = open(file_list.file_name)
+ firstLine = next(inputfile).strip().split("\t")
+ cpt=0
+ for i, field_component in enumerate( firstLine ):
+ if i!=toNotConsider:#to squeeze the first column
+ if cpt==0:
+ options.append( ( field_component, field_component, False ) )
+ cpt+=1
+ if cpt==each:
+ cpt=0
+ inputfile.close()
+ else:#if input file is a .cel file list or a collection
+ if not hasattr(file_list[0],'collection'):#if it is not a collection, get name easily
+ for i, field_component in enumerate( file_list ):
+ options.append( ( field_component.name, field_component.name, False ) )
+ else:#if the file is a collection, have to get deeper in the corresponding HistoryDatasetCollectionAssociation object
+ for i, field_component in enumerate( file_list[0].collection.elements ):
+ options.append( ( field_component.element_identifier, field_component.element_identifier, False ) )
+ return options
+
+def generateFactorFile( file_list, factor_list, outputFileName, logFile):
+ forbidenCharacters={"*",":",",","|"}
+ outputfile = open(outputFileName, 'w')
+ outputLog = open(logFile, 'w')
+ sampleList=[]
+ if not isinstance(file_list,list):
+ conditionNames=get_condition_file_names(file_list,0) #unique expression file, remove the first column (index=0)
+ else :
+ conditionNames=get_condition_file_names(file_list) #.CEL files
+ for iSample, sample_component in enumerate (conditionNames):
+ sampleList.append(str(sample_component[1]))
+ outputLog.write("[INFO] "+str(len(sampleList))+" sample are detected as input\n")
+ globalDict=dict()
+ factorNameList=[]
+ firstLine="Conditions"
+ if len(factor_list)==0:#check if there is at least one factor available
+ outputLog.write("[ERROR] no factor was defined !\n")
+ return 1
+ else:
+ for iFactor, factor_component in enumerate( factor_list ):
+ currentSampleList=list(sampleList)
+ currentFactor=str(factor_component['factorName'])
+ #check if factor name contains forbidden characters
+ for specialCharacter in forbidenCharacters:
+ if currentFactor.find(specialCharacter)!=-1:
+ outputLog.write("[ERROR] '"+specialCharacter+"' character is forbidden in factor name : '"+currentFactor+"'\n")
+ return 4
+ #check if factor allready named like that
+ if not globalDict.get(currentFactor) is None:
+ outputLog.write("[ERROR] '"+currentFactor+"' is used several times as factor name\n")
+ return 3
+ globalDict[currentFactor]=dict()
+ firstLine=firstLine+"\t"+currentFactor
+ factorNameList.append(currentFactor)
+ if len(factor_component['valueList'])<=1:#check if there is at least two value available
+ outputLog.write("[ERROR] at least two different values are necessary for '"+currentFactor+"' factor\n")
+ return 1
+ else:
+ for iValue, value_component in enumerate( factor_component['valueList'] ):
+ currentValue=str(value_component['valueName'])
+ #check if factor name contains forbidden characters
+ for specialCharacter in forbidenCharacters:
+ if currentValue.find(specialCharacter)!=-1:
+ outputLog.write("[ERROR] '"+specialCharacter+"' character is forbidden in value name : '"+currentValue+"'\n")
+ return 4
+ currentSample=str(value_component['valueConditions']).split(",")
+ for iSample, sample_component in enumerate (currentSample):
+ if not sample_component in currentSampleList:
+ outputLog.write("[ERROR] sample "+sample_component+" was assigned several times for factor '"+currentFactor+"'\n")
+ return 2
+ currentSampleList.remove(sample_component)
+ globalDict[currentFactor][sample_component]=currentValue
+ if(len(currentSampleList)>0):
+ outputLog.write("[ERROR] for factor '"+currentFactor+"'' sample "+str(currentSampleList)+" are not assigned to any value\n")
+ return 2
+ outputLog.write("[INFO] "+str(len(globalDict))+" factors are detected\n")
+ #start writing the factor file
+ outputfile.write(firstLine+"\n")
+ for iSample, sample_component in enumerate(sampleList):
+ newLine=sample_component
+ for iFactor, factor_component in enumerate(factorNameList):
+ newLine=newLine+"\t"+globalDict[factor_component][sample_component]
+ outputfile.write(newLine+"\n")
+ outputfile.close()
+ outputLog.close()
+ return 0
+
+def selectSubSetTable(file_path,headerLine_number,columnsToAdd,columnNamesToKeep,outputFileName,logFile):
+ outputLog = open(logFile, 'w')
+ outputLog.write("[INFO] header line number : "+ headerLine_number+" lines\n")
+ availableColumnsTuple=get_column_names_mergeNumber(file_path, headerLine_number)
+ #convert tuple list as a simple array
+ availableColumns=[]
+ for iTuple, tuple_content in enumerate (availableColumnsTuple):
+ availableColumns.append(str(tuple_content[0]))
+ if len(availableColumns)==0:
+ outputLog.write("[ERROR] No detected columns in input file\n")
+ return 1
+ selectedColumns=list(columnsToAdd)
+ for iVolcano, volcano_content in enumerate(columnNamesToKeep):
+ selectedColumns.append(availableColumns.index(volcano_content['pvalColumn']))
+ if volcano_content['fdrColumn'] in availableColumns:
+ selectedColumns.append(availableColumns.index(volcano_content['fdrColumn']))
+ else:
+ selectedColumns.append(0)
+ selectedColumns.append(availableColumns.index(volcano_content['fcColumn']))
+ if len(selectedColumns)!=(3*len(columnNamesToKeep)+len(columnsToAdd)):
+ outputLog.write("[ERROR] matching between input file colnames and requested column names failed\n")
+ return 1
+ outputLog.write("[INFO] columns kept : "+str(selectedColumns)+"\n")
+ #start writting formatted file
+ inputfile = open(file_path)
+ outputfile = open(outputFileName, 'w')
+ iLineCpt=-1
+ for iCurrentLine in inputfile:
+ iLineCpt=iLineCpt+1
+ if iLineCpt>=int(headerLine_number):
+ currentLineFields=np.array(iCurrentLine.strip().split("\t"))
+ newLine="\t".join(currentLineFields[selectedColumns])
+ outputfile.write(newLine+"\n")
+ if iLineCpt1:
+ if (nextLine[iColumn], nextLine[iColumn], False) not in options:
+ options.append( (nextLine[iColumn], nextLine[iColumn], False) )
+ inputfile.close()
+ return options
+
+def get_row_names_interaction( file_path, factorNameA, factorNameB ):
+ inputfile = open(file_path)
+ firstLine = next(inputfile).strip().split("\t")
+ iColumnA=-1
+ iColumnB=-1
+ for i, field_component in enumerate( firstLine ):
+ if field_component==factorNameA:#to test
+ iColumnA=i
+ if field_component==factorNameB:#to test
+ iColumnB=i
+ possibleValuesA=[]
+ possibleValuesB=[]
+ if iColumnA!=-1 and iColumnB!=-1:
+ for nextLine in inputfile:
+ nextLine=nextLine.strip().split("\t")
+ if len(nextLine)>1:
+ if nextLine[iColumnA] not in possibleValuesA:
+ possibleValuesA.append(nextLine[iColumnA])
+ if nextLine[iColumnB] not in possibleValuesB:
+ possibleValuesB.append(nextLine[iColumnB])
+ inputfile.close()
+ options=[]
+ if len(possibleValuesA)>=1 and len(possibleValuesB)>=1 and possibleValuesA[0]!="None" and possibleValuesB[0]!="None":
+ for counterA in range(len(possibleValuesA)):
+ for counterB in range(len(possibleValuesB)):
+ options.append( (possibleValuesA[counterA]+"*"+possibleValuesB[counterB], possibleValuesA[counterA]+"*"+possibleValuesB[counterB], False) )
+ return options
+
+def get_comparisonsA( factorA, valuesA ):
+ options=[]
+ formatValuesA=re.sub("(^\[u')|('\]$)","", str(valuesA))
+ possibleValues=formatValuesA.split("', u'")
+ if len(possibleValues)>=2:
+ for counter in range(len(possibleValues)-1):
+ for innerCounter in range(counter+1,len(possibleValues)):
+ options.append( (possibleValues[counter]+" - "+possibleValues[innerCounter], possibleValues[counter]+" - "+possibleValues[innerCounter], False) )
+ options.append( (possibleValues[innerCounter]+" - "+possibleValues[counter], possibleValues[innerCounter]+" - "+possibleValues[counter], False) )
+ return options
+
+def get_comparisonsAB(factorA, valuesA, factorB, valuesB, interaction):
+ options=[]
+ formatValuesA=re.sub("(^\[u')|('\]$)","", str(valuesA))
+ possibleValuesA=formatValuesA.split("', u'")
+ formatValuesB=re.sub("(^\[u')|('\]$)","", str(valuesB))
+ possibleValuesB=formatValuesB.split("', u'")
+ if str(interaction)=="False":
+ if len(possibleValuesA)>=2:
+ for counter in range(len(possibleValuesA)-1):
+ for innerCounter in range(counter+1,len(possibleValuesA)):
+ options.append( (possibleValuesA[counter]+" - "+possibleValuesA[innerCounter], possibleValuesA[counter]+" - "+possibleValuesA[innerCounter], False) )
+ options.append( (possibleValuesA[innerCounter]+" - "+possibleValuesA[counter], possibleValuesA[innerCounter]+" - "+possibleValuesA[counter], False) )
+ if len(possibleValuesB)>=2:
+ for counter in range(len(possibleValuesB)-1):
+ for innerCounter in range(counter+1,len(possibleValuesB)):
+ options.append( (possibleValuesB[counter]+" - "+possibleValuesB[innerCounter], possibleValuesB[counter]+" - "+possibleValuesB[innerCounter], False) )
+ options.append( (possibleValuesB[innerCounter]+" - "+possibleValuesB[counter], possibleValuesB[innerCounter]+" - "+possibleValuesB[counter], False) )
+ else:
+ if len(possibleValuesA)>=1 and len(possibleValuesB)>=1 and possibleValuesA[0]!="None" and possibleValuesB[0]!="None":
+ for counterA in range(len(possibleValuesA)):
+ for innerCounterA in range(len(possibleValuesA)):
+ for counterB in range(len(possibleValuesB)):
+ for innerCounterB in range(len(possibleValuesB)):
+ if not(counterA==innerCounterA and counterB==innerCounterB):
+ options.append( ("("+possibleValuesA[counterA]+" * "+possibleValuesB[counterB]+") - ("+possibleValuesA[innerCounterA]+" * "+possibleValuesB[innerCounterB]+")","("+possibleValuesA[counterA]+" * "+possibleValuesB[counterB]+") - ("+possibleValuesA[innerCounterA]+" * "+possibleValuesB[innerCounterB]+")", False) )
+ return options
+
+def get_row_names_allInteractions( file_path, factorSelected):
+ formatFactors=re.sub("(^\[u')|('\]$)","", str(factorSelected))
+ factorsList=formatFactors.split("', u'")
+ iColumn=[None] * len(factorsList)
+ valuesList=[None] * len(factorsList)
+
+ inputfile = open(file_path)
+ firstLine = next(inputfile).strip().split("\t")
+ for iField, fieldComponent in enumerate( firstLine ):
+ for iFactor, factorComponent in enumerate(factorsList):
+ if fieldComponent==factorComponent:
+ iColumn[iFactor]=iField
+ valuesList[iFactor]=[]
+
+ for nextLine in inputfile:
+ nextLine=nextLine.strip().split("\t")
+ if len(nextLine)>1:
+ for iFactor, factorComponent in enumerate(factorsList):
+ if nextLine[iColumn[iFactor]] not in valuesList[iFactor]:
+ valuesList[iFactor].append(nextLine[iColumn[iFactor]])
+ inputfile.close()
+
+ allCombinations=[]
+ for iFactor, factorComponent in enumerate(factorsList):
+ if iFactor==0:
+ allCombinations=valuesList[iFactor]
+ else:
+ currentCombinations=allCombinations
+ allCombinations=[]
+ for iValue, valueComponent in enumerate(valuesList[iFactor]):
+ for iCombination, combination in enumerate(currentCombinations):
+ allCombinations.append(combination+"*"+valueComponent)
+
+ options=[]
+ for iCombination, combination in enumerate(allCombinations):
+ options.append((combination,combination,False))
+
+ return options
+
+def get_allrow_names( file_path, factorSelected ):
+ formatFactors=re.sub("(^\[u')|('\]$)","", str(factorSelected))
+ factorsList=formatFactors.split("', u'")
+ iColumn=[None] * len(factorsList)
+ valuesList=[None] * len(factorsList)
+
+ inputfile = open(file_path)
+ firstLine = next(inputfile).strip().split("\t")
+ for iField, fieldComponent in enumerate( firstLine ):
+ for iFactor, factorComponent in enumerate(factorsList):
+ if fieldComponent==factorComponent:
+ iColumn[iFactor]=iField
+ valuesList[iFactor]=[]
+
+ for nextLine in inputfile:
+ nextLine=nextLine.strip().split("\t")
+ if len(nextLine)>1:
+ for iFactor, factorComponent in enumerate(factorsList):
+ if nextLine[iColumn[iFactor]] not in valuesList[iFactor]:
+ valuesList[iFactor].append(nextLine[iColumn[iFactor]])
+ inputfile.close()
+
+ allValues=[]
+ for iFactor, factorComponent in enumerate(factorsList):
+ for iValue, valueComponent in enumerate(valuesList[iFactor]):
+ allValues.append(factorComponent+":"+valueComponent)
+
+ options=[]
+ for iValue, valueComponent in enumerate(allValues):
+ options.append((valueComponent,valueComponent,False))
+
+ return options
+
+def replaceNamesInFiles(expressionFile_name,conditionFile_name,outputExpressionFile,outputConditionFile,ouputDictionnary):
+ dico={}
+ forbidenCharacters={"*",":",",","|"}
+ ##start with expression file, read only the first line
+ inputfile = open(expressionFile_name)
+ outputfile = open(outputExpressionFile, 'w')
+ firstLine = next(inputfile).rstrip().split("\t")
+ iCondition=1
+ newFirstLine=""
+ for i, field_component in enumerate( firstLine ):
+ if (i>0):
+ #conditions names should not be redundant with other conditions
+ if(field_component not in dico):
+ dico[field_component]="Condition"+str(iCondition)
+ newFirstLine+="\t"+"Condition"+str(iCondition)
+ iCondition+=1
+ else:
+ raise NameError('condition name allready exists!')
+ else:
+ newFirstLine+=field_component
+ outputfile.write(newFirstLine+"\n")
+ for line in inputfile:
+ outputfile.write(line)
+ outputfile.close()
+ inputfile.close()
+ #then parse condition file, read all lines in this case
+ inputfile = open(conditionFile_name)
+ outputfile = open(outputConditionFile, 'w')
+ firstLine=1
+ iFactor=1
+ iValue=1
+ for line in inputfile:
+ currentLine = line.rstrip().split("\t")
+ newCurrentLine=""
+ for i, field_component in enumerate( currentLine ):
+ #special treatment for the first line
+ if (firstLine==1):
+ if (i==0):
+ newCurrentLine=field_component
+ else:
+ #factor names should not be redundant with other factors or conditions
+ if(field_component not in dico):
+ dico[field_component]="Factor"+str(iFactor)
+ newCurrentLine+="\t"+"Factor"+str(iFactor)
+ iFactor+=1
+ else:
+ raise NameError('factor name allready exists!')
+ else:
+ if (i==0):
+ #check if condition name allready exist and used it if it is, or create a new one if not
+ if(field_component not in dico):
+ dico[field_component]="Condition"+str(iCondition)
+ newCurrentLine="Condition"+str(iCondition)
+ iCondition+=1
+ else:
+ newCurrentLine=dico[field_component]
+ else:
+ if(field_component not in dico):
+ dico[field_component]="Value"+str(iValue)
+ newCurrentLine+="\tValue"+str(iValue)
+ iValue+=1
+ else:
+ newCurrentLine+="\t"+dico[field_component]
+ outputfile.write(newCurrentLine+"\n")
+ firstLine=0
+ outputfile.close()
+ inputfile.close()
+ ##check if any entries in dictionnary contains forbiden character
+ for key, value in dico.items():
+ for specialCharacter in forbidenCharacters:
+ if value.startswith("Condition")==False and key.find(specialCharacter)!=-1:
+ return 1
+ ##then write dictionnary in a additional file
+ outputfile = open(ouputDictionnary, 'w')
+ for key, value in dico.items():
+ outputfile.write(key+"\t"+value+"\n")
+ outputfile.close()
+ return 0
+
+
+def replaceNamesBlockInFiles(expressionFile_name,conditionFile_name,blockingFile_name,outputExpressionFile,outputConditionFile,outputBlockingFile,ouputDictionnary):
+ dico={}
+ forbidenCharacters={"*",":",",","|"}
+ ##start with expression file, read only the first line
+ inputfile = open(expressionFile_name)
+ outputfile = open(outputExpressionFile, 'w')
+ firstLine = next(inputfile).rstrip().split("\t")
+ iCondition=1
+ newFirstLine=""
+ for i, field_component in enumerate( firstLine ):
+ if (i>0):
+ #conditions names should not be redundant with other conditions
+ if(field_component not in dico):
+ dico[field_component]="Condition"+str(iCondition)
+ newFirstLine+="\t"+"Condition"+str(iCondition)
+ iCondition+=1
+ else:
+ raise NameError('condition name allready exists!')
+ else:
+ newFirstLine+=field_component
+ outputfile.write(newFirstLine+"\n")
+ for line in inputfile:
+ outputfile.write(line)
+ outputfile.close()
+ inputfile.close()
+ #then parse condition file, read all lines in this case
+ iFactor=1
+ iValue=1
+ for fileNum in range(2):
+ if fileNum==0:
+ inputfile = open(conditionFile_name)
+ outputfile = open(outputConditionFile, 'w')
+ else:
+ inputfile = open(blockingFile_name)
+ outputfile = open(outputBlockingFile, 'w')
+ firstLine=1
+ for line in inputfile:
+ currentLine = line.rstrip().split("\t")
+ newCurrentLine=""
+ for i, field_component in enumerate( currentLine ):
+ #special treatment for the first line
+ if (firstLine==1):
+ if (i==0):
+ newCurrentLine=field_component
+ else:
+ #factor names should not be redundant with other factors or conditions
+ if(field_component not in dico):
+ dico[field_component]="Factor"+str(iFactor)
+ newCurrentLine+="\t"+"Factor"+str(iFactor)
+ iFactor+=1
+ else:
+ raise NameError('factor name allready exists!')
+ else:
+ if (i==0):
+ #check if condition name allready exist and used it if it is, or create a new one if not
+ if(field_component not in dico):
+ dico[field_component]="Condition"+str(iCondition)
+ newCurrentLine="Condition"+str(iCondition)
+ iCondition+=1
+ else:
+ newCurrentLine=dico[field_component]
+ else:
+ if(field_component not in dico):
+ dico[field_component]="Value"+str(iValue)
+ newCurrentLine+="\tValue"+str(iValue)
+ iValue+=1
+ else:
+ newCurrentLine+="\t"+dico[field_component]
+ outputfile.write(newCurrentLine+"\n")
+ firstLine=0
+ outputfile.close()
+ inputfile.close()
+ ##check if any entries in dictionnary contains forbiden character
+ for key, value in dico.items():
+ for specialCharacter in forbidenCharacters:
+ if value.startswith("Condition")==False and key.find(specialCharacter)!=-1:
+ return 1
+ ##then write dictionnary in a additional file
+ outputfile = open(ouputDictionnary, 'w')
+ for key, value in dico.items():
+ outputfile.write(key+"\t"+value+"\n")
+ outputfile.close()
+ return 0
diff -r 000000000000 -r 708f43bda2b6 src/LIMMAscriptV4.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/LIMMAscriptV4.R Fri Jun 26 09:35:11 2020 -0400
@@ -0,0 +1,1002 @@
+# A command-line interface for LIMMA to use with Galaxy
+# written by Jimmy Vandel
+# one of these arguments is required:
+#
+#
+initial.options <- commandArgs(trailingOnly = FALSE)
+file.arg.name <- "--file="
+script.name <- sub(file.arg.name, "", initial.options[grep(file.arg.name, initial.options)])
+script.basename <- dirname(script.name)
+source(file.path(script.basename, "utils.R"))
+source(file.path(script.basename, "getopt.R"))
+
+#addComment("Welcome R!")
+
+# setup R error handling to go to stderr
+options( show.error.messages=F, error = function () { cat(geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
+
+# we need that to not crash galaxy with an UTF8 error on German LC settings.
+loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
+loc <- Sys.setlocale("LC_NUMERIC", "C")
+
+#get starting time
+start.time <- Sys.time()
+
+options(stringAsfactors = FALSE, useFancyQuotes = FALSE)
+args <- commandArgs()
+
+# get options, using the spec as defined by the enclosed list.
+# we read the options from the default: commandArgs(TRUE).
+spec <- matrix(c(
+ "dataFile", "i", 1, "character",
+ "factorInfo","a", 1, "character",
+ "blockingInfo","b", 1, "character",
+ "dicoRenaming","g",1,"character",
+ "blockingPolicy","u", 1, "character",
+ "fdrThreshold","t", 1, "double",
+ "thresholdFC","d", 1, "double",
+ "format", "f", 1, "character",
+ "histo","h", 1, "character",
+ "volcano","v", 1, "character",
+ "factorsContrast","r", 1, "character",
+ "contrastNames","p", 1, "character",
+ "firstGroupContrast","m", 1, "character",
+ "secondGroupContrast","n", 1, "character",
+ "controlGroups","c", 1, "character",
+ "fratioFile","s",1,"character",
+ "organismID","x",1,"character",
+ "rowNameType","y",1,"character",
+ "quiet", "q", 0, "logical",
+ "log", "l", 1, "character",
+ "outputFile" , "o", 1, "character",
+ "outputDfFile" , "z", 1, "character"),
+ byrow=TRUE, ncol=4)
+opt <- getopt(spec)
+
+# enforce the following required arguments
+if (is.null(opt$log)) {
+ addComment("[ERROR]'log file' is required\n")
+ q( "no", 1, F )
+}
+addComment("[INFO]Start of R script",T,opt$log,display=FALSE)
+if (is.null(opt$dataFile)) {
+ addComment("[ERROR]'dataFile' is required",T,opt$log)
+ q( "no", 1, F )
+}
+if (!is.null(opt$blockingInfo) && is.null(opt$blockingPolicy) ) {
+ addComment("[ERROR]blocking policy is missing",T,opt$log)
+ q( "no", 1, F )
+}
+if (is.null(opt$dicoRenaming)) {
+ addComment("[ERROR]renaming dictionnary is missing",T,opt$log)
+ q( "no", 1, F )
+}
+if (is.null(opt$factorsContrast)) {
+ addComment("[ERROR]factor informations are missing",T,opt$log)
+ q( "no", 1, F )
+}
+if (length(opt$firstGroupContrast)!=length(opt$secondGroupContrast)) {
+ addComment("[ERROR]some contrast groups seems to be empty",T,opt$log)
+ q( "no", 1, F )
+}
+if (is.null(opt$factorInfo)) {
+ addComment("[ERROR]factors info is missing",T,opt$log)
+ q( "no", 1, F )
+}
+if (is.null(opt$format)) {
+ addComment("[ERROR]'output format' is required",T,opt$log)
+ q( "no", 1, F )
+}
+if (is.null(opt$fdrThreshold)) {
+ addComment("[ERROR]'FDR threshold' is required",T,opt$log)
+ q( "no", 1, F )
+}
+if (is.null(opt$outputFile) || is.null(opt$outputDfFile)){
+ addComment("[ERROR]'output files' are required",T,opt$log)
+ q( "no", 1, F )
+}
+if (is.null(opt$thresholdFC)){
+ addComment("[ERROR]'FC threshold' is required",T,opt$log)
+ q( "no", 1, F )
+}
+if (is.null(opt$fratioFile)) {
+ addComment("[ERROR]F-ratio parameter is missing",T,opt$log)
+ q( "no", 1, F )
+}
+
+#demande si le script sera bavard
+verbose <- if (is.null(opt$quiet)) {
+ TRUE
+}else{
+ FALSE
+}
+
+#paramètres internes
+#pour savoir si on remplace les FC calculés par LIMMA par un calcul du LS-MEAN (ie moyenne de moyennes de chaque groupe dans chaque terme du contraste plutôt qu'une moyenne globale dans chaque terme)
+useLSmean=FALSE
+
+addComment("[INFO]Parameters checked!",T,opt$log,display=FALSE)
+
+addComment(c("[INFO]Working directory: ",getwd()),TRUE,opt$log,display=FALSE)
+addComment(c("[INFO]Command line: ",args),TRUE,opt$log,display=FALSE)
+
+#directory for plots
+dir.create(file.path(getwd(), "plotDir"))
+dir.create(file.path(getwd(), "plotLyDir"))
+
+#charge des packages silencieusement
+suppressPackageStartupMessages({
+ library("methods")
+ library("limma")
+ library("biomaRt")
+ library("ggplot2")
+ library("plotly")
+ library("stringr")
+ library("RColorBrewer")
+})
+
+
+#chargement du fichier dictionnaire de renommage
+renamingDico=read.csv(file=file.path(getwd(), opt$dicoRenaming),header=F,sep="\t",colClasses="character")
+rownames(renamingDico)=renamingDico[,2]
+
+
+#chargement des fichiers en entrée
+expDataMatrix=read.csv(file=file.path(getwd(), opt$dataFile),header=F,sep="\t",colClasses="character")
+#remove first row to convert it as colnames (to avoid X before colnames with header=T)
+colNamesData=expDataMatrix[1,-1]
+expDataMatrix=expDataMatrix[-1,]
+#remove first colum to convert it as rownames
+rowNamesData=expDataMatrix[,1]
+expDataMatrix=expDataMatrix[,-1]
+if(is.data.frame(expDataMatrix)){
+ expDataMatrix=data.matrix(expDataMatrix)
+}else{
+ expDataMatrix=data.matrix(as.numeric(expDataMatrix))
+}
+dimnames(expDataMatrix)=list(rowNamesData,colNamesData)
+
+#test the number of rows that are constant in dataMatrix
+nbConstantRows=length(which(unlist(apply(expDataMatrix,1,var))==0))
+if(nbConstantRows>0){
+ addComment(c("[WARNING]",nbConstantRows,"rows are constant across conditions in input data file"),T,opt$log,display=FALSE)
+}
+
+#test if all condition names are present in dico
+if(!all(colnames(expDataMatrix) %in% rownames(renamingDico))){
+ addComment("[ERROR]Missing condition names in renaming dictionary",T,opt$log)
+ q( "no", 1, F )
+}
+
+addComment("[INFO]Expression data loaded and checked",T,opt$log,display=FALSE)
+addComment(c("[INFO]Dim of expression matrix:",dim(expDataMatrix)),T,opt$log,display=FALSE)
+
+#chargement du fichier des facteurs
+factorInfoMatrix=read.csv(file=file.path(getwd(), opt$factorInfo),header=F,sep="\t",colClasses="character")
+#remove first row to convert it as colnames
+colnames(factorInfoMatrix)=factorInfoMatrix[1,]
+factorInfoMatrix=factorInfoMatrix[-1,]
+#use first colum to convert it as rownames but not removing it to avoid conversion as vector in unique factor case
+rownames(factorInfoMatrix)=factorInfoMatrix[,1]
+
+if(length(setdiff(colnames(expDataMatrix),rownames(factorInfoMatrix)))!=0){
+ addComment("[ERROR]Missing samples in factor file",T,opt$log)
+ q( "no", 1, F )
+}
+
+#order sample as in expression matrix and remove spurious sample
+factorInfoMatrix=factorInfoMatrix[colnames(expDataMatrix),]
+
+#test if all values names are present in dico
+if(!all(unlist(factorInfoMatrix) %in% rownames(renamingDico))){
+ addComment("[ERROR]Missing factor names in renaming dictionary",T,opt$log)
+ q( "no", 1, F )
+}
+
+addComment("[INFO]Factors OK",T,opt$log,display=FALSE)
+addComment(c("[INFO]Dim of factorInfo matrix:",dim(factorInfoMatrix)),T,opt$log,display=FALSE)
+
+##manage blocking factor
+blockingFactor=NULL
+blockinFactorsList=NULL
+if(!is.null(opt$blockingInfo)){
+
+ #chargement du fichier des blocking factors
+ blockingInfoMatrix=read.csv(file=file.path(getwd(), opt$blockingInfo),header=F,sep="\t",colClasses="character")
+ #remove first row to convert it as colnames
+ colnames(blockingInfoMatrix)=blockingInfoMatrix[1,]
+ blockingInfoMatrix=blockingInfoMatrix[-1,]
+ #use first colum to convert it as rownames but not removing it to avoid conversion as vector in unique factor case
+ rownames(blockingInfoMatrix)=blockingInfoMatrix[,1]
+
+
+ if(length(setdiff(colnames(expDataMatrix),rownames(blockingInfoMatrix)))!=0){
+ addComment("[ERROR]Missing samples in blocking factor file",T,opt$log)
+ q( "no", 1, F )
+ }
+
+ #order sample as in expression matrix
+ blockingInfoMatrix=blockingInfoMatrix[colnames(expDataMatrix),]
+
+ #test if all blocking names are present in dico
+ if(!all(unlist(blockingInfoMatrix) %in% rownames(renamingDico))){
+ addComment("[ERROR]Missing blocking names in renaming dictionary",T,opt$log)
+ q( "no", 1, F )
+ }
+
+ #remove blocking factors allready present as real factors
+ blockingNotInMainFactors=setdiff(colnames(blockingInfoMatrix)[-1],colnames(factorInfoMatrix)[-1])
+
+ if(length(blockingNotInMainFactors)<(ncol(blockingInfoMatrix)-1))addComment("[WARNING]Blocking factors cannot be principal factors",T,opt$log,display=FALSE)
+
+ if(length(blockingNotInMainFactors)>0){
+
+ blockingInfoMatrix=blockingInfoMatrix[,c(colnames(blockingInfoMatrix)[1],blockingNotInMainFactors)]
+
+ groupBlocking=rep("c",ncol(expDataMatrix))
+ #for each blocking factor
+ for(blockingFact in blockingNotInMainFactors){
+ if(opt$blockingPolicy=="correlated"){
+ indNewFact=as.numeric(factor(blockingInfoMatrix[,blockingFact]))
+ groupBlocking=paste(groupBlocking,indNewFact,sep="_")
+ }else{
+ if(is.null(blockinFactorsList))blockinFactorsList=list()
+ blockinFactorsList[[blockingFact]]=factor(unlist(lapply(blockingInfoMatrix[,blockingFact],function(x)paste(c(blockingFact,"_",x),collapse=""))))
+ }
+ }
+ if(opt$blockingPolicy=="correlated"){
+ blockingFactor=factor(groupBlocking)
+ if(length(levels(blockingFactor))==1){
+ addComment("[ERROR]Selected blocking factors seems to be constant",T,opt$log)
+ q( "no", 1, F )
+ }
+ }
+ addComment("[INFO]Blocking info OK",T,opt$log,display=FALSE)
+ }else{
+ addComment("[WARNING]No blocking factors will be considered",T,opt$log,display=FALSE)
+ }
+}
+
+
+##rename different input parameters using renamingDictionary
+opt$factorsContrast=renamingDico[unlist(lapply(unlist(strsplit(opt$factorsContrast,",")),function(x)which(renamingDico[,1]==x))),2]
+
+userDefinedContrasts=FALSE
+if(!is.null(opt$firstGroupContrast) && !is.null(opt$secondGroupContrast)){
+ userDefinedContrasts=TRUE
+ for(iContrast in 1:length(opt$firstGroupContrast)){
+ opt$firstGroupContrast[iContrast]=paste(unlist(lapply(unlist(strsplit(opt$firstGroupContrast[iContrast],",")),function(x)paste(renamingDico[unlist(lapply(unlist(strsplit(x,"\\*")),function(x)which(renamingDico[,1]==x))),2],collapse="*"))),collapse=",")
+ opt$secondGroupContrast[iContrast]=paste(unlist(lapply(unlist(strsplit(opt$secondGroupContrast[iContrast],",")),function(x)paste(renamingDico[unlist(lapply(unlist(strsplit(x,"\\*")),function(x)which(renamingDico[,1]==x))),2],collapse="*"))),collapse=",")
+ }
+}
+
+if(!is.null(opt$controlGroups)){
+ renamedGroups=c()
+ for(iGroup in unlist(strsplit(opt$controlGroups,","))){
+ renamedControlGroup=paste(renamingDico[unlist(lapply(unlist(strsplit(iGroup,":")),function(x)which(renamingDico[,1]==x))),2],collapse=":")
+ if(length(renamedControlGroup)==0 || any(which(unlist(gregexpr(text = renamedControlGroup,pattern = ":"))==-1))){
+ addComment("[ERROR]Control groups for interaction seem to mismatch, please check them.",T,opt$log)
+ q( "no", 1, F )
+ }
+ renamedGroups=c(renamedGroups,renamedControlGroup)
+ }
+ opt$controlGroups=renamedGroups
+}
+addComment("[INFO]Contrast variables are renamed to avoid confusion",T,opt$log,display=FALSE)
+##renaming done
+
+#to convert factor as numeric value --> useless now ?
+#expDataMatrix=apply(expDataMatrix,c(1,2),function(x)as.numeric(paste(x)))
+
+#get factors info for LIMMA
+factorsList=list()
+for(iFactor in opt$factorsContrast){
+ if(!(iFactor %in% colnames(factorInfoMatrix))){
+ addComment("[ERROR]Required factors are missing in input file",T,opt$log)
+ q( "no", 1, F )
+ }
+ factorsList[[iFactor]]=factor(unlist(lapply(factorInfoMatrix[,iFactor],function(x)paste(c(iFactor,"_",x),collapse=""))))
+ if(length(levels(factorsList[[iFactor]]))==1){
+ addComment("[ERROR]One selected factor seems to be constant",T,opt$log)
+ q( "no", 1, F )
+ }
+}
+
+#check if there is at least 2 factors to allow interaction computation
+if(!is.null(opt$controlGroups) && length(factorsList)<2){
+ addComment("[ERROR]You cannot ask for interaction with less than 2 factors",T,opt$log)
+ q( "no", 1, F )
+}
+
+#merge all factors as a single one
+factorsMerged=as.character(factorsList[[opt$factorsContrast[1]]])
+for(iFactor in opt$factorsContrast[-1]){
+ factorsMerged=paste(factorsMerged,as.character(factorsList[[iFactor]]),sep=".")
+}
+factorsMerged=factor(factorsMerged)
+
+#checked that coefficient number (ie. factorsMerged levels) is strictly smaller than sample size
+if(length(levels(factorsMerged))>=length(factorsMerged)){
+ addComment(c("[ERROR]No enough samples (",length(factorsMerged),") to estimate ",length(levels(factorsMerged))," coefficients"),T,opt$log)
+ q( "no", 1, F )
+}
+
+#get the sample size of each factor values
+sampleSizeFactor=table(factorsMerged)
+
+
+if(!is.null(blockinFactorsList)){
+ factorString=c("blockinFactorsList[['", names(blockinFactorsList)[1],"']]")
+ for(blockingFact in names(blockinFactorsList)[-1]){
+ factorString=c(factorString," + blockinFactorsList[['",blockingFact,"']]")
+ }
+ design = model.matrix(as.formula(paste(c("~ factorsMerged +",factorString," + 0"),collapse="")))
+
+ #rename design columns
+ coeffMeaning = levels(factorsMerged)
+ for(blockingFact in blockinFactorsList){
+ coeffMeaning=c(coeffMeaning,levels(blockingFact)[-1])
+ }
+ colnames(design) = coeffMeaning
+}else{
+ design = model.matrix(as.formula( ~ factorsMerged + 0))
+
+ #rename degin columns
+ coeffMeaning = levels(factorsMerged)
+ colnames(design) = coeffMeaning
+}
+
+addComment(c("[INFO]Available coefficients: ",coeffMeaning),T,opt$log,display=F)
+
+estimableCoeff=which(colSums(design)!=0)
+
+addComment("[INFO]Design done",T,opt$log,display=F)
+
+ #use blocking factor if exists
+if(!is.null(blockingFactor)){
+ corfit <- duplicateCorrelation(expDataMatrix, design, block=blockingFactor)
+
+ addComment(c("[INFO]Correlation within groups: ",corfit$consensus.correlation),T,opt$log,display=F)
+
+ #run linear model fit
+ data.fit = lmFit(expDataMatrix,design,block = blockingFactor, correlation=corfit$consensus.correlation)
+}else{
+ #run linear model fit
+ data.fit = lmFit(expDataMatrix,design)
+}
+
+estimatedCoeff=which(!is.na(data.fit$coefficients[1,]))
+
+addComment("[INFO]Lmfit done",T,opt$log,display=F)
+
+#catch situation where some coefficients cannot be estimated, probably due to dependances between design columns
+#if(length(setdiff(estimableCoeff,estimatedCoeff))>0){
+# addComment("[ERROR]Error in design matrix, check your group definitions",T,opt$log)
+# q( "no", 1, F )
+#}
+#to strong condition, should return ERROR only when coefficients relative to principal factors cannot be estimated, otherwise, return a simple WARNING
+
+#define requested contrasts
+requiredContrasts=c()
+humanReadingContrasts=c()
+persoContrastName=c()
+if(userDefinedContrasts){
+ for(iContrast in 1:length(opt$firstGroupContrast)){
+ posGroup=unlist(lapply(unlist(strsplit(opt$firstGroupContrast[iContrast],",")),function(x)paste(paste(opt$factorsContrast,unlist(strsplit(x,"\\*")),sep="_"),collapse=".")))
+ negGroup=unlist(lapply(unlist(strsplit(opt$secondGroupContrast[iContrast],",")),function(x)paste(paste(opt$factorsContrast,unlist(strsplit(x,"\\*")),sep="_"),collapse=".")))
+ #clear posGroup and negGroup from empty groups
+ emptyPosGroups=which(!(posGroup%in%coeffMeaning))
+ if(length(emptyPosGroups)>0){
+ addComment(c("[WARNING]The group(s)",posGroup[emptyPosGroups],"is/are removed from contrast as it/they is/are empty"),T,opt$log,display=FALSE)
+ posGroup=posGroup[-emptyPosGroups]
+ currentHumanContrast=paste(unlist(strsplit(opt$firstGroupContrast[iContrast],","))[-emptyPosGroups],collapse="+")
+ }else{
+ currentHumanContrast=paste(unlist(strsplit(opt$firstGroupContrast[iContrast],",")),collapse="+")
+ }
+ emptyNegGroups=which(!(negGroup%in%coeffMeaning))
+ if(length(emptyNegGroups)>0){
+ addComment(c("[WARNING]The group(s)",negGroup[emptyNegGroups],"is/are removed from contrast as it/they is/are empty"),T,opt$log,display=FALSE)
+ negGroup=negGroup[-emptyNegGroups]
+ currentHumanContrast=paste(c(currentHumanContrast,unlist(strsplit(opt$secondGroupContrast[iContrast],","))[-emptyNegGroups]),collapse="-")
+ }else{
+ currentHumanContrast=paste(c(currentHumanContrast,unlist(strsplit(opt$secondGroupContrast[iContrast],","))),collapse="-")
+ }
+ if(length(posGroup)==0 || length(negGroup)==0 ){
+ addComment(c("[WARNING]Contrast",currentHumanContrast,"cannot be estimated due to empty group"),T,opt$log,display=FALSE)
+ }else{
+ if(all(posGroup%in%negGroup) && all(negGroup%in%posGroup)){
+ addComment(c("[WARNING]Contrast",currentHumanContrast,"cannot be estimated due to null contrast"),T,opt$log,display=FALSE)
+ }else{
+ #get coefficients required for first group added as positive
+ positiveCoeffWeights=sampleSizeFactor[posGroup]/sum(sampleSizeFactor[posGroup])
+ #positiveCoeffWeights=rep(1,length(posGroup))
+ #names(positiveCoeffWeights)=names(sampleSizeFactor[posGroup])
+ #get coefficients required for second group added as negative
+ negativeCoeffWeights=sampleSizeFactor[negGroup]/sum(sampleSizeFactor[negGroup])
+ #negativeCoeffWeights=rep(1,length(negGroup))
+ #names(negativeCoeffWeights)=names(sampleSizeFactor[negGroup])
+ #build the resulting contrast
+ currentContrast=paste(paste(positiveCoeffWeights[posGroup],posGroup,sep="*"),collapse="+")
+ currentContrast=paste(c(currentContrast,paste(paste(negativeCoeffWeights[negGroup],negGroup,sep="*"),collapse="-")),collapse="-")
+ requiredContrasts=c(requiredContrasts,currentContrast)
+
+ #build the human reading contrast
+ humanReadingContrasts=c(humanReadingContrasts,currentHumanContrast)
+ if(!is.null(opt$contrastNames) && nchar(opt$contrastNames[iContrast])>0){
+ persoContrastName=c(persoContrastName,opt$contrastNames[iContrast])
+ }else{
+ persoContrastName=c(persoContrastName,"")
+ }
+
+ addComment(c("[INFO]Contrast added : ",currentHumanContrast),T,opt$log,display=F)
+ addComment(c("with complete formula ",currentContrast),T,opt$log,display=F)
+ }
+ }
+ }
+}
+
+
+ #define the true formula with interactions to get interaction coefficients
+ factorString=c("factorsList[['", names(factorsList)[1],"']]")
+ for(iFactor in names(factorsList)[-1]){
+ factorString=c(factorString," * factorsList[['",iFactor,"']]")
+ }
+
+ if(!is.null(blockinFactorsList)){
+ for(blockingFact in names(blockinFactorsList)){
+ factorString=c(factorString," + blockinFactorsList[['",blockingFact,"']]")
+ }
+ }
+
+ #should not be null at the end
+ allFtestMeanSquare=NULL
+ #to get the F-test values
+ estimatedInteractions=rownames(anova(lm(as.formula(paste(c("expDataMatrix[1,] ~ ",factorString),collapse="")))))
+ estimatedInteractions=c(unlist(lapply(estimatedInteractions[-length(estimatedInteractions)],function(x){temp=unlist(strsplit(x,"[ \" | : ]"));paste(temp[seq(2,length(temp),3)],collapse=":")})),estimatedInteractions[length(estimatedInteractions)])
+ #rename estimated interaction terms using renamingDico
+ estimatedInteractions=c(unlist(lapply(estimatedInteractions[-length(estimatedInteractions)],function(x)paste(renamingDico[unlist(strsplit(x,":")),1],collapse=":"))),estimatedInteractions[length(estimatedInteractions)])
+ t <- unlist(apply(expDataMatrix,1,function(x){temp=anova(lm(as.formula(paste(c("x ~ ",factorString),collapse=""))))$`Mean Sq`;temp/temp[length(temp)]}))
+ allFtestMeanSquare <- t(matrix(t,nrow=length(estimatedInteractions)))
+ #remove from allFtest rows containing NA
+ if(length(which(is.na(allFtestMeanSquare[,1])))>0)allFtestMeanSquare=allFtestMeanSquare[-(which(is.na(allFtestMeanSquare[,1]))),]
+ colnames(allFtestMeanSquare)=estimatedInteractions
+
+ #add contrasts corresponding to interaction terms
+ if(!is.null(opt$controlGroups)){
+ #first load user defined control group for each factor
+ controlGroup=rep(NA,length(factorsList))
+ names(controlGroup)=names(factorsList)
+ for(iGroup in opt$controlGroups){
+ splitGroup=unlist(strsplit(iGroup,":"))
+ splitGroup[2]=paste(splitGroup[1],splitGroup[2],sep = "_")
+ #check if defined control group is really a level of the corresponding factor
+ if(!splitGroup[1]%in%names(controlGroup) || !splitGroup[2]%in%factorsList[[splitGroup[1]]]){
+ addComment(c("[ERROR]The factor name",splitGroup[1],"does not exist or group name",splitGroup[2]),T,opt$log)
+ q( "no", 1, F )
+ }
+ if(!is.na(controlGroup[splitGroup[1]])){
+ addComment("[ERROR]Several control groups are defined for the same factor, please select only one control group for each factor if you want to compute interaction contrasts",T,opt$log)
+ q( "no", 1, F )
+ }
+ controlGroup[splitGroup[1]]=splitGroup[2]
+ }
+
+ #check if all factor have a defined control group
+ if(any(is.na(controlGroup))){
+ addComment("[ERROR]Missing control group for some factors, please check them if you want to compute interaction contrasts",T,opt$log)
+ q( "no", 1, F )
+ }
+
+ nbFactors=length(factorsList)
+ interactionContrasts=c()
+ contrastClass=c()
+ #initialize list for the first level
+ newPreviousLoopContrast=list()
+ for(iFactorA in 1:(nbFactors-1)){
+ nameFactorA=names(factorsList)[iFactorA]
+ compA=c()
+ for(levelA in setdiff(levels(factorsList[[iFactorA]]),controlGroup[nameFactorA])){
+ compA=c(compA,paste(levelA,controlGroup[nameFactorA],sep="-"))
+ }
+ newPreviousLoopContrast[[as.character(iFactorA)]]=compA
+ }
+ #make a loop for growing interaction set
+ for(globalIfactor in 1:(nbFactors-1)){
+ previousLoopContrast=newPreviousLoopContrast
+ newPreviousLoopContrast=list()
+ #factor A reuse contrasts made at previsous loop
+ for(iFactorA in names(previousLoopContrast)){
+ compA=previousLoopContrast[[iFactorA]]
+
+ if(max(as.integer(unlist(strsplit(iFactorA,"\\."))))1 && iFactor0){
+ requiredContrasts=c(requiredContrasts,interactionContrasts[-contrastToIgnore])
+ humanReadingContrasts=c(humanReadingContrasts,humanReadingInteraction[-contrastToIgnore])
+ persoContrastName=c(persoContrastName,rep("",length(humanReadingInteraction[-contrastToIgnore])))
+ }else{
+ requiredContrasts=c(requiredContrasts,interactionContrasts)
+ humanReadingContrasts=c(humanReadingContrasts,humanReadingInteraction)
+ persoContrastName=c(persoContrastName,rep("",length(humanReadingInteraction)))
+ }
+ }#end of intreaction contrasts
+
+
+ #remove from requiredContrasts contrasts that cannot be estimated
+ toRemove=unique(unlist(lapply(setdiff(coeffMeaning,names(estimatedCoeff)),function(x)grep(x,requiredContrasts))))
+ if(length(toRemove)>0){
+ addComment(c("[WARNING]",length(toRemove)," contrasts are removed, due to missing coefficients"),T,opt$log,display=FALSE)
+ requiredContrasts=requiredContrasts[-toRemove]
+ humanReadingContrasts=humanReadingContrasts[-toRemove]
+ persoContrastName=persoContrastName[-toRemove]
+ }
+
+ if(length(requiredContrasts)==0){
+ addComment("[ERROR]No contrast to compute, please check your contrast definition.",T,opt$log)
+ q( "no", 1, F )
+ }
+
+ #compute for each contrast mean of coefficients in posGroup and negGroup for FC computation of log(FC) with LSmean as in Partek
+ meanPosGroup=list()
+ meanNegGroup=list()
+ for(iContrast in 1:length(requiredContrasts)){
+ #define posGroup and negGroup
+ #first split contrast
+ temp=unlist(strsplit(requiredContrasts[iContrast],"[ + ]"))
+ splitContrast=temp[1]
+ for(iTemp in temp[-1])splitContrast=c(splitContrast,"+",iTemp)
+ splitContrast=unlist(lapply(splitContrast,function(x){temp=unlist(strsplit(x,"-"));splitCompB=temp[1];for(iTemp in temp[-1])splitCompB=c(splitCompB,"-",iTemp);splitCompB}))
+ #and then put each term in good group
+ posGroup=c()
+ negGroup=c()
+ nextIsPos=TRUE
+ for(iSplit in splitContrast){
+ if(iSplit=="+")nextIsPos=TRUE
+ if(iSplit=="-")nextIsPos=FALSE
+ if(iSplit!="-" && iSplit!="+"){
+ #remove weights of contrast terms
+ iSplitBis=unlist(strsplit(iSplit,"[*]"))
+ iSplitBis=iSplitBis[length(iSplitBis)]
+ if(nextIsPos)posGroup=c(posGroup,iSplitBis)
+ else negGroup=c(negGroup,iSplitBis)
+ }
+ }
+ #compute means for each group
+ meanPosGroup[[iContrast]]=apply(as.matrix(data.fit$coefficients[,posGroup],ncol=length(posGroup)),1,mean)
+ meanNegGroup[[iContrast]]=apply(as.matrix(data.fit$coefficients[,negGroup],ncol=length(negGroup)),1,mean)
+ }
+
+
+ contrast.matrix = makeContrasts(contrasts=requiredContrasts,levels=design)
+ data.fit.con = contrasts.fit(data.fit,contrast.matrix)
+
+ addComment("[INFO]Contrast definition done",T,opt$log,T,display=FALSE)
+
+ #compute LIMMA statistics
+ data.fit.eb = eBayes(data.fit.con)
+
+ addComment("[INFO]Estimation done",T,opt$log,T,display=FALSE)
+
+ #adjust p.value through FDR
+ data.fit.eb$adj_p.value=data.fit.eb$p.value
+ for(iComparison in 1:ncol(data.fit.eb$adj_p.value)){
+ data.fit.eb$adj_p.value[,iComparison]=p.adjust(data.fit.eb$p.value[,iComparison],"fdr")
+ }
+
+ #add a new field based on LS-means for each contrast instead of global mean like they were calculated in coefficients field
+ data.fit.eb$coefficientsLS=data.fit.eb$coefficients
+ if(ncol(data.fit.eb$coefficients)!=length(meanPosGroup)){
+ addComment("[ERROR]Estimated contrasts number unexpected",T,opt$log)
+ q( "no", 1, F )
+ }
+ for(iContrast in 1:length(meanPosGroup)){
+ data.fit.eb$coefficientsLS[,iContrast]=meanPosGroup[[iContrast]][rownames(data.fit.eb$coefficientsLS)]-meanNegGroup[[iContrast]][rownames(data.fit.eb$coefficientsLS)]
+ }
+
+ #if requested replace coefficient computed as global mean by LS-means values
+ if(useLSmean)data.fit.eb$coefficients=data.fit.eb$coefficientsLS
+
+addComment("[INFO]Core treatment done",T,opt$log,T,display=FALSE)
+
+
+##convert humanReadingContrasts with namingDictionary to create humanReadingContrastsRenamed and keep original humanReadingContrasts names for file names
+humanReadingContrastsRenamed=rep("",length(humanReadingContrasts))
+for(iContrast in 1:length(humanReadingContrasts)){
+ if(persoContrastName[iContrast]==""){
+ #if(verbose)addComment(humanReadingContrasts[iContrast])
+ specialCharacters=str_extract_all(humanReadingContrasts[iContrast],"[+|*|_|:|-]")[[1]]
+ #if(verbose)addComment(specialCharacters)
+ nameConverted=unlist(lapply(strsplit(humanReadingContrasts[iContrast],"[+|*|_|:|-]")[[1]],function(x)renamingDico[x,1]))
+ #if(verbose)addComment(nameConverted)
+ humanReadingContrastsRenamed[iContrast]=paste(nameConverted,specialCharacters,collapse="",sep="")
+ #if(verbose)addComment(humanReadingContrastsRenamed[iContrast])
+ humanReadingContrastsRenamed[iContrast]=substr(humanReadingContrastsRenamed[iContrast],1,nchar(humanReadingContrastsRenamed[iContrast])-1)
+ }else{
+ humanReadingContrastsRenamed[iContrast]=persoContrastName[iContrast]
+ }
+}
+
+#write correspondances between plot file names (humanReadingContrasts) and displayed names in figure legends (humanReadingContrastsRenamed), usefull to define html items in xml file
+correspondanceTable=matrix("",ncol=2,nrow=ncol(data.fit.eb$p.value))
+correspondanceTable[,1]=unlist(lapply(humanReadingContrasts,function(x)gsub(":","_INT_",gsub("\\+","_PLUS_",gsub("\\*","_AND_",x)))))
+correspondanceTable[,2]=humanReadingContrastsRenamed
+rownames(correspondanceTable)=correspondanceTable[,2]
+write.table(correspondanceTable,file=file.path(getwd(), "correspondanceFileNames.csv"),quote=FALSE,sep="\t",col.names = F,row.names = F)
+
+#plot nominal p-val histograms for selected comparisons
+histogramPerPage=6
+if (!is.null(opt$histo)) {
+ iToPlot=1
+ plotVector=list()
+ nbComparisons=ncol(data.fit.eb$p.value)
+ for (iComparison in 1:nbComparisons){
+ dataToPlot=data.frame(pval=data.fit.eb$p.value[,iComparison],id=rownames(data.fit.eb$p.value))
+ p <- ggplot(data=dataToPlot, aes(x=pval)) + geom_histogram(colour="red", fill="salmon") +
+ theme_bw() + ggtitle(humanReadingContrastsRenamed[iComparison]) + ylab(label="Frequencies") + xlab(label="Nominal p-val") +
+ theme(panel.border=element_blank(),plot.title = element_text(hjust = 0.5))
+ plotVector[[length(plotVector)+1]]=p
+
+ pp <- ggplotly(p)
+ htmlwidgets::saveWidget(as_widget(pp), paste(c(file.path(getwd(), "plotLyDir"),"/",opt$histo,"_",correspondanceTable[humanReadingContrastsRenamed[iComparison],1],".html"),collapse=""),selfcontained = F)
+
+ if(iComparison==nbComparisons || length(plotVector)==histogramPerPage){
+ #plot and close the actual plot
+ if(opt$format=="pdf"){
+ pdf(paste(c("./plotDir/",opt$histo,iToPlot,".pdf"),collapse=""))}else{
+ png(paste(c("./plotDir/",opt$histo,iToPlot,".png"),collapse=""))
+ }
+ multiplot(plotlist=plotVector,cols=2)
+ dev.off()
+ if(iComparison0)pvalThresholdFDR=min(probeWithLowFDR)
+
+ #get significant points over FC and FDR thresholds
+ significativePoints=intersect(which(abs(data.fit.eb$coefficients[,iComparison])>=logFCthreshold),which(data.fit.eb$adj_p.value[,iComparison]<=opt$fdrThreshold))
+
+ #to reduce size of html plot, we keep 20000 points maximum sampled amongst genes with pval>=33%(pval) and abs(log2(FC))<=66%(abs(log2(FC)))
+ htmlPointsToRemove=intersect(which(abs(data.fit.eb$coefficients[,iComparison])<=quantile(abs(data.fit.eb$coefficients[,iComparison]),c(0.66))),which(data.fit.eb$p.value[,iComparison]>=quantile(abs(data.fit.eb$p.value[,iComparison]),c(0.33))))
+ if(length(htmlPointsToRemove)>20000){
+ htmlPointsToRemove=setdiff(htmlPointsToRemove,sample(htmlPointsToRemove,20000))
+ }else{
+ htmlPointsToRemove=c()
+ }
+
+ xMinLimPlot=min(data.fit.eb$coefficients[,iComparison])-0.2
+ xMaxLimPlot=max(data.fit.eb$coefficients[,iComparison])+0.2
+ yMaxLimPlot= max(-log10(data.fit.eb$p.value[,iComparison]))+0.2
+
+ if(length(significativePoints)>0){
+ dataSignifToPlot=data.frame(pval=-log10(data.fit.eb$p.value[significativePoints,iComparison]),FC=data.fit.eb$coefficients[significativePoints,iComparison],description=paste(names(data.fit.eb$coefficients[significativePoints,iComparison]),"\n","FC: " , round(2^data.fit.eb$coefficients[significativePoints,iComparison],2) , " | FDR p-val: ",prettyNum(data.fit.eb$adj_p.value[significativePoints,iComparison],digits=4), sep=""))
+ #to test if remains any normal points to draw
+ if(length(significativePoints)0)p <- p + geom_point(data=dataSignifToPlot,aes(colour=description))
+
+ ##interactive plot
+ if(length(htmlPointsToRemove)>0){
+ pointToRemove=union(htmlPointsToRemove,significativePoints)
+ #to test if remains any normal points to draw
+ if(length(pointToRemove)40000)addComment(c("[WARNING]For",humanReadingContrastsRenamed[iComparison],"volcano, numerous points to plot(",nrow(dataToPlot)+nrow(dataSignifToPlot),"), resulting volcano could be heavy, using more stringent thresholds could be helpful."),T,opt$log)
+
+ phtml <- plot_ly(data=dataToPlot, x=~FC, y=~pval,type="scatter", mode="markers",showlegend = FALSE, marker = list(color="gray",opacity=0.5), text=~description, hoverinfo="text") %>%
+ layout(title = humanReadingContrastsRenamed[iComparison],xaxis=list(title="Log2 Fold Change",showgrid=TRUE, zeroline=FALSE),yaxis=list(title="-log10(p-val)", showgrid=TRUE, zeroline=FALSE))
+ if(length(significativePoints)>0) phtml=add_markers(phtml,data=dataSignifToPlot, x=~FC, y=~pval, mode="markers" , marker=list( color=log10(abs(dataSignifToPlot$FC)*dataSignifToPlot$pval),colorscale='Rainbow'), text=~description, hoverinfo="text", inherit = FALSE) %>% hide_colorbar()
+ if(logFCthreshold!=0){
+ phtml=add_trace(phtml,x=c(-logFCthreshold,-logFCthreshold), y=c(0,yMaxLimPlot), type="scatter", mode = "lines", line=list(color="coral",dash="dash"), hoverinfo='none', showlegend = FALSE,inherit = FALSE)
+ phtml=add_annotations(phtml,x=-logFCthreshold,y=0,xref = "x",yref = "y",text = paste(c("log2(1/FC=",opt$thresholdFC,")"),collapse=""),xanchor = 'right',showarrow = F,textangle=270,font=list(color="coral"))
+ phtml=add_trace(phtml,x=c(logFCthreshold,logFCthreshold), y=c(0, yMaxLimPlot), type="scatter", mode = "lines", line=list(color="coral",dash="dash"), hoverinfo='none', showlegend = FALSE,inherit = FALSE)
+ phtml=add_annotations(phtml,x=logFCthreshold,y=0,xref = "x",yref = "y",text = paste(c("log2(FC=",opt$thresholdFC,")"),collapse=""),xanchor = 'right',showarrow = F,textangle=270,font=list(color="coral"))
+ }
+ if(!is.null(pvalThresholdFDR)){
+ phtml=add_trace(phtml,x=c(xMinLimPlot,xMaxLimPlot), y=c(pvalThresholdFDR,pvalThresholdFDR), type="scatter", mode = "lines", line=list(color="cornflowerblue",dash="dash"), hoverinfo='none', showlegend = FALSE,inherit = FALSE)
+ phtml=add_annotations(phtml,x=xMinLimPlot,y=pvalThresholdFDR+0.1,xref = "x",yref = "y",text = paste(c("FDR pval limit(",opt$fdrThreshold,")"),collapse=""),xanchor = 'left',showarrow = F,font=list(color="cornflowerblue"))
+ }
+ plotVector[[length(plotVector)+1]]=p
+
+ #save plotly files
+ pp <- ggplotly(phtml)
+ htmlwidgets::saveWidget(as_widget(pp), paste(c(file.path(getwd(), "plotLyDir"),"/",opt$volcano,"_",correspondanceTable[humanReadingContrastsRenamed[iComparison],1],".html"),collapse=""),selfcontained = F)
+
+
+ if(iComparison==nbComparisons || length(plotVector)==volcanoPerPage){
+ #plot and close the actual plot
+ if(opt$format=="pdf"){
+ pdf(paste(c("./plotDir/",opt$volcano,"_",correspondanceTable[humanReadingContrastsRenamed[iComparison],1],".pdf"),collapse=""))}else{
+ png(paste(c("./plotDir/",opt$volcano,"_",correspondanceTable[humanReadingContrastsRenamed[iComparison],1],".png"),collapse=""))
+ }
+ multiplot(plotlist=plotVector,cols=1)
+ dev.off()
+ if(iComparison0)))
+#filter out genes with lower FC for all comparisons
+genesToKeep=intersect(genesToKeep,names(which(apply(data.fit.eb$coefficients,1,function(x)length(which(abs(x)>=logFCthreshold))>0))))
+
+if(length(genesToKeep)>0){
+ data.fit.eb$adj_p.value=matrix(data.fit.eb$adj_p.value[genesToKeep,],ncol=ncol(data.fit.eb$adj_p.value))
+ rownames(data.fit.eb$adj_p.value)=genesToKeep
+ colnames(data.fit.eb$adj_p.value)=colnames(data.fit.eb$p.value)
+
+ data.fit.eb$p.value=matrix(data.fit.eb$p.value[genesToKeep,],ncol=ncol(data.fit.eb$p.value))
+ rownames(data.fit.eb$p.value)=genesToKeep
+ colnames(data.fit.eb$p.value)=colnames(data.fit.eb$adj_p.value)
+
+ data.fit.eb$coefficients=matrix(data.fit.eb$coefficients[genesToKeep,],ncol=ncol(data.fit.eb$coefficients))
+ rownames(data.fit.eb$coefficients)=genesToKeep
+ colnames(data.fit.eb$coefficients)=colnames(data.fit.eb$adj_p.value)
+
+ data.fit.eb$t=matrix(data.fit.eb$t[genesToKeep,],ncol=ncol(data.fit.eb$t))
+ rownames(data.fit.eb$t)=genesToKeep
+ colnames(data.fit.eb$t)=colnames(data.fit.eb$adj_p.value)
+
+ dfMatrix=dfMatrix[genesToKeep,,drop=FALSE]
+
+}else{
+ addComment(c("[WARNING]No significative genes considering the given FDR threshold : ",opt$fdrThreshold),T,opt$log,display=FALSE)
+}
+
+addComment("[INFO]Significant genes filtering done",T,opt$log,T,display=FALSE)
+
+
+#plot VennDiagramm for genes below threshold between comparisons
+#t=apply(data.fit.eb$adj_p.value[,1:4],2,function(x)names(which(x<=opt$threshold)))
+#get.venn.partitions(t)
+#vennCounts(data.fit.eb$adj_p.value[,1:4]<=opt$threshold)
+
+#make a simple sort genes based only on the first comparison
+#newOrder=order(data.fit.eb$adj_p.value[,1])
+#data.fit.eb$adj_p.value=data.fit.eb$adj_p.value[newOrder,]
+
+#alternative sorting strategy based on the mean gene rank over all comparisons
+if(length(genesToKeep)>1){
+ currentRank=rep(0,nrow(data.fit.eb$adj_p.value))
+ for(iComparison in 1:ncol(data.fit.eb$adj_p.value)){
+ currentRank=currentRank+rank(data.fit.eb$adj_p.value[,iComparison])
+ }
+ currentRank=currentRank/ncol(data.fit.eb$adj_p.value)
+ newOrder=order(currentRank)
+
+ data.fit.eb$adj_p.value=matrix(data.fit.eb$adj_p.value[newOrder,],ncol=ncol(data.fit.eb$adj_p.value))
+ rownames(data.fit.eb$adj_p.value)=rownames(data.fit.eb$p.value)[newOrder]
+ colnames(data.fit.eb$adj_p.value)=colnames(data.fit.eb$p.value)
+
+ data.fit.eb$p.value=matrix(data.fit.eb$p.value[newOrder,],ncol=ncol(data.fit.eb$p.value))
+ rownames(data.fit.eb$p.value)=rownames(data.fit.eb$adj_p.value)
+ colnames(data.fit.eb$p.value)=colnames(data.fit.eb$adj_p.value)
+
+ data.fit.eb$coefficients=matrix(data.fit.eb$coefficients[newOrder,],ncol=ncol(data.fit.eb$coefficients))
+ rownames(data.fit.eb$coefficients)=rownames(data.fit.eb$adj_p.value)
+ colnames(data.fit.eb$coefficients)=colnames(data.fit.eb$adj_p.value)
+
+ data.fit.eb$t=matrix(data.fit.eb$t[newOrder,],ncol=ncol(data.fit.eb$t))
+ rownames(data.fit.eb$t)=rownames(data.fit.eb$adj_p.value)
+ colnames(data.fit.eb$t)=colnames(data.fit.eb$adj_p.value)
+
+ dfMatrix=dfMatrix[newOrder,,drop=FALSE]
+}
+
+
+#formating output matrices depending on genes to keep
+if(length(genesToKeep)==0){
+ outputData=matrix(0,ncol=ncol(data.fit.eb$adj_p.value)*5+2,nrow=3)
+ outputData[1,]=c("X","X",rep(humanReadingContrastsRenamed,each=5))
+ outputData[2,]=c("X","X",rep(c("p-val","FDR.p-val","FC","log2(FC)","t-stat"),ncol(data.fit.eb$adj_p.value)))
+ outputData[,1]=c("LIMMA","Gene","noGene")
+ outputData[,2]=c("Comparison","Info","noInfo")
+
+ outputDfData=matrix(0,ncol=3+1,nrow=2)
+ outputDfData[1,]=c("X","df.residual","df.prior","df.total")
+ outputDfData[,1]=c("Statistics","noGene")
+}else{
+ if(length(genesToKeep)==1){
+ outputData=matrix(0,ncol=ncol(data.fit.eb$adj_p.value)*5+2,nrow=3)
+ outputData[1,]=c("X","X",rep(humanReadingContrastsRenamed,each=5))
+ outputData[2,]=c("X","X",rep(c("p-val","FDR.p-val","FC","log2(FC)","t-stat"),ncol(data.fit.eb$adj_p.value)))
+ outputData[,1]=c("LIMMA","Gene",genesToKeep)
+ outputData[,2]=c("Comparison","Info","na")
+ if(!is.null(rowItemInfo))outputData[3,2]=rowItemInfo[genesToKeep]
+ outputData[3,seq(3,ncol(outputData),5)]=prettyNum(data.fit.eb$p.value,digits=4)
+ outputData[3,seq(4,ncol(outputData),5)]=prettyNum(data.fit.eb$adj_p.value,digits=4)
+ outputData[3,seq(5,ncol(outputData),5)]=prettyNum(2^data.fit.eb$coefficients,digits=4)
+ outputData[3,seq(6,ncol(outputData),5)]=prettyNum(data.fit.eb$coefficients,digits=4)
+ outputData[3,seq(7,ncol(outputData),5)]=prettyNum(data.fit.eb$t,digits=4)
+
+ outputDfData=matrix(0,ncol=3+1,nrow=1+nrow(dfMatrix))
+ outputDfData[1,]=c("Statistics","df.residual","df.prior","df.total")
+ outputDfData[2,]=c(rownames(dfMatrix),prettyNum(dfMatrix[,c("df.residual","df.prior","df.total")],digits=4))
+ }else{
+ #format matrix to be correctly read by galaxy (move headers in first column and row)
+ outputData=matrix(0,ncol=ncol(data.fit.eb$adj_p.value)*5+2,nrow=nrow(data.fit.eb$adj_p.value)+2)
+ outputData[1,]=c("X","X",rep(humanReadingContrastsRenamed,each=5))
+ outputData[2,]=c("X","X",rep(c("p-val","FDR.p-val","FC","log2(FC)","t-stat"),ncol(data.fit.eb$adj_p.value)))
+ outputData[,1]=c("LIMMA","Gene",rownames(data.fit.eb$adj_p.value))
+ outputData[,2]=c("Comparison","Info",rep("na",nrow(data.fit.eb$adj_p.value)))
+ if(!is.null(rowItemInfo))outputData[3:nrow(outputData),2]=rowItemInfo[rownames(data.fit.eb$adj_p.value)]
+ outputData[3:nrow(outputData),seq(3,ncol(outputData),5)]=prettyNum(data.fit.eb$p.value,digits=4)
+ outputData[3:nrow(outputData),seq(4,ncol(outputData),5)]=prettyNum(data.fit.eb$adj_p.value,digits=4)
+ outputData[3:nrow(outputData),seq(5,ncol(outputData),5)]=prettyNum(2^data.fit.eb$coefficients,digits=4)
+ outputData[3:nrow(outputData),seq(6,ncol(outputData),5)]=prettyNum(data.fit.eb$coefficients,digits=4)
+ outputData[3:nrow(outputData),seq(7,ncol(outputData),5)]=prettyNum(data.fit.eb$t,digits=4)
+
+ outputDfData=matrix(0,ncol=3+1,nrow=1+nrow(dfMatrix))
+ outputDfData[1,]=c("Statistics","df.residual","df.prior","df.total")
+ outputDfData[2:(1+nrow(dfMatrix)),]=cbind(rownames(dfMatrix),prettyNum(dfMatrix[,c("df.residual")],digits=4),prettyNum(dfMatrix[,c("df.prior")],digits=4),prettyNum(dfMatrix[,c("df.total")],digits=4))
+ }
+}
+addComment("[INFO]Formated output",T,opt$log,display=FALSE)
+
+#write output results
+write.table(outputData,file=opt$outputFile,quote=FALSE,sep="\t",col.names = F,row.names = F)
+
+#write df info file
+write.table(outputDfData,file=opt$outputDfFile,quote=FALSE,sep="\t",col.names = F,row.names = F)
+
+end.time <- Sys.time()
+addComment(c("[INFO]Total execution time for R script:",as.numeric(end.time - start.time,units="mins"),"mins"),T,opt$log,display=FALSE)
+
+addComment("[INFO]End of R script",T,opt$log,display=FALSE)
+
+printSessionInfo(opt$log)
+#sessionInfo()
+
+
+
diff -r 000000000000 -r 708f43bda2b6 src/VolcanoPlotsScript.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/VolcanoPlotsScript.R Fri Jun 26 09:35:11 2020 -0400
@@ -0,0 +1,426 @@
+# R script to plot volcanos through Galaxy based GIANT tool
+# written by Jimmy Vandel
+#
+#
+initial.options <- commandArgs(trailingOnly = FALSE)
+file.arg.name <- "--file="
+script.name <- sub(file.arg.name, "", initial.options[grep(file.arg.name, initial.options)])
+script.basename <- dirname(script.name)
+source(file.path(script.basename, "utils.R"))
+source(file.path(script.basename, "getopt.R"))
+
+#addComment("Welcome R!")
+
+# setup R error handling to go to stderr
+options( show.error.messages=F, error = function () { cat(geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
+
+# we need that to not crash galaxy with an UTF8 error on German LC settings.
+loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
+loc <- Sys.setlocale("LC_NUMERIC", "C")
+
+#get starting time
+start.time <- Sys.time()
+
+options(stringAsfactors = FALSE, useFancyQuotes = FALSE)
+args <- commandArgs()
+
+# get options, using the spec as defined by the enclosed list.
+# we read the options from the default: commandArgs(TRUE).
+spec <- matrix(c(
+ "statisticsFile", "i", 1, "character",
+ "volcanoName" , "n", 1, "character",
+ "pvalColumnName" , "p", 1, "character",
+ "fdrColumnName" , "m", 1, "character",
+ "fcColumnName" , "c", 1, "character",
+ "fcKind","d", 1, "character",
+ "fdrThreshold","s", 1, "double",
+ "fcThreshold","e", 1, "double",
+ "organismID","x",1,"character",
+ "rowNameType","y",1,"character",
+ "log", "l", 1, "character",
+ "outputFile" , "o", 1, "character",
+ "format", "f", 1, "character",
+ "quiet", "q", 0, "logical"),
+ byrow=TRUE, ncol=4)
+opt <- getopt(spec)
+
+# enforce the following required arguments
+if (is.null(opt$log)) {
+ addComment("[ERROR]'log file' is required\n")
+ q( "no", 1, F )
+}
+addComment("[INFO]Start of R script",T,opt$log,display=FALSE)
+if (is.null(opt$statisticsFile)) {
+ addComment("[ERROR]'statisticsFile' is required",T,opt$log)
+ q( "no", 1, F )
+}
+if (length(opt$pvalColumnName)==0 || length(opt$fdrColumnName)==0 || length(opt$fcColumnName)==0) {
+ addComment("[ERROR]no selected columns",T,opt$log)
+ q( "no", 1, F )
+}
+if (length(opt$pvalColumnName)!=length(opt$fcColumnName) || length(opt$pvalColumnName)!=length(opt$fdrColumnName)) {
+ addComment("[ERROR]different number of selected columns between p.val, adj-p.val and FC ",T,opt$log)
+ q( "no", 1, F )
+}
+if (is.null(opt$fcKind)) {
+ addComment("[ERROR]'fcKind' is required",T,opt$log)
+ q( "no", 1, F )
+}
+if (is.null(opt$fdrThreshold)) {
+ addComment("[ERROR]'FDR threshold' is required",T,opt$log)
+ q( "no", 1, F )
+}
+if (is.null(opt$fcThreshold)) {
+ addComment("[ERROR]'FC threshold' is required",T,opt$log)
+ q( "no", 1, F )
+}
+if (is.null(opt$outputFile)) {
+ addComment("[ERROR]'output file' is required",T,opt$log)
+ q( "no", 1, F )
+}
+if (is.null(opt$format)) {
+ addComment("[ERROR]'output format' is required",T,opt$log)
+ q( "no", 1, F )
+}
+
+#demande si le script sera bavard
+verbose <- if (is.null(opt$quiet)) {
+ TRUE
+}else{
+ FALSE
+}
+
+#paramètres internes
+addComment("[INFO]Parameters checked test mode !",T,opt$log,display=FALSE)
+
+addComment(c("[INFO]Working directory: ",getwd()),TRUE,opt$log,display=FALSE)
+addComment(c("[INFO]Command line: ",args),TRUE,opt$log,display=FALSE)
+
+#directory for plots
+dir.create(file.path(getwd(), "plotDir"))
+dir.create(file.path(getwd(), "plotLyDir"))
+
+#charge des packages silencieusement
+suppressPackageStartupMessages({
+ library("methods")
+ library("biomaRt")
+ library("ggplot2")
+ library("plotly")
+ library("stringr")
+})
+
+#define some usefull variable
+nbVolcanosToPlot=length(opt$pvalColumnName)
+
+#load input file
+statDataMatrix=read.csv(file=file.path(getwd(), opt$statisticsFile),header=F,sep="\t",colClasses="character")
+#remove first colum to convert it as rownames
+rownames(statDataMatrix)=statDataMatrix[,1]
+statDataMatrix=statDataMatrix[,-1]
+
+#identify lines without adjusted p-value info (should contain the same content as rownames) and replace them with NA values
+FDRinfo=rep(TRUE,nbVolcanosToPlot)
+for(iVolcano in 1:nbVolcanosToPlot){
+ #input parameter should be None when adjusted p-val are not available
+ if(opt$fdrColumnName[iVolcano]=="None"){
+ #content of the corresponding column should also be the same as rownames
+ if(!all(statDataMatrix[,(iVolcano-1)*3+2]==rownames(statDataMatrix))){
+ addComment(c("[ERROR]It seems that input stat matrix contains adjusted p-values for volcano",iVolcano,"whereas input parameter indicates that not."),T,opt$log)
+ q( "no", 1, F )
+ }
+ FDRinfo[iVolcano]=FALSE
+ statDataMatrix[,(iVolcano-1)*3+2]=NA
+ }
+}
+
+if(is.data.frame(statDataMatrix)){
+ statDataMatrix=data.matrix(statDataMatrix)
+}else{
+ statDataMatrix=data.matrix(as.numeric(statDataMatrix))
+}
+
+#check if available column number match with volcano requested number
+if(ncol(statDataMatrix)!=3*nbVolcanosToPlot){
+ addComment("[ERROR]Input file column number is different from requested volcano number",T,opt$log)
+ q( "no", 1, F )
+}
+
+#build global dataFrame with data and fill with p.val and log2(FC) and FDR
+dataFrame=data.frame(row.names = rownames(statDataMatrix))
+#start with p-value
+dataFrame$p.value=statDataMatrix[,seq(1,nbVolcanosToPlot*3,3),drop=FALSE]
+#compute FDR if needed or just get available info
+dataFrame$adj_p.value=dataFrame$p.value
+for(iVolcano in 1:nbVolcanosToPlot){
+ #adjusted p-value are already computed
+ if(FDRinfo[iVolcano]){
+ dataFrame$adj_p.value[,iVolcano]=statDataMatrix[,(iVolcano-1)*3+2,drop=FALSE]
+ }else{
+ #adjusted p-value should be computed based on p-val using FDR
+ dataFrame$adj_p.value[,iVolcano]=p.adjust(dataFrame$p.value[,iVolcano,drop=FALSE],"fdr")
+ addComment(c("[INFO]Adjusted p-values are not available in input for volcano",iVolcano,", FDR approach will be used on available raw p-values"),T,opt$log)
+ }
+}
+if(opt$fcKind=="FC"){
+ #we should transform as Log2FC
+ dataFrame$coefficients=log2(statDataMatrix[,seq(3,nbVolcanosToPlot*3,3),drop=FALSE])
+ addComment(c("[INFO]FC are converted in log2(FC) for plotting"),T,opt$log)
+}else{
+ dataFrame$coefficients=statDataMatrix[,seq(3,nbVolcanosToPlot*3,3),drop=FALSE]
+}
+
+addComment(c("[INFO]Input data available for",nbVolcanosToPlot,"volcano(s) with",nrow(statDataMatrix),"rows"),T,opt$log)
+
+
+#plot VOLCANOs
+volcanoPerPage=1
+logFCthreshold=log2(opt$fcThreshold)
+iToPlot=1
+plotVector=list()
+volcanoNameList=c()
+for (iVolcano in 1:nbVolcanosToPlot){
+
+ if(nchar(opt$volcanoName[iVolcano])>0){
+ curentVolcanoName=opt$volcanoName[iVolcano]
+ }else{
+ curentVolcanoName=paste(iVolcano,opt$pvalColumnName[iVolcano],sep="_")
+ }
+
+ #keep only rows without NA for p-val, adjusted p-val and coeff
+ pValToPlot=dataFrame$p.value[,iVolcano]
+ fdrToPlot=dataFrame$adj_p.value[,iVolcano]
+ coeffToPlot=dataFrame$coefficients[,iVolcano]
+
+ rowToRemove=unique(c(which(is.na(pValToPlot)),which(is.na(fdrToPlot)),which(is.na(coeffToPlot))))
+ if(length(rowToRemove)>0){
+ pValToPlot=pValToPlot[-rowToRemove]
+ fdrToPlot=fdrToPlot[-rowToRemove]
+ coeffToPlot=coeffToPlot[-rowToRemove]
+ }
+ addComment(c("[INFO]For",curentVolcanoName,"volcano,",length(rowToRemove),"rows are discarded due to NA values,",length(pValToPlot),"remaining rows."),T,opt$log)
+
+ #save volcano name
+ volcanoNameList=c(volcanoNameList,curentVolcanoName)
+
+ #remove characters possibly troubling
+ volcanoFileName=iVolcano
+
+ #define the log10(p-val) threshold corresponding to FDR threshold fixed by user
+ probeWithLowFDR=-log10(pValToPlot[which(fdrToPlot<=opt$fdrThreshold)])
+ pvalThresholdFDR=NULL
+ if(length(probeWithLowFDR)>0)pvalThresholdFDR=min(probeWithLowFDR)
+
+ #get significant points over FC and FDR thresholds
+ significativePoints=intersect(which(abs(coeffToPlot)>=logFCthreshold),which(fdrToPlot<=opt$fdrThreshold))
+
+ #to reduce size of html plot, we keep 20000 points maximum sampled amongst genes with pval>=33%(pval) and abs(log2(FC))<=66%(abs(log2(FC)))
+ htmlPointsToRemove=intersect(which(abs(coeffToPlot)<=quantile(abs(coeffToPlot),c(0.66))),which(pValToPlot>=quantile(abs(pValToPlot),c(0.33))))
+ if(length(htmlPointsToRemove)>20000){
+ htmlPointsToRemove=setdiff(htmlPointsToRemove,sample(htmlPointsToRemove,20000))
+ }else{
+ htmlPointsToRemove=c()
+ }
+
+ xMinLimPlot=min(coeffToPlot)-0.2
+ xMaxLimPlot=max(coeffToPlot)+0.2
+ yMaxLimPlot= max(-log10(pValToPlot))+0.2
+
+ if(length(significativePoints)>0){
+ dataSignifToPlot=data.frame(pval=-log10(pValToPlot[significativePoints]),FC=coeffToPlot[significativePoints],description=paste(names(coeffToPlot[significativePoints]),"\n","FC: " , round(2^coeffToPlot[significativePoints],2) , " | Adjusted p-val: ",prettyNum(fdrToPlot[significativePoints],digits=4), sep=""))
+ #to test if remains any normal points to draw
+ if(length(significativePoints)0)p <- p + geom_point(data=dataSignifToPlot,aes(colour=description))
+
+ ##interactive plot
+
+ if(length(htmlPointsToRemove)>0){
+ pointToRemove=union(htmlPointsToRemove,significativePoints)
+ #to test if it remains any normal points to draw
+ if(length(pointToRemove)40000)addComment(c("[WARNING]For",curentVolcanoName,"volcano, numerous points to plot(",nrow(dataToPlot)+nrow(dataSignifToPlot),"), resulting volcano could be heavy, using more stringent thresholds could be helpful."),T,opt$log)
+
+ phtml <- plot_ly(data=dataToPlot, x=~FC, y=~pval,type="scatter", mode="markers",showlegend = FALSE, marker = list(color="gray",opacity=0.5), text=~description, hoverinfo="text") %>%
+ layout(title = curentVolcanoName[iVolcano],xaxis=list(title="Log2 Fold Change",showgrid=TRUE, zeroline=FALSE),yaxis=list(title="-Log10(p-val)", showgrid=TRUE, zeroline=FALSE))
+ if(length(significativePoints)>0) phtml=add_markers(phtml,data=dataSignifToPlot, x=~FC, y=~pval, mode="markers" , marker=list( color=log10(abs(dataSignifToPlot$FC)*dataSignifToPlot$pval),colorscale='Rainbow'), text=~description, hoverinfo="text", inherit = FALSE) %>% hide_colorbar()
+ if(logFCthreshold!=0){
+ phtml=add_trace(phtml,x=c(-logFCthreshold,-logFCthreshold), y=c(0,yMaxLimPlot), type="scatter", mode = "lines", line=list(color="coral",dash="dash"), hoverinfo='none', showlegend = FALSE,inherit = FALSE)
+ phtml=add_annotations(phtml,x=-logFCthreshold,y=0,xref = "x",yref = "y",text = paste(c("log2(1/FC=",opt$fcThreshold,")"),collapse=""),xanchor = 'right',showarrow = F,textangle=270,font=list(color="coral"))
+ phtml=add_trace(phtml,x=c(logFCthreshold,logFCthreshold), y=c(0, yMaxLimPlot), type="scatter", mode = "lines", line=list(color="coral",dash="dash"), hoverinfo='none', showlegend = FALSE,inherit = FALSE)
+ phtml=add_annotations(phtml,x=logFCthreshold,y=0,xref = "x",yref = "y",text = paste(c("log2(FC=",opt$fcThreshold,")"),collapse=""),xanchor = 'right',showarrow = F,textangle=270,font=list(color="coral"))
+ }
+ if(!is.null(pvalThresholdFDR)){
+ phtml=add_trace(phtml,x=c(xMinLimPlot,xMaxLimPlot), y=c(pvalThresholdFDR,pvalThresholdFDR), type="scatter", mode = "lines", line=list(color="cornflowerblue",dash="dash"), hoverinfo='none', showlegend = FALSE,inherit = FALSE)
+ phtml=add_annotations(phtml,x=xMinLimPlot,y=pvalThresholdFDR+0.1,xref = "x",yref = "y",text = paste(c("Adjusted pval limit(",opt$fdrThreshold,")"),collapse=""),xanchor = 'left',showarrow = F,font=list(color="cornflowerblue"))
+ }
+ plotVector[[length(plotVector)+1]]=p
+
+ #save plotly files
+ pp <- ggplotly(phtml)
+ htmlwidgets::saveWidget(as_widget(pp), paste(c(file.path(getwd(), "plotLyDir"),"/Volcanos_",volcanoFileName,".html"),collapse=""),selfcontained = F)
+
+
+ if(iVolcano==nbVolcanosToPlot || length(plotVector)==volcanoPerPage){
+ #plot and close the actual plot
+ if(opt$format=="pdf"){
+ pdf(paste(c("./plotDir/Volcanos_",volcanoFileName,".pdf"),collapse=""))}else{
+ png(paste(c("./plotDir/Volcanos_",volcanoFileName,".png"),collapse=""))
+ }
+ multiplot(plotlist=plotVector,cols=1)
+ dev.off()
+ if(iVolcano0)))
+#filter out genes with lower FC for all comparisons
+genesToKeep=intersect(genesToKeep,names(which(apply(dataFrame$coefficients,1,function(x)length(which(abs(x)>=logFCthreshold))>0))))
+
+if(length(genesToKeep)>0){
+ dataFrameNew=data.frame(row.names=genesToKeep)
+
+ dataFrameNew$adj_p.value=matrix(dataFrame$adj_p.value[genesToKeep,,drop=FALSE],ncol=ncol(dataFrame$adj_p.value))
+ rownames(dataFrameNew$adj_p.value)=genesToKeep
+ colnames(dataFrameNew$adj_p.value)=colnames(dataFrame$p.value)
+
+ dataFrameNew$p.value=matrix(dataFrame$p.value[genesToKeep,,drop=FALSE],ncol=ncol(dataFrame$p.value))
+ rownames(dataFrameNew$p.value)=genesToKeep
+ colnames(dataFrameNew$p.value)=colnames(dataFrame$adj_p.value)
+
+ dataFrameNew$coefficients=matrix(dataFrame$coefficients[genesToKeep,,drop=FALSE],ncol=ncol(dataFrame$coefficients))
+ rownames(dataFrameNew$coefficients)=genesToKeep
+ colnames(dataFrameNew$coefficients)=colnames(dataFrame$adj_p.value)
+
+ dataFrame=dataFrameNew
+ rm(dataFrameNew)
+}else{
+ addComment("[WARNING]No significative genes",T,opt$log,display=FALSE)
+}
+
+addComment("[INFO]Significant genes filtering done",T,opt$log,T,display=FALSE)
+
+
+#plot VennDiagramm for genes below threshold between comparisons
+#t=apply(dataFrame$adj_p.value[,1:4],2,function(x)names(which(x<=opt$threshold)))
+#get.venn.partitions(t)
+#vennCounts(dataFrame$adj_p.value[,1:4]<=opt$threshold)
+
+#make a simple sort genes based only on the first comparison
+#newOrder=order(dataFrame$adj_p.value[,1])
+#dataFrame$adj_p.value=dataFrame$adj_p.value[newOrder,]
+
+#alternative sorting strategy based on the mean gene rank over all comparisons
+if(length(genesToKeep)>1){
+ currentRank=rep(0,nrow(dataFrame$adj_p.value))
+ for(iVolcano in 1:ncol(dataFrame$adj_p.value)){
+ currentRank=currentRank+rank(dataFrame$adj_p.value[,iVolcano])
+ }
+ currentRank=currentRank/ncol(dataFrame$adj_p.value)
+ newOrder=order(currentRank)
+ rownames(dataFrame)=rownames(dataFrame)[newOrder]
+
+ dataFrame$adj_p.value=matrix(dataFrame$adj_p.value[newOrder,],ncol=ncol(dataFrame$adj_p.value))
+ rownames(dataFrame$adj_p.value)=rownames(dataFrame$p.value)[newOrder]
+ colnames(dataFrame$adj_p.value)=colnames(dataFrame$p.value)
+
+ dataFrame$p.value=matrix(dataFrame$p.value[newOrder,],ncol=ncol(dataFrame$p.value))
+ rownames(dataFrame$p.value)=rownames(dataFrame$adj_p.value)
+ colnames(dataFrame$p.value)=colnames(dataFrame$adj_p.value)
+
+ dataFrame$coefficients=matrix(dataFrame$coefficients[newOrder,],ncol=ncol(dataFrame$coefficients))
+ rownames(dataFrame$coefficients)=rownames(dataFrame$adj_p.value)
+ colnames(dataFrame$coefficients)=colnames(dataFrame$adj_p.value)
+}
+
+#formating output matrix depending on genes to keep
+if(length(genesToKeep)==0){
+ outputData=matrix(0,ncol=ncol(dataFrame$adj_p.value)*4+2,nrow=3)
+ outputData[1,]=c("X","X",rep(volcanoNameList,each=4))
+ outputData[2,]=c("X","X",rep(c("p-val","Adjusted.p-val","FC","log2(FC)"),ncol(dataFrame$adj_p.value)))
+ outputData[,1]=c("Volcano","Gene","noGene")
+ outputData[,2]=c("Comparison","Info","noInfo")
+}else{
+ if(length(genesToKeep)==1){
+ outputData=matrix(0,ncol=ncol(dataFrame$adj_p.value)*4+2,nrow=3)
+ outputData[1,]=c("X","X",rep(volcanoNameList,each=4))
+ outputData[2,]=c("X","X",rep(c("p-val","Adjusted.p-val","FC","log2(FC)"),ncol(dataFrame$adj_p.value)))
+ outputData[,1]=c("Volcano","Gene",genesToKeep)
+ outputData[,2]=c("Comparison","Info","na")
+ if(!is.null(rowItemInfo))outputData[3,2]=rowItemInfo[genesToKeep]
+ outputData[3,seq(3,ncol(outputData),4)]=prettyNum(dataFrame$p.value,digits=4)
+ outputData[3,seq(4,ncol(outputData),4)]=prettyNum(dataFrame$adj_p.value,digits=4)
+ outputData[3,seq(5,ncol(outputData),4)]=prettyNum(2^dataFrame$coefficients,digits=4)
+ outputData[3,seq(6,ncol(outputData),4)]=prettyNum(dataFrame$coefficients,digits=4)
+ }else{
+ #format matrix to be correctly read by galaxy (move headers in first column and row)
+ outputData=matrix(0,ncol=ncol(dataFrame$adj_p.value)*4+2,nrow=nrow(dataFrame$adj_p.value)+2)
+ outputData[1,]=c("X","X",rep(volcanoNameList,each=4))
+ outputData[2,]=c("X","X",rep(c("p-val","Adjusted.p-val","FC","log2(FC)"),ncol(dataFrame$adj_p.value)))
+ outputData[,1]=c("Volcano","Gene",rownames(dataFrame$adj_p.value))
+ outputData[,2]=c("Comparison","Info",rep("na",nrow(dataFrame$adj_p.value)))
+ if(!is.null(rowItemInfo))outputData[3:nrow(outputData),2]=rowItemInfo[rownames(dataFrame$adj_p.value)]
+ outputData[3:nrow(outputData),seq(3,ncol(outputData),4)]=prettyNum(dataFrame$p.value,digits=4)
+ outputData[3:nrow(outputData),seq(4,ncol(outputData),4)]=prettyNum(dataFrame$adj_p.value,digits=4)
+ outputData[3:nrow(outputData),seq(5,ncol(outputData),4)]=prettyNum(2^dataFrame$coefficients,digits=4)
+ outputData[3:nrow(outputData),seq(6,ncol(outputData),4)]=prettyNum(dataFrame$coefficients,digits=4)
+ }
+}
+addComment("[INFO]Formated output",T,opt$log,display=FALSE)
+
+#write output results
+write.table(outputData,file=opt$outputFile,quote=FALSE,sep="\t",col.names = F,row.names = F)
+
+
+end.time <- Sys.time()
+addComment(c("[INFO]Total execution time for R script:",as.numeric(end.time - start.time,units="mins"),"mins"),T,opt$log,display=FALSE)
+
+addComment("[INFO]End of R script",T,opt$log,display=FALSE)
+
+printSessionInfo(opt$log)
+
+#sessionInfo()
\ No newline at end of file
diff -r 000000000000 -r 708f43bda2b6 src/getopt.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/getopt.R Fri Jun 26 09:35:11 2020 -0400
@@ -0,0 +1,773 @@
+# Copyright (c) 2008-2010 Allen Day
+# Copyright (c) 2011-2013 Trevor L. Davis
+#
+# Modified by J.Vandel 2017 to consider situation of multiple identical flag
+# and concatenate as a vector the set of parameter for the same flag instead of
+# keeping only the last value as done by the previous version.
+#
+# This file is free software: you may copy, redistribute and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation, either version 2 of the License, or (at your
+# option) any later version.
+#
+# This file is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+#' C-like getopt behavior
+#'
+#' getopt is primarily intended to be used with ``\link{Rscript}''. It
+#' facilitates writing ``\#!'' shebang scripts that accept short and long
+#' flags/options. It can also be used from ``R'' directly, but is probably less
+#' useful in this context.
+#'
+#' getopt() returns a \link{list} data structure containing \link{names} of the
+#' flags that were present in the \link{character} \link{vector} passed in under
+#' the \emph{opt} argument. Each value of the \link{list} is coerced to the
+#' data type specified according to the value of the \emph{spec} argument. See
+#' below for details.
+#'
+#' Notes on naming convention:
+#'
+#' 1. An \emph{option} is one of the shell-split input strings.
+#'
+#' 2. A \emph{flag} is a type of \emph{option}. a \emph{flag} can be defined as
+#' having no \emph{argument} (defined below), a required \emph{argument}, or an
+#' optional \emph{argument}.
+#'
+#' 3. An \emph{argument} is a type of \emph{option}, and is the value associated
+#' with a flag.
+#'
+#' 4. A \emph{long flag} is a type of \emph{flag}, and begins with the string
+#' ``--''. If the \emph{long flag} has an associated \emph{argument}, it may be
+#' delimited from the \emph{long flag} by either a trailing \emph{=}, or may be
+#' the subsequent \emph{option}.
+#'
+#' 5. A \emph{short flag} is a type of \emph{flag}, and begins with the string
+#' ``-''. If a \emph{short flag} has an associated \emph{argument}, it is the
+#' subsequent \emph{option}. \emph{short flags} may be bundled together,
+#' sharing a single leading ``-'', but only the final \emph{short flag} is able
+#' to have a corresponding \emph{argument}.
+#'
+#' Many users wonder whether they should use the getopt package, optparse package,
+#' or argparse package.
+#' Here is some of the major differences:
+#'
+#' Features available in \code{getopt} unavailable in \code{optparse}
+#'
+#' 1. As well as allowing one to specify options that take either
+#' no argument or a required argument like \code{optparse},
+#' \code{getopt} also allows one to specify option with an optional argument.
+#'
+#' Some features implemented in \code{optparse} package unavailable in \code{getopt}
+#'
+#' 1. Limited support for capturing positional arguments after the optional arguments
+#' when \code{positional_arguments} set to TRUE in \code{parse_args}
+#'
+#' 2. Automatic generation of an help option and printing of help text when encounters an "-h"
+#'
+#' 3. Option to specify default arguments for options as well the
+#' variable name to store option values
+#'
+#' There is also new package \code{argparse} introduced in 2012 which contains
+#' all the features of both getopt and optparse but which has a dependency on
+#' Python 2.7 or 3.2+ and has not been used in production since 2008 or 2009
+#' like the getopt and optparse packages.
+#'
+#' Some Features unlikely to be implemented in \code{getopt}:
+#'
+#' 1. Support for multiple, identical flags, e.g. for "-m 3 -v 5 -v", the
+#' trailing "-v" overrides the preceding "-v 5", result is v=TRUE (or equivalent
+#' typecast).
+#'
+#' 2. Support for multi-valued flags, e.g. "--libpath=/usr/local/lib
+#' --libpath=/tmp/foo".
+#'
+#' 3. Support for lists, e.g. "--define os=linux --define os=redhat" would
+#' set result$os$linux=TRUE and result$os$redhat=TRUE.
+#'
+#' 4. Support for incremental, argument-less flags, e.g. "/path/to/script
+#' -vvv" should set v=3.
+#'
+#' 5. Support partial-but-unique string match on options, e.g. "--verb" and
+#' "--verbose" both match long flag "--verbose".
+#'
+#' 6. No support for mixing in positional arguments or extra arguments that
+#' don't match any options. For example, you can't do "my.R --arg1 1 foo bar
+#' baz" and recover "foo", "bar", "baz" as a list. Likewise for "my.R foo
+#' --arg1 1 bar baz".
+#'
+#' @aliases getopt getopt-package
+#' @param spec The getopt specification, or spec of what options are considered
+#' valid. The specification must be either a 4-5 column \link{matrix}, or a
+#' \link{character} \link{vector} coercible into a 4 column \link{matrix} using
+#' \link{matrix}(x,ncol=4,byrow=TRUE) command. The \link{matrix}/\link{vector}
+#' contains:
+#'
+#' Column 1: the \emph{long flag} name. A multi-\link{character} string.
+#'
+#' Column 2: \emph{short flag} alias of Column 1. A single-\link{character}
+#' string.
+#'
+#' Column 3: \emph{Argument} mask of the \emph{flag}. An \link{integer}.
+#' Possible values: 0=no argument, 1=required argument, 2=optional argument.
+#'
+#' Column 4: Data type to which the \emph{flag}'s argument shall be cast using
+#' \link{storage.mode}. A multi-\link{character} string. This only considered
+#' for same-row Column 3 values of 1,2. Possible values: \link{logical},
+#' \link{integer}, \link{double}, \link{complex}, \link{character}.
+#' If \link{numeric} is encountered then it will be converted to double.
+#'
+#' Column 5 (optional): A brief description of the purpose of the option.
+#'
+#' The terms \emph{option}, \emph{flag}, \emph{long flag}, \emph{short flag},
+#' and \emph{argument} have very specific meanings in the context of this
+#' document. Read the ``Description'' section for definitions.
+#' @param opt This defaults to the return value of \link{commandArgs}(TRUE).
+#'
+#' If R was invoked directly via the ``R'' command, this corresponds to all
+#' arguments passed to R after the ``--args'' flag.
+#'
+#' If R was invoked via the ``\link{Rscript}'' command, this corresponds to all
+#' arguments after the name of the R script file.
+#'
+#' Read about \link{commandArgs} and \link{Rscript} to learn more.
+#' @param command The string to use in the usage message as the name of the
+#' script. See argument \emph{usage}.
+#' @param usage If TRUE, argument \emph{opt} will be ignored and a usage
+#' statement (character string) will be generated and returned from \emph{spec}.
+#' @param debug This is used internally to debug the getopt() function itself.
+#' @author Allen Day
+#' @seealso \code{\link{getopt}}
+#' @keywords data
+#' @export
+#' @examples
+#'
+#' #!/path/to/Rscript
+#' library('getopt');
+#' #get options, using the spec as defined by the enclosed list.
+#' #we read the options from the default: commandArgs(TRUE).
+#' spec = matrix(c(
+#' 'verbose', 'v', 2, "integer",
+#' 'help' , 'h', 0, "logical",
+#' 'count' , 'c', 1, "integer",
+#' 'mean' , 'm', 1, "double",
+#' 'sd' , 's', 1, "double"
+#' ), byrow=TRUE, ncol=4);
+#' opt = getopt(spec);
+#'
+#' # if help was asked for print a friendly message
+#' # and exit with a non-zero error code
+#' if ( !is.null(opt$help) ) {
+#' cat(getopt(spec, usage=TRUE));
+#' q(status=1);
+#' }
+#'
+#' #set some reasonable defaults for the options that are needed,
+#' #but were not specified.
+#' if ( is.null(opt$mean ) ) { opt$mean = 0 }
+#' if ( is.null(opt$sd ) ) { opt$sd = 1 }
+#' if ( is.null(opt$count ) ) { opt$count = 10 }
+#' if ( is.null(opt$verbose ) ) { opt$verbose = FALSE }
+#'
+#' #print some progress messages to stderr, if requested.
+#' if ( opt$verbose ) { write("writing...",stderr()); }
+#'
+#' #do some operation based on user input.
+#' cat(paste(rnorm(opt$count,mean=opt$mean,sd=opt$sd),collapse="\n"));
+#' cat("\n");
+#'
+#' #signal success and exit.
+#' #q(status=0);
+getopt = function (spec=NULL,opt=commandArgs(TRUE),command=get_Rscript_filename(),usage=FALSE,debug=FALSE) {
+
+ # littler compatibility - map argv vector to opt
+ if (exists("argv", where = .GlobalEnv, inherits = FALSE)) {
+ opt = get("argv", envir = .GlobalEnv);
+ }
+
+ ncol=4;
+ maxcol=6;
+ col.long.name = 1;
+ col.short.name = 2;
+ col.has.argument = 3;
+ col.mode = 4;
+ col.description = 5;
+
+ flag.no.argument = 0;
+ flag.required.argument = 1;
+ flag.optional.argument = 2;
+
+ result = list();
+ result$ARGS = vector(mode="character");
+
+ #no spec. fail.
+ if ( is.null(spec) ) {
+ stop('argument "spec" must be non-null.');
+
+ #spec is not a matrix. attempt to coerce, if possible. issue a warning.
+ } else if ( !is.matrix(spec) ) {
+ if ( length(spec)/4 == as.integer(length(spec)/4) ) {
+ warning('argument "spec" was coerced to a 4-column (row-major) matrix. use a matrix to prevent the coercion');
+ spec = matrix( spec, ncol=ncol, byrow=TRUE );
+ } else {
+ stop('argument "spec" must be a matrix, or a character vector with length divisible by 4, rtfm.');
+ }
+
+ #spec is a matrix, but it has too few columns.
+ } else if ( dim(spec)[2] < ncol ) {
+ stop(paste('"spec" should have at least ",ncol," columns.',sep=''));
+
+ #spec is a matrix, but it has too many columns.
+ } else if ( dim(spec)[2] > maxcol ) {
+ stop(paste('"spec" should have no more than ",maxcol," columns.',sep=''));
+
+ #spec is a matrix, and it has some optional columns.
+ } else if ( dim(spec)[2] != ncol ) {
+ ncol = dim(spec)[2];
+ }
+
+ #sanity check. make sure long names are unique, and short names are unique.
+ if ( length(unique(spec[,col.long.name])) != length(spec[,col.long.name]) ) {
+ stop(paste('redundant long names for flags (column ',col.long.name,').',sep=''));
+ }
+ if ( length(na.omit(unique(spec[,col.short.name]))) != length(na.omit(spec[,col.short.name])) ) {
+ stop(paste('redundant short names for flags (column ',col.short.name,').',sep=''));
+ }
+ # convert numeric type to double type
+ spec[,4] <- gsub("numeric", "double", spec[,4])
+
+ # if usage=TRUE, don't process opt, but generate a usage string from the data in spec
+ if ( usage ) {
+ ret = '';
+ ret = paste(ret,"Usage: ",command,sep='');
+ for ( j in 1:(dim(spec))[1] ) {
+ ret = paste(ret,' [-[-',spec[j,col.long.name],'|',spec[j,col.short.name],']',sep='');
+ if (spec[j,col.has.argument] == flag.no.argument) {
+ ret = paste(ret,']',sep='');
+ } else if (spec[j,col.has.argument] == flag.required.argument) {
+ ret = paste(ret,' <',spec[j,col.mode],'>]',sep='');
+ } else if (spec[j,col.has.argument] == flag.optional.argument) {
+ ret = paste(ret,' [<',spec[j,col.mode],'>]]',sep='');
+ }
+ }
+ # include usage strings
+ if ( ncol >= 5 ) {
+ max.long = max(apply(cbind(spec[,col.long.name]),1,function(x)length(strsplit(x,'')[[1]])));
+ ret = paste(ret,"\n",sep='');
+ for (j in 1:(dim(spec))[1] ) {
+ ret = paste(ret,sprintf(paste(" -%s|--%-",max.long,"s %s\n",sep=''),
+ spec[j,col.short.name],spec[j,col.long.name],spec[j,col.description]
+ ),sep='');
+ }
+ }
+ else {
+ ret = paste(ret,"\n",sep='');
+ }
+ return(ret);
+ }
+
+ #XXX check spec validity here. e.g. column three should be convertible to integer
+
+ i = 1;
+
+ while ( i <= length(opt) ) {
+ if ( debug ) print(paste("processing",opt[i]));
+
+ current.flag = 0; #XXX use NA
+ optstring = opt[i];
+
+
+ #long flag
+ if ( substr(optstring, 1, 2) == '--' ) {
+ if ( debug ) print(paste(" long option:",opt[i]));
+
+ optstring = substring(optstring,3);
+
+ this.flag = NA;
+ this.argument = NA;
+ kv = strsplit(optstring, '=')[[1]];
+ if ( !is.na(kv[2]) ) {
+ this.flag = kv[1];
+ this.argument = paste(kv[-1], collapse="=");
+ } else {
+ this.flag = optstring;
+ }
+
+ rowmatch = grep( this.flag, spec[,col.long.name],fixed=TRUE );
+
+ #long flag is invalid, matches no options
+ if ( length(rowmatch) == 0 ) {
+ stop(paste('long flag "', this.flag, '" is invalid', sep=''));
+
+ #long flag is ambiguous, matches too many options
+ } else if ( length(rowmatch) > 1 ) {
+ # check if there is an exact match and use that
+ rowmatch = which(this.flag == spec[,col.long.name])
+ if(length(rowmatch) == 0) {
+ stop(paste('long flag "', this.flag, '" is ambiguous', sep=''));
+ }
+ }
+
+ #if we have an argument
+ if ( !is.na(this.argument) ) {
+ #if we can't accept the argument, bail out
+ if ( spec[rowmatch, col.has.argument] == flag.no.argument ) {
+ stop(paste('long flag "', this.flag, '" accepts no arguments', sep=''));
+
+ #otherwise assign the argument to the flag
+ } else {
+ storage.mode(this.argument) = spec[rowmatch, col.mode];
+ #don't need here to remove the last value of the vector as argument is in the same string as
+ #the flag name "--flag=argument" so no spurious TRUE was added
+ result[[spec[rowmatch, col.long.name]]] = c(result[[spec[rowmatch, col.long.name]]],this.argument);
+ i = i + 1;
+ next;
+ }
+
+ #otherwise, we don't have an argument
+ } else {
+ #if we require an argument, bail out
+ ###if ( spec[rowmatch, col.has.argument] == flag.required.argument ) {
+ ### stop(paste('long flag "', this.flag, '" requires an argument', sep=''));
+
+ #long flag has no attached argument. set flag as present. set current.flag so we can peek ahead later and consume the argument if it's there
+ ###} else {
+ result[[spec[rowmatch, col.long.name]]] = c(result[[spec[rowmatch, col.long.name]]],TRUE);
+ current.flag = rowmatch;
+ ###}
+ }
+
+ #short flag(s)
+ } else if ( substr(optstring, 1, 1) == '-' ) {
+ if ( debug ) print(paste(" short option:",opt[i]));
+
+ these.flags = strsplit(optstring,'')[[1]];
+
+ done = FALSE;
+ for ( j in 2:length(these.flags) ) {
+ this.flag = these.flags[j];
+ rowmatch = grep( this.flag, spec[,col.short.name],fixed=TRUE );
+
+ #short flag is invalid, matches no options
+ if ( length(rowmatch) == 0 ) {
+ stop(paste('short flag "', this.flag, '" is invalid', sep=''));
+
+ #short flag is ambiguous, matches too many options
+ } else if ( length(rowmatch) > 1 ) {
+ stop(paste('short flag "', this.flag, '" is ambiguous', sep=''));
+
+ #short flag has an argument, but is not the last in a compound flag string
+ } else if ( j < length(these.flags) & spec[rowmatch,col.has.argument] == flag.required.argument ) {
+ stop(paste('short flag "', this.flag, '" requires an argument, but has none', sep=''));
+
+ #short flag has no argument, flag it as present
+ } else if ( spec[rowmatch,col.has.argument] == flag.no.argument ) {
+ result[[spec[rowmatch, col.long.name]]] = c(result[[spec[rowmatch, col.long.name]]],TRUE);
+ done = TRUE;
+
+ #can't definitively process this flag yet, need to see if next option is an argument or not
+ } else {
+ result[[spec[rowmatch, col.long.name]]] = c(result[[spec[rowmatch, col.long.name]]],TRUE);
+ current.flag = rowmatch;
+ done = FALSE;
+ }
+ }
+ if ( done ) {
+ i = i + 1;
+ next;
+ }
+ }
+
+ #invalid opt
+ if ( current.flag == 0 ) {
+ stop(paste('"', optstring, '" is not a valid option, or does not support an argument', sep=''));
+ #TBD support for positional args
+ #if ( debug ) print(paste('"', optstring, '" not a valid option. It is appended to getopt(...)$ARGS', sep=''));
+ #result$ARGS = append(result$ARGS, optstring);
+
+ # some dangling flag, handle it
+ } else if ( current.flag > 0 ) {
+ if ( debug ) print(' dangling flag');
+ if ( length(opt) > i ) {
+ peek.optstring = opt[i + 1];
+ if ( debug ) print(paste(' peeking ahead at: "',peek.optstring,'"',sep=''));
+
+ #got an argument. attach it, increment the index, and move on to the next option. we don't allow arguments beginning with '-' UNLESS
+ #specfile indicates the value is an "integer" or "double", in which case we allow a leading dash (and verify trailing digits/decimals).
+ if ( substr(peek.optstring, 1, 1) != '-' |
+ #match negative double
+ ( substr(peek.optstring, 1, 1) == '-'
+ & regexpr('^-[0123456789]*\\.?[0123456789]+$',peek.optstring) > 0
+ & spec[current.flag, col.mode]== 'double'
+ ) |
+ #match negative integer
+ ( substr(peek.optstring, 1, 1) == '-'
+ & regexpr('^-[0123456789]+$',peek.optstring) > 0
+ & spec[current.flag, col.mode]== 'integer'
+ )
+ ) {
+ if ( debug ) print(paste(' consuming argument *',peek.optstring,'*',sep=''));
+ storage.mode(peek.optstring) = spec[current.flag, col.mode];
+ #remove the last argument put in result for current.flag that should be a TRUE and concatenate argument with previous ones
+ result[[spec[current.flag, col.long.name]]] = c(result[[spec[current.flag, col.long.name]]][-length(result[[spec[current.flag, col.long.name]]])],peek.optstring);
+ i = i + 1;
+
+ #a lone dash
+ } else if ( substr(peek.optstring, 1, 1) == '-' & length(strsplit(peek.optstring,'')[[1]]) == 1 ) {
+ if ( debug ) print(' consuming "lone dash" argument');
+ storage.mode(peek.optstring) = spec[current.flag, col.mode];
+ #remove the last argument put in result for current.flag that should be a TRUE and concatenate argument with previous ones
+ result[[spec[current.flag, col.long.name]]] =c(result[[spec[current.flag, col.long.name]]][-length(result[[spec[current.flag, col.long.name]]])],peek.optstring);
+ i = i + 1;
+
+ #no argument
+ } else {
+ if ( debug ) print(' no argument!');
+
+ #if we require an argument, bail out
+ if ( spec[current.flag, col.has.argument] == flag.required.argument ) {
+ stop(paste('flag "', this.flag, '" requires an argument', sep=''));
+
+ #otherwise set flag as present.
+ } else if (
+ spec[current.flag, col.has.argument] == flag.optional.argument |
+ spec[current.flag, col.has.argument] == flag.no.argument
+ ) {
+ x = TRUE;
+ storage.mode(x) = spec[current.flag, col.mode];
+ result[[spec[current.flag, col.long.name]]] = c(result[[spec[current.flag, col.long.name]]],x);
+ } else {
+ stop(paste("This should never happen.",
+ "Is your spec argument correct? Maybe you forgot to set",
+ "ncol=4, byrow=TRUE in your matrix call?"));
+ }
+ }
+ #trailing flag without required argument
+ } else if ( spec[current.flag, col.has.argument] == flag.required.argument ) {
+ stop(paste('flag "', this.flag, '" requires an argument', sep=''));
+
+ #trailing flag without optional argument
+ } else if ( spec[current.flag, col.has.argument] == flag.optional.argument ) {
+ x = TRUE;
+ storage.mode(x) = spec[current.flag, col.mode];
+ result[[spec[current.flag, col.long.name]]] = c(result[[spec[current.flag, col.long.name]]],x);
+
+ #trailing flag without argument
+ } else if ( spec[current.flag, col.has.argument] == flag.no.argument ) {
+ x = TRUE;
+ storage.mode(x) = spec[current.flag, col.mode];
+ result[[spec[current.flag, col.long.name]]] = c(result[[spec[current.flag, col.long.name]]],x);
+ } else {
+ stop("this should never happen (2). please inform the author.");
+ }
+ #no dangling flag, nothing to do.
+ } else {
+ }
+
+ i = i+1;
+ }
+ return(result);
+}
+
+
+
+#########################
+#set a modified version using only long named parameters
+
+getoptLong = function (spec=NULL,opt=commandArgs(TRUE),command=get_Rscript_filename(),usage=FALSE,debug=FALSE) {
+
+ # littler compatibility - map argv vector to opt
+ if (exists("argv", where = .GlobalEnv, inherits = FALSE)) {
+ opt = get("argv", envir = .GlobalEnv);
+ }
+
+ ncol=4;
+ maxcol=6;
+ col.long.name = 1;
+ #col.short.name = 2;
+ col.has.argument = 3;
+ col.mode = 4;
+ col.description = 5;
+
+ flag.no.argument = 0;
+ flag.required.argument = 1;
+ flag.optional.argument = 2;
+
+ result = list();
+ result$ARGS = vector(mode="character");
+
+ #no spec. fail.
+ if ( is.null(spec) ) {
+ stop('argument "spec" must be non-null.');
+
+ #spec is not a matrix. attempt to coerce, if possible. issue a warning.
+ } else if ( !is.matrix(spec) ) {
+ if ( length(spec)/4 == as.integer(length(spec)/4) ) {
+ warning('argument "spec" was coerced to a 4-column (row-major) matrix. use a matrix to prevent the coercion');
+ spec = matrix( spec, ncol=ncol, byrow=TRUE );
+ } else {
+ stop('argument "spec" must be a matrix, or a character vector with length divisible by 4, rtfm.');
+ }
+
+ #spec is a matrix, but it has too few columns.
+ } else if ( dim(spec)[2] < ncol ) {
+ stop(paste('"spec" should have at least ",ncol," columns.',sep=''));
+
+ #spec is a matrix, but it has too many columns.
+ } else if ( dim(spec)[2] > maxcol ) {
+ stop(paste('"spec" should have no more than ",maxcol," columns.',sep=''));
+
+ #spec is a matrix, and it has some optional columns.
+ } else if ( dim(spec)[2] != ncol ) {
+ ncol = dim(spec)[2];
+ }
+
+ #sanity check. make sure long names are unique, and short names are unique.
+ if ( length(unique(spec[,col.long.name])) != length(spec[,col.long.name]) ) {
+ stop(paste('redundant long names for flags (column ',col.long.name,').',sep=''));
+ }
+ # if ( length(na.omit(unique(spec[,col.short.name]))) != length(na.omit(spec[,col.short.name])) ) {
+ # stop(paste('redundant short names for flags (column ',col.short.name,').',sep=''));
+ # }
+ # convert numeric type to double type
+ spec[,4] <- gsub("numeric", "double", spec[,4])
+
+ # if usage=TRUE, don't process opt, but generate a usage string from the data in spec
+ if ( usage ) {
+ ret = '';
+ ret = paste(ret,"Usage: ",command,sep='');
+ for ( j in 1:(dim(spec))[1] ) {
+ ret = paste(ret,' [-[-',spec[j,col.long.name],']',sep='');
+ if (spec[j,col.has.argument] == flag.no.argument) {
+ ret = paste(ret,']',sep='');
+ } else if (spec[j,col.has.argument] == flag.required.argument) {
+ ret = paste(ret,' <',spec[j,col.mode],'>]',sep='');
+ } else if (spec[j,col.has.argument] == flag.optional.argument) {
+ ret = paste(ret,' [<',spec[j,col.mode],'>]]',sep='');
+ }
+ }
+ # include usage strings
+ if ( ncol >= 5 ) {
+ max.long = max(apply(cbind(spec[,col.long.name]),1,function(x)length(strsplit(x,'')[[1]])));
+ ret = paste(ret,"\n",sep='');
+ for (j in 1:(dim(spec))[1] ) {
+ ret = paste(ret,sprintf(paste("--%-",max.long,"s %s\n",sep='')
+ ,spec[j,col.long.name],spec[j,col.description]
+ ),sep='');
+ }
+ }
+ else {
+ ret = paste(ret,"\n",sep='');
+ }
+ return(ret);
+ }
+
+ #XXX check spec validity here. e.g. column three should be convertible to integer
+
+ i = 1;
+
+ while ( i <= length(opt) ) {
+ if ( debug ) print(paste("processing",opt[i]));
+
+ current.flag = 0; #XXX use NA
+ optstring = opt[i];
+
+
+ #long flag
+ if ( substr(optstring, 1, 2) == '--' ) {
+ if ( debug ) print(paste(" long option:",opt[i]));
+
+ optstring = substring(optstring,3);
+
+ this.flag = NA;
+ this.argument = NA;
+ kv = strsplit(optstring, '=')[[1]];
+ if ( !is.na(kv[2]) ) {
+ this.flag = kv[1];
+ this.argument = paste(kv[-1], collapse="=");
+ } else {
+ this.flag = optstring;
+ }
+
+ rowmatch = grep( this.flag, spec[,col.long.name],fixed=TRUE );
+
+ #long flag is invalid, matches no options
+ if ( length(rowmatch) == 0 ) {
+ stop(paste('long flag "', this.flag, '" is invalid', sep=''));
+
+ #long flag is ambiguous, matches too many options
+ } else if ( length(rowmatch) > 1 ) {
+ # check if there is an exact match and use that
+ rowmatch = which(this.flag == spec[,col.long.name])
+ if(length(rowmatch) == 0) {
+ stop(paste('long flag "', this.flag, '" is ambiguous', sep=''));
+ }
+ }
+
+ #if we have an argument
+ if ( !is.na(this.argument) ) {
+ #if we can't accept the argument, bail out
+ if ( spec[rowmatch, col.has.argument] == flag.no.argument ) {
+ stop(paste('long flag "', this.flag, '" accepts no arguments', sep=''));
+
+ #otherwise assign the argument to the flag
+ } else {
+ storage.mode(this.argument) = spec[rowmatch, col.mode];
+ #don't need here to remove the last value of the vector as argument is in the same string as
+ #the flag name "--flag=argument" so no spurious TRUE was added
+ result[[spec[rowmatch, col.long.name]]] = c(result[[spec[rowmatch, col.long.name]]],this.argument);
+ i = i + 1;
+ next;
+ }
+
+ #otherwise, we don't have an argument
+ } else {
+ #if we require an argument, bail out
+ ###if ( spec[rowmatch, col.has.argument] == flag.required.argument ) {
+ ### stop(paste('long flag "', this.flag, '" requires an argument', sep=''));
+
+ #long flag has no attached argument. set flag as present. set current.flag so we can peek ahead later and consume the argument if it's there
+ ###} else {
+ result[[spec[rowmatch, col.long.name]]] = c(result[[spec[rowmatch, col.long.name]]],TRUE);
+ current.flag = rowmatch;
+ ###}
+ }
+
+ #short flag(s)
+ }
+ #else if ( substr(optstring, 1, 1) == '-' ) {
+ # if ( debug ) print(paste(" short option:",opt[i]));
+ #
+ # these.flags = strsplit(optstring,'')[[1]];
+ #
+ # done = FALSE;
+ # for ( j in 2:length(these.flags) ) {
+ # this.flag = these.flags[j];
+ # rowmatch = grep( this.flag, spec[,col.short.name],fixed=TRUE );
+ #
+ # #short flag is invalid, matches no options
+ # if ( length(rowmatch) == 0 ) {
+ # stop(paste('short flag "', this.flag, '" is invalid', sep=''));
+ #
+ # #short flag is ambiguous, matches too many options
+ # } else if ( length(rowmatch) > 1 ) {
+ # stop(paste('short flag "', this.flag, '" is ambiguous', sep=''));
+ #
+ # #short flag has an argument, but is not the last in a compound flag string
+ # } else if ( j < length(these.flags) & spec[rowmatch,col.has.argument] == flag.required.argument ) {
+ # stop(paste('short flag "', this.flag, '" requires an argument, but has none', sep=''));
+ #
+ # #short flag has no argument, flag it as present
+ # } else if ( spec[rowmatch,col.has.argument] == flag.no.argument ) {
+ # result[[spec[rowmatch, col.long.name]]] = c(result[[spec[rowmatch, col.long.name]]],TRUE);
+ # done = TRUE;
+ #
+ # #can't definitively process this flag yet, need to see if next option is an argument or not
+ # } else {
+ # result[[spec[rowmatch, col.long.name]]] = c(result[[spec[rowmatch, col.long.name]]],TRUE);
+ # current.flag = rowmatch;
+ # done = FALSE;
+ # }
+ # }
+ # if ( done ) {
+ # i = i + 1;
+ # next;
+ # }
+ # }
+
+ #invalid opt
+ if ( current.flag == 0 ) {
+ stop(paste('"', optstring, '" is not a valid option, or does not support an argument', sep=''));
+ #TBD support for positional args
+ #if ( debug ) print(paste('"', optstring, '" not a valid option. It is appended to getopt(...)$ARGS', sep=''));
+ #result$ARGS = append(result$ARGS, optstring);
+
+ # some dangling flag, handle it
+ } else if ( current.flag > 0 ) {
+ if ( debug ) print(' dangling flag');
+ if ( length(opt) > i ) {
+ peek.optstring = opt[i + 1];
+ if ( debug ) print(paste(' peeking ahead at: "',peek.optstring,'"',sep=''));
+
+ #got an argument. attach it, increment the index, and move on to the next option. we don't allow arguments beginning with '-' UNLESS
+ #specfile indicates the value is an "integer" or "double", in which case we allow a leading dash (and verify trailing digits/decimals).
+ if ( substr(peek.optstring, 1, 1) != '-' |
+ #match negative double
+ ( substr(peek.optstring, 1, 1) == '-'
+ & regexpr('^-[0123456789]*\\.?[0123456789]+$',peek.optstring) > 0
+ & spec[current.flag, col.mode]== 'double'
+ ) |
+ #match negative integer
+ ( substr(peek.optstring, 1, 1) == '-'
+ & regexpr('^-[0123456789]+$',peek.optstring) > 0
+ & spec[current.flag, col.mode]== 'integer'
+ )
+ ) {
+ if ( debug ) print(paste(' consuming argument *',peek.optstring,'*',sep=''));
+ storage.mode(peek.optstring) = spec[current.flag, col.mode];
+ #remove the last argument put in result for current.flag that should be a TRUE and concatenate argument with previous ones
+ result[[spec[current.flag, col.long.name]]] = c(result[[spec[current.flag, col.long.name]]][-length(result[[spec[current.flag, col.long.name]]])],peek.optstring);
+ i = i + 1;
+
+ #a lone dash
+ } else if ( substr(peek.optstring, 1, 1) == '-' & length(strsplit(peek.optstring,'')[[1]]) == 1 ) {
+ if ( debug ) print(' consuming "lone dash" argument');
+ storage.mode(peek.optstring) = spec[current.flag, col.mode];
+ #remove the last argument put in result for current.flag that should be a TRUE and concatenate argument with previous ones
+ result[[spec[current.flag, col.long.name]]] =c(result[[spec[current.flag, col.long.name]]][-length(result[[spec[current.flag, col.long.name]]])],peek.optstring);
+ i = i + 1;
+
+ #no argument
+ } else {
+ if ( debug ) print(' no argument!');
+
+ #if we require an argument, bail out
+ if ( spec[current.flag, col.has.argument] == flag.required.argument ) {
+ stop(paste('flag "', this.flag, '" requires an argument', sep=''));
+
+ #otherwise set flag as present.
+ } else if (
+ spec[current.flag, col.has.argument] == flag.optional.argument |
+ spec[current.flag, col.has.argument] == flag.no.argument
+ ) {
+ x = TRUE;
+ storage.mode(x) = spec[current.flag, col.mode];
+ result[[spec[current.flag, col.long.name]]] = c(result[[spec[current.flag, col.long.name]]],x);
+ } else {
+ stop(paste("This should never happen.",
+ "Is your spec argument correct? Maybe you forgot to set",
+ "ncol=4, byrow=TRUE in your matrix call?"));
+ }
+ }
+ #trailing flag without required argument
+ } else if ( spec[current.flag, col.has.argument] == flag.required.argument ) {
+ stop(paste('flag "', this.flag, '" requires an argument', sep=''));
+
+ #trailing flag without optional argument
+ } else if ( spec[current.flag, col.has.argument] == flag.optional.argument ) {
+ x = TRUE;
+ storage.mode(x) = spec[current.flag, col.mode];
+ result[[spec[current.flag, col.long.name]]] = c(result[[spec[current.flag, col.long.name]]],x);
+
+ #trailing flag without argument
+ } else if ( spec[current.flag, col.has.argument] == flag.no.argument ) {
+ x = TRUE;
+ storage.mode(x) = spec[current.flag, col.mode];
+ result[[spec[current.flag, col.long.name]]] = c(result[[spec[current.flag, col.long.name]]],x);
+ } else {
+ stop("this should never happen (2). please inform the author.");
+ }
+ #no dangling flag, nothing to do.
+ } else {
+ }
+
+ i = i+1;
+ }
+ return(result);
+}
+
diff -r 000000000000 -r 708f43bda2b6 src/heatMapClustering.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/heatMapClustering.R Fri Jun 26 09:35:11 2020 -0400
@@ -0,0 +1,896 @@
+# A command-line interface to plot heatmap based on expression or diff. exp. analysis
+# written by Jimmy Vandel
+# one of these arguments is required:
+#
+#
+initial.options <- commandArgs(trailingOnly = FALSE)
+file.arg.name <- "--file="
+script.name <- sub(file.arg.name, "", initial.options[grep(file.arg.name, initial.options)])
+script.basename <- dirname(script.name)
+source(file.path(script.basename, "utils.R"))
+source(file.path(script.basename, "getopt.R"))
+
+#addComment("Welcome R!")
+
+# setup R error handling to go to stderr
+options( show.error.messages=F, error = function () { cat(geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
+
+# we need that to not crash galaxy with an UTF8 error on German LC settings.
+loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
+loc <- Sys.setlocale("LC_NUMERIC", "C")
+
+#get starting time
+start.time <- Sys.time()
+
+
+options(stringAsfactors = FALSE, useFancyQuotes = FALSE, OutDec=".")
+
+#get options
+args <- commandArgs()
+
+# get options, using the spec as defined by the enclosed list.
+# we read the options from the default: commandArgs(TRUE).
+spec <- matrix(c(
+ "expressionFile", "x", 1, "character",
+ "diffAnalyseFile", "x", 1, "character",
+ "factorInfo","x", 1, "character",
+ "genericData","x", 0, "logical",
+ "comparisonName","x",1,"character",
+ "comparisonNameLow","x",1,"character",
+ "comparisonNameHigh","x",1,"character",
+ "filterInputOutput","x", 1, "character",
+ "FCthreshold","x", 1, "double",
+ "pvalThreshold","x", 1, "double",
+ "geneListFiltering","x",1,"character",
+ "clusterNumber","x",1,"integer",
+ "maxRows","x",1,"integer",
+ "sampleClusterNumber","x",1,"integer",
+ "dataTransformation","x",1,"character",
+ "distanceMeasure","x",1,"character",
+ "aggloMethod","x",1,"character",
+ "personalColors","x",1,"character",
+ "sideBarColorPalette","x",1,"character",
+ "format", "x", 1, "character",
+ "quiet", "x", 0, "logical",
+ "log", "x", 1, "character",
+ "outputFile" , "x", 1, "character"),
+ byrow=TRUE, ncol=4)
+opt <- getoptLong(spec)
+
+# enforce the following required arguments
+if (is.null(opt$log)) {
+ addComment("[ERROR]'log file' is required")
+ q( "no", 1, F )
+}
+addComment("[INFO]Start of R script",T,opt$log,display=FALSE)
+if (is.null(opt$format)) {
+ addComment("[ERROR]'output format' is required",T,opt$log)
+ q( "no", 1, F )
+}
+if (is.null(opt$outputFile)) {
+ addComment("[ERROR]'output file' is required",T,opt$log)
+ q( "no", 1, F )
+}
+
+if(is.null(opt$expressionFile) && !is.null(opt$genericData)){
+ addComment("[ERROR]generic data clustering is based on expression clustering",T,opt$log)
+ q( "no", 1, F )
+}
+
+if (is.null(opt$clusterNumber) || opt$clusterNumber<2) {
+ addComment("[ERROR]valid genes clusters number is required",T,opt$log)
+ q( "no", 1, F )
+}
+
+if (is.null(opt$sampleClusterNumber) || opt$sampleClusterNumber<1) {
+ addComment("[ERROR]valid samples clusters number is required",T,opt$log)
+ q( "no", 1, F )
+}
+
+if (is.null(opt$dataTransformation)) {
+ addComment("[ERROR]data transformation option is required",T,opt$log)
+ q( "no", 1, F )
+}
+
+if (is.null(opt$distanceMeasure)) {
+ addComment("[ERROR]distance measure option is required",T,opt$log)
+ q( "no", 1, F )
+}
+
+if (is.null(opt$aggloMethod)) {
+ addComment("[ERROR]agglomeration method option is required",T,opt$log)
+ q( "no", 1, F )
+}
+
+if (is.null(opt$maxRows) || opt$maxRows<2) {
+ addComment("[ERROR]valid plotted row number is required",T,opt$log)
+ q( "no", 1, F )
+}
+
+if (!is.null(opt[["comparisonName"]]) && nchar(opt[["comparisonName"]])==0){
+ addComment("[ERROR]you have to specify comparison",T,opt$log)
+ q( "no", 1, F )
+}
+
+if (!is.null(opt$comparisonNameLow) && nchar(opt$comparisonNameLow)==0){
+ addComment("[ERROR]you have to specify comparisonLow",T,opt$log)
+ q( "no", 1, F )
+}
+
+if (!is.null(opt$comparisonNameHigh) && nchar(opt$comparisonNameHigh)==0){
+ addComment("[ERROR]you have to specify comparisonHigh",T,opt$log)
+ q( "no", 1, F )
+}
+
+if (is.null(opt$genericData) && (!is.null(opt$comparisonNameLow) || !is.null(opt$comparisonNameHigh))){
+ addComment("[ERROR]comparisonLow and comparisonHigh can be specified only with generic data",T,opt$log)
+ q( "no", 1, F )
+}
+
+if (!is.null(opt$genericData) && !is.null(opt[["comparisonName"]])){
+ addComment("[ERROR]basic comparison cannot be specified for generic data",T,opt$log)
+ q( "no", 1, F )
+}
+
+if ((!is.null(opt[["comparisonName"]]) || !is.null(opt$comparisonNameLow) || !is.null(opt$comparisonNameHigh)) && is.null(opt$diffAnalyseFile)) {
+ addComment("[ERROR]'diff. exp. analysis file' is required",T,opt$log)
+ q( "no", 1, F )
+}
+
+if (!is.null(opt$genericData) && !is.null(opt$diffAnalyseFile) && is.null(opt$comparisonNameLow) && is.null(opt$comparisonNameHigh)){
+ addComment("[ERROR]Missing comparison information for filtering",T,opt$log)
+ q( "no", 1, F )
+}
+
+if ((!is.null(opt$FCthreshold) || !is.null(opt$pvalThreshold)) && (is.null(opt[["comparisonName"]]) && is.null(opt$comparisonNameLow) && is.null(opt$comparisonNameHigh))) {
+ addComment("[ERROR]'comparisons' are missing for filtering",T,opt$log)
+ q( "no", 1, F )
+}
+
+if ((!is.null(opt$FCthreshold) || !is.null(opt$pvalThreshold)) && !is.null(opt$geneListFiltering)) {
+ addComment("[ERROR]Cannot have two filtering strategies",T,opt$log)
+ q( "no", 1, F )
+}
+
+verbose <- if (is.null(opt$quiet)) {
+ TRUE
+}else{
+ FALSE}
+
+addComment("[INFO]Parameters checked!",T,opt$log,display=FALSE)
+
+addComment(c("[INFO]Working directory: ",getwd()),TRUE,opt$log,display=FALSE)
+addComment(c("[INFO]Command line: ",args),TRUE,opt$log,display=FALSE)
+
+#directory for plots and HTML
+dir.create(file.path(getwd(), "plotDir"))
+dir.create(file.path(getwd(), "plotLyDir"))
+
+#silent package loading
+suppressPackageStartupMessages({
+ library("plotly")
+ library("dendextend")
+ #library("ggdendro")
+ #library("plyr")
+ library("ggplot2")
+ library("heatmaply")
+ library("circlize")
+ #library("RColorBrewer")
+ #source("https://bioconductor.org/biocLite.R")
+ #biocLite("ComplexHeatmap")
+ library("ComplexHeatmap")
+ #library("processx")
+})
+
+expressionToCluster=!is.null(opt$expressionFile)
+
+#load input data files
+if(expressionToCluster){
+ #first expression data
+ expressionMatrix=read.csv(file=opt$expressionFile,header=F,sep="\t",colClasses="character")
+ #remove first row to convert it as colnames (to avoid X before colnames with header=T)
+ colNamesData=expressionMatrix[1,-1]
+ expressionMatrix=expressionMatrix[-1,]
+ #remove first colum to convert it as rownames
+ rowNamesData=expressionMatrix[,1]
+ expressionMatrix=expressionMatrix[,-1]
+ if(is.data.frame(expressionMatrix)){
+ expressionMatrix=data.matrix(expressionMatrix)
+ }else{
+ expressionMatrix=data.matrix(as.numeric(expressionMatrix))
+ }
+ dimnames(expressionMatrix)=list(rowNamesData,colNamesData)
+
+ #check input files
+ if (!is.numeric(expressionMatrix)) {
+ addComment("[ERROR]Expression data is not fully numeric!",T,opt$log,display=FALSE)
+ q( "no", 1, F )
+ }
+
+ addComment("[INFO]Expression data loaded and checked")
+ addComment(c("[INFO]Dim of expression matrix:",dim(expressionMatrix)),T,opt$log,display=FALSE)
+}
+
+nbComparisons=0
+nbColPerContrast=5
+comparisonMatrix=NULL
+comparisonMatrixInfoGene=NULL
+#if available comparisons
+if(!is.null(opt[["comparisonName"]])){
+ #load results from differential expression analysis
+ #consider first row contains column names
+ comparisonMatrix=read.csv(file=opt$diffAnalyseFile,header=F,sep="\t")
+ colnames(comparisonMatrix)=as.character(unlist(comparisonMatrix[1,]))
+ #remove the second line also as it's information line (p-val,FDR.p-val,FC,logFC)
+ comparisonMatrix=comparisonMatrix[-c(1,2),]
+ #remove first and second colums, convert the first one as rownames
+ rownames(comparisonMatrix)=as.character(unlist(comparisonMatrix[,1]))
+ #and save second column content that contain geneInfo
+ comparisonMatrixInfoGene=as.character(unlist(comparisonMatrix[,2]))
+ names(comparisonMatrixInfoGene)=as.character(unlist(comparisonMatrix[,1]))
+ comparisonMatrix=comparisonMatrix[,-c(1,2)]
+
+ comparisonMatrix=matrix(as.numeric(as.matrix(comparisonMatrix)),ncol=ncol(comparisonMatrix),dimnames = dimnames(comparisonMatrix))
+
+ if (ncol(comparisonMatrix)%%nbColPerContrast != 0) {
+ addComment("[ERROR]Diff. exp. data does not contain good number of columns per contrast, should contains in this order:p-val,FDR.p-val,FC,log2(FC) and t-stat",T,opt$log,display=FALSE)
+ q( "no", 1, F )
+ }
+
+ if(max(comparisonMatrix[,c(seq(1,ncol(comparisonMatrix),nbColPerContrast),seq(2,ncol(comparisonMatrix),nbColPerContrast))])>1 || min(comparisonMatrix[,c(seq(1,ncol(comparisonMatrix),nbColPerContrast),seq(2,ncol(comparisonMatrix),nbColPerContrast))])<0){
+ addComment("[ERROR]Seem that diff. exp. data does not contain correct values for p-val and FDR.p-val columns, should be including in [0,1] interval",T,opt$log,display=FALSE)
+ q( "no", 1, F )
+ }
+
+ if (!is.numeric(comparisonMatrix)) {
+ addComment("[ERROR]Diff. exp. data is not fully numeric!",T,opt$log,display=FALSE)
+ q( "no", 1, F )
+ }
+
+ if(expressionToCluster && length(setdiff(rownames(comparisonMatrix),rownames(expressionMatrix)))!=0){
+ addComment("[WARNING]All genes from diff. exp. file are not included in expression file",T,opt$log,display=FALSE)
+ }
+
+ if(expressionToCluster && length(setdiff(rownames(expressionMatrix),rownames(comparisonMatrix)))!=0){
+ addComment("[WARNING]All genes from expression file are not included in diff. exp. file",T,opt$log,display=FALSE)
+ }
+
+ addComment("[INFO]Diff. exp. analysis loaded and checked",T,opt$log,display=FALSE)
+ addComment(c("[INFO]Dim of original comparison matrix:",dim(comparisonMatrix)),T,opt$log,display=FALSE)
+
+ #restrict to user specified comparisons
+ restrictedComparisons=unlist(strsplit(opt[["comparisonName"]],","))
+ #should be improved to avoid selection of column names starting too similarly
+ colToKeep=which(unlist(lapply(colnames(comparisonMatrix),function(x)any(startsWith(x,restrictedComparisons)))))
+ comparisonMatrix=matrix(comparisonMatrix[,colToKeep],ncol=length(colToKeep),dimnames = list(rownames(comparisonMatrix),colnames(comparisonMatrix)[colToKeep]))
+
+ #get number of required comparisons
+ nbComparisons=ncol(comparisonMatrix)/nbColPerContrast
+
+ addComment(c("[INFO]Dim of effective filtering matrix:",dim(comparisonMatrix)),T,opt$log,display=FALSE)
+}
+
+#should be only the case with generic data
+if(!is.null(opt$comparisonNameLow) || !is.null(opt$comparisonNameHigh)){
+ #load generic data used for filtering
+ nbColPerContrast=1
+ #consider first row contains column names
+ comparisonMatrix=read.csv(file=opt$diffAnalyseFile,header=F,sep="\t")
+ colnames(comparisonMatrix)=as.character(unlist(comparisonMatrix[1,]))
+ #remove first colum, convert the first one as rownames
+ rownames(comparisonMatrix)=as.character(unlist(comparisonMatrix[,1]))
+ comparisonMatrix=comparisonMatrix[-1,-1]
+
+ comparisonMatrix=matrix(as.numeric(as.matrix(comparisonMatrix)),ncol=ncol(comparisonMatrix),dimnames = dimnames(comparisonMatrix))
+
+ if (!is.numeric(comparisonMatrix)) {
+ addComment("[ERROR]Filtering matrix is not fully numeric!",T,opt$log,display=FALSE)
+ q( "no", 1, F )
+ }
+
+ if(expressionToCluster && length(setdiff(rownames(comparisonMatrix),rownames(expressionMatrix)))!=0){
+ addComment("[WARNING]All genes from filtering file are not included in expression file",T,opt$log,display=FALSE)
+ }
+
+ if(expressionToCluster && length(setdiff(rownames(expressionMatrix),rownames(comparisonMatrix)))!=0){
+ addComment("[WARNING]All genes from expression file are not included in filtering file",T,opt$log,display=FALSE)
+ }
+
+ addComment("[INFO]Filtering file loaded and checked",T,opt$log,display=FALSE)
+ addComment(c("[INFO]Dim of original filtering matrix:",dim(comparisonMatrix)),T,opt$log,display=FALSE)
+
+ #restrict to user specified comparisons
+ restrictedComparisons=c()
+ if(!is.null(opt$comparisonNameLow))restrictedComparisons=unique(c(restrictedComparisons,unlist(strsplit(opt$comparisonNameLow,","))))
+ if(!is.null(opt$comparisonNameHigh))restrictedComparisons=unique(c(restrictedComparisons,unlist(strsplit(opt$comparisonNameHigh,","))))
+
+ if (!all(restrictedComparisons%in%colnames(comparisonMatrix))){
+ addComment("[ERROR]Selected columns in filtering file are not present in filtering matrix!",T,opt$log,display=FALSE)
+ q( "no", 1, F )
+ }
+ comparisonMatrix=matrix(comparisonMatrix[,restrictedComparisons],ncol=length(restrictedComparisons),dimnames = list(rownames(comparisonMatrix),restrictedComparisons))
+
+ #get number of required comparisons
+ nbComparisons=ncol(comparisonMatrix)
+
+ addComment(c("[INFO]Dim of effective filtering matrix:",dim(comparisonMatrix)),T,opt$log,display=FALSE)
+}
+
+
+
+factorInfoMatrix=NULL
+if(!is.null(opt$factorInfo)){
+ #get group information
+ #load factors file
+ factorInfoMatrix=read.csv(file=opt$factorInfo,header=F,sep="\t",colClasses="character")
+ #remove first row to convert it as colnames
+ colnames(factorInfoMatrix)=factorInfoMatrix[1,]
+ factorInfoMatrix=factorInfoMatrix[-1,]
+ #use first colum to convert it as rownames but not removing it to avoid conversion as vector in unique factor case
+ rownames(factorInfoMatrix)=factorInfoMatrix[,1]
+
+ factorBarColor=colnames(factorInfoMatrix)[2]
+
+ if(ncol(factorInfoMatrix)>2){
+ addComment("[ERROR]Factors file should not contain more than 2 columns",T,opt$log,display=FALSE)
+ q( "no", 1, F )
+ }
+
+ #factor file is used for color band on heatmap, so all expression matrix column should be in the factor file
+ if(expressionToCluster && length(setdiff(colnames(expressionMatrix),rownames(factorInfoMatrix)))!=0){
+ addComment("[ERROR]Missing samples in factor file",T,opt$log,display=FALSE)
+ q( "no", 1, F )
+ }
+
+ #factor file is used for color band on heatmap, so all comparison matrix column should be in the factor file
+ if(!expressionToCluster && length(setdiff(colnames(comparisonMatrix),rownames(factorInfoMatrix)))!=0){
+ addComment("[ERROR]Missing differential contrasts in factor file",T,opt$log,display=FALSE)
+ q( "no", 1, F )
+ }
+
+ addComment("[INFO]Factors OK",T,opt$log,display=FALSE)
+ addComment(c("[INFO]Dim of factorInfo matrix:",dim(factorInfoMatrix)),T,opt$log,display=FALSE)
+}
+
+if(!is.null(opt$personalColors)){
+ ##parse personal colors
+ personalColors=unlist(strsplit(opt$personalColors,","))
+ if(length(personalColors)==2){
+ ##add medium color between two to get three colors
+ personalColors=c(personalColors[1],paste(c("#",as.character(as.hexmode(floor(apply(col2rgb(personalColors),1,mean))))),collapse=""),personalColors[2])
+ }
+ if(length(personalColors)!=3){
+ addComment("[ERROR]Personalized colors doesn't contain enough colors",T,opt$log,display=FALSE)
+ q( "no", 1, F )
+ }
+
+}
+
+
+if(!is.null(opt$filterInputOutput) && opt$filterInputOutput=="input"){
+ #filter input data
+
+ if(is.null(opt$geneListFiltering)){
+ #filtering using stat thresholds
+ #rowToKeep=intersect(which(comparisonMatrix[,seq(2,ncol(comparisonMatrix),4)]<=opt$pvalThreshold),which(abs(comparisonMatrix[,seq(4,ncol(comparisonMatrix),4)])>=log2(opt$FCthreshold)))
+ if(is.null(opt$genericData)){
+ #diff. expression matrix
+ rowToKeep=names(which(unlist(apply(comparisonMatrix,1,function(x)length(intersect(which(x[seq(2,length(x),nbColPerContrast)]log2(opt$FCthreshold))))!=0))))
+ }else{
+ #generic filtering matrix
+ rowToKeep=rownames(comparisonMatrix)
+ if(!is.null(opt$comparisonNameLow)){
+ restrictedLowComparisons=unlist(strsplit(opt$comparisonNameLow,","))
+ rowToKeep=intersect(rowToKeep,names(which(unlist(apply(comparisonMatrix,1,function(x)length(which(x[restrictedLowComparisons]>opt$FCthreshold))!=0)))))
+ }
+ if(!is.null(opt$comparisonNameHigh)){
+ restrictedHighComparisons=unlist(strsplit(opt$comparisonNameHigh,","))
+ rowToKeep=intersect(rowToKeep,names(which(unlist(apply(comparisonMatrix,1,function(x)length(which(x[restrictedHighComparisons]nrow(dataToHeatMap)){
+ #correct number of clusters if needed
+ nbClusters=nrow(dataToHeatMap)
+ addComment(c("[WARNING]Not enough rows to reach required clusters number, it is reduced to number of rows:",nbClusters),T,opt$log,display=FALSE)
+}
+
+nbSampleClusters=opt$sampleClusterNumber
+if(nbSampleClusters>ncol(dataToHeatMap)){
+ #correct number of clusters if needed
+ nbSampleClusters=ncol(dataToHeatMap)
+ addComment(c("[WARNING]Not enough columns to reach required conditions clusters number, it is reduced to number of columns:",nbSampleClusters),T,opt$log,display=FALSE)
+}
+
+colClust=FALSE
+rowClust=FALSE
+effectiveRowClust=FALSE
+
+#make appropriate clustering if needed
+if(nrow(dataToHeatMap)>1 && nbClusters>1)rowClust=hclust(distExtended(dataToHeatMap,method = opt$distanceMeasure),method = opt$aggloMethod)
+if(ncol(dataToHeatMap)>1 && nbSampleClusters>1)colClust=hclust(distExtended(t(dataToHeatMap),method = opt$distanceMeasure),method = opt$aggloMethod)
+
+if(nrow(dataToHeatMap)>maxRowsToDisplay){
+ #make subsampling based on preliminary global clustering
+ #clusteringResults=cutree(rowClust,nbClusters)
+ #heatMapGenesToKeep=unlist(lapply(seq(1,nbClusters),function(x)sample(which(clusteringResults==x),min(length(which(clusteringResults==x)),round(maxRowsToDisplay/nbClusters)))))
+ ##OR
+ #basic subsampling
+ heatMapGenesToKeep=sample(rownames(dataToHeatMap),maxRowsToDisplay)
+ effectiveDataToHeatMap=matrix(dataToHeatMap[heatMapGenesToKeep,],ncol=ncol(dataToHeatMap),dimnames=list(heatMapGenesToKeep,colnames(dataToHeatMap)))
+ effectiveNbClusters=min(nbClusters,maxRowsToDisplay)
+ if(nrow(effectiveDataToHeatMap)>1 && effectiveNbClusters>1)effectiveRowClust=hclust(distExtended(effectiveDataToHeatMap, method = opt$distanceMeasure),method = opt$aggloMethod)
+ addComment(c("[WARNING]Too many rows for efficient heatmap drawing",maxRowsToDisplay,"subsampling is done for vizualization only"),T,opt$log,display=FALSE)
+ rm(heatMapGenesToKeep)
+}else{
+ effectiveDataToHeatMap=dataToHeatMap
+ effectiveRowClust=rowClust
+ effectiveNbClusters=nbClusters
+}
+
+addComment(c("[INFO]Dim of plotted heatmap matrix:",dim(effectiveDataToHeatMap)),T,opt$log,display=FALSE)
+
+personalized_hoverinfo=matrix("",ncol = ncol(effectiveDataToHeatMap),nrow = nrow(effectiveDataToHeatMap),dimnames = dimnames(effectiveDataToHeatMap))
+if(expressionToCluster){
+ for(iCol in colnames(effectiveDataToHeatMap)){for(iRow in rownames(effectiveDataToHeatMap)){personalized_hoverinfo[iRow,iCol]=paste(c("Probe: ",iRow,"\nCondition: ",iCol,"\n",valueMeaning,": ",effectiveDataToHeatMap[iRow,iCol]),collapse="")}}
+}else{
+ for(iCol in colnames(effectiveDataToHeatMap)){for(iRow in rownames(effectiveDataToHeatMap)){personalized_hoverinfo[iRow,iCol]=paste(c("Probe: ",iRow,"\nCondition: ",iCol,"\nFC: ",round(2^effectiveDataToHeatMap[iRow,iCol],2)),collapse="")}}
+}
+
+#trying to overcome limitation of heatmaply package to modify xtick and ytick label, using directly plotly functions, but for now plotly do not permit to have personalized color for each x/y tick separately
+test=FALSE
+if(test==TRUE){
+
+ #define dendogram shapes
+ dd.row <- as.dendrogram(effectiveRowClust)
+ dd.col <- as.dendrogram(colClust)
+
+ #and color them
+ dd.row=color_branches(dd.row, k = effectiveNbClusters, groupLabels = T)
+ dd.col=color_branches(dd.col, k = nbSampleClusters, groupLabels = T)
+
+ #generating function for dendogram from segment list
+ ggdend <- function(df) {
+ ggplot() +
+ geom_segment(data = df, aes(x=x, y=y, xend=xend, yend=yend)) +
+ labs(x = "", y = "") + theme_minimal() +
+ theme(axis.text = element_blank(), axis.ticks = element_blank(),
+ panel.grid = element_blank())
+ }
+
+ # generate x/y dendogram plots
+ px <- ggdend(dendro_data(dd.col)$segments)
+ py <- ggdend(dendro_data(dd.row)$segments) + coord_flip()
+
+ # reshape data matrix
+ col.ord <- order.dendrogram(dd.col)
+ row.ord <- order.dendrogram(dd.row)
+ xx <- effectiveDataToHeatMap[row.ord, col.ord]
+ # and also personalized_hoverinfo
+ personalized_hoverinfo=personalized_hoverinfo[row.ord, col.ord]
+
+ # hide axis ticks and grid lines
+ eaxis <- list(
+ showticklabels = FALSE,
+ showgrid = FALSE,
+ zeroline = FALSE
+ )
+
+ #make the empty plot
+ p_empty <- plot_ly() %>%
+ layout(margin = list(l = 200),
+ xaxis = eaxis,
+ yaxis = eaxis)
+
+ heatmap.plotly <- plot_ly(
+ z = xx, x = 1:ncol(xx), y = 1:nrow(xx), colors = viridis(n = 101, alpha = 1, begin = 0, end = 1, option = "inferno"),
+ type = "heatmap", showlegend = FALSE, text = personalized_hoverinfo, hoverinfo = "text",
+ colorbar = list(
+ # Capitalise first letter
+ title = valueMeaning,
+ tickmode = "array",
+ len = 0.3
+ )
+ ) %>%
+ layout(
+ xaxis = list(
+ tickfont = list(size = 10,color=get_leaves_branches_col(dd.row)),
+ tickangle = 45,
+ tickvals = 1:ncol(xx), ticktext = colnames(xx),
+ linecolor = "#ffffff",
+ range = c(0.5, ncol(xx) + 0.5),
+ showticklabels = TRUE
+ ),
+ yaxis = list(
+ tickfont = list(size = 10, color=get_leaves_branches_col(dd.col)),
+ tickangle = 0,
+ tickvals = 1:nrow(xx), ticktext = rownames(xx),
+ linecolor = "#ffffff",
+ range = c(0.5, nrow(xx) + 0.5),
+ showticklabels = TRUE
+ )
+ )
+
+ #generate plotly
+ pp <- subplot(px, p_empty, heatmap.plotly, py, nrows = 2, margin = 0,widths = c(0.8,0.2),heights = c(0.2,0.8), shareX = TRUE,
+ shareY = TRUE)
+
+ #save image file
+ export(pp, file = paste(c(file.path(getwd(), "plotDir"),"/Heatmap.",opt$format),collapse=""))
+ #rise a bug due to token stuf
+ #orca(pp, file = paste(c(file.path(getwd(), "plotDir"),"/Heatmap.",opt$format),collapse=""))
+
+
+ #save plotLy file
+ htmlwidgets::saveWidget(as_widget(pp), paste(c(file.path(getwd(), "plotLyDir"),"/Heatmap.html"),collapse=""),selfcontained = F)
+
+ #htmlwidgets::saveWidget(as_widget(pp),"~/Bureau/test.html",selfcontained = F)
+
+}else{ #test
+ label_names=c("Probe","Condition",valueMeaning)
+
+ # #color hclust objects
+ # dd.row=color_branches(effectiveRowClust, k = effectiveNbClusters)
+ # #rowColors=get_leaves_branches_col(dd.row)
+ # #rowColors[order.dendrogram(dd.row)]=rowColors
+ # rowGroup=cutree(effectiveRowClust, k = effectiveNbClusters)
+ #
+ # #get order of class as they will be displayed on the dendogram
+ # rowGroupRenamed=data.frame(cluster=mapvalues(rowGroup, unique(rowGroup[order.dendrogram(dd.row)[nleaves(dd.row):1]]), 1:effectiveNbClusters))
+ #
+ # dd.col=color_branches(colClust, k = nbSampleClusters)
+ # #colColors=get_leaves_branches_col(dd.col)
+ # #colColors[order.dendrogram(dd.col)]=colColors
+ # colGroup=cutree(colClust, k = nbSampleClusters)
+ #
+ # # #get order of class as they will be displayed on the dendogram
+ # colGroupRenamed=data.frame(sampleCluster=mapvalues(colGroup, unique(colGroup[order.dendrogram(dd.col)[nleaves(dd.col):1]]), 1:nbSampleClusters))
+
+
+ #while option is not correctly managed by heatmap apply, put personalized_hoverinfo to NULL
+ personalized_hoverinfo=NULL
+
+ if(is.null(opt$personalColors)){
+ heatmapColors=viridis(n = 101, alpha = 1, begin = 0, end = 1, option = "inferno")
+ }else{
+ heatmapColors=personalColors
+ }
+
+ colGroupRenamed=NULL
+ if(!is.null(factorInfoMatrix)){
+ colGroupRenamed=eval(parse(text=(paste("data.frame(",factorBarColor,"=factorInfoMatrix[colnames(effectiveDataToHeatMap),2])",sep=""))))
+ sideBarGroupNb=length(table(factorInfoMatrix[colnames(effectiveDataToHeatMap),2]))
+ sideBarColorPaletteName="Spectral"
+ if(!is.null(opt$sideBarColorPalette) && opt$sideBarColorPalette%in%rownames(RColorBrewer::brewer.pal.info)){
+ sideBarColorPaletteName=opt$sideBarColorPalette
+ }
+ sideBarColorPalette=setNames(colorRampPalette(RColorBrewer::brewer.pal(RColorBrewer::brewer.pal.info[sideBarColorPaletteName,"maxcolors"], sideBarColorPaletteName))(sideBarGroupNb),unique(factorInfoMatrix[colnames(effectiveDataToHeatMap),2]))
+ }
+
+ if(!is.null(colGroupRenamed)){
+ pp <- heatmaply(effectiveDataToHeatMap,key.title = valueMeaning,k_row=effectiveNbClusters,k_col=nbSampleClusters,col_side_colors=colGroupRenamed,col_side_palette=sideBarColorPalette,Rowv=effectiveRowClust,Colv=colClust,label_names=label_names,custom_hovertext=personalized_hoverinfo,plot_method = "plotly",colors = heatmapColors)
+ }else{
+ pp <- heatmaply(effectiveDataToHeatMap,key.title = valueMeaning,k_row=effectiveNbClusters,k_col=nbSampleClusters,Rowv=effectiveRowClust,Colv=colClust,label_names=label_names,custom_hovertext=personalized_hoverinfo,plot_method = "plotly",colors = heatmapColors)
+ }
+
+
+ #save image file
+ export(pp, file = paste(c(file.path(getwd(), "plotDir"),"/Heatmap.",opt$format),collapse=""))
+ #rise a bug due to token stuf
+ #orca(pp, file = paste(c(file.path(getwd(), "plotDir"),"/Heatmap.",opt$format),collapse=""))
+
+
+ #save plotLy file
+ htmlwidgets::saveWidget(as_widget(pp), paste(c(file.path(getwd(), "plotLyDir"),"/Heatmap.html"),collapse=""),selfcontained = F)
+
+}
+addComment("[INFO]Heatmap drawn",T,opt$log,display=FALSE)
+
+
+#plot circular heatmap
+if(!class(effectiveRowClust)=="logical"){
+ dendo=as.dendrogram(effectiveRowClust)
+
+ if(is.null(opt$personalColors)){
+ col_fun = colorRamp2(quantile(effectiveDataToHeatMap,probs = seq(0,1,0.01)), viridis(101,option = "inferno"))
+ }else{
+ col_fun = colorRamp2(quantile(effectiveDataToHeatMap,probs = seq(0,1,0.5)), personalColors)
+ }
+
+ if(opt$format=="pdf"){
+ pdf(paste(c("./plotDir/circularPlot.pdf"),collapse=""))}else{
+ png(paste(c("./plotDir/circularPlot.png"),collapse=""))
+ }
+
+ circos.par(cell.padding = c(0, 0, 0, 0), gap.degree = 5)
+ circos.initialize(c(rep("a",nrow(effectiveDataToHeatMap)),"b"),xlim=cbind(c(0,0),c(nrow(effectiveDataToHeatMap),5)))
+ circos.track(ylim = c(0, 1), bg.border = NA, panel.fun = function(x, y) {
+ if(CELL_META$sector.index=="a"){
+ nr = ncol(effectiveDataToHeatMap)
+ nc = nrow(effectiveDataToHeatMap)
+ circos.text(1:nc- 0.5, rep(0,nc), adj = c(0, 0),
+ rownames(effectiveDataToHeatMap)[order.dendrogram(dendo)], facing = "clockwise", niceFacing = TRUE, cex = 0.3)
+ }
+ })
+
+ circos.track(ylim = c(0, ncol(effectiveDataToHeatMap)), bg.border = NA, panel.fun = function(x, y) {
+
+ m = t(matrix(effectiveDataToHeatMap[order.dendrogram(dendo),],ncol=ncol(effectiveDataToHeatMap)))
+ col_mat = col_fun(m)
+ nr = nrow(m)
+ nc = ncol(m)
+ if(CELL_META$sector.index=="a"){
+ for(i in 1:nr) {
+ circos.rect(1:nc - 1, rep(nr - i, nc),
+ 1:nc, rep(nr - i + 1, nc),
+ border = col_mat[i, ], col = col_mat[i, ])
+ }
+ }else{
+ circos.text(rep(1,nr), seq(nr,1,-1) , colnames(effectiveDataToHeatMap),cex = 0.3)
+ }
+ })
+
+ #dendo = color_branches(dendo, k = effectiveNbClusters, col = colorRampPalette(brewer.pal(12,"Set3"))(effectiveNbClusters))
+ dendo = color_branches(dendo, k = effectiveNbClusters, col = rev(colorspace::rainbow_hcl(effectiveNbClusters)))
+
+
+ circos.track(ylim = c(0, attributes(dendo)$height), bg.border = NA, track.height = 0.25,
+ panel.fun = function(x, y) {
+ if(CELL_META$sector.index=="a")circos.dendrogram(dendo)} )
+
+ circos.clear()
+ ##add legend
+ lgd_links = Legend(at = seq(ceiling(min(effectiveDataToHeatMap)),floor(max(effectiveDataToHeatMap)),ceiling((floor(max(effectiveDataToHeatMap))-ceiling(min(effectiveDataToHeatMap)))/4)), col_fun = col_fun,
+ title_position = "topleft", grid_width = unit(5, "mm") ,title = valueMeaning)
+
+ pushViewport(viewport(x = 0.85, y = 0.80,
+ width = 0.1,
+ height = 0.1,
+ just = c("left", "bottom")))
+ grid.draw(lgd_links)
+ upViewport()
+
+
+ dev.off()
+
+ addComment("[INFO]Circular heatmap drawn",T,opt$log,display=FALSE)
+ loc <- Sys.setlocale("LC_NUMERIC","C")
+}else{
+ addComment(c("[WARNING]Circular plot will not be plotted considering row or cluster number < 2"),T,opt$log,display=FALSE)
+}
+rm(effectiveDataToHeatMap,effectiveRowClust,effectiveNbClusters)
+
+#plot screeplot
+if(class(rowClust)!="logical" && nrow(dataToHeatMap)>2){
+ screePlotData=c()
+ for(iNbClusters in 2:(nbClusters+min(10,max(0,nrow(dataToHeatMap)-nbClusters)))){
+ clusteringResults=cutree(rowClust,iNbClusters)
+ #clusteringResults=kmeans(dataToHeatMap,iNbClusters)$cluster
+
+ #compute variance between each intra-class points amongst themselves (need at least 3 points by cluster)
+ #screePlotData=c(screePlotData,sum(unlist(lapply(seq(1,iNbClusters),function(x){temp=which(clusteringResults==x);if(length(temp)>2){var(dist(dataToHeatMap[temp,]))}else{0}}))) )
+ #compute variance between each intra-class points and fictive mean point (need at least 2 points by cluster)
+ #screePlotData=c(screePlotData,sum(unlist(lapply(seq(1,iNbClusters),function(x){temp=which(clusteringResults==x);if(length(temp)>1){ var(dist(rbind(apply(dataToHeatMap[temp,],2,mean),dataToHeatMap[temp,]))[1:length(temp)]) }else{0}}))) )
+ if(ncol(dataToHeatMap)>1)screePlotData=c(screePlotData,sum(unlist(lapply(seq(1,iNbClusters),function(x){temp=which(clusteringResults==x);if(length(temp)>1){ sum((distExtended(rbind(apply(dataToHeatMap[temp,],2,mean),dataToHeatMap[temp,]),method = opt$distanceMeasure)[1:length(temp)])^2) }else{0}}))) )
+ else screePlotData=c(screePlotData,sum(unlist(lapply(seq(1,iNbClusters),function(x){temp=which(clusteringResults==x);if(length(temp)>1){ sum((dataToHeatMap[temp,]-mean(dataToHeatMap[temp,]))^2) }else{0}}))) )
+ }
+
+ dataToPlot=data.frame(clusterNb=seq(2,length(screePlotData)+1),wcss=screePlotData)
+ p <- ggplot(data=dataToPlot, aes(clusterNb,wcss)) + geom_point(colour="#EE4444") + geom_line(colour="#DD9999") +
+ ggtitle("Scree plot") + theme_bw() + xlab(label="Cluster number") + ylab(label="Within cluster sum of squares") +
+ theme(panel.border=element_blank(),plot.title = element_text(hjust = 0.5),legend.position = "none") +
+ scale_x_continuous(breaks=seq(min(dataToPlot$clusterNb), max(dataToPlot$clusterNb), 1))
+
+ #save plotly files
+ pp <- ggplotly(p)
+
+ if(opt$format=="pdf"){
+ pdf(paste(c("./plotDir/screePlot.pdf"),collapse=""))}else{
+ png(paste(c("./plotDir/screePlot.png"),collapse=""))
+ }
+ plot(p)
+ dev.off()
+
+ #save plotly files
+ htmlwidgets::saveWidget(as_widget(pp), paste(c(file.path(getwd(), "plotLyDir"),"/screePlot.html"),collapse=""),selfcontained = F)
+
+ addComment("[INFO]Scree plot drawn",T,opt$log,display=FALSE)
+}else{
+ addComment(c("[WARNING]Scree plot will not be plotted considering row number <= 2"),T,opt$log,display=FALSE)
+}
+
+##----------------------
+
+#filter output based on parameters
+
+rowToKeep=rownames(dataToHeatMap)
+if(!is.null(opt$filterInputOutput) && opt$filterInputOutput=="output"){
+ #rowToKeep=intersect(which(comparisonMatrix[,seq(2,ncol(comparisonMatrix),4)]<=opt$pvalThreshold),which(abs(comparisonMatrix[,seq(4,ncol(comparisonMatrix),4)])>=log2(opt$FCthreshold)))
+ if(is.null(opt$geneListFiltering)){
+ if(is.null(opt$genericData)){
+ #diff. expression matrix
+ rowToKeep=names(which(unlist(apply(comparisonMatrix,1,function(x)length(intersect(which(x[seq(2,length(x),nbColPerContrast)]<=opt$pvalThreshold),which(abs(x[seq(4,length(x),nbColPerContrast)])>=log2(opt$FCthreshold))))!=0))))
+ }else{
+ #generic filtering matrix
+ rowToKeep=rownames(comparisonMatrix)
+ if(!is.null(opt$comparisonNameLow)){
+ restrictedLowComparisons=unlist(strsplit(opt$comparisonNameLow,","))
+ rowToKeep=intersect(rowToKeep,names(which(unlist(apply(comparisonMatrix,1,function(x)length(which(x[restrictedLowComparisons]>opt$FCthreshold))!=0)))))
+ }
+ if(!is.null(opt$comparisonNameHigh)){
+ restrictedHighComparisons=unlist(strsplit(opt$comparisonNameHigh,","))
+ rowToKeep=intersect(rowToKeep,names(which(unlist(apply(comparisonMatrix,1,function(x)length(which(x[restrictedHighComparisons]
+#
+# This file is free software: you may copy, redistribute and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation, either version 2 of the License, or (at your
+# option) any later version.
+#
+# This file is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+
+#extendedDist function to correlation measure
+distExtended <- function(x,method) {
+ if(method %in% c("euclidean", "maximum", "manhattan", "canberra", "binary", "minkowski"))return(dist(x,method = method))
+ if(method %in% c("pearson", "spearman", "kendall"))return(as.dist(1-cor(t(x),method=method))/2)
+ if(method %in% c("absPearson", "absSpearman", "absKendall"))return(as.dist(1-abs(cor(t(x),method=method))))
+ return(NULL)
+}
+
+##comment function to display message and optionnaly add it to log file
+
+addComment <- function(text,addToFile=FALSE,fileName=NULL,append=TRUE,display=TRUE){
+ if(display)cat(paste(c(text,"\n"),collapse = " "))
+ if(addToFile)write(paste(text,collapse = " "),fileName,append=append)
+}
+
+printSessionInfo <- function(fileName=NULL,append=TRUE){
+ addComment("[INFO]R session info :",T,fileName,display=FALSE)
+ tempInfo=sessionInfo()
+ write(paste(tempInfo$R.version$version.string),fileName,append=append)
+ write(paste("Platform",tempInfo$platform,sep = " : "),fileName,append=append)
+ write(paste("Running under",tempInfo$running,sep = " : "),fileName,append=append)
+ write(paste("Local variables",tempInfo$locale,sep = " : "),fileName,append=append)
+ write(paste("Attached base packages",paste(tempInfo$basePkgs,collapse = "; "),sep = " : "),fileName,append=append)
+ if(length(tempInfo$otherPkgs)>0){
+ lineToPrint=""
+ for(iPack in tempInfo$otherPkgs){
+ lineToPrint=paste(lineToPrint,iPack$Package," ",iPack$Version,"; ",sep = "")
+ }
+ write(paste("Other attached packages",lineToPrint,sep = " : "),fileName,append=append)
+ }
+ if(length(tempInfo$loadedOnly)>0){
+ lineToPrint=""
+ for(iPack in tempInfo$loadedOnly){
+ lineToPrint=paste(lineToPrint,iPack$Package," ",iPack$Version,"; ",sep = "")
+ }
+ write(paste("Loaded packages",lineToPrint,sep = " : "),fileName,append=append)
+ }
+}
+
+##negative of a mathematical expression
+negativeExpression <- function(expression){
+ expression=gsub("\\+","_toMinus_",expression)
+ expression=gsub("\\-","+",expression)
+ expression=gsub("_toMinus_","-",expression)
+ if(substr(expression,1,1)!="-" && substr(expression,1,1)!="+"){
+ expression=paste(c("-",expression),collapse="")
+ }
+
+ return(expression)
+}
+
+#' Returns file name of calling Rscript
+#'
+#' \code{get_Rscript_filename} returns the file name of calling Rscript
+#' @return A string with the filename of the calling script.
+#' If not found (i.e. you are in a interactive session) returns NA.
+#'
+#' @export
+get_Rscript_filename <- function() {
+ prog <- sub("--file=", "", grep("--file=", commandArgs(), value=TRUE)[1])
+ if( .Platform$OS.type == "windows") {
+ prog <- gsub("\\\\", "\\\\\\\\", prog)
+ }
+ prog
+}
+
+#' Recursively sorts a list
+#'
+#' \code{sort_list} returns a sorted list
+#' @param unsorted_list A list.
+#' @return A sorted list.
+#' @export
+sort_list <- function(unsorted_list) {
+ for(ii in seq(along=unsorted_list)) {
+ if(is.list(unsorted_list[[ii]])) {
+ unsorted_list[[ii]] <- sort_list(unsorted_list[[ii]])
+ }
+ }
+ unsorted_list[sort(names(unsorted_list))]
+}
+
+
+# Multiple plot function
+#
+# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
+# - cols: Number of columns in layout
+# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
+#
+# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
+# then plot 1 will go in the upper left, 2 will go in the upper right, and
+# 3 will go all the way across the bottom.
+#
+multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
+ library(grid)
+
+ # Make a list from the ... arguments and plotlist
+ plots <- c(list(...), plotlist)
+
+ numPlots = length(plots)
+
+ # If layout is NULL, then use 'cols' to determine layout
+ if (is.null(layout)) {
+ # Make the panel
+ # ncol: Number of columns of plots
+ # nrow: Number of rows needed, calculated from # of cols
+ layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
+ ncol = cols, nrow = ceiling(numPlots/cols))
+ }
+
+ if (numPlots==1) {
+ print(plots[[1]])
+
+ } else {
+ # Set up the page
+ grid.newpage()
+ pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
+
+ # Make each plot, in the correct location
+ for (i in 1:numPlots) {
+ # Get the i,j matrix positions of the regions that contain this subplot
+ matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
+
+ print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
+ layout.pos.col = matchidx$col))
+ }
+ }
+}