# HG changeset patch # User vandelj # Date 1593178703 14400 # Node ID 14045c80a22203e6fc2b0867918842581d2d1bc7 "planemo upload for repository https://github.com/juliechevalier/GIANT/tree/master commit cb276a594444c8f32e9819fefde3a21f121d35df" diff -r 000000000000 -r 14045c80a222 galaxy/wrappers/ExprHeatmapClustering.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy/wrappers/ExprHeatmapClustering.xml Fri Jun 26 09:38:23 2020 -0400 @@ -0,0 +1,928 @@ + + Run hierarchical clustering and plot heatmap from expression data and/or differential expression analysis + + r-plotly + r-dendextend + r-ggdendro + r-ggplot2 + r-heatmaply + r-circlize + bioconductor-complexheatmap + pandoc + + + + + + + + + ./selectedExpressionData; + #else + cp $dataToCluster.expressionData ./selectedExpressionData; + #end if + + ##reorder columns of input data based on factors file + #if $dataToCluster.reorder_sample.reordering_selector=="factorFile" and $dataToCluster.reorder_sample.factorFileData and $dataToCluster.reorder_sample.factorToUse: + awk -v factors="$dataToCluster.reorder_sample.factorToUse" 'BEGIN{FS="\t";OFS="";ORS="";split(factors,factorsTab,",")} FNR==1{for(iFactor=1;iFactor<=length(factorsTab);iFactor++)for(iField=2;iField<=NF;iField++){if(\$iField==factorsTab[iFactor])colsToSelect[iFactor]=iField}} FNR>1{line=\$1;for(iFactor=1;iFactor<=length(factorsTab);iFactor++)line=line"\t"\$colsToSelect[iFactor];print line"\n";}' $dataToCluster.reorder_sample.factorFileData > ./orderingFactor; + + sort -V -k2 ./orderingFactor > ./orderingSample; + + awk 'BEGIN{FS="\t";OFS="";ORS="";factorNumber=0} ARGIND==1{sampleOrdered[FNR]=\$1;factorNumber=FNR} ARGIND==2 && FNR==1{for(iElemt=1;iElemt<=factorNumber;iElemt++)for(iPosit=2;iPosit<=NF;iPosit++)if(\$iPosit==sampleOrdered[iElemt])positOrdered[iElemt]=iPosit} ARGIND==2{line=\$1;for(iElemt=1;iElemt<=factorNumber;iElemt++)if(iElemt in positOrdered)line=line"\t"\$positOrdered[iElemt];print line"\n"}' ./orderingSample ./selectedExpressionData > ./orderedExpressionData; + + ##check if some input data columns were lost during the process + awk 'ARGIND==1 && FNR==1{colNumbA=NF} ARGIND==2 && FNR==1{colNumbB=NF} END{if(colNumbA!=colNumbB) print "[WARNING] "colNumbA-colNumbB" input data columns was removed during reordering due to missing information in factor file!\n"}' ./selectedExpressionData ./orderedExpressionData >> $log; + + #if $advSection.conditionClusterNumber!="1": + printf "[WARNING]Sample clustering option is selected, sample reordering will not be preserved!\n" >> $log; + #end if + #else: + cp ./selectedExpressionData ./orderedExpressionData; + #end if + #end if + + + ##generate common file name for differential analysis results depending on input data nature + #if ($dataToCluster.dataToCluster_selector=="expression" or $dataToCluster.dataToCluster_selector=="genericData") and $dataToCluster.filtering_step.filtering_step_selector!="no" and $dataToCluster.filtering_step.select_filtering.filtering_stepBis_selector=="diffExpParam" and $dataToCluster.filtering_step.select_filtering.differentialAnalysis: + cp ${dataToCluster.filtering_step.select_filtering.differentialAnalysis} ./filteredDifferentialAnalysis; + #end if + #if $dataToCluster.dataToCluster_selector=="foldChange" and $dataToCluster.differentialAnalysis: + cp $dataToCluster.differentialAnalysis ./filteredDifferentialAnalysis; + #end if + + + ##generate factor information to use for barplot + #if $advSection.conditionBarColor.conditionBarColor_selector=="yes" and $advSection.conditionBarColor.factorFileDataBarPlot and $advSection.conditionBarColor.factorToUse: + awk -v factor="$advSection.conditionBarColor.factorToUse" 'BEGIN{FS="\t";OFS="";ORS=""} NR==1{for(i=2;i<=NF;i++)if(\$i==factor)colToKeep=i} {print \$1"\t"\$colToKeep"\n"}' $advSection.conditionBarColor.factorFileDataBarPlot > ./barPlotFactor; + #end if + + Rscript '$__tool_directory__/../../src/heatMapClustering.R' --log '$log' --outputFile '$outputData' --format '$advSection.imageFormat' --clusterNumber '$advSection.clusterNumber' --maxRows '$advSection.maxSampleToPlot' --sampleClusterNumber '$advSection.conditionClusterNumber' --dataTransformation '$advSection.dataTransformation' --distanceMeasure '$advSection.distanceMeasure' --aggloMethod '$advSection.aggloMethod' + #if $advSection.select_color.specifyColors=="true": + --personalColors '$advSection.select_color.featureMin_color,$advSection.select_color.featureMedium_color,$advSection.select_color.featureMax_color' + #end if + #if $advSection.conditionBarColor.conditionBarColor_selector=="yes" and $advSection.conditionBarColor.factorFileDataBarPlot and $advSection.conditionBarColor.factorToUse: + --factorInfo './barPlotFactor' + --sideBarColorPalette '$advSection.conditionBarColor.sideBarPalette' + #end if + #if $dataToCluster.dataToCluster_selector=="genericData": + --genericData + #end if + #if $dataToCluster.dataToCluster_selector=="expression" or $dataToCluster.dataToCluster_selector=="genericData": + --expressionFile './orderedExpressionData' + #if $dataToCluster.filtering_step.filtering_step_selector!="no": + --filterInputOutput '$dataToCluster.filtering_step.filtering_step_selector' + #if $dataToCluster.filtering_step.select_filtering.filtering_stepBis_selector=="diffExpParam": + --diffAnalyseFile './filteredDifferentialAnalysis' + #if $dataToCluster.dataToCluster_selector=="expression": + --comparisonName '$dataToCluster.filtering_step.select_filtering.comparisonsToInclude' + --FCthreshold '$dataToCluster.filtering_step.select_filtering.FCthreshold' + --pvalThreshold '$dataToCluster.filtering_step.select_filtering.pvalThreshold' + #else: + #if $dataToCluster.filtering_step.select_filtering.comparisonsToIncludeLow and $dataToCluster.filtering_step.select_filtering.valThresholdLow: + --comparisonNameLow '$dataToCluster.filtering_step.select_filtering.comparisonsToIncludeLow' + --FCthreshold '$dataToCluster.filtering_step.select_filtering.valThresholdLow' + #end if + #if $dataToCluster.filtering_step.select_filtering.comparisonsToIncludeHigh and $dataToCluster.filtering_step.select_filtering.valThresholdHigh: + --comparisonNameHigh '$dataToCluster.filtering_step.select_filtering.comparisonsToIncludeHigh' + --pvalThreshold '$dataToCluster.filtering_step.select_filtering.valThresholdHigh' + #end if + #end if + #else: + --geneListFiltering '$dataToCluster.filtering_step.select_filtering.geneListFile' + #end if + #end if + #else + --diffAnalyseFile './filteredDifferentialAnalysis' + --comparisonName '$dataToCluster.comparisonsToInclude' + #if $dataToCluster.filtering_step.filtering_step_selector!="no": + --filterInputOutput '$dataToCluster.filtering_step.filtering_step_selector' + #if $dataToCluster.filtering_step.select_filtering.filtering_stepBis_selector=="diffExpParam": + --FCthreshold '$dataToCluster.filtering_step.select_filtering.FCthreshold' + --pvalThreshold '$dataToCluster.filtering_step.select_filtering.pvalThreshold' + #else: + --geneListFiltering '$dataToCluster.filtering_step.select_filtering.geneListFile' + #end if + #end if + #end if + ; + ret_code=\$?; + if [ \$ret_code != 0 ]; then + exit \$ret_code; + else + bash $scriptTransfer; + ret_code=\$?; + if [ \$ret_code != 0 ]; then + exit \$ret_code; + fi + fi; + printf "[INFO]End of tool script" >> $log; + ]]> + + + + + + + + + + + + + + + + + +" > ${html_file.extra_files_path}/outputClustering.html + +printf "\n" >> ${html_file.extra_files_path}/outputClustering.html +#if $dataToCluster.dataToCluster_selector=="foldChange" or ($dataToCluster.dataToCluster_selector=="expression" and $dataToCluster.filtering_step.filtering_step_selector!="no" and $dataToCluster.filtering_step.select_filtering.filtering_stepBis_selector=="diffExpParam"): +printf "\n" >> ${html_file.extra_files_path}/outputClustering.html +printf "\n" >> ${html_file.extra_files_path}/outputClustering.html +awk 'BEGIN{odd=1;FS="\t"} NR==1{for(i=4;i<=NF;i=i+5){if(odd==1){odd=0;printf "\n"}else{odd=1;printf "\n"}}}' $outputData >> ${html_file.extra_files_path}/outputClustering.html +#else +printf "\n" >> ${html_file.extra_files_path}/outputClustering.html +printf "\n" >> ${html_file.extra_files_path}/outputClustering.html +awk 'BEGIN{odd=1;FS="\t"} NR==1{for(i=3;i<=NF;i++){if(odd==1){odd=0;printf "\n"}else{odd=1;printf "\n"}}}' $outputData >> ${html_file.extra_files_path}/outputClustering.html +#end if + +printf "\n" >> ${html_file.extra_files_path}/outputClustering.html +printf "\n\n" >> ${html_file.extra_files_path}/outputClustering.html + +#if $dataToCluster.dataToCluster_selector=="foldChange" or ($dataToCluster.dataToCluster_selector=="expression" and $dataToCluster.filtering_step.filtering_step_selector!="no" and $dataToCluster.filtering_step.select_filtering.filtering_stepBis_selector=="diffExpParam"): +printf "\n" >> ${html_file.extra_files_path}/outputClustering.html +printf "\n" >> ${html_file.extra_files_path}/outputClustering.html +printf "\n" >> ${html_file.extra_files_path}/outputClustering.html +awk 'BEGIN{FS="\t"} NR==1{for(i=4;i<=NF;i=i+5)printf "\n"}' $outputData >> ${html_file.extra_files_path}/outputClustering.html +printf "\n" >> ${html_file.extra_files_path}/outputClustering.html +printf "\n\n" >> ${html_file.extra_files_path}/outputClustering.html +awk 'BEGIN{FS="\t"} NR==2{for(i=4;i<=NF;i++)printf "\n"}' $outputData >> ${html_file.extra_files_path}/outputClustering.html +#else +printf "\n" >> ${html_file.extra_files_path}/outputClustering.html +printf "\n" >> ${html_file.extra_files_path}/outputClustering.html +awk 'BEGIN{FS="\t"} NR==1{for(i=3;i<=NF;i++)printf "\n"}' $outputData >> ${html_file.extra_files_path}/outputClustering.html +#end if + +printf "\n" >> ${html_file.extra_files_path}/outputClustering.html +printf "\n\n" >> ${html_file.extra_files_path}/outputClustering.html +printf "\n\n" >> ${html_file.extra_files_path}/outputClustering.html + +#if $dataToCluster.dataToCluster_selector=="foldChange" or ($dataToCluster.dataToCluster_selector=="expression" and $dataToCluster.filtering_step.filtering_step_selector!="no" and $dataToCluster.filtering_step.select_filtering.filtering_stepBis_selector=="diffExpParam"): +awk 'BEGIN{FS="\t"} NR==2{for(i=1;i<=NF;i++)printf "\n"}' $outputData >> ${html_file.extra_files_path}/outputClustering.html +#else +awk 'BEGIN{FS="\t"} NR==1{for(i=1;i<=NF;i++)printf "\n"}' $outputData >> ${html_file.extra_files_path}/outputClustering.html +#end if + +printf "\n" >> ${html_file.extra_files_path}/outputClustering.html +printf "\n\n">> ${html_file.extra_files_path}/outputClustering.html +printf "\n" >> ${html_file.extra_files_path}/outputClustering.html + +#if $dataToCluster.dataToCluster_selector=="foldChange" or ($dataToCluster.dataToCluster_selector=="expression" and $dataToCluster.filtering_step.filtering_step_selector!="no" and $dataToCluster.filtering_step.select_filtering.filtering_stepBis_selector=="diffExpParam"): +awk 'BEGIN{FS="\t"} NR>2{printf "\n";for(i=1;i<=NF;i++){printf "\n"};printf "\n";printf "\n"}' $outputData >> ${html_file.extra_files_path}/outputClustering.html +#else +awk 'BEGIN{FS="\t"} NR>1{printf "\n";for(i=1;i<=NF;i++){printf "\n"};printf "\n";printf "\n"}' $outputData >> ${html_file.extra_files_path}/outputClustering.html +#end if + +printf "\n" >> ${html_file.extra_files_path}/outputClustering.html + +printf "
GeneInfoCluster"\$i"
"\$i"GeneCluster"\$i"
"\$i""\$i"
"\$i"
"\$i"
+ +" >> ${html_file.extra_files_path}/outputClustering.html + +]]> +
+ + +> $log; + exit 10 +fi + +#write header of html file +printf "\n\n" > $html_file + + +##first add reference of the clustering output table +printf "

Clustering tabular

\n" >> $html_file +printf "Clustering results\n" >> $html_file + + +##manage heatmap file + + +if [ -e ./plotLyDir/Heatmap.html ]; then + +printf "

Heatmap plot

\n" >> $html_file + +##modify HTML to point to the first script folder +sed -i "s/Heatmap_files/PlotLy_Heatmap_scripts/g" ./plotLyDir/Heatmap.html + +##copy HTML files in both folders +cp ./plotLyDir/Heatmap.html ${html_file.extra_files_path}/Heatmap.html + +##add HTML link +printf "Heatmap\n" >> $html_file + +#if $advSection.imagePlotlyFormat=="svg": +##before copying scripts folder modify them to replace png snapshot with svg (not proud of solution but seems to work) +cd ./plotLyDir/Heatmap_files/plotly-main-*/ +awk '{gsub("\"png\"","\"svg\"",\$0);print \$0}' ./plotly-latest.min.js > ./plotly-latest.minTemp.js +awk '{gsub("Download plot as a png","Download plot as a svg",\$0);print \$0}' ./plotly-latest.minTemp.js > ./plotly-latest.min.js +rm ./plotly-latest.minTemp.js +cd ../../../ +#end if + +#if $advSection.scaleSnapshot!="1.0": +##before copying scripts folder modify scale parameter (not proud of solution but seems to work) +cd ./plotLyDir/Heatmap_files/plotly-main-*/ +awk '{gsub("h=t\\.scale\\|\\|1","h=$advSection.scaleSnapshot",\$0);print \$0}' ./plotly-latest.min.js > ./plotly-latest.minTemp.js +mv -f ./plotly-latest.minTemp.js ./plotly-latest.min.js +cd ../../../ +#end if + +##now copy scripts folder +cp -r ./plotLyDir/Heatmap_files $html_file.extra_files_path +mv ${html_file.extra_files_path}/Heatmap_files ${html_file.extra_files_path}/PlotLy_Heatmap_scripts + +else + printf "[ERROR]Heatmap.html is missing.\n" >> $log; + exit 10 +fi + + + + +##manage screePlot files + + +if [ -e ./plotLyDir/screePlot.html ]; then + +printf "

Scree plot

\n" >> $html_file + +##modify HTML to point to the first script folder +sed -i "s/screePlot_files/PlotLy_screePlot_scripts/g" ./plotLyDir/screePlot.html + +##copy HTML files in both folders +cp ./plotLyDir/screePlot.html ${html_file.extra_files_path}/screePlot.html + +##add HTML link +printf "Scree plot\n" >> $html_file + +#if $advSection.imagePlotlyFormat=="svg": +##before copying scripts folder modify them to replace png snapshot with svg (not proud of solution but seems to work) +cd ./plotLyDir/screePlot_files/plotly-main-*/ +awk '{gsub("\"png\"","\"svg\"",\$0);print \$0}' ./plotly-latest.min.js > ./plotly-latest.minTemp.js +awk '{gsub("Download plot as a png","Download plot as a svg",\$0);print \$0}' ./plotly-latest.minTemp.js > ./plotly-latest.min.js +rm ./plotly-latest.minTemp.js +cd ../../../ +#end if + +#if $advSection.scaleSnapshot!="1.0": +##before copying scripts folder modify scale parameter (not proud of solution but seems to work) +cd ./plotLyDir/screePlot_files/plotly-main-*/ +awk '{gsub("h=t\\.scale\\|\\|1","h=$advSection.scaleSnapshot",\$0);print \$0}' ./plotly-latest.min.js > ./plotly-latest.minTemp.js +mv -f ./plotly-latest.minTemp.js ./plotly-latest.min.js +cd ../../../ +#end if + +##now copy scripts folder +cp -r ./plotLyDir/screePlot_files $html_file.extra_files_path +mv ${html_file.extra_files_path}/screePlot_files ${html_file.extra_files_path}/PlotLy_screePlot_scripts + +else + printf "[WARNING]screeplot.html is missing, probably due to limited number of genes.\n" >> $log; +fi + + +##manage circular files + + +if [ -e ./plotDir/circularPlot.${advSection.imageFormat} ]; then + +cp ./plotDir/circularPlot.${advSection.imageFormat} ${html_file.extra_files_path}/circularPlot.${advSection.imageFormat} + +printf "

Circular plot

\n" >> $html_file + +##add HTML link +printf "Circular plot\n" >> $html_file + +else + printf "[WARNING]circularPlot file is missing, probably due to limited number of genes.\n" >> $log; +fi + + + +##create footer of HTML file +printf "\n" >> $html_file + +]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + = threshold or fold change <= 1/threshold will be kept (set this threshold to 1 if you do not want to filter on fold change). + + \- **FDR p-val threshold** to use for filtering, genes with FDR p-val <= threshold will be kept (set this threshold to 1 if you do not want to filter on FDR p-val). + + +- **From generic tabular file** to filter genes based on selected columns values. + + \- **Generic tabular file** contains gene in the first column and various informations used for filtering in the following (same format as clustered generic tabular file). + + \- **Low filtering columns** used to discard rows with values below a given threshold (typically for Fold Change filtering). If several columns are selected, rows satisfying threshold condition in at least one of these columns are kept. + + \- **Low filtering threshold** below which the rows are discarded, the same threshold is applied for all selected columns. + + \- **High filtering columns** used to discard rows with values above a given threshold (typically for p-value filtering). If several columns are selected, rows satisfying threshold condition in at least one of these columns are kept. + + \- **High filtering threshold** above which the rows are discarded, the same threshold is applied for all selected columns. + +- **From list of genes** to focus on pre-identified genes. + + \- **Gene list file** with genes identifiers as one column file without header. + + :: + + DDX11L2 + VWA1 + TTTY10 + ARSD + +----- + +**Advanced parameters** + +\- **Genes cluster number** used by hierarchical clustering (minimum is 2). See generated screeplot to adjust this number before re-running a clustering. + +\- **Samples/comparisons clusters number** used by hierarchical clustering applied on columns/conditions. Set to 1 (ie. no clustering) if you need to conserve input columns order for visualization purposes. Columns clusters information is not included (yet) in output tabular file. + +\- **Mathematical transformation** can be applied to clustered data before clustering and visualization. Data used for the filtering step are not modified by this transformation. + +\- **Distance measure** used to cluster rows and columns. + +\- **Agglomeration method** used to cluster rows and columns. + +\- **Add side bar** to vizualize factor values for displayed columns/conditions, represented as a colored side bar in the heatmap. + +- **Factor file** that contains factor information for coloring (same format as the factor file used for input data columns reordering). + +- **Factor to use** to color side bar depending on its values for displayed columns/conditions. + +- **Color palette used** for coloring factor values (see RColorBrewer R package documentation for more information on proposed palettes). + +\- **Maximum gene number** : for readability and running time considerations only, number of displayed rows (genes) in heatmaps/circular plot can be limited. Clustering information in generated tabular file and scree plot are computed from a global clustering considering all genes (excepting those filtered out before clustering). Heatmap and circular plot are displayed for a random gene selection, to avoid such random selection we advise you to use input filtering option before clustering to have a gene number below this limit. + +\- **Personalized heatmap colors** to build color gradient choosing start, middle and end colors. + +\- **Output format** for circular plots only. + +\- **Html snapshot format** for interactive plotly plots. + +\- **Scale html snapshots** to increase resolution of snapshots taken from interactive plotly plots. + +----- + +**Outputs** + +\- **Tabular clustering file** containing cluster information for each gene satifying filtering steps. If expression or generic data was clustered, a two columns file is generated with gene identifiers and cluster numbers with possibly additional columns containing informations used for filtering. If differential results was clustered, a similar file is returned with an additional column containing cluster numbers and differential statistics coresponding to comparisons used for filtering. + +\- **HTML file** to access interactive version of heatmap and screeplot through PlotLy html pages, circular plot image and tabulated clustering results. As a reminder, when the number of genes to display in heatmap/circular plot exceeds the maximum gene number parameter, a random sampling is performed for plotting efficiency. Thus, clustering displayed on heatmap/circular plot may slighlty differ from clustering information contained in tabular file as heatmap/circular plot clustering is done over a subset of genes whereas tabular file contains clustering results performed on all genes. + +\- **LOG file** containing information about execution. Useful especially if tool execution fails. Please attach this log file in any bug report. + +]]> + + + @misc{vandel_jimmy_2018_1477870, author = {Vandel, J. and Gheeraert, C. and Eeckhoute, J. and Staels, B. and Lefebvre, P. and Dubois-Chevalier, J.}, title = {GIANT: Galaxy-based Interactive tools for ANalaysis of Transcriptomic data}, month = nov, year = 2018, doi = {10.5281/zenodo.1477870}, url = {https://doi.org/10.5281/zenodo.1477870} + } + + @article{, + author = {Galili, Tal and O'Callaghan, Alan and + Sidi, Jonathan and Sievert, Carson}, + title = {heatmaply: an R package for creating interactive cluster + heatmaps for online publishing}, + journal = {Bioinformatics}, + year = {2017}, + doi = {10.1093/bioinformatics/btx657}, + url = {http://dx.doi.org/10.1093/bioinformatics/btx657}, + eprint = + {https://academic.oup.com/bioinformatics/article-pdf/doi/10.1093/bioinformatics/btx657/21358327/btx657.pdf} + } + + @article{doi:10.1093/bioinformatics/btu393, + author = {Gu, Zuguang and Gu, Lei and Eils, Roland and Schlesner, Matthias and Brors, Benedikt}, + title = {circlize implements and enhances circular visualization in R }, + journal = {Bioinformatics}, + volume = {30}, + number = {19}, + pages = {2811-2812}, + year = {2014}, + doi = {10.1093/bioinformatics/btu393}, + URL = {http://dx.doi.org/10.1093/bioinformatics/btu393}, + eprint = {/oup/backfile/content_public/journal/bioinformatics/30/19/10.1093_bioinformatics_btu393/2/btu393.pdf} + } + + @online{plotly, author = {Plotly Technologies Inc.}, title = {Collaborative data science}, publisher = {Plotly Technologies Inc.}, address = {Montreal, QC}, year = {2015}, url = {https://plot.ly} + } + + + + +
diff -r 000000000000 -r 14045c80a222 src/ExprPlotsScript.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/ExprPlotsScript.R Fri Jun 26 09:38:23 2020 -0400 @@ -0,0 +1,465 @@ +# A command-line interface to basic plots for use with Galaxy +# written by Jimmy Vandel +# one of these arguments is required: +# +# +initial.options <- commandArgs(trailingOnly = FALSE) +file.arg.name <- "--file=" +script.name <- sub(file.arg.name, "", initial.options[grep(file.arg.name, initial.options)]) +script.basename <- dirname(script.name) +source(file.path(script.basename, "utils.R")) +source(file.path(script.basename, "getopt.R")) + +#addComment("Welcome R!") + +# setup R error handling to go to stderr +options( show.error.messages=F, error = function () { cat(geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) + +# we need that to not crash galaxy with an UTF8 error on German LC settings. +loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") +loc <- Sys.setlocale("LC_NUMERIC", "C") + +#get starting time +start.time <- Sys.time() + +#get options +options(stringAsfactors = FALSE, useFancyQuotes = FALSE) +args <- commandArgs() + + +# get options, using the spec as defined by the enclosed list. +# we read the options from the default: commandArgs(TRUE). +spec <- matrix(c( + "dataFile", "i", 1, "character", + "factorInfo","t", 1, "character", + "dataFileFormat","j",1,"character", + "conditionNames","c",1,"character", + "format", "f", 1, "character", + "quiet", "q", 0, "logical", + "log", "l", 1, "character", + "histo" , "h", 1, "character", + "maPlot" , "a", 1, "character", + "boxplot" , "b", 1, "character", + "microarray" , "m", 1, "character", + "acp" , "p" , 1, "character", + "screePlot" , "s" , 1, "character"), + byrow=TRUE, ncol=4) +opt <- getopt(spec) + +# enforce the following required arguments +if (is.null(opt$log)) { + addComment("[ERROR]'log file' is required") + q( "no", 1, F ) +} +addComment("[INFO]Start of R script",T,opt$log,display=FALSE) +if (is.null(opt$dataFile) || is.null(opt$dataFileFormat)) { + addComment("[ERROR]'dataFile' and it format are required",T,opt$log) + q( "no", 1, F ) +} +if (is.null(opt$format)) { + addComment("[ERROR]'output format' is required",T,opt$log) + q( "no", 1, F ) +} +if (is.null(opt$histo) & is.null(opt$maPlot) & is.null(opt$boxplot) & is.null(opt$microarray) & is.null(opt$acp)){ + addComment("[ERROR]Select at least one plot to draw",T,opt$log) + q( "no", 1, F ) +} + +verbose <- if (is.null(opt$quiet)) { + TRUE +}else{ + FALSE} + +addComment("[INFO]Parameters checked!",T,opt$log,display=FALSE) + +addComment(c("[INFO]Working directory: ",getwd()),TRUE,opt$log,display=FALSE) +addComment(c("[INFO]Command line: ",args),TRUE,opt$log,display=FALSE) + +#directory for plots +dir.create(file.path(getwd(), "plotDir")) +dir.create(file.path(getwd(), "plotLyDir")) + +#silent package loading +suppressPackageStartupMessages({ + library("oligo") + library("ff") + library("ggplot2") + library("plotly") +}) + + +#chargement des fichiers en entrée +#fichier de type CEL +dataAreFromCel=FALSE +if(toupper(opt$dataFileFormat)=="CEL"){ + dataAreFromCel=TRUE + celData=read.celfiles(unlist(strsplit(opt$dataFile,","))) + #load all expressions + dataMatrix=exprs(celData) + #select "pm" probes + probeInfo=getProbeInfo(celData,probeType = c("pm"),target="probeset") + #reduce dataMatrix to log expression matrix for a randomly probe selection + dataMatrix=log2(dataMatrix[sample(unique(probeInfo[,1]),min(100000,length(unique(probeInfo[,1])))),]) + addComment("[INFO]Raw data are log2 transformed",TRUE,opt$log,display=FALSE) + remove(probeInfo) +}else{ + #fichier deja tabule + dataMatrix=read.csv(file=opt$dataFile,header=F,sep="\t",colClasses="character") + #remove first row to convert it as colnames (to avoid X before colnames with header=T) + colNamesData=dataMatrix[1,-1] + dataMatrix=dataMatrix[-1,] + #remove first colum to convert it as rownames + rowNamesData=dataMatrix[,1] + dataMatrix=dataMatrix[,-1] + if(is.data.frame(dataMatrix)){ + dataMatrix=data.matrix(dataMatrix) + }else{ + dataMatrix=data.matrix(as.numeric(dataMatrix)) + } + dimnames(dataMatrix)=list(rowNamesData,colNamesData) + if(any(duplicated(rowNamesData)))addComment("[WARNING] several rows share the same probe/gene name, you should merge or rename them to avoid further analysis mistakes",TRUE,opt$log,display=FALSE) +} + +addComment("[INFO]Input data loaded",TRUE,opt$log,display=FALSE) +addComment(c("[INFO]Dim of data matrix:",dim(dataMatrix)),T,opt$log,display=FALSE) + +#get number of conditions +nbConditions=ncol(dataMatrix) + +#get condition names if they are specified +if(!is.null(opt$conditionNames) && length(opt$conditionNames)==nbConditions){ + nameConditions=opt$conditionNames + colnames(dataMatrix)=nameConditions + #rownames(phenoData(celData)@data)=nameConditions + #rownames(protocolData(celData)@data)=nameConditions +}else{ + nameConditions=colnames(dataMatrix) +} + +#create a correspondance table between plot file names and name displayed in figure legend and html items +correspondanceNameTable=matrix("",ncol=2,nrow=nbConditions) +correspondanceNameTable[,1]=paste("Condition",1:nbConditions,sep="") +correspondanceNameTable[,2]=nameConditions +rownames(correspondanceNameTable)=correspondanceNameTable[,2] + +addComment("[INFO]Retreive condition names",TRUE,opt$log,display=FALSE) + +if(!is.null(opt$factorInfo)){ + #chargement du fichier des facteurs + factorInfoMatrix=read.csv(file=file.path(getwd(), opt$factorInfo),header=F,sep="\t",colClasses="character") + #remove first row to convert it as colnames + colnames(factorInfoMatrix)=factorInfoMatrix[1,] + factorInfoMatrix=factorInfoMatrix[-1,] + #use first colum to convert it as rownames but not removing it to avoid conversion as vector in unique factor case + rownames(factorInfoMatrix)=factorInfoMatrix[,1] + + + if(length(setdiff(colnames(dataMatrix),rownames(factorInfoMatrix)))!=0){ + addComment("[ERROR]Missing samples in factor file",T,opt$log) + q( "no", 1, F ) + } + + #order sample as in expression matrix and remove spurious sample + factorInfoMatrix=factorInfoMatrix[colnames(dataMatrix),] + + addComment("[INFO]Factors OK",T,opt$log,display=FALSE) + addComment(c("[INFO]Dim of factorInfo matrix:",dim(factorInfoMatrix)),T,opt$log,display=FALSE) + +} + +addComment("[INFO]Ready to plot",T,opt$log,display=FALSE) + + +##---------------------- + +###plot histograms### +histogramPerFigure=50 +if (!is.null(opt$histo)) { + for(iToPlot in 1:(((nbConditions-1)%/%histogramPerFigure)+1)){ + firstPlot=1+histogramPerFigure*(iToPlot-1) + lastPlot=min(nbConditions,histogramPerFigure*iToPlot) + dataToPlot=data.frame(x=c(dataMatrix[,firstPlot:lastPlot]),Experiment=rep(colnames(dataMatrix)[firstPlot:lastPlot],each=nrow(dataMatrix))) + p <- ggplot(data=dataToPlot, aes(x = x, color=Experiment)) + stat_density(geom="line", size=1, position="identity") + + ggtitle("Intensity densities") + theme_bw() + ylab(label="Density") + + theme(panel.border=element_blank(),plot.title = element_text(hjust = 0.5)) + if(dataAreFromCel){ + #original ploting function + #hist(celData[,firstPlot:lastPlot],lty=rep(1,nbConditions)[firstPlot:lastPlot],lwd=2,which='pm',target="probeset",transfo=log2,col=rainbow(nbConditions)[firstPlot:lastPlot]) + p <- p + xlab(label="Log2 intensities") + }else{ + p <- p + xlab(label="Intensities") + } + if(opt$format=="pdf"){ + pdf(paste(c("./plotDir/",opt$histo,iToPlot,".pdf"),collapse=""))}else{ + png(paste(c("./plotDir/",opt$histo,iToPlot,".png"),collapse="")) + } + print(p) + dev.off() + #save plotly files + pp <- ggplotly(p) + htmlwidgets::saveWidget(as_widget(pp), paste(c(file.path(getwd(), "plotLyDir"),"/",opt$histo,iToPlot,".html"),collapse=""),selfcontained = F) + } + remove(p,dataToPlot) + addComment("[INFO]Histograms drawn",T,opt$log,display=FALSE) +} + +##---------------------- + +###plot MAplots### +MAplotPerPage=4 +if (!is.null(opt$maPlot)) { + iToPlot=1 + plotVector=list() + toTake=sample(nrow(dataMatrix),min(200000,nrow(dataMatrix))) + refMedianColumn=rowMedians(as.matrix(dataMatrix[toTake,])) + if(length(toTake)>100000)addComment(c("[INFO]high number of input data rows ",length(toTake),"; the generation of MA plot can take a while, please be patient"),TRUE,opt$log,display=FALSE) + for (iCondition in 1:nbConditions){ + #MAplot(celData,which=i,what=pm,transfo=log2) + #smoothScatter(x=xToPlot,y=yToPlot,main=nameConditions[iCondition]) + dataA=dataMatrix[toTake,iCondition] + dataB=refMedianColumn####ATTENTION PAR DEFAUT + xToPlot=0.5*(dataA+dataB) + yToPlot=dataA-dataB + tempX=seq(min(xToPlot),max(xToPlot),0.1) + tempY=unlist(lapply(tempX,function(x){median(yToPlot[intersect(which(xToPlot>=(x-0.1/2)),which(xToPlot<(x+0.1/2)))])})) + + dataToPlot=data.frame(x=xToPlot,y=yToPlot) + dataMedianToPlot=data.frame(x=tempX,y=tempY) + p <- ggplot(data=dataToPlot, aes(x,y)) + stat_density2d(aes(fill = ..density..^0.25), geom = "tile", contour = FALSE, n = 100) + + scale_fill_continuous(low = "white", high = "dodgerblue4") + geom_smooth(data=dataMedianToPlot,colour="red", size=0.5, se=FALSE) + + ggtitle(correspondanceNameTable[iCondition,2]) + theme_bw() + xlab(label="") + ylab(label="") + + theme(panel.border=element_blank(),plot.title = element_text(hjust = 0.5),legend.position = "none") + plotVector[[length(plotVector)+1]]=p + + #save plotly files + pp <- ggplotly(p) + htmlwidgets::saveWidget(as_widget(pp), paste(c(file.path(getwd(), "plotLyDir"),"/",opt$maPlot,"_",correspondanceNameTable[iCondition,1],".html"),collapse=""),selfcontained = F) + + if(iCondition==nbConditions || length(plotVector)==MAplotPerPage){ + #define a new plotting file + if(opt$format=="pdf"){ + pdf(paste(c("./plotDir/",opt$maPlot,iToPlot,".pdf"),collapse=""))}else{ + png(paste(c("./plotDir/",opt$maPlot,iToPlot,".png"),collapse="")) + } + multiplot(plotlist=plotVector,cols=2) + dev.off() + if(iCondition0)c(sample(c(x),min(length(x),1000)),max(c(x)),min(c(x)))) + dataOutliers=data.frame(yVal=unlist(boxplotsOutliers),xVal=unlist(lapply(seq_along(boxplotsOutliers),function(x)rep(names(boxplotsOutliers)[x],length(boxplotsOutliers[[x]]))))) + #plot boxplots without outliers + p <- ggplot(data=dataToPlot, aes(y = intensities, x=Experiment ,color=Experiment)) + geom_boxplot(outlier.colour=NA,outlier.shape =NA) + + ggtitle("Intensities") + theme_bw() + xlab(label="") + + theme(panel.border=element_blank(),plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 45, hjust = 1),plot.margin=unit(c(10,10,max(unlist(lapply(dataToPlot$Experiment,function(x)nchar(as.character(x))))),15+max(unlist(lapply(dataToPlot$Experiment,function(x)nchar(as.character(x)))))),"mm")) + #add to plot sampled outliers + p <- p + geom_point(data=dataOutliers,aes(x=xVal,y=yVal,color=xVal),inherit.aes = F) + if(dataAreFromCel){ + #original plotting function + #boxplot(celData[,firstPlot:lastPlot],which='pm',col=rainbow(nbConditions)[firstPlot:lastPlot],target="probeset",transfo=log2,names=nameConditions[firstPlot:lastPlot],main="Intensities") + p <- p + ylab(label="Log2 intensities") + }else{ + p <- p + ylab(label="Intensities") + } + if(opt$format=="pdf"){ + pdf(paste(c("./plotDir/",opt$boxplot,iToPlot,".pdf"),collapse=""))}else{ + png(paste(c("./plotDir/",opt$boxplot,iToPlot,".png"),collapse="")) + } + print(p) + dev.off() + #save plotly files + pp <- ggplotly(p) + + #modify plotly object to get HTML file not too heavy for loading + for(iData in 1:length(pp$x$data)){ + ##get kept outliers y values + #yPointsToKeep=dataOutliers$yVal[which(dataOutliers$xVal==pp$x$data[[iData]]$name)] + if(pp$x$data[[iData]]$type=="scatter"){ + ##scatter plot represent outliers points added to boxplot through geom_point + ##nothing to do as outliers have been sampled allready, just have to modify hover text + #if(length(yPointsToKeep)>0){ + #pointsToKeep=which(pp$x$data[[iData]]$y %in% yPointsToKeep) + #pp$x$data[[iData]]$x=pp$x$data[[iData]]$x[pointsToKeep] + #pp$x$data[[iData]]$y=pp$x$data[[iData]]$y[pointsToKeep] + #pp$x$data[[iData]]$text=pp$x$data[[iData]]$text[pointsToKeep] + #}else{ + #pp$x$data[[iData]]$x=NULL + #pp$x$data[[iData]]$y=NULL + #pp$x$data[[iData]]$marker$opacity=0 + #pp$x$data[[iData]]$hoverinfo=NULL + #pp$x$data[[iData]]$text=NULL + #} + #modify text to display + if(dataAreFromCel){ + pp$x$data[[iData]]$text=unlist(lapply(seq_along(pp$x$data[[iData]]$y),function(x)return(paste(c("log2(intensity) ",prettyNum(pp$x$data[[iData]]$y[x],digits=4)),collapse = "")))) + }else{ + pp$x$data[[iData]]$text=unlist(lapply(seq_along(pp$x$data[[iData]]$y),function(x)return(paste(c("intensity ",prettyNum(pp$x$data[[iData]]$y[x],digits=4)),collapse = "")))) + } + }else{ + ##disable marker plotting to keep only box and whiskers plot (outliers are displayed through scatter plot) + pp$x$data[[iData]]$marker$opacity=0 + + #sample 50000 points amongst all data to get a lighter html file, sampling size should not be too low to avoid modifying limit of boxplots + pp$x$data[[iData]]$y=c(sample(dataMatrix[,pp$x$data[[iData]]$name],min(length(dataMatrix[,pp$x$data[[iData]]$name]),50000)),min(dataMatrix[,pp$x$data[[iData]]$name]),max(dataMatrix[,pp$x$data[[iData]]$name])) + pp$x$data[[iData]]$x=rep(pp$x$data[[iData]]$x[1],length(pp$x$data[[iData]]$y)) + + ##first remove outliers info + #downUpValues=boxplot.stats(dataMatrix[,pp$x$data[[iData]]$name])$stats + #if(verbose)addComment(c("filter values for boxplot",pp$x$data[[iData]]$name,"between",min(downUpValues),"and",max(downUpValues)),T,opt$log) + #pointsToRemove=which(pp$x$data[[iData]]$y0)pp$x$data[[iData]]$y=pp$x$data[[iData]]$y[-pointsToRemove] + #pointsToRemove=which(pp$x$data[[iData]]$y>max(downUpValues)) + #if(length(pointsToRemove)>0)pp$x$data[[iData]]$y=pp$x$data[[iData]]$y[-pointsToRemove] + #then add sampled outliers info + #pp$x$data[[iData]]$y=c(yPointsToKeep,pp$x$data[[iData]]$y) + #pp$x$data[[iData]]$x=rep(pp$x$data[[iData]]$x[1],length(pp$x$data[[iData]]$y)) + } + } + + htmlwidgets::saveWidget(as_widget(pp), paste(c(file.path(getwd(), "plotLyDir"),"/",opt$boxplot,iToPlot,".html"),collapse=""),selfcontained = F) + } + remove(p,dataToPlot) + addComment("[INFO]Boxplots drawn",T,opt$log,display=FALSE) + +} + +##---------------------- + +###plot microarrays (only for .CEL files)### +if (!is.null(opt$microarray) && dataAreFromCel) { + for (iCondition in 1:nbConditions){ + if(opt$format=="pdf"){ + pdf(paste(c("./plotDir/",opt$microarray,"_",correspondanceNameTable[iCondition,1],".pdf"),collapse=""),onefile = F,width = 5,height = 5)}else{ + png(paste(c("./plotDir/",opt$microarray,"_",correspondanceNameTable[iCondition,1],".png"),collapse="")) + } + image(celData[,iCondition],main=correspondanceNameTable[iCondition,2]) + dev.off() + } + addComment("[INFO]Microarray drawn",T,opt$log,display=FALSE) +} + +##---------------------- + +###plot PCA plot### +if (!is.null(opt$acp)){ + ##to avoid error when nrow is too large, results quite stable with 200k random selected rows + randomSelection=sample(nrow(dataMatrix),min(200000,nrow(dataMatrix))) + #remove constant variables + + dataFiltered=dataMatrix[randomSelection,] + toRemove=which(unlist(apply(dataFiltered,1,var))==0) + if(length(toRemove)>0){ + dataFiltered=dataFiltered[-toRemove,] + } + ##geom_text(aes(label=Experiments,hjust=1, vjust=1.3), y = PC2+0.01) + PACres = prcomp(t(dataFiltered),scale.=TRUE) + + if(!is.null(opt$screePlot)){ + #scree plot + #p <- fviz_eig(PACres) + dataToPlot=data.frame(compo=seq(1,length(PACres$sdev)),var=(PACres$sdev^2/sum(PACres$sdev^2))*100) + p<-ggplot(data=dataToPlot, aes(x=compo, y=var)) + geom_bar(stat="identity", fill="steelblue") + geom_line() + geom_point() + + ggtitle("Scree plot") + theme_bw() + theme(panel.border=element_blank(),plot.title = element_text(hjust = 0.5)) + + xlab(label="Dimensions") + ylab(label="% explained variances") + scale_x_discrete(limits=dataToPlot$compo) + pp <- ggplotly(p) + + if(opt$format=="pdf"){ + pdf(paste(c("./plotDir/",opt$screePlot,".pdf"),collapse=""))}else{ + png(paste(c("./plotDir/",opt$screePlot,".png"),collapse="")) + } + plot(p) + dev.off() + htmlwidgets::saveWidget(as_widget(pp), paste(c(file.path(getwd(), "plotLyDir"),"/",opt$screePlot,".html"),collapse=""),selfcontained = F) + } + + #now plot pca plots + + if(!is.null(opt$factorInfo)){ + fileIdent="" + symbolset = c("circle","cross","square","diamond","circle-open","square-open","diamond-open","x") + + #save equivalence between real factor names and generic ones in correspondanceNameTable + correspondanceNameTable=rbind(correspondanceNameTable,matrix(c(paste("Factor",1:(ncol(factorInfoMatrix)-1),sep=""),colnames(factorInfoMatrix)[-1]),ncol=2,nrow=ncol(factorInfoMatrix)-1)) + rownames(correspondanceNameTable)=correspondanceNameTable[,2] + + #first order factors from decreasing groups number + orderedFactors=colnames(factorInfoMatrix)[-1][order(unlist(lapply(colnames(factorInfoMatrix)[-1],function(x)length(table(factorInfoMatrix[,x])))),decreasing = T)] + allFactorsBigger=length(table(factorInfoMatrix[,orderedFactors[length(orderedFactors)]]))>length(symbolset) + if(allFactorsBigger)addComment("[WARNING]All factors are composed of too many groups to display two factors at same time, each PCA plot will display only one factor groups",T,opt$log,display=FALSE) + for(iFactor in 1:length(orderedFactors)){ + #if it is the last factor of the list or if all factor + if(iFactor==length(orderedFactors) || allFactorsBigger){ + if(length(orderedFactors)==1 || allFactorsBigger){ + dataToPlot=data.frame(PC1=PACres$x[,1],PC2=PACres$x[,2],PC3=PACres$x[,3],Experiments=rownames(PACres$x), Attribute1=factorInfoMatrix[rownames(PACres$x),orderedFactors[iFactor]], hoverLabel=unlist(lapply(rownames(PACres$x),function(x)paste(factorInfoMatrix[x,-1],collapse=",")))) + p <- plot_ly(dataToPlot,x = ~PC1, y = ~PC2, z = ~PC3, type = 'scatter3d', mode="markers", color=~Attribute1,colors=rainbow(length(levels(dataToPlot$Attribute1))+2),hoverinfo = 'text', text = ~paste(Experiments,"\n",hoverLabel),marker=list(size=5))%>% + layout(title = "Principal Component Analysis", scene = list(xaxis = list(title = "Component 1"),yaxis = list(title = "Component 2"),zaxis = list(title = "Component 3")), + legend=list(font = list(family = "sans-serif",size = 15,color = "#000"))) + fileIdent=correspondanceNameTable[orderedFactors[iFactor],1] + #add text label to plot + ##p <- add_text(p,x = dataToPlot$PC1, y = dataToPlot$PC2 + (max(PACres$x[,2])-min(PACres$x[,2]))*0.02, z = dataToPlot$PC3, mode = 'text', inherit = F, text=rownames(PACres$x), hoverinfo='skip', showlegend = FALSE, color=I('black')) + #save the plotly plot + htmlwidgets::saveWidget(as_widget(p), paste(c(file.path(getwd(), "plotLyDir"),"/",opt$acp,"_",fileIdent,".html"),collapse=""),selfcontained = F) + } + }else{ + for(iFactorBis in (iFactor+1):length(orderedFactors)){ + if(length(table(factorInfoMatrix[,orderedFactors[iFactorBis]]))<=length(symbolset)){ + dataToPlot=data.frame(PC1=PACres$x[,1],PC2=PACres$x[,2],PC3=PACres$x[,3],Experiments=rownames(PACres$x), Attribute1=factorInfoMatrix[rownames(PACres$x),orderedFactors[iFactor]], Attribute2=factorInfoMatrix[rownames(PACres$x),orderedFactors[iFactorBis]], hoverLabel=unlist(lapply(rownames(PACres$x),function(x)paste(factorInfoMatrix[x,-1],collapse=",")))) + p <- plot_ly(dataToPlot,x = ~PC1, y = ~PC2, z = ~PC3, type = 'scatter3d', mode="markers", color=~Attribute1,colors=rainbow(length(levels(dataToPlot$Attribute1))+2),symbol=~Attribute2,symbols = symbolset,hoverinfo = 'text', text = ~paste(Experiments,"\n",hoverLabel),marker=list(size=5))%>% + layout(title = "Principal Component Analysis", scene = list(xaxis = list(title = "Component 1"),yaxis = list(title = "Component 2"),zaxis = list(title = "Component 3")), + legend=list(font = list(family = "sans-serif",size = 15,color = "#000"))) + fileIdent=paste(correspondanceNameTable[orderedFactors[c(iFactor,iFactorBis)],1],collapse="_AND_") + #add text label to plot + ##p <- add_text(p,x = dataToPlot$PC1, y = dataToPlot$PC2 + (max(PACres$x[,2])-min(PACres$x[,2]))*0.02, z = dataToPlot$PC3, mode = 'text', inherit = F, text=rownames(PACres$x), hoverinfo='skip', showlegend = FALSE, color=I('black')) + #save the plotly plot + htmlwidgets::saveWidget(as_widget(p), paste(c(file.path(getwd(), "plotLyDir"),"/",opt$acp,"_",fileIdent,".html"),collapse=""),selfcontained = F) + }else{ + addComment(c("[WARNING]PCA with",orderedFactors[iFactor],"and",orderedFactors[iFactorBis],"groups cannot be displayed, too many groups (max",length(symbolset),")"),T,opt$log,display=FALSE) + } + } + } + } + }else{ + dataToPlot=data.frame(PC1=PACres$x[,1],PC2=PACres$x[,2],PC3=PACres$x[,3],Experiments=rownames(PACres$x)) + p <- plot_ly(dataToPlot,x = ~PC1, y = ~PC2, z = ~PC3, type = 'scatter3d', mode="markers",marker=list(size=5,color="salmon"),hoverinfo = 'text',text = ~paste(Experiments))%>% + layout(title = "Principal Component Analysis", scene = list(xaxis = list(title = "Component 1"),yaxis = list(title = "Component 2"),zaxis = list(title = "Component 3")), + legend=list(font = list(family = "sans-serif",size = 15,color = "#000"))) + ##p <- add_text(p,x = dataToPlot$PC1, y = dataToPlot$PC2 + (max(PACres$x[,2])-min(PACres$x[,2]))*0.02, z = dataToPlot$PC3, mode = 'text', inherit = F, text=rownames(PACres$x), hoverinfo='skip', showlegend = FALSE, color=I('black')) + + #save plotly files + htmlwidgets::saveWidget(as_widget(p), paste(c(file.path(getwd(), "plotLyDir"),"/",opt$acp,"_plot.html"),collapse=""),selfcontained = F) + } + remove(p,dataToPlot,dataFiltered) + addComment("[INFO]ACP plot drawn",T,opt$log,display=FALSE) +} + +#write correspondances between plot file names and displayed names in figure legends, usefull to define html items in xml file +write.table(correspondanceNameTable,file=file.path(getwd(), "correspondanceFileNames.csv"),quote=FALSE,sep="\t",col.names = F,row.names = F) + +end.time <- Sys.time() +addComment(c("[INFO]Total execution time for R script:",as.numeric(end.time - start.time,units="mins"),"mins"),T,opt$log,display=FALSE) + +addComment("[INFO]End of R script",T,opt$log,display=FALSE) + +printSessionInfo(opt$log) +#sessionInfo() diff -r 000000000000 -r 14045c80a222 src/General_functions.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/General_functions.py Fri Jun 26 09:38:23 2020 -0400 @@ -0,0 +1,206 @@ +import re +import numpy as np + +def get_column_names( file_path, toNotConsider=-1, each=1): + options=[] + inputfile = open(file_path) + firstLine = next(inputfile).strip().split("\t") + cpt=0 + for i, field_component in enumerate( firstLine ): + if i!=toNotConsider:#to squeeze the first column + if cpt==0: + options.append( ( field_component, field_component, False ) ) + cpt+=1 + if cpt==each: + cpt=0 + inputfile.close() + return options + +def get_column_names_filteredList( file_path, toNotConsider=[], each=1): + options=[] + inputfile = open(file_path) + firstLine = next(inputfile).strip().split("\t") + cpt=0 + for i, field_component in enumerate( firstLine ): + if i not in toNotConsider:#to squeeze the first columns + if cpt==0: + options.append( ( field_component, field_component, False ) ) + cpt+=1 + if cpt==each: + cpt=0 + inputfile.close() + return options + +def get_column_names_mergeNumber(file_path, numberToMerge=1, toNotConsider=[]): + options=[] + inputfile = open(file_path) + if int(numberToMerge)>0: + iHeader=0 + for iCurrentLine in inputfile: + iHeader=iHeader+1 + if iHeader>int(numberToMerge): + break + currentLine=iCurrentLine.strip().split("\t") + iOption=-1 + for i, field_component in enumerate( currentLine ): + if i not in toNotConsider:#to squeeze specified columns + iOption=iOption+1 + if iHeader==1: + options.append( ( str(field_component), str(field_component), False ) ) + else: + options[iOption]=(options[iOption][0]+"_"+str(field_component),options[iOption][1]+"_"+str(field_component),False) + else: + currentLine = next(inputfile).strip().split("\t") + for i, field_component in enumerate( currentLine ): + if i not in toNotConsider:#to squeeze specified columns + options.append( ( "Column_"+str(i), "Column_"+str(i), False ) ) + inputfile.close() + return options + +def get_row_names( file_path, factorName ): + inputfile = open(file_path) + firstLine = next(inputfile).strip().split("\t") + iColumn=-1 + for i, field_component in enumerate( firstLine ): + if field_component==factorName:#to test + iColumn=i + options=[] + if iColumn!=-1: + for nextLine in inputfile: + nextLine=nextLine.strip().split("\t") + if len(nextLine)>1: + if (nextLine[iColumn], nextLine[iColumn], False) not in options: + options.append( (nextLine[iColumn], nextLine[iColumn], False) ) + inputfile.close() + return options + +def get_condition_file_names( file_list, toNotConsider=-1, each=1): + options=[] + if not isinstance(file_list,list):#if input file is a tabular file, act as get_column_names + inputfile = open(file_list.file_name) + firstLine = next(inputfile).strip().split("\t") + cpt=0 + for i, field_component in enumerate( firstLine ): + if i!=toNotConsider:#to squeeze the first column + if cpt==0: + options.append( ( field_component, field_component, False ) ) + cpt+=1 + if cpt==each: + cpt=0 + inputfile.close() + else:#if input file is a .cel file list or a collection + if not hasattr(file_list[0],'collection'):#if it is not a collection, get name easily + for i, field_component in enumerate( file_list ): + options.append( ( field_component.name, field_component.name, False ) ) + else:#if the file is a collection, have to get deeper in the corresponding HistoryDatasetCollectionAssociation object + for i, field_component in enumerate( file_list[0].collection.elements ): + options.append( ( field_component.element_identifier, field_component.element_identifier, False ) ) + return options + +def generateFactorFile( file_list, factor_list, outputFileName, logFile): + forbidenCharacters={"*",":",",","|"} + outputfile = open(outputFileName, 'w') + outputLog = open(logFile, 'w') + sampleList=[] + if not isinstance(file_list,list): + conditionNames=get_condition_file_names(file_list,0) #unique expression file, remove the first column (index=0) + else : + conditionNames=get_condition_file_names(file_list) #.CEL files + for iSample, sample_component in enumerate (conditionNames): + sampleList.append(str(sample_component[1])) + outputLog.write("[INFO] "+str(len(sampleList))+" sample are detected as input\n") + globalDict=dict() + factorNameList=[] + firstLine="Conditions" + if len(factor_list)==0:#check if there is at least one factor available + outputLog.write("[ERROR] no factor was defined !\n") + return 1 + else: + for iFactor, factor_component in enumerate( factor_list ): + currentSampleList=list(sampleList) + currentFactor=str(factor_component['factorName']) + #check if factor name contains forbidden characters + for specialCharacter in forbidenCharacters: + if currentFactor.find(specialCharacter)!=-1: + outputLog.write("[ERROR] '"+specialCharacter+"' character is forbidden in factor name : '"+currentFactor+"'\n") + return 4 + #check if factor allready named like that + if not globalDict.get(currentFactor) is None: + outputLog.write("[ERROR] '"+currentFactor+"' is used several times as factor name\n") + return 3 + globalDict[currentFactor]=dict() + firstLine=firstLine+"\t"+currentFactor + factorNameList.append(currentFactor) + if len(factor_component['valueList'])<=1:#check if there is at least two value available + outputLog.write("[ERROR] at least two different values are necessary for '"+currentFactor+"' factor\n") + return 1 + else: + for iValue, value_component in enumerate( factor_component['valueList'] ): + currentValue=str(value_component['valueName']) + #check if factor name contains forbidden characters + for specialCharacter in forbidenCharacters: + if currentValue.find(specialCharacter)!=-1: + outputLog.write("[ERROR] '"+specialCharacter+"' character is forbidden in value name : '"+currentValue+"'\n") + return 4 + currentSample=str(value_component['valueConditions']).split(",") + for iSample, sample_component in enumerate (currentSample): + if not sample_component in currentSampleList: + outputLog.write("[ERROR] sample "+sample_component+" was assigned several times for factor '"+currentFactor+"'\n") + return 2 + currentSampleList.remove(sample_component) + globalDict[currentFactor][sample_component]=currentValue + if(len(currentSampleList)>0): + outputLog.write("[ERROR] for factor '"+currentFactor+"'' sample "+str(currentSampleList)+" are not assigned to any value\n") + return 2 + outputLog.write("[INFO] "+str(len(globalDict))+" factors are detected\n") + #start writing the factor file + outputfile.write(firstLine+"\n") + for iSample, sample_component in enumerate(sampleList): + newLine=sample_component + for iFactor, factor_component in enumerate(factorNameList): + newLine=newLine+"\t"+globalDict[factor_component][sample_component] + outputfile.write(newLine+"\n") + outputfile.close() + outputLog.close() + return 0 + +def selectSubSetTable(file_path,headerLine_number,columnsToAdd,columnNamesToKeep,outputFileName,logFile): + outputLog = open(logFile, 'w') + outputLog.write("[INFO] header line number : "+ headerLine_number+" lines\n") + availableColumnsTuple=get_column_names_mergeNumber(file_path, headerLine_number) + #convert tuple list as a simple array + availableColumns=[] + for iTuple, tuple_content in enumerate (availableColumnsTuple): + availableColumns.append(str(tuple_content[0])) + if len(availableColumns)==0: + outputLog.write("[ERROR] No detected columns in input file\n") + return 1 + selectedColumns=list(columnsToAdd) + for iVolcano, volcano_content in enumerate(columnNamesToKeep): + selectedColumns.append(availableColumns.index(volcano_content['pvalColumn'])) + if volcano_content['fdrColumn'] in availableColumns: + selectedColumns.append(availableColumns.index(volcano_content['fdrColumn'])) + else: + selectedColumns.append(0) + selectedColumns.append(availableColumns.index(volcano_content['fcColumn'])) + if len(selectedColumns)!=(3*len(columnNamesToKeep)+len(columnsToAdd)): + outputLog.write("[ERROR] matching between input file colnames and requested column names failed\n") + return 1 + outputLog.write("[INFO] columns kept : "+str(selectedColumns)+"\n") + #start writting formatted file + inputfile = open(file_path) + outputfile = open(outputFileName, 'w') + iLineCpt=-1 + for iCurrentLine in inputfile: + iLineCpt=iLineCpt+1 + if iLineCpt>=int(headerLine_number): + currentLineFields=np.array(iCurrentLine.strip().split("\t")) + newLine="\t".join(currentLineFields[selectedColumns]) + outputfile.write(newLine+"\n") + if iLineCpt1: + if (nextLine[iColumn], nextLine[iColumn], False) not in options: + options.append( (nextLine[iColumn], nextLine[iColumn], False) ) + inputfile.close() + return options + +def get_row_names_interaction( file_path, factorNameA, factorNameB ): + inputfile = open(file_path) + firstLine = next(inputfile).strip().split("\t") + iColumnA=-1 + iColumnB=-1 + for i, field_component in enumerate( firstLine ): + if field_component==factorNameA:#to test + iColumnA=i + if field_component==factorNameB:#to test + iColumnB=i + possibleValuesA=[] + possibleValuesB=[] + if iColumnA!=-1 and iColumnB!=-1: + for nextLine in inputfile: + nextLine=nextLine.strip().split("\t") + if len(nextLine)>1: + if nextLine[iColumnA] not in possibleValuesA: + possibleValuesA.append(nextLine[iColumnA]) + if nextLine[iColumnB] not in possibleValuesB: + possibleValuesB.append(nextLine[iColumnB]) + inputfile.close() + options=[] + if len(possibleValuesA)>=1 and len(possibleValuesB)>=1 and possibleValuesA[0]!="None" and possibleValuesB[0]!="None": + for counterA in range(len(possibleValuesA)): + for counterB in range(len(possibleValuesB)): + options.append( (possibleValuesA[counterA]+"*"+possibleValuesB[counterB], possibleValuesA[counterA]+"*"+possibleValuesB[counterB], False) ) + return options + +def get_comparisonsA( factorA, valuesA ): + options=[] + formatValuesA=re.sub("(^\[u')|('\]$)","", str(valuesA)) + possibleValues=formatValuesA.split("', u'") + if len(possibleValues)>=2: + for counter in range(len(possibleValues)-1): + for innerCounter in range(counter+1,len(possibleValues)): + options.append( (possibleValues[counter]+" - "+possibleValues[innerCounter], possibleValues[counter]+" - "+possibleValues[innerCounter], False) ) + options.append( (possibleValues[innerCounter]+" - "+possibleValues[counter], possibleValues[innerCounter]+" - "+possibleValues[counter], False) ) + return options + +def get_comparisonsAB(factorA, valuesA, factorB, valuesB, interaction): + options=[] + formatValuesA=re.sub("(^\[u')|('\]$)","", str(valuesA)) + possibleValuesA=formatValuesA.split("', u'") + formatValuesB=re.sub("(^\[u')|('\]$)","", str(valuesB)) + possibleValuesB=formatValuesB.split("', u'") + if str(interaction)=="False": + if len(possibleValuesA)>=2: + for counter in range(len(possibleValuesA)-1): + for innerCounter in range(counter+1,len(possibleValuesA)): + options.append( (possibleValuesA[counter]+" - "+possibleValuesA[innerCounter], possibleValuesA[counter]+" - "+possibleValuesA[innerCounter], False) ) + options.append( (possibleValuesA[innerCounter]+" - "+possibleValuesA[counter], possibleValuesA[innerCounter]+" - "+possibleValuesA[counter], False) ) + if len(possibleValuesB)>=2: + for counter in range(len(possibleValuesB)-1): + for innerCounter in range(counter+1,len(possibleValuesB)): + options.append( (possibleValuesB[counter]+" - "+possibleValuesB[innerCounter], possibleValuesB[counter]+" - "+possibleValuesB[innerCounter], False) ) + options.append( (possibleValuesB[innerCounter]+" - "+possibleValuesB[counter], possibleValuesB[innerCounter]+" - "+possibleValuesB[counter], False) ) + else: + if len(possibleValuesA)>=1 and len(possibleValuesB)>=1 and possibleValuesA[0]!="None" and possibleValuesB[0]!="None": + for counterA in range(len(possibleValuesA)): + for innerCounterA in range(len(possibleValuesA)): + for counterB in range(len(possibleValuesB)): + for innerCounterB in range(len(possibleValuesB)): + if not(counterA==innerCounterA and counterB==innerCounterB): + options.append( ("("+possibleValuesA[counterA]+" * "+possibleValuesB[counterB]+") - ("+possibleValuesA[innerCounterA]+" * "+possibleValuesB[innerCounterB]+")","("+possibleValuesA[counterA]+" * "+possibleValuesB[counterB]+") - ("+possibleValuesA[innerCounterA]+" * "+possibleValuesB[innerCounterB]+")", False) ) + return options + +def get_row_names_allInteractions( file_path, factorSelected): + formatFactors=re.sub("(^\[u')|('\]$)","", str(factorSelected)) + factorsList=formatFactors.split("', u'") + iColumn=[None] * len(factorsList) + valuesList=[None] * len(factorsList) + + inputfile = open(file_path) + firstLine = next(inputfile).strip().split("\t") + for iField, fieldComponent in enumerate( firstLine ): + for iFactor, factorComponent in enumerate(factorsList): + if fieldComponent==factorComponent: + iColumn[iFactor]=iField + valuesList[iFactor]=[] + + for nextLine in inputfile: + nextLine=nextLine.strip().split("\t") + if len(nextLine)>1: + for iFactor, factorComponent in enumerate(factorsList): + if nextLine[iColumn[iFactor]] not in valuesList[iFactor]: + valuesList[iFactor].append(nextLine[iColumn[iFactor]]) + inputfile.close() + + allCombinations=[] + for iFactor, factorComponent in enumerate(factorsList): + if iFactor==0: + allCombinations=valuesList[iFactor] + else: + currentCombinations=allCombinations + allCombinations=[] + for iValue, valueComponent in enumerate(valuesList[iFactor]): + for iCombination, combination in enumerate(currentCombinations): + allCombinations.append(combination+"*"+valueComponent) + + options=[] + for iCombination, combination in enumerate(allCombinations): + options.append((combination,combination,False)) + + return options + +def get_allrow_names( file_path, factorSelected ): + formatFactors=re.sub("(^\[u')|('\]$)","", str(factorSelected)) + factorsList=formatFactors.split("', u'") + iColumn=[None] * len(factorsList) + valuesList=[None] * len(factorsList) + + inputfile = open(file_path) + firstLine = next(inputfile).strip().split("\t") + for iField, fieldComponent in enumerate( firstLine ): + for iFactor, factorComponent in enumerate(factorsList): + if fieldComponent==factorComponent: + iColumn[iFactor]=iField + valuesList[iFactor]=[] + + for nextLine in inputfile: + nextLine=nextLine.strip().split("\t") + if len(nextLine)>1: + for iFactor, factorComponent in enumerate(factorsList): + if nextLine[iColumn[iFactor]] not in valuesList[iFactor]: + valuesList[iFactor].append(nextLine[iColumn[iFactor]]) + inputfile.close() + + allValues=[] + for iFactor, factorComponent in enumerate(factorsList): + for iValue, valueComponent in enumerate(valuesList[iFactor]): + allValues.append(factorComponent+":"+valueComponent) + + options=[] + for iValue, valueComponent in enumerate(allValues): + options.append((valueComponent,valueComponent,False)) + + return options + +def replaceNamesInFiles(expressionFile_name,conditionFile_name,outputExpressionFile,outputConditionFile,ouputDictionnary): + dico={} + forbidenCharacters={"*",":",",","|"} + ##start with expression file, read only the first line + inputfile = open(expressionFile_name) + outputfile = open(outputExpressionFile, 'w') + firstLine = next(inputfile).rstrip().split("\t") + iCondition=1 + newFirstLine="" + for i, field_component in enumerate( firstLine ): + if (i>0): + #conditions names should not be redundant with other conditions + if(field_component not in dico): + dico[field_component]="Condition"+str(iCondition) + newFirstLine+="\t"+"Condition"+str(iCondition) + iCondition+=1 + else: + raise NameError('condition name allready exists!') + else: + newFirstLine+=field_component + outputfile.write(newFirstLine+"\n") + for line in inputfile: + outputfile.write(line) + outputfile.close() + inputfile.close() + #then parse condition file, read all lines in this case + inputfile = open(conditionFile_name) + outputfile = open(outputConditionFile, 'w') + firstLine=1 + iFactor=1 + iValue=1 + for line in inputfile: + currentLine = line.rstrip().split("\t") + newCurrentLine="" + for i, field_component in enumerate( currentLine ): + #special treatment for the first line + if (firstLine==1): + if (i==0): + newCurrentLine=field_component + else: + #factor names should not be redundant with other factors or conditions + if(field_component not in dico): + dico[field_component]="Factor"+str(iFactor) + newCurrentLine+="\t"+"Factor"+str(iFactor) + iFactor+=1 + else: + raise NameError('factor name allready exists!') + else: + if (i==0): + #check if condition name allready exist and used it if it is, or create a new one if not + if(field_component not in dico): + dico[field_component]="Condition"+str(iCondition) + newCurrentLine="Condition"+str(iCondition) + iCondition+=1 + else: + newCurrentLine=dico[field_component] + else: + if(field_component not in dico): + dico[field_component]="Value"+str(iValue) + newCurrentLine+="\tValue"+str(iValue) + iValue+=1 + else: + newCurrentLine+="\t"+dico[field_component] + outputfile.write(newCurrentLine+"\n") + firstLine=0 + outputfile.close() + inputfile.close() + ##check if any entries in dictionnary contains forbiden character + for key, value in dico.items(): + for specialCharacter in forbidenCharacters: + if value.startswith("Condition")==False and key.find(specialCharacter)!=-1: + return 1 + ##then write dictionnary in a additional file + outputfile = open(ouputDictionnary, 'w') + for key, value in dico.items(): + outputfile.write(key+"\t"+value+"\n") + outputfile.close() + return 0 + + +def replaceNamesBlockInFiles(expressionFile_name,conditionFile_name,blockingFile_name,outputExpressionFile,outputConditionFile,outputBlockingFile,ouputDictionnary): + dico={} + forbidenCharacters={"*",":",",","|"} + ##start with expression file, read only the first line + inputfile = open(expressionFile_name) + outputfile = open(outputExpressionFile, 'w') + firstLine = next(inputfile).rstrip().split("\t") + iCondition=1 + newFirstLine="" + for i, field_component in enumerate( firstLine ): + if (i>0): + #conditions names should not be redundant with other conditions + if(field_component not in dico): + dico[field_component]="Condition"+str(iCondition) + newFirstLine+="\t"+"Condition"+str(iCondition) + iCondition+=1 + else: + raise NameError('condition name allready exists!') + else: + newFirstLine+=field_component + outputfile.write(newFirstLine+"\n") + for line in inputfile: + outputfile.write(line) + outputfile.close() + inputfile.close() + #then parse condition file, read all lines in this case + iFactor=1 + iValue=1 + for fileNum in range(2): + if fileNum==0: + inputfile = open(conditionFile_name) + outputfile = open(outputConditionFile, 'w') + else: + inputfile = open(blockingFile_name) + outputfile = open(outputBlockingFile, 'w') + firstLine=1 + for line in inputfile: + currentLine = line.rstrip().split("\t") + newCurrentLine="" + for i, field_component in enumerate( currentLine ): + #special treatment for the first line + if (firstLine==1): + if (i==0): + newCurrentLine=field_component + else: + #factor names should not be redundant with other factors or conditions + if(field_component not in dico): + dico[field_component]="Factor"+str(iFactor) + newCurrentLine+="\t"+"Factor"+str(iFactor) + iFactor+=1 + else: + raise NameError('factor name allready exists!') + else: + if (i==0): + #check if condition name allready exist and used it if it is, or create a new one if not + if(field_component not in dico): + dico[field_component]="Condition"+str(iCondition) + newCurrentLine="Condition"+str(iCondition) + iCondition+=1 + else: + newCurrentLine=dico[field_component] + else: + if(field_component not in dico): + dico[field_component]="Value"+str(iValue) + newCurrentLine+="\tValue"+str(iValue) + iValue+=1 + else: + newCurrentLine+="\t"+dico[field_component] + outputfile.write(newCurrentLine+"\n") + firstLine=0 + outputfile.close() + inputfile.close() + ##check if any entries in dictionnary contains forbiden character + for key, value in dico.items(): + for specialCharacter in forbidenCharacters: + if value.startswith("Condition")==False and key.find(specialCharacter)!=-1: + return 1 + ##then write dictionnary in a additional file + outputfile = open(ouputDictionnary, 'w') + for key, value in dico.items(): + outputfile.write(key+"\t"+value+"\n") + outputfile.close() + return 0 diff -r 000000000000 -r 14045c80a222 src/LIMMAscriptV4.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/LIMMAscriptV4.R Fri Jun 26 09:38:23 2020 -0400 @@ -0,0 +1,1002 @@ +# A command-line interface for LIMMA to use with Galaxy +# written by Jimmy Vandel +# one of these arguments is required: +# +# +initial.options <- commandArgs(trailingOnly = FALSE) +file.arg.name <- "--file=" +script.name <- sub(file.arg.name, "", initial.options[grep(file.arg.name, initial.options)]) +script.basename <- dirname(script.name) +source(file.path(script.basename, "utils.R")) +source(file.path(script.basename, "getopt.R")) + +#addComment("Welcome R!") + +# setup R error handling to go to stderr +options( show.error.messages=F, error = function () { cat(geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) + +# we need that to not crash galaxy with an UTF8 error on German LC settings. +loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") +loc <- Sys.setlocale("LC_NUMERIC", "C") + +#get starting time +start.time <- Sys.time() + +options(stringAsfactors = FALSE, useFancyQuotes = FALSE) +args <- commandArgs() + +# get options, using the spec as defined by the enclosed list. +# we read the options from the default: commandArgs(TRUE). +spec <- matrix(c( + "dataFile", "i", 1, "character", + "factorInfo","a", 1, "character", + "blockingInfo","b", 1, "character", + "dicoRenaming","g",1,"character", + "blockingPolicy","u", 1, "character", + "fdrThreshold","t", 1, "double", + "thresholdFC","d", 1, "double", + "format", "f", 1, "character", + "histo","h", 1, "character", + "volcano","v", 1, "character", + "factorsContrast","r", 1, "character", + "contrastNames","p", 1, "character", + "firstGroupContrast","m", 1, "character", + "secondGroupContrast","n", 1, "character", + "controlGroups","c", 1, "character", + "fratioFile","s",1,"character", + "organismID","x",1,"character", + "rowNameType","y",1,"character", + "quiet", "q", 0, "logical", + "log", "l", 1, "character", + "outputFile" , "o", 1, "character", + "outputDfFile" , "z", 1, "character"), + byrow=TRUE, ncol=4) +opt <- getopt(spec) + +# enforce the following required arguments +if (is.null(opt$log)) { + addComment("[ERROR]'log file' is required\n") + q( "no", 1, F ) +} +addComment("[INFO]Start of R script",T,opt$log,display=FALSE) +if (is.null(opt$dataFile)) { + addComment("[ERROR]'dataFile' is required",T,opt$log) + q( "no", 1, F ) +} +if (!is.null(opt$blockingInfo) && is.null(opt$blockingPolicy) ) { + addComment("[ERROR]blocking policy is missing",T,opt$log) + q( "no", 1, F ) +} +if (is.null(opt$dicoRenaming)) { + addComment("[ERROR]renaming dictionnary is missing",T,opt$log) + q( "no", 1, F ) +} +if (is.null(opt$factorsContrast)) { + addComment("[ERROR]factor informations are missing",T,opt$log) + q( "no", 1, F ) +} +if (length(opt$firstGroupContrast)!=length(opt$secondGroupContrast)) { + addComment("[ERROR]some contrast groups seems to be empty",T,opt$log) + q( "no", 1, F ) +} +if (is.null(opt$factorInfo)) { + addComment("[ERROR]factors info is missing",T,opt$log) + q( "no", 1, F ) +} +if (is.null(opt$format)) { + addComment("[ERROR]'output format' is required",T,opt$log) + q( "no", 1, F ) +} +if (is.null(opt$fdrThreshold)) { + addComment("[ERROR]'FDR threshold' is required",T,opt$log) + q( "no", 1, F ) +} +if (is.null(opt$outputFile) || is.null(opt$outputDfFile)){ + addComment("[ERROR]'output files' are required",T,opt$log) + q( "no", 1, F ) +} +if (is.null(opt$thresholdFC)){ + addComment("[ERROR]'FC threshold' is required",T,opt$log) + q( "no", 1, F ) +} +if (is.null(opt$fratioFile)) { + addComment("[ERROR]F-ratio parameter is missing",T,opt$log) + q( "no", 1, F ) +} + +#demande si le script sera bavard +verbose <- if (is.null(opt$quiet)) { + TRUE +}else{ + FALSE +} + +#paramètres internes +#pour savoir si on remplace les FC calculés par LIMMA par un calcul du LS-MEAN (ie moyenne de moyennes de chaque groupe dans chaque terme du contraste plutôt qu'une moyenne globale dans chaque terme) +useLSmean=FALSE + +addComment("[INFO]Parameters checked!",T,opt$log,display=FALSE) + +addComment(c("[INFO]Working directory: ",getwd()),TRUE,opt$log,display=FALSE) +addComment(c("[INFO]Command line: ",args),TRUE,opt$log,display=FALSE) + +#directory for plots +dir.create(file.path(getwd(), "plotDir")) +dir.create(file.path(getwd(), "plotLyDir")) + +#charge des packages silencieusement +suppressPackageStartupMessages({ + library("methods") + library("limma") + library("biomaRt") + library("ggplot2") + library("plotly") + library("stringr") + library("RColorBrewer") +}) + + +#chargement du fichier dictionnaire de renommage +renamingDico=read.csv(file=file.path(getwd(), opt$dicoRenaming),header=F,sep="\t",colClasses="character") +rownames(renamingDico)=renamingDico[,2] + + +#chargement des fichiers en entrée +expDataMatrix=read.csv(file=file.path(getwd(), opt$dataFile),header=F,sep="\t",colClasses="character") +#remove first row to convert it as colnames (to avoid X before colnames with header=T) +colNamesData=expDataMatrix[1,-1] +expDataMatrix=expDataMatrix[-1,] +#remove first colum to convert it as rownames +rowNamesData=expDataMatrix[,1] +expDataMatrix=expDataMatrix[,-1] +if(is.data.frame(expDataMatrix)){ + expDataMatrix=data.matrix(expDataMatrix) +}else{ + expDataMatrix=data.matrix(as.numeric(expDataMatrix)) +} +dimnames(expDataMatrix)=list(rowNamesData,colNamesData) + +#test the number of rows that are constant in dataMatrix +nbConstantRows=length(which(unlist(apply(expDataMatrix,1,var))==0)) +if(nbConstantRows>0){ + addComment(c("[WARNING]",nbConstantRows,"rows are constant across conditions in input data file"),T,opt$log,display=FALSE) +} + +#test if all condition names are present in dico +if(!all(colnames(expDataMatrix) %in% rownames(renamingDico))){ + addComment("[ERROR]Missing condition names in renaming dictionary",T,opt$log) + q( "no", 1, F ) +} + +addComment("[INFO]Expression data loaded and checked",T,opt$log,display=FALSE) +addComment(c("[INFO]Dim of expression matrix:",dim(expDataMatrix)),T,opt$log,display=FALSE) + +#chargement du fichier des facteurs +factorInfoMatrix=read.csv(file=file.path(getwd(), opt$factorInfo),header=F,sep="\t",colClasses="character") +#remove first row to convert it as colnames +colnames(factorInfoMatrix)=factorInfoMatrix[1,] +factorInfoMatrix=factorInfoMatrix[-1,] +#use first colum to convert it as rownames but not removing it to avoid conversion as vector in unique factor case +rownames(factorInfoMatrix)=factorInfoMatrix[,1] + +if(length(setdiff(colnames(expDataMatrix),rownames(factorInfoMatrix)))!=0){ + addComment("[ERROR]Missing samples in factor file",T,opt$log) + q( "no", 1, F ) +} + +#order sample as in expression matrix and remove spurious sample +factorInfoMatrix=factorInfoMatrix[colnames(expDataMatrix),] + +#test if all values names are present in dico +if(!all(unlist(factorInfoMatrix) %in% rownames(renamingDico))){ + addComment("[ERROR]Missing factor names in renaming dictionary",T,opt$log) + q( "no", 1, F ) +} + +addComment("[INFO]Factors OK",T,opt$log,display=FALSE) +addComment(c("[INFO]Dim of factorInfo matrix:",dim(factorInfoMatrix)),T,opt$log,display=FALSE) + +##manage blocking factor +blockingFactor=NULL +blockinFactorsList=NULL +if(!is.null(opt$blockingInfo)){ + + #chargement du fichier des blocking factors + blockingInfoMatrix=read.csv(file=file.path(getwd(), opt$blockingInfo),header=F,sep="\t",colClasses="character") + #remove first row to convert it as colnames + colnames(blockingInfoMatrix)=blockingInfoMatrix[1,] + blockingInfoMatrix=blockingInfoMatrix[-1,] + #use first colum to convert it as rownames but not removing it to avoid conversion as vector in unique factor case + rownames(blockingInfoMatrix)=blockingInfoMatrix[,1] + + + if(length(setdiff(colnames(expDataMatrix),rownames(blockingInfoMatrix)))!=0){ + addComment("[ERROR]Missing samples in blocking factor file",T,opt$log) + q( "no", 1, F ) + } + + #order sample as in expression matrix + blockingInfoMatrix=blockingInfoMatrix[colnames(expDataMatrix),] + + #test if all blocking names are present in dico + if(!all(unlist(blockingInfoMatrix) %in% rownames(renamingDico))){ + addComment("[ERROR]Missing blocking names in renaming dictionary",T,opt$log) + q( "no", 1, F ) + } + + #remove blocking factors allready present as real factors + blockingNotInMainFactors=setdiff(colnames(blockingInfoMatrix)[-1],colnames(factorInfoMatrix)[-1]) + + if(length(blockingNotInMainFactors)<(ncol(blockingInfoMatrix)-1))addComment("[WARNING]Blocking factors cannot be principal factors",T,opt$log,display=FALSE) + + if(length(blockingNotInMainFactors)>0){ + + blockingInfoMatrix=blockingInfoMatrix[,c(colnames(blockingInfoMatrix)[1],blockingNotInMainFactors)] + + groupBlocking=rep("c",ncol(expDataMatrix)) + #for each blocking factor + for(blockingFact in blockingNotInMainFactors){ + if(opt$blockingPolicy=="correlated"){ + indNewFact=as.numeric(factor(blockingInfoMatrix[,blockingFact])) + groupBlocking=paste(groupBlocking,indNewFact,sep="_") + }else{ + if(is.null(blockinFactorsList))blockinFactorsList=list() + blockinFactorsList[[blockingFact]]=factor(unlist(lapply(blockingInfoMatrix[,blockingFact],function(x)paste(c(blockingFact,"_",x),collapse="")))) + } + } + if(opt$blockingPolicy=="correlated"){ + blockingFactor=factor(groupBlocking) + if(length(levels(blockingFactor))==1){ + addComment("[ERROR]Selected blocking factors seems to be constant",T,opt$log) + q( "no", 1, F ) + } + } + addComment("[INFO]Blocking info OK",T,opt$log,display=FALSE) + }else{ + addComment("[WARNING]No blocking factors will be considered",T,opt$log,display=FALSE) + } +} + + +##rename different input parameters using renamingDictionary +opt$factorsContrast=renamingDico[unlist(lapply(unlist(strsplit(opt$factorsContrast,",")),function(x)which(renamingDico[,1]==x))),2] + +userDefinedContrasts=FALSE +if(!is.null(opt$firstGroupContrast) && !is.null(opt$secondGroupContrast)){ + userDefinedContrasts=TRUE + for(iContrast in 1:length(opt$firstGroupContrast)){ + opt$firstGroupContrast[iContrast]=paste(unlist(lapply(unlist(strsplit(opt$firstGroupContrast[iContrast],",")),function(x)paste(renamingDico[unlist(lapply(unlist(strsplit(x,"\\*")),function(x)which(renamingDico[,1]==x))),2],collapse="*"))),collapse=",") + opt$secondGroupContrast[iContrast]=paste(unlist(lapply(unlist(strsplit(opt$secondGroupContrast[iContrast],",")),function(x)paste(renamingDico[unlist(lapply(unlist(strsplit(x,"\\*")),function(x)which(renamingDico[,1]==x))),2],collapse="*"))),collapse=",") + } +} + +if(!is.null(opt$controlGroups)){ + renamedGroups=c() + for(iGroup in unlist(strsplit(opt$controlGroups,","))){ + renamedControlGroup=paste(renamingDico[unlist(lapply(unlist(strsplit(iGroup,":")),function(x)which(renamingDico[,1]==x))),2],collapse=":") + if(length(renamedControlGroup)==0 || any(which(unlist(gregexpr(text = renamedControlGroup,pattern = ":"))==-1))){ + addComment("[ERROR]Control groups for interaction seem to mismatch, please check them.",T,opt$log) + q( "no", 1, F ) + } + renamedGroups=c(renamedGroups,renamedControlGroup) + } + opt$controlGroups=renamedGroups +} +addComment("[INFO]Contrast variables are renamed to avoid confusion",T,opt$log,display=FALSE) +##renaming done + +#to convert factor as numeric value --> useless now ? +#expDataMatrix=apply(expDataMatrix,c(1,2),function(x)as.numeric(paste(x))) + +#get factors info for LIMMA +factorsList=list() +for(iFactor in opt$factorsContrast){ + if(!(iFactor %in% colnames(factorInfoMatrix))){ + addComment("[ERROR]Required factors are missing in input file",T,opt$log) + q( "no", 1, F ) + } + factorsList[[iFactor]]=factor(unlist(lapply(factorInfoMatrix[,iFactor],function(x)paste(c(iFactor,"_",x),collapse="")))) + if(length(levels(factorsList[[iFactor]]))==1){ + addComment("[ERROR]One selected factor seems to be constant",T,opt$log) + q( "no", 1, F ) + } +} + +#check if there is at least 2 factors to allow interaction computation +if(!is.null(opt$controlGroups) && length(factorsList)<2){ + addComment("[ERROR]You cannot ask for interaction with less than 2 factors",T,opt$log) + q( "no", 1, F ) +} + +#merge all factors as a single one +factorsMerged=as.character(factorsList[[opt$factorsContrast[1]]]) +for(iFactor in opt$factorsContrast[-1]){ + factorsMerged=paste(factorsMerged,as.character(factorsList[[iFactor]]),sep=".") +} +factorsMerged=factor(factorsMerged) + +#checked that coefficient number (ie. factorsMerged levels) is strictly smaller than sample size +if(length(levels(factorsMerged))>=length(factorsMerged)){ + addComment(c("[ERROR]No enough samples (",length(factorsMerged),") to estimate ",length(levels(factorsMerged))," coefficients"),T,opt$log) + q( "no", 1, F ) +} + +#get the sample size of each factor values +sampleSizeFactor=table(factorsMerged) + + +if(!is.null(blockinFactorsList)){ + factorString=c("blockinFactorsList[['", names(blockinFactorsList)[1],"']]") + for(blockingFact in names(blockinFactorsList)[-1]){ + factorString=c(factorString," + blockinFactorsList[['",blockingFact,"']]") + } + design = model.matrix(as.formula(paste(c("~ factorsMerged +",factorString," + 0"),collapse=""))) + + #rename design columns + coeffMeaning = levels(factorsMerged) + for(blockingFact in blockinFactorsList){ + coeffMeaning=c(coeffMeaning,levels(blockingFact)[-1]) + } + colnames(design) = coeffMeaning +}else{ + design = model.matrix(as.formula( ~ factorsMerged + 0)) + + #rename degin columns + coeffMeaning = levels(factorsMerged) + colnames(design) = coeffMeaning +} + +addComment(c("[INFO]Available coefficients: ",coeffMeaning),T,opt$log,display=F) + +estimableCoeff=which(colSums(design)!=0) + +addComment("[INFO]Design done",T,opt$log,display=F) + + #use blocking factor if exists +if(!is.null(blockingFactor)){ + corfit <- duplicateCorrelation(expDataMatrix, design, block=blockingFactor) + + addComment(c("[INFO]Correlation within groups: ",corfit$consensus.correlation),T,opt$log,display=F) + + #run linear model fit + data.fit = lmFit(expDataMatrix,design,block = blockingFactor, correlation=corfit$consensus.correlation) +}else{ + #run linear model fit + data.fit = lmFit(expDataMatrix,design) +} + +estimatedCoeff=which(!is.na(data.fit$coefficients[1,])) + +addComment("[INFO]Lmfit done",T,opt$log,display=F) + +#catch situation where some coefficients cannot be estimated, probably due to dependances between design columns +#if(length(setdiff(estimableCoeff,estimatedCoeff))>0){ +# addComment("[ERROR]Error in design matrix, check your group definitions",T,opt$log) +# q( "no", 1, F ) +#} +#to strong condition, should return ERROR only when coefficients relative to principal factors cannot be estimated, otherwise, return a simple WARNING + +#define requested contrasts +requiredContrasts=c() +humanReadingContrasts=c() +persoContrastName=c() +if(userDefinedContrasts){ + for(iContrast in 1:length(opt$firstGroupContrast)){ + posGroup=unlist(lapply(unlist(strsplit(opt$firstGroupContrast[iContrast],",")),function(x)paste(paste(opt$factorsContrast,unlist(strsplit(x,"\\*")),sep="_"),collapse="."))) + negGroup=unlist(lapply(unlist(strsplit(opt$secondGroupContrast[iContrast],",")),function(x)paste(paste(opt$factorsContrast,unlist(strsplit(x,"\\*")),sep="_"),collapse="."))) + #clear posGroup and negGroup from empty groups + emptyPosGroups=which(!(posGroup%in%coeffMeaning)) + if(length(emptyPosGroups)>0){ + addComment(c("[WARNING]The group(s)",posGroup[emptyPosGroups],"is/are removed from contrast as it/they is/are empty"),T,opt$log,display=FALSE) + posGroup=posGroup[-emptyPosGroups] + currentHumanContrast=paste(unlist(strsplit(opt$firstGroupContrast[iContrast],","))[-emptyPosGroups],collapse="+") + }else{ + currentHumanContrast=paste(unlist(strsplit(opt$firstGroupContrast[iContrast],",")),collapse="+") + } + emptyNegGroups=which(!(negGroup%in%coeffMeaning)) + if(length(emptyNegGroups)>0){ + addComment(c("[WARNING]The group(s)",negGroup[emptyNegGroups],"is/are removed from contrast as it/they is/are empty"),T,opt$log,display=FALSE) + negGroup=negGroup[-emptyNegGroups] + currentHumanContrast=paste(c(currentHumanContrast,unlist(strsplit(opt$secondGroupContrast[iContrast],","))[-emptyNegGroups]),collapse="-") + }else{ + currentHumanContrast=paste(c(currentHumanContrast,unlist(strsplit(opt$secondGroupContrast[iContrast],","))),collapse="-") + } + if(length(posGroup)==0 || length(negGroup)==0 ){ + addComment(c("[WARNING]Contrast",currentHumanContrast,"cannot be estimated due to empty group"),T,opt$log,display=FALSE) + }else{ + if(all(posGroup%in%negGroup) && all(negGroup%in%posGroup)){ + addComment(c("[WARNING]Contrast",currentHumanContrast,"cannot be estimated due to null contrast"),T,opt$log,display=FALSE) + }else{ + #get coefficients required for first group added as positive + positiveCoeffWeights=sampleSizeFactor[posGroup]/sum(sampleSizeFactor[posGroup]) + #positiveCoeffWeights=rep(1,length(posGroup)) + #names(positiveCoeffWeights)=names(sampleSizeFactor[posGroup]) + #get coefficients required for second group added as negative + negativeCoeffWeights=sampleSizeFactor[negGroup]/sum(sampleSizeFactor[negGroup]) + #negativeCoeffWeights=rep(1,length(negGroup)) + #names(negativeCoeffWeights)=names(sampleSizeFactor[negGroup]) + #build the resulting contrast + currentContrast=paste(paste(positiveCoeffWeights[posGroup],posGroup,sep="*"),collapse="+") + currentContrast=paste(c(currentContrast,paste(paste(negativeCoeffWeights[negGroup],negGroup,sep="*"),collapse="-")),collapse="-") + requiredContrasts=c(requiredContrasts,currentContrast) + + #build the human reading contrast + humanReadingContrasts=c(humanReadingContrasts,currentHumanContrast) + if(!is.null(opt$contrastNames) && nchar(opt$contrastNames[iContrast])>0){ + persoContrastName=c(persoContrastName,opt$contrastNames[iContrast]) + }else{ + persoContrastName=c(persoContrastName,"") + } + + addComment(c("[INFO]Contrast added : ",currentHumanContrast),T,opt$log,display=F) + addComment(c("with complete formula ",currentContrast),T,opt$log,display=F) + } + } + } +} + + + #define the true formula with interactions to get interaction coefficients + factorString=c("factorsList[['", names(factorsList)[1],"']]") + for(iFactor in names(factorsList)[-1]){ + factorString=c(factorString," * factorsList[['",iFactor,"']]") + } + + if(!is.null(blockinFactorsList)){ + for(blockingFact in names(blockinFactorsList)){ + factorString=c(factorString," + blockinFactorsList[['",blockingFact,"']]") + } + } + + #should not be null at the end + allFtestMeanSquare=NULL + #to get the F-test values + estimatedInteractions=rownames(anova(lm(as.formula(paste(c("expDataMatrix[1,] ~ ",factorString),collapse=""))))) + estimatedInteractions=c(unlist(lapply(estimatedInteractions[-length(estimatedInteractions)],function(x){temp=unlist(strsplit(x,"[ \" | : ]"));paste(temp[seq(2,length(temp),3)],collapse=":")})),estimatedInteractions[length(estimatedInteractions)]) + #rename estimated interaction terms using renamingDico + estimatedInteractions=c(unlist(lapply(estimatedInteractions[-length(estimatedInteractions)],function(x)paste(renamingDico[unlist(strsplit(x,":")),1],collapse=":"))),estimatedInteractions[length(estimatedInteractions)]) + t <- unlist(apply(expDataMatrix,1,function(x){temp=anova(lm(as.formula(paste(c("x ~ ",factorString),collapse=""))))$`Mean Sq`;temp/temp[length(temp)]})) + allFtestMeanSquare <- t(matrix(t,nrow=length(estimatedInteractions))) + #remove from allFtest rows containing NA + if(length(which(is.na(allFtestMeanSquare[,1])))>0)allFtestMeanSquare=allFtestMeanSquare[-(which(is.na(allFtestMeanSquare[,1]))),] + colnames(allFtestMeanSquare)=estimatedInteractions + + #add contrasts corresponding to interaction terms + if(!is.null(opt$controlGroups)){ + #first load user defined control group for each factor + controlGroup=rep(NA,length(factorsList)) + names(controlGroup)=names(factorsList) + for(iGroup in opt$controlGroups){ + splitGroup=unlist(strsplit(iGroup,":")) + splitGroup[2]=paste(splitGroup[1],splitGroup[2],sep = "_") + #check if defined control group is really a level of the corresponding factor + if(!splitGroup[1]%in%names(controlGroup) || !splitGroup[2]%in%factorsList[[splitGroup[1]]]){ + addComment(c("[ERROR]The factor name",splitGroup[1],"does not exist or group name",splitGroup[2]),T,opt$log) + q( "no", 1, F ) + } + if(!is.na(controlGroup[splitGroup[1]])){ + addComment("[ERROR]Several control groups are defined for the same factor, please select only one control group for each factor if you want to compute interaction contrasts",T,opt$log) + q( "no", 1, F ) + } + controlGroup[splitGroup[1]]=splitGroup[2] + } + + #check if all factor have a defined control group + if(any(is.na(controlGroup))){ + addComment("[ERROR]Missing control group for some factors, please check them if you want to compute interaction contrasts",T,opt$log) + q( "no", 1, F ) + } + + nbFactors=length(factorsList) + interactionContrasts=c() + contrastClass=c() + #initialize list for the first level + newPreviousLoopContrast=list() + for(iFactorA in 1:(nbFactors-1)){ + nameFactorA=names(factorsList)[iFactorA] + compA=c() + for(levelA in setdiff(levels(factorsList[[iFactorA]]),controlGroup[nameFactorA])){ + compA=c(compA,paste(levelA,controlGroup[nameFactorA],sep="-")) + } + newPreviousLoopContrast[[as.character(iFactorA)]]=compA + } + #make a loop for growing interaction set + for(globalIfactor in 1:(nbFactors-1)){ + previousLoopContrast=newPreviousLoopContrast + newPreviousLoopContrast=list() + #factor A reuse contrasts made at previsous loop + for(iFactorA in names(previousLoopContrast)){ + compA=previousLoopContrast[[iFactorA]] + + if(max(as.integer(unlist(strsplit(iFactorA,"\\."))))1 && iFactor0){ + requiredContrasts=c(requiredContrasts,interactionContrasts[-contrastToIgnore]) + humanReadingContrasts=c(humanReadingContrasts,humanReadingInteraction[-contrastToIgnore]) + persoContrastName=c(persoContrastName,rep("",length(humanReadingInteraction[-contrastToIgnore]))) + }else{ + requiredContrasts=c(requiredContrasts,interactionContrasts) + humanReadingContrasts=c(humanReadingContrasts,humanReadingInteraction) + persoContrastName=c(persoContrastName,rep("",length(humanReadingInteraction))) + } + }#end of intreaction contrasts + + + #remove from requiredContrasts contrasts that cannot be estimated + toRemove=unique(unlist(lapply(setdiff(coeffMeaning,names(estimatedCoeff)),function(x)grep(x,requiredContrasts)))) + if(length(toRemove)>0){ + addComment(c("[WARNING]",length(toRemove)," contrasts are removed, due to missing coefficients"),T,opt$log,display=FALSE) + requiredContrasts=requiredContrasts[-toRemove] + humanReadingContrasts=humanReadingContrasts[-toRemove] + persoContrastName=persoContrastName[-toRemove] + } + + if(length(requiredContrasts)==0){ + addComment("[ERROR]No contrast to compute, please check your contrast definition.",T,opt$log) + q( "no", 1, F ) + } + + #compute for each contrast mean of coefficients in posGroup and negGroup for FC computation of log(FC) with LSmean as in Partek + meanPosGroup=list() + meanNegGroup=list() + for(iContrast in 1:length(requiredContrasts)){ + #define posGroup and negGroup + #first split contrast + temp=unlist(strsplit(requiredContrasts[iContrast],"[ + ]")) + splitContrast=temp[1] + for(iTemp in temp[-1])splitContrast=c(splitContrast,"+",iTemp) + splitContrast=unlist(lapply(splitContrast,function(x){temp=unlist(strsplit(x,"-"));splitCompB=temp[1];for(iTemp in temp[-1])splitCompB=c(splitCompB,"-",iTemp);splitCompB})) + #and then put each term in good group + posGroup=c() + negGroup=c() + nextIsPos=TRUE + for(iSplit in splitContrast){ + if(iSplit=="+")nextIsPos=TRUE + if(iSplit=="-")nextIsPos=FALSE + if(iSplit!="-" && iSplit!="+"){ + #remove weights of contrast terms + iSplitBis=unlist(strsplit(iSplit,"[*]")) + iSplitBis=iSplitBis[length(iSplitBis)] + if(nextIsPos)posGroup=c(posGroup,iSplitBis) + else negGroup=c(negGroup,iSplitBis) + } + } + #compute means for each group + meanPosGroup[[iContrast]]=apply(as.matrix(data.fit$coefficients[,posGroup],ncol=length(posGroup)),1,mean) + meanNegGroup[[iContrast]]=apply(as.matrix(data.fit$coefficients[,negGroup],ncol=length(negGroup)),1,mean) + } + + + contrast.matrix = makeContrasts(contrasts=requiredContrasts,levels=design) + data.fit.con = contrasts.fit(data.fit,contrast.matrix) + + addComment("[INFO]Contrast definition done",T,opt$log,T,display=FALSE) + + #compute LIMMA statistics + data.fit.eb = eBayes(data.fit.con) + + addComment("[INFO]Estimation done",T,opt$log,T,display=FALSE) + + #adjust p.value through FDR + data.fit.eb$adj_p.value=data.fit.eb$p.value + for(iComparison in 1:ncol(data.fit.eb$adj_p.value)){ + data.fit.eb$adj_p.value[,iComparison]=p.adjust(data.fit.eb$p.value[,iComparison],"fdr") + } + + #add a new field based on LS-means for each contrast instead of global mean like they were calculated in coefficients field + data.fit.eb$coefficientsLS=data.fit.eb$coefficients + if(ncol(data.fit.eb$coefficients)!=length(meanPosGroup)){ + addComment("[ERROR]Estimated contrasts number unexpected",T,opt$log) + q( "no", 1, F ) + } + for(iContrast in 1:length(meanPosGroup)){ + data.fit.eb$coefficientsLS[,iContrast]=meanPosGroup[[iContrast]][rownames(data.fit.eb$coefficientsLS)]-meanNegGroup[[iContrast]][rownames(data.fit.eb$coefficientsLS)] + } + + #if requested replace coefficient computed as global mean by LS-means values + if(useLSmean)data.fit.eb$coefficients=data.fit.eb$coefficientsLS + +addComment("[INFO]Core treatment done",T,opt$log,T,display=FALSE) + + +##convert humanReadingContrasts with namingDictionary to create humanReadingContrastsRenamed and keep original humanReadingContrasts names for file names +humanReadingContrastsRenamed=rep("",length(humanReadingContrasts)) +for(iContrast in 1:length(humanReadingContrasts)){ + if(persoContrastName[iContrast]==""){ + #if(verbose)addComment(humanReadingContrasts[iContrast]) + specialCharacters=str_extract_all(humanReadingContrasts[iContrast],"[+|*|_|:|-]")[[1]] + #if(verbose)addComment(specialCharacters) + nameConverted=unlist(lapply(strsplit(humanReadingContrasts[iContrast],"[+|*|_|:|-]")[[1]],function(x)renamingDico[x,1])) + #if(verbose)addComment(nameConverted) + humanReadingContrastsRenamed[iContrast]=paste(nameConverted,specialCharacters,collapse="",sep="") + #if(verbose)addComment(humanReadingContrastsRenamed[iContrast]) + humanReadingContrastsRenamed[iContrast]=substr(humanReadingContrastsRenamed[iContrast],1,nchar(humanReadingContrastsRenamed[iContrast])-1) + }else{ + humanReadingContrastsRenamed[iContrast]=persoContrastName[iContrast] + } +} + +#write correspondances between plot file names (humanReadingContrasts) and displayed names in figure legends (humanReadingContrastsRenamed), usefull to define html items in xml file +correspondanceTable=matrix("",ncol=2,nrow=ncol(data.fit.eb$p.value)) +correspondanceTable[,1]=unlist(lapply(humanReadingContrasts,function(x)gsub(":","_INT_",gsub("\\+","_PLUS_",gsub("\\*","_AND_",x))))) +correspondanceTable[,2]=humanReadingContrastsRenamed +rownames(correspondanceTable)=correspondanceTable[,2] +write.table(correspondanceTable,file=file.path(getwd(), "correspondanceFileNames.csv"),quote=FALSE,sep="\t",col.names = F,row.names = F) + +#plot nominal p-val histograms for selected comparisons +histogramPerPage=6 +if (!is.null(opt$histo)) { + iToPlot=1 + plotVector=list() + nbComparisons=ncol(data.fit.eb$p.value) + for (iComparison in 1:nbComparisons){ + dataToPlot=data.frame(pval=data.fit.eb$p.value[,iComparison],id=rownames(data.fit.eb$p.value)) + p <- ggplot(data=dataToPlot, aes(x=pval)) + geom_histogram(colour="red", fill="salmon") + + theme_bw() + ggtitle(humanReadingContrastsRenamed[iComparison]) + ylab(label="Frequencies") + xlab(label="Nominal p-val") + + theme(panel.border=element_blank(),plot.title = element_text(hjust = 0.5)) + plotVector[[length(plotVector)+1]]=p + + pp <- ggplotly(p) + htmlwidgets::saveWidget(as_widget(pp), paste(c(file.path(getwd(), "plotLyDir"),"/",opt$histo,"_",correspondanceTable[humanReadingContrastsRenamed[iComparison],1],".html"),collapse=""),selfcontained = F) + + if(iComparison==nbComparisons || length(plotVector)==histogramPerPage){ + #plot and close the actual plot + if(opt$format=="pdf"){ + pdf(paste(c("./plotDir/",opt$histo,iToPlot,".pdf"),collapse=""))}else{ + png(paste(c("./plotDir/",opt$histo,iToPlot,".png"),collapse="")) + } + multiplot(plotlist=plotVector,cols=2) + dev.off() + if(iComparison0)pvalThresholdFDR=min(probeWithLowFDR) + + #get significant points over FC and FDR thresholds + significativePoints=intersect(which(abs(data.fit.eb$coefficients[,iComparison])>=logFCthreshold),which(data.fit.eb$adj_p.value[,iComparison]<=opt$fdrThreshold)) + + #to reduce size of html plot, we keep 20000 points maximum sampled amongst genes with pval>=33%(pval) and abs(log2(FC))<=66%(abs(log2(FC))) + htmlPointsToRemove=intersect(which(abs(data.fit.eb$coefficients[,iComparison])<=quantile(abs(data.fit.eb$coefficients[,iComparison]),c(0.66))),which(data.fit.eb$p.value[,iComparison]>=quantile(abs(data.fit.eb$p.value[,iComparison]),c(0.33)))) + if(length(htmlPointsToRemove)>20000){ + htmlPointsToRemove=setdiff(htmlPointsToRemove,sample(htmlPointsToRemove,20000)) + }else{ + htmlPointsToRemove=c() + } + + xMinLimPlot=min(data.fit.eb$coefficients[,iComparison])-0.2 + xMaxLimPlot=max(data.fit.eb$coefficients[,iComparison])+0.2 + yMaxLimPlot= max(-log10(data.fit.eb$p.value[,iComparison]))+0.2 + + if(length(significativePoints)>0){ + dataSignifToPlot=data.frame(pval=-log10(data.fit.eb$p.value[significativePoints,iComparison]),FC=data.fit.eb$coefficients[significativePoints,iComparison],description=paste(names(data.fit.eb$coefficients[significativePoints,iComparison]),"\n","FC: " , round(2^data.fit.eb$coefficients[significativePoints,iComparison],2) , " | FDR p-val: ",prettyNum(data.fit.eb$adj_p.value[significativePoints,iComparison],digits=4), sep="")) + #to test if remains any normal points to draw + if(length(significativePoints)0)p <- p + geom_point(data=dataSignifToPlot,aes(colour=description)) + + ##interactive plot + if(length(htmlPointsToRemove)>0){ + pointToRemove=union(htmlPointsToRemove,significativePoints) + #to test if remains any normal points to draw + if(length(pointToRemove)40000)addComment(c("[WARNING]For",humanReadingContrastsRenamed[iComparison],"volcano, numerous points to plot(",nrow(dataToPlot)+nrow(dataSignifToPlot),"), resulting volcano could be heavy, using more stringent thresholds could be helpful."),T,opt$log) + + phtml <- plot_ly(data=dataToPlot, x=~FC, y=~pval,type="scatter", mode="markers",showlegend = FALSE, marker = list(color="gray",opacity=0.5), text=~description, hoverinfo="text") %>% + layout(title = humanReadingContrastsRenamed[iComparison],xaxis=list(title="Log2 Fold Change",showgrid=TRUE, zeroline=FALSE),yaxis=list(title="-log10(p-val)", showgrid=TRUE, zeroline=FALSE)) + if(length(significativePoints)>0) phtml=add_markers(phtml,data=dataSignifToPlot, x=~FC, y=~pval, mode="markers" , marker=list( color=log10(abs(dataSignifToPlot$FC)*dataSignifToPlot$pval),colorscale='Rainbow'), text=~description, hoverinfo="text", inherit = FALSE) %>% hide_colorbar() + if(logFCthreshold!=0){ + phtml=add_trace(phtml,x=c(-logFCthreshold,-logFCthreshold), y=c(0,yMaxLimPlot), type="scatter", mode = "lines", line=list(color="coral",dash="dash"), hoverinfo='none', showlegend = FALSE,inherit = FALSE) + phtml=add_annotations(phtml,x=-logFCthreshold,y=0,xref = "x",yref = "y",text = paste(c("log2(1/FC=",opt$thresholdFC,")"),collapse=""),xanchor = 'right',showarrow = F,textangle=270,font=list(color="coral")) + phtml=add_trace(phtml,x=c(logFCthreshold,logFCthreshold), y=c(0, yMaxLimPlot), type="scatter", mode = "lines", line=list(color="coral",dash="dash"), hoverinfo='none', showlegend = FALSE,inherit = FALSE) + phtml=add_annotations(phtml,x=logFCthreshold,y=0,xref = "x",yref = "y",text = paste(c("log2(FC=",opt$thresholdFC,")"),collapse=""),xanchor = 'right',showarrow = F,textangle=270,font=list(color="coral")) + } + if(!is.null(pvalThresholdFDR)){ + phtml=add_trace(phtml,x=c(xMinLimPlot,xMaxLimPlot), y=c(pvalThresholdFDR,pvalThresholdFDR), type="scatter", mode = "lines", line=list(color="cornflowerblue",dash="dash"), hoverinfo='none', showlegend = FALSE,inherit = FALSE) + phtml=add_annotations(phtml,x=xMinLimPlot,y=pvalThresholdFDR+0.1,xref = "x",yref = "y",text = paste(c("FDR pval limit(",opt$fdrThreshold,")"),collapse=""),xanchor = 'left',showarrow = F,font=list(color="cornflowerblue")) + } + plotVector[[length(plotVector)+1]]=p + + #save plotly files + pp <- ggplotly(phtml) + htmlwidgets::saveWidget(as_widget(pp), paste(c(file.path(getwd(), "plotLyDir"),"/",opt$volcano,"_",correspondanceTable[humanReadingContrastsRenamed[iComparison],1],".html"),collapse=""),selfcontained = F) + + + if(iComparison==nbComparisons || length(plotVector)==volcanoPerPage){ + #plot and close the actual plot + if(opt$format=="pdf"){ + pdf(paste(c("./plotDir/",opt$volcano,"_",correspondanceTable[humanReadingContrastsRenamed[iComparison],1],".pdf"),collapse=""))}else{ + png(paste(c("./plotDir/",opt$volcano,"_",correspondanceTable[humanReadingContrastsRenamed[iComparison],1],".png"),collapse="")) + } + multiplot(plotlist=plotVector,cols=1) + dev.off() + if(iComparison0))) +#filter out genes with lower FC for all comparisons +genesToKeep=intersect(genesToKeep,names(which(apply(data.fit.eb$coefficients,1,function(x)length(which(abs(x)>=logFCthreshold))>0)))) + +if(length(genesToKeep)>0){ + data.fit.eb$adj_p.value=matrix(data.fit.eb$adj_p.value[genesToKeep,],ncol=ncol(data.fit.eb$adj_p.value)) + rownames(data.fit.eb$adj_p.value)=genesToKeep + colnames(data.fit.eb$adj_p.value)=colnames(data.fit.eb$p.value) + + data.fit.eb$p.value=matrix(data.fit.eb$p.value[genesToKeep,],ncol=ncol(data.fit.eb$p.value)) + rownames(data.fit.eb$p.value)=genesToKeep + colnames(data.fit.eb$p.value)=colnames(data.fit.eb$adj_p.value) + + data.fit.eb$coefficients=matrix(data.fit.eb$coefficients[genesToKeep,],ncol=ncol(data.fit.eb$coefficients)) + rownames(data.fit.eb$coefficients)=genesToKeep + colnames(data.fit.eb$coefficients)=colnames(data.fit.eb$adj_p.value) + + data.fit.eb$t=matrix(data.fit.eb$t[genesToKeep,],ncol=ncol(data.fit.eb$t)) + rownames(data.fit.eb$t)=genesToKeep + colnames(data.fit.eb$t)=colnames(data.fit.eb$adj_p.value) + + dfMatrix=dfMatrix[genesToKeep,,drop=FALSE] + +}else{ + addComment(c("[WARNING]No significative genes considering the given FDR threshold : ",opt$fdrThreshold),T,opt$log,display=FALSE) +} + +addComment("[INFO]Significant genes filtering done",T,opt$log,T,display=FALSE) + + +#plot VennDiagramm for genes below threshold between comparisons +#t=apply(data.fit.eb$adj_p.value[,1:4],2,function(x)names(which(x<=opt$threshold))) +#get.venn.partitions(t) +#vennCounts(data.fit.eb$adj_p.value[,1:4]<=opt$threshold) + +#make a simple sort genes based only on the first comparison +#newOrder=order(data.fit.eb$adj_p.value[,1]) +#data.fit.eb$adj_p.value=data.fit.eb$adj_p.value[newOrder,] + +#alternative sorting strategy based on the mean gene rank over all comparisons +if(length(genesToKeep)>1){ + currentRank=rep(0,nrow(data.fit.eb$adj_p.value)) + for(iComparison in 1:ncol(data.fit.eb$adj_p.value)){ + currentRank=currentRank+rank(data.fit.eb$adj_p.value[,iComparison]) + } + currentRank=currentRank/ncol(data.fit.eb$adj_p.value) + newOrder=order(currentRank) + + data.fit.eb$adj_p.value=matrix(data.fit.eb$adj_p.value[newOrder,],ncol=ncol(data.fit.eb$adj_p.value)) + rownames(data.fit.eb$adj_p.value)=rownames(data.fit.eb$p.value)[newOrder] + colnames(data.fit.eb$adj_p.value)=colnames(data.fit.eb$p.value) + + data.fit.eb$p.value=matrix(data.fit.eb$p.value[newOrder,],ncol=ncol(data.fit.eb$p.value)) + rownames(data.fit.eb$p.value)=rownames(data.fit.eb$adj_p.value) + colnames(data.fit.eb$p.value)=colnames(data.fit.eb$adj_p.value) + + data.fit.eb$coefficients=matrix(data.fit.eb$coefficients[newOrder,],ncol=ncol(data.fit.eb$coefficients)) + rownames(data.fit.eb$coefficients)=rownames(data.fit.eb$adj_p.value) + colnames(data.fit.eb$coefficients)=colnames(data.fit.eb$adj_p.value) + + data.fit.eb$t=matrix(data.fit.eb$t[newOrder,],ncol=ncol(data.fit.eb$t)) + rownames(data.fit.eb$t)=rownames(data.fit.eb$adj_p.value) + colnames(data.fit.eb$t)=colnames(data.fit.eb$adj_p.value) + + dfMatrix=dfMatrix[newOrder,,drop=FALSE] +} + + +#formating output matrices depending on genes to keep +if(length(genesToKeep)==0){ + outputData=matrix(0,ncol=ncol(data.fit.eb$adj_p.value)*5+2,nrow=3) + outputData[1,]=c("X","X",rep(humanReadingContrastsRenamed,each=5)) + outputData[2,]=c("X","X",rep(c("p-val","FDR.p-val","FC","log2(FC)","t-stat"),ncol(data.fit.eb$adj_p.value))) + outputData[,1]=c("LIMMA","Gene","noGene") + outputData[,2]=c("Comparison","Info","noInfo") + + outputDfData=matrix(0,ncol=3+1,nrow=2) + outputDfData[1,]=c("X","df.residual","df.prior","df.total") + outputDfData[,1]=c("Statistics","noGene") +}else{ + if(length(genesToKeep)==1){ + outputData=matrix(0,ncol=ncol(data.fit.eb$adj_p.value)*5+2,nrow=3) + outputData[1,]=c("X","X",rep(humanReadingContrastsRenamed,each=5)) + outputData[2,]=c("X","X",rep(c("p-val","FDR.p-val","FC","log2(FC)","t-stat"),ncol(data.fit.eb$adj_p.value))) + outputData[,1]=c("LIMMA","Gene",genesToKeep) + outputData[,2]=c("Comparison","Info","na") + if(!is.null(rowItemInfo))outputData[3,2]=rowItemInfo[genesToKeep] + outputData[3,seq(3,ncol(outputData),5)]=prettyNum(data.fit.eb$p.value,digits=4) + outputData[3,seq(4,ncol(outputData),5)]=prettyNum(data.fit.eb$adj_p.value,digits=4) + outputData[3,seq(5,ncol(outputData),5)]=prettyNum(2^data.fit.eb$coefficients,digits=4) + outputData[3,seq(6,ncol(outputData),5)]=prettyNum(data.fit.eb$coefficients,digits=4) + outputData[3,seq(7,ncol(outputData),5)]=prettyNum(data.fit.eb$t,digits=4) + + outputDfData=matrix(0,ncol=3+1,nrow=1+nrow(dfMatrix)) + outputDfData[1,]=c("Statistics","df.residual","df.prior","df.total") + outputDfData[2,]=c(rownames(dfMatrix),prettyNum(dfMatrix[,c("df.residual","df.prior","df.total")],digits=4)) + }else{ + #format matrix to be correctly read by galaxy (move headers in first column and row) + outputData=matrix(0,ncol=ncol(data.fit.eb$adj_p.value)*5+2,nrow=nrow(data.fit.eb$adj_p.value)+2) + outputData[1,]=c("X","X",rep(humanReadingContrastsRenamed,each=5)) + outputData[2,]=c("X","X",rep(c("p-val","FDR.p-val","FC","log2(FC)","t-stat"),ncol(data.fit.eb$adj_p.value))) + outputData[,1]=c("LIMMA","Gene",rownames(data.fit.eb$adj_p.value)) + outputData[,2]=c("Comparison","Info",rep("na",nrow(data.fit.eb$adj_p.value))) + if(!is.null(rowItemInfo))outputData[3:nrow(outputData),2]=rowItemInfo[rownames(data.fit.eb$adj_p.value)] + outputData[3:nrow(outputData),seq(3,ncol(outputData),5)]=prettyNum(data.fit.eb$p.value,digits=4) + outputData[3:nrow(outputData),seq(4,ncol(outputData),5)]=prettyNum(data.fit.eb$adj_p.value,digits=4) + outputData[3:nrow(outputData),seq(5,ncol(outputData),5)]=prettyNum(2^data.fit.eb$coefficients,digits=4) + outputData[3:nrow(outputData),seq(6,ncol(outputData),5)]=prettyNum(data.fit.eb$coefficients,digits=4) + outputData[3:nrow(outputData),seq(7,ncol(outputData),5)]=prettyNum(data.fit.eb$t,digits=4) + + outputDfData=matrix(0,ncol=3+1,nrow=1+nrow(dfMatrix)) + outputDfData[1,]=c("Statistics","df.residual","df.prior","df.total") + outputDfData[2:(1+nrow(dfMatrix)),]=cbind(rownames(dfMatrix),prettyNum(dfMatrix[,c("df.residual")],digits=4),prettyNum(dfMatrix[,c("df.prior")],digits=4),prettyNum(dfMatrix[,c("df.total")],digits=4)) + } +} +addComment("[INFO]Formated output",T,opt$log,display=FALSE) + +#write output results +write.table(outputData,file=opt$outputFile,quote=FALSE,sep="\t",col.names = F,row.names = F) + +#write df info file +write.table(outputDfData,file=opt$outputDfFile,quote=FALSE,sep="\t",col.names = F,row.names = F) + +end.time <- Sys.time() +addComment(c("[INFO]Total execution time for R script:",as.numeric(end.time - start.time,units="mins"),"mins"),T,opt$log,display=FALSE) + +addComment("[INFO]End of R script",T,opt$log,display=FALSE) + +printSessionInfo(opt$log) +#sessionInfo() + + + diff -r 000000000000 -r 14045c80a222 src/VolcanoPlotsScript.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/VolcanoPlotsScript.R Fri Jun 26 09:38:23 2020 -0400 @@ -0,0 +1,426 @@ +# R script to plot volcanos through Galaxy based GIANT tool +# written by Jimmy Vandel +# +# +initial.options <- commandArgs(trailingOnly = FALSE) +file.arg.name <- "--file=" +script.name <- sub(file.arg.name, "", initial.options[grep(file.arg.name, initial.options)]) +script.basename <- dirname(script.name) +source(file.path(script.basename, "utils.R")) +source(file.path(script.basename, "getopt.R")) + +#addComment("Welcome R!") + +# setup R error handling to go to stderr +options( show.error.messages=F, error = function () { cat(geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) + +# we need that to not crash galaxy with an UTF8 error on German LC settings. +loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") +loc <- Sys.setlocale("LC_NUMERIC", "C") + +#get starting time +start.time <- Sys.time() + +options(stringAsfactors = FALSE, useFancyQuotes = FALSE) +args <- commandArgs() + +# get options, using the spec as defined by the enclosed list. +# we read the options from the default: commandArgs(TRUE). +spec <- matrix(c( + "statisticsFile", "i", 1, "character", + "volcanoName" , "n", 1, "character", + "pvalColumnName" , "p", 1, "character", + "fdrColumnName" , "m", 1, "character", + "fcColumnName" , "c", 1, "character", + "fcKind","d", 1, "character", + "fdrThreshold","s", 1, "double", + "fcThreshold","e", 1, "double", + "organismID","x",1,"character", + "rowNameType","y",1,"character", + "log", "l", 1, "character", + "outputFile" , "o", 1, "character", + "format", "f", 1, "character", + "quiet", "q", 0, "logical"), + byrow=TRUE, ncol=4) +opt <- getopt(spec) + +# enforce the following required arguments +if (is.null(opt$log)) { + addComment("[ERROR]'log file' is required\n") + q( "no", 1, F ) +} +addComment("[INFO]Start of R script",T,opt$log,display=FALSE) +if (is.null(opt$statisticsFile)) { + addComment("[ERROR]'statisticsFile' is required",T,opt$log) + q( "no", 1, F ) +} +if (length(opt$pvalColumnName)==0 || length(opt$fdrColumnName)==0 || length(opt$fcColumnName)==0) { + addComment("[ERROR]no selected columns",T,opt$log) + q( "no", 1, F ) +} +if (length(opt$pvalColumnName)!=length(opt$fcColumnName) || length(opt$pvalColumnName)!=length(opt$fdrColumnName)) { + addComment("[ERROR]different number of selected columns between p.val, adj-p.val and FC ",T,opt$log) + q( "no", 1, F ) +} +if (is.null(opt$fcKind)) { + addComment("[ERROR]'fcKind' is required",T,opt$log) + q( "no", 1, F ) +} +if (is.null(opt$fdrThreshold)) { + addComment("[ERROR]'FDR threshold' is required",T,opt$log) + q( "no", 1, F ) +} +if (is.null(opt$fcThreshold)) { + addComment("[ERROR]'FC threshold' is required",T,opt$log) + q( "no", 1, F ) +} +if (is.null(opt$outputFile)) { + addComment("[ERROR]'output file' is required",T,opt$log) + q( "no", 1, F ) +} +if (is.null(opt$format)) { + addComment("[ERROR]'output format' is required",T,opt$log) + q( "no", 1, F ) +} + +#demande si le script sera bavard +verbose <- if (is.null(opt$quiet)) { + TRUE +}else{ + FALSE +} + +#paramètres internes +addComment("[INFO]Parameters checked test mode !",T,opt$log,display=FALSE) + +addComment(c("[INFO]Working directory: ",getwd()),TRUE,opt$log,display=FALSE) +addComment(c("[INFO]Command line: ",args),TRUE,opt$log,display=FALSE) + +#directory for plots +dir.create(file.path(getwd(), "plotDir")) +dir.create(file.path(getwd(), "plotLyDir")) + +#charge des packages silencieusement +suppressPackageStartupMessages({ + library("methods") + library("biomaRt") + library("ggplot2") + library("plotly") + library("stringr") +}) + +#define some usefull variable +nbVolcanosToPlot=length(opt$pvalColumnName) + +#load input file +statDataMatrix=read.csv(file=file.path(getwd(), opt$statisticsFile),header=F,sep="\t",colClasses="character") +#remove first colum to convert it as rownames +rownames(statDataMatrix)=statDataMatrix[,1] +statDataMatrix=statDataMatrix[,-1] + +#identify lines without adjusted p-value info (should contain the same content as rownames) and replace them with NA values +FDRinfo=rep(TRUE,nbVolcanosToPlot) +for(iVolcano in 1:nbVolcanosToPlot){ + #input parameter should be None when adjusted p-val are not available + if(opt$fdrColumnName[iVolcano]=="None"){ + #content of the corresponding column should also be the same as rownames + if(!all(statDataMatrix[,(iVolcano-1)*3+2]==rownames(statDataMatrix))){ + addComment(c("[ERROR]It seems that input stat matrix contains adjusted p-values for volcano",iVolcano,"whereas input parameter indicates that not."),T,opt$log) + q( "no", 1, F ) + } + FDRinfo[iVolcano]=FALSE + statDataMatrix[,(iVolcano-1)*3+2]=NA + } +} + +if(is.data.frame(statDataMatrix)){ + statDataMatrix=data.matrix(statDataMatrix) +}else{ + statDataMatrix=data.matrix(as.numeric(statDataMatrix)) +} + +#check if available column number match with volcano requested number +if(ncol(statDataMatrix)!=3*nbVolcanosToPlot){ + addComment("[ERROR]Input file column number is different from requested volcano number",T,opt$log) + q( "no", 1, F ) +} + +#build global dataFrame with data and fill with p.val and log2(FC) and FDR +dataFrame=data.frame(row.names = rownames(statDataMatrix)) +#start with p-value +dataFrame$p.value=statDataMatrix[,seq(1,nbVolcanosToPlot*3,3),drop=FALSE] +#compute FDR if needed or just get available info +dataFrame$adj_p.value=dataFrame$p.value +for(iVolcano in 1:nbVolcanosToPlot){ + #adjusted p-value are already computed + if(FDRinfo[iVolcano]){ + dataFrame$adj_p.value[,iVolcano]=statDataMatrix[,(iVolcano-1)*3+2,drop=FALSE] + }else{ + #adjusted p-value should be computed based on p-val using FDR + dataFrame$adj_p.value[,iVolcano]=p.adjust(dataFrame$p.value[,iVolcano,drop=FALSE],"fdr") + addComment(c("[INFO]Adjusted p-values are not available in input for volcano",iVolcano,", FDR approach will be used on available raw p-values"),T,opt$log) + } +} +if(opt$fcKind=="FC"){ + #we should transform as Log2FC + dataFrame$coefficients=log2(statDataMatrix[,seq(3,nbVolcanosToPlot*3,3),drop=FALSE]) + addComment(c("[INFO]FC are converted in log2(FC) for plotting"),T,opt$log) +}else{ + dataFrame$coefficients=statDataMatrix[,seq(3,nbVolcanosToPlot*3,3),drop=FALSE] +} + +addComment(c("[INFO]Input data available for",nbVolcanosToPlot,"volcano(s) with",nrow(statDataMatrix),"rows"),T,opt$log) + + +#plot VOLCANOs +volcanoPerPage=1 +logFCthreshold=log2(opt$fcThreshold) +iToPlot=1 +plotVector=list() +volcanoNameList=c() +for (iVolcano in 1:nbVolcanosToPlot){ + + if(nchar(opt$volcanoName[iVolcano])>0){ + curentVolcanoName=opt$volcanoName[iVolcano] + }else{ + curentVolcanoName=paste(iVolcano,opt$pvalColumnName[iVolcano],sep="_") + } + + #keep only rows without NA for p-val, adjusted p-val and coeff + pValToPlot=dataFrame$p.value[,iVolcano] + fdrToPlot=dataFrame$adj_p.value[,iVolcano] + coeffToPlot=dataFrame$coefficients[,iVolcano] + + rowToRemove=unique(c(which(is.na(pValToPlot)),which(is.na(fdrToPlot)),which(is.na(coeffToPlot)))) + if(length(rowToRemove)>0){ + pValToPlot=pValToPlot[-rowToRemove] + fdrToPlot=fdrToPlot[-rowToRemove] + coeffToPlot=coeffToPlot[-rowToRemove] + } + addComment(c("[INFO]For",curentVolcanoName,"volcano,",length(rowToRemove),"rows are discarded due to NA values,",length(pValToPlot),"remaining rows."),T,opt$log) + + #save volcano name + volcanoNameList=c(volcanoNameList,curentVolcanoName) + + #remove characters possibly troubling + volcanoFileName=iVolcano + + #define the log10(p-val) threshold corresponding to FDR threshold fixed by user + probeWithLowFDR=-log10(pValToPlot[which(fdrToPlot<=opt$fdrThreshold)]) + pvalThresholdFDR=NULL + if(length(probeWithLowFDR)>0)pvalThresholdFDR=min(probeWithLowFDR) + + #get significant points over FC and FDR thresholds + significativePoints=intersect(which(abs(coeffToPlot)>=logFCthreshold),which(fdrToPlot<=opt$fdrThreshold)) + + #to reduce size of html plot, we keep 20000 points maximum sampled amongst genes with pval>=33%(pval) and abs(log2(FC))<=66%(abs(log2(FC))) + htmlPointsToRemove=intersect(which(abs(coeffToPlot)<=quantile(abs(coeffToPlot),c(0.66))),which(pValToPlot>=quantile(abs(pValToPlot),c(0.33)))) + if(length(htmlPointsToRemove)>20000){ + htmlPointsToRemove=setdiff(htmlPointsToRemove,sample(htmlPointsToRemove,20000)) + }else{ + htmlPointsToRemove=c() + } + + xMinLimPlot=min(coeffToPlot)-0.2 + xMaxLimPlot=max(coeffToPlot)+0.2 + yMaxLimPlot= max(-log10(pValToPlot))+0.2 + + if(length(significativePoints)>0){ + dataSignifToPlot=data.frame(pval=-log10(pValToPlot[significativePoints]),FC=coeffToPlot[significativePoints],description=paste(names(coeffToPlot[significativePoints]),"\n","FC: " , round(2^coeffToPlot[significativePoints],2) , " | Adjusted p-val: ",prettyNum(fdrToPlot[significativePoints],digits=4), sep="")) + #to test if remains any normal points to draw + if(length(significativePoints)0)p <- p + geom_point(data=dataSignifToPlot,aes(colour=description)) + + ##interactive plot + + if(length(htmlPointsToRemove)>0){ + pointToRemove=union(htmlPointsToRemove,significativePoints) + #to test if it remains any normal points to draw + if(length(pointToRemove)40000)addComment(c("[WARNING]For",curentVolcanoName,"volcano, numerous points to plot(",nrow(dataToPlot)+nrow(dataSignifToPlot),"), resulting volcano could be heavy, using more stringent thresholds could be helpful."),T,opt$log) + + phtml <- plot_ly(data=dataToPlot, x=~FC, y=~pval,type="scatter", mode="markers",showlegend = FALSE, marker = list(color="gray",opacity=0.5), text=~description, hoverinfo="text") %>% + layout(title = curentVolcanoName[iVolcano],xaxis=list(title="Log2 Fold Change",showgrid=TRUE, zeroline=FALSE),yaxis=list(title="-Log10(p-val)", showgrid=TRUE, zeroline=FALSE)) + if(length(significativePoints)>0) phtml=add_markers(phtml,data=dataSignifToPlot, x=~FC, y=~pval, mode="markers" , marker=list( color=log10(abs(dataSignifToPlot$FC)*dataSignifToPlot$pval),colorscale='Rainbow'), text=~description, hoverinfo="text", inherit = FALSE) %>% hide_colorbar() + if(logFCthreshold!=0){ + phtml=add_trace(phtml,x=c(-logFCthreshold,-logFCthreshold), y=c(0,yMaxLimPlot), type="scatter", mode = "lines", line=list(color="coral",dash="dash"), hoverinfo='none', showlegend = FALSE,inherit = FALSE) + phtml=add_annotations(phtml,x=-logFCthreshold,y=0,xref = "x",yref = "y",text = paste(c("log2(1/FC=",opt$fcThreshold,")"),collapse=""),xanchor = 'right',showarrow = F,textangle=270,font=list(color="coral")) + phtml=add_trace(phtml,x=c(logFCthreshold,logFCthreshold), y=c(0, yMaxLimPlot), type="scatter", mode = "lines", line=list(color="coral",dash="dash"), hoverinfo='none', showlegend = FALSE,inherit = FALSE) + phtml=add_annotations(phtml,x=logFCthreshold,y=0,xref = "x",yref = "y",text = paste(c("log2(FC=",opt$fcThreshold,")"),collapse=""),xanchor = 'right',showarrow = F,textangle=270,font=list(color="coral")) + } + if(!is.null(pvalThresholdFDR)){ + phtml=add_trace(phtml,x=c(xMinLimPlot,xMaxLimPlot), y=c(pvalThresholdFDR,pvalThresholdFDR), type="scatter", mode = "lines", line=list(color="cornflowerblue",dash="dash"), hoverinfo='none', showlegend = FALSE,inherit = FALSE) + phtml=add_annotations(phtml,x=xMinLimPlot,y=pvalThresholdFDR+0.1,xref = "x",yref = "y",text = paste(c("Adjusted pval limit(",opt$fdrThreshold,")"),collapse=""),xanchor = 'left',showarrow = F,font=list(color="cornflowerblue")) + } + plotVector[[length(plotVector)+1]]=p + + #save plotly files + pp <- ggplotly(phtml) + htmlwidgets::saveWidget(as_widget(pp), paste(c(file.path(getwd(), "plotLyDir"),"/Volcanos_",volcanoFileName,".html"),collapse=""),selfcontained = F) + + + if(iVolcano==nbVolcanosToPlot || length(plotVector)==volcanoPerPage){ + #plot and close the actual plot + if(opt$format=="pdf"){ + pdf(paste(c("./plotDir/Volcanos_",volcanoFileName,".pdf"),collapse=""))}else{ + png(paste(c("./plotDir/Volcanos_",volcanoFileName,".png"),collapse="")) + } + multiplot(plotlist=plotVector,cols=1) + dev.off() + if(iVolcano0))) +#filter out genes with lower FC for all comparisons +genesToKeep=intersect(genesToKeep,names(which(apply(dataFrame$coefficients,1,function(x)length(which(abs(x)>=logFCthreshold))>0)))) + +if(length(genesToKeep)>0){ + dataFrameNew=data.frame(row.names=genesToKeep) + + dataFrameNew$adj_p.value=matrix(dataFrame$adj_p.value[genesToKeep,,drop=FALSE],ncol=ncol(dataFrame$adj_p.value)) + rownames(dataFrameNew$adj_p.value)=genesToKeep + colnames(dataFrameNew$adj_p.value)=colnames(dataFrame$p.value) + + dataFrameNew$p.value=matrix(dataFrame$p.value[genesToKeep,,drop=FALSE],ncol=ncol(dataFrame$p.value)) + rownames(dataFrameNew$p.value)=genesToKeep + colnames(dataFrameNew$p.value)=colnames(dataFrame$adj_p.value) + + dataFrameNew$coefficients=matrix(dataFrame$coefficients[genesToKeep,,drop=FALSE],ncol=ncol(dataFrame$coefficients)) + rownames(dataFrameNew$coefficients)=genesToKeep + colnames(dataFrameNew$coefficients)=colnames(dataFrame$adj_p.value) + + dataFrame=dataFrameNew + rm(dataFrameNew) +}else{ + addComment("[WARNING]No significative genes",T,opt$log,display=FALSE) +} + +addComment("[INFO]Significant genes filtering done",T,opt$log,T,display=FALSE) + + +#plot VennDiagramm for genes below threshold between comparisons +#t=apply(dataFrame$adj_p.value[,1:4],2,function(x)names(which(x<=opt$threshold))) +#get.venn.partitions(t) +#vennCounts(dataFrame$adj_p.value[,1:4]<=opt$threshold) + +#make a simple sort genes based only on the first comparison +#newOrder=order(dataFrame$adj_p.value[,1]) +#dataFrame$adj_p.value=dataFrame$adj_p.value[newOrder,] + +#alternative sorting strategy based on the mean gene rank over all comparisons +if(length(genesToKeep)>1){ + currentRank=rep(0,nrow(dataFrame$adj_p.value)) + for(iVolcano in 1:ncol(dataFrame$adj_p.value)){ + currentRank=currentRank+rank(dataFrame$adj_p.value[,iVolcano]) + } + currentRank=currentRank/ncol(dataFrame$adj_p.value) + newOrder=order(currentRank) + rownames(dataFrame)=rownames(dataFrame)[newOrder] + + dataFrame$adj_p.value=matrix(dataFrame$adj_p.value[newOrder,],ncol=ncol(dataFrame$adj_p.value)) + rownames(dataFrame$adj_p.value)=rownames(dataFrame$p.value)[newOrder] + colnames(dataFrame$adj_p.value)=colnames(dataFrame$p.value) + + dataFrame$p.value=matrix(dataFrame$p.value[newOrder,],ncol=ncol(dataFrame$p.value)) + rownames(dataFrame$p.value)=rownames(dataFrame$adj_p.value) + colnames(dataFrame$p.value)=colnames(dataFrame$adj_p.value) + + dataFrame$coefficients=matrix(dataFrame$coefficients[newOrder,],ncol=ncol(dataFrame$coefficients)) + rownames(dataFrame$coefficients)=rownames(dataFrame$adj_p.value) + colnames(dataFrame$coefficients)=colnames(dataFrame$adj_p.value) +} + +#formating output matrix depending on genes to keep +if(length(genesToKeep)==0){ + outputData=matrix(0,ncol=ncol(dataFrame$adj_p.value)*4+2,nrow=3) + outputData[1,]=c("X","X",rep(volcanoNameList,each=4)) + outputData[2,]=c("X","X",rep(c("p-val","Adjusted.p-val","FC","log2(FC)"),ncol(dataFrame$adj_p.value))) + outputData[,1]=c("Volcano","Gene","noGene") + outputData[,2]=c("Comparison","Info","noInfo") +}else{ + if(length(genesToKeep)==1){ + outputData=matrix(0,ncol=ncol(dataFrame$adj_p.value)*4+2,nrow=3) + outputData[1,]=c("X","X",rep(volcanoNameList,each=4)) + outputData[2,]=c("X","X",rep(c("p-val","Adjusted.p-val","FC","log2(FC)"),ncol(dataFrame$adj_p.value))) + outputData[,1]=c("Volcano","Gene",genesToKeep) + outputData[,2]=c("Comparison","Info","na") + if(!is.null(rowItemInfo))outputData[3,2]=rowItemInfo[genesToKeep] + outputData[3,seq(3,ncol(outputData),4)]=prettyNum(dataFrame$p.value,digits=4) + outputData[3,seq(4,ncol(outputData),4)]=prettyNum(dataFrame$adj_p.value,digits=4) + outputData[3,seq(5,ncol(outputData),4)]=prettyNum(2^dataFrame$coefficients,digits=4) + outputData[3,seq(6,ncol(outputData),4)]=prettyNum(dataFrame$coefficients,digits=4) + }else{ + #format matrix to be correctly read by galaxy (move headers in first column and row) + outputData=matrix(0,ncol=ncol(dataFrame$adj_p.value)*4+2,nrow=nrow(dataFrame$adj_p.value)+2) + outputData[1,]=c("X","X",rep(volcanoNameList,each=4)) + outputData[2,]=c("X","X",rep(c("p-val","Adjusted.p-val","FC","log2(FC)"),ncol(dataFrame$adj_p.value))) + outputData[,1]=c("Volcano","Gene",rownames(dataFrame$adj_p.value)) + outputData[,2]=c("Comparison","Info",rep("na",nrow(dataFrame$adj_p.value))) + if(!is.null(rowItemInfo))outputData[3:nrow(outputData),2]=rowItemInfo[rownames(dataFrame$adj_p.value)] + outputData[3:nrow(outputData),seq(3,ncol(outputData),4)]=prettyNum(dataFrame$p.value,digits=4) + outputData[3:nrow(outputData),seq(4,ncol(outputData),4)]=prettyNum(dataFrame$adj_p.value,digits=4) + outputData[3:nrow(outputData),seq(5,ncol(outputData),4)]=prettyNum(2^dataFrame$coefficients,digits=4) + outputData[3:nrow(outputData),seq(6,ncol(outputData),4)]=prettyNum(dataFrame$coefficients,digits=4) + } +} +addComment("[INFO]Formated output",T,opt$log,display=FALSE) + +#write output results +write.table(outputData,file=opt$outputFile,quote=FALSE,sep="\t",col.names = F,row.names = F) + + +end.time <- Sys.time() +addComment(c("[INFO]Total execution time for R script:",as.numeric(end.time - start.time,units="mins"),"mins"),T,opt$log,display=FALSE) + +addComment("[INFO]End of R script",T,opt$log,display=FALSE) + +printSessionInfo(opt$log) + +#sessionInfo() \ No newline at end of file diff -r 000000000000 -r 14045c80a222 src/getopt.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/getopt.R Fri Jun 26 09:38:23 2020 -0400 @@ -0,0 +1,773 @@ +# Copyright (c) 2008-2010 Allen Day +# Copyright (c) 2011-2013 Trevor L. Davis +# +# Modified by J.Vandel 2017 to consider situation of multiple identical flag +# and concatenate as a vector the set of parameter for the same flag instead of +# keeping only the last value as done by the previous version. +# +# This file is free software: you may copy, redistribute and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation, either version 2 of the License, or (at your +# option) any later version. +# +# This file is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +#' C-like getopt behavior +#' +#' getopt is primarily intended to be used with ``\link{Rscript}''. It +#' facilitates writing ``\#!'' shebang scripts that accept short and long +#' flags/options. It can also be used from ``R'' directly, but is probably less +#' useful in this context. +#' +#' getopt() returns a \link{list} data structure containing \link{names} of the +#' flags that were present in the \link{character} \link{vector} passed in under +#' the \emph{opt} argument. Each value of the \link{list} is coerced to the +#' data type specified according to the value of the \emph{spec} argument. See +#' below for details. +#' +#' Notes on naming convention: +#' +#' 1. An \emph{option} is one of the shell-split input strings. +#' +#' 2. A \emph{flag} is a type of \emph{option}. a \emph{flag} can be defined as +#' having no \emph{argument} (defined below), a required \emph{argument}, or an +#' optional \emph{argument}. +#' +#' 3. An \emph{argument} is a type of \emph{option}, and is the value associated +#' with a flag. +#' +#' 4. A \emph{long flag} is a type of \emph{flag}, and begins with the string +#' ``--''. If the \emph{long flag} has an associated \emph{argument}, it may be +#' delimited from the \emph{long flag} by either a trailing \emph{=}, or may be +#' the subsequent \emph{option}. +#' +#' 5. A \emph{short flag} is a type of \emph{flag}, and begins with the string +#' ``-''. If a \emph{short flag} has an associated \emph{argument}, it is the +#' subsequent \emph{option}. \emph{short flags} may be bundled together, +#' sharing a single leading ``-'', but only the final \emph{short flag} is able +#' to have a corresponding \emph{argument}. +#' +#' Many users wonder whether they should use the getopt package, optparse package, +#' or argparse package. +#' Here is some of the major differences: +#' +#' Features available in \code{getopt} unavailable in \code{optparse} +#' +#' 1. As well as allowing one to specify options that take either +#' no argument or a required argument like \code{optparse}, +#' \code{getopt} also allows one to specify option with an optional argument. +#' +#' Some features implemented in \code{optparse} package unavailable in \code{getopt} +#' +#' 1. Limited support for capturing positional arguments after the optional arguments +#' when \code{positional_arguments} set to TRUE in \code{parse_args} +#' +#' 2. Automatic generation of an help option and printing of help text when encounters an "-h" +#' +#' 3. Option to specify default arguments for options as well the +#' variable name to store option values +#' +#' There is also new package \code{argparse} introduced in 2012 which contains +#' all the features of both getopt and optparse but which has a dependency on +#' Python 2.7 or 3.2+ and has not been used in production since 2008 or 2009 +#' like the getopt and optparse packages. +#' +#' Some Features unlikely to be implemented in \code{getopt}: +#' +#' 1. Support for multiple, identical flags, e.g. for "-m 3 -v 5 -v", the +#' trailing "-v" overrides the preceding "-v 5", result is v=TRUE (or equivalent +#' typecast). +#' +#' 2. Support for multi-valued flags, e.g. "--libpath=/usr/local/lib +#' --libpath=/tmp/foo". +#' +#' 3. Support for lists, e.g. "--define os=linux --define os=redhat" would +#' set result$os$linux=TRUE and result$os$redhat=TRUE. +#' +#' 4. Support for incremental, argument-less flags, e.g. "/path/to/script +#' -vvv" should set v=3. +#' +#' 5. Support partial-but-unique string match on options, e.g. "--verb" and +#' "--verbose" both match long flag "--verbose". +#' +#' 6. No support for mixing in positional arguments or extra arguments that +#' don't match any options. For example, you can't do "my.R --arg1 1 foo bar +#' baz" and recover "foo", "bar", "baz" as a list. Likewise for "my.R foo +#' --arg1 1 bar baz". +#' +#' @aliases getopt getopt-package +#' @param spec The getopt specification, or spec of what options are considered +#' valid. The specification must be either a 4-5 column \link{matrix}, or a +#' \link{character} \link{vector} coercible into a 4 column \link{matrix} using +#' \link{matrix}(x,ncol=4,byrow=TRUE) command. The \link{matrix}/\link{vector} +#' contains: +#' +#' Column 1: the \emph{long flag} name. A multi-\link{character} string. +#' +#' Column 2: \emph{short flag} alias of Column 1. A single-\link{character} +#' string. +#' +#' Column 3: \emph{Argument} mask of the \emph{flag}. An \link{integer}. +#' Possible values: 0=no argument, 1=required argument, 2=optional argument. +#' +#' Column 4: Data type to which the \emph{flag}'s argument shall be cast using +#' \link{storage.mode}. A multi-\link{character} string. This only considered +#' for same-row Column 3 values of 1,2. Possible values: \link{logical}, +#' \link{integer}, \link{double}, \link{complex}, \link{character}. +#' If \link{numeric} is encountered then it will be converted to double. +#' +#' Column 5 (optional): A brief description of the purpose of the option. +#' +#' The terms \emph{option}, \emph{flag}, \emph{long flag}, \emph{short flag}, +#' and \emph{argument} have very specific meanings in the context of this +#' document. Read the ``Description'' section for definitions. +#' @param opt This defaults to the return value of \link{commandArgs}(TRUE). +#' +#' If R was invoked directly via the ``R'' command, this corresponds to all +#' arguments passed to R after the ``--args'' flag. +#' +#' If R was invoked via the ``\link{Rscript}'' command, this corresponds to all +#' arguments after the name of the R script file. +#' +#' Read about \link{commandArgs} and \link{Rscript} to learn more. +#' @param command The string to use in the usage message as the name of the +#' script. See argument \emph{usage}. +#' @param usage If TRUE, argument \emph{opt} will be ignored and a usage +#' statement (character string) will be generated and returned from \emph{spec}. +#' @param debug This is used internally to debug the getopt() function itself. +#' @author Allen Day +#' @seealso \code{\link{getopt}} +#' @keywords data +#' @export +#' @examples +#' +#' #!/path/to/Rscript +#' library('getopt'); +#' #get options, using the spec as defined by the enclosed list. +#' #we read the options from the default: commandArgs(TRUE). +#' spec = matrix(c( +#' 'verbose', 'v', 2, "integer", +#' 'help' , 'h', 0, "logical", +#' 'count' , 'c', 1, "integer", +#' 'mean' , 'm', 1, "double", +#' 'sd' , 's', 1, "double" +#' ), byrow=TRUE, ncol=4); +#' opt = getopt(spec); +#' +#' # if help was asked for print a friendly message +#' # and exit with a non-zero error code +#' if ( !is.null(opt$help) ) { +#' cat(getopt(spec, usage=TRUE)); +#' q(status=1); +#' } +#' +#' #set some reasonable defaults for the options that are needed, +#' #but were not specified. +#' if ( is.null(opt$mean ) ) { opt$mean = 0 } +#' if ( is.null(opt$sd ) ) { opt$sd = 1 } +#' if ( is.null(opt$count ) ) { opt$count = 10 } +#' if ( is.null(opt$verbose ) ) { opt$verbose = FALSE } +#' +#' #print some progress messages to stderr, if requested. +#' if ( opt$verbose ) { write("writing...",stderr()); } +#' +#' #do some operation based on user input. +#' cat(paste(rnorm(opt$count,mean=opt$mean,sd=opt$sd),collapse="\n")); +#' cat("\n"); +#' +#' #signal success and exit. +#' #q(status=0); +getopt = function (spec=NULL,opt=commandArgs(TRUE),command=get_Rscript_filename(),usage=FALSE,debug=FALSE) { + + # littler compatibility - map argv vector to opt + if (exists("argv", where = .GlobalEnv, inherits = FALSE)) { + opt = get("argv", envir = .GlobalEnv); + } + + ncol=4; + maxcol=6; + col.long.name = 1; + col.short.name = 2; + col.has.argument = 3; + col.mode = 4; + col.description = 5; + + flag.no.argument = 0; + flag.required.argument = 1; + flag.optional.argument = 2; + + result = list(); + result$ARGS = vector(mode="character"); + + #no spec. fail. + if ( is.null(spec) ) { + stop('argument "spec" must be non-null.'); + + #spec is not a matrix. attempt to coerce, if possible. issue a warning. + } else if ( !is.matrix(spec) ) { + if ( length(spec)/4 == as.integer(length(spec)/4) ) { + warning('argument "spec" was coerced to a 4-column (row-major) matrix. use a matrix to prevent the coercion'); + spec = matrix( spec, ncol=ncol, byrow=TRUE ); + } else { + stop('argument "spec" must be a matrix, or a character vector with length divisible by 4, rtfm.'); + } + + #spec is a matrix, but it has too few columns. + } else if ( dim(spec)[2] < ncol ) { + stop(paste('"spec" should have at least ",ncol," columns.',sep='')); + + #spec is a matrix, but it has too many columns. + } else if ( dim(spec)[2] > maxcol ) { + stop(paste('"spec" should have no more than ",maxcol," columns.',sep='')); + + #spec is a matrix, and it has some optional columns. + } else if ( dim(spec)[2] != ncol ) { + ncol = dim(spec)[2]; + } + + #sanity check. make sure long names are unique, and short names are unique. + if ( length(unique(spec[,col.long.name])) != length(spec[,col.long.name]) ) { + stop(paste('redundant long names for flags (column ',col.long.name,').',sep='')); + } + if ( length(na.omit(unique(spec[,col.short.name]))) != length(na.omit(spec[,col.short.name])) ) { + stop(paste('redundant short names for flags (column ',col.short.name,').',sep='')); + } + # convert numeric type to double type + spec[,4] <- gsub("numeric", "double", spec[,4]) + + # if usage=TRUE, don't process opt, but generate a usage string from the data in spec + if ( usage ) { + ret = ''; + ret = paste(ret,"Usage: ",command,sep=''); + for ( j in 1:(dim(spec))[1] ) { + ret = paste(ret,' [-[-',spec[j,col.long.name],'|',spec[j,col.short.name],']',sep=''); + if (spec[j,col.has.argument] == flag.no.argument) { + ret = paste(ret,']',sep=''); + } else if (spec[j,col.has.argument] == flag.required.argument) { + ret = paste(ret,' <',spec[j,col.mode],'>]',sep=''); + } else if (spec[j,col.has.argument] == flag.optional.argument) { + ret = paste(ret,' [<',spec[j,col.mode],'>]]',sep=''); + } + } + # include usage strings + if ( ncol >= 5 ) { + max.long = max(apply(cbind(spec[,col.long.name]),1,function(x)length(strsplit(x,'')[[1]]))); + ret = paste(ret,"\n",sep=''); + for (j in 1:(dim(spec))[1] ) { + ret = paste(ret,sprintf(paste(" -%s|--%-",max.long,"s %s\n",sep=''), + spec[j,col.short.name],spec[j,col.long.name],spec[j,col.description] + ),sep=''); + } + } + else { + ret = paste(ret,"\n",sep=''); + } + return(ret); + } + + #XXX check spec validity here. e.g. column three should be convertible to integer + + i = 1; + + while ( i <= length(opt) ) { + if ( debug ) print(paste("processing",opt[i])); + + current.flag = 0; #XXX use NA + optstring = opt[i]; + + + #long flag + if ( substr(optstring, 1, 2) == '--' ) { + if ( debug ) print(paste(" long option:",opt[i])); + + optstring = substring(optstring,3); + + this.flag = NA; + this.argument = NA; + kv = strsplit(optstring, '=')[[1]]; + if ( !is.na(kv[2]) ) { + this.flag = kv[1]; + this.argument = paste(kv[-1], collapse="="); + } else { + this.flag = optstring; + } + + rowmatch = grep( this.flag, spec[,col.long.name],fixed=TRUE ); + + #long flag is invalid, matches no options + if ( length(rowmatch) == 0 ) { + stop(paste('long flag "', this.flag, '" is invalid', sep='')); + + #long flag is ambiguous, matches too many options + } else if ( length(rowmatch) > 1 ) { + # check if there is an exact match and use that + rowmatch = which(this.flag == spec[,col.long.name]) + if(length(rowmatch) == 0) { + stop(paste('long flag "', this.flag, '" is ambiguous', sep='')); + } + } + + #if we have an argument + if ( !is.na(this.argument) ) { + #if we can't accept the argument, bail out + if ( spec[rowmatch, col.has.argument] == flag.no.argument ) { + stop(paste('long flag "', this.flag, '" accepts no arguments', sep='')); + + #otherwise assign the argument to the flag + } else { + storage.mode(this.argument) = spec[rowmatch, col.mode]; + #don't need here to remove the last value of the vector as argument is in the same string as + #the flag name "--flag=argument" so no spurious TRUE was added + result[[spec[rowmatch, col.long.name]]] = c(result[[spec[rowmatch, col.long.name]]],this.argument); + i = i + 1; + next; + } + + #otherwise, we don't have an argument + } else { + #if we require an argument, bail out + ###if ( spec[rowmatch, col.has.argument] == flag.required.argument ) { + ### stop(paste('long flag "', this.flag, '" requires an argument', sep='')); + + #long flag has no attached argument. set flag as present. set current.flag so we can peek ahead later and consume the argument if it's there + ###} else { + result[[spec[rowmatch, col.long.name]]] = c(result[[spec[rowmatch, col.long.name]]],TRUE); + current.flag = rowmatch; + ###} + } + + #short flag(s) + } else if ( substr(optstring, 1, 1) == '-' ) { + if ( debug ) print(paste(" short option:",opt[i])); + + these.flags = strsplit(optstring,'')[[1]]; + + done = FALSE; + for ( j in 2:length(these.flags) ) { + this.flag = these.flags[j]; + rowmatch = grep( this.flag, spec[,col.short.name],fixed=TRUE ); + + #short flag is invalid, matches no options + if ( length(rowmatch) == 0 ) { + stop(paste('short flag "', this.flag, '" is invalid', sep='')); + + #short flag is ambiguous, matches too many options + } else if ( length(rowmatch) > 1 ) { + stop(paste('short flag "', this.flag, '" is ambiguous', sep='')); + + #short flag has an argument, but is not the last in a compound flag string + } else if ( j < length(these.flags) & spec[rowmatch,col.has.argument] == flag.required.argument ) { + stop(paste('short flag "', this.flag, '" requires an argument, but has none', sep='')); + + #short flag has no argument, flag it as present + } else if ( spec[rowmatch,col.has.argument] == flag.no.argument ) { + result[[spec[rowmatch, col.long.name]]] = c(result[[spec[rowmatch, col.long.name]]],TRUE); + done = TRUE; + + #can't definitively process this flag yet, need to see if next option is an argument or not + } else { + result[[spec[rowmatch, col.long.name]]] = c(result[[spec[rowmatch, col.long.name]]],TRUE); + current.flag = rowmatch; + done = FALSE; + } + } + if ( done ) { + i = i + 1; + next; + } + } + + #invalid opt + if ( current.flag == 0 ) { + stop(paste('"', optstring, '" is not a valid option, or does not support an argument', sep='')); + #TBD support for positional args + #if ( debug ) print(paste('"', optstring, '" not a valid option. It is appended to getopt(...)$ARGS', sep='')); + #result$ARGS = append(result$ARGS, optstring); + + # some dangling flag, handle it + } else if ( current.flag > 0 ) { + if ( debug ) print(' dangling flag'); + if ( length(opt) > i ) { + peek.optstring = opt[i + 1]; + if ( debug ) print(paste(' peeking ahead at: "',peek.optstring,'"',sep='')); + + #got an argument. attach it, increment the index, and move on to the next option. we don't allow arguments beginning with '-' UNLESS + #specfile indicates the value is an "integer" or "double", in which case we allow a leading dash (and verify trailing digits/decimals). + if ( substr(peek.optstring, 1, 1) != '-' | + #match negative double + ( substr(peek.optstring, 1, 1) == '-' + & regexpr('^-[0123456789]*\\.?[0123456789]+$',peek.optstring) > 0 + & spec[current.flag, col.mode]== 'double' + ) | + #match negative integer + ( substr(peek.optstring, 1, 1) == '-' + & regexpr('^-[0123456789]+$',peek.optstring) > 0 + & spec[current.flag, col.mode]== 'integer' + ) + ) { + if ( debug ) print(paste(' consuming argument *',peek.optstring,'*',sep='')); + storage.mode(peek.optstring) = spec[current.flag, col.mode]; + #remove the last argument put in result for current.flag that should be a TRUE and concatenate argument with previous ones + result[[spec[current.flag, col.long.name]]] = c(result[[spec[current.flag, col.long.name]]][-length(result[[spec[current.flag, col.long.name]]])],peek.optstring); + i = i + 1; + + #a lone dash + } else if ( substr(peek.optstring, 1, 1) == '-' & length(strsplit(peek.optstring,'')[[1]]) == 1 ) { + if ( debug ) print(' consuming "lone dash" argument'); + storage.mode(peek.optstring) = spec[current.flag, col.mode]; + #remove the last argument put in result for current.flag that should be a TRUE and concatenate argument with previous ones + result[[spec[current.flag, col.long.name]]] =c(result[[spec[current.flag, col.long.name]]][-length(result[[spec[current.flag, col.long.name]]])],peek.optstring); + i = i + 1; + + #no argument + } else { + if ( debug ) print(' no argument!'); + + #if we require an argument, bail out + if ( spec[current.flag, col.has.argument] == flag.required.argument ) { + stop(paste('flag "', this.flag, '" requires an argument', sep='')); + + #otherwise set flag as present. + } else if ( + spec[current.flag, col.has.argument] == flag.optional.argument | + spec[current.flag, col.has.argument] == flag.no.argument + ) { + x = TRUE; + storage.mode(x) = spec[current.flag, col.mode]; + result[[spec[current.flag, col.long.name]]] = c(result[[spec[current.flag, col.long.name]]],x); + } else { + stop(paste("This should never happen.", + "Is your spec argument correct? Maybe you forgot to set", + "ncol=4, byrow=TRUE in your matrix call?")); + } + } + #trailing flag without required argument + } else if ( spec[current.flag, col.has.argument] == flag.required.argument ) { + stop(paste('flag "', this.flag, '" requires an argument', sep='')); + + #trailing flag without optional argument + } else if ( spec[current.flag, col.has.argument] == flag.optional.argument ) { + x = TRUE; + storage.mode(x) = spec[current.flag, col.mode]; + result[[spec[current.flag, col.long.name]]] = c(result[[spec[current.flag, col.long.name]]],x); + + #trailing flag without argument + } else if ( spec[current.flag, col.has.argument] == flag.no.argument ) { + x = TRUE; + storage.mode(x) = spec[current.flag, col.mode]; + result[[spec[current.flag, col.long.name]]] = c(result[[spec[current.flag, col.long.name]]],x); + } else { + stop("this should never happen (2). please inform the author."); + } + #no dangling flag, nothing to do. + } else { + } + + i = i+1; + } + return(result); +} + + + +######################### +#set a modified version using only long named parameters + +getoptLong = function (spec=NULL,opt=commandArgs(TRUE),command=get_Rscript_filename(),usage=FALSE,debug=FALSE) { + + # littler compatibility - map argv vector to opt + if (exists("argv", where = .GlobalEnv, inherits = FALSE)) { + opt = get("argv", envir = .GlobalEnv); + } + + ncol=4; + maxcol=6; + col.long.name = 1; + #col.short.name = 2; + col.has.argument = 3; + col.mode = 4; + col.description = 5; + + flag.no.argument = 0; + flag.required.argument = 1; + flag.optional.argument = 2; + + result = list(); + result$ARGS = vector(mode="character"); + + #no spec. fail. + if ( is.null(spec) ) { + stop('argument "spec" must be non-null.'); + + #spec is not a matrix. attempt to coerce, if possible. issue a warning. + } else if ( !is.matrix(spec) ) { + if ( length(spec)/4 == as.integer(length(spec)/4) ) { + warning('argument "spec" was coerced to a 4-column (row-major) matrix. use a matrix to prevent the coercion'); + spec = matrix( spec, ncol=ncol, byrow=TRUE ); + } else { + stop('argument "spec" must be a matrix, or a character vector with length divisible by 4, rtfm.'); + } + + #spec is a matrix, but it has too few columns. + } else if ( dim(spec)[2] < ncol ) { + stop(paste('"spec" should have at least ",ncol," columns.',sep='')); + + #spec is a matrix, but it has too many columns. + } else if ( dim(spec)[2] > maxcol ) { + stop(paste('"spec" should have no more than ",maxcol," columns.',sep='')); + + #spec is a matrix, and it has some optional columns. + } else if ( dim(spec)[2] != ncol ) { + ncol = dim(spec)[2]; + } + + #sanity check. make sure long names are unique, and short names are unique. + if ( length(unique(spec[,col.long.name])) != length(spec[,col.long.name]) ) { + stop(paste('redundant long names for flags (column ',col.long.name,').',sep='')); + } + # if ( length(na.omit(unique(spec[,col.short.name]))) != length(na.omit(spec[,col.short.name])) ) { + # stop(paste('redundant short names for flags (column ',col.short.name,').',sep='')); + # } + # convert numeric type to double type + spec[,4] <- gsub("numeric", "double", spec[,4]) + + # if usage=TRUE, don't process opt, but generate a usage string from the data in spec + if ( usage ) { + ret = ''; + ret = paste(ret,"Usage: ",command,sep=''); + for ( j in 1:(dim(spec))[1] ) { + ret = paste(ret,' [-[-',spec[j,col.long.name],']',sep=''); + if (spec[j,col.has.argument] == flag.no.argument) { + ret = paste(ret,']',sep=''); + } else if (spec[j,col.has.argument] == flag.required.argument) { + ret = paste(ret,' <',spec[j,col.mode],'>]',sep=''); + } else if (spec[j,col.has.argument] == flag.optional.argument) { + ret = paste(ret,' [<',spec[j,col.mode],'>]]',sep=''); + } + } + # include usage strings + if ( ncol >= 5 ) { + max.long = max(apply(cbind(spec[,col.long.name]),1,function(x)length(strsplit(x,'')[[1]]))); + ret = paste(ret,"\n",sep=''); + for (j in 1:(dim(spec))[1] ) { + ret = paste(ret,sprintf(paste("--%-",max.long,"s %s\n",sep='') + ,spec[j,col.long.name],spec[j,col.description] + ),sep=''); + } + } + else { + ret = paste(ret,"\n",sep=''); + } + return(ret); + } + + #XXX check spec validity here. e.g. column three should be convertible to integer + + i = 1; + + while ( i <= length(opt) ) { + if ( debug ) print(paste("processing",opt[i])); + + current.flag = 0; #XXX use NA + optstring = opt[i]; + + + #long flag + if ( substr(optstring, 1, 2) == '--' ) { + if ( debug ) print(paste(" long option:",opt[i])); + + optstring = substring(optstring,3); + + this.flag = NA; + this.argument = NA; + kv = strsplit(optstring, '=')[[1]]; + if ( !is.na(kv[2]) ) { + this.flag = kv[1]; + this.argument = paste(kv[-1], collapse="="); + } else { + this.flag = optstring; + } + + rowmatch = grep( this.flag, spec[,col.long.name],fixed=TRUE ); + + #long flag is invalid, matches no options + if ( length(rowmatch) == 0 ) { + stop(paste('long flag "', this.flag, '" is invalid', sep='')); + + #long flag is ambiguous, matches too many options + } else if ( length(rowmatch) > 1 ) { + # check if there is an exact match and use that + rowmatch = which(this.flag == spec[,col.long.name]) + if(length(rowmatch) == 0) { + stop(paste('long flag "', this.flag, '" is ambiguous', sep='')); + } + } + + #if we have an argument + if ( !is.na(this.argument) ) { + #if we can't accept the argument, bail out + if ( spec[rowmatch, col.has.argument] == flag.no.argument ) { + stop(paste('long flag "', this.flag, '" accepts no arguments', sep='')); + + #otherwise assign the argument to the flag + } else { + storage.mode(this.argument) = spec[rowmatch, col.mode]; + #don't need here to remove the last value of the vector as argument is in the same string as + #the flag name "--flag=argument" so no spurious TRUE was added + result[[spec[rowmatch, col.long.name]]] = c(result[[spec[rowmatch, col.long.name]]],this.argument); + i = i + 1; + next; + } + + #otherwise, we don't have an argument + } else { + #if we require an argument, bail out + ###if ( spec[rowmatch, col.has.argument] == flag.required.argument ) { + ### stop(paste('long flag "', this.flag, '" requires an argument', sep='')); + + #long flag has no attached argument. set flag as present. set current.flag so we can peek ahead later and consume the argument if it's there + ###} else { + result[[spec[rowmatch, col.long.name]]] = c(result[[spec[rowmatch, col.long.name]]],TRUE); + current.flag = rowmatch; + ###} + } + + #short flag(s) + } + #else if ( substr(optstring, 1, 1) == '-' ) { + # if ( debug ) print(paste(" short option:",opt[i])); + # + # these.flags = strsplit(optstring,'')[[1]]; + # + # done = FALSE; + # for ( j in 2:length(these.flags) ) { + # this.flag = these.flags[j]; + # rowmatch = grep( this.flag, spec[,col.short.name],fixed=TRUE ); + # + # #short flag is invalid, matches no options + # if ( length(rowmatch) == 0 ) { + # stop(paste('short flag "', this.flag, '" is invalid', sep='')); + # + # #short flag is ambiguous, matches too many options + # } else if ( length(rowmatch) > 1 ) { + # stop(paste('short flag "', this.flag, '" is ambiguous', sep='')); + # + # #short flag has an argument, but is not the last in a compound flag string + # } else if ( j < length(these.flags) & spec[rowmatch,col.has.argument] == flag.required.argument ) { + # stop(paste('short flag "', this.flag, '" requires an argument, but has none', sep='')); + # + # #short flag has no argument, flag it as present + # } else if ( spec[rowmatch,col.has.argument] == flag.no.argument ) { + # result[[spec[rowmatch, col.long.name]]] = c(result[[spec[rowmatch, col.long.name]]],TRUE); + # done = TRUE; + # + # #can't definitively process this flag yet, need to see if next option is an argument or not + # } else { + # result[[spec[rowmatch, col.long.name]]] = c(result[[spec[rowmatch, col.long.name]]],TRUE); + # current.flag = rowmatch; + # done = FALSE; + # } + # } + # if ( done ) { + # i = i + 1; + # next; + # } + # } + + #invalid opt + if ( current.flag == 0 ) { + stop(paste('"', optstring, '" is not a valid option, or does not support an argument', sep='')); + #TBD support for positional args + #if ( debug ) print(paste('"', optstring, '" not a valid option. It is appended to getopt(...)$ARGS', sep='')); + #result$ARGS = append(result$ARGS, optstring); + + # some dangling flag, handle it + } else if ( current.flag > 0 ) { + if ( debug ) print(' dangling flag'); + if ( length(opt) > i ) { + peek.optstring = opt[i + 1]; + if ( debug ) print(paste(' peeking ahead at: "',peek.optstring,'"',sep='')); + + #got an argument. attach it, increment the index, and move on to the next option. we don't allow arguments beginning with '-' UNLESS + #specfile indicates the value is an "integer" or "double", in which case we allow a leading dash (and verify trailing digits/decimals). + if ( substr(peek.optstring, 1, 1) != '-' | + #match negative double + ( substr(peek.optstring, 1, 1) == '-' + & regexpr('^-[0123456789]*\\.?[0123456789]+$',peek.optstring) > 0 + & spec[current.flag, col.mode]== 'double' + ) | + #match negative integer + ( substr(peek.optstring, 1, 1) == '-' + & regexpr('^-[0123456789]+$',peek.optstring) > 0 + & spec[current.flag, col.mode]== 'integer' + ) + ) { + if ( debug ) print(paste(' consuming argument *',peek.optstring,'*',sep='')); + storage.mode(peek.optstring) = spec[current.flag, col.mode]; + #remove the last argument put in result for current.flag that should be a TRUE and concatenate argument with previous ones + result[[spec[current.flag, col.long.name]]] = c(result[[spec[current.flag, col.long.name]]][-length(result[[spec[current.flag, col.long.name]]])],peek.optstring); + i = i + 1; + + #a lone dash + } else if ( substr(peek.optstring, 1, 1) == '-' & length(strsplit(peek.optstring,'')[[1]]) == 1 ) { + if ( debug ) print(' consuming "lone dash" argument'); + storage.mode(peek.optstring) = spec[current.flag, col.mode]; + #remove the last argument put in result for current.flag that should be a TRUE and concatenate argument with previous ones + result[[spec[current.flag, col.long.name]]] =c(result[[spec[current.flag, col.long.name]]][-length(result[[spec[current.flag, col.long.name]]])],peek.optstring); + i = i + 1; + + #no argument + } else { + if ( debug ) print(' no argument!'); + + #if we require an argument, bail out + if ( spec[current.flag, col.has.argument] == flag.required.argument ) { + stop(paste('flag "', this.flag, '" requires an argument', sep='')); + + #otherwise set flag as present. + } else if ( + spec[current.flag, col.has.argument] == flag.optional.argument | + spec[current.flag, col.has.argument] == flag.no.argument + ) { + x = TRUE; + storage.mode(x) = spec[current.flag, col.mode]; + result[[spec[current.flag, col.long.name]]] = c(result[[spec[current.flag, col.long.name]]],x); + } else { + stop(paste("This should never happen.", + "Is your spec argument correct? Maybe you forgot to set", + "ncol=4, byrow=TRUE in your matrix call?")); + } + } + #trailing flag without required argument + } else if ( spec[current.flag, col.has.argument] == flag.required.argument ) { + stop(paste('flag "', this.flag, '" requires an argument', sep='')); + + #trailing flag without optional argument + } else if ( spec[current.flag, col.has.argument] == flag.optional.argument ) { + x = TRUE; + storage.mode(x) = spec[current.flag, col.mode]; + result[[spec[current.flag, col.long.name]]] = c(result[[spec[current.flag, col.long.name]]],x); + + #trailing flag without argument + } else if ( spec[current.flag, col.has.argument] == flag.no.argument ) { + x = TRUE; + storage.mode(x) = spec[current.flag, col.mode]; + result[[spec[current.flag, col.long.name]]] = c(result[[spec[current.flag, col.long.name]]],x); + } else { + stop("this should never happen (2). please inform the author."); + } + #no dangling flag, nothing to do. + } else { + } + + i = i+1; + } + return(result); +} + diff -r 000000000000 -r 14045c80a222 src/heatMapClustering.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/heatMapClustering.R Fri Jun 26 09:38:23 2020 -0400 @@ -0,0 +1,896 @@ +# A command-line interface to plot heatmap based on expression or diff. exp. analysis +# written by Jimmy Vandel +# one of these arguments is required: +# +# +initial.options <- commandArgs(trailingOnly = FALSE) +file.arg.name <- "--file=" +script.name <- sub(file.arg.name, "", initial.options[grep(file.arg.name, initial.options)]) +script.basename <- dirname(script.name) +source(file.path(script.basename, "utils.R")) +source(file.path(script.basename, "getopt.R")) + +#addComment("Welcome R!") + +# setup R error handling to go to stderr +options( show.error.messages=F, error = function () { cat(geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) + +# we need that to not crash galaxy with an UTF8 error on German LC settings. +loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") +loc <- Sys.setlocale("LC_NUMERIC", "C") + +#get starting time +start.time <- Sys.time() + + +options(stringAsfactors = FALSE, useFancyQuotes = FALSE, OutDec=".") + +#get options +args <- commandArgs() + +# get options, using the spec as defined by the enclosed list. +# we read the options from the default: commandArgs(TRUE). +spec <- matrix(c( + "expressionFile", "x", 1, "character", + "diffAnalyseFile", "x", 1, "character", + "factorInfo","x", 1, "character", + "genericData","x", 0, "logical", + "comparisonName","x",1,"character", + "comparisonNameLow","x",1,"character", + "comparisonNameHigh","x",1,"character", + "filterInputOutput","x", 1, "character", + "FCthreshold","x", 1, "double", + "pvalThreshold","x", 1, "double", + "geneListFiltering","x",1,"character", + "clusterNumber","x",1,"integer", + "maxRows","x",1,"integer", + "sampleClusterNumber","x",1,"integer", + "dataTransformation","x",1,"character", + "distanceMeasure","x",1,"character", + "aggloMethod","x",1,"character", + "personalColors","x",1,"character", + "sideBarColorPalette","x",1,"character", + "format", "x", 1, "character", + "quiet", "x", 0, "logical", + "log", "x", 1, "character", + "outputFile" , "x", 1, "character"), + byrow=TRUE, ncol=4) +opt <- getoptLong(spec) + +# enforce the following required arguments +if (is.null(opt$log)) { + addComment("[ERROR]'log file' is required") + q( "no", 1, F ) +} +addComment("[INFO]Start of R script",T,opt$log,display=FALSE) +if (is.null(opt$format)) { + addComment("[ERROR]'output format' is required",T,opt$log) + q( "no", 1, F ) +} +if (is.null(opt$outputFile)) { + addComment("[ERROR]'output file' is required",T,opt$log) + q( "no", 1, F ) +} + +if(is.null(opt$expressionFile) && !is.null(opt$genericData)){ + addComment("[ERROR]generic data clustering is based on expression clustering",T,opt$log) + q( "no", 1, F ) +} + +if (is.null(opt$clusterNumber) || opt$clusterNumber<2) { + addComment("[ERROR]valid genes clusters number is required",T,opt$log) + q( "no", 1, F ) +} + +if (is.null(opt$sampleClusterNumber) || opt$sampleClusterNumber<1) { + addComment("[ERROR]valid samples clusters number is required",T,opt$log) + q( "no", 1, F ) +} + +if (is.null(opt$dataTransformation)) { + addComment("[ERROR]data transformation option is required",T,opt$log) + q( "no", 1, F ) +} + +if (is.null(opt$distanceMeasure)) { + addComment("[ERROR]distance measure option is required",T,opt$log) + q( "no", 1, F ) +} + +if (is.null(opt$aggloMethod)) { + addComment("[ERROR]agglomeration method option is required",T,opt$log) + q( "no", 1, F ) +} + +if (is.null(opt$maxRows) || opt$maxRows<2) { + addComment("[ERROR]valid plotted row number is required",T,opt$log) + q( "no", 1, F ) +} + +if (!is.null(opt[["comparisonName"]]) && nchar(opt[["comparisonName"]])==0){ + addComment("[ERROR]you have to specify comparison",T,opt$log) + q( "no", 1, F ) +} + +if (!is.null(opt$comparisonNameLow) && nchar(opt$comparisonNameLow)==0){ + addComment("[ERROR]you have to specify comparisonLow",T,opt$log) + q( "no", 1, F ) +} + +if (!is.null(opt$comparisonNameHigh) && nchar(opt$comparisonNameHigh)==0){ + addComment("[ERROR]you have to specify comparisonHigh",T,opt$log) + q( "no", 1, F ) +} + +if (is.null(opt$genericData) && (!is.null(opt$comparisonNameLow) || !is.null(opt$comparisonNameHigh))){ + addComment("[ERROR]comparisonLow and comparisonHigh can be specified only with generic data",T,opt$log) + q( "no", 1, F ) +} + +if (!is.null(opt$genericData) && !is.null(opt[["comparisonName"]])){ + addComment("[ERROR]basic comparison cannot be specified for generic data",T,opt$log) + q( "no", 1, F ) +} + +if ((!is.null(opt[["comparisonName"]]) || !is.null(opt$comparisonNameLow) || !is.null(opt$comparisonNameHigh)) && is.null(opt$diffAnalyseFile)) { + addComment("[ERROR]'diff. exp. analysis file' is required",T,opt$log) + q( "no", 1, F ) +} + +if (!is.null(opt$genericData) && !is.null(opt$diffAnalyseFile) && is.null(opt$comparisonNameLow) && is.null(opt$comparisonNameHigh)){ + addComment("[ERROR]Missing comparison information for filtering",T,opt$log) + q( "no", 1, F ) +} + +if ((!is.null(opt$FCthreshold) || !is.null(opt$pvalThreshold)) && (is.null(opt[["comparisonName"]]) && is.null(opt$comparisonNameLow) && is.null(opt$comparisonNameHigh))) { + addComment("[ERROR]'comparisons' are missing for filtering",T,opt$log) + q( "no", 1, F ) +} + +if ((!is.null(opt$FCthreshold) || !is.null(opt$pvalThreshold)) && !is.null(opt$geneListFiltering)) { + addComment("[ERROR]Cannot have two filtering strategies",T,opt$log) + q( "no", 1, F ) +} + +verbose <- if (is.null(opt$quiet)) { + TRUE +}else{ + FALSE} + +addComment("[INFO]Parameters checked!",T,opt$log,display=FALSE) + +addComment(c("[INFO]Working directory: ",getwd()),TRUE,opt$log,display=FALSE) +addComment(c("[INFO]Command line: ",args),TRUE,opt$log,display=FALSE) + +#directory for plots and HTML +dir.create(file.path(getwd(), "plotDir")) +dir.create(file.path(getwd(), "plotLyDir")) + +#silent package loading +suppressPackageStartupMessages({ + library("plotly") + library("dendextend") + #library("ggdendro") + #library("plyr") + library("ggplot2") + library("heatmaply") + library("circlize") + #library("RColorBrewer") + #source("https://bioconductor.org/biocLite.R") + #biocLite("ComplexHeatmap") + library("ComplexHeatmap") + #library("processx") +}) + +expressionToCluster=!is.null(opt$expressionFile) + +#load input data files +if(expressionToCluster){ + #first expression data + expressionMatrix=read.csv(file=opt$expressionFile,header=F,sep="\t",colClasses="character") + #remove first row to convert it as colnames (to avoid X before colnames with header=T) + colNamesData=expressionMatrix[1,-1] + expressionMatrix=expressionMatrix[-1,] + #remove first colum to convert it as rownames + rowNamesData=expressionMatrix[,1] + expressionMatrix=expressionMatrix[,-1] + if(is.data.frame(expressionMatrix)){ + expressionMatrix=data.matrix(expressionMatrix) + }else{ + expressionMatrix=data.matrix(as.numeric(expressionMatrix)) + } + dimnames(expressionMatrix)=list(rowNamesData,colNamesData) + + #check input files + if (!is.numeric(expressionMatrix)) { + addComment("[ERROR]Expression data is not fully numeric!",T,opt$log,display=FALSE) + q( "no", 1, F ) + } + + addComment("[INFO]Expression data loaded and checked") + addComment(c("[INFO]Dim of expression matrix:",dim(expressionMatrix)),T,opt$log,display=FALSE) +} + +nbComparisons=0 +nbColPerContrast=5 +comparisonMatrix=NULL +comparisonMatrixInfoGene=NULL +#if available comparisons +if(!is.null(opt[["comparisonName"]])){ + #load results from differential expression analysis + #consider first row contains column names + comparisonMatrix=read.csv(file=opt$diffAnalyseFile,header=F,sep="\t") + colnames(comparisonMatrix)=as.character(unlist(comparisonMatrix[1,])) + #remove the second line also as it's information line (p-val,FDR.p-val,FC,logFC) + comparisonMatrix=comparisonMatrix[-c(1,2),] + #remove first and second colums, convert the first one as rownames + rownames(comparisonMatrix)=as.character(unlist(comparisonMatrix[,1])) + #and save second column content that contain geneInfo + comparisonMatrixInfoGene=as.character(unlist(comparisonMatrix[,2])) + names(comparisonMatrixInfoGene)=as.character(unlist(comparisonMatrix[,1])) + comparisonMatrix=comparisonMatrix[,-c(1,2)] + + comparisonMatrix=matrix(as.numeric(as.matrix(comparisonMatrix)),ncol=ncol(comparisonMatrix),dimnames = dimnames(comparisonMatrix)) + + if (ncol(comparisonMatrix)%%nbColPerContrast != 0) { + addComment("[ERROR]Diff. exp. data does not contain good number of columns per contrast, should contains in this order:p-val,FDR.p-val,FC,log2(FC) and t-stat",T,opt$log,display=FALSE) + q( "no", 1, F ) + } + + if(max(comparisonMatrix[,c(seq(1,ncol(comparisonMatrix),nbColPerContrast),seq(2,ncol(comparisonMatrix),nbColPerContrast))])>1 || min(comparisonMatrix[,c(seq(1,ncol(comparisonMatrix),nbColPerContrast),seq(2,ncol(comparisonMatrix),nbColPerContrast))])<0){ + addComment("[ERROR]Seem that diff. exp. data does not contain correct values for p-val and FDR.p-val columns, should be including in [0,1] interval",T,opt$log,display=FALSE) + q( "no", 1, F ) + } + + if (!is.numeric(comparisonMatrix)) { + addComment("[ERROR]Diff. exp. data is not fully numeric!",T,opt$log,display=FALSE) + q( "no", 1, F ) + } + + if(expressionToCluster && length(setdiff(rownames(comparisonMatrix),rownames(expressionMatrix)))!=0){ + addComment("[WARNING]All genes from diff. exp. file are not included in expression file",T,opt$log,display=FALSE) + } + + if(expressionToCluster && length(setdiff(rownames(expressionMatrix),rownames(comparisonMatrix)))!=0){ + addComment("[WARNING]All genes from expression file are not included in diff. exp. file",T,opt$log,display=FALSE) + } + + addComment("[INFO]Diff. exp. analysis loaded and checked",T,opt$log,display=FALSE) + addComment(c("[INFO]Dim of original comparison matrix:",dim(comparisonMatrix)),T,opt$log,display=FALSE) + + #restrict to user specified comparisons + restrictedComparisons=unlist(strsplit(opt[["comparisonName"]],",")) + #should be improved to avoid selection of column names starting too similarly + colToKeep=which(unlist(lapply(colnames(comparisonMatrix),function(x)any(startsWith(x,restrictedComparisons))))) + comparisonMatrix=matrix(comparisonMatrix[,colToKeep],ncol=length(colToKeep),dimnames = list(rownames(comparisonMatrix),colnames(comparisonMatrix)[colToKeep])) + + #get number of required comparisons + nbComparisons=ncol(comparisonMatrix)/nbColPerContrast + + addComment(c("[INFO]Dim of effective filtering matrix:",dim(comparisonMatrix)),T,opt$log,display=FALSE) +} + +#should be only the case with generic data +if(!is.null(opt$comparisonNameLow) || !is.null(opt$comparisonNameHigh)){ + #load generic data used for filtering + nbColPerContrast=1 + #consider first row contains column names + comparisonMatrix=read.csv(file=opt$diffAnalyseFile,header=F,sep="\t") + colnames(comparisonMatrix)=as.character(unlist(comparisonMatrix[1,])) + #remove first colum, convert the first one as rownames + rownames(comparisonMatrix)=as.character(unlist(comparisonMatrix[,1])) + comparisonMatrix=comparisonMatrix[-1,-1] + + comparisonMatrix=matrix(as.numeric(as.matrix(comparisonMatrix)),ncol=ncol(comparisonMatrix),dimnames = dimnames(comparisonMatrix)) + + if (!is.numeric(comparisonMatrix)) { + addComment("[ERROR]Filtering matrix is not fully numeric!",T,opt$log,display=FALSE) + q( "no", 1, F ) + } + + if(expressionToCluster && length(setdiff(rownames(comparisonMatrix),rownames(expressionMatrix)))!=0){ + addComment("[WARNING]All genes from filtering file are not included in expression file",T,opt$log,display=FALSE) + } + + if(expressionToCluster && length(setdiff(rownames(expressionMatrix),rownames(comparisonMatrix)))!=0){ + addComment("[WARNING]All genes from expression file are not included in filtering file",T,opt$log,display=FALSE) + } + + addComment("[INFO]Filtering file loaded and checked",T,opt$log,display=FALSE) + addComment(c("[INFO]Dim of original filtering matrix:",dim(comparisonMatrix)),T,opt$log,display=FALSE) + + #restrict to user specified comparisons + restrictedComparisons=c() + if(!is.null(opt$comparisonNameLow))restrictedComparisons=unique(c(restrictedComparisons,unlist(strsplit(opt$comparisonNameLow,",")))) + if(!is.null(opt$comparisonNameHigh))restrictedComparisons=unique(c(restrictedComparisons,unlist(strsplit(opt$comparisonNameHigh,",")))) + + if (!all(restrictedComparisons%in%colnames(comparisonMatrix))){ + addComment("[ERROR]Selected columns in filtering file are not present in filtering matrix!",T,opt$log,display=FALSE) + q( "no", 1, F ) + } + comparisonMatrix=matrix(comparisonMatrix[,restrictedComparisons],ncol=length(restrictedComparisons),dimnames = list(rownames(comparisonMatrix),restrictedComparisons)) + + #get number of required comparisons + nbComparisons=ncol(comparisonMatrix) + + addComment(c("[INFO]Dim of effective filtering matrix:",dim(comparisonMatrix)),T,opt$log,display=FALSE) +} + + + +factorInfoMatrix=NULL +if(!is.null(opt$factorInfo)){ + #get group information + #load factors file + factorInfoMatrix=read.csv(file=opt$factorInfo,header=F,sep="\t",colClasses="character") + #remove first row to convert it as colnames + colnames(factorInfoMatrix)=factorInfoMatrix[1,] + factorInfoMatrix=factorInfoMatrix[-1,] + #use first colum to convert it as rownames but not removing it to avoid conversion as vector in unique factor case + rownames(factorInfoMatrix)=factorInfoMatrix[,1] + + factorBarColor=colnames(factorInfoMatrix)[2] + + if(ncol(factorInfoMatrix)>2){ + addComment("[ERROR]Factors file should not contain more than 2 columns",T,opt$log,display=FALSE) + q( "no", 1, F ) + } + + #factor file is used for color band on heatmap, so all expression matrix column should be in the factor file + if(expressionToCluster && length(setdiff(colnames(expressionMatrix),rownames(factorInfoMatrix)))!=0){ + addComment("[ERROR]Missing samples in factor file",T,opt$log,display=FALSE) + q( "no", 1, F ) + } + + #factor file is used for color band on heatmap, so all comparison matrix column should be in the factor file + if(!expressionToCluster && length(setdiff(colnames(comparisonMatrix),rownames(factorInfoMatrix)))!=0){ + addComment("[ERROR]Missing differential contrasts in factor file",T,opt$log,display=FALSE) + q( "no", 1, F ) + } + + addComment("[INFO]Factors OK",T,opt$log,display=FALSE) + addComment(c("[INFO]Dim of factorInfo matrix:",dim(factorInfoMatrix)),T,opt$log,display=FALSE) +} + +if(!is.null(opt$personalColors)){ + ##parse personal colors + personalColors=unlist(strsplit(opt$personalColors,",")) + if(length(personalColors)==2){ + ##add medium color between two to get three colors + personalColors=c(personalColors[1],paste(c("#",as.character(as.hexmode(floor(apply(col2rgb(personalColors),1,mean))))),collapse=""),personalColors[2]) + } + if(length(personalColors)!=3){ + addComment("[ERROR]Personalized colors doesn't contain enough colors",T,opt$log,display=FALSE) + q( "no", 1, F ) + } + +} + + +if(!is.null(opt$filterInputOutput) && opt$filterInputOutput=="input"){ + #filter input data + + if(is.null(opt$geneListFiltering)){ + #filtering using stat thresholds + #rowToKeep=intersect(which(comparisonMatrix[,seq(2,ncol(comparisonMatrix),4)]<=opt$pvalThreshold),which(abs(comparisonMatrix[,seq(4,ncol(comparisonMatrix),4)])>=log2(opt$FCthreshold))) + if(is.null(opt$genericData)){ + #diff. expression matrix + rowToKeep=names(which(unlist(apply(comparisonMatrix,1,function(x)length(intersect(which(x[seq(2,length(x),nbColPerContrast)]log2(opt$FCthreshold))))!=0)))) + }else{ + #generic filtering matrix + rowToKeep=rownames(comparisonMatrix) + if(!is.null(opt$comparisonNameLow)){ + restrictedLowComparisons=unlist(strsplit(opt$comparisonNameLow,",")) + rowToKeep=intersect(rowToKeep,names(which(unlist(apply(comparisonMatrix,1,function(x)length(which(x[restrictedLowComparisons]>opt$FCthreshold))!=0))))) + } + if(!is.null(opt$comparisonNameHigh)){ + restrictedHighComparisons=unlist(strsplit(opt$comparisonNameHigh,",")) + rowToKeep=intersect(rowToKeep,names(which(unlist(apply(comparisonMatrix,1,function(x)length(which(x[restrictedHighComparisons]nrow(dataToHeatMap)){ + #correct number of clusters if needed + nbClusters=nrow(dataToHeatMap) + addComment(c("[WARNING]Not enough rows to reach required clusters number, it is reduced to number of rows:",nbClusters),T,opt$log,display=FALSE) +} + +nbSampleClusters=opt$sampleClusterNumber +if(nbSampleClusters>ncol(dataToHeatMap)){ + #correct number of clusters if needed + nbSampleClusters=ncol(dataToHeatMap) + addComment(c("[WARNING]Not enough columns to reach required conditions clusters number, it is reduced to number of columns:",nbSampleClusters),T,opt$log,display=FALSE) +} + +colClust=FALSE +rowClust=FALSE +effectiveRowClust=FALSE + +#make appropriate clustering if needed +if(nrow(dataToHeatMap)>1 && nbClusters>1)rowClust=hclust(distExtended(dataToHeatMap,method = opt$distanceMeasure),method = opt$aggloMethod) +if(ncol(dataToHeatMap)>1 && nbSampleClusters>1)colClust=hclust(distExtended(t(dataToHeatMap),method = opt$distanceMeasure),method = opt$aggloMethod) + +if(nrow(dataToHeatMap)>maxRowsToDisplay){ + #make subsampling based on preliminary global clustering + #clusteringResults=cutree(rowClust,nbClusters) + #heatMapGenesToKeep=unlist(lapply(seq(1,nbClusters),function(x)sample(which(clusteringResults==x),min(length(which(clusteringResults==x)),round(maxRowsToDisplay/nbClusters))))) + ##OR + #basic subsampling + heatMapGenesToKeep=sample(rownames(dataToHeatMap),maxRowsToDisplay) + effectiveDataToHeatMap=matrix(dataToHeatMap[heatMapGenesToKeep,],ncol=ncol(dataToHeatMap),dimnames=list(heatMapGenesToKeep,colnames(dataToHeatMap))) + effectiveNbClusters=min(nbClusters,maxRowsToDisplay) + if(nrow(effectiveDataToHeatMap)>1 && effectiveNbClusters>1)effectiveRowClust=hclust(distExtended(effectiveDataToHeatMap, method = opt$distanceMeasure),method = opt$aggloMethod) + addComment(c("[WARNING]Too many rows for efficient heatmap drawing",maxRowsToDisplay,"subsampling is done for vizualization only"),T,opt$log,display=FALSE) + rm(heatMapGenesToKeep) +}else{ + effectiveDataToHeatMap=dataToHeatMap + effectiveRowClust=rowClust + effectiveNbClusters=nbClusters +} + +addComment(c("[INFO]Dim of plotted heatmap matrix:",dim(effectiveDataToHeatMap)),T,opt$log,display=FALSE) + +personalized_hoverinfo=matrix("",ncol = ncol(effectiveDataToHeatMap),nrow = nrow(effectiveDataToHeatMap),dimnames = dimnames(effectiveDataToHeatMap)) +if(expressionToCluster){ + for(iCol in colnames(effectiveDataToHeatMap)){for(iRow in rownames(effectiveDataToHeatMap)){personalized_hoverinfo[iRow,iCol]=paste(c("Probe: ",iRow,"\nCondition: ",iCol,"\n",valueMeaning,": ",effectiveDataToHeatMap[iRow,iCol]),collapse="")}} +}else{ + for(iCol in colnames(effectiveDataToHeatMap)){for(iRow in rownames(effectiveDataToHeatMap)){personalized_hoverinfo[iRow,iCol]=paste(c("Probe: ",iRow,"\nCondition: ",iCol,"\nFC: ",round(2^effectiveDataToHeatMap[iRow,iCol],2)),collapse="")}} +} + +#trying to overcome limitation of heatmaply package to modify xtick and ytick label, using directly plotly functions, but for now plotly do not permit to have personalized color for each x/y tick separately +test=FALSE +if(test==TRUE){ + + #define dendogram shapes + dd.row <- as.dendrogram(effectiveRowClust) + dd.col <- as.dendrogram(colClust) + + #and color them + dd.row=color_branches(dd.row, k = effectiveNbClusters, groupLabels = T) + dd.col=color_branches(dd.col, k = nbSampleClusters, groupLabels = T) + + #generating function for dendogram from segment list + ggdend <- function(df) { + ggplot() + + geom_segment(data = df, aes(x=x, y=y, xend=xend, yend=yend)) + + labs(x = "", y = "") + theme_minimal() + + theme(axis.text = element_blank(), axis.ticks = element_blank(), + panel.grid = element_blank()) + } + + # generate x/y dendogram plots + px <- ggdend(dendro_data(dd.col)$segments) + py <- ggdend(dendro_data(dd.row)$segments) + coord_flip() + + # reshape data matrix + col.ord <- order.dendrogram(dd.col) + row.ord <- order.dendrogram(dd.row) + xx <- effectiveDataToHeatMap[row.ord, col.ord] + # and also personalized_hoverinfo + personalized_hoverinfo=personalized_hoverinfo[row.ord, col.ord] + + # hide axis ticks and grid lines + eaxis <- list( + showticklabels = FALSE, + showgrid = FALSE, + zeroline = FALSE + ) + + #make the empty plot + p_empty <- plot_ly() %>% + layout(margin = list(l = 200), + xaxis = eaxis, + yaxis = eaxis) + + heatmap.plotly <- plot_ly( + z = xx, x = 1:ncol(xx), y = 1:nrow(xx), colors = viridis(n = 101, alpha = 1, begin = 0, end = 1, option = "inferno"), + type = "heatmap", showlegend = FALSE, text = personalized_hoverinfo, hoverinfo = "text", + colorbar = list( + # Capitalise first letter + title = valueMeaning, + tickmode = "array", + len = 0.3 + ) + ) %>% + layout( + xaxis = list( + tickfont = list(size = 10,color=get_leaves_branches_col(dd.row)), + tickangle = 45, + tickvals = 1:ncol(xx), ticktext = colnames(xx), + linecolor = "#ffffff", + range = c(0.5, ncol(xx) + 0.5), + showticklabels = TRUE + ), + yaxis = list( + tickfont = list(size = 10, color=get_leaves_branches_col(dd.col)), + tickangle = 0, + tickvals = 1:nrow(xx), ticktext = rownames(xx), + linecolor = "#ffffff", + range = c(0.5, nrow(xx) + 0.5), + showticklabels = TRUE + ) + ) + + #generate plotly + pp <- subplot(px, p_empty, heatmap.plotly, py, nrows = 2, margin = 0,widths = c(0.8,0.2),heights = c(0.2,0.8), shareX = TRUE, + shareY = TRUE) + + #save image file + export(pp, file = paste(c(file.path(getwd(), "plotDir"),"/Heatmap.",opt$format),collapse="")) + #rise a bug due to token stuf + #orca(pp, file = paste(c(file.path(getwd(), "plotDir"),"/Heatmap.",opt$format),collapse="")) + + + #save plotLy file + htmlwidgets::saveWidget(as_widget(pp), paste(c(file.path(getwd(), "plotLyDir"),"/Heatmap.html"),collapse=""),selfcontained = F) + + #htmlwidgets::saveWidget(as_widget(pp),"~/Bureau/test.html",selfcontained = F) + +}else{ #test + label_names=c("Probe","Condition",valueMeaning) + + # #color hclust objects + # dd.row=color_branches(effectiveRowClust, k = effectiveNbClusters) + # #rowColors=get_leaves_branches_col(dd.row) + # #rowColors[order.dendrogram(dd.row)]=rowColors + # rowGroup=cutree(effectiveRowClust, k = effectiveNbClusters) + # + # #get order of class as they will be displayed on the dendogram + # rowGroupRenamed=data.frame(cluster=mapvalues(rowGroup, unique(rowGroup[order.dendrogram(dd.row)[nleaves(dd.row):1]]), 1:effectiveNbClusters)) + # + # dd.col=color_branches(colClust, k = nbSampleClusters) + # #colColors=get_leaves_branches_col(dd.col) + # #colColors[order.dendrogram(dd.col)]=colColors + # colGroup=cutree(colClust, k = nbSampleClusters) + # + # # #get order of class as they will be displayed on the dendogram + # colGroupRenamed=data.frame(sampleCluster=mapvalues(colGroup, unique(colGroup[order.dendrogram(dd.col)[nleaves(dd.col):1]]), 1:nbSampleClusters)) + + + #while option is not correctly managed by heatmap apply, put personalized_hoverinfo to NULL + personalized_hoverinfo=NULL + + if(is.null(opt$personalColors)){ + heatmapColors=viridis(n = 101, alpha = 1, begin = 0, end = 1, option = "inferno") + }else{ + heatmapColors=personalColors + } + + colGroupRenamed=NULL + if(!is.null(factorInfoMatrix)){ + colGroupRenamed=eval(parse(text=(paste("data.frame(",factorBarColor,"=factorInfoMatrix[colnames(effectiveDataToHeatMap),2])",sep="")))) + sideBarGroupNb=length(table(factorInfoMatrix[colnames(effectiveDataToHeatMap),2])) + sideBarColorPaletteName="Spectral" + if(!is.null(opt$sideBarColorPalette) && opt$sideBarColorPalette%in%rownames(RColorBrewer::brewer.pal.info)){ + sideBarColorPaletteName=opt$sideBarColorPalette + } + sideBarColorPalette=setNames(colorRampPalette(RColorBrewer::brewer.pal(RColorBrewer::brewer.pal.info[sideBarColorPaletteName,"maxcolors"], sideBarColorPaletteName))(sideBarGroupNb),unique(factorInfoMatrix[colnames(effectiveDataToHeatMap),2])) + } + + if(!is.null(colGroupRenamed)){ + pp <- heatmaply(effectiveDataToHeatMap,key.title = valueMeaning,k_row=effectiveNbClusters,k_col=nbSampleClusters,col_side_colors=colGroupRenamed,col_side_palette=sideBarColorPalette,Rowv=effectiveRowClust,Colv=colClust,label_names=label_names,custom_hovertext=personalized_hoverinfo,plot_method = "plotly",colors = heatmapColors) + }else{ + pp <- heatmaply(effectiveDataToHeatMap,key.title = valueMeaning,k_row=effectiveNbClusters,k_col=nbSampleClusters,Rowv=effectiveRowClust,Colv=colClust,label_names=label_names,custom_hovertext=personalized_hoverinfo,plot_method = "plotly",colors = heatmapColors) + } + + + #save image file + export(pp, file = paste(c(file.path(getwd(), "plotDir"),"/Heatmap.",opt$format),collapse="")) + #rise a bug due to token stuf + #orca(pp, file = paste(c(file.path(getwd(), "plotDir"),"/Heatmap.",opt$format),collapse="")) + + + #save plotLy file + htmlwidgets::saveWidget(as_widget(pp), paste(c(file.path(getwd(), "plotLyDir"),"/Heatmap.html"),collapse=""),selfcontained = F) + +} +addComment("[INFO]Heatmap drawn",T,opt$log,display=FALSE) + + +#plot circular heatmap +if(!class(effectiveRowClust)=="logical"){ + dendo=as.dendrogram(effectiveRowClust) + + if(is.null(opt$personalColors)){ + col_fun = colorRamp2(quantile(effectiveDataToHeatMap,probs = seq(0,1,0.01)), viridis(101,option = "inferno")) + }else{ + col_fun = colorRamp2(quantile(effectiveDataToHeatMap,probs = seq(0,1,0.5)), personalColors) + } + + if(opt$format=="pdf"){ + pdf(paste(c("./plotDir/circularPlot.pdf"),collapse=""))}else{ + png(paste(c("./plotDir/circularPlot.png"),collapse="")) + } + + circos.par(cell.padding = c(0, 0, 0, 0), gap.degree = 5) + circos.initialize(c(rep("a",nrow(effectiveDataToHeatMap)),"b"),xlim=cbind(c(0,0),c(nrow(effectiveDataToHeatMap),5))) + circos.track(ylim = c(0, 1), bg.border = NA, panel.fun = function(x, y) { + if(CELL_META$sector.index=="a"){ + nr = ncol(effectiveDataToHeatMap) + nc = nrow(effectiveDataToHeatMap) + circos.text(1:nc- 0.5, rep(0,nc), adj = c(0, 0), + rownames(effectiveDataToHeatMap)[order.dendrogram(dendo)], facing = "clockwise", niceFacing = TRUE, cex = 0.3) + } + }) + + circos.track(ylim = c(0, ncol(effectiveDataToHeatMap)), bg.border = NA, panel.fun = function(x, y) { + + m = t(matrix(effectiveDataToHeatMap[order.dendrogram(dendo),],ncol=ncol(effectiveDataToHeatMap))) + col_mat = col_fun(m) + nr = nrow(m) + nc = ncol(m) + if(CELL_META$sector.index=="a"){ + for(i in 1:nr) { + circos.rect(1:nc - 1, rep(nr - i, nc), + 1:nc, rep(nr - i + 1, nc), + border = col_mat[i, ], col = col_mat[i, ]) + } + }else{ + circos.text(rep(1,nr), seq(nr,1,-1) , colnames(effectiveDataToHeatMap),cex = 0.3) + } + }) + + #dendo = color_branches(dendo, k = effectiveNbClusters, col = colorRampPalette(brewer.pal(12,"Set3"))(effectiveNbClusters)) + dendo = color_branches(dendo, k = effectiveNbClusters, col = rev(colorspace::rainbow_hcl(effectiveNbClusters))) + + + circos.track(ylim = c(0, attributes(dendo)$height), bg.border = NA, track.height = 0.25, + panel.fun = function(x, y) { + if(CELL_META$sector.index=="a")circos.dendrogram(dendo)} ) + + circos.clear() + ##add legend + lgd_links = Legend(at = seq(ceiling(min(effectiveDataToHeatMap)),floor(max(effectiveDataToHeatMap)),ceiling((floor(max(effectiveDataToHeatMap))-ceiling(min(effectiveDataToHeatMap)))/4)), col_fun = col_fun, + title_position = "topleft", grid_width = unit(5, "mm") ,title = valueMeaning) + + pushViewport(viewport(x = 0.85, y = 0.80, + width = 0.1, + height = 0.1, + just = c("left", "bottom"))) + grid.draw(lgd_links) + upViewport() + + + dev.off() + + addComment("[INFO]Circular heatmap drawn",T,opt$log,display=FALSE) + loc <- Sys.setlocale("LC_NUMERIC","C") +}else{ + addComment(c("[WARNING]Circular plot will not be plotted considering row or cluster number < 2"),T,opt$log,display=FALSE) +} +rm(effectiveDataToHeatMap,effectiveRowClust,effectiveNbClusters) + +#plot screeplot +if(class(rowClust)!="logical" && nrow(dataToHeatMap)>2){ + screePlotData=c() + for(iNbClusters in 2:(nbClusters+min(10,max(0,nrow(dataToHeatMap)-nbClusters)))){ + clusteringResults=cutree(rowClust,iNbClusters) + #clusteringResults=kmeans(dataToHeatMap,iNbClusters)$cluster + + #compute variance between each intra-class points amongst themselves (need at least 3 points by cluster) + #screePlotData=c(screePlotData,sum(unlist(lapply(seq(1,iNbClusters),function(x){temp=which(clusteringResults==x);if(length(temp)>2){var(dist(dataToHeatMap[temp,]))}else{0}}))) ) + #compute variance between each intra-class points and fictive mean point (need at least 2 points by cluster) + #screePlotData=c(screePlotData,sum(unlist(lapply(seq(1,iNbClusters),function(x){temp=which(clusteringResults==x);if(length(temp)>1){ var(dist(rbind(apply(dataToHeatMap[temp,],2,mean),dataToHeatMap[temp,]))[1:length(temp)]) }else{0}}))) ) + if(ncol(dataToHeatMap)>1)screePlotData=c(screePlotData,sum(unlist(lapply(seq(1,iNbClusters),function(x){temp=which(clusteringResults==x);if(length(temp)>1){ sum((distExtended(rbind(apply(dataToHeatMap[temp,],2,mean),dataToHeatMap[temp,]),method = opt$distanceMeasure)[1:length(temp)])^2) }else{0}}))) ) + else screePlotData=c(screePlotData,sum(unlist(lapply(seq(1,iNbClusters),function(x){temp=which(clusteringResults==x);if(length(temp)>1){ sum((dataToHeatMap[temp,]-mean(dataToHeatMap[temp,]))^2) }else{0}}))) ) + } + + dataToPlot=data.frame(clusterNb=seq(2,length(screePlotData)+1),wcss=screePlotData) + p <- ggplot(data=dataToPlot, aes(clusterNb,wcss)) + geom_point(colour="#EE4444") + geom_line(colour="#DD9999") + + ggtitle("Scree plot") + theme_bw() + xlab(label="Cluster number") + ylab(label="Within cluster sum of squares") + + theme(panel.border=element_blank(),plot.title = element_text(hjust = 0.5),legend.position = "none") + + scale_x_continuous(breaks=seq(min(dataToPlot$clusterNb), max(dataToPlot$clusterNb), 1)) + + #save plotly files + pp <- ggplotly(p) + + if(opt$format=="pdf"){ + pdf(paste(c("./plotDir/screePlot.pdf"),collapse=""))}else{ + png(paste(c("./plotDir/screePlot.png"),collapse="")) + } + plot(p) + dev.off() + + #save plotly files + htmlwidgets::saveWidget(as_widget(pp), paste(c(file.path(getwd(), "plotLyDir"),"/screePlot.html"),collapse=""),selfcontained = F) + + addComment("[INFO]Scree plot drawn",T,opt$log,display=FALSE) +}else{ + addComment(c("[WARNING]Scree plot will not be plotted considering row number <= 2"),T,opt$log,display=FALSE) +} + +##---------------------- + +#filter output based on parameters + +rowToKeep=rownames(dataToHeatMap) +if(!is.null(opt$filterInputOutput) && opt$filterInputOutput=="output"){ + #rowToKeep=intersect(which(comparisonMatrix[,seq(2,ncol(comparisonMatrix),4)]<=opt$pvalThreshold),which(abs(comparisonMatrix[,seq(4,ncol(comparisonMatrix),4)])>=log2(opt$FCthreshold))) + if(is.null(opt$geneListFiltering)){ + if(is.null(opt$genericData)){ + #diff. expression matrix + rowToKeep=names(which(unlist(apply(comparisonMatrix,1,function(x)length(intersect(which(x[seq(2,length(x),nbColPerContrast)]<=opt$pvalThreshold),which(abs(x[seq(4,length(x),nbColPerContrast)])>=log2(opt$FCthreshold))))!=0)))) + }else{ + #generic filtering matrix + rowToKeep=rownames(comparisonMatrix) + if(!is.null(opt$comparisonNameLow)){ + restrictedLowComparisons=unlist(strsplit(opt$comparisonNameLow,",")) + rowToKeep=intersect(rowToKeep,names(which(unlist(apply(comparisonMatrix,1,function(x)length(which(x[restrictedLowComparisons]>opt$FCthreshold))!=0))))) + } + if(!is.null(opt$comparisonNameHigh)){ + restrictedHighComparisons=unlist(strsplit(opt$comparisonNameHigh,",")) + rowToKeep=intersect(rowToKeep,names(which(unlist(apply(comparisonMatrix,1,function(x)length(which(x[restrictedHighComparisons] +# +# This file is free software: you may copy, redistribute and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation, either version 2 of the License, or (at your +# option) any later version. +# +# This file is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + + +#extendedDist function to correlation measure +distExtended <- function(x,method) { + if(method %in% c("euclidean", "maximum", "manhattan", "canberra", "binary", "minkowski"))return(dist(x,method = method)) + if(method %in% c("pearson", "spearman", "kendall"))return(as.dist(1-cor(t(x),method=method))/2) + if(method %in% c("absPearson", "absSpearman", "absKendall"))return(as.dist(1-abs(cor(t(x),method=method)))) + return(NULL) +} + +##comment function to display message and optionnaly add it to log file + +addComment <- function(text,addToFile=FALSE,fileName=NULL,append=TRUE,display=TRUE){ + if(display)cat(paste(c(text,"\n"),collapse = " ")) + if(addToFile)write(paste(text,collapse = " "),fileName,append=append) +} + +printSessionInfo <- function(fileName=NULL,append=TRUE){ + addComment("[INFO]R session info :",T,fileName,display=FALSE) + tempInfo=sessionInfo() + write(paste(tempInfo$R.version$version.string),fileName,append=append) + write(paste("Platform",tempInfo$platform,sep = " : "),fileName,append=append) + write(paste("Running under",tempInfo$running,sep = " : "),fileName,append=append) + write(paste("Local variables",tempInfo$locale,sep = " : "),fileName,append=append) + write(paste("Attached base packages",paste(tempInfo$basePkgs,collapse = "; "),sep = " : "),fileName,append=append) + if(length(tempInfo$otherPkgs)>0){ + lineToPrint="" + for(iPack in tempInfo$otherPkgs){ + lineToPrint=paste(lineToPrint,iPack$Package," ",iPack$Version,"; ",sep = "") + } + write(paste("Other attached packages",lineToPrint,sep = " : "),fileName,append=append) + } + if(length(tempInfo$loadedOnly)>0){ + lineToPrint="" + for(iPack in tempInfo$loadedOnly){ + lineToPrint=paste(lineToPrint,iPack$Package," ",iPack$Version,"; ",sep = "") + } + write(paste("Loaded packages",lineToPrint,sep = " : "),fileName,append=append) + } +} + +##negative of a mathematical expression +negativeExpression <- function(expression){ + expression=gsub("\\+","_toMinus_",expression) + expression=gsub("\\-","+",expression) + expression=gsub("_toMinus_","-",expression) + if(substr(expression,1,1)!="-" && substr(expression,1,1)!="+"){ + expression=paste(c("-",expression),collapse="") + } + + return(expression) +} + +#' Returns file name of calling Rscript +#' +#' \code{get_Rscript_filename} returns the file name of calling Rscript +#' @return A string with the filename of the calling script. +#' If not found (i.e. you are in a interactive session) returns NA. +#' +#' @export +get_Rscript_filename <- function() { + prog <- sub("--file=", "", grep("--file=", commandArgs(), value=TRUE)[1]) + if( .Platform$OS.type == "windows") { + prog <- gsub("\\\\", "\\\\\\\\", prog) + } + prog +} + +#' Recursively sorts a list +#' +#' \code{sort_list} returns a sorted list +#' @param unsorted_list A list. +#' @return A sorted list. +#' @export +sort_list <- function(unsorted_list) { + for(ii in seq(along=unsorted_list)) { + if(is.list(unsorted_list[[ii]])) { + unsorted_list[[ii]] <- sort_list(unsorted_list[[ii]]) + } + } + unsorted_list[sort(names(unsorted_list))] +} + + +# Multiple plot function +# +# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects) +# - cols: Number of columns in layout +# - layout: A matrix specifying the layout. If present, 'cols' is ignored. +# +# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE), +# then plot 1 will go in the upper left, 2 will go in the upper right, and +# 3 will go all the way across the bottom. +# +multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) { + library(grid) + + # Make a list from the ... arguments and plotlist + plots <- c(list(...), plotlist) + + numPlots = length(plots) + + # If layout is NULL, then use 'cols' to determine layout + if (is.null(layout)) { + # Make the panel + # ncol: Number of columns of plots + # nrow: Number of rows needed, calculated from # of cols + layout <- matrix(seq(1, cols * ceiling(numPlots/cols)), + ncol = cols, nrow = ceiling(numPlots/cols)) + } + + if (numPlots==1) { + print(plots[[1]]) + + } else { + # Set up the page + grid.newpage() + pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout)))) + + # Make each plot, in the correct location + for (i in 1:numPlots) { + # Get the i,j matrix positions of the regions that contain this subplot + matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE)) + + print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row, + layout.pos.col = matchidx$col)) + } + } +}