Mercurial > repos > vandelj > giant_hierarchical_clustering
changeset 0:14045c80a222 draft
"planemo upload for repository https://github.com/juliechevalier/GIANT/tree/master commit cb276a594444c8f32e9819fefde3a21f121d35df"
author | vandelj |
---|---|
date | Fri, 26 Jun 2020 09:38:23 -0400 |
parents | |
children | 0b09345fa632 |
files | galaxy/wrappers/ExprHeatmapClustering.xml src/ExprPlotsScript.R src/General_functions.py src/LIMMA_options.py src/LIMMAscriptV4.R src/VolcanoPlotsScript.R src/getopt.R src/heatMapClustering.R src/utils.R |
diffstat | 9 files changed, 5169 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy/wrappers/ExprHeatmapClustering.xml Fri Jun 26 09:38:23 2020 -0400 @@ -0,0 +1,928 @@ +<tool name="GIANT-Heatmap and Hierarchical clustering" id="giant_hierarchical_clustering" version="0.5.1"> + <description>Run hierarchical clustering and plot heatmap from expression data and/or differential expression analysis</description> + <requirements> + <requirement type="package" version="4.8.0">r-plotly</requirement> + <requirement type="package" version="1.12.0">r-dendextend</requirement> + <requirement type="package" version="0.1_20">r-ggdendro</requirement> + <requirement type="package" version="3.2.1">r-ggplot2</requirement> + <requirement type="package" version="0.16.0">r-heatmaply</requirement> + <requirement type="package" version="0.4.8">r-circlize</requirement> + <requirement type="package" version="1.18.1">bioconductor-complexheatmap</requirement> + <requirement type="package" version="2.2.2">pandoc</requirement> + </requirements> + <code file='../../src/General_functions.py'/> + <stdio> + <regex match="Execution halted" + source="both" + level="fatal" + description="Execution halted, please contact tool developer or administrators." /> + <regex match="Error in" + source="both" + level="fatal" + description="An error occured during R execution, please contact tool developer." /> + <exit_code range="10" level="fatal" description="Missing file during html report, see log file for more information." /> + <exit_code range="1:9" level="fatal" description="Error in R execution, see log file for more information." /> + </stdio> + <command> <![CDATA[ + + #if ($dataToCluster.dataToCluster_selector=="expression" or $dataToCluster.dataToCluster_selector=="genericData") and $dataToCluster.expressionData: + + ##start by selecting specific input data columns depending on user request + #if $dataToCluster.dataToCluster_selector=="genericData" and $dataToCluster.columnToKeep: + awk -v columns="$dataToCluster.columnToKeep" 'BEGIN{FS="\t";OFS="";ORS="";split(columns,columnsTab,",")} FNR==1{for(iColumn=1;iColumn<=length(columnsTab);iColumn++)for(iField=2;iField<=NF;iField++){if(\$iField==columnsTab[iColumn])colsToSelect[iColumn]=iField}} {line=\$1;for(iColumn=1;iColumn<=length(columnsTab);iColumn++)line=line"\t"\$colsToSelect[iColumn];print line"\n";}' $dataToCluster.expressionData > ./selectedExpressionData; + #else + cp $dataToCluster.expressionData ./selectedExpressionData; + #end if + + ##reorder columns of input data based on factors file + #if $dataToCluster.reorder_sample.reordering_selector=="factorFile" and $dataToCluster.reorder_sample.factorFileData and $dataToCluster.reorder_sample.factorToUse: + awk -v factors="$dataToCluster.reorder_sample.factorToUse" 'BEGIN{FS="\t";OFS="";ORS="";split(factors,factorsTab,",")} FNR==1{for(iFactor=1;iFactor<=length(factorsTab);iFactor++)for(iField=2;iField<=NF;iField++){if(\$iField==factorsTab[iFactor])colsToSelect[iFactor]=iField}} FNR>1{line=\$1;for(iFactor=1;iFactor<=length(factorsTab);iFactor++)line=line"\t"\$colsToSelect[iFactor];print line"\n";}' $dataToCluster.reorder_sample.factorFileData > ./orderingFactor; + + sort -V -k2 ./orderingFactor > ./orderingSample; + + awk 'BEGIN{FS="\t";OFS="";ORS="";factorNumber=0} ARGIND==1{sampleOrdered[FNR]=\$1;factorNumber=FNR} ARGIND==2 && FNR==1{for(iElemt=1;iElemt<=factorNumber;iElemt++)for(iPosit=2;iPosit<=NF;iPosit++)if(\$iPosit==sampleOrdered[iElemt])positOrdered[iElemt]=iPosit} ARGIND==2{line=\$1;for(iElemt=1;iElemt<=factorNumber;iElemt++)if(iElemt in positOrdered)line=line"\t"\$positOrdered[iElemt];print line"\n"}' ./orderingSample ./selectedExpressionData > ./orderedExpressionData; + + ##check if some input data columns were lost during the process + awk 'ARGIND==1 && FNR==1{colNumbA=NF} ARGIND==2 && FNR==1{colNumbB=NF} END{if(colNumbA!=colNumbB) print "[WARNING] "colNumbA-colNumbB" input data columns was removed during reordering due to missing information in factor file!\n"}' ./selectedExpressionData ./orderedExpressionData >> $log; + + #if $advSection.conditionClusterNumber!="1": + printf "[WARNING]Sample clustering option is selected, sample reordering will not be preserved!\n" >> $log; + #end if + #else: + cp ./selectedExpressionData ./orderedExpressionData; + #end if + #end if + + + ##generate common file name for differential analysis results depending on input data nature + #if ($dataToCluster.dataToCluster_selector=="expression" or $dataToCluster.dataToCluster_selector=="genericData") and $dataToCluster.filtering_step.filtering_step_selector!="no" and $dataToCluster.filtering_step.select_filtering.filtering_stepBis_selector=="diffExpParam" and $dataToCluster.filtering_step.select_filtering.differentialAnalysis: + cp ${dataToCluster.filtering_step.select_filtering.differentialAnalysis} ./filteredDifferentialAnalysis; + #end if + #if $dataToCluster.dataToCluster_selector=="foldChange" and $dataToCluster.differentialAnalysis: + cp $dataToCluster.differentialAnalysis ./filteredDifferentialAnalysis; + #end if + + + ##generate factor information to use for barplot + #if $advSection.conditionBarColor.conditionBarColor_selector=="yes" and $advSection.conditionBarColor.factorFileDataBarPlot and $advSection.conditionBarColor.factorToUse: + awk -v factor="$advSection.conditionBarColor.factorToUse" 'BEGIN{FS="\t";OFS="";ORS=""} NR==1{for(i=2;i<=NF;i++)if(\$i==factor)colToKeep=i} {print \$1"\t"\$colToKeep"\n"}' $advSection.conditionBarColor.factorFileDataBarPlot > ./barPlotFactor; + #end if + + Rscript '$__tool_directory__/../../src/heatMapClustering.R' --log '$log' --outputFile '$outputData' --format '$advSection.imageFormat' --clusterNumber '$advSection.clusterNumber' --maxRows '$advSection.maxSampleToPlot' --sampleClusterNumber '$advSection.conditionClusterNumber' --dataTransformation '$advSection.dataTransformation' --distanceMeasure '$advSection.distanceMeasure' --aggloMethod '$advSection.aggloMethod' + #if $advSection.select_color.specifyColors=="true": + --personalColors '$advSection.select_color.featureMin_color,$advSection.select_color.featureMedium_color,$advSection.select_color.featureMax_color' + #end if + #if $advSection.conditionBarColor.conditionBarColor_selector=="yes" and $advSection.conditionBarColor.factorFileDataBarPlot and $advSection.conditionBarColor.factorToUse: + --factorInfo './barPlotFactor' + --sideBarColorPalette '$advSection.conditionBarColor.sideBarPalette' + #end if + #if $dataToCluster.dataToCluster_selector=="genericData": + --genericData + #end if + #if $dataToCluster.dataToCluster_selector=="expression" or $dataToCluster.dataToCluster_selector=="genericData": + --expressionFile './orderedExpressionData' + #if $dataToCluster.filtering_step.filtering_step_selector!="no": + --filterInputOutput '$dataToCluster.filtering_step.filtering_step_selector' + #if $dataToCluster.filtering_step.select_filtering.filtering_stepBis_selector=="diffExpParam": + --diffAnalyseFile './filteredDifferentialAnalysis' + #if $dataToCluster.dataToCluster_selector=="expression": + --comparisonName '$dataToCluster.filtering_step.select_filtering.comparisonsToInclude' + --FCthreshold '$dataToCluster.filtering_step.select_filtering.FCthreshold' + --pvalThreshold '$dataToCluster.filtering_step.select_filtering.pvalThreshold' + #else: + #if $dataToCluster.filtering_step.select_filtering.comparisonsToIncludeLow and $dataToCluster.filtering_step.select_filtering.valThresholdLow: + --comparisonNameLow '$dataToCluster.filtering_step.select_filtering.comparisonsToIncludeLow' + --FCthreshold '$dataToCluster.filtering_step.select_filtering.valThresholdLow' + #end if + #if $dataToCluster.filtering_step.select_filtering.comparisonsToIncludeHigh and $dataToCluster.filtering_step.select_filtering.valThresholdHigh: + --comparisonNameHigh '$dataToCluster.filtering_step.select_filtering.comparisonsToIncludeHigh' + --pvalThreshold '$dataToCluster.filtering_step.select_filtering.valThresholdHigh' + #end if + #end if + #else: + --geneListFiltering '$dataToCluster.filtering_step.select_filtering.geneListFile' + #end if + #end if + #else + --diffAnalyseFile './filteredDifferentialAnalysis' + --comparisonName '$dataToCluster.comparisonsToInclude' + #if $dataToCluster.filtering_step.filtering_step_selector!="no": + --filterInputOutput '$dataToCluster.filtering_step.filtering_step_selector' + #if $dataToCluster.filtering_step.select_filtering.filtering_stepBis_selector=="diffExpParam": + --FCthreshold '$dataToCluster.filtering_step.select_filtering.FCthreshold' + --pvalThreshold '$dataToCluster.filtering_step.select_filtering.pvalThreshold' + #else: + --geneListFiltering '$dataToCluster.filtering_step.select_filtering.geneListFile' + #end if + #end if + #end if + ; + ret_code=\$?; + if [ \$ret_code != 0 ]; then + exit \$ret_code; + else + bash $scriptTransfer; + ret_code=\$?; + if [ \$ret_code != 0 ]; then + exit \$ret_code; + fi + fi; + printf "[INFO]End of tool script" >> $log; + ]]> + </command> + + + + <configfiles> + <configfile name="scriptTableToHtml"> +<![CDATA[ +printf "<!DOCTYPE html> +<html> +<head> +<meta http-equiv=\"Content-type\" content=\"text/html; charset=utf-8\"> +<link rel=\"stylesheet\" type=\"text/css\" href=\"https://cdn.datatables.net/1.10.16/css/jquery.dataTables.min.css\"> +<script type=\"text/javascript\" language=\"javascript\" src=\"https://code.jquery.com/jquery-1.12.4.js\"> +</script> +<script type=\"text/javascript\" language=\"javascript\" src=\"https://cdn.datatables.net/1.10.16/js/jquery.dataTables.min.js\"> +</script> +<script type=\"text/javascript\" class=\"init\"> +\\$(document).ready(function() { + \\$(\'\#example\').DataTable( { + \"columnDefs\": [ { + \"visible\": false, + \"targets\": -1 + } ] + } ); +} ); +</script> +</head> +<body style=\"background-color:white;\"> +<table id=\"example\" class=\"display\" cellspacing=\"0\"> +" > ${html_file.extra_files_path}/outputClustering.html + +printf "<colgroup>\n" >> ${html_file.extra_files_path}/outputClustering.html +#if $dataToCluster.dataToCluster_selector=="foldChange" or ($dataToCluster.dataToCluster_selector=="expression" and $dataToCluster.filtering_step.filtering_step_selector!="no" and $dataToCluster.filtering_step.select_filtering.filtering_stepBis_selector=="diffExpParam"): +printf "<col span=\"2\" style=\"background-color:rgb(224,235,235)\">\n" >> ${html_file.extra_files_path}/outputClustering.html +printf "<col span=\"1\" style=\"background-color:rgb(250,235,235)\">\n" >> ${html_file.extra_files_path}/outputClustering.html +awk 'BEGIN{odd=1;FS="\t"} NR==1{for(i=4;i<=NF;i=i+5){if(odd==1){odd=0;printf "<col span=\"5\" style=\"background-color:rgb(224,238,255)\">\n"}else{odd=1;printf "<col span=\"5\" style=\"background-color:rgb(255,221,224)\">\n"}}}' $outputData >> ${html_file.extra_files_path}/outputClustering.html +#else +printf "<col span=\"1\" style=\"background-color:rgb(224,235,235)\">\n" >> ${html_file.extra_files_path}/outputClustering.html +printf "<col span=\"1\" style=\"background-color:rgb(250,235,235)\">\n" >> ${html_file.extra_files_path}/outputClustering.html +awk 'BEGIN{odd=1;FS="\t"} NR==1{for(i=3;i<=NF;i++){if(odd==1){odd=0;printf "<col span=\"1\" style=\"background-color:rgb(224,238,255)\">\n"}else{odd=1;printf "<col span=\"1\" style=\"background-color:rgb(255,221,224)\">\n"}}}' $outputData >> ${html_file.extra_files_path}/outputClustering.html +#end if + +printf "</colgroup>\n" >> ${html_file.extra_files_path}/outputClustering.html +printf "<thead>\n<tr>\n" >> ${html_file.extra_files_path}/outputClustering.html + +#if $dataToCluster.dataToCluster_selector=="foldChange" or ($dataToCluster.dataToCluster_selector=="expression" and $dataToCluster.filtering_step.filtering_step_selector!="no" and $dataToCluster.filtering_step.select_filtering.filtering_stepBis_selector=="diffExpParam"): +printf "<th rowspan=\"2\">Gene</th>\n" >> ${html_file.extra_files_path}/outputClustering.html +printf "<th rowspan=\"2\">Info</th>\n" >> ${html_file.extra_files_path}/outputClustering.html +printf "<th rowspan=\"2\">Cluster</th>\n" >> ${html_file.extra_files_path}/outputClustering.html +awk 'BEGIN{FS="\t"} NR==1{for(i=4;i<=NF;i=i+5)printf "<th colspan=\"5\">"\$i"</th>\n"}' $outputData >> ${html_file.extra_files_path}/outputClustering.html +printf "<th></th>\n" >> ${html_file.extra_files_path}/outputClustering.html +printf "</tr>\n<tr>\n" >> ${html_file.extra_files_path}/outputClustering.html +awk 'BEGIN{FS="\t"} NR==2{for(i=4;i<=NF;i++)printf "<th>"\$i"</th>\n"}' $outputData >> ${html_file.extra_files_path}/outputClustering.html +#else +printf "<th rowspan=\"1\">Gene</th>\n" >> ${html_file.extra_files_path}/outputClustering.html +printf "<th rowspan=\"1\">Cluster</th>\n" >> ${html_file.extra_files_path}/outputClustering.html +awk 'BEGIN{FS="\t"} NR==1{for(i=3;i<=NF;i++)printf "<th colspan=\"1\">"\$i"</th>\n"}' $outputData >> ${html_file.extra_files_path}/outputClustering.html +#end if + +printf "<th></th>\n" >> ${html_file.extra_files_path}/outputClustering.html +printf "</tr>\n</thead>\n" >> ${html_file.extra_files_path}/outputClustering.html +printf "<tfoot>\n<tr>\n" >> ${html_file.extra_files_path}/outputClustering.html + +#if $dataToCluster.dataToCluster_selector=="foldChange" or ($dataToCluster.dataToCluster_selector=="expression" and $dataToCluster.filtering_step.filtering_step_selector!="no" and $dataToCluster.filtering_step.select_filtering.filtering_stepBis_selector=="diffExpParam"): +awk 'BEGIN{FS="\t"} NR==2{for(i=1;i<=NF;i++)printf "<th>"\$i"</th>\n"}' $outputData >> ${html_file.extra_files_path}/outputClustering.html +#else +awk 'BEGIN{FS="\t"} NR==1{for(i=1;i<=NF;i++)printf "<th>"\$i"</th>\n"}' $outputData >> ${html_file.extra_files_path}/outputClustering.html +#end if + +printf "<th></th>\n" >> ${html_file.extra_files_path}/outputClustering.html +printf "</tr>\n</tfoot>\n">> ${html_file.extra_files_path}/outputClustering.html +printf "<tbody>\n" >> ${html_file.extra_files_path}/outputClustering.html + +#if $dataToCluster.dataToCluster_selector=="foldChange" or ($dataToCluster.dataToCluster_selector=="expression" and $dataToCluster.filtering_step.filtering_step_selector!="no" and $dataToCluster.filtering_step.select_filtering.filtering_stepBis_selector=="diffExpParam"): +awk 'BEGIN{FS="\t"} NR>2{printf "<tr>\n";for(i=1;i<=NF;i++){printf "<th>"\$i"</th>\n"};printf "<th></th>\n";printf "</tr>\n"}' $outputData >> ${html_file.extra_files_path}/outputClustering.html +#else +awk 'BEGIN{FS="\t"} NR>1{printf "<tr>\n";for(i=1;i<=NF;i++){printf "<th>"\$i"</th>\n"};printf "<th></th>\n";printf "</tr>\n"}' $outputData >> ${html_file.extra_files_path}/outputClustering.html +#end if + +printf "</tbody>\n" >> ${html_file.extra_files_path}/outputClustering.html + +printf "</table> +</body> +</html>" >> ${html_file.extra_files_path}/outputClustering.html + +]]> + </configfile> + + <configfile name="scriptTransfer"> +<![CDATA[ + +mkdir -p $html_file.extra_files_path + + +##create HTML file for clustering output table +source $scriptTableToHtml + +##check outputClustering.html is here +if ! [ -e ${html_file.extra_files_path}/outputClustering.html ]; then + printf "[ERROR]outputClustering.html is missing.\n" >> $log; + exit 10 +fi + +#write header of html file +printf "<!DOCTYPE html>\n<html>\n<body>" > $html_file + + +##first add reference of the clustering output table +printf "<h3>Clustering tabular</h3>\n" >> $html_file +printf "<a href=\"outputClustering.html\">Clustering results</a>\n" >> $html_file + + +##manage heatmap file + + +if [ -e ./plotLyDir/Heatmap.html ]; then + +printf "<h3>Heatmap plot</h3>\n" >> $html_file + +##modify HTML to point to the first script folder +sed -i "s/Heatmap_files/PlotLy_Heatmap_scripts/g" ./plotLyDir/Heatmap.html + +##copy HTML files in both folders +cp ./plotLyDir/Heatmap.html ${html_file.extra_files_path}/Heatmap.html + +##add HTML link +printf "<a href=\"Heatmap.html\">Heatmap</a>\n" >> $html_file + +#if $advSection.imagePlotlyFormat=="svg": +##before copying scripts folder modify them to replace png snapshot with svg (not proud of solution but seems to work) +cd ./plotLyDir/Heatmap_files/plotly-main-*/ +awk '{gsub("\"png\"","\"svg\"",\$0);print \$0}' ./plotly-latest.min.js > ./plotly-latest.minTemp.js +awk '{gsub("Download plot as a png","Download plot as a svg",\$0);print \$0}' ./plotly-latest.minTemp.js > ./plotly-latest.min.js +rm ./plotly-latest.minTemp.js +cd ../../../ +#end if + +#if $advSection.scaleSnapshot!="1.0": +##before copying scripts folder modify scale parameter (not proud of solution but seems to work) +cd ./plotLyDir/Heatmap_files/plotly-main-*/ +awk '{gsub("h=t\\.scale\\|\\|1","h=$advSection.scaleSnapshot",\$0);print \$0}' ./plotly-latest.min.js > ./plotly-latest.minTemp.js +mv -f ./plotly-latest.minTemp.js ./plotly-latest.min.js +cd ../../../ +#end if + +##now copy scripts folder +cp -r ./plotLyDir/Heatmap_files $html_file.extra_files_path +mv ${html_file.extra_files_path}/Heatmap_files ${html_file.extra_files_path}/PlotLy_Heatmap_scripts + +else + printf "[ERROR]Heatmap.html is missing.\n" >> $log; + exit 10 +fi + + + + +##manage screePlot files + + +if [ -e ./plotLyDir/screePlot.html ]; then + +printf "<h3>Scree plot</h3>\n" >> $html_file + +##modify HTML to point to the first script folder +sed -i "s/screePlot_files/PlotLy_screePlot_scripts/g" ./plotLyDir/screePlot.html + +##copy HTML files in both folders +cp ./plotLyDir/screePlot.html ${html_file.extra_files_path}/screePlot.html + +##add HTML link +printf "<a href=\"screePlot.html\">Scree plot</a>\n" >> $html_file + +#if $advSection.imagePlotlyFormat=="svg": +##before copying scripts folder modify them to replace png snapshot with svg (not proud of solution but seems to work) +cd ./plotLyDir/screePlot_files/plotly-main-*/ +awk '{gsub("\"png\"","\"svg\"",\$0);print \$0}' ./plotly-latest.min.js > ./plotly-latest.minTemp.js +awk '{gsub("Download plot as a png","Download plot as a svg",\$0);print \$0}' ./plotly-latest.minTemp.js > ./plotly-latest.min.js +rm ./plotly-latest.minTemp.js +cd ../../../ +#end if + +#if $advSection.scaleSnapshot!="1.0": +##before copying scripts folder modify scale parameter (not proud of solution but seems to work) +cd ./plotLyDir/screePlot_files/plotly-main-*/ +awk '{gsub("h=t\\.scale\\|\\|1","h=$advSection.scaleSnapshot",\$0);print \$0}' ./plotly-latest.min.js > ./plotly-latest.minTemp.js +mv -f ./plotly-latest.minTemp.js ./plotly-latest.min.js +cd ../../../ +#end if + +##now copy scripts folder +cp -r ./plotLyDir/screePlot_files $html_file.extra_files_path +mv ${html_file.extra_files_path}/screePlot_files ${html_file.extra_files_path}/PlotLy_screePlot_scripts + +else + printf "[WARNING]screeplot.html is missing, probably due to limited number of genes.\n" >> $log; +fi + + +##manage circular files + + +if [ -e ./plotDir/circularPlot.${advSection.imageFormat} ]; then + +cp ./plotDir/circularPlot.${advSection.imageFormat} ${html_file.extra_files_path}/circularPlot.${advSection.imageFormat} + +printf "<h3>Circular plot</h3>\n" >> $html_file + +##add HTML link +printf "<a href=\"circularPlot.${advSection.imageFormat}\">Circular plot</a>\n" >> $html_file + +else + printf "[WARNING]circularPlot file is missing, probably due to limited number of genes.\n" >> $log; +fi + + + +##create footer of HTML file +printf "</body>\n</html>" >> $html_file + +]]> + </configfile> + </configfiles> + + + +<inputs> + <param type="text" name="title" value="Heatmap_toPersonalize" label="Title for output"/> + + <conditional name="dataToCluster"> + <param name="dataToCluster_selector" type="select" label="Data to cluster"> + <option value="expression" selected="true">Expression data</option> + <option value="foldChange">Differential expression analysis results</option> + <option value="genericData">Generic data table</option> + </param> + <when value="expression"> + + <param type="data" name="expressionData" format="tabular" label="Normalized expression tabular file" multiple="false"/> + + <conditional name="reorder_sample"> + <param name="reordering_selector" type="select" label="Reorder samples"> + <option value="no" selected="true">No reordering</option> + <option value="factorFile">Reorder sample based on a factors file</option> + </param> + <when value="factorFile"> + <param type="data" name="factorFileData" format="tabular" label="Factors file" multiple="false" help="Be sure the conditions clusters number is set to 1 in advanced parameters."/> + <param name="factorToUse" type="select" optional="false" multiple="true" label="Select factor(s) to use for reordering" refresh_on_change="true" dynamic_options="get_column_names(reorder_sample['factorFileData'].file_name,0)"> + <validator type="empty_field" message="You should specify at least one factor"></validator> + </param> + </when> + <when value="no"> + </when> + </conditional> + + <conditional name="filtering_step"> + <param name="filtering_step_selector" type="select" label="Probes/genes filtering"> + <option value="no" selected="true">No filtering</option> + <option value="input">Filter input probes/genes before clustering</option> + <option value="output">Filter probes/genes after clustering (for tabular output)</option> + </param> + <when value="input"> + <conditional name="select_filtering"> + <param name="filtering_stepBis_selector" type="select" label="Filter"> + <option value="diffExpParam" selected="true">Based on differential expression results (FC and p-val)</option> + <option value="geneList">From list of genes</option> + </param> + <when value="diffExpParam"> + <param type="data" name="differentialAnalysis" format="tabular" label="Differential analysis tabular file (as given by LIMMA diff.exp. tool)" optional="false" multiple="false"> + </param> + + <param name="comparisonsToInclude" type="select" optional="false" multiple="true" label="Select comparisons to use for filtering" refresh_on_change="true" dynamic_options="get_column_names_filteredList(select_filtering['differentialAnalysis'].file_name,[0,1],5)"> + <validator type="empty_field" message="You should specify one factor"></validator> + </param> + + <param name="FCthreshold" type="float" value="2" label="Fold change threshold for input (both 'threshold' and '1/threshold' values will be used)" help="Minimum value is 1 (ie. all probes/genes are kept)" > + <validator type="in_range" min="1" exclude_min="false" message="Threshold should be greater than 1"/> + </param> + <param name="pvalThreshold" type="float" value="0.05" label="FDR p-val threshold for input" help="When several comparisons are selected a conservative rule is applied (see details below)" > + <validator type="in_range" min="0" max="1" message="Threshold should be between 0 and 1"/> + </param> + </when> + <when value="geneList"> + <param type="data" format="tabular" name="geneListFile" label="List of genes to keep" multiple="false" help="Gene names should be the same as written in expression file"/> + </when> + </conditional> + </when> + + <when value="output"> + <conditional name="select_filtering"> + <param name="filtering_stepBis_selector" type="select" label="Filter"> + <option value="diffExpParam" selected="true">Based on differential expression results (FC and p-val)</option> + <option value="geneList">From list of genes</option> + </param> + <when value="diffExpParam"> + <param type="data" name="differentialAnalysis" format="tabular" label="Differential analysis tabular file (as given by LIMMA diff.exp. tool)" optional="false" multiple="false"> + </param> + + <param name="comparisonsToInclude" type="select" optional="false" multiple="true" label="Select comparisons to use for filtering" refresh_on_change="true" dynamic_options="get_column_names_filteredList(select_filtering['differentialAnalysis'].file_name,[0,1],5)"> + <validator type="empty_field" message="You should specify one factor"></validator> + </param> + + <param name="FCthreshold" type="float" value="2" label="Fold change threshold for output (both 'threshold' and '1/threshold' values will be used)" help="Minimum value is 1 (ie. all probes/genes are kept)" > + <validator type="in_range" min="1" exclude_min="false" message="Threshold should be greater than 1"/> + </param> + <param name="pvalThreshold" type="float" value="0.05" label="FDR p-val threshold for output" help="When several comparisons are selected a conservative rule is applied (see details below)"> + <validator type="in_range" min="0" max="1" message="Threshold should be between 0 and 1"/> + </param> + </when> + <when value="geneList"> + <param type="data" format="tabular" name="geneListFile" label="List of genes to keep" multiple="false" help="Gene names should be the same as written in expression file"/> + </when> + </conditional> + </when> + <when value="no"> + </when> + </conditional> + + </when> + + <when value="foldChange"> + + <param type="data" name="differentialAnalysis" format="tabular" label="Differential analysis tabular file (as given by LIMMA diff.exp. tool)" optional="false" multiple="false"> + </param> + + <param name="comparisonsToInclude" type="select" optional="false" multiple="true" label="Select comparisons to cluster" refresh_on_change="true" dynamic_options="get_column_names_filteredList(dataToCluster['differentialAnalysis'].file_name,[0,1],5)"> + <validator type="empty_field" message="You should specify one factor"></validator> + </param> + + <conditional name="filtering_step"> + <param name="filtering_step_selector" type="select" label="Probes/genes filtering"> + <option value="no" selected="true">No filtering</option> + <option value="input">Filter input probes/genes before clustering</option> + <option value="output">Filter probes/genes only in tabular output file</option> + </param> + <when value="input"> + <conditional name="select_filtering"> + <param name="filtering_stepBis_selector" type="select" label="Filter"> + <option value="diffExpParam" selected="true">Based on differential expression results (FC and p-val)</option> + <option value="geneList">From list of genes</option> + </param> + <when value="diffExpParam"> + <param name="FCthreshold" type="float" value="2" label="Fold change threshold for input (both 'threshold' and '1/threshold' values will be used)" help="Minimum value is 1 (ie. all probes/genes are kept)" > + <validator type="in_range" min="1" exclude_min="false" message="FC threshold should be greater than 1"/> + </param> + <param name="pvalThreshold" type="float" value="0.05" label="FDR p-val threshold for input" help="When several comparisons are selected a conservative rule is applied (see details below)" > + <validator type="in_range" min="0" max="1" message="Threshold should be between 0 and 1"/> + </param> + </when> + <when value="geneList"> + <param type="data" format="tabular" name="geneListFile" label="List of genes to keep" multiple="false" help="Gene names should be the same as written in diff. exp. analysis file"/> + </when> + </conditional> + </when> + + <when value="output"> + <conditional name="select_filtering"> + <param name="filtering_stepBis_selector" type="select" label="Filter"> + <option value="diffExpParam" selected="true">Based on diff. exp. parameters (FC and p-val)</option> + <option value="geneList">From list of genes</option> + </param> + <when value="diffExpParam"> + <param name="FCthreshold" type="float" value="2" label="Fold change threshold for output (both 'threshold' and '1/threshold' values will be used)" help="Minimum value is 1 (ie. all probes/genes are kept)"> + <validator type="in_range" min="1" exclude_min="false" message="Threshold should be greater than 1"/> + </param> + <param name="pvalThreshold" type="float" value="0.05" label="FDR p-val threshold for output" help="When several comparisons are selected a conservative rule is applied (see details below)"> + <validator type="in_range" min="0" max="1" message="Threshold should be between 0 and 1"/> + </param> + </when> + <when value="geneList"> + <param type="data" format="tabular" name="geneListFile" label="List of genes to keep" multiple="false" help="Gene names should be the same as written in diff. exp. analysis file"/> + </when> + </conditional> + </when> + <when value="no"> + </when> + </conditional> + + </when> + + <when value="genericData"> + + <param type="data" name="expressionData" format="tabular" label="Generic tabular file" multiple="false"/> + + <param name="columnToKeep" type="select" optional="false" multiple="true" label="Select column to cluster" refresh_on_change="true" dynamic_options="get_column_names_filteredList(dataToCluster['expressionData'].file_name,[0])"> + <validator type="empty_field" message="You should select at least on column"></validator> + </param> + + <conditional name="reorder_sample"> + <param name="reordering_selector" type="select" label="Reorder columns"> + <option value="no" selected="true">No reordering</option> + <option value="factorFile">Reorder comlumns based on a factors file</option> + </param> + <when value="factorFile"> + <param type="data" name="factorFileData" format="tabular" label="Factors file" multiple="false" help="Be sure the conditions clusters number is set to 1 in advanced parameters."/> + <param name="factorToUse" type="select" optional="false" multiple="true" label="Select factor(s) to use for reordering" refresh_on_change="true" dynamic_options="get_column_names(reorder_sample['factorFileData'].file_name,0)"> + <validator type="empty_field" message="You should specify at least one factor"></validator> + </param> + </when> + <when value="no"> + </when> + </conditional> + + <conditional name="filtering_step"> + <param name="filtering_step_selector" type="select" label="Probes/genes filtering"> + <option value="no" selected="true">No filtering</option> + <option value="input">Filter input probes/genes before clustering</option> + <option value="output">Filter probes/genes after clustering (for tabular output)</option> + </param> + <when value="input"> + <conditional name="select_filtering"> + <param name="filtering_stepBis_selector" type="select" label="Filter"> + <option value="diffExpParam" selected="true">Based on tabular file content</option> + <option value="geneList">From list of genes</option> + </param> + <when value="diffExpParam"> + <param type="data" name="differentialAnalysis" format="tabular" label="Tabular file containing filtering information" optional="false" multiple="false"> + </param> + + <param name="comparisonsToIncludeLow" type="select" optional="true" multiple="true" label="Select columns to consider for low filtering (keeping rows with higher value than a low threshold, ae. FC)" refresh_on_change="true" dynamic_options="get_column_names_filteredList(select_filtering['differentialAnalysis'].file_name,[0])"> + </param> + + <param name="valThresholdLow" type="float" value="0.0" optional="true" label="Low filtering threshold" help="When several comparisons are selected a conservative rule is applied (see details below)"> + </param> + + <param name="comparisonsToIncludeHigh" type="select" optional="true" multiple="true" label="Select columns to consider for high filtering (keeping rows with lower value than a high threshold, ae. p-value)" refresh_on_change="true" dynamic_options="get_column_names_filteredList(select_filtering['differentialAnalysis'].file_name,[0])"> + </param> + + <param name="valThresholdHigh" type="float" value="0.0" optional="true" label="High filtering threshold" help="When several columns are selected a conservative rule is applied (see details below)" > + </param> + </when> + <when value="geneList"> + <param type="data" format="tabular" name="geneListFile" label="List of genes to keep" multiple="false" help="Gene names should be the same as written in input file"/> + </when> + </conditional> + </when> + + <when value="output"> + <conditional name="select_filtering"> + <param name="filtering_stepBis_selector" type="select" label="Filter"> + <option value="diffExpParam" selected="true">Based on tabular file content</option> + <option value="geneList">From list of genes</option> + </param> + <when value="diffExpParam"> + <param type="data" name="differentialAnalysis" format="tabular" label="Tabular file containing filtering information" optional="false" multiple="false"> + </param> + + <param name="comparisonsToIncludeLow" type="select" optional="true" multiple="true" label="Select columns to consider for low filtering (keeping rows with higher value than a low threshold, ae. FC)" refresh_on_change="true" dynamic_options="get_column_names_filteredList(select_filtering['differentialAnalysis'].file_name,[0])"> + </param> + + <param name="valThresholdLow" type="float" value="0.0" optional="true" label="Low filtering threshold" help="When several comparisons are selected a conservative rule is applied (see details below)"> + </param> + + <param name="comparisonsToIncludeHigh" type="select" optional="true" multiple="true" label="Select columns to consider for high filtering (keeping rows with lower value than a high threshold, ae. p-value)" refresh_on_change="true" dynamic_options="get_column_names_filteredList(select_filtering['differentialAnalysis'].file_name,[0])"> + </param> + + <param name="valThresholdHigh" type="float" value="0.0" optional="true" label="High filtering threshold" help="When several columns are selected a conservative rule is applied (see details below)" > + </param> + </when> + <when value="geneList"> + <param type="data" format="tabular" name="geneListFile" label="List of genes to keep" multiple="false" help="Gene names should be the same as written in input file"/> + </when> + </conditional> + </when> + <when value="no"> + </when> + </conditional> + + </when> + </conditional> + + <section name="advSection" title="Advanced parameters" expanded="false"> + + <param name="clusterNumber" type="integer" value="5" label="Requested number of genes clusters" help="Use scree plot to adjust the number of genes clusters"> + <validator type="in_range" min="2" message="Cluster number should be greater than 1"/> + </param> + + <param name="conditionClusterNumber" type="integer" value="1" label="Requested number of conditions clusters (1 = no clustering)"> + <validator type="in_range" min="1" message="Cluster number should be greater than 0"/> + </param> + + <param name="dataTransformation" type="select" label="Apply mathematical transformation to data before clustering"> + <option value="no" selected="true">No</option> + <option value="log">Natural Logarithm</option> + <option value="log2">Base 2 Logarithm</option> + </param> + + <param name="distanceMeasure" type="select" label="Distance measure used for clustering" help="See documentation of 'Dist' R package for more information"> + <option value="euclidean" selected="true">euclidean</option> + <option value="manhattan">manhattan</option> + <option value="binary">binary</option> + <option value="pearson">pearson</option> + <option value="spearman">spearman</option> + <option value="kendall">kendall</option> + </param> + + <param name="aggloMethod" type="select" label="Agglomeration method used for clustering" help="See documentation of 'hclust' R method for more information"> + <option value="complete">complete</option> + <option value="median">median</option> + <option value="centroid">centroid</option> + <option value="average">average</option> + <option value="single">single</option> + <option value="mcquitty">mcquitty</option> + <option value="ward.D">ward1</option> + <option value="ward.D2" selected="true">ward2</option> + </param> + + <conditional name="conditionBarColor"> + <param name="conditionBarColor_selector" type="select" label="Add side bar color for samples/comparisons"> + <option value="no" selected="true">No</option> + <option value="yes">Yes please</option> + </param> + <when value="yes"> + <param type="data" name="factorFileDataBarPlot" format="tabular" label="Factors file" multiple="false" help="Available only for expression data clustering"/> + <param name="factorToUse" type="select" optional="false" multiple="false" label="Select factor to use for coloring side bar" refresh_on_change="true" dynamic_options="get_column_names(conditionBarColor['factorFileDataBarPlot'].file_name,0)"> + <validator type="empty_field" message="You should specify one factor"></validator> + </param> + <param name="sideBarPalette" type="select" label="Side bar color palette"> + <option value="Spectral" selected="true">Spectral</option> + <option value="Set1">Set1</option> + <option value="Set2">Set2</option> + <option value="Set3">Set3</option> + <option value="RdYlBu">RdYlBu</option> + <option value="RdYlGn">RdYlGn</option> + <option value="PiYG">PiYG</option> + </param> + </when> + <when value="no"> + </when> + </conditional> + + <param name="maxSampleToPlot" type="integer" value="1000" label="Maximum gene number to plot"> + <validator type="in_range" min="2" message="The number should be greater than 1"/> + </param> + + <conditional name="select_color"> + <param type="boolean" name="specifyColors" checked="false" label="Personalized heatmap colors"> + </param> + <when value="true"> + <param name="featureMin_color" type="color" label="Min value color" value="#ff00ff"> + </param> + + <param name="featureMedium_color" type="color" label="Medium value color" value="#4455ff"> + </param> + + <param name="featureMax_color" type="color" label="Max value color" value="#00ffff"> + </param> + </when> + <when value="false"> + </when> + </conditional> + + <param type="select" name="imageFormat" display="radio" label="Output format"> + <option value="png">PNG format</option> + <option value="pdf">PDF format</option> + </param> + <param type="select" name="imagePlotlyFormat" display="radio" label="Html snapshot format"> + <option value="png">PNG format</option> + <option value="svg">SVG format</option> + </param> + <param name="scaleSnapshot" type="float" value="1.0" label="Scale html snapshots to increase resolution" help="Minimum value is 1.0 (default resolution)" > + <validator type="in_range" min="1.0" exclude_min="false" message="Scale should be greater than 1"/> + </param> + </section> + + </inputs> + + + + <outputs> + <data format="tabular" name="outputData" label="${title}_ClusteringResults"/> + + <data format="html" name="html_file" label="${title}_HTML.html"/> + <!-- + <collection name="outputHeatmap" label="${title}_Heatmap" type="list"> + <discover_datasets pattern="(?P<designation>Heatmap.*)\.(?P<ext>[^\._]+)?" directory="plotDir" visible="false"/> + <discover_datasets pattern="(?P<designation>screePlot.*)\.(?P<ext>[^\._]+)?" directory="plotDir" visible="false"/> + <discover_datasets pattern="(?P<designation>circularPlot.*)\.(?P<ext>[^\._]+)?" directory="plotDir" visible="false"/> + </collection> + --> + <data format="txt" name="log" label="${title}_Log" /> + </outputs> + + + + <tests> + <test maxseconds="7200"> + <param name="dataToCluster_selector" value="expression" /> + <param name="expressionData" value="./NormalizedData.tabular" /> + <param name="filtering_step_selector" value="input" /> + <param name="filtering_stepBis_selector" value="diffExpParam" /> + <param name="differentialAnalysis" value="./LIMMAstatistics.tabular" /> + <param name="comparisonsToInclude" value="WT*WY14643-KO*WY14643" /> + <param name="FCthreshold" value="1.2" /> + <param name="pvalThreshold" value="0.05" /> + <output name="log" file="./HierarchicalClustering/ExpressionClustering.log" lines_diff="6" /> + </test> + <test maxseconds="7200"> + <param name="dataToCluster_selector" value="foldChange" /> + <param name="differentialAnalysis" value="./LIMMAstatistics.tabular" /> + <param name="comparisonsToInclude" value="WT*WY14643+KO*WY14643-WT*Control-KO*Control,WT*WY14643+WT*Control-KO*WY14643-KO*Control" /> + <param name="filtering_step_selector" value="output" /> + <param name="filtering_stepBis_selector" value="diffExpParam" /> + <param name="FCthreshold" value="1.2" /> + <param name="pvalThreshold" value="0.05" /> + <output name="outputData" file="./HierarchicalClustering/foldChangeClustering.tabular" /> + <output name="log" file="./HierarchicalClustering/foldChangeClustering.log" lines_diff="6" /> + </test> +</tests> + + + + <help> + <![CDATA[ +**What it does** + +Run hierarchical clustering on gene expression data or differential expression analysis (from arrays and RNA-seq studies) and diplay correponding heatmap. + +----- + +**Parameters** + +\- **Title** to personalize output file names (please avoid special characters). + +\- **Data to cluster**, genes can be clustered based on : expression data, results from differential analysis tool or any tabular file content. + + +- **Expression data** with samples as columns and genes as rows (header row contains sample names and first column gene identifiers). + + :: + + Conditions 157_(HuGene-2_0-st).CEL 156_(HuGene-2_0-st).CEL 155_(HuGene-2_0-st).CEL 154_(HuGene-2_0-st).CEL + DDX11L2 4.500872 4.429759 4.780281 4.996189 + MIR1302-2 3.415065 3.520472 3.471503 3.567988 + OR4F5 3.737956 3.011586 3.424494 3.497545 + VWA1 5.189621 5.129595 4.806793 5.227014 + + +- **Differential expression analysis results** with contrasts statistics (p-val, FDR p-val, FC, log2(FC) and t-statistic) as columns and genes as rows (first and second rows contain comparison definition and first and second columns contain gene identifiers and functional informations). Please respect the GIANT-Differential Expression Analysis tool output format. + + :: + + LIMMA comparison WT*Treat WT*Treat WT*Treat WT*Treat WT*Treat + Gene Info p-val FDR.p-val FC log2(FC) t-stat + ARSD na 0.0057 0.41 0.8389 -0.2534 -5.175 + TTTY10 na 1.6e-07 0.0074 0.6403 -0.6432 -6.122 + MIR548AL na 0.072 0.2914 1.711 0.775 10.43 + + \- **Comparisons to cluster** when clustering is performed on differential results, log2(FC) values of selected comparisons will be used. + +- **Generic tabular data** with samples as columns and genes as rows (header row contains sample names and first column gene identifiers). + + :: + + Conditions SampleA SampleB SampleC SampleD + DDX11L2 4.500872 4.429759 4.780281 4.996189 + MIR1302-2 3.415065 3.520472 3.471503 3.567988 + OR4F5 3.737956 3.011586 3.424494 3.497545 + VWA1 5.189621 5.129595 4.806793 5.227014 + + \- **Samples to cluster** when clustering is performed on generic data, user have to select the columns to consider in clustering (first column, containing gene identifiers, will be automatically selected). + + +\- **Reorder samples** (only available for expression and generic data clustering). + +- **Based on a factors file**, samples will be sorted in an alphabetical/numerical order for the selected factors. Names in the 1st column of the factors file have to match with the columns names of the data to cluster. + + :: + + Conditions Sex Treatment Reaction + 154_(HuGene-2_0-st).CEL 1 TreatA Pos + 156_(HuGene-2_0-st).CEL 0 NoTreat Pos + 157_(HuGene-2_0-st).CEL 0 TreatB Neg + 155_(HuGene-2_0-st).CEL 0 NoTreat Neg + +\- **Genes filtering** can be applied before or after clustering step. + +- **Filtering before clustering** allows to restrict clustering to differentially expressed genes using differential analysis results (available for expression data and differential results clustering) or any generic file (available for generic data clustering). As an alternative, a specific gene list file can be directly used for filtering. + +- **Filtering after clustering** will have no effect on clustering or generated heatmaps. This filter is only applied to generated tabular files to keep differentially expressed genes (using differential analysis file or any generic file) or specific user defined genes (using gene list file). + +\- **Filter approaches** : three filtering strategies can be applied before/after clustering depending on the nature of clustered data. These strategies use : differential analysis results (available for expression data and differential results clustering), generic file content (available for generic data clustering) or a gene list file (available for any input data). + +- **From differential analysis results** to filter genes based on fold change and FDR p-val for selected comparisons. + + \- **Differential expression results file** is requested only for expression data clustering. For differential results clustering, the same differential results file selected as "data to cluster" is used. (see "Data to cluster section" for requested format) + + \- **Comparisons to use** are requested only for expression data clustering. For differential results clustering, the same comparisons selected in "data to cluster" section will be used. If several comparisons are selected, genes that satisfy both fold change and FDR p-val thresholds in at least one of these comparisons are kept. + + \- **Fold change threshold** to use for filtering, genes with fold change >= threshold or fold change <= 1/threshold will be kept (set this threshold to 1 if you do not want to filter on fold change). + + \- **FDR p-val threshold** to use for filtering, genes with FDR p-val <= threshold will be kept (set this threshold to 1 if you do not want to filter on FDR p-val). + + +- **From generic tabular file** to filter genes based on selected columns values. + + \- **Generic tabular file** contains gene in the first column and various informations used for filtering in the following (same format as clustered generic tabular file). + + \- **Low filtering columns** used to discard rows with values below a given threshold (typically for Fold Change filtering). If several columns are selected, rows satisfying threshold condition in at least one of these columns are kept. + + \- **Low filtering threshold** below which the rows are discarded, the same threshold is applied for all selected columns. + + \- **High filtering columns** used to discard rows with values above a given threshold (typically for p-value filtering). If several columns are selected, rows satisfying threshold condition in at least one of these columns are kept. + + \- **High filtering threshold** above which the rows are discarded, the same threshold is applied for all selected columns. + +- **From list of genes** to focus on pre-identified genes. + + \- **Gene list file** with genes identifiers as one column file without header. + + :: + + DDX11L2 + VWA1 + TTTY10 + ARSD + +----- + +**Advanced parameters** + +\- **Genes cluster number** used by hierarchical clustering (minimum is 2). See generated screeplot to adjust this number before re-running a clustering. + +\- **Samples/comparisons clusters number** used by hierarchical clustering applied on columns/conditions. Set to 1 (ie. no clustering) if you need to conserve input columns order for visualization purposes. Columns clusters information is not included (yet) in output tabular file. + +\- **Mathematical transformation** can be applied to clustered data before clustering and visualization. Data used for the filtering step are not modified by this transformation. + +\- **Distance measure** used to cluster rows and columns. + +\- **Agglomeration method** used to cluster rows and columns. + +\- **Add side bar** to vizualize factor values for displayed columns/conditions, represented as a colored side bar in the heatmap. + +- **Factor file** that contains factor information for coloring (same format as the factor file used for input data columns reordering). + +- **Factor to use** to color side bar depending on its values for displayed columns/conditions. + +- **Color palette used** for coloring factor values (see RColorBrewer R package documentation for more information on proposed palettes). + +\- **Maximum gene number** : for readability and running time considerations only, number of displayed rows (genes) in heatmaps/circular plot can be limited. Clustering information in generated tabular file and scree plot are computed from a global clustering considering all genes (excepting those filtered out before clustering). Heatmap and circular plot are displayed for a random gene selection, to avoid such random selection we advise you to use input filtering option before clustering to have a gene number below this limit. + +\- **Personalized heatmap colors** to build color gradient choosing start, middle and end colors. + +\- **Output format** for circular plots only. + +\- **Html snapshot format** for interactive plotly plots. + +\- **Scale html snapshots** to increase resolution of snapshots taken from interactive plotly plots. + +----- + +**Outputs** + +\- **Tabular clustering file** containing cluster information for each gene satifying filtering steps. If expression or generic data was clustered, a two columns file is generated with gene identifiers and cluster numbers with possibly additional columns containing informations used for filtering. If differential results was clustered, a similar file is returned with an additional column containing cluster numbers and differential statistics coresponding to comparisons used for filtering. + +\- **HTML file** to access interactive version of heatmap and screeplot through PlotLy html pages, circular plot image and tabulated clustering results. As a reminder, when the number of genes to display in heatmap/circular plot exceeds the maximum gene number parameter, a random sampling is performed for plotting efficiency. Thus, clustering displayed on heatmap/circular plot may slighlty differ from clustering information contained in tabular file as heatmap/circular plot clustering is done over a subset of genes whereas tabular file contains clustering results performed on all genes. + +\- **LOG file** containing information about execution. Useful especially if tool execution fails. Please attach this log file in any bug report. + +]]> + </help> + <citations> + <citation type="bibtex">@misc{vandel_jimmy_2018_1477870, author = {Vandel, J. and Gheeraert, C. and Eeckhoute, J. and Staels, B. and Lefebvre, P. and Dubois-Chevalier, J.}, title = {GIANT: Galaxy-based Interactive tools for ANalaysis of Transcriptomic data}, month = nov, year = 2018, doi = {10.5281/zenodo.1477870}, url = {https://doi.org/10.5281/zenodo.1477870} + }</citation> + + <citation type="bibtex">@article{, + author = {Galili, Tal and O'Callaghan, Alan and + Sidi, Jonathan and Sievert, Carson}, + title = {heatmaply: an R package for creating interactive cluster + heatmaps for online publishing}, + journal = {Bioinformatics}, + year = {2017}, + doi = {10.1093/bioinformatics/btx657}, + url = {http://dx.doi.org/10.1093/bioinformatics/btx657}, + eprint = + {https://academic.oup.com/bioinformatics/article-pdf/doi/10.1093/bioinformatics/btx657/21358327/btx657.pdf} + }</citation> + + <citation type="bibtex">@article{doi:10.1093/bioinformatics/btu393, + author = {Gu, Zuguang and Gu, Lei and Eils, Roland and Schlesner, Matthias and Brors, Benedikt}, + title = {circlize implements and enhances circular visualization in R }, + journal = {Bioinformatics}, + volume = {30}, + number = {19}, + pages = {2811-2812}, + year = {2014}, + doi = {10.1093/bioinformatics/btu393}, + URL = {http://dx.doi.org/10.1093/bioinformatics/btu393}, + eprint = {/oup/backfile/content_public/journal/bioinformatics/30/19/10.1093_bioinformatics_btu393/2/btu393.pdf} + }</citation> + + <citation type="bibtex">@online{plotly, author = {Plotly Technologies Inc.}, title = {Collaborative data science}, publisher = {Plotly Technologies Inc.}, address = {Montreal, QC}, year = {2015}, url = {https://plot.ly} + }</citation> + + + </citations> + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/ExprPlotsScript.R Fri Jun 26 09:38:23 2020 -0400 @@ -0,0 +1,465 @@ +# A command-line interface to basic plots for use with Galaxy +# written by Jimmy Vandel +# one of these arguments is required: +# +# +initial.options <- commandArgs(trailingOnly = FALSE) +file.arg.name <- "--file=" +script.name <- sub(file.arg.name, "", initial.options[grep(file.arg.name, initial.options)]) +script.basename <- dirname(script.name) +source(file.path(script.basename, "utils.R")) +source(file.path(script.basename, "getopt.R")) + +#addComment("Welcome R!") + +# setup R error handling to go to stderr +options( show.error.messages=F, error = function () { cat(geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) + +# we need that to not crash galaxy with an UTF8 error on German LC settings. +loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") +loc <- Sys.setlocale("LC_NUMERIC", "C") + +#get starting time +start.time <- Sys.time() + +#get options +options(stringAsfactors = FALSE, useFancyQuotes = FALSE) +args <- commandArgs() + + +# get options, using the spec as defined by the enclosed list. +# we read the options from the default: commandArgs(TRUE). +spec <- matrix(c( + "dataFile", "i", 1, "character", + "factorInfo","t", 1, "character", + "dataFileFormat","j",1,"character", + "conditionNames","c",1,"character", + "format", "f", 1, "character", + "quiet", "q", 0, "logical", + "log", "l", 1, "character", + "histo" , "h", 1, "character", + "maPlot" , "a", 1, "character", + "boxplot" , "b", 1, "character", + "microarray" , "m", 1, "character", + "acp" , "p" , 1, "character", + "screePlot" , "s" , 1, "character"), + byrow=TRUE, ncol=4) +opt <- getopt(spec) + +# enforce the following required arguments +if (is.null(opt$log)) { + addComment("[ERROR]'log file' is required") + q( "no", 1, F ) +} +addComment("[INFO]Start of R script",T,opt$log,display=FALSE) +if (is.null(opt$dataFile) || is.null(opt$dataFileFormat)) { + addComment("[ERROR]'dataFile' and it format are required",T,opt$log) + q( "no", 1, F ) +} +if (is.null(opt$format)) { + addComment("[ERROR]'output format' is required",T,opt$log) + q( "no", 1, F ) +} +if (is.null(opt$histo) & is.null(opt$maPlot) & is.null(opt$boxplot) & is.null(opt$microarray) & is.null(opt$acp)){ + addComment("[ERROR]Select at least one plot to draw",T,opt$log) + q( "no", 1, F ) +} + +verbose <- if (is.null(opt$quiet)) { + TRUE +}else{ + FALSE} + +addComment("[INFO]Parameters checked!",T,opt$log,display=FALSE) + +addComment(c("[INFO]Working directory: ",getwd()),TRUE,opt$log,display=FALSE) +addComment(c("[INFO]Command line: ",args),TRUE,opt$log,display=FALSE) + +#directory for plots +dir.create(file.path(getwd(), "plotDir")) +dir.create(file.path(getwd(), "plotLyDir")) + +#silent package loading +suppressPackageStartupMessages({ + library("oligo") + library("ff") + library("ggplot2") + library("plotly") +}) + + +#chargement des fichiers en entrée +#fichier de type CEL +dataAreFromCel=FALSE +if(toupper(opt$dataFileFormat)=="CEL"){ + dataAreFromCel=TRUE + celData=read.celfiles(unlist(strsplit(opt$dataFile,","))) + #load all expressions + dataMatrix=exprs(celData) + #select "pm" probes + probeInfo=getProbeInfo(celData,probeType = c("pm"),target="probeset") + #reduce dataMatrix to log expression matrix for a randomly probe selection + dataMatrix=log2(dataMatrix[sample(unique(probeInfo[,1]),min(100000,length(unique(probeInfo[,1])))),]) + addComment("[INFO]Raw data are log2 transformed",TRUE,opt$log,display=FALSE) + remove(probeInfo) +}else{ + #fichier deja tabule + dataMatrix=read.csv(file=opt$dataFile,header=F,sep="\t",colClasses="character") + #remove first row to convert it as colnames (to avoid X before colnames with header=T) + colNamesData=dataMatrix[1,-1] + dataMatrix=dataMatrix[-1,] + #remove first colum to convert it as rownames + rowNamesData=dataMatrix[,1] + dataMatrix=dataMatrix[,-1] + if(is.data.frame(dataMatrix)){ + dataMatrix=data.matrix(dataMatrix) + }else{ + dataMatrix=data.matrix(as.numeric(dataMatrix)) + } + dimnames(dataMatrix)=list(rowNamesData,colNamesData) + if(any(duplicated(rowNamesData)))addComment("[WARNING] several rows share the same probe/gene name, you should merge or rename them to avoid further analysis mistakes",TRUE,opt$log,display=FALSE) +} + +addComment("[INFO]Input data loaded",TRUE,opt$log,display=FALSE) +addComment(c("[INFO]Dim of data matrix:",dim(dataMatrix)),T,opt$log,display=FALSE) + +#get number of conditions +nbConditions=ncol(dataMatrix) + +#get condition names if they are specified +if(!is.null(opt$conditionNames) && length(opt$conditionNames)==nbConditions){ + nameConditions=opt$conditionNames + colnames(dataMatrix)=nameConditions + #rownames(phenoData(celData)@data)=nameConditions + #rownames(protocolData(celData)@data)=nameConditions +}else{ + nameConditions=colnames(dataMatrix) +} + +#create a correspondance table between plot file names and name displayed in figure legend and html items +correspondanceNameTable=matrix("",ncol=2,nrow=nbConditions) +correspondanceNameTable[,1]=paste("Condition",1:nbConditions,sep="") +correspondanceNameTable[,2]=nameConditions +rownames(correspondanceNameTable)=correspondanceNameTable[,2] + +addComment("[INFO]Retreive condition names",TRUE,opt$log,display=FALSE) + +if(!is.null(opt$factorInfo)){ + #chargement du fichier des facteurs + factorInfoMatrix=read.csv(file=file.path(getwd(), opt$factorInfo),header=F,sep="\t",colClasses="character") + #remove first row to convert it as colnames + colnames(factorInfoMatrix)=factorInfoMatrix[1,] + factorInfoMatrix=factorInfoMatrix[-1,] + #use first colum to convert it as rownames but not removing it to avoid conversion as vector in unique factor case + rownames(factorInfoMatrix)=factorInfoMatrix[,1] + + + if(length(setdiff(colnames(dataMatrix),rownames(factorInfoMatrix)))!=0){ + addComment("[ERROR]Missing samples in factor file",T,opt$log) + q( "no", 1, F ) + } + + #order sample as in expression matrix and remove spurious sample + factorInfoMatrix=factorInfoMatrix[colnames(dataMatrix),] + + addComment("[INFO]Factors OK",T,opt$log,display=FALSE) + addComment(c("[INFO]Dim of factorInfo matrix:",dim(factorInfoMatrix)),T,opt$log,display=FALSE) + +} + +addComment("[INFO]Ready to plot",T,opt$log,display=FALSE) + + +##---------------------- + +###plot histograms### +histogramPerFigure=50 +if (!is.null(opt$histo)) { + for(iToPlot in 1:(((nbConditions-1)%/%histogramPerFigure)+1)){ + firstPlot=1+histogramPerFigure*(iToPlot-1) + lastPlot=min(nbConditions,histogramPerFigure*iToPlot) + dataToPlot=data.frame(x=c(dataMatrix[,firstPlot:lastPlot]),Experiment=rep(colnames(dataMatrix)[firstPlot:lastPlot],each=nrow(dataMatrix))) + p <- ggplot(data=dataToPlot, aes(x = x, color=Experiment)) + stat_density(geom="line", size=1, position="identity") + + ggtitle("Intensity densities") + theme_bw() + ylab(label="Density") + + theme(panel.border=element_blank(),plot.title = element_text(hjust = 0.5)) + if(dataAreFromCel){ + #original ploting function + #hist(celData[,firstPlot:lastPlot],lty=rep(1,nbConditions)[firstPlot:lastPlot],lwd=2,which='pm',target="probeset",transfo=log2,col=rainbow(nbConditions)[firstPlot:lastPlot]) + p <- p + xlab(label="Log2 intensities") + }else{ + p <- p + xlab(label="Intensities") + } + if(opt$format=="pdf"){ + pdf(paste(c("./plotDir/",opt$histo,iToPlot,".pdf"),collapse=""))}else{ + png(paste(c("./plotDir/",opt$histo,iToPlot,".png"),collapse="")) + } + print(p) + dev.off() + #save plotly files + pp <- ggplotly(p) + htmlwidgets::saveWidget(as_widget(pp), paste(c(file.path(getwd(), "plotLyDir"),"/",opt$histo,iToPlot,".html"),collapse=""),selfcontained = F) + } + remove(p,dataToPlot) + addComment("[INFO]Histograms drawn",T,opt$log,display=FALSE) +} + +##---------------------- + +###plot MAplots### +MAplotPerPage=4 +if (!is.null(opt$maPlot)) { + iToPlot=1 + plotVector=list() + toTake=sample(nrow(dataMatrix),min(200000,nrow(dataMatrix))) + refMedianColumn=rowMedians(as.matrix(dataMatrix[toTake,])) + if(length(toTake)>100000)addComment(c("[INFO]high number of input data rows ",length(toTake),"; the generation of MA plot can take a while, please be patient"),TRUE,opt$log,display=FALSE) + for (iCondition in 1:nbConditions){ + #MAplot(celData,which=i,what=pm,transfo=log2) + #smoothScatter(x=xToPlot,y=yToPlot,main=nameConditions[iCondition]) + dataA=dataMatrix[toTake,iCondition] + dataB=refMedianColumn####ATTENTION PAR DEFAUT + xToPlot=0.5*(dataA+dataB) + yToPlot=dataA-dataB + tempX=seq(min(xToPlot),max(xToPlot),0.1) + tempY=unlist(lapply(tempX,function(x){median(yToPlot[intersect(which(xToPlot>=(x-0.1/2)),which(xToPlot<(x+0.1/2)))])})) + + dataToPlot=data.frame(x=xToPlot,y=yToPlot) + dataMedianToPlot=data.frame(x=tempX,y=tempY) + p <- ggplot(data=dataToPlot, aes(x,y)) + stat_density2d(aes(fill = ..density..^0.25), geom = "tile", contour = FALSE, n = 100) + + scale_fill_continuous(low = "white", high = "dodgerblue4") + geom_smooth(data=dataMedianToPlot,colour="red", size=0.5, se=FALSE) + + ggtitle(correspondanceNameTable[iCondition,2]) + theme_bw() + xlab(label="") + ylab(label="") + + theme(panel.border=element_blank(),plot.title = element_text(hjust = 0.5),legend.position = "none") + plotVector[[length(plotVector)+1]]=p + + #save plotly files + pp <- ggplotly(p) + htmlwidgets::saveWidget(as_widget(pp), paste(c(file.path(getwd(), "plotLyDir"),"/",opt$maPlot,"_",correspondanceNameTable[iCondition,1],".html"),collapse=""),selfcontained = F) + + if(iCondition==nbConditions || length(plotVector)==MAplotPerPage){ + #define a new plotting file + if(opt$format=="pdf"){ + pdf(paste(c("./plotDir/",opt$maPlot,iToPlot,".pdf"),collapse=""))}else{ + png(paste(c("./plotDir/",opt$maPlot,iToPlot,".png"),collapse="")) + } + multiplot(plotlist=plotVector,cols=2) + dev.off() + if(iCondition<nbConditions){ + #prepare for a new plotting file if necessary + plotVector=list() + iToPlot=iToPlot+1 + } + } + } + remove(p,dataToPlot,dataA,dataB,toTake,xToPlot,yToPlot) + addComment("[INFO]MAplots drawn",T,opt$log,display=FALSE) +} + +##---------------------- + +###plot boxplots### +boxplotPerFigure=50 +if (!is.null(opt$boxplot)) { + for(iToPlot in 1:(((nbConditions-1)%/%boxplotPerFigure)+1)){ + firstPlot=1+boxplotPerFigure*(iToPlot-1) + lastPlot=min(nbConditions,boxplotPerFigure*iToPlot) + dataToPlot=data.frame(intensities=c(dataMatrix[,firstPlot:lastPlot]),Experiment=rep(colnames(dataMatrix)[firstPlot:lastPlot],each=nrow(dataMatrix))) + #to make HTML file lighter, sampling will be done amongst outliers + #get outliers for each boxplot + boxplotsOutliers=apply(dataMatrix[,firstPlot:lastPlot],2,function(x)boxplot.stats(x)$out) + #sample amongst them to keep at maximum of 1000 points and include both min and max outliers values + boxplotsOutliers=lapply(boxplotsOutliers,function(x)if(length(x)>0)c(sample(c(x),min(length(x),1000)),max(c(x)),min(c(x)))) + dataOutliers=data.frame(yVal=unlist(boxplotsOutliers),xVal=unlist(lapply(seq_along(boxplotsOutliers),function(x)rep(names(boxplotsOutliers)[x],length(boxplotsOutliers[[x]]))))) + #plot boxplots without outliers + p <- ggplot(data=dataToPlot, aes(y = intensities, x=Experiment ,color=Experiment)) + geom_boxplot(outlier.colour=NA,outlier.shape =NA) + + ggtitle("Intensities") + theme_bw() + xlab(label="") + + theme(panel.border=element_blank(),plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 45, hjust = 1),plot.margin=unit(c(10,10,max(unlist(lapply(dataToPlot$Experiment,function(x)nchar(as.character(x))))),15+max(unlist(lapply(dataToPlot$Experiment,function(x)nchar(as.character(x)))))),"mm")) + #add to plot sampled outliers + p <- p + geom_point(data=dataOutliers,aes(x=xVal,y=yVal,color=xVal),inherit.aes = F) + if(dataAreFromCel){ + #original plotting function + #boxplot(celData[,firstPlot:lastPlot],which='pm',col=rainbow(nbConditions)[firstPlot:lastPlot],target="probeset",transfo=log2,names=nameConditions[firstPlot:lastPlot],main="Intensities") + p <- p + ylab(label="Log2 intensities") + }else{ + p <- p + ylab(label="Intensities") + } + if(opt$format=="pdf"){ + pdf(paste(c("./plotDir/",opt$boxplot,iToPlot,".pdf"),collapse=""))}else{ + png(paste(c("./plotDir/",opt$boxplot,iToPlot,".png"),collapse="")) + } + print(p) + dev.off() + #save plotly files + pp <- ggplotly(p) + + #modify plotly object to get HTML file not too heavy for loading + for(iData in 1:length(pp$x$data)){ + ##get kept outliers y values + #yPointsToKeep=dataOutliers$yVal[which(dataOutliers$xVal==pp$x$data[[iData]]$name)] + if(pp$x$data[[iData]]$type=="scatter"){ + ##scatter plot represent outliers points added to boxplot through geom_point + ##nothing to do as outliers have been sampled allready, just have to modify hover text + #if(length(yPointsToKeep)>0){ + #pointsToKeep=which(pp$x$data[[iData]]$y %in% yPointsToKeep) + #pp$x$data[[iData]]$x=pp$x$data[[iData]]$x[pointsToKeep] + #pp$x$data[[iData]]$y=pp$x$data[[iData]]$y[pointsToKeep] + #pp$x$data[[iData]]$text=pp$x$data[[iData]]$text[pointsToKeep] + #}else{ + #pp$x$data[[iData]]$x=NULL + #pp$x$data[[iData]]$y=NULL + #pp$x$data[[iData]]$marker$opacity=0 + #pp$x$data[[iData]]$hoverinfo=NULL + #pp$x$data[[iData]]$text=NULL + #} + #modify text to display + if(dataAreFromCel){ + pp$x$data[[iData]]$text=unlist(lapply(seq_along(pp$x$data[[iData]]$y),function(x)return(paste(c("log2(intensity) ",prettyNum(pp$x$data[[iData]]$y[x],digits=4)),collapse = "")))) + }else{ + pp$x$data[[iData]]$text=unlist(lapply(seq_along(pp$x$data[[iData]]$y),function(x)return(paste(c("intensity ",prettyNum(pp$x$data[[iData]]$y[x],digits=4)),collapse = "")))) + } + }else{ + ##disable marker plotting to keep only box and whiskers plot (outliers are displayed through scatter plot) + pp$x$data[[iData]]$marker$opacity=0 + + #sample 50000 points amongst all data to get a lighter html file, sampling size should not be too low to avoid modifying limit of boxplots + pp$x$data[[iData]]$y=c(sample(dataMatrix[,pp$x$data[[iData]]$name],min(length(dataMatrix[,pp$x$data[[iData]]$name]),50000)),min(dataMatrix[,pp$x$data[[iData]]$name]),max(dataMatrix[,pp$x$data[[iData]]$name])) + pp$x$data[[iData]]$x=rep(pp$x$data[[iData]]$x[1],length(pp$x$data[[iData]]$y)) + + ##first remove outliers info + #downUpValues=boxplot.stats(dataMatrix[,pp$x$data[[iData]]$name])$stats + #if(verbose)addComment(c("filter values for boxplot",pp$x$data[[iData]]$name,"between",min(downUpValues),"and",max(downUpValues)),T,opt$log) + #pointsToRemove=which(pp$x$data[[iData]]$y<min(downUpValues)) + #if(length(pointsToRemove)>0)pp$x$data[[iData]]$y=pp$x$data[[iData]]$y[-pointsToRemove] + #pointsToRemove=which(pp$x$data[[iData]]$y>max(downUpValues)) + #if(length(pointsToRemove)>0)pp$x$data[[iData]]$y=pp$x$data[[iData]]$y[-pointsToRemove] + #then add sampled outliers info + #pp$x$data[[iData]]$y=c(yPointsToKeep,pp$x$data[[iData]]$y) + #pp$x$data[[iData]]$x=rep(pp$x$data[[iData]]$x[1],length(pp$x$data[[iData]]$y)) + } + } + + htmlwidgets::saveWidget(as_widget(pp), paste(c(file.path(getwd(), "plotLyDir"),"/",opt$boxplot,iToPlot,".html"),collapse=""),selfcontained = F) + } + remove(p,dataToPlot) + addComment("[INFO]Boxplots drawn",T,opt$log,display=FALSE) + +} + +##---------------------- + +###plot microarrays (only for .CEL files)### +if (!is.null(opt$microarray) && dataAreFromCel) { + for (iCondition in 1:nbConditions){ + if(opt$format=="pdf"){ + pdf(paste(c("./plotDir/",opt$microarray,"_",correspondanceNameTable[iCondition,1],".pdf"),collapse=""),onefile = F,width = 5,height = 5)}else{ + png(paste(c("./plotDir/",opt$microarray,"_",correspondanceNameTable[iCondition,1],".png"),collapse="")) + } + image(celData[,iCondition],main=correspondanceNameTable[iCondition,2]) + dev.off() + } + addComment("[INFO]Microarray drawn",T,opt$log,display=FALSE) +} + +##---------------------- + +###plot PCA plot### +if (!is.null(opt$acp)){ + ##to avoid error when nrow is too large, results quite stable with 200k random selected rows + randomSelection=sample(nrow(dataMatrix),min(200000,nrow(dataMatrix))) + #remove constant variables + + dataFiltered=dataMatrix[randomSelection,] + toRemove=which(unlist(apply(dataFiltered,1,var))==0) + if(length(toRemove)>0){ + dataFiltered=dataFiltered[-toRemove,] + } + ##geom_text(aes(label=Experiments,hjust=1, vjust=1.3), y = PC2+0.01) + PACres = prcomp(t(dataFiltered),scale.=TRUE) + + if(!is.null(opt$screePlot)){ + #scree plot + #p <- fviz_eig(PACres) + dataToPlot=data.frame(compo=seq(1,length(PACres$sdev)),var=(PACres$sdev^2/sum(PACres$sdev^2))*100) + p<-ggplot(data=dataToPlot, aes(x=compo, y=var)) + geom_bar(stat="identity", fill="steelblue") + geom_line() + geom_point() + + ggtitle("Scree plot") + theme_bw() + theme(panel.border=element_blank(),plot.title = element_text(hjust = 0.5)) + + xlab(label="Dimensions") + ylab(label="% explained variances") + scale_x_discrete(limits=dataToPlot$compo) + pp <- ggplotly(p) + + if(opt$format=="pdf"){ + pdf(paste(c("./plotDir/",opt$screePlot,".pdf"),collapse=""))}else{ + png(paste(c("./plotDir/",opt$screePlot,".png"),collapse="")) + } + plot(p) + dev.off() + htmlwidgets::saveWidget(as_widget(pp), paste(c(file.path(getwd(), "plotLyDir"),"/",opt$screePlot,".html"),collapse=""),selfcontained = F) + } + + #now plot pca plots + + if(!is.null(opt$factorInfo)){ + fileIdent="" + symbolset = c("circle","cross","square","diamond","circle-open","square-open","diamond-open","x") + + #save equivalence between real factor names and generic ones in correspondanceNameTable + correspondanceNameTable=rbind(correspondanceNameTable,matrix(c(paste("Factor",1:(ncol(factorInfoMatrix)-1),sep=""),colnames(factorInfoMatrix)[-1]),ncol=2,nrow=ncol(factorInfoMatrix)-1)) + rownames(correspondanceNameTable)=correspondanceNameTable[,2] + + #first order factors from decreasing groups number + orderedFactors=colnames(factorInfoMatrix)[-1][order(unlist(lapply(colnames(factorInfoMatrix)[-1],function(x)length(table(factorInfoMatrix[,x])))),decreasing = T)] + allFactorsBigger=length(table(factorInfoMatrix[,orderedFactors[length(orderedFactors)]]))>length(symbolset) + if(allFactorsBigger)addComment("[WARNING]All factors are composed of too many groups to display two factors at same time, each PCA plot will display only one factor groups",T,opt$log,display=FALSE) + for(iFactor in 1:length(orderedFactors)){ + #if it is the last factor of the list or if all factor + if(iFactor==length(orderedFactors) || allFactorsBigger){ + if(length(orderedFactors)==1 || allFactorsBigger){ + dataToPlot=data.frame(PC1=PACres$x[,1],PC2=PACres$x[,2],PC3=PACres$x[,3],Experiments=rownames(PACres$x), Attribute1=factorInfoMatrix[rownames(PACres$x),orderedFactors[iFactor]], hoverLabel=unlist(lapply(rownames(PACres$x),function(x)paste(factorInfoMatrix[x,-1],collapse=",")))) + p <- plot_ly(dataToPlot,x = ~PC1, y = ~PC2, z = ~PC3, type = 'scatter3d', mode="markers", color=~Attribute1,colors=rainbow(length(levels(dataToPlot$Attribute1))+2),hoverinfo = 'text', text = ~paste(Experiments,"\n",hoverLabel),marker=list(size=5))%>% + layout(title = "Principal Component Analysis", scene = list(xaxis = list(title = "Component 1"),yaxis = list(title = "Component 2"),zaxis = list(title = "Component 3")), + legend=list(font = list(family = "sans-serif",size = 15,color = "#000"))) + fileIdent=correspondanceNameTable[orderedFactors[iFactor],1] + #add text label to plot + ##p <- add_text(p,x = dataToPlot$PC1, y = dataToPlot$PC2 + (max(PACres$x[,2])-min(PACres$x[,2]))*0.02, z = dataToPlot$PC3, mode = 'text', inherit = F, text=rownames(PACres$x), hoverinfo='skip', showlegend = FALSE, color=I('black')) + #save the plotly plot + htmlwidgets::saveWidget(as_widget(p), paste(c(file.path(getwd(), "plotLyDir"),"/",opt$acp,"_",fileIdent,".html"),collapse=""),selfcontained = F) + } + }else{ + for(iFactorBis in (iFactor+1):length(orderedFactors)){ + if(length(table(factorInfoMatrix[,orderedFactors[iFactorBis]]))<=length(symbolset)){ + dataToPlot=data.frame(PC1=PACres$x[,1],PC2=PACres$x[,2],PC3=PACres$x[,3],Experiments=rownames(PACres$x), Attribute1=factorInfoMatrix[rownames(PACres$x),orderedFactors[iFactor]], Attribute2=factorInfoMatrix[rownames(PACres$x),orderedFactors[iFactorBis]], hoverLabel=unlist(lapply(rownames(PACres$x),function(x)paste(factorInfoMatrix[x,-1],collapse=",")))) + p <- plot_ly(dataToPlot,x = ~PC1, y = ~PC2, z = ~PC3, type = 'scatter3d', mode="markers", color=~Attribute1,colors=rainbow(length(levels(dataToPlot$Attribute1))+2),symbol=~Attribute2,symbols = symbolset,hoverinfo = 'text', text = ~paste(Experiments,"\n",hoverLabel),marker=list(size=5))%>% + layout(title = "Principal Component Analysis", scene = list(xaxis = list(title = "Component 1"),yaxis = list(title = "Component 2"),zaxis = list(title = "Component 3")), + legend=list(font = list(family = "sans-serif",size = 15,color = "#000"))) + fileIdent=paste(correspondanceNameTable[orderedFactors[c(iFactor,iFactorBis)],1],collapse="_AND_") + #add text label to plot + ##p <- add_text(p,x = dataToPlot$PC1, y = dataToPlot$PC2 + (max(PACres$x[,2])-min(PACres$x[,2]))*0.02, z = dataToPlot$PC3, mode = 'text', inherit = F, text=rownames(PACres$x), hoverinfo='skip', showlegend = FALSE, color=I('black')) + #save the plotly plot + htmlwidgets::saveWidget(as_widget(p), paste(c(file.path(getwd(), "plotLyDir"),"/",opt$acp,"_",fileIdent,".html"),collapse=""),selfcontained = F) + }else{ + addComment(c("[WARNING]PCA with",orderedFactors[iFactor],"and",orderedFactors[iFactorBis],"groups cannot be displayed, too many groups (max",length(symbolset),")"),T,opt$log,display=FALSE) + } + } + } + } + }else{ + dataToPlot=data.frame(PC1=PACres$x[,1],PC2=PACres$x[,2],PC3=PACres$x[,3],Experiments=rownames(PACres$x)) + p <- plot_ly(dataToPlot,x = ~PC1, y = ~PC2, z = ~PC3, type = 'scatter3d', mode="markers",marker=list(size=5,color="salmon"),hoverinfo = 'text',text = ~paste(Experiments))%>% + layout(title = "Principal Component Analysis", scene = list(xaxis = list(title = "Component 1"),yaxis = list(title = "Component 2"),zaxis = list(title = "Component 3")), + legend=list(font = list(family = "sans-serif",size = 15,color = "#000"))) + ##p <- add_text(p,x = dataToPlot$PC1, y = dataToPlot$PC2 + (max(PACres$x[,2])-min(PACres$x[,2]))*0.02, z = dataToPlot$PC3, mode = 'text', inherit = F, text=rownames(PACres$x), hoverinfo='skip', showlegend = FALSE, color=I('black')) + + #save plotly files + htmlwidgets::saveWidget(as_widget(p), paste(c(file.path(getwd(), "plotLyDir"),"/",opt$acp,"_plot.html"),collapse=""),selfcontained = F) + } + remove(p,dataToPlot,dataFiltered) + addComment("[INFO]ACP plot drawn",T,opt$log,display=FALSE) +} + +#write correspondances between plot file names and displayed names in figure legends, usefull to define html items in xml file +write.table(correspondanceNameTable,file=file.path(getwd(), "correspondanceFileNames.csv"),quote=FALSE,sep="\t",col.names = F,row.names = F) + +end.time <- Sys.time() +addComment(c("[INFO]Total execution time for R script:",as.numeric(end.time - start.time,units="mins"),"mins"),T,opt$log,display=FALSE) + +addComment("[INFO]End of R script",T,opt$log,display=FALSE) + +printSessionInfo(opt$log) +#sessionInfo()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/General_functions.py Fri Jun 26 09:38:23 2020 -0400 @@ -0,0 +1,206 @@ +import re +import numpy as np + +def get_column_names( file_path, toNotConsider=-1, each=1): + options=[] + inputfile = open(file_path) + firstLine = next(inputfile).strip().split("\t") + cpt=0 + for i, field_component in enumerate( firstLine ): + if i!=toNotConsider:#to squeeze the first column + if cpt==0: + options.append( ( field_component, field_component, False ) ) + cpt+=1 + if cpt==each: + cpt=0 + inputfile.close() + return options + +def get_column_names_filteredList( file_path, toNotConsider=[], each=1): + options=[] + inputfile = open(file_path) + firstLine = next(inputfile).strip().split("\t") + cpt=0 + for i, field_component in enumerate( firstLine ): + if i not in toNotConsider:#to squeeze the first columns + if cpt==0: + options.append( ( field_component, field_component, False ) ) + cpt+=1 + if cpt==each: + cpt=0 + inputfile.close() + return options + +def get_column_names_mergeNumber(file_path, numberToMerge=1, toNotConsider=[]): + options=[] + inputfile = open(file_path) + if int(numberToMerge)>0: + iHeader=0 + for iCurrentLine in inputfile: + iHeader=iHeader+1 + if iHeader>int(numberToMerge): + break + currentLine=iCurrentLine.strip().split("\t") + iOption=-1 + for i, field_component in enumerate( currentLine ): + if i not in toNotConsider:#to squeeze specified columns + iOption=iOption+1 + if iHeader==1: + options.append( ( str(field_component), str(field_component), False ) ) + else: + options[iOption]=(options[iOption][0]+"_"+str(field_component),options[iOption][1]+"_"+str(field_component),False) + else: + currentLine = next(inputfile).strip().split("\t") + for i, field_component in enumerate( currentLine ): + if i not in toNotConsider:#to squeeze specified columns + options.append( ( "Column_"+str(i), "Column_"+str(i), False ) ) + inputfile.close() + return options + +def get_row_names( file_path, factorName ): + inputfile = open(file_path) + firstLine = next(inputfile).strip().split("\t") + iColumn=-1 + for i, field_component in enumerate( firstLine ): + if field_component==factorName:#to test + iColumn=i + options=[] + if iColumn!=-1: + for nextLine in inputfile: + nextLine=nextLine.strip().split("\t") + if len(nextLine)>1: + if (nextLine[iColumn], nextLine[iColumn], False) not in options: + options.append( (nextLine[iColumn], nextLine[iColumn], False) ) + inputfile.close() + return options + +def get_condition_file_names( file_list, toNotConsider=-1, each=1): + options=[] + if not isinstance(file_list,list):#if input file is a tabular file, act as get_column_names + inputfile = open(file_list.file_name) + firstLine = next(inputfile).strip().split("\t") + cpt=0 + for i, field_component in enumerate( firstLine ): + if i!=toNotConsider:#to squeeze the first column + if cpt==0: + options.append( ( field_component, field_component, False ) ) + cpt+=1 + if cpt==each: + cpt=0 + inputfile.close() + else:#if input file is a .cel file list or a collection + if not hasattr(file_list[0],'collection'):#if it is not a collection, get name easily + for i, field_component in enumerate( file_list ): + options.append( ( field_component.name, field_component.name, False ) ) + else:#if the file is a collection, have to get deeper in the corresponding HistoryDatasetCollectionAssociation object + for i, field_component in enumerate( file_list[0].collection.elements ): + options.append( ( field_component.element_identifier, field_component.element_identifier, False ) ) + return options + +def generateFactorFile( file_list, factor_list, outputFileName, logFile): + forbidenCharacters={"*",":",",","|"} + outputfile = open(outputFileName, 'w') + outputLog = open(logFile, 'w') + sampleList=[] + if not isinstance(file_list,list): + conditionNames=get_condition_file_names(file_list,0) #unique expression file, remove the first column (index=0) + else : + conditionNames=get_condition_file_names(file_list) #.CEL files + for iSample, sample_component in enumerate (conditionNames): + sampleList.append(str(sample_component[1])) + outputLog.write("[INFO] "+str(len(sampleList))+" sample are detected as input\n") + globalDict=dict() + factorNameList=[] + firstLine="Conditions" + if len(factor_list)==0:#check if there is at least one factor available + outputLog.write("[ERROR] no factor was defined !\n") + return 1 + else: + for iFactor, factor_component in enumerate( factor_list ): + currentSampleList=list(sampleList) + currentFactor=str(factor_component['factorName']) + #check if factor name contains forbidden characters + for specialCharacter in forbidenCharacters: + if currentFactor.find(specialCharacter)!=-1: + outputLog.write("[ERROR] '"+specialCharacter+"' character is forbidden in factor name : '"+currentFactor+"'\n") + return 4 + #check if factor allready named like that + if not globalDict.get(currentFactor) is None: + outputLog.write("[ERROR] '"+currentFactor+"' is used several times as factor name\n") + return 3 + globalDict[currentFactor]=dict() + firstLine=firstLine+"\t"+currentFactor + factorNameList.append(currentFactor) + if len(factor_component['valueList'])<=1:#check if there is at least two value available + outputLog.write("[ERROR] at least two different values are necessary for '"+currentFactor+"' factor\n") + return 1 + else: + for iValue, value_component in enumerate( factor_component['valueList'] ): + currentValue=str(value_component['valueName']) + #check if factor name contains forbidden characters + for specialCharacter in forbidenCharacters: + if currentValue.find(specialCharacter)!=-1: + outputLog.write("[ERROR] '"+specialCharacter+"' character is forbidden in value name : '"+currentValue+"'\n") + return 4 + currentSample=str(value_component['valueConditions']).split(",") + for iSample, sample_component in enumerate (currentSample): + if not sample_component in currentSampleList: + outputLog.write("[ERROR] sample "+sample_component+" was assigned several times for factor '"+currentFactor+"'\n") + return 2 + currentSampleList.remove(sample_component) + globalDict[currentFactor][sample_component]=currentValue + if(len(currentSampleList)>0): + outputLog.write("[ERROR] for factor '"+currentFactor+"'' sample "+str(currentSampleList)+" are not assigned to any value\n") + return 2 + outputLog.write("[INFO] "+str(len(globalDict))+" factors are detected\n") + #start writing the factor file + outputfile.write(firstLine+"\n") + for iSample, sample_component in enumerate(sampleList): + newLine=sample_component + for iFactor, factor_component in enumerate(factorNameList): + newLine=newLine+"\t"+globalDict[factor_component][sample_component] + outputfile.write(newLine+"\n") + outputfile.close() + outputLog.close() + return 0 + +def selectSubSetTable(file_path,headerLine_number,columnsToAdd,columnNamesToKeep,outputFileName,logFile): + outputLog = open(logFile, 'w') + outputLog.write("[INFO] header line number : "+ headerLine_number+" lines\n") + availableColumnsTuple=get_column_names_mergeNumber(file_path, headerLine_number) + #convert tuple list as a simple array + availableColumns=[] + for iTuple, tuple_content in enumerate (availableColumnsTuple): + availableColumns.append(str(tuple_content[0])) + if len(availableColumns)==0: + outputLog.write("[ERROR] No detected columns in input file\n") + return 1 + selectedColumns=list(columnsToAdd) + for iVolcano, volcano_content in enumerate(columnNamesToKeep): + selectedColumns.append(availableColumns.index(volcano_content['pvalColumn'])) + if volcano_content['fdrColumn'] in availableColumns: + selectedColumns.append(availableColumns.index(volcano_content['fdrColumn'])) + else: + selectedColumns.append(0) + selectedColumns.append(availableColumns.index(volcano_content['fcColumn'])) + if len(selectedColumns)!=(3*len(columnNamesToKeep)+len(columnsToAdd)): + outputLog.write("[ERROR] matching between input file colnames and requested column names failed\n") + return 1 + outputLog.write("[INFO] columns kept : "+str(selectedColumns)+"\n") + #start writting formatted file + inputfile = open(file_path) + outputfile = open(outputFileName, 'w') + iLineCpt=-1 + for iCurrentLine in inputfile: + iLineCpt=iLineCpt+1 + if iLineCpt>=int(headerLine_number): + currentLineFields=np.array(iCurrentLine.strip().split("\t")) + newLine="\t".join(currentLineFields[selectedColumns]) + outputfile.write(newLine+"\n") + if iLineCpt<int(headerLine_number): + outputLog.write("[ERROR] not enough lines in input files ("+(iLineCpt+1)+" lines)\n") + return 1 + inputfile.close() + outputfile.close() + outputLog.close() + return 0 \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/LIMMA_options.py Fri Jun 26 09:38:23 2020 -0400 @@ -0,0 +1,330 @@ +import re + +def get_column_names( file_path, toNotConsider=None, toNotConsiderBis=None): + options=[] + inputfile = open(file_path) + firstLine = next(inputfile).strip().split("\t") + for i, field_component in enumerate( firstLine ): + if i!=0 and field_component!=toNotConsider and field_component!=toNotConsiderBis:#to squeeze the first column + options.append( ( field_component, field_component, False ) ) + inputfile.close() + return options + +def get_row_names( file_path, factorName ): + inputfile = open(file_path) + firstLine = next(inputfile).strip().split("\t") + iColumn=-1 + for i, field_component in enumerate( firstLine ): + if field_component==factorName:#to test + iColumn=i + options=[] + if iColumn!=-1: + for nextLine in inputfile: + nextLine=nextLine.strip().split("\t") + if len(nextLine)>1: + if (nextLine[iColumn], nextLine[iColumn], False) not in options: + options.append( (nextLine[iColumn], nextLine[iColumn], False) ) + inputfile.close() + return options + +def get_row_names_interaction( file_path, factorNameA, factorNameB ): + inputfile = open(file_path) + firstLine = next(inputfile).strip().split("\t") + iColumnA=-1 + iColumnB=-1 + for i, field_component in enumerate( firstLine ): + if field_component==factorNameA:#to test + iColumnA=i + if field_component==factorNameB:#to test + iColumnB=i + possibleValuesA=[] + possibleValuesB=[] + if iColumnA!=-1 and iColumnB!=-1: + for nextLine in inputfile: + nextLine=nextLine.strip().split("\t") + if len(nextLine)>1: + if nextLine[iColumnA] not in possibleValuesA: + possibleValuesA.append(nextLine[iColumnA]) + if nextLine[iColumnB] not in possibleValuesB: + possibleValuesB.append(nextLine[iColumnB]) + inputfile.close() + options=[] + if len(possibleValuesA)>=1 and len(possibleValuesB)>=1 and possibleValuesA[0]!="None" and possibleValuesB[0]!="None": + for counterA in range(len(possibleValuesA)): + for counterB in range(len(possibleValuesB)): + options.append( (possibleValuesA[counterA]+"*"+possibleValuesB[counterB], possibleValuesA[counterA]+"*"+possibleValuesB[counterB], False) ) + return options + +def get_comparisonsA( factorA, valuesA ): + options=[] + formatValuesA=re.sub("(^\[u')|('\]$)","", str(valuesA)) + possibleValues=formatValuesA.split("', u'") + if len(possibleValues)>=2: + for counter in range(len(possibleValues)-1): + for innerCounter in range(counter+1,len(possibleValues)): + options.append( (possibleValues[counter]+" - "+possibleValues[innerCounter], possibleValues[counter]+" - "+possibleValues[innerCounter], False) ) + options.append( (possibleValues[innerCounter]+" - "+possibleValues[counter], possibleValues[innerCounter]+" - "+possibleValues[counter], False) ) + return options + +def get_comparisonsAB(factorA, valuesA, factorB, valuesB, interaction): + options=[] + formatValuesA=re.sub("(^\[u')|('\]$)","", str(valuesA)) + possibleValuesA=formatValuesA.split("', u'") + formatValuesB=re.sub("(^\[u')|('\]$)","", str(valuesB)) + possibleValuesB=formatValuesB.split("', u'") + if str(interaction)=="False": + if len(possibleValuesA)>=2: + for counter in range(len(possibleValuesA)-1): + for innerCounter in range(counter+1,len(possibleValuesA)): + options.append( (possibleValuesA[counter]+" - "+possibleValuesA[innerCounter], possibleValuesA[counter]+" - "+possibleValuesA[innerCounter], False) ) + options.append( (possibleValuesA[innerCounter]+" - "+possibleValuesA[counter], possibleValuesA[innerCounter]+" - "+possibleValuesA[counter], False) ) + if len(possibleValuesB)>=2: + for counter in range(len(possibleValuesB)-1): + for innerCounter in range(counter+1,len(possibleValuesB)): + options.append( (possibleValuesB[counter]+" - "+possibleValuesB[innerCounter], possibleValuesB[counter]+" - "+possibleValuesB[innerCounter], False) ) + options.append( (possibleValuesB[innerCounter]+" - "+possibleValuesB[counter], possibleValuesB[innerCounter]+" - "+possibleValuesB[counter], False) ) + else: + if len(possibleValuesA)>=1 and len(possibleValuesB)>=1 and possibleValuesA[0]!="None" and possibleValuesB[0]!="None": + for counterA in range(len(possibleValuesA)): + for innerCounterA in range(len(possibleValuesA)): + for counterB in range(len(possibleValuesB)): + for innerCounterB in range(len(possibleValuesB)): + if not(counterA==innerCounterA and counterB==innerCounterB): + options.append( ("("+possibleValuesA[counterA]+" * "+possibleValuesB[counterB]+") - ("+possibleValuesA[innerCounterA]+" * "+possibleValuesB[innerCounterB]+")","("+possibleValuesA[counterA]+" * "+possibleValuesB[counterB]+") - ("+possibleValuesA[innerCounterA]+" * "+possibleValuesB[innerCounterB]+")", False) ) + return options + +def get_row_names_allInteractions( file_path, factorSelected): + formatFactors=re.sub("(^\[u')|('\]$)","", str(factorSelected)) + factorsList=formatFactors.split("', u'") + iColumn=[None] * len(factorsList) + valuesList=[None] * len(factorsList) + + inputfile = open(file_path) + firstLine = next(inputfile).strip().split("\t") + for iField, fieldComponent in enumerate( firstLine ): + for iFactor, factorComponent in enumerate(factorsList): + if fieldComponent==factorComponent: + iColumn[iFactor]=iField + valuesList[iFactor]=[] + + for nextLine in inputfile: + nextLine=nextLine.strip().split("\t") + if len(nextLine)>1: + for iFactor, factorComponent in enumerate(factorsList): + if nextLine[iColumn[iFactor]] not in valuesList[iFactor]: + valuesList[iFactor].append(nextLine[iColumn[iFactor]]) + inputfile.close() + + allCombinations=[] + for iFactor, factorComponent in enumerate(factorsList): + if iFactor==0: + allCombinations=valuesList[iFactor] + else: + currentCombinations=allCombinations + allCombinations=[] + for iValue, valueComponent in enumerate(valuesList[iFactor]): + for iCombination, combination in enumerate(currentCombinations): + allCombinations.append(combination+"*"+valueComponent) + + options=[] + for iCombination, combination in enumerate(allCombinations): + options.append((combination,combination,False)) + + return options + +def get_allrow_names( file_path, factorSelected ): + formatFactors=re.sub("(^\[u')|('\]$)","", str(factorSelected)) + factorsList=formatFactors.split("', u'") + iColumn=[None] * len(factorsList) + valuesList=[None] * len(factorsList) + + inputfile = open(file_path) + firstLine = next(inputfile).strip().split("\t") + for iField, fieldComponent in enumerate( firstLine ): + for iFactor, factorComponent in enumerate(factorsList): + if fieldComponent==factorComponent: + iColumn[iFactor]=iField + valuesList[iFactor]=[] + + for nextLine in inputfile: + nextLine=nextLine.strip().split("\t") + if len(nextLine)>1: + for iFactor, factorComponent in enumerate(factorsList): + if nextLine[iColumn[iFactor]] not in valuesList[iFactor]: + valuesList[iFactor].append(nextLine[iColumn[iFactor]]) + inputfile.close() + + allValues=[] + for iFactor, factorComponent in enumerate(factorsList): + for iValue, valueComponent in enumerate(valuesList[iFactor]): + allValues.append(factorComponent+":"+valueComponent) + + options=[] + for iValue, valueComponent in enumerate(allValues): + options.append((valueComponent,valueComponent,False)) + + return options + +def replaceNamesInFiles(expressionFile_name,conditionFile_name,outputExpressionFile,outputConditionFile,ouputDictionnary): + dico={} + forbidenCharacters={"*",":",",","|"} + ##start with expression file, read only the first line + inputfile = open(expressionFile_name) + outputfile = open(outputExpressionFile, 'w') + firstLine = next(inputfile).rstrip().split("\t") + iCondition=1 + newFirstLine="" + for i, field_component in enumerate( firstLine ): + if (i>0): + #conditions names should not be redundant with other conditions + if(field_component not in dico): + dico[field_component]="Condition"+str(iCondition) + newFirstLine+="\t"+"Condition"+str(iCondition) + iCondition+=1 + else: + raise NameError('condition name allready exists!') + else: + newFirstLine+=field_component + outputfile.write(newFirstLine+"\n") + for line in inputfile: + outputfile.write(line) + outputfile.close() + inputfile.close() + #then parse condition file, read all lines in this case + inputfile = open(conditionFile_name) + outputfile = open(outputConditionFile, 'w') + firstLine=1 + iFactor=1 + iValue=1 + for line in inputfile: + currentLine = line.rstrip().split("\t") + newCurrentLine="" + for i, field_component in enumerate( currentLine ): + #special treatment for the first line + if (firstLine==1): + if (i==0): + newCurrentLine=field_component + else: + #factor names should not be redundant with other factors or conditions + if(field_component not in dico): + dico[field_component]="Factor"+str(iFactor) + newCurrentLine+="\t"+"Factor"+str(iFactor) + iFactor+=1 + else: + raise NameError('factor name allready exists!') + else: + if (i==0): + #check if condition name allready exist and used it if it is, or create a new one if not + if(field_component not in dico): + dico[field_component]="Condition"+str(iCondition) + newCurrentLine="Condition"+str(iCondition) + iCondition+=1 + else: + newCurrentLine=dico[field_component] + else: + if(field_component not in dico): + dico[field_component]="Value"+str(iValue) + newCurrentLine+="\tValue"+str(iValue) + iValue+=1 + else: + newCurrentLine+="\t"+dico[field_component] + outputfile.write(newCurrentLine+"\n") + firstLine=0 + outputfile.close() + inputfile.close() + ##check if any entries in dictionnary contains forbiden character + for key, value in dico.items(): + for specialCharacter in forbidenCharacters: + if value.startswith("Condition")==False and key.find(specialCharacter)!=-1: + return 1 + ##then write dictionnary in a additional file + outputfile = open(ouputDictionnary, 'w') + for key, value in dico.items(): + outputfile.write(key+"\t"+value+"\n") + outputfile.close() + return 0 + + +def replaceNamesBlockInFiles(expressionFile_name,conditionFile_name,blockingFile_name,outputExpressionFile,outputConditionFile,outputBlockingFile,ouputDictionnary): + dico={} + forbidenCharacters={"*",":",",","|"} + ##start with expression file, read only the first line + inputfile = open(expressionFile_name) + outputfile = open(outputExpressionFile, 'w') + firstLine = next(inputfile).rstrip().split("\t") + iCondition=1 + newFirstLine="" + for i, field_component in enumerate( firstLine ): + if (i>0): + #conditions names should not be redundant with other conditions + if(field_component not in dico): + dico[field_component]="Condition"+str(iCondition) + newFirstLine+="\t"+"Condition"+str(iCondition) + iCondition+=1 + else: + raise NameError('condition name allready exists!') + else: + newFirstLine+=field_component + outputfile.write(newFirstLine+"\n") + for line in inputfile: + outputfile.write(line) + outputfile.close() + inputfile.close() + #then parse condition file, read all lines in this case + iFactor=1 + iValue=1 + for fileNum in range(2): + if fileNum==0: + inputfile = open(conditionFile_name) + outputfile = open(outputConditionFile, 'w') + else: + inputfile = open(blockingFile_name) + outputfile = open(outputBlockingFile, 'w') + firstLine=1 + for line in inputfile: + currentLine = line.rstrip().split("\t") + newCurrentLine="" + for i, field_component in enumerate( currentLine ): + #special treatment for the first line + if (firstLine==1): + if (i==0): + newCurrentLine=field_component + else: + #factor names should not be redundant with other factors or conditions + if(field_component not in dico): + dico[field_component]="Factor"+str(iFactor) + newCurrentLine+="\t"+"Factor"+str(iFactor) + iFactor+=1 + else: + raise NameError('factor name allready exists!') + else: + if (i==0): + #check if condition name allready exist and used it if it is, or create a new one if not + if(field_component not in dico): + dico[field_component]="Condition"+str(iCondition) + newCurrentLine="Condition"+str(iCondition) + iCondition+=1 + else: + newCurrentLine=dico[field_component] + else: + if(field_component not in dico): + dico[field_component]="Value"+str(iValue) + newCurrentLine+="\tValue"+str(iValue) + iValue+=1 + else: + newCurrentLine+="\t"+dico[field_component] + outputfile.write(newCurrentLine+"\n") + firstLine=0 + outputfile.close() + inputfile.close() + ##check if any entries in dictionnary contains forbiden character + for key, value in dico.items(): + for specialCharacter in forbidenCharacters: + if value.startswith("Condition")==False and key.find(specialCharacter)!=-1: + return 1 + ##then write dictionnary in a additional file + outputfile = open(ouputDictionnary, 'w') + for key, value in dico.items(): + outputfile.write(key+"\t"+value+"\n") + outputfile.close() + return 0
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/LIMMAscriptV4.R Fri Jun 26 09:38:23 2020 -0400 @@ -0,0 +1,1002 @@ +# A command-line interface for LIMMA to use with Galaxy +# written by Jimmy Vandel +# one of these arguments is required: +# +# +initial.options <- commandArgs(trailingOnly = FALSE) +file.arg.name <- "--file=" +script.name <- sub(file.arg.name, "", initial.options[grep(file.arg.name, initial.options)]) +script.basename <- dirname(script.name) +source(file.path(script.basename, "utils.R")) +source(file.path(script.basename, "getopt.R")) + +#addComment("Welcome R!") + +# setup R error handling to go to stderr +options( show.error.messages=F, error = function () { cat(geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) + +# we need that to not crash galaxy with an UTF8 error on German LC settings. +loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") +loc <- Sys.setlocale("LC_NUMERIC", "C") + +#get starting time +start.time <- Sys.time() + +options(stringAsfactors = FALSE, useFancyQuotes = FALSE) +args <- commandArgs() + +# get options, using the spec as defined by the enclosed list. +# we read the options from the default: commandArgs(TRUE). +spec <- matrix(c( + "dataFile", "i", 1, "character", + "factorInfo","a", 1, "character", + "blockingInfo","b", 1, "character", + "dicoRenaming","g",1,"character", + "blockingPolicy","u", 1, "character", + "fdrThreshold","t", 1, "double", + "thresholdFC","d", 1, "double", + "format", "f", 1, "character", + "histo","h", 1, "character", + "volcano","v", 1, "character", + "factorsContrast","r", 1, "character", + "contrastNames","p", 1, "character", + "firstGroupContrast","m", 1, "character", + "secondGroupContrast","n", 1, "character", + "controlGroups","c", 1, "character", + "fratioFile","s",1,"character", + "organismID","x",1,"character", + "rowNameType","y",1,"character", + "quiet", "q", 0, "logical", + "log", "l", 1, "character", + "outputFile" , "o", 1, "character", + "outputDfFile" , "z", 1, "character"), + byrow=TRUE, ncol=4) +opt <- getopt(spec) + +# enforce the following required arguments +if (is.null(opt$log)) { + addComment("[ERROR]'log file' is required\n") + q( "no", 1, F ) +} +addComment("[INFO]Start of R script",T,opt$log,display=FALSE) +if (is.null(opt$dataFile)) { + addComment("[ERROR]'dataFile' is required",T,opt$log) + q( "no", 1, F ) +} +if (!is.null(opt$blockingInfo) && is.null(opt$blockingPolicy) ) { + addComment("[ERROR]blocking policy is missing",T,opt$log) + q( "no", 1, F ) +} +if (is.null(opt$dicoRenaming)) { + addComment("[ERROR]renaming dictionnary is missing",T,opt$log) + q( "no", 1, F ) +} +if (is.null(opt$factorsContrast)) { + addComment("[ERROR]factor informations are missing",T,opt$log) + q( "no", 1, F ) +} +if (length(opt$firstGroupContrast)!=length(opt$secondGroupContrast)) { + addComment("[ERROR]some contrast groups seems to be empty",T,opt$log) + q( "no", 1, F ) +} +if (is.null(opt$factorInfo)) { + addComment("[ERROR]factors info is missing",T,opt$log) + q( "no", 1, F ) +} +if (is.null(opt$format)) { + addComment("[ERROR]'output format' is required",T,opt$log) + q( "no", 1, F ) +} +if (is.null(opt$fdrThreshold)) { + addComment("[ERROR]'FDR threshold' is required",T,opt$log) + q( "no", 1, F ) +} +if (is.null(opt$outputFile) || is.null(opt$outputDfFile)){ + addComment("[ERROR]'output files' are required",T,opt$log) + q( "no", 1, F ) +} +if (is.null(opt$thresholdFC)){ + addComment("[ERROR]'FC threshold' is required",T,opt$log) + q( "no", 1, F ) +} +if (is.null(opt$fratioFile)) { + addComment("[ERROR]F-ratio parameter is missing",T,opt$log) + q( "no", 1, F ) +} + +#demande si le script sera bavard +verbose <- if (is.null(opt$quiet)) { + TRUE +}else{ + FALSE +} + +#paramètres internes +#pour savoir si on remplace les FC calculés par LIMMA par un calcul du LS-MEAN (ie moyenne de moyennes de chaque groupe dans chaque terme du contraste plutôt qu'une moyenne globale dans chaque terme) +useLSmean=FALSE + +addComment("[INFO]Parameters checked!",T,opt$log,display=FALSE) + +addComment(c("[INFO]Working directory: ",getwd()),TRUE,opt$log,display=FALSE) +addComment(c("[INFO]Command line: ",args),TRUE,opt$log,display=FALSE) + +#directory for plots +dir.create(file.path(getwd(), "plotDir")) +dir.create(file.path(getwd(), "plotLyDir")) + +#charge des packages silencieusement +suppressPackageStartupMessages({ + library("methods") + library("limma") + library("biomaRt") + library("ggplot2") + library("plotly") + library("stringr") + library("RColorBrewer") +}) + + +#chargement du fichier dictionnaire de renommage +renamingDico=read.csv(file=file.path(getwd(), opt$dicoRenaming),header=F,sep="\t",colClasses="character") +rownames(renamingDico)=renamingDico[,2] + + +#chargement des fichiers en entrée +expDataMatrix=read.csv(file=file.path(getwd(), opt$dataFile),header=F,sep="\t",colClasses="character") +#remove first row to convert it as colnames (to avoid X before colnames with header=T) +colNamesData=expDataMatrix[1,-1] +expDataMatrix=expDataMatrix[-1,] +#remove first colum to convert it as rownames +rowNamesData=expDataMatrix[,1] +expDataMatrix=expDataMatrix[,-1] +if(is.data.frame(expDataMatrix)){ + expDataMatrix=data.matrix(expDataMatrix) +}else{ + expDataMatrix=data.matrix(as.numeric(expDataMatrix)) +} +dimnames(expDataMatrix)=list(rowNamesData,colNamesData) + +#test the number of rows that are constant in dataMatrix +nbConstantRows=length(which(unlist(apply(expDataMatrix,1,var))==0)) +if(nbConstantRows>0){ + addComment(c("[WARNING]",nbConstantRows,"rows are constant across conditions in input data file"),T,opt$log,display=FALSE) +} + +#test if all condition names are present in dico +if(!all(colnames(expDataMatrix) %in% rownames(renamingDico))){ + addComment("[ERROR]Missing condition names in renaming dictionary",T,opt$log) + q( "no", 1, F ) +} + +addComment("[INFO]Expression data loaded and checked",T,opt$log,display=FALSE) +addComment(c("[INFO]Dim of expression matrix:",dim(expDataMatrix)),T,opt$log,display=FALSE) + +#chargement du fichier des facteurs +factorInfoMatrix=read.csv(file=file.path(getwd(), opt$factorInfo),header=F,sep="\t",colClasses="character") +#remove first row to convert it as colnames +colnames(factorInfoMatrix)=factorInfoMatrix[1,] +factorInfoMatrix=factorInfoMatrix[-1,] +#use first colum to convert it as rownames but not removing it to avoid conversion as vector in unique factor case +rownames(factorInfoMatrix)=factorInfoMatrix[,1] + +if(length(setdiff(colnames(expDataMatrix),rownames(factorInfoMatrix)))!=0){ + addComment("[ERROR]Missing samples in factor file",T,opt$log) + q( "no", 1, F ) +} + +#order sample as in expression matrix and remove spurious sample +factorInfoMatrix=factorInfoMatrix[colnames(expDataMatrix),] + +#test if all values names are present in dico +if(!all(unlist(factorInfoMatrix) %in% rownames(renamingDico))){ + addComment("[ERROR]Missing factor names in renaming dictionary",T,opt$log) + q( "no", 1, F ) +} + +addComment("[INFO]Factors OK",T,opt$log,display=FALSE) +addComment(c("[INFO]Dim of factorInfo matrix:",dim(factorInfoMatrix)),T,opt$log,display=FALSE) + +##manage blocking factor +blockingFactor=NULL +blockinFactorsList=NULL +if(!is.null(opt$blockingInfo)){ + + #chargement du fichier des blocking factors + blockingInfoMatrix=read.csv(file=file.path(getwd(), opt$blockingInfo),header=F,sep="\t",colClasses="character") + #remove first row to convert it as colnames + colnames(blockingInfoMatrix)=blockingInfoMatrix[1,] + blockingInfoMatrix=blockingInfoMatrix[-1,] + #use first colum to convert it as rownames but not removing it to avoid conversion as vector in unique factor case + rownames(blockingInfoMatrix)=blockingInfoMatrix[,1] + + + if(length(setdiff(colnames(expDataMatrix),rownames(blockingInfoMatrix)))!=0){ + addComment("[ERROR]Missing samples in blocking factor file",T,opt$log) + q( "no", 1, F ) + } + + #order sample as in expression matrix + blockingInfoMatrix=blockingInfoMatrix[colnames(expDataMatrix),] + + #test if all blocking names are present in dico + if(!all(unlist(blockingInfoMatrix) %in% rownames(renamingDico))){ + addComment("[ERROR]Missing blocking names in renaming dictionary",T,opt$log) + q( "no", 1, F ) + } + + #remove blocking factors allready present as real factors + blockingNotInMainFactors=setdiff(colnames(blockingInfoMatrix)[-1],colnames(factorInfoMatrix)[-1]) + + if(length(blockingNotInMainFactors)<(ncol(blockingInfoMatrix)-1))addComment("[WARNING]Blocking factors cannot be principal factors",T,opt$log,display=FALSE) + + if(length(blockingNotInMainFactors)>0){ + + blockingInfoMatrix=blockingInfoMatrix[,c(colnames(blockingInfoMatrix)[1],blockingNotInMainFactors)] + + groupBlocking=rep("c",ncol(expDataMatrix)) + #for each blocking factor + for(blockingFact in blockingNotInMainFactors){ + if(opt$blockingPolicy=="correlated"){ + indNewFact=as.numeric(factor(blockingInfoMatrix[,blockingFact])) + groupBlocking=paste(groupBlocking,indNewFact,sep="_") + }else{ + if(is.null(blockinFactorsList))blockinFactorsList=list() + blockinFactorsList[[blockingFact]]=factor(unlist(lapply(blockingInfoMatrix[,blockingFact],function(x)paste(c(blockingFact,"_",x),collapse="")))) + } + } + if(opt$blockingPolicy=="correlated"){ + blockingFactor=factor(groupBlocking) + if(length(levels(blockingFactor))==1){ + addComment("[ERROR]Selected blocking factors seems to be constant",T,opt$log) + q( "no", 1, F ) + } + } + addComment("[INFO]Blocking info OK",T,opt$log,display=FALSE) + }else{ + addComment("[WARNING]No blocking factors will be considered",T,opt$log,display=FALSE) + } +} + + +##rename different input parameters using renamingDictionary +opt$factorsContrast=renamingDico[unlist(lapply(unlist(strsplit(opt$factorsContrast,",")),function(x)which(renamingDico[,1]==x))),2] + +userDefinedContrasts=FALSE +if(!is.null(opt$firstGroupContrast) && !is.null(opt$secondGroupContrast)){ + userDefinedContrasts=TRUE + for(iContrast in 1:length(opt$firstGroupContrast)){ + opt$firstGroupContrast[iContrast]=paste(unlist(lapply(unlist(strsplit(opt$firstGroupContrast[iContrast],",")),function(x)paste(renamingDico[unlist(lapply(unlist(strsplit(x,"\\*")),function(x)which(renamingDico[,1]==x))),2],collapse="*"))),collapse=",") + opt$secondGroupContrast[iContrast]=paste(unlist(lapply(unlist(strsplit(opt$secondGroupContrast[iContrast],",")),function(x)paste(renamingDico[unlist(lapply(unlist(strsplit(x,"\\*")),function(x)which(renamingDico[,1]==x))),2],collapse="*"))),collapse=",") + } +} + +if(!is.null(opt$controlGroups)){ + renamedGroups=c() + for(iGroup in unlist(strsplit(opt$controlGroups,","))){ + renamedControlGroup=paste(renamingDico[unlist(lapply(unlist(strsplit(iGroup,":")),function(x)which(renamingDico[,1]==x))),2],collapse=":") + if(length(renamedControlGroup)==0 || any(which(unlist(gregexpr(text = renamedControlGroup,pattern = ":"))==-1))){ + addComment("[ERROR]Control groups for interaction seem to mismatch, please check them.",T,opt$log) + q( "no", 1, F ) + } + renamedGroups=c(renamedGroups,renamedControlGroup) + } + opt$controlGroups=renamedGroups +} +addComment("[INFO]Contrast variables are renamed to avoid confusion",T,opt$log,display=FALSE) +##renaming done + +#to convert factor as numeric value --> useless now ? +#expDataMatrix=apply(expDataMatrix,c(1,2),function(x)as.numeric(paste(x))) + +#get factors info for LIMMA +factorsList=list() +for(iFactor in opt$factorsContrast){ + if(!(iFactor %in% colnames(factorInfoMatrix))){ + addComment("[ERROR]Required factors are missing in input file",T,opt$log) + q( "no", 1, F ) + } + factorsList[[iFactor]]=factor(unlist(lapply(factorInfoMatrix[,iFactor],function(x)paste(c(iFactor,"_",x),collapse="")))) + if(length(levels(factorsList[[iFactor]]))==1){ + addComment("[ERROR]One selected factor seems to be constant",T,opt$log) + q( "no", 1, F ) + } +} + +#check if there is at least 2 factors to allow interaction computation +if(!is.null(opt$controlGroups) && length(factorsList)<2){ + addComment("[ERROR]You cannot ask for interaction with less than 2 factors",T,opt$log) + q( "no", 1, F ) +} + +#merge all factors as a single one +factorsMerged=as.character(factorsList[[opt$factorsContrast[1]]]) +for(iFactor in opt$factorsContrast[-1]){ + factorsMerged=paste(factorsMerged,as.character(factorsList[[iFactor]]),sep=".") +} +factorsMerged=factor(factorsMerged) + +#checked that coefficient number (ie. factorsMerged levels) is strictly smaller than sample size +if(length(levels(factorsMerged))>=length(factorsMerged)){ + addComment(c("[ERROR]No enough samples (",length(factorsMerged),") to estimate ",length(levels(factorsMerged))," coefficients"),T,opt$log) + q( "no", 1, F ) +} + +#get the sample size of each factor values +sampleSizeFactor=table(factorsMerged) + + +if(!is.null(blockinFactorsList)){ + factorString=c("blockinFactorsList[['", names(blockinFactorsList)[1],"']]") + for(blockingFact in names(blockinFactorsList)[-1]){ + factorString=c(factorString," + blockinFactorsList[['",blockingFact,"']]") + } + design = model.matrix(as.formula(paste(c("~ factorsMerged +",factorString," + 0"),collapse=""))) + + #rename design columns + coeffMeaning = levels(factorsMerged) + for(blockingFact in blockinFactorsList){ + coeffMeaning=c(coeffMeaning,levels(blockingFact)[-1]) + } + colnames(design) = coeffMeaning +}else{ + design = model.matrix(as.formula( ~ factorsMerged + 0)) + + #rename degin columns + coeffMeaning = levels(factorsMerged) + colnames(design) = coeffMeaning +} + +addComment(c("[INFO]Available coefficients: ",coeffMeaning),T,opt$log,display=F) + +estimableCoeff=which(colSums(design)!=0) + +addComment("[INFO]Design done",T,opt$log,display=F) + + #use blocking factor if exists +if(!is.null(blockingFactor)){ + corfit <- duplicateCorrelation(expDataMatrix, design, block=blockingFactor) + + addComment(c("[INFO]Correlation within groups: ",corfit$consensus.correlation),T,opt$log,display=F) + + #run linear model fit + data.fit = lmFit(expDataMatrix,design,block = blockingFactor, correlation=corfit$consensus.correlation) +}else{ + #run linear model fit + data.fit = lmFit(expDataMatrix,design) +} + +estimatedCoeff=which(!is.na(data.fit$coefficients[1,])) + +addComment("[INFO]Lmfit done",T,opt$log,display=F) + +#catch situation where some coefficients cannot be estimated, probably due to dependances between design columns +#if(length(setdiff(estimableCoeff,estimatedCoeff))>0){ +# addComment("[ERROR]Error in design matrix, check your group definitions",T,opt$log) +# q( "no", 1, F ) +#} +#to strong condition, should return ERROR only when coefficients relative to principal factors cannot be estimated, otherwise, return a simple WARNING + +#define requested contrasts +requiredContrasts=c() +humanReadingContrasts=c() +persoContrastName=c() +if(userDefinedContrasts){ + for(iContrast in 1:length(opt$firstGroupContrast)){ + posGroup=unlist(lapply(unlist(strsplit(opt$firstGroupContrast[iContrast],",")),function(x)paste(paste(opt$factorsContrast,unlist(strsplit(x,"\\*")),sep="_"),collapse="."))) + negGroup=unlist(lapply(unlist(strsplit(opt$secondGroupContrast[iContrast],",")),function(x)paste(paste(opt$factorsContrast,unlist(strsplit(x,"\\*")),sep="_"),collapse="."))) + #clear posGroup and negGroup from empty groups + emptyPosGroups=which(!(posGroup%in%coeffMeaning)) + if(length(emptyPosGroups)>0){ + addComment(c("[WARNING]The group(s)",posGroup[emptyPosGroups],"is/are removed from contrast as it/they is/are empty"),T,opt$log,display=FALSE) + posGroup=posGroup[-emptyPosGroups] + currentHumanContrast=paste(unlist(strsplit(opt$firstGroupContrast[iContrast],","))[-emptyPosGroups],collapse="+") + }else{ + currentHumanContrast=paste(unlist(strsplit(opt$firstGroupContrast[iContrast],",")),collapse="+") + } + emptyNegGroups=which(!(negGroup%in%coeffMeaning)) + if(length(emptyNegGroups)>0){ + addComment(c("[WARNING]The group(s)",negGroup[emptyNegGroups],"is/are removed from contrast as it/they is/are empty"),T,opt$log,display=FALSE) + negGroup=negGroup[-emptyNegGroups] + currentHumanContrast=paste(c(currentHumanContrast,unlist(strsplit(opt$secondGroupContrast[iContrast],","))[-emptyNegGroups]),collapse="-") + }else{ + currentHumanContrast=paste(c(currentHumanContrast,unlist(strsplit(opt$secondGroupContrast[iContrast],","))),collapse="-") + } + if(length(posGroup)==0 || length(negGroup)==0 ){ + addComment(c("[WARNING]Contrast",currentHumanContrast,"cannot be estimated due to empty group"),T,opt$log,display=FALSE) + }else{ + if(all(posGroup%in%negGroup) && all(negGroup%in%posGroup)){ + addComment(c("[WARNING]Contrast",currentHumanContrast,"cannot be estimated due to null contrast"),T,opt$log,display=FALSE) + }else{ + #get coefficients required for first group added as positive + positiveCoeffWeights=sampleSizeFactor[posGroup]/sum(sampleSizeFactor[posGroup]) + #positiveCoeffWeights=rep(1,length(posGroup)) + #names(positiveCoeffWeights)=names(sampleSizeFactor[posGroup]) + #get coefficients required for second group added as negative + negativeCoeffWeights=sampleSizeFactor[negGroup]/sum(sampleSizeFactor[negGroup]) + #negativeCoeffWeights=rep(1,length(negGroup)) + #names(negativeCoeffWeights)=names(sampleSizeFactor[negGroup]) + #build the resulting contrast + currentContrast=paste(paste(positiveCoeffWeights[posGroup],posGroup,sep="*"),collapse="+") + currentContrast=paste(c(currentContrast,paste(paste(negativeCoeffWeights[negGroup],negGroup,sep="*"),collapse="-")),collapse="-") + requiredContrasts=c(requiredContrasts,currentContrast) + + #build the human reading contrast + humanReadingContrasts=c(humanReadingContrasts,currentHumanContrast) + if(!is.null(opt$contrastNames) && nchar(opt$contrastNames[iContrast])>0){ + persoContrastName=c(persoContrastName,opt$contrastNames[iContrast]) + }else{ + persoContrastName=c(persoContrastName,"") + } + + addComment(c("[INFO]Contrast added : ",currentHumanContrast),T,opt$log,display=F) + addComment(c("with complete formula ",currentContrast),T,opt$log,display=F) + } + } + } +} + + + #define the true formula with interactions to get interaction coefficients + factorString=c("factorsList[['", names(factorsList)[1],"']]") + for(iFactor in names(factorsList)[-1]){ + factorString=c(factorString," * factorsList[['",iFactor,"']]") + } + + if(!is.null(blockinFactorsList)){ + for(blockingFact in names(blockinFactorsList)){ + factorString=c(factorString," + blockinFactorsList[['",blockingFact,"']]") + } + } + + #should not be null at the end + allFtestMeanSquare=NULL + #to get the F-test values + estimatedInteractions=rownames(anova(lm(as.formula(paste(c("expDataMatrix[1,] ~ ",factorString),collapse=""))))) + estimatedInteractions=c(unlist(lapply(estimatedInteractions[-length(estimatedInteractions)],function(x){temp=unlist(strsplit(x,"[ \" | : ]"));paste(temp[seq(2,length(temp),3)],collapse=":")})),estimatedInteractions[length(estimatedInteractions)]) + #rename estimated interaction terms using renamingDico + estimatedInteractions=c(unlist(lapply(estimatedInteractions[-length(estimatedInteractions)],function(x)paste(renamingDico[unlist(strsplit(x,":")),1],collapse=":"))),estimatedInteractions[length(estimatedInteractions)]) + t <- unlist(apply(expDataMatrix,1,function(x){temp=anova(lm(as.formula(paste(c("x ~ ",factorString),collapse=""))))$`Mean Sq`;temp/temp[length(temp)]})) + allFtestMeanSquare <- t(matrix(t,nrow=length(estimatedInteractions))) + #remove from allFtest rows containing NA + if(length(which(is.na(allFtestMeanSquare[,1])))>0)allFtestMeanSquare=allFtestMeanSquare[-(which(is.na(allFtestMeanSquare[,1]))),] + colnames(allFtestMeanSquare)=estimatedInteractions + + #add contrasts corresponding to interaction terms + if(!is.null(opt$controlGroups)){ + #first load user defined control group for each factor + controlGroup=rep(NA,length(factorsList)) + names(controlGroup)=names(factorsList) + for(iGroup in opt$controlGroups){ + splitGroup=unlist(strsplit(iGroup,":")) + splitGroup[2]=paste(splitGroup[1],splitGroup[2],sep = "_") + #check if defined control group is really a level of the corresponding factor + if(!splitGroup[1]%in%names(controlGroup) || !splitGroup[2]%in%factorsList[[splitGroup[1]]]){ + addComment(c("[ERROR]The factor name",splitGroup[1],"does not exist or group name",splitGroup[2]),T,opt$log) + q( "no", 1, F ) + } + if(!is.na(controlGroup[splitGroup[1]])){ + addComment("[ERROR]Several control groups are defined for the same factor, please select only one control group for each factor if you want to compute interaction contrasts",T,opt$log) + q( "no", 1, F ) + } + controlGroup[splitGroup[1]]=splitGroup[2] + } + + #check if all factor have a defined control group + if(any(is.na(controlGroup))){ + addComment("[ERROR]Missing control group for some factors, please check them if you want to compute interaction contrasts",T,opt$log) + q( "no", 1, F ) + } + + nbFactors=length(factorsList) + interactionContrasts=c() + contrastClass=c() + #initialize list for the first level + newPreviousLoopContrast=list() + for(iFactorA in 1:(nbFactors-1)){ + nameFactorA=names(factorsList)[iFactorA] + compA=c() + for(levelA in setdiff(levels(factorsList[[iFactorA]]),controlGroup[nameFactorA])){ + compA=c(compA,paste(levelA,controlGroup[nameFactorA],sep="-")) + } + newPreviousLoopContrast[[as.character(iFactorA)]]=compA + } + #make a loop for growing interaction set + for(globalIfactor in 1:(nbFactors-1)){ + previousLoopContrast=newPreviousLoopContrast + newPreviousLoopContrast=list() + #factor A reuse contrasts made at previsous loop + for(iFactorA in names(previousLoopContrast)){ + compA=previousLoopContrast[[iFactorA]] + + if(max(as.integer(unlist(strsplit(iFactorA,"\\."))))<nbFactors){ + #factor B is the new factor to include in intreraction set + for(iFactorB in (max(as.integer(unlist(strsplit(iFactorA,"\\."))))+1):nbFactors){ + nameFactorB=names(factorsList)[iFactorB] + #keep contrasts identified trough interacting factors set + newPreviousLoopContrast[[paste(iFactorA,iFactorB,sep=".")]]=c() + for(iCompA in compA){ + for(levelB in setdiff(levels(factorsList[[iFactorB]]),controlGroup[nameFactorB])){ + #decompose the contrast compA to apply the new level of factor B on each term + temp=unlist(strsplit(iCompA,"[ + ]")) + splitCompA=temp[1] + for(iTemp in temp[-1])splitCompA=c(splitCompA,"+",iTemp) + splitCompA=unlist(lapply(splitCompA,function(x){temp=unlist(strsplit(x,"-"));splitCompB=temp[1];for(iTemp in temp[-1])splitCompB=c(splitCompB,"-",iTemp);splitCompB})) + #apply on each contrast term the new level of factor B + firstTerm=paste(unlist(lapply(splitCompA,function(x)if(x!="+" && x!="-"){paste(x,levelB,sep=".")}else{x})),collapse="") + secondTerm=negativeExpression(paste(unlist(lapply(splitCompA,function(x)if(x!="+" && x!="-"){paste(x,controlGroup[nameFactorB],sep=".")}else{x})),collapse="")) + currentContrast=paste(c(firstTerm,secondTerm),collapse="") + + newPreviousLoopContrast[[paste(iFactorA,iFactorB,sep=".")]]=c(newPreviousLoopContrast[[paste(iFactorA,iFactorB,sep=".")]],currentContrast) + } + } + } + } + } + for(iContrast in names(newPreviousLoopContrast)){ + contrastClass=c(contrastClass,rep(iContrast,length(newPreviousLoopContrast[[iContrast]]))) + } + interactionContrasts=c(interactionContrasts,unlist(newPreviousLoopContrast)) + } + #make human title for interactions + names(interactionContrasts)=contrastClass + humanReadingInteraction=unlist(lapply(interactionContrasts,function(x)gsub("\\.",":",unlist(strsplit(x,"[+-]"))[1]))) + + contrastToIgnore=c() + + #complete with control groups and order to match to coeffs + for(iContrast in 1:length(interactionContrasts)){ + missingFactor=setdiff(1:nbFactors,as.integer(unlist(strsplit(names(interactionContrasts[iContrast]),"\\.")))) + #decompose the contrast + temp=unlist(strsplit(interactionContrasts[iContrast],"[ + ]")) + splitContrast=temp[1] + for(iTemp in temp[-1])splitContrast=c(splitContrast,"+",iTemp) + splitContrast=unlist(lapply(splitContrast,function(x){temp=unlist(strsplit(x,"-"));splitCompB=temp[1];for(iTemp in temp[-1])splitCompB=c(splitCompB,"-",iTemp);splitCompB})) + for(iFactor in missingFactor){ + for(iTerm in 1:length(splitContrast)){ + if(splitContrast[iTerm]!="+" && splitContrast[iTerm]!="-"){ + splitTerm=unlist(strsplit(splitContrast[iTerm],"\\.")) + if(iFactor==1)splitContrast[iTerm]=paste(c(controlGroup[names(factorsList)[iFactor]],splitTerm),collapse=".") + if(iFactor==nbFactors)splitContrast[iTerm]=paste(c(splitTerm,controlGroup[names(factorsList)[iFactor]]),collapse=".") + if(iFactor>1 && iFactor<nbFactors)splitContrast[iTerm]=paste(c(splitTerm[1:(iFactor-1)],controlGroup[names(factorsList)[iFactor]],splitTerm[iFactor:length(splitTerm)]),collapse=".") + } + } + } + interactionContrasts[iContrast]=paste(splitContrast,collapse="") + if(all(splitContrast[seq(1,length(splitContrast),2)]%in%coeffMeaning)){ + addComment(c("[INFO]Interaction contrast added : ",humanReadingInteraction[iContrast]),T,opt$log,display=F) + addComment(c("with complete formula ",interactionContrasts[iContrast]),T,opt$log,display=F) + }else{ + contrastToIgnore=c(contrastToIgnore,iContrast) + addComment(c("[WARNING]Interaction contrast",humanReadingInteraction[iContrast],"is removed due to empty group"),T,opt$log,display=F) + } + } + + #add interaction contrasts to global contrast list + if(length(contrastToIgnore)>0){ + requiredContrasts=c(requiredContrasts,interactionContrasts[-contrastToIgnore]) + humanReadingContrasts=c(humanReadingContrasts,humanReadingInteraction[-contrastToIgnore]) + persoContrastName=c(persoContrastName,rep("",length(humanReadingInteraction[-contrastToIgnore]))) + }else{ + requiredContrasts=c(requiredContrasts,interactionContrasts) + humanReadingContrasts=c(humanReadingContrasts,humanReadingInteraction) + persoContrastName=c(persoContrastName,rep("",length(humanReadingInteraction))) + } + }#end of intreaction contrasts + + + #remove from requiredContrasts contrasts that cannot be estimated + toRemove=unique(unlist(lapply(setdiff(coeffMeaning,names(estimatedCoeff)),function(x)grep(x,requiredContrasts)))) + if(length(toRemove)>0){ + addComment(c("[WARNING]",length(toRemove)," contrasts are removed, due to missing coefficients"),T,opt$log,display=FALSE) + requiredContrasts=requiredContrasts[-toRemove] + humanReadingContrasts=humanReadingContrasts[-toRemove] + persoContrastName=persoContrastName[-toRemove] + } + + if(length(requiredContrasts)==0){ + addComment("[ERROR]No contrast to compute, please check your contrast definition.",T,opt$log) + q( "no", 1, F ) + } + + #compute for each contrast mean of coefficients in posGroup and negGroup for FC computation of log(FC) with LSmean as in Partek + meanPosGroup=list() + meanNegGroup=list() + for(iContrast in 1:length(requiredContrasts)){ + #define posGroup and negGroup + #first split contrast + temp=unlist(strsplit(requiredContrasts[iContrast],"[ + ]")) + splitContrast=temp[1] + for(iTemp in temp[-1])splitContrast=c(splitContrast,"+",iTemp) + splitContrast=unlist(lapply(splitContrast,function(x){temp=unlist(strsplit(x,"-"));splitCompB=temp[1];for(iTemp in temp[-1])splitCompB=c(splitCompB,"-",iTemp);splitCompB})) + #and then put each term in good group + posGroup=c() + negGroup=c() + nextIsPos=TRUE + for(iSplit in splitContrast){ + if(iSplit=="+")nextIsPos=TRUE + if(iSplit=="-")nextIsPos=FALSE + if(iSplit!="-" && iSplit!="+"){ + #remove weights of contrast terms + iSplitBis=unlist(strsplit(iSplit,"[*]")) + iSplitBis=iSplitBis[length(iSplitBis)] + if(nextIsPos)posGroup=c(posGroup,iSplitBis) + else negGroup=c(negGroup,iSplitBis) + } + } + #compute means for each group + meanPosGroup[[iContrast]]=apply(as.matrix(data.fit$coefficients[,posGroup],ncol=length(posGroup)),1,mean) + meanNegGroup[[iContrast]]=apply(as.matrix(data.fit$coefficients[,negGroup],ncol=length(negGroup)),1,mean) + } + + + contrast.matrix = makeContrasts(contrasts=requiredContrasts,levels=design) + data.fit.con = contrasts.fit(data.fit,contrast.matrix) + + addComment("[INFO]Contrast definition done",T,opt$log,T,display=FALSE) + + #compute LIMMA statistics + data.fit.eb = eBayes(data.fit.con) + + addComment("[INFO]Estimation done",T,opt$log,T,display=FALSE) + + #adjust p.value through FDR + data.fit.eb$adj_p.value=data.fit.eb$p.value + for(iComparison in 1:ncol(data.fit.eb$adj_p.value)){ + data.fit.eb$adj_p.value[,iComparison]=p.adjust(data.fit.eb$p.value[,iComparison],"fdr") + } + + #add a new field based on LS-means for each contrast instead of global mean like they were calculated in coefficients field + data.fit.eb$coefficientsLS=data.fit.eb$coefficients + if(ncol(data.fit.eb$coefficients)!=length(meanPosGroup)){ + addComment("[ERROR]Estimated contrasts number unexpected",T,opt$log) + q( "no", 1, F ) + } + for(iContrast in 1:length(meanPosGroup)){ + data.fit.eb$coefficientsLS[,iContrast]=meanPosGroup[[iContrast]][rownames(data.fit.eb$coefficientsLS)]-meanNegGroup[[iContrast]][rownames(data.fit.eb$coefficientsLS)] + } + + #if requested replace coefficient computed as global mean by LS-means values + if(useLSmean)data.fit.eb$coefficients=data.fit.eb$coefficientsLS + +addComment("[INFO]Core treatment done",T,opt$log,T,display=FALSE) + + +##convert humanReadingContrasts with namingDictionary to create humanReadingContrastsRenamed and keep original humanReadingContrasts names for file names +humanReadingContrastsRenamed=rep("",length(humanReadingContrasts)) +for(iContrast in 1:length(humanReadingContrasts)){ + if(persoContrastName[iContrast]==""){ + #if(verbose)addComment(humanReadingContrasts[iContrast]) + specialCharacters=str_extract_all(humanReadingContrasts[iContrast],"[+|*|_|:|-]")[[1]] + #if(verbose)addComment(specialCharacters) + nameConverted=unlist(lapply(strsplit(humanReadingContrasts[iContrast],"[+|*|_|:|-]")[[1]],function(x)renamingDico[x,1])) + #if(verbose)addComment(nameConverted) + humanReadingContrastsRenamed[iContrast]=paste(nameConverted,specialCharacters,collapse="",sep="") + #if(verbose)addComment(humanReadingContrastsRenamed[iContrast]) + humanReadingContrastsRenamed[iContrast]=substr(humanReadingContrastsRenamed[iContrast],1,nchar(humanReadingContrastsRenamed[iContrast])-1) + }else{ + humanReadingContrastsRenamed[iContrast]=persoContrastName[iContrast] + } +} + +#write correspondances between plot file names (humanReadingContrasts) and displayed names in figure legends (humanReadingContrastsRenamed), usefull to define html items in xml file +correspondanceTable=matrix("",ncol=2,nrow=ncol(data.fit.eb$p.value)) +correspondanceTable[,1]=unlist(lapply(humanReadingContrasts,function(x)gsub(":","_INT_",gsub("\\+","_PLUS_",gsub("\\*","_AND_",x))))) +correspondanceTable[,2]=humanReadingContrastsRenamed +rownames(correspondanceTable)=correspondanceTable[,2] +write.table(correspondanceTable,file=file.path(getwd(), "correspondanceFileNames.csv"),quote=FALSE,sep="\t",col.names = F,row.names = F) + +#plot nominal p-val histograms for selected comparisons +histogramPerPage=6 +if (!is.null(opt$histo)) { + iToPlot=1 + plotVector=list() + nbComparisons=ncol(data.fit.eb$p.value) + for (iComparison in 1:nbComparisons){ + dataToPlot=data.frame(pval=data.fit.eb$p.value[,iComparison],id=rownames(data.fit.eb$p.value)) + p <- ggplot(data=dataToPlot, aes(x=pval)) + geom_histogram(colour="red", fill="salmon") + + theme_bw() + ggtitle(humanReadingContrastsRenamed[iComparison]) + ylab(label="Frequencies") + xlab(label="Nominal p-val") + + theme(panel.border=element_blank(),plot.title = element_text(hjust = 0.5)) + plotVector[[length(plotVector)+1]]=p + + pp <- ggplotly(p) + htmlwidgets::saveWidget(as_widget(pp), paste(c(file.path(getwd(), "plotLyDir"),"/",opt$histo,"_",correspondanceTable[humanReadingContrastsRenamed[iComparison],1],".html"),collapse=""),selfcontained = F) + + if(iComparison==nbComparisons || length(plotVector)==histogramPerPage){ + #plot and close the actual plot + if(opt$format=="pdf"){ + pdf(paste(c("./plotDir/",opt$histo,iToPlot,".pdf"),collapse=""))}else{ + png(paste(c("./plotDir/",opt$histo,iToPlot,".png"),collapse="")) + } + multiplot(plotlist=plotVector,cols=2) + dev.off() + if(iComparison<nbComparisons){ + #prepare for a new plotting file if necessary + plotVector=list() + iToPlot=iToPlot+1 + } + } + } + addComment("[INFO]Histograms drawn",T,opt$log,T,display=FALSE) + +} + +#plot F-test sum square barplot +if(!is.null(allFtestMeanSquare)){ + dataToPlot=data.frame(Fratio=apply(allFtestMeanSquare,2,mean),Factors=factor(colnames(allFtestMeanSquare),levels = colnames(allFtestMeanSquare))) + + p <- ggplot(data=dataToPlot, aes(x=Factors, y=Fratio, fill=Factors)) + + geom_bar(stat="identity") + scale_fill_manual(values = colorRampPalette(brewer.pal(9,"Set1"))(ncol(allFtestMeanSquare))[sample(ncol(allFtestMeanSquare))]) + ylab(label="mean F-ratio") + + theme_bw() + theme(panel.border=element_blank(),plot.title = element_text(hjust = 0.5)) + ggtitle("Source of variation") + + if(opt$format=="pdf"){ + pdf(paste(c("./plotDir/",opt$fratioFile,".pdf"),collapse=""))}else{ + png(paste(c("./plotDir/",opt$fratioFile,".png"),collapse="")) + } + plot(p) + dev.off() + + pp <- ggplotly(p) + htmlwidgets::saveWidget(as_widget(pp), paste(c(file.path(getwd(), "plotLyDir"),"/",opt$fratioFile,".html"),collapse=""),selfcontained = F) + + addComment("[INFO]SumSquareTest drawn",T,opt$log,T,display=FALSE) +} + +#plot VOLCANO plot +#volcanoplot(data.fit.eb,coef=1,highlight=10) +volcanoPerPage=1 +logFCthreshold=log2(opt$thresholdFC) +if (!is.null(opt$volcano)) { + iToPlot=1 + plotVector=list() + nbComparisons=ncol(data.fit.eb$adj_p.value) + for (iComparison in 1:nbComparisons){ + + #define the log10(p-val) threshold corresponding to FDR threshold fixed by user + probeWithLowFDR=-log10(data.fit.eb$p.value[which(data.fit.eb$adj_p.value[,iComparison]<=opt$fdrThreshold),iComparison]) + pvalThresholdFDR=NULL + if(length(probeWithLowFDR)>0)pvalThresholdFDR=min(probeWithLowFDR) + + #get significant points over FC and FDR thresholds + significativePoints=intersect(which(abs(data.fit.eb$coefficients[,iComparison])>=logFCthreshold),which(data.fit.eb$adj_p.value[,iComparison]<=opt$fdrThreshold)) + + #to reduce size of html plot, we keep 20000 points maximum sampled amongst genes with pval>=33%(pval) and abs(log2(FC))<=66%(abs(log2(FC))) + htmlPointsToRemove=intersect(which(abs(data.fit.eb$coefficients[,iComparison])<=quantile(abs(data.fit.eb$coefficients[,iComparison]),c(0.66))),which(data.fit.eb$p.value[,iComparison]>=quantile(abs(data.fit.eb$p.value[,iComparison]),c(0.33)))) + if(length(htmlPointsToRemove)>20000){ + htmlPointsToRemove=setdiff(htmlPointsToRemove,sample(htmlPointsToRemove,20000)) + }else{ + htmlPointsToRemove=c() + } + + xMinLimPlot=min(data.fit.eb$coefficients[,iComparison])-0.2 + xMaxLimPlot=max(data.fit.eb$coefficients[,iComparison])+0.2 + yMaxLimPlot= max(-log10(data.fit.eb$p.value[,iComparison]))+0.2 + + if(length(significativePoints)>0){ + dataSignifToPlot=data.frame(pval=-log10(data.fit.eb$p.value[significativePoints,iComparison]),FC=data.fit.eb$coefficients[significativePoints,iComparison],description=paste(names(data.fit.eb$coefficients[significativePoints,iComparison]),"\n","FC: " , round(2^data.fit.eb$coefficients[significativePoints,iComparison],2) , " | FDR p-val: ",prettyNum(data.fit.eb$adj_p.value[significativePoints,iComparison],digits=4), sep="")) + #to test if remains any normal points to draw + if(length(significativePoints)<nrow(data.fit.eb$p.value)){ + dataToPlot=data.frame(pval=-log10(data.fit.eb$p.value[-significativePoints,iComparison]),FC=data.fit.eb$coefficients[-significativePoints,iComparison],description=paste("FC: " , round(2^data.fit.eb$coefficients[-significativePoints,iComparison],2) , " | FDR p-val: ",prettyNum(data.fit.eb$adj_p.value[-significativePoints,iComparison],digits=4), sep="")) + }else{ + dataToPlot=data.frame(pval=0,FC=0,description="null") + } + }else{ + dataToPlot=data.frame(pval=-log10(data.fit.eb$p.value[,iComparison]),FC=data.fit.eb$coefficients[,iComparison],description=paste("FC: " , round(2^data.fit.eb$coefficients[,iComparison],2) , " | FDR p-val: ",prettyNum(data.fit.eb$adj_p.value[,iComparison],digits=4), sep="")) + } + + ##traditional plot + p <- ggplot(data=dataToPlot, aes(x=FC, y=pval)) + geom_point() + + theme_bw() + ggtitle(humanReadingContrastsRenamed[iComparison]) + ylab(label="-log10(p-val)") + xlab(label="Log2 Fold Change") + + theme(panel.border=element_blank(),plot.title = element_text(hjust = 0.5),legend.position="none") + if(logFCthreshold!=0) p <- p + geom_vline(xintercept=-logFCthreshold, color="salmon",linetype="dotted", size=1) + geom_vline(xintercept=logFCthreshold, color="salmon",linetype="dotted", size=1) + geom_text(data.frame(text=c(paste(c("log2(1/FC=",opt$thresholdFC,")"),collapse=""),paste(c("log2(FC=",opt$thresholdFC,")"),collapse="")),x=c(-logFCthreshold,logFCthreshold),y=c(0,0)),mapping=aes(x=x, y=y, label=text), size=4, angle=90, vjust=-0.4, hjust=0, color="salmon") + if(!is.null(pvalThresholdFDR)) p <- p + geom_hline(yintercept=pvalThresholdFDR, color="skyblue1",linetype="dotted", size=0.5) + geom_text(data.frame(text=c(paste(c("FDR pval limit(",opt$fdrThreshold,")"),collapse="")),x=c(xMinLimPlot),y=c(pvalThresholdFDR)),mapping=aes(x=x, y=y, label=text), size=4, vjust=0, hjust=0, color="skyblue3") + if(length(significativePoints)>0)p <- p + geom_point(data=dataSignifToPlot,aes(colour=description)) + + ##interactive plot + if(length(htmlPointsToRemove)>0){ + pointToRemove=union(htmlPointsToRemove,significativePoints) + #to test if remains any normal points to draw + if(length(pointToRemove)<nrow(data.fit.eb$p.value)){ + dataToPlot=data.frame(pval=-log10(data.fit.eb$p.value[-pointToRemove,iComparison]),FC=data.fit.eb$coefficients[-pointToRemove,iComparison],description=paste("FC: " , round(2^data.fit.eb$coefficients[-pointToRemove,iComparison],2) , " | FDR p-val: ", prettyNum(data.fit.eb$adj_p.value[-pointToRemove,iComparison],digits=4), sep="")) + }else{ + dataToPlot=data.frame(pval=0,FC=0,description="null") + } + } + + if((nrow(dataToPlot)+nrow(dataSignifToPlot))>40000)addComment(c("[WARNING]For",humanReadingContrastsRenamed[iComparison],"volcano, numerous points to plot(",nrow(dataToPlot)+nrow(dataSignifToPlot),"), resulting volcano could be heavy, using more stringent thresholds could be helpful."),T,opt$log) + + phtml <- plot_ly(data=dataToPlot, x=~FC, y=~pval,type="scatter", mode="markers",showlegend = FALSE, marker = list(color="gray",opacity=0.5), text=~description, hoverinfo="text") %>% + layout(title = humanReadingContrastsRenamed[iComparison],xaxis=list(title="Log2 Fold Change",showgrid=TRUE, zeroline=FALSE),yaxis=list(title="-log10(p-val)", showgrid=TRUE, zeroline=FALSE)) + if(length(significativePoints)>0) phtml=add_markers(phtml,data=dataSignifToPlot, x=~FC, y=~pval, mode="markers" , marker=list( color=log10(abs(dataSignifToPlot$FC)*dataSignifToPlot$pval),colorscale='Rainbow'), text=~description, hoverinfo="text", inherit = FALSE) %>% hide_colorbar() + if(logFCthreshold!=0){ + phtml=add_trace(phtml,x=c(-logFCthreshold,-logFCthreshold), y=c(0,yMaxLimPlot), type="scatter", mode = "lines", line=list(color="coral",dash="dash"), hoverinfo='none', showlegend = FALSE,inherit = FALSE) + phtml=add_annotations(phtml,x=-logFCthreshold,y=0,xref = "x",yref = "y",text = paste(c("log2(1/FC=",opt$thresholdFC,")"),collapse=""),xanchor = 'right',showarrow = F,textangle=270,font=list(color="coral")) + phtml=add_trace(phtml,x=c(logFCthreshold,logFCthreshold), y=c(0, yMaxLimPlot), type="scatter", mode = "lines", line=list(color="coral",dash="dash"), hoverinfo='none', showlegend = FALSE,inherit = FALSE) + phtml=add_annotations(phtml,x=logFCthreshold,y=0,xref = "x",yref = "y",text = paste(c("log2(FC=",opt$thresholdFC,")"),collapse=""),xanchor = 'right',showarrow = F,textangle=270,font=list(color="coral")) + } + if(!is.null(pvalThresholdFDR)){ + phtml=add_trace(phtml,x=c(xMinLimPlot,xMaxLimPlot), y=c(pvalThresholdFDR,pvalThresholdFDR), type="scatter", mode = "lines", line=list(color="cornflowerblue",dash="dash"), hoverinfo='none', showlegend = FALSE,inherit = FALSE) + phtml=add_annotations(phtml,x=xMinLimPlot,y=pvalThresholdFDR+0.1,xref = "x",yref = "y",text = paste(c("FDR pval limit(",opt$fdrThreshold,")"),collapse=""),xanchor = 'left',showarrow = F,font=list(color="cornflowerblue")) + } + plotVector[[length(plotVector)+1]]=p + + #save plotly files + pp <- ggplotly(phtml) + htmlwidgets::saveWidget(as_widget(pp), paste(c(file.path(getwd(), "plotLyDir"),"/",opt$volcano,"_",correspondanceTable[humanReadingContrastsRenamed[iComparison],1],".html"),collapse=""),selfcontained = F) + + + if(iComparison==nbComparisons || length(plotVector)==volcanoPerPage){ + #plot and close the actual plot + if(opt$format=="pdf"){ + pdf(paste(c("./plotDir/",opt$volcano,"_",correspondanceTable[humanReadingContrastsRenamed[iComparison],1],".pdf"),collapse=""))}else{ + png(paste(c("./plotDir/",opt$volcano,"_",correspondanceTable[humanReadingContrastsRenamed[iComparison],1],".png"),collapse="")) + } + multiplot(plotlist=plotVector,cols=1) + dev.off() + if(iComparison<nbComparisons){ + #prepare for a new ploting file if necessary + plotVector=list() + iToPlot=iToPlot+1 + } + } + } + remove(dataToPlot,dataSignifToPlot) + addComment("[INFO]Volcanos drawn",T,opt$log,T,display=FALSE) +} + +rowItemInfo=NULL +if(!is.null(opt$rowNameType) && !is.null(opt$organismID)){ +##get gene information from BioMart +#if(!require("biomaRt")){ +# source("https://bioconductor.org/biocLite.R") +# biocLite("biomaRt") +#} + +ensembl_hs_mart <- useMart(biomart="ensembl", dataset=opt$organismID) +ensembl_df <- getBM(attributes=c(opt$rowNameType,"description"),mart=ensembl_hs_mart) +rowItemInfo=ensembl_df[which(ensembl_df[,1]!=""),2] +rowItemInfo=unlist(lapply(rowItemInfo,function(x)substr(unlist(strsplit(x," \\[Source"))[1],1,30))) +names(rowItemInfo)=ensembl_df[which(ensembl_df[,1]!=""),1] +} + +#write(unlist(dimnames(data.fit.eb$adj_p.value)),opt$log,append = T) + +#prepare additional output containing df informations +dfMatrix=matrix(0,ncol=3,nrow = nrow(data.fit.eb$coefficients),dimnames = list(rownames(data.fit.eb$coefficients),c("df.residual","df.prior","df.total"))) +dfMatrix[,"df.residual"]=data.fit.eb$df.residual +dfMatrix[,"df.prior"]=data.fit.eb$df.prior +dfMatrix[,"df.total"]=data.fit.eb$df.total + +#filter out genes with higher p-values for all comparisons +genesToKeep=names(which(apply(data.fit.eb$adj_p.value,1,function(x)length(which(x<=opt$fdrThreshold))>0))) +#filter out genes with lower FC for all comparisons +genesToKeep=intersect(genesToKeep,names(which(apply(data.fit.eb$coefficients,1,function(x)length(which(abs(x)>=logFCthreshold))>0)))) + +if(length(genesToKeep)>0){ + data.fit.eb$adj_p.value=matrix(data.fit.eb$adj_p.value[genesToKeep,],ncol=ncol(data.fit.eb$adj_p.value)) + rownames(data.fit.eb$adj_p.value)=genesToKeep + colnames(data.fit.eb$adj_p.value)=colnames(data.fit.eb$p.value) + + data.fit.eb$p.value=matrix(data.fit.eb$p.value[genesToKeep,],ncol=ncol(data.fit.eb$p.value)) + rownames(data.fit.eb$p.value)=genesToKeep + colnames(data.fit.eb$p.value)=colnames(data.fit.eb$adj_p.value) + + data.fit.eb$coefficients=matrix(data.fit.eb$coefficients[genesToKeep,],ncol=ncol(data.fit.eb$coefficients)) + rownames(data.fit.eb$coefficients)=genesToKeep + colnames(data.fit.eb$coefficients)=colnames(data.fit.eb$adj_p.value) + + data.fit.eb$t=matrix(data.fit.eb$t[genesToKeep,],ncol=ncol(data.fit.eb$t)) + rownames(data.fit.eb$t)=genesToKeep + colnames(data.fit.eb$t)=colnames(data.fit.eb$adj_p.value) + + dfMatrix=dfMatrix[genesToKeep,,drop=FALSE] + +}else{ + addComment(c("[WARNING]No significative genes considering the given FDR threshold : ",opt$fdrThreshold),T,opt$log,display=FALSE) +} + +addComment("[INFO]Significant genes filtering done",T,opt$log,T,display=FALSE) + + +#plot VennDiagramm for genes below threshold between comparisons +#t=apply(data.fit.eb$adj_p.value[,1:4],2,function(x)names(which(x<=opt$threshold))) +#get.venn.partitions(t) +#vennCounts(data.fit.eb$adj_p.value[,1:4]<=opt$threshold) + +#make a simple sort genes based only on the first comparison +#newOrder=order(data.fit.eb$adj_p.value[,1]) +#data.fit.eb$adj_p.value=data.fit.eb$adj_p.value[newOrder,] + +#alternative sorting strategy based on the mean gene rank over all comparisons +if(length(genesToKeep)>1){ + currentRank=rep(0,nrow(data.fit.eb$adj_p.value)) + for(iComparison in 1:ncol(data.fit.eb$adj_p.value)){ + currentRank=currentRank+rank(data.fit.eb$adj_p.value[,iComparison]) + } + currentRank=currentRank/ncol(data.fit.eb$adj_p.value) + newOrder=order(currentRank) + + data.fit.eb$adj_p.value=matrix(data.fit.eb$adj_p.value[newOrder,],ncol=ncol(data.fit.eb$adj_p.value)) + rownames(data.fit.eb$adj_p.value)=rownames(data.fit.eb$p.value)[newOrder] + colnames(data.fit.eb$adj_p.value)=colnames(data.fit.eb$p.value) + + data.fit.eb$p.value=matrix(data.fit.eb$p.value[newOrder,],ncol=ncol(data.fit.eb$p.value)) + rownames(data.fit.eb$p.value)=rownames(data.fit.eb$adj_p.value) + colnames(data.fit.eb$p.value)=colnames(data.fit.eb$adj_p.value) + + data.fit.eb$coefficients=matrix(data.fit.eb$coefficients[newOrder,],ncol=ncol(data.fit.eb$coefficients)) + rownames(data.fit.eb$coefficients)=rownames(data.fit.eb$adj_p.value) + colnames(data.fit.eb$coefficients)=colnames(data.fit.eb$adj_p.value) + + data.fit.eb$t=matrix(data.fit.eb$t[newOrder,],ncol=ncol(data.fit.eb$t)) + rownames(data.fit.eb$t)=rownames(data.fit.eb$adj_p.value) + colnames(data.fit.eb$t)=colnames(data.fit.eb$adj_p.value) + + dfMatrix=dfMatrix[newOrder,,drop=FALSE] +} + + +#formating output matrices depending on genes to keep +if(length(genesToKeep)==0){ + outputData=matrix(0,ncol=ncol(data.fit.eb$adj_p.value)*5+2,nrow=3) + outputData[1,]=c("X","X",rep(humanReadingContrastsRenamed,each=5)) + outputData[2,]=c("X","X",rep(c("p-val","FDR.p-val","FC","log2(FC)","t-stat"),ncol(data.fit.eb$adj_p.value))) + outputData[,1]=c("LIMMA","Gene","noGene") + outputData[,2]=c("Comparison","Info","noInfo") + + outputDfData=matrix(0,ncol=3+1,nrow=2) + outputDfData[1,]=c("X","df.residual","df.prior","df.total") + outputDfData[,1]=c("Statistics","noGene") +}else{ + if(length(genesToKeep)==1){ + outputData=matrix(0,ncol=ncol(data.fit.eb$adj_p.value)*5+2,nrow=3) + outputData[1,]=c("X","X",rep(humanReadingContrastsRenamed,each=5)) + outputData[2,]=c("X","X",rep(c("p-val","FDR.p-val","FC","log2(FC)","t-stat"),ncol(data.fit.eb$adj_p.value))) + outputData[,1]=c("LIMMA","Gene",genesToKeep) + outputData[,2]=c("Comparison","Info","na") + if(!is.null(rowItemInfo))outputData[3,2]=rowItemInfo[genesToKeep] + outputData[3,seq(3,ncol(outputData),5)]=prettyNum(data.fit.eb$p.value,digits=4) + outputData[3,seq(4,ncol(outputData),5)]=prettyNum(data.fit.eb$adj_p.value,digits=4) + outputData[3,seq(5,ncol(outputData),5)]=prettyNum(2^data.fit.eb$coefficients,digits=4) + outputData[3,seq(6,ncol(outputData),5)]=prettyNum(data.fit.eb$coefficients,digits=4) + outputData[3,seq(7,ncol(outputData),5)]=prettyNum(data.fit.eb$t,digits=4) + + outputDfData=matrix(0,ncol=3+1,nrow=1+nrow(dfMatrix)) + outputDfData[1,]=c("Statistics","df.residual","df.prior","df.total") + outputDfData[2,]=c(rownames(dfMatrix),prettyNum(dfMatrix[,c("df.residual","df.prior","df.total")],digits=4)) + }else{ + #format matrix to be correctly read by galaxy (move headers in first column and row) + outputData=matrix(0,ncol=ncol(data.fit.eb$adj_p.value)*5+2,nrow=nrow(data.fit.eb$adj_p.value)+2) + outputData[1,]=c("X","X",rep(humanReadingContrastsRenamed,each=5)) + outputData[2,]=c("X","X",rep(c("p-val","FDR.p-val","FC","log2(FC)","t-stat"),ncol(data.fit.eb$adj_p.value))) + outputData[,1]=c("LIMMA","Gene",rownames(data.fit.eb$adj_p.value)) + outputData[,2]=c("Comparison","Info",rep("na",nrow(data.fit.eb$adj_p.value))) + if(!is.null(rowItemInfo))outputData[3:nrow(outputData),2]=rowItemInfo[rownames(data.fit.eb$adj_p.value)] + outputData[3:nrow(outputData),seq(3,ncol(outputData),5)]=prettyNum(data.fit.eb$p.value,digits=4) + outputData[3:nrow(outputData),seq(4,ncol(outputData),5)]=prettyNum(data.fit.eb$adj_p.value,digits=4) + outputData[3:nrow(outputData),seq(5,ncol(outputData),5)]=prettyNum(2^data.fit.eb$coefficients,digits=4) + outputData[3:nrow(outputData),seq(6,ncol(outputData),5)]=prettyNum(data.fit.eb$coefficients,digits=4) + outputData[3:nrow(outputData),seq(7,ncol(outputData),5)]=prettyNum(data.fit.eb$t,digits=4) + + outputDfData=matrix(0,ncol=3+1,nrow=1+nrow(dfMatrix)) + outputDfData[1,]=c("Statistics","df.residual","df.prior","df.total") + outputDfData[2:(1+nrow(dfMatrix)),]=cbind(rownames(dfMatrix),prettyNum(dfMatrix[,c("df.residual")],digits=4),prettyNum(dfMatrix[,c("df.prior")],digits=4),prettyNum(dfMatrix[,c("df.total")],digits=4)) + } +} +addComment("[INFO]Formated output",T,opt$log,display=FALSE) + +#write output results +write.table(outputData,file=opt$outputFile,quote=FALSE,sep="\t",col.names = F,row.names = F) + +#write df info file +write.table(outputDfData,file=opt$outputDfFile,quote=FALSE,sep="\t",col.names = F,row.names = F) + +end.time <- Sys.time() +addComment(c("[INFO]Total execution time for R script:",as.numeric(end.time - start.time,units="mins"),"mins"),T,opt$log,display=FALSE) + +addComment("[INFO]End of R script",T,opt$log,display=FALSE) + +printSessionInfo(opt$log) +#sessionInfo() + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/VolcanoPlotsScript.R Fri Jun 26 09:38:23 2020 -0400 @@ -0,0 +1,426 @@ +# R script to plot volcanos through Galaxy based GIANT tool +# written by Jimmy Vandel +# +# +initial.options <- commandArgs(trailingOnly = FALSE) +file.arg.name <- "--file=" +script.name <- sub(file.arg.name, "", initial.options[grep(file.arg.name, initial.options)]) +script.basename <- dirname(script.name) +source(file.path(script.basename, "utils.R")) +source(file.path(script.basename, "getopt.R")) + +#addComment("Welcome R!") + +# setup R error handling to go to stderr +options( show.error.messages=F, error = function () { cat(geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) + +# we need that to not crash galaxy with an UTF8 error on German LC settings. +loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") +loc <- Sys.setlocale("LC_NUMERIC", "C") + +#get starting time +start.time <- Sys.time() + +options(stringAsfactors = FALSE, useFancyQuotes = FALSE) +args <- commandArgs() + +# get options, using the spec as defined by the enclosed list. +# we read the options from the default: commandArgs(TRUE). +spec <- matrix(c( + "statisticsFile", "i", 1, "character", + "volcanoName" , "n", 1, "character", + "pvalColumnName" , "p", 1, "character", + "fdrColumnName" , "m", 1, "character", + "fcColumnName" , "c", 1, "character", + "fcKind","d", 1, "character", + "fdrThreshold","s", 1, "double", + "fcThreshold","e", 1, "double", + "organismID","x",1,"character", + "rowNameType","y",1,"character", + "log", "l", 1, "character", + "outputFile" , "o", 1, "character", + "format", "f", 1, "character", + "quiet", "q", 0, "logical"), + byrow=TRUE, ncol=4) +opt <- getopt(spec) + +# enforce the following required arguments +if (is.null(opt$log)) { + addComment("[ERROR]'log file' is required\n") + q( "no", 1, F ) +} +addComment("[INFO]Start of R script",T,opt$log,display=FALSE) +if (is.null(opt$statisticsFile)) { + addComment("[ERROR]'statisticsFile' is required",T,opt$log) + q( "no", 1, F ) +} +if (length(opt$pvalColumnName)==0 || length(opt$fdrColumnName)==0 || length(opt$fcColumnName)==0) { + addComment("[ERROR]no selected columns",T,opt$log) + q( "no", 1, F ) +} +if (length(opt$pvalColumnName)!=length(opt$fcColumnName) || length(opt$pvalColumnName)!=length(opt$fdrColumnName)) { + addComment("[ERROR]different number of selected columns between p.val, adj-p.val and FC ",T,opt$log) + q( "no", 1, F ) +} +if (is.null(opt$fcKind)) { + addComment("[ERROR]'fcKind' is required",T,opt$log) + q( "no", 1, F ) +} +if (is.null(opt$fdrThreshold)) { + addComment("[ERROR]'FDR threshold' is required",T,opt$log) + q( "no", 1, F ) +} +if (is.null(opt$fcThreshold)) { + addComment("[ERROR]'FC threshold' is required",T,opt$log) + q( "no", 1, F ) +} +if (is.null(opt$outputFile)) { + addComment("[ERROR]'output file' is required",T,opt$log) + q( "no", 1, F ) +} +if (is.null(opt$format)) { + addComment("[ERROR]'output format' is required",T,opt$log) + q( "no", 1, F ) +} + +#demande si le script sera bavard +verbose <- if (is.null(opt$quiet)) { + TRUE +}else{ + FALSE +} + +#paramètres internes +addComment("[INFO]Parameters checked test mode !",T,opt$log,display=FALSE) + +addComment(c("[INFO]Working directory: ",getwd()),TRUE,opt$log,display=FALSE) +addComment(c("[INFO]Command line: ",args),TRUE,opt$log,display=FALSE) + +#directory for plots +dir.create(file.path(getwd(), "plotDir")) +dir.create(file.path(getwd(), "plotLyDir")) + +#charge des packages silencieusement +suppressPackageStartupMessages({ + library("methods") + library("biomaRt") + library("ggplot2") + library("plotly") + library("stringr") +}) + +#define some usefull variable +nbVolcanosToPlot=length(opt$pvalColumnName) + +#load input file +statDataMatrix=read.csv(file=file.path(getwd(), opt$statisticsFile),header=F,sep="\t",colClasses="character") +#remove first colum to convert it as rownames +rownames(statDataMatrix)=statDataMatrix[,1] +statDataMatrix=statDataMatrix[,-1] + +#identify lines without adjusted p-value info (should contain the same content as rownames) and replace them with NA values +FDRinfo=rep(TRUE,nbVolcanosToPlot) +for(iVolcano in 1:nbVolcanosToPlot){ + #input parameter should be None when adjusted p-val are not available + if(opt$fdrColumnName[iVolcano]=="None"){ + #content of the corresponding column should also be the same as rownames + if(!all(statDataMatrix[,(iVolcano-1)*3+2]==rownames(statDataMatrix))){ + addComment(c("[ERROR]It seems that input stat matrix contains adjusted p-values for volcano",iVolcano,"whereas input parameter indicates that not."),T,opt$log) + q( "no", 1, F ) + } + FDRinfo[iVolcano]=FALSE + statDataMatrix[,(iVolcano-1)*3+2]=NA + } +} + +if(is.data.frame(statDataMatrix)){ + statDataMatrix=data.matrix(statDataMatrix) +}else{ + statDataMatrix=data.matrix(as.numeric(statDataMatrix)) +} + +#check if available column number match with volcano requested number +if(ncol(statDataMatrix)!=3*nbVolcanosToPlot){ + addComment("[ERROR]Input file column number is different from requested volcano number",T,opt$log) + q( "no", 1, F ) +} + +#build global dataFrame with data and fill with p.val and log2(FC) and FDR +dataFrame=data.frame(row.names = rownames(statDataMatrix)) +#start with p-value +dataFrame$p.value=statDataMatrix[,seq(1,nbVolcanosToPlot*3,3),drop=FALSE] +#compute FDR if needed or just get available info +dataFrame$adj_p.value=dataFrame$p.value +for(iVolcano in 1:nbVolcanosToPlot){ + #adjusted p-value are already computed + if(FDRinfo[iVolcano]){ + dataFrame$adj_p.value[,iVolcano]=statDataMatrix[,(iVolcano-1)*3+2,drop=FALSE] + }else{ + #adjusted p-value should be computed based on p-val using FDR + dataFrame$adj_p.value[,iVolcano]=p.adjust(dataFrame$p.value[,iVolcano,drop=FALSE],"fdr") + addComment(c("[INFO]Adjusted p-values are not available in input for volcano",iVolcano,", FDR approach will be used on available raw p-values"),T,opt$log) + } +} +if(opt$fcKind=="FC"){ + #we should transform as Log2FC + dataFrame$coefficients=log2(statDataMatrix[,seq(3,nbVolcanosToPlot*3,3),drop=FALSE]) + addComment(c("[INFO]FC are converted in log2(FC) for plotting"),T,opt$log) +}else{ + dataFrame$coefficients=statDataMatrix[,seq(3,nbVolcanosToPlot*3,3),drop=FALSE] +} + +addComment(c("[INFO]Input data available for",nbVolcanosToPlot,"volcano(s) with",nrow(statDataMatrix),"rows"),T,opt$log) + + +#plot VOLCANOs +volcanoPerPage=1 +logFCthreshold=log2(opt$fcThreshold) +iToPlot=1 +plotVector=list() +volcanoNameList=c() +for (iVolcano in 1:nbVolcanosToPlot){ + + if(nchar(opt$volcanoName[iVolcano])>0){ + curentVolcanoName=opt$volcanoName[iVolcano] + }else{ + curentVolcanoName=paste(iVolcano,opt$pvalColumnName[iVolcano],sep="_") + } + + #keep only rows without NA for p-val, adjusted p-val and coeff + pValToPlot=dataFrame$p.value[,iVolcano] + fdrToPlot=dataFrame$adj_p.value[,iVolcano] + coeffToPlot=dataFrame$coefficients[,iVolcano] + + rowToRemove=unique(c(which(is.na(pValToPlot)),which(is.na(fdrToPlot)),which(is.na(coeffToPlot)))) + if(length(rowToRemove)>0){ + pValToPlot=pValToPlot[-rowToRemove] + fdrToPlot=fdrToPlot[-rowToRemove] + coeffToPlot=coeffToPlot[-rowToRemove] + } + addComment(c("[INFO]For",curentVolcanoName,"volcano,",length(rowToRemove),"rows are discarded due to NA values,",length(pValToPlot),"remaining rows."),T,opt$log) + + #save volcano name + volcanoNameList=c(volcanoNameList,curentVolcanoName) + + #remove characters possibly troubling + volcanoFileName=iVolcano + + #define the log10(p-val) threshold corresponding to FDR threshold fixed by user + probeWithLowFDR=-log10(pValToPlot[which(fdrToPlot<=opt$fdrThreshold)]) + pvalThresholdFDR=NULL + if(length(probeWithLowFDR)>0)pvalThresholdFDR=min(probeWithLowFDR) + + #get significant points over FC and FDR thresholds + significativePoints=intersect(which(abs(coeffToPlot)>=logFCthreshold),which(fdrToPlot<=opt$fdrThreshold)) + + #to reduce size of html plot, we keep 20000 points maximum sampled amongst genes with pval>=33%(pval) and abs(log2(FC))<=66%(abs(log2(FC))) + htmlPointsToRemove=intersect(which(abs(coeffToPlot)<=quantile(abs(coeffToPlot),c(0.66))),which(pValToPlot>=quantile(abs(pValToPlot),c(0.33)))) + if(length(htmlPointsToRemove)>20000){ + htmlPointsToRemove=setdiff(htmlPointsToRemove,sample(htmlPointsToRemove,20000)) + }else{ + htmlPointsToRemove=c() + } + + xMinLimPlot=min(coeffToPlot)-0.2 + xMaxLimPlot=max(coeffToPlot)+0.2 + yMaxLimPlot= max(-log10(pValToPlot))+0.2 + + if(length(significativePoints)>0){ + dataSignifToPlot=data.frame(pval=-log10(pValToPlot[significativePoints]),FC=coeffToPlot[significativePoints],description=paste(names(coeffToPlot[significativePoints]),"\n","FC: " , round(2^coeffToPlot[significativePoints],2) , " | Adjusted p-val: ",prettyNum(fdrToPlot[significativePoints],digits=4), sep="")) + #to test if remains any normal points to draw + if(length(significativePoints)<length(pValToPlot)){ + dataToPlot=data.frame(pval=-log10(pValToPlot[-significativePoints]),FC=coeffToPlot[-significativePoints],description=paste("FC: " , round(2^coeffToPlot[-significativePoints],2) , " | Adjusted p-val: ",prettyNum(fdrToPlot[-significativePoints],digits=4), sep="")) + }else{ + dataToPlot=data.frame(pval=0,FC=0,description="null") + } + }else{ + dataToPlot=data.frame(pval=-log10(pValToPlot),FC=coeffToPlot,description=paste("FC: " , round(2^coeffToPlot,2) , " | Adjusted p-val: ",prettyNum(fdrToPlot,digits=4), sep="")) + } + + ##traditional plot + + p <- ggplot(data=dataToPlot, aes(x=FC, y=pval)) + geom_point() + + theme_bw() + ggtitle(curentVolcanoName) + ylab(label="-Log10(p-val)") + xlab(label="Log2 Fold Change") + + theme(panel.border=element_blank(),plot.title = element_text(hjust = 0.5),legend.position="none") + if(logFCthreshold!=0) p <- p + geom_vline(xintercept=-logFCthreshold, color="salmon",linetype="dotted", size=1) + geom_vline(xintercept=logFCthreshold, color="salmon",linetype="dotted", size=1) + geom_text(data.frame(text=c(paste(c("log2(1/FC=",opt$fcThreshold,")"),collapse=""),paste(c("log2(FC=",opt$fcThreshold,")"),collapse="")),x=c(-logFCthreshold,logFCthreshold),y=c(0,0)),mapping=aes(x=x, y=y, label=text), size=4, angle=90, vjust=-0.4, hjust=0, color="salmon") + if(!is.null(pvalThresholdFDR)) p <- p + geom_hline(yintercept=pvalThresholdFDR, color="skyblue1",linetype="dotted", size=0.5) + geom_text(data.frame(text=c(paste(c("Adjusted pval limit(",opt$fdrThreshold,")"),collapse="")),x=c(xMinLimPlot),y=c(pvalThresholdFDR)),mapping=aes(x=x, y=y, label=text), size=4, vjust=0, hjust=0, color="skyblue3") + if(length(significativePoints)>0)p <- p + geom_point(data=dataSignifToPlot,aes(colour=description)) + + ##interactive plot + + if(length(htmlPointsToRemove)>0){ + pointToRemove=union(htmlPointsToRemove,significativePoints) + #to test if it remains any normal points to draw + if(length(pointToRemove)<length(pValToPlot)){ + dataToPlot=data.frame(pval=-log10(pValToPlot[-pointToRemove]),FC=coeffToPlot[-pointToRemove],description=paste("FC: " , round(2^coeffToPlot[-pointToRemove],2) , " | Adjusted p-val: ", prettyNum(fdrToPlot[-pointToRemove],digits=4), sep="")) + }else{ + dataToPlot=data.frame(pval=0,FC=0,description="null") + } + } + + if((nrow(dataToPlot)+length(significativePoints))>40000)addComment(c("[WARNING]For",curentVolcanoName,"volcano, numerous points to plot(",nrow(dataToPlot)+nrow(dataSignifToPlot),"), resulting volcano could be heavy, using more stringent thresholds could be helpful."),T,opt$log) + + phtml <- plot_ly(data=dataToPlot, x=~FC, y=~pval,type="scatter", mode="markers",showlegend = FALSE, marker = list(color="gray",opacity=0.5), text=~description, hoverinfo="text") %>% + layout(title = curentVolcanoName[iVolcano],xaxis=list(title="Log2 Fold Change",showgrid=TRUE, zeroline=FALSE),yaxis=list(title="-Log10(p-val)", showgrid=TRUE, zeroline=FALSE)) + if(length(significativePoints)>0) phtml=add_markers(phtml,data=dataSignifToPlot, x=~FC, y=~pval, mode="markers" , marker=list( color=log10(abs(dataSignifToPlot$FC)*dataSignifToPlot$pval),colorscale='Rainbow'), text=~description, hoverinfo="text", inherit = FALSE) %>% hide_colorbar() + if(logFCthreshold!=0){ + phtml=add_trace(phtml,x=c(-logFCthreshold,-logFCthreshold), y=c(0,yMaxLimPlot), type="scatter", mode = "lines", line=list(color="coral",dash="dash"), hoverinfo='none', showlegend = FALSE,inherit = FALSE) + phtml=add_annotations(phtml,x=-logFCthreshold,y=0,xref = "x",yref = "y",text = paste(c("log2(1/FC=",opt$fcThreshold,")"),collapse=""),xanchor = 'right',showarrow = F,textangle=270,font=list(color="coral")) + phtml=add_trace(phtml,x=c(logFCthreshold,logFCthreshold), y=c(0, yMaxLimPlot), type="scatter", mode = "lines", line=list(color="coral",dash="dash"), hoverinfo='none', showlegend = FALSE,inherit = FALSE) + phtml=add_annotations(phtml,x=logFCthreshold,y=0,xref = "x",yref = "y",text = paste(c("log2(FC=",opt$fcThreshold,")"),collapse=""),xanchor = 'right',showarrow = F,textangle=270,font=list(color="coral")) + } + if(!is.null(pvalThresholdFDR)){ + phtml=add_trace(phtml,x=c(xMinLimPlot,xMaxLimPlot), y=c(pvalThresholdFDR,pvalThresholdFDR), type="scatter", mode = "lines", line=list(color="cornflowerblue",dash="dash"), hoverinfo='none', showlegend = FALSE,inherit = FALSE) + phtml=add_annotations(phtml,x=xMinLimPlot,y=pvalThresholdFDR+0.1,xref = "x",yref = "y",text = paste(c("Adjusted pval limit(",opt$fdrThreshold,")"),collapse=""),xanchor = 'left',showarrow = F,font=list(color="cornflowerblue")) + } + plotVector[[length(plotVector)+1]]=p + + #save plotly files + pp <- ggplotly(phtml) + htmlwidgets::saveWidget(as_widget(pp), paste(c(file.path(getwd(), "plotLyDir"),"/Volcanos_",volcanoFileName,".html"),collapse=""),selfcontained = F) + + + if(iVolcano==nbVolcanosToPlot || length(plotVector)==volcanoPerPage){ + #plot and close the actual plot + if(opt$format=="pdf"){ + pdf(paste(c("./plotDir/Volcanos_",volcanoFileName,".pdf"),collapse=""))}else{ + png(paste(c("./plotDir/Volcanos_",volcanoFileName,".png"),collapse="")) + } + multiplot(plotlist=plotVector,cols=1) + dev.off() + if(iVolcano<nbVolcanosToPlot){ + #prepare for a new ploting file if necessary + plotVector=list() + iToPlot=iToPlot+1 + } + } +} +remove(dataToPlot,dataSignifToPlot) +addComment("[INFO]Volcanos drawn",T,opt$log,T,display=FALSE) + + +#now add anotation infos about genes + +rowItemInfo=NULL +if(!is.null(opt$rowNameType) && !is.null(opt$organismID)){ + ##get gene information from BioMart + #if(!require("biomaRt")){ + # source("https://bioconductor.org/biocLite.R") + # biocLite("biomaRt") + #} + + ensembl_hs_mart <- useMart(biomart="ensembl", dataset=opt$organismID) + ensembl_df <- getBM(attributes=c(opt$rowNameType,"description"),mart=ensembl_hs_mart) + rowItemInfo=ensembl_df[which(ensembl_df[,1]!=""),2] + rowItemInfo=unlist(lapply(rowItemInfo,function(x)substr(unlist(strsplit(x," \\[Source"))[1],1,30))) + names(rowItemInfo)=ensembl_df[which(ensembl_df[,1]!=""),1] +} + +#filter out genes with higher p-values for all comparisons +genesToKeep=names(which(apply(dataFrame$adj_p.value,1,function(x)length(which(x<=opt$fdrThreshold))>0))) +#filter out genes with lower FC for all comparisons +genesToKeep=intersect(genesToKeep,names(which(apply(dataFrame$coefficients,1,function(x)length(which(abs(x)>=logFCthreshold))>0)))) + +if(length(genesToKeep)>0){ + dataFrameNew=data.frame(row.names=genesToKeep) + + dataFrameNew$adj_p.value=matrix(dataFrame$adj_p.value[genesToKeep,,drop=FALSE],ncol=ncol(dataFrame$adj_p.value)) + rownames(dataFrameNew$adj_p.value)=genesToKeep + colnames(dataFrameNew$adj_p.value)=colnames(dataFrame$p.value) + + dataFrameNew$p.value=matrix(dataFrame$p.value[genesToKeep,,drop=FALSE],ncol=ncol(dataFrame$p.value)) + rownames(dataFrameNew$p.value)=genesToKeep + colnames(dataFrameNew$p.value)=colnames(dataFrame$adj_p.value) + + dataFrameNew$coefficients=matrix(dataFrame$coefficients[genesToKeep,,drop=FALSE],ncol=ncol(dataFrame$coefficients)) + rownames(dataFrameNew$coefficients)=genesToKeep + colnames(dataFrameNew$coefficients)=colnames(dataFrame$adj_p.value) + + dataFrame=dataFrameNew + rm(dataFrameNew) +}else{ + addComment("[WARNING]No significative genes",T,opt$log,display=FALSE) +} + +addComment("[INFO]Significant genes filtering done",T,opt$log,T,display=FALSE) + + +#plot VennDiagramm for genes below threshold between comparisons +#t=apply(dataFrame$adj_p.value[,1:4],2,function(x)names(which(x<=opt$threshold))) +#get.venn.partitions(t) +#vennCounts(dataFrame$adj_p.value[,1:4]<=opt$threshold) + +#make a simple sort genes based only on the first comparison +#newOrder=order(dataFrame$adj_p.value[,1]) +#dataFrame$adj_p.value=dataFrame$adj_p.value[newOrder,] + +#alternative sorting strategy based on the mean gene rank over all comparisons +if(length(genesToKeep)>1){ + currentRank=rep(0,nrow(dataFrame$adj_p.value)) + for(iVolcano in 1:ncol(dataFrame$adj_p.value)){ + currentRank=currentRank+rank(dataFrame$adj_p.value[,iVolcano]) + } + currentRank=currentRank/ncol(dataFrame$adj_p.value) + newOrder=order(currentRank) + rownames(dataFrame)=rownames(dataFrame)[newOrder] + + dataFrame$adj_p.value=matrix(dataFrame$adj_p.value[newOrder,],ncol=ncol(dataFrame$adj_p.value)) + rownames(dataFrame$adj_p.value)=rownames(dataFrame$p.value)[newOrder] + colnames(dataFrame$adj_p.value)=colnames(dataFrame$p.value) + + dataFrame$p.value=matrix(dataFrame$p.value[newOrder,],ncol=ncol(dataFrame$p.value)) + rownames(dataFrame$p.value)=rownames(dataFrame$adj_p.value) + colnames(dataFrame$p.value)=colnames(dataFrame$adj_p.value) + + dataFrame$coefficients=matrix(dataFrame$coefficients[newOrder,],ncol=ncol(dataFrame$coefficients)) + rownames(dataFrame$coefficients)=rownames(dataFrame$adj_p.value) + colnames(dataFrame$coefficients)=colnames(dataFrame$adj_p.value) +} + +#formating output matrix depending on genes to keep +if(length(genesToKeep)==0){ + outputData=matrix(0,ncol=ncol(dataFrame$adj_p.value)*4+2,nrow=3) + outputData[1,]=c("X","X",rep(volcanoNameList,each=4)) + outputData[2,]=c("X","X",rep(c("p-val","Adjusted.p-val","FC","log2(FC)"),ncol(dataFrame$adj_p.value))) + outputData[,1]=c("Volcano","Gene","noGene") + outputData[,2]=c("Comparison","Info","noInfo") +}else{ + if(length(genesToKeep)==1){ + outputData=matrix(0,ncol=ncol(dataFrame$adj_p.value)*4+2,nrow=3) + outputData[1,]=c("X","X",rep(volcanoNameList,each=4)) + outputData[2,]=c("X","X",rep(c("p-val","Adjusted.p-val","FC","log2(FC)"),ncol(dataFrame$adj_p.value))) + outputData[,1]=c("Volcano","Gene",genesToKeep) + outputData[,2]=c("Comparison","Info","na") + if(!is.null(rowItemInfo))outputData[3,2]=rowItemInfo[genesToKeep] + outputData[3,seq(3,ncol(outputData),4)]=prettyNum(dataFrame$p.value,digits=4) + outputData[3,seq(4,ncol(outputData),4)]=prettyNum(dataFrame$adj_p.value,digits=4) + outputData[3,seq(5,ncol(outputData),4)]=prettyNum(2^dataFrame$coefficients,digits=4) + outputData[3,seq(6,ncol(outputData),4)]=prettyNum(dataFrame$coefficients,digits=4) + }else{ + #format matrix to be correctly read by galaxy (move headers in first column and row) + outputData=matrix(0,ncol=ncol(dataFrame$adj_p.value)*4+2,nrow=nrow(dataFrame$adj_p.value)+2) + outputData[1,]=c("X","X",rep(volcanoNameList,each=4)) + outputData[2,]=c("X","X",rep(c("p-val","Adjusted.p-val","FC","log2(FC)"),ncol(dataFrame$adj_p.value))) + outputData[,1]=c("Volcano","Gene",rownames(dataFrame$adj_p.value)) + outputData[,2]=c("Comparison","Info",rep("na",nrow(dataFrame$adj_p.value))) + if(!is.null(rowItemInfo))outputData[3:nrow(outputData),2]=rowItemInfo[rownames(dataFrame$adj_p.value)] + outputData[3:nrow(outputData),seq(3,ncol(outputData),4)]=prettyNum(dataFrame$p.value,digits=4) + outputData[3:nrow(outputData),seq(4,ncol(outputData),4)]=prettyNum(dataFrame$adj_p.value,digits=4) + outputData[3:nrow(outputData),seq(5,ncol(outputData),4)]=prettyNum(2^dataFrame$coefficients,digits=4) + outputData[3:nrow(outputData),seq(6,ncol(outputData),4)]=prettyNum(dataFrame$coefficients,digits=4) + } +} +addComment("[INFO]Formated output",T,opt$log,display=FALSE) + +#write output results +write.table(outputData,file=opt$outputFile,quote=FALSE,sep="\t",col.names = F,row.names = F) + + +end.time <- Sys.time() +addComment(c("[INFO]Total execution time for R script:",as.numeric(end.time - start.time,units="mins"),"mins"),T,opt$log,display=FALSE) + +addComment("[INFO]End of R script",T,opt$log,display=FALSE) + +printSessionInfo(opt$log) + +#sessionInfo() \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/getopt.R Fri Jun 26 09:38:23 2020 -0400 @@ -0,0 +1,773 @@ +# Copyright (c) 2008-2010 Allen Day +# Copyright (c) 2011-2013 Trevor L. Davis <trevor.l.davis@stanford.edu> +# +# Modified by J.Vandel 2017 to consider situation of multiple identical flag +# and concatenate as a vector the set of parameter for the same flag instead of +# keeping only the last value as done by the previous version. +# +# This file is free software: you may copy, redistribute and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation, either version 2 of the License, or (at your +# option) any later version. +# +# This file is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +#' C-like getopt behavior +#' +#' getopt is primarily intended to be used with ``\link{Rscript}''. It +#' facilitates writing ``\#!'' shebang scripts that accept short and long +#' flags/options. It can also be used from ``R'' directly, but is probably less +#' useful in this context. +#' +#' getopt() returns a \link{list} data structure containing \link{names} of the +#' flags that were present in the \link{character} \link{vector} passed in under +#' the \emph{opt} argument. Each value of the \link{list} is coerced to the +#' data type specified according to the value of the \emph{spec} argument. See +#' below for details. +#' +#' Notes on naming convention: +#' +#' 1. An \emph{option} is one of the shell-split input strings. +#' +#' 2. A \emph{flag} is a type of \emph{option}. a \emph{flag} can be defined as +#' having no \emph{argument} (defined below), a required \emph{argument}, or an +#' optional \emph{argument}. +#' +#' 3. An \emph{argument} is a type of \emph{option}, and is the value associated +#' with a flag. +#' +#' 4. A \emph{long flag} is a type of \emph{flag}, and begins with the string +#' ``--''. If the \emph{long flag} has an associated \emph{argument}, it may be +#' delimited from the \emph{long flag} by either a trailing \emph{=}, or may be +#' the subsequent \emph{option}. +#' +#' 5. A \emph{short flag} is a type of \emph{flag}, and begins with the string +#' ``-''. If a \emph{short flag} has an associated \emph{argument}, it is the +#' subsequent \emph{option}. \emph{short flags} may be bundled together, +#' sharing a single leading ``-'', but only the final \emph{short flag} is able +#' to have a corresponding \emph{argument}. +#' +#' Many users wonder whether they should use the getopt package, optparse package, +#' or argparse package. +#' Here is some of the major differences: +#' +#' Features available in \code{getopt} unavailable in \code{optparse} +#' +#' 1. As well as allowing one to specify options that take either +#' no argument or a required argument like \code{optparse}, +#' \code{getopt} also allows one to specify option with an optional argument. +#' +#' Some features implemented in \code{optparse} package unavailable in \code{getopt} +#' +#' 1. Limited support for capturing positional arguments after the optional arguments +#' when \code{positional_arguments} set to TRUE in \code{parse_args} +#' +#' 2. Automatic generation of an help option and printing of help text when encounters an "-h" +#' +#' 3. Option to specify default arguments for options as well the +#' variable name to store option values +#' +#' There is also new package \code{argparse} introduced in 2012 which contains +#' all the features of both getopt and optparse but which has a dependency on +#' Python 2.7 or 3.2+ and has not been used in production since 2008 or 2009 +#' like the getopt and optparse packages. +#' +#' Some Features unlikely to be implemented in \code{getopt}: +#' +#' 1. Support for multiple, identical flags, e.g. for "-m 3 -v 5 -v", the +#' trailing "-v" overrides the preceding "-v 5", result is v=TRUE (or equivalent +#' typecast). +#' +#' 2. Support for multi-valued flags, e.g. "--libpath=/usr/local/lib +#' --libpath=/tmp/foo". +#' +#' 3. Support for lists, e.g. "--define os=linux --define os=redhat" would +#' set result$os$linux=TRUE and result$os$redhat=TRUE. +#' +#' 4. Support for incremental, argument-less flags, e.g. "/path/to/script +#' -vvv" should set v=3. +#' +#' 5. Support partial-but-unique string match on options, e.g. "--verb" and +#' "--verbose" both match long flag "--verbose". +#' +#' 6. No support for mixing in positional arguments or extra arguments that +#' don't match any options. For example, you can't do "my.R --arg1 1 foo bar +#' baz" and recover "foo", "bar", "baz" as a list. Likewise for "my.R foo +#' --arg1 1 bar baz". +#' +#' @aliases getopt getopt-package +#' @param spec The getopt specification, or spec of what options are considered +#' valid. The specification must be either a 4-5 column \link{matrix}, or a +#' \link{character} \link{vector} coercible into a 4 column \link{matrix} using +#' \link{matrix}(x,ncol=4,byrow=TRUE) command. The \link{matrix}/\link{vector} +#' contains: +#' +#' Column 1: the \emph{long flag} name. A multi-\link{character} string. +#' +#' Column 2: \emph{short flag} alias of Column 1. A single-\link{character} +#' string. +#' +#' Column 3: \emph{Argument} mask of the \emph{flag}. An \link{integer}. +#' Possible values: 0=no argument, 1=required argument, 2=optional argument. +#' +#' Column 4: Data type to which the \emph{flag}'s argument shall be cast using +#' \link{storage.mode}. A multi-\link{character} string. This only considered +#' for same-row Column 3 values of 1,2. Possible values: \link{logical}, +#' \link{integer}, \link{double}, \link{complex}, \link{character}. +#' If \link{numeric} is encountered then it will be converted to double. +#' +#' Column 5 (optional): A brief description of the purpose of the option. +#' +#' The terms \emph{option}, \emph{flag}, \emph{long flag}, \emph{short flag}, +#' and \emph{argument} have very specific meanings in the context of this +#' document. Read the ``Description'' section for definitions. +#' @param opt This defaults to the return value of \link{commandArgs}(TRUE). +#' +#' If R was invoked directly via the ``R'' command, this corresponds to all +#' arguments passed to R after the ``--args'' flag. +#' +#' If R was invoked via the ``\link{Rscript}'' command, this corresponds to all +#' arguments after the name of the R script file. +#' +#' Read about \link{commandArgs} and \link{Rscript} to learn more. +#' @param command The string to use in the usage message as the name of the +#' script. See argument \emph{usage}. +#' @param usage If TRUE, argument \emph{opt} will be ignored and a usage +#' statement (character string) will be generated and returned from \emph{spec}. +#' @param debug This is used internally to debug the getopt() function itself. +#' @author Allen Day +#' @seealso \code{\link{getopt}} +#' @keywords data +#' @export +#' @examples +#' +#' #!/path/to/Rscript +#' library('getopt'); +#' #get options, using the spec as defined by the enclosed list. +#' #we read the options from the default: commandArgs(TRUE). +#' spec = matrix(c( +#' 'verbose', 'v', 2, "integer", +#' 'help' , 'h', 0, "logical", +#' 'count' , 'c', 1, "integer", +#' 'mean' , 'm', 1, "double", +#' 'sd' , 's', 1, "double" +#' ), byrow=TRUE, ncol=4); +#' opt = getopt(spec); +#' +#' # if help was asked for print a friendly message +#' # and exit with a non-zero error code +#' if ( !is.null(opt$help) ) { +#' cat(getopt(spec, usage=TRUE)); +#' q(status=1); +#' } +#' +#' #set some reasonable defaults for the options that are needed, +#' #but were not specified. +#' if ( is.null(opt$mean ) ) { opt$mean = 0 } +#' if ( is.null(opt$sd ) ) { opt$sd = 1 } +#' if ( is.null(opt$count ) ) { opt$count = 10 } +#' if ( is.null(opt$verbose ) ) { opt$verbose = FALSE } +#' +#' #print some progress messages to stderr, if requested. +#' if ( opt$verbose ) { write("writing...",stderr()); } +#' +#' #do some operation based on user input. +#' cat(paste(rnorm(opt$count,mean=opt$mean,sd=opt$sd),collapse="\n")); +#' cat("\n"); +#' +#' #signal success and exit. +#' #q(status=0); +getopt = function (spec=NULL,opt=commandArgs(TRUE),command=get_Rscript_filename(),usage=FALSE,debug=FALSE) { + + # littler compatibility - map argv vector to opt + if (exists("argv", where = .GlobalEnv, inherits = FALSE)) { + opt = get("argv", envir = .GlobalEnv); + } + + ncol=4; + maxcol=6; + col.long.name = 1; + col.short.name = 2; + col.has.argument = 3; + col.mode = 4; + col.description = 5; + + flag.no.argument = 0; + flag.required.argument = 1; + flag.optional.argument = 2; + + result = list(); + result$ARGS = vector(mode="character"); + + #no spec. fail. + if ( is.null(spec) ) { + stop('argument "spec" must be non-null.'); + + #spec is not a matrix. attempt to coerce, if possible. issue a warning. + } else if ( !is.matrix(spec) ) { + if ( length(spec)/4 == as.integer(length(spec)/4) ) { + warning('argument "spec" was coerced to a 4-column (row-major) matrix. use a matrix to prevent the coercion'); + spec = matrix( spec, ncol=ncol, byrow=TRUE ); + } else { + stop('argument "spec" must be a matrix, or a character vector with length divisible by 4, rtfm.'); + } + + #spec is a matrix, but it has too few columns. + } else if ( dim(spec)[2] < ncol ) { + stop(paste('"spec" should have at least ",ncol," columns.',sep='')); + + #spec is a matrix, but it has too many columns. + } else if ( dim(spec)[2] > maxcol ) { + stop(paste('"spec" should have no more than ",maxcol," columns.',sep='')); + + #spec is a matrix, and it has some optional columns. + } else if ( dim(spec)[2] != ncol ) { + ncol = dim(spec)[2]; + } + + #sanity check. make sure long names are unique, and short names are unique. + if ( length(unique(spec[,col.long.name])) != length(spec[,col.long.name]) ) { + stop(paste('redundant long names for flags (column ',col.long.name,').',sep='')); + } + if ( length(na.omit(unique(spec[,col.short.name]))) != length(na.omit(spec[,col.short.name])) ) { + stop(paste('redundant short names for flags (column ',col.short.name,').',sep='')); + } + # convert numeric type to double type + spec[,4] <- gsub("numeric", "double", spec[,4]) + + # if usage=TRUE, don't process opt, but generate a usage string from the data in spec + if ( usage ) { + ret = ''; + ret = paste(ret,"Usage: ",command,sep=''); + for ( j in 1:(dim(spec))[1] ) { + ret = paste(ret,' [-[-',spec[j,col.long.name],'|',spec[j,col.short.name],']',sep=''); + if (spec[j,col.has.argument] == flag.no.argument) { + ret = paste(ret,']',sep=''); + } else if (spec[j,col.has.argument] == flag.required.argument) { + ret = paste(ret,' <',spec[j,col.mode],'>]',sep=''); + } else if (spec[j,col.has.argument] == flag.optional.argument) { + ret = paste(ret,' [<',spec[j,col.mode],'>]]',sep=''); + } + } + # include usage strings + if ( ncol >= 5 ) { + max.long = max(apply(cbind(spec[,col.long.name]),1,function(x)length(strsplit(x,'')[[1]]))); + ret = paste(ret,"\n",sep=''); + for (j in 1:(dim(spec))[1] ) { + ret = paste(ret,sprintf(paste(" -%s|--%-",max.long,"s %s\n",sep=''), + spec[j,col.short.name],spec[j,col.long.name],spec[j,col.description] + ),sep=''); + } + } + else { + ret = paste(ret,"\n",sep=''); + } + return(ret); + } + + #XXX check spec validity here. e.g. column three should be convertible to integer + + i = 1; + + while ( i <= length(opt) ) { + if ( debug ) print(paste("processing",opt[i])); + + current.flag = 0; #XXX use NA + optstring = opt[i]; + + + #long flag + if ( substr(optstring, 1, 2) == '--' ) { + if ( debug ) print(paste(" long option:",opt[i])); + + optstring = substring(optstring,3); + + this.flag = NA; + this.argument = NA; + kv = strsplit(optstring, '=')[[1]]; + if ( !is.na(kv[2]) ) { + this.flag = kv[1]; + this.argument = paste(kv[-1], collapse="="); + } else { + this.flag = optstring; + } + + rowmatch = grep( this.flag, spec[,col.long.name],fixed=TRUE ); + + #long flag is invalid, matches no options + if ( length(rowmatch) == 0 ) { + stop(paste('long flag "', this.flag, '" is invalid', sep='')); + + #long flag is ambiguous, matches too many options + } else if ( length(rowmatch) > 1 ) { + # check if there is an exact match and use that + rowmatch = which(this.flag == spec[,col.long.name]) + if(length(rowmatch) == 0) { + stop(paste('long flag "', this.flag, '" is ambiguous', sep='')); + } + } + + #if we have an argument + if ( !is.na(this.argument) ) { + #if we can't accept the argument, bail out + if ( spec[rowmatch, col.has.argument] == flag.no.argument ) { + stop(paste('long flag "', this.flag, '" accepts no arguments', sep='')); + + #otherwise assign the argument to the flag + } else { + storage.mode(this.argument) = spec[rowmatch, col.mode]; + #don't need here to remove the last value of the vector as argument is in the same string as + #the flag name "--flag=argument" so no spurious TRUE was added + result[[spec[rowmatch, col.long.name]]] = c(result[[spec[rowmatch, col.long.name]]],this.argument); + i = i + 1; + next; + } + + #otherwise, we don't have an argument + } else { + #if we require an argument, bail out + ###if ( spec[rowmatch, col.has.argument] == flag.required.argument ) { + ### stop(paste('long flag "', this.flag, '" requires an argument', sep='')); + + #long flag has no attached argument. set flag as present. set current.flag so we can peek ahead later and consume the argument if it's there + ###} else { + result[[spec[rowmatch, col.long.name]]] = c(result[[spec[rowmatch, col.long.name]]],TRUE); + current.flag = rowmatch; + ###} + } + + #short flag(s) + } else if ( substr(optstring, 1, 1) == '-' ) { + if ( debug ) print(paste(" short option:",opt[i])); + + these.flags = strsplit(optstring,'')[[1]]; + + done = FALSE; + for ( j in 2:length(these.flags) ) { + this.flag = these.flags[j]; + rowmatch = grep( this.flag, spec[,col.short.name],fixed=TRUE ); + + #short flag is invalid, matches no options + if ( length(rowmatch) == 0 ) { + stop(paste('short flag "', this.flag, '" is invalid', sep='')); + + #short flag is ambiguous, matches too many options + } else if ( length(rowmatch) > 1 ) { + stop(paste('short flag "', this.flag, '" is ambiguous', sep='')); + + #short flag has an argument, but is not the last in a compound flag string + } else if ( j < length(these.flags) & spec[rowmatch,col.has.argument] == flag.required.argument ) { + stop(paste('short flag "', this.flag, '" requires an argument, but has none', sep='')); + + #short flag has no argument, flag it as present + } else if ( spec[rowmatch,col.has.argument] == flag.no.argument ) { + result[[spec[rowmatch, col.long.name]]] = c(result[[spec[rowmatch, col.long.name]]],TRUE); + done = TRUE; + + #can't definitively process this flag yet, need to see if next option is an argument or not + } else { + result[[spec[rowmatch, col.long.name]]] = c(result[[spec[rowmatch, col.long.name]]],TRUE); + current.flag = rowmatch; + done = FALSE; + } + } + if ( done ) { + i = i + 1; + next; + } + } + + #invalid opt + if ( current.flag == 0 ) { + stop(paste('"', optstring, '" is not a valid option, or does not support an argument', sep='')); + #TBD support for positional args + #if ( debug ) print(paste('"', optstring, '" not a valid option. It is appended to getopt(...)$ARGS', sep='')); + #result$ARGS = append(result$ARGS, optstring); + + # some dangling flag, handle it + } else if ( current.flag > 0 ) { + if ( debug ) print(' dangling flag'); + if ( length(opt) > i ) { + peek.optstring = opt[i + 1]; + if ( debug ) print(paste(' peeking ahead at: "',peek.optstring,'"',sep='')); + + #got an argument. attach it, increment the index, and move on to the next option. we don't allow arguments beginning with '-' UNLESS + #specfile indicates the value is an "integer" or "double", in which case we allow a leading dash (and verify trailing digits/decimals). + if ( substr(peek.optstring, 1, 1) != '-' | + #match negative double + ( substr(peek.optstring, 1, 1) == '-' + & regexpr('^-[0123456789]*\\.?[0123456789]+$',peek.optstring) > 0 + & spec[current.flag, col.mode]== 'double' + ) | + #match negative integer + ( substr(peek.optstring, 1, 1) == '-' + & regexpr('^-[0123456789]+$',peek.optstring) > 0 + & spec[current.flag, col.mode]== 'integer' + ) + ) { + if ( debug ) print(paste(' consuming argument *',peek.optstring,'*',sep='')); + storage.mode(peek.optstring) = spec[current.flag, col.mode]; + #remove the last argument put in result for current.flag that should be a TRUE and concatenate argument with previous ones + result[[spec[current.flag, col.long.name]]] = c(result[[spec[current.flag, col.long.name]]][-length(result[[spec[current.flag, col.long.name]]])],peek.optstring); + i = i + 1; + + #a lone dash + } else if ( substr(peek.optstring, 1, 1) == '-' & length(strsplit(peek.optstring,'')[[1]]) == 1 ) { + if ( debug ) print(' consuming "lone dash" argument'); + storage.mode(peek.optstring) = spec[current.flag, col.mode]; + #remove the last argument put in result for current.flag that should be a TRUE and concatenate argument with previous ones + result[[spec[current.flag, col.long.name]]] =c(result[[spec[current.flag, col.long.name]]][-length(result[[spec[current.flag, col.long.name]]])],peek.optstring); + i = i + 1; + + #no argument + } else { + if ( debug ) print(' no argument!'); + + #if we require an argument, bail out + if ( spec[current.flag, col.has.argument] == flag.required.argument ) { + stop(paste('flag "', this.flag, '" requires an argument', sep='')); + + #otherwise set flag as present. + } else if ( + spec[current.flag, col.has.argument] == flag.optional.argument | + spec[current.flag, col.has.argument] == flag.no.argument + ) { + x = TRUE; + storage.mode(x) = spec[current.flag, col.mode]; + result[[spec[current.flag, col.long.name]]] = c(result[[spec[current.flag, col.long.name]]],x); + } else { + stop(paste("This should never happen.", + "Is your spec argument correct? Maybe you forgot to set", + "ncol=4, byrow=TRUE in your matrix call?")); + } + } + #trailing flag without required argument + } else if ( spec[current.flag, col.has.argument] == flag.required.argument ) { + stop(paste('flag "', this.flag, '" requires an argument', sep='')); + + #trailing flag without optional argument + } else if ( spec[current.flag, col.has.argument] == flag.optional.argument ) { + x = TRUE; + storage.mode(x) = spec[current.flag, col.mode]; + result[[spec[current.flag, col.long.name]]] = c(result[[spec[current.flag, col.long.name]]],x); + + #trailing flag without argument + } else if ( spec[current.flag, col.has.argument] == flag.no.argument ) { + x = TRUE; + storage.mode(x) = spec[current.flag, col.mode]; + result[[spec[current.flag, col.long.name]]] = c(result[[spec[current.flag, col.long.name]]],x); + } else { + stop("this should never happen (2). please inform the author."); + } + #no dangling flag, nothing to do. + } else { + } + + i = i+1; + } + return(result); +} + + + +######################### +#set a modified version using only long named parameters + +getoptLong = function (spec=NULL,opt=commandArgs(TRUE),command=get_Rscript_filename(),usage=FALSE,debug=FALSE) { + + # littler compatibility - map argv vector to opt + if (exists("argv", where = .GlobalEnv, inherits = FALSE)) { + opt = get("argv", envir = .GlobalEnv); + } + + ncol=4; + maxcol=6; + col.long.name = 1; + #col.short.name = 2; + col.has.argument = 3; + col.mode = 4; + col.description = 5; + + flag.no.argument = 0; + flag.required.argument = 1; + flag.optional.argument = 2; + + result = list(); + result$ARGS = vector(mode="character"); + + #no spec. fail. + if ( is.null(spec) ) { + stop('argument "spec" must be non-null.'); + + #spec is not a matrix. attempt to coerce, if possible. issue a warning. + } else if ( !is.matrix(spec) ) { + if ( length(spec)/4 == as.integer(length(spec)/4) ) { + warning('argument "spec" was coerced to a 4-column (row-major) matrix. use a matrix to prevent the coercion'); + spec = matrix( spec, ncol=ncol, byrow=TRUE ); + } else { + stop('argument "spec" must be a matrix, or a character vector with length divisible by 4, rtfm.'); + } + + #spec is a matrix, but it has too few columns. + } else if ( dim(spec)[2] < ncol ) { + stop(paste('"spec" should have at least ",ncol," columns.',sep='')); + + #spec is a matrix, but it has too many columns. + } else if ( dim(spec)[2] > maxcol ) { + stop(paste('"spec" should have no more than ",maxcol," columns.',sep='')); + + #spec is a matrix, and it has some optional columns. + } else if ( dim(spec)[2] != ncol ) { + ncol = dim(spec)[2]; + } + + #sanity check. make sure long names are unique, and short names are unique. + if ( length(unique(spec[,col.long.name])) != length(spec[,col.long.name]) ) { + stop(paste('redundant long names for flags (column ',col.long.name,').',sep='')); + } + # if ( length(na.omit(unique(spec[,col.short.name]))) != length(na.omit(spec[,col.short.name])) ) { + # stop(paste('redundant short names for flags (column ',col.short.name,').',sep='')); + # } + # convert numeric type to double type + spec[,4] <- gsub("numeric", "double", spec[,4]) + + # if usage=TRUE, don't process opt, but generate a usage string from the data in spec + if ( usage ) { + ret = ''; + ret = paste(ret,"Usage: ",command,sep=''); + for ( j in 1:(dim(spec))[1] ) { + ret = paste(ret,' [-[-',spec[j,col.long.name],']',sep=''); + if (spec[j,col.has.argument] == flag.no.argument) { + ret = paste(ret,']',sep=''); + } else if (spec[j,col.has.argument] == flag.required.argument) { + ret = paste(ret,' <',spec[j,col.mode],'>]',sep=''); + } else if (spec[j,col.has.argument] == flag.optional.argument) { + ret = paste(ret,' [<',spec[j,col.mode],'>]]',sep=''); + } + } + # include usage strings + if ( ncol >= 5 ) { + max.long = max(apply(cbind(spec[,col.long.name]),1,function(x)length(strsplit(x,'')[[1]]))); + ret = paste(ret,"\n",sep=''); + for (j in 1:(dim(spec))[1] ) { + ret = paste(ret,sprintf(paste("--%-",max.long,"s %s\n",sep='') + ,spec[j,col.long.name],spec[j,col.description] + ),sep=''); + } + } + else { + ret = paste(ret,"\n",sep=''); + } + return(ret); + } + + #XXX check spec validity here. e.g. column three should be convertible to integer + + i = 1; + + while ( i <= length(opt) ) { + if ( debug ) print(paste("processing",opt[i])); + + current.flag = 0; #XXX use NA + optstring = opt[i]; + + + #long flag + if ( substr(optstring, 1, 2) == '--' ) { + if ( debug ) print(paste(" long option:",opt[i])); + + optstring = substring(optstring,3); + + this.flag = NA; + this.argument = NA; + kv = strsplit(optstring, '=')[[1]]; + if ( !is.na(kv[2]) ) { + this.flag = kv[1]; + this.argument = paste(kv[-1], collapse="="); + } else { + this.flag = optstring; + } + + rowmatch = grep( this.flag, spec[,col.long.name],fixed=TRUE ); + + #long flag is invalid, matches no options + if ( length(rowmatch) == 0 ) { + stop(paste('long flag "', this.flag, '" is invalid', sep='')); + + #long flag is ambiguous, matches too many options + } else if ( length(rowmatch) > 1 ) { + # check if there is an exact match and use that + rowmatch = which(this.flag == spec[,col.long.name]) + if(length(rowmatch) == 0) { + stop(paste('long flag "', this.flag, '" is ambiguous', sep='')); + } + } + + #if we have an argument + if ( !is.na(this.argument) ) { + #if we can't accept the argument, bail out + if ( spec[rowmatch, col.has.argument] == flag.no.argument ) { + stop(paste('long flag "', this.flag, '" accepts no arguments', sep='')); + + #otherwise assign the argument to the flag + } else { + storage.mode(this.argument) = spec[rowmatch, col.mode]; + #don't need here to remove the last value of the vector as argument is in the same string as + #the flag name "--flag=argument" so no spurious TRUE was added + result[[spec[rowmatch, col.long.name]]] = c(result[[spec[rowmatch, col.long.name]]],this.argument); + i = i + 1; + next; + } + + #otherwise, we don't have an argument + } else { + #if we require an argument, bail out + ###if ( spec[rowmatch, col.has.argument] == flag.required.argument ) { + ### stop(paste('long flag "', this.flag, '" requires an argument', sep='')); + + #long flag has no attached argument. set flag as present. set current.flag so we can peek ahead later and consume the argument if it's there + ###} else { + result[[spec[rowmatch, col.long.name]]] = c(result[[spec[rowmatch, col.long.name]]],TRUE); + current.flag = rowmatch; + ###} + } + + #short flag(s) + } + #else if ( substr(optstring, 1, 1) == '-' ) { + # if ( debug ) print(paste(" short option:",opt[i])); + # + # these.flags = strsplit(optstring,'')[[1]]; + # + # done = FALSE; + # for ( j in 2:length(these.flags) ) { + # this.flag = these.flags[j]; + # rowmatch = grep( this.flag, spec[,col.short.name],fixed=TRUE ); + # + # #short flag is invalid, matches no options + # if ( length(rowmatch) == 0 ) { + # stop(paste('short flag "', this.flag, '" is invalid', sep='')); + # + # #short flag is ambiguous, matches too many options + # } else if ( length(rowmatch) > 1 ) { + # stop(paste('short flag "', this.flag, '" is ambiguous', sep='')); + # + # #short flag has an argument, but is not the last in a compound flag string + # } else if ( j < length(these.flags) & spec[rowmatch,col.has.argument] == flag.required.argument ) { + # stop(paste('short flag "', this.flag, '" requires an argument, but has none', sep='')); + # + # #short flag has no argument, flag it as present + # } else if ( spec[rowmatch,col.has.argument] == flag.no.argument ) { + # result[[spec[rowmatch, col.long.name]]] = c(result[[spec[rowmatch, col.long.name]]],TRUE); + # done = TRUE; + # + # #can't definitively process this flag yet, need to see if next option is an argument or not + # } else { + # result[[spec[rowmatch, col.long.name]]] = c(result[[spec[rowmatch, col.long.name]]],TRUE); + # current.flag = rowmatch; + # done = FALSE; + # } + # } + # if ( done ) { + # i = i + 1; + # next; + # } + # } + + #invalid opt + if ( current.flag == 0 ) { + stop(paste('"', optstring, '" is not a valid option, or does not support an argument', sep='')); + #TBD support for positional args + #if ( debug ) print(paste('"', optstring, '" not a valid option. It is appended to getopt(...)$ARGS', sep='')); + #result$ARGS = append(result$ARGS, optstring); + + # some dangling flag, handle it + } else if ( current.flag > 0 ) { + if ( debug ) print(' dangling flag'); + if ( length(opt) > i ) { + peek.optstring = opt[i + 1]; + if ( debug ) print(paste(' peeking ahead at: "',peek.optstring,'"',sep='')); + + #got an argument. attach it, increment the index, and move on to the next option. we don't allow arguments beginning with '-' UNLESS + #specfile indicates the value is an "integer" or "double", in which case we allow a leading dash (and verify trailing digits/decimals). + if ( substr(peek.optstring, 1, 1) != '-' | + #match negative double + ( substr(peek.optstring, 1, 1) == '-' + & regexpr('^-[0123456789]*\\.?[0123456789]+$',peek.optstring) > 0 + & spec[current.flag, col.mode]== 'double' + ) | + #match negative integer + ( substr(peek.optstring, 1, 1) == '-' + & regexpr('^-[0123456789]+$',peek.optstring) > 0 + & spec[current.flag, col.mode]== 'integer' + ) + ) { + if ( debug ) print(paste(' consuming argument *',peek.optstring,'*',sep='')); + storage.mode(peek.optstring) = spec[current.flag, col.mode]; + #remove the last argument put in result for current.flag that should be a TRUE and concatenate argument with previous ones + result[[spec[current.flag, col.long.name]]] = c(result[[spec[current.flag, col.long.name]]][-length(result[[spec[current.flag, col.long.name]]])],peek.optstring); + i = i + 1; + + #a lone dash + } else if ( substr(peek.optstring, 1, 1) == '-' & length(strsplit(peek.optstring,'')[[1]]) == 1 ) { + if ( debug ) print(' consuming "lone dash" argument'); + storage.mode(peek.optstring) = spec[current.flag, col.mode]; + #remove the last argument put in result for current.flag that should be a TRUE and concatenate argument with previous ones + result[[spec[current.flag, col.long.name]]] =c(result[[spec[current.flag, col.long.name]]][-length(result[[spec[current.flag, col.long.name]]])],peek.optstring); + i = i + 1; + + #no argument + } else { + if ( debug ) print(' no argument!'); + + #if we require an argument, bail out + if ( spec[current.flag, col.has.argument] == flag.required.argument ) { + stop(paste('flag "', this.flag, '" requires an argument', sep='')); + + #otherwise set flag as present. + } else if ( + spec[current.flag, col.has.argument] == flag.optional.argument | + spec[current.flag, col.has.argument] == flag.no.argument + ) { + x = TRUE; + storage.mode(x) = spec[current.flag, col.mode]; + result[[spec[current.flag, col.long.name]]] = c(result[[spec[current.flag, col.long.name]]],x); + } else { + stop(paste("This should never happen.", + "Is your spec argument correct? Maybe you forgot to set", + "ncol=4, byrow=TRUE in your matrix call?")); + } + } + #trailing flag without required argument + } else if ( spec[current.flag, col.has.argument] == flag.required.argument ) { + stop(paste('flag "', this.flag, '" requires an argument', sep='')); + + #trailing flag without optional argument + } else if ( spec[current.flag, col.has.argument] == flag.optional.argument ) { + x = TRUE; + storage.mode(x) = spec[current.flag, col.mode]; + result[[spec[current.flag, col.long.name]]] = c(result[[spec[current.flag, col.long.name]]],x); + + #trailing flag without argument + } else if ( spec[current.flag, col.has.argument] == flag.no.argument ) { + x = TRUE; + storage.mode(x) = spec[current.flag, col.mode]; + result[[spec[current.flag, col.long.name]]] = c(result[[spec[current.flag, col.long.name]]],x); + } else { + stop("this should never happen (2). please inform the author."); + } + #no dangling flag, nothing to do. + } else { + } + + i = i+1; + } + return(result); +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/heatMapClustering.R Fri Jun 26 09:38:23 2020 -0400 @@ -0,0 +1,896 @@ +# A command-line interface to plot heatmap based on expression or diff. exp. analysis +# written by Jimmy Vandel +# one of these arguments is required: +# +# +initial.options <- commandArgs(trailingOnly = FALSE) +file.arg.name <- "--file=" +script.name <- sub(file.arg.name, "", initial.options[grep(file.arg.name, initial.options)]) +script.basename <- dirname(script.name) +source(file.path(script.basename, "utils.R")) +source(file.path(script.basename, "getopt.R")) + +#addComment("Welcome R!") + +# setup R error handling to go to stderr +options( show.error.messages=F, error = function () { cat(geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) + +# we need that to not crash galaxy with an UTF8 error on German LC settings. +loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") +loc <- Sys.setlocale("LC_NUMERIC", "C") + +#get starting time +start.time <- Sys.time() + + +options(stringAsfactors = FALSE, useFancyQuotes = FALSE, OutDec=".") + +#get options +args <- commandArgs() + +# get options, using the spec as defined by the enclosed list. +# we read the options from the default: commandArgs(TRUE). +spec <- matrix(c( + "expressionFile", "x", 1, "character", + "diffAnalyseFile", "x", 1, "character", + "factorInfo","x", 1, "character", + "genericData","x", 0, "logical", + "comparisonName","x",1,"character", + "comparisonNameLow","x",1,"character", + "comparisonNameHigh","x",1,"character", + "filterInputOutput","x", 1, "character", + "FCthreshold","x", 1, "double", + "pvalThreshold","x", 1, "double", + "geneListFiltering","x",1,"character", + "clusterNumber","x",1,"integer", + "maxRows","x",1,"integer", + "sampleClusterNumber","x",1,"integer", + "dataTransformation","x",1,"character", + "distanceMeasure","x",1,"character", + "aggloMethod","x",1,"character", + "personalColors","x",1,"character", + "sideBarColorPalette","x",1,"character", + "format", "x", 1, "character", + "quiet", "x", 0, "logical", + "log", "x", 1, "character", + "outputFile" , "x", 1, "character"), + byrow=TRUE, ncol=4) +opt <- getoptLong(spec) + +# enforce the following required arguments +if (is.null(opt$log)) { + addComment("[ERROR]'log file' is required") + q( "no", 1, F ) +} +addComment("[INFO]Start of R script",T,opt$log,display=FALSE) +if (is.null(opt$format)) { + addComment("[ERROR]'output format' is required",T,opt$log) + q( "no", 1, F ) +} +if (is.null(opt$outputFile)) { + addComment("[ERROR]'output file' is required",T,opt$log) + q( "no", 1, F ) +} + +if(is.null(opt$expressionFile) && !is.null(opt$genericData)){ + addComment("[ERROR]generic data clustering is based on expression clustering",T,opt$log) + q( "no", 1, F ) +} + +if (is.null(opt$clusterNumber) || opt$clusterNumber<2) { + addComment("[ERROR]valid genes clusters number is required",T,opt$log) + q( "no", 1, F ) +} + +if (is.null(opt$sampleClusterNumber) || opt$sampleClusterNumber<1) { + addComment("[ERROR]valid samples clusters number is required",T,opt$log) + q( "no", 1, F ) +} + +if (is.null(opt$dataTransformation)) { + addComment("[ERROR]data transformation option is required",T,opt$log) + q( "no", 1, F ) +} + +if (is.null(opt$distanceMeasure)) { + addComment("[ERROR]distance measure option is required",T,opt$log) + q( "no", 1, F ) +} + +if (is.null(opt$aggloMethod)) { + addComment("[ERROR]agglomeration method option is required",T,opt$log) + q( "no", 1, F ) +} + +if (is.null(opt$maxRows) || opt$maxRows<2) { + addComment("[ERROR]valid plotted row number is required",T,opt$log) + q( "no", 1, F ) +} + +if (!is.null(opt[["comparisonName"]]) && nchar(opt[["comparisonName"]])==0){ + addComment("[ERROR]you have to specify comparison",T,opt$log) + q( "no", 1, F ) +} + +if (!is.null(opt$comparisonNameLow) && nchar(opt$comparisonNameLow)==0){ + addComment("[ERROR]you have to specify comparisonLow",T,opt$log) + q( "no", 1, F ) +} + +if (!is.null(opt$comparisonNameHigh) && nchar(opt$comparisonNameHigh)==0){ + addComment("[ERROR]you have to specify comparisonHigh",T,opt$log) + q( "no", 1, F ) +} + +if (is.null(opt$genericData) && (!is.null(opt$comparisonNameLow) || !is.null(opt$comparisonNameHigh))){ + addComment("[ERROR]comparisonLow and comparisonHigh can be specified only with generic data",T,opt$log) + q( "no", 1, F ) +} + +if (!is.null(opt$genericData) && !is.null(opt[["comparisonName"]])){ + addComment("[ERROR]basic comparison cannot be specified for generic data",T,opt$log) + q( "no", 1, F ) +} + +if ((!is.null(opt[["comparisonName"]]) || !is.null(opt$comparisonNameLow) || !is.null(opt$comparisonNameHigh)) && is.null(opt$diffAnalyseFile)) { + addComment("[ERROR]'diff. exp. analysis file' is required",T,opt$log) + q( "no", 1, F ) +} + +if (!is.null(opt$genericData) && !is.null(opt$diffAnalyseFile) && is.null(opt$comparisonNameLow) && is.null(opt$comparisonNameHigh)){ + addComment("[ERROR]Missing comparison information for filtering",T,opt$log) + q( "no", 1, F ) +} + +if ((!is.null(opt$FCthreshold) || !is.null(opt$pvalThreshold)) && (is.null(opt[["comparisonName"]]) && is.null(opt$comparisonNameLow) && is.null(opt$comparisonNameHigh))) { + addComment("[ERROR]'comparisons' are missing for filtering",T,opt$log) + q( "no", 1, F ) +} + +if ((!is.null(opt$FCthreshold) || !is.null(opt$pvalThreshold)) && !is.null(opt$geneListFiltering)) { + addComment("[ERROR]Cannot have two filtering strategies",T,opt$log) + q( "no", 1, F ) +} + +verbose <- if (is.null(opt$quiet)) { + TRUE +}else{ + FALSE} + +addComment("[INFO]Parameters checked!",T,opt$log,display=FALSE) + +addComment(c("[INFO]Working directory: ",getwd()),TRUE,opt$log,display=FALSE) +addComment(c("[INFO]Command line: ",args),TRUE,opt$log,display=FALSE) + +#directory for plots and HTML +dir.create(file.path(getwd(), "plotDir")) +dir.create(file.path(getwd(), "plotLyDir")) + +#silent package loading +suppressPackageStartupMessages({ + library("plotly") + library("dendextend") + #library("ggdendro") + #library("plyr") + library("ggplot2") + library("heatmaply") + library("circlize") + #library("RColorBrewer") + #source("https://bioconductor.org/biocLite.R") + #biocLite("ComplexHeatmap") + library("ComplexHeatmap") + #library("processx") +}) + +expressionToCluster=!is.null(opt$expressionFile) + +#load input data files +if(expressionToCluster){ + #first expression data + expressionMatrix=read.csv(file=opt$expressionFile,header=F,sep="\t",colClasses="character") + #remove first row to convert it as colnames (to avoid X before colnames with header=T) + colNamesData=expressionMatrix[1,-1] + expressionMatrix=expressionMatrix[-1,] + #remove first colum to convert it as rownames + rowNamesData=expressionMatrix[,1] + expressionMatrix=expressionMatrix[,-1] + if(is.data.frame(expressionMatrix)){ + expressionMatrix=data.matrix(expressionMatrix) + }else{ + expressionMatrix=data.matrix(as.numeric(expressionMatrix)) + } + dimnames(expressionMatrix)=list(rowNamesData,colNamesData) + + #check input files + if (!is.numeric(expressionMatrix)) { + addComment("[ERROR]Expression data is not fully numeric!",T,opt$log,display=FALSE) + q( "no", 1, F ) + } + + addComment("[INFO]Expression data loaded and checked") + addComment(c("[INFO]Dim of expression matrix:",dim(expressionMatrix)),T,opt$log,display=FALSE) +} + +nbComparisons=0 +nbColPerContrast=5 +comparisonMatrix=NULL +comparisonMatrixInfoGene=NULL +#if available comparisons +if(!is.null(opt[["comparisonName"]])){ + #load results from differential expression analysis + #consider first row contains column names + comparisonMatrix=read.csv(file=opt$diffAnalyseFile,header=F,sep="\t") + colnames(comparisonMatrix)=as.character(unlist(comparisonMatrix[1,])) + #remove the second line also as it's information line (p-val,FDR.p-val,FC,logFC) + comparisonMatrix=comparisonMatrix[-c(1,2),] + #remove first and second colums, convert the first one as rownames + rownames(comparisonMatrix)=as.character(unlist(comparisonMatrix[,1])) + #and save second column content that contain geneInfo + comparisonMatrixInfoGene=as.character(unlist(comparisonMatrix[,2])) + names(comparisonMatrixInfoGene)=as.character(unlist(comparisonMatrix[,1])) + comparisonMatrix=comparisonMatrix[,-c(1,2)] + + comparisonMatrix=matrix(as.numeric(as.matrix(comparisonMatrix)),ncol=ncol(comparisonMatrix),dimnames = dimnames(comparisonMatrix)) + + if (ncol(comparisonMatrix)%%nbColPerContrast != 0) { + addComment("[ERROR]Diff. exp. data does not contain good number of columns per contrast, should contains in this order:p-val,FDR.p-val,FC,log2(FC) and t-stat",T,opt$log,display=FALSE) + q( "no", 1, F ) + } + + if(max(comparisonMatrix[,c(seq(1,ncol(comparisonMatrix),nbColPerContrast),seq(2,ncol(comparisonMatrix),nbColPerContrast))])>1 || min(comparisonMatrix[,c(seq(1,ncol(comparisonMatrix),nbColPerContrast),seq(2,ncol(comparisonMatrix),nbColPerContrast))])<0){ + addComment("[ERROR]Seem that diff. exp. data does not contain correct values for p-val and FDR.p-val columns, should be including in [0,1] interval",T,opt$log,display=FALSE) + q( "no", 1, F ) + } + + if (!is.numeric(comparisonMatrix)) { + addComment("[ERROR]Diff. exp. data is not fully numeric!",T,opt$log,display=FALSE) + q( "no", 1, F ) + } + + if(expressionToCluster && length(setdiff(rownames(comparisonMatrix),rownames(expressionMatrix)))!=0){ + addComment("[WARNING]All genes from diff. exp. file are not included in expression file",T,opt$log,display=FALSE) + } + + if(expressionToCluster && length(setdiff(rownames(expressionMatrix),rownames(comparisonMatrix)))!=0){ + addComment("[WARNING]All genes from expression file are not included in diff. exp. file",T,opt$log,display=FALSE) + } + + addComment("[INFO]Diff. exp. analysis loaded and checked",T,opt$log,display=FALSE) + addComment(c("[INFO]Dim of original comparison matrix:",dim(comparisonMatrix)),T,opt$log,display=FALSE) + + #restrict to user specified comparisons + restrictedComparisons=unlist(strsplit(opt[["comparisonName"]],",")) + #should be improved to avoid selection of column names starting too similarly + colToKeep=which(unlist(lapply(colnames(comparisonMatrix),function(x)any(startsWith(x,restrictedComparisons))))) + comparisonMatrix=matrix(comparisonMatrix[,colToKeep],ncol=length(colToKeep),dimnames = list(rownames(comparisonMatrix),colnames(comparisonMatrix)[colToKeep])) + + #get number of required comparisons + nbComparisons=ncol(comparisonMatrix)/nbColPerContrast + + addComment(c("[INFO]Dim of effective filtering matrix:",dim(comparisonMatrix)),T,opt$log,display=FALSE) +} + +#should be only the case with generic data +if(!is.null(opt$comparisonNameLow) || !is.null(opt$comparisonNameHigh)){ + #load generic data used for filtering + nbColPerContrast=1 + #consider first row contains column names + comparisonMatrix=read.csv(file=opt$diffAnalyseFile,header=F,sep="\t") + colnames(comparisonMatrix)=as.character(unlist(comparisonMatrix[1,])) + #remove first colum, convert the first one as rownames + rownames(comparisonMatrix)=as.character(unlist(comparisonMatrix[,1])) + comparisonMatrix=comparisonMatrix[-1,-1] + + comparisonMatrix=matrix(as.numeric(as.matrix(comparisonMatrix)),ncol=ncol(comparisonMatrix),dimnames = dimnames(comparisonMatrix)) + + if (!is.numeric(comparisonMatrix)) { + addComment("[ERROR]Filtering matrix is not fully numeric!",T,opt$log,display=FALSE) + q( "no", 1, F ) + } + + if(expressionToCluster && length(setdiff(rownames(comparisonMatrix),rownames(expressionMatrix)))!=0){ + addComment("[WARNING]All genes from filtering file are not included in expression file",T,opt$log,display=FALSE) + } + + if(expressionToCluster && length(setdiff(rownames(expressionMatrix),rownames(comparisonMatrix)))!=0){ + addComment("[WARNING]All genes from expression file are not included in filtering file",T,opt$log,display=FALSE) + } + + addComment("[INFO]Filtering file loaded and checked",T,opt$log,display=FALSE) + addComment(c("[INFO]Dim of original filtering matrix:",dim(comparisonMatrix)),T,opt$log,display=FALSE) + + #restrict to user specified comparisons + restrictedComparisons=c() + if(!is.null(opt$comparisonNameLow))restrictedComparisons=unique(c(restrictedComparisons,unlist(strsplit(opt$comparisonNameLow,",")))) + if(!is.null(opt$comparisonNameHigh))restrictedComparisons=unique(c(restrictedComparisons,unlist(strsplit(opt$comparisonNameHigh,",")))) + + if (!all(restrictedComparisons%in%colnames(comparisonMatrix))){ + addComment("[ERROR]Selected columns in filtering file are not present in filtering matrix!",T,opt$log,display=FALSE) + q( "no", 1, F ) + } + comparisonMatrix=matrix(comparisonMatrix[,restrictedComparisons],ncol=length(restrictedComparisons),dimnames = list(rownames(comparisonMatrix),restrictedComparisons)) + + #get number of required comparisons + nbComparisons=ncol(comparisonMatrix) + + addComment(c("[INFO]Dim of effective filtering matrix:",dim(comparisonMatrix)),T,opt$log,display=FALSE) +} + + + +factorInfoMatrix=NULL +if(!is.null(opt$factorInfo)){ + #get group information + #load factors file + factorInfoMatrix=read.csv(file=opt$factorInfo,header=F,sep="\t",colClasses="character") + #remove first row to convert it as colnames + colnames(factorInfoMatrix)=factorInfoMatrix[1,] + factorInfoMatrix=factorInfoMatrix[-1,] + #use first colum to convert it as rownames but not removing it to avoid conversion as vector in unique factor case + rownames(factorInfoMatrix)=factorInfoMatrix[,1] + + factorBarColor=colnames(factorInfoMatrix)[2] + + if(ncol(factorInfoMatrix)>2){ + addComment("[ERROR]Factors file should not contain more than 2 columns",T,opt$log,display=FALSE) + q( "no", 1, F ) + } + + #factor file is used for color band on heatmap, so all expression matrix column should be in the factor file + if(expressionToCluster && length(setdiff(colnames(expressionMatrix),rownames(factorInfoMatrix)))!=0){ + addComment("[ERROR]Missing samples in factor file",T,opt$log,display=FALSE) + q( "no", 1, F ) + } + + #factor file is used for color band on heatmap, so all comparison matrix column should be in the factor file + if(!expressionToCluster && length(setdiff(colnames(comparisonMatrix),rownames(factorInfoMatrix)))!=0){ + addComment("[ERROR]Missing differential contrasts in factor file",T,opt$log,display=FALSE) + q( "no", 1, F ) + } + + addComment("[INFO]Factors OK",T,opt$log,display=FALSE) + addComment(c("[INFO]Dim of factorInfo matrix:",dim(factorInfoMatrix)),T,opt$log,display=FALSE) +} + +if(!is.null(opt$personalColors)){ + ##parse personal colors + personalColors=unlist(strsplit(opt$personalColors,",")) + if(length(personalColors)==2){ + ##add medium color between two to get three colors + personalColors=c(personalColors[1],paste(c("#",as.character(as.hexmode(floor(apply(col2rgb(personalColors),1,mean))))),collapse=""),personalColors[2]) + } + if(length(personalColors)!=3){ + addComment("[ERROR]Personalized colors doesn't contain enough colors",T,opt$log,display=FALSE) + q( "no", 1, F ) + } + +} + + +if(!is.null(opt$filterInputOutput) && opt$filterInputOutput=="input"){ + #filter input data + + if(is.null(opt$geneListFiltering)){ + #filtering using stat thresholds + #rowToKeep=intersect(which(comparisonMatrix[,seq(2,ncol(comparisonMatrix),4)]<=opt$pvalThreshold),which(abs(comparisonMatrix[,seq(4,ncol(comparisonMatrix),4)])>=log2(opt$FCthreshold))) + if(is.null(opt$genericData)){ + #diff. expression matrix + rowToKeep=names(which(unlist(apply(comparisonMatrix,1,function(x)length(intersect(which(x[seq(2,length(x),nbColPerContrast)]<opt$pvalThreshold),which(abs(x[seq(4,length(x),nbColPerContrast)])>log2(opt$FCthreshold))))!=0)))) + }else{ + #generic filtering matrix + rowToKeep=rownames(comparisonMatrix) + if(!is.null(opt$comparisonNameLow)){ + restrictedLowComparisons=unlist(strsplit(opt$comparisonNameLow,",")) + rowToKeep=intersect(rowToKeep,names(which(unlist(apply(comparisonMatrix,1,function(x)length(which(x[restrictedLowComparisons]>opt$FCthreshold))!=0))))) + } + if(!is.null(opt$comparisonNameHigh)){ + restrictedHighComparisons=unlist(strsplit(opt$comparisonNameHigh,",")) + rowToKeep=intersect(rowToKeep,names(which(unlist(apply(comparisonMatrix,1,function(x)length(which(x[restrictedHighComparisons]<opt$pvalThreshold))!=0))))) + } + } + }else{ + #filtering using user gene list + geneListFiltering=read.csv(opt$geneListFiltering,as.is = 1,header=F) + rowToKeep=unlist(c(geneListFiltering)) + } + + if(!is.null(comparisonMatrix) && !all(rowToKeep%in%rownames(comparisonMatrix))){ + #should arrive only with user gene list filtering with diff.exp. results clustering + addComment("[WARNING] some genes of the user defined list are not in the diff. exp. input file",T,opt$log) + rowToKeep=intersect(rowToKeep,rownames(comparisonMatrix)) + } + + if(expressionToCluster && !all(rowToKeep%in%rownames(expressionMatrix))){ + addComment("[WARNING] some genes selected by the input filter are not in the expression file",T,opt$log) + rowToKeep=intersect(rowToKeep,rownames(expressionMatrix)) + } + + if(length(rowToKeep)==0){ + addComment("[ERROR]No gene survived to the input filtering thresholds, execution will be aborted. + Please consider to change threshold values and re-run the tool.",T,opt$log) + q( "no", 1, F ) + } + + #filter comparison matrix + if(!is.null(comparisonMatrix)){ + comparisonMatrix=matrix(comparisonMatrix[rowToKeep,],ncol=ncol(comparisonMatrix),dimnames = list(rowToKeep,colnames(comparisonMatrix))) + if(!is.null(comparisonMatrixInfoGene))comparisonMatrixInfoGene=comparisonMatrixInfoGene[rowToKeep] + } + #then expression matrix + if(expressionToCluster)expressionMatrix=matrix(expressionMatrix[rowToKeep,],ncol=ncol(expressionMatrix),dimnames = list(rowToKeep,colnames(expressionMatrix))) + + if(!is.null(comparisonMatrix) && expressionToCluster && nrow(comparisonMatrix)!=nrow(expressionMatrix)){ + addComment("[ERROR]Problem during input filtering, please check code",T,opt$log,display=FALSE) + q( "no", 1, F ) + } + + addComment("[INFO]Filtering step done",T,opt$log,display=FALSE) + addComment(c("[INFO]Input filtering step:",length(rowToKeep),"remaining rows"),T,opt$log,display=FALSE) +} + + +addComment("[INFO]Ready to plot",T,opt$log,display=FALSE) + +##--------------------- + +#plot heatmap +if(expressionToCluster){ + #will make clustering based on expression value or generic value + dataToHeatMap=expressionMatrix + valueMeaning="Intensity" + if(!is.null(opt$genericData))valueMeaning="Value" +}else{ + #will make clustering on log2(FC) values + dataToHeatMap=matrix(comparisonMatrix[,seq(4,ncol(comparisonMatrix),nbColPerContrast)],ncol=nbComparisons,dimnames = list(rownames(comparisonMatrix),colnames(comparisonMatrix)[seq(1,ncol(comparisonMatrix),nbColPerContrast)])) + valueMeaning="Log2(FC)" +} +addComment(c("[INFO]Dim of heatmap matrix:",dim(dataToHeatMap)),T,opt$log,display=FALSE) + +if(nrow(dataToHeatMap)==1 && ncol(dataToHeatMap)==1){ + addComment("[ERROR]Cannot make clustering with unique cell tab",T,opt$log,display=FALSE) + q( "no", 1, F ) +} + + +#apply data transformation if needed +if(opt$dataTransformation=="log"){ + dataToHeatMap=log(dataToHeatMap) + valueMeaning=paste(c("log(",valueMeaning,")"),collapse="") + addComment("[INFO]Data to cluster and to display in the heatmap are log transformed",T,opt$log,display=FALSE) +} +if(opt$dataTransformation=="log2"){ + dataToHeatMap=log2(dataToHeatMap) + valueMeaning=paste(c("log2(",valueMeaning,")"),collapse="") + addComment("[INFO]Data to cluster and to display in the heatmap are log2 transformed",T,opt$log,display=FALSE) +} + +maxRowsToDisplay=opt$maxRows + +nbClusters=opt$clusterNumber +if(nbClusters>nrow(dataToHeatMap)){ + #correct number of clusters if needed + nbClusters=nrow(dataToHeatMap) + addComment(c("[WARNING]Not enough rows to reach required clusters number, it is reduced to number of rows:",nbClusters),T,opt$log,display=FALSE) +} + +nbSampleClusters=opt$sampleClusterNumber +if(nbSampleClusters>ncol(dataToHeatMap)){ + #correct number of clusters if needed + nbSampleClusters=ncol(dataToHeatMap) + addComment(c("[WARNING]Not enough columns to reach required conditions clusters number, it is reduced to number of columns:",nbSampleClusters),T,opt$log,display=FALSE) +} + +colClust=FALSE +rowClust=FALSE +effectiveRowClust=FALSE + +#make appropriate clustering if needed +if(nrow(dataToHeatMap)>1 && nbClusters>1)rowClust=hclust(distExtended(dataToHeatMap,method = opt$distanceMeasure),method = opt$aggloMethod) +if(ncol(dataToHeatMap)>1 && nbSampleClusters>1)colClust=hclust(distExtended(t(dataToHeatMap),method = opt$distanceMeasure),method = opt$aggloMethod) + +if(nrow(dataToHeatMap)>maxRowsToDisplay){ + #make subsampling based on preliminary global clustering + #clusteringResults=cutree(rowClust,nbClusters) + #heatMapGenesToKeep=unlist(lapply(seq(1,nbClusters),function(x)sample(which(clusteringResults==x),min(length(which(clusteringResults==x)),round(maxRowsToDisplay/nbClusters))))) + ##OR + #basic subsampling + heatMapGenesToKeep=sample(rownames(dataToHeatMap),maxRowsToDisplay) + effectiveDataToHeatMap=matrix(dataToHeatMap[heatMapGenesToKeep,],ncol=ncol(dataToHeatMap),dimnames=list(heatMapGenesToKeep,colnames(dataToHeatMap))) + effectiveNbClusters=min(nbClusters,maxRowsToDisplay) + if(nrow(effectiveDataToHeatMap)>1 && effectiveNbClusters>1)effectiveRowClust=hclust(distExtended(effectiveDataToHeatMap, method = opt$distanceMeasure),method = opt$aggloMethod) + addComment(c("[WARNING]Too many rows for efficient heatmap drawing",maxRowsToDisplay,"subsampling is done for vizualization only"),T,opt$log,display=FALSE) + rm(heatMapGenesToKeep) +}else{ + effectiveDataToHeatMap=dataToHeatMap + effectiveRowClust=rowClust + effectiveNbClusters=nbClusters +} + +addComment(c("[INFO]Dim of plotted heatmap matrix:",dim(effectiveDataToHeatMap)),T,opt$log,display=FALSE) + +personalized_hoverinfo=matrix("",ncol = ncol(effectiveDataToHeatMap),nrow = nrow(effectiveDataToHeatMap),dimnames = dimnames(effectiveDataToHeatMap)) +if(expressionToCluster){ + for(iCol in colnames(effectiveDataToHeatMap)){for(iRow in rownames(effectiveDataToHeatMap)){personalized_hoverinfo[iRow,iCol]=paste(c("Probe: ",iRow,"\nCondition: ",iCol,"\n",valueMeaning,": ",effectiveDataToHeatMap[iRow,iCol]),collapse="")}} +}else{ + for(iCol in colnames(effectiveDataToHeatMap)){for(iRow in rownames(effectiveDataToHeatMap)){personalized_hoverinfo[iRow,iCol]=paste(c("Probe: ",iRow,"\nCondition: ",iCol,"\nFC: ",round(2^effectiveDataToHeatMap[iRow,iCol],2)),collapse="")}} +} + +#trying to overcome limitation of heatmaply package to modify xtick and ytick label, using directly plotly functions, but for now plotly do not permit to have personalized color for each x/y tick separately +test=FALSE +if(test==TRUE){ + + #define dendogram shapes + dd.row <- as.dendrogram(effectiveRowClust) + dd.col <- as.dendrogram(colClust) + + #and color them + dd.row=color_branches(dd.row, k = effectiveNbClusters, groupLabels = T) + dd.col=color_branches(dd.col, k = nbSampleClusters, groupLabels = T) + + #generating function for dendogram from segment list + ggdend <- function(df) { + ggplot() + + geom_segment(data = df, aes(x=x, y=y, xend=xend, yend=yend)) + + labs(x = "", y = "") + theme_minimal() + + theme(axis.text = element_blank(), axis.ticks = element_blank(), + panel.grid = element_blank()) + } + + # generate x/y dendogram plots + px <- ggdend(dendro_data(dd.col)$segments) + py <- ggdend(dendro_data(dd.row)$segments) + coord_flip() + + # reshape data matrix + col.ord <- order.dendrogram(dd.col) + row.ord <- order.dendrogram(dd.row) + xx <- effectiveDataToHeatMap[row.ord, col.ord] + # and also personalized_hoverinfo + personalized_hoverinfo=personalized_hoverinfo[row.ord, col.ord] + + # hide axis ticks and grid lines + eaxis <- list( + showticklabels = FALSE, + showgrid = FALSE, + zeroline = FALSE + ) + + #make the empty plot + p_empty <- plot_ly() %>% + layout(margin = list(l = 200), + xaxis = eaxis, + yaxis = eaxis) + + heatmap.plotly <- plot_ly( + z = xx, x = 1:ncol(xx), y = 1:nrow(xx), colors = viridis(n = 101, alpha = 1, begin = 0, end = 1, option = "inferno"), + type = "heatmap", showlegend = FALSE, text = personalized_hoverinfo, hoverinfo = "text", + colorbar = list( + # Capitalise first letter + title = valueMeaning, + tickmode = "array", + len = 0.3 + ) + ) %>% + layout( + xaxis = list( + tickfont = list(size = 10,color=get_leaves_branches_col(dd.row)), + tickangle = 45, + tickvals = 1:ncol(xx), ticktext = colnames(xx), + linecolor = "#ffffff", + range = c(0.5, ncol(xx) + 0.5), + showticklabels = TRUE + ), + yaxis = list( + tickfont = list(size = 10, color=get_leaves_branches_col(dd.col)), + tickangle = 0, + tickvals = 1:nrow(xx), ticktext = rownames(xx), + linecolor = "#ffffff", + range = c(0.5, nrow(xx) + 0.5), + showticklabels = TRUE + ) + ) + + #generate plotly + pp <- subplot(px, p_empty, heatmap.plotly, py, nrows = 2, margin = 0,widths = c(0.8,0.2),heights = c(0.2,0.8), shareX = TRUE, + shareY = TRUE) + + #save image file + export(pp, file = paste(c(file.path(getwd(), "plotDir"),"/Heatmap.",opt$format),collapse="")) + #rise a bug due to token stuf + #orca(pp, file = paste(c(file.path(getwd(), "plotDir"),"/Heatmap.",opt$format),collapse="")) + + + #save plotLy file + htmlwidgets::saveWidget(as_widget(pp), paste(c(file.path(getwd(), "plotLyDir"),"/Heatmap.html"),collapse=""),selfcontained = F) + + #htmlwidgets::saveWidget(as_widget(pp),"~/Bureau/test.html",selfcontained = F) + +}else{ #test + label_names=c("Probe","Condition",valueMeaning) + + # #color hclust objects + # dd.row=color_branches(effectiveRowClust, k = effectiveNbClusters) + # #rowColors=get_leaves_branches_col(dd.row) + # #rowColors[order.dendrogram(dd.row)]=rowColors + # rowGroup=cutree(effectiveRowClust, k = effectiveNbClusters) + # + # #get order of class as they will be displayed on the dendogram + # rowGroupRenamed=data.frame(cluster=mapvalues(rowGroup, unique(rowGroup[order.dendrogram(dd.row)[nleaves(dd.row):1]]), 1:effectiveNbClusters)) + # + # dd.col=color_branches(colClust, k = nbSampleClusters) + # #colColors=get_leaves_branches_col(dd.col) + # #colColors[order.dendrogram(dd.col)]=colColors + # colGroup=cutree(colClust, k = nbSampleClusters) + # + # # #get order of class as they will be displayed on the dendogram + # colGroupRenamed=data.frame(sampleCluster=mapvalues(colGroup, unique(colGroup[order.dendrogram(dd.col)[nleaves(dd.col):1]]), 1:nbSampleClusters)) + + + #while option is not correctly managed by heatmap apply, put personalized_hoverinfo to NULL + personalized_hoverinfo=NULL + + if(is.null(opt$personalColors)){ + heatmapColors=viridis(n = 101, alpha = 1, begin = 0, end = 1, option = "inferno") + }else{ + heatmapColors=personalColors + } + + colGroupRenamed=NULL + if(!is.null(factorInfoMatrix)){ + colGroupRenamed=eval(parse(text=(paste("data.frame(",factorBarColor,"=factorInfoMatrix[colnames(effectiveDataToHeatMap),2])",sep="")))) + sideBarGroupNb=length(table(factorInfoMatrix[colnames(effectiveDataToHeatMap),2])) + sideBarColorPaletteName="Spectral" + if(!is.null(opt$sideBarColorPalette) && opt$sideBarColorPalette%in%rownames(RColorBrewer::brewer.pal.info)){ + sideBarColorPaletteName=opt$sideBarColorPalette + } + sideBarColorPalette=setNames(colorRampPalette(RColorBrewer::brewer.pal(RColorBrewer::brewer.pal.info[sideBarColorPaletteName,"maxcolors"], sideBarColorPaletteName))(sideBarGroupNb),unique(factorInfoMatrix[colnames(effectiveDataToHeatMap),2])) + } + + if(!is.null(colGroupRenamed)){ + pp <- heatmaply(effectiveDataToHeatMap,key.title = valueMeaning,k_row=effectiveNbClusters,k_col=nbSampleClusters,col_side_colors=colGroupRenamed,col_side_palette=sideBarColorPalette,Rowv=effectiveRowClust,Colv=colClust,label_names=label_names,custom_hovertext=personalized_hoverinfo,plot_method = "plotly",colors = heatmapColors) + }else{ + pp <- heatmaply(effectiveDataToHeatMap,key.title = valueMeaning,k_row=effectiveNbClusters,k_col=nbSampleClusters,Rowv=effectiveRowClust,Colv=colClust,label_names=label_names,custom_hovertext=personalized_hoverinfo,plot_method = "plotly",colors = heatmapColors) + } + + + #save image file + export(pp, file = paste(c(file.path(getwd(), "plotDir"),"/Heatmap.",opt$format),collapse="")) + #rise a bug due to token stuf + #orca(pp, file = paste(c(file.path(getwd(), "plotDir"),"/Heatmap.",opt$format),collapse="")) + + + #save plotLy file + htmlwidgets::saveWidget(as_widget(pp), paste(c(file.path(getwd(), "plotLyDir"),"/Heatmap.html"),collapse=""),selfcontained = F) + +} +addComment("[INFO]Heatmap drawn",T,opt$log,display=FALSE) + + +#plot circular heatmap +if(!class(effectiveRowClust)=="logical"){ + dendo=as.dendrogram(effectiveRowClust) + + if(is.null(opt$personalColors)){ + col_fun = colorRamp2(quantile(effectiveDataToHeatMap,probs = seq(0,1,0.01)), viridis(101,option = "inferno")) + }else{ + col_fun = colorRamp2(quantile(effectiveDataToHeatMap,probs = seq(0,1,0.5)), personalColors) + } + + if(opt$format=="pdf"){ + pdf(paste(c("./plotDir/circularPlot.pdf"),collapse=""))}else{ + png(paste(c("./plotDir/circularPlot.png"),collapse="")) + } + + circos.par(cell.padding = c(0, 0, 0, 0), gap.degree = 5) + circos.initialize(c(rep("a",nrow(effectiveDataToHeatMap)),"b"),xlim=cbind(c(0,0),c(nrow(effectiveDataToHeatMap),5))) + circos.track(ylim = c(0, 1), bg.border = NA, panel.fun = function(x, y) { + if(CELL_META$sector.index=="a"){ + nr = ncol(effectiveDataToHeatMap) + nc = nrow(effectiveDataToHeatMap) + circos.text(1:nc- 0.5, rep(0,nc), adj = c(0, 0), + rownames(effectiveDataToHeatMap)[order.dendrogram(dendo)], facing = "clockwise", niceFacing = TRUE, cex = 0.3) + } + }) + + circos.track(ylim = c(0, ncol(effectiveDataToHeatMap)), bg.border = NA, panel.fun = function(x, y) { + + m = t(matrix(effectiveDataToHeatMap[order.dendrogram(dendo),],ncol=ncol(effectiveDataToHeatMap))) + col_mat = col_fun(m) + nr = nrow(m) + nc = ncol(m) + if(CELL_META$sector.index=="a"){ + for(i in 1:nr) { + circos.rect(1:nc - 1, rep(nr - i, nc), + 1:nc, rep(nr - i + 1, nc), + border = col_mat[i, ], col = col_mat[i, ]) + } + }else{ + circos.text(rep(1,nr), seq(nr,1,-1) , colnames(effectiveDataToHeatMap),cex = 0.3) + } + }) + + #dendo = color_branches(dendo, k = effectiveNbClusters, col = colorRampPalette(brewer.pal(12,"Set3"))(effectiveNbClusters)) + dendo = color_branches(dendo, k = effectiveNbClusters, col = rev(colorspace::rainbow_hcl(effectiveNbClusters))) + + + circos.track(ylim = c(0, attributes(dendo)$height), bg.border = NA, track.height = 0.25, + panel.fun = function(x, y) { + if(CELL_META$sector.index=="a")circos.dendrogram(dendo)} ) + + circos.clear() + ##add legend + lgd_links = Legend(at = seq(ceiling(min(effectiveDataToHeatMap)),floor(max(effectiveDataToHeatMap)),ceiling((floor(max(effectiveDataToHeatMap))-ceiling(min(effectiveDataToHeatMap)))/4)), col_fun = col_fun, + title_position = "topleft", grid_width = unit(5, "mm") ,title = valueMeaning) + + pushViewport(viewport(x = 0.85, y = 0.80, + width = 0.1, + height = 0.1, + just = c("left", "bottom"))) + grid.draw(lgd_links) + upViewport() + + + dev.off() + + addComment("[INFO]Circular heatmap drawn",T,opt$log,display=FALSE) + loc <- Sys.setlocale("LC_NUMERIC","C") +}else{ + addComment(c("[WARNING]Circular plot will not be plotted considering row or cluster number < 2"),T,opt$log,display=FALSE) +} +rm(effectiveDataToHeatMap,effectiveRowClust,effectiveNbClusters) + +#plot screeplot +if(class(rowClust)!="logical" && nrow(dataToHeatMap)>2){ + screePlotData=c() + for(iNbClusters in 2:(nbClusters+min(10,max(0,nrow(dataToHeatMap)-nbClusters)))){ + clusteringResults=cutree(rowClust,iNbClusters) + #clusteringResults=kmeans(dataToHeatMap,iNbClusters)$cluster + + #compute variance between each intra-class points amongst themselves (need at least 3 points by cluster) + #screePlotData=c(screePlotData,sum(unlist(lapply(seq(1,iNbClusters),function(x){temp=which(clusteringResults==x);if(length(temp)>2){var(dist(dataToHeatMap[temp,]))}else{0}}))) ) + #compute variance between each intra-class points and fictive mean point (need at least 2 points by cluster) + #screePlotData=c(screePlotData,sum(unlist(lapply(seq(1,iNbClusters),function(x){temp=which(clusteringResults==x);if(length(temp)>1){ var(dist(rbind(apply(dataToHeatMap[temp,],2,mean),dataToHeatMap[temp,]))[1:length(temp)]) }else{0}}))) ) + if(ncol(dataToHeatMap)>1)screePlotData=c(screePlotData,sum(unlist(lapply(seq(1,iNbClusters),function(x){temp=which(clusteringResults==x);if(length(temp)>1){ sum((distExtended(rbind(apply(dataToHeatMap[temp,],2,mean),dataToHeatMap[temp,]),method = opt$distanceMeasure)[1:length(temp)])^2) }else{0}}))) ) + else screePlotData=c(screePlotData,sum(unlist(lapply(seq(1,iNbClusters),function(x){temp=which(clusteringResults==x);if(length(temp)>1){ sum((dataToHeatMap[temp,]-mean(dataToHeatMap[temp,]))^2) }else{0}}))) ) + } + + dataToPlot=data.frame(clusterNb=seq(2,length(screePlotData)+1),wcss=screePlotData) + p <- ggplot(data=dataToPlot, aes(clusterNb,wcss)) + geom_point(colour="#EE4444") + geom_line(colour="#DD9999") + + ggtitle("Scree plot") + theme_bw() + xlab(label="Cluster number") + ylab(label="Within cluster sum of squares") + + theme(panel.border=element_blank(),plot.title = element_text(hjust = 0.5),legend.position = "none") + + scale_x_continuous(breaks=seq(min(dataToPlot$clusterNb), max(dataToPlot$clusterNb), 1)) + + #save plotly files + pp <- ggplotly(p) + + if(opt$format=="pdf"){ + pdf(paste(c("./plotDir/screePlot.pdf"),collapse=""))}else{ + png(paste(c("./plotDir/screePlot.png"),collapse="")) + } + plot(p) + dev.off() + + #save plotly files + htmlwidgets::saveWidget(as_widget(pp), paste(c(file.path(getwd(), "plotLyDir"),"/screePlot.html"),collapse=""),selfcontained = F) + + addComment("[INFO]Scree plot drawn",T,opt$log,display=FALSE) +}else{ + addComment(c("[WARNING]Scree plot will not be plotted considering row number <= 2"),T,opt$log,display=FALSE) +} + +##---------------------- + +#filter output based on parameters + +rowToKeep=rownames(dataToHeatMap) +if(!is.null(opt$filterInputOutput) && opt$filterInputOutput=="output"){ + #rowToKeep=intersect(which(comparisonMatrix[,seq(2,ncol(comparisonMatrix),4)]<=opt$pvalThreshold),which(abs(comparisonMatrix[,seq(4,ncol(comparisonMatrix),4)])>=log2(opt$FCthreshold))) + if(is.null(opt$geneListFiltering)){ + if(is.null(opt$genericData)){ + #diff. expression matrix + rowToKeep=names(which(unlist(apply(comparisonMatrix,1,function(x)length(intersect(which(x[seq(2,length(x),nbColPerContrast)]<=opt$pvalThreshold),which(abs(x[seq(4,length(x),nbColPerContrast)])>=log2(opt$FCthreshold))))!=0)))) + }else{ + #generic filtering matrix + rowToKeep=rownames(comparisonMatrix) + if(!is.null(opt$comparisonNameLow)){ + restrictedLowComparisons=unlist(strsplit(opt$comparisonNameLow,",")) + rowToKeep=intersect(rowToKeep,names(which(unlist(apply(comparisonMatrix,1,function(x)length(which(x[restrictedLowComparisons]>opt$FCthreshold))!=0))))) + } + if(!is.null(opt$comparisonNameHigh)){ + restrictedHighComparisons=unlist(strsplit(opt$comparisonNameHigh,",")) + rowToKeep=intersect(rowToKeep,names(which(unlist(apply(comparisonMatrix,1,function(x)length(which(x[restrictedHighComparisons]<opt$pvalThreshold))!=0))))) + } + } + }else{ + geneListFiltering=read.csv(opt$geneListFiltering,as.is = 1,header=F) + rowToKeep=unlist(c(geneListFiltering)) + } + if(!is.null(comparisonMatrix) && !all(rowToKeep%in%rownames(comparisonMatrix))){ + #should arrive only with user gene list filtering with diff.exp. results clustering + addComment("[WARNING] some genes of the user defined list are not in the diff. exp. input file",T,opt$log) + rowToKeep=intersect(rowToKeep,rownames(comparisonMatrix)) + } + + if(expressionToCluster && !all(rowToKeep%in%rownames(expressionMatrix))){ + addComment("[WARNING] some genes selected by the output filter are not in the expression file",T,opt$log) + rowToKeep=intersect(rowToKeep,rownames(expressionMatrix)) + } + addComment(c("[INFO]Output filtering step:",length(rowToKeep),"remaining rows"),T,opt$log,display=FALSE) +} + +#we add differential analysis info in output if it was directly used for clustering or when it was used for filtering with expression + +#in case of expression or generic data clustering without filtering based on external stats +if(expressionToCluster && is.null(comparisonMatrix)){ + if(length(rowToKeep)==0){ + addComment("[WARNING]No more gene after output filtering step, tabular output will be empty",T,opt$log,display=FALSE) + outputData=matrix(c("Gene","Cluster","noGene","noClustering"),ncol=2,nrow=2,byrow = TRUE) + }else{ + outputData=matrix(0,ncol=2,nrow=length(rowToKeep)+1) + outputData[1,]=c("Gene","Cluster") + outputData[2:(length(rowToKeep)+1),1]=rowToKeep + if(class(rowClust)!="logical" ){ + outputData[2:(length(rowToKeep)+1),2]=cutree(rowClust,nbClusters)[rowToKeep] + }else{ + outputData[2:(length(rowToKeep)+1),2]=0 + } + } +} + +#in case of generic data clustering with filtering based on generic external data +if(!is.null(opt$genericData) && !is.null(comparisonMatrix)){ + if(length(rowToKeep)==0){ + addComment("[WARNING]No more gene after output filtering step, tabular output will be empty",T,opt$log,display=FALSE) + outputData=matrix(c("Gene","Cluster","noGene","noClustering"),ncol=2,nrow=2,byrow = TRUE) + }else{ + outputData=matrix(0,ncol=2+nbComparisons,nrow=length(rowToKeep)+1) + outputData[1,]=c("Gene","Cluster",colnames(comparisonMatrix)) + outputData[2:(length(rowToKeep)+1),1]=rowToKeep + if(class(rowClust)!="logical" ){ + outputData[2:(length(rowToKeep)+1),2]=cutree(rowClust,nbClusters)[rowToKeep] + }else{ + outputData[2:(length(rowToKeep)+1),2]=0 + } + outputData[2:(length(rowToKeep)+1),3:(ncol(comparisonMatrix)+2)]=prettyNum(comparisonMatrix[rowToKeep,],digits=4) + } +} + +#in case of expression data clustering with filtering based on diff. exp. results or diff. exp. results clustering +if(is.null(opt$genericData) && !is.null(comparisonMatrix)){ + if(length(rowToKeep)==0){ + addComment("[WARNING]No more gene after output filtering step, tabular output will be empty",T,opt$log,display=FALSE) + outputData=matrix(0,ncol=3,nrow=3) + outputData[1,]=c("","","Comparison") + outputData[2,]=c("Gene","Info","Cluster") + outputData[3,]=c("noGene","noInfo","noClustering") + }else{ + outputData=matrix(0,ncol=3+nbComparisons*nbColPerContrast,nrow=length(rowToKeep)+2) + outputData[1,]=c("","","Comparison",rep(colnames(comparisonMatrix)[seq(1,ncol(comparisonMatrix),nbColPerContrast)],each=nbColPerContrast)) + outputData[2,]=c("Gene","Info","Cluster",rep(c("p-val","FDR.p-val","FC","log2(FC)","t-stat"),nbComparisons)) + outputData[3:(length(rowToKeep)+2),1]=rowToKeep + outputData[3:(length(rowToKeep)+2),2]=comparisonMatrixInfoGene[rowToKeep] + if(class(rowClust)!="logical" ){ + outputData[3:(length(rowToKeep)+2),3]=cutree(rowClust,nbClusters)[rowToKeep] + }else{ + outputData[3:(length(rowToKeep)+2),3]=0 + } + outputData[3:(length(rowToKeep)+2),4:(ncol(comparisonMatrix)+3)]=prettyNum(comparisonMatrix[rowToKeep,],digits=4) + } +} + +addComment("[INFO]Formated output",T,opt$log,display=FALSE) +write.table(outputData,file=opt$outputFile,quote=FALSE,sep="\t",col.names = F,row.names = F) + +##---------------------- + +end.time <- Sys.time() +addComment(c("[INFO]Total execution time for R script:",as.numeric(end.time - start.time,units="mins"),"mins"),T,opt$log,display=FALSE) + + +addComment("[INFO]End of R script",T,opt$log,display=FALSE) + +printSessionInfo(opt$log) + +#sessionInfo() + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/utils.R Fri Jun 26 09:38:23 2020 -0400 @@ -0,0 +1,143 @@ +# Copyright (c) 2011-2013 Trevor L. Davis <trevor.l.davis@stanford.edu> +# +# This file is free software: you may copy, redistribute and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation, either version 2 of the License, or (at your +# option) any later version. +# +# This file is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + + +#extendedDist function to correlation measure +distExtended <- function(x,method) { + if(method %in% c("euclidean", "maximum", "manhattan", "canberra", "binary", "minkowski"))return(dist(x,method = method)) + if(method %in% c("pearson", "spearman", "kendall"))return(as.dist(1-cor(t(x),method=method))/2) + if(method %in% c("absPearson", "absSpearman", "absKendall"))return(as.dist(1-abs(cor(t(x),method=method)))) + return(NULL) +} + +##comment function to display message and optionnaly add it to log file + +addComment <- function(text,addToFile=FALSE,fileName=NULL,append=TRUE,display=TRUE){ + if(display)cat(paste(c(text,"\n"),collapse = " ")) + if(addToFile)write(paste(text,collapse = " "),fileName,append=append) +} + +printSessionInfo <- function(fileName=NULL,append=TRUE){ + addComment("[INFO]R session info :",T,fileName,display=FALSE) + tempInfo=sessionInfo() + write(paste(tempInfo$R.version$version.string),fileName,append=append) + write(paste("Platform",tempInfo$platform,sep = " : "),fileName,append=append) + write(paste("Running under",tempInfo$running,sep = " : "),fileName,append=append) + write(paste("Local variables",tempInfo$locale,sep = " : "),fileName,append=append) + write(paste("Attached base packages",paste(tempInfo$basePkgs,collapse = "; "),sep = " : "),fileName,append=append) + if(length(tempInfo$otherPkgs)>0){ + lineToPrint="" + for(iPack in tempInfo$otherPkgs){ + lineToPrint=paste(lineToPrint,iPack$Package," ",iPack$Version,"; ",sep = "") + } + write(paste("Other attached packages",lineToPrint,sep = " : "),fileName,append=append) + } + if(length(tempInfo$loadedOnly)>0){ + lineToPrint="" + for(iPack in tempInfo$loadedOnly){ + lineToPrint=paste(lineToPrint,iPack$Package," ",iPack$Version,"; ",sep = "") + } + write(paste("Loaded packages",lineToPrint,sep = " : "),fileName,append=append) + } +} + +##negative of a mathematical expression +negativeExpression <- function(expression){ + expression=gsub("\\+","_toMinus_",expression) + expression=gsub("\\-","+",expression) + expression=gsub("_toMinus_","-",expression) + if(substr(expression,1,1)!="-" && substr(expression,1,1)!="+"){ + expression=paste(c("-",expression),collapse="") + } + + return(expression) +} + +#' Returns file name of calling Rscript +#' +#' \code{get_Rscript_filename} returns the file name of calling Rscript +#' @return A string with the filename of the calling script. +#' If not found (i.e. you are in a interactive session) returns NA. +#' +#' @export +get_Rscript_filename <- function() { + prog <- sub("--file=", "", grep("--file=", commandArgs(), value=TRUE)[1]) + if( .Platform$OS.type == "windows") { + prog <- gsub("\\\\", "\\\\\\\\", prog) + } + prog +} + +#' Recursively sorts a list +#' +#' \code{sort_list} returns a sorted list +#' @param unsorted_list A list. +#' @return A sorted list. +#' @export +sort_list <- function(unsorted_list) { + for(ii in seq(along=unsorted_list)) { + if(is.list(unsorted_list[[ii]])) { + unsorted_list[[ii]] <- sort_list(unsorted_list[[ii]]) + } + } + unsorted_list[sort(names(unsorted_list))] +} + + +# Multiple plot function +# +# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects) +# - cols: Number of columns in layout +# - layout: A matrix specifying the layout. If present, 'cols' is ignored. +# +# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE), +# then plot 1 will go in the upper left, 2 will go in the upper right, and +# 3 will go all the way across the bottom. +# +multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) { + library(grid) + + # Make a list from the ... arguments and plotlist + plots <- c(list(...), plotlist) + + numPlots = length(plots) + + # If layout is NULL, then use 'cols' to determine layout + if (is.null(layout)) { + # Make the panel + # ncol: Number of columns of plots + # nrow: Number of rows needed, calculated from # of cols + layout <- matrix(seq(1, cols * ceiling(numPlots/cols)), + ncol = cols, nrow = ceiling(numPlots/cols)) + } + + if (numPlots==1) { + print(plots[[1]]) + + } else { + # Set up the page + grid.newpage() + pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout)))) + + # Make each plot, in the correct location + for (i in 1:numPlots) { + # Get the i,j matrix positions of the regions that contain this subplot + matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE)) + + print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row, + layout.pos.col = matchidx$col)) + } + } +}