Mercurial > repos > eschen42 > w4mcorcov
changeset 13:2ae2d26e3270 draft
planemo upload for repository https://github.com/HegemanLab/w4mcorcov_galaxy_wrapper/tree/master commit e89c652c0849eb1d5a1e6c9100c72c64a8d388b4
author | eschen42 |
---|---|
date | Wed, 12 Dec 2018 09:20:02 -0500 |
parents | ddaf84e15d06 |
children | 90708fdbc22d |
files | w4mcorcov.xml w4mcorcov_calc.R w4mcorcov_lib.R w4mcorcov_salience.R w4mcorcov_wrapper.R |
diffstat | 5 files changed, 640 insertions(+), 536 deletions(-) [+] |
line wrap: on
line diff
--- a/w4mcorcov.xml Thu Nov 08 23:06:09 2018 -0500 +++ b/w4mcorcov.xml Wed Dec 12 09:20:02 2018 -0500 @@ -1,10 +1,10 @@ -<tool id="w4mcorcov" name="OPLS-DA_Contrasts" version="0.98.16"> +<tool id="w4mcorcov" name="OPLS-DA_Contrasts" version="0.98.17"> <description>OPLS-DA Contrasts of Univariate Results</description> <macros> <xml name="paramPairSigFeatOnly"> <param name="pairSigFeatOnly" type="boolean" checked="true" truevalue="TRUE" falsevalue="FALSE" label="Retain only pairwise-significant features" - help="When this option is set to 'Yes', analysis will be performed including only features that differ significantly for the pair of levels being contrasted; when set to 'No', any feature that varies significantly across all levels will be included (i.e., exclude any feature that is not significantly different across all levels). See examples below." /> + help="When this option is set to 'Yes', analysis will be performed including only features scored by the univariate test as differing significantly for the pair of levels being contrasted; when set to 'No', any feature that varies significantly across all levels will be included (i.e., exclude only features not scored by the univariate test as significantly varying when all levels are considered). See examples below." /> </xml> <xml name="cplots"> <param name="cplot_y" label="C-plot Y-axis" type="select" help="Choose the Y-axis for C-plots."> @@ -17,14 +17,12 @@ <param name="cplot_o" type="boolean" checked="true" truevalue="TRUE" falsevalue="FALSE" label="Produce orthogonal C-plot" help="When this option is set to 'Yes', correlation will be plotted against vip4 for orthogonal loadings." /> + <param name="fdr_features" type="text" value="ALL" + label="How many features for p-value calculation?" + help="Specify how many features should be used to perform family-wise error rate adjustment of p-values for covariance and correlation. If you were to eliminate features from the data matrix based on significance criteria prior to running this tool, you would want to include them in the count here to avoid underestimating the p-value. Specify 'ALL' to signify that all features that could impact p-value calculation are included in the data matrix."/> </xml> </macros> <requirements> - <!-- - <requirement type="package" version="3.4.1">r-base</requirement> - <requirement type="package" version="1.1_4">r-batch</requirement> - <requirement type="package" version="1.2.14">bioconductor-ropls</requirement> - --> <requirement type="package">r-base</requirement> <requirement type="package">r-batch</requirement> <requirement type="package" version="1.10.0">bioconductor-ropls</requirement> @@ -35,9 +33,9 @@ sampleMetadata_in '$sampleMetadata_in' variableMetadata_in '$variableMetadata_in' facC '$facC' - #if str( $signif_test.tesC ) == "none": - tesC "none" - pairSigFeatOnly "FALSE" + #if str( $signif_test.tesC ) == 'none': + tesC 'none' + pairSigFeatOnly 'FALSE' #else: tesC '$signif_test.tesC' pairSigFeatOnly '$signif_test.pairSigFeatOnly' @@ -45,20 +43,21 @@ levCSV '$levCSV' matchingC '$matchingC' labelFeatures '$labelFeatures' - #if str( $xplots.expPlot ) == "none": - cplot_p "FALSE" - cplot_o "FALSE" - cplot_y "correlation" - #else if str( $xplots.expPlot ) == "cplot": - cplot_p '$xplots.cplot_p' - cplot_o '$xplots.cplot_o' - cplot_y '$xplots.cplot_y' + #if str( $advanced.advancedFeatures ) == 'none': + fdr_features 'ALL' + cplot_p 'FALSE' + cplot_o 'FALSE' + cplot_y 'correlation' + #else if str( $advanced.advancedFeatures ) == 'advanced': + fdr_features '$advanced.fdr_features' + cplot_p '$advanced.cplot_p' + cplot_o '$advanced.cplot_o' + cplot_y '$advanced.cplot_y' #end if contrast_detail '$contrast_detail' contrast_corcov '$contrast_corcov' contrast_salience '$contrast_salience' - ]]></command> - + ]]></command> <inputs> <param name="dataMatrix_in" format="tabular" label="Data matrix file" type="data" help="variables ✖ samples" /> @@ -79,7 +78,7 @@ </sanitizer> </param> <conditional name="signif_test"> - <param name="tesC" label="Univariate significance-test" type="select" help="Either 'none' or the name of the statistical test that was run by the 'Univariate' tool to produce the variableMetadata file; that name must also be a portion of the column names in that file."> + <param name="tesC" label="Univariate significance-test" type="select" help="Either 'none' or the name of the statistical test that was run by the 'Univariate' tool to produce the variableMetadata file."> <option value="none">none - Display all features from variableMetadata (rather than choosing a subset based on significance in univariate testing)</option> <option value="ttest">ttest - Student's t-test (parametric test, qualitative factor with exactly 2 levels)</option> <option value="anova">anova - Analysis of variance (parametric test, qualitative factor with more than 2 levels)</option> @@ -101,7 +100,7 @@ </when> </conditional> <param name="levCSV" type="text" value="*" label="Levels of interest" - help="Comma-separated level-names (or comma-less regular expressions to match level-names) to consider in analysis; must match at least two levels; levels must be non-numeric; may include wild cards or regular expressions. Note that extra space characters will affect results - 'a,b' is correct, but 'a , b' is not and may fail or give different results."> + help="Comma-separated level-names (or comma-separated regular expressions to match level-names) to consider in analysis; must match at least two levels; levels must be non-numeric; may include wild cards or regular expressions. Note that extra space characters will affect results - when 'a,b' is correct, 'a, b' is not equivalent and likely will fail or give different results."> <sanitizer> <valid initial="string.letters"> <add preset="string.digits"/> @@ -134,15 +133,17 @@ <option value="regex">use regular expressions for matching level-names</option> </param> <param name="labelFeatures" type="text" value="3" - label="How many features having extreme loadings should be labelled on cov-vs.-cor plot" + label="How many features having extreme loadings should be labelled on cov-vs.-cor plot?" help="Specify the number of features at each of the loading-extremes that should be labelled (with the name of the feature) on the covariance-vs.-correlation plot; specify 'ALL' to label all features or '0' to label no features; this choice has no effect on the OPLS-DA loadings plot."/> - <conditional name="xplots"> - <param name="expPlot" label="Extra plots to include" type="select" help="Choosing 'none' hides further choices."> - <option value="none">Do not include additonal extra plots.</option> - <option value="cplot">Include C-plots (predictor-loading vs. 'vip4p' and orthogonal-loading versus 'vip4o')</option> + <conditional name="advanced"> + <param name="advancedFeatures" type="select" + label="Advanced (C-plots and customized p-value adjustment)" + help="Choose 'Do not include ...' to hides further choices."> + <option value="advanced">Include C-plots and customize p-value adjustment.</option> + <option value="none">Do not include additonal C-plots or customize p-value adjustment.</option> </param> <when value="none" /> - <when value="cplot"> + <when value="advanced"> <expand macro="cplots" /> </when> </conditional> @@ -151,8 +152,9 @@ <!-- pdf1: summaries of each contrasts, clearly labelled by level=pair name * first PCA score-plot - * then PLS score-plot - * then PLS S-PLOT; color in red features with VIP > 1; color in grey any non-pairwise-significant features, if these are included + * then OPLS score-plot + * then OPLS S-PLOT; color saturation increases with VIP + * then C-plots if requrested --> <data name="contrast_detail" format="pdf" label="${tool.name}_${variableMetadata_in.name}_detail" /> <!-- @@ -186,6 +188,7 @@ <param name="facC" value="k10"/> <param name="pairSigFeatOnly" value="FALSE"/> <param name="labelFeatures" value="3"/> + <param name="fdr_features" value="250"/> <param name="levCSV" value="k[12],k[3-4]"/> <param name="matchingC" value="regex"/> <output name="contrast_corcov"> @@ -201,16 +204,27 @@ <has_text text="level1Level2Sig" /> <!-- first matched line --> <has_text text="M349.2383T700" /> - <has_text text="-0.462909875" /> - <has_text text="-36.6668927" /> + <has_text text="-0.49037231902" /> + <has_text text="-2111932280.94" /> <has_text text="0.4914638" /> <has_text text="0.01302117" /> + <has_text text="-0.049216260" /> + <has_text text="-0.00152098716" /> + <has_text text="2.0603074801" /> + <has_text text="-0.60020597" /> + <has_text text="-0.3623876130" /> <!-- second matched line --> <has_text text="M207.9308T206" /> <has_text text="0.504885262" /> - <has_text text="5.97529097" /> + <has_text text="293403792" /> <has_text text="0.207196379" /> <has_text text="0.04438632" /> + <has_text text="0.020749097" /> + <has_text text="0.005184709" /> + <has_text text="1.47082346" /> + <has_text text="2.24325407" /> + <has_text text="0.38157919" /> + <has_text text="0.610536188" /> </assert_contents> </output> <output name="contrast_salience"> @@ -218,20 +232,12 @@ <!-- column-labels line --> <has_text text="featureID" /> <has_text text="salientLevel" /> - <has_text text="salientRCV" /> + <has_text text="salienceRCV" /> <has_text text="salience" /> - <!-- first matched line --> - <has_text text="M349.2383T700" /> - <has_text text="0.659554" /> - <has_text text="8.81866595" /> - <!-- second matched line --> - <has_text text="M207.9308T206" /> - <has_text text="0.0578578" /> - <has_text text="2.27527985" /> - <!-- third matched line --> - <has_text text="M211.0607T263" /> - <has_text text="9999" /> - <has_text text="12.87766096" /> + <!-- first three matched lines --> + <has_text text="M207.0654T373" /><has_text text="k4" /><has_text text="0.822733190" /><has_text text="134.087771" /><has_text text="3.9994434" /><has_text text="207.0654" /><has_text text="373" /> + <has_text text="M222.9585T226" /><has_text text="k2" /><has_text text="0.761200229" /><has_text text="87.3672719" /><has_text text="3.9995358" /><has_text text="222.9585" /><has_text text="226" /> + <has_text text="M235.0975T362" /><has_text text="k4" /><has_text text="0.209363850" /><has_text text="77.6255643" /><has_text text="3.99606600" /><has_text text="235.0975" /><has_text text="362" /> </assert_contents> </output> </test> @@ -244,6 +250,7 @@ <param name="facC" value="k10"/> <param name="pairSigFeatOnly" value="TRUE"/> <param name="labelFeatures" value="3"/> + <param name="fdr_features" value="ALL"/> <param name="levCSV" value="k[12],k[3-4]"/> <param name="matchingC" value="regex"/> <output name="contrast_corcov"> @@ -259,8 +266,8 @@ <has_text text="level1Level2Sig" /> <!-- first matched line --> <has_text text="M200.005T296" /> - <has_text text="-0.28035717" /> - <has_text text="-3.3573953" /> + <has_text text="0.0050579682" /> + <has_text text="2607493" /> <has_text text="0.1157346" /> <has_text text="0.0647860" /> </assert_contents> @@ -270,20 +277,12 @@ <!-- column-labels line --> <has_text text="featureID" /> <has_text text="salientLevel" /> - <has_text text="salientRCV" /> + <has_text text="salienceRCV" /> <has_text text="salience" /> - <!-- first matched line --> - <has_text text="M349.2383T700" /> - <has_text text="0.659554" /> - <has_text text="8.81866595" /> - <!-- second matched line --> - <has_text text="M207.9308T206" /> - <has_text text="0.0578578" /> - <has_text text="2.27527985" /> - <!-- third matched line --> - <has_text text="M211.0607T263" /> - <has_text text="9999" /> - <has_text text="12.87766096" /> + <!-- first three matched lines --> + <has_text text="M207.0654T373" /><has_text text="k4" /><has_text text="0.822733190" /><has_text text="134.087771" /><has_text text="3.9994434" /><has_text text="207.0654" /><has_text text="373" /> + <has_text text="M222.9585T226" /><has_text text="k2" /><has_text text="0.761200229" /><has_text text="87.3672719" /><has_text text="3.9995358" /><has_text text="222.9585" /><has_text text="226" /> + <has_text text="M235.0975T362" /><has_text text="k4" /><has_text text="0.209363850" /><has_text text="77.6255643" /><has_text text="3.99606600" /><has_text text="235.0975" /><has_text text="362" /> </assert_contents> </output> </test> @@ -295,6 +294,7 @@ <param name="tesC" value="none"/> <param name="facC" value="k10"/> <param name="labelFeatures" value="3"/> + <param name="fdr_features" value="ALL"/> <param name="levCSV" value="k[12],k[3-4]"/> <param name="matchingC" value="regex"/> <output name="contrast_corcov"> @@ -309,38 +309,29 @@ <has_text text="vip4o" /> <!-- first matched line --> <has_text text="M349.2383T700" /> - <has_text text="-0.4732226665" /> - <has_text text="-37.71066" /> + <has_text text="-0.499225" /> + <has_text text="-2135165209" /> <has_text text="0.5246766" /> <has_text text="0.0103341" /> <!-- second matched line --> <has_text text="M207.9308T206" /> <has_text text="0.4927151212" /> - <has_text text="5.86655640" /> + <has_text text="284608538" /> <has_text text="0.2111623" /> <has_text text="0.0488654" /> </assert_contents> </output> - <!-- test #4 --> <output name="contrast_salience"> <assert_contents> <!-- column-labels line --> <has_text text="featureID" /> <has_text text="salientLevel" /> - <has_text text="salientRCV" /> + <has_text text="salienceRCV" /> <has_text text="salience" /> - <!-- first matched line --> - <has_text text="M349.2383T700" /> - <has_text text="0.659554" /> - <has_text text="8.81866595" /> - <!-- second matched line --> - <has_text text="M207.9308T206" /> - <has_text text="0.0578578" /> - <has_text text="2.27527985" /> - <!-- third matched line --> - <has_text text="M211.0607T263" /> - <has_text text="9999" /> - <has_text text="12.87766096" /> + <!-- first three matched lines --> + <has_text text="M207.0654T373" /><has_text text="k4" /><has_text text="0.822733190" /><has_text text="134.087771" /><has_text text="3.9994434" /><has_text text="207.0654" /><has_text text="373" /> + <has_text text="M222.9585T226" /><has_text text="k2" /><has_text text="0.761200229" /><has_text text="87.3672719" /><has_text text="3.9995358" /><has_text text="222.9585" /><has_text text="226" /> + <has_text text="M235.0975T362" /><has_text text="k4" /><has_text text="0.209363850" /><has_text text="77.6255643" /><has_text text="3.99606600" /><has_text text="235.0975" /><has_text text="362" /> </assert_contents> </output> </test> @@ -352,6 +343,7 @@ <param name="tesC" value="none"/> <param name="facC" value="tissue_flowering"/> <param name="labelFeatures" value="3"/> + <param name="fdr_features" value="ALL"/> <param name="levCSV" value="*"/> <param name="matchingC" value="wildcard"/> <output name="contrast_corcov"> @@ -369,7 +361,7 @@ <has_text text="flower_yes" /> <has_text text="other" /> <has_text text="0.3499550705" /> - <has_text text="0.03526926" /> + <has_text text="11.609255" /> <has_text text="0.43664386" /> <has_text text="0.587701897" /> <has_text text="0.026082688" /> @@ -383,17 +375,14 @@ <!-- column-labels line --> <has_text text="featureID" /> <has_text text="salientLevel" /> - <has_text text="salientRCV" /> + <has_text text="salienceRCV" /> <has_text text="salience" /> <has_text text="mz" /> <has_text text="rt" /> <!-- first matched line --> - <has_text text="PM518T369" /> - <has_text text="flower_yes" /> - <has_text text="0.58655260" /> - <has_text text="4.414469" /> - <has_text text="518.1656" /> - <has_text text="368.59817" /> + <has_text text="NM517T428" /><has_text text="flower_yes" /><has_text text="0.02765631" /><has_text text="7.8343993" /><has_text text="1.27813793" /><has_text text="517.121714" /><has_text text="428.306854248" /> + <has_text text="NM517T426_1" /><has_text text="0.0290065" /><has_text text="7.7151305" /><has_text text="1.2758886" /><has_text text="517.09367125" /><has_text text="426.233886719" /> + <has_text text="NM516T284_2" /><has_text text="0.022883902" /><has_text text="7.6379724" /><has_text text="1.25603610" /><has_text text="516.082005177" /><has_text text="283.569198608" /> </assert_contents> </output> </test> @@ -405,6 +394,7 @@ <param name="tesC" value="none"/> <param name="facC" value="k._10"/> <param name="labelFeatures" value="3"/> + <param name="fdr_features" value="ALL"/> <param name="levCSV" value="k1,k.2"/> <param name="matchingC" value="none"/> <output name="contrast_corcov"> @@ -420,13 +410,13 @@ <!-- first matched line --> <has_text text="M349.2383T700" /> <has_text text="0.61594030" /> - <has_text text="37.76875778" /> + <has_text text="3489481837.9" /> <has_text text="0.54672558" /> <has_text text="0.3920409" /> <!-- second matched line --> <has_text text="M207.9308T206" /> <has_text text="-0.89716403" /> - <has_text text="-6.337903" /> + <has_text text="-585563327.7" /> <has_text text="0.270297" /> <has_text text="0.037661" /> </assert_contents> @@ -440,6 +430,7 @@ <param name="tesC" value="none"/> <param name="facC" value="k._10"/> <param name="labelFeatures" value="3"/> + <param name="fdr_features" value="ALL"/> <param name="levCSV" value="k_3,k-4"/> <param name="matchingC" value="none"/> <output name="contrast_corcov"> @@ -454,10 +445,15 @@ <has_text text="vip4o" /> <!-- first matched line --> <has_text text="M349.2383T700" /> - <has_text text="-0.331230562" /> - <has_text text="-2.47167915" /> - <has_text text="0.0892595" /> - <has_text text="0.0049228872" /> + <has_text text="-0.1221966" /> + <has_text text="-917311734" /> + <has_text text="0.0304592" /> + <has_text text="0.104748883" /> + <has_text text="-0.002736415" /> + <has_text text="-0.0113968" /> + <has_text text="0.387723" /> + <has_text text="-0.3812168081" /> + <has_text text="0.154611878" /> </assert_contents> </output> </test> @@ -469,6 +465,7 @@ <param name="tesC" value="none"/> <param name="facC" value="k._10"/> <param name="labelFeatures" value="3"/> + <param name="fdr_features" value="ALL"/> <param name="levCSV" value="k_3,k-4"/> <param name="matchingC" value="none"/> <output name="contrast_corcov"> @@ -484,14 +481,25 @@ <!-- k1 rejected by levCSV, leaving only k_3 and k-4 --> <not_has_text text="k1" /> <not_has_text text="other" /> + <!-- first matched line --> + <has_text text="M200.005T296" /> + <has_text text="-0.1829149760" /> + <has_text text="-115723402" /> + <has_text text="0.0892595" /> + <has_text text="0.00492288" /> + <has_text text="-0.00801895" /> + <has_text text="0.0005356178" /> + <has_text text="0.1848186" /> + <has_text text="-0.428802311" /> + <has_text text="0.0882045811" /> </assert_contents> </output> </test> </tests> <help><![CDATA[ -**Run PLS-DA Contrasts of Univariate Results** ----------------------------------------------- +**Run OPLS-DA Contrasts of Univariate Results** +----------------------------------------------- **Author** - Arthur Eschenlauer (University of Minnesota, esch0041@umn.edu) @@ -500,9 +508,9 @@ Motivation ---------- -OPLS-DA and the SIMCA S-PLOT (Wiklund *et al.*, 2008) may be employed to draw attention to metabolomic features that are potential biomarkers, i.e. features that are potentially useful to discriminate to which class a sample should be assigned (e.g. Sun *et al.*, 2016). Workflow4Metabolomics (W4M, Giacomoni *et al.*, 2014, Guitton *et al.*, 2017) provides a suite of tools for preprocessing and statistical analysis of LC-MS, GC-MS, and NMR metabolomics data; however, it does not (as of release 3.0) include a tool for making the equivalent of an S-PLOT. +OPLS-DA and the SIMCA S-PLOT (Wiklund *et al.*, 2008) may be employed to draw attention to metabolomic features that are potential biomarkers, i.e. features that are potentially useful when assigning a sample to one of two classes (e.g. Sun *et al.*, 2016). Workflow4Metabolomics (W4M, Giacomoni *et al.*, 2014, Guitton *et al.*, 2017) provides a suite of tools for preprocessing and statistical analysis of LC-MS, GC-MS, and NMR metabolomics data; however, it does not (as of release 3.2) include a tool for making the equivalent of an S-PLOT. -The S-PLOT is computed from mean-centered, pareto-scaled data. This plot presents the correlation of the first score vector from an OPLS-DA model with the sample-variables used to produce that model versus the covariance of the scores with the sample-variables. For OPLS-DA, the first score vector represents the variation among the sample-variables that is related to the predictor (i.e., the contrasting factor). +The S-PLOT is computed from mean-centered, pareto-scaled data. This plot presents the correlation of the first score vector from an OPLS-DA model with the sample-variables used to produce that model versus the covariance of the scores with the sample-variables. For OPLS-DA, the first score vector represents the variation among the sample-variables that is related to the predictor (i.e., the contrasting factor); the second score vector, variation that is orthogonal to the predictor. The primary aims of this tool are: @@ -514,19 +522,19 @@ Description ----------- -The purpose of the 'PLS-DA Contrasts' tool is to visualize GC-MS or LC-MS features that are possible biomarkers. +The purpose of the 'OPLS-DA Contrasts' tool is to visualize GC-MS or LC-MS features that are possible biomarkers. The W4M 'Univariate' tool (Th]]>é<![CDATA[venot *et al.*, 2015) adds the results of family-wise corrected pairwise significance-tests as columns of the **variableMetadata** dataset. -For instance, suppose that you ran Kruskal-Wallis testing for a column named 'cluster' in sampleMetadata that has values 'k1' and 'k2' and at least one other value. +For instance, if Kruskal-Wallis testing were perfomred on a column named 'cluster' in sampleMetadata that has values 'k1' and 'k2' and at least one other value: - A column of variableMetadata would be labelled 'cluster_kruskal_sig' and would have values '1' and '0'; when the samples are grouped by 'cluster', '1' means that there is strong evidence against the hypothesis that there is no difference among the intensities for the feature across all sample-groups. - A column of variableMetadata would be labelled 'cluster_kruskal_k1.k2_sig' and would have values '1' and '0', where '1' means that there is significant evidence against the hypothesis that samples from sampleMetadata whose 'cluster' column contains 'k1' or 'k2' have the same intensity for that feature. -The 'PLS-DA Contrasts' tool produces graphics and data for OPLS-DA contrasts of feature-intensities between significantly different pairs of factor-levels. For each factor-level, the tool performs a contrast with all other factor-levels combined and then separately with each other factor-level. +The 'OPLS-DA Contrasts' tool produces graphics and data for OPLS-DA contrasts of feature-intensities between significantly different pairs of factor-levels. For each factor-level, the tool performs a contrast with all other factor-levels combined and then separately with each other factor-level. **Along the left-to-right axis, the plots show the supervised projection of the variation explained by the predictor** (i.e., the factor specified when invoking the tool); **the top-to-bottom axis displays the variation that is orthogonal to the predictor level** (i.e., independent of it). -Although this tool can be used in a purely exploratory manner by supplying the variableMetadata file without the columns added by the W4M 'Univariate' tool, **the preferred workflow may be to use univariate testing to exclude features that are not significantly different and then to use OPLS-DA to visualize the differences identified in univariate testing** (Th]]>é<![CDATA[venot *et al.*, 2015); an appropriate exception would be to visualize contrasts of a specific list of metabolites. +Although this tool can be used in a purely exploratory manner by supplying the variableMetadata file without the columns added by the W4M 'Univariate' tool, **a preferable workflow may be to use univariate testing to exclude features that are not significantly different and then to use OPLS-DA to visualize the differences identified in univariate testing** (Th]]>é<![CDATA[venot *et al.*, 2015); an appropriate exception would be to visualize contrasts of a specific list of metabolites. If you do exclude features, however, make sure that you set the advanced parameter "How many features for p-value calculation?" accordingly. It must be stressed that there may be no *single* definitive computational approach to select features that are reliable biomarkers, especially from a small number of samples or experiments. A few possible choices are: @@ -537,15 +545,13 @@ In this spirit, this tool reports the S-PLOT covariance and correlation (Wiklund *op. cit.*) and VIP metrics, and it introduces an informal "salience" metric to flag features that may merit attention without dimensional reduction; future versions may add selectivity ratio. -For a more systematic approach to biomarker identification, please consider the W4M 'biosigner' tool (Rinuardo *et al.* 2016), which applies three different identification metrics to the selection process. - -Regardless of how any potential biomarker is identified, further validation analysis (e.g., independent confirmatory experiments) is needed before it is recommended for general application. +For a more systematic approach to biomarker identification, please consider the W4M 'biosigner' tool (Rinuardo *et al.* 2016), which applies three different identification metrics to the selection process. Regardless of how any potential biomarker is identified, further validation analysis (e.g., independent confirmatory experiments) is needed before it can be recommended for general application. W4M Workflow Position --------------------- -- Upstream tool: **Univariate** (category: Statistical Analysis) or (not generally recommended) any **Preprocessing** tool that produces or updates a 'variableMetadata' file. +- Upstream tool: **Univariate** (category: Statistical Analysis) or any **Preprocessing** tool that produces or updates a 'variableMetadata' file. - Downstream tool categories: **Statistical Analysis** Input files @@ -616,49 +622,68 @@ | Specify the number of features at each of the loading-extremes that should be labelled (with the name of the feature) on the covariance-vs.-correlation plot; specify 'ALL' to label all features; this choice has no effect on the OPLS-DA loadings plot. | +[IN] (Advanced) C-plot Y-axis + | Choose whether C-plots should plot the correlation (the default) or the covariance *vs.* VIP. + | + +[IN] (Advanced) Produce predictor C-plot + | Choose whether a C-plot should be produced for the projections parallel to the predictor. + | + +[IN] (Advanced) Produce orthogonal C-plot + | Choose whether a C-plot should be produced for the projections orthogonal to the predictor. + | + +[IN] (Advanced) How many features for p-value calculation? + | You will need to use this option when statistical criteria have previously been applied to remove features in the data matrix. This is important for adjusting the p-values for correlation of the scores with each feature; this adjustment is necessary to avoid underestimation of the p-values. If this is applicable, specify the sum of the number of features removed and the number of features in the data matrix. + | + [OUT] Contrast-detail output PDF | File containing several plots for each two-projection OPLS-DA analysis. - (first row, left) **correlation-versus-covariance plot** of OPLS-DA results - - This is a work-alike for the S-PLOT, computed using formula in equations 1 and 2 from Wiklund, (*op. cit.*); - - point-color becomes saturated as the "variable importance in projection to the predictive components" (VIP\ :subscript:`4,p` from Galindo-Prieto *et al.* 2014) increases through the range from 0.83 and 1.21 (Mehmood *et al.* 2012), for use to identify features for consideration as biomarkers; - - plot symbols are diamonds when the adjusted p-value of correlation is greater than 0.05, circles when it is less than 0.01, and triangles when it between these limits. + - This is a work-alike for the S-PLOT described in Wiklund, (*op. cit.*), ignoring samples with missing values; + - point-color becomes saturated as the "variable importance in projection to the predictive components" (VIP\ :subscript:`4,p` from Galindo-Prieto *op. cit.*) ranges from 0.83 and 1.21 (Mehmood *et al.* 2012), for use to identify features for consideration as biomarkers; + - plot symbols are diamonds when the p-value of the correlation, adjusted for family-wise error rate (Yekutieli *et al.*, 2001), is greater than 0.05, circles when it is less than 0.01, and triangles when between 0.01 and 0.05. - (second row, left) **model-overview plot** for the two projections; grey bars are the correlation coefficient for the fitted data; black bars indicate performance in cross-validation tests (Th]]>é<![CDATA[venot, 2017) - (first row, right) OPLS-DA **scores-plot** for the two projections (Th]]>é<![CDATA[venot *et al.*, 2015) -- (second row, right) **correlation-versus-covariance plot** of OPLS-DA results **orthogonal to the predictor** (see section "S-Plot of Orthogonal Component" in Wiklund, *op. cit.*, pp. 120-121; this characterizes features with the greatest variation independent of the predictor). -- (third row, left, when "**predictor C-plot**" is chosen under "Extra plots to include") plot of the covariance or correlation vs. the VIP for the projection *parallel to the predictor*, for use to identify features for consideration as biomarkers. -- (third row, right, when "**orthogonal C-plot**" is chosen under "Extra plots to include") plotof the covariance or correlation vs. the VIP for the projection *orthogonal to the predictor*, for use to identify features varying considerably without regard to the predictor. +- (second row, right) **correlation-versus-covariance plot** of OPLS-DA results **orthogonal to the predictor** (see section "S-Plot of Orthogonal Component" in Wiklund, *op. cit.*, pp. 120-121; this characterizes variation of features that is *independent of the predictor*). +- (third row, left, when "**predictor C-plot**" is chosen under "Advanced") plot of the correlation (or covariance) vs. the VIP\ :subscript:`4,p` (Galindo-Prieto *op. cit.*), to assist in identifying features for consideration as biomarkers. +- (third row, right, when "**orthogonal C-plot**" is chosen under "Advanced") plot of the correlation (or covariance) vs. the VIP\ :subscript:`4,o` (*ibid.*), to assist in identifying features varying considerably without regard to the predictor. [OUT] Contrast Correlation-Covarinace data TABULAR | A tab-separated values file of metadata for each feature for each contrast in which it was included. | Thus, a given feature may appear many times, but *the combination of featureID, factorLevel1, and factorLevel2 will be unique.* | This file has the following columns: -- **featureID** - feature-identifier +- **featureID** - feature identifier - **factorLevel1** - factor-level 1 - **factorLevel2** - factor-level 2 (or "other" when contrasting factor-level 1 with all other levels) -- **correlation** - correlation of the features projection explaining the difference between the features, < 0 when intensity for level 1 is greater (from equation 2 in Wiklund, *op. cit.*). Note that, for a given contrast, there is a linear relationship between 'loadp' and 'correlation'. -- **covariance** - relative covariance of the features projection explaining the difference between the features, < 0 when intensity for level 1 is greater (from formula in *ibid.*, but scaled so that the greatest value has a magnitude of 1) +- **correlation**\ (t\ :subscript:`p`,X\ :subscript:`i`) - for this feature (i), correlation of sample intensities for this feature (X\ :subscript:`i`) with the OPLS-DA projection's first set of scores (t\ :subscript:`p`, i.e., the scores explaining the difference between the features), computed (omitting samples missing values) using the R *stats::cor* function with the 'pearson' method (R Core Team, 2018); this is negative when intensity for level 1 is greater than for level 2 +- **covariance**\ (t\ :subscript:`p`,X\ :subscript:`i`) - computed as for correlation but using the R *stats::cov* function (*ibid.*) in lieu of *stats::cor*; this is also negative when intensity for level 1 is greater than for level 2 - **vip4p** - "variable importance in projection" to the predictive projection, VIP\ :subscript:`4,p` (Galindo-Prieto *op. cit.*) - **vip4o** - "variable importance in projection" to the orthogonal projection, VIP\ :subscript:`4,o` (*ibid.*) - **loadp** - variable loading for the predictive projection (Wiklund *op. cit.*) - **loado** - variable loading for the orthogonal projection (*ibid.*) - **cor_p_val_raw** - p-value for Fisher-transformed correlation (Fisher, 1921; Snedecor, 1980; see also https://en.wikipedia.org/wiki/Fisher_transformation), with no family-wise error-rate correction. -- **cor_p_value** - p-value for Fisher-transformed correlation, adjusted for family-wise error rate (Yekutieli *et al.*, 2001). Caveat: any previous selection for features that vary notably by factor level may result in too little adjustment. -- **cor_ci_lower** - lower limit of 95% confidence interval for correlation (see e.g. https://en.wikipedia.org/wiki/Fisher_transformation) -- **cor_ci_upper** - upper limit of 95% confidence interval for correlation (*ibid.*) +- **cor_p_value** - p-value for Fisher-transformed correlation, adjusted for family-wise error rate (Yekutieli *et al.*, 2001). +- **cor_ci_lower** - lower limit of 95% confidence interval for correlation, based on cor_p_value +- **cor_ci_upper** - upper limit of 95% confidence interval for correlation, based on cor_p_value - **mz** - *m/z* ratio for feature, copied from input variableMetadata - **rt** - retention time for feature, copied from input variableMetadata -- **level1Level2Sig** - (Only present when a test other than "none" is chosen) '1' when feature varies significantly across all classes (i.e., not pair-wise); '0' otherwise +- **level1Level2Sig** (Only present when a test other than "none" is chosen) - '1' when feature varies significantly across all classes (i.e., not pair-wise); '0' otherwise [OUT] Feature "Salience" data TABULAR | Metrics for the "salient level" for each feature, i.e., the level at which the feature is more prominent than any other level. This is *not* at all related to the SIMCA OPLS-DA S-PLOT; rather, it is intended as a potential way to discover features for consideration as potential biomarkers without dimensionally reducting the data. This is a tab-separated values file having the following columns: - **featureID** - feature identifier - **salientLevel** - salient level, i.e., for the feature, the class-level having the greatest median intensity -- **salientRCV** - salient robust coefficient of variation, i.e., for the feature, the mean absolute deviation of the intensity for the salient level divided by the median intensity for the salient level +- **salienceRCV** - salience robust coefficient of variation, i.e., for the feature, the mean absolute deviation of the intensity for the salient level divided by the median intensity for the salient level +- **relativeSalientDistance** - relative salient distance, i.e., for the feature, the distance between the two highest class-level medians divided by the square root of the mean absolute deviations of those two class-level's intensities - **salience** - salience, i.e., for the feature, the median of the class-level having the greatest intensity divided by the mean of the medians for all class-levels +- **mz** - *m/z* ratio for feature, copied from input variableMetadata +- **rt** - retention time for feature, copied from input variableMetadata Wild card patterns to match level-names --------------------------------------- @@ -744,6 +769,8 @@ +--------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------+ | Number of features having extreme loadings | ALL | +--------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------+ + | How many features for p-value calculation? | 250 | + +--------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------+ | Output primary table | https://raw.githubusercontent.com/HegemanLab/w4mcorcov_galaxy_wrapper/master/tools/w4mcorcov/test-data/expected_contrast_corcov.tsv | +--------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------+ | Output salience table | https://raw.githubusercontent.com/HegemanLab/w4mcorcov_galaxy_wrapper/master/tools/w4mcorcov/test-data/expected_contrast_salience.tsv | @@ -768,6 +795,8 @@ +--------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------+ | Number of features having extreme loadings | 5 | +--------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------+ + | How many features for p-value calculation? | ALL | + +--------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------+ | Output primary table | https://raw.githubusercontent.com/HegemanLab/w4mcorcov_galaxy_wrapper/master/tools/w4mcorcov/test-data/expected_contrast_corcov_all.tsv | +--------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------+ | Output salience table | https://raw.githubusercontent.com/HegemanLab/w4mcorcov_galaxy_wrapper/master/tools/w4mcorcov/test-data/expected_contrast_salience_all.tsv | @@ -790,6 +819,8 @@ +--------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+ | Number of features having extreme loadings | 0 | +--------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+ + | How many features for p-value calculation? | ALL | + +--------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+ | Output primary table | https://raw.githubusercontent.com/HegemanLab/w4mcorcov_galaxy_wrapper/master/tools/w4mcorcov/test-data/expected_contrast_corcov_global.tsv | +--------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+ | Output salience table | https://raw.githubusercontent.com/HegemanLab/w4mcorcov_galaxy_wrapper/master/tools/w4mcorcov/test-data/expected_contrast_salience_global.tsv | @@ -812,6 +843,8 @@ +--------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+ | Number of features having extreme loadings | 3 | +--------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+ + | How many features for p-value calculation? | ALL | + +--------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+ | Output primary table | https://raw.githubusercontent.com/HegemanLab/w4mcorcov_galaxy_wrapper/master/tools/w4mcorcov/test-data/expected_contrast_corcov_lohi.tsv | +--------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+ | Output salience table | https://raw.githubusercontent.com/HegemanLab/w4mcorcov_galaxy_wrapper/master/tools/w4mcorcov/test-data/expected_contrast_salience_lohi.tsv | @@ -830,6 +863,17 @@ <citations> <!-- this tool --> <citation type="doi">10.5281/zenodo.1034784</citation> + <!-- R project --> + <citation type="bibtex"><![CDATA[ + @Manual{, + title = {R: A Language and Environment for Statistical Computing}, + author = {{R Core Team}}, + organization = {R Foundation for Statistical Computing}, + address = {Vienna, Austria}, + year = {2018}, + url = {https://www.R-project.org/}, + } + ]]></citation> <!-- Fisher_1921: Fisher z-transformation of correlation coefficient --> <citation type="bibtex"><![CDATA[ @article{Fisher_1921, @@ -887,7 +931,7 @@ address = {Roswell Park Cancer Institute}, } ]]></citation> - <!-- Wiklund_2008 OPLS PLS-DA and S-PLOT --> + <!-- Wiklund_2008 OPLS-DA and S-PLOT --> <citation type="doi">10.1021/ac0713510</citation> <!-- Yekutieli_2001 The control of the false discovery rate in multiple testing under dependency --> <citation type="doi">10.1214/aos/1013699998</citation>
--- a/w4mcorcov_calc.R Thu Nov 08 23:06:09 2018 -0500 +++ b/w4mcorcov_calc.R Wed Dec 12 09:20:02 2018 -0500 @@ -1,12 +1,4 @@ -# center with 'colMeans()' - ref: http://gastonsanchez.com/visually-enforced/how-to/2014/01/15/Center-data-in-R/ -center_colmeans <- function(x) { - xcenter = colMeans(x) - x - rep(xcenter, rep.int(nrow(x), ncol(x))) -} - -#### OPLS-DA -algoC <- "nipals" - +# compute and output detail plots do_detail_plot <- function( x_dataMatrix , x_predictor @@ -21,7 +13,7 @@ off <- function(x) if (x_show_labels == "0") 0 else x if ( x_is_match && ncol(x_dataMatrix) > 0 - && length(unique(x_predictor))> 1 + && length(unique(x_predictor)) > 1 && x_crossval_i < nrow(x_dataMatrix) ) { my_oplsda <- opls( @@ -36,20 +28,22 @@ , scaleC = "pareto" # data centered and pareto scaled here only. This line fixes issue #2. ) # strip out variables having negligible variance - x_dataMatrix <- x_dataMatrix[,names(my_oplsda@vipVn), drop = FALSE] + x_dataMatrix <- x_dataMatrix[ , names(my_oplsda@vipVn), drop = FALSE] my_oplsda_suppLs_y_levels <- levels(as.factor(my_oplsda@suppLs$y)) fctr_lvl_1 <- my_oplsda_suppLs_y_levels[1] fctr_lvl_2 <- my_oplsda_suppLs_y_levels[2] do_s_plot <- function( - x_env - , predictor_projection_x = TRUE - , cplot_x = FALSE - , cor_vs_cov_x = NULL - ) - { + x_env + , predictor_projection_x = TRUE + , cplot_x = FALSE + , cor_vs_cov_x = NULL + ) { if (cplot_x) { cplot_y_correlation <- (x_env$cplot_y == "correlation") + default_lim_x <- 10 + } else { + default_lim_x <- 1.2 } if (is.null(cor_vs_cov_x)) { my_cor_vs_cov <- cor_vs_cov( @@ -57,14 +51,17 @@ , ropls_x = my_oplsda , predictor_projection_x = predictor_projection_x , x_progress + , x_env ) } else { my_cor_vs_cov <- cor_vs_cov_x } - # str(my_cor_vs_cov) + if (is.null(my_cor_vs_cov) || sum(!is.na(my_cor_vs_cov$tsv1$covariance)) < 2) { if (is.null(cor_vs_cov_x)) { - x_progress("No cor_vs_cov data produced") + x_progress( + sprintf("No cor_vs_cov data produced for level %s versus %s", fctr_lvl_1, fctr_lvl_2) + ) } plot(x=1, y=1, xaxt="n", yaxt="n", xlab="", ylab="", type="n") text(x=1, y=1, labels="too few covariance data") @@ -76,216 +73,240 @@ min_x <- min(covariance, na.rm = TRUE) max_x <- max(covariance, na.rm = TRUE) lim_x <- max(sapply(X=c(min_x, max_x), FUN=abs)) - covariance <- covariance / lim_x - lim_x <- 1.2 - # "It is generally accepted that a variable should be selected if vj>1, [27–29], + + # Regarding using VIP as a guide to selecting a biomarker: + # "It is generally accepted that a variable should be selected if vj>1, [27–29], # but a proper threshold between 0.83 and 1.21 can yield more relevant variables according to [28]." # (Mehmood 2012 doi:10.1186/1748-7188-6-27) plus_cor <- correlation plus_cov <- covariance - cex <- 0.65 - which_projection <- if (projection == 1) "t1" else "o1" - which_loading <- if (projection == 1) "parallel" else "orthogonal" - if (projection == 1) { # predictor-projection - vipcp <- pmax(0, pmin(1,(vip4p-0.83)/(1.21-0.83))) - if (!cplot_x) { # S-plot predictor-projection - my_xlab <- "relative covariance(feature,t1)" - my_x <- plus_cov - my_ylab <- "correlation(feature,t1)" - my_y <- plus_cor - } else { # C-plot predictor-projection - my_xlab <- "variable importance in predictor-projection" - my_x <- vip4p - if (cplot_y_correlation) { + if (length(plus_cor) != 0 && length(plus_cor) != 0) { + cex <- 0.65 + if (projection == 1) { + # predictor-projection + vipcp <- pmax(0, pmin(1, (vip4p - 0.83) / (1.21 - 0.83))) + if (!cplot_x) { + # S-plot predictor-projection + my_xlab <- "covariance(feature,t1)" + my_x <- plus_cov my_ylab <- "correlation(feature,t1)" my_y <- plus_cor + # X,Y limits for S-PLOT + my_xlim <- c( -lim_x, lim_x ) * (1.0 + off(0.3)) + my_ylim <- c( -1.0, 1.0 ) * (1.0 + off(0.2) ) } else { - my_ylab <- "relative covariance(feature,t1)" - my_y <- plus_cov + # C-plot predictor-projection + my_xlab <- "variable importance in predictor-projection" + my_x <- vip4p + if (cplot_y_correlation) { + my_ylab <- "correlation(feature,t1)" + my_y <- plus_cor + my_ylim <- c( -1.0, 1.0 ) * (1.0 + off(0.2) ) + } else { + my_ylab <- "covariance(feature,t1)" + my_y <- plus_cov + my_ylim <- max(abs(plus_cov)) + my_ylim <- c( -my_ylim, my_ylim ) * (1.0 + off(0.2) ) + } + # X,Y limits for C-plot + lim_x <- max(my_x, na.rm = TRUE) * 1.1 + lim_x <- min(lim_x, default_lim_x) + my_xlim <- c( 0, lim_x ) # + off(0.2) ) } - } - if (cplot_x) { - lim_x <- max(my_x, na.rm = TRUE) * 1.1 - my_xlim <- c( 0, lim_x + off(0.2) ) + my_load_distal <- loadp + my_load_proximal <- loado + red <- as.numeric(correlation > 0) * vipcp + blue <- as.numeric(correlation < 0) * vipcp + alpha <- 0.1 + 0.4 * vipcp + red[is.na(red)] <- 0 + blue[is.na(blue)] <- 0 + alpha[is.na(alpha)] <- 0 + my_col <- rgb(blue = blue, red = red, green = 0, alpha = alpha) + main_label <- sprintf("%s for level %s versus %s" + , x_prefix, fctr_lvl_1, fctr_lvl_2) } else { - my_xlim <- c( -lim_x - off(0.2), lim_x + off(0.2) ) - } - my_ylim <- c( -1.0 - off(0.2), 1.0 + off(0.2) ) - my_load_distal <- loadp - my_load_proximal <- loado - red <- as.numeric(correlation > 0) * vipcp - blue <- as.numeric(correlation < 0) * vipcp - alpha <- 0.1 + 0.4 * vipcp - red[is.na(red)] <- 0 - blue[is.na(blue)] <- 0 - alpha[is.na(alpha)] <- 0 - my_col <- rgb(blue = blue, red = red, green = 0, alpha = alpha) - main_label <- sprintf("%s for level %s versus %s" - , x_prefix, fctr_lvl_1, fctr_lvl_2) - } else { # orthogonal projection - vipco <- pmax(0, pmin(1,(vip4o-0.83)/(1.21-0.83))) - if (!cplot_x) { - my_xlab <- "relative covariance(feature,to1)" - my_x <- -plus_cov - } else { - my_xlab <- "variable importance in orthogonal projection" - my_x <- vip4o - } - if (!cplot_x) { # S-plot orthogonal projection - my_xlim <- c( -lim_x - off(0.2), lim_x + off(0.2) ) - my_ylab <- "correlation(feature,to1)" - my_y <- plus_cor - } else { # C-plot orthogonal projection - lim_x <- max(my_x, na.rm = TRUE) * 1.1 - my_xlim <- c( 0, lim_x + off(0.2) ) - if (cplot_y_correlation) { + # orthogonal projection + vipco <- pmax(0, pmin(1, (vip4o - 0.83) / (1.21 - 0.83))) + if (!cplot_x) { + # S-PLOT orthogonal-projection + my_xlab <- "covariance(feature,to1)" + my_x <- -plus_cov + # X,Y limits for S-PLOT + my_xlim <- c( -lim_x, lim_x ) * (1.0 + off(0.3)) my_ylab <- "correlation(feature,to1)" my_y <- plus_cor + my_ylim <- c( -1.0, 1.0 ) * (1.0 + off(0.2) ) } else { - my_ylab <- "relative covariance(feature,to1)" - my_y <- plus_cov + # C-plot orthogonal-projection + my_xlab <- "variable importance in orthogonal projection" + my_x <- vip4o + # C-plot orthogonal projection + # X,Y limits for C-plot + lim_x <- max(my_x, na.rm = TRUE) * 1.1 + my_xlim <- c( 0, lim_x ) # + off(0.2) ) + if (cplot_y_correlation) { + my_ylab <- "correlation(feature,to1)" + my_y <- plus_cor + my_ylim <- c( -1.0, 1.0 ) * (1.0 + off(0.2) ) + } else { + my_ylab <- "covariance(feature,to1)" + my_y <- plus_cov + my_ylim <- max(abs(plus_cov)) + my_ylim <- c( -my_ylim, my_ylim ) * (1.0 + off(0.2) ) + } } - } - my_ylim <- c( -1.0 - off(0.2), 1.0 + off(0.2) ) - my_load_distal <- loado - my_load_proximal <- loadp - alpha <- 0.1 + 0.4 * vipco - alpha[is.na(alpha)] <- 0 - my_col <- rgb(blue = 0, red = 0, green = 0, alpha = alpha) - main_label <- sprintf( - "Features influencing orthogonal projection for %s versus %s" - , fctr_lvl_1, fctr_lvl_2) - } - main_cex <- min(1.0, 46.0/nchar(main_label)) - my_feature_label_slant <- -30 # slant feature labels 30 degrees downward - my_pch <- sapply(X = cor_p_value, function(x) if (x < 0.01) 16 else if (x < 0.05) 17 else 18) - plot( - y = my_y - , x = my_x - , type = "p" - , xlim = my_xlim - , ylim = my_ylim - , xlab = my_xlab - , ylab = my_ylab - , main = main_label - , cex.main = main_cex - , cex = cex - , pch = my_pch - , col = my_col - ) - low_x <- -0.7 * lim_x - high_x <- 0.7 * lim_x - if (projection == 1 && !cplot_x) { - text(x = low_x, y = -0.05, labels = fctr_lvl_1, col = "blue") - text(x = high_x, y = 0.05, labels = fctr_lvl_2, col = "red") - } - if ( x_show_labels != "0" ) { - names(my_load_distal) <- tsv1$featureID - names(my_load_proximal) <- tsv1$featureID - if ( x_show_labels == "ALL" ) { - n_labels <- length(my_load_distal) - } else { - n_labels <- as.numeric(x_show_labels) + my_load_distal <- loado + my_load_proximal <- loadp + alpha <- 0.1 + 0.4 * vipco + alpha[is.na(alpha)] <- 0 + my_col <- rgb(blue = 0, red = 0, green = 0, alpha = alpha) + main_label <- sprintf( + "Features influencing orthogonal projection for %s versus %s" + , fctr_lvl_1, fctr_lvl_2) } - n_labels <- min( n_labels, (1 + length(my_load_distal)) / 2 ) - labels_to_show <- c( - names(head(sort(my_load_distal),n = n_labels)) - , names(tail(sort(my_load_distal),n = n_labels)) - ) - labels <- unname( - sapply( - X = tsv1$featureID - , FUN = function(x) if( x %in% labels_to_show ) x else "" + main_cex <- min(1.0, 46.0/nchar(main_label)) + my_feature_label_slant <- -30 # slant feature labels 30 degrees downward + my_pch <- sapply(X = cor_p_value, function(x) if (x < 0.01) 16 else if (x < 0.05) 17 else 18) + if ( sum(is.infinite(my_xlim)) == 0 ) { + plot( + y = my_y + , x = my_x + , type = "p" + , xlim = my_xlim + , ylim = my_ylim + , xlab = my_xlab + , ylab = my_ylab + , main = main_label + , cex.main = main_cex + , cex = cex + , pch = my_pch + , col = my_col ) - ) - x_text_offset <- 0.024 - y_text_off <- 0.017 - if (!cplot_x) { # S-plot - y_text_offset <- if (projection == 1) -y_text_off else y_text_off - } else { # C-plot - y_text_offset <- - sapply( - X = (my_y > 0) - , FUN = function(x) { if (x) y_text_off else -y_text_off } + low_x <- -0.7 * lim_x + high_x <- 0.7 * lim_x + if (projection == 1 && !cplot_x) { + text(x = low_x, y = -0.05, labels = fctr_lvl_1, col = "blue") + text(x = high_x, y = 0.05, labels = fctr_lvl_2, col = "red") + } + if ( x_show_labels != "0" ) { + names(my_load_distal) <- tsv1$featureID + names(my_load_proximal) <- tsv1$featureID + if ( x_show_labels == "ALL" ) { + n_labels <- length(my_load_distal) + } else { + n_labels <- as.numeric(x_show_labels) + } + n_labels <- min( n_labels, (1 + length(my_load_distal)) / 2 ) + labels_to_show <- c( + names(head(sort(my_load_distal), n = n_labels)) + , names(tail(sort(my_load_distal), n = n_labels)) + ) + labels <- unname( + sapply( + X = tsv1$featureID + , FUN = function(x) if ( x %in% labels_to_show ) x else "" + ) ) - } - label_features <- function(x_arg, y_arg, labels_arg, slant_arg) { - if (length(labels_arg) > 0) { - unique_slant <- unique(slant_arg) - if (length(unique_slant) == 1) { - text( - y = y_arg - , x = x_arg + x_text_offset - , cex = 0.4 - , labels = labels_arg - , col = rgb(blue = 0, red = 0, green = 0, alpha = 0.5) # grey semi-transparent labels - , srt = slant_arg - , adj = 0 # left-justified + x_text_offset <- 0.024 + y_text_off <- 0.017 + if (!cplot_x) { + # S-plot + y_text_offset <- if (projection == 1) -y_text_off else y_text_off + } else { + # C-plot + y_text_offset <- + sapply( + X = (my_y > 0) + , FUN = function(x) { + if (x) y_text_off else -y_text_off + } + ) + } + label_features <- function(x_arg, y_arg, labels_arg, slant_arg) { + if (length(labels_arg) > 0) { + unique_slant <- unique(slant_arg) + if (length(unique_slant) == 1) { + text( + y = y_arg + , x = x_arg + x_text_offset + , cex = 0.4 + , labels = labels_arg + , col = rgb(blue = 0, red = 0, green = 0, alpha = 0.5) # grey semi-transparent labels + , srt = slant_arg + , adj = 0 # left-justified + ) + } else { + for (slant in unique_slant) { + text( + y = y_arg[slant_arg == slant] + , x = x_arg[slant_arg == slant] + x_text_offset + , cex = 0.4 + , labels = labels_arg[slant_arg == slant] + , col = rgb(blue = 0, red = 0, green = 0, alpha = 0.5) # grey semi-transparent labels + , srt = slant + , adj = 0 # left-justified + ) + } + } + } + } + if (!cplot_x) { + my_slant <- (if (projection == 1) 1 else -1) * my_feature_label_slant + } else { + my_slant <- sapply( + X = (my_y > 0) + , FUN = function(x) if (x) 2 else -2 + ) * my_feature_label_slant + } + if (length(my_x) > 1) { + label_features( + x_arg = my_x [my_x > 0] + , y_arg = my_y [my_x > 0] - y_text_offset + , labels_arg = labels[my_x > 0] + , slant_arg = (if (!cplot_x) -my_slant else (my_slant)) ) - } else { - for (slant in unique_slant) { - text( - y = y_arg[slant_arg == slant] - , x = x_arg[slant_arg == slant] + x_text_offset - , cex = 0.4 - , labels = labels_arg[slant_arg == slant] - , col = rgb(blue = 0, red = 0, green = 0, alpha = 0.5) # grey semi-transparent labels - , srt = slant - , adj = 0 # left-justified + if (!cplot_x) { + label_features( + x_arg = my_x [my_x < 0] + , y_arg = my_y [my_x < 0] + y_text_offset + , labels_arg = labels[my_x < 0] + , slant_arg = my_slant ) } - } - } - } - if (!cplot_x) { - my_slant <- (if (projection == 1) 1 else -1) * my_feature_label_slant + } else { + if (!cplot_x) { + my_slant <- (if (my_x > 1) -1 else 1) * my_slant + my_y_arg <- my_y + (if (my_x > 1) -1 else 1) * y_text_offset + } else { + my_slant <- (if (my_y > 1) -1 else 1) * my_slant + my_y_arg <- my_y + (if (my_y > 1) -1 else 1) * y_text_offset + } + label_features( + x_arg = my_x + , y_arg = my_y_arg + , labels_arg = labels + , slant_arg = my_slant + ) + } # end if (length(my_x) > 1) + } # end if ( x_show_labels != "0" ) } else { - my_slant <- sapply( - X = (my_y > 0) - , FUN = function(x) if (x) 2 else -2 - ) * my_feature_label_slant - } - if (length(my_x) > 1) { - label_features( - x_arg = my_x [my_x > 0] - , y_arg = my_y [my_x > 0] - y_text_offset - , labels_arg = labels[my_x > 0] - , slant_arg = (if (!cplot_x) -my_slant else (my_slant)) - ) - if (!cplot_x) { - label_features( - x_arg = my_x [my_x < 0] - , y_arg = my_y [my_x < 0] + y_text_offset - , labels_arg = labels[my_x < 0] - , slant_arg = my_slant - ) - } - } else { - if (!cplot_x) { - my_slant <- (if (my_x > 1) -1 else 1) * my_slant - my_y_arg = my_y + (if (my_x > 1) -1 else 1) * y_text_offset - } else { - my_slant <- (if (my_y > 1) -1 else 1) * my_slant - my_y_arg = my_y + (if (my_y > 1) -1 else 1) * y_text_offset - } - label_features( - x_arg = my_x - , y_arg = my_y_arg - , labels_arg = labels - , slant_arg = my_slant - ) - } - } - } - ) + plot(x=1, y=1, xaxt="n", yaxt="n", xlab="", ylab="", type="n") + text(x=1, y=1, labels="no S-plot is possible") + } # end if (sum(is.infinte(my_xlim))==0) + } else { + plot(x=1, y=1, xaxt="n", yaxt="n", xlab="", ylab="", type="n") + text(x=1, y=1, labels="no S-plot is possible") + } # end if (length(plus_cor) != 0 && length(plus_cor) != 0) + } # end action + ) # end with( my_cor_vs_cov, action ) return (my_cor_vs_cov) - } + } # end function do_s_plot my_cor_vs_cov <- do_s_plot( x_env = x_env , predictor_projection_x = TRUE , cplot_x = FALSE ) - typeVc <- c("correlation", # 1 + typevc <- c("correlation", # 1 "outlier", # 2 "overview", # 3 "permutation", # 4 @@ -299,36 +320,37 @@ "xy-weight" # 12 ) # [c(3,8,9)] # [c(4,3,8,9)] if ( length(my_oplsda@orthoVipVn) > 0 ) { - my_typevc <- typeVc[c(9,3,8)] + my_typevc <- typevc[c(9,3,8)] } else { my_typevc <- c("(dummy)","overview","(dummy)") } my_ortho_cor_vs_cov_exists <- FALSE for (my_type in my_typevc) { - if (my_type %in% typeVc) { + if (my_type %in% typevc) { tryCatch({ - if ( my_type != "x-loading" ) { - plot( - x = my_oplsda - , typeVc = my_type - , parCexN = 0.4 - , parDevNewL = FALSE - , parLayL = TRUE - , parEllipsesL = TRUE - ) - if (my_type == "overview") { - sub_label <- sprintf("%s versus %s", fctr_lvl_1, fctr_lvl_2) - title(sub = sub_label) - } - } else { - my_ortho_cor_vs_cov <- do_s_plot( - x_env = x_env - , predictor_projection_x = FALSE - , cplot_x = FALSE - ) - my_ortho_cor_vs_cov_exists <- TRUE + if ( my_type != "x-loading" ) { + plot( + x = my_oplsda + , typeVc = my_type + , parCexN = 0.4 + , parDevNewL = FALSE + , parLayL = TRUE + , parEllipsesL = TRUE + ) + if (my_type == "overview") { + sub_label <- sprintf("%s versus %s", fctr_lvl_1, fctr_lvl_2) + title(sub = sub_label) + } + } else { + my_ortho_cor_vs_cov <- do_s_plot( + x_env = x_env + , predictor_projection_x = FALSE + , cplot_x = FALSE + ) + my_ortho_cor_vs_cov_exists <- TRUE + } } - }, error = function(e) { + , error = function(e) { x_progress( sprintf( "factor level %s or %s may have only one sample - %s" @@ -347,12 +369,17 @@ cplot_o <- x_env$cplot_o if (cplot_p || cplot_o) { if (cplot_p) { - do_s_plot( - x_env = x_env - , predictor_projection_x = TRUE - , cplot_x = TRUE - , cor_vs_cov_x = my_cor_vs_cov - ) + if (!is.null(my_cor_vs_cov)) { + do_s_plot( + x_env = x_env + , predictor_projection_x = TRUE + , cplot_x = TRUE + , cor_vs_cov_x = my_cor_vs_cov + ) + } else { + plot(x=1, y=1, xaxt="n", yaxt="n", xlab="", ylab="", type="n") + text(x=1, y=1, labels="no predictor projection is possible") + } did_plots <- 1 } else { did_plots <- 0 @@ -405,10 +432,10 @@ # extract parameters from the environment vrbl_metadata <- calc_env$vrbl_metadata - vrbl_metadata_names <- vrbl_metadata[,1] + vrbl_metadata_names <- vrbl_metadata[, 1] smpl_metadata <- calc_env$smpl_metadata data_matrix <- calc_env$data_matrix - pairSigFeatOnly <- calc_env$pairSigFeatOnly + pair_significant_features_only <- calc_env$pairSigFeatOnly facC <- calc_env$facC tesC <- calc_env$tesC # extract the levels from the environment @@ -433,22 +460,27 @@ names(rt) <- vrbl_metadata$variableMetadata rt_lookup <- function(feature) unname(rt[feature]) - # calculate salience_df as data.frame(feature, max_level, max_median, max_rcv, mean_median, salience, salient_rcv) + # calculate salience_df as data.frame(feature, max_level, max_median, salience_rcv, mean_median, salience, salience_rcv) salience_df <- calc_env$salience_df <- w4msalience( data_matrix = data_matrix , sample_class = smpl_metadata[,facC] , failure_action = failure_action ) salience_tsv_action({ - my_df <- data.frame( - featureID = salience_df$feature - , salientLevel = salience_df$max_level - , salientRCV = salience_df$salient_rcv - , salience = salience_df$salience - , mz = mz_lookup(salience_df$feature) - , rt = rt_lookup(salience_df$feature) - ) - my_df[order(-my_df$salience),] + with ( + salience_df + , { + my_df <<- data.frame( + featureID = feature + , salientLevel = max_level + , salienceRCV = salience_rcv + , relativeSalientDistance = relative_salient_distance + , salience = salience + , mz = mz_lookup(feature) + , rt = rt_lookup(feature) + ) + }) + my_df[order(-my_df$relativeSalientDistance),] }) # transform wildcards to regexen @@ -560,8 +592,8 @@ x_dataMatrix = my_matrix , x_predictor = predictor , x_is_match = TRUE - , x_algorithm = algoC - , x_prefix = if (pairSigFeatOnly) { + , x_algorithm = "nipals" + , x_prefix = if (pair_significant_features_only) { "Significantly contrasting features" } else { "Significant features" @@ -627,15 +659,15 @@ } ) col_selector <- vrbl_metadata_names[ - if ( pairSigFeatOnly ) fully_significant else overall_significant + if (pair_significant_features_only) fully_significant else overall_significant ] my_matrix <- tdm[ chosen_samples, col_selector, drop = FALSE ] my_cor_cov <- do_detail_plot( x_dataMatrix = my_matrix , x_predictor = predictor , x_is_match = is_match - , x_algorithm = algoC - , x_prefix = if (pairSigFeatOnly) { + , x_algorithm = "nipals" + , x_prefix = if (pair_significant_features_only) { "Significantly contrasting features" } else { "Significant features" @@ -668,7 +700,8 @@ } } } - } else { # tesC == "none" + } else { + # tesC == "none" # find all the levels for factor facC in sampleMetadata level_union <- unique(sort(smpl_metadata_facC)) # identify the selected levels for factor facC from sampleMetadata @@ -686,7 +719,6 @@ fctr_lvl_2 <- { if ( fctr_lvl_1 %in% completed ) return("DUMMY") - # strF(completed) completed <<- c(completed, fctr_lvl_1) setdiff(level_union, fctr_lvl_1) } @@ -717,7 +749,7 @@ x_dataMatrix = my_matrix , x_predictor = predictor , x_is_match = is_match - , x_algorithm = algoC + , x_algorithm = "nipals" , x_prefix = "Features" , x_show_labels = labelFeatures , x_progress = progress_action @@ -770,7 +802,7 @@ x_dataMatrix = my_matrix , x_predictor = predictor , x_is_match = is_match - , x_algorithm = algoC + , x_algorithm = "nipals" , x_prefix = "Features" , x_show_labels = labelFeatures , x_progress = progress_action @@ -804,7 +836,7 @@ if (!did_plot) { failure_action( sprintf( - "bad parameter! sampleMetadata must have at least two levels of factor '%s' matching '%s'" + "Did not plot. Does sampleMetadata have at least two levels of factor '%s' matching '%s' ?" , facC, originalLevCSV)) return ( FALSE ) } @@ -821,12 +853,14 @@ , ropls_x , predictor_projection_x = TRUE , x_progress = print +, x_env ) { tryCatch({ - return( - cor_vs_cov_try( matrix_x, ropls_x, predictor_projection_x, x_progress) - ) - }, error = function(e) { + return( + cor_vs_cov_try( matrix_x, ropls_x, predictor_projection_x, x_progress, x_env) + ) + } + , error = function(e) { x_progress( sprintf( "cor_vs_cov fatal error - %s" @@ -842,7 +876,12 @@ , ropls_x # an instance of ropls::opls , predictor_projection_x = TRUE # TRUE for predictor projection; FALSE for orthogonal projection , x_progress = print # function to produce progress and error messages +, x_env ) { + my_matrix_x <- matrix_x + my_matrix_x[my_matrix_x==0] <- NA + fdr_features <- x_env$fdr_features + x_class <- class(ropls_x) if ( !( as.character(x_class) == "opls" ) ) { stop( @@ -866,12 +905,12 @@ # I used equations (1) and (2) from Wiklund 2008, doi:10.1021/ac0713510 # (and not from the supplement despite the statement that, for the NIPALS algorithm, - # the equations from the supplement should be used) because of the definition of the + # the equations from the supplement should be used) because of the definition of the # Pearson/Galton coefficient of correlation is defined as # $$ # \rho_{X,Y}= \frac{\operatorname{cov}(X,Y)}{\sigma_X \sigma_Y} # $$ - # as described (among other places) on Wikipedia at + # as described (among other places) on Wikipedia at # https://en.wikipedia.org/wiki/Pearson_correlation_coefficient#For_a_population # The equations in the supplement said to use, for the predictive component t1, # \rho_{t1,X_i}= \frac{\operatorname{cov}(t1,X_i)}{(\operatorname{mag}(t1))(\operatorname{mag}(X_i))} @@ -879,112 +918,116 @@ # perhaps my data are not centered exactly the same way that theirs were. # The correlations calculated here are in agreement with those calculated with the code from # page 22 of https://cran.r-project.org/web/packages/muma/muma.pdf - # I did transform covariance to "relative covariance" (relative to the maximum value) - # to keep the figures consistent with one another. + - # count the features (one column for each sample) - Nfeatures <- ncol(matrix_x) - # count the samples (one row for each sample) - Nobservations <- nrow(matrix_x) - # a one-dimensional magnitude function (i.e., take the vector norm) - vector_norm <- function(one_dimensional) sqrt(sum(one_dimensional * one_dimensional)) - # calculate the standard deviation for each feature - sd_xi <- sapply(X = 1:Nfeatures, FUN = function(x) sd(matrix_x[,x])) + # count the features/variables (one column for each sample) + # count the features/variables (one column for each sample) + n_features <- ncol(my_matrix_x) + all_n_features <- x_env$fdr_features + if (length(grep("^[0-9][0-9]*$", all_n_features)) > 0) { + all_n_features <- as.integer(all_n_features) + } else { + all_n_features <- n_features + } + fdr_n_features <- max(n_features, all_n_features) + # print("n_features") + # print(n_features) + + # count the samples/observations (one row for each sample) + n_observations <- nrow(my_matrix_x) + # choose whether to plot the predictive score vector or orthogonal score vector if (predictor_projection_x) - score_matrix <- ropls_x@scoreMN + score_vector <- ropls_x@scoreMN else - score_matrix <- ropls_x@orthoScoreMN - # transpose the score (or orthoscore) vector for use as a premultiplier in covariance calculation - score_matrix_transposed <- t(score_matrix) - # compute the norm of the vector (i.e., the magnitude) - score_matrix_magnitude <- vector_norm(score_matrix) - # compute the standard deviation of the vector - score_matrix_sd <- sd(score_matrix) - # compute the relative covariance of each feature with the score vector + score_vector <- ropls_x@orthoScoreMN + + # compute the covariance of each feature with the score vector result$covariance <- - score_matrix_transposed %*% matrix_x / ( score_matrix_magnitude * score_matrix_magnitude ) + sapply( + X = 1:n_features + , FUN = function(i) { + cov(score_vector, my_matrix_x[ , i, drop = TRUE], use = "pairwise.complete.obs") + } + ) + # access covariance by feature name + names(result$covariance) <- colnames(my_matrix_x) + # compute the correlation of each feature with the score vector result$correlation <- - score_matrix_transposed %*% matrix_x / ( (Nobservations - 1) * ( score_matrix_sd * sd_xi ) ) - - # convert covariance and correlation from one-dimensional matrices to arrays of values, - # which are accessed by feature name below - p1 <- result$covariance <- result$covariance [ 1, , drop = TRUE ] - # x_progress("strF(p1)") - # x_progress(strF(p1)) + sapply( + X = 1:n_features + , FUN = function(i) { + cor(score_vector, my_matrix_x[ , i, drop = TRUE], use = "pairwise.complete.obs") + } + ) + # access correlation by feature name + names(result$correlation) <- colnames(my_matrix_x) - pcorr1 <- result$correlation <- result$correlation[ 1, , drop = TRUE ] - # x_progress("pearson strF(pcorr1)") - # x_progress(strF(pcorr1)) - # x_progress(typeof(pcorr1)) - # x_progress(str(pcorr1)) - - # # this is how to use Spearman correlation instead of pearson - # result$spearcor <- sapply( - # X = 1:Nfeatures - # , FUN = function(i) { - # stats::cor( - # x = as.vector(score_matrix) - # , y = as.vector(matrix_x[,i]) - # # , method = "spearman" - # , method = "pearson" - # ) - # } - # ) - # names(result$spearcor) <- names(p1) - # pcorr1 <- result$spearcor - # x_progress("spearman strF(pcorr1)") - # x_progress(strF(pcorr1)) - # x_progress(typeof(pcorr1)) - # x_progress(str(pcorr1)) - # pcorr1 <- result$correlation <- result$spearcor + # eliminate NAs in either correlation or covariance + nas <- is.na(result$correlation) | is.na(result$covariance) + featureID <- names(ropls_x@vipVn) + featureID <- featureID[!nas] + result$correlation <- result$correlation[!nas] + result$covariance <- result$covariance[!nas] + n_features <- length(featureID) - # correl.ci(r, n, a = 0.05, rho = 0) correl_pci <- lapply( - X = 1:Nfeatures - , FUN = function(i) correl.ci(r = pcorr1[i], n = Nobservations) + X = 1:n_features + , FUN = function(i) { + correl.ci( + r = result$correlation[i] + , n_obs = n_observations + , n_vars = fdr_n_features + ) + } ) result$p_value_raw <- sapply( - X = 1:Nfeatures + X = 1:n_features + , FUN = function(i) correl_pci[[i]]$p.value.raw + ) + result$p_value_raw[is.na(result$p_value_raw)] <- 1.0 + result$p_value <- sapply( + X = 1:n_features , FUN = function(i) correl_pci[[i]]$p.value ) - result$p_value_raw[is.na(result$p_value_raw)] <- 0.0 + result$p_value[is.na(result$p_value)] <- 1.0 result$ci_lower <- sapply( - X = 1:Nfeatures - , FUN = function(i) correl_pci[[i]]$CI['lower'] + X = 1:n_features + , FUN = function(i) correl_pci[[i]]$CI["lower"] ) result$ci_upper <- sapply( - X = 1:Nfeatures - , FUN = function(i) correl_pci[[i]]$CI['upper'] + X = 1:n_features + , FUN = function(i) correl_pci[[i]]$CI["upper"] ) # extract "variant 4 of Variable Influence on Projection for OPLS" (see Galindo_Prieto_2014, DOI 10.1002/cem.2627) # Length = number of features; labels = feature identifiers. (The same is true for $correlation and $covariance.) - result$vip4p <- as.numeric(ropls_x@vipVn) - result$vip4o <- as.numeric(ropls_x@orthoVipVn) + result$vip4p <- as.numeric(ropls_x@vipVn)[!nas] + result$vip4o <- as.numeric(ropls_x@orthoVipVn)[!nas] if (length(result$vip4o) == 0) result$vip4o <- NA # extract the loadings - result$loadp <- as.numeric(ropls_x@loadingMN) - result$loado <- as.numeric(ropls_x@orthoLoadingMN) + result$loadp <- as.numeric(ropls_x@loadingMN)[!nas] + result$loado <- as.numeric(ropls_x@orthoLoadingMN)[!nas] # get the level names level_names <- sort(levels(as.factor(ropls_x@suppLs$y))) fctr_lvl_1 <- level_names[1] fctr_lvl_2 <- level_names[2] - feature_count <- length(ropls_x@vipVn) - result$level1 <- rep.int(x = fctr_lvl_1, times = feature_count) - result$level2 <- rep.int(x = fctr_lvl_2, times = feature_count) + result$level1 <- rep.int(x = fctr_lvl_1, times = n_features) + result$level2 <- rep.int(x = fctr_lvl_2, times = n_features) greaterLevel <- sapply( X = result$correlation - , FUN = function(my_corr) + , FUN = function(my_corr) { tryCatch({ - if ( is.nan( my_corr ) ) { - NA + if ( is.na(my_corr) || ! is.numeric( my_corr ) ) { + NA } else { if ( my_corr < 0 ) fctr_lvl_1 else fctr_lvl_2 } - }, error = function(e) { + } + , error = function(e) { + print(my_corr) x_progress( sprintf( "cor_vs_cov -> sapply: error - substituting NA - %s" @@ -992,16 +1035,11 @@ ) ) NA - }) + } + ) + } ) - # begin fixes for https://github.com/HegemanLab/w4mcorcov_galaxy_wrapper/issues/1 - featureID <- names(ropls_x@vipVn) - greaterLevel <- greaterLevel[featureID] - result$correlation <- result$correlation[featureID] - result$covariance <- result$covariance[featureID] - # end fixes for https://github.com/HegemanLab/w4mcorcov_galaxy_wrapper/issues/1 - # build a data frame to hold the content for the tab-separated values file tsv1 <- data.frame( featureID = featureID @@ -1016,8 +1054,8 @@ , loadp = result$loadp , loado = result$loado , cor_p_val_raw = result$p_value_raw - , cor_p_value = p.adjust(p = result$p_value_raw, method = "BY") - , cor_ci_lower = result$ci_lower + , cor_p_value = result$p_value + , cor_ci_lower = result$ci_lower , cor_ci_upper = result$ci_upper ) rownames(tsv1) <- tsv1$featureID @@ -1039,7 +1077,7 @@ tsv1 <- tsv1[!is.na(tsv1$covariance),] superresult$tsv1 <- tsv1 - # # I did not include these but left them commentd out in case future + # # I did not include these but left them commentd out in case future # # consumers of this routine want to use it in currently unanticipated ways # result$superresult <- superresult # result$oplsda <- ropls_x @@ -1059,22 +1097,28 @@ # which follows # https://en.wikipedia.org/wiki/Fisher_transformation#Definition -correl.ci <- function(r, n, a = 0.05, rho = 0) { - ## r is the calculated correlation coefficient for n pairs +correl.ci <- function(r, n_obs, n_vars, a = 0.05, rho = 0) { + ## r is the calculated correlation coefficient for n_obs pairs of observations of one variable ## a is the significance level ## rho is the hypothesised correlation zh0 <- atanh(rho) # 0.5*log((1+rho)/(1-rho)), i.e., Fisher's z-transformation for Ho zh1 <- atanh(r) # 0.5*log((1+r)/(1-r)), i.e., Fisher's z-transformation for H1 - se <- (1 - r^2)/sqrt(n - 3) ## standard error for Fisher's z-transformation of Ho + se <- (1 - r^2)/sqrt(n_obs - 3) ## standard error for Fisher's z-transformation of Ho test <- (zh1 - zh0)/se ### test statistic pvalue <- 2*(1 - pnorm(abs(test))) ## p-value - zL <- zh1 - qnorm(1 - a/2)*se - zH <- zh1 + qnorm(1 - a/2)*se - fishL <- tanh(zL) # (exp(2*zL)-1)/(exp(2*zL)+1), i.e., lower confidence limit - fishH <- tanh(zH) # (exp(2*zH)-1)/(exp(2*zH)+1), i.e., upper confidence limit - CI <- c(fishL, fishH) - names(CI) <- c('lower', 'upper') - list(correlation = r, p.value = pvalue, CI = CI) + pvalue.adj <- p.adjust(p = pvalue, method = "BY", n = n_vars) + z_L <- zh1 - qnorm(1 - a/2)*se + z_h <- zh1 + qnorm(1 - a/2)*se + fish_l <- tanh(z_L) # (exp(2*z_l)-1)/(exp(2*z_l)+1), i.e., lower confidence limit + fish_h <- tanh(z_h) # (exp(2*z_h)-1)/(exp(2*z_h)+1), i.e., upper confidence limit + ci <- c(fish_l, fish_h) + names(ci) <- c("lower", "upper") + list( + correlation = r + , p.value.raw = pvalue + , p.value = pvalue.adj + , CI = ci + ) } -# vim: sw=2 ts=2 et : +# vim: sw=2 ts=2 et ai :
--- a/w4mcorcov_lib.R Thu Nov 08 23:06:09 2018 -0500 +++ b/w4mcorcov_lib.R Wed Dec 12 09:20:02 2018 -0500 @@ -1,3 +1,3 @@ suppressMessages(library(batch)) suppressMessages(library(ropls)) -suppressMessages(library(methods)) +#suppressMessages(library(methods))
--- a/w4mcorcov_salience.R Thu Nov 08 23:06:09 2018 -0500 +++ b/w4mcorcov_salience.R Wed Dec 12 09:20:02 2018 -0500 @@ -19,14 +19,21 @@ failure_action("w4msalience: Expected data_matrix to be a matrix (or data.frame) of numeric") return (NULL) } + + feature_names <- colnames(t_data_matrix) + n_features <- ncol(t_data_matrix) - n_features_plus_1 <- 1 + n_features - features <- colnames(t_data_matrix) n_samples <- nrow(t_data_matrix) if ( length(sample_class) != n_samples ) { strF(data_matrix) strF(sample_class) - failure_action(sprintf("w4msalience: The data_matrix has %d samples but sample_class has %d", n_samples, length(sample_class))) + failure_action( + sprintf( + "w4msalience: The data_matrix has %d samples but sample_class has %d" + , n_samples + , length(sample_class) + ) + ) return (NULL) } # end sanity checks @@ -39,13 +46,6 @@ , FUN = "median" ) - # "For each feature, 'select sample_class, max(intensity) from feature group by sample_class'." - maxOfFeatureBySampleClassLevel <- aggregate( - x = as.data.frame(t_data_matrix) - , by = list(sample_class) - , FUN = "max" - ) - # "For each feature, 'select sample_class, rcv(intensity) from feature group by sample_class'." # cv is less robust; deviation from normality degrades performance # cv(x) == sd(x) / mean(x) @@ -56,61 +56,72 @@ , by = list(sample_class) , FUN = "mad" ) - rcvOfFeatureBySampleClassLevel <- as.matrix( - madOfFeatureBySampleClassLevel[,2:n_features_plus_1] / medianOfFeatureBySampleClassLevel[,2:n_features_plus_1] - ) - rcvOfFeatureBySampleClassLevel[is.nan(rcvOfFeatureBySampleClassLevel)] <- max(9999,max(rcvOfFeatureBySampleClassLevel, na.rm = TRUE)) - # "For each feature, 'select max(max_feature_intensity) from feature'." - maxApplyMaxOfFeatureBySampleClassLevel <- sapply( - X = 1:n_features - , FUN = function(i) { - match( - max(maxOfFeatureBySampleClassLevel[, i + 1]) - , maxOfFeatureBySampleClassLevel[, i + 1] - ) - } - ) - - # "For each feature, 'select mean(median_feature_intensity) from feature'." - meanApplyMedianOfFeatureBySampleClassLevel <- sapply( - X = 1:n_features - , FUN = function(i) mean(medianOfFeatureBySampleClassLevel[, i + 1]) - ) + # Note that `apply(X=array(1:10), MARGIN = 1, FUN = function(x) return(c(x,x^2)))` + # produces a matrix with two rows and ten columns - # Compute the 'salience' for each feature, i.e., how salient the intensity of a feature - # is for one particular class-level relative to the intensity across all class-levels. - salience_df <- data.frame( - # the feature name - feature = features - # the name (or factor-level) of the class-level with the highest median intensity for the feature - , max_level = medianOfFeatureBySampleClassLevel[maxApplyMaxOfFeatureBySampleClassLevel,1] - # the median intensity for the feature and the level max_level - , max_median = sapply( - X = 1:n_features - , FUN = function(i) { - maxOfFeatureBySampleClassLevel[maxApplyMaxOfFeatureBySampleClassLevel[i], 1 + i] - } + my_list <- apply( + X = array(1:n_features) + , MARGIN = 1 + , FUN = function(x) { + my_df <- data.frame( + median = medianOfFeatureBySampleClassLevel[ , 1 + x] + , mad = madOfFeatureBySampleClassLevel[ , 1 + x] + ) + my_df$salient_level <- medianOfFeatureBySampleClassLevel[ , 1] + my_df <- my_df[ order(my_df$median, decreasing = TRUE), ] + my_dist_df <- my_df[ 1:2, ] + # "robust coefficient of variation", i.e., + # mad(feature-intensity for class-level max_level) / median(feature-intensity for class-level max_level) + rcv_result <- my_dist_df$mad[1] / my_dist_df$median[1] + dist_result <- + ( my_dist_df$median[1] - my_dist_df$median[2] ) / + sqrt( my_dist_df$mad[1] * my_dist_df$mad[2] ) + if (is.infinite(dist_result) || is.nan(dist_result)) + dist_result <- 0 + mean_median <- mean(my_df$median) + salience_result <- if (mean_median > 0) my_df$median[1] / mean_median else 0 + return ( + data.frame( + dist_result = dist_result + , max_median = my_df$median[1] + , mean_median = mean_median + , salience_result = salience_result + , salient_level = my_df$salient_level[1] + , rcv_result = rcv_result + ) + ) + } + ) + results_matrix <- sapply(X = 1:n_features, FUN = function(i) my_list[[i]]) + results_df <- as.data.frame(t(results_matrix)) + + relative_salient_distance <- unlist(results_df$dist_result) + salience <- unlist(results_df$salience_result) + salient_level <- unlist(results_df$salient_level) + max_median <- unlist(results_df$max_median) + mean_median <- unlist(results_df$mean_median) + rcv_result <- unlist(results_df$rcv_result) + + salience_df <- + data.frame( + # the feature name + feature = feature_names + # the name (or factor-level) of the class-level with the highest median intensity for the feature + , max_level = salient_level + # the median intensity for the feature and the level max_level + , max_median = max_median + # the distance between the maximum intensities for the feature at the two highest levels + , relative_salient_distance = relative_salient_distance + # the coefficient of variation (expressed as a proportion) for the intensity for the feature and the level max_level + , salience_rcv = rcv_result + # the mean of the medians of intensity for all class-levels for the feature + , mean_median = mean_median + # raw salience is the ratio of the most-prominent level to the mean of all levels for the feature + , salience = salience + # don't coerce strings to factors (this is a parameter for the data.frame constructor, not a column of the data.frame) + , stringsAsFactors = FALSE ) - # the coefficient of variation (expressed as a proportion) for the intensity for the feature and the level max_level - , max_rcv = sapply( - X = 1:n_features - , FUN = function(i) { - rcvOfFeatureBySampleClassLevel[maxApplyMaxOfFeatureBySampleClassLevel[i], i] - } - ) - # the mean of the medians of intensity for all class-levels for the feature - , mean_median = meanApplyMedianOfFeatureBySampleClassLevel - # don't coerce strings to factors (this is a parameter for the data.frame constructor, not a column of the data.frame) - , stringsAsFactors = FALSE - ) - # raw salience is the ratio of the most-prominent level to the mean of all levels for the feature - salience_df$salience <- sapply( - X = 1:nrow(salience_df) - , FUN = function(i) with(salience_df[i,], if (mean_median > 0) { max_median / mean_median } else { 0 } ) - ) - # "robust coefficient of variation, i.e., mad(feature-intensity for class-level max_level) / median(feature-intensity for class-level max_level) - salience_df$salient_rcv <- salience_df$max_rcv return (salience_df) }
--- a/w4mcorcov_wrapper.R Thu Nov 08 23:06:09 2018 -0500 +++ b/w4mcorcov_wrapper.R Wed Dec 12 09:20:02 2018 -0500 @@ -73,6 +73,10 @@ ######## errorPrint(sessionInfo()) +errorCat("\nsearch path:") +errorPrint(search()) +# errorCat("\nCurrently loaded namespaces:\n") +# errorPrint(loadedNamespaces()) argVc <- unlist(parseCommandArgs(evaluate=FALSE)) errorCat("\n\n---\n\nArguments that were passed to R are as follows:\n") @@ -104,6 +108,7 @@ my_env$levCSV <- as.character(argVc["levCSV"]) my_env$matchingC <- as.character(argVc["matchingC"]) my_env$labelFeatures <- as.character(argVc["labelFeatures"]) # number of features to label at each extreme of the loadings or 'ALL' +my_env$fdr_features <- as.character(argVc["fdr_features"]) # number of features to consider when adjusting p-value, or 'ALL' my_env$cplot_o <- as.logical(argVc["cplot_o"]) # TRUE if orthogonal C-plot is requested my_env$cplot_p <- as.logical(argVc["cplot_p"]) # TRUE if parallel C-plot is requested my_env$cplot_y <- as.character(argVc["cplot_y"]) # Choice of covariance/correlation for Y-axis on C-plot