Mercurial > repos > recetox > waveica
changeset 10:821062fc5782 draft default tip
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/waveica commit 2b8b1dcb2947c6503fd4f82904df708e4f88ea1d
author | recetox |
---|---|
date | Fri, 04 Jul 2025 09:43:22 +0000 |
parents | 6fc9f6dbcef5 |
children | |
files | macros.xml test-data/data_matrix.csv test-data/data_matrix.parquet test-data/data_matrix.tsv test-data/feature_table.csv test-data/feature_table.parquet test-data/feature_table.tsv test-data/metadata.csv test-data/metadata.parquet test-data/metadata.tsv test-data/normalized_data.csv test-data/normalized_data.parquet test-data/normalized_data.tsv test-data/normalized_data_nobatch.tsv test-data/sample_metadata.csv test-data/sample_metadata.parquet test-data/sample_metadata.tsv test-data/test10_output1.csv test-data/test10_output1.tsv test-data/test10_output2.tsv test-data/test1_output.csv test-data/test3_output.parquet test-data/test9_output2.parquet waveica.xml waveica_wrapper.R |
diffstat | 25 files changed, 342 insertions(+), 338 deletions(-) [+] |
line wrap: on
line diff
--- a/macros.xml Fri May 16 10:14:27 2025 +0000 +++ b/macros.xml Fri Jul 04 09:43:22 2025 +0000 @@ -26,10 +26,6 @@ </xrefs> </xml> - <xml name="input_data"> - <param type="data" name="data" label="Feature table" format="csv,tsv,tabular,parquet" help=""/> - </xml> - <xml name="wf"> <conditional name="wf"> <param type="select" name="wavelet_filter" label="Wavelet transform filter" help="wavelet function and filter length [1] (see footnotes for more details)"> @@ -81,22 +77,15 @@ </when> </conditional> </xml> - <xml name="split_output"> - <param name = "keep_two_output" label="Output metadata and data matrix as separate files" type="boolean" checked="false" - truevalue="TRUE" falsevalue="FALSE" help="Keep two output files, one being the data matrix (feature table) and the second being the metadata table." /> - </xml> <xml name="outputs"> <outputs> <data name="normalized_data" format="tabular" label="Normalized table of ${on_string}"> <change_format> - <when input_dataset="data" attribute="ext" value="parquet" format="parquet" /> - </change_format> - </data> - <data name="metadata" format="tabular" label="Metadata table of ${on_string}"> - <filter>keep_two_output</filter> - <change_format> - <when input_dataset="data" attribute="ext" value="parquet" format="parquet" /> + <when input_dataset="data_matrix" attribute="ext" value="parquet" format="parquet" /> + <when input_dataset="data_matrix" attribute="ext" value="csv" format="csv" /> + <when input_dataset="data_matrix" attribute="ext" value="tsv" format="tsv" /> + <when input_dataset="data_matrix" attribute="ext" value="tabular" format="tabular" /> </change_format> </data> </outputs>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/data_matrix.csv Fri Jul 04 09:43:22 2025 +0000 @@ -0,0 +1,6 @@ +sampleName,M85T34,M86T41,M86T518,M86T539 +VT_160120_002,228520.06430737,35646729.21543971,2386896.97966461,1026645.83653468 +VT_160120_004,90217.384387202,35735702.457215995,2456290.69621518,1089246.46040563 +VT_160120_006,235656.75288383896,37021134.452711605,8873450.40260241,837856.449608585 +VT_160120_008,16622.9351783435,44302499.262606,2466946.89667101,994979.069689685 +VT_160120_010,62385.0742465736,44639738.0735709,2389372.85729467,954938.131337246
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/data_matrix.tsv Fri Jul 04 09:43:22 2025 +0000 @@ -0,0 +1,6 @@ +sampleName M85T34 M86T41 M86T518 M86T539 +VT_160120_002 228520.06430737 35646729.21543971 2386896.97966461 1026645.83653468 +VT_160120_004 90217.384387202 35735702.457215995 2456290.69621518 1089246.46040563 +VT_160120_006 235656.75288383896 37021134.452711605 8873450.40260241 837856.449608585 +VT_160120_008 16622.9351783435 44302499.262606 2466946.89667101 994979.069689685 +VT_160120_010 62385.0742465736 44639738.0735709 2389372.85729467 954938.131337246
--- a/test-data/feature_table.csv Fri May 16 10:14:27 2025 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,6 +0,0 @@ -sampleName,M85T34,M86T41,M86T518,M86T539 -VT_160120_002,228520.06430737,35646729.21543971,2386896.97966461,1026645.83653468 -VT_160120_004,90217.384387202,35735702.457215995,2456290.69621518,1089246.46040563 -VT_160120_006,235656.75288383896,37021134.452711605,8873450.40260241,837856.449608585 -VT_160120_008,16622.9351783435,44302499.262606,2466946.89667101,994979.069689685 -VT_160120_010,62385.0742465736,44639738.0735709,2389372.85729467,954938.131337246
--- a/test-data/feature_table.tsv Fri May 16 10:14:27 2025 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,6 +0,0 @@ -sampleName M85T34 M86T41 M86T518 M86T539 -VT_160120_002 228520.06430737 35646729.21543971 2386896.97966461 1026645.83653468 -VT_160120_004 90217.384387202 35735702.457215995 2456290.69621518 1089246.46040563 -VT_160120_006 235656.75288383896 37021134.452711605 8873450.40260241 837856.449608585 -VT_160120_008 16622.9351783435 44302499.262606 2466946.89667101 994979.069689685 -VT_160120_010 62385.0742465736 44639738.0735709 2389372.85729467 954938.131337246
--- a/test-data/metadata.csv Fri May 16 10:14:27 2025 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,6 +0,0 @@ -sampleName,class,sampleType,injectionOrder,batch -VT_160120_002,sample,sample,1,1 -VT_160120_004,sample,sample,2,1 -VT_160120_006,sample,sample,3,1 -VT_160120_008,sample,sample,4,1 -VT_160120_010,sample,sample,5,1
--- a/test-data/metadata.tsv Fri May 16 10:14:27 2025 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,6 +0,0 @@ -sampleName class sampleType injectionOrder batch -VT_160120_002 sample sample 1 1 -VT_160120_004 sample sample 2 1 -VT_160120_006 sample sample 3 1 -VT_160120_008 sample sample 4 1 -VT_160120_010 sample sample 5 1
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/normalized_data.csv Fri Jul 04 09:43:22 2025 +0000 @@ -0,0 +1,5 @@ +id,VT_160120_002,VT_160120_004,VT_160120_006,VT_160120_008,VT_160120_010 +M85T34,355200.506508035,216897.826587868,362337.195084505,143303.377379009,189065.516447239 +M86T41,75115889.9077485,75204863.1495248,76490295.1450204,83771659.9549148,84108898.7658797 +M86T518,6101488.54615418,6170882.26270475,12588041.969092,6181538.46316058,6103964.42378424 +M86T539,2007379.02604984,2069979.64992079,1818589.63912375,1975712.25920485,1935671.32085241
--- a/test-data/normalized_data.tsv Fri May 16 10:14:27 2025 +0000 +++ b/test-data/normalized_data.tsv Fri Jul 04 09:43:22 2025 +0000 @@ -1,6 +1,6 @@ -sampleName class sampleType injectionOrder batch M85T34 M86T41 M86T518 M86T539 -VT_160120_002 sample sample 1 1 355200.506508035 75115889.9077485 6101488.54615418 2007379.02604984 -VT_160120_004 sample sample 2 1 216897.826587868 75204863.1495248 6170882.26270475 2069979.64992079 -VT_160120_006 sample sample 3 1 362337.195084504 76490295.1450204 12588041.969092 1818589.63912375 -VT_160120_008 sample sample 4 1 143303.377379009 83771659.9549148 6181538.46316058 1975712.25920485 -VT_160120_010 sample sample 5 1 189065.516447239 84108898.7658797 6103964.42378424 1935671.32085241 +sampleName M85T34 M86T41 M86T518 M86T539 +VT_160120_002 355200.506508035 75115889.9077485 6101488.54615418 2007379.02604984 +VT_160120_004 216897.826587868 75204863.1495248 6170882.26270475 2069979.64992079 +VT_160120_006 362337.195084505 76490295.1450204 12588041.969092 1818589.63912375 +VT_160120_008 143303.377379009 83771659.9549148 6181538.46316058 1975712.25920485 +VT_160120_010 189065.516447239 84108898.7658797 6103964.42378424 1935671.32085241
--- a/test-data/normalized_data_nobatch.tsv Fri May 16 10:14:27 2025 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,6 +0,0 @@ -sampleName class sampleType injectionOrder M85T34 M86T41 M86T518 M86T539 -VT_160120_002 sample sample 1 -9795801.68327296 29546678.5668352 -6207890.55898405 -8941748.93595845 -VT_160120_004 sample sample 2 -9798910.74239713 29543569.5077111 -6210999.61810821 -8944857.99508262 -VT_160120_006 sample sample 3 -9797307.93141959 29545172.3186886 -6209396.80713068 -8943255.18410509 -VT_160120_008 sample sample 4 -9793706.69204694 29548773.5580612 -6205795.56775803 -8939653.94473244 -VT_160120_010 sample sample 5 -9800711.45464277 29541768.7954654 -6212800.33035386 -8946658.70732827
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sample_metadata.csv Fri Jul 04 09:43:22 2025 +0000 @@ -0,0 +1,6 @@ +sampleName,class,sampleType,injectionOrder,batch +VT_160120_002,sample,sample,1,1 +VT_160120_004,sample,sample,2,1 +VT_160120_006,sample,sample,3,1 +VT_160120_008,sample,sample,4,1 +VT_160120_010,sample,sample,5,1
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sample_metadata.tsv Fri Jul 04 09:43:22 2025 +0000 @@ -0,0 +1,6 @@ +sampleName class sampleType injectionOrder batch +VT_160120_002 sample sample 1 1 +VT_160120_004 sample sample 2 1 +VT_160120_006 sample sample 3 1 +VT_160120_008 sample sample 4 1 +VT_160120_010 sample sample 5 1
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test10_output1.csv Fri Jul 04 09:43:22 2025 +0000 @@ -0,0 +1,5 @@ +id,VT_160120_002,VT_160120_004,VT_160120_006,VT_160120_008,VT_160120_010 +M85T34,355200.506508035,216897.826587868,362337.195084505,143303.377379009,189065.516447239 +M86T41,75115889.9077485,75204863.1495248,76490295.1450204,83771659.9549148,84108898.7658797 +M86T518,6101488.54615418,6170882.26270475,12588041.969092,6181538.46316058,6103964.42378424 +M86T539,2007379.02604984,2069979.64992079,1818589.63912375,1975712.25920485,1935671.32085241
--- a/test-data/test10_output1.tsv Fri May 16 10:14:27 2025 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,5 +0,0 @@ -id VT_160120_002 VT_160120_004 VT_160120_006 VT_160120_008 VT_160120_010 -M85T34 355200.506508035 216897.826587868 362337.195084504 143303.377379009 189065.516447239 -M86T41 75115889.9077485 75204863.1495248 76490295.1450204 83771659.9549148 84108898.7658797 -M86T518 6101488.54615418 6170882.26270475 12588041.969092 6181538.46316058 6103964.42378424 -M86T539 2007379.02604984 2069979.64992079 1818589.63912375 1975712.25920485 1935671.32085241
--- a/test-data/test10_output2.tsv Fri May 16 10:14:27 2025 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,6 +0,0 @@ -sampleName class sampleType injectionOrder batch -VT_160120_002 sample sample 1 1 -VT_160120_004 sample sample 2 1 -VT_160120_006 sample sample 3 1 -VT_160120_008 sample sample 4 1 -VT_160120_010 sample sample 5 1
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test1_output.csv Fri Jul 04 09:43:22 2025 +0000 @@ -0,0 +1,6 @@ +sampleName,M85T34,M86T41,M86T518,M86T539 +VT_160120_002,355200.506508035,75115889.9077485,6101488.54615418,2007379.02604984 +VT_160120_004,216897.826587868,75204863.1495248,6170882.26270475,2069979.64992079 +VT_160120_006,362337.195084505,76490295.1450204,12588041.969092,1818589.63912375 +VT_160120_008,143303.377379009,83771659.9549148,6181538.46316058,1975712.25920485 +VT_160120_010,189065.516447239,84108898.7658797,6103964.42378424,1935671.32085241
--- a/waveica.xml Fri May 16 10:14:27 2025 +0000 +++ b/waveica.xml Fri Jul 04 09:43:22 2025 +0000 @@ -1,4 +1,4 @@ -<tool id="waveica" name="WaveICA" version="@TOOL_VERSION@+galaxy9" profile="23.0"> +<tool id="waveica" name="WaveICA" version="@TOOL_VERSION@+galaxy10" profile="23.0"> <description>removal of batch effects for untargeted metabolomics data</description> <macros> <import>macros.xml</import> @@ -9,67 +9,49 @@ <requirements> <requirement type="package" version="@TOOL_VERSION@">r-recetox-waveica</requirement> <requirement type="package" version="8.0.0">r-arrow</requirement> + <requirement type="package" version="1.0.10">r-dplyr</requirement> </requirements> <command detect_errors="aggressive"><![CDATA[ - Rscript - -e 'source("${__tool_directory__}/waveica_wrapper.R")' - - #if $batch_correction.mode == "batchwise": - -e 'normalized_data <- waveica( - file = "$input_num.data", - #if $input_num.input_choice == "2": - metadata = "$input_num.metadata", - ext = "$input_num.data.ext,$input_num.metadata.ext", - transpose = $input_num.transpose_feature_table, - #else: - ext = "$input_num.data.ext", - #end if - wavelet_filter = "$wf.wavelet_filter", - wavelet_length = "$wf.wavelet_length", + Rscript -e "source('${__tool_directory__}/waveica_wrapper.R'); + normalized_data <- + #if $batch_correction.mode == 'batchwise': + waveica( + data_matrix_file = '$data_matrix', + sample_metadata_file = '$sample_metadata', + ft_ext = '$data_matrix.ext', + mt_ext = '$sample_metadata.ext', + wavelet_filter = '$wf.wavelet_filter', + wavelet_length = '$wf.wavelet_length', k = $k, t = $batch_correction.t, t2 = $batch_correction.t2, alpha = $alpha, - exclude_blanks = $exclude_blanks - )' - #else if $batch_correction.mode == "single_batch": - -e 'normalized_data <- waveica_singlebatch( - file = "$input_num.data", - #if $input_num.input_choice == "2": - metadata = "$input_num.metadata", - ext = "$input_num.data.ext,$input_num.metadata.ext", - transpose = $input_num.transpose_feature_table, - #else: - ext = "$input_num.data.ext", - #end if - wavelet_filter = "$wf.wavelet_filter", - wavelet_length = "$wf.wavelet_length", + exclude_blanks = $exclude_blanks, + transpose = $transpose_feature_table + ) + #else: + waveica_singlebatch( + data_matrix_file = '$data_matrix', + sample_metadata_file = '$sample_metadata', + ft_ext = '$data_matrix.ext', + mt_ext = '$sample_metadata.ext', + wavelet_filter = '$wf.wavelet_filter', + wavelet_length = '$wf.wavelet_length', k = $k, alpha = $alpha, cutoff = $batch_correction.cutoff, exclude_blanks = $exclude_blanks - )' - #end if - - -e 'store_data(normalized_data, "$normalized_data", "$metadata", "$input_num.data.ext", $keep_two_output)' + transpose = $transpose_feature_table + ) + #end if + ;store_data(normalized_data, '$normalized_data', '$data_matrix.ext')" ]]></command> <inputs> - <conditional name="input_num"> - <param name="input_choice" type="select" label="Choose input files:"> - <option value="1" selected="true">1: intensity-by-feature table with metadata</option> - <option value="2">2: intensity-by-feature table and metadata table separately</option> - </param> - <when value="1"> - <expand macro="input_data"/> - </when> - <when value="2"> - <expand macro="input_data"/> - <param name="metadata" label="Input metadata" type="data" format="csv,tsv,tabular,parquet" help="" /> - <param name = "transpose_feature_table" label="Transpose feature table" type="boolean" checked="false" + <param name="data_matrix" type="data" label="Feature table" format="csv,tsv,tabular,parquet" help="Table of measured features for each sample."/> + <param name="sample_metadata" label="Input sample metadata" type="data" format="csv,tsv,tabular,parquet" help="Table with sample information (e.g., sample name, class, batch, injection order) for each sample." /> + <param name = "transpose_feature_table" label="Transpose feature table" type="boolean" checked="false" truevalue="TRUE" falsevalue="FALSE" help="Swap sample names with feature names as column headers (to fit recetox-aplcms outputs)." /> - </when> - </conditional> <param type="integer" value="20" name="k" label="Number of components to decompose" help="maximal component that ICA decomposes"/> <param type="float" value="0" name="alpha" label="Alpha" help="trade-off value between the independence of samples (temporal ICA) and variables (spatial ICA), should be between 0 and 1"/> <expand macro="wf"/> @@ -89,16 +71,14 @@ </when> </conditional> <param name="exclude_blanks" type="boolean" checked="false" truevalue="TRUE" falsevalue="FALSE" label="Remove blanks" help="Excludes blank samples from the output" /> - <expand macro="split_output"/> </inputs> <expand macro="outputs"/> <tests> - <test expect_num_outputs="1"><!-- TEST 1 --> - <conditional name="input_num"> - <param name="data" value="input_data.csv" ftype="csv"/> - </conditional> + <test><!-- TEST 1: CSV input --> + <param name="data_matrix" value="data_matrix.csv" ftype="csv"/> + <param name="sample_metadata" value="sample_metadata.csv" ftype="csv"/> <param name="alpha" value="0"/> <param name="k" value="20"/> <conditional name="wf"> @@ -110,12 +90,11 @@ <param name="t" value="0.05"/> <param name="t2" value="0.05"/> </conditional> - <output name="normalized_data" file="normalized_data.tsv" ftype="tabular"/> + <output name="normalized_data" file="test1_output.csv" ftype="csv"/> </test> - <test expect_num_outputs="1"><!-- TEST 2 --> - <conditional name="input_num"> - <param name="data" value="input_data.tsv" ftype="tsv"/> - </conditional> + <test><!-- TEST 2: TSV input --> + <param name="data_matrix" value="data_matrix.tsv" ftype="tabular"/> + <param name="sample_metadata" value="sample_metadata.tsv" ftype="tabular"/> <param name="alpha" value="0"/> <param name="k" value="20"/> <conditional name="wf"> @@ -129,29 +108,9 @@ </conditional> <output name="normalized_data" file="normalized_data.tsv" ftype="tabular"/> </test> - <test expect_num_outputs="1"><!-- TEST 3 --> - <conditional name="input_num"> - <param name="data" value="input_data.parquet" ftype="parquet"/> - </conditional> - <param name="k" value="20"/> - <param name="alpha" value="0"/> - <conditional name="wf"> - <param name="wavelet_filter" value="d"/> - <param name="wavelet_length" value="2"/> - </conditional> - <conditional name="batch_correction"> - <param name="mode" value="batchwise"/> - <param name="t" value="0.05"/> - <param name="t2" value="0.05"/> - </conditional> - <output name="normalized_data" file="normalized_data.parquet" ftype="parquet"/> - </test> - <test expect_num_outputs="1"><!-- TEST 4 --> - <conditional name="input_num"> - <param name="input_choice" value="2"/> - <param name="data" value="feature_table.csv" ftype="csv"/> - <param name="metadata" value="metadata.csv" ftype="csv"/> - </conditional> + <test><!-- TEST 3: Parquet input --> + <param name="data_matrix" value="data_matrix.parquet" ftype="parquet"/> + <param name="sample_metadata" value="sample_metadata.csv" ftype="csv"/> <param name="alpha" value="0"/> <param name="k" value="20"/> <conditional name="wf"> @@ -163,73 +122,40 @@ <param name="t" value="0.05"/> <param name="t2" value="0.05"/> </conditional> - <output name="normalized_data" file="normalized_data.tsv" ftype="tabular"/> + <output name="normalized_data" file="test3_output.parquet" compare="sim_size" delta="200" ftype="parquet"/> </test> - <test expect_num_outputs="1"><!-- TEST 5 --> - <conditional name="input_num"> - <param name="input_choice" value="2"/> - <param name="data" value="feature_table.tsv" ftype="tabular"/> - <param name="metadata" value="metadata.tsv" ftype="tabular"/> - </conditional> - <output name="normalized_data" file="normalized_data.tsv" ftype="tabular"/> + <test expect_failure="true"><!-- TEST 4: NA data should fail --> + <param name="data_matrix" value="na_data.csv" ftype="csv"/> + <param name="sample_metadata" value="sample_metadata.csv" ftype="csv"/> </test> - <test expect_num_outputs="1"><!-- TEST 6 --> - <conditional name="input_num"> - <param name="input_choice" value="2"/> - <param name="data" value="feature_table.parquet" ftype="parquet"/> - <param name="metadata" value="metadata.csv" ftype="csv"/> - </conditional> - <output name="normalized_data" file="normalized_data.parquet" compare="sim_size" delta="200" ftype="parquet"/> + <test expect_failure="true"><!-- TEST 5: Incomplete metadata should fail --> + <param name="data_matrix" value="data_matrix.csv" ftype="csv"/> + <param name="sample_metadata" value="incomplete_metadata_data.csv" ftype="csv"/> </test> - <test expect_num_outputs="1"><!-- TEST 7 --> - <conditional name="input_num"> - <param name="input_choice" value="2"/> - <param name="data" value="feature_table_transpose_version.parquet" ftype="parquet"/> - <param name="metadata" value="metadata.parquet" ftype="parquet"/> - <param name="transpose_feature_table" value="TRUE"/> - </conditional> + + <test><!-- TEST 6 --> + <param name="data_matrix" value="feature_table_transpose_version.parquet" ftype="parquet"/> + <param name="sample_metadata" value="sample_metadata.parquet" ftype="parquet"/> + <param name="transpose_feature_table" value="TRUE"/> <output name="normalized_data" file="normalized_data.parquet" compare="sim_size" delta="200" ftype="parquet"/> </test> - <test expect_num_outputs="1"><!-- TEST 8 --> - <conditional name="input_num"> - <param name="input_choice" value="2"/> - <param name="data" value="feature_table_transpose_version.csv" ftype="csv"/> - <param name="metadata" value="metadata.csv" ftype="csv"/> - <param name="transpose_feature_table" value="TRUE"/> - </conditional> - <output name="normalized_data" file="normalized_data.tsv" ftype="tabular"/> - </test> - <test expect_num_outputs="2"><!-- TEST 9 --> - <conditional name="input_num"> - <param name="input_choice" value="2"/> - <param name="data" value="feature_table_transpose_version.parquet" ftype="parquet"/> - <param name="metadata" value="metadata.parquet" ftype="parquet"/> - <param name="transpose_feature_table" value="TRUE"/> - </conditional> - <param name="keep_two_output" value="TRUE"/> - <output name="normalized_data" file="test9_output1.parquet" ftype="parquet"/> - <output name="metadata" file="test9_output2.parquet" ftype="parquet"/> + <test><!-- TEST 7 --> + <param name="data_matrix" value="feature_table_transpose_version.csv" ftype="csv"/> + <param name="sample_metadata" value="sample_metadata.csv" ftype="csv"/> + <param name="transpose_feature_table" value="TRUE"/> + <output name="normalized_data" file="normalized_data.csv" ftype="csv"/> </test> - <test expect_num_outputs="2"><!-- TEST 10 --> - <conditional name="input_num"> - <param name="input_choice" value="2"/> - <param name="data" value="feature_table_transpose_version.csv" ftype="csv"/> - <param name="metadata" value="metadata.csv" ftype="csv"/> - <param name="transpose_feature_table" value="TRUE"/> - </conditional> - <param name="keep_two_output" value="TRUE"/> - <output name="normalized_data" file="test10_output1.tsv" ftype="tabular"/> - <output name="metadata" file="test10_output2.tsv" ftype="tabular"/> + <test><!-- TEST 8 --> + <param name="data_matrix" value="feature_table_transpose_version.parquet" ftype="parquet"/> + <param name="sample_metadata" value="sample_metadata.parquet" ftype="parquet"/> + <param name="transpose_feature_table" value="TRUE"/> + <output name="normalized_data" file="test9_output1.parquet" ftype="parquet"/> </test> - <test expect_failure="true"><!-- TEST 11 --> - <conditional name="input_num"> - <param name="data" value="na_data.csv" ftype="csv"/> - </conditional> - </test> - <test expect_failure="true"><!-- TEST 12 --> - <conditional name="input_num"> - <param name="data" value="incomplete_metadata_data.csv" ftype="csv"/> - </conditional> + <test><!-- TEST 9 --> + <param name="data_matrix" value="feature_table_transpose_version.csv" ftype="csv"/> + <param name="sample_metadata" value="sample_metadata.csv" ftype="csv"/> + <param name="transpose_feature_table" value="TRUE"/> + <output name="normalized_data" file="test10_output1.csv" ftype="csv"/> </test> </tests>
--- a/waveica_wrapper.R Fri May 16 10:14:27 2025 +0000 +++ b/waveica_wrapper.R Fri Jul 04 09:43:22 2025 +0000 @@ -1,65 +1,112 @@ -read_file <- function(file, metadata, ft_ext, mt_ext, transpose) { - data <- read_data(file, ft_ext) +# Read data from a file in the specified format (csv, tsv/tabular, or parquet) +read_data <- function(file, ext, transpose = FALSE) { + # Reads a data file based on its extension and returns a data frame. + if (ext == "csv") { + tryCatch( + { + data <- read.csv(file, header = TRUE) + }, + error = function(e) { + stop( + paste0( + "Failed to read as CSV. The file may not ", + "be a valid text file or may be corrupted: ", + file, "\nError: ", e$message + ) + ) + } + ) + } else if (ext == "tsv" || ext == "tabular") { + tryCatch( + { + data <- read.csv(file, header = TRUE, sep = "\t") + }, + error = function(e) { + stop( + paste0( + "Failed to read as TSV/tabular.", + "The file may not be a valid text file or may be corrupted: ", + file, "\nError: ", e$message + ) + ) + } + ) + } else if (ext == "parquet") { + data <- arrow::read_parquet(file) + } else { + stop(paste("Unsupported file extension or format for reading:", ext)) + } + original_first_colname <- colnames(data)[1] if (transpose) { col_names <- c("sampleName", data[[1]]) data <- tranpose_data(data, col_names) } - - if (!is.na(metadata)) { - mt_data <- read_data(metadata, mt_ext) - data <- merge(mt_data, data, by = "sampleName") - } - - return(data) + return(list(data = data, original_first_colname = original_first_colname)) } -read_data <- function(file, ext) { - if (ext == "csv") { - data <- read.csv(file, header = TRUE) - } else if (ext == "tsv" || ext == "tabular") { - data <- read.csv(file, header = TRUE, sep = "\t") - } else { - data <- arrow::read_parquet(file) - } - - return(data) -} - -waveica <- function(file, - metadata = NA, - ext, - transpose = FALSE, +# Main function for batchwise WaveICA normalization +waveica <- function(data_matrix_file, + sample_metadata_file, + ft_ext, + mt_ext, wavelet_filter, wavelet_length, k, t, t2, alpha, - exclude_blanks) { - # get input from the Galaxy, preprocess data - ext <- strsplit(x = ext, split = "\\,")[[1]] + exclude_blanks, + transpose = FALSE) { + # Reads feature and metadata tables, merges them, + # verifies columns, runs WaveICA, and returns normalized data. + read_features_response <- read_data( + data_matrix_file, ft_ext, + transpose + ) + features <- read_features_response$data + original_first_colname <- read_features_response$original_first_colname - ft_ext <- ext[1] - mt_ext <- ext[2] - - data <- read_file(file, metadata, ft_ext, mt_ext, transpose) + read_metadata_response <- read_data(sample_metadata_file, mt_ext) + metadata <- read_metadata_response$data required_columns <- c( "sampleName", "class", "sampleType", "injectionOrder", "batch" ) + + metadata <- dplyr::select(metadata, required_columns) + + # Ensure both tables have a sampleName column + if (!"sampleName" %in% colnames(features) || !"sampleName" %in% colnames(metadata)) { # nolint + stop("Both feature and metadata tables must contain a 'sampleName' column.") + } + data <- merge(metadata, features, by = "sampleName") + + data <- verify_input_dataframe(data, required_columns) data <- sort_by_injection_order(data) - # separate data into features, batch and group + # Separate features, batch, and group columns feature_columns <- colnames(data)[!colnames(data) %in% required_columns] features <- data[, feature_columns] group <- enumerate_groups(as.character(data$sampleType)) batch <- data$batch - # run WaveICA + # Check that wavelet level is not too high for the number of samples + max_level <- floor(log2(nrow(features))) + requested_level <- as.numeric(wavelet_length) + if (requested_level > max_level) { + stop(sprintf( + paste0( + "Wavelet length/level (%d) is too high for ", + "the number of samples (%d). Maximum allowed is %d." + ), + requested_level, nrow(features), max_level + )) + } + # Run WaveICA normalization features <- recetox.waveica::waveica( data = features, wf = get_wf(wavelet_filter, wavelet_length), @@ -70,47 +117,63 @@ t2 = t2, alpha = alpha ) + non_feature_columns <- setdiff(colnames(data), feature_columns) + # Update the data frame with normalized features data[, feature_columns] <- features - # remove blanks from dataset + # Optionally remove blank samples if (exclude_blanks) { data <- exclude_group(data, group) } - - return(data) + data <- final_data_processing( + data, non_feature_columns, + transpose, original_first_colname + ) + data } -waveica_singlebatch <- function(file, - metadata = NA, - ext, - transpose = FALSE, +# Main function for single-batch WaveICA normalization +waveica_singlebatch <- function(data_matrix_file, + sample_metadata_file, + ft_ext, + mt_ext, wavelet_filter, wavelet_length, k, alpha, cutoff, - exclude_blanks) { - # get input from the Galaxy, preprocess data - ext <- strsplit(x = ext, split = "\\,")[[1]] + exclude_blanks, + transpose = FALSE) { + # Reads feature and metadata tables, merges them, + # verifies columns, runs WaveICA (single batch), and returns normalized data. + read_features_response <- read_data(data_matrix_file, ft_ext, transpose) + features <- read_features_response$data + original_first_colname <- read_features_response$original_first_colname - ft_ext <- ext[1] - mt_ext <- ext[2] + read_data_response <- read_data(sample_metadata_file, mt_ext) + metadata <- read_data_response$data - data <- read_file(file, metadata, ft_ext, mt_ext, transpose) + # Ensure both tables have a sampleName column + if (!"sampleName" %in% colnames(features) || + !"sampleName" %in% colnames(metadata)) { # nolint + stop("Both feature and metadata tables must contain a 'sampleName' column.") + } + data <- merge(metadata, features, by = "sampleName") required_columns <- c("sampleName", "class", "sampleType", "injectionOrder") optional_columns <- c("batch") - data <- verify_input_dataframe(data, required_columns) data <- sort_by_injection_order(data) - feature_columns <- colnames(data)[!colnames(data) %in% c(required_columns, optional_columns)] + feature_columns <- colnames(data)[ + !colnames(data) %in% c(required_columns, optional_columns) + ] features <- data[, feature_columns] injection_order <- data$injectionOrder - # run WaveICA + # Run WaveICA normalization (single batch) features <- recetox.waveica::waveica_nonbatchwise( data = features, wf = get_wf(wavelet_filter, wavelet_length), @@ -119,45 +182,59 @@ alpha = alpha, cutoff = cutoff ) + non_feature_columns <- setdiff(colnames(data), feature_columns) + # Update the data frame with normalized features data[, feature_columns] <- features group <- enumerate_groups(as.character(data$sampleType)) - # remove blanks from dataset + # Optionally remove blank samples if (exclude_blanks) { data <- exclude_group(data, group) } - - return(data) + data <- final_data_processing( + data, non_feature_columns, + transpose, original_first_colname + ) + data } +# Sorts the data frame by batch and injection order (if batch exists), +# otherwise by injection order only sort_by_injection_order <- function(data) { if ("batch" %in% colnames(data)) { - data <- data[order(data[, "batch"], data[, "injectionOrder"], decreasing = FALSE), ] + data <- data[ + order(data[, "batch"], + data[, "injectionOrder"], + decreasing = FALSE + ), + ] } else { data <- data[order(data[, "injectionOrder"], decreasing = FALSE), ] } - return(data) + data } +# Verifies that required columns exist and that there are no missing values verify_input_dataframe <- function(data, required_columns) { if (anyNA(data)) { stop("Error: dataframe cannot contain NULL values! -Make sure that your dataframe does not contain empty cells") + \nMake sure that your dataframe does not contain empty cells") } else if (!all(required_columns %in% colnames(data))) { stop( "Error: missing metadata! -Make sure that the following columns are present in your dataframe: ", + \nMake sure that the following columns are present in your dataframe: ", paste(required_columns, collapse = ", ") ) } - data <- verify_column_types(data, required_columns) - - return(data) + data } +# Checks column types for required and feature columns +# and removes problematic feature columns verify_column_types <- function(data, required_columns) { - # Specify the column names and their expected types + # Checks that required columns have the correct type + # and removes non-numeric feature columns efficiently. column_types <- list( "sampleName" = c("character", "factor"), "class" = c("character", "factor", "integer"), @@ -165,120 +242,133 @@ "injectionOrder" = "integer", "batch" = "integer" ) - column_types <- column_types[required_columns] - for (col_name in names(data)) { + # Check required columns' types (fast, vectorized) + for (col_name in names(column_types)) { + if (!col_name %in% names(data)) next + expected_types <- column_types[[col_name]] actual_type <- class(data[[col_name]]) - if (col_name %in% names(column_types)) { - expected_types <- column_types[[col_name]] - - if (!actual_type %in% expected_types) { - stop( - "Column ", col_name, " is of type ", actual_type, - " but expected type is ", - paste(expected_types, collapse = " or "), "\n" - ) - } - } else { - if (actual_type != "numeric") { - data[[col_name]] <- as.numeric(as.character(data[[col_name]])) - } + if (!actual_type %in% expected_types) { + stop( + "Column ", col_name, " is of type ", actual_type, + " but expected type is ", + paste(expected_types, collapse = " or "), "\n" + ) } } - return(data) + + # Identify feature columns (not required columns) + feature_cols <- setdiff(names(data), required_columns) + # Try to convert all feature columns to numeric in one go + # as well as suppressing warnings + data[feature_cols] <- suppressWarnings( + lapply( + data[feature_cols], + function(x) as.numeric(as.character(x)) + ) + ) + # Find columns that are problematic (contain any NA after conversion) + na_counts <- vapply(data[feature_cols], function(x) any(is.na(x)), logical(1)) + removed_columns <- names(na_counts)[na_counts] + if (length(removed_columns) > 0) { + message( + "Removed problematic columns (non-numeric): ", + paste(removed_columns, collapse = ", ") + ) + } + + # Keep only good columns + keep_cols <- !(names(data) %in% removed_columns) + data <- data[, keep_cols, drop = FALSE] + data } -# Match group labels with [blank/sample/qc] and enumerate them +# Enumerates group labels: blank=0, sample=1, qc=2, standard=3 enumerate_groups <- function(group) { group[grepl("blank", tolower(group))] <- 0 group[grepl("sample", tolower(group))] <- 1 group[grepl("qc", tolower(group))] <- 2 group[grepl("standard", tolower(group))] <- 3 - - return(group) + group } -# Create appropriate input for R wavelets function +# Returns the correct wavelet filter string for the R wavelets function get_wf <- function(wavelet_filter, wavelet_length) { wf <- paste(wavelet_filter, wavelet_length, sep = "") - - # exception to the wavelet function + # Exception for Daubechies-2 if (wf == "d2") { wf <- "haar" } - - return(wf) + wf } -# Exclude blanks from a dataframe +# Removes blank samples (group==0) from the data frame exclude_group <- function(data, group) { row_idx_to_exclude <- which(group %in% 0) if (length(row_idx_to_exclude) > 0) { data_without_blanks <- data[-c(row_idx_to_exclude), ] cat("Blank samples have been excluded from the dataframe.\n") - return(data_without_blanks) + data_without_blanks } else { - return(data) + data } } -store_data <- function(data, feature_output, metadata_output, ext, split_output = FALSE) { +# Stores the output data in the requested format (csv, tsv/tabular, parquet), +# optionally splitting metadata and features +store_data <- function(data, feature_output, ext) { if (ext == "parquet") { - if (split_output == TRUE) { - split_df <- split_output(data) - arrow::write_parquet(split_df$metadata, metadata_output) - arrow::write_parquet(split_df$feature_table, feature_output) - } else { - arrow::write_parquet(data, feature_output) - } + arrow::write_parquet(data, feature_output) + } else if (ext == "csv") { + write.csv(data, file = feature_output, row.names = FALSE, quote = FALSE) + } else if (ext == "tsv" || ext == "tabular") { + write.table(data, + file = feature_output, sep = "\t", + row.names = FALSE, quote = FALSE + ) } else { - if (split_output == TRUE) { - split_df <- split_output(data) - write.table(split_df$metadata, - file = metadata_output, sep = "\t", - row.names = FALSE, quote = FALSE - ) - write.table(split_df$feature_table, - file = feature_output, sep = "\t", - row.names = FALSE, quote = FALSE - ) - } else { - write.table(data, - file = feature_output, sep = "\t", - row.names = FALSE, quote = FALSE - ) - } + stop(paste("Unsupported file extension:", ext)) } cat("Normalization has been completed.\n") } -split_output <- function(df) { - required_columns_set1 <- c("sampleName", "class", "sampleType", "injectionOrder", "batch") - required_columns_set2 <- c("sampleName", "class", "sampleType", "injectionOrder") - - if (all(required_columns_set1 %in% colnames(df))) { - metadata_df <- df[, required_columns_set1, drop = FALSE] - df <- df[, -c(2:5)] - } else if (all(required_columns_set2 %in% colnames(df))) { - metadata_df <- df[, required_columns_set2, drop = FALSE] - df <- df[, -c(2:4)] - } else { - stop("Neither set of required columns is present in the dataframe.") - } - - # Transpose the feature table - col_names <- c("id", as.vector(df[[1]])) - feature_table <- tranpose_data(df, col_names) - - return(list(metadata = metadata_df, feature_table = feature_table)) -} - tranpose_data <- function(data, column_names) { t_data <- data[-1] t_data <- t(t_data) tranposed_data <- data.frame(rownames(t_data), t_data) colnames(tranposed_data) <- column_names - return(tranposed_data) + tranposed_data } + + +final_data_processing <- function( + data, non_feature_columns, + transpose, original_first_colname) { + # Remove all columns that are in non_ + # feature_columns, except the first column + cols_to_keep <- !(colnames(data) %in% non_feature_columns) + cols_to_keep[1] <- TRUE # Always keep the first column + data <- data[, cols_to_keep, drop = FALSE] + + + if (transpose) { + # The first column becomes the new column names + new_colnames <- as.character(data[[1]]) + # Remove the first column + data <- data[, -1, drop = FALSE] + # Transpose the rest + data <- t(data) + # Convert to data frame + data <- as.data.frame(data, stringsAsFactors = FALSE) + # The first row becomes the first column + first_col <- rownames(data) + data <- cbind(first_col, data) + # Set column names + colnames(data) <- c(colnames(data)[1], new_colnames) + rownames(data) <- NULL + } + colnames(data)[1] <- original_first_colname + data +}