Mercurial > repos > recetox > recetox_aplcms_align_features

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/help.xml	Mon Feb 13 10:26:59 2023 +0000
@@ -0,0 +1,255 @@
+<macros>
+
+<token name="@GENERAL_HELP@">
+General Information
+===================
+
+Overview
+--------
+
+recetox-aplcms is a software package for peak detection in high resolution mass spectrometry (HRMS) data.
+It supports reading .mzml files in raw profile mode and uses a bi-Gaussian chromatographic peak shape for feature detection and quantification.
+
+recetox-aplcms is based on the apLCMS package developed by Tianwei Yu at Emory University - see the citations and the apLCMS section beneath.
+This version includes various software updates and is actively developed and maintained on `GitHub`_.
+Please submit eventual bug reports as `issues`_ on the repository.
+
+.. _GitHub: https://github.com/RECETOX/recetox-aplcms
+.. _issues: https://github.com/RECETOX/recetox-aplcms/issues/new
+
+
+Workflow
+--------
+
+.. image:: https://raw.githubusercontent.com/RECETOX/galaxytools/aee0dd6cf6c05936269efe4337c50e27cc68e86b/tools/recetox_aplcms/images/scheme.png
+   :width: 2560
+   :height: 788
+   :scale: 40
+   :alt: A picture of a workflow diagram.
+
+The individual steps of the recetox-aplcms package can be combined in 2 separate workflows processing HRMS data in an unsupervised manner or by including a-priori knowledge.
+The workflows consist of the following building blocks:
+
+(1) remove noise - denoise the raw data and extract the EIC
+(2) generate feature table - group features in EIC into peaks using peak-shape model
+(3) compute clusters - compute mz and rt clusters across samples
+(4) compute template - find the template for rt correction
+(5) correct time - correct the rt across samples using splines
+(6) align features - align identical features across samples
+(7) recover weaker signals - recover missed features in samples based on the aligned features
+(8) merge known table - add known features to detected features table and vice versa
+
+For detailed documentation on the individual steps please see the individual tool wrappers.
+
+
+apLCMS (Original Reference)
+---------------------------
+
+apLCMS is a software which generates a feature table from a batch of LC/MS spectra. The m/z and retention time
+tolerance levels are estimated from the data. A run-filter is used to detect peaks and remove noise.
+Non-parametric statistical methods are used to find-tune peak selection and grouping. After retention time
+correction, a feature table is generated by aligning peaks across spectra. For further information on apLCMS
+please refer to https://mypage.cuhk.edu.cn/academics/yutianwei/apLCMS/.
+</token>
+
+<token name="@REMOVE_NOISE_HELP@">
+recetox-aplcms - remove noise
+=============================
+
+This tool is the first step of recetox-aplcms.
+It removes noise from the raw data and performs a first clustering step of points with close m/z values into the extracted ion chromatograms (EICs).
+Only peaks with a minimum elution length of `min_run` seconds are kept.
+
+Example Output
+--------------
+The raw data points contained in the scans of the `mzml` file are filtered for noise and grouped into clusters based on m/z values.
+See an example output in the table below. The `group_number` column indicates the cluster index.
+
++----------------------+-------------------+-----------------------+--------------------+
+| mz                   |    rt             |    intensity          |    group_number    |
++======================+===================+=======================+====================+
+| 70.01060119055192    |    350.58654      |    21178.330810546875 |    5               |
++----------------------+-------------------+-----------------------+--------------------+
+| 70.02334120404554    |    130.175262     |    287869.5478515625  |    10              |
++----------------------+-------------------+-----------------------+--------------------+
+| 70.0287408273165     |    134.801352     |    60883.15185546875  |    11              |
++----------------------+-------------------+-----------------------+--------------------+
+| 70.02872416715464    |    183.991896     |    9201.574584960938  |    11              |
++----------------------+-------------------+-----------------------+--------------------+
+| ...                  |    ...            |    ...                |    ...             |
++----------------------+-------------------+-----------------------+--------------------+
+</token>
+
+<token name="@GENERATE_FEATURE_TABLE_HELP@">
+recetox-aplcms - generate feature table
+=======================================
+The second step in the recetox-aplcms workflow performing peak shape parameter estimation.
+
+This tool takes the grouped features created with `recetox-aplcms-remove-noise` and computes the peak shape in `rt` domain and integrates the peak area.
+
+
+Example Output
+--------------
+The output contains the `mz` and `rt` of the peaks as well as the standard deviation in both direction of the peak for the bi-gaussian peak shape.
+
++----------------------+-------------------+-----------------+-------------------+----------------------+
+| mz                   |   rt              |    sd1          |    sd2            |   area               |
++======================+===================+=================+===================+======================+
+| 70.02317542938793    |   142.36033       |   11.436659559  |    14.592754933   |   4159269.24595184   |
++----------------------+-------------------+-----------------+-------------------+----------------------+
+| 70.02869594233522    |   205.48765       |   0.263230763   |    0.285101428707 |   8849767.11861127   |
++----------------------+-------------------+-----------------+-------------------+----------------------+
+| 78.04643252598305    |   294.01713       |   0.51677558617 |    1.317028944141 |   1333044.50659719   |
++----------------------+-------------------+-----------------+-------------------+----------------------+
+| ...                  |    ...            |    ...          |    ...            |   ...                |
++----------------------+-------------------+-----------------+-------------------+----------------------+
+</token>
+
+<token name="@COMPUTE_CLUSTERS_HELP@">
+recetox-aplcms - compute clusters
+=================================
+
+Group features with `mz` and `rt` using tolerances within the tolerance into clusters, creating larger features from raw data points.
+Custom tolerances for `mz` and `rt` are computed based on the given parameters.
+The tool takes a collection of all detected features and computes the clusters over a global feature table, adding the `sample_id` and `cluster` columns to the table.
+
+Example Output
+--------------
+
++----------------------+-------------------+-----------------+-------------------+----------------------+---------------------+---------------+
+| mz                   |   rt              |    sd1          |    sd2            |   area               |  sample_id          |  cluster      |
++======================+===================+=================+===================+======================+=====================+===============+
+| 70.02317542938793    |   142.36033       |   11.436659559  |    14.592754933   |   4159269.245951841  | 21_qc_no_dil_milliq |  7            |
++----------------------+-------------------+-----------------+-------------------+----------------------+---------------------+---------------+
+| 70.02869594233522    |   205.48765       |   0.263230763   |    0.285101428707 |   8849767.11861127   | 21_qc_no_dil_milliq |  9            |
++----------------------+-------------------+-----------------+-------------------+----------------------+---------------------+---------------+
+| 78.04643252598305    |   294.01713       |   0.51677558617 |    1.317028944141 |   1333044.506597194  | 21_qc_no_dil_milliq |  13           |
++----------------------+-------------------+-----------------+-------------------+----------------------+---------------------+---------------+
+| ...                  |    ...            |    ...          |    ...            |   ...                | ...                 |  ...          |
++----------------------+-------------------+-----------------+-------------------+----------------------+---------------------+---------------+
+</token>
+
+<token name="@CORRECT_TIME_HELP@">
+recetox-aplcms - correct time
+=============================
+
+Apply spline-based retention time correction to a feature table given the template table and the computed `mz` and `rt` tolerances.
+
+Example Output
+--------------
+The output has the same format as `compute clusters` but the retention time values are corrected based on the template table.
+
++----------------------+-------------------+-----------------+-------------------+----------------------+---------------------+---------------+
+| mz                   |   rt              |    sd1          |    sd2            |   area               |  sample_id          |  cluster      |
++======================+===================+=================+===================+======================+=====================+===============+
+| 70.02317542938793    |   142.36033       |   11.436659559  |    14.592754933   |   4159269.245951841  | 21_qc_no_dil_milliq |  7            |
++----------------------+-------------------+-----------------+-------------------+----------------------+---------------------+---------------+
+| 70.02869594233522    |   205.48765       |   0.263230763   |    0.285101428707 |   8849767.11861127   | 21_qc_no_dil_milliq |  9            |
++----------------------+-------------------+-----------------+-------------------+----------------------+---------------------+---------------+
+| 78.04643252598305    |   294.01713       |   0.51677558617 |    1.317028944141 |   1333044.506597194  | 21_qc_no_dil_milliq |  13           |
++----------------------+-------------------+-----------------+-------------------+----------------------+---------------------+---------------+
+| ...                  |    ...            |    ...          |    ...            |   ...                | ...                 |  ...          |
++----------------------+-------------------+-----------------+-------------------+----------------------+---------------------+---------------+
+</token>
+<token name="@COMPUTE_TEMPLATE_HELP@">
+recetox-aplcms - compute template
+=================================
+Compute the template from a set of feature tables, choosing the one with the most features as the template.
+</token>
+
+<token name="@RECOVER_WEAKER_SIGNALS_HELP@">
+recetox-aplcms - recover weaker signals
+=======================================
+Second stage peak detection based on the aligned feature table from the `feature alignment` step.
+If a feature is contained in the aligned feature table, this step revisits the raw data and searches
+for this feature at the retention time obtained by mapping the corrected retention time back to the original sample.
+
+This recovers features which are present in a sample but might have been filtered out initially as noise due to low signal intensity.
+
+Example Output
+--------------
+The table has the same format as the `compute clusters` output but might contain additional features which have been extracted based
+on their presence in the aligned feature table.
+
++----------------------+-------------------+-----------------+-------------------+----------------------+---------------------+---------------+
+| mz                   |   rt              |    sd1          |    sd2            |   area               |  sample_id          |  cluster      |
++======================+===================+=================+===================+======================+=====================+===============+
+| 70.02317542938793    |   142.36033       |   11.436659559  |    14.592754933   |   4159269.245951841  | 21_qc_no_dil_milliq |  7            |
++----------------------+-------------------+-----------------+-------------------+----------------------+---------------------+---------------+
+| 70.02869594233522    |   205.48765       |   0.263230763   |    0.285101428707 |   8849767.11861127   | 21_qc_no_dil_milliq |  9            |
++----------------------+-------------------+-----------------+-------------------+----------------------+---------------------+---------------+
+| 78.04643252598305    |   294.01713       |   0.51677558617 |    1.317028944141 |   1333044.506597194  | 21_qc_no_dil_milliq |  13           |
++----------------------+-------------------+-----------------+-------------------+----------------------+---------------------+---------------+
+| ...                  |    ...            |    ...          |    ...            |   ...                | ...                 |  ...          |
++----------------------+-------------------+-----------------+-------------------+----------------------+---------------------+---------------+
+</token>
+
+<token name="@ALIGN_FEATURES_HELP@">
+recetox-aplcms - align features
+===============================
+This step performs feature alignment after clustering and retention time correction.
+The peaks clustered across samples are grouped based on the given tolerances to create an aligned feature table, connecting identical features across samples.
+The parameter controls in how many samples a feature has to be detected at least in order to be included in the aligned feature table.
+
+Example Output
+--------------
+The tool outputs 3 tables: the peak related `metadata`, the `retention times` and the `intensities` for all features across all samples.
+
+Metadata Table
+~~~~~~~~~~~~~~
+The `npeaks` column denotes the number of peaks which have been grouped into this feature. The columns with the sample names indicate whether this feature is present in the sample.
+
++-------+--------------+--------------+---------------+----------------+---------------+---------------+-----------+------------------------+------------------------+------------------------+
+|  id   | mz           |  mzmin       |  mzmax        |  rt            |  rtmin        |  rtmax        |   npeaks  |  21_qc_no_dil_milliq   |  29_qc_no_dil_milliq   |  8_qc_no_dil_milliq    |
++=======+==============+==============+===============+================+===============+===============+===========+========================+========================+========================+
+|  1    | 70.03707021  |  70.037066   |  70.0370750   |  294.1038014   |  294.0634942  |  294.149985   |   3       |  1                     |  1                     |  1                     |
++-------+--------------+--------------+---------------+----------------+---------------+---------------+-----------+------------------------+------------------------+------------------------+
+|  2    | 70.06505677  |  70.065045   |  70.0650676   |  141.9560055   |  140.5762528  |  143.335758   |   2       |  1                     |  0                     |  1                     |
++-------+--------------+--------------+---------------+----------------+---------------+---------------+-----------+------------------------+------------------------+------------------------+
+|  57   | 78.04643252  |  78.046429   |  78.0464325   |  294.0063397   |  293.9406777  |  294.072001   |   2       |  1                     |  1                     |  0                     |
++-------+--------------+--------------+---------------+----------------+---------------+---------------+-----------+------------------------+------------------------+------------------------+
+|  ...  | ...          |   ...        |  ...          |  ...           |  ...          |  ...          |   ...     |  ...                   |  ...                   |  ...                   |
++-------+--------------+--------------+---------------+----------------+---------------+---------------+-----------+------------------------+------------------------+------------------------+
+
+Intensity Table
+~~~~~~~~~~~~~~~
+This table contains the peak area for aligned features in all samples.
+
++-------+------------------------+------------------------+------------------------+
+|  id   |  21_qc_no_dil_milliq   |  29_qc_no_dil_milliq   |  8_qc_no_dil_milliq    |
++=======+========================+========================+========================+
+|  1    |  13187487.20482895     |  7957395.699119729     |  11700594.397257797    |
++-------+------------------------+------------------------+------------------------+
+|  2    |  2075168.6398983458    |  0                     |  2574362.159289044     |
++-------+------------------------+------------------------+------------------------+
+|  57   |  2934524.4406785755    |  1333044.5065971944    |  0                     |
++-------+------------------------+------------------------+------------------------+
+|  ...  |  ...                   |  ...                   |  ...                   |
++-------+------------------------+------------------------+------------------------+
+
+Retention Time Table
+~~~~~~~~~~~~~~~~~~~~
+This table contains the retention times for all aligned features in all samples.
+
++-------+------------------------+------------------------+------------------------+
+|  id   |  21_qc_no_dil_milliq   |  29_qc_no_dil_milliq   |  8_qc_no_dil_milliq    |
++=======+========================+========================+========================+
+|  1    |  294.09792478513236    |  294.1499853056912     |  294.0634942428341     |
++-------+------------------------+------------------------+------------------------+
+|  2    |  140.57625284242982    |  0                     |  143.33575827589172    |
++-------+------------------------+------------------------+------------------------+
+|  57   |  294.07200187644435    |  293.9406777222317     |  0                     |
++-------+------------------------+------------------------+------------------------+
+|  ...  |  ...                   |  ...                   |  ...                   |
++-------+------------------------+------------------------+------------------------+
+</token>
+
+<token name="@MERGE_KNOWN_TABLES_HELP@">
+recetox-aplcms - merge known table
+==================================
+
+This tool allows merging the detected features back into the table of known features and vice versa.
+It is used in the hybrid version of recetox-aplcms to augment the aligned feature table with the suspect peaks
+and to augment this table with successfully detected features.
+</token>
+</macros>
Binary file images/scheme.png has changed
--- a/macros.xml	Thu Jun 16 10:26:58 2022 +0000
+++ b/macros.xml	Mon Feb 13 10:26:59 2023 +0000
@@ -1,11 +1,9 @@
 <macros>
-    <token name="@TOOL_VERSION@">0.9.4</token>
+    <token name="@TOOL_VERSION@">0.10.1</token>
     <xml name="requirements">
         <requirements>
-            <requirement type="package" version="4.1.0">r-base</requirement>
-            <requirement type="package" version="4.0.1">r-arrow</requirement>
             <requirement type="package" version="@TOOL_VERSION@">r-recetox-aplcms</requirement>
-            <requirement type="package" version="1.0.7">r-dplyr</requirement>
+            <requirement type="package" version="2.5.2">pymzml</requirement>
         </requirements>
     </xml>

@@ -31,6 +29,11 @@
                 familyName="Novotný"
                 url="https://github.com/xtracko"
                 identifier="0000-0001-5449-3523" />
+            <person
+                givenName="Helge"
+                familyName="Hecht"
+                url="https://github.com/hechth"
+                identifier="0000-0001-6744-996X" />
             <organization
                 url="https://www.recetox.muni.cz/"
                 email="GalaxyToolsDevelopmentandDeployment@space.muni.cz"
@@ -38,157 +41,104 @@
         </creator>
     </xml>

-    <xml name="inputs">
-        <inputs>
-            <param name="files" type="data" format="mzdata,mzml,mzxml,netcdf" multiple="true" min="3" label="data"
-                   help="Mass spectrometry files for peak extraction." />
-            <yield />
-       </inputs>
-    </xml>
-
-    <xml name="history_db">
-        <param name="known_table" type="data" format="parquet" label="known_table"
-               help="A data table containing the known metabolite ions and previously found features. The table must contain these 18 columns: chemical_formula (optional), HMDB_ID (optional), KEGG_compound_ID (optional), neutral.mass (optional), ion.type (the ion form - optional), m.z (either theoretical or mean observed m/z value of previously found features), Number_profiles_processed (the total number of processed samples to build this database), Percent_found (the percentage of historically processed samples in which the feature appeared), mz_min (minimum  observed m/z value), mz_max (maximum observed m/z value), RT_mean (mean observed retention time), RT_sd (standard deviation of observed retention time), RT_min (minimum observed retention time), RT_max (maximum observed retention time), int_mean.log. (mean observed log intensity), int_sd.log. (standard deviation of observed log intensity), int_min.log. (minimum observed log intensity), int_max.log. (maximum observed log intensity)." />
-        <section name="history_db" title="Known-Table settings">
-            <param name="match_tol_ppm" type="integer" optional="true" min="0" label="match_tol_ppm (optional)"
-                   help="The ppm tolerance to match identified features to known metabolites/features." />
-            <param name="new_feature_min_count" type="integer" value="2" min="1" label="new_feature_min_count"
-                   help="The minimum number of occurrences of a historically unseen (unknown) feature to add this feature into the database of known features." />
-        </section>
+    <xml name="remove_noise_params">
+        <param name="min_pres" type="float" value="0.5" label="min_pres"
+               help="The minimum proportion of presence in the time period for a series of signals grouped by m/z to be considered a peak." />
+        <param name="min_run" type="float" value="12" label="min_run"
+               help="The minimum length of elution time for a series of signals grouped by m/z to be considered a peak." />
+        <param name="mz_tol" type="float" value="1e-05" label="mz_tol"
+               help="The m/z tolerance level for the grouping of data points. This value is expressed as the fraction of the m/z value. This value, multiplied by the m/z value, becomes the cutoff level. The recommended value is the machine's nominal accuracy level. Divide the ppm value by 1e6. For FTMS, 1e-5 is recommended." />
+        <param name="baseline_correct" type="float" value="0" label="baseline_correct"
+               help="After grouping the observations, the highest intensity in each group is found. If the highest is lower than this value, the entire group will be deleted. The default value is NA, in which case the program uses a percentile of the height of the noise groups. If given a value, the value will be used as the threshold, and baseline.correct.noise.percentile will be ignored." />
+        <param name="intensity_weighted" type="boolean" checked="false" truevalue="TRUE" falsevalue="FALSE" label="intensity_weighted"
+               help="Whether to weight the local density by signal intensities in initial peak detection." />
     </xml>

-    <xml name="noise_filtering">
-        <section name="noise_filtering" title="Noise filtering and peak detection">
-            <yield />
-            <param name="min_pres" type="float" value="0.5"
-                   label="min_pres"
-                   help="The minimum proportion of presence in the time period for a series of signals grouped by m/z to be considered a peak." />
-            <param name="min_run" type="float" value="12"
-                   label="min_run"
-                   help="The minimum length of elution time for a series of signals grouped by m/z to be considered a peak." />
-            <param name="mz_tol" type="float" value="1e-05"
-                   label="mz_tol"
-                   help="The m/z tolerance level for the grouping of data points. This value is expressed as the fraction of the m/z value. This value, multiplied by the m/z value, becomes the cutoff level. The recommended value is the machine's nominal accuracy level. Divide the ppm value by 1e6. For FTMS, 1e-5 is recommended." />
-            <param name="baseline_correct" type="float" value="0" label="baseline_correct"
-                   help="After grouping the observations, the highest intensity in each group is found. If the highest is lower than this value, the entire group will be deleted. The default value is NA, in which case the program uses a percentile of the height of the noise groups. If given a value, the value will be used as the threshold, and baseline.correct.noise.percentile will be ignored." />
-            <param name="baseline_correct_noise_percentile" type="float" value="0.05"
-                   label="baseline_correct_noise_percentile"
-                   help="The percentile of signal strength of those EIC that don't pass the run filter, to be used as the baseline threshold of signal strength." />
-            <param name="intensity_weighted" type="boolean" checked="false" truevalue="TRUE" falsevalue="FALSE"
-                   label="intensity_weighted"
-                   help="Whether to weight the local density by signal intensities in initial peak detection." />
-        </section>
-    </xml>
-
-    <xml name="feature_detection">
-        <section name="feature_detection" title="Feature detection">
-            <param name="shape_model" type="select" display="radio"
-                   label="shape_model"
+    <xml name="generate_feature_table_params">
+        <param name="sd_cut_min" type="float" value="0.01" label="sd_cut_min"
+               help="The minimum standard deviation of a feature to be not eliminated." />
+        <param name="sd_cut_max" type="float" value="500" label="sd_cut_max"
+               help="The maximum standard deviation of a feature to be not eliminated." />
+        <conditional name="shape">
+            <param name="shape_model" type="select" display="radio" label="shape_model"
                    help="The mathematical model for the shape of a peak. There are two choices - bi-Gaussian and Gaussian. When the peaks are asymmetric, the bi-Gaussian is better.">
                 <option value="Gaussian">Gaussian</option>
                 <option value="bi-Gaussian" selected="true">bi-Gaussian</option>
             </param>
-            <param name="BIC_factor" type="float" value="2.0"
-                   label="BIC_factor"
-                   help="The factor that is multiplied on the number of parameters to modify the BIC criterion. If larger than 1, models with more peaks are penalized more." />
-            <param name="peak_estim_method" type="select" display="radio"
-                   label="peak_estim_method"
-                   help="The estimation method for the bi-Gaussian peak model. Two possible values: moment and EM.">
-                <option value="moment" selected="true">Moment</option>
-                <option value="EM">EM</option>
-            </param>
-            <param name="min_bandwidth" type="float" optional="true"
-                   label="min_bandwidth (optional)"
-                   help="The minimum bandwidth to use in the kernel smoother." />
-            <param name="max_bandwidth" type="float" optional="true"
-                   label="max_bandwidth (optional)"
-                   help="The maximum bandwidth to use in the kernel smoother." />
-            <param name="sd_cut_min" type="float" value="0.01"
-                   label="sd_cut_min"
-                   help="The minimum standard deviation of a feature to be not eliminated." />
-            <param name="sd_cut_max" type="float" value="500"
-                   label="sd_cut_max"
-                   help="The maximum standard deviation of a feature to be not eliminated." />
-            <param name="sigma_ratio_lim_min" type="float" value="0.01"
-                   label="sigma_ratio_lim_min"
-                   help="The lower limit of the believed ratio range between the left-standard deviation and the right-standard deviation of the bi-Gaussian function used to fit the data." />
-            <param name="sigma_ratio_lim_max" type="float" value="100"
-                   label="sigma_ratio_lim_max"
-                   help="The upper limit of the believed ratio range between the left-standard deviation and the right-standard deviation of the bi-Gaussian function used to fit the data." />
-            <param name="component_eliminate" type="float" value="0.01"
-                   label="component_eliminate"
-                   help="In fitting mixture of bi-Gaussian (or Gaussian) model of an EIC, when a component accounts for a proportion of intensities less than this value, the component will be ignored." />
-            <param name="moment_power" type="float" value="1"
-                   label="moment_power"
-                   help="The power parameter for data transformation when fitting the bi-Gaussian or Gaussian mixture model in an EIC." />
-        </section>
+            <when value="bi-Gaussian">
+                <param name="sigma_ratio_lim_min" type="float" value="0.01" label="sigma_ratio_lim_min"
+                       help="The lower limit of the believed ratio range between the left-standard deviation and the right-standard deviation of the bi-Gaussian function used to fit the data." />
+                <param name="sigma_ratio_lim_max" type="float" value="100" label="sigma_ratio_lim_max"
+                       help="The upper limit of the believed ratio range between the left-standard deviation and the right-standard deviation of the bi-Gaussian function used to fit the data." />
+            </when>
+        </conditional>
+        <param name="peak_estim_method" type="select" display="radio" label="peak_estim_method"
+               help="The estimation method for the bi-Gaussian peak model. Two possible values: moment and EM.">
+            <option value="moment" selected="true">Moment</option>
+            <option value="EM">EM</option>
+        </param>
+        <param name="moment_power" type="float" value="1" label="moment_power"
+               help="The power parameter for data transformation when fitting the bi-Gaussian or Gaussian mixture model in an EIC." />
+        <param name="component_eliminate" type="float" value="0.01" label="component_eliminate"
+               help="In fitting mixture of bi-Gaussian (or Gaussian) model of an EIC, when a component accounts for a proportion of intensities less than this value, the component will be ignored." />
+        <param name="BIC_factor" type="float" value="2.0" label="BIC_factor"
+               help="A factor influencing Bayesian information criterion (BIC) in estimation of RT peak shape. If the value is larger than 1, models with more peaks are penalized more." />
     </xml>

-    <xml name="peak_alignment">
-        <section name="peak_alignment" title="Peak Alignment">
-            <param name="align_chr_tol" type="float" optional="true"
-                   label="align_chr_tol (optional)"
-                   help="The retention time tolerance level for peak alignment. The default is NA, which allows the program to search for the tolerance level based on the data." />
-            <param name="align_mz_tol" type="float" optional="true"
-                   label="align_mz_tol (optional)"
-                   help="The m/z tolerance level for peak alignment. The default is NA, which allows the program to search for the tolerance level based on the data. The tolerance is given in absolute numbers, not scaled, i.e. for 10ppm tolerance enter '1e-05'. This value, multiplied by the m/z value, becomes the cutoff level." />
-            <param name="max_align_mz_diff" type="float" value="0.01"
-                   label="max_align_mz_diff"
-                   help="As the m/z tolerance is expressed in relative terms (ppm), it may not be suitable when the m/z range is wide. This parameter limits the tolerance in absolute terms. It mostly influences feature matching in higher m/z range." />
-        </section>
+    <xml name="compute_clusters_params">
+        <conditional name="tolerances_input_method">
+            <param name="input_method" type="select" display="radio" label="Tolerances input method"
+                   help="Tolerances can be entered directly or loaded from a file.">
+                <option value="direct" selected="true">direct</option>
+                <option value="file">file</option>
+            </param>
+            <when value="direct">
+                <param name="mz_tol_relative" type="float" optional="true" label="mz_tol_relative"
+                       help="Relative m/z tolerance to use for grouping features." />
+                <param name="rt_tol_relative" type="float" optional="true" label="rt_tol_relative"
+                       help="Relative retention time tolerance to use for grouping features." />
+            </when>
+            <when value="file">
+                <param label="Input tolerances values" name="input_tolerances" type="data" format="parquet"
+                       help="Table containing tolerance values." />
+            </when>
+        </conditional>
+        <param name="mz_tol_absolute" type="float" label="mz_tol_absolute" value="1e-05"
+               help="Absolute m/z tolerance to use for grouping features." />
+        <param name="mz_max_diff" type="float" label="mz_max_diff" value="0.01"
+               help="Maximum difference between feature m/z values to belong to the same cluster." />
     </xml>

-    <xml name="weak_signal_recovery">
-        <section name="weak_signal_recovery" title="Weak Signal Recovery">
-            <param name="recover_mz_range" type="float" optional="true"
-                   label="recover_mz_range (optional)"
-                   help="The m/z around the feature m/z to search for observations. The default value is NA, in which case 1.5 times the m/z tolerance in the aligned object will be used." />
-            <param name="recover_chr_range" type="float" optional="true"
-                   label="recover_chr_range (optional)"
-                   help="The retention time around the feature retention time to search for observations. The default value is NA, in which case 0.5 times the retention time tolerance in the aligned object will be used." />
-            <param name="use_observed_range" type="boolean" checked="true" truevalue="TRUE" falsevalue="FALSE"
-                   label="use_observed_range"
-                   help="If the value is true, the actual range of the observed locations of the feature in all the spectra will be used." />
-            <param name="recover_min_count" type="integer" value="3"
-                   label="recover_min_count"
-                   help="The minimum number of raw data points to be considered as a true feature." />
-        </section>
-    </xml>
-    <xml name="multibatch_processing">
-        <section name="multibatch_processing" title="Multibatch processing">
-            <param name="min_within_batch_prop_detect" type="float" min="0" max="1" value="0.1"
-                   label="minimum_batchwise_prop_detect"
-                   help="The minimum detection frequency (relative) of a feature for it to be included in the final feature table." />
-            <param name="min_batch_prop" type="float" min="0" max="1" value="0.5"
-                   label="minimum_batch_prop"
-                   help="The minimum proportion of batches that must have a given feature. The features that are less abundant than the value won't be reported." />
-            <param name="batch_align_mz_tol" type="float" min="0" value="0.00001"
-                   label="batch_align_mz_tol"
-                   help="The m/z tolerance level for peak alignment within batch." />
-            <param name="batch_align_chr_tol" type="float" min="0" value="50.0"
-                   label="batch_align_chr_tol"
-                   help="The retention time tolerance level for peak alignment within batch." />
-        </section>
-    </xml>
-
-    <xml name="mz_tol_macro">
+    <xml name="recover_weaker_params">
         <param name="mz_tol" type="float" value="1e-05" label="mz_tol"
                help="The m/z tolerance level for the grouping of data points. This value is expressed as the
                fraction of the m/z value. This value, multiplied by the m/z value, becomes the cutoff level.
                The recommended value is the machine's nominal accuracy level. Divide the ppm value by 1e6.
                For FTMS, 1e-5 is recommended." />
+        <param name="recover_mz_range" type="float" optional="true" label="recover_mz_range"
+               help="The m/z around the feature m/z to search for observations. The default value is NA, in which
+               case 1.5 times the m/z tolerance in the aligned object will be used." />
+        <param name="recover_rt_range" type="float" optional="true" label="recover_rt_range"
+               help="The retention time around the feature retention time to search for observations.
+               The default value is NA, in which case 0.5 times the retention time tolerance in the aligned
+                object will be used." />
+        <param name="use_observed_range" type="boolean" checked="true" truevalue="TRUE" falsevalue="FALSE"
+               label="use_observed_range" help="If the value is true, the actual range of the observed locations of
+               the feature in all the spectra will be used." />
+        <param name="recover_min_count" type="integer" value="3" label="recover_min_count"
+               help="The minimum number of raw data points to be considered as a true feature." />
+        <param name="intensity_weighted" type="boolean" checked="false" truevalue="TRUE" falsevalue="FALSE"
+               label="intensity_weighted" help="Whether to weight the local density by signal intensities in initial peak detection." />
     </xml>

-    <xml name="output_format">
-       <section name="output_format" title="Output Format">
-              <param name="out_format" type="boolean" checked="false" truevalue="recetox" falsevalue="original" label="Use custom RECETOX output format?" />
-       </section>
-    </xml>
-
-    <xml name="unsupervised_outputs">
-        <data name="recovered_feature_sample_table" format="parquet" label="${tool.name} recovered_feature_sample_table on ${on_string}" />
-        <data name="aligned_feature_sample_table" format="parquet" label="${tool.name} aligned_feature_sample_table on ${on_string}" hidden="true" />
-        <yield />
+    <xml name="bandwidth_params">
+        <param name="bandwidth" type="float" value="0.5" label="bandwidth"
+               help="A value between zero and one. Multiplying this value to the length of the signal along
+               the time axis helps determine the bandwidth in the kernel smoother used for peak identification." />
+        <param name="min_bandwidth" type="float" optional="true" label="min_bandwidth"
+               help="The minimum bandwidth to use in the kernel smoother." />
+        <param name="max_bandwidth" type="float" optional="true" label="max_bandwidth"
+               help="The maximum bandwidth to use in the kernel smoother." />
     </xml>

     <xml name="citations">
@@ -197,51 +147,8 @@
             <citation type="doi">10.1186/1471-2105-11-559</citation>
             <citation type="doi">10.1021/pr301053d</citation>
             <citation type="doi">10.1093/bioinformatics/btu430</citation>
+            <citation type="doi">10.1038/s41598-020-70850-0</citation>
             <yield />
         </citations>
     </xml>
-
-    <token name="@HELP_hybrid@">
-        <![CDATA[
-            This is the Hybrid version of apLCMS which is incorporating the knowledge of known metabolites and historically
-            detected features on the same machinery to help detect and quantify lower-intensity peaks.
-
-            CAUTION: To use such knowledge, especially historical data, you must keep using (1) the same chromatography
-            system (otherwise the retention time will not match), and (2) the same type of samples with similar extraction
-            technique, such as human serum.
-
-            @GENERAL_HELP@
-        ]]>
-    </token>
-
-    <token name="@HELP_unsupervised@">
-        <![CDATA[
-            This is the Unsupervised version of apLCMS which is not relying on any existing knowledge about metabolites or
-            any historically detected features. For such functionality please use the Hybrid version of apLCMS.
-
-            @GENERAL_HELP@
-        ]]>
-    </token>
-
-    <token name="@HELP_two-step-hybrid@">
-        <![CDATA[
-              This is the **Two-Step Hybrid** version of **apLCMS**. This tool is improved upon the Hybrid version by accounting for the batch
-              effects in multi-batch experiments. As in the Hybrid version, this tool incorporates the knowledge of known metabolites and
-              historically detected features on the same machinery to help detect and quantify lower-intensity peaks.
-
-              **CAUTION**: To use such knowledge, especially historical data, you must keep using (1) the same chromatography
-              system (otherwise the retention time will not match), and (2) the same type of samples with similar extraction
-              technique, such as human serum.
-
-            @GENERAL_HELP@
-        ]]>
-    </token>
-
-    <token name="@GENERAL_HELP@">
-        apLCMS is a software which generates a feature table from a batch of LC/MS spectra. The m/z and retention time
-        tolerance levels are estimated from the data. A run-filter is used to detect peaks and remove noise.
-        Non-parametric statistical methods are used to find-tune peak selection and grouping. After retention time
-        correction, a feature table is generated by aligning peaks across spectra. For further information on apLCMS
-        please refer to https://mypage.cuhk.edu.cn/academics/yutianwei/apLCMS/.
-    </token>
 </macros>
--- a/macros_split.xml	Thu Jun 16 10:26:58 2022 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,23 +0,0 @@
-<macros>
-    <xml name="noise_filtering_split">
-        <section name="noise_filtering" title="Noise filtering and peak detection">
-            <param name="min_pres" type="float" value="0.5"
-                   label="min_pres"
-                   help="The minimum proportion of presence in the time period for a series of signals grouped by m/z to be considered a peak." />
-            <param name="min_run" type="float" value="12"
-                   label="min_run"
-                   help="The minimum length of elution time for a series of signals grouped by m/z to be considered a peak." />
-            <param name="mz_tol" type="float" value="1e-05"
-                   label="mz_tol"
-                   help="The m/z tolerance level for the grouping of data points. This value is expressed as the fraction of the m/z value. This value, multiplied by the m/z value, becomes the cutoff level. The recommended value is the machine's nominal accuracy level. Divide the ppm value by 1e6. For FTMS, 1e-5 is recommended." />
-            <param name="baseline_correct" type="float" value="0" label="baseline_correct"
-                   help="After grouping the observations, the highest intensity in each group is found. If the highest is lower than this value, the entire group will be deleted. The default value is NA, in which case the program uses a percentile of the height of the noise groups. If given a value, the value will be used as the threshold, and baseline.correct.noise.percentile will be ignored." />
-            <param name="baseline_correct_noise_percentile" type="float" value="0.05"
-                   label="baseline_correct_noise_percentile"
-                   help="The percentile of signal strength of those EIC that don't pass the run filter, to be used as the baseline threshold of signal strength." />
-            <param name="intensity_weighted" type="boolean" checked="false" truevalue="TRUE" falsevalue="FALSE"
-                   label="intensity_weighted"
-                   help="Whether to weight the local density by signal intensities in initial peak detection." />
-        </section>
-    </xml>
-</macros>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mzml_id_getter.py	Mon Feb 13 10:26:59 2023 +0000
@@ -0,0 +1,23 @@
+#!/usr/bin/env python
+
+import argparse
+import sys
+
+from pymzml.run import Reader
+
+
+def main(argv):
+    parser = argparse.ArgumentParser(description='Get run ID from an mzML file.')
+    parser.add_argument('mzml_file', help='Path to an mzML file to get run ID from.')
+    args = parser.parse_args()
+
+    mzml = Reader(args.mzml_file)
+    id = mzml.info['run_id']
+
+    if id is not None:
+        with open("sample_name.txt", mode='x') as f:
+            f.write(id)
+
+
+if __name__ == '__main__':
+    main(sys.argv[1:])
--- a/recetox_aplcms_align_features.xml	Thu Jun 16 10:26:58 2022 +0000
+++ b/recetox_aplcms_align_features.xml	Mon Feb 13 10:26:59 2023 +0000
@@ -1,91 +1,63 @@
-<tool id="recetox_aplcms_align_features" name="RECETOX apLCMS - align features" version="@TOOL_VERSION@+galaxy1">
-    <description>align features from LC/MS spectra across samples</description>
+<tool id="recetox_aplcms_align_features" name="recetox-aplcms - align features" version="@TOOL_VERSION@+galaxy0">
+    <description>align peaks across samples</description>
     <macros>
         <import>macros.xml</import>
-        <import>macros_split.xml</import>
+        <import>help.xml</import>
     </macros>
     <expand macro="creator"/>
+    <expand macro="requirements"/>

-    <expand macro="requirements"/>
     <command detect_errors="aggressive"><![CDATA[
-        sh ${symlink_inputs} &&
         Rscript -e 'source("${__tool_directory__}/utils.R")' -e 'source("${run_script}")'
     ]]></command>
     <configfiles>
-        <configfile name="symlink_inputs">
-            #for $infile in $ms_files
-                ln -s '${infile}' '${infile.element_identifier}'
-            #end for
-             #for $infile in $corrected_files
-                ln -s '${infile}' '${infile.element_identifier}'
-            #end for
-        </configfile>
         <configfile name="run_script"><![CDATA[
-            #set filenames_str = str("', '").join([str($f.element_identifier) for $f in $ms_files])
-            files_list <- sort_samples_by_acquisition_number(c('$filenames_str'))
-            sample_names <- get_sample_name(files_list)
+             #set filenames = str("', '").join([str($f) for $f in $files])
+             feature_tables <- load_parquet_collection(c('$filenames'))
+             sample_names <- unlist(lapply(feature_tables, load_sample_name))
+
+             validate_sample_names(sample_names)
+
+             ordering <- order(sample_names)
+             feature_tables <- feature_tables[ordering]
+             sample_names <- sample_names[ordering]

-            #set corrected_files = str("', '").join([str($f.element_identifier) for $f in $corrected_files])
-            corrected_features <- load_features(c('$corrected_files'))
+             tolerances <- load_data_from_parquet_file('$input_tolerances')

-            aligned <- align_features(
-                sample_names = sample_names,
-                features = corrected_features,
-                min.exp = $min_exp,
-                mz.tol = $peak_alignment.align_mz_tol,
-                chr.tol = $peak_alignment.align_chr_tol,
-                find.tol.max.d = 10 * $mz_tol,
-                max.align.mz.diff = $peak_alignment.max_align_mz_diff,
-                do.plot = FALSE
-            )
+             aligned_features <- create_aligned_feature_table(
+                  features_table = dplyr::bind_rows(feature_tables),
+                  min_occurrence = $min_occurrence,
+                  sample_names = sample_names,
+                  mz_tol_relative = get_mz_tol(tolerances),
+                  rt_tol_relative = get_rt_tol(tolerances)
+             )

-            save_aligned_features(aligned, "$rt_cross_table", "$int_cross_table", "$tolerances")
+             save_aligned_features(aligned_features, '$metadata_file', '$rt_file', '$intensity_file')
         ]]></configfile>
     </configfiles>

     <inputs>
-        <param name="ms_files" type="data_collection" collection_type="list" format="mzdata,mzml,mzxml,netcdf"
-               label="Input data collection" help="Mass spectrometry file for peak extraction." />
-        <param name="corrected_files" type="data_collection" collection_type="list" format="parquet"
-               label="Input corrected feature samples collection"
-               help="Mass spectrometry files containing corrected feature samples." />
-        <expand macro="mz_tol_macro"/>
-        <param name="min_exp" type="integer" min="1" value="2" label="min_exp"
-               help="If a feature is to be included in the final feature table, it must be present in at least this number of spectra." />
-        <expand macro="peak_alignment"/>
+        <param name="files" type="data_collection" collection_type="list" format="parquet"
+               label="Clustered features" help="List of tables containing clustered features." />
+        <param label="Input tolerances values" name="input_tolerances" type="data" format="parquet"
+               help="Table containing tolerance values." />
+        <param name="min_occurrence" type="integer" min="2" value="2" label="min_occurrence"
+               help="A feature has to show up in at least this number of profiles to be included in the final result." />
     </inputs>

     <outputs>
-        <data name="tolerances" format="parquet" label="${tool.name} on ${on_string} (tolerances)" />
-        <data name="rt_cross_table" format="parquet" label="${tool.name} on ${on_string} (rt cross table)" />
-        <data name="int_cross_table" format="parquet" label="${tool.name} on ${on_string} (int cross table)" />
+        <data name="metadata_file" format="parquet" label="${tool.name} on ${on_string} (metadata table)"/>
+        <data name="rt_file" format="parquet" label="${tool.name} on ${on_string} (rt table)"/>
+        <data name="intensity_file" format="parquet" label="${tool.name} on ${on_string} (intensity table)"/>
     </outputs>

     <tests>
-        <test>
-            <param name="ms_files">
-                <collection type="list">
-                    <element name="mbr_test0.mzml" value="mbr_test0.mzml"/>
-                    <element name="mbr_test1.mzml" value="mbr_test1.mzml"/>
-                    <element name="mbr_test2.mzml" value="mbr_test2.mzml"/>
-                </collection>
-            </param>
-            <param name="corrected_files">
-                <collection type="list">
-                    <element name="corrected_features_0.parquet" value="corrected_expected/corrected_0.parquet"/>
-                    <element name="corrected_features_1.parquet" value="corrected_expected/corrected_1.parquet"/>
-                    <element name="corrected_features_2.parquet" value="corrected_expected/corrected_2.parquet"/>
-                </collection>
-            </param>
-            <output name="tolerances" file="tolerances.parquet" ftype="parquet"/>
-            <output name="rt_cross_table" file="rt_cross_table.parquet" ftype="parquet"/>
-            <output name="int_cross_table" file="int_cross_table.parquet" ftype="parquet"/>
-        </test>
+
     </tests>

     <help>
         <![CDATA[
-            This is a tool which runs apLCMS alignment of features.
+            @ALIGN_FEATURES_HELP@

             @GENERAL_HELP@
         ]]>
--- a/utils.R	Thu Jun 16 10:26:58 2022 +0000
+++ b/utils.R	Mon Feb 13 10:26:59 2023 +0000
@@ -1,149 +1,125 @@
 library(recetox.aplcms)

-align_features <- function(sample_names, ...) {
-    aligned <- feature.align(...)
-    feature_names <- seq_len(nrow(aligned$pk.times))
-
-  list(
-    mz_tolerance = as.numeric(aligned$mz.tol),
-    rt_tolerance = as.numeric(aligned$chr.tol),
-    rt_crosstab = as_feature_crosstab(feature_names, sample_names, aligned$pk.times),
-    int_crosstab = as_feature_crosstab(feature_names, sample_names, aligned$aligned.ftrs)
-    )
+get_env_sample_name <- function() {
+    sample_name <- Sys.getenv("SAMPLE_NAME", unset = NA)
+    if (nchar(sample_name) == 0) {
+        sample_name <- NA
+    }
+    if (is.na(sample_name)) {
+        message("The mzML file does not contain run ID.")
+    }
+    return(sample_name)
 }

-get_sample_name <- function(filename) {
-    tools::file_path_sans_ext(basename(filename))
-}
-
-as_feature_crosstab <- function(feature_names, sample_names, data) {
-  colnames(data) <- c("mz", "rt", "mz_min", "mz_max", sample_names)
-  rownames(data) <- feature_names
-  as.data.frame(data)
+save_sample_name <- function(df, sample_name) {
+    attr(df, "sample_name") <- sample_name
+    return(df)
 }

-as_feature_sample_table <- function(rt_crosstab, int_crosstab) {
-  feature_names <- rownames(rt_crosstab)
-  sample_names <- colnames(rt_crosstab)[- (1:4)]
-
-  feature_table <- data.frame(
-    feature = feature_names,
-    mz = rt_crosstab[, 1],
-    rt = rt_crosstab[, 2]
-  )
-
-  # series of conversions to produce a table type from data.frame
-  rt_crosstab <- as.table(as.matrix(rt_crosstab[, - (1:4)]))
-  int_crosstab <- as.table(as.matrix(int_crosstab[, - (1:4)]))
-
-  crosstab_axes <- list(feature = feature_names, sample = sample_names)
-  dimnames(rt_crosstab) <- dimnames(int_crosstab) <- crosstab_axes
-
-  x <- as.data.frame(rt_crosstab, responseName = "sample_rt")
-  y <- as.data.frame(int_crosstab, responseName = "sample_intensity")
-
-  data <- merge(x, y, by = c("feature", "sample"))
-  data <- merge(feature_table, data, by = "feature")
-  data
+load_sample_name <- function(df) {
+    sample_name <- attr(df, "sample_name")
+    if (is.null(sample_name)) {
+        return(NA)
+    } else {
+        return(sample_name)
+    }
 }

-load_features <- function(files) {
-    files_list <- sort_samples_by_acquisition_number(files)
-    features <- lapply(files_list, arrow::read_parquet)
-    features <- lapply(features, as.matrix)
+save_data_as_parquet_file <- function(data, filename) {
+    arrow::write_parquet(data, filename)
+}
+
+load_data_from_parquet_file <- function(filename) {
+    return(arrow::read_parquet(filename))
+}
+
+load_parquet_collection <- function(files) {
+    features <- lapply(files, arrow::read_parquet)
+    features <- lapply(features, tibble::as_tibble)
     return(features)
 }

-save_data_as_parquet_files <- function(data, subdir) {
-  dir.create(subdir)
-  for (i in 0:(length(data) - 1)) {
-    filename <- file.path(subdir, paste0(subdir, "_features_", i, ".parquet"))
-    arrow::write_parquet(as.data.frame(data[i + 1]), filename)
-  }
+save_parquet_collection <- function(table, sample_names, subdir) {
+    dir.create(subdir)
+    for (i in seq_len(length(table$feature_tables))) {
+      filename <- file.path(subdir, paste0(subdir, "_", sample_names[i], ".parquet"))
+      feature_table <- as.data.frame(table$feature_tables[[i]])
+      feature_table <- save_sample_name(feature_table, sample_names[i])
+      arrow::write_parquet(feature_table, filename)
+    }
+}
+
+sort_by_sample_name <- function(tables, sample_names) {
+    return(tables[order(sample_names)])
 }

-save_aligned_features <- function(aligned, rt_file, int_file, tol_file) {
-  arrow::write_parquet(as.data.frame(aligned$rt_crosstab), rt_file)
-  arrow::write_parquet(as.data.frame(aligned$int_crosstab), int_file)
-
-  mz_tolerance <- c(aligned$mz_tolerance)
-  rt_tolerance <- c(aligned$rt_tolerance)
-  arrow::write_parquet(data.frame(mz_tolerance, rt_tolerance), tol_file)
+save_tolerances <- function(table, tol_file) {
+    mz_tolerance <- c(table$mz_tol_relative)
+    rt_tolerance <- c(table$rt_tol_relative)
+    arrow::write_parquet(data.frame(mz_tolerance, rt_tolerance), tol_file)
 }

-load_aligned_features <- function(rt_file, int_file, tol_file) {
-  rt_cross_table <- arrow::read_parquet(rt_file)
-  int_cross_table <- arrow::read_parquet(int_file)
-  tolerances_table <- arrow::read_parquet(tol_file)
+get_mz_tol <- function(tolerances) {
+    return(tolerances$mz_tolerance)
+}

-  result <- list()
-  result$mz_tolerance <- tolerances_table$mz_tolerance
-  result$rt_tolerance <- tolerances_table$rt_tolerance
-  result$rt_crosstab <- rt_cross_table
-  result$int_crosstab <- int_cross_table
-  return(result)
+get_rt_tol <- function(tolerances) {
+    return(tolerances$rt_tolerance)
+}
+
+save_aligned_features <- function(aligned_features, metadata_file, rt_file, intensity_file) {
+    save_data_as_parquet_file(aligned_features$metadata, metadata_file)
+    save_data_as_parquet_file(aligned_features$rt, rt_file)
+    save_data_as_parquet_file(aligned_features$intensity, intensity_file)
 }

-recover_signals <- function(cluster,
-                            filenames,
-                            extracted,
-                            corrected,
-                            aligned,
-                            mz_tol = 1e-05,
-                            mz_range = NA,
-                            rt_range = NA,
-                            use_observed_range = TRUE,
-                            min_bandwidth = NA,
-                            max_bandwidth = NA,
-                            recover_min_count = 3) {
-  if (!is(cluster, "cluster")) {
-    cluster <- parallel::makeCluster(cluster)
-    on.exit(parallel::stopCluster(cluster))
-  }
-
-  clusterExport(cluster, c("extracted", "corrected", "aligned", "recover.weaker"))
-  clusterEvalQ(cluster, library("splines"))
+select_table_with_sample_name <- function(tables, sample_name) {
+    sample_names <- lapply(tables, load_sample_name)
+    index <- which(sample_names == sample_name)
+    if (length(index) > 0) {
+        return(tables[[index]])
+    } else {
+        stop(sprintf("Mismatch - sample name '%s' not present in %s",
+                     sample_name, paste(sample_names, collapse = ", ")))
+    }
+}

-  recovered <- parLapply(cluster, seq_along(filenames), function(i) {
-    recover.weaker(
-      loc = i,
-      filename = filenames[[i]],
-      this.f1 = extracted[[i]],
-      this.f2 = corrected[[i]],
-      pk.times = aligned$rt_crosstab,
-      aligned.ftrs = aligned$int_crosstab,
-      orig.tol = mz_tol,
-      align.mz.tol = aligned$mz_tolerance,
-      align.chr.tol = aligned$rt_tolerance,
-      mz.range = mz_range,
-      chr.range = rt_range,
-      use.observed.range = use_observed_range,
-      bandwidth = 0.5,
-      min.bw = min_bandwidth,
-      max.bw = max_bandwidth,
-      recover.min.count = recover_min_count
-    )
-  })
+select_adjusted <- function(recovered_features) {
+    return(recovered_features$adjusted_features)
+}

-  feature_table <- aligned$rt_crosstab[, 1:4]
-  rt_crosstab <- cbind(feature_table, sapply(recovered, function(x) x$this.times))
-  int_crosstab <- cbind(feature_table, sapply(recovered, function(x) x$this.ftrs))
-
-  feature_names <- rownames(feature_table)
-  sample_names <- colnames(aligned$rt_crosstab[, - (1:4)])
-
-  list(
-    extracted_features = lapply(recovered, function(x) x$this.f1),
-    corrected_features = lapply(recovered, function(x) x$this.f2),
-    rt_crosstab = as_feature_crosstab(feature_names, sample_names, rt_crosstab),
-    int_crosstab = as_feature_crosstab(feature_names, sample_names, int_crosstab)
-  )
+known_table_columns <- function() {
+  c("chemical_formula", "HMDB_ID", "KEGG_compound_ID", "mass", "ion.type",
+    "m.z", "Number_profiles_processed", "Percent_found", "mz_min", "mz_max",
+    "RT_mean", "RT_sd", "RT_min", "RT_max", "int_mean(log)", "int_sd(log)",
+    "int_min(log)", "int_max(log)")
 }

-create_feature_sample_table <- function(features) {
-  table <- as_feature_sample_table(
-      rt_crosstab = features$rt_crosstab,
-      int_crosstab = features$int_crosstab
-  )
-  return(table)
+save_known_table <- function(table, filename) {
+  columns <- known_table_columns()
+  arrow::write_parquet(table$known_table[columns], filename)
+}
+
+read_known_table <- function(filename) {
+  arrow::read_parquet(filename, col_select = known_table_columns())
+}
+
+save_pairing <- function(table, filename) {
+  df <- table$pairing %>% as_tibble() %>% setNames(c("new", "old"))
+  arrow::write_parquet(df, filename)
 }
+
+join_tables_to_list <- function(metadata, rt_table, intensity_table) {
+  features <- new("list")
+  features$metadata <- metadata
+  features$intensity <- intensity_table
+  features$rt <- rt_table
+  return(features)
+}
+
+validate_sample_names <- function(sample_names) {
+    if ((any(is.na(sample_names))) || (length(unique(sample_names)) != length(sample_names))) {
+        stop(sprintf("Sample names absent or not unique - provided sample names: %s",
+                     paste(sample_names, collapse = ", ")))
+    }
+}