view immuneml_train_ml_model.xml @ 17:b63bd4447c24 draft

"planemo upload commit baf8a7b46ce256e676d2c14b18ab58ee6f5b3cca"
author immuneml
date Fri, 04 Feb 2022 14:25:21 +0000
parents cd57c1c66f8b
children 051d349fdc8c
line wrap: on
line source

<tool id="immuneml_train_ml_model" name="Train machine learning models" version="@VERSION@.0">
  <description></description>
    <macros>
        <import>prod_macros.xml</import>
    </macros>
    <expand macro="requirements" />
  <command><![CDATA[

      #if $iml_input
         cp -r ${iml_input.extra_files_path}/result/* . &&
	 (mv repertoires/* . &>/dev/null || :) &&
	 rm -rf repertoires &&
      #end if

      #set $input_orig_names = []
      #if $data_input
          #for $input in $data_input
             #set input_orig_names += [str($input.element_identifier)]
             ([ -e ./"$input.element_identifier" ] && echo "File '$input.element_identifier' already exists in the input folder, skipping." || ln -s $input "$input.element_identifier") &&
          #end for#
      #end if

      cp "$yaml_input" yaml_copy &&
      immune-ml ./yaml_copy ${html_outfile.files_path} --tool GalaxyTrainMLModel &&
      mv ${html_outfile.files_path}/index.html ${html_outfile} &&
      mv ${html_outfile.files_path}/exported_models/*.zip ${optimal_model} &&
      mv ${html_outfile.files_path}/immuneML_output.zip $archive
      ]]>
  </command>
  <inputs>
      <param name="iml_input" type="data" format="immuneml_receptors" label="immuneML dataset" optional="true" help="This field accepts an ImmuneML dataset, as created by the Create Dataset tool."/>
      <param name="yaml_input" type="data" format="txt" label="YAML specification" multiple="false"/>
      <param name="data_input" type="data" multiple="true" label="Additional files" optional="true" help="This field should include individual repertoire files, metadata files, receptor data and others."/>
  </inputs>
    <outputs>
        <data format="zip" name="optimal_model" label="optimal_ml_settings.zip"/>
        <data format="zip" name="archive" label="Archive: ML model training"/>
        <data format="html" name="html_outfile" label="Summary: ML model training"/>
    </outputs>


  <help>
      <![CDATA[

      This tool can be used to run hyperparameter optimization over several different ML settings,
      which include ML models and their parameters, encodings and preprocessing steps. Nested cross-validation is used to identify the optimal combination of ML settings.

      This is a YAML-based Galaxy tool, if you prefer a button-based interface that assumes less ML knowledge,
      see `Train immune receptor classifiers (easy interface) <root?tool_id=immuneml_train_classifiers>`_ and
      `Train immune repertoire classifiers (easy interface) <root?tool_id=novice_immuneml_interface>`_.

      For more details on how to train ML models in Galaxy, see `the documentation <https://docs.immuneml.uio.no/latest/galaxy/galaxy_train_ml_models.html>`_.

      **Tool output**

      This Galaxy tool will produce the following history elements:

      - Summary: ML model training: a HTML page that allows you to browse through all results, including prediction accuracies on
        the various data splits and report results.

      - Archive: ML model training: a .zip file containing the complete output folder as it was produced by immuneML. This folder
        contains the output of the TrainMLModel instruction including all trained models and their predictions, and report results.
        Furthermore, the folder contains the complete YAML specification file for the immuneML run, the HTML output and a log file.

      - optimal_ml_settings.zip: a .zip file containing the raw files for the optimal trained ML settings (ML model, encoding, and
        optionally preprocessing steps). This .zip file can subsequently be used as an input when `applying previously trained ML models to a new AIRR dataset in Galaxy <https://docs.immuneml.uio.no/latest/galaxy/galaxy_apply_ml_models.html>`_.

    ]]>

  </help>

</tool>