view immuneml_train_ml_model.xml @ 9:d9a2f5b7448a draft

"planemo upload commit 97c6b0b37098f81d2454174e7b8e50224270f25d-dirty"
author immuneml
date Fri, 30 Jul 2021 13:23:25 +0000
parents 45ca02982e1f
children e33b3d370124
line wrap: on
line source

<tool id="immuneml_train_ml_model" name="Train machine learning models" version="@VERSION@.0">
  <description></description>
    <macros>
        <import>prod_macros.xml</import>
    </macros>
    <expand macro="requirements" />
  <command><![CDATA[

      #if $iml_input
         cp -r ${iml_input.extra_files_path}/result/* . &&
	 (mv repertoires/* . &>/dev/null || :) &&
	 rm -rf repertoires &&
      #end if

      #set $input_orig_names = []
      #if $data_input
          #for $input in $data_input
             #set input_orig_names += [str($input.element_identifier)]
             ([ -e ./"$input.element_identifier" ] && echo "File '$input.element_identifier' already exists in the input folder, skipping." || ln -s $input "$input.element_identifier") &&
          #end for#
      #end if

      cp "$yaml_input" yaml_copy &&
      immune-ml ./yaml_copy ${html_outfile.files_path} --tool GalaxyTrainMLModel &&
      mv ${html_outfile.files_path}/index.html ${html_outfile} &&
      mv ${html_outfile.files_path}/exported_models/*.zip ${optimal_model} &&
      mv ${html_outfile.files_path}/immuneML_output.zip $archive
      ]]>
  </command>
  <inputs>
      <param name="yaml_input" type="data" format="txt" label="YAML specification" multiple="false"/>
      <param name="data_input" type="data" multiple="true" label="Additional files" optional="true" help="This field should include individual repertoire files, metadata files, receptor data and others."/>
      <param name="iml_input" type="data" format="immuneml_receptors" label="Dataset input" optional="true" help="This field accepts an ImmuneML dataset, as created by the Create Dataset tool."/>
  </inputs>
    <outputs>
        <data format="zip" name="optimal_model" label="optimal_ml_settings.zip"/>
        <data format="zip" name="archive" label="Archive: ML model training"/>
        <data format="html" name="html_outfile" label="Summary: ML model training"/>
    </outputs>


  <help>
      <![CDATA[

      This tool can be used to run hyperparameter optimization over several different ML settings,
      which include ML models and their parameters, encodings and preprocessing steps. Nested cross-validation is used to identify the optimal combination of ML settings.

      This is a YAML-based Galaxy tool, if you prefer a button-based interface that assumes less ML knowledge,
      see `Train immune receptor classifiers (easy interface) <root?tool_id=immuneml_train_classifiers>`_ and
      `Train immune repertoire classifiers (easy interface) <root?tool_id=novice_immuneml_interface>`_.

      For more details on how to train ML models in Galaxy, see `the documentation <https://docs.immuneml.uio.no/latest/galaxy/galaxy_train_ml_models.html>`_.

      **Tool output**

      This Galaxy tool will produce the following history elements:

      - Summary: ML model training: a HTML page that allows you to browse through all results, including prediction accuracies on
        the various data splits and report results.

      - Archive: ML model training: a .zip file containing the complete output folder as it was produced by immuneML. This folder
        contains the output of the TrainMLModel instruction including all trained models and their predictions, and report results.
        Furthermore, the folder contains the complete YAML specification file for the immuneML run, the HTML output and a log file.

      - optimal_ml_settings.zip: a .zip file containing the raw files for the optimal trained ML settings (ML model, encoding, and
        optionally preprocessing steps). This .zip file can subsequently be used as an input when `applying previously trained ML models to a new AIRR dataset in Galaxy <https://docs.immuneml.uio.no/latest/galaxy/galaxy_apply_ml_models.html>`_.

    ]]>

  </help>

</tool>