# HG changeset patch
# User immuneml
# Date 1625139403 0
# Node ID 629e7e403e19ef784c5aa81e3e4a8fe4fc422157
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
diff -r 000000000000 -r 629e7e403e19 README.md
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/README.md Thu Jul 01 11:36:43 2021 +0000
@@ -0,0 +1,23 @@
+# immuneml_tools
+Galaxy tool wrappers for immuneML.
+https://immuneml.uio.no/
+
+## Installation:
+The tools can be installed from a Galaxy toolshed. You can also install them offline by editing Galaxy config files in the usual way.
+
+### New datatype `iml_dataset`
+No matter how you install the tools, you will need to define a new datatype, which is done as follows:
+
+1. In your `galaxy.yml` look up the name of your `datatypes_config_file`. If the name is not yet defined, set
+```
+datatypes_config_file: datatypes_conf.xml
+```
+2. Make `datatypes_conf.xml` by copying `datatypes_conf.xml.sample` unless a `datatypes_config_file` was already defined.
+3. Add the following line to your `datatypes_config_file`:
+```
+
+```
+The line has to be inside `` along with the other datatypes.
+
+### The immuneML conda package
+Galaxy will need to install the immuneML conda package. This conda installation typically takes several minutes.
diff -r 000000000000 -r 629e7e403e19 build_dataset_yaml_wrapper.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/build_dataset_yaml_wrapper.py Thu Jul 01 11:36:43 2021 +0000
@@ -0,0 +1,4 @@
+import sys
+from immuneML.api.galaxy.build_dataset_yaml import main
+if __name__ == "__main__":
+ main(sys.argv[1:])
diff -r 000000000000 -r 629e7e403e19 build_yaml_from_arguments_wrapper.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/build_yaml_from_arguments_wrapper.py Thu Jul 01 11:36:43 2021 +0000
@@ -0,0 +1,4 @@
+import sys
+from immuneML.api.galaxy.build_yaml_from_arguments import main
+if __name__ == "__main__":
+ main(sys.argv[1:])
diff -r 000000000000 -r 629e7e403e19 immuneml_create_dataset.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/immuneml_create_dataset.xml Thu Jul 01 11:36:43 2021 +0000
@@ -0,0 +1,178 @@
+
+
+
+ prod_macros.xml
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ `_).
+
+ The imported immuneML dataset is stored in a Galaxy collection, which will appear as a history item on the right side of the screen,
+ and can later be selected as input to other tools.
+
+ The tool has a simplified and an advanced interface. The simplified interface is fully button-based, and relies
+ on default settings for importing datasets. The advanced interface gives full control over import settings through a YAML
+ specification. In most cases, the simplified interface will suffice.
+
+ For the exhaustive documentation of this tool and more information about immuneML datasets, see the tutorial `How to make an immuneML dataset in Galaxy `_.
+
+ **Tool output**
+
+ This Galaxy tool will produce the following history elements:
+
+ - ImmuneML dataset: a sequence, receptor or repertoire dataset which can be used as an input to other immuneML tools. The history element contains a summary HTML page describing general characteristics of the dataset, including the name of the dataset
+ (which is used in the dataset definition of a yaml specification), the dataset type and size, available labels, and a link to download the raw data files.
+
+ - create_dataset.yaml: the YAML specification file that was used by immuneML to create the dataset.
+ This file can be downloaded and altered (for example to export files in AIRR format, or use non-standard import parameters),
+ and run again using the 'Advanced' interface.
+
+ ]]>
+
+
+
diff -r 000000000000 -r 629e7e403e19 immuneml_simulate_dataset.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/immuneml_simulate_dataset.xml Thu Jul 01 11:36:43 2021 +0000
@@ -0,0 +1,61 @@
+
+
+
+ prod_macros.xml
+
+
+
+
+
+
+
+
+
+
+
+
+
+ `_ Galaxy tool.
+
+ For the exhaustive documentation of this tool and an example YAML specification, see the tutorial `How to simulate an AIRR dataset in Galaxy `_.
+
+ **Tool output**
+
+ This Galaxy tool will produce the following history elements:
+
+ - ImmuneML dataset (simulated sequences): a sequence, receptor or repertoire dataset which can be used as an input to other immuneML tools. The history element contains a summary HTML page describing general characteristics of the dataset, including the name of the dataset
+ (which is used in the dataset definition of a yaml specification), the dataset type and size, available labels, and a link to download the raw data files.
+
+ - Archive: dataset simulation: a .zip file containing the complete output folder as it was produced by immuneML. This folder
+ contains the output of the DatasetExport instruction including raw data files.
+ Furthermore, the folder contains the complete YAML specification file for the immuneML run, the HTML output and a log file.
+
+
+ ]]>
+
+
+
diff -r 000000000000 -r 629e7e403e19 immuneml_simulate_events.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/immuneml_simulate_events.xml Thu Jul 01 11:36:43 2021 +0000
@@ -0,0 +1,59 @@
+
+
+
+ prod_macros.xml
+
+
+ /dev/null || :) &&
+ rm -rf repertoires &&
+ #end if
+
+ cp "$yaml_input" yaml_copy &&
+ immune-ml ./yaml_copy ${html_outfile.files_path} --tool GalaxySimulationTool &&
+
+ mv ${html_outfile.files_path}/index.html ${html_outfile} &&
+ mv ${html_outfile.files_path}/immuneML_output.zip $archive
+
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+ `_.
+
+ For the exhaustive documentation of this tool and an example YAML specification, see the tutorial `How to simulate immune events into an existing AIRR dataset in Galaxy `_.
+
+ **Tool output**
+
+ This Galaxy tool will produce the following history elements:
+
+ - ImmuneML dataset (simulated immune signals): a repertoire dataset which can be used as an input to other immuneML tools. The history element contains a summary HTML page describing general characteristics of the dataset, including the name of the dataset
+ (which is used in the dataset definition of a yaml specification), the dataset type and size, available labels, and a link to download the raw data files.
+
+ - Archive: immune signal simulation: a .zip file containing the complete output folder as it was produced by immuneML. This folder
+ contains the output of the Simulation instruction including all raw data files.
+ Furthermore, the folder contains the complete YAML specification file for the immuneML run, the HTML output and a log file.
+
+
+ ]]>
+
+
+
diff -r 000000000000 -r 629e7e403e19 immuneml_train_ml_model.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/immuneml_train_ml_model.xml Thu Jul 01 11:36:43 2021 +0000
@@ -0,0 +1,72 @@
+
+
+
+ prod_macros.xml
+
+
+ /dev/null || :) &&
+ rm -rf repertoires &&
+ #end if
+
+ #set $input_orig_names = []
+ #if $data_input
+ #for $input in $data_input
+ #set input_orig_names += [str($input.element_identifier)]
+ ([ -e ./"$input.element_identifier" ] && echo "File '$input.element_identifier' already exists in the input folder, skipping." || ln -s $input "$input.element_identifier") &&
+ #end for#
+ #end if
+
+ cp "$yaml_input" yaml_copy &&
+ immune-ml ./yaml_copy ${html_outfile.files_path} --tool GalaxyTrainMLModel &&
+ mv ${html_outfile.files_path}/index.html ${html_outfile} &&
+ mv ${html_outfile.files_path}/exported_models/*.zip ${optimal_model} &&
+ mv ${html_outfile.files_path}/immuneML_output.zip $archive
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ `_ and
+ `Train immune repertoire classifiers (easy interface) `_.
+
+ For more details on how to train ML models in Galaxy, see `the documentation `_.
+
+ **Tool output**
+
+ This Galaxy tool will produce the following history elements:
+
+ - Summary: ML model training: a HTML page that allows you to browse through all results, including prediction accuracies on
+ the various data splits and report results.
+
+ - Archive: ML model training: a .zip file containing the complete output folder as it was produced by immuneML. This folder
+ contains the output of the TrainMLModel instruction including all trained models and their predictions, and report results.
+ Furthermore, the folder contains the complete YAML specification file for the immuneML run, the HTML output and a log file.
+
+ - optimal_ml_settings.zip: a .zip file containing the raw files for the optimal trained ML settings (ML model, encoding, and
+ optionally preprocessing steps). This .zip file can subsequently be used as an input when `applying previously trained ML models to a new AIRR dataset in Galaxy `_.
+
+ ]]>
+
+
+
+
diff -r 000000000000 -r 629e7e403e19 immuneml_train_recept.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/immuneml_train_recept.xml Thu Jul 01 11:36:43 2021 +0000
@@ -0,0 +1,206 @@
+
+
+
+ prod_macros.xml
+
+
+ /dev/null || : ) &&
+ rm -rf repertoires &&
+ #end if
+
+ python '$__tool_directory__/build_yaml_from_arguments_wrapper.py' --output_path $specs.files_path
+ #if $labels
+ --labels "$labels"
+ #end if
+ #if $ml_methods
+ #set methods_splitted = str($ml_methods).replace(",", " ")
+ --ml_methods $methods_splitted
+ #end if
+ #if $training_percentage
+ --training_percentage $training_percentage
+ #end if
+ #if $split_count
+ --split_count $split_count
+ #end if
+
+ --gap_type $gap_cond.gap_type
+ #if $gap_cond.gap_type == "ungapped"
+ --k $gap_cond.k
+ #end if
+ #if $gap_cond.gap_type == "gapped"
+ --k_left $gap_cond.k_left
+ --k_right $gap_cond.k_right
+ --min_gap $gap_cond.min_gap
+ --max_gap $gap_cond.max_gap
+ #end if
+ --position_type $position_type
+
+ && cp ${specs.files_path}/specs.yaml yaml_copy &&
+
+ immune-ml ./yaml_copy ${html_outfile.files_path} --tool GalaxyTrainMLModel
+
+ && mv ${html_outfile.files_path}/index.html ${html_outfile}
+ && mv ${specs.files_path}/specs.yaml ${specs}
+ && mv ${html_outfile.files_path}/immuneML_output.zip $archive
+ && mv ${html_outfile.files_path}/exported_models/*.zip ${optimal_model}
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ `_ tool instead.
+
+ The full documentation can be found `here `_.
+
+ **Basic terminology**
+
+ In the context of ML, the characteristics to predict per receptor are called **labels** and the values that these labels can
+ take on are **classes**. One could thus have a label named ‘epitope’ with possible classes ‘binding_gluten’ and ‘not_binding_gluten’.
+ The labels and classes must be present in the receptor metadata.
+
+ When training an ML model, the goal is for the model to learn **signals** within the data which discriminate between the different
+ classes. An ML model that predicts classes is also referred to as a **classifier**. A signal can have a variety of definitions,
+ including the presence of a specific subsequence or conserved positions. Our assumptions about what makes up a ‘signal’
+ determines how we should represent our data to the ML model. This representation is called **encoding**. In this tool, the encoding is automatically chosen based on
+ the user's assumptions about the dataset.
+
+ .. image:: https://docs.immuneml.uio.no/_images/receptor_classification_overview.png
+ :height: 500
+
+ |
+ |
+
+ **An overview of the components of the immuneML receptor classification tool.**
+ ImmuneML reads in receptor data with labels (+ and -), encodes the data, trains user-specified ML models and summarizes
+ the performance statistics per ML method.
+ Encoding: position dependent and invariant encoding are shown. The specificity-associated subsequences are highlighted
+ with color. The different colors represent independent elements of the antigen specificity signal. Each color represents
+ one subsequence, and position dependent subsequences can only have the same color when they occur in the same position,
+ although different colors (i.e., nucleotide or amino acid sequences) may occur in the same position.
+ Training: the training and validation data is used to train ML models and find the optimal hyperparameters through
+ 5-fold cross-validation. The test set is left out and is used to obtain a fair estimate of the model performance.
+
+
+ **Encoding**
+
+ Encodings for immune receptor data represent the immune receptor based on the subsequences (e.g., 3 – 5 amino acids long, also referred to as k-mers)
+ in the CDR3 regions. The CDR3 regions are divided into overlapping subsequences and the (antigen specificity)
+ signal may be characterized by the presence or absence of certain sequence motifs in the CDR3 region.
+ A graphical representation of how a CDR3 sequence can be divided into k-mers, and how these k-mers can relate to specific positions in a 3D immune receptor
+ (here: antibody) is shown in this figure:
+
+ .. image:: https://docs.immuneml.uio.no/_images/3mer_to_3d.png
+ :height: 250
+
+ |
+
+ The subsequences may be position dependent or invariant. Position invariant means that if a subsequence, e.g.,
+ ‘EDNA’ occurs in different positions in the CDR3 it will still be considered the same signal. This is not the case for
+ position dependent subsequences, if ‘EDNA’ often occurs in the beginning of the CDR3 in antigen binding receptors,
+ then finding ‘EDNA’ in the end of a CDR3 in a new receptor will be considered unrelated. Positions are determined based
+ on the IMGT numbering scheme.
+
+ Finally, it is possible to introduce gaps in the encoding of subsequences (not shown in the Figure). In this case, a
+ motif is defined by two subsequences separated by a region of varying nucleotide or amino acid length. Thus, the
+ subsequences ‘EDNA’, ‘EDGNA’ and ‘EDGAGAGNA’ may all be considered to be part of the same motif: ‘ED’ followed by ‘NA’
+ with a gap of 0 – 5 amino acids in between.
+
+ Note that in any case, the subsequences that are associated with the ‘positive’ class may still be present in the ‘negative’
+ class, albeit at a lower rate.
+
+ **Training a machine learning model**
+
+ Training an ML model means optimizing the **parameters** for the model with the goal of predicting the correct class of an (unseen) immune receptor.
+ Different ML methods require different procedures for training. In addition to the model parameters there are the **hyperparameters**, these
+ hyperparameters do not directly change the predictions of a model, but they control the learning process (for example: the learning speed).
+
+ The immune receptors are divided into sets with different purposes: the training and validation sets are used for finding the optimal parameters
+ and hyperparameters respectively. The test set is held out, and is only used to estimate the performance of a trained model.
+
+ In this tool, a range of plausible hyperparameters have been predefined for each ML method. The optimal hyperparameters are found by splitting the
+ training/validation data into 5 equal portions, where 4 portions are used to train the ML model (with different hyperparameters) and the remaining
+ portion is used to validate the performance of these hyperparameters settings. This is repeated 5 times such that each portion has been used for
+ validation once. With the best hyperparameters found in the 5 repetitions, a final model is trained using all 5 portions of the data. This procedure
+ is also referred to as 5-fold cross-validation. Note that this 5-fold cross-validation is separate from the number of times the splitting into
+ training + validation and testing sets is done (see the overview figure).
+
+ Finally, the whole process is repeated one or more times with different randomly selected receptors in the test set, to see how robust the performance
+ of the ML methods is. The number of times to repeat this splitting into training + validation and test sets is determined in the last question.
+
+ **Tool output**
+
+ This Galaxy tool will produce the following history elements:
+
+ - Summary: receptor classification: a HTML page that allows you to browse through all results, including prediction accuracies on
+ the various data splits and plots showing the performance of classifiers and learned parameters.
+
+ - Archive: receptor classification: a .zip file containing the complete output folder as it was produced by immuneML. This folder
+ contains the output of the TrainMLModel instruction including all trained models and their predictions, and report results.
+ Furthermore, the folder contains the complete YAML specification file for the immuneML run, the HTML output and a log file.
+
+ - optimal_ml_settings.zip: a .zip file containing the raw files for the optimal trained ML settings (ML model, encoding).
+ This .zip file can subsequently be used as an input when `applying previously trained ML models to a new AIRR dataset in Galaxy `_.
+
+ - receptor_classification.yaml: the YAML specification file that was used by immuneML internally to run the analysis. This file can be
+ downloaded, altered, and run again by immuneML using the `Train machine learning models `_ Galaxy tool.
+
+ **More analysis options**
+
+ A limited selection of immuneML options is available through this tool. If you wish to have full control of the analysis, consider using
+ the `Train machine learning models `_ Galaxy tool.
+ This tool provides other encodings and machine learning methods to choose from, as well as
+ data preprocessing and settings for hyperparameter optimization. The interface of the YAML-based tool expects more independence and knowledge about
+ machine learning from the user.
+
+ ]]>
+
+
+
diff -r 000000000000 -r 629e7e403e19 immuneml_train_repert.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/immuneml_train_repert.xml Thu Jul 01 11:36:43 2021 +0000
@@ -0,0 +1,235 @@
+
+
+
+ prod_macros.xml
+
+
+ /dev/null || :) &&
+ rm -rf repertoires &&
+ #end if
+
+ python '$__tool_directory__/build_yaml_from_arguments_wrapper.py' --output_path $specs.files_path
+ #if $labels
+ --labels "$labels"
+ #end if
+ #if $ml_methods
+ #set methods_splitted = str($ml_methods).replace(",", " ")
+ --ml_methods $methods_splitted
+ #end if
+ #if $training_percentage
+ --training_percentage $training_percentage
+ #end if
+ #if $split_count
+ --split_count $split_count
+ #end if
+ #if $sequence_cond.sequence_type
+ --sequence_type $sequence_cond.sequence_type
+ #end if
+ #if $sequence_cond.sequence_type == "subsequence"
+ --position_type $sequence_cond.position_type
+ --gap_type $sequence_cond.gap_cond.gap_type
+ #if $sequence_cond.gap_cond.gap_type == "ungapped"
+ --k $sequence_cond.gap_cond.k
+ #end if
+ #if $sequence_cond.gap_cond.gap_type == "gapped"
+ --k_left $sequence_cond.gap_cond.k_left
+ --k_right $sequence_cond.gap_cond.k_right
+ --min_gap $sequence_cond.gap_cond.min_gap
+ --max_gap $sequence_cond.gap_cond.max_gap
+ #end if
+ #end if
+ #if $reads
+ --reads $reads
+ #end if
+
+ && cp ${specs.files_path}/specs.yaml yaml_copy &&
+
+ immune-ml ./yaml_copy ${html_outfile.files_path} --tool GalaxyTrainMLModel
+
+ && mv ${html_outfile.files_path}/index.html ${html_outfile}
+ && mv ${specs.files_path}/specs.yaml ${specs}
+ && mv ${html_outfile.files_path}/immuneML_output.zip $archive
+ && mv ${html_outfile.files_path}/exported_models/*.zip ${optimal_model}
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ `_ tool instead.
+
+ The full documentation can be found `here `_.
+
+ **Basic terminology**
+
+ In the context of ML, the characteristics to predict per repertoire are called **labels** and the values that these labels can take on are **classes**.
+ One could thus have a label named ‘CMV_status’ with possible classes ‘positive’ and ‘negative’. The labels and classes must be present in the metadata
+ file, in columns where the header and values correspond to the label and classes respectively.
+
+ .. image:: https://docs.immuneml.uio.no/_images/metadata_repertoire_classification.png
+ :height: 150
+
+ |
+
+ When training an ML model, the goal is for the model to learn **signals** within the data which discriminate between the different classes. An ML model
+ that predicts classes is also referred to as a **classifier**. A signal can have a variety of definitions, including the presence of specific receptors,
+ groups of similar receptors or short CDR3 subsequences in an immune repertoire. Our assumptions about what makes up a ‘signal’ determines how we
+ should represent our data to the ML model. This representation is called **encoding**. In this tool, the encoding is automatically chosen based on
+ the user's assumptions about the dataset.
+
+
+ .. image:: https://docs.immuneml.uio.no/_images/repertoire_classification_overview.png
+ :height: 500
+
+ |
+ |
+
+ **An overview of the components of the immuneML repertoire classification tool.**
+ immuneML reads in repertoire data with labels (+ and -), encodes the
+ data, trains user-specified ML models and summarizes the performance statistics per ML method.
+ Encoding: different forms of encoding are shown; full sequence encoding and position dependent and invariant subsequence encoding.
+ The disease-associated sequences or sub-sequences are highlighted with color. The different colors represent independent elements of the disease signal.
+ Each color represents one (sub)sequence, and position dependent subsequences can only have the same color when they occur in the same position,
+ although different colors (i.e., nucleotide or amino acid sequences) may occur in the same position.
+ Training: the training and validation data is used to train ML models and find the optimal hyperparameters through 5-fold cross-validation.
+ The test set is left out and is used to obtain a fair estimate of the model performance.
+
+ **Encoding**
+
+ The simplest encoding represents an immune repertoire based on the full CDR3 sequences that it contains. This means the ML models will learn to look
+ at which CDR3 sequences are more often present in the ‘positive’ or ‘negative’ classes. It also means that two similar (non-identical) CDR3 sequences
+ are treated as independent pieces of information; if a particular sequence often occurs in diseased repertoires, then finding a similar sequence in a
+ new repertoire is no evidence for this repertoire also being diseased.
+
+ Other encoding variants are based on shorter subsequences (e.g., 3 – 5 amino acids long, also referred to as k-mers) in the CDR3 regions of an immune repertoire. With this
+ encoding, the CDR3 regions are divided into overlapping subsequences and the (disease) signal may be characterized by the presence or absence of
+ certain sequence motifs in the CDR3 regions. Here, two similar CDR3 sequences are no longer independent, because they contain many identical subsequences.
+ A graphical representation of how a CDR3 sequence can be divided into k-mers, and how these k-mers can relate to specific positions in a 3D immune receptor
+ (here: antibody) is shown in this figure:
+
+ .. image:: https://docs.immuneml.uio.no/_images/3mer_to_3d.png
+ :height: 250
+
+ |
+
+ The subsequences may be position-dependent or invariant. Position invariant means that if a subsequence, e.g., ‘EDNA’ occurs in different positions
+ in the CDR3 it will still be considered the same signal. This is not the case for position dependent subsequences, if ‘EDNA’ often occurs in the
+ beginning of the CDR3 in diseased repertoires, then finding ‘EDNA’ in the end of a CDR3 in a new repertoire will be considered unrelated. Positions
+ are determined based on the IMGT numbering scheme.
+
+ Finally, it is possible to introduce gaps in the encoding of subsequences (not shown in the Figure). In this case, a motif is defined by two
+ subsequences separated by a region of varying nucleotide or amino acid length. Thus, the subsequences ‘EDNA’, ‘EDGNA’ and ‘EDGAGAGNA’ may all be
+ considered to be part of the same motif: ‘ED’ followed by ‘NA’ with a gap of 0 – 5 amino acids in between.
+
+ Note that in any case, the (sub)sequences that are associated with the ‘positive’ class may still be present in the ‘negative’ class, albeit at a lower rate.
+
+
+
+ **Training a machine learning model**
+
+ Training an ML model means optimizing the **parameters** for the model with the goal of predicting the correct class of an (unseen) immune repertoire.
+ Different ML methods require different procedures for training. In addition to the model parameters there are the **hyperparameters**, which
+ do not directly change the predictions of a model, but they control the learning process (for example: the learning speed).
+
+ The immune repertoires are divided into sets with different purposes: the training and validation sets are used for finding the optimal parameters
+ and hyperparameters respectively. The test set is held out, and is only used to estimate the performance of a trained model.
+
+ In this tool, a range of plausible hyperparameters have been predefined for each ML method. The optimal hyperparameters are found by splitting the
+ training/validation data into 5 equal portions, where 4 portions are used to train the ML model (with different hyperparameters) and the remaining
+ portion is used to validate the performance of these hyperparameter settings. This is repeated 5 times such that each portion has been used for
+ validation once. With the best hyperparameters found in the 5 repetitions, a final model is trained using all 5 portions of the data. This procedure
+ is also referred to as 5-fold cross-validation. Note that this 5-fold cross-validation is separate from the number of times the splitting into
+ training + validation and testing sets is done (see the overview figure).
+
+ Finally, the whole process is repeated one or more times with different randomly selected repertoires in the test set, to see how robust the performance
+ of the ML methods is. The number of times to repeat this splitting into training + validation and test sets is determined in the last question.
+
+
+ **Tool output**
+
+ This Galaxy tool will produce the following history elements:
+
+ - Summary: repertoire classification: a HTML page that allows you to browse through all results, including prediction accuracies on
+ the various data splits and plots showing the performance of classifiers and learned parameters.
+
+ - Archive: repertoire classification: a .zip file containing the complete output folder as it was produced by immuneML. This folder
+ contains the output of the TrainMLModel instruction including all trained models and their predictions, and report results.
+ Furthermore, the folder contains the complete YAML specification file for the immuneML run, the HTML output and a log file.
+
+ - optimal_ml_settings.zip: a .zip file containing the raw files for the optimal trained ML settings (ML model, encoding).
+ This .zip file can subsequently be used as an input when `applying previously trained ML models to a new AIRR dataset in Galaxy `_.
+
+ - repertoire_classification.yaml: the YAML specification file that was used by immuneML internally to run the analysis. This file can be
+ downloaded, altered, and run again by immuneML using the `Train machine learning models `_ Galaxy tool.
+
+ **More analysis options**
+
+ A limited selection of immuneML options is available through this tool. If you wish to have full control of the analysis, consider using
+ the `Train machine learning models `_ Galaxy tool.
+ This tool provides other encodings and machine learning methods to choose from, as well as
+ data preprocessing and settings for hyperparameter optimization. The interface of the YAML-based tool expects more independence and knowledge about
+ machine learning from the user.
+
+ ]]>
+
+
+
diff -r 000000000000 -r 629e7e403e19 immuneml_yaml.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/immuneml_yaml.xml Thu Jul 01 11:36:43 2021 +0000
@@ -0,0 +1,74 @@
+
+
+
+ prod_macros.xml
+
+
+ /dev/null || :) &&
+ rm -rf repertoires &&
+ #end if
+
+ #set $input_orig_names = []
+ #if $data_input
+ #for $input in $data_input
+ #set input_orig_names += [str($input.element_identifier)]
+ ([ -e ./"$input.element_identifier" ] && echo "File '$input.element_identifier' already exists in the input folder, skipping." || ln -s $input "$input.element_identifier") &&
+ #end for#
+ #end if
+
+ cp "$yaml_input" yaml_copy &&
+ immune-ml ./yaml_copy ${html_outfile.files_path} --tool GalaxyYamlTool &&
+ mv ${html_outfile.files_path}/index.html ${html_outfile} &&
+ mv ${html_outfile.files_path}/immuneML_output.zip $archive
+
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ `_,
+ `simulating synthetic data `_,
+ `implanting synthetic immune signals `_ or
+ `training `_ and
+ `applying `_ ML models instead of this tool.
+ These other tools are able to export the relevant output files to Galaxy history elements.
+
+ However, when you want to run the `ExploratoryAnalysis `_ instruction,
+ or other analyses that do not have a corresponding Galaxy tool, this generic tool can be used.
+
+ For the exhaustive documentation of this tool and an example YAML specification for exploratory analysis, see the tutorial `How to run any AIRR ML analysis in Galaxy `_.
+
+
+ **Tool output**
+
+ This Galaxy tool will produce the following history elements:
+
+ - Summary: immuneML analysis: a HTML page that allows you to browse through all results.
+
+ - ImmuneML Analysis Archive: a .zip file containing the complete output folder as it was produced by immuneML. This folder
+ contains the output of the instruction that was used, including all raw data files.
+ Furthermore, the folder contains the complete YAML specification file for the immuneML run, the HTML output and a log file.
+
+
+ ]]>
+
+
+
diff -r 000000000000 -r 629e7e403e19 metadata.png
Binary file metadata.png has changed
diff -r 000000000000 -r 629e7e403e19 prod_macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/prod_macros.xml Thu Jul 01 11:36:43 2021 +0000
@@ -0,0 +1,9 @@
+
+ 2.0.1
+
+
+ immuneML
+
+
+
+
diff -r 000000000000 -r 629e7e403e19 repertoire_classification_overview.png
Binary file repertoire_classification_overview.png has changed
diff -r 000000000000 -r 629e7e403e19 test.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test.py Thu Jul 01 11:36:43 2021 +0000
@@ -0,0 +1,60 @@
+import argparse
+import os
+from shutil import copyfile
+
+#immuneml --inputs file1 file2 file3 --output_dir /some/path --yaml_path abc.yml --metadata abc.csv --tool galaxy_yaml_tool
+
+def get_args():
+ parser = argparse.ArgumentParser(description='Tool for detecting known and novel MicroRNAs')
+ parser.add_argument('-o', '--output_dir', help='Output directory', default='.', required=True)
+ parser.add_argument('-i', '--inputs', help='Input directory', default='.', required=True, nargs='+')
+ parser.add_argument('-y', '--yaml', help='Yaml input', default='.', required=True)
+ parser.add_argument('-m', '--metadata', help='Metadata input', default='.', required=False)
+ parser.add_argument('-t', '--tool', help='Tool', default='.', required=False)
+
+ return parser.parse_args()
+
+
+def main():
+ print('main')
+ args = get_args()
+
+ print(args.output_dir)
+ print(args.inputs)
+
+ #os.mkdir(args.output_dir)
+ i = 0
+ html_files_links = ''
+ for f in args.inputs:
+ filename = str(i) + '.txt'
+ copyfile(f, os.path.join(args.output_dir, str(i) + '.txt'))
+ i += 1
+ html_files_links += '