Mercurial > repos > immuneml > immuneml_tools
annotate immuneml_train_recept.xml @ 2:9bf78cb6b91d draft
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
author | immuneml |
---|---|
date | Thu, 01 Jul 2021 14:43:41 +0000 |
parents | 629e7e403e19 |
children | ed3932e6d616 |
rev | line source |
---|---|
0
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
1 <tool id="immuneml_train_classifiers" name="Train immune receptor classifiers (simplified interface)" version="@VERSION@.0"> |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
2 <description></description> |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
3 <macros> |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
4 <import>prod_macros.xml</import> |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
5 </macros> |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
6 <expand macro="requirements" /> |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
7 <command><![CDATA[ |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
8 |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
9 #if $iml_input |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
10 cp -r ${iml_input.extra_files_path}/result/* . && |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
11 (mv repertoires/* . &>/dev/null || : ) && |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
12 rm -rf repertoires && |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
13 #end if |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
14 |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
15 python '$__tool_directory__/build_yaml_from_arguments_wrapper.py' --output_path $specs.files_path |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
16 #if $labels |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
17 --labels "$labels" |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
18 #end if |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
19 #if $ml_methods |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
20 #set methods_splitted = str($ml_methods).replace(",", " ") |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
21 --ml_methods $methods_splitted |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
22 #end if |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
23 #if $training_percentage |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
24 --training_percentage $training_percentage |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
25 #end if |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
26 #if $split_count |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
27 --split_count $split_count |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
28 #end if |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
29 |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
30 --gap_type $gap_cond.gap_type |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
31 #if $gap_cond.gap_type == "ungapped" |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
32 --k $gap_cond.k |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
33 #end if |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
34 #if $gap_cond.gap_type == "gapped" |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
35 --k_left $gap_cond.k_left |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
36 --k_right $gap_cond.k_right |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
37 --min_gap $gap_cond.min_gap |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
38 --max_gap $gap_cond.max_gap |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
39 #end if |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
40 --position_type $position_type |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
41 |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
42 && cp ${specs.files_path}/specs.yaml yaml_copy && |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
43 |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
44 immune-ml ./yaml_copy ${html_outfile.files_path} --tool GalaxyTrainMLModel |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
45 |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
46 && mv ${html_outfile.files_path}/index.html ${html_outfile} |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
47 && mv ${specs.files_path}/specs.yaml ${specs} |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
48 && mv ${html_outfile.files_path}/immuneML_output.zip $archive |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
49 && mv ${html_outfile.files_path}/exported_models/*.zip ${optimal_model} |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
50 ]]> |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
51 </command> |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
52 <inputs> |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
53 <param name="iml_input" type="data" format="iml_dataset" label="Input dataset (immune receptors)" help="This field accepts receptor datasets in the ImmuneML dataset format, as created by the Create Dataset tool."/> |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
54 <param type="text" name="labels" optional="false" label="Which property (“label”) of the receptors would you like to predict?" help="A receptor property to predict could for example be the epitope it binds to. Such properties must be present as receptor sequence metadata. For example, when using data in VDJdb format, the default fields are named ‘epitope’, ‘epitope_gene’ and ‘epitope_species’. One of these labels must be chosen here."/> |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
55 |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
56 <conditional name="gap_cond"> |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
57 <param type="select" name="gap_type" label="The immune receptors will be classified based on subsequences found in the CDR3 region. I assume that the signal that determines the receptor label is:" display="radio"> |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
58 <option value="ungapped">A contiguous subsequence of amino acids</option> |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
59 <option value="gapped">Two subsequences of amino acids, possibly separated by a gap</option> |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
60 </param> |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
61 <when value="gapped"> |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
62 <param type="integer" name="k_left" label="Given a gapped signal, the sequence length before the gap is:" value="2" min="0"/> |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
63 <param type="integer" name="k_right" label="And the sequence length after the gap is:" value="2" min="0"/> |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
64 <param type="integer" name="min_gap" label="While the minimal gap length is:" value="0" min="0"/> |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
65 <param type="integer" name="max_gap" label="And the maximal gap length is:" value="5" min="0"/> |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
66 </when> |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
67 <when value="ungapped"> |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
68 <param type="integer" name="k" label="Given a contiguous subsequence of amino acids containing a signal, the expected length of this subsequence is:" value="3" min="0"/> |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
69 </when> |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
70 </conditional> |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
71 <param type="boolean" name="position_type" label="If the same subsequence occurs in a different position in two receptors, is this expected to be the same signal? " |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
72 truevalue="invariant" falsevalue="positional" checked="true"/> |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
73 |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
74 <param type="select" name="ml_methods" label="Which ML methods would you like to include?" help="For each ML method, the optimal hyper parameter settings are determined and the performance of the methods is compared to each other." |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
75 display="checkboxes" multiple="true"> |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
76 <option value="RandomForestClassifier">Random forest</option> |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
77 <option value="SimpleLogisticRegression">Logistic regression</option> |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
78 <option value="SVM">Support Vector Machine</option> |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
79 <option value="KNN">K-nearest neighbors</option> |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
80 </param> |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
81 |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
82 <param type="integer" name="training_percentage" label="Percentage of data that is used for training + validation (the remainder is used for testing):" value="70" min="50" max="90" help="This part of the data is used for training the classifier i.e., learning the relevant patterns in the data and determining the optimal hyper parameter settings for the classifier. The remaining data is used to test the performance of the classifier. There is no golden rule that determines the optimal percentage of training data, but typically a value between 60 and 80% is chosen."/> |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
83 <param type="integer" name="split_count" label="Number of times to repeat the training process with different random splits of data:" value="5" min="0" help="The more often the experiment is repeated, the better the performance of the ML models can be estimated, but the longer it will take for the analysis to complete."/> |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
84 |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
85 </inputs> |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
86 <outputs> |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
87 <data format="txt" name="specs" label="receptor_classification.yaml"/> |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
88 <data format="zip" name="optimal_model" label="optimal_ml_settings.zip"/> |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
89 <data format="zip" name="archive" label="Archive: receptor classification"/> |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
90 <data format="html" name="html_outfile" label="Summary: receptor classification"/> |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
91 </outputs> |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
92 |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
93 |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
94 <help><![CDATA[ |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
95 The purpose of this tool is to train machine learning (ML) models to predict a characteristic per immune receptor, such as |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
96 antigen specificity. One or more ML models are trained to classify receptors based on the information within the CDR3 sequence(s). Finally, the performance |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
97 of the different methods is compared. |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
98 Alternatively, if you want to predict a property per immune repertoire, such as disease status, check out the |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
99 `Train immune repertoire classifiers (simplified interface) <https://galaxy.immuneml.uio.no/root?tool_id=novice_immuneml_interface>`_ tool instead. |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
100 |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
101 The full documentation can be found `here <https://docs.immuneml.uio.no/galaxy/galaxy_simple_receptors.html>`_. |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
102 |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
103 **Basic terminology** |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
104 |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
105 In the context of ML, the characteristics to predict per receptor are called **labels** and the values that these labels can |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
106 take on are **classes**. One could thus have a label named ‘epitope’ with possible classes ‘binding_gluten’ and ‘not_binding_gluten’. |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
107 The labels and classes must be present in the receptor metadata. |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
108 |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
109 When training an ML model, the goal is for the model to learn **signals** within the data which discriminate between the different |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
110 classes. An ML model that predicts classes is also referred to as a **classifier**. A signal can have a variety of definitions, |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
111 including the presence of a specific subsequence or conserved positions. Our assumptions about what makes up a ‘signal’ |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
112 determines how we should represent our data to the ML model. This representation is called **encoding**. In this tool, the encoding is automatically chosen based on |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
113 the user's assumptions about the dataset. |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
114 |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
115 .. image:: https://docs.immuneml.uio.no/_images/receptor_classification_overview.png |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
116 :height: 500 |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
117 |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
118 | |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
119 | |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
120 |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
121 **An overview of the components of the immuneML receptor classification tool.** |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
122 ImmuneML reads in receptor data with labels (+ and -), encodes the data, trains user-specified ML models and summarizes |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
123 the performance statistics per ML method. |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
124 Encoding: position dependent and invariant encoding are shown. The specificity-associated subsequences are highlighted |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
125 with color. The different colors represent independent elements of the antigen specificity signal. Each color represents |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
126 one subsequence, and position dependent subsequences can only have the same color when they occur in the same position, |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
127 although different colors (i.e., nucleotide or amino acid sequences) may occur in the same position. |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
128 Training: the training and validation data is used to train ML models and find the optimal hyperparameters through |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
129 5-fold cross-validation. The test set is left out and is used to obtain a fair estimate of the model performance. |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
130 |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
131 |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
132 **Encoding** |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
133 |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
134 Encodings for immune receptor data represent the immune receptor based on the subsequences (e.g., 3 – 5 amino acids long, also referred to as k-mers) |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
135 in the CDR3 regions. The CDR3 regions are divided into overlapping subsequences and the (antigen specificity) |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
136 signal may be characterized by the presence or absence of certain sequence motifs in the CDR3 region. |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
137 A graphical representation of how a CDR3 sequence can be divided into k-mers, and how these k-mers can relate to specific positions in a 3D immune receptor |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
138 (here: antibody) is shown in this figure: |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
139 |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
140 .. image:: https://docs.immuneml.uio.no/_images/3mer_to_3d.png |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
141 :height: 250 |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
142 |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
143 | |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
144 |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
145 The subsequences may be position dependent or invariant. Position invariant means that if a subsequence, e.g., |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
146 ‘EDNA’ occurs in different positions in the CDR3 it will still be considered the same signal. This is not the case for |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
147 position dependent subsequences, if ‘EDNA’ often occurs in the beginning of the CDR3 in antigen binding receptors, |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
148 then finding ‘EDNA’ in the end of a CDR3 in a new receptor will be considered unrelated. Positions are determined based |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
149 on the IMGT numbering scheme. |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
150 |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
151 Finally, it is possible to introduce gaps in the encoding of subsequences (not shown in the Figure). In this case, a |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
152 motif is defined by two subsequences separated by a region of varying nucleotide or amino acid length. Thus, the |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
153 subsequences ‘EDNA’, ‘EDGNA’ and ‘EDGAGAGNA’ may all be considered to be part of the same motif: ‘ED’ followed by ‘NA’ |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
154 with a gap of 0 – 5 amino acids in between. |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
155 |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
156 Note that in any case, the subsequences that are associated with the ‘positive’ class may still be present in the ‘negative’ |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
157 class, albeit at a lower rate. |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
158 |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
159 **Training a machine learning model** |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
160 |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
161 Training an ML model means optimizing the **parameters** for the model with the goal of predicting the correct class of an (unseen) immune receptor. |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
162 Different ML methods require different procedures for training. In addition to the model parameters there are the **hyperparameters**, these |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
163 hyperparameters do not directly change the predictions of a model, but they control the learning process (for example: the learning speed). |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
164 |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
165 The immune receptors are divided into sets with different purposes: the training and validation sets are used for finding the optimal parameters |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
166 and hyperparameters respectively. The test set is held out, and is only used to estimate the performance of a trained model. |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
167 |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
168 In this tool, a range of plausible hyperparameters have been predefined for each ML method. The optimal hyperparameters are found by splitting the |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
169 training/validation data into 5 equal portions, where 4 portions are used to train the ML model (with different hyperparameters) and the remaining |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
170 portion is used to validate the performance of these hyperparameters settings. This is repeated 5 times such that each portion has been used for |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
171 validation once. With the best hyperparameters found in the 5 repetitions, a final model is trained using all 5 portions of the data. This procedure |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
172 is also referred to as 5-fold cross-validation. Note that this 5-fold cross-validation is separate from the number of times the splitting into |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
173 training + validation and testing sets is done (see the overview figure). |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
174 |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
175 Finally, the whole process is repeated one or more times with different randomly selected receptors in the test set, to see how robust the performance |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
176 of the ML methods is. The number of times to repeat this splitting into training + validation and test sets is determined in the last question. |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
177 |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
178 **Tool output** |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
179 |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
180 This Galaxy tool will produce the following history elements: |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
181 |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
182 - Summary: receptor classification: a HTML page that allows you to browse through all results, including prediction accuracies on |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
183 the various data splits and plots showing the performance of classifiers and learned parameters. |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
184 |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
185 - Archive: receptor classification: a .zip file containing the complete output folder as it was produced by immuneML. This folder |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
186 contains the output of the TrainMLModel instruction including all trained models and their predictions, and report results. |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
187 Furthermore, the folder contains the complete YAML specification file for the immuneML run, the HTML output and a log file. |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
188 |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
189 - optimal_ml_settings.zip: a .zip file containing the raw files for the optimal trained ML settings (ML model, encoding). |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
190 This .zip file can subsequently be used as an input when `applying previously trained ML models to a new AIRR dataset in Galaxy <https://docs.immuneml.uio.no/galaxy/galaxy_apply_ml_models.html>`_. |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
191 |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
192 - receptor_classification.yaml: the YAML specification file that was used by immuneML internally to run the analysis. This file can be |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
193 downloaded, altered, and run again by immuneML using the `Train machine learning models <https://galaxy.immuneml.uio.no/root?tool_id=immuneml_train_ml_model>`_ Galaxy tool. |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
194 |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
195 **More analysis options** |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
196 |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
197 A limited selection of immuneML options is available through this tool. If you wish to have full control of the analysis, consider using |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
198 the `Train machine learning models <https://galaxy.immuneml.uio.no/root?tool_id=immuneml_train_ml_model>`_ Galaxy tool. |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
199 This tool provides other encodings and machine learning methods to choose from, as well as |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
200 data preprocessing and settings for hyperparameter optimization. The interface of the YAML-based tool expects more independence and knowledge about |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
201 machine learning from the user. |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
202 |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
203 ]]> |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
204 </help> |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
205 |
629e7e403e19
"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff
changeset
|
206 </tool> |