diff immuneml_simulate_dataset.xml @ 0:629e7e403e19 draft

"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
author immuneml
date Thu, 01 Jul 2021 11:36:43 +0000
parents
children ed3932e6d616
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/immuneml_simulate_dataset.xml	Thu Jul 01 11:36:43 2021 +0000
@@ -0,0 +1,61 @@
+<tool id="immuneml_simulate_dataset" name="Simulate a synthetic immune receptor or repertoire dataset" version="@VERSION@.0">
+  <description></description>
+    <macros>
+        <import>prod_macros.xml</import>
+    </macros>
+    <expand macro="requirements" />
+  <command><![CDATA[
+
+      cp "$yaml_input" yaml_copy &&
+      immune-ml ./yaml_copy ${html_outfile.files_path} --tool DataSimulationTool &&
+
+      mv ${html_outfile.files_path}/index.html ${html_outfile} && 
+      mv ${html_outfile.files_path}/immuneML_output.zip $archive
+
+      ]]>
+  </command>
+  <inputs>
+      <param name="yaml_input" type="data" format="txt" label="YAML specification" multiple="false"/>
+  </inputs>
+    <outputs>
+        <data format="zip" name="archive" label="Archive: dataset simulation"/>
+        <data format="iml_dataset" name="html_outfile" label="ImmuneML dataset (simulated sequences)"/>
+    </outputs>
+
+
+  <help><![CDATA[
+
+        This Galaxy tool allows you to quickly make a dummy dataset.
+        The tool generates a SequenceDataset, ReceptorDataset or RepertoireDataset consisting of random CDR3 sequences, which could be used for benchmarking machine learning methods or encodings,
+        or testing out other functionalities.
+        The amino acids in the sequences are chosen from a uniform random distribution, and there is no underlying structure in the sequences.
+
+        You can control:
+
+        - The amount of sequences in the dataset, and in the case of a RepertoireDataset, the amount of repertoires
+
+        - The length of the generated sequences
+
+        - Labels, which can be used as a target when training ML models
+
+        Note that since these labels are randomly assigned, they do not bear any meaning and it is not possible to train a ML model with high classification accuracy on this data.
+        Meaningful labels can be added using the `Simulate immune events into existing repertoire/receptor dataset <https://galaxy.immuneml.uio.no/root?tool_id=immuneml_simulation>`_ Galaxy tool.
+
+        For the exhaustive documentation of this tool and an example YAML specification, see the tutorial `How to simulate an AIRR dataset in Galaxy <https://docs.immuneml.uio.no/galaxy/galaxy_simulate_dataset.html>`_.
+
+        **Tool output**
+
+        This Galaxy tool will produce the following history elements:
+
+        - ImmuneML dataset (simulated sequences): a sequence, receptor or repertoire dataset which can be used as an input to other immuneML tools. The history element contains a summary HTML page describing general characteristics of the dataset, including the name of the dataset
+          (which is used in the dataset definition of a yaml specification), the dataset type and size, available labels, and a link to download the raw data files.
+
+        - Archive: dataset simulation: a .zip file containing the complete output folder as it was produced by immuneML. This folder
+          contains the output of the DatasetExport instruction including raw data files.
+          Furthermore, the folder contains the complete YAML specification file for the immuneML run, the HTML output and a log file.
+
+
+    ]]>
+  </help>
+
+</tool>