Mercurial > repos > bgruening > split_file_to_collection
diff split_file_to_collection.xml @ 0:de3c2c88e710 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
author | bgruening |
---|---|
date | Tue, 17 Jul 2018 14:37:13 -0400 |
parents | |
children | 750c1684d47c |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/split_file_to_collection.xml Tue Jul 17 14:37:13 2018 -0400 @@ -0,0 +1,289 @@ +<tool id="split_file_to_collection" name="Split file" version="0.1.1"> + <description>to dataset collection</description> + <macros> + <xml name="numnew_fname"> + <param name="numnew" type="integer" label="Number of new files" min="1" value="1"/> + <param name="newfilenames" type="text" label="Base name for new files in collection" + help="This will increment automatically - if input is 'file', then output is 'file0', 'file1', etc." value="split_file"/> + <conditional name="select_allocate"> + <param name="allocate" type="select" label="Method to allocate records to new files" help="See the information section for a diagram"> + <option value="random">At random</option> + <option value="batch">Maintain record order</option> + <option value="byrow" selected="true">Alternate output files</option> + </param> + <when value="random"> + <param name="seed" type="integer" label="Random number seed" help="For reproducibility, set this to some arbitrary integer (i.e. '1010')" value="1010"/> + </when> + <when value="batch"> + </when> + <when value="byrow"> + </when> + </conditional> + </xml> + </macros> + <requirements> + <requirement type="package" version="3.5">python</requirement> + </requirements> + <command detect_errors="aggressive"><![CDATA[ + mkdir ./out && + python '$__tool_directory__/split_file_to_collection.py' + --out ./out + --in '$split_parms.input' + --ftype '$split_parms.select_ftype' + #if $split_parms.select_ftype == "tabular": + --top '$split_parms.top' + --by '$split_parms.split_by.select_split_by' + #if $split_parms.split_by.select_split_by == "col": + --id_column '$split_parms.split_by.id_col' + --match '$split_parms.split_by.match_regex' + --sub '$split_parms.split_by.sub_regex' + #else + --numnew '$split_parms.split_by.numnew' + #if $split_parms.split_by.select_allocate.allocate == "random": + --rand + --seed '$split_parms.split_by.rand.seed' + #end if + #if $split_parms.split_by.select_allocate.allocate == "batch": + --batch + #end if + #end if + #else + --numnew '$split_parms.numnew' + #if $split_parms.select_allocate.allocate == "random": + --rand + --seed '$split_parms.select_allocate.seed' + #end if + #if $split_parms.select_allocate.allocate == "batch": + --batch + #end if + #end if + #if ($split_parms.select_ftype == "tabular" and $split_parms.split_by.select_split_by == "row"): + --file_names '$split_parms.split_by.newfilenames' + --file_ext '$split_parms.select_ftype' + #end if + #if $split_parms.select_ftype != "tabular": + --file_names '$split_parms.newfilenames' + --file_ext '$split_parms.select_ftype' + #end if + > '$log' + ]]></command> + <inputs> + <conditional name="split_parms"> + <param name="select_ftype" type="select" label="Select the file type to split"> + <option value="mgf">MGF</option> + <option value="fastq">FASTQ</option> + <option value="tabular">Tabular</option> + <option value="fasta">FASTA</option> + </param> + <when value="tabular"> + <param name="input" type="data" format="tabular" label="Tabular file to split"/> + <param name="top" type="integer" value="0" min="0" label="Number of header lines to transfer to new files"/> + <conditional name="split_by"> + <param name="select_split_by" type="select" label="Split by row or by a column?"> + <option value="row">By row</option> + <option value="col">By column</option> + </param> + <when value="col"> + <param name="id_col" type="data_column" label="Column to split on" data_ref="input"/> + <param name="match_regex" type="text" label="Regex to match contents of id column" value="(.*)"> + <sanitizer> + <valid> + <add preset="string.printable"/> + <remove value="\" /> + <remove value="'" /> + </valid> + <mapping initial="none"> + <add source="\" target="__backslash__" /> + <add source="'" target="__sq__"/> + </mapping> + </sanitizer> + </param> + <param name="sub_regex" type="text" label="Pattern to replace match with" value="\1"> + <sanitizer> + <valid> + <add preset="string.printable"/> + <remove value="\" /> + <remove value="'" /> + </valid> + <mapping initial="none"> + <add source="\" target="__backslash__" /> + <add source="'" target="__sq__"/> + </mapping> + </sanitizer> + </param> + </when> + <when value="row"> + <expand macro="numnew_fname"/> + </when> + </conditional> + </when> + <when value="mgf"> + <param name="input" type="data" format="mgf" label="MGF file to split"/> + <expand macro="numnew_fname"/> + </when> + <when value="fastq"> + <param name="input" type="data" format="fastq" label="FASTQ file to split"/> + <expand macro="numnew_fname"/> + </when> + <when value="fasta"> + <param name="input" type="data" format="fasta" label="FASTA file to split"/> + <expand macro="numnew_fname"/> + </when> + </conditional> + </inputs> + <outputs> + <collection name="list_output_tab" type="list" label="${tool.name} on ${on_string}: output collection"> + <discover_datasets pattern="__name__" directory="out" visible="false" format="tabular"/> + <filter>split_parms['select_ftype'] == "tabular"</filter> + </collection> + <collection name="list_output_mgf" type="list" label="${tool.name} on ${on_string}: output collection"> + <discover_datasets pattern="__name__" directory="out" visible="false" format="mgf"/> + <filter>split_parms['select_ftype'] == "mgf"</filter> + </collection> + <collection name="list_output_fasta" type="list" label="${tool.name} on ${on_string}: output collection"> + <discover_datasets pattern="__name__" directory="out" visible="false" format="fasta"/> + <filter>split_parms['select_ftype'] == "fasta"</filter> + </collection> + <collection name="list_output_fastq" type="list" label="${tool.name} on ${on_string}: output collection"> + <discover_datasets pattern="__name__" directory="out" visible="false" format="fastq"/> + <filter>split_parms['select_ftype'] == "fastq"</filter> + </collection> + <data name="log" format="txt" label="${tool.name} on ${on_string}: log" /> + </outputs> + <tests> + <test> + <param name="input" value="test.tabular" ftype="tabular"/> + <param name="select_ftype" value="tabular"/> + <param name="select_split_by" value="col"/> + <param name="id_col" value="1"/> + <param name="match_regex" value="(.*)\.mgf"/> + <param name="sub_regex" value="\1.tab"/> + <param name="top" value="2"/> + <output_collection name="list_output_tab" type="list"> + <element name="foo.tab" file="foo.tab" ftype="tabular"/> + <element name="foo2.tab" file="foo2.tab" ftype="tabular"/> + <element name="foo3.tab" file="foo3.tab" ftype="tabular"/> + </output_collection> + </test> + <test> + <param name="input" value="test.tabular" ftype="tabular"/> + <param name="select_ftype" value="tabular"/> + <param name="select_split_by" value="row"/> + <param name="top" value="2"/> + <param name="numnew" value="2"/> + <param name="newfilenames" value="test"/> + <output_collection name="list_output_tab" type="list"> + <element name="test_0.tabular" file="test_0.tabular" ftype="tabular"/> + <element name="test_1.tabular" file="test_1.tabular" ftype="tabular"/> + </output_collection> + </test> + <test> + <param name="input" value="test.tabular" ftype="tabular"/> + <param name="select_ftype" value="tabular"/> + <param name="select_split_by" value="row"/> + <param name="top" value="2"/> + <param name="numnew" value="2"/> + <param name="newfilenames" value="batch_tab"/> + <param name="allocate" value="batch"/> + <output_collection name="list_output_tab" type="list"> + <element name="batch_tab_0.tabular" file="batch_tab_0.tabular" ftype="tabular"/> + <element name="batch_tab_1.tabular" file="batch_tab_1.tabular" ftype="tabular"/> + </output_collection> + </test> + <test> + <param name="input" value="psm.tabular" ftype="tabular"/> + <param name="select_ftype" value="tabular"/> + <param name="select_split_by" value="col"/> + <param name="id_col" value="10"/> + <param name="match_regex" value="(.*)\.mgf"/> + <param name="sub_regex" value="\1.tab"/> + <param name="top" value="1"/> + <output_collection name="list_output_tab" type="list"> + <element name="file1.tab" file="file1.tab" ftype="tabular"/> + <element name="file2.tab" file="file2.tab" ftype="tabular"/> + <element name="file3.tab" file="file3.tab" ftype="tabular"/> + <element name="file4.tab" file="file4.tab" ftype="tabular"/> + </output_collection> + </test> + <test> + <param name="input" value="demo758Dacentroid.mgf" ftype="mgf"/> + <param name="select_ftype" value="mgf"/> + <param name="numnew" value="3"/> + <param name="newfilenames" value="demo"/> + <output_collection name="list_output_mgf" type="list"> + <element name="demo_0.mgf" file="demo_0.mgf" ftype="mgf"/> + <element name="demo_1.mgf" file="demo_1.mgf" ftype="mgf"/> + <element name="demo_2.mgf" file="demo_2.mgf" ftype="mgf"/> + </output_collection> + </test> + <test> + <param name="input" value="test.fasta" ftype="fasta"/> + <param name="select_ftype" value="fasta"/> + <param name="numnew" value="2"/> + <param name="newfilenames" value="test"/> + <output_collection name="list_output_fasta" type="list"> + <element name="test_0.fasta" file="test_0.fasta" ftype="fasta"/> + <element name="test_1.fasta" file="test_1.fasta" ftype="fasta"/> + </output_collection> + </test> + <test> + <param name="input" value="test.fastq" ftype="fastq"/> + <param name="select_ftype" value="fastq"/> + <param name="numnew" value="2"/> + <param name="newfilenames" value="test"/> + <output_collection name="list_output_fastq" type="list"> + <element name="test_0.fastq" file="test_0.fastq" ftype="fastq"/> + <element name="test_1.fastq" file="test_1.fastq" ftype="fastq"/> + </output_collection> + </test> + <test> + <param name="input" value="test.fasta" ftype="fasta"/> + <param name="select_ftype" value="fasta"/> + <param name="numnew" value="2"/> + <param name="newfilenames" value="rand"/> + <param name="allocate" value="random"/> + <param name="seed" value="1010"/> + <output_collection name="list_output_fasta" type="list"> + <element name="rand_0.fasta" file="rand_0.fasta" ftype="fasta"/> + <element name="rand_1.fasta" file="rand_1.fasta" ftype="fasta"/> + </output_collection> + </test> + <test> + <param name="input" value="test.fasta" ftype="fasta"/> + <param name="select_ftype" value="fasta"/> + <param name="numnew" value="2"/> + <param name="newfilenames" value="fasta_batch"/> + <param name="allocate" value="batch"/> + <output_collection name="list_output_fasta" type="list"> + <element name="fasta_batch_0.fasta" file="fasta_batch_0.fasta" ftype="fasta"/> + <element name="fasta_batch_1.fasta" file="fasta_batch_1.fasta" ftype="fasta"/> + </output_collection> + </test> + </tests> + <help><![CDATA[ +**Split file into a dataset collection** + +This tool can split five types of files into a separate files within a dataset collection: MGF, FASTA, FASTQ, and tabular. +If a tabular file is used as input, you may choose to split by line or by column. If split by column, a new file is created for each unique value in the column. +In addition, (Python) regular expressions may be used to transform the value in the column to a new value. Caution should be used with this feature, as it could transform all values to the same value, or other unexpected behavior. +The default regular expression uses each value in the column without modifying it. + +If splitting by line (or by some other item, like a FASTA entry or an MGF section), the splitting can be either done sequentially or at random. +Note that there are no guarantees when splitting at random that every result file will be non-empty, so downstream tools should be able to gracefully handle empty files. + +**Note** + +Due to current limitations with dataset collections, a log file is produced when running this tool. It will usually be empty, but if the tool fails, any errors will be printed to the log file. + ]]></help> + <citations> + <citation type="bibtex"> +@misc{githubsplit, + author = {Easterly, Caleb}, + year = {2018}, + title = {split_file_to_collection: a Galaxy tool}, + publisher = {GitHub}, + journal = {GitHub repository}, + url = {https://github.com/galaxyproteomics/tools-galaxyp/tools/split_file_to_collection}, +}</citation> + </citations> +</tool>