Mercurial > repos > bgruening > split_file_on_column

<tool id="tp_split_on_column" name="Split file" version="0.4">
    <description>according to the values of a column</description>
    <requirements>
        <requirement type="package" version="5.0.1">gawk</requirement>
    </requirements>
    <command>
<![CDATA[
    mkdir tmp_out &&
    #if $include_header
        awk -F '\t' 'NR==1{hdr=$0;next}f!="tmp_out/"\$$column".$infile.ext"{if(f) close(f); f="tmp_out/"\$$column".$infile.ext";print hdr>f} {print >> f}' $infile
    #else
        awk -F'\t' '{print > "tmp_out/"\$$column".$infile.ext" }' '$infile'
    #end if
]]>
    </command>
    <inputs>
        <param format="tabular" name="infile" type="data" label="File to select" />
        <param name="column" label="on column" type="data_column" data_ref="infile" accept_default="true" />

        <param name="include_header" type="boolean" label="Include the header in all splitted files?"
               help="Include the first line (the assumed header line) in all splitted files." />
    </inputs>
    <outputs>
        <collection name="split_output" type="list" label="Table split on first column">
            <discover_datasets pattern="__name_and_ext__" directory="tmp_out" />
        </collection>
    </outputs>
    <tests>
        <test>
            <param name="infile" value="5cols.tabular" ftype="tabular"/>
            <param name="column" value="5" />
            <param name="include_header" value="false"/>
            <output_collection name="split_output" type="list">
                <element name="1">
                    <assert_contents>
                        <has_text_matching expression="chr7\t56632\t56652\tcluster\t1" />
                    </assert_contents>
                </element>
                <element name="2">
                    <assert_contents>
                        <has_text_matching expression="chr7\t56761\t56781\tcluster\t2" />
                    </assert_contents>
                </element>
            </output_collection>
        </test>
        <test>
            <param name="infile" value="5cols-with-header.tabular" ftype="tabular" />
            <param name="column" value="5" />
            <param name="include_header" value="true"/>
            <output_collection name="split_output" type="list">
                <element name="1">
                    <assert_contents>
                        <has_line_matching expression="Column1\tColumn2\tColumn3\tColumn4\tColumn5" />
                        <has_n_lines n="3" />
                        <has_text_matching expression="chr7\t56632\t56652\tcluster\t1" />
                    </assert_contents>
                </element>
                <element name="2">
                    <assert_contents>
                        <has_line_matching expression="Column1\tColumn2\tColumn3\tColumn4\tColumn5" />
                        <has_n_lines n="4" />
                        <has_text_matching expression="chr7\t56761\t56781\tcluster\t2" />
                    </assert_contents>
                </element>
            </output_collection>
        </test>
        <test>
            <param name="infile" value="5cols-with-header.tabular" ftype="tabular" />
            <param name="column" value="5" />
            <param name="include_header" value="false"/>
            <output_collection name="split_output" type="list">
                <element name="1">
                    <assert_contents>
                        <has_text_matching expression="chr7\t56632\t56652\tcluster\t1" />
                    </assert_contents>
                </element>
                <element name="2">
                    <assert_contents>
                        <has_text_matching expression="chr7\t56761\t56781\tcluster\t2" />
                    </assert_contents>
                </element>
            </output_collection>
        </test>
    </tests>
    <help>
<![CDATA[

**What it does**

This tool splits a file into different smaller files using a specific column.
It will work like the group tool, but every group is saved to its own file.
You have the option to include the header (first line) in all splitted files.
If you have a header and don't want keep it, please remove it before you use this tool.
For example with the "Remove beginning of a file" tool.

-----

**Example**

Splitting a file without header on column 5 from this::

    chr7  56632  56652  cluster 1
    chr7  56736  56756  cluster 1
    chr7  56761  56781  cluster 2
    chr7  56772  56792  cluster 2
    chr7  56775  56795  cluster 2

will produce 2 files with different clusters::

    chr7  56632  56652  cluster 1
    chr7  56736  56756  cluster 1


    chr7  56761  56781  cluster 2
    chr7  56772  56792  cluster 2
    chr7  56775  56795  cluster 2

]]>
    </help>
    <citations>
        <citation type="bibtex">
@misc{githubsplit_file_on_column,
      author = {Gruening, Bjoern},
      year = {2015},
      title = {split_file_on_column},
      publisher = {GitHub},
      journal = {GitHub repository},
      url = {https://github.com/bgruening/galaxytools},
     }
        </citation>
    </citations>
</tool>
author	bgruening
date	Thu, 25 Feb 2021 15:54:13 +0000
parents	b60f2452580e
children	d4b5b70e82cb