Mercurial > repos > bgruening > split_file_on_column

<tool id="tp_split_on_column" name="Split by group" version="0.6">
    <requirements>
        <requirement type="package" version="5.1.0">gawk</requirement>
    </requirements>
    <command>
<![CDATA[
    mkdir tmp_out &&
    #if $include_header
        awk -F '\t' 'NR==1{hdr=$0;next}f!="tmp_out/"\$$column".$infile.ext"{if(f) close(f); f="tmp_out/"\$$column".$infile.ext"}; {if (!seen[f]++) print hdr>f; print >> f}' $infile
    #else
        awk -F'\t' '{print >> "tmp_out/"\$$column".$infile.ext" }' '$infile'
    #end if
]]>
    </command>
    <inputs>
        <param format="tabular" name="infile" type="data" label="File to split" />
        <param name="column" label="on column" type="data_column" data_ref="infile" accept_default="true" />

        <param name="include_header" type="boolean" label="Include header in splits?"
               help="Include the first line (the assumed header line) to all split files." />
    </inputs>
    <outputs>
        <collection name="split_output" type="list" label="Split by group collection">
            <discover_datasets pattern="__name_and_ext__" directory="tmp_out" />
        </collection>
    </outputs>
    <tests>
        <test>
            <param name="infile" value="5cols.tabular" ftype="tabular"/>
            <param name="column" value="5" />
            <param name="include_header" value="false"/>
            <output_collection name="split_output" type="list">
                <element name="1">
                    <assert_contents>
                        <has_text_matching expression="chr7\t56632\t56652\tcluster\t1" />
                    </assert_contents>
                </element>
                <element name="2">
                    <assert_contents>
                        <has_text_matching expression="chr7\t56761\t56781\tcluster\t2" />
                    </assert_contents>
                </element>
            </output_collection>
        </test>
        <test>
            <param name="infile" value="5cols-with-header.tabular" ftype="tabular" />
            <param name="column" value="5" />
            <param name="include_header" value="true"/>
            <output_collection name="split_output" type="list">
                <element name="1">
                    <assert_contents>
                        <has_line_matching expression="Column1\tColumn2\tColumn3\tColumn4\tColumn5" />
                        <has_n_lines n="3" />
                        <has_text_matching expression="chr7\t56632\t56652\tcluster\t1" />
                    </assert_contents>
                </element>
                <element name="2">
                    <assert_contents>
                        <has_line_matching expression="Column1\tColumn2\tColumn3\tColumn4\tColumn5" />
                        <has_n_lines n="4" />
                        <has_text_matching expression="chr7\t56761\t56781\tcluster\t2" />
                    </assert_contents>
                </element>
            </output_collection>
        </test>
        <test>
            <param name="infile" value="5cols-with-header.tabular" ftype="tabular" />
            <param name="column" value="5" />
            <param name="include_header" value="false"/>
            <output_collection name="split_output" type="list">
                <element name="1">
                    <assert_contents>
                        <has_text_matching expression="chr7\t56632\t56652\tcluster\t1" />
                    </assert_contents>
                </element>
                <element name="2">
                    <assert_contents>
                        <has_text_matching expression="chr7\t56761\t56781\tcluster\t2" />
                    </assert_contents>
                </element>
            </output_collection>
        </test>
        <test><!-- test with unsorted column, no header -->
            <param name="infile" value="5cols-unsorted.tabular" ftype="tabular" />
            <param name="column" value="5" />
            <param name="include_header" value="false"/>
            <output_collection name="split_output" type="list">
                <element name="1">
                    <assert_contents>
                        <has_n_lines n="3" />
                    </assert_contents>
                </element>
                <element name="2">
                    <assert_contents>
                        <has_n_lines n="2" />
                    </assert_contents>
                </element>
            </output_collection>
        </test>
        <test><!-- test with unsorted column, with header -->
            <param name="infile" value="5cols-unsorted-with-header.tabular" ftype="tabular" />
            <param name="column" value="5" />
            <param name="include_header" value="true"/>
            <output_collection name="split_output" type="list">
                <element name="1">
                    <assert_contents>
                        <has_n_lines n="4" />
                        <has_line_matching expression="Column1\tColumn2\tColumn3\tColumn4\tColumn5" />

                    </assert_contents>
                </element>
                <element name="2">
                    <assert_contents>
                        <has_n_lines n="3" />
                        <has_line_matching expression="Column1\tColumn2\tColumn3\tColumn4\tColumn5" />
                    </assert_contents>
                </element>
            </output_collection>
        </test>
    </tests>
    <help>
<![CDATA[

========
Synopsis
========

Given a single input dataset this tool splits the file on unique values from a specified column.

===========
Description
===========

This tool splits a file into a collection based on unique values of a speific column.
It performs a grouping operation with every group saved as a separate collection element.
You have the option to include the header (first line) to all splits.
If you have a header and don't want keep it, please remove it before you use this tool.
For example with the "Remove beginning of a file" tool.

-----

**Example**

Splitting this file on column 1::

    chr1 10 20
    chr1 30 40
    chr2 40 70
    chr4 60 80


will produce a collection with 3 elements::

    chr1 10 20
    chr1 30 40

    chr2 40 70

    chr4 60 80

------

.. image:: $PATH_TO_IMAGES/split_by_group.svg
  :width: 800
  :alt: Split by group

]]>
    </help>
    <citations>
        <citation type="bibtex">
@misc{githubsplit_file_on_column,
      author = {Gruening, Bjoern},
      year = {2015},
      title = {split_file_on_column},
      publisher = {GitHub},
      journal = {GitHub repository},
      url = {https://github.com/bgruening/galaxytools},
     }
        </citation>
    </citations>
</tool>
author	bgruening
date	Tue, 19 Jul 2022 13:25:20 +0000
parents	d4b5b70e82cb
children