Mercurial > repos > bgruening > split_file_on_column
view split_file_on_column.xml @ 6:ff2a81aa3f08 draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_on_column commit 6a2deb2f38472a2845123bd54e73b6bd115b3a0b
author | bgruening |
---|---|
date | Tue, 19 Jul 2022 13:25:20 +0000 |
parents | d4b5b70e82cb |
children |
line wrap: on
line source
<tool id="tp_split_on_column" name="Split by group" version="0.6"> <requirements> <requirement type="package" version="5.1.0">gawk</requirement> </requirements> <command> <![CDATA[ mkdir tmp_out && #if $include_header awk -F '\t' 'NR==1{hdr=$0;next}f!="tmp_out/"\$$column".$infile.ext"{if(f) close(f); f="tmp_out/"\$$column".$infile.ext"}; {if (!seen[f]++) print hdr>f; print >> f}' $infile #else awk -F'\t' '{print >> "tmp_out/"\$$column".$infile.ext" }' '$infile' #end if ]]> </command> <inputs> <param format="tabular" name="infile" type="data" label="File to split" /> <param name="column" label="on column" type="data_column" data_ref="infile" accept_default="true" /> <param name="include_header" type="boolean" label="Include header in splits?" help="Include the first line (the assumed header line) to all split files." /> </inputs> <outputs> <collection name="split_output" type="list" label="Split by group collection"> <discover_datasets pattern="__name_and_ext__" directory="tmp_out" /> </collection> </outputs> <tests> <test> <param name="infile" value="5cols.tabular" ftype="tabular"/> <param name="column" value="5" /> <param name="include_header" value="false"/> <output_collection name="split_output" type="list"> <element name="1"> <assert_contents> <has_text_matching expression="chr7\t56632\t56652\tcluster\t1" /> </assert_contents> </element> <element name="2"> <assert_contents> <has_text_matching expression="chr7\t56761\t56781\tcluster\t2" /> </assert_contents> </element> </output_collection> </test> <test> <param name="infile" value="5cols-with-header.tabular" ftype="tabular" /> <param name="column" value="5" /> <param name="include_header" value="true"/> <output_collection name="split_output" type="list"> <element name="1"> <assert_contents> <has_line_matching expression="Column1\tColumn2\tColumn3\tColumn4\tColumn5" /> <has_n_lines n="3" /> <has_text_matching expression="chr7\t56632\t56652\tcluster\t1" /> </assert_contents> </element> <element name="2"> <assert_contents> <has_line_matching expression="Column1\tColumn2\tColumn3\tColumn4\tColumn5" /> <has_n_lines n="4" /> <has_text_matching expression="chr7\t56761\t56781\tcluster\t2" /> </assert_contents> </element> </output_collection> </test> <test> <param name="infile" value="5cols-with-header.tabular" ftype="tabular" /> <param name="column" value="5" /> <param name="include_header" value="false"/> <output_collection name="split_output" type="list"> <element name="1"> <assert_contents> <has_text_matching expression="chr7\t56632\t56652\tcluster\t1" /> </assert_contents> </element> <element name="2"> <assert_contents> <has_text_matching expression="chr7\t56761\t56781\tcluster\t2" /> </assert_contents> </element> </output_collection> </test> <test><!-- test with unsorted column, no header --> <param name="infile" value="5cols-unsorted.tabular" ftype="tabular" /> <param name="column" value="5" /> <param name="include_header" value="false"/> <output_collection name="split_output" type="list"> <element name="1"> <assert_contents> <has_n_lines n="3" /> </assert_contents> </element> <element name="2"> <assert_contents> <has_n_lines n="2" /> </assert_contents> </element> </output_collection> </test> <test><!-- test with unsorted column, with header --> <param name="infile" value="5cols-unsorted-with-header.tabular" ftype="tabular" /> <param name="column" value="5" /> <param name="include_header" value="true"/> <output_collection name="split_output" type="list"> <element name="1"> <assert_contents> <has_n_lines n="4" /> <has_line_matching expression="Column1\tColumn2\tColumn3\tColumn4\tColumn5" /> </assert_contents> </element> <element name="2"> <assert_contents> <has_n_lines n="3" /> <has_line_matching expression="Column1\tColumn2\tColumn3\tColumn4\tColumn5" /> </assert_contents> </element> </output_collection> </test> </tests> <help> <![CDATA[ ======== Synopsis ======== Given a single input dataset this tool splits the file on unique values from a specified column. =========== Description =========== This tool splits a file into a collection based on unique values of a speific column. It performs a grouping operation with every group saved as a separate collection element. You have the option to include the header (first line) to all splits. If you have a header and don't want keep it, please remove it before you use this tool. For example with the "Remove beginning of a file" tool. ----- **Example** Splitting this file on column 1:: chr1 10 20 chr1 30 40 chr2 40 70 chr4 60 80 will produce a collection with 3 elements:: chr1 10 20 chr1 30 40 chr2 40 70 chr4 60 80 ------ .. image:: $PATH_TO_IMAGES/split_by_group.svg :width: 800 :alt: Split by group ]]> </help> <citations> <citation type="bibtex"> @misc{githubsplit_file_on_column, author = {Gruening, Bjoern}, year = {2015}, title = {split_file_on_column}, publisher = {GitHub}, journal = {GitHub repository}, url = {https://github.com/bgruening/galaxytools}, } </citation> </citations> </tool>