Mercurial > repos > bgruening > split_file_on_column
view split_file_on_column.xml @ 4:37a53100b67e draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_on_column commit 4d0bfcf37bfbedafc7ff0672dfe452766ca8a606"
author | bgruening |
---|---|
date | Thu, 25 Feb 2021 15:54:13 +0000 |
parents | b60f2452580e |
children | d4b5b70e82cb |
line wrap: on
line source
<tool id="tp_split_on_column" name="Split file" version="0.4"> <description>according to the values of a column</description> <requirements> <requirement type="package" version="5.0.1">gawk</requirement> </requirements> <command> <![CDATA[ mkdir tmp_out && #if $include_header awk -F '\t' 'NR==1{hdr=$0;next}f!="tmp_out/"\$$column".$infile.ext"{if(f) close(f); f="tmp_out/"\$$column".$infile.ext";print hdr>f} {print >> f}' $infile #else awk -F'\t' '{print > "tmp_out/"\$$column".$infile.ext" }' '$infile' #end if ]]> </command> <inputs> <param format="tabular" name="infile" type="data" label="File to select" /> <param name="column" label="on column" type="data_column" data_ref="infile" accept_default="true" /> <param name="include_header" type="boolean" label="Include the header in all splitted files?" help="Include the first line (the assumed header line) in all splitted files." /> </inputs> <outputs> <collection name="split_output" type="list" label="Table split on first column"> <discover_datasets pattern="__name_and_ext__" directory="tmp_out" /> </collection> </outputs> <tests> <test> <param name="infile" value="5cols.tabular" ftype="tabular"/> <param name="column" value="5" /> <param name="include_header" value="false"/> <output_collection name="split_output" type="list"> <element name="1"> <assert_contents> <has_text_matching expression="chr7\t56632\t56652\tcluster\t1" /> </assert_contents> </element> <element name="2"> <assert_contents> <has_text_matching expression="chr7\t56761\t56781\tcluster\t2" /> </assert_contents> </element> </output_collection> </test> <test> <param name="infile" value="5cols-with-header.tabular" ftype="tabular" /> <param name="column" value="5" /> <param name="include_header" value="true"/> <output_collection name="split_output" type="list"> <element name="1"> <assert_contents> <has_line_matching expression="Column1\tColumn2\tColumn3\tColumn4\tColumn5" /> <has_n_lines n="3" /> <has_text_matching expression="chr7\t56632\t56652\tcluster\t1" /> </assert_contents> </element> <element name="2"> <assert_contents> <has_line_matching expression="Column1\tColumn2\tColumn3\tColumn4\tColumn5" /> <has_n_lines n="4" /> <has_text_matching expression="chr7\t56761\t56781\tcluster\t2" /> </assert_contents> </element> </output_collection> </test> <test> <param name="infile" value="5cols-with-header.tabular" ftype="tabular" /> <param name="column" value="5" /> <param name="include_header" value="false"/> <output_collection name="split_output" type="list"> <element name="1"> <assert_contents> <has_text_matching expression="chr7\t56632\t56652\tcluster\t1" /> </assert_contents> </element> <element name="2"> <assert_contents> <has_text_matching expression="chr7\t56761\t56781\tcluster\t2" /> </assert_contents> </element> </output_collection> </test> </tests> <help> <![CDATA[ **What it does** This tool splits a file into different smaller files using a specific column. It will work like the group tool, but every group is saved to its own file. You have the option to include the header (first line) in all splitted files. If you have a header and don't want keep it, please remove it before you use this tool. For example with the "Remove beginning of a file" tool. ----- **Example** Splitting a file without header on column 5 from this:: chr7 56632 56652 cluster 1 chr7 56736 56756 cluster 1 chr7 56761 56781 cluster 2 chr7 56772 56792 cluster 2 chr7 56775 56795 cluster 2 will produce 2 files with different clusters:: chr7 56632 56652 cluster 1 chr7 56736 56756 cluster 1 chr7 56761 56781 cluster 2 chr7 56772 56792 cluster 2 chr7 56775 56795 cluster 2 ]]> </help> <citations> <citation type="bibtex"> @misc{githubsplit_file_on_column, author = {Gruening, Bjoern}, year = {2015}, title = {split_file_on_column}, publisher = {GitHub}, journal = {GitHub repository}, url = {https://github.com/bgruening/galaxytools}, } </citation> </citations> </tool>