Mercurial > repos > bgruening > split_file_on_column

Binary file split_file_on_column.tar.gz has changed
--- a/split_file_on_column.xml	Wed Dec 23 03:50:48 2015 -0500
+++ b/split_file_on_column.xml	Thu Feb 25 15:54:13 2021 +0000
@@ -1,17 +1,24 @@
-<tool id="tp_split_on_column" name="Split file" version="0.2">
+<tool id="tp_split_on_column" name="Split file" version="0.4">
     <description>according to the values of a column</description>
     <requirements>
-        <requirement type="package" version="4.1.0">gnu_awk</requirement>
+        <requirement type="package" version="5.0.1">gawk</requirement>
     </requirements>
     <command>
 <![CDATA[
-        mkdir tmp_out &&
-        awk -F'\t' '{print > "tmp_out/"\$$column".$infile.ext" }' $infile
+    mkdir tmp_out &&
+    #if $include_header
+        awk -F '\t' 'NR==1{hdr=$0;next}f!="tmp_out/"\$$column".$infile.ext"{if(f) close(f); f="tmp_out/"\$$column".$infile.ext";print hdr>f} {print >> f}' $infile
+    #else
+        awk -F'\t' '{print > "tmp_out/"\$$column".$infile.ext" }' '$infile'
+    #end if
 ]]>
     </command>
     <inputs>
         <param format="tabular" name="infile" type="data" label="File to select" />
         <param name="column" label="on column" type="data_column" data_ref="infile" accept_default="true" />
+
+        <param name="include_header" type="boolean" label="Include the header in all splitted files?"
+               help="Include the first line (the assumed header line) in all splitted files." />
     </inputs>
     <outputs>
         <collection name="split_output" type="list" label="Table split on first column">
@@ -22,6 +29,45 @@
         <test>
             <param name="infile" value="5cols.tabular" ftype="tabular"/>
             <param name="column" value="5" />
+            <param name="include_header" value="false"/>
+            <output_collection name="split_output" type="list">
+                <element name="1">
+                    <assert_contents>
+                        <has_text_matching expression="chr7\t56632\t56652\tcluster\t1" />
+                    </assert_contents>
+                </element>
+                <element name="2">
+                    <assert_contents>
+                        <has_text_matching expression="chr7\t56761\t56781\tcluster\t2" />
+                    </assert_contents>
+                </element>
+            </output_collection>
+        </test>
+        <test>
+            <param name="infile" value="5cols-with-header.tabular" ftype="tabular" />
+            <param name="column" value="5" />
+            <param name="include_header" value="true"/>
+            <output_collection name="split_output" type="list">
+                <element name="1">
+                    <assert_contents>
+                        <has_line_matching expression="Column1\tColumn2\tColumn3\tColumn4\tColumn5" />
+                        <has_n_lines n="3" />
+                        <has_text_matching expression="chr7\t56632\t56652\tcluster\t1" />
+                    </assert_contents>
+                </element>
+                <element name="2">
+                    <assert_contents>
+                        <has_line_matching expression="Column1\tColumn2\tColumn3\tColumn4\tColumn5" />
+                        <has_n_lines n="4" />
+                        <has_text_matching expression="chr7\t56761\t56781\tcluster\t2" />
+                    </assert_contents>
+                </element>
+            </output_collection>
+        </test>
+        <test>
+            <param name="infile" value="5cols-with-header.tabular" ftype="tabular" />
+            <param name="column" value="5" />
+            <param name="include_header" value="false"/>
             <output_collection name="split_output" type="list">
                 <element name="1">
                     <assert_contents>
@@ -43,12 +89,15 @@

 This tool splits a file into different smaller files using a specific column.
 It will work like the group tool, but every group is saved to its own file.
+You have the option to include the header (first line) in all splitted files.
+If you have a header and don't want keep it, please remove it before you use this tool.
+For example with the "Remove beginning of a file" tool.

 -----

 **Example**

-Splitting on column 5 from this::
+Splitting a file without header on column 5 from this::

     chr7  56632  56652  cluster 1
     chr7  56736  56756  cluster 1
@@ -66,7 +115,19 @@
     chr7  56772  56792  cluster 2
     chr7  56775  56795  cluster 2

-
 ]]>
     </help>
+    <citations>
+        <citation type="bibtex">
+@misc{githubsplit_file_on_column,
+      author = {Gruening, Bjoern},
+      year = {2015},
+      title = {split_file_on_column},
+      publisher = {GitHub},
+      journal = {GitHub repository},
+      url = {https://github.com/bgruening/galaxytools},
+     }
+        </citation>
+    </citations>
 </tool>
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/5cols-with-header.tabular	Thu Feb 25 15:54:13 2021 +0000
@@ -0,0 +1,6 @@
+Column1	Column2	Column3	Column4	Column5
+chr7	56632	56652	cluster	1
+chr7	56736	56756	cluster	1
+chr7	56761	56781	cluster	2
+chr7	56772	56792	cluster	2
+chr7	56775	56795	cluster	2
--- a/test-data/5cols.tabular	Wed Dec 23 03:50:48 2015 -0500
+++ b/test-data/5cols.tabular	Thu Feb 25 15:54:13 2021 +0000
@@ -1,5 +1,5 @@
 chr7	56632	56652	cluster	1
-chr7	56736	56756	cluster	1
+chr7	56736	56756	cluster 1
 chr7	56761	56781	cluster	2
 chr7	56772	56792	cluster	2
 chr7	56775	56795	cluster	2
--- a/tool_dependencies.xml	Wed Dec 23 03:50:48 2015 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,6 +0,0 @@
-<?xml version="1.0"?>
-<tool_dependency>
-    <package name="gnu_awk" version="4.1.0">
-        <repository changeset_revision="f145f856ec57" name="package_gnu_awk_4_1_0" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" />
-    </package>
-</tool_dependency>