Mercurial > repos > xuebing > sharplabtool
diff tools/stats/grouping.xml @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/stats/grouping.xml Fri Mar 09 19:37:19 2012 -0500 @@ -0,0 +1,123 @@ +<tool id="Grouping1" name="Group" version="2.0.0"> + <description>data by a column and perform aggregate operation on other columns.</description> + <command interpreter="python"> + grouping.py + $out_file1 + $input1 + $groupcol + $ignorecase + #for $op in $operations + '${op.optype} + ${op.opcol} + ${op.opround}' + #end for + </command> + <inputs> + <param format="tabular" name="input1" type="data" label="Select data" help="Dataset missing? See TIP below."/> + <param name="groupcol" label="Group by column" type="data_column" data_ref="input1" /> + <param name="ignorecase" type="boolean" truevalue="1" falsevalue="0"> + <label>Ignore case while grouping?</label> + </param> + <repeat name="operations" title="Operation"> + <param name="optype" type="select" label="Type"> + <option value="mean">Mean</option> + <option value="median">Median</option> + <option value="mode">Mode</option> + <option value="max">Maximum</option> + <option value="min">Minimum</option> + <option value="sum">Sum</option> + <option value="length">Count</option> + <option value="unique">Count Distinct</option> + <option value="cat">Concatenate</option> + <option value="cat_uniq">Concatenate Distinct</option> + <option value="random">Randomly pick</option> + <option value="std">Standard deviation</option> + </param> + <param name="opcol" label="On column" type="data_column" data_ref="input1" /> + <param name="opround" type="select" label="Round result to nearest integer?"> + <option value="no">NO</option> + <option value="yes">YES</option> + </param> + </repeat> + </inputs> + <outputs> + <data format="tabular" name="out_file1" /> + </outputs> + <requirements> + <requirement type="python-module">numpy</requirement> + </requirements> + <tests> + <!-- Test valid data --> + <test> + <param name="input1" value="1.bed"/> + <param name="groupcol" value="1"/> + <param name="ignorecase" value="true"/> + <param name="optype" value="mean"/> + <param name="opcol" value="2"/> + <param name="opround" value="no"/> + <output name="out_file1" file="groupby_out1.dat"/> + </test> + <!-- Long case but test framework doesn't allow yet + <test> + <param name="input1" value="1.bed"/> + <param name="groupcol" value="1"/> + <param name="ignorecase" value="false"/> + <param name="operations" value='[{"opcol": "2", "__index__": 0, "optype": "mean", "opround": "no"}, {"opcol": "2", "__index__": 1, "optype": "median", "opround": "no"}, {"opcol": "6", "__index__": 2, "optype": "mode", "opround": "no"}, {"opcol": "2", "__index__": 3, "optype": "max", "opround": "no"}, {"opcol": "2", "__index__": 4, "optype": "min", "opround": "no"}, {"opcol": "2", "__index__": 5, "optype": "sum", "opround": "no"}, {"opcol": "1", "__index__": 6, "optype": "length", "opround": "no"}, {"opcol": "1", "__index__": 7, "optype": "unique", "opround": "no"}, {"opcol": "1", "__index__": 8, "optype": "cat", "opround": "no"}, {"opcol": "6", "__index__": 9, "optype": "cat_uniq", "opround": "no"}, {"opcol": "2", "__index__": 10, "optype": "random", "opround": "no"}, {"opcol": "2", "__index__": 11, "optype": "std", "opround": "no"}]'/> + <output name="out_file1" file="groupby_out3.tabular"/> + </test> + --> + <!-- Test data with an invalid value in a column. Can't do it because test framework doesn't allow testing of errors + <test> + <param name="input1" value="1.tabular"/> + <param name="groupcol" value="1"/> + <param name="ignorecase" value="true"/> + <param name="optype" value="mean"/> + <param name="opcol" value="2"/> + <param name="opround" value="no"/> + <output name="out_file1" file="groupby_out2.dat"/> + </test> + --> + </tests> + <help> + +.. class:: infomark + +**TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert* + +----- + +**Syntax** + +This tool allows you to group the input dataset by a particular column and perform aggregate functions: Mean, Median, Mode, Sum, Max, Min, Count, Concatenate, and Randomly pick on any column(s). + +The Concatenate function will take, for each group, each item in the specified column and build a comma delimited list. Concatenate Unique will do the same but will build a list of unique items with no repetition. + +Count and Count Unique are equivalent to Concatenate and Concatenate Unique, but will only count the number of items and will return an integer. + +- If multiple modes are present, all are reported. + +----- + +**Example** + +- For the following input:: + + chr22 1000 1003 TTT + chr22 2000 2003 aaa + chr10 2200 2203 TTT + chr10 1200 1203 ttt + chr22 1600 1603 AAA + +- **Grouping on column 4** while ignoring case, and performing operation **Count on column 1** will return:: + + AAA 2 + TTT 3 + +- **Grouping on column 4** while not ignoring case, and performing operation **Count on column 1** will return:: + + aaa 1 + AAA 1 + ttt 1 + TTT 2 + </help> +</tool>