changeset 0:63706c95c9ed draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/scikit_bio commit d46d41c5fec10407bd6b5cb77a11d9b43b82b95e
author iuc
date Fri, 23 Sep 2016 12:17:38 -0400
parents
children 024a9b86f853
files macros.xml scikit_bio_diversity_beta_diversity.py scikit_bio_diversity_beta_diversity.xml test-data/input_abundance_1.tabular test-data/input_tree_1.newick test-data/output_weighted_unifrac_1.tabular
diffstat 6 files changed, 341 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Fri Sep 23 12:17:38 2016 -0400
@@ -0,0 +1,39 @@
+<macros>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="0.4.2">scikit-bio</requirement>
+            <yield />
+        </requirements>
+    </xml>
+
+    <xml name="version_command">
+        <version_command><![CDATA[python -c "import skbio;print 'scikit-bio version', skbio.__version__"]]></version_command>
+    </xml>
+
+    <xml name="stdio">
+        <stdio>
+            <exit_code range="1:" />
+            <exit_code range=":-1" />
+        </stdio>
+    </xml>
+    <token name="@VERSION@">0.4.2</token>
+
+    <xml name="params_tree">
+        <param name="input_tree" type="data" format="txt" label="Newick Tree file" optional="True" help="You can provide a file or a string"/>
+        <param name="tree" type="text" value="" label="Newick Tree text" help="You can provide a file or a string">
+            <sanitizer invalid_char="">
+                <valid initial="string.printable"/>
+            </sanitizer>
+        </param>
+    </xml>
+    <xml name="citations">
+        <citations>
+            <citation type="bibtex">@unpublished{scikit-bio:2016,
+      title  = "scikit-bio",
+      author = "Contributors",
+      url    = "http://scikit-bio.org/",
+      year   = "2016 (accessed April 1, 2016)"
+    }</citation>
+        </citations>
+    </xml>
+</macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scikit_bio_diversity_beta_diversity.py	Fri Sep 23 12:17:38 2016 -0400
@@ -0,0 +1,88 @@
+#!/usr/bin/env python
+
+# Reports a beta diversity matrix for tabular input file
+# using scikit-bio
+# Daniel Blankenberg
+
+
+import sys
+import optparse
+import codecs
+from skbio.diversity import beta_diversity
+from skbio import TreeNode
+
+
+__VERSION__ = "0.0.1"
+
+DELIMITER = '\t'
+
+NEEDS_TREE = [ 'unweighted_unifrac', 'weighted_unifrac' ]
+
+NEEDS_OTU_NAMES = [ 'unweighted_unifrac', 'weighted_unifrac' ]
+
+
+def __main__():
+    parser = optparse.OptionParser( usage="%prog [options]" )
+    parser.add_option( '-v', '--version', dest='version', action='store_true', default=False, help='print version and exit' )
+    parser.add_option( '-i', '--input', dest='input', action='store', type="string", default=None, help='Input abundance Filename' )
+    parser.add_option( '', '--otu_column', dest='otu_column', action='store', type="int", default=None, help='OTU ID Column (1 based)' )
+    parser.add_option( '', '--sample_columns', dest='sample_columns', action='store', type="string", default=None, help='Comma separated list of sample columns, unset to use all.' )
+    parser.add_option( '', '--header', dest='header', action='store_true', default=False, help='Abundance file has a header line' )
+    parser.add_option( '', '--distance_metric', dest='distance_metric', action='store', type="string", default=None, help='Distance metric to use' )
+    parser.add_option( '', '--tree', dest='tree', action='store', type="string", default=None, help='Newick Tree Filename' )
+    parser.add_option( '-o', '--output', dest='output', action='store', type="string", default=None, help='Output Filename' )
+    (options, args) = parser.parse_args()
+    if options.version:
+        print >> sys.stderr, "scikit-bio betadiversity from tabular file", __VERSION__
+        sys.exit()
+
+    if options.otu_column is not None:
+        otu_column = options.otu_column - 1
+    else:
+        otu_column = None
+
+    if options.sample_columns is None:
+        with open( options.input, 'rb' ) as fh:
+            line = fh.readline()
+            columns = range( len( line.split( DELIMITER ) ) )
+            if otu_column in columns:
+                columns.remove( otu_column )
+    else:
+        columns = map( lambda x: int( x ) - 1, options.sample_columns.split( "," ) )
+
+    max_col = max( columns + [otu_column] )
+    counts = [ [] for x in columns ]
+    sample_names = []
+    otu_names = []
+    with open( options.input, 'rb' ) as fh:
+        if options.header:
+            header = fh.readline().rstrip('\n\r').split( DELIMITER )
+            sample_names = [ header[i] for i in columns ]
+        else:
+            sample_names = [ "SAMPLE_%i" % x for x in range( len( columns ) ) ]
+        for i, line in enumerate( fh ):
+            fields = line.rstrip('\n\r').split( DELIMITER )
+            if len(fields) <= max_col:
+                print >> sys.stederr, "Bad data line: ", fields
+                continue
+            if otu_column is not None:
+                otu_names.append( fields[ otu_column ] )
+            else:
+                otu_names.append( "OTU_%i" % i )
+            for j, col in enumerate( columns ):
+                counts[ j ].append( int( fields[ col ] ) )
+
+    extra_kwds = {}
+    if options.distance_metric in NEEDS_OTU_NAMES:
+        extra_kwds['otu_ids'] = otu_names
+    if options.distance_metric in NEEDS_TREE:
+        assert options.tree, Exception( "You must provide a newick tree when using '%s'" % options.distance_metric )
+        # NB: TreeNode apparently needs unicode files
+        with codecs.open( options.tree, 'rb', 'utf-8' ) as fh:
+            extra_kwds['tree'] = TreeNode.read( fh )
+
+    bd_dm = beta_diversity( options.distance_metric, counts, ids=sample_names, **extra_kwds )
+    bd_dm.write( options.output )
+
+if __name__ == "__main__":
+    __main__()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scikit_bio_diversity_beta_diversity.xml	Fri Sep 23 12:17:38 2016 -0400
@@ -0,0 +1,123 @@
+<tool id="scikit_bio_diversity_beta_diversity" name="Beta Diversity" version="@VERSION@.0">
+    <description>
+        using scikit-bio
+    </description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements" />
+    <expand macro="stdio" />
+    <expand macro="version_command" />
+    <command><![CDATA[
+        python ${__tool_directory__}/scikit_bio_diversity_beta_diversity.py
+        --input "${input_abundance}"
+        #if $otu_column:
+            --otu_column "${otu_column}"
+        #end if
+        #if $sample_columns:
+            --sample_columns "${sample_columns}"
+        #end if
+        ${header}
+        --distance_metric "${distance_metric.beta_diversity_method}"
+        #if str( $distance_metric.beta_diversity_method ) in [ 'unweighted_unifrac', 'weighted_unifrac' ]:
+            --tree
+            #if $distance_metric.input_tree:
+                "${distance_metric.input_tree}"
+            #else:
+                "${input_tree_config_file}"
+            #end if
+        #end if
+        --output "${output_beta_diversity}"
+    ]]>
+    </command>
+    <configfiles>
+        <configfile name="input_tree_config_file">#if str( $distance_metric.beta_diversity_method ) in [ 'unweighted_unifrac', 'weighted_unifrac' ] then $distance_metric.tree else ''#</configfile>
+    </configfiles>
+    <inputs>
+        <param name="input_abundance" type="data" format="tabular" label="File with abundance values for community" help="Rows are samples; columns are species/phyla/community classifier"/>
+        <param name="otu_column" label="Group name column" type="data_column" data_ref="input_abundance" value="1" optional="True" help="Species, phylum, etc"/>
+        <param name="sample_columns" label="Select Sample count columns" type="data_column" multiple="True" value="" optional="True" data_ref="input_abundance" help="Leave blank for all"/>
+        <param name="header" type="boolean" truevalue="--header" falsevalue="" checked="False" label="Input has a header line"/>
+        <conditional name="distance_metric">
+            <param name="beta_diversity_method" type="select" multiple="False" label="Diversity index to compute">
+                <option value="unweighted_unifrac">unweighted_unifrac</option>
+                <option value="weighted_unifrac" selected="True">weighted_unifrac</option>
+                <option value="euclidean">euclidean</option>
+                <option value="minkowski">minkowski</option>
+                <option value="cityblock">cityblock</option>
+                <option value="seuclidean">seuclidean</option>
+                <option value="sqeuclidean">sqeuclidean</option>
+                <option value="cosine">cosine</option>
+                <option value="correlation">correlation</option>
+                <option value="hamming">hamming</option>
+                <option value="jaccard">jaccard</option>
+                <option value="chebyshev">chebyshev</option>
+                <option value="canberra">canberra</option>
+                <option value="braycurtis">braycurtis</option>
+                <option value="mahalanobis">mahalanobis</option>
+                <option value="yule">yule</option>
+                <option value="matching">matching</option>
+                <option value="dice">dice</option>
+                <option value="kulsinski">kulsinski</option>
+                <option value="rogerstanimoto">rogerstanimoto</option>
+                <option value="russellrao">russellrao</option>
+                <option value="sokalmichener">sokalmichener</option>
+                <option value="sokalsneath">sokalsneath</option>
+                <option value="wminkowski">wminkowski</option>
+            </param>
+            <when value="euclidean"/>
+            <when value="minkowski"/>
+            <when value="cityblock"/>
+            <when value="seuclidean"/>
+            <when value="sqeuclidean"/>
+            <when value="cosine"/>
+            <when value="correlation"/>
+            <when value="hamming"/>
+            <when value="jaccard"/>
+            <when value="chebyshev"/>
+            <when value="canberra"/>
+            <when value="braycurtis"/>
+            <when value="mahalanobis"/>
+            <when value="yule"/>
+            <when value="matching"/>
+            <when value="dice"/>
+            <when value="kulsinski"/>
+            <when value="rogerstanimoto"/>
+            <when value="russellrao"/>
+            <when value="sokalmichener"/>
+            <when value="sokalsneath"/>
+            <when value="wminkowski"/>
+            <when value="unweighted_unifrac">
+                <expand macro="params_tree" />
+            </when>
+            <when value="weighted_unifrac">
+                <expand macro="params_tree" />
+            </when>
+        </conditional>
+    </inputs>
+    <outputs>
+        <data format="tabular" name="output_beta_diversity" label="${tool.name} on ${on_string} (${distance_metric.beta_diversity_method})"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="input_abundance" ftype="tabular" value="input_abundance_1.tabular"/>
+            <param name="otu_column" value="1"/>
+            <param name="sample_columns" value="2,3,4"/>
+            <param name="header" value="True"/>
+            <conditional name="distance_metric">
+                <param name="beta_diversity_method" value="weighted_unifrac"/>
+                <param name="input_tree" value="input_tree_1.newick"/>
+                <param name="tree" value=""/>
+            </conditional>
+            <output name="output_beta_diversity" ftype="tabular" file="output_weighted_unifrac_1.tabular" />
+        </test>
+    </tests>
+    <help>
+        <![CDATA[
+        
+Calculates beta diversity using the selected metric.
+
+        ]]>
+    </help>
+    <expand macro="citations" />
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/input_abundance_1.tabular	Fri Sep 23 12:17:38 2016 -0400
@@ -0,0 +1,86 @@
+#ID	sample_one	sample_two	sample_3
+Crenarchaeota	0	0	1
+Euryarchaeota	0	1	0
+AC1	0	1	2
+AD3	1	1	4
+Acidobacteria	13	14	372
+Actinobacteria	16758	1443	101451
+AncK6	0	0	0
+Aquificae	1	0	12
+Armatimonadetes	4	7	13
+BHI80-139	0	0	8
+BRC1	1	5	9
+Bacteroidetes	5868	270336	13264
+CD12	0	0	0
+Caldiserica	0	0	2
+Caldithrix	0	0	0
+Chlamydiae	1	1	13
+Chlorobi	3	9	11
+Chloroflexi	31	21	463
+Chrysiogenetes	0	0	2
+Cyanobacteria	5	16	123
+Deferribacteres	0	1	1
+EM19	0	0	0
+EM3	0	0	0
+Elusimicrobia	4	4	3
+FBP	0	0	0
+FCPU426	0	0	2
+Fibrobacteres	4	9	24
+Firmicutes	136317	71445	302692
+Fusobacteria	1268	1636	5463
+GAL15	0	0	0
+GN01	0	0	4
+GN02	0	3	48
+GN04	2	6	3
+GOUTA4	0	1	0
+Gemmatimonadetes	1	4	46
+H-178	0	0	0
+Hyd24-12	0	0	0
+KSB3	0	0	11
+Kazan-3B-28	0	0	1
+LCP-89	0	0	0
+LD1	1	1	1
+Lentisphaerae	0	2	12
+MAT-CR-M4-B07	0	0	0
+MVP-21	0	0	0
+MVS-104	0	0	0
+NC10	0	0	0
+NKB19	4	11	17
+NPL-UPA2	0	0	0
+Nitrospirae	2	1	9
+OD1	1	3	19
+OP1	2	2	102
+OP11	0	0	15
+OP3	0	1	8
+OP8	1	0	9
+OP9	1	0	57
+OctSpA1-106	0	0	0
+PAUC34f	0	0	0
+Planctomycetes	16	7	131
+Poribacteria	0	0	0
+Proteobacteria	48361	12121	153808
+SAR406	1	2	7
+SBR1093	0	0	3
+SC4	0	0	2
+SR1	16	4	61
+Spirochaetes	6	11	184
+Synergistetes	2	2	13
+TA06	0	0	0
+TM6	0	2	4
+TM7	76	61	2210
+TPD-58	0	0	0
+Tenericutes	2	3	25
+Thermotogae	1	0	11
+VHS-B3-43	0	0	0
+Verrucomicrobia	55	1240	44
+WPS-2	1	0	0
+WS1	1	0	5
+WS2	0	0	2
+WS3	1	3	0
+WS4	0	0	0
+WS5	0	1	1
+WS6	0	0	1
+WWE1	0	0	7
+ZB3	0	0	2
+[Caldithrix]	3	2	4
+[Thermi]	1	1	22
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/input_tree_1.newick	Fri Sep 23 12:17:38 2016 -0400
@@ -0,0 +1,1 @@
+((Crenarchaeota:1.00000,Euryarchaeota:1.00000,Nanoarchaeota:1.00000,'[Parvarchaeota]':1.00000)Archaea:1.00000,(AC1:1.00000,AD3:1.00000,Acidobacteria:1.00000,Actinobacteria:1.00000,AncK6:1.00000,Aquificae:1.00000,Armatimonadetes:1.00000,BHI80-139:1.00000,BRC1:1.00000,Bacteroidetes:1.00000,CD12:1.00000,Caldiserica:1.00000,Caldithrix:1.00000,Chlamydiae:1.00000,Chlorobi:1.00000,Chloroflexi:1.00000,Chrysiogenetes:1.00000,Cyanobacteria:1.00000,Deferribacteres:1.00000,Dictyoglomi:1.00000,EM19:1.00000,EM3:1.00000,Elusimicrobia:1.00000,FBP:1.00000,FCPU426:1.00000,Fibrobacteres:1.00000,Firmicutes:1.00000,Fusobacteria:1.00000,GAL15:1.00000,GN01:1.00000,GN02:1.00000,GN04:1.00000,GOUTA4:1.00000,Gemmatimonadetes:1.00000,H-178:1.00000,Hyd24-12:1.00000,KSB3:1.00000,Kazan-3B-28:1.00000,LCP-89:1.00000,LD1:1.00000,Lentisphaerae:1.00000,MAT-CR-M4-B07:1.00000,MVP-21:1.00000,MVS-104:1.00000,NC10:1.00000,NKB19:1.00000,NPL-UPA2:1.00000,Nitrospirae:1.00000,OC31:1.00000,OD1:1.00000,OP1:1.00000,OP11:1.00000,OP3:1.00000,OP8:1.00000,OP9:1.00000,OctSpA1-106:1.00000,PAUC34f:1.00000,Planctomycetes:1.00000,Poribacteria:1.00000,Proteobacteria:1.00000,SAR406:1.00000,SBR1093:1.00000,SC4:1.00000,SR1:1.00000,Spirochaetes:1.00000,Synergistetes:1.00000,TA06:1.00000,TM6:1.00000,TM7:1.00000,TPD-58:1.00000,Tenericutes:1.00000,Thermotogae:1.00000,VHS-B3-43:1.00000,Verrucomicrobia:1.00000,WPS-2:1.00000,WS1:1.00000,WS2:1.00000,WS3:1.00000,WS4:1.00000,WS5:1.00000,WS6:1.00000,WWE1:1.00000,ZB3:1.00000,'[Caldithrix]':1.00000,'[Thermi]':1.00000)Bacteria:1.00000)root:1.00000;
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output_weighted_unifrac_1.tabular	Fri Sep 23 12:17:38 2016 -0400
@@ -0,0 +1,4 @@
+	sample_one	sample_two	sample_3
+sample_one	0.0	1.45881907807	0.274219368588
+sample_two	1.45881907807	0.0	1.46956460092
+sample_3	0.274219368588	1.46956460092	0.0