Mercurial > repos > crs4 > hadoop_galaxy

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cat_paths.xml	Fri May 30 06:48:47 2014 -0400
@@ -0,0 +1,62 @@
+<tool id="hadoop_galaxy_cat_paths" name="Cat paths" version="0.1.0">
+  <description>Concatenate all components of a pathset into a single file.</description>
+  <requirements>
+    <requirement type="package" version="0.11">pydoop</requirement>
+    <requirement type="package" version="0.1.1">hadoop-galaxy</requirement>
+  </requirements>
+
+  <command>
+    #if $use_hadoop
+      dist_cat_paths
+    #else
+      cat_paths
+    #end if
+    #if $delete_source
+      --delete-source
+    #end if
+    $input_pathset $output_path
+  </command>
+
+  <inputs>
+    <param name="input_pathset" type="data" format="pathset" label="Input pathset">
+      <validator type="empty_field" />
+    </param>
+    <param name="delete_source" type="boolean" checked="false" label="Delete remote input data"
+        help="This option makes the tool move the data rather than copy it" />
+    <param name="use_hadoop" type="boolean" checked="false" label="Use Hadoop-based program"
+        help="The Galaxy workspace must be accessible by the Hadoop cluster (see help for details)" />
+  </inputs>
+
+  <outputs>
+      <!-- TODO: can we read the format from input pathset and transfer it to output? -->
+      <data name="output_path" format="data" label="Concatenated dataset $input_pathset.name" />
+  </outputs>
+
+  <stdio>
+    <exit_code range="1:" level="fatal" />
+  </stdio>
+
+  <help>
+Datasets represented as pathsets can be split in a number of files.
+This tool takes all of them and concatenates them into a single output file.
+
+In your workflow, you'll need to explicitly set the appropriate data format on the
+output dataset with an Action to "Change Datatype".
+
+"Delete remote input data" option
+====================================
+With this option, after the data has been concated into the new Galaxy dataset,
+the original files that were referenced by the pathset are deleted.  This effectively
+tells the action to "move" the data instead of a "copying" it and helps
+avoid amassing intermediate data in your Hadoop workspace.
+
+
+"Use Hadoop-based program" option
+====================================
+
+With this option you will use your entire Hadoop cluster to simultaneously write
+multiple parts of the final file.  For this to be possible, the Hadoop nodes
+must be able to access the Galaxy file space directly.  In addition, to achieve
+reasonable results the Galaxy workspace should on a parallel shared file system.
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes_conf.xml	Fri May 30 06:48:47 2014 -0400
@@ -0,0 +1,6 @@
+<?xml version="1.0"?>
+<datatypes>
+  <registration>
+    <datatype extension="pathset" type="galaxy.datatypes.data:Text" mimetype="text/plain" subclass="True" display_in_upload="true" />
+  </registration>
+</datatypes>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/dist_text_zipper.xml	Fri May 30 06:48:47 2014 -0400
@@ -0,0 +1,30 @@
+<tool id="hadoop_galaxy_dist_text_zipper" name="Dist TextZipper" version="0.1.0">
+  <description>Compress lots of text files on Hadoop</description>
+  <requirements>
+    <requirement type="package" version="0.11">pydoop</requirement>
+    <requirement type="package" version="0.1.1">hadoop-galaxy</requirement>
+  </requirements>
+
+  <command>
+    hadoop_galaxy
+    --input $input_data
+    --output $output
+    --executable dist_text_zipper
+  </command>
+
+  <inputs>
+    <param name="input_data" type="data" format="pathset" label="Source data set"/>
+  </inputs>
+
+  <outputs>
+    <data name="output" format="pathset" label="Zipped $input_data.name" />
+  </outputs>
+
+  <stdio>
+    <exit_code range="1:" level="fatal" />
+  </stdio>
+
+  <help>
+This is a Pydoop-based distributed text file compression program.
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/make_pathset.xml	Fri May 30 06:48:47 2014 -0400
@@ -0,0 +1,58 @@
+<tool id="hadoop_galaxy_make_pathset" name="Make Pathset" version="0.1.0">
+  <description>Create a pathset for a set of files</description>
+  <requirements>
+    <requirement type="package" version="0.11">pydoop</requirement>
+    <requirement type="package" version="0.1.1">hadoop-galaxy</requirement>
+  </requirements>
+
+  <command>
+    make_pathset
+    #if str($paths.source) == 'tool_input'
+      --force-local --data-format $paths.datapaths.ext "$output_path" "$paths.datapaths"
+    #elif str($paths.source) == 'text_box'
+      #if str($paths.filesystem_select) == "local_fs"
+        --force-local
+      #end if
+      #if $paths.data_format
+        --data-format "$paths.data_format"
+      #end if
+      "$output_path" "$paths.datapaths"
+    #else
+      #raise ValueError("BUG!! unknown paths.source value")
+    #end if
+  </command>
+
+  <inputs>
+    <conditional name="paths">
+      <param name="source" type="select" label="Path source">
+        <option value="tool_input">Dataset from your history</option>
+        <option value="text_box">User input</option>
+      </param>
+      <when value="tool_input">
+        <param name="datapaths" type="data" label="A dataset in any format" />
+      </when>
+      <when value="text_box">
+        <param name="filesystem_select" type="select" label="File system type">
+          <option value="default_fs">Default</option>
+          <option value="local_fs">Local FS</option>
+        </param>
+        <param name="datapaths" type="text" label="Paths or URIs" size="60">
+          <validator type="empty_field" />
+        </param>
+        <param name="data_format" type="text" label="Extension representing data format" size="20" />
+      </when>
+    </conditional>
+  </inputs>
+
+  <outputs>
+    <data name="output_path" format="pathset" />
+  </outputs>
+
+  <stdio>
+    <exit_code range="1:" level="fatal" />
+  </stdio>
+
+  <help>
+Create a pathset for a set of files to be used as input for Hadoop tools.
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/put_dataset.xml	Fri May 30 06:48:47 2014 -0400
@@ -0,0 +1,43 @@
+<tool id="hadoop_galaxy_put_dataset" name="Put dataset" version="0.1.0">
+  <description>Copy data from Galaxy storage to Hadoop storage.</description>
+  <requirements>
+    <requirement type="package" version="0.11">pydoop</requirement>
+    <requirement type="package" version="0.1.1">hadoop-galaxy</requirement>
+  </requirements>
+
+  <command>
+    put_dataset
+    #if $workspace != ""
+      --hadoop-workspace "$workspace"
+    #end if
+    #if $use_distcp
+      --distcp
+    #end if
+    "$input_pathset" "$output_path"
+  </command>
+
+  <inputs>
+    <param name="input_pathset" type="data" format="pathset" label="Galaxy pathset" />
+
+    <param name="workspace" type="text" label="Path to workspace for Hadoop data"
+       help="The data will be copied to a new directory under this path. The value can also be set through the HADOOP_GALAXY_PUT_DIR environment variable." />
+
+    <param name="use_distcp" type="boolean" checked="false" label="Use Hadoop distcp2"
+       help="Use distcp2 if Hadoop can access Galaxy's storage space and you're copying a large dataset." />
+  </inputs>
+
+  <outputs>
+    <data name="output_path" format="pathset" label="Hadoop pathset from $input_pathset.name" />
+  </outputs>
+
+  <stdio>
+    <exit_code range="1:" level="fatal" />
+  </stdio>
+
+  <help>
+      This tools copies data from Galaxy's storage to storage that is suitable for
+      Hadoop jobs.  An example of a use case may be to copy data from the Galaxy server
+      to HDFS. Whether this tool is required depends on your specific local setup.
+  </help>
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/split_pathset.xml	Fri May 30 06:48:47 2014 -0400
@@ -0,0 +1,60 @@
+<tool id="hadoop_galaxy_split_pathset" name="Split pathset" version="0.1.0">
+  <description>Split a pathset according to a regular expression criteria</description>
+  <requirements>
+    <requirement type="package" version="0.11">pydoop</requirement>
+    <requirement type="package" version="0.1.1">hadoop-galaxy</requirement>
+  </requirements>
+
+  <command>
+      split_pathset '$criteria_expr'
+      #if $anchor_end
+          --anchor-end
+      #end if
+      --expand-levels $expand_levels
+      $input_pathset $output_true $output_false
+  </command>
+
+  <inputs>
+    <param name="criteria_expr" type="text" label="Regular expression criteria">
+      <validator type="empty_field" />
+    </param>
+    <param name="anchor_end" type="boolean"
+      checked="false"
+      label="Anchor expression and the end of the string (like $)"
+      />
+    <param name="expand_levels" type="integer"
+      value="0"
+      label="Expand paths by at least this many levels before applying criteria"
+      />
+    <param name="input_pathset" type="data" format="pathset" label="Input pathset" />
+    <param name="match_name" type="text" value="match" label="Name of dataset matching criteria">
+      <validator type="empty_field" />
+    </param>
+    <param name="no_match_name" type="text" value="no_match" label="Name of dataset not matching criteria">
+      <validator type="empty_field" />
+    </param>
+  </inputs>
+
+  <outputs>
+    <data name="output_true" type="data" format="pathset" label="$match_name" />
+    <data name="output_false" type="data" format="pathset" label="$no_match_name" />
+  </outputs>
+
+  <stdio>
+    <exit_code range="1:" level="fatal" />
+  </stdio>
+
+  <help>
+    Splits a pathset according to a regular expression.
+
+    You can have the tool expand the paths in the pathset by a certain number
+    of levels prior to testing whether it matches the regular expression.
+
+
+    **Note**: you can't use '$' in your regular expression. To anchor the
+    expression to the end of the path use the checkbox.
+
+
+    *Note*: the regular expression must match the path from its beginning.
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_conf.xml	Fri May 30 06:48:47 2014 -0400
@@ -0,0 +1,10 @@
+<?xml version="1.0"?>
+<toolbox>
+  <section name="Hadoop-Galaxy" id="hadoop_galaxy">
+    <tool file="hadoop_galaxy/make_pathset.xml" />
+    <tool file="hadoop_galaxy/put_dataset.xml" />
+    <tool file="hadoop_galaxy/cat_paths.xml" />
+    <tool file="hadoop_galaxy/split_pathset.xml" />
+    <tool file="hadoop_galaxy/dist_text_zipper.xml" />
+  </section>
+</toolbox>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Fri May 30 06:48:47 2014 -0400
@@ -0,0 +1,27 @@
+<?xml version="1.0"?>
+<tool_dependency>
+  <package name="pydoop" version="0.11">
+    <repository changeset_revision="2055590ae81c" name="package_pydoop_0_11" owner="crs4" prior_installation_required="True" toolshed="http://toolshed.g2.bx.psu.edu" />
+  </package>
+  <package name="hadoop-galaxy" version="0.1.1">
+    <install version="1.0">
+      <actions>
+        <action type="shell_command">git clone https://github.com/crs4/hadoop-galaxy/</action>
+        <action type="shell_command">git reset --hard 0.1.1</action>
+        <action type="set_environment_for_install">
+          <repository changeset_revision="2055590ae81c" name="package_pydoop_0_11" owner="crs4" toolshed="http://toolshed.g2.bx.psu.edu">
+            <package name="pydoop" version="0.11" />
+          </repository>
+        </action>
+        <action type="make_directory">$INSTALL_DIR/lib/python</action>
+        <action type="shell_command">export PYTHONPATH=$INSTALL_DIR/lib/python:$PYTHONPATH &amp;&amp; python setup.py install --prefix=$INSTALL_DIR --install-lib=$INSTALL_DIR/lib/python</action>
+        <action type="set_environment">
+          <environment_variable action="prepend_to" name="PATH">$INSTALL_DIR/bin</environment_variable>
+          <environment_variable action="prepend_to" name="PYTHONPATH">$INSTALL_DIR/lib/python</environment_variable>
+        </action>
+      </actions>
+    </install>
+    <readme>
+    </readme>
+  </package>
+</tool_dependency>