Mercurial > repos > crs4 > hadoop_galaxy
changeset 1:30bd2584b6a0 draft default tip
Uploaded
author | crs4 |
---|---|
date | Wed, 15 Oct 2014 09:39:16 -0400 |
parents | 7698311d4466 |
children | |
files | cat_paths.xml datatypes_conf.xml dist_text_zipper.xml hadoop_galaxy-13348e73/cat_paths.xml hadoop_galaxy-13348e73/datatypes_conf.xml hadoop_galaxy-13348e73/dist_text_zipper.xml hadoop_galaxy-13348e73/make_pathset.xml hadoop_galaxy-13348e73/put_dataset.xml hadoop_galaxy-13348e73/split_pathset.xml hadoop_galaxy-13348e73/tool_dependencies.xml make_pathset.xml put_dataset.xml split_pathset.xml tool_conf.xml tool_dependencies.xml |
diffstat | 15 files changed, 286 insertions(+), 296 deletions(-) [+] |
line wrap: on
line diff
--- a/cat_paths.xml Fri May 30 06:48:47 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,62 +0,0 @@ -<tool id="hadoop_galaxy_cat_paths" name="Cat paths" version="0.1.0"> - <description>Concatenate all components of a pathset into a single file.</description> - <requirements> - <requirement type="package" version="0.11">pydoop</requirement> - <requirement type="package" version="0.1.1">hadoop-galaxy</requirement> - </requirements> - - <command> - #if $use_hadoop - dist_cat_paths - #else - cat_paths - #end if - #if $delete_source - --delete-source - #end if - $input_pathset $output_path - </command> - - <inputs> - <param name="input_pathset" type="data" format="pathset" label="Input pathset"> - <validator type="empty_field" /> - </param> - <param name="delete_source" type="boolean" checked="false" label="Delete remote input data" - help="This option makes the tool move the data rather than copy it" /> - <param name="use_hadoop" type="boolean" checked="false" label="Use Hadoop-based program" - help="The Galaxy workspace must be accessible by the Hadoop cluster (see help for details)" /> - </inputs> - - <outputs> - <!-- TODO: can we read the format from input pathset and transfer it to output? --> - <data name="output_path" format="data" label="Concatenated dataset $input_pathset.name" /> - </outputs> - - <stdio> - <exit_code range="1:" level="fatal" /> - </stdio> - - <help> -Datasets represented as pathsets can be split in a number of files. -This tool takes all of them and concatenates them into a single output file. - -In your workflow, you'll need to explicitly set the appropriate data format on the -output dataset with an Action to "Change Datatype". - -"Delete remote input data" option -==================================== -With this option, after the data has been concated into the new Galaxy dataset, -the original files that were referenced by the pathset are deleted. This effectively -tells the action to "move" the data instead of a "copying" it and helps -avoid amassing intermediate data in your Hadoop workspace. - - -"Use Hadoop-based program" option -==================================== - -With this option you will use your entire Hadoop cluster to simultaneously write -multiple parts of the final file. For this to be possible, the Hadoop nodes -must be able to access the Galaxy file space directly. In addition, to achieve -reasonable results the Galaxy workspace should on a parallel shared file system. - </help> -</tool>
--- a/datatypes_conf.xml Fri May 30 06:48:47 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,6 +0,0 @@ -<?xml version="1.0"?> -<datatypes> - <registration> - <datatype extension="pathset" type="galaxy.datatypes.data:Text" mimetype="text/plain" subclass="True" display_in_upload="true" /> - </registration> -</datatypes>
--- a/dist_text_zipper.xml Fri May 30 06:48:47 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,30 +0,0 @@ -<tool id="hadoop_galaxy_dist_text_zipper" name="Dist TextZipper" version="0.1.0"> - <description>Compress lots of text files on Hadoop</description> - <requirements> - <requirement type="package" version="0.11">pydoop</requirement> - <requirement type="package" version="0.1.1">hadoop-galaxy</requirement> - </requirements> - - <command> - hadoop_galaxy - --input $input_data - --output $output - --executable dist_text_zipper - </command> - - <inputs> - <param name="input_data" type="data" format="pathset" label="Source data set"/> - </inputs> - - <outputs> - <data name="output" format="pathset" label="Zipped $input_data.name" /> - </outputs> - - <stdio> - <exit_code range="1:" level="fatal" /> - </stdio> - - <help> -This is a Pydoop-based distributed text file compression program. - </help> -</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hadoop_galaxy-13348e73/cat_paths.xml Wed Oct 15 09:39:16 2014 -0400 @@ -0,0 +1,62 @@ +<tool id="hadoop_galaxy_cat_paths" name="Cat paths" version="0.1.4"> + <description>Concatenate all components of a pathset into a single file.</description> + <requirements> + <requirement type="package" version="0.11">pydoop</requirement> + <requirement type="package" version="0.1.4">hadoop-galaxy</requirement> + </requirements> + + <command> + #if $use_hadoop + dist_cat_paths + #else + cat_paths + #end if + #if $delete_source + --delete-source + #end if + $input_pathset $output_path + </command> + + <inputs> + <param name="input_pathset" type="data" format="pathset" label="Input pathset"> + <validator type="empty_field" /> + </param> + <param name="delete_source" type="boolean" checked="false" label="Delete remote input data" + help="This option makes the tool move the data rather than copy it" /> + <param name="use_hadoop" type="boolean" checked="false" label="Use Hadoop-based program" + help="The Galaxy workspace must be accessible by the Hadoop cluster (see help for details)" /> + </inputs> + + <outputs> + <!-- TODO: can we read the format from input pathset and transfer it to output? --> + <data name="output_path" format="data" label="Concatenated dataset $input_pathset.name" /> + </outputs> + + <stdio> + <exit_code range="1:" level="fatal" /> + </stdio> + + <help> +Datasets represented as pathsets can be split in a number of files. +This tool takes all of them and concatenates them into a single output file. + +In your workflow, you'll need to explicitly set the appropriate data format on the +output dataset with an Action to "Change Datatype". + +"Delete remote input data" option +==================================== +With this option, after the data has been concated into the new Galaxy dataset, +the original files that were referenced by the pathset are deleted. This effectively +tells the action to "move" the data instead of a "copying" it and helps +avoid amassing intermediate data in your Hadoop workspace. + + +"Use Hadoop-based program" option +==================================== + +With this option you will use your entire Hadoop cluster to simultaneously write +multiple parts of the final file. For this to be possible, the Hadoop nodes +must be able to access the Galaxy file space directly. In addition, to achieve +reasonable results the Galaxy workspace should on a parallel shared file system. + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hadoop_galaxy-13348e73/datatypes_conf.xml Wed Oct 15 09:39:16 2014 -0400 @@ -0,0 +1,6 @@ +<?xml version="1.0"?> +<datatypes> + <registration> + <datatype extension="pathset" type="galaxy.datatypes.data:Text" mimetype="text/plain" subclass="True" display_in_upload="true" /> + </registration> +</datatypes>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hadoop_galaxy-13348e73/dist_text_zipper.xml Wed Oct 15 09:39:16 2014 -0400 @@ -0,0 +1,30 @@ +<tool id="hadoop_galaxy_dist_text_zipper" name="Dist TextZipper" version="0.1.4"> + <description>Compress lots of text files on Hadoop</description> + <requirements> + <requirement type="package" version="0.11">pydoop</requirement> + <requirement type="package" version="0.1.4">hadoop-galaxy</requirement> + </requirements> + + <command> + hadoop_galaxy + --input $input_data + --output $output + --executable dist_text_zipper + </command> + + <inputs> + <param name="input_data" type="data" format="pathset" label="Source data set"/> + </inputs> + + <outputs> + <data name="output" format="pathset" label="Zipped $input_data.name" /> + </outputs> + + <stdio> + <exit_code range="1:" level="fatal" /> + </stdio> + + <help> +This is a Pydoop-based distributed text file compression program. + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hadoop_galaxy-13348e73/make_pathset.xml Wed Oct 15 09:39:16 2014 -0400 @@ -0,0 +1,58 @@ +<tool id="hadoop_galaxy_make_pathset" name="Make Pathset" version="0.1.4"> + <description>Create a pathset for a set of files</description> + <requirements> + <requirement type="package" version="0.11">pydoop</requirement> + <requirement type="package" version="0.1.4">hadoop-galaxy</requirement> + </requirements> + + <command> + make_pathset + #if str($paths.source) == 'tool_input' + --force-local --data-format $paths.datapaths.ext "$output_path" "$paths.datapaths" + #elif str($paths.source) == 'text_box' + #if str($paths.filesystem_select) == "local_fs" + --force-local + #end if + #if $paths.data_format + --data-format "$paths.data_format" + #end if + "$output_path" "$paths.datapaths" + #else + #raise ValueError("BUG!! unknown paths.source value") + #end if + </command> + + <inputs> + <conditional name="paths"> + <param name="source" type="select" label="Path source"> + <option value="tool_input">Dataset from your history</option> + <option value="text_box">User input</option> + </param> + <when value="tool_input"> + <param name="datapaths" type="data" label="A dataset in any format" /> + </when> + <when value="text_box"> + <param name="filesystem_select" type="select" label="File system type"> + <option value="default_fs">Default</option> + <option value="local_fs">Local FS</option> + </param> + <param name="datapaths" type="text" label="Paths or URIs" size="60"> + <validator type="empty_field" /> + </param> + <param name="data_format" type="text" label="Extension representing data format" size="20" /> + </when> + </conditional> + </inputs> + + <outputs> + <data name="output_path" format="pathset" /> + </outputs> + + <stdio> + <exit_code range="1:" level="fatal" /> + </stdio> + + <help> +Create a pathset for a set of files to be used as input for Hadoop tools. + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hadoop_galaxy-13348e73/put_dataset.xml Wed Oct 15 09:39:16 2014 -0400 @@ -0,0 +1,43 @@ +<tool id="hadoop_galaxy_put_dataset" name="Put dataset" version="0.1.4"> + <description>Copy data from Galaxy storage to Hadoop storage.</description> + <requirements> + <requirement type="package" version="0.11">pydoop</requirement> + <requirement type="package" version="0.1.4">hadoop-galaxy</requirement> + </requirements> + + <command> + put_dataset + #if $workspace != "" + --hadoop-workspace "$workspace" + #end if + #if $use_distcp + --distcp + #end if + "$input_pathset" "$output_path" + </command> + + <inputs> + <param name="input_pathset" type="data" format="pathset" label="Galaxy pathset" /> + + <param name="workspace" type="text" label="Path to workspace for Hadoop data" + help="The data will be copied to a new directory under this path. The value can also be set through the HADOOP_GALAXY_PUT_DIR environment variable." /> + + <param name="use_distcp" type="boolean" checked="false" label="Use Hadoop distcp2" + help="Use distcp2 if Hadoop can access Galaxy's storage space and you're copying a large dataset." /> + </inputs> + + <outputs> + <data name="output_path" format="pathset" label="Hadoop pathset from $input_pathset.name" /> + </outputs> + + <stdio> + <exit_code range="1:" level="fatal" /> + </stdio> + + <help> + This tools copies data from Galaxy's storage to storage that is suitable for + Hadoop jobs. An example of a use case may be to copy data from the Galaxy server + to HDFS. Whether this tool is required depends on your specific local setup. + </help> + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hadoop_galaxy-13348e73/split_pathset.xml Wed Oct 15 09:39:16 2014 -0400 @@ -0,0 +1,60 @@ +<tool id="hadoop_galaxy_split_pathset" name="Split pathset" version="0.1.4"> + <description>Split a pathset according to a regular expression criteria</description> + <requirements> + <requirement type="package" version="0.11">pydoop</requirement> + <requirement type="package" version="0.1.4">hadoop-galaxy</requirement> + </requirements> + + <command> + split_pathset '$criteria_expr' + #if $anchor_end + --anchor-end + #end if + --expand-levels $expand_levels + $input_pathset $output_true $output_false + </command> + + <inputs> + <param name="criteria_expr" type="text" label="Regular expression criteria"> + <validator type="empty_field" /> + </param> + <param name="anchor_end" type="boolean" + checked="false" + label="Anchor expression and the end of the string (like $)" + /> + <param name="expand_levels" type="integer" + value="0" + label="Expand paths by at least this many levels before applying criteria" + /> + <param name="input_pathset" type="data" format="pathset" label="Input pathset" /> + <param name="match_name" type="text" value="match" label="Name of dataset matching criteria"> + <validator type="empty_field" /> + </param> + <param name="no_match_name" type="text" value="no_match" label="Name of dataset not matching criteria"> + <validator type="empty_field" /> + </param> + </inputs> + + <outputs> + <data name="output_true" type="data" format="pathset" label="$match_name" /> + <data name="output_false" type="data" format="pathset" label="$no_match_name" /> + </outputs> + + <stdio> + <exit_code range="1:" level="fatal" /> + </stdio> + + <help> + Splits a pathset according to a regular expression. + + You can have the tool expand the paths in the pathset by a certain number + of levels prior to testing whether it matches the regular expression. + + + **Note**: you can't use '$' in your regular expression. To anchor the + expression to the end of the path use the checkbox. + + + *Note*: the regular expression must match the path from its beginning. + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hadoop_galaxy-13348e73/tool_dependencies.xml Wed Oct 15 09:39:16 2014 -0400 @@ -0,0 +1,27 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="pydoop" version="0.11"> + <repository changeset_revision="2055590ae81c" name="package_pydoop_0_11" owner="crs4" prior_installation_required="True" toolshed="https://toolshed.g2.bx.psu.edu" /> + </package> + <package name="hadoop-galaxy" version="0.1.4"> + <install version="1.0"> + <actions> + <action type="shell_command">git clone https://github.com/crs4/hadoop-galaxy/</action> + <action type="shell_command">git reset --hard 0.1.4</action> + <action type="set_environment_for_install"> + <repository changeset_revision="2055590ae81c" name="package_pydoop_0_11" owner="crs4" toolshed="https://toolshed.g2.bx.psu.edu"> + <package name="pydoop" version="0.11" /> + </repository> + </action> + <action type="make_directory">$INSTALL_DIR/lib/python</action> + <action type="shell_command">export PYTHONPATH=$INSTALL_DIR/lib/python:$PYTHONPATH && python setup.py install --prefix=$INSTALL_DIR --install-lib=$INSTALL_DIR/lib/python</action> + <action type="set_environment"> + <environment_variable action="prepend_to" name="PATH">$INSTALL_DIR/bin</environment_variable> + <environment_variable action="prepend_to" name="PYTHONPATH">$INSTALL_DIR/lib/python</environment_variable> + </action> + </actions> + </install> + <readme> + </readme> + </package> +</tool_dependency>
--- a/make_pathset.xml Fri May 30 06:48:47 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,58 +0,0 @@ -<tool id="hadoop_galaxy_make_pathset" name="Make Pathset" version="0.1.0"> - <description>Create a pathset for a set of files</description> - <requirements> - <requirement type="package" version="0.11">pydoop</requirement> - <requirement type="package" version="0.1.1">hadoop-galaxy</requirement> - </requirements> - - <command> - make_pathset - #if str($paths.source) == 'tool_input' - --force-local --data-format $paths.datapaths.ext "$output_path" "$paths.datapaths" - #elif str($paths.source) == 'text_box' - #if str($paths.filesystem_select) == "local_fs" - --force-local - #end if - #if $paths.data_format - --data-format "$paths.data_format" - #end if - "$output_path" "$paths.datapaths" - #else - #raise ValueError("BUG!! unknown paths.source value") - #end if - </command> - - <inputs> - <conditional name="paths"> - <param name="source" type="select" label="Path source"> - <option value="tool_input">Dataset from your history</option> - <option value="text_box">User input</option> - </param> - <when value="tool_input"> - <param name="datapaths" type="data" label="A dataset in any format" /> - </when> - <when value="text_box"> - <param name="filesystem_select" type="select" label="File system type"> - <option value="default_fs">Default</option> - <option value="local_fs">Local FS</option> - </param> - <param name="datapaths" type="text" label="Paths or URIs" size="60"> - <validator type="empty_field" /> - </param> - <param name="data_format" type="text" label="Extension representing data format" size="20" /> - </when> - </conditional> - </inputs> - - <outputs> - <data name="output_path" format="pathset" /> - </outputs> - - <stdio> - <exit_code range="1:" level="fatal" /> - </stdio> - - <help> -Create a pathset for a set of files to be used as input for Hadoop tools. - </help> -</tool>
--- a/put_dataset.xml Fri May 30 06:48:47 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,43 +0,0 @@ -<tool id="hadoop_galaxy_put_dataset" name="Put dataset" version="0.1.0"> - <description>Copy data from Galaxy storage to Hadoop storage.</description> - <requirements> - <requirement type="package" version="0.11">pydoop</requirement> - <requirement type="package" version="0.1.1">hadoop-galaxy</requirement> - </requirements> - - <command> - put_dataset - #if $workspace != "" - --hadoop-workspace "$workspace" - #end if - #if $use_distcp - --distcp - #end if - "$input_pathset" "$output_path" - </command> - - <inputs> - <param name="input_pathset" type="data" format="pathset" label="Galaxy pathset" /> - - <param name="workspace" type="text" label="Path to workspace for Hadoop data" - help="The data will be copied to a new directory under this path. The value can also be set through the HADOOP_GALAXY_PUT_DIR environment variable." /> - - <param name="use_distcp" type="boolean" checked="false" label="Use Hadoop distcp2" - help="Use distcp2 if Hadoop can access Galaxy's storage space and you're copying a large dataset." /> - </inputs> - - <outputs> - <data name="output_path" format="pathset" label="Hadoop pathset from $input_pathset.name" /> - </outputs> - - <stdio> - <exit_code range="1:" level="fatal" /> - </stdio> - - <help> - This tools copies data from Galaxy's storage to storage that is suitable for - Hadoop jobs. An example of a use case may be to copy data from the Galaxy server - to HDFS. Whether this tool is required depends on your specific local setup. - </help> - -</tool>
--- a/split_pathset.xml Fri May 30 06:48:47 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,60 +0,0 @@ -<tool id="hadoop_galaxy_split_pathset" name="Split pathset" version="0.1.0"> - <description>Split a pathset according to a regular expression criteria</description> - <requirements> - <requirement type="package" version="0.11">pydoop</requirement> - <requirement type="package" version="0.1.1">hadoop-galaxy</requirement> - </requirements> - - <command> - split_pathset '$criteria_expr' - #if $anchor_end - --anchor-end - #end if - --expand-levels $expand_levels - $input_pathset $output_true $output_false - </command> - - <inputs> - <param name="criteria_expr" type="text" label="Regular expression criteria"> - <validator type="empty_field" /> - </param> - <param name="anchor_end" type="boolean" - checked="false" - label="Anchor expression and the end of the string (like $)" - /> - <param name="expand_levels" type="integer" - value="0" - label="Expand paths by at least this many levels before applying criteria" - /> - <param name="input_pathset" type="data" format="pathset" label="Input pathset" /> - <param name="match_name" type="text" value="match" label="Name of dataset matching criteria"> - <validator type="empty_field" /> - </param> - <param name="no_match_name" type="text" value="no_match" label="Name of dataset not matching criteria"> - <validator type="empty_field" /> - </param> - </inputs> - - <outputs> - <data name="output_true" type="data" format="pathset" label="$match_name" /> - <data name="output_false" type="data" format="pathset" label="$no_match_name" /> - </outputs> - - <stdio> - <exit_code range="1:" level="fatal" /> - </stdio> - - <help> - Splits a pathset according to a regular expression. - - You can have the tool expand the paths in the pathset by a certain number - of levels prior to testing whether it matches the regular expression. - - - **Note**: you can't use '$' in your regular expression. To anchor the - expression to the end of the path use the checkbox. - - - *Note*: the regular expression must match the path from its beginning. - </help> -</tool>
--- a/tool_conf.xml Fri May 30 06:48:47 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,10 +0,0 @@ -<?xml version="1.0"?> -<toolbox> - <section name="Hadoop-Galaxy" id="hadoop_galaxy"> - <tool file="hadoop_galaxy/make_pathset.xml" /> - <tool file="hadoop_galaxy/put_dataset.xml" /> - <tool file="hadoop_galaxy/cat_paths.xml" /> - <tool file="hadoop_galaxy/split_pathset.xml" /> - <tool file="hadoop_galaxy/dist_text_zipper.xml" /> - </section> -</toolbox>
--- a/tool_dependencies.xml Fri May 30 06:48:47 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,27 +0,0 @@ -<?xml version="1.0"?> -<tool_dependency> - <package name="pydoop" version="0.11"> - <repository changeset_revision="2055590ae81c" name="package_pydoop_0_11" owner="crs4" prior_installation_required="True" toolshed="http://toolshed.g2.bx.psu.edu" /> - </package> - <package name="hadoop-galaxy" version="0.1.1"> - <install version="1.0"> - <actions> - <action type="shell_command">git clone https://github.com/crs4/hadoop-galaxy/</action> - <action type="shell_command">git reset --hard 0.1.1</action> - <action type="set_environment_for_install"> - <repository changeset_revision="2055590ae81c" name="package_pydoop_0_11" owner="crs4" toolshed="http://toolshed.g2.bx.psu.edu"> - <package name="pydoop" version="0.11" /> - </repository> - </action> - <action type="make_directory">$INSTALL_DIR/lib/python</action> - <action type="shell_command">export PYTHONPATH=$INSTALL_DIR/lib/python:$PYTHONPATH && python setup.py install --prefix=$INSTALL_DIR --install-lib=$INSTALL_DIR/lib/python</action> - <action type="set_environment"> - <environment_variable action="prepend_to" name="PATH">$INSTALL_DIR/bin</environment_variable> - <environment_variable action="prepend_to" name="PYTHONPATH">$INSTALL_DIR/lib/python</environment_variable> - </action> - </actions> - </install> - <readme> - </readme> - </package> -</tool_dependency>