comparison cat_paths.xml @ 1:30bd2584b6a0 draft default tip

Uploaded
author crs4
date Wed, 15 Oct 2014 09:39:16 -0400
parents 7698311d4466
children
comparison
equal deleted inserted replaced
0:7698311d4466 1:30bd2584b6a0
1 <tool id="hadoop_galaxy_cat_paths" name="Cat paths" version="0.1.0">
2 <description>Concatenate all components of a pathset into a single file.</description>
3 <requirements>
4 <requirement type="package" version="0.11">pydoop</requirement>
5 <requirement type="package" version="0.1.1">hadoop-galaxy</requirement>
6 </requirements>
7
8 <command>
9 #if $use_hadoop
10 dist_cat_paths
11 #else
12 cat_paths
13 #end if
14 #if $delete_source
15 --delete-source
16 #end if
17 $input_pathset $output_path
18 </command>
19
20 <inputs>
21 <param name="input_pathset" type="data" format="pathset" label="Input pathset">
22 <validator type="empty_field" />
23 </param>
24 <param name="delete_source" type="boolean" checked="false" label="Delete remote input data"
25 help="This option makes the tool move the data rather than copy it" />
26 <param name="use_hadoop" type="boolean" checked="false" label="Use Hadoop-based program"
27 help="The Galaxy workspace must be accessible by the Hadoop cluster (see help for details)" />
28 </inputs>
29
30 <outputs>
31 <!-- TODO: can we read the format from input pathset and transfer it to output? -->
32 <data name="output_path" format="data" label="Concatenated dataset $input_pathset.name" />
33 </outputs>
34
35 <stdio>
36 <exit_code range="1:" level="fatal" />
37 </stdio>
38
39 <help>
40 Datasets represented as pathsets can be split in a number of files.
41 This tool takes all of them and concatenates them into a single output file.
42
43 In your workflow, you'll need to explicitly set the appropriate data format on the
44 output dataset with an Action to "Change Datatype".
45
46 "Delete remote input data" option
47 ====================================
48 With this option, after the data has been concated into the new Galaxy dataset,
49 the original files that were referenced by the pathset are deleted. This effectively
50 tells the action to "move" the data instead of a "copying" it and helps
51 avoid amassing intermediate data in your Hadoop workspace.
52
53
54 "Use Hadoop-based program" option
55 ====================================
56
57 With this option you will use your entire Hadoop cluster to simultaneously write
58 multiple parts of the final file. For this to be possible, the Hadoop nodes
59 must be able to access the Galaxy file space directly. In addition, to achieve
60 reasonable results the Galaxy workspace should on a parallel shared file system.
61 </help>
62 </tool>