comparison tools/cgatools17/join_v17.xml @ 1:3a2e0f376f26 draft

Minor change to tv2vcf.xml to allow for workflow automation
author dgdekoning
date Wed, 21 Oct 2015 10:09:15 -0400
parents
children
comparison
equal deleted inserted replaced
0:751b62d30ae1 1:3a2e0f376f26
1 <tool id="cg_join" name="Join" version="1.7.1">
2
3 <description>Join two tsv files based on equal fields or overlapping regions.</description>
4
5 <requirements>
6 <requirement type="package" version="1">cgatools17</requirement>
7 </requirements>
8
9 <command>
10 cgatools | head -1;
11 cgatools join --beta
12 --input $inputA
13 --input $inputB
14 --output $output
15 --output-mode $outmode
16 $dump
17 --select $col
18 #for $m in $matches
19 --match ${m.match}
20 #end for
21 #if $range_overlap.range == 'yes'
22 #for $o in $range_overlap.overlaps
23 --overlap ${o.overlap}
24 #end for
25 --overlap-mode $range_overlap.overlapmode
26 --overlap-fraction-A $range_overlap.fractionA
27 --boundary-uncertainty-A $range_overlap.boundaryA
28 --overlap-fraction-B $range_overlap.fractionB
29 --boundary-uncertainty-B $range_overlap.boundaryB
30 #end if
31 </command>
32
33
34 <inputs>
35 <param name="inputA" type="data" format="tabular" label="Select input file A "/>
36 <param name="inputB" type="data" format="tabular" label="Select input file B "/>
37 <param name="col" type="text" value="A.*,B.*" size="40" label="Specify columns for output" help="The default value A.*,B.* prints all columns from both files, other selections enter in the format A.col_name1,A.col_name3,B.col_name1" />
38
39 <param name="outmode" type="select" label="Select output mode">
40 <option value="full" selected="true">full (1 line for each match of records in A and B)</option>
41 <option value="compact">compact (1 line for each record in A, joining multiple records in B by semicolon)</option>
42 <option value="compact-pct">compact-pct (same as compact, annotated with % overlap)</option>
43 </param>
44
45 <param name="dump" type="select" label="Select records to print">
46 <option value="--always-dump" selected="true">print all records of A even if not matched in B</option>
47 <option value="">print only records of A that are matched in B</option>
48 </param>
49
50 <repeat name="matches" title="Exact match column">
51 <param name="match" type="text" size="40" label="Enter column:column" help="Enter column_from_A:column_from_B, e.g. chromosome:chromosome"/>
52 </repeat>
53
54 <conditional name="range_overlap">
55 <param name="range" type="select" label="Do you want to match columns by overlapping range?">
56 <option value="no">no</option>
57 <option value="yes">yes</option>
58 </param>
59 <when value="no">
60 <!-- no options -->
61 </when>
62 <when value="yes">
63 <repeat name="overlaps" title="Range column">
64 <param name="overlap" type="text" size="40" label="Enter column&#91;,column&#93;:column&#91;,column&#93;" help="Enter range_start_from_A&#91;,range_stop_from_A&#93;:range_start_from_B&#91;,range_stop_from_B&#93;, e.g. begin,end:begin,end (overlapping range of positions) or begin,end:position"/>
65 </repeat>
66
67 <param name="overlapmode" type="select" label="Select overlap mode">
68 <option value="strict" selected="true">strict (overlap if A.begin&lt;B.end and B.begin&gt;A.end)</option>
69 <option value="allow-abutting-points">allow-abutting-points (overlap if A.begin&lt;B.end and B.begin&gt;A.end, or if A.begin&lt;=B.end and B.begin&lt;=A.end and either A or B has zero length.)</option>
70 </param>
71
72 <param name="fractionA" type="integer" value="0" label="Minimum fraction of A region overlap " />
73 <param name="boundaryA" type="integer" value="0" label="Boundary uncertainty for A for overlap filtering " help="Records failing the following boundary-uncertainty calculation are not included in the output: overlap length >= overlap-fraction-A * (A-range-length - boundary-uncertainty-A)"/>
74
75 <param name="fractionB" type="integer" value="0" label="Minimum fraction of B region overlap " />
76 <param name="boundaryB" type="integer" value="0" label="Boundary uncertainty for overlap filtering " help="Records failing the following boundary-uncertainty calculation are not included in the output: overlap length >= overlap-fraction-B * (B-range-length - boundary-uncertainty-B)"/>
77 </when>
78 </conditional>
79
80 <!-- prefix for output file so you dont have to manually rename history items -->
81 <param name="fname" type="text" value="" label="Prefix for your output file" help="Optional"/>
82
83 </inputs>
84
85 <outputs>
86 <data format="tabular" name="output" label="$fname ${tool.name} on data ${on_string}" />
87 </outputs>
88
89
90 <help>
91
92 **What it does**
93
94 This tool joins two tab-delimited files based on equal fields or overlapping regions.
95
96 **cgatools 1.7.1 Documentation**
97
98 Userguide: http://cgatools.sourceforge.net/docs/1.7.1/cgatools-user-guide.pdf
99
100 Release notes: http://cgatools.sourceforge.net/docs/1.7.1/cgatools-release-notes.pdf
101
102 **Command line reference**::
103
104 COMMAND NAME
105 join - Joins two tab-delimited files based on equal fields or overlapping regions.
106
107 DESCRIPTION
108 Joins two tab-delimited files based on equal fields or overlapping regions.
109 By default, an output record is produced for each match found between file
110 A and file B, but output format can be controlled by the --output-mode
111 parameter.
112
113 OPTIONS
114 -h [ --help ]
115 Print this help message.
116
117 --beta
118 This is a beta command. To run this command, you must pass the --beta
119 flag.
120
121 --input arg
122 File name to use as input (may be passed in as arguments at the end of
123 the command), or omitted for stdin). There must be exactly two input
124 files to join. If only one file is specified by name, file A is taken
125 to be stdin and file B is the named file. File B is read fully into
126 memory, and file A is streamed. File A's columns appear first in the
127 output.
128
129 --output arg (=STDOUT)
130 The output file name (may be omitted for stdout).
131
132 --match arg
133 A match specification, which is a column from A and a column from B
134 separated by a colon.
135
136 --overlap arg
137 Overlap specification. An overlap specification consists of a range
138 definition for files A and B, separated by a colon. A range definition
139 may be two columns, in which case they are interpreted as the beginning
140 and end of the range. Or it may be one column, in which case the range
141 is defined as the 1-base range starting at the given value. The records
142 from the two files must overlap in order to be considered for output.
143 Two ranges are considered to overlap if the overlap is at least one
144 base long, or if one of the ranges is length 0 and the ranges overlap
145 or abut. For example, "begin,end:offset" will match wherever end-begin
146 &gt; 0, begin&lt;offset+1, and end&gt;offset, or wherever end-begin = 0,
147 begin&lt;=offset+1, and end&gt;=offset.
148
149
150 -m [ --output-mode ] arg (=full)
151 Output mode, one of the following:
152 full Print an output record for each match found between
153 file A and file B.
154 compact Print at most one record for each record of file A,
155 joining the file B values by a semicolon and
156 suppressing repeated B values and empty B values.
157 compact-pct Same as compact, but for each distinct B value,
158 annotate with the percentage of the A record that is
159 overlapped by B records with that B value. Percentage
160 is rounded up to nearest integer.
161
162 --overlap-mode arg (=strict)
163 Overlap mode, one of the following:
164 strict Range A and B overlap if A.begin &lt; B.end and
165 B.begin &lt; A.end.
166 allow-abutting-points Range A and B overlap they meet the strict
167 requirements, or if A.begin &lt;= B.end and
168 B.begin &lt;= A.end and either A or B has zero
169 length.
170
171 --select arg (=A.*,B.*)
172 Set of fields to select for output.
173
174 -a [ --always-dump ]
175 Dump every record of A, even if there are no matches with file B.
176
177 --overlap-fraction-A arg (=0)
178 Minimum fraction of A region overlap for filtering output.
179
180 --boundary-uncertainty-A arg (=0)
181 Boundary uncertainty for overlap filtering. Specifically, records
182 failing the following predicate are filtered away: overlap &gt;=
183 overlap-fraction-A * ( A-range-length - boundary-uncertainty-A )
184
185 --overlap-fraction-B arg (=0)
186 Minimum fraction of B region overlap for filtering output.
187
188 --boundary-uncertainty-B arg (=0)
189 Boundary uncertainty for overlap filtering. Specifically, records
190 failing the following predicate are filtered away: overlap &gt;=
191 overlap-fraction-B * ( B-range-length - boundary-uncertainty-B )
192
193 SUPPORTED FORMAT_VERSION
194 Any
195 </help>
196 </tool>