comparison gtf_to_bed.xml @ 0:ed0d0eda36a9 draft default tip

"planemo upload for repository https://github.com/usegalaxy-be/galaxytools/tree/main/gtf_to_bed commit 66fba7c9dccfddadce13aad591f441c66c3c309b-dirty"
author padge
date Wed, 29 Sep 2021 13:50:53 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:ed0d0eda36a9
1 <tool name="gtf_to_bed" id="gtf_to_bed" version="0.01">
2 <!--Source in git at: https://github.com/fubar2/toolfactory-->
3 <!--Created by admin@galaxy.org at 29/09/2021 09:26:48 using the Galaxy Tool Factory.-->
4 <description>Takes as input a GTF file and writes a BED file in 12 column format</description>
5 <requirements>
6 <requirement type="package">perl</requirement>
7 </requirements>
8 <stdio>
9 <exit_code range="1:" level="fatal"/>
10 </stdio>
11 <version_command><![CDATA[echo "0.01"]]></version_command>
12 <command><![CDATA[perl
13 $runme
14 $input_gtf
15 $converted_from_gtf]]></command>
16 <configfiles>
17 <configfile name="runme"><![CDATA[#raw
18
19 #!/usr/bin/perl
20 # written by Guy Bottu for the GenePattern server of VIB BioinforlmaticsCore,
21 # takes as input a GTF file and writes a BED file in 12 column format
22 # with information about transcripts, for use with RSeqC.
23 #
24 # The "thick" information is about the coding region, ideally it goes from
25 # start codon to stop codon, but is information is lacking (e.g. because
26 # of missing sequence or missing annotation), we use the CDS information.
27 # For some transcripts there are multiple start or stop codons. We amways
28 # choose the "thick" so that is has maximum length.
29 #
30 # If there is no CDS information (as for ncRNA) the "thick" will have just a
31 # repeat of the transcript start position, as per BED convention.
32 #
33 # modified for integration under GenePattern
34 #
35 # usage : perl gtf_to_bed.pl <GTF file> <output file>
36 use List::Util qw (min max);
37
38 $gtf = $ARGV[0];
39 $gtf =~ /.*\/([^\/]+)\.gtf3?/;
40 # print $gtf;
41 $bed = $ARGV[1];
42
43 open GTF, $gtf;
44 open BED, ">$bed";
45 LINEPARSER: while (<GTF>) {
46 if (/^#/) { next LINEPARSER } # skip comment lines
47 @fields = split /\t/;
48 $chrom = $fields[0]; $type = $fields[2]; $beginpos = $fields[3];
49 $endpos = $fields[4]; $strand = $fields[6];
50 chomp $fields[8]; $documentation = $fields[8];
51 $documentation =~ /transcript_id "([^"]+)";/;
52 $transcript_id = $1;
53 if ($strand ne '+' and $strand ne '-') {
54 print "WARNING : $transcript_id has strand information $strand\n";
55 }
56 if ($type eq 'transcript') {
57 $chrom{$transcript_id} = $chrom;
58 $strand{$transcript_id} = $strand;
59 $transcript_beginpos{$transcript_id} = $beginpos;
60 $transcript_endpos{$transcript_id} = $endpos;
61 } elsif ($type eq 'exon') {
62
63 $documentation =~ /exon_number "([^"]+)";/;
64 $exon_number = $1;
65 # print $exon_number;
66 $exon_beginpos{$transcript_id}[$exon_number] = $beginpos;
67 $exon_endpos{$transcript_id}[$exon_number] = $endpos;
68 } elsif ($type eq 'start_codon') {
69 if (not exists $ORFpos{$transcript_id}[0]
70 or ($strand eq '+' and $beginpos < $ORFpos{$transcript_id}[0])
71 or ($strand eq '-' and $endpos > $ORFpos{$transcript_id}[1])) {
72 $ORFpos{$transcript_id}[0] = $beginpos;
73 $ORFpos{$transcript_id}[1] = $endpos;
74 }
75 } elsif ($type eq 'stop_codon') {
76 if (not exists $ORFpos{$transcript_id}[2]
77 or ($strand eq '+' and $endpos > $ORFpos{$transcript_id}[3])
78 or ($strand eq '-' and $beginpos < $ORFpos{$transcript_id}[2])) {
79 $ORFpos{$transcript_id}[2] = $beginpos;
80 $ORFpos{$transcript_id}[3] = $endpos;
81 }
82 } elsif ($type eq 'CDS') {
83 if (not exists $CDSpos{$transcript_id}[0]
84 or $beginpos < $CDSpos{$transcript_id}[0]) {
85 $CDSpos{$transcript_id}[0] = $beginpos;
86 }
87 if (not exists $CDSpos{$transcript_id}[1]
88 or $endpos > $CDSpos{$transcript_id}[1]) {
89 $CDSpos{$transcript_id}[1] = $endpos;
90 }
91 }
92 }
93
94 foreach $transcript_id (sort keys %transcript_beginpos) {
95 $beginpos = $transcript_beginpos{$transcript_id} - 1;
96 ## in BED numbering starts with 0, not 1 like in GTF
97 $endpos = $transcript_endpos{$transcript_id};
98 print BED "$chrom{$transcript_id}\t$beginpos\t$endpos\t$transcript_id\t0\t$strand{$transcript_id}";
99
100 if (exists $ORFpos{$transcript_id}[0] or exists $ORFpos{$transcript_id}[2]
101 or exists $CDSpos{$transcript_id}[0]) {
102 if ($strand{$transcript_id} eq '+') {
103 if (exists $ORFpos{$transcript_id}[0]) {
104 if (exists $CDSpos{$transcript_id}[0]) { # both start_codon and CDS
105 $beginthickpos = min($ORFpos{$transcript_id}[0],
106 $CDSpos{$transcript_id}[0]);
107 } else { # only start_codon
108 $beginthickpos = $ORFpos{$transcript_id}[0];
109 }
110 } elsif (exists $CDSpos{$transcript_id}[0]) { # only CDS
111 $beginthickpos = $CDSpos{$transcript_id}[0];
112 } else { # -- (but there is a stop_codon)
113 $beginthickpos = $transcript_beginpos{$transcript_id};
114 }
115 if (exists $ORFpos{$transcript_id}[3]) {
116 if (exists $CDSpos{$transcript_id}[1]) { # both stop_codon and CDS
117 $endthickpos = max($ORFpos{$transcript_id}[3],
118 $CDSpos{$transcript_id}[1]);
119 } else { # only stop_codon
120 $endthickpos = $ORFpos{$transcript_id}[3];
121 }
122 } elsif (exists $CDSpos{$transcript_id}[1]) { # only CDS
123 $endthickpos = $CDSpos{$transcript_id}[1];
124 } else { # -- (but there is a start_codon)
125 $endthickpos = $transcript_endpos{$transcript_id};
126 }
127 } elsif ($strand{$transcript_id} eq '-') {
128 if (exists $ORFpos{$transcript_id}[2]) {
129 if (exists $CDSpos{$transcript_id}[0]) { # both stop_codon and CDS
130 $beginthickpos = min($ORFpos{$transcript_id}[2],
131 $CDSpos{$transcript_id}[0]);
132 } else { # only stop_codon
133 $beginthickpos = $ORFpos{$transcript_id}[2];
134 }
135 } elsif (exists $CDSpos{$transcript_id}[0]) { # only CDS
136 $beginthickpos = $CDSpos{$transcript_id}[0];
137 } else { # -- (but there is a start_codon)
138 $beginthickpos = $transcript_beginpos{$transcript_id};
139 }
140 if (exists $ORFpos{$transcript_id}[1]) {
141 if (exists $CDSpos{$transcript_id}[1]) { # both start_codon and CDS
142 $endthickpos = max($ORFpos{$transcript_id}[1],
143 $CDSpos{$transcript_id}[1]);
144 } else { # only start_codon
145 $endthickpos = $ORFpos{$transcript_id}[1];
146 }
147 } elsif (exists $CDSpos{$transcript_id}[1]) { # only CDS
148 $endthickpos = $CDSpos{$transcript_id}[1];
149 } else { # -- (but there is a stop_codon)
150 $endthickpos = $transcript_endpos{$transcript_id};
151 }
152 }
153 $beginthickpos -= 1;
154 } else {
155 $beginthickpos = $beginpos; $endthickpos = $beginpos;
156 }
157 print BED "\t$beginthickpos\t$endthickpos";
158
159 $blocksizes = ''; $blockstarts = '';
160 $Nexons = $#{$exon_beginpos{$transcript_id}};
161 ## In some GTF files the exons of a transcript on the reverse strand
162 ## are numbered according to their position on the forward strand
163 ## and in others according to their position on the reverse strand
164 if ($Nexons == 1) {
165 $blocksizes .= $exon_endpos{$transcript_id}[1] - $exon_beginpos{$transcript_id}[1] + 1 . ',';
166 $blockstarts .= $exon_beginpos{$transcript_id}[1] - $transcript_beginpos{$transcript_id} . ',';
167 } else {
168 if ($exon_beginpos{$transcript_id}[2] > $exon_beginpos{$transcript_id}[1]) {
169 foreach $exon_number (1 .. $Nexons) {
170 $blocksizes .= $exon_endpos{$transcript_id}[$exon_number] - $exon_beginpos{$transcript_id}[$exon_number] + 1 . ',';
171 $blockstarts .= $exon_beginpos{$transcript_id}[$exon_number] - $transcript_beginpos{$transcript_id} . ',';
172 }
173 } else { # (is <)
174 for($exon_number = $Nexons ; $exon_number > 0 ; $exon_number--) {
175 $blocksizes .= $exon_endpos{$transcript_id}[$exon_number] - $exon_beginpos{$transcript_id}[$exon_number] + 1 . ',';
176 $blockstarts .= $exon_beginpos{$transcript_id}[$exon_number] - $transcript_beginpos{$transcript_id} . ',';
177 }
178 }
179 }
180 print BED "\t0\t$Nexons\t$blocksizes\t$blockstarts\n";
181 }
182 close( GTF );
183 close( BED );
184
185 #end raw]]></configfile>
186 </configfiles>
187 <inputs>
188 <param name="input_gtf" type="data" optional="false" label="input_gtf" help="Input Gene Transfer Format (.gtf) file" format="gtf,txt" multiple="false"/>
189 </inputs>
190 <outputs>
191 <data name="converted_from_gtf" format="bed" label="converted_from_gtf" hidden="false"/>
192 </outputs>
193 <tests>
194 <test>
195 <output name="converted_from_gtf" value="test_output.bed" compare="diff" lines_diff="0"/>
196 <param name="input_gtf" value="test_input.gtf"/>
197 </test>
198 </tests>
199 <help><![CDATA[
200
201 Conversion script written by Guy Bottu for the GenePattern server of VIB BioinforlmaticsCore, takes as input a GTF file and writes a BED file in 12 column format with information about transcripts, for use with RSeqC. Modified for integration under GenePattern.
202
203
204
205 The "thick" information is about the coding region, ideally it goes from start codon to stop codon, but if information is lacking (e.g. because of missing sequence or missing annotation), we use the CDS information.
206
207
208
209 For some transcripts there are multiple start or stop codons. We always choose the "thick" so that is has maximum length.
210
211 If there is no CDS information (as for ncRNA) the "thick" will have just a repeat of the transcript start position, as per BED convention.
212
213
214
215
216
217 usage : perl gtf_to_bed.pl <GTF file> <output file>
218
219
220
221 ------
222
223
224 Script::
225
226 #!/usr/bin/perl
227 # written by Guy Bottu for the GenePattern server of VIB BioinforlmaticsCore,
228 # takes as input a GTF file and writes a BED file in 12 column format
229 # with information about transcripts, for use with RSeqC.
230 #
231 # The "thick" information is about the coding region, ideally it goes from
232 # start codon to stop codon, but is information is lacking (e.g. because
233 # of missing sequence or missing annotation), we use the CDS information.
234 # For some transcripts there are multiple start or stop codons. We amways
235 # choose the "thick" so that is has maximum length.
236 #
237 # If there is no CDS information (as for ncRNA) the "thick" will have just a
238 # repeat of the transcript start position, as per BED convention.
239 #
240 # modified for integration under GenePattern
241 #
242 # usage : perl gtf_to_bed.pl <GTF file> <output file>
243 use List::Util qw (min max);
244 $gtf = $ARGV[0];
245 $gtf =~ /.*\/([^\/]+)\.gtf3?/;
246 # print $gtf;
247 $bed = $ARGV[1];
248 open GTF, $gtf;
249 open BED, ">$bed";
250 LINEPARSER: while (<GTF>) {
251 if (/^#/) { next LINEPARSER } # skip comment lines
252 @fields = split /\t/;
253 $chrom = $fields[0]; $type = $fields[2]; $beginpos = $fields[3];
254 $endpos = $fields[4]; $strand = $fields[6];
255 chomp $fields[8]; $documentation = $fields[8];
256 $documentation =~ /transcript_id "([^"]+)";/;
257 $transcript_id = $1;
258 if ($strand ne '+' and $strand ne '-') {
259 print "WARNING : $transcript_id has strand information $strand\n";
260 }
261 if ($type eq 'transcript') {
262 $chrom{$transcript_id} = $chrom;
263 $strand{$transcript_id} = $strand;
264 $transcript_beginpos{$transcript_id} = $beginpos;
265 $transcript_endpos{$transcript_id} = $endpos;
266 } elsif ($type eq 'exon') {
267 $documentation =~ /exon_number "([^"]+)";/;
268 $exon_number = $1;
269 # print $exon_number;
270 $exon_beginpos{$transcript_id}[$exon_number] = $beginpos;
271 $exon_endpos{$transcript_id}[$exon_number] = $endpos;
272 } elsif ($type eq 'start_codon') {
273 if (not exists $ORFpos{$transcript_id}[0]
274 or ($strand eq '+' and $beginpos < $ORFpos{$transcript_id}[0])
275 or ($strand eq '-' and $endpos > $ORFpos{$transcript_id}[1])) {
276 $ORFpos{$transcript_id}[0] = $beginpos;
277 $ORFpos{$transcript_id}[1] = $endpos;
278 }
279 } elsif ($type eq 'stop_codon') {
280 if (not exists $ORFpos{$transcript_id}[2]
281 or ($strand eq '+' and $endpos > $ORFpos{$transcript_id}[3])
282 or ($strand eq '-' and $beginpos < $ORFpos{$transcript_id}[2])) {
283 $ORFpos{$transcript_id}[2] = $beginpos;
284 $ORFpos{$transcript_id}[3] = $endpos;
285 }
286 } elsif ($type eq 'CDS') {
287 if (not exists $CDSpos{$transcript_id}[0]
288 or $beginpos < $CDSpos{$transcript_id}[0]) {
289 $CDSpos{$transcript_id}[0] = $beginpos;
290 }
291 if (not exists $CDSpos{$transcript_id}[1]
292 or $endpos > $CDSpos{$transcript_id}[1]) {
293 $CDSpos{$transcript_id}[1] = $endpos;
294 }
295 }
296 }
297 foreach $transcript_id (sort keys %transcript_beginpos) {
298 $beginpos = $transcript_beginpos{$transcript_id} - 1;
299 ## in BED numbering starts with 0, not 1 like in GTF
300 $endpos = $transcript_endpos{$transcript_id};
301 print BED "$chrom{$transcript_id}\t$beginpos\t$endpos\t$transcript_id\t0\t$strand{$transcript_id}";
302 if (exists $ORFpos{$transcript_id}[0] or exists $ORFpos{$transcript_id}[2]
303 or exists $CDSpos{$transcript_id}[0]) {
304 if ($strand{$transcript_id} eq '+') {
305 if (exists $ORFpos{$transcript_id}[0]) {
306 if (exists $CDSpos{$transcript_id}[0]) { # both start_codon and CDS
307 $beginthickpos = min($ORFpos{$transcript_id}[0],
308 $CDSpos{$transcript_id}[0]);
309 } else { # only start_codon
310 $beginthickpos = $ORFpos{$transcript_id}[0];
311 }
312 } elsif (exists $CDSpos{$transcript_id}[0]) { # only CDS
313 $beginthickpos = $CDSpos{$transcript_id}[0];
314 } else { # -- (but there is a stop_codon)
315 $beginthickpos = $transcript_beginpos{$transcript_id};
316 }
317 if (exists $ORFpos{$transcript_id}[3]) {
318 if (exists $CDSpos{$transcript_id}[1]) { # both stop_codon and CDS
319 $endthickpos = max($ORFpos{$transcript_id}[3],
320 $CDSpos{$transcript_id}[1]);
321 } else { # only stop_codon
322 $endthickpos = $ORFpos{$transcript_id}[3];
323 }
324 } elsif (exists $CDSpos{$transcript_id}[1]) { # only CDS
325 $endthickpos = $CDSpos{$transcript_id}[1];
326 } else { # -- (but there is a start_codon)
327 $endthickpos = $transcript_endpos{$transcript_id};
328 }
329 } elsif ($strand{$transcript_id} eq '-') {
330 if (exists $ORFpos{$transcript_id}[2]) {
331 if (exists $CDSpos{$transcript_id}[0]) { # both stop_codon and CDS
332 $beginthickpos = min($ORFpos{$transcript_id}[2],
333 $CDSpos{$transcript_id}[0]);
334 } else { # only stop_codon
335 $beginthickpos = $ORFpos{$transcript_id}[2];
336 }
337 } elsif (exists $CDSpos{$transcript_id}[0]) { # only CDS
338 $beginthickpos = $CDSpos{$transcript_id}[0];
339 } else { # -- (but there is a start_codon)
340 $beginthickpos = $transcript_beginpos{$transcript_id};
341 }
342 if (exists $ORFpos{$transcript_id}[1]) {
343 if (exists $CDSpos{$transcript_id}[1]) { # both start_codon and CDS
344 $endthickpos = max($ORFpos{$transcript_id}[1],
345 $CDSpos{$transcript_id}[1]);
346 } else { # only start_codon
347 $endthickpos = $ORFpos{$transcript_id}[1];
348 }
349 } elsif (exists $CDSpos{$transcript_id}[1]) { # only CDS
350 $endthickpos = $CDSpos{$transcript_id}[1];
351 } else { # -- (but there is a stop_codon)
352 $endthickpos = $transcript_endpos{$transcript_id};
353 }
354 }
355 $beginthickpos -= 1;
356 } else {
357 $beginthickpos = $beginpos; $endthickpos = $beginpos;
358 }
359 print BED "\t$beginthickpos\t$endthickpos";
360 $blocksizes = ''; $blockstarts = '';
361 $Nexons = $#{$exon_beginpos{$transcript_id}};
362 ## In some GTF files the exons of a transcript on the reverse strand
363 ## are numbered according to their position on the forward strand
364 ## and in others according to their position on the reverse strand
365 if ($Nexons == 1) {
366 $blocksizes .= $exon_endpos{$transcript_id}[1] - $exon_beginpos{$transcript_id}[1] + 1 . ',';
367 $blockstarts .= $exon_beginpos{$transcript_id}[1] - $transcript_beginpos{$transcript_id} . ',';
368 } else {
369 if ($exon_beginpos{$transcript_id}[2] > $exon_beginpos{$transcript_id}[1]) {
370 foreach $exon_number (1 .. $Nexons) {
371 $blocksizes .= $exon_endpos{$transcript_id}[$exon_number] - $exon_beginpos{$transcript_id}[$exon_number] + 1 . ',';
372 $blockstarts .= $exon_beginpos{$transcript_id}[$exon_number] - $transcript_beginpos{$transcript_id} . ',';
373 }
374 } else { # (is <)
375 for($exon_number = $Nexons ; $exon_number > 0 ; $exon_number--) {
376 $blocksizes .= $exon_endpos{$transcript_id}[$exon_number] - $exon_beginpos{$transcript_id}[$exon_number] + 1 . ',';
377 $blockstarts .= $exon_beginpos{$transcript_id}[$exon_number] - $transcript_beginpos{$transcript_id} . ',';
378 }
379 }
380 }
381 print BED "\t0\t$Nexons\t$blocksizes\t$blockstarts\n";
382 }
383 close( GTF );
384 close( BED );
385
386 ]]></help>
387 <citations>
388 <citation type="doi">10.1093/bioinformatics/bts573</citation>
389 </citations>
390 </tool>
391