Mercurial > repos > padge > gtf_to_bed_script
comparison gtf_to_bed.xml @ 0:ed0d0eda36a9 draft default tip
"planemo upload for repository https://github.com/usegalaxy-be/galaxytools/tree/main/gtf_to_bed commit 66fba7c9dccfddadce13aad591f441c66c3c309b-dirty"
author | padge |
---|---|
date | Wed, 29 Sep 2021 13:50:53 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:ed0d0eda36a9 |
---|---|
1 <tool name="gtf_to_bed" id="gtf_to_bed" version="0.01"> | |
2 <!--Source in git at: https://github.com/fubar2/toolfactory--> | |
3 <!--Created by admin@galaxy.org at 29/09/2021 09:26:48 using the Galaxy Tool Factory.--> | |
4 <description>Takes as input a GTF file and writes a BED file in 12 column format</description> | |
5 <requirements> | |
6 <requirement type="package">perl</requirement> | |
7 </requirements> | |
8 <stdio> | |
9 <exit_code range="1:" level="fatal"/> | |
10 </stdio> | |
11 <version_command><![CDATA[echo "0.01"]]></version_command> | |
12 <command><![CDATA[perl | |
13 $runme | |
14 $input_gtf | |
15 $converted_from_gtf]]></command> | |
16 <configfiles> | |
17 <configfile name="runme"><![CDATA[#raw | |
18 | |
19 #!/usr/bin/perl | |
20 # written by Guy Bottu for the GenePattern server of VIB BioinforlmaticsCore, | |
21 # takes as input a GTF file and writes a BED file in 12 column format | |
22 # with information about transcripts, for use with RSeqC. | |
23 # | |
24 # The "thick" information is about the coding region, ideally it goes from | |
25 # start codon to stop codon, but is information is lacking (e.g. because | |
26 # of missing sequence or missing annotation), we use the CDS information. | |
27 # For some transcripts there are multiple start or stop codons. We amways | |
28 # choose the "thick" so that is has maximum length. | |
29 # | |
30 # If there is no CDS information (as for ncRNA) the "thick" will have just a | |
31 # repeat of the transcript start position, as per BED convention. | |
32 # | |
33 # modified for integration under GenePattern | |
34 # | |
35 # usage : perl gtf_to_bed.pl <GTF file> <output file> | |
36 use List::Util qw (min max); | |
37 | |
38 $gtf = $ARGV[0]; | |
39 $gtf =~ /.*\/([^\/]+)\.gtf3?/; | |
40 # print $gtf; | |
41 $bed = $ARGV[1]; | |
42 | |
43 open GTF, $gtf; | |
44 open BED, ">$bed"; | |
45 LINEPARSER: while (<GTF>) { | |
46 if (/^#/) { next LINEPARSER } # skip comment lines | |
47 @fields = split /\t/; | |
48 $chrom = $fields[0]; $type = $fields[2]; $beginpos = $fields[3]; | |
49 $endpos = $fields[4]; $strand = $fields[6]; | |
50 chomp $fields[8]; $documentation = $fields[8]; | |
51 $documentation =~ /transcript_id "([^"]+)";/; | |
52 $transcript_id = $1; | |
53 if ($strand ne '+' and $strand ne '-') { | |
54 print "WARNING : $transcript_id has strand information $strand\n"; | |
55 } | |
56 if ($type eq 'transcript') { | |
57 $chrom{$transcript_id} = $chrom; | |
58 $strand{$transcript_id} = $strand; | |
59 $transcript_beginpos{$transcript_id} = $beginpos; | |
60 $transcript_endpos{$transcript_id} = $endpos; | |
61 } elsif ($type eq 'exon') { | |
62 | |
63 $documentation =~ /exon_number "([^"]+)";/; | |
64 $exon_number = $1; | |
65 # print $exon_number; | |
66 $exon_beginpos{$transcript_id}[$exon_number] = $beginpos; | |
67 $exon_endpos{$transcript_id}[$exon_number] = $endpos; | |
68 } elsif ($type eq 'start_codon') { | |
69 if (not exists $ORFpos{$transcript_id}[0] | |
70 or ($strand eq '+' and $beginpos < $ORFpos{$transcript_id}[0]) | |
71 or ($strand eq '-' and $endpos > $ORFpos{$transcript_id}[1])) { | |
72 $ORFpos{$transcript_id}[0] = $beginpos; | |
73 $ORFpos{$transcript_id}[1] = $endpos; | |
74 } | |
75 } elsif ($type eq 'stop_codon') { | |
76 if (not exists $ORFpos{$transcript_id}[2] | |
77 or ($strand eq '+' and $endpos > $ORFpos{$transcript_id}[3]) | |
78 or ($strand eq '-' and $beginpos < $ORFpos{$transcript_id}[2])) { | |
79 $ORFpos{$transcript_id}[2] = $beginpos; | |
80 $ORFpos{$transcript_id}[3] = $endpos; | |
81 } | |
82 } elsif ($type eq 'CDS') { | |
83 if (not exists $CDSpos{$transcript_id}[0] | |
84 or $beginpos < $CDSpos{$transcript_id}[0]) { | |
85 $CDSpos{$transcript_id}[0] = $beginpos; | |
86 } | |
87 if (not exists $CDSpos{$transcript_id}[1] | |
88 or $endpos > $CDSpos{$transcript_id}[1]) { | |
89 $CDSpos{$transcript_id}[1] = $endpos; | |
90 } | |
91 } | |
92 } | |
93 | |
94 foreach $transcript_id (sort keys %transcript_beginpos) { | |
95 $beginpos = $transcript_beginpos{$transcript_id} - 1; | |
96 ## in BED numbering starts with 0, not 1 like in GTF | |
97 $endpos = $transcript_endpos{$transcript_id}; | |
98 print BED "$chrom{$transcript_id}\t$beginpos\t$endpos\t$transcript_id\t0\t$strand{$transcript_id}"; | |
99 | |
100 if (exists $ORFpos{$transcript_id}[0] or exists $ORFpos{$transcript_id}[2] | |
101 or exists $CDSpos{$transcript_id}[0]) { | |
102 if ($strand{$transcript_id} eq '+') { | |
103 if (exists $ORFpos{$transcript_id}[0]) { | |
104 if (exists $CDSpos{$transcript_id}[0]) { # both start_codon and CDS | |
105 $beginthickpos = min($ORFpos{$transcript_id}[0], | |
106 $CDSpos{$transcript_id}[0]); | |
107 } else { # only start_codon | |
108 $beginthickpos = $ORFpos{$transcript_id}[0]; | |
109 } | |
110 } elsif (exists $CDSpos{$transcript_id}[0]) { # only CDS | |
111 $beginthickpos = $CDSpos{$transcript_id}[0]; | |
112 } else { # -- (but there is a stop_codon) | |
113 $beginthickpos = $transcript_beginpos{$transcript_id}; | |
114 } | |
115 if (exists $ORFpos{$transcript_id}[3]) { | |
116 if (exists $CDSpos{$transcript_id}[1]) { # both stop_codon and CDS | |
117 $endthickpos = max($ORFpos{$transcript_id}[3], | |
118 $CDSpos{$transcript_id}[1]); | |
119 } else { # only stop_codon | |
120 $endthickpos = $ORFpos{$transcript_id}[3]; | |
121 } | |
122 } elsif (exists $CDSpos{$transcript_id}[1]) { # only CDS | |
123 $endthickpos = $CDSpos{$transcript_id}[1]; | |
124 } else { # -- (but there is a start_codon) | |
125 $endthickpos = $transcript_endpos{$transcript_id}; | |
126 } | |
127 } elsif ($strand{$transcript_id} eq '-') { | |
128 if (exists $ORFpos{$transcript_id}[2]) { | |
129 if (exists $CDSpos{$transcript_id}[0]) { # both stop_codon and CDS | |
130 $beginthickpos = min($ORFpos{$transcript_id}[2], | |
131 $CDSpos{$transcript_id}[0]); | |
132 } else { # only stop_codon | |
133 $beginthickpos = $ORFpos{$transcript_id}[2]; | |
134 } | |
135 } elsif (exists $CDSpos{$transcript_id}[0]) { # only CDS | |
136 $beginthickpos = $CDSpos{$transcript_id}[0]; | |
137 } else { # -- (but there is a start_codon) | |
138 $beginthickpos = $transcript_beginpos{$transcript_id}; | |
139 } | |
140 if (exists $ORFpos{$transcript_id}[1]) { | |
141 if (exists $CDSpos{$transcript_id}[1]) { # both start_codon and CDS | |
142 $endthickpos = max($ORFpos{$transcript_id}[1], | |
143 $CDSpos{$transcript_id}[1]); | |
144 } else { # only start_codon | |
145 $endthickpos = $ORFpos{$transcript_id}[1]; | |
146 } | |
147 } elsif (exists $CDSpos{$transcript_id}[1]) { # only CDS | |
148 $endthickpos = $CDSpos{$transcript_id}[1]; | |
149 } else { # -- (but there is a stop_codon) | |
150 $endthickpos = $transcript_endpos{$transcript_id}; | |
151 } | |
152 } | |
153 $beginthickpos -= 1; | |
154 } else { | |
155 $beginthickpos = $beginpos; $endthickpos = $beginpos; | |
156 } | |
157 print BED "\t$beginthickpos\t$endthickpos"; | |
158 | |
159 $blocksizes = ''; $blockstarts = ''; | |
160 $Nexons = $#{$exon_beginpos{$transcript_id}}; | |
161 ## In some GTF files the exons of a transcript on the reverse strand | |
162 ## are numbered according to their position on the forward strand | |
163 ## and in others according to their position on the reverse strand | |
164 if ($Nexons == 1) { | |
165 $blocksizes .= $exon_endpos{$transcript_id}[1] - $exon_beginpos{$transcript_id}[1] + 1 . ','; | |
166 $blockstarts .= $exon_beginpos{$transcript_id}[1] - $transcript_beginpos{$transcript_id} . ','; | |
167 } else { | |
168 if ($exon_beginpos{$transcript_id}[2] > $exon_beginpos{$transcript_id}[1]) { | |
169 foreach $exon_number (1 .. $Nexons) { | |
170 $blocksizes .= $exon_endpos{$transcript_id}[$exon_number] - $exon_beginpos{$transcript_id}[$exon_number] + 1 . ','; | |
171 $blockstarts .= $exon_beginpos{$transcript_id}[$exon_number] - $transcript_beginpos{$transcript_id} . ','; | |
172 } | |
173 } else { # (is <) | |
174 for($exon_number = $Nexons ; $exon_number > 0 ; $exon_number--) { | |
175 $blocksizes .= $exon_endpos{$transcript_id}[$exon_number] - $exon_beginpos{$transcript_id}[$exon_number] + 1 . ','; | |
176 $blockstarts .= $exon_beginpos{$transcript_id}[$exon_number] - $transcript_beginpos{$transcript_id} . ','; | |
177 } | |
178 } | |
179 } | |
180 print BED "\t0\t$Nexons\t$blocksizes\t$blockstarts\n"; | |
181 } | |
182 close( GTF ); | |
183 close( BED ); | |
184 | |
185 #end raw]]></configfile> | |
186 </configfiles> | |
187 <inputs> | |
188 <param name="input_gtf" type="data" optional="false" label="input_gtf" help="Input Gene Transfer Format (.gtf) file" format="gtf,txt" multiple="false"/> | |
189 </inputs> | |
190 <outputs> | |
191 <data name="converted_from_gtf" format="bed" label="converted_from_gtf" hidden="false"/> | |
192 </outputs> | |
193 <tests> | |
194 <test> | |
195 <output name="converted_from_gtf" value="test_output.bed" compare="diff" lines_diff="0"/> | |
196 <param name="input_gtf" value="test_input.gtf"/> | |
197 </test> | |
198 </tests> | |
199 <help><![CDATA[ | |
200 | |
201 Conversion script written by Guy Bottu for the GenePattern server of VIB BioinforlmaticsCore, takes as input a GTF file and writes a BED file in 12 column format with information about transcripts, for use with RSeqC. Modified for integration under GenePattern. | |
202 | |
203 | |
204 | |
205 The "thick" information is about the coding region, ideally it goes from start codon to stop codon, but if information is lacking (e.g. because of missing sequence or missing annotation), we use the CDS information. | |
206 | |
207 | |
208 | |
209 For some transcripts there are multiple start or stop codons. We always choose the "thick" so that is has maximum length. | |
210 | |
211 If there is no CDS information (as for ncRNA) the "thick" will have just a repeat of the transcript start position, as per BED convention. | |
212 | |
213 | |
214 | |
215 | |
216 | |
217 usage : perl gtf_to_bed.pl <GTF file> <output file> | |
218 | |
219 | |
220 | |
221 ------ | |
222 | |
223 | |
224 Script:: | |
225 | |
226 #!/usr/bin/perl | |
227 # written by Guy Bottu for the GenePattern server of VIB BioinforlmaticsCore, | |
228 # takes as input a GTF file and writes a BED file in 12 column format | |
229 # with information about transcripts, for use with RSeqC. | |
230 # | |
231 # The "thick" information is about the coding region, ideally it goes from | |
232 # start codon to stop codon, but is information is lacking (e.g. because | |
233 # of missing sequence or missing annotation), we use the CDS information. | |
234 # For some transcripts there are multiple start or stop codons. We amways | |
235 # choose the "thick" so that is has maximum length. | |
236 # | |
237 # If there is no CDS information (as for ncRNA) the "thick" will have just a | |
238 # repeat of the transcript start position, as per BED convention. | |
239 # | |
240 # modified for integration under GenePattern | |
241 # | |
242 # usage : perl gtf_to_bed.pl <GTF file> <output file> | |
243 use List::Util qw (min max); | |
244 $gtf = $ARGV[0]; | |
245 $gtf =~ /.*\/([^\/]+)\.gtf3?/; | |
246 # print $gtf; | |
247 $bed = $ARGV[1]; | |
248 open GTF, $gtf; | |
249 open BED, ">$bed"; | |
250 LINEPARSER: while (<GTF>) { | |
251 if (/^#/) { next LINEPARSER } # skip comment lines | |
252 @fields = split /\t/; | |
253 $chrom = $fields[0]; $type = $fields[2]; $beginpos = $fields[3]; | |
254 $endpos = $fields[4]; $strand = $fields[6]; | |
255 chomp $fields[8]; $documentation = $fields[8]; | |
256 $documentation =~ /transcript_id "([^"]+)";/; | |
257 $transcript_id = $1; | |
258 if ($strand ne '+' and $strand ne '-') { | |
259 print "WARNING : $transcript_id has strand information $strand\n"; | |
260 } | |
261 if ($type eq 'transcript') { | |
262 $chrom{$transcript_id} = $chrom; | |
263 $strand{$transcript_id} = $strand; | |
264 $transcript_beginpos{$transcript_id} = $beginpos; | |
265 $transcript_endpos{$transcript_id} = $endpos; | |
266 } elsif ($type eq 'exon') { | |
267 $documentation =~ /exon_number "([^"]+)";/; | |
268 $exon_number = $1; | |
269 # print $exon_number; | |
270 $exon_beginpos{$transcript_id}[$exon_number] = $beginpos; | |
271 $exon_endpos{$transcript_id}[$exon_number] = $endpos; | |
272 } elsif ($type eq 'start_codon') { | |
273 if (not exists $ORFpos{$transcript_id}[0] | |
274 or ($strand eq '+' and $beginpos < $ORFpos{$transcript_id}[0]) | |
275 or ($strand eq '-' and $endpos > $ORFpos{$transcript_id}[1])) { | |
276 $ORFpos{$transcript_id}[0] = $beginpos; | |
277 $ORFpos{$transcript_id}[1] = $endpos; | |
278 } | |
279 } elsif ($type eq 'stop_codon') { | |
280 if (not exists $ORFpos{$transcript_id}[2] | |
281 or ($strand eq '+' and $endpos > $ORFpos{$transcript_id}[3]) | |
282 or ($strand eq '-' and $beginpos < $ORFpos{$transcript_id}[2])) { | |
283 $ORFpos{$transcript_id}[2] = $beginpos; | |
284 $ORFpos{$transcript_id}[3] = $endpos; | |
285 } | |
286 } elsif ($type eq 'CDS') { | |
287 if (not exists $CDSpos{$transcript_id}[0] | |
288 or $beginpos < $CDSpos{$transcript_id}[0]) { | |
289 $CDSpos{$transcript_id}[0] = $beginpos; | |
290 } | |
291 if (not exists $CDSpos{$transcript_id}[1] | |
292 or $endpos > $CDSpos{$transcript_id}[1]) { | |
293 $CDSpos{$transcript_id}[1] = $endpos; | |
294 } | |
295 } | |
296 } | |
297 foreach $transcript_id (sort keys %transcript_beginpos) { | |
298 $beginpos = $transcript_beginpos{$transcript_id} - 1; | |
299 ## in BED numbering starts with 0, not 1 like in GTF | |
300 $endpos = $transcript_endpos{$transcript_id}; | |
301 print BED "$chrom{$transcript_id}\t$beginpos\t$endpos\t$transcript_id\t0\t$strand{$transcript_id}"; | |
302 if (exists $ORFpos{$transcript_id}[0] or exists $ORFpos{$transcript_id}[2] | |
303 or exists $CDSpos{$transcript_id}[0]) { | |
304 if ($strand{$transcript_id} eq '+') { | |
305 if (exists $ORFpos{$transcript_id}[0]) { | |
306 if (exists $CDSpos{$transcript_id}[0]) { # both start_codon and CDS | |
307 $beginthickpos = min($ORFpos{$transcript_id}[0], | |
308 $CDSpos{$transcript_id}[0]); | |
309 } else { # only start_codon | |
310 $beginthickpos = $ORFpos{$transcript_id}[0]; | |
311 } | |
312 } elsif (exists $CDSpos{$transcript_id}[0]) { # only CDS | |
313 $beginthickpos = $CDSpos{$transcript_id}[0]; | |
314 } else { # -- (but there is a stop_codon) | |
315 $beginthickpos = $transcript_beginpos{$transcript_id}; | |
316 } | |
317 if (exists $ORFpos{$transcript_id}[3]) { | |
318 if (exists $CDSpos{$transcript_id}[1]) { # both stop_codon and CDS | |
319 $endthickpos = max($ORFpos{$transcript_id}[3], | |
320 $CDSpos{$transcript_id}[1]); | |
321 } else { # only stop_codon | |
322 $endthickpos = $ORFpos{$transcript_id}[3]; | |
323 } | |
324 } elsif (exists $CDSpos{$transcript_id}[1]) { # only CDS | |
325 $endthickpos = $CDSpos{$transcript_id}[1]; | |
326 } else { # -- (but there is a start_codon) | |
327 $endthickpos = $transcript_endpos{$transcript_id}; | |
328 } | |
329 } elsif ($strand{$transcript_id} eq '-') { | |
330 if (exists $ORFpos{$transcript_id}[2]) { | |
331 if (exists $CDSpos{$transcript_id}[0]) { # both stop_codon and CDS | |
332 $beginthickpos = min($ORFpos{$transcript_id}[2], | |
333 $CDSpos{$transcript_id}[0]); | |
334 } else { # only stop_codon | |
335 $beginthickpos = $ORFpos{$transcript_id}[2]; | |
336 } | |
337 } elsif (exists $CDSpos{$transcript_id}[0]) { # only CDS | |
338 $beginthickpos = $CDSpos{$transcript_id}[0]; | |
339 } else { # -- (but there is a start_codon) | |
340 $beginthickpos = $transcript_beginpos{$transcript_id}; | |
341 } | |
342 if (exists $ORFpos{$transcript_id}[1]) { | |
343 if (exists $CDSpos{$transcript_id}[1]) { # both start_codon and CDS | |
344 $endthickpos = max($ORFpos{$transcript_id}[1], | |
345 $CDSpos{$transcript_id}[1]); | |
346 } else { # only start_codon | |
347 $endthickpos = $ORFpos{$transcript_id}[1]; | |
348 } | |
349 } elsif (exists $CDSpos{$transcript_id}[1]) { # only CDS | |
350 $endthickpos = $CDSpos{$transcript_id}[1]; | |
351 } else { # -- (but there is a stop_codon) | |
352 $endthickpos = $transcript_endpos{$transcript_id}; | |
353 } | |
354 } | |
355 $beginthickpos -= 1; | |
356 } else { | |
357 $beginthickpos = $beginpos; $endthickpos = $beginpos; | |
358 } | |
359 print BED "\t$beginthickpos\t$endthickpos"; | |
360 $blocksizes = ''; $blockstarts = ''; | |
361 $Nexons = $#{$exon_beginpos{$transcript_id}}; | |
362 ## In some GTF files the exons of a transcript on the reverse strand | |
363 ## are numbered according to their position on the forward strand | |
364 ## and in others according to their position on the reverse strand | |
365 if ($Nexons == 1) { | |
366 $blocksizes .= $exon_endpos{$transcript_id}[1] - $exon_beginpos{$transcript_id}[1] + 1 . ','; | |
367 $blockstarts .= $exon_beginpos{$transcript_id}[1] - $transcript_beginpos{$transcript_id} . ','; | |
368 } else { | |
369 if ($exon_beginpos{$transcript_id}[2] > $exon_beginpos{$transcript_id}[1]) { | |
370 foreach $exon_number (1 .. $Nexons) { | |
371 $blocksizes .= $exon_endpos{$transcript_id}[$exon_number] - $exon_beginpos{$transcript_id}[$exon_number] + 1 . ','; | |
372 $blockstarts .= $exon_beginpos{$transcript_id}[$exon_number] - $transcript_beginpos{$transcript_id} . ','; | |
373 } | |
374 } else { # (is <) | |
375 for($exon_number = $Nexons ; $exon_number > 0 ; $exon_number--) { | |
376 $blocksizes .= $exon_endpos{$transcript_id}[$exon_number] - $exon_beginpos{$transcript_id}[$exon_number] + 1 . ','; | |
377 $blockstarts .= $exon_beginpos{$transcript_id}[$exon_number] - $transcript_beginpos{$transcript_id} . ','; | |
378 } | |
379 } | |
380 } | |
381 print BED "\t0\t$Nexons\t$blocksizes\t$blockstarts\n"; | |
382 } | |
383 close( GTF ); | |
384 close( BED ); | |
385 | |
386 ]]></help> | |
387 <citations> | |
388 <citation type="doi">10.1093/bioinformatics/bts573</citation> | |
389 </citations> | |
390 </tool> | |
391 |