comparison fromgtfTobed12.py @ 0:418e4d0fe0bd draft

planemo upload for repository https://github.com/lldelisle/tools-lldelisle/tree/master/tools/fromgtfTobed12 commit 1aaffda5b95e0389e315179345642c0d005867c1
author lldelisle
date Fri, 04 Nov 2022 15:37:12 +0000
parents
children 6fd4b3b90220
comparison
equal deleted inserted replaced
-1:000000000000 0:418e4d0fe0bd
1 import argparse
2 import sys
3 import warnings
4
5 import gffutils
6
7 warnings.filterwarnings("ignore", message="It appears you have a gene feature"
8 " in your GTF file. You may want to use the "
9 "`disable_infer_genes` option to speed up database "
10 "creation")
11 warnings.filterwarnings("ignore", message="It appears you have a transcript "
12 "feature in your GTF file. You may want to use the "
13 "`disable_infer_transcripts` option to speed up "
14 "database creation")
15 # In gffutils v0.10 they changed the error message:
16 warnings.filterwarnings("ignore", message="It appears you have a gene feature"
17 " in your GTF file. You may want to use the "
18 "`disable_infer_genes=True` option to speed up "
19 "database creation")
20 warnings.filterwarnings("ignore", message="It appears you have a transcript "
21 "feature in your GTF file. You may want to use the "
22 "`disable_infer_transcripts=True` option to speed up "
23 "database creation")
24
25
26 def convert_gtf_to_bed(fn, fo, useGene, mergeTranscripts,
27 mergeTranscriptsAndOverlappingExons, ucsc):
28 db = gffutils.create_db(fn, ':memory:')
29 # For each transcript:
30 prefered_name = "transcript_name"
31 if useGene or mergeTranscripts or mergeTranscriptsAndOverlappingExons:
32 prefered_name = "gene_name"
33 if mergeTranscripts or mergeTranscriptsAndOverlappingExons:
34 all_items = db.features_of_type("gene", order_by='start')
35 else:
36 all_items = db.features_of_type("transcript", order_by='start')
37 for tr in all_items:
38 # The name would be the name of the transcript/gene if exists
39 try:
40 # First try to have it directly on the feature
41 trName = tr.attributes[prefered_name][0]
42 except KeyError:
43 # Else try to guess the name of the transcript/gene from exons:
44 try:
45 trName = set([e.attributes[prefered_name][0]
46 for e in
47 db.children(tr,
48 featuretype='exon',
49 order_by='start')]).pop()
50 except KeyError:
51 # Else take the transcript id
52 trName = tr.id
53 # If the cds is defined in the gtf,
54 # use it to define the thick start and end
55 # The gtf is 1-based closed intervalls and
56 # bed are 0-based half-open so:
57 # I need to remove one from each start
58 try:
59 # In case of multiple CDS (when there is one entry per gene)
60 # I use the first one to get the start
61 # and the last one to get the end (order_by=-start)
62 cds_start = next(db.children(tr,
63 featuretype='CDS',
64 order_by='start')).start - 1
65 cds_end = next(db.children(tr,
66 featuretype='CDS',
67 order_by='-start')).end
68 except StopIteration:
69 # If the CDS is not defined, then it is set to the start
70 # as proposed here:
71 # https://genome.ucsc.edu/FAQ/FAQformat.html#format1
72 cds_start = tr.start - 1
73 cds_end = tr.start - 1
74 # Get all exons starts and lengths
75 if mergeTranscriptsAndOverlappingExons:
76 # We merge overlapping exons:
77 exons_starts = []
78 exons_length = []
79 current_start = -1
80 current_end = None
81 for e in db.children(tr, featuretype='exon', order_by='start'):
82 if current_start == -1:
83 current_start = e.start - 1
84 current_end = e.end
85 else:
86 if e.start > current_end:
87 # This is a non-overlapping exon
88 # We store the previous exon:
89 exons_starts.append(current_start)
90 exons_length.append(current_end - current_start)
91 # We set the current:
92 current_start = e.start - 1
93 current_end = e.end
94 else:
95 # This is an overlapping exon
96 # We update current_end if necessary
97 current_end = max(current_end, e.end)
98 if current_start != -1:
99 # There is a last exon to store:
100 exons_starts.append(current_start)
101 exons_length.append(current_end - current_start)
102 else:
103 exons_starts = [e.start - 1
104 for e in
105 db.children(tr, featuretype='exon',
106 order_by='start')]
107 exons_length = [len(e)
108 for e in
109 db.children(tr, featuretype='exon',
110 order_by='start')]
111 # Rewrite the chromosome name if needed:
112 chrom = tr.chrom
113 if ucsc and chrom[0:3] != 'chr':
114 chrom = 'chr' + chrom
115 fo.write("%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%s\t%s\n" %
116 (chrom, tr.start - 1, tr.end, trName, 0, tr.strand,
117 cds_start, cds_end, "0", len(exons_starts),
118 ",".join([str(ex_l) for ex_l in exons_length]),
119 ",".join([str(s - (tr.start - 1)) for s in exons_starts])))
120
121
122 argp = argparse.ArgumentParser(
123 description=("Convert a gtf to a bed12 with one entry"
124 " per transcript/gene"))
125 argp.add_argument('input', default=None,
126 help="Input gtf file (can be gzip).")
127 argp.add_argument('--output', default=sys.stdout,
128 type=argparse.FileType('w'),
129 help="Output bed12 file.")
130 argp.add_argument('--useGene', action="store_true",
131 help="Use the gene name instead of the "
132 "transcript name.")
133 argp.add_argument('--ucscformat', action="store_true",
134 help="If you want that all chromosome names "
135 "begin with 'chr'.")
136 group = argp.add_mutually_exclusive_group()
137 group.add_argument('--mergeTranscripts', action="store_true",
138 help="Merge all transcripts into a single "
139 "entry to have one line per gene.")
140 group.add_argument('--mergeTranscriptsAndOverlappingExons',
141 action="store_true",
142 help="Merge all transcripts into a single "
143 "entry to have one line per gene and merge"
144 " overlapping exons.")
145
146 args = argp.parse_args()
147 convert_gtf_to_bed(args.input, args.output, args.useGene,
148 args.mergeTranscripts,
149 args.mergeTranscriptsAndOverlappingExons,
150 args.ucscformat)