Mercurial > repos > petr-novak > various_galaxy_tools
comparison gff_to_bed_converter.py @ 0:696e702ebf74 draft
"planemo upload commit 0f6eca49bafc3c946189d793161a7f81d595e1a1-dirty"
author | petr-novak |
---|---|
date | Mon, 09 May 2022 08:26:30 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:696e702ebf74 |
---|---|
1 #!/usr/bin/env python | |
2 from __future__ import print_function | |
3 | |
4 import sys | |
5 | |
6 from galaxy.datatypes.util.gff_util import parse_gff_attributes | |
7 | |
8 | |
9 def get_bed_line(chrom, name, strand, blocks): | |
10 """Returns a BED line for given data.""" | |
11 | |
12 if len(blocks) == 1: | |
13 # Use simple BED format if there is only a single block: | |
14 # chrom, chromStart, chromEnd, name, score, strand | |
15 # | |
16 start, end = blocks[0] | |
17 return "%s\t%i\t%i\t%s\t0\t%s\n" % (chrom, start, end, name, strand) | |
18 | |
19 # | |
20 # Build lists for transcript blocks' starts, sizes. | |
21 # | |
22 | |
23 # Get transcript start, end. | |
24 t_start = sys.maxsize | |
25 t_end = -1 | |
26 for block_start, block_end in blocks: | |
27 if block_start < t_start: | |
28 t_start = block_start | |
29 if block_end > t_end: | |
30 t_end = block_end | |
31 | |
32 # Get block starts, sizes. | |
33 block_starts = [] | |
34 block_sizes = [] | |
35 for block_start, block_end in blocks: | |
36 block_starts.append(str(block_start - t_start)) | |
37 block_sizes.append(str(block_end - block_start)) | |
38 | |
39 # | |
40 # Create BED entry. | |
41 # Bed format: chrom, chromStart, chromEnd, name, score, strand, \ | |
42 # thickStart, thickEnd, itemRgb, blockCount, blockSizes, blockStarts | |
43 # | |
44 # Render complete feature with thick blocks. There's no clear way to do this unless | |
45 # we analyze the block names, but making everything thick makes more sense than | |
46 # making everything thin. | |
47 # | |
48 return "%s\t%i\t%i\t%s\t0\t%s\t%i\t%i\t0\t%i\t%s\t%s\n" % ( | |
49 chrom, | |
50 t_start, | |
51 t_end, | |
52 name, | |
53 strand, | |
54 t_start, | |
55 t_end, | |
56 len(block_starts), | |
57 ",".join(block_sizes), | |
58 ",".join(block_starts), | |
59 ) | |
60 | |
61 | |
62 def __main__(): | |
63 input_name = sys.argv[1] | |
64 output_name = sys.argv[2] | |
65 skipped_lines = 0 | |
66 first_skipped_line = 0 | |
67 i = 0 | |
68 cur_transcript_chrome = None | |
69 cur_transcript_id = None | |
70 cur_transcript_strand = None | |
71 cur_transcripts_blocks = [] # (start, end) for each block. | |
72 with open(output_name, "w") as out, open(input_name) as in_fh: | |
73 for i, line in enumerate(in_fh): | |
74 line = line.rstrip("\r\n") | |
75 if line and not line.startswith("#"): | |
76 try: | |
77 # GFF format: chrom source, name, chromStart, chromEnd, score, strand, attributes | |
78 elems = line.split("\t") | |
79 start = str(int(elems[3]) - 1) | |
80 coords = [int(start), int(elems[4])] | |
81 strand = elems[6] | |
82 if strand not in ["+", "-"]: | |
83 strand = "+" | |
84 attributes = parse_gff_attributes(elems[8]) | |
85 t_id = attributes.get("transcript_id", None) | |
86 | |
87 if not t_id: | |
88 # | |
89 # No transcript ID, so write last transcript and write current line as its own line. | |
90 # | |
91 | |
92 # Write previous transcript. | |
93 if cur_transcript_id: | |
94 # Write BED entry. | |
95 out.write( | |
96 get_bed_line( | |
97 cur_transcript_chrome, | |
98 cur_transcript_id, | |
99 cur_transcript_strand, | |
100 cur_transcripts_blocks, | |
101 ) | |
102 ) | |
103 | |
104 # Replace any spaces in the name with underscores so UCSC will not complain. | |
105 name = elems[2].replace(" ", "_") | |
106 out.write(get_bed_line(elems[0], name, strand, [coords])) | |
107 continue | |
108 | |
109 # There is a transcript ID, so process line at transcript level. | |
110 if t_id == cur_transcript_id: | |
111 # Line is element of transcript and will be a block in the BED entry. | |
112 cur_transcripts_blocks.append(coords) | |
113 continue | |
114 | |
115 # | |
116 # Line is part of new transcript; write previous transcript and start | |
117 # new transcript. | |
118 # | |
119 | |
120 # Write previous transcript. | |
121 if cur_transcript_id: | |
122 # Write BED entry. | |
123 out.write( | |
124 get_bed_line( | |
125 cur_transcript_chrome, cur_transcript_id, cur_transcript_strand, cur_transcripts_blocks | |
126 ) | |
127 ) | |
128 | |
129 # Start new transcript. | |
130 cur_transcript_chrome = elems[0] | |
131 cur_transcript_id = t_id | |
132 cur_transcript_strand = strand | |
133 cur_transcripts_blocks = [] | |
134 cur_transcripts_blocks.append(coords) | |
135 except Exception: | |
136 skipped_lines += 1 | |
137 if not first_skipped_line: | |
138 first_skipped_line = i + 1 | |
139 else: | |
140 skipped_lines += 1 | |
141 if not first_skipped_line: | |
142 first_skipped_line = i + 1 | |
143 | |
144 # Write last transcript. | |
145 if cur_transcript_id: | |
146 # Write BED entry. | |
147 out.write( | |
148 get_bed_line(cur_transcript_chrome, cur_transcript_id, cur_transcript_strand, cur_transcripts_blocks) | |
149 ) | |
150 info_msg = "%i lines converted to BED. " % (i + 1 - skipped_lines) | |
151 if skipped_lines > 0: | |
152 info_msg += "Skipped %d blank/comment/invalid lines starting with line #%d." % ( | |
153 skipped_lines, | |
154 first_skipped_line, | |
155 ) | |
156 print(info_msg) | |
157 | |
158 | |
159 if __name__ == "__main__": | |
160 __main__() |