Mercurial > repos > earlhaminst > gstf_preparation
comparison gstf_preparation.py @ 0:28879ca33b5f draft
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
author | earlhaminst |
---|---|
date | Wed, 15 Mar 2017 20:18:57 -0400 |
parents | |
children | a36645976342 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:28879ca33b5f |
---|---|
1 from __future__ import print_function | |
2 | |
3 import collections | |
4 import json | |
5 import optparse | |
6 import sqlite3 | |
7 import sys | |
8 | |
9 version = "0.3.0" | |
10 gene_count = 0 | |
11 | |
12 Sequence = collections.namedtuple('Sequence', ['header', 'sequence']) | |
13 | |
14 | |
15 def FASTAReader_gen(fasta_filename): | |
16 with open(fasta_filename) as fasta_file: | |
17 line = fasta_file.readline() | |
18 while True: | |
19 if not line: | |
20 return | |
21 assert line.startswith('>'), "FASTA headers must start with >" | |
22 header = line.rstrip() | |
23 sequence_parts = [] | |
24 line = fasta_file.readline() | |
25 while line and line[0] != '>': | |
26 sequence_parts.append(line.rstrip()) | |
27 line = fasta_file.readline() | |
28 sequence = "\n".join(sequence_parts) | |
29 yield Sequence(header, sequence) | |
30 | |
31 | |
32 def create_tables(conn): | |
33 cur = conn.cursor() | |
34 | |
35 cur.execute('''CREATE TABLE meta ( | |
36 version VARCHAR PRIMARY KEY NOT NULL)''') | |
37 | |
38 cur.execute('INSERT INTO meta (version) VALUES (?)', | |
39 (version, )) | |
40 | |
41 cur.execute('''CREATE TABLE gene ( | |
42 gene_id VARCHAR PRIMARY KEY NOT NULL, | |
43 gene_symbol VARCHAR, | |
44 species VARCHAR NOT NULL, | |
45 gene_json VARCHAR NOT NULL)''') | |
46 cur.execute('CREATE INDEX gene_symbol_index ON gene (gene_symbol)') | |
47 | |
48 cur.execute('''CREATE TABLE transcript ( | |
49 transcript_id VARCHAR PRIMARY KEY NOT NULL, | |
50 protein_id VARCHAR UNIQUE, | |
51 protein_sequence VARCHAR, | |
52 gene_id VARCHAR NOT NULL REFERENCES gene(gene_id))''') | |
53 | |
54 cur.execute('''CREATE VIEW transcript_species AS | |
55 SELECT transcript_id, species | |
56 FROM transcript JOIN gene | |
57 ON transcript.gene_id = gene.gene_id''') | |
58 | |
59 conn.commit() | |
60 | |
61 | |
62 def remove_type_from_list_of_ids(l): | |
63 return ','.join(remove_type_from_id(_) for _ in l.split(',')) | |
64 | |
65 | |
66 def remove_type_from_id(id_): | |
67 colon_index = id_.find(':') | |
68 if colon_index >= 0: | |
69 return id_[colon_index + 1:] | |
70 else: | |
71 return id_ | |
72 | |
73 | |
74 def feature_to_dict(cols, parent_dict=None): | |
75 d = { | |
76 'end': int(cols[4]), | |
77 'start': int(cols[3]), | |
78 } | |
79 for attr in cols[8].split(';'): | |
80 if '=' in attr: | |
81 (tag, value) = attr.split('=') | |
82 if tag == 'ID': | |
83 tag = 'id' | |
84 value = remove_type_from_id(value) | |
85 elif tag == 'Parent': | |
86 value = remove_type_from_list_of_ids(value) | |
87 d[tag] = value | |
88 if cols[6] == '+': | |
89 d['strand'] = 1 | |
90 elif cols[6] == '-': | |
91 d['strand'] = -1 | |
92 else: | |
93 raise Exception("Unrecognized strand '%s'" % cols[6]) | |
94 if parent_dict is not None and 'Parent' in d: | |
95 # a 3' UTR can be split among multiple exons | |
96 # a 5' UTR can be split among multiple exons | |
97 # a CDS can be part of multiple transcripts | |
98 for parent in d['Parent'].split(','): | |
99 if parent not in parent_dict: | |
100 parent_dict[parent] = [d] | |
101 else: | |
102 parent_dict[parent].append(d) | |
103 return d | |
104 | |
105 | |
106 def add_gene_to_dict(cols, species, gene_dict): | |
107 global gene_count | |
108 gene = feature_to_dict(cols) | |
109 gene.update({ | |
110 'member_id': gene_count, | |
111 'object_type': 'Gene', | |
112 'seq_region_name': cols[0], | |
113 'species': species, | |
114 'Transcript': [], | |
115 'display_name': gene['Name'] | |
116 }) | |
117 if gene['id']: | |
118 gene_dict[gene['id']] = gene | |
119 gene_count = gene_count + 1 | |
120 | |
121 | |
122 def add_transcript_to_dict(cols, species, transcript_dict): | |
123 transcript = feature_to_dict(cols) | |
124 transcript.update({ | |
125 'object_type': 'Transcript', | |
126 'seq_region_name': cols[0], | |
127 'species': species, | |
128 }) | |
129 transcript_dict[transcript['id']] = transcript | |
130 | |
131 | |
132 def add_exon_to_dict(cols, species, exon_parent_dict): | |
133 exon = feature_to_dict(cols, exon_parent_dict) | |
134 exon.update({ | |
135 'length': int(cols[4]) - int(cols[3]) + 1, | |
136 'object_type': 'Exon', | |
137 'seq_region_name': cols[0], | |
138 'species': species, | |
139 }) | |
140 if 'id' not in exon and 'Name' in exon: | |
141 exon['id'] = exon['Name'] | |
142 | |
143 | |
144 def add_cds_to_dict(cols, cds_parent_dict): | |
145 cds = feature_to_dict(cols, cds_parent_dict) | |
146 if 'id' not in cds: | |
147 if 'Name' in cds: | |
148 cds['id'] = cds['Name'] | |
149 elif 'Parent' in cds and ',' not in cds['Parent']: | |
150 cds['id'] = cds['Parent'] | |
151 | |
152 | |
153 def join_dicts(gene_dict, transcript_dict, exon_parent_dict, cds_parent_dict, five_prime_utr_parent_dict, three_prime_utr_parent_dict): | |
154 for parent, exon_list in exon_parent_dict.items(): | |
155 if parent in transcript_dict: | |
156 exon_list.sort(key=lambda _: _['start']) | |
157 transcript_dict[parent]['Exon'] = exon_list | |
158 | |
159 for transcript_id, transcript in transcript_dict.items(): | |
160 translation = { | |
161 'CDS': [], | |
162 'id': None, | |
163 'end': transcript['end'], | |
164 'object_type': 'Translation', | |
165 'species': transcript['species'], | |
166 'start': transcript['start'], | |
167 } | |
168 found_cds = False | |
169 derived_translation_start = None | |
170 derived_translation_end = None | |
171 if transcript_id in cds_parent_dict: | |
172 cds_list = cds_parent_dict[transcript_id] | |
173 cds_ids = set(_['id'] for _ in cds_list) | |
174 if len(cds_ids) > 1: | |
175 raise Exception("Transcript %s has multiple CDSs: this is not supported by Ensembl JSON format" % parent) | |
176 translation['id'] = cds_ids.pop() | |
177 cds_list.sort(key=lambda _: _['start']) | |
178 translation['CDS'] = cds_list | |
179 translation['start'] = cds_list[0]['start'] | |
180 translation['end'] = cds_list[-1]['end'] | |
181 found_cds = True | |
182 if transcript_id in five_prime_utr_parent_dict: | |
183 five_prime_utr_list = five_prime_utr_parent_dict[transcript_id] | |
184 five_prime_utr_list.sort(key=lambda _: _['start']) | |
185 if transcript['strand'] == 1: | |
186 derived_translation_start = five_prime_utr_list[-1]['end'] + 1 | |
187 else: | |
188 derived_translation_end = five_prime_utr_list[0]['start'] - 1 | |
189 if transcript_id in three_prime_utr_parent_dict: | |
190 three_prime_utr_list = three_prime_utr_parent_dict[transcript_id] | |
191 three_prime_utr_list.sort(key=lambda _: _['start']) | |
192 if transcript['strand'] == 1: | |
193 derived_translation_end = three_prime_utr_list[0]['start'] - 1 | |
194 else: | |
195 derived_translation_start = three_prime_utr_list[-1]['end'] + 1 | |
196 if derived_translation_start is not None: | |
197 if found_cds: | |
198 if derived_translation_start > translation['start']: | |
199 raise Exception("UTR overlaps with CDS") | |
200 else: | |
201 translation['start'] = derived_translation_start | |
202 if derived_translation_end is not None: | |
203 if found_cds: | |
204 if derived_translation_end < translation['end']: | |
205 raise Exception("UTR overlaps with CDS") | |
206 else: | |
207 translation['end'] = derived_translation_end | |
208 if found_cds or derived_translation_start is not None or derived_translation_end is not None: | |
209 transcript['Translation'] = translation | |
210 | |
211 for transcript in transcript_dict.values(): | |
212 if 'Parent' in transcript: | |
213 # A polycistronic transcript can have multiple parents | |
214 for parent in transcript['Parent'].split(','): | |
215 if parent in gene_dict: | |
216 gene_dict[parent]['Transcript'].append(transcript) | |
217 | |
218 | |
219 def write_gene_dict_to_db(conn, gene_dict): | |
220 cur = conn.cursor() | |
221 | |
222 for gene in gene_dict.values(): | |
223 gene_id = gene['id'] | |
224 cur.execute('INSERT INTO gene (gene_id, gene_symbol, species, gene_json) VALUES (?, ?, ?, ?)', | |
225 (gene_id, gene.get('display_name', None), gene['species'], json.dumps(gene))) | |
226 | |
227 if "Transcript" in gene: | |
228 for transcript in gene["Transcript"]: | |
229 transcript_id = transcript['id'] | |
230 protein_id = transcript.get('Translation', {}).get('id', None) | |
231 try: | |
232 cur.execute('INSERT INTO transcript (transcript_id, protein_id, gene_id) VALUES (?, ?, ?)', | |
233 (transcript_id, protein_id, gene_id)) | |
234 except Exception as e: | |
235 raise Exception("Error while inserting (%s, %s, %s) into transcript table: %s" % (transcript_id, protein_id, gene_id, e)) | |
236 | |
237 conn.commit() | |
238 | |
239 | |
240 def fetch_species_for_transcript(conn, transcript_id): | |
241 cur = conn.cursor() | |
242 | |
243 cur.execute('SELECT species FROM transcript_species WHERE transcript_id=?', | |
244 (transcript_id, )) | |
245 results = cur.fetchone() | |
246 if not results: | |
247 return None | |
248 return results[0] | |
249 | |
250 | |
251 def remove_id_version(s): | |
252 """ | |
253 Remove the optional '.VERSION' from an Ensembl id. | |
254 """ | |
255 if s.startswith('ENS'): | |
256 return s.split('.')[0] | |
257 else: | |
258 return s | |
259 | |
260 | |
261 def __main__(): | |
262 parser = optparse.OptionParser() | |
263 parser.add_option('--gff3', action='append', default=[], help='GFF3 file to convert, in SPECIES:FILENAME format. Use multiple times to add more files') | |
264 parser.add_option('--json', action='append', default=[], help='JSON file to merge. Use multiple times to add more files') | |
265 parser.add_option('--fasta', action='append', default=[], help='Path of the input FASTA files') | |
266 parser.add_option('-o', '--output', help='Path of the output SQLite file') | |
267 parser.add_option('--of', help='Path of the output FASTA file') | |
268 options, args = parser.parse_args() | |
269 if args: | |
270 raise Exception('Use options to provide inputs') | |
271 | |
272 conn = sqlite3.connect(options.output) | |
273 conn.execute('PRAGMA foreign_keys = ON') | |
274 create_tables(conn) | |
275 | |
276 for gff3_arg in options.gff3: | |
277 try: | |
278 (species, filename) = gff3_arg.split(':') | |
279 except ValueError: | |
280 raise Exception("Argument for --gff3 '%s' is not in the SPECIES:FILENAME format" % gff3_arg) | |
281 gene_dict = dict() | |
282 transcript_dict = dict() | |
283 exon_parent_dict = dict() | |
284 cds_parent_dict = dict() | |
285 five_prime_utr_parent_dict = dict() | |
286 three_prime_utr_parent_dict = dict() | |
287 with open(filename) as f: | |
288 for i, line in enumerate(f, start=1): | |
289 line = line.strip() | |
290 if not line: | |
291 # skip empty lines | |
292 continue | |
293 if line[0] == '#': | |
294 # skip comment lines | |
295 continue | |
296 cols = line.split('\t') | |
297 if len(cols) != 9: | |
298 raise Exception("Line %i in file '%s': '%s' does not have 9 columns" % (i, filename, line)) | |
299 feature_type = cols[2] | |
300 try: | |
301 if feature_type == 'gene': | |
302 add_gene_to_dict(cols, species, gene_dict) | |
303 elif feature_type in ('mRNA', 'transcript'): | |
304 add_transcript_to_dict(cols, species, transcript_dict) | |
305 elif feature_type == 'exon': | |
306 add_exon_to_dict(cols, species, exon_parent_dict) | |
307 elif feature_type == 'five_prime_UTR': | |
308 feature_to_dict(cols, five_prime_utr_parent_dict) | |
309 elif feature_type == 'three_prime_UTR': | |
310 feature_to_dict(cols, three_prime_utr_parent_dict) | |
311 elif feature_type == 'CDS': | |
312 add_cds_to_dict(cols, cds_parent_dict) | |
313 else: | |
314 print("Line %i in file '%s': '%s' is not an implemented feature type" % (i, filename, feature_type), file=sys.stderr) | |
315 except Exception as e: | |
316 print("Line %i in file '%s': %s" % (i, filename, e), file=sys.stderr) | |
317 | |
318 join_dicts(gene_dict, transcript_dict, exon_parent_dict, cds_parent_dict, five_prime_utr_parent_dict, three_prime_utr_parent_dict) | |
319 write_gene_dict_to_db(conn, gene_dict) | |
320 | |
321 for json_arg in options.json: | |
322 with open(json_arg) as f: | |
323 write_gene_dict_to_db(conn, json.load(f)) | |
324 | |
325 with open(options.of, 'w') as output_fasta_file: | |
326 for fasta_arg in options.fasta: | |
327 for entry in FASTAReader_gen(fasta_arg): | |
328 # Extract the transcript id by removing everything after the first space and then removing the version if it is an Ensembl id | |
329 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0]) | |
330 species_for_transcript = fetch_species_for_transcript(conn, transcript_id) | |
331 if not species_for_transcript: | |
332 print("Transcript '%s' not found in the gene feature information" % transcript_id, file=sys.stderr) | |
333 continue | |
334 # Remove any underscore in the species | |
335 species_for_transcript = species_for_transcript.replace('_', '') | |
336 # Write the FASTA sequence using '>TranscriptId_species' as the header, as required by TreeBest | |
337 output_fasta_file.write(">%s_%s\n%s\n" % (transcript_id, species_for_transcript, entry.sequence)) | |
338 | |
339 conn.close() | |
340 | |
341 | |
342 if __name__ == '__main__': | |
343 __main__() |