Mercurial > repos > earlhaminst > gstf_preparation
comparison gstf_preparation.py @ 5:b3ba0c84667c draft
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 95bab1105cf8a7b07c668f08f712399e8775a4ae
author | earlhaminst |
---|---|
date | Mon, 16 Apr 2018 14:05:09 -0400 |
parents | 284f64ad9d43 |
children | 56bbdbfe3eaa |
comparison
equal
deleted
inserted
replaced
4:284f64ad9d43 | 5:b3ba0c84667c |
---|---|
170 derived_translation_end = None | 170 derived_translation_end = None |
171 if transcript_id in cds_parent_dict: | 171 if transcript_id in cds_parent_dict: |
172 cds_list = cds_parent_dict[transcript_id] | 172 cds_list = cds_parent_dict[transcript_id] |
173 cds_ids = set(_['id'] for _ in cds_list) | 173 cds_ids = set(_['id'] for _ in cds_list) |
174 if len(cds_ids) > 1: | 174 if len(cds_ids) > 1: |
175 raise Exception("Transcript %s has multiple CDSs: this is not supported by Ensembl JSON format" % parent) | 175 raise Exception("Transcript %s has multiple CDSs: this is not supported by Ensembl JSON format" % transcript_id) |
176 translation['id'] = cds_ids.pop() | 176 cds_id = cds_ids.pop() |
177 translation['id'] = cds_id | |
177 cds_list.sort(key=lambda _: _['start']) | 178 cds_list.sort(key=lambda _: _['start']) |
178 translation['CDS'] = cds_list | 179 translation['CDS'] = cds_list |
179 translation['start'] = cds_list[0]['start'] | 180 translation['start'] = cds_list[0]['start'] |
180 translation['end'] = cds_list[-1]['end'] | 181 translation['end'] = cds_list[-1]['end'] |
181 found_cds = True | 182 found_cds = True |
194 else: | 195 else: |
195 derived_translation_start = three_prime_utr_list[-1]['end'] + 1 | 196 derived_translation_start = three_prime_utr_list[-1]['end'] + 1 |
196 if derived_translation_start is not None: | 197 if derived_translation_start is not None: |
197 if found_cds: | 198 if found_cds: |
198 if derived_translation_start > translation['start']: | 199 if derived_translation_start > translation['start']: |
199 raise Exception("UTR overlaps with CDS") | 200 raise Exception("Transcript %s has the start of CDS %s overlapping with the UTR end" % (transcript_id, cds_id)) |
200 else: | 201 else: |
201 translation['start'] = derived_translation_start | 202 translation['start'] = derived_translation_start |
202 if derived_translation_end is not None: | 203 if derived_translation_end is not None: |
203 if found_cds: | 204 if found_cds: |
204 if derived_translation_end < translation['end']: | 205 if derived_translation_end < translation['end']: |
205 raise Exception("UTR overlaps with CDS") | 206 raise Exception("Transcript %s has the end of CDS %s overlapping with the UTR start" % (transcript_id, cds_id)) |
206 else: | 207 else: |
207 translation['end'] = derived_translation_end | 208 translation['end'] = derived_translation_end |
208 if found_cds or derived_translation_start is not None or derived_translation_end is not None: | 209 if found_cds or derived_translation_start is not None or derived_translation_end is not None: |
209 transcript['Translation'] = translation | 210 transcript['Translation'] = translation |
210 | 211 |
298 transcript_dict = dict() | 299 transcript_dict = dict() |
299 exon_parent_dict = dict() | 300 exon_parent_dict = dict() |
300 cds_parent_dict = dict() | 301 cds_parent_dict = dict() |
301 five_prime_utr_parent_dict = dict() | 302 five_prime_utr_parent_dict = dict() |
302 three_prime_utr_parent_dict = dict() | 303 three_prime_utr_parent_dict = dict() |
304 unimplemented_feature_nlines_dict = dict() | |
303 | 305 |
304 with open(filename) as f: | 306 with open(filename) as f: |
305 for i, line in enumerate(f, start=1): | 307 for i, line in enumerate(f, start=1): |
306 line = line.strip() | 308 line = line.strip() |
307 if not line: | 309 if not line: |
325 feature_to_dict(cols, five_prime_utr_parent_dict) | 327 feature_to_dict(cols, five_prime_utr_parent_dict) |
326 elif feature_type == 'three_prime_UTR': | 328 elif feature_type == 'three_prime_UTR': |
327 feature_to_dict(cols, three_prime_utr_parent_dict) | 329 feature_to_dict(cols, three_prime_utr_parent_dict) |
328 elif feature_type == 'CDS': | 330 elif feature_type == 'CDS': |
329 add_cds_to_dict(cols, cds_parent_dict) | 331 add_cds_to_dict(cols, cds_parent_dict) |
332 elif feature_type in unimplemented_feature_nlines_dict: | |
333 unimplemented_feature_nlines_dict[feature_type] += 1 | |
330 else: | 334 else: |
331 print("Line %i in file '%s': '%s' is not an implemented feature type" % (i, filename, feature_type), file=sys.stderr) | 335 unimplemented_feature_nlines_dict[feature_type] = 0 |
332 except Exception as e: | 336 except Exception as e: |
333 print("Line %i in file '%s': %s" % (i, filename, e), file=sys.stderr) | 337 print("Line %i in file '%s': %s" % (i, filename, e), file=sys.stderr) |
338 | |
339 for unimplemented_feature, nlines in unimplemented_feature_nlines_dict.items(): | |
340 print("Skipped %d lines in file '%s': '%s' is not an implemented feature type" % (nlines, filename, unimplemented_feature), file=sys.stderr) | |
334 | 341 |
335 join_dicts(gene_dict, transcript_dict, exon_parent_dict, cds_parent_dict, five_prime_utr_parent_dict, three_prime_utr_parent_dict) | 342 join_dicts(gene_dict, transcript_dict, exon_parent_dict, cds_parent_dict, five_prime_utr_parent_dict, three_prime_utr_parent_dict) |
336 write_gene_dict_to_db(conn, gene_dict) | 343 write_gene_dict_to_db(conn, gene_dict) |
337 | 344 |
338 for json_arg in options.json: | 345 for json_arg in options.json: |
346 # Extract the transcript id by removing everything after the first space and then removing the version if it is an Ensembl id | 353 # Extract the transcript id by removing everything after the first space and then removing the version if it is an Ensembl id |
347 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0]) | 354 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0]) |
348 | 355 |
349 gene_id = fetch_gene_id_for_transcript(conn, transcript_id) | 356 gene_id = fetch_gene_id_for_transcript(conn, transcript_id) |
350 if not gene_id: | 357 if not gene_id: |
358 print("Transcript '%s' in file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr) | |
351 continue | 359 continue |
352 | 360 |
353 if gene_id in gene_transcripts_dict: | 361 if gene_id in gene_transcripts_dict: |
354 gene_transcripts_dict[gene_id].append((transcript_id, len(entry.sequence))) | 362 gene_transcripts_dict[gene_id].append((transcript_id, len(entry.sequence))) |
355 else: | 363 else: |
367 if options.longestCDS and transcript_id not in selected_transcript_ids: | 375 if options.longestCDS and transcript_id not in selected_transcript_ids: |
368 continue | 376 continue |
369 | 377 |
370 species_for_transcript = fetch_species_for_transcript(conn, transcript_id) | 378 species_for_transcript = fetch_species_for_transcript(conn, transcript_id) |
371 if not species_for_transcript: | 379 if not species_for_transcript: |
372 print("Transcript '%s' not found in the gene feature information" % transcript_id, file=sys.stderr) | 380 print("Transcript '%s' in file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr) |
373 continue | 381 continue |
374 | 382 |
375 if options.headers: | 383 if options.headers: |
376 # Change the FASTA header to '>TranscriptId_species', as required by TreeBest | 384 # Change the FASTA header to '>TranscriptId_species', as required by TreeBest |
377 # Remove any underscore in the species | 385 # Remove any underscore in the species |