comparison gstf_preparation.py @ 5:b3ba0c84667c draft

planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 95bab1105cf8a7b07c668f08f712399e8775a4ae
author earlhaminst
date Mon, 16 Apr 2018 14:05:09 -0400
parents 284f64ad9d43
children 56bbdbfe3eaa
comparison
equal deleted inserted replaced
4:284f64ad9d43 5:b3ba0c84667c
170 derived_translation_end = None 170 derived_translation_end = None
171 if transcript_id in cds_parent_dict: 171 if transcript_id in cds_parent_dict:
172 cds_list = cds_parent_dict[transcript_id] 172 cds_list = cds_parent_dict[transcript_id]
173 cds_ids = set(_['id'] for _ in cds_list) 173 cds_ids = set(_['id'] for _ in cds_list)
174 if len(cds_ids) > 1: 174 if len(cds_ids) > 1:
175 raise Exception("Transcript %s has multiple CDSs: this is not supported by Ensembl JSON format" % parent) 175 raise Exception("Transcript %s has multiple CDSs: this is not supported by Ensembl JSON format" % transcript_id)
176 translation['id'] = cds_ids.pop() 176 cds_id = cds_ids.pop()
177 translation['id'] = cds_id
177 cds_list.sort(key=lambda _: _['start']) 178 cds_list.sort(key=lambda _: _['start'])
178 translation['CDS'] = cds_list 179 translation['CDS'] = cds_list
179 translation['start'] = cds_list[0]['start'] 180 translation['start'] = cds_list[0]['start']
180 translation['end'] = cds_list[-1]['end'] 181 translation['end'] = cds_list[-1]['end']
181 found_cds = True 182 found_cds = True
194 else: 195 else:
195 derived_translation_start = three_prime_utr_list[-1]['end'] + 1 196 derived_translation_start = three_prime_utr_list[-1]['end'] + 1
196 if derived_translation_start is not None: 197 if derived_translation_start is not None:
197 if found_cds: 198 if found_cds:
198 if derived_translation_start > translation['start']: 199 if derived_translation_start > translation['start']:
199 raise Exception("UTR overlaps with CDS") 200 raise Exception("Transcript %s has the start of CDS %s overlapping with the UTR end" % (transcript_id, cds_id))
200 else: 201 else:
201 translation['start'] = derived_translation_start 202 translation['start'] = derived_translation_start
202 if derived_translation_end is not None: 203 if derived_translation_end is not None:
203 if found_cds: 204 if found_cds:
204 if derived_translation_end < translation['end']: 205 if derived_translation_end < translation['end']:
205 raise Exception("UTR overlaps with CDS") 206 raise Exception("Transcript %s has the end of CDS %s overlapping with the UTR start" % (transcript_id, cds_id))
206 else: 207 else:
207 translation['end'] = derived_translation_end 208 translation['end'] = derived_translation_end
208 if found_cds or derived_translation_start is not None or derived_translation_end is not None: 209 if found_cds or derived_translation_start is not None or derived_translation_end is not None:
209 transcript['Translation'] = translation 210 transcript['Translation'] = translation
210 211
298 transcript_dict = dict() 299 transcript_dict = dict()
299 exon_parent_dict = dict() 300 exon_parent_dict = dict()
300 cds_parent_dict = dict() 301 cds_parent_dict = dict()
301 five_prime_utr_parent_dict = dict() 302 five_prime_utr_parent_dict = dict()
302 three_prime_utr_parent_dict = dict() 303 three_prime_utr_parent_dict = dict()
304 unimplemented_feature_nlines_dict = dict()
303 305
304 with open(filename) as f: 306 with open(filename) as f:
305 for i, line in enumerate(f, start=1): 307 for i, line in enumerate(f, start=1):
306 line = line.strip() 308 line = line.strip()
307 if not line: 309 if not line:
325 feature_to_dict(cols, five_prime_utr_parent_dict) 327 feature_to_dict(cols, five_prime_utr_parent_dict)
326 elif feature_type == 'three_prime_UTR': 328 elif feature_type == 'three_prime_UTR':
327 feature_to_dict(cols, three_prime_utr_parent_dict) 329 feature_to_dict(cols, three_prime_utr_parent_dict)
328 elif feature_type == 'CDS': 330 elif feature_type == 'CDS':
329 add_cds_to_dict(cols, cds_parent_dict) 331 add_cds_to_dict(cols, cds_parent_dict)
332 elif feature_type in unimplemented_feature_nlines_dict:
333 unimplemented_feature_nlines_dict[feature_type] += 1
330 else: 334 else:
331 print("Line %i in file '%s': '%s' is not an implemented feature type" % (i, filename, feature_type), file=sys.stderr) 335 unimplemented_feature_nlines_dict[feature_type] = 0
332 except Exception as e: 336 except Exception as e:
333 print("Line %i in file '%s': %s" % (i, filename, e), file=sys.stderr) 337 print("Line %i in file '%s': %s" % (i, filename, e), file=sys.stderr)
338
339 for unimplemented_feature, nlines in unimplemented_feature_nlines_dict.items():
340 print("Skipped %d lines in file '%s': '%s' is not an implemented feature type" % (nlines, filename, unimplemented_feature), file=sys.stderr)
334 341
335 join_dicts(gene_dict, transcript_dict, exon_parent_dict, cds_parent_dict, five_prime_utr_parent_dict, three_prime_utr_parent_dict) 342 join_dicts(gene_dict, transcript_dict, exon_parent_dict, cds_parent_dict, five_prime_utr_parent_dict, three_prime_utr_parent_dict)
336 write_gene_dict_to_db(conn, gene_dict) 343 write_gene_dict_to_db(conn, gene_dict)
337 344
338 for json_arg in options.json: 345 for json_arg in options.json:
346 # Extract the transcript id by removing everything after the first space and then removing the version if it is an Ensembl id 353 # Extract the transcript id by removing everything after the first space and then removing the version if it is an Ensembl id
347 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0]) 354 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0])
348 355
349 gene_id = fetch_gene_id_for_transcript(conn, transcript_id) 356 gene_id = fetch_gene_id_for_transcript(conn, transcript_id)
350 if not gene_id: 357 if not gene_id:
358 print("Transcript '%s' in file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr)
351 continue 359 continue
352 360
353 if gene_id in gene_transcripts_dict: 361 if gene_id in gene_transcripts_dict:
354 gene_transcripts_dict[gene_id].append((transcript_id, len(entry.sequence))) 362 gene_transcripts_dict[gene_id].append((transcript_id, len(entry.sequence)))
355 else: 363 else:
367 if options.longestCDS and transcript_id not in selected_transcript_ids: 375 if options.longestCDS and transcript_id not in selected_transcript_ids:
368 continue 376 continue
369 377
370 species_for_transcript = fetch_species_for_transcript(conn, transcript_id) 378 species_for_transcript = fetch_species_for_transcript(conn, transcript_id)
371 if not species_for_transcript: 379 if not species_for_transcript:
372 print("Transcript '%s' not found in the gene feature information" % transcript_id, file=sys.stderr) 380 print("Transcript '%s' in file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr)
373 continue 381 continue
374 382
375 if options.headers: 383 if options.headers:
376 # Change the FASTA header to '>TranscriptId_species', as required by TreeBest 384 # Change the FASTA header to '>TranscriptId_species', as required by TreeBest
377 # Remove any underscore in the species 385 # Remove any underscore in the species