comparison gstf_preparation.py @ 13:51a7a2a82902 draft

"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
author earlhaminst
date Tue, 06 Oct 2020 17:10:37 +0000
parents 99bae410128c
children 598e9172b8e7
comparison
equal deleted inserted replaced
12:99bae410128c 13:51a7a2a82902
1 from __future__ import print_function
2
3 import json 1 import json
4 import optparse 2 import optparse
5 import os 3 import os
6 import sqlite3 4 import sqlite3
7 import sys 5 import sys
8 6
9 version = "0.5.0" 7 version = "0.5.0"
10 gene_count = 0 8 gene_count = 0
11 9
12 10
13 class Sequence(object): 11 def asbool(val):
12 if isinstance(val, str):
13 val_lower = val.strip().lower()
14 if val_lower in ('true', '1'):
15 return True
16 elif val_lower in ('false', '0'):
17 return False
18 else:
19 raise ValueError(f"Cannot convert {val} to bool")
20 else:
21 return bool(val)
22
23
24 class Sequence:
14 def __init__(self, header, sequence_parts): 25 def __init__(self, header, sequence_parts):
15 self.header = header 26 self.header = header
16 self.sequence_parts = sequence_parts 27 self.sequence_parts = sequence_parts
17 self._sequence = None 28 self._sequence = None
18 29
202 found_cds = False 213 found_cds = False
203 derived_translation_start = None 214 derived_translation_start = None
204 derived_translation_end = None 215 derived_translation_end = None
205 if transcript_id in cds_parent_dict: 216 if transcript_id in cds_parent_dict:
206 cds_list = cds_parent_dict[transcript_id] 217 cds_list = cds_parent_dict[transcript_id]
207 cds_ids = set(_['id'] for _ in cds_list) 218 cds_ids = {_['id'] for _ in cds_list}
208 if len(cds_ids) > 1: 219 if len(cds_ids) > 1:
209 raise Exception("Transcript %s has multiple CDSs: this is not supported by Ensembl JSON format" % transcript_id) 220 raise Exception("Transcript %s has multiple CDSs: this is not supported by Ensembl JSON format" % transcript_id)
210 cds_id = cds_ids.pop() 221 cds_id = cds_ids.pop()
211 translation['id'] = cds_id 222 translation['id'] = cds_id
212 cds_list.sort(key=lambda _: _['start']) 223 cds_list.sort(key=lambda _: _['start'])
229 else: 240 else:
230 derived_translation_start = three_prime_utr_list[-1]['end'] + 1 241 derived_translation_start = three_prime_utr_list[-1]['end'] + 1
231 if derived_translation_start is not None: 242 if derived_translation_start is not None:
232 if found_cds: 243 if found_cds:
233 if derived_translation_start > translation['start']: 244 if derived_translation_start > translation['start']:
234 raise Exception("Transcript %s has the start of CDS %s overlapping with the UTR end" % (transcript_id, cds_id)) 245 raise Exception(f"Transcript {transcript_id} has the start of CDS {cds_id} overlapping with the UTR end")
235 else: 246 else:
236 translation['start'] = derived_translation_start 247 translation['start'] = derived_translation_start
237 if derived_translation_end is not None: 248 if derived_translation_end is not None:
238 if found_cds: 249 if found_cds:
239 if derived_translation_end < translation['end']: 250 if derived_translation_end < translation['end']:
240 raise Exception("Transcript %s has the end of CDS %s overlapping with the UTR start" % (transcript_id, cds_id)) 251 raise Exception(f"Transcript {transcript_id} has the end of CDS {cds_id} overlapping with the UTR start")
241 else: 252 else:
242 translation['end'] = derived_translation_end 253 translation['end'] = derived_translation_end
243 if found_cds or derived_translation_start is not None or derived_translation_end is not None: 254 if found_cds or derived_translation_start is not None or derived_translation_end is not None:
244 transcript['Translation'] = translation 255 transcript['Translation'] = translation
245 256
257 for gene in gene_dict.values(): 268 for gene in gene_dict.values():
258 if gene is None: 269 if gene is None:
259 # This can happen when loading a JSON file from Ensembl 270 # This can happen when loading a JSON file from Ensembl
260 continue 271 continue
261 if 'confidence' in gene and gene['confidence'].lower() != 'high': 272 if 'confidence' in gene and gene['confidence'].lower() != 'high':
262 print("Gene %s has confidence %s (not high), discarding" % (gene['id'], gene['confidence']), file=sys.stderr) 273 print("Gene {} has confidence {} (not high), discarding".format(gene['id'], gene['confidence']), file=sys.stderr)
263 continue 274 continue
264 gene_id = gene['id'] 275 gene_id = gene['id']
265 cur.execute('INSERT INTO gene (gene_id, gene_symbol, seq_region_name, seq_region_start, seq_region_end, seq_region_strand, species, biotype, gene_json) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)', 276 cur.execute('INSERT INTO gene (gene_id, gene_symbol, seq_region_name, seq_region_start, seq_region_end, seq_region_strand, species, biotype, gene_json) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)',
266 (gene_id, gene.get('display_name'), gene['seq_region_name'], gene['start'], gene['end'], gene['strand'], gene['species'], gene.get('biotype'), json.dumps(gene))) 277 (gene_id, gene.get('display_name'), gene['seq_region_name'], gene['start'], gene['end'], gene['strand'], gene['species'], gene.get('biotype'), json.dumps(gene)))
267 278
269 for transcript in gene["Transcript"]: 280 for transcript in gene["Transcript"]:
270 transcript_id = transcript['id'] 281 transcript_id = transcript['id']
271 transcript_symbol = transcript.get('display_name') 282 transcript_symbol = transcript.get('display_name')
272 protein_id = transcript.get('Translation', {}).get('id') 283 protein_id = transcript.get('Translation', {}).get('id')
273 biotype = transcript.get('biotype') 284 biotype = transcript.get('biotype')
274 is_canonical = transcript.get('is_canonical', False) 285 is_canonical = asbool(transcript.get('is_canonical', False))
275 to_insert = (transcript_id, transcript_symbol, protein_id, biotype, is_canonical, gene_id) 286 to_insert = (transcript_id, transcript_symbol, protein_id, biotype, is_canonical, gene_id)
276 try: 287 try:
277 cur.execute('INSERT INTO transcript (transcript_id, transcript_symbol, protein_id, biotype, is_canonical, gene_id) VALUES (?, ?, ?, ?, ?, ?)', 288 cur.execute('INSERT INTO transcript (transcript_id, transcript_symbol, protein_id, biotype, is_canonical, gene_id) VALUES (?, ?, ?, ?, ?, ?)',
278 to_insert) 289 to_insert)
279 except Exception as e: 290 except Exception as e:
280 raise Exception("Error while inserting %s into transcript table: %s" % (str(to_insert), e)) 291 raise Exception("Error while inserting {} into transcript table: {}".format(str(to_insert), e))
281 292
282 conn.commit() 293 conn.commit()
283 294
284 295
285 def remove_id_version(s, force=False): 296 def remove_id_version(s, force=False):
395 if transcript: 406 if transcript:
396 force_remove_id_version = True 407 force_remove_id_version = True
397 force_remove_id_version_file_list.append(fasta_arg) 408 force_remove_id_version_file_list.append(fasta_arg)
398 print("Forcing removal of id version in FASTA file '%s'" % fasta_arg, file=sys.stderr) 409 print("Forcing removal of id version in FASTA file '%s'" % fasta_arg, file=sys.stderr)
399 if not transcript: 410 if not transcript:
400 print("Transcript '%s' in FASTA file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr) 411 print(f"Transcript '{transcript_id}' in FASTA file '{fasta_arg}' not found in the gene feature information", file=sys.stderr)
401 continue 412 continue
402 if options.filter != 'canonical': 413 if options.filter != 'canonical':
403 break 414 break
404 found_gene_transcript = True 415 found_gene_transcript = True
405 416
432 for entry in FASTAReader_gen(fasta_arg): 443 for entry in FASTAReader_gen(fasta_arg):
433 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0], force_remove_id_version) 444 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0], force_remove_id_version)
434 445
435 transcript = fetch_transcript_and_gene(conn, transcript_id) 446 transcript = fetch_transcript_and_gene(conn, transcript_id)
436 if not transcript: 447 if not transcript:
437 print("Transcript '%s' in FASTA file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr) 448 print(f"Transcript '{transcript_id}' in FASTA file '{fasta_arg}' not found in the gene feature information", file=sys.stderr)
438 continue 449 continue
439 450
440 if options.filter == 'canonical': 451 if options.filter == 'canonical':
441 # We already filtered out non-protein-coding transcripts when populating gene_transcripts_dict 452 # We already filtered out non-protein-coding transcripts when populating gene_transcripts_dict
442 if transcript_id not in selected_transcript_ids: 453 if transcript_id not in selected_transcript_ids:
443 continue 454 continue
444 elif options.filter == 'coding': 455 elif options.filter == 'coding':
445 if len(entry.sequence) % 3 != 0: 456 if len(entry.sequence) % 3 != 0:
446 print("Transcript '%s' in FASTA file '%s' has a coding sequence length which is not multiple of 3, removing from FASTA output" % (transcript_id, fasta_arg), file=sys.stderr) 457 print(f"Transcript '{transcript_id}' in FASTA file '{fasta_arg}' has a coding sequence length which is not multiple of 3, removing from FASTA output", file=sys.stderr)
447 continue 458 continue
448 transcript_biotype = transcript['biotype'] # This is the biotype of the transcript or, if that is NULL, the one of the gene 459 transcript_biotype = transcript['biotype'] # This is the biotype of the transcript or, if that is NULL, the one of the gene
449 if transcript_biotype and transcript_biotype != 'protein_coding': 460 if transcript_biotype and transcript_biotype != 'protein_coding':
450 print("Transcript %s has biotype %s (not protein-coding), removing from FASTA output" % (transcript_id, transcript_biotype), file=sys.stderr) 461 print(f"Transcript {transcript_id} has biotype {transcript_biotype} (not protein-coding), removing from FASTA output", file=sys.stderr)
451 continue 462 continue
452 463
453 if options.headers == "TranscriptId_species": 464 if options.headers == "TranscriptId_species":
454 # Change the FASTA header to '>TranscriptId_species', as required by TreeBest 465 # Change the FASTA header to '>TranscriptId_species', as required by TreeBest
455 # Remove any underscore in the species 466 # Remove any underscore in the species
456 entry.header = ">%s_%s" % (transcript_id, transcript['species'].replace('_', '')) 467 entry.header = ">{}_{}".format(transcript_id, transcript['species'].replace('_', ''))
457 elif options.headers == "TranscriptID-GeneSymbol_species": 468 elif options.headers == "TranscriptID-GeneSymbol_species":
458 # Remove any underscore in the species 469 # Remove any underscore in the species
459 entry.header = ">%s-%s_%s" % (transcript_id, transcript['gene_symbol'], transcript['species'].replace('_', '')) 470 entry.header = ">{}-{}_{}".format(transcript_id, transcript['gene_symbol'], transcript['species'].replace('_', ''))
460 elif options.headers == "TranscriptID-TranscriptSymbol_species": 471 elif options.headers == "TranscriptID-TranscriptSymbol_species":
461 # Remove any underscore in the species 472 # Remove any underscore in the species
462 entry.header = ">%s-%s_%s" % (transcript_id, transcript['transcript_symbol'], transcript['species'].replace('_', '')) 473 entry.header = ">{}-{}_{}".format(transcript_id, transcript['transcript_symbol'], transcript['species'].replace('_', ''))
463 474
464 if transcript['seq_region_name'].lower() in regions: 475 if transcript['seq_region_name'].lower() in regions:
465 entry.print(filtered_fasta_file) 476 entry.print(filtered_fasta_file)
466 else: 477 else:
467 entry.print(output_fasta_file) 478 entry.print(output_fasta_file)