Mercurial > repos > earlhaminst > gstf_preparation
comparison gstf_preparation.py @ 13:51a7a2a82902 draft
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
| author | earlhaminst |
|---|---|
| date | Tue, 06 Oct 2020 17:10:37 +0000 |
| parents | 99bae410128c |
| children | 598e9172b8e7 |
comparison
equal
deleted
inserted
replaced
| 12:99bae410128c | 13:51a7a2a82902 |
|---|---|
| 1 from __future__ import print_function | |
| 2 | |
| 3 import json | 1 import json |
| 4 import optparse | 2 import optparse |
| 5 import os | 3 import os |
| 6 import sqlite3 | 4 import sqlite3 |
| 7 import sys | 5 import sys |
| 8 | 6 |
| 9 version = "0.5.0" | 7 version = "0.5.0" |
| 10 gene_count = 0 | 8 gene_count = 0 |
| 11 | 9 |
| 12 | 10 |
| 13 class Sequence(object): | 11 def asbool(val): |
| 12 if isinstance(val, str): | |
| 13 val_lower = val.strip().lower() | |
| 14 if val_lower in ('true', '1'): | |
| 15 return True | |
| 16 elif val_lower in ('false', '0'): | |
| 17 return False | |
| 18 else: | |
| 19 raise ValueError(f"Cannot convert {val} to bool") | |
| 20 else: | |
| 21 return bool(val) | |
| 22 | |
| 23 | |
| 24 class Sequence: | |
| 14 def __init__(self, header, sequence_parts): | 25 def __init__(self, header, sequence_parts): |
| 15 self.header = header | 26 self.header = header |
| 16 self.sequence_parts = sequence_parts | 27 self.sequence_parts = sequence_parts |
| 17 self._sequence = None | 28 self._sequence = None |
| 18 | 29 |
| 202 found_cds = False | 213 found_cds = False |
| 203 derived_translation_start = None | 214 derived_translation_start = None |
| 204 derived_translation_end = None | 215 derived_translation_end = None |
| 205 if transcript_id in cds_parent_dict: | 216 if transcript_id in cds_parent_dict: |
| 206 cds_list = cds_parent_dict[transcript_id] | 217 cds_list = cds_parent_dict[transcript_id] |
| 207 cds_ids = set(_['id'] for _ in cds_list) | 218 cds_ids = {_['id'] for _ in cds_list} |
| 208 if len(cds_ids) > 1: | 219 if len(cds_ids) > 1: |
| 209 raise Exception("Transcript %s has multiple CDSs: this is not supported by Ensembl JSON format" % transcript_id) | 220 raise Exception("Transcript %s has multiple CDSs: this is not supported by Ensembl JSON format" % transcript_id) |
| 210 cds_id = cds_ids.pop() | 221 cds_id = cds_ids.pop() |
| 211 translation['id'] = cds_id | 222 translation['id'] = cds_id |
| 212 cds_list.sort(key=lambda _: _['start']) | 223 cds_list.sort(key=lambda _: _['start']) |
| 229 else: | 240 else: |
| 230 derived_translation_start = three_prime_utr_list[-1]['end'] + 1 | 241 derived_translation_start = three_prime_utr_list[-1]['end'] + 1 |
| 231 if derived_translation_start is not None: | 242 if derived_translation_start is not None: |
| 232 if found_cds: | 243 if found_cds: |
| 233 if derived_translation_start > translation['start']: | 244 if derived_translation_start > translation['start']: |
| 234 raise Exception("Transcript %s has the start of CDS %s overlapping with the UTR end" % (transcript_id, cds_id)) | 245 raise Exception(f"Transcript {transcript_id} has the start of CDS {cds_id} overlapping with the UTR end") |
| 235 else: | 246 else: |
| 236 translation['start'] = derived_translation_start | 247 translation['start'] = derived_translation_start |
| 237 if derived_translation_end is not None: | 248 if derived_translation_end is not None: |
| 238 if found_cds: | 249 if found_cds: |
| 239 if derived_translation_end < translation['end']: | 250 if derived_translation_end < translation['end']: |
| 240 raise Exception("Transcript %s has the end of CDS %s overlapping with the UTR start" % (transcript_id, cds_id)) | 251 raise Exception(f"Transcript {transcript_id} has the end of CDS {cds_id} overlapping with the UTR start") |
| 241 else: | 252 else: |
| 242 translation['end'] = derived_translation_end | 253 translation['end'] = derived_translation_end |
| 243 if found_cds or derived_translation_start is not None or derived_translation_end is not None: | 254 if found_cds or derived_translation_start is not None or derived_translation_end is not None: |
| 244 transcript['Translation'] = translation | 255 transcript['Translation'] = translation |
| 245 | 256 |
| 257 for gene in gene_dict.values(): | 268 for gene in gene_dict.values(): |
| 258 if gene is None: | 269 if gene is None: |
| 259 # This can happen when loading a JSON file from Ensembl | 270 # This can happen when loading a JSON file from Ensembl |
| 260 continue | 271 continue |
| 261 if 'confidence' in gene and gene['confidence'].lower() != 'high': | 272 if 'confidence' in gene and gene['confidence'].lower() != 'high': |
| 262 print("Gene %s has confidence %s (not high), discarding" % (gene['id'], gene['confidence']), file=sys.stderr) | 273 print("Gene {} has confidence {} (not high), discarding".format(gene['id'], gene['confidence']), file=sys.stderr) |
| 263 continue | 274 continue |
| 264 gene_id = gene['id'] | 275 gene_id = gene['id'] |
| 265 cur.execute('INSERT INTO gene (gene_id, gene_symbol, seq_region_name, seq_region_start, seq_region_end, seq_region_strand, species, biotype, gene_json) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)', | 276 cur.execute('INSERT INTO gene (gene_id, gene_symbol, seq_region_name, seq_region_start, seq_region_end, seq_region_strand, species, biotype, gene_json) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)', |
| 266 (gene_id, gene.get('display_name'), gene['seq_region_name'], gene['start'], gene['end'], gene['strand'], gene['species'], gene.get('biotype'), json.dumps(gene))) | 277 (gene_id, gene.get('display_name'), gene['seq_region_name'], gene['start'], gene['end'], gene['strand'], gene['species'], gene.get('biotype'), json.dumps(gene))) |
| 267 | 278 |
| 269 for transcript in gene["Transcript"]: | 280 for transcript in gene["Transcript"]: |
| 270 transcript_id = transcript['id'] | 281 transcript_id = transcript['id'] |
| 271 transcript_symbol = transcript.get('display_name') | 282 transcript_symbol = transcript.get('display_name') |
| 272 protein_id = transcript.get('Translation', {}).get('id') | 283 protein_id = transcript.get('Translation', {}).get('id') |
| 273 biotype = transcript.get('biotype') | 284 biotype = transcript.get('biotype') |
| 274 is_canonical = transcript.get('is_canonical', False) | 285 is_canonical = asbool(transcript.get('is_canonical', False)) |
| 275 to_insert = (transcript_id, transcript_symbol, protein_id, biotype, is_canonical, gene_id) | 286 to_insert = (transcript_id, transcript_symbol, protein_id, biotype, is_canonical, gene_id) |
| 276 try: | 287 try: |
| 277 cur.execute('INSERT INTO transcript (transcript_id, transcript_symbol, protein_id, biotype, is_canonical, gene_id) VALUES (?, ?, ?, ?, ?, ?)', | 288 cur.execute('INSERT INTO transcript (transcript_id, transcript_symbol, protein_id, biotype, is_canonical, gene_id) VALUES (?, ?, ?, ?, ?, ?)', |
| 278 to_insert) | 289 to_insert) |
| 279 except Exception as e: | 290 except Exception as e: |
| 280 raise Exception("Error while inserting %s into transcript table: %s" % (str(to_insert), e)) | 291 raise Exception("Error while inserting {} into transcript table: {}".format(str(to_insert), e)) |
| 281 | 292 |
| 282 conn.commit() | 293 conn.commit() |
| 283 | 294 |
| 284 | 295 |
| 285 def remove_id_version(s, force=False): | 296 def remove_id_version(s, force=False): |
| 395 if transcript: | 406 if transcript: |
| 396 force_remove_id_version = True | 407 force_remove_id_version = True |
| 397 force_remove_id_version_file_list.append(fasta_arg) | 408 force_remove_id_version_file_list.append(fasta_arg) |
| 398 print("Forcing removal of id version in FASTA file '%s'" % fasta_arg, file=sys.stderr) | 409 print("Forcing removal of id version in FASTA file '%s'" % fasta_arg, file=sys.stderr) |
| 399 if not transcript: | 410 if not transcript: |
| 400 print("Transcript '%s' in FASTA file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr) | 411 print(f"Transcript '{transcript_id}' in FASTA file '{fasta_arg}' not found in the gene feature information", file=sys.stderr) |
| 401 continue | 412 continue |
| 402 if options.filter != 'canonical': | 413 if options.filter != 'canonical': |
| 403 break | 414 break |
| 404 found_gene_transcript = True | 415 found_gene_transcript = True |
| 405 | 416 |
| 432 for entry in FASTAReader_gen(fasta_arg): | 443 for entry in FASTAReader_gen(fasta_arg): |
| 433 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0], force_remove_id_version) | 444 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0], force_remove_id_version) |
| 434 | 445 |
| 435 transcript = fetch_transcript_and_gene(conn, transcript_id) | 446 transcript = fetch_transcript_and_gene(conn, transcript_id) |
| 436 if not transcript: | 447 if not transcript: |
| 437 print("Transcript '%s' in FASTA file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr) | 448 print(f"Transcript '{transcript_id}' in FASTA file '{fasta_arg}' not found in the gene feature information", file=sys.stderr) |
| 438 continue | 449 continue |
| 439 | 450 |
| 440 if options.filter == 'canonical': | 451 if options.filter == 'canonical': |
| 441 # We already filtered out non-protein-coding transcripts when populating gene_transcripts_dict | 452 # We already filtered out non-protein-coding transcripts when populating gene_transcripts_dict |
| 442 if transcript_id not in selected_transcript_ids: | 453 if transcript_id not in selected_transcript_ids: |
| 443 continue | 454 continue |
| 444 elif options.filter == 'coding': | 455 elif options.filter == 'coding': |
| 445 if len(entry.sequence) % 3 != 0: | 456 if len(entry.sequence) % 3 != 0: |
| 446 print("Transcript '%s' in FASTA file '%s' has a coding sequence length which is not multiple of 3, removing from FASTA output" % (transcript_id, fasta_arg), file=sys.stderr) | 457 print(f"Transcript '{transcript_id}' in FASTA file '{fasta_arg}' has a coding sequence length which is not multiple of 3, removing from FASTA output", file=sys.stderr) |
| 447 continue | 458 continue |
| 448 transcript_biotype = transcript['biotype'] # This is the biotype of the transcript or, if that is NULL, the one of the gene | 459 transcript_biotype = transcript['biotype'] # This is the biotype of the transcript or, if that is NULL, the one of the gene |
| 449 if transcript_biotype and transcript_biotype != 'protein_coding': | 460 if transcript_biotype and transcript_biotype != 'protein_coding': |
| 450 print("Transcript %s has biotype %s (not protein-coding), removing from FASTA output" % (transcript_id, transcript_biotype), file=sys.stderr) | 461 print(f"Transcript {transcript_id} has biotype {transcript_biotype} (not protein-coding), removing from FASTA output", file=sys.stderr) |
| 451 continue | 462 continue |
| 452 | 463 |
| 453 if options.headers == "TranscriptId_species": | 464 if options.headers == "TranscriptId_species": |
| 454 # Change the FASTA header to '>TranscriptId_species', as required by TreeBest | 465 # Change the FASTA header to '>TranscriptId_species', as required by TreeBest |
| 455 # Remove any underscore in the species | 466 # Remove any underscore in the species |
| 456 entry.header = ">%s_%s" % (transcript_id, transcript['species'].replace('_', '')) | 467 entry.header = ">{}_{}".format(transcript_id, transcript['species'].replace('_', '')) |
| 457 elif options.headers == "TranscriptID-GeneSymbol_species": | 468 elif options.headers == "TranscriptID-GeneSymbol_species": |
| 458 # Remove any underscore in the species | 469 # Remove any underscore in the species |
| 459 entry.header = ">%s-%s_%s" % (transcript_id, transcript['gene_symbol'], transcript['species'].replace('_', '')) | 470 entry.header = ">{}-{}_{}".format(transcript_id, transcript['gene_symbol'], transcript['species'].replace('_', '')) |
| 460 elif options.headers == "TranscriptID-TranscriptSymbol_species": | 471 elif options.headers == "TranscriptID-TranscriptSymbol_species": |
| 461 # Remove any underscore in the species | 472 # Remove any underscore in the species |
| 462 entry.header = ">%s-%s_%s" % (transcript_id, transcript['transcript_symbol'], transcript['species'].replace('_', '')) | 473 entry.header = ">{}-{}_{}".format(transcript_id, transcript['transcript_symbol'], transcript['species'].replace('_', '')) |
| 463 | 474 |
| 464 if transcript['seq_region_name'].lower() in regions: | 475 if transcript['seq_region_name'].lower() in regions: |
| 465 entry.print(filtered_fasta_file) | 476 entry.print(filtered_fasta_file) |
| 466 else: | 477 else: |
| 467 entry.print(output_fasta_file) | 478 entry.print(output_fasta_file) |
