Mercurial > repos > earlhaminst > gstf_preparation
comparison gstf_preparation.py @ 12:99bae410128c draft
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 4579d0c461c30183a3092d84013e30f53f072ca1-dirty"
author | earlhaminst |
---|---|
date | Mon, 05 Oct 2020 13:33:59 +0000 |
parents | dbe37a658cd2 |
children | 51a7a2a82902 |
comparison
equal
deleted
inserted
replaced
11:dbe37a658cd2 | 12:99bae410128c |
---|---|
256 | 256 |
257 for gene in gene_dict.values(): | 257 for gene in gene_dict.values(): |
258 if gene is None: | 258 if gene is None: |
259 # This can happen when loading a JSON file from Ensembl | 259 # This can happen when loading a JSON file from Ensembl |
260 continue | 260 continue |
261 if 'confidence' in gene and gene['confidence'] != 'high': | 261 if 'confidence' in gene and gene['confidence'].lower() != 'high': |
262 print("Gene %s has confidence %s (not high), discarding" % (gene['id'], gene['confidence']), file=sys.stderr) | 262 print("Gene %s has confidence %s (not high), discarding" % (gene['id'], gene['confidence']), file=sys.stderr) |
263 continue | 263 continue |
264 gene_id = gene['id'] | 264 gene_id = gene['id'] |
265 cur.execute('INSERT INTO gene (gene_id, gene_symbol, seq_region_name, seq_region_start, seq_region_end, seq_region_strand, species, biotype, gene_json) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)', | 265 cur.execute('INSERT INTO gene (gene_id, gene_symbol, seq_region_name, seq_region_start, seq_region_end, seq_region_strand, species, biotype, gene_json) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)', |
266 (gene_id, gene.get('display_name'), gene['seq_region_name'], gene['start'], gene['end'], gene['strand'], gene['species'], gene.get('biotype'), json.dumps(gene))) | 266 (gene_id, gene.get('display_name'), gene['seq_region_name'], gene['start'], gene['end'], gene['strand'], gene['species'], gene.get('biotype'), json.dumps(gene))) |
298 parser.add_option('--gff3', action='append', default=[], help='GFF3 file to convert, in SPECIES:FILENAME format. Use multiple times to add more files') | 298 parser.add_option('--gff3', action='append', default=[], help='GFF3 file to convert, in SPECIES:FILENAME format. Use multiple times to add more files') |
299 parser.add_option('--json', action='append', default=[], help='JSON file to merge. Use multiple times to add more files') | 299 parser.add_option('--json', action='append', default=[], help='JSON file to merge. Use multiple times to add more files') |
300 parser.add_option('--fasta', action='append', default=[], help='Path of the input FASTA files') | 300 parser.add_option('--fasta', action='append', default=[], help='Path of the input FASTA files') |
301 parser.add_option('--filter', type='choice', choices=['canonical', 'coding', ''], default='', help='Which transcripts to keep') | 301 parser.add_option('--filter', type='choice', choices=['canonical', 'coding', ''], default='', help='Which transcripts to keep') |
302 parser.add_option('--headers', type='choice', | 302 parser.add_option('--headers', type='choice', |
303 choices=['TranscriptId_species', 'GeneSymbol-TranscriptID_species', 'TranscriptSymbol-TranscriptID_species', ''], | 303 choices=['TranscriptId_species', 'TranscriptID-GeneSymbol_species', 'TranscriptID-TranscriptSymbol_species', ''], |
304 default='', help='Change the header line of the FASTA sequences to this format') | 304 default='', help='Change the header line of the FASTA sequences to this format') |
305 parser.add_option('--regions', default="", help='Comma-separated list of region IDs for which FASTA sequences should be filtered') | 305 parser.add_option('--regions', default="", help='Comma-separated list of region IDs for which FASTA sequences should be filtered') |
306 parser.add_option('-o', '--output', help='Path of the output SQLite file') | 306 parser.add_option('-o', '--output', help='Path of the output SQLite file') |
307 parser.add_option('--of', help='Path of the output FASTA file') | 307 parser.add_option('--of', help='Path of the output FASTA file') |
308 parser.add_option('--ff', default=os.devnull, help='Path of the filtered sequences output FASTA file') | 308 parser.add_option('--ff', default=os.devnull, help='Path of the filtered sequences output FASTA file') |
452 | 452 |
453 if options.headers == "TranscriptId_species": | 453 if options.headers == "TranscriptId_species": |
454 # Change the FASTA header to '>TranscriptId_species', as required by TreeBest | 454 # Change the FASTA header to '>TranscriptId_species', as required by TreeBest |
455 # Remove any underscore in the species | 455 # Remove any underscore in the species |
456 entry.header = ">%s_%s" % (transcript_id, transcript['species'].replace('_', '')) | 456 entry.header = ">%s_%s" % (transcript_id, transcript['species'].replace('_', '')) |
457 elif options.headers == "GeneSymbol-TranscriptID_species": | 457 elif options.headers == "TranscriptID-GeneSymbol_species": |
458 # Remove any underscore in the species | 458 # Remove any underscore in the species |
459 entry.header = ">%s-%s_%s" % (transcript['gene_symbol'], transcript_id, transcript['species'].replace('_', '')) | 459 entry.header = ">%s-%s_%s" % (transcript_id, transcript['gene_symbol'], transcript['species'].replace('_', '')) |
460 elif options.headers == "TranscriptSymbol-TranscriptID_species": | 460 elif options.headers == "TranscriptID-TranscriptSymbol_species": |
461 # Remove any underscore in the species | 461 # Remove any underscore in the species |
462 entry.header = ">%s-%s_%s" % (transcript['transcript_symbol'], transcript_id, transcript['species'].replace('_', '')) | 462 entry.header = ">%s-%s_%s" % (transcript_id, transcript['transcript_symbol'], transcript['species'].replace('_', '')) |
463 | 463 |
464 if transcript['seq_region_name'].lower() in regions: | 464 if transcript['seq_region_name'].lower() in regions: |
465 entry.print(filtered_fasta_file) | 465 entry.print(filtered_fasta_file) |
466 else: | 466 else: |
467 entry.print(output_fasta_file) | 467 entry.print(output_fasta_file) |