Mercurial > repos > earlhaminst > gstf_preparation
diff gstf_preparation.py @ 11:dbe37a658cd2 draft
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
author | earlhaminst |
---|---|
date | Sun, 27 Sep 2020 18:54:31 +0000 |
parents | e8e75a79de59 |
children | 99bae410128c |
line wrap: on
line diff
--- a/gstf_preparation.py Thu Oct 31 08:16:51 2019 -0400 +++ b/gstf_preparation.py Sun Sep 27 18:54:31 2020 +0000 @@ -6,7 +6,7 @@ import sqlite3 import sys -version = "0.4.0" +version = "0.5.0" gene_count = 0 @@ -61,25 +61,39 @@ seq_region_end INTEGER NOT NULL, seq_region_strand INTEGER NOT NULL, species VARCHAR NOT NULL, + biotype VARCHAR, gene_json VARCHAR NOT NULL)''') cur.execute('CREATE INDEX gene_symbol_index ON gene (gene_symbol)') cur.execute('''CREATE TABLE transcript ( transcript_id VARCHAR PRIMARY KEY NOT NULL, + transcript_symbol VARCHAR, protein_id VARCHAR UNIQUE, protein_sequence VARCHAR, + biotype VARCHAR, + is_canonical BOOLEAN NOT NULL DEFAULT FALSE, gene_id VARCHAR NOT NULL REFERENCES gene(gene_id))''') - cur.execute('''CREATE VIEW transcript_species AS - SELECT transcript_id, species, seq_region_name + # The following temporary view is not used in GAFA, so schema changes to it + # don't require a meta version upgrade. + cur.execute('''CREATE TEMPORARY VIEW transcript_join_gene AS + SELECT transcript_id, transcript_symbol, COALESCE(transcript.biotype, gene.biotype) AS biotype, is_canonical, gene_id, gene_symbol, seq_region_name, species FROM transcript JOIN gene - ON transcript.gene_id = gene.gene_id''') + USING (gene_id)''') conn.commit() -def remove_type_from_list_of_ids(l): - return ','.join(remove_type_from_id(_) for _ in l.split(',')) +def fetch_transcript_and_gene(conn, transcript_id): + cur = conn.cursor() + + cur.execute('SELECT * FROM transcript_join_gene WHERE transcript_id=?', + (transcript_id, )) + return cur.fetchone() + + +def remove_type_from_list_of_ids(ids): + return ','.join(remove_type_from_id(id_) for id_ in ids.split(',')) def remove_type_from_id(id_): @@ -103,6 +117,8 @@ value = remove_type_from_id(value) elif tag == 'Parent': value = remove_type_from_list_of_ids(value) + elif tag == 'representative': + tag = 'is_canonical' d[tag] = value if cols[6] == '+': d['strand'] = 1 @@ -122,27 +138,27 @@ def add_gene_to_dict(cols, species, gene_dict): global gene_count gene = feature_to_dict(cols) + if not gene['id']: + raise Exception("Id not found among column 9 attribute tags: %s" % cols[8]) gene.update({ 'member_id': gene_count, 'object_type': 'Gene', 'seq_region_name': cols[0], 'species': species, 'Transcript': [], - 'display_name': gene.get('Name', None) + 'display_name': gene.get('Name'), }) - if gene['id']: - gene_dict[gene['id']] = gene - gene_count = gene_count + 1 + gene_dict[gene['id']] = gene + gene_count = gene_count + 1 def add_transcript_to_dict(cols, species, transcript_dict): transcript = feature_to_dict(cols) - if 'biotype' in transcript and transcript['biotype'] != 'protein_coding': - return transcript.update({ 'object_type': 'Transcript', 'seq_region_name': cols[0], 'species': species, + 'display_name': transcript.get('Name'), }) transcript_dict[transcript['id']] = transcript @@ -242,45 +258,30 @@ if gene is None: # This can happen when loading a JSON file from Ensembl continue + if 'confidence' in gene and gene['confidence'] != 'high': + print("Gene %s has confidence %s (not high), discarding" % (gene['id'], gene['confidence']), file=sys.stderr) + continue gene_id = gene['id'] - cur.execute('INSERT INTO gene (gene_id, gene_symbol, seq_region_name, seq_region_start, seq_region_end, seq_region_strand, species, gene_json) VALUES (?, ?, ?, ?, ?, ?, ?, ?)', - (gene_id, gene.get('display_name', None), gene['seq_region_name'], gene['start'], gene['end'], gene['strand'], gene['species'], json.dumps(gene))) + cur.execute('INSERT INTO gene (gene_id, gene_symbol, seq_region_name, seq_region_start, seq_region_end, seq_region_strand, species, biotype, gene_json) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)', + (gene_id, gene.get('display_name'), gene['seq_region_name'], gene['start'], gene['end'], gene['strand'], gene['species'], gene.get('biotype'), json.dumps(gene))) if "Transcript" in gene: for transcript in gene["Transcript"]: transcript_id = transcript['id'] - protein_id = transcript.get('Translation', {}).get('id', None) + transcript_symbol = transcript.get('display_name') + protein_id = transcript.get('Translation', {}).get('id') + biotype = transcript.get('biotype') + is_canonical = transcript.get('is_canonical', False) + to_insert = (transcript_id, transcript_symbol, protein_id, biotype, is_canonical, gene_id) try: - cur.execute('INSERT INTO transcript (transcript_id, protein_id, gene_id) VALUES (?, ?, ?)', - (transcript_id, protein_id, gene_id)) + cur.execute('INSERT INTO transcript (transcript_id, transcript_symbol, protein_id, biotype, is_canonical, gene_id) VALUES (?, ?, ?, ?, ?, ?)', + to_insert) except Exception as e: - raise Exception("Error while inserting (%s, %s, %s) into transcript table: %s" % (transcript_id, protein_id, gene_id, e)) + raise Exception("Error while inserting %s into transcript table: %s" % (str(to_insert), e)) conn.commit() -def fetch_species_and_seq_region_for_transcript(conn, transcript_id): - cur = conn.cursor() - - cur.execute('SELECT species, seq_region_name FROM transcript_species WHERE transcript_id=?', - (transcript_id, )) - row = cur.fetchone() - if not row: - return (None, None) - return row - - -def fetch_gene_id_for_transcript(conn, transcript_id): - cur = conn.cursor() - - cur.execute('SELECT gene_id FROM transcript WHERE transcript_id=?', - (transcript_id, )) - row = cur.fetchone() - if not row: - return None - return row[0] - - def remove_id_version(s, force=False): """ Remove the optional '.VERSION' from an id if it's an Ensembl id or if @@ -297,8 +298,10 @@ parser.add_option('--gff3', action='append', default=[], help='GFF3 file to convert, in SPECIES:FILENAME format. Use multiple times to add more files') parser.add_option('--json', action='append', default=[], help='JSON file to merge. Use multiple times to add more files') parser.add_option('--fasta', action='append', default=[], help='Path of the input FASTA files') - parser.add_option('-l', action='store_true', default=False, dest='longestCDS', help='Keep only the longest CDS per gene') - parser.add_option('--headers', action='store_true', default=False, help='Change the header line of the FASTA sequences to the >TranscriptId_species format') + parser.add_option('--filter', type='choice', choices=['canonical', 'coding', ''], default='', help='Which transcripts to keep') + parser.add_option('--headers', type='choice', + choices=['TranscriptId_species', 'GeneSymbol-TranscriptID_species', 'TranscriptSymbol-TranscriptID_species', ''], + default='', help='Change the header line of the FASTA sequences to this format') parser.add_option('--regions', default="", help='Comma-separated list of region IDs for which FASTA sequences should be filtered') parser.add_option('-o', '--output', help='Path of the output SQLite file') parser.add_option('--of', help='Path of the output FASTA file') @@ -309,6 +312,7 @@ raise Exception('Use options to provide inputs') conn = sqlite3.connect(options.output) + conn.row_factory = sqlite3.Row conn.execute('PRAGMA foreign_keys = ON') create_tables(conn) @@ -371,7 +375,7 @@ # Read the FASTA files a first time to: # - determine for each file if we need to force the removal of the version # from the transcript id - # - fill gene_transcripts_dict when keeping only the longest CDS per gene + # - fill gene_transcripts_dict when keeping only the canonical transcripts force_remove_id_version_file_list = [] gene_transcripts_dict = dict() for fasta_arg in options.fasta: @@ -381,35 +385,45 @@ # Extract the transcript id by removing everything after the first space and then removing the version if needed transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0], force_remove_id_version) - if len(entry.sequence) % 3 != 0: - continue - - gene_id = fetch_gene_id_for_transcript(conn, transcript_id) - if not gene_id and not found_gene_transcript: + transcript = fetch_transcript_and_gene(conn, transcript_id) + if not transcript and not found_gene_transcript: # We have not found a proper gene transcript in this file yet, # try to force the removal of the version from the transcript id transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0], True) - gene_id = fetch_gene_id_for_transcript(conn, transcript_id) + transcript = fetch_transcript_and_gene(conn, transcript_id) # Remember that we need to force the removal for this file - if gene_id: + if transcript: force_remove_id_version = True force_remove_id_version_file_list.append(fasta_arg) print("Forcing removal of id version in FASTA file '%s'" % fasta_arg, file=sys.stderr) - if not gene_id: + if not transcript: print("Transcript '%s' in FASTA file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr) continue - if options.longestCDS: - found_gene_transcript = True - else: + if options.filter != 'canonical': break + found_gene_transcript = True + + if len(entry.sequence) % 3 != 0: + continue + transcript_biotype = transcript['biotype'] # This is the biotype of the transcript or, if that is NULL, the one of the gene + if transcript_biotype and transcript_biotype != 'protein_coding': + continue + gene_transcripts_dict.setdefault(transcript['gene_id'], []).append((transcript_id, transcript['is_canonical'], len(entry.sequence))) - gene_transcripts_dict.setdefault(gene_id, []).append((transcript_id, len(entry.sequence))) - - if options.longestCDS: - # For each gene, select the transcript with the longest sequence. - # If more than one transcripts have the same longest sequence for a - # gene, the first one to appear in the FASTA file is selected - selected_transcript_ids = [max(transcript_id_lengths, key=lambda _: _[1])[0] for transcript_id_lengths in gene_transcripts_dict.values()] + if options.filter == 'canonical': + selected_transcript_ids = [] + for gene_id, transcript_tuples in gene_transcripts_dict.items(): + canonical_transcript_ids = [id_ for (id_, is_canonical, _) in transcript_tuples if is_canonical] + if not canonical_transcript_ids: + # Select the transcript with the longest sequence. If more than + # one transcripts have the same longest sequence for a gene, the + # first one to appear in the FASTA file is selected. + selected_transcript_id = max(transcript_tuples, key=lambda transcript_tuple: transcript_tuple[2])[0] + elif len(canonical_transcript_ids) > 1: + raise Exception("Gene %s has more than 1 canonical transcripts" % (gene_id)) + else: + selected_transcript_id = canonical_transcript_ids[0] + selected_transcript_ids.append(selected_transcript_id) regions = [_.strip().lower() for _ in options.regions.split(",")] with open(options.of, 'w') as output_fasta_file, open(options.ff, 'w') as filtered_fasta_file: @@ -417,24 +431,37 @@ force_remove_id_version = fasta_arg in force_remove_id_version_file_list for entry in FASTAReader_gen(fasta_arg): transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0], force_remove_id_version) - if options.longestCDS and transcript_id not in selected_transcript_ids: - continue - if len(entry.sequence) % 3 != 0: - print("Transcript '%s' in FASTA file '%s' has a coding sequence length which is not multiple of 3" % (transcript_id, fasta_arg), file=sys.stderr) - continue - - species_for_transcript, seq_region_for_transcript = fetch_species_and_seq_region_for_transcript(conn, transcript_id) - if not species_for_transcript: + transcript = fetch_transcript_and_gene(conn, transcript_id) + if not transcript: print("Transcript '%s' in FASTA file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr) continue - if options.headers: + if options.filter == 'canonical': + # We already filtered out non-protein-coding transcripts when populating gene_transcripts_dict + if transcript_id not in selected_transcript_ids: + continue + elif options.filter == 'coding': + if len(entry.sequence) % 3 != 0: + print("Transcript '%s' in FASTA file '%s' has a coding sequence length which is not multiple of 3, removing from FASTA output" % (transcript_id, fasta_arg), file=sys.stderr) + continue + transcript_biotype = transcript['biotype'] # This is the biotype of the transcript or, if that is NULL, the one of the gene + if transcript_biotype and transcript_biotype != 'protein_coding': + print("Transcript %s has biotype %s (not protein-coding), removing from FASTA output" % (transcript_id, transcript_biotype), file=sys.stderr) + continue + + if options.headers == "TranscriptId_species": # Change the FASTA header to '>TranscriptId_species', as required by TreeBest # Remove any underscore in the species - entry.header = ">%s_%s" % (transcript_id, species_for_transcript.replace('_', '')) + entry.header = ">%s_%s" % (transcript_id, transcript['species'].replace('_', '')) + elif options.headers == "GeneSymbol-TranscriptID_species": + # Remove any underscore in the species + entry.header = ">%s-%s_%s" % (transcript['gene_symbol'], transcript_id, transcript['species'].replace('_', '')) + elif options.headers == "TranscriptSymbol-TranscriptID_species": + # Remove any underscore in the species + entry.header = ">%s-%s_%s" % (transcript['transcript_symbol'], transcript_id, transcript['species'].replace('_', '')) - if seq_region_for_transcript.lower() in regions: + if transcript['seq_region_name'].lower() in regions: entry.print(filtered_fasta_file) else: entry.print(output_fasta_file)