view gstf_preparation.py @ 15:9c62ad7dd113 draft default tip

"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit a4e49466bf746633ddc26d052b65ca41576d18fb"
author earlhaminst
date Thu, 29 Apr 2021 19:46:38 +0000
parents 598e9172b8e7
children
line wrap: on
line source

import json
import optparse
import os
import sqlite3
import sys

version = "0.5.0"
gene_count = 0


def asbool(val):
    if isinstance(val, str):
        val_lower = val.strip().lower()
        if val_lower in ('true', '1'):
            return True
        elif val_lower in ('false', '0'):
            return False
        else:
            raise ValueError(f"Cannot convert {val} to bool")
    else:
        return bool(val)


class Sequence:
    def __init__(self, header, sequence_parts):
        self.header = header
        self.sequence_parts = sequence_parts
        self._sequence = None

    @property
    def sequence(self):
        if self._sequence is None:
            self._sequence = ''.join(self.sequence_parts)
        return self._sequence

    def print(self, fh=sys.stdout):
        print(self.header, file=fh)
        for line in self.sequence_parts:
            print(line, file=fh)


def FASTAReader_gen(fasta_filename):
    with open(fasta_filename) as fasta_file:
        line = fasta_file.readline()
        while True:
            if not line:
                return
            assert line.startswith('>'), "FASTA headers must start with >"
            header = line.rstrip()
            sequence_parts = []
            line = fasta_file.readline()
            while line and line[0] != '>':
                sequence_parts.append(line.rstrip())
                line = fasta_file.readline()
            yield Sequence(header, sequence_parts)


def create_tables(conn):
    cur = conn.cursor()

    cur.execute('''CREATE TABLE meta (
        version VARCHAR PRIMARY KEY NOT NULL)''')

    cur.execute('INSERT INTO meta (version) VALUES (?)',
                (version, ))

    cur.execute('''CREATE TABLE gene (
        gene_id VARCHAR PRIMARY KEY NOT NULL,
        gene_symbol VARCHAR,
        seq_region_name VARCHAR NOT NULL,
        seq_region_start INTEGER NOT NULL,
        seq_region_end INTEGER NOT NULL,
        seq_region_strand INTEGER NOT NULL,
        species VARCHAR NOT NULL,
        biotype VARCHAR,
        gene_json VARCHAR NOT NULL)''')
    cur.execute('CREATE INDEX gene_symbol_index ON gene (gene_symbol)')

    cur.execute('''CREATE TABLE transcript (
        transcript_id VARCHAR PRIMARY KEY NOT NULL,
        transcript_symbol VARCHAR,
        protein_id VARCHAR UNIQUE,
        protein_sequence VARCHAR,
        biotype VARCHAR,
        is_canonical BOOLEAN NOT NULL DEFAULT FALSE,
        gene_id VARCHAR NOT NULL REFERENCES gene(gene_id))''')

    # The following temporary view is not used in GAFA, so schema changes to it
    # don't require a meta version upgrade.
    cur.execute('''CREATE TEMPORARY VIEW transcript_join_gene AS
        SELECT transcript_id, transcript_symbol, COALESCE(transcript.biotype, gene.biotype) AS biotype, is_canonical, gene_id, gene_symbol, seq_region_name, species
        FROM transcript JOIN gene
        USING (gene_id)''')

    conn.commit()


def fetch_transcript_and_gene(conn, transcript_id):
    cur = conn.cursor()

    cur.execute('SELECT * FROM transcript_join_gene WHERE transcript_id=?',
                (transcript_id, ))
    return cur.fetchone()


def remove_type_from_list_of_ids(ids):
    return ','.join(remove_type_from_id(id_) for id_ in ids.split(','))


def remove_type_from_id(id_):
    colon_index = id_.find(':')
    if colon_index >= 0:
        return id_[colon_index + 1:]
    else:
        return id_


def feature_to_dict(cols, parent_dict=None):
    d = {
        'end': int(cols[4]),
        'start': int(cols[3]),
    }
    for attr in cols[8].split(';'):
        if '=' in attr:
            (tag, value) = attr.split('=')
            if tag == 'ID':
                tag = 'id'
                value = remove_type_from_id(value)
            elif tag == 'Parent':
                value = remove_type_from_list_of_ids(value)
            elif tag == 'representative':
                tag = 'is_canonical'
            d[tag] = value
    if cols[6] == '+':
        d['strand'] = 1
    elif cols[6] == '-':
        d['strand'] = -1
    else:
        raise Exception(f"Unrecognized strand: {cols[6]}")
    if parent_dict is not None and 'Parent' in d:
        # a 3' UTR can be split among multiple exons
        # a 5' UTR can be split among multiple exons
        # a CDS can be part of multiple transcripts
        for parent in d['Parent'].split(','):
            parent_dict.setdefault(parent, []).append(d)
    return d


def add_gene_to_dict(cols, species, gene_dict):
    global gene_count
    gene = feature_to_dict(cols)
    if not gene['id']:
        raise Exception(f"Id not found among column 9 attribute tags: {cols[8]}")
    gene.update({
        'member_id': gene_count,
        'object_type': 'Gene',
        'seq_region_name': cols[0],
        'species': species,
        'Transcript': [],
        'display_name': gene.get('Name'),
    })
    gene_dict[gene['id']] = gene
    gene_count = gene_count + 1


def add_transcript_to_dict(cols, species, transcript_dict):
    transcript = feature_to_dict(cols)
    transcript.update({
        'object_type': 'Transcript',
        'seq_region_name': cols[0],
        'species': species,
        'display_name': transcript.get('Name'),
    })
    transcript_dict[transcript['id']] = transcript


def add_exon_to_dict(cols, species, exon_parent_dict):
    exon = feature_to_dict(cols, exon_parent_dict)
    exon.update({
        'length': int(cols[4]) - int(cols[3]) + 1,
        'object_type': 'Exon',
        'seq_region_name': cols[0],
        'species': species,
    })
    if 'id' not in exon and 'Name' in exon:
        exon['id'] = exon['Name']


def add_cds_to_dict(cols, cds_parent_dict):
    cds = feature_to_dict(cols, cds_parent_dict)
    if 'id' not in cds:
        if 'Name' in cds:
            cds['id'] = cds['Name']
        elif 'Parent' in cds and ',' not in cds['Parent']:
            cds['id'] = cds['Parent']


def join_dicts(gene_dict, transcript_dict, exon_parent_dict, cds_parent_dict, five_prime_utr_parent_dict, three_prime_utr_parent_dict):
    for parent, exon_list in exon_parent_dict.items():
        if parent in transcript_dict:
            exon_list.sort(key=lambda _: _['start'])
            transcript_dict[parent]['Exon'] = exon_list

    for transcript_id, transcript in transcript_dict.items():
        translation = {
            'CDS': [],
            'id': None,
            'end': transcript['end'],
            'object_type': 'Translation',
            'species': transcript['species'],
            'start': transcript['start'],
        }
        found_cds = False
        derived_translation_start = None
        derived_translation_end = None
        if transcript_id in cds_parent_dict:
            cds_list = cds_parent_dict[transcript_id]
            unique_cds_ids = {cds['id'] for cds in cds_list}
            if len(unique_cds_ids) > 1:
                msg = f"""Found multiple CDS IDs ({unique_cds_ids}) for transcript '{transcript_id}'.
This is not supported by the Ensembl JSON format. If a CDS is split across
multiple discontinuous genomic locations, the GFF3 standard requires that all
corresponding lines use the same ID attribute."""
                raise Exception(msg)
            cds_id = unique_cds_ids.pop()
            translation['id'] = cds_id
            cds_list.sort(key=lambda _: _['start'])
            translation['CDS'] = cds_list
            translation['start'] = cds_list[0]['start']
            translation['end'] = cds_list[-1]['end']
            found_cds = True
        if transcript_id in five_prime_utr_parent_dict:
            five_prime_utr_list = five_prime_utr_parent_dict[transcript_id]
            five_prime_utr_list.sort(key=lambda _: _['start'])
            if transcript['strand'] == 1:
                derived_translation_start = five_prime_utr_list[-1]['end'] + 1
            else:
                derived_translation_end = five_prime_utr_list[0]['start'] - 1
        if transcript_id in three_prime_utr_parent_dict:
            three_prime_utr_list = three_prime_utr_parent_dict[transcript_id]
            three_prime_utr_list.sort(key=lambda _: _['start'])
            if transcript['strand'] == 1:
                derived_translation_end = three_prime_utr_list[0]['start'] - 1
            else:
                derived_translation_start = three_prime_utr_list[-1]['end'] + 1
        if derived_translation_start is not None:
            if found_cds:
                if derived_translation_start > translation['start']:
                    raise Exception(f"Transcript {transcript_id} has the start of CDS {cds_id} overlapping with the UTR end")
            else:
                translation['start'] = derived_translation_start
        if derived_translation_end is not None:
            if found_cds:
                if derived_translation_end < translation['end']:
                    raise Exception(f"Transcript {transcript_id} has the end of CDS {cds_id} overlapping with the UTR start")
            else:
                translation['end'] = derived_translation_end
        if found_cds or derived_translation_start is not None or derived_translation_end is not None:
            transcript['Translation'] = translation

    for transcript in transcript_dict.values():
        if 'Parent' in transcript:
            # A polycistronic transcript can have multiple parents
            for parent in transcript['Parent'].split(','):
                if parent in gene_dict:
                    gene_dict[parent]['Transcript'].append(transcript)


def write_gene_dict_to_db(conn, gene_dict):
    cur = conn.cursor()

    for gene in gene_dict.values():
        if gene is None:
            # This can happen when loading a JSON file from Ensembl
            continue
        if 'confidence' in gene and gene['confidence'].lower() != 'high':
            print("Gene {} has confidence {} (not high), discarding".format(gene['id'], gene['confidence']), file=sys.stderr)
            continue
        gene_id = gene['id']
        cur.execute('INSERT INTO gene (gene_id, gene_symbol, seq_region_name, seq_region_start, seq_region_end, seq_region_strand, species, biotype, gene_json) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)',
                    (gene_id, gene.get('display_name'), gene['seq_region_name'], gene['start'], gene['end'], gene['strand'], gene['species'], gene.get('biotype'), json.dumps(gene)))

        if "Transcript" in gene:
            for transcript in gene["Transcript"]:
                transcript_id = transcript['id']
                transcript_symbol = transcript.get('display_name')
                protein_id = transcript.get('Translation', {}).get('id')
                biotype = transcript.get('biotype')
                is_canonical = asbool(transcript.get('is_canonical', False))
                to_insert = (transcript_id, transcript_symbol, protein_id, biotype, is_canonical, gene_id)
                try:
                    cur.execute('INSERT INTO transcript (transcript_id, transcript_symbol, protein_id, biotype, is_canonical, gene_id) VALUES (?, ?, ?, ?, ?, ?)',
                                to_insert)
                except Exception as e:
                    raise Exception(f"Error while inserting {to_insert} into transcript table: {e}")

    conn.commit()


def remove_id_version(s, force=False):
    """
    Remove the optional '.VERSION' from an id if it's an Ensembl id or if
    `force` is True.
    """
    if force or s.startswith('ENS'):
        return s.split('.')[0]
    else:
        return s


def __main__():
    parser = optparse.OptionParser()
    parser.add_option('--gff3', action='append', default=[], help='GFF3 file to convert, in SPECIES:FILENAME format. Use multiple times to add more files')
    parser.add_option('--json', action='append', default=[], help='JSON file to merge. Use multiple times to add more files')
    parser.add_option('--fasta', action='append', default=[], help='Path of the input FASTA files')
    parser.add_option('--filter', type='choice', choices=['canonical', 'coding', ''], default='', help='Which transcripts to keep')
    parser.add_option('--headers', type='choice',
                      choices=['TranscriptId_species', 'TranscriptID-GeneSymbol_species', 'TranscriptID-TranscriptSymbol_species', ''],
                      default='', help='Change the header line of the FASTA sequences to this format')
    parser.add_option('--regions', default="", help='Comma-separated list of region IDs for which FASTA sequences should be filtered')
    parser.add_option('-o', '--output', help='Path of the output SQLite file')
    parser.add_option('--of', help='Path of the output FASTA file')
    parser.add_option('--ff', default=os.devnull, help='Path of the filtered sequences output FASTA file')

    options, args = parser.parse_args()
    if args:
        raise Exception('Use options to provide inputs')

    conn = sqlite3.connect(options.output)
    conn.row_factory = sqlite3.Row
    conn.execute('PRAGMA foreign_keys = ON')
    create_tables(conn)

    for gff3_arg in options.gff3:
        try:
            (species, filename) = gff3_arg.split(':')
        except ValueError:
            raise Exception(f"Argument for --gff3 '{gff3_arg}' is not in the SPECIES:FILENAME format")
        gene_dict = dict()
        transcript_dict = dict()
        exon_parent_dict = dict()
        cds_parent_dict = dict()
        five_prime_utr_parent_dict = dict()
        three_prime_utr_parent_dict = dict()
        unimplemented_feature_nlines_dict = dict()

        with open(filename) as f:
            for i, line in enumerate(f, start=1):
                line = line.strip()
                if not line:
                    # skip empty lines
                    continue
                if line[0] == '#':
                    # skip comment lines
                    continue
                cols = line.split('\t')
                if len(cols) != 9:
                    raise Exception(f"Line {i} in file '{filename}': '{line}' does not have 9 columns")
                feature_type = cols[2]
                try:
                    if feature_type == 'gene':
                        add_gene_to_dict(cols, species, gene_dict)
                    elif feature_type in ('mRNA', 'transcript'):
                        add_transcript_to_dict(cols, species, transcript_dict)
                    elif feature_type == 'exon':
                        add_exon_to_dict(cols, species, exon_parent_dict)
                    elif feature_type == 'five_prime_UTR':
                        feature_to_dict(cols, five_prime_utr_parent_dict)
                    elif feature_type == 'three_prime_UTR':
                        feature_to_dict(cols, three_prime_utr_parent_dict)
                    elif feature_type == 'CDS':
                        add_cds_to_dict(cols, cds_parent_dict)
                    elif feature_type in unimplemented_feature_nlines_dict:
                        unimplemented_feature_nlines_dict[feature_type] += 1
                    else:
                        unimplemented_feature_nlines_dict[feature_type] = 0
                except Exception as e:
                    print(f"Line {i} in file '{filename}': {e}", file=sys.stderr)

        for unimplemented_feature, nlines in unimplemented_feature_nlines_dict.items():
            print(f"Skipped {nlines} lines in GFF3 file '{filename}': '{unimplemented_feature}' is not an implemented feature type", file=sys.stderr)

        join_dicts(gene_dict, transcript_dict, exon_parent_dict, cds_parent_dict, five_prime_utr_parent_dict, three_prime_utr_parent_dict)
        write_gene_dict_to_db(conn, gene_dict)

    for json_arg in options.json:
        with open(json_arg) as f:
            write_gene_dict_to_db(conn, json.load(f))

    # Read the FASTA files a first time to:
    # - determine for each file if we need to force the removal of the version
    #   from the transcript id
    # - fill gene_transcripts_dict when keeping only the canonical transcripts
    force_remove_id_version_file_list = []
    gene_transcripts_dict = dict()
    for fasta_arg in options.fasta:
        force_remove_id_version = False
        found_gene_transcript = False
        for entry in FASTAReader_gen(fasta_arg):
            # Extract the transcript id by removing everything after the first space and then removing the version if needed
            transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0], force_remove_id_version)

            transcript = fetch_transcript_and_gene(conn, transcript_id)
            if not transcript and not found_gene_transcript:
                # We have not found a proper gene transcript in this file yet,
                # try to force the removal of the version from the transcript id
                transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0], True)
                transcript = fetch_transcript_and_gene(conn, transcript_id)
                # Remember that we need to force the removal for this file
                if transcript:
                    force_remove_id_version = True
                    force_remove_id_version_file_list.append(fasta_arg)
                    print(f"Forcing removal of id version in FASTA file '{fasta_arg}'", file=sys.stderr)
            if not transcript:
                print(f"Transcript '{transcript_id}' in FASTA file '{fasta_arg}' not found in the gene feature information", file=sys.stderr)
                continue
            if options.filter != 'canonical':
                break
            found_gene_transcript = True

            if len(entry.sequence) % 3 != 0:
                continue
            transcript_biotype = transcript['biotype']  # This is the biotype of the transcript or, if that is NULL, the one of the gene
            if transcript_biotype and transcript_biotype != 'protein_coding':
                continue
            gene_transcripts_dict.setdefault(transcript['gene_id'], []).append((transcript_id, transcript['is_canonical'], len(entry.sequence)))

    if options.filter == 'canonical':
        selected_transcript_ids = []
        for gene_id, transcript_tuples in gene_transcripts_dict.items():
            canonical_transcript_ids = [id_ for (id_, is_canonical, _) in transcript_tuples if is_canonical]
            if not canonical_transcript_ids:
                # Select the transcript with the longest sequence. If more than
                # one transcripts have the same longest sequence for a gene, the
                # first one to appear in the FASTA file is selected.
                selected_transcript_id = max(transcript_tuples, key=lambda transcript_tuple: transcript_tuple[2])[0]
            elif len(canonical_transcript_ids) > 1:
                raise Exception(f"Gene {gene_id} has more than 1 canonical transcripts")
            else:
                selected_transcript_id = canonical_transcript_ids[0]
            selected_transcript_ids.append(selected_transcript_id)

    regions = [_.strip().lower() for _ in options.regions.split(",")]
    with open(options.of, 'w') as output_fasta_file, open(options.ff, 'w') as filtered_fasta_file:
        for fasta_arg in options.fasta:
            force_remove_id_version = fasta_arg in force_remove_id_version_file_list
            for entry in FASTAReader_gen(fasta_arg):
                transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0], force_remove_id_version)

                transcript = fetch_transcript_and_gene(conn, transcript_id)
                if not transcript:
                    print(f"Transcript '{transcript_id}' in FASTA file '{fasta_arg}' not found in the gene feature information", file=sys.stderr)
                    continue

                if options.filter == 'canonical':
                    # We already filtered out non-protein-coding transcripts when populating gene_transcripts_dict
                    if transcript_id not in selected_transcript_ids:
                        continue
                elif options.filter == 'coding':
                    if len(entry.sequence) % 3 != 0:
                        print(f"Transcript '{transcript_id}' in FASTA file '{fasta_arg}' has a coding sequence length which is not multiple of 3, removing from FASTA output", file=sys.stderr)
                        continue
                    transcript_biotype = transcript['biotype']  # This is the biotype of the transcript or, if that is NULL, the one of the gene
                    if transcript_biotype and transcript_biotype != 'protein_coding':
                        print(f"Transcript {transcript_id} has biotype {transcript_biotype} (not protein-coding), removing from FASTA output", file=sys.stderr)
                        continue

                if options.headers == "TranscriptId_species":
                    # Change the FASTA header to '>TranscriptId_species', as required by TreeBest
                    # Remove any underscore in the species
                    entry.header = ">{}_{}".format(transcript_id, transcript['species'].replace('_', ''))
                elif options.headers == "TranscriptID-GeneSymbol_species":
                    # Remove any underscore in the species
                    entry.header = ">{}-{}_{}".format(transcript_id, transcript['gene_symbol'], transcript['species'].replace('_', ''))
                elif options.headers == "TranscriptID-TranscriptSymbol_species":
                    # Remove any underscore in the species
                    entry.header = ">{}-{}_{}".format(transcript_id, transcript['transcript_symbol'], transcript['species'].replace('_', ''))

                if transcript['seq_region_name'].lower() in regions:
                    entry.print(filtered_fasta_file)
                else:
                    entry.print(output_fasta_file)

    conn.close()


if __name__ == '__main__':
    __main__()