Mercurial > repos > earlhaminst > gstf_preparation
diff gstf_preparation.py @ 0:28879ca33b5f draft
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
author | earlhaminst |
---|---|
date | Wed, 15 Mar 2017 20:18:57 -0400 |
parents | |
children | a36645976342 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gstf_preparation.py Wed Mar 15 20:18:57 2017 -0400 @@ -0,0 +1,343 @@ +from __future__ import print_function + +import collections +import json +import optparse +import sqlite3 +import sys + +version = "0.3.0" +gene_count = 0 + +Sequence = collections.namedtuple('Sequence', ['header', 'sequence']) + + +def FASTAReader_gen(fasta_filename): + with open(fasta_filename) as fasta_file: + line = fasta_file.readline() + while True: + if not line: + return + assert line.startswith('>'), "FASTA headers must start with >" + header = line.rstrip() + sequence_parts = [] + line = fasta_file.readline() + while line and line[0] != '>': + sequence_parts.append(line.rstrip()) + line = fasta_file.readline() + sequence = "\n".join(sequence_parts) + yield Sequence(header, sequence) + + +def create_tables(conn): + cur = conn.cursor() + + cur.execute('''CREATE TABLE meta ( + version VARCHAR PRIMARY KEY NOT NULL)''') + + cur.execute('INSERT INTO meta (version) VALUES (?)', + (version, )) + + cur.execute('''CREATE TABLE gene ( + gene_id VARCHAR PRIMARY KEY NOT NULL, + gene_symbol VARCHAR, + species VARCHAR NOT NULL, + gene_json VARCHAR NOT NULL)''') + cur.execute('CREATE INDEX gene_symbol_index ON gene (gene_symbol)') + + cur.execute('''CREATE TABLE transcript ( + transcript_id VARCHAR PRIMARY KEY NOT NULL, + protein_id VARCHAR UNIQUE, + protein_sequence VARCHAR, + gene_id VARCHAR NOT NULL REFERENCES gene(gene_id))''') + + cur.execute('''CREATE VIEW transcript_species AS + SELECT transcript_id, species + FROM transcript JOIN gene + ON transcript.gene_id = gene.gene_id''') + + conn.commit() + + +def remove_type_from_list_of_ids(l): + return ','.join(remove_type_from_id(_) for _ in l.split(',')) + + +def remove_type_from_id(id_): + colon_index = id_.find(':') + if colon_index >= 0: + return id_[colon_index + 1:] + else: + return id_ + + +def feature_to_dict(cols, parent_dict=None): + d = { + 'end': int(cols[4]), + 'start': int(cols[3]), + } + for attr in cols[8].split(';'): + if '=' in attr: + (tag, value) = attr.split('=') + if tag == 'ID': + tag = 'id' + value = remove_type_from_id(value) + elif tag == 'Parent': + value = remove_type_from_list_of_ids(value) + d[tag] = value + if cols[6] == '+': + d['strand'] = 1 + elif cols[6] == '-': + d['strand'] = -1 + else: + raise Exception("Unrecognized strand '%s'" % cols[6]) + if parent_dict is not None and 'Parent' in d: + # a 3' UTR can be split among multiple exons + # a 5' UTR can be split among multiple exons + # a CDS can be part of multiple transcripts + for parent in d['Parent'].split(','): + if parent not in parent_dict: + parent_dict[parent] = [d] + else: + parent_dict[parent].append(d) + return d + + +def add_gene_to_dict(cols, species, gene_dict): + global gene_count + gene = feature_to_dict(cols) + gene.update({ + 'member_id': gene_count, + 'object_type': 'Gene', + 'seq_region_name': cols[0], + 'species': species, + 'Transcript': [], + 'display_name': gene['Name'] + }) + if gene['id']: + gene_dict[gene['id']] = gene + gene_count = gene_count + 1 + + +def add_transcript_to_dict(cols, species, transcript_dict): + transcript = feature_to_dict(cols) + transcript.update({ + 'object_type': 'Transcript', + 'seq_region_name': cols[0], + 'species': species, + }) + transcript_dict[transcript['id']] = transcript + + +def add_exon_to_dict(cols, species, exon_parent_dict): + exon = feature_to_dict(cols, exon_parent_dict) + exon.update({ + 'length': int(cols[4]) - int(cols[3]) + 1, + 'object_type': 'Exon', + 'seq_region_name': cols[0], + 'species': species, + }) + if 'id' not in exon and 'Name' in exon: + exon['id'] = exon['Name'] + + +def add_cds_to_dict(cols, cds_parent_dict): + cds = feature_to_dict(cols, cds_parent_dict) + if 'id' not in cds: + if 'Name' in cds: + cds['id'] = cds['Name'] + elif 'Parent' in cds and ',' not in cds['Parent']: + cds['id'] = cds['Parent'] + + +def join_dicts(gene_dict, transcript_dict, exon_parent_dict, cds_parent_dict, five_prime_utr_parent_dict, three_prime_utr_parent_dict): + for parent, exon_list in exon_parent_dict.items(): + if parent in transcript_dict: + exon_list.sort(key=lambda _: _['start']) + transcript_dict[parent]['Exon'] = exon_list + + for transcript_id, transcript in transcript_dict.items(): + translation = { + 'CDS': [], + 'id': None, + 'end': transcript['end'], + 'object_type': 'Translation', + 'species': transcript['species'], + 'start': transcript['start'], + } + found_cds = False + derived_translation_start = None + derived_translation_end = None + if transcript_id in cds_parent_dict: + cds_list = cds_parent_dict[transcript_id] + cds_ids = set(_['id'] for _ in cds_list) + if len(cds_ids) > 1: + raise Exception("Transcript %s has multiple CDSs: this is not supported by Ensembl JSON format" % parent) + translation['id'] = cds_ids.pop() + cds_list.sort(key=lambda _: _['start']) + translation['CDS'] = cds_list + translation['start'] = cds_list[0]['start'] + translation['end'] = cds_list[-1]['end'] + found_cds = True + if transcript_id in five_prime_utr_parent_dict: + five_prime_utr_list = five_prime_utr_parent_dict[transcript_id] + five_prime_utr_list.sort(key=lambda _: _['start']) + if transcript['strand'] == 1: + derived_translation_start = five_prime_utr_list[-1]['end'] + 1 + else: + derived_translation_end = five_prime_utr_list[0]['start'] - 1 + if transcript_id in three_prime_utr_parent_dict: + three_prime_utr_list = three_prime_utr_parent_dict[transcript_id] + three_prime_utr_list.sort(key=lambda _: _['start']) + if transcript['strand'] == 1: + derived_translation_end = three_prime_utr_list[0]['start'] - 1 + else: + derived_translation_start = three_prime_utr_list[-1]['end'] + 1 + if derived_translation_start is not None: + if found_cds: + if derived_translation_start > translation['start']: + raise Exception("UTR overlaps with CDS") + else: + translation['start'] = derived_translation_start + if derived_translation_end is not None: + if found_cds: + if derived_translation_end < translation['end']: + raise Exception("UTR overlaps with CDS") + else: + translation['end'] = derived_translation_end + if found_cds or derived_translation_start is not None or derived_translation_end is not None: + transcript['Translation'] = translation + + for transcript in transcript_dict.values(): + if 'Parent' in transcript: + # A polycistronic transcript can have multiple parents + for parent in transcript['Parent'].split(','): + if parent in gene_dict: + gene_dict[parent]['Transcript'].append(transcript) + + +def write_gene_dict_to_db(conn, gene_dict): + cur = conn.cursor() + + for gene in gene_dict.values(): + gene_id = gene['id'] + cur.execute('INSERT INTO gene (gene_id, gene_symbol, species, gene_json) VALUES (?, ?, ?, ?)', + (gene_id, gene.get('display_name', None), gene['species'], json.dumps(gene))) + + if "Transcript" in gene: + for transcript in gene["Transcript"]: + transcript_id = transcript['id'] + protein_id = transcript.get('Translation', {}).get('id', None) + try: + cur.execute('INSERT INTO transcript (transcript_id, protein_id, gene_id) VALUES (?, ?, ?)', + (transcript_id, protein_id, gene_id)) + except Exception as e: + raise Exception("Error while inserting (%s, %s, %s) into transcript table: %s" % (transcript_id, protein_id, gene_id, e)) + + conn.commit() + + +def fetch_species_for_transcript(conn, transcript_id): + cur = conn.cursor() + + cur.execute('SELECT species FROM transcript_species WHERE transcript_id=?', + (transcript_id, )) + results = cur.fetchone() + if not results: + return None + return results[0] + + +def remove_id_version(s): + """ + Remove the optional '.VERSION' from an Ensembl id. + """ + if s.startswith('ENS'): + return s.split('.')[0] + else: + return s + + +def __main__(): + parser = optparse.OptionParser() + parser.add_option('--gff3', action='append', default=[], help='GFF3 file to convert, in SPECIES:FILENAME format. Use multiple times to add more files') + parser.add_option('--json', action='append', default=[], help='JSON file to merge. Use multiple times to add more files') + parser.add_option('--fasta', action='append', default=[], help='Path of the input FASTA files') + parser.add_option('-o', '--output', help='Path of the output SQLite file') + parser.add_option('--of', help='Path of the output FASTA file') + options, args = parser.parse_args() + if args: + raise Exception('Use options to provide inputs') + + conn = sqlite3.connect(options.output) + conn.execute('PRAGMA foreign_keys = ON') + create_tables(conn) + + for gff3_arg in options.gff3: + try: + (species, filename) = gff3_arg.split(':') + except ValueError: + raise Exception("Argument for --gff3 '%s' is not in the SPECIES:FILENAME format" % gff3_arg) + gene_dict = dict() + transcript_dict = dict() + exon_parent_dict = dict() + cds_parent_dict = dict() + five_prime_utr_parent_dict = dict() + three_prime_utr_parent_dict = dict() + with open(filename) as f: + for i, line in enumerate(f, start=1): + line = line.strip() + if not line: + # skip empty lines + continue + if line[0] == '#': + # skip comment lines + continue + cols = line.split('\t') + if len(cols) != 9: + raise Exception("Line %i in file '%s': '%s' does not have 9 columns" % (i, filename, line)) + feature_type = cols[2] + try: + if feature_type == 'gene': + add_gene_to_dict(cols, species, gene_dict) + elif feature_type in ('mRNA', 'transcript'): + add_transcript_to_dict(cols, species, transcript_dict) + elif feature_type == 'exon': + add_exon_to_dict(cols, species, exon_parent_dict) + elif feature_type == 'five_prime_UTR': + feature_to_dict(cols, five_prime_utr_parent_dict) + elif feature_type == 'three_prime_UTR': + feature_to_dict(cols, three_prime_utr_parent_dict) + elif feature_type == 'CDS': + add_cds_to_dict(cols, cds_parent_dict) + else: + print("Line %i in file '%s': '%s' is not an implemented feature type" % (i, filename, feature_type), file=sys.stderr) + except Exception as e: + print("Line %i in file '%s': %s" % (i, filename, e), file=sys.stderr) + + join_dicts(gene_dict, transcript_dict, exon_parent_dict, cds_parent_dict, five_prime_utr_parent_dict, three_prime_utr_parent_dict) + write_gene_dict_to_db(conn, gene_dict) + + for json_arg in options.json: + with open(json_arg) as f: + write_gene_dict_to_db(conn, json.load(f)) + + with open(options.of, 'w') as output_fasta_file: + for fasta_arg in options.fasta: + for entry in FASTAReader_gen(fasta_arg): + # Extract the transcript id by removing everything after the first space and then removing the version if it is an Ensembl id + transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0]) + species_for_transcript = fetch_species_for_transcript(conn, transcript_id) + if not species_for_transcript: + print("Transcript '%s' not found in the gene feature information" % transcript_id, file=sys.stderr) + continue + # Remove any underscore in the species + species_for_transcript = species_for_transcript.replace('_', '') + # Write the FASTA sequence using '>TranscriptId_species' as the header, as required by TreeBest + output_fasta_file.write(">%s_%s\n%s\n" % (transcript_id, species_for_transcript, entry.sequence)) + + conn.close() + + +if __name__ == '__main__': + __main__()