Mercurial > repos > earlhaminst > ensembl_longest_cds_per_gene
diff ensembl_longest_cds_per_gene.py @ 0:4dba69135845 draft
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
author | earlhaminst |
---|---|
date | Tue, 07 Mar 2017 05:54:30 -0500 |
parents | |
children | 6cf9f7f6509c |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ensembl_longest_cds_per_gene.py Tue Mar 07 05:54:30 2017 -0500 @@ -0,0 +1,78 @@ +""" +This script reads a CDS FASTA file from Ensembl and outputs a FASTA file with +only the longest CDS sequence for each gene. The header of the sequences in the +output file will be the transcript id without version. +""" +from __future__ import print_function + +import collections +import optparse +import sys + +Sequence = collections.namedtuple('Sequence', ['header', 'sequence']) + + +def FASTAReader_gen(fasta_filename): + with open(fasta_filename) as fasta_file: + line = fasta_file.readline() + while True: + if not line: + return + assert line.startswith('>'), "FASTA headers must start with >" + header = line.rstrip() + sequence_parts = [] + line = fasta_file.readline() + while line and line[0] != '>': + sequence_parts.append(line.rstrip()) + line = fasta_file.readline() + sequence = "\n".join(sequence_parts) + yield Sequence(header, sequence) + + +def remove_id_version(s): + """ + Remove the optional '.VERSION' from an Ensembl id. + """ + return s.split('.')[0] + + +parser = optparse.OptionParser() +parser.add_option('-f', '--fasta', dest="input_fasta_filename", + help='CDS file in FASTA format from Ensembl') +parser.add_option('-o', '--output', dest="output_fasta_filename", + help='Output FASTA file name') +options, args = parser.parse_args() + +if options.input_fasta_filename is None: + raise Exception('-f option must be specified') +if options.output_fasta_filename is None: + raise Exception('-o option must be specified') + +gene_transcripts_dict = dict() + +for entry in FASTAReader_gen(options.input_fasta_filename): + transcript_id, rest = entry.header[1:].split(' ', 1) + transcript_id = remove_id_version(transcript_id) + gene_id = None + for s in rest.split(' '): + if s.startswith('gene:'): + gene_id = remove_id_version(s[5:]) + break + else: + print("Gene id not found in header '%s'" % entry.header, file=sys.stderr) + continue + if gene_id in gene_transcripts_dict: + gene_transcripts_dict[gene_id].append((transcript_id, len(entry.sequence))) + else: + gene_transcripts_dict[gene_id] = [(transcript_id, len(entry.sequence))] + +# For each gene, select the transcript with the longest sequence +# If more than one transcripts have the same longest sequence for a gene, the +# first one to appear in the FASTA file is selected +selected_transcript_ids = [max(transcript_id_lengths, key=lambda _: _[1])[0] for transcript_id_lengths in gene_transcripts_dict.values()] + +with open(options.output_fasta_filename, 'w') as output_fasta_file: + for entry in FASTAReader_gen(options.input_fasta_filename): + transcript_id = remove_id_version(entry.header[1:].split(' ')[0]) + if transcript_id in selected_transcript_ids: + output_fasta_file.write(">%s\n%s\n" % (transcript_id, entry.sequence))