diff ensembl_longest_cds_per_gene.py @ 0:4dba69135845 draft

planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
author earlhaminst
date Tue, 07 Mar 2017 05:54:30 -0500
parents
children 6cf9f7f6509c
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ensembl_longest_cds_per_gene.py	Tue Mar 07 05:54:30 2017 -0500
@@ -0,0 +1,78 @@
+"""
+This script reads a CDS FASTA file from Ensembl and outputs a FASTA file with
+only the longest CDS sequence for each gene. The header of the sequences in the
+output file will be the transcript id without version.
+"""
+from __future__ import print_function
+
+import collections
+import optparse
+import sys
+
+Sequence = collections.namedtuple('Sequence', ['header', 'sequence'])
+
+
+def FASTAReader_gen(fasta_filename):
+    with open(fasta_filename) as fasta_file:
+        line = fasta_file.readline()
+        while True:
+            if not line:
+                return
+            assert line.startswith('>'), "FASTA headers must start with >"
+            header = line.rstrip()
+            sequence_parts = []
+            line = fasta_file.readline()
+            while line and line[0] != '>':
+                sequence_parts.append(line.rstrip())
+                line = fasta_file.readline()
+            sequence = "\n".join(sequence_parts)
+            yield Sequence(header, sequence)
+
+
+def remove_id_version(s):
+    """
+    Remove the optional '.VERSION' from an Ensembl id.
+    """
+    return s.split('.')[0]
+
+
+parser = optparse.OptionParser()
+parser.add_option('-f', '--fasta', dest="input_fasta_filename",
+                  help='CDS file in FASTA format from Ensembl')
+parser.add_option('-o', '--output', dest="output_fasta_filename",
+                  help='Output FASTA file name')
+options, args = parser.parse_args()
+
+if options.input_fasta_filename is None:
+    raise Exception('-f option must be specified')
+if options.output_fasta_filename is None:
+    raise Exception('-o option must be specified')
+
+gene_transcripts_dict = dict()
+
+for entry in FASTAReader_gen(options.input_fasta_filename):
+    transcript_id, rest = entry.header[1:].split(' ', 1)
+    transcript_id = remove_id_version(transcript_id)
+    gene_id = None
+    for s in rest.split(' '):
+        if s.startswith('gene:'):
+            gene_id = remove_id_version(s[5:])
+            break
+    else:
+        print("Gene id not found in header '%s'" % entry.header, file=sys.stderr)
+        continue
+    if gene_id in gene_transcripts_dict:
+        gene_transcripts_dict[gene_id].append((transcript_id, len(entry.sequence)))
+    else:
+        gene_transcripts_dict[gene_id] = [(transcript_id, len(entry.sequence))]
+
+# For each gene, select the transcript with the longest sequence
+# If more than one transcripts have the same longest sequence for a gene, the
+# first one to appear in the FASTA file is selected
+selected_transcript_ids = [max(transcript_id_lengths, key=lambda _: _[1])[0] for transcript_id_lengths in gene_transcripts_dict.values()]
+
+with open(options.output_fasta_filename, 'w') as output_fasta_file:
+    for entry in FASTAReader_gen(options.input_fasta_filename):
+        transcript_id = remove_id_version(entry.header[1:].split(' ')[0])
+        if transcript_id in selected_transcript_ids:
+            output_fasta_file.write(">%s\n%s\n" % (transcript_id, entry.sequence))