diff ensembl_longest_cds_per_gene.py @ 2:6cf9f7f6509c draft default tip

planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 651fae48371f845578753052c6fe173e3bb35670
author earlhaminst
date Wed, 15 Mar 2017 20:23:13 -0400
parents 4dba69135845
children
line wrap: on
line diff
--- a/ensembl_longest_cds_per_gene.py	Tue Mar 07 11:12:55 2017 -0500
+++ b/ensembl_longest_cds_per_gene.py	Wed Mar 15 20:23:13 2017 -0400
@@ -1,7 +1,6 @@
 """
 This script reads a CDS FASTA file from Ensembl and outputs a FASTA file with
-only the longest CDS sequence for each gene. The header of the sequences in the
-output file will be the transcript id without version.
+only the longest CDS sequence for each gene.
 """
 from __future__ import print_function
 
@@ -33,7 +32,10 @@
     """
     Remove the optional '.VERSION' from an Ensembl id.
     """
-    return s.split('.')[0]
+    if s.startswith('ENS'):
+        return s.split('.')[0]
+    else:
+        return s
 
 
 parser = optparse.OptionParser()
@@ -52,7 +54,6 @@
 
 for entry in FASTAReader_gen(options.input_fasta_filename):
     transcript_id, rest = entry.header[1:].split(' ', 1)
-    transcript_id = remove_id_version(transcript_id)
     gene_id = None
     for s in rest.split(' '):
         if s.startswith('gene:'):
@@ -73,6 +74,6 @@
 
 with open(options.output_fasta_filename, 'w') as output_fasta_file:
     for entry in FASTAReader_gen(options.input_fasta_filename):
-        transcript_id = remove_id_version(entry.header[1:].split(' ')[0])
+        transcript_id = entry.header[1:].split(' ')[0]
         if transcript_id in selected_transcript_ids:
-            output_fasta_file.write(">%s\n%s\n" % (transcript_id, entry.sequence))
+            output_fasta_file.write("%s\n%s\n" % (entry.header, entry.sequence))