Mercurial > repos > earlhaminst > ensembl_longest_cds_per_gene
diff ensembl_longest_cds_per_gene.py @ 2:6cf9f7f6509c draft default tip
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 651fae48371f845578753052c6fe173e3bb35670
author | earlhaminst |
---|---|
date | Wed, 15 Mar 2017 20:23:13 -0400 |
parents | 4dba69135845 |
children |
line wrap: on
line diff
--- a/ensembl_longest_cds_per_gene.py Tue Mar 07 11:12:55 2017 -0500 +++ b/ensembl_longest_cds_per_gene.py Wed Mar 15 20:23:13 2017 -0400 @@ -1,7 +1,6 @@ """ This script reads a CDS FASTA file from Ensembl and outputs a FASTA file with -only the longest CDS sequence for each gene. The header of the sequences in the -output file will be the transcript id without version. +only the longest CDS sequence for each gene. """ from __future__ import print_function @@ -33,7 +32,10 @@ """ Remove the optional '.VERSION' from an Ensembl id. """ - return s.split('.')[0] + if s.startswith('ENS'): + return s.split('.')[0] + else: + return s parser = optparse.OptionParser() @@ -52,7 +54,6 @@ for entry in FASTAReader_gen(options.input_fasta_filename): transcript_id, rest = entry.header[1:].split(' ', 1) - transcript_id = remove_id_version(transcript_id) gene_id = None for s in rest.split(' '): if s.startswith('gene:'): @@ -73,6 +74,6 @@ with open(options.output_fasta_filename, 'w') as output_fasta_file: for entry in FASTAReader_gen(options.input_fasta_filename): - transcript_id = remove_id_version(entry.header[1:].split(' ')[0]) + transcript_id = entry.header[1:].split(' ')[0] if transcript_id in selected_transcript_ids: - output_fasta_file.write(">%s\n%s\n" % (transcript_id, entry.sequence)) + output_fasta_file.write("%s\n%s\n" % (entry.header, entry.sequence))