changeset 0:4dba69135845 draft

planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
author earlhaminst
date Tue, 07 Mar 2017 05:54:30 -0500
parents
children a07680f3033a
files ensembl_longest_cds_per_gene.py ensembl_longest_cds_per_gene.xml test-data/Mus_musculus.GRCm38.cds.first100.fa test-data/Mus_musculus.GRCm38.cds.longest.fa
diffstat 4 files changed, 477 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ensembl_longest_cds_per_gene.py	Tue Mar 07 05:54:30 2017 -0500
@@ -0,0 +1,78 @@
+"""
+This script reads a CDS FASTA file from Ensembl and outputs a FASTA file with
+only the longest CDS sequence for each gene. The header of the sequences in the
+output file will be the transcript id without version.
+"""
+from __future__ import print_function
+
+import collections
+import optparse
+import sys
+
+Sequence = collections.namedtuple('Sequence', ['header', 'sequence'])
+
+
+def FASTAReader_gen(fasta_filename):
+    with open(fasta_filename) as fasta_file:
+        line = fasta_file.readline()
+        while True:
+            if not line:
+                return
+            assert line.startswith('>'), "FASTA headers must start with >"
+            header = line.rstrip()
+            sequence_parts = []
+            line = fasta_file.readline()
+            while line and line[0] != '>':
+                sequence_parts.append(line.rstrip())
+                line = fasta_file.readline()
+            sequence = "\n".join(sequence_parts)
+            yield Sequence(header, sequence)
+
+
+def remove_id_version(s):
+    """
+    Remove the optional '.VERSION' from an Ensembl id.
+    """
+    return s.split('.')[0]
+
+
+parser = optparse.OptionParser()
+parser.add_option('-f', '--fasta', dest="input_fasta_filename",
+                  help='CDS file in FASTA format from Ensembl')
+parser.add_option('-o', '--output', dest="output_fasta_filename",
+                  help='Output FASTA file name')
+options, args = parser.parse_args()
+
+if options.input_fasta_filename is None:
+    raise Exception('-f option must be specified')
+if options.output_fasta_filename is None:
+    raise Exception('-o option must be specified')
+
+gene_transcripts_dict = dict()
+
+for entry in FASTAReader_gen(options.input_fasta_filename):
+    transcript_id, rest = entry.header[1:].split(' ', 1)
+    transcript_id = remove_id_version(transcript_id)
+    gene_id = None
+    for s in rest.split(' '):
+        if s.startswith('gene:'):
+            gene_id = remove_id_version(s[5:])
+            break
+    else:
+        print("Gene id not found in header '%s'" % entry.header, file=sys.stderr)
+        continue
+    if gene_id in gene_transcripts_dict:
+        gene_transcripts_dict[gene_id].append((transcript_id, len(entry.sequence)))
+    else:
+        gene_transcripts_dict[gene_id] = [(transcript_id, len(entry.sequence))]
+
+# For each gene, select the transcript with the longest sequence
+# If more than one transcripts have the same longest sequence for a gene, the
+# first one to appear in the FASTA file is selected
+selected_transcript_ids = [max(transcript_id_lengths, key=lambda _: _[1])[0] for transcript_id_lengths in gene_transcripts_dict.values()]
+
+with open(options.output_fasta_filename, 'w') as output_fasta_file:
+    for entry in FASTAReader_gen(options.input_fasta_filename):
+        transcript_id = remove_id_version(entry.header[1:].split(' ')[0])
+        if transcript_id in selected_transcript_ids:
+            output_fasta_file.write(">%s\n%s\n" % (transcript_id, entry.sequence))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ensembl_longest_cds_per_gene.xml	Tue Mar 07 05:54:30 2017 -0500
@@ -0,0 +1,27 @@
+<tool id="ensembl_longest_cds_per_gene" name="Select longest CDS per gene" version="0.0.1">
+    <description>from Ensembl CDS FASTA</description>
+    <command detect_errors="exit_code"><![CDATA[
+python '$__tool_directory__/ensembl_longest_cds_per_gene.py' -f '$input' -o '$output'
+    ]]></command>
+    <inputs>
+        <param name="input" type="data" format="fasta" label="CDS FASTA from Ensembl" />
+    </inputs>
+    <outputs>
+        <data name="output" format="fasta" label="${tool.name} on ${on_string}" />
+    </outputs>
+    <tests>
+        <test>
+            <param name="input" ftype="fasta" value="Mus_musculus.GRCm38.cds.first100.fa" />
+            <output name="output" file="Mus_musculus.GRCm38.cds.longest.fa" />
+        </test>
+    </tests>
+    <help><![CDATA[
+This tool filters a CDS FASTA file from Ensembl only the longest CDS sequence for each gene.
+
+The headers of the input CDS FASTA file are expected to be of the following format::
+
+    >ENSMUST00000177965.1 cds chromosome:GRCm38:12:113456720:113456736:-1 gene:ENSMUSG00000094057.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:Ighd2-7 description:immunoglobulin heavy diversity 2-7 [Source:MGI Symbol;Acc:MGI:4439866]
+
+Among the CDS sequences having the same gene identifier (ENSMUSG00000094057 in the example above), the tool will select the one with the longest sequence. The header of the sequences in the output dataset will contain only the transcript id without version (ENSMUST00000177965 in the example above).
+    ]]></help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/Mus_musculus.GRCm38.cds.first100.fa	Tue Mar 07 05:54:30 2017 -0500
@@ -0,0 +1,200 @@
+>ENSMUST00000196221.1 cds chromosome:GRCm38:14:54113468:54113476:1 gene:ENSMUSG00000096749.2 gene_biotype:TR_D_gene transcript_biotype:TR_D_gene gene_symbol:Trdd1 description:T cell receptor delta diversity 1 [Source:MGI Symbol;Acc:MGI:4439547]
+ATGGCATAT
+>ENSMUST00000177564.1 cds chromosome:GRCm38:14:54122226:54122241:1 gene:ENSMUSG00000096176.1 gene_biotype:TR_D_gene transcript_biotype:TR_D_gene gene_symbol:Trdd2 description:T cell receptor delta diversity 2 [Source:MGI Symbol;Acc:MGI:4439546]
+ATCGGAGGGATACGAG
+>ENSMUST00000178537.1 cds chromosome:GRCm38:6:41533201:41533212:1 gene:ENSMUSG00000095668.1 gene_biotype:TR_D_gene transcript_biotype:TR_D_gene gene_symbol:Trbd1 description:T cell receptor beta, D region 1 [Source:MGI Symbol;Acc:MGI:4439571]
+GGGACAGGGGGC
+>ENSMUST00000178862.1 cds chromosome:GRCm38:6:41542163:41542176:1 gene:ENSMUSG00000094569.1 gene_biotype:TR_D_gene transcript_biotype:TR_D_gene gene_symbol:Trbd2 description:T cell receptor beta, D region 2 [Source:MGI Symbol;Acc:MGI:4439727]
+GGGACTGGGGGGGC
+>ENSMUST00000179520.1 cds chromosome:GRCm38:12:113430528:113430538:-1 gene:ENSMUSG00000094028.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:Ighd4-1 description:immunoglobulin heavy diversity 4-1 [Source:MGI Symbol;Acc:MGI:4439801]
+CTAACTGGGAC
+>ENSMUST00000179883.1 cds chromosome:GRCm38:12:113448214:113448229:-1 gene:ENSMUSG00000094552.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:Ighd3-2 description:immunoglobulin heavy diversity 3-2 [Source:MGI Symbol;Acc:MGI:4439707]
+AGACAGCTCAGGCTAC
+>ENSMUST00000195858.1 cds chromosome:GRCm38:12:113449588:113449597:-1 gene:ENSMUSG00000096420.2 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:Ighd5-6 description:immunoglobulin heavy diversity 5-6 [Source:MGI Symbol;Acc:MGI:4937234]
+GAATACCTAC
+>ENSMUST00000180001.1 cds chromosome:GRCm38:12:113450851:113450867:-1 gene:ENSMUSG00000095656.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:Ighd2-8 description:immunoglobulin heavy diversity 2-8 [Source:MGI Symbol;Acc:MGI:4439706]
+TCTACTATGGTAACTAC
+>ENSMUST00000178815.1 cds chromosome:GRCm38:12:113454942:113454951:-1 gene:ENSMUSG00000094957.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:Ighd5-5 description:immunoglobulin heavy diversity 5-5 [Source:MGI Symbol;Acc:MGI:4937334]
+GACTACCTAC
+>ENSMUST00000177965.1 cds chromosome:GRCm38:12:113456720:113456736:-1 gene:ENSMUSG00000094057.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:Ighd2-7 description:immunoglobulin heavy diversity 2-7 [Source:MGI Symbol;Acc:MGI:4439866]
+TCTACTATGGTTACGAC
+>ENSMUST00000178909.1 cds chromosome:GRCm38:12:113459864:113459892:-1 gene:ENSMUSG00000094268.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:Ighd5-8 description:immunoglobulin heavy diversity 5-8 [Source:MGI Symbol;Acc:MGI:4937171]
+AGACAGCTAGCCTCTGCAGTGCCACAACC
+>ENSMUST00000177646.1 cds chromosome:GRCm38:12:113460101:113460110:-1 gene:ENSMUSG00000096884.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:Ighd5-4 description:immunoglobulin heavy diversity 5-4 [Source:MGI Symbol;Acc:MGI:4937058]
+GAATACCTAC
+>ENSMUST00000178230.1 cds chromosome:GRCm38:12:113461369:113461385:-1 gene:ENSMUSG00000096250.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:Ighd2-6 description:immunoglobulin heavy diversity 2-6 [Source:MGI Symbol;Acc:MGI:4439865]
+CCTACTATAGTAACTAC
+>ENSMUST00000178483.1 cds chromosome:GRCm38:12:113464524:113464552:-1 gene:ENSMUSG00000095592.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:Ighd5-7 description:immunoglobulin heavy diversity 5-7 [Source:MGI Symbol;Acc:MGI:4936973]
+AGGCAGCTAGCCTCTGCAGTGCCACAACC
+>ENSMUST00000179262.1 cds chromosome:GRCm38:12:113464761:113464770:-1 gene:ENSMUSG00000093876.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:Ighd5-3 description:immunoglobulin heavy diversity 5-3 [Source:MGI Symbol;Acc:MGI:4937297]
+GAATACCTAC
+>ENSMUST00000178549.1 cds chromosome:GRCm38:12:113466027:113466043:-1 gene:ENSMUSG00000095897.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:Ighd2-5 description:immunoglobulin heavy diversity 2-5 [Source:MGI Symbol;Acc:MGI:4439705]
+CCTACTATAGTAACTAC
+>ENSMUST00000193012.1 cds chromosome:GRCm38:12:113469189:113469217:-1 gene:ENSMUSG00000103203.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:Gm37327 description:predicted gene, 37327 [Source:MGI Symbol;Acc:MGI:5610555]
+AGGCAGCTAGCCTCTGCAGTGCCACAACC
+>ENSMUST00000179166.1 cds chromosome:GRCm38:12:113469426:113469435:-1 gene:ENSMUSG00000096396.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:Ighd5-2 description:immunoglobulin heavy diversity 5-2 [Source:MGI Symbol;Acc:MGI:4936898]
+GAATACCTAC
+>ENSMUST00000179560.1 cds chromosome:GRCm38:12:113470694:113470710:-1 gene:ENSMUSG00000095444.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:Ighd2-4 description:immunoglobulin heavy diversity 2-4 [Source:MGI Symbol;Acc:MGI:4439709]
+TCTACTATGATTACGAC
+>ENSMUST00000177839.1 cds chromosome:GRCm38:12:113475400:113475416:-1 gene:ENSMUSG00000096568.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:Ighd2-3 description:immunoglobulin heavy diversity 2-3 [Source:MGI Symbol;Acc:MGI:4439708]
+TCTATGATGGTTACTAC
+>ENSMUST00000103439.1 cds chromosome:GRCm38:12:113482170:113482192:-1 gene:ENSMUSG00000076630.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:Ighd1-1 description:immunoglobulin heavy diversity 1-1 [Source:MGI Symbol;Acc:MGI:4439871]
+TTTATTACTACGGTAGTAGCTAC
+>ENSMUST00000180266.1 cds chromosome:GRCm38:12:113525313:113525329:-1 gene:ENSMUSG00000093818.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:Ighd3-1 description:immunoglobulin heavy diversity 3-1 [Source:MGI Symbol;Acc:MGI:4439891]
+GGCACAGCTCGGGCTAC
+>ENSMUST00000103441.1 cds chromosome:GRCm38:12:113528032:113528054:-1 gene:ENSMUSG00000076632.1 gene_biotype:IG_D_gene transcript_biotype:IG_D_gene gene_symbol:Gm16968 description:predicted gene, 16968 [Source:MGI Symbol;Acc:MGI:4439892]
+TATATAACTAAAGTGGTAGCTCA
+>ENSMUST00000177622.3 cds chromosome:GRCm38:14:53443249:53443839:1 gene:ENSMUSG00000096908.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:Trav7-3 description:T cell receptor alpha variable 7-3 [Source:MGI Symbol;Acc:MGI:3649556]
+ATGAAATCCTTGAGTGTTTCCCTAGTGGTCCTGTGGCTCCAGTTAAACTGGGTGAACAGC
+CAGCAGAAGGTGCAGCAGAGCCCAGAATCCCTCATTGTCCCAGAGGGAGCCATGACCTCT
+CTCAACTGCACTTTCAGCGACAGTGCTTCTCAGTATTTTGCATGGTACAGACAGCATTCT
+GGGAAAGCCCCCAAGGCACTGATGTCCATCTTCTCCAATGGTGAAAAAGAAGAAGGCAGA
+TTCACAATTCACCTCAATAAAGCCAGTCTGCATTTCTCCCTGCACATCAGAGACTCCCAG
+CCCAGTGACTCTGCTCTCTACCTCTGTGCAGTGAGCA
+>ENSMUST00000180711.2 cds chromosome:GRCm38:14:53454296:53454784:1 gene:ENSMUSG00000094468.5 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:Trav6-4 description:T cell receptor alpha variable 6-4 [Source:MGI Symbol;Acc:MGI:3702153]
+ATGAATACTTCTCCAGTTTTAGTAACTGTGATGCTGCTGTTCATGCTTGGGATGAGAAAG
+ACCCACGGAGATTCAGTGACCCAGAAACAAGGTCAAGTGACCCTTTCAGAAGATGACTTC
+CTATTTATAAATTGCACTTATTCTACCACAACATACCCAACTCTTTTGTGGTATGTCCAA
+TATCTTGGACAAGGTCCACAGCTCCTTCTGAAAGTGACAACTGCCAACAATAAGGGAATC
+AGCAGAGGCTTTGAAGCTACATATGACAAAGGGACCACGTCCTTCCACTTGCAGAAAGCC
+TCAGTGCAGGAGTCAGACTCAGCCGTGTACTTCTGTGCTCTGGTTGA
+>ENSMUST00000184650.1 cds chromosome:GRCm38:14:53454327:53454784:1 gene:ENSMUSG00000094468.5 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:Trav6-4 description:T cell receptor alpha variable 6-4 [Source:MGI Symbol;Acc:MGI:3702153]
+ATGAATACTTCTCCAGTTTTAGTAACTGTGATGCTGCTGTTCATGCTTGAGACCCACGGA
+GATTCAGTGACCCAGAAACAAGGTCAAGTGACCCTTTCAGAAGATGACTTCCTATTTATA
+AATTGCACTTATTCTACCACAACATACCCAACTCTTTTGTGGTATGTCCAATATCTTGGA
+CAAGGTCCACAGCTCCTTCTGAAAGTGACAACTGCCAACAATAAGGGAATCAGCAGAGGC
+TTTGAAGCTACATATGACAAAGGGACCACGTCCTTCCACTTGCAGAAAGCCTCAGTGCAG
+GAGTCAGACTCAGCCGTGTACTTCTGTGCTCTGGTTGA
+>ENSMUST00000181728.2 cds chromosome:GRCm38:14:53461099:53461738:1 gene:ENSMUSG00000094766.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:Trav7-4 description:T cell receptor alpha variable 7-4 [Source:MGI Symbol;Acc:MGI:3649611]
+ATGAAATCCTTGAGTGTTTCACTAGTGGTCCTGTGGCTCCAGTTAAACTGCGTGAGGAGC
+CAGCAGAAGGTGCAGCAGAGCCCAGAATCCCTCAGTGTCCCAGAGGGAGGCATGGCCTCT
+CTCAACTGCACTTCAAGTGATCGTAATTTTCAGTACTTCTGGTGGTACAGACAGCATTCT
+GGAGAAGGCCCCAAGGCACTGATGTCAATCTTCTCTGATGGTGACAAGAAAGAAGGCAGA
+TTCACAGCTCACCTCAATAAGGCCAGCCTGCATGTTTCCCTGCACATCAGAGACTCCCAG
+CCCAGTGACTCCGCTCTCTACTTCTGTGCAGCTAGTGAGCA
+>ENSMUST00000103643.3 cds chromosome:GRCm38:14:53469756:53470232:1 gene:ENSMUSG00000076831.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:Trav8-1 description:T cell receptor alpha variable 8-1 [Source:MGI Symbol;Acc:MGI:3649608]
+ATGCACAGCCTCCTGGGGTTGTTGTTGTGGCTGCAACTGACAAGGGTGAATAGTCAACTA
+GCAGAAGAGAATTCGTGGGCCCTGAGCGTCCACGAGGGTGAAAGTGTCACGGTGAATTGT
+AGTTACAAGACATCCATAACTGCCCTACAGTGGTACAGACAGAAGTCAGGCAAAGGCCCT
+GCCCAGCTAATCTTAATACGTTCAAATGAGAGAGAGAAGCGCAATGGAAGACTCAGAGCC
+ACCCTTGACACCTCCAGCCAGAGCAGCTCCTTGTCCATCACTGCTACTCGGTGTGAAGAC
+ACCGCTGTGTACTTCTGTGCTACTGATG
+>ENSMUST00000103581.5 cds chromosome:GRCm38:14:53488045:53488567:1 gene:ENSMUSG00000096900.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:Trav9-1 description:T cell receptor alpha  variable 9-1 [Source:MGI Symbol;Acc:MGI:3650356]
+ATGCTCCTGGTTCTCATCTCGTTCCTCGGGATACATTTCTTCCTGGATGTCCAAACACAG
+ACAGTTTCCCAGTCTGATGCCCATGTCACTGTCTTCGAAGGAGACTCGGTGGAGCTGAGA
+TGCAACTATTCCTATGGTGGATCCATTTACCTCTCCTGGTACATCCAGCACCATGGCCAT
+GGCCTCCAGTTTCTCCTCAAGTACTATTCGGGAAACCCAGTGGTTCAAGGAGTGAACGGC
+TTCGAGGCTGAGTTCAGCAAGAGCGACTCTTCCTTCCACCTTCGGAAAGCCTCTGTGCAC
+TGGAGCGACTCGGCTGTGTACTTCTGTGCTGTGAGCG
+>ENSMUST00000181210.2 cds chromosome:GRCm38:14:53491115:53491622:1 gene:ENSMUSG00000096149.5 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:Trav6-5 description:T cell receptor alpha variable 6-5 [Source:MGI Symbol;Acc:MGI:3649609]
+ATGAACCTTTGTCCTGAACTGGGTATTCTACTCTTCCTAATGCTTGGAGAAAGCAATGGA
+GACTCAGTGACTCAGACAGAAGGCCCAGTGACACTGTCTGAAGGGACTTCTCTGACTGTG
+AACTGTTCCTATGAAACCAAACAGTACCCAACCCTGTTCTGGTATGTGCAGTATCCCGGA
+GAAGGTCCACAGCTCCTCTTTAAAGTCCCAAAGGCCAACGAGAAGGGAAGCAACAGAGGT
+TTTGAAGCTACATACAATAAAGAAGCCACCTCCTTCCACTTGCAGAAAGCCTCAGTGCAA
+GAGTCAGACTCGGCTGTGTACTACTGTGCTCTGAGTGA
+>ENSMUST00000183488.1 cds chromosome:GRCm38:14:53491152:53491622:1 gene:ENSMUSG00000096149.5 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:Trav6-5 description:T cell receptor alpha variable 6-5 [Source:MGI Symbol;Acc:MGI:3649609]
+ATGAACCTTTGTCCTGAACTGGGTATTCTACTCTTCCTAATGCTTGAAAGCAATGGAGAC
+TCAGTGACTCAGACAGAAGGCCCAGTGACACTGTCTGAAGGGACTTCTCTGACTGTGAAC
+TGTTCCTATGAAACCAAACAGTACCCAACCCTGTTCTGGTATGTGCAGTATCCCGGAGAA
+GGTCCACAGCTCCTCTTTAAAGTCCCAAAGGCCAACGAGAAGGGAAGCAACAGAGGTTTT
+GAAGCTACATACAATAAAGAAGCCACCTCCTTCCACTTGCAGAAAGCCTCAGTGCAAGAG
+TCAGACTCGGCTGTGTACTACTGTGCTCTGAGTGA
+>ENSMUST00000103583.4 cds chromosome:GRCm38:14:53505727:53506286:1 gene:ENSMUSG00000096551.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:Trav10 description:T cell receptor alpha variable 10 [Source:MGI Symbol;Acc:MGI:3704132]
+ATGAAGACATCCCTTCACACTGTATTCCTATTCTTGTGGCTATGGATGGACTGGGAGAGC
+CATGGAGAGAAGGTCGAGCAACATGAGTCTACACTGAGTGTTCGAGAGGGAGACAGCGCT
+GTCATCAACTGCACTTACACAGATACTGCTTCATCATACTTCCCTTGGTACAAGCAAGAA
+GCTGGAAAGAGTCTCCACTTTGTGATAGACATTCGTTCAAATGTGGACAGAAAACAGAGC
+CAAAGACTTATAGTTTTGTTGGATAAGAAAGCCAAACGATTCTCCCTGCACATCACAGCC
+ACACAGCCTGAAGATTCAGCCATCTACTTCTGTGCAGCAAGCA
+>ENSMUST00000103584.3 cds chromosome:GRCm38:14:53516929:53517366:1 gene:ENSMUSG00000095862.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:Trav6-6 description:T cell receptor alpha variable 6-6 [Source:MGI Symbol;Acc:MGI:4439905]
+ATGGACTCTTCTCCAGGCTTCGTGGCTGTGATACTTCTCATACTTGGAAGGACCCACGGA
+GATTCCGTGACTCAAACAGAAGGCCCAGTGACCGTCTCAGAAAGCGAGTCCCTGATAATA
+AATTGCACGTATTCAGCCACAAGCATAGCTTACCCTAATCTTTTCTGGTATGTTCGATAT
+CCTGGAGAAGGTCTACAACTCCTCCTGAAAGTCATTACGGCTGGCCAGAAGGGAAGCAGC
+AGAGGGTTTGAAGCCACATACAATAAAGAAACCACCTCCTTCCACTTGCAGAAAGCCTCA
+GTGCAAGAGTCAGACTCGGCTGTGTACTACTGTGCTCTGGGTGA
+>ENSMUST00000103585.3 cds chromosome:GRCm38:14:53519303:53519859:1 gene:ENSMUSG00000096615.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:Trav11 description:T cell receptor alpha variable 11 [Source:MGI Symbol;Acc:MGI:3642163]
+ATGAAAAAGCGCCTGAGTGCCTGCTGGGTGGTCTTGTGGCTGCATTATCAGTGGGTGGCT
+GGCAAGACCCAAGTGGAGCAGAGTCCTCAGTCCCTGGTTGTCCGTCAGGGAGAGAACTGC
+GTCCTTCAATGTAATTACAGTGTGACCCCCGACAACCACTTAAGGTGGTTCAAACAGGAC
+ACAGGCAAAGGTCTTGTGTCCCTGACAGTCCTGGTTGACCAAAAAGACAAAACGTCAAAT
+GGGAGATACTCAGCAACTCTGGATAAAGATGCTAAGCACAGCACGCTGCACATCACAGCC
+ACCCTGCTGGATGACACTGCCACCTACATCTGTGTGGTGGGCG
+>ENSMUST00000200609.1 cds chromosome:GRCm38:14:53530786:53531313:1 gene:ENSMUSG00000106620.1 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:Trav7-5 description:T cell receptor alpha variable 7-5 [Source:MGI Symbol;Acc:MGI:3648929]
+ATGAAATCCTTGAGTGTTTCACTAGTGGTCCTGTGGCTCCAGTTTAATTGGGTGAGCAGC
+CAGCAGAAGGTGCAGCAGAGCCCAGAATCCCTCACTGTCTCAGAGGGAGCCATGGCCTCT
+CTCAACTGCACGTTCAGTGATGGTACTTCTAACAACTTCAGGTGGTACAGACAGCATTCT
+GCGAAAGGCCTTGAGGTGCTAGTGTCCATCTTCTCTGATGGTGAAAAGGAAGAAGGCAGA
+TTTACAGCTCACCTCAATAGAGCCAACTTGCATGTTTCCCTACACATCAGAGAACCACAA
+CCCAGTGACTCTGCTGTCTACCTCTGTGCAGTGAGCA
+>ENSMUST00000200115.1 cds chromosome:GRCm38:14:53538191:53538738:1 gene:ENSMUSG00000096825.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:Trav12-1 description:T cell receptor alpha variable 12-1 [Source:MGI Symbol;Acc:MGI:4440525]
+ATGAACATGCGTCCTGTCACCTCCTCAGTTCTCGTGCTCCTCCTAATGCTCAGAAGGAGC
+AATGGAGACTCCGTGACCCAGACAGAAGGCCTGGTCACTGTCACCGAGGGGTTGCCTGTG
+AAGCTGAACTGCACCTATCAGACTACTTATTTAACTATTGCCTTTTTCTGGTATGTGCAA
+TATCTCAACGAAGCCCCTCAGGTACTCCTGAAGAGCTCCACAGACAACAAGAGGACCGAG
+CACCAAGGGTTCCACGCCACTCTCCATAAGAGCAGCAGCTCCTTCCATCTGCAGAAGTCC
+TCAGCGCAGCTGTCAGACTCTGCCCTGTACTACTGTGCTCTGAGTGA
+>ENSMUST00000103650.2 cds chromosome:GRCm38:14:53538260:53538738:1 gene:ENSMUSG00000096825.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:Trav12-1 description:T cell receptor alpha variable 12-1 [Source:MGI Symbol;Acc:MGI:4440525]
+ATGAACATGCGTCCTGTCACCTCCTCAGTTCTCGTGCTCCTCCTAATGCTCAGGAGCAAT
+GGAGACTCCGTGACCCAGACAGAAGGCCTGGTCACTGTCACCGAGGGGTTGCCTGTGAAG
+CTGAACTGCACCTATCAGACTACTTATTTAACTATTGCCTTTTTCTGGTATGTGCAATAT
+CTCAACGAAGCCCCTCAGGTACTCCTGAAGAGCTCCACAGACAACAAGAGGACCGAGCAC
+CAAGGGTTCCACGCCACTCTCCATAAGAGCAGCAGCTCCTTCCATCTGCAGAAGTCCTCA
+GCGCAGCTGTCAGACTCTGCCCTGTACTACTGTGCTCTGAGTGA
+>ENSMUST00000103651.3 cds chromosome:GRCm38:14:53545014:53545525:1 gene:ENSMUSG00000076839.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:Trav13-1 description:T cell receptor alpha variable 13-1 [Source:MGI Symbol;Acc:MGI:4439904]
+ATGAAGAGGCTGCTGAGCTCTCTGCTGGGGCTTCTGTGCACCCAGGTTTGCTGGGTGAAA
+GGACAGCAAGTGCAGCAGAGCCCCGCGTCCTTGGTTCTGCAGGAGGGGGAGAACGCAGAG
+CTGCAGTGTAACTTTTCCACATCTTTGAACAGTATGCAGTGGTTTTACCAACGTCCTGGG
+GGAAGTCTCGTCAGCCTGTTCTACAATCCTTCTGGGACAAAGCATAGTGGGAGACTGACA
+TCCACTACAGTCATCAAAGAACGTCGCAGCTCTTTGCACATTTCCTCCTCCCAGACAACA
+GACTCAGGCACTTATCTCTGTGCTTTGGAAC
+>ENSMUST00000198297.1 cds chromosome:GRCm38:14:53554022:53554558:1 gene:ENSMUSG00000076840.4 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:Trav14-1 description:T cell receptor alpha variable 14-1 [Source:MGI Symbol;Acc:MGI:3646773]
+ATGGACAAGATTCTGACAGCATCATTTTTACTCCTAGGCCTTCACCTAGCTGGGGTGAAT
+GGCCAGCAGAAGGAGAAACATGACCAGCAGCAGGTGAGACAAAGTCCCCAATCTCTGACA
+GTCTGGGAAGGAGGAACCACAGTTCTGACCTGCAGTTATGAGGACAGCACTTTTAACTAC
+TTCCCATGGTACCAACAGTTCCCTGGGGAAGGCCCTGCACTTCTGATATCCATACTTTCA
+GTGTCCGATAAAAAGGAAGATGGACGATTCACAACCTTCTTCAATAAAAGGGAGAAAAAG
+CTCTCCTTGCACATCATAGACTCTCAGCCTGGAGACTCAGCCACCTACTTCTGTGCAGCA
+AGTG
+>ENSMUST00000200101.1 cds chromosome:GRCm38:14:53559632:53560247:1 gene:ENSMUSG00000094016.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:Trav15-1-dv6-1 description:T cell receptor alpha variable 15-1-DV6-1 [Source:MGI Symbol;Acc:MGI:4439369]
+ATGCCTCCTCACAGCCTGCTCTGTGTGCTGGTGGCCTTGGCTTTCTCTGGATCTAATGTG
+GCCCAGAAAGTGATTCAGGTCTGGTCAACAACAAGCAGGCAGGAGGGCGAAAAACTCACA
+CTGGACTGTTCATATAAGACAAGTCAGGTCTTATACCATCTTTTCTGGTACAAGCACCTT
+CTTAGTGGAGAGATGGTTTTGCTTATTCGACAAATGCCTTCTACTATTGCAATAGAGAGG
+AGCGGCCGCTATTCTGTAGTCTTCCAGAAATCACGCAAATCCATCAGCCTTGTCATTTCA
+ACCTTACAACCAGACGATTCGGGAAAGTATTTCTGTGCTCTCTGGGAGCTGG
+>ENSMUST00000103653.2 cds chromosome:GRCm38:14:53559676:53560247:1 gene:ENSMUSG00000094016.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:Trav15-1-dv6-1 description:T cell receptor alpha variable 15-1-DV6-1 [Source:MGI Symbol;Acc:MGI:4439369]
+ATGCCTCCTCACAGCCTGCTCTGTGTGCTGGTGGCCTTGGCTTTCTCTGCTAATGTGGCC
+CAGAAAGTGATTCAGGTCTGGTCAACAACAAGCAGGCAGGAGGGCGAAAAACTCACACTG
+GACTGTTCATATAAGACAAGTCAGGTCTTATACCATCTTTTCTGGTACAAGCACCTTCTT
+AGTGGAGAGATGGTTTTGCTTATTCGACAAATGCCTTCTACTATTGCAATAGAGAGGAGC
+GGCCGCTATTCTGTAGTCTTCCAGAAATCACGCAAATCCATCAGCCTTGTCATTTCAACC
+TTACAACCAGACGATTCGGGAAAGTATTTCTGTGCTCTCTGGGAGCTGG
+>ENSMUST00000103654.2 cds chromosome:GRCm38:14:53590857:53591514:1 gene:ENSMUSG00000094966.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:Trav9-2 description:T cell receptor alpha  variable 9-2 [Source:MGI Symbol;Acc:MGI:4439903]
+ATGCTCCTGGCGCTCCTCCCAGTGCTGGGGATACACTTTGTCCTGAGAGATGCCCAAGCT
+CAGTCAGTGACGCAGCCCGATGCTCGCGTCACTGTCTCTGAAGGAGCCTCTCTGCAGCTG
+AGATGCAAGTATTCCTACTCTGGGACACCTTATCTGTTCTGGTATGTCCAGTACCCGCGG
+CAGGGGCTGCAGCTGCTCCTCAAGTACTATTCAGGAGACCCAGTGGTTCAAGGAGTGAAT
+GGCTTCGAGGCTGAGTTCAGCAAGAGTAACTCTTCCTTCCACCTGCGGAAAGCCTCTGTG
+CACTGGAGCGACTCTGCTGTGTACTTCTGTGTTTTGAGCG
+>ENSMUST00000103655.2 cds chromosome:GRCm38:14:53598828:53599410:1 gene:ENSMUSG00000093966.2 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:Trav4-3 description:T cell receptor alpha variable 4-3 [Source:MGI Symbol;Acc:MGI:4440478]
+ATGCAGAGGAACCTGGGAGCTGTGCTGGGGATTCTGTGGGTGCAGATTTGCTGGGTGAGC
+GGAGATAAGGTGAAACAAAGTCCCTCAGCGCTGAGTCTCCAAGAAGGAACCAATTCTGCT
+CTGAGATGCAATTTTTCTATCGCCGCGACAACTGTGCAGTGGTTCCTACAGAATCCCAGG
+GGCAGCCTCATCAATCTTTTTTACCTGGTTCCAGGAACAAAGGAGAATGGGAGGTTAAAG
+TCAGCATTCGATTCTAAGGAGAGCTACAGCACCCTGCACATCAGGGATGCCCAGCTGGAG
+GACTCAGGCACTTACTTCTGTGCTGCTGAGG
+>ENSMUST00000180972.2 cds chromosome:GRCm38:14:53616315:53616914:1 gene:ENSMUSG00000096656.6 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:Trav12-2 description:T cell receptor alpha variable 12-2 [Source:MGI Symbol;Acc:MGI:5293447]
+ATGAACATGCGTCCTGACACCTGCTCAGTTCTTGTGCTCCTCTTAATGCTCAGAAGGAAC
+AATGGAGACTCTGTGACCCAGACAGAAGGCCTGGTCACTCTCACCGAGGGGTTGCCTGTG
+ATGCTGAACTGCACCTATCAGAGTACTTACTCACCTTTCCTTTTCTGGTATGTGCAACAT
+CTCAACGAAGCCCCTAAGCTACTTTTGAAGAGCTTCACAGACAACAAGAGGCCCGAGCAC
+CAAGGGTTCCACGCCACTCTCCATAAGAGCAGCAGCTCCTTCCATCTGCAGAAGTCCTCA
+GCGCAGCTGTCAGACTCTGCCCTGTACTACTGTGCTTTGAGTGA
+>ENSMUST00000103657.5 cds chromosome:GRCm38:14:53621657:53622245:1 gene:ENSMUSG00000095958.3 gene_biotype:TR_V_gene transcript_biotype:TR_V_gene gene_symbol:Trav12-3 description:T cell receptor alpha variable 12-3 [Source:MGI Symbol;Acc:MGI:3648633]
+ATGCGTCCTGGCACCTGCTCAGTTCTTGTGCTCCTCCTAATGCTCAGGAGGAGCAATGGA
+GATGGAGACTCAGTGACCCAGAAGGAAGGCCTGGTCACTCTCACCGAGGGGTTGCCTGTG
+ATGCTGAACTGCACCTATCAGACTATTTACTCAAATGCTTTCCTTTTCTGGTATGTGCAC
+TATCTCAATGAATCCCCTCGGCTACTCCTGAAGAGCTCCACAGACAACAAGAGGACCGAG
+CACCAAGGGTTCCACGCCACTCTCCATAAGAGCAGCAGCTCCTTCCATCTGCAGAAGTCC
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/Mus_musculus.GRCm38.cds.longest.fa	Tue Mar 07 05:54:30 2017 -0500
@@ -0,0 +1,172 @@
+>ENSMUST00000196221
+ATGGCATAT
+>ENSMUST00000177564
+ATCGGAGGGATACGAG
+>ENSMUST00000178537
+GGGACAGGGGGC
+>ENSMUST00000178862
+GGGACTGGGGGGGC
+>ENSMUST00000179520
+CTAACTGGGAC
+>ENSMUST00000179883
+AGACAGCTCAGGCTAC
+>ENSMUST00000195858
+GAATACCTAC
+>ENSMUST00000180001
+TCTACTATGGTAACTAC
+>ENSMUST00000178815
+GACTACCTAC
+>ENSMUST00000177965
+TCTACTATGGTTACGAC
+>ENSMUST00000178909
+AGACAGCTAGCCTCTGCAGTGCCACAACC
+>ENSMUST00000177646
+GAATACCTAC
+>ENSMUST00000178230
+CCTACTATAGTAACTAC
+>ENSMUST00000178483
+AGGCAGCTAGCCTCTGCAGTGCCACAACC
+>ENSMUST00000179262
+GAATACCTAC
+>ENSMUST00000178549
+CCTACTATAGTAACTAC
+>ENSMUST00000193012
+AGGCAGCTAGCCTCTGCAGTGCCACAACC
+>ENSMUST00000179166
+GAATACCTAC
+>ENSMUST00000179560
+TCTACTATGATTACGAC
+>ENSMUST00000177839
+TCTATGATGGTTACTAC
+>ENSMUST00000103439
+TTTATTACTACGGTAGTAGCTAC
+>ENSMUST00000180266
+GGCACAGCTCGGGCTAC
+>ENSMUST00000103441
+TATATAACTAAAGTGGTAGCTCA
+>ENSMUST00000177622
+ATGAAATCCTTGAGTGTTTCCCTAGTGGTCCTGTGGCTCCAGTTAAACTGGGTGAACAGC
+CAGCAGAAGGTGCAGCAGAGCCCAGAATCCCTCATTGTCCCAGAGGGAGCCATGACCTCT
+CTCAACTGCACTTTCAGCGACAGTGCTTCTCAGTATTTTGCATGGTACAGACAGCATTCT
+GGGAAAGCCCCCAAGGCACTGATGTCCATCTTCTCCAATGGTGAAAAAGAAGAAGGCAGA
+TTCACAATTCACCTCAATAAAGCCAGTCTGCATTTCTCCCTGCACATCAGAGACTCCCAG
+CCCAGTGACTCTGCTCTCTACCTCTGTGCAGTGAGCA
+>ENSMUST00000180711
+ATGAATACTTCTCCAGTTTTAGTAACTGTGATGCTGCTGTTCATGCTTGGGATGAGAAAG
+ACCCACGGAGATTCAGTGACCCAGAAACAAGGTCAAGTGACCCTTTCAGAAGATGACTTC
+CTATTTATAAATTGCACTTATTCTACCACAACATACCCAACTCTTTTGTGGTATGTCCAA
+TATCTTGGACAAGGTCCACAGCTCCTTCTGAAAGTGACAACTGCCAACAATAAGGGAATC
+AGCAGAGGCTTTGAAGCTACATATGACAAAGGGACCACGTCCTTCCACTTGCAGAAAGCC
+TCAGTGCAGGAGTCAGACTCAGCCGTGTACTTCTGTGCTCTGGTTGA
+>ENSMUST00000181728
+ATGAAATCCTTGAGTGTTTCACTAGTGGTCCTGTGGCTCCAGTTAAACTGCGTGAGGAGC
+CAGCAGAAGGTGCAGCAGAGCCCAGAATCCCTCAGTGTCCCAGAGGGAGGCATGGCCTCT
+CTCAACTGCACTTCAAGTGATCGTAATTTTCAGTACTTCTGGTGGTACAGACAGCATTCT
+GGAGAAGGCCCCAAGGCACTGATGTCAATCTTCTCTGATGGTGACAAGAAAGAAGGCAGA
+TTCACAGCTCACCTCAATAAGGCCAGCCTGCATGTTTCCCTGCACATCAGAGACTCCCAG
+CCCAGTGACTCCGCTCTCTACTTCTGTGCAGCTAGTGAGCA
+>ENSMUST00000103643
+ATGCACAGCCTCCTGGGGTTGTTGTTGTGGCTGCAACTGACAAGGGTGAATAGTCAACTA
+GCAGAAGAGAATTCGTGGGCCCTGAGCGTCCACGAGGGTGAAAGTGTCACGGTGAATTGT
+AGTTACAAGACATCCATAACTGCCCTACAGTGGTACAGACAGAAGTCAGGCAAAGGCCCT
+GCCCAGCTAATCTTAATACGTTCAAATGAGAGAGAGAAGCGCAATGGAAGACTCAGAGCC
+ACCCTTGACACCTCCAGCCAGAGCAGCTCCTTGTCCATCACTGCTACTCGGTGTGAAGAC
+ACCGCTGTGTACTTCTGTGCTACTGATG
+>ENSMUST00000103581
+ATGCTCCTGGTTCTCATCTCGTTCCTCGGGATACATTTCTTCCTGGATGTCCAAACACAG
+ACAGTTTCCCAGTCTGATGCCCATGTCACTGTCTTCGAAGGAGACTCGGTGGAGCTGAGA
+TGCAACTATTCCTATGGTGGATCCATTTACCTCTCCTGGTACATCCAGCACCATGGCCAT
+GGCCTCCAGTTTCTCCTCAAGTACTATTCGGGAAACCCAGTGGTTCAAGGAGTGAACGGC
+TTCGAGGCTGAGTTCAGCAAGAGCGACTCTTCCTTCCACCTTCGGAAAGCCTCTGTGCAC
+TGGAGCGACTCGGCTGTGTACTTCTGTGCTGTGAGCG
+>ENSMUST00000181210
+ATGAACCTTTGTCCTGAACTGGGTATTCTACTCTTCCTAATGCTTGGAGAAAGCAATGGA
+GACTCAGTGACTCAGACAGAAGGCCCAGTGACACTGTCTGAAGGGACTTCTCTGACTGTG
+AACTGTTCCTATGAAACCAAACAGTACCCAACCCTGTTCTGGTATGTGCAGTATCCCGGA
+GAAGGTCCACAGCTCCTCTTTAAAGTCCCAAAGGCCAACGAGAAGGGAAGCAACAGAGGT
+TTTGAAGCTACATACAATAAAGAAGCCACCTCCTTCCACTTGCAGAAAGCCTCAGTGCAA
+GAGTCAGACTCGGCTGTGTACTACTGTGCTCTGAGTGA
+>ENSMUST00000103583
+ATGAAGACATCCCTTCACACTGTATTCCTATTCTTGTGGCTATGGATGGACTGGGAGAGC
+CATGGAGAGAAGGTCGAGCAACATGAGTCTACACTGAGTGTTCGAGAGGGAGACAGCGCT
+GTCATCAACTGCACTTACACAGATACTGCTTCATCATACTTCCCTTGGTACAAGCAAGAA
+GCTGGAAAGAGTCTCCACTTTGTGATAGACATTCGTTCAAATGTGGACAGAAAACAGAGC
+CAAAGACTTATAGTTTTGTTGGATAAGAAAGCCAAACGATTCTCCCTGCACATCACAGCC
+ACACAGCCTGAAGATTCAGCCATCTACTTCTGTGCAGCAAGCA
+>ENSMUST00000103584
+ATGGACTCTTCTCCAGGCTTCGTGGCTGTGATACTTCTCATACTTGGAAGGACCCACGGA
+GATTCCGTGACTCAAACAGAAGGCCCAGTGACCGTCTCAGAAAGCGAGTCCCTGATAATA
+AATTGCACGTATTCAGCCACAAGCATAGCTTACCCTAATCTTTTCTGGTATGTTCGATAT
+CCTGGAGAAGGTCTACAACTCCTCCTGAAAGTCATTACGGCTGGCCAGAAGGGAAGCAGC
+AGAGGGTTTGAAGCCACATACAATAAAGAAACCACCTCCTTCCACTTGCAGAAAGCCTCA
+GTGCAAGAGTCAGACTCGGCTGTGTACTACTGTGCTCTGGGTGA
+>ENSMUST00000103585
+ATGAAAAAGCGCCTGAGTGCCTGCTGGGTGGTCTTGTGGCTGCATTATCAGTGGGTGGCT
+GGCAAGACCCAAGTGGAGCAGAGTCCTCAGTCCCTGGTTGTCCGTCAGGGAGAGAACTGC
+GTCCTTCAATGTAATTACAGTGTGACCCCCGACAACCACTTAAGGTGGTTCAAACAGGAC
+ACAGGCAAAGGTCTTGTGTCCCTGACAGTCCTGGTTGACCAAAAAGACAAAACGTCAAAT
+GGGAGATACTCAGCAACTCTGGATAAAGATGCTAAGCACAGCACGCTGCACATCACAGCC
+ACCCTGCTGGATGACACTGCCACCTACATCTGTGTGGTGGGCG
+>ENSMUST00000200609
+ATGAAATCCTTGAGTGTTTCACTAGTGGTCCTGTGGCTCCAGTTTAATTGGGTGAGCAGC
+CAGCAGAAGGTGCAGCAGAGCCCAGAATCCCTCACTGTCTCAGAGGGAGCCATGGCCTCT
+CTCAACTGCACGTTCAGTGATGGTACTTCTAACAACTTCAGGTGGTACAGACAGCATTCT
+GCGAAAGGCCTTGAGGTGCTAGTGTCCATCTTCTCTGATGGTGAAAAGGAAGAAGGCAGA
+TTTACAGCTCACCTCAATAGAGCCAACTTGCATGTTTCCCTACACATCAGAGAACCACAA
+CCCAGTGACTCTGCTGTCTACCTCTGTGCAGTGAGCA
+>ENSMUST00000200115
+ATGAACATGCGTCCTGTCACCTCCTCAGTTCTCGTGCTCCTCCTAATGCTCAGAAGGAGC
+AATGGAGACTCCGTGACCCAGACAGAAGGCCTGGTCACTGTCACCGAGGGGTTGCCTGTG
+AAGCTGAACTGCACCTATCAGACTACTTATTTAACTATTGCCTTTTTCTGGTATGTGCAA
+TATCTCAACGAAGCCCCTCAGGTACTCCTGAAGAGCTCCACAGACAACAAGAGGACCGAG
+CACCAAGGGTTCCACGCCACTCTCCATAAGAGCAGCAGCTCCTTCCATCTGCAGAAGTCC
+TCAGCGCAGCTGTCAGACTCTGCCCTGTACTACTGTGCTCTGAGTGA
+>ENSMUST00000103651
+ATGAAGAGGCTGCTGAGCTCTCTGCTGGGGCTTCTGTGCACCCAGGTTTGCTGGGTGAAA
+GGACAGCAAGTGCAGCAGAGCCCCGCGTCCTTGGTTCTGCAGGAGGGGGAGAACGCAGAG
+CTGCAGTGTAACTTTTCCACATCTTTGAACAGTATGCAGTGGTTTTACCAACGTCCTGGG
+GGAAGTCTCGTCAGCCTGTTCTACAATCCTTCTGGGACAAAGCATAGTGGGAGACTGACA
+TCCACTACAGTCATCAAAGAACGTCGCAGCTCTTTGCACATTTCCTCCTCCCAGACAACA
+GACTCAGGCACTTATCTCTGTGCTTTGGAAC
+>ENSMUST00000198297
+ATGGACAAGATTCTGACAGCATCATTTTTACTCCTAGGCCTTCACCTAGCTGGGGTGAAT
+GGCCAGCAGAAGGAGAAACATGACCAGCAGCAGGTGAGACAAAGTCCCCAATCTCTGACA
+GTCTGGGAAGGAGGAACCACAGTTCTGACCTGCAGTTATGAGGACAGCACTTTTAACTAC
+TTCCCATGGTACCAACAGTTCCCTGGGGAAGGCCCTGCACTTCTGATATCCATACTTTCA
+GTGTCCGATAAAAAGGAAGATGGACGATTCACAACCTTCTTCAATAAAAGGGAGAAAAAG
+CTCTCCTTGCACATCATAGACTCTCAGCCTGGAGACTCAGCCACCTACTTCTGTGCAGCA
+AGTG
+>ENSMUST00000200101
+ATGCCTCCTCACAGCCTGCTCTGTGTGCTGGTGGCCTTGGCTTTCTCTGGATCTAATGTG
+GCCCAGAAAGTGATTCAGGTCTGGTCAACAACAAGCAGGCAGGAGGGCGAAAAACTCACA
+CTGGACTGTTCATATAAGACAAGTCAGGTCTTATACCATCTTTTCTGGTACAAGCACCTT
+CTTAGTGGAGAGATGGTTTTGCTTATTCGACAAATGCCTTCTACTATTGCAATAGAGAGG
+AGCGGCCGCTATTCTGTAGTCTTCCAGAAATCACGCAAATCCATCAGCCTTGTCATTTCA
+ACCTTACAACCAGACGATTCGGGAAAGTATTTCTGTGCTCTCTGGGAGCTGG
+>ENSMUST00000103654
+ATGCTCCTGGCGCTCCTCCCAGTGCTGGGGATACACTTTGTCCTGAGAGATGCCCAAGCT
+CAGTCAGTGACGCAGCCCGATGCTCGCGTCACTGTCTCTGAAGGAGCCTCTCTGCAGCTG
+AGATGCAAGTATTCCTACTCTGGGACACCTTATCTGTTCTGGTATGTCCAGTACCCGCGG
+CAGGGGCTGCAGCTGCTCCTCAAGTACTATTCAGGAGACCCAGTGGTTCAAGGAGTGAAT
+GGCTTCGAGGCTGAGTTCAGCAAGAGTAACTCTTCCTTCCACCTGCGGAAAGCCTCTGTG
+CACTGGAGCGACTCTGCTGTGTACTTCTGTGTTTTGAGCG
+>ENSMUST00000103655
+ATGCAGAGGAACCTGGGAGCTGTGCTGGGGATTCTGTGGGTGCAGATTTGCTGGGTGAGC
+GGAGATAAGGTGAAACAAAGTCCCTCAGCGCTGAGTCTCCAAGAAGGAACCAATTCTGCT
+CTGAGATGCAATTTTTCTATCGCCGCGACAACTGTGCAGTGGTTCCTACAGAATCCCAGG
+GGCAGCCTCATCAATCTTTTTTACCTGGTTCCAGGAACAAAGGAGAATGGGAGGTTAAAG
+TCAGCATTCGATTCTAAGGAGAGCTACAGCACCCTGCACATCAGGGATGCCCAGCTGGAG
+GACTCAGGCACTTACTTCTGTGCTGCTGAGG
+>ENSMUST00000180972
+ATGAACATGCGTCCTGACACCTGCTCAGTTCTTGTGCTCCTCTTAATGCTCAGAAGGAAC
+AATGGAGACTCTGTGACCCAGACAGAAGGCCTGGTCACTCTCACCGAGGGGTTGCCTGTG
+ATGCTGAACTGCACCTATCAGAGTACTTACTCACCTTTCCTTTTCTGGTATGTGCAACAT
+CTCAACGAAGCCCCTAAGCTACTTTTGAAGAGCTTCACAGACAACAAGAGGCCCGAGCAC
+CAAGGGTTCCACGCCACTCTCCATAAGAGCAGCAGCTCCTTCCATCTGCAGAAGTCCTCA
+GCGCAGCTGTCAGACTCTGCCCTGTACTACTGTGCTTTGAGTGA
+>ENSMUST00000103657
+ATGCGTCCTGGCACCTGCTCAGTTCTTGTGCTCCTCCTAATGCTCAGGAGGAGCAATGGA
+GATGGAGACTCAGTGACCCAGAAGGAAGGCCTGGTCACTCTCACCGAGGGGTTGCCTGTG
+ATGCTGAACTGCACCTATCAGACTATTTACTCAAATGCTTTCCTTTTCTGGTATGTGCAC
+TATCTCAATGAATCCCCTCGGCTACTCCTGAAGAGCTCCACAGACAACAAGAGGACCGAG
+CACCAAGGGTTCCACGCCACTCTCCATAAGAGCAGCAGCTCCTTCCATCTGCAGAAGTCC