changeset 0:9a1626faa05c draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/blob/master/tools/berokka commit 387f04ffbf5205aaaa7b46e9e3d518edb62a538f
author iuc
date Mon, 25 Mar 2019 12:55:16 -0400
parents
children f91f6054fca7
files README.rst berokka.xml test-data/berokka_test1.fasta test-data/results_1 test-data/results_2 test-data/results_3 test-data/trimmed_1 test-data/trimmed_2 test-data/trimmed_3
diffstat 9 files changed, 440 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/README.rst	Mon Mar 25 12:55:16 2019 -0400
@@ -0,0 +1,14 @@
+Galaxy Wrapper for Berokka
+==========================
+
+Trim, circularise, orient & filter long read bacterial genome assemblies.
+
+Detailed Description
+--------------------
+
+View original Berokka documentation here: https://github.com/tseemann/berokka/blob/master/README.md
+
+License
+-------
+
+`GPLv3 <https://raw.githubusercontent.com/tseemann/berokka/master/LICENSE>`_
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/berokka.xml	Mon Mar 25 12:55:16 2019 -0400
@@ -0,0 +1,107 @@
+<tool id="berokka" name="Berokka" version="0.2">
+    <description>Trim, circularise, orient and filter long read bacterial genome assemblies</description>
+    <requirements>
+        <requirement type="package" version="0.2">berokka</requirement>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+        berokka 
+            --outdir default '${input_file}'
+
+	    #if $filter_fasta:
+                --filter '${filter_fasta}'
+            #end if
+
+            --readlen '${read_length}'
+
+            --fuzz '${fuzz}'
+
+            #unless $anno:
+                --noanno
+            #end unless
+
+    ]]></command>
+    <inputs>
+        <param name="input_file" type="data"  format="fasta" label="Input (FASTA)" help="Should be completed long-read assemblies in FASTA format, such as those from CANU or HGAP"/>
+        <param name="filter_fasta" optional="true" type="data" format="fasta" label="Filter (FASTA)" help="Give a fasta to use as a filter."/>
+        <param name="read_length" type="integer" value="60000" min="28"  label="Read Length" help="Approximate max read length (default '60000')"/>
+        <param name="fuzz" type="integer" value="5" label="Fuzz" help="Accept local alignment within X bp of global (default '5')"/>	
+        <param name="anno" type="boolean" checked="true" label="Annotation" help="Annotate Trimmed FASTA"/>
+    </inputs>
+    <outputs>
+        <data name="trimmed" format="fasta" from_work_dir="default/02.trimmed.fa" label="${tool.name} on ${on_string}: Trimmed"/>
+        <data name="results" format="tabular" from_work_dir="default/03.results.tab" label="${tool.name} on ${on_string}: Results"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="input_file" value="berokka_test1.fasta"/>
+            <param name="read_length" value="60000"/>
+            <param name="fuzz" value="5"/>
+            <param name="anno" value="true"/>
+            <output name="trimmed" file="trimmed_1" ftype="fasta"/>
+            <output name="results" file="results_1" ftype="tabular"/>
+        </test>
+        <test>
+            <param name="input_file" value="berokka_test1.fasta"/>
+            <param name="filter_select" value="true"/>
+            <param name="filter_fasta" value="berokka_test1.fasta"/>
+            <param name="read_length" value="60000"/>
+            <param name="fuzz" value="5"/>
+            <param name="anno" value="true"/>
+            <output name="trimmed" file="trimmed_2" ftype="fasta"/>
+            <output name="results" file="results_2" ftype="tabular"/>
+        </test>
+        <test>
+            <param name="input_file" value="berokka_test1.fasta"/>
+            <param name="read_length" value="100"/>
+            <param name="fuzz" value="50"/>
+            <param name="anno" value="false"/>
+            <output name="trimmed" file="trimmed_3" ftype="fasta"/>
+            <output name="results" file="results_3" ftype="tabular"/>
+        </test>
+   </tests>
+   <help><![CDATA[
+**Summary**
+
+Trim, circularise, orient & filter long read bacterial genome assemblies
+There is already a good piece of software to trim/circularise and orient genome assemblies called Circlator. Please try that first!
+
+You should only try Berokka if:
+
+1. You only have the contig files and do not have the corrected reads anymore
+2. Your contigs are simple cases with clear overhang and could be done manually with BLAST
+3. Circlator fails on your data even after troubleshooting
+
+NOTE: orientation to dnaA or rep genes is not yet implemented.
+
+**Input**
+
+Input should be completed long-read assemblies in FASTA format, such as those from CANU or HGAP.
+
+**Output**
+
+1. trimmed: The (possibly) trimmed sequences (FASTA)
+
+2. results: Summary of results (TSV)
+
+**Options**
+
+* `Filter <FASTA>` allows you to remove contigs which match 50% of sequences in this file. Berokka comes with the standard Pacbio control sequence. You can provide your own FASTA file using this option.
+
+* `Read Length <LENGTH>` can be used for datasets that won't seem to circularise. It affects the length of the match it attempts to make using BLAST.
+
+* `Fuzz` can be used to accept local alignment within X bp of global (default '5')
+
+* `Annotation` can be set to "No" to ensure that the FASTA descriptions are not altered between the input and output FASTA files.
+
+    ]]></help>
+    <citations>
+        <citation type="bibtex">
+@UNPUBLISHED{Seemann2016,
+    author = {Seemann, Torsten},
+    title = {Berokka: Faster Trim, circularise and orient long read bacterial genome assemblies},
+    year = {2016},
+    url = {https://github.com/tseemann/berokka},
+}
+        </citation>
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/berokka_test1.fasta	Mon Mar 25 12:55:16 2019 -0400
@@ -0,0 +1,70 @@
+>gi|145231|gb|M33724.1|ECOALPHOA Escherichia coli K-12 truncated PhoA (phoA) gene, partial cds; and transposon Mu dI, partial sequence
+CAAAGCTCCGGGCCTCACCCAGGCGCTAAATACCAAAGATGGCGCAGTGATGGTGATGAGTTACGGGAAC
+TCCGAAGAGGATTCACAAGAACATACCGGCAGTCAGTTGCGTATTGCGGCGTATGGCCCGCATGCCGCCA
+ATGAAGCGGCGCACGAAAAACGCGAAAGCGT
+
+>gi|145232|gb|M33725.1|ECOALPHOB Escherichia coli K12 phoA pseudogene and transposon Mu dl-R, partial sequence
+CTGTCATAAAGTTGTCACGGCCGAGACTTATAGTCGCTTTGTTTTTATTTTTTAATGTATTTGTACATGG
+AGAAAATAAAGTGAAACAAAGCACTATTGCACTGGCACTCTTACCGTTACTGTTTACCCCTGTGACAAAA
+GCCCGGACACCAGTGAAGCGGCGCACGAAAAACGCGAAAGCGT
+
+>gi|145234|gb|M33727.1|ECOALPHOE Escherichia coli K12 upstream sequence of psiA5::Mu dI. is identical to psiA30 upstream sequence; putative (phoA) pseudogene and transposon Mu dl-R, partial sequence
+TTGTTTTTATTTTTTAATGTATTTGTACATGGAGAAAATAAAGTGAAACAAAGCACTATTGCACTGGTGA
+AGCGGCGCACGAAAAACGCGAAAGCGT
+
+>gi|146195|gb|J01619.1|ECOGLTA Eschericia coli gltA gene, sdhCDAB operon and sucABCD operons, complete sequence
+GAATTCGACCGCCATTGCGCAAGGCATCGCCATGACCAGGCAGGATACAAAAGAGAGTCGATAAATATTC
+ACGGTGTCCATACCTGATAAATATTTTATGAAAGGCGGCGATGATGCCGCAAAATAATACTTATTTATAA
+TCCAGCACGTAGGTTGCGTTAGCGGTTACTTCACCTGCCGTGACATCGACTGCATTATCAATTTGTTCCA
+TCCAGGCGAAAAAGTTCAGCGTCTGTTCTGATGAGCTTGCATCCAGGTCAAGATCTGGCGCGGCTGAACC
+TAATACGATGTTACCGTCATTTTTGTCCATCAGTCGTACACCGACCCCAGTTGCTTCGCCTGCACTGGTG
+TTGCTCAACAAAGGCGTAGCACCAGTTGTCTTAGCCGTGCTATCGAAGGTTACGCCAAACTTTGGATACC
+GGCATTCCGCTACCGTTGTCAGAAGCAGGCAGATCACAGTTGATCAAGCGAATGTCGACGGCCACTTTAT
+TGCTATGATGCTCCCGGTTTATATGGGTTGTCGTGACTTGTCCAAGATCTATGTTTTTATCAATATCTTC
+TGGATGAATTTCACAAGGTGCTTCAATAACCTCCCCCTTAAAGTGAATTTCGCCAGAACCTTCATCAGCA
+GCATAAACAGGTGCAGTGAACAGCAGAGATACGGCCAGTGCGGCCAATGTTTTTTGTCCTTTAAACATAA
+CAGAGTCCTTTAAGGATATAGAATAGGGGTATAGCTACGCCAGAATATCGTATTTGATTATTGCTAGTTT
+TTAGTTTTGCTTAAAAAATATTGTTAGTTTTATTAAATTGGAAAACTAAATTATTGGTATCATGAATTGT
+TGTATGATGATAAATATAGGGGGGATATGATAGACGTCATTTTCATAGGGTTATAAAATGCGACTACCAT
+GAAGTTTTTAATTCAAAGTATTGGGTTGCTGATAATTTGAGCTGTTCTATTCTTTTTAAATATCTATATA
+GGTCTGTTAATGGATTTTATTTTTACAAGTTTTTTGTGTTTAGGCATATAAAAATCAAGCCCGCCATATG
+AACGGCGGGTTAAAATATTTACAACTTAGCAATCGAACCATTAACGCTTGATATCGCTTTTAAAGTCGCG
+TTTTTCATATCCTGTATACAGCTGACGCGGACGGGCAATCTTCATACCGTCACTGTGCATTTCGCTCCAG
+TGGGCGATCCAGCCAACGGTACGTGCCATTGCGAAAATGACGGTGAACATGGAAGACGGAATACCCATCG
+CTTTCAGGATGATACCAGAGTAGAAATCGACGTTCGGGTACAGTTTCTTCTCGATAAAGTACGGGTCGTT
+CAGCGCGATGTTTTCCAGCTCCATAGCCACTTCCAGCAGGTCATCCTTCGTGCCCAGCTCTTTCAGCACT
+TCATGGCAGGTTTCACGCATTACGGTGGCGCGCGGGTCGTAATTTTTGTACACGCGGTGACCGAAGCCCA
+TCAGGCGGAAAGAATCATTTTTGTCTTTCGCACGACGAAAAAATTCCGGAATGTGTTTAACGGAGCTGAT
+TTCTTCCAGCATTTTCAGCGCCGCTTCGTTAGCACCGCCGTGCGCAGGTCCCCACAGTGAAGCAATACCT
+GCTGCGATACAGGCAAACGGGTTCGCACCCGAAGAGCCAGCGGTACGCACGGTGGAGGTAGAGGCGTTCT
+GTTCATGGTCAGCGTGCAGGATCAGAATACGGTCCATAGCACGTTCCAGAATCGGATTAACTTCATACGG
+TTCGCACGGCGTGGAGAACATCATATTCAGGAAGTTACCGGCGTAGGAGAGATCGTTGCGCGGGTAAACA
+AATGGCTGACCAATGGAATACTTGTAACACATCGCGGCCATGGTCGGCATTTTCGACAGCAGGCGGAACG
+CGGCAATTTCACGGTGACGAGGATTGTTAACATCCAGCGAGTCGTGATAGAACGCCGCCAGCGCGCCGGT
+AATACCACACATGACTGCCATTGGATGCGAGTCGCGACGGAAAGCATGGAACAGACGGGTAATCTGCTCG
+TGGATCATGGTATGACGGGTCACCGTAGTTTTAAATTCGTCATACTGTTCCTGAGTCGGTTTTTCACCAT
+TCAGCAGGATGTAACAAACTTCCAGGTAGTTAGAATCGGTCGCCAGCTGATCGATCGGGAAACCGCGGTG
+CAGCAAAATACCTTCATCACCATCAATAAAAGTAATTTTAGATTCGCAGGATGCGGTTGAAGTGAAGCCT
+GGGTCAAAGGTGAACACACCTTTTGAACCGAGAGTACGGATATCAATAACATCTTGACCCAGCGTGCCTT
+TCAGCACATCCAGTTCAACAGCTGTATCCCCGTTGAGGGTGAGTTTTGCTTTTGTATCAGCCATTTAAGG
+TCTCCTTAGCGCCTTATTGCGTAAGACTGCCGGAACTTAAATTTGCCTTCGCACATCAACCTGGCTTTAC
+CCGTTTTTTATTTGGCTCGCCGCTCTGTGAAAGAGGGGAAAACCTGGGTACAGAGCTCTGGGCGCTTGCA
+GGTAAAGGATCCATTGATGACGAATAAATGGCGAATCAAGTACTTAGCAATCCGAATTATTAAACTTGTC
+TACCACTAATAACTGTCCCGAATGAATTGGTCAATACTCCACACTGTTACATAAGTTAATCTTAGGTGAA
+ATACCGACTTCATAACTTTTACGCATTATATGCTTTTCCTGGTAATGTTTGTAACAACTTTGTTGAATGA
+TTGTCAAATTAGATGATTAAAAATTAAATAAATGTTGTTATCGTGACCTGGATCACTGTTCAGGATAAAA
+CCCGACAAACTATATGTAGGTTAATTGTAATGATTTTGTGAACAGCCTATACTGCCGCCAGTCTCCGGAA
+CACCCTGCAATCCCGAGCCACCCAGCGTTGTAACGTGTCGTTTTCGCATCTGGAAGCAGTGTTTTGCATG
+ACGCGCAGTTATAGAAAGGACGCTGTCTGACCCGCAAGCAGACCGGAGGAAGGAAATCCCGACGTCTCCA
+GGTAACAGAAAGTTAACCTCTGTGCCCGTAGTCCCCAGGGAATAATAAGAACAGCATGTGGGCGTTATTC
+ATGATAAGAAATGTGAAAAAACAAAGACCTGTTAATCTGGACCTACAGACCATCCGGTTCCCCATCACGG
+CGATAGCGTCCATTCTCCATCGCGTTTCCGGTGTGATCACCTTTGTTGCAGTGGGCATCCTGCTGTGGCT
+TCTGGGTACCAGCCTCTCTTCCCCTGAAGGTTTCGAGCAAGCTTCCGCGATTATGGGCAGCTTCTTCGTC
+AAATTTATCATGTGGGGCATCCTTACCGCTCTGGCGTATCACGTCGTCGTAGGTATTCGCCACATGATGA
+TGGATTTTGGCTATCTGGAAGAAACATTCGAAGCGGGTAAACGCTCCGCCAAAATCTCCTTTGTTATTAC
+TGTCGTGCTTTCACTTCTCGCAGGAGTCCTCGTATGGTAAGCAACGCCTCCGCATTAGGACGCAATGGCG
+TACATGATTTCATCCTCGTTCGCGCTACCGCTATCGTCCTGACGCTCTACATCATTTATATGGTCGGTTT
+TTTCGCTACCAGTGGCGAGCTGACATATGAAGTCTGGATCGGTTTCTTCGCCTCTGCGTTCACCAAAGTG
+TTCACCCTGCTGGCGCTGTTTTCTATCTTGATCCATGCCTGGATCGGCATGTGGCAGGTGTTGACCGACT
+ACGTTAAACCGCTGGCTTTGCGCCTGATGCTGCAACTGGTGATTGTCGTTGCACTGGTGGTTTACGTGAT
+TTATGGATTCGTTGTGGTGTGGGGTGTGTGATGAAATTGCCAGTCAGAGAATTTGATGCAGTTGTGATTG
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/results_1	Mon Mar 25 12:55:16 2019 -0400
@@ -0,0 +1,5 @@
+#sequence	status	old_len	new_len	trimmed
+gi|145231|gb|M33724.1|ECOALPHOA	kept	171	171	0
+gi|145232|gb|M33725.1|ECOALPHOB	kept	183	183	0
+gi|145234|gb|M33727.1|ECOALPHOE	kept	97	97	0
+gi|146195|gb|J01619.1|ECOGLTA	kept	3850	3850	0
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/results_2	Mon Mar 25 12:55:16 2019 -0400
@@ -0,0 +1,5 @@
+#sequence	status	old_len	new_len	trimmed
+gi|145231|gb|M33724.1|ECOALPHOA	kept	171	171	0
+gi|145232|gb|M33725.1|ECOALPHOB	kept	183	183	0
+gi|145234|gb|M33727.1|ECOALPHOE	kept	97	97	0
+gi|146195|gb|J01619.1|ECOGLTA	kept	3850	3850	0
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/results_3	Mon Mar 25 12:55:16 2019 -0400
@@ -0,0 +1,5 @@
+#sequence	status	old_len	new_len	trimmed
+gi|145231|gb|M33724.1|ECOALPHOA	kept	171	171	0
+gi|145232|gb|M33725.1|ECOALPHOB	kept	183	183	0
+gi|145234|gb|M33727.1|ECOALPHOE	kept	97	97	0
+gi|146195|gb|J01619.1|ECOGLTA	kept	3850	3850	0
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/trimmed_1	Mon Mar 25 12:55:16 2019 -0400
@@ -0,0 +1,78 @@
+>gi|145231|gb|M33724.1|ECOALPHOA Escherichia coli K-12 truncated PhoA (phoA) gene, partial cds; and transposon Mu dI, partial sequence
+CAAAGCTCCGGGCCTCACCCAGGCGCTAAATACCAAAGATGGCGCAGTGATGGTGATGAG
+TTACGGGAACTCCGAAGAGGATTCACAAGAACATACCGGCAGTCAGTTGCGTATTGCGGC
+GTATGGCCCGCATGCCGCCAATGAAGCGGCGCACGAAAAACGCGAAAGCGT
+>gi|145232|gb|M33725.1|ECOALPHOB Escherichia coli K12 phoA pseudogene and transposon Mu dl-R, partial sequence
+CTGTCATAAAGTTGTCACGGCCGAGACTTATAGTCGCTTTGTTTTTATTTTTTAATGTAT
+TTGTACATGGAGAAAATAAAGTGAAACAAAGCACTATTGCACTGGCACTCTTACCGTTAC
+TGTTTACCCCTGTGACAAAAGCCCGGACACCAGTGAAGCGGCGCACGAAAAACGCGAAAG
+CGT
+>gi|145234|gb|M33727.1|ECOALPHOE Escherichia coli K12 upstream sequence of psiA5::Mu dI. is identical to psiA30 upstream sequence; putative (phoA) pseudogene and transposon Mu dl-R, partial sequence
+TTGTTTTTATTTTTTAATGTATTTGTACATGGAGAAAATAAAGTGAAACAAAGCACTATT
+GCACTGGTGAAGCGGCGCACGAAAAACGCGAAAGCGT
+>gi|146195|gb|J01619.1|ECOGLTA Eschericia coli gltA gene, sdhCDAB operon and sucABCD operons, complete sequence
+GAATTCGACCGCCATTGCGCAAGGCATCGCCATGACCAGGCAGGATACAAAAGAGAGTCG
+ATAAATATTCACGGTGTCCATACCTGATAAATATTTTATGAAAGGCGGCGATGATGCCGC
+AAAATAATACTTATTTATAATCCAGCACGTAGGTTGCGTTAGCGGTTACTTCACCTGCCG
+TGACATCGACTGCATTATCAATTTGTTCCATCCAGGCGAAAAAGTTCAGCGTCTGTTCTG
+ATGAGCTTGCATCCAGGTCAAGATCTGGCGCGGCTGAACCTAATACGATGTTACCGTCAT
+TTTTGTCCATCAGTCGTACACCGACCCCAGTTGCTTCGCCTGCACTGGTGTTGCTCAACA
+AAGGCGTAGCACCAGTTGTCTTAGCCGTGCTATCGAAGGTTACGCCAAACTTTGGATACC
+GGCATTCCGCTACCGTTGTCAGAAGCAGGCAGATCACAGTTGATCAAGCGAATGTCGACG
+GCCACTTTATTGCTATGATGCTCCCGGTTTATATGGGTTGTCGTGACTTGTCCAAGATCT
+ATGTTTTTATCAATATCTTCTGGATGAATTTCACAAGGTGCTTCAATAACCTCCCCCTTA
+AAGTGAATTTCGCCAGAACCTTCATCAGCAGCATAAACAGGTGCAGTGAACAGCAGAGAT
+ACGGCCAGTGCGGCCAATGTTTTTTGTCCTTTAAACATAACAGAGTCCTTTAAGGATATA
+GAATAGGGGTATAGCTACGCCAGAATATCGTATTTGATTATTGCTAGTTTTTAGTTTTGC
+TTAAAAAATATTGTTAGTTTTATTAAATTGGAAAACTAAATTATTGGTATCATGAATTGT
+TGTATGATGATAAATATAGGGGGGATATGATAGACGTCATTTTCATAGGGTTATAAAATG
+CGACTACCATGAAGTTTTTAATTCAAAGTATTGGGTTGCTGATAATTTGAGCTGTTCTAT
+TCTTTTTAAATATCTATATAGGTCTGTTAATGGATTTTATTTTTACAAGTTTTTTGTGTT
+TAGGCATATAAAAATCAAGCCCGCCATATGAACGGCGGGTTAAAATATTTACAACTTAGC
+AATCGAACCATTAACGCTTGATATCGCTTTTAAAGTCGCGTTTTTCATATCCTGTATACA
+GCTGACGCGGACGGGCAATCTTCATACCGTCACTGTGCATTTCGCTCCAGTGGGCGATCC
+AGCCAACGGTACGTGCCATTGCGAAAATGACGGTGAACATGGAAGACGGAATACCCATCG
+CTTTCAGGATGATACCAGAGTAGAAATCGACGTTCGGGTACAGTTTCTTCTCGATAAAGT
+ACGGGTCGTTCAGCGCGATGTTTTCCAGCTCCATAGCCACTTCCAGCAGGTCATCCTTCG
+TGCCCAGCTCTTTCAGCACTTCATGGCAGGTTTCACGCATTACGGTGGCGCGCGGGTCGT
+AATTTTTGTACACGCGGTGACCGAAGCCCATCAGGCGGAAAGAATCATTTTTGTCTTTCG
+CACGACGAAAAAATTCCGGAATGTGTTTAACGGAGCTGATTTCTTCCAGCATTTTCAGCG
+CCGCTTCGTTAGCACCGCCGTGCGCAGGTCCCCACAGTGAAGCAATACCTGCTGCGATAC
+AGGCAAACGGGTTCGCACCCGAAGAGCCAGCGGTACGCACGGTGGAGGTAGAGGCGTTCT
+GTTCATGGTCAGCGTGCAGGATCAGAATACGGTCCATAGCACGTTCCAGAATCGGATTAA
+CTTCATACGGTTCGCACGGCGTGGAGAACATCATATTCAGGAAGTTACCGGCGTAGGAGA
+GATCGTTGCGCGGGTAAACAAATGGCTGACCAATGGAATACTTGTAACACATCGCGGCCA
+TGGTCGGCATTTTCGACAGCAGGCGGAACGCGGCAATTTCACGGTGACGAGGATTGTTAA
+CATCCAGCGAGTCGTGATAGAACGCCGCCAGCGCGCCGGTAATACCACACATGACTGCCA
+TTGGATGCGAGTCGCGACGGAAAGCATGGAACAGACGGGTAATCTGCTCGTGGATCATGG
+TATGACGGGTCACCGTAGTTTTAAATTCGTCATACTGTTCCTGAGTCGGTTTTTCACCAT
+TCAGCAGGATGTAACAAACTTCCAGGTAGTTAGAATCGGTCGCCAGCTGATCGATCGGGA
+AACCGCGGTGCAGCAAAATACCTTCATCACCATCAATAAAAGTAATTTTAGATTCGCAGG
+ATGCGGTTGAAGTGAAGCCTGGGTCAAAGGTGAACACACCTTTTGAACCGAGAGTACGGA
+TATCAATAACATCTTGACCCAGCGTGCCTTTCAGCACATCCAGTTCAACAGCTGTATCCC
+CGTTGAGGGTGAGTTTTGCTTTTGTATCAGCCATTTAAGGTCTCCTTAGCGCCTTATTGC
+GTAAGACTGCCGGAACTTAAATTTGCCTTCGCACATCAACCTGGCTTTACCCGTTTTTTA
+TTTGGCTCGCCGCTCTGTGAAAGAGGGGAAAACCTGGGTACAGAGCTCTGGGCGCTTGCA
+GGTAAAGGATCCATTGATGACGAATAAATGGCGAATCAAGTACTTAGCAATCCGAATTAT
+TAAACTTGTCTACCACTAATAACTGTCCCGAATGAATTGGTCAATACTCCACACTGTTAC
+ATAAGTTAATCTTAGGTGAAATACCGACTTCATAACTTTTACGCATTATATGCTTTTCCT
+GGTAATGTTTGTAACAACTTTGTTGAATGATTGTCAAATTAGATGATTAAAAATTAAATA
+AATGTTGTTATCGTGACCTGGATCACTGTTCAGGATAAAACCCGACAAACTATATGTAGG
+TTAATTGTAATGATTTTGTGAACAGCCTATACTGCCGCCAGTCTCCGGAACACCCTGCAA
+TCCCGAGCCACCCAGCGTTGTAACGTGTCGTTTTCGCATCTGGAAGCAGTGTTTTGCATG
+ACGCGCAGTTATAGAAAGGACGCTGTCTGACCCGCAAGCAGACCGGAGGAAGGAAATCCC
+GACGTCTCCAGGTAACAGAAAGTTAACCTCTGTGCCCGTAGTCCCCAGGGAATAATAAGA
+ACAGCATGTGGGCGTTATTCATGATAAGAAATGTGAAAAAACAAAGACCTGTTAATCTGG
+ACCTACAGACCATCCGGTTCCCCATCACGGCGATAGCGTCCATTCTCCATCGCGTTTCCG
+GTGTGATCACCTTTGTTGCAGTGGGCATCCTGCTGTGGCTTCTGGGTACCAGCCTCTCTT
+CCCCTGAAGGTTTCGAGCAAGCTTCCGCGATTATGGGCAGCTTCTTCGTCAAATTTATCA
+TGTGGGGCATCCTTACCGCTCTGGCGTATCACGTCGTCGTAGGTATTCGCCACATGATGA
+TGGATTTTGGCTATCTGGAAGAAACATTCGAAGCGGGTAAACGCTCCGCCAAAATCTCCT
+TTGTTATTACTGTCGTGCTTTCACTTCTCGCAGGAGTCCTCGTATGGTAAGCAACGCCTC
+CGCATTAGGACGCAATGGCGTACATGATTTCATCCTCGTTCGCGCTACCGCTATCGTCCT
+GACGCTCTACATCATTTATATGGTCGGTTTTTTCGCTACCAGTGGCGAGCTGACATATGA
+AGTCTGGATCGGTTTCTTCGCCTCTGCGTTCACCAAAGTGTTCACCCTGCTGGCGCTGTT
+TTCTATCTTGATCCATGCCTGGATCGGCATGTGGCAGGTGTTGACCGACTACGTTAAACC
+GCTGGCTTTGCGCCTGATGCTGCAACTGGTGATTGTCGTTGCACTGGTGGTTTACGTGAT
+TTATGGATTCGTTGTGGTGTGGGGTGTGTGATGAAATTGCCAGTCAGAGAATTTGATGCA
+GTTGTGATTG
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/trimmed_2	Mon Mar 25 12:55:16 2019 -0400
@@ -0,0 +1,78 @@
+>gi|145231|gb|M33724.1|ECOALPHOA Escherichia coli K-12 truncated PhoA (phoA) gene, partial cds; and transposon Mu dI, partial sequence
+CAAAGCTCCGGGCCTCACCCAGGCGCTAAATACCAAAGATGGCGCAGTGATGGTGATGAG
+TTACGGGAACTCCGAAGAGGATTCACAAGAACATACCGGCAGTCAGTTGCGTATTGCGGC
+GTATGGCCCGCATGCCGCCAATGAAGCGGCGCACGAAAAACGCGAAAGCGT
+>gi|145232|gb|M33725.1|ECOALPHOB Escherichia coli K12 phoA pseudogene and transposon Mu dl-R, partial sequence
+CTGTCATAAAGTTGTCACGGCCGAGACTTATAGTCGCTTTGTTTTTATTTTTTAATGTAT
+TTGTACATGGAGAAAATAAAGTGAAACAAAGCACTATTGCACTGGCACTCTTACCGTTAC
+TGTTTACCCCTGTGACAAAAGCCCGGACACCAGTGAAGCGGCGCACGAAAAACGCGAAAG
+CGT
+>gi|145234|gb|M33727.1|ECOALPHOE Escherichia coli K12 upstream sequence of psiA5::Mu dI. is identical to psiA30 upstream sequence; putative (phoA) pseudogene and transposon Mu dl-R, partial sequence
+TTGTTTTTATTTTTTAATGTATTTGTACATGGAGAAAATAAAGTGAAACAAAGCACTATT
+GCACTGGTGAAGCGGCGCACGAAAAACGCGAAAGCGT
+>gi|146195|gb|J01619.1|ECOGLTA Eschericia coli gltA gene, sdhCDAB operon and sucABCD operons, complete sequence
+GAATTCGACCGCCATTGCGCAAGGCATCGCCATGACCAGGCAGGATACAAAAGAGAGTCG
+ATAAATATTCACGGTGTCCATACCTGATAAATATTTTATGAAAGGCGGCGATGATGCCGC
+AAAATAATACTTATTTATAATCCAGCACGTAGGTTGCGTTAGCGGTTACTTCACCTGCCG
+TGACATCGACTGCATTATCAATTTGTTCCATCCAGGCGAAAAAGTTCAGCGTCTGTTCTG
+ATGAGCTTGCATCCAGGTCAAGATCTGGCGCGGCTGAACCTAATACGATGTTACCGTCAT
+TTTTGTCCATCAGTCGTACACCGACCCCAGTTGCTTCGCCTGCACTGGTGTTGCTCAACA
+AAGGCGTAGCACCAGTTGTCTTAGCCGTGCTATCGAAGGTTACGCCAAACTTTGGATACC
+GGCATTCCGCTACCGTTGTCAGAAGCAGGCAGATCACAGTTGATCAAGCGAATGTCGACG
+GCCACTTTATTGCTATGATGCTCCCGGTTTATATGGGTTGTCGTGACTTGTCCAAGATCT
+ATGTTTTTATCAATATCTTCTGGATGAATTTCACAAGGTGCTTCAATAACCTCCCCCTTA
+AAGTGAATTTCGCCAGAACCTTCATCAGCAGCATAAACAGGTGCAGTGAACAGCAGAGAT
+ACGGCCAGTGCGGCCAATGTTTTTTGTCCTTTAAACATAACAGAGTCCTTTAAGGATATA
+GAATAGGGGTATAGCTACGCCAGAATATCGTATTTGATTATTGCTAGTTTTTAGTTTTGC
+TTAAAAAATATTGTTAGTTTTATTAAATTGGAAAACTAAATTATTGGTATCATGAATTGT
+TGTATGATGATAAATATAGGGGGGATATGATAGACGTCATTTTCATAGGGTTATAAAATG
+CGACTACCATGAAGTTTTTAATTCAAAGTATTGGGTTGCTGATAATTTGAGCTGTTCTAT
+TCTTTTTAAATATCTATATAGGTCTGTTAATGGATTTTATTTTTACAAGTTTTTTGTGTT
+TAGGCATATAAAAATCAAGCCCGCCATATGAACGGCGGGTTAAAATATTTACAACTTAGC
+AATCGAACCATTAACGCTTGATATCGCTTTTAAAGTCGCGTTTTTCATATCCTGTATACA
+GCTGACGCGGACGGGCAATCTTCATACCGTCACTGTGCATTTCGCTCCAGTGGGCGATCC
+AGCCAACGGTACGTGCCATTGCGAAAATGACGGTGAACATGGAAGACGGAATACCCATCG
+CTTTCAGGATGATACCAGAGTAGAAATCGACGTTCGGGTACAGTTTCTTCTCGATAAAGT
+ACGGGTCGTTCAGCGCGATGTTTTCCAGCTCCATAGCCACTTCCAGCAGGTCATCCTTCG
+TGCCCAGCTCTTTCAGCACTTCATGGCAGGTTTCACGCATTACGGTGGCGCGCGGGTCGT
+AATTTTTGTACACGCGGTGACCGAAGCCCATCAGGCGGAAAGAATCATTTTTGTCTTTCG
+CACGACGAAAAAATTCCGGAATGTGTTTAACGGAGCTGATTTCTTCCAGCATTTTCAGCG
+CCGCTTCGTTAGCACCGCCGTGCGCAGGTCCCCACAGTGAAGCAATACCTGCTGCGATAC
+AGGCAAACGGGTTCGCACCCGAAGAGCCAGCGGTACGCACGGTGGAGGTAGAGGCGTTCT
+GTTCATGGTCAGCGTGCAGGATCAGAATACGGTCCATAGCACGTTCCAGAATCGGATTAA
+CTTCATACGGTTCGCACGGCGTGGAGAACATCATATTCAGGAAGTTACCGGCGTAGGAGA
+GATCGTTGCGCGGGTAAACAAATGGCTGACCAATGGAATACTTGTAACACATCGCGGCCA
+TGGTCGGCATTTTCGACAGCAGGCGGAACGCGGCAATTTCACGGTGACGAGGATTGTTAA
+CATCCAGCGAGTCGTGATAGAACGCCGCCAGCGCGCCGGTAATACCACACATGACTGCCA
+TTGGATGCGAGTCGCGACGGAAAGCATGGAACAGACGGGTAATCTGCTCGTGGATCATGG
+TATGACGGGTCACCGTAGTTTTAAATTCGTCATACTGTTCCTGAGTCGGTTTTTCACCAT
+TCAGCAGGATGTAACAAACTTCCAGGTAGTTAGAATCGGTCGCCAGCTGATCGATCGGGA
+AACCGCGGTGCAGCAAAATACCTTCATCACCATCAATAAAAGTAATTTTAGATTCGCAGG
+ATGCGGTTGAAGTGAAGCCTGGGTCAAAGGTGAACACACCTTTTGAACCGAGAGTACGGA
+TATCAATAACATCTTGACCCAGCGTGCCTTTCAGCACATCCAGTTCAACAGCTGTATCCC
+CGTTGAGGGTGAGTTTTGCTTTTGTATCAGCCATTTAAGGTCTCCTTAGCGCCTTATTGC
+GTAAGACTGCCGGAACTTAAATTTGCCTTCGCACATCAACCTGGCTTTACCCGTTTTTTA
+TTTGGCTCGCCGCTCTGTGAAAGAGGGGAAAACCTGGGTACAGAGCTCTGGGCGCTTGCA
+GGTAAAGGATCCATTGATGACGAATAAATGGCGAATCAAGTACTTAGCAATCCGAATTAT
+TAAACTTGTCTACCACTAATAACTGTCCCGAATGAATTGGTCAATACTCCACACTGTTAC
+ATAAGTTAATCTTAGGTGAAATACCGACTTCATAACTTTTACGCATTATATGCTTTTCCT
+GGTAATGTTTGTAACAACTTTGTTGAATGATTGTCAAATTAGATGATTAAAAATTAAATA
+AATGTTGTTATCGTGACCTGGATCACTGTTCAGGATAAAACCCGACAAACTATATGTAGG
+TTAATTGTAATGATTTTGTGAACAGCCTATACTGCCGCCAGTCTCCGGAACACCCTGCAA
+TCCCGAGCCACCCAGCGTTGTAACGTGTCGTTTTCGCATCTGGAAGCAGTGTTTTGCATG
+ACGCGCAGTTATAGAAAGGACGCTGTCTGACCCGCAAGCAGACCGGAGGAAGGAAATCCC
+GACGTCTCCAGGTAACAGAAAGTTAACCTCTGTGCCCGTAGTCCCCAGGGAATAATAAGA
+ACAGCATGTGGGCGTTATTCATGATAAGAAATGTGAAAAAACAAAGACCTGTTAATCTGG
+ACCTACAGACCATCCGGTTCCCCATCACGGCGATAGCGTCCATTCTCCATCGCGTTTCCG
+GTGTGATCACCTTTGTTGCAGTGGGCATCCTGCTGTGGCTTCTGGGTACCAGCCTCTCTT
+CCCCTGAAGGTTTCGAGCAAGCTTCCGCGATTATGGGCAGCTTCTTCGTCAAATTTATCA
+TGTGGGGCATCCTTACCGCTCTGGCGTATCACGTCGTCGTAGGTATTCGCCACATGATGA
+TGGATTTTGGCTATCTGGAAGAAACATTCGAAGCGGGTAAACGCTCCGCCAAAATCTCCT
+TTGTTATTACTGTCGTGCTTTCACTTCTCGCAGGAGTCCTCGTATGGTAAGCAACGCCTC
+CGCATTAGGACGCAATGGCGTACATGATTTCATCCTCGTTCGCGCTACCGCTATCGTCCT
+GACGCTCTACATCATTTATATGGTCGGTTTTTTCGCTACCAGTGGCGAGCTGACATATGA
+AGTCTGGATCGGTTTCTTCGCCTCTGCGTTCACCAAAGTGTTCACCCTGCTGGCGCTGTT
+TTCTATCTTGATCCATGCCTGGATCGGCATGTGGCAGGTGTTGACCGACTACGTTAAACC
+GCTGGCTTTGCGCCTGATGCTGCAACTGGTGATTGTCGTTGCACTGGTGGTTTACGTGAT
+TTATGGATTCGTTGTGGTGTGGGGTGTGTGATGAAATTGCCAGTCAGAGAATTTGATGCA
+GTTGTGATTG
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/trimmed_3	Mon Mar 25 12:55:16 2019 -0400
@@ -0,0 +1,78 @@
+>gi|145231|gb|M33724.1|ECOALPHOA Escherichia coli K-12 truncated PhoA (phoA) gene, partial cds; and transposon Mu dI, partial sequence
+CAAAGCTCCGGGCCTCACCCAGGCGCTAAATACCAAAGATGGCGCAGTGATGGTGATGAG
+TTACGGGAACTCCGAAGAGGATTCACAAGAACATACCGGCAGTCAGTTGCGTATTGCGGC
+GTATGGCCCGCATGCCGCCAATGAAGCGGCGCACGAAAAACGCGAAAGCGT
+>gi|145232|gb|M33725.1|ECOALPHOB Escherichia coli K12 phoA pseudogene and transposon Mu dl-R, partial sequence
+CTGTCATAAAGTTGTCACGGCCGAGACTTATAGTCGCTTTGTTTTTATTTTTTAATGTAT
+TTGTACATGGAGAAAATAAAGTGAAACAAAGCACTATTGCACTGGCACTCTTACCGTTAC
+TGTTTACCCCTGTGACAAAAGCCCGGACACCAGTGAAGCGGCGCACGAAAAACGCGAAAG
+CGT
+>gi|145234|gb|M33727.1|ECOALPHOE Escherichia coli K12 upstream sequence of psiA5::Mu dI. is identical to psiA30 upstream sequence; putative (phoA) pseudogene and transposon Mu dl-R, partial sequence
+TTGTTTTTATTTTTTAATGTATTTGTACATGGAGAAAATAAAGTGAAACAAAGCACTATT
+GCACTGGTGAAGCGGCGCACGAAAAACGCGAAAGCGT
+>gi|146195|gb|J01619.1|ECOGLTA Eschericia coli gltA gene, sdhCDAB operon and sucABCD operons, complete sequence
+GAATTCGACCGCCATTGCGCAAGGCATCGCCATGACCAGGCAGGATACAAAAGAGAGTCG
+ATAAATATTCACGGTGTCCATACCTGATAAATATTTTATGAAAGGCGGCGATGATGCCGC
+AAAATAATACTTATTTATAATCCAGCACGTAGGTTGCGTTAGCGGTTACTTCACCTGCCG
+TGACATCGACTGCATTATCAATTTGTTCCATCCAGGCGAAAAAGTTCAGCGTCTGTTCTG
+ATGAGCTTGCATCCAGGTCAAGATCTGGCGCGGCTGAACCTAATACGATGTTACCGTCAT
+TTTTGTCCATCAGTCGTACACCGACCCCAGTTGCTTCGCCTGCACTGGTGTTGCTCAACA
+AAGGCGTAGCACCAGTTGTCTTAGCCGTGCTATCGAAGGTTACGCCAAACTTTGGATACC
+GGCATTCCGCTACCGTTGTCAGAAGCAGGCAGATCACAGTTGATCAAGCGAATGTCGACG
+GCCACTTTATTGCTATGATGCTCCCGGTTTATATGGGTTGTCGTGACTTGTCCAAGATCT
+ATGTTTTTATCAATATCTTCTGGATGAATTTCACAAGGTGCTTCAATAACCTCCCCCTTA
+AAGTGAATTTCGCCAGAACCTTCATCAGCAGCATAAACAGGTGCAGTGAACAGCAGAGAT
+ACGGCCAGTGCGGCCAATGTTTTTTGTCCTTTAAACATAACAGAGTCCTTTAAGGATATA
+GAATAGGGGTATAGCTACGCCAGAATATCGTATTTGATTATTGCTAGTTTTTAGTTTTGC
+TTAAAAAATATTGTTAGTTTTATTAAATTGGAAAACTAAATTATTGGTATCATGAATTGT
+TGTATGATGATAAATATAGGGGGGATATGATAGACGTCATTTTCATAGGGTTATAAAATG
+CGACTACCATGAAGTTTTTAATTCAAAGTATTGGGTTGCTGATAATTTGAGCTGTTCTAT
+TCTTTTTAAATATCTATATAGGTCTGTTAATGGATTTTATTTTTACAAGTTTTTTGTGTT
+TAGGCATATAAAAATCAAGCCCGCCATATGAACGGCGGGTTAAAATATTTACAACTTAGC
+AATCGAACCATTAACGCTTGATATCGCTTTTAAAGTCGCGTTTTTCATATCCTGTATACA
+GCTGACGCGGACGGGCAATCTTCATACCGTCACTGTGCATTTCGCTCCAGTGGGCGATCC
+AGCCAACGGTACGTGCCATTGCGAAAATGACGGTGAACATGGAAGACGGAATACCCATCG
+CTTTCAGGATGATACCAGAGTAGAAATCGACGTTCGGGTACAGTTTCTTCTCGATAAAGT
+ACGGGTCGTTCAGCGCGATGTTTTCCAGCTCCATAGCCACTTCCAGCAGGTCATCCTTCG
+TGCCCAGCTCTTTCAGCACTTCATGGCAGGTTTCACGCATTACGGTGGCGCGCGGGTCGT
+AATTTTTGTACACGCGGTGACCGAAGCCCATCAGGCGGAAAGAATCATTTTTGTCTTTCG
+CACGACGAAAAAATTCCGGAATGTGTTTAACGGAGCTGATTTCTTCCAGCATTTTCAGCG
+CCGCTTCGTTAGCACCGCCGTGCGCAGGTCCCCACAGTGAAGCAATACCTGCTGCGATAC
+AGGCAAACGGGTTCGCACCCGAAGAGCCAGCGGTACGCACGGTGGAGGTAGAGGCGTTCT
+GTTCATGGTCAGCGTGCAGGATCAGAATACGGTCCATAGCACGTTCCAGAATCGGATTAA
+CTTCATACGGTTCGCACGGCGTGGAGAACATCATATTCAGGAAGTTACCGGCGTAGGAGA
+GATCGTTGCGCGGGTAAACAAATGGCTGACCAATGGAATACTTGTAACACATCGCGGCCA
+TGGTCGGCATTTTCGACAGCAGGCGGAACGCGGCAATTTCACGGTGACGAGGATTGTTAA
+CATCCAGCGAGTCGTGATAGAACGCCGCCAGCGCGCCGGTAATACCACACATGACTGCCA
+TTGGATGCGAGTCGCGACGGAAAGCATGGAACAGACGGGTAATCTGCTCGTGGATCATGG
+TATGACGGGTCACCGTAGTTTTAAATTCGTCATACTGTTCCTGAGTCGGTTTTTCACCAT
+TCAGCAGGATGTAACAAACTTCCAGGTAGTTAGAATCGGTCGCCAGCTGATCGATCGGGA
+AACCGCGGTGCAGCAAAATACCTTCATCACCATCAATAAAAGTAATTTTAGATTCGCAGG
+ATGCGGTTGAAGTGAAGCCTGGGTCAAAGGTGAACACACCTTTTGAACCGAGAGTACGGA
+TATCAATAACATCTTGACCCAGCGTGCCTTTCAGCACATCCAGTTCAACAGCTGTATCCC
+CGTTGAGGGTGAGTTTTGCTTTTGTATCAGCCATTTAAGGTCTCCTTAGCGCCTTATTGC
+GTAAGACTGCCGGAACTTAAATTTGCCTTCGCACATCAACCTGGCTTTACCCGTTTTTTA
+TTTGGCTCGCCGCTCTGTGAAAGAGGGGAAAACCTGGGTACAGAGCTCTGGGCGCTTGCA
+GGTAAAGGATCCATTGATGACGAATAAATGGCGAATCAAGTACTTAGCAATCCGAATTAT
+TAAACTTGTCTACCACTAATAACTGTCCCGAATGAATTGGTCAATACTCCACACTGTTAC
+ATAAGTTAATCTTAGGTGAAATACCGACTTCATAACTTTTACGCATTATATGCTTTTCCT
+GGTAATGTTTGTAACAACTTTGTTGAATGATTGTCAAATTAGATGATTAAAAATTAAATA
+AATGTTGTTATCGTGACCTGGATCACTGTTCAGGATAAAACCCGACAAACTATATGTAGG
+TTAATTGTAATGATTTTGTGAACAGCCTATACTGCCGCCAGTCTCCGGAACACCCTGCAA
+TCCCGAGCCACCCAGCGTTGTAACGTGTCGTTTTCGCATCTGGAAGCAGTGTTTTGCATG
+ACGCGCAGTTATAGAAAGGACGCTGTCTGACCCGCAAGCAGACCGGAGGAAGGAAATCCC
+GACGTCTCCAGGTAACAGAAAGTTAACCTCTGTGCCCGTAGTCCCCAGGGAATAATAAGA
+ACAGCATGTGGGCGTTATTCATGATAAGAAATGTGAAAAAACAAAGACCTGTTAATCTGG
+ACCTACAGACCATCCGGTTCCCCATCACGGCGATAGCGTCCATTCTCCATCGCGTTTCCG
+GTGTGATCACCTTTGTTGCAGTGGGCATCCTGCTGTGGCTTCTGGGTACCAGCCTCTCTT
+CCCCTGAAGGTTTCGAGCAAGCTTCCGCGATTATGGGCAGCTTCTTCGTCAAATTTATCA
+TGTGGGGCATCCTTACCGCTCTGGCGTATCACGTCGTCGTAGGTATTCGCCACATGATGA
+TGGATTTTGGCTATCTGGAAGAAACATTCGAAGCGGGTAAACGCTCCGCCAAAATCTCCT
+TTGTTATTACTGTCGTGCTTTCACTTCTCGCAGGAGTCCTCGTATGGTAAGCAACGCCTC
+CGCATTAGGACGCAATGGCGTACATGATTTCATCCTCGTTCGCGCTACCGCTATCGTCCT
+GACGCTCTACATCATTTATATGGTCGGTTTTTTCGCTACCAGTGGCGAGCTGACATATGA
+AGTCTGGATCGGTTTCTTCGCCTCTGCGTTCACCAAAGTGTTCACCCTGCTGGCGCTGTT
+TTCTATCTTGATCCATGCCTGGATCGGCATGTGGCAGGTGTTGACCGACTACGTTAAACC
+GCTGGCTTTGCGCCTGATGCTGCAACTGGTGATTGTCGTTGCACTGGTGGTTTACGTGAT
+TTATGGATTCGTTGTGGTGTGGGGTGTGTGATGAAATTGCCAGTCAGAGAATTTGATGCA
+GTTGTGATTG