changeset 14:56b924f62165 draft

"Update tests files for Galaxy tool wrapper"
author althonos
date Tue, 05 Apr 2022 23:18:49 +0000
parents fde43648cba0
children 64528877558f
files CHANGELOG.md gecco.xml test-data/BGC0001866.1_cluster_1.gbk test-data/clusters.tsv test-data/features.tsv test-data/genes.tsv test-data/sideload.json
diffstat 7 files changed, 147 insertions(+), 106 deletions(-) [+]
line wrap: on
line diff
--- a/CHANGELOG.md	Thu Mar 31 18:00:15 2022 +0000
+++ b/CHANGELOG.md	Tue Apr 05 23:18:49 2022 +0000
@@ -5,7 +5,14 @@
 and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
 
 ## [Unreleased]
-[Unreleased]: https://git.embl.de/grp-zeller/GECCO/compare/v0.9.1-alpha4...master
+[Unreleased]: https://git.embl.de/grp-zeller/GECCO/compare/v0.9.1...master
+
+## [v0.9.1] - 2022-04-05
+[v0.9.1]: https://git.embl.de/grp-zeller/GECCO/compare/v0.9.1-alpha4...v0.9.1
+
+### Changed
+- Make the `genes.tsv` and `features.tsv` table contain all genes even when they come from a contig too short to be processed by the CRF sliding window.
+- Replaced the `--force-clusters-tsv` flag with a `--force-tsv` flag to force writing TSV tables even when no genes or clusters were found in `gecco run` or `gecco annotate`.
 
 ## [v0.9.1-alpha4] - 2022-03-31
 [v0.9.1-alpha4]: https://git.embl.de/grp-zeller/GECCO/compare/v0.9.1-alpha3...v0.9.1-alpha4
@@ -15,7 +22,7 @@
 $ python -m gecco -vv train --c1 0.4 --c2 0 --select 0.25 --window-size 20 \
          -f mibig-2.0.proG2.Pfam-v35.0.features.tsv \
          -c mibig-2.0.proG2.clusters.tsv \
-         -g GECCO-data/data/embeddings/mibig-2.0.proG2.genes.gff \
+         -g GECCO-data/data/embeddings/mibig-2.0.proG2.genes.tsv \
          -o models/v0.9.1-alpha4
 ```
 
--- a/gecco.xml	Thu Mar 31 18:00:15 2022 +0000
+++ b/gecco.xml	Tue Apr 05 23:18:49 2022 +0000
@@ -1,8 +1,8 @@
 <?xml version='1.0' encoding='utf-8'?>
-<tool id="gecco" name="GECCO" version="0.8.10" python_template_version="3.5">
+<tool id="gecco" name="GECCO" version="0.9.1" python_template_version="3.5">
     <description>is a fast and scalable method for identifying putative novel Biosynthetic Gene Clusters (BGCs) in genomic and metagenomic data using Conditional Random Fields (CRFs).</description>
     <requirements>
-        <requirement type="package" version="0.8.10">gecco</requirement>
+        <requirement type="package" version="0.9.1">gecco</requirement>
     </requirements>
     <version_command>gecco --version</version_command>
     <command detect_errors="aggressive"><![CDATA[
@@ -18,8 +18,10 @@
         --format $input.ext
         --genome input_tempfile.$file_extension
         --postproc $postproc
-        --edge-distance $edge_distance
-        --force-clusters-tsv
+        --force-tsv
+        #if $edge_distance
+            --edge-distance $edge_distance
+        #end if
         #if $mask
             --mask
         #end if
@@ -33,6 +35,7 @@
             --antismash-sideload
         #end if
 
+        && mv input_tempfile.genes.tsv '$genes'
         && mv input_tempfile.features.tsv '$features'
         && mv input_tempfile.clusters.tsv '$clusters'
         #if $antismash_sideload
@@ -49,13 +52,14 @@
             <option value="antismash">antiSMASH</option>
             <option value="gecco" selected="true">GECCO</option>
         </param>
-        <param argument="--edge-distance" type="integer" min="0" value="10" label="Number of genes from the contig edges to filter out"/>
+        <param argument="--edge-distance" type="integer" min="0" optional="true" value="" label="Number of genes from the contig edges to filter out"/>
         <param argument="--antismash-sideload" type="boolean" checked="false" label="Generate an antiSMASH v6 sideload JSON file"/>
     </inputs>
     <outputs>
         <collection name="records" type="list" label="${tool.name} detected Biosynthetic Gene Clusters on ${on_string} (GenBank)">
             <discover_datasets pattern="(?P&lt;designation&gt;.*)\.gbk" ext="genbank" visible="false" />
         </collection>
+        <data name="genes" format="tabular" label="${tool.name} summary of detected genes on ${on_string} (TSV)"/>
         <data name="features" format="tabular" label="${tool.name} summary of detected features on ${on_string} (TSV)"/>
         <data name="clusters" format="tabular" label="${tool.name} summary of detected BGCs on ${on_string} (TSV)"/>
         <data name="sideload" format="json" label="antiSMASH v6 sideload file with ${tool.name} detected BGCs on ${on_string} (JSON)">
@@ -66,12 +70,14 @@
         <test>
             <param name="input" value="BGC0001866.fna"/>
             <output name="features" file="features.tsv"/>
+            <output name="genes" file="genes.tsv"/>
             <output name="clusters" file="clusters.tsv"/>
         </test>
         <test>
             <param name="input" value="BGC0001866.fna"/>
             <param name="edge_distance" value="0"/>
             <output name="features" file="features.tsv"/>
+            <output name="genes" file="genes.tsv"/>
             <output name="clusters" file="clusters.tsv"/>
             <output_collection name="records" type="list">
                 <element name="BGC0001866.1_cluster_1" file="BGC0001866.1_cluster_1.gbk" ftype="genbank" compare="diff" lines_diff="4"/>
@@ -82,6 +88,7 @@
             <param name="antismash_sideload" value="True"/>
             <param name="edge_distance" value="0"/>
             <output name="features" file="features.tsv"/>
+            <output name="genes" file="genes.tsv"/>
             <output name="clusters" file="clusters.tsv"/>
             <output name="sideload" file="sideload.json"/>
             <output_collection name="records" type="list">
@@ -107,8 +114,9 @@
 
 GECCO will create the following files once done (using the same prefix as the input file):
 
-- ``features.tsv``: The features file, containing the identified proteins and domains in the input sequences.
-- ``clusters.tsv``: If any were found, a clusters file, containing the coordinates of the predicted clusters, along their putative biosynthetic type.
+- ``features.tsv``: The genes file, containing the genes identified in the input sequences.
+- ``features.tsv``: The features file, containing the protein domains identified in the input sequences.
+- ``clusters.tsv``: A clusters file, containing the coordinates of the predicted clusters, along their putative biosynthetic type.
 - ``{sequence}_cluster_{N}.gbk``: If any BGCs were found, a GenBank file per cluster, containing the cluster sequence annotated with its member proteins and domains.
 
 Contact
--- a/test-data/BGC0001866.1_cluster_1.gbk	Thu Mar 31 18:00:15 2022 +0000
+++ b/test-data/BGC0001866.1_cluster_1.gbk	Tue Apr 05 23:18:49 2022 +0000
@@ -1,4 +1,4 @@
-LOCUS       BGC0001866.1_cluster_1 32633 bp    DNA     linear   UNK 21-NOV-2021
+LOCUS       BGC0001866.1_cluster_1 32633 bp    DNA     linear   UNK 06-APR-2022
 DEFINITION  BGC0001866.1 Byssochlamys spectabilis strain CBS 101075 chromosome
             Unknown C8Q69scaffold_14, whole genome shotgun sequence.
 ACCESSION   BGC0001866.1_cluster_1
@@ -15,15 +15,15 @@
   JOURNAL   bioRxiv (2021.05.03.442509)
   REMARK    doi:10.1101/2021.05.03.442509
 COMMENT     ##GECCO-Data-START##
-            version                :: GECCO v0.8.10
-            creation_date          :: 2021-11-21T16:33:58.470847
+            version                :: GECCO v0.9.1
+            creation_date          :: 2022-04-06T01:08:36.965708
             biosyn_class           :: Polyketide
-            alkaloid_probability   :: 0.0
-            polyketide_probability :: 0.98
+            alkaloid_probability   :: 0.010000000000000009
+            polyketide_probability :: 0.96
             ripp_probability       :: 0.0
             saccharide_probability :: 0.0
-            terpene_probability    :: 0.0
-            nrp_probability        :: 0.09999999999999998
+            terpene_probability    :: 0.010000000000000009
+            nrp_probability        :: 0.14
             ##GECCO-Data-END##
 FEATURES             Location/Qualifiers
      CDS             complement(1..1143)
@@ -41,7 +41,7 @@
                      /inference="protein motif"
                      /db_xref="PFAM:PF00394"
                      /db_xref="InterPro:IPR001117"
-                     /note="e-value: 2.1941888078432915e-08"
+                     /note="e-value: 2.262067179461254e-08"
                      /note="p-value: 8.178117062405111e-12"
                      /function="Multicopper oxidase"
                      /standard_name="PF00394"
@@ -49,7 +49,7 @@
                      /inference="protein motif"
                      /db_xref="PFAM:PF07731"
                      /db_xref="InterPro:IPR011706"
-                     /note="e-value: 3.9374169295176556e-23"
+                     /note="e-value: 4.059222969454281e-23"
                      /note="p-value: 1.467542649838858e-26"
                      /function="Multicopper oxidase"
                      /standard_name="PF07731"
@@ -93,7 +93,7 @@
                      /inference="protein motif"
                      /db_xref="PFAM:PF00891"
                      /db_xref="InterPro:IPR001077"
-                     /note="e-value: 4.743887678074703e-16"
+                     /note="e-value: 4.890642309934635e-16"
                      /note="p-value: 1.7681280946979883e-19"
                      /function="O-methyltransferase domain"
                      /standard_name="PF00891"
@@ -108,7 +108,7 @@
                      /inference="protein motif"
                      /db_xref="PFAM:PF00135"
                      /db_xref="InterPro:IPR002018"
-                     /note="e-value: 4.674605664377319e-21"
+                     /note="e-value: 4.819217021121008e-21"
                      /note="p-value: 1.7423055029360116e-24"
                      /function="Carboxylesterase family"
                      /standard_name="PF00135"
@@ -123,7 +123,7 @@
                      /inference="protein motif"
                      /db_xref="PFAM:PF00135"
                      /db_xref="InterPro:IPR002018"
-                     /note="e-value: 3.9706994470948554e-30"
+                     /note="e-value: 4.0935350990176556e-30"
                      /note="p-value: 1.4799476135277136e-33"
                      /function="Carboxylesterase family"
                      /standard_name="PF00135"
@@ -140,7 +140,7 @@
                      /inference="protein motif"
                      /db_xref="PFAM:PF00135"
                      /db_xref="InterPro:IPR002018"
-                     /note="e-value: 1.4185801852307574e-15"
+                     /note="e-value: 1.4624647008379705e-15"
                      /note="p-value: 5.287291037013632e-19"
                      /function="Carboxylesterase family"
                      /standard_name="PF00135"
@@ -160,7 +160,7 @@
                      /inference="protein motif"
                      /db_xref="PFAM:PF13434"
                      /db_xref="InterPro:IPR025700"
-                     /note="e-value: 5.777178703900199e-08"
+                     /note="e-value: 5.955898730893757e-08"
                      /note="p-value: 2.153253337271785e-11"
                      /function="L-lysine 6-monooxygenase (NADPH-requiring)"
                      /standard_name="PF13434"
@@ -168,7 +168,7 @@
                      /inference="protein motif"
                      /db_xref="PFAM:PF00743"
                      /db_xref="InterPro:IPR020946"
-                     /note="e-value: 5.089108077410868e-07"
+                     /note="e-value: 5.246542281818287e-07"
                      /note="p-value: 1.8967976434628658e-10"
                      /function="Flavin-binding monooxygenase-like"
                      /standard_name="PF00743"
@@ -202,7 +202,7 @@
                      /inference="protein motif"
                      /db_xref="PFAM:PF07690"
                      /db_xref="InterPro:IPR011701"
-                     /note="e-value: 5.839871260376694e-37"
+                     /note="e-value: 6.020530714201243e-37"
                      /note="p-value: 2.1766199255969786e-40"
                      /function="Major Facilitator Superfamily"
                      /standard_name="PF07690"
@@ -210,7 +210,7 @@
                      /inference="protein motif"
                      /db_xref="PFAM:PF06609"
                      /db_xref="InterPro:IPR010573"
-                     /note="e-value: 9.543170598318239e-09"
+                     /note="e-value: 9.83839354265682e-09"
                      /note="p-value: 3.55690294383833e-12"
                      /function="Fungal trichothecene efflux pump (TRI12)"
                      /standard_name="PF06609"
@@ -235,8 +235,8 @@
                      /inference="protein motif"
                      /db_xref="PFAM:PF08493"
                      /db_xref="InterPro:IPR013700"
-                     /note="e-value: 2.6165794251055913e-17"
-                     /note="p-value: 9.752439154325723e-21"
+                     /note="e-value: 2.686865976406516e-17"
+                     /note="p-value: 9.713904470016327e-21"
                      /function="Aflatoxin regulatory protein"
                      /standard_name="PF08493"
      CDS             16827..18797
@@ -259,7 +259,7 @@
                      /inference="protein motif"
                      /db_xref="PFAM:PF00109"
                      /db_xref="InterPro:IPR014030"
-                     /note="e-value: 9.025888536170949e-60"
+                     /note="e-value: 9.30510909096118e-60"
                      /note="p-value: 3.364103069761815e-63"
                      /function="Beta-ketoacyl synthase, N-terminal domain"
                      /standard_name="PF00109"
@@ -267,23 +267,23 @@
                      /inference="protein motif"
                      /db_xref="PFAM:PF02801"
                      /db_xref="InterPro:IPR014031"
-                     /note="e-value: 2.2171445990751238e-35"
+                     /note="e-value: 2.2857331200304854e-35"
                      /note="p-value: 8.263677223537547e-39"
                      /function="Beta-ketoacyl synthase, C-terminal domain"
                      /standard_name="PF02801"
-     misc_feature    17937..18287
+     misc_feature    17937..18290
                      /inference="protein motif"
                      /db_xref="PFAM:PF16197"
                      /db_xref="InterPro:IPR032821"
-                     /note="e-value: 3.8698172759236842e-25"
-                     /note="p-value: 1.4423471024687604e-28"
+                     /note="e-value: 4.800730099641783e-25"
+                     /note="p-value: 1.7356218726109122e-28"
                      /function="Ketoacyl-synthetase C-terminal extension"
                      /standard_name="PF16197"
      misc_feature    18360..18770
                      /inference="protein motif"
                      /db_xref="PFAM:PF00698"
                      /db_xref="InterPro:IPR014043"
-                     /note="e-value: 1.0799913424517567e-26"
+                     /note="e-value: 1.113401436161595e-26"
                      /note="p-value: 4.025312495161225e-30"
                      /function="Acyl transferase domain"
                      /standard_name="PF00698"
@@ -314,7 +314,7 @@
                      /inference="protein motif"
                      /db_xref="PFAM:PF00698"
                      /db_xref="InterPro:IPR014043"
-                     /note="e-value: 2.639223271303753e-16"
+                     /note="e-value: 2.7208690154402465e-16"
                      /note="p-value: 9.836836642950999e-20"
                      /function="Acyl transferase domain"
                      /standard_name="PF00698"
@@ -322,14 +322,14 @@
                      /inference="protein motif"
                      /db_xref="PFAM:PF14765"
                      /db_xref="InterPro:IPR020807"
-                     /note="e-value: 2.520598829779557e-60"
+                     /note="e-value: 2.598574865139864e-60"
                      /note="p-value: 9.394703055458656e-64"
                      /function="Polyketide synthase dehydratase"
                      /standard_name="PF14765"
      misc_feature    20786..21256
                      /inference="protein motif"
                      /db_xref="PFAM:PF13489"
-                     /note="e-value: 1.0131254482174088e-12"
+                     /note="e-value: 1.04446701072283e-12"
                      /note="p-value: 3.776091868123029e-16"
                      /function="Methyltransferase domain"
                      /standard_name="PF13489"
@@ -337,23 +337,23 @@
                      /inference="protein motif"
                      /db_xref="PFAM:PF13847"
                      /db_xref="InterPro:IPR025714"
-                     /note="e-value: 8.939870258494623e-11"
-                     /note="p-value: 3.332042586095648e-14"
+                     /note="e-value: 8.752004453621267e-11"
+                     /note="p-value: 3.1641375465008194e-14"
                      /function="Methyltransferase domain"
                      /standard_name="PF13847"
      misc_feature    20804..21097
                      /inference="protein motif"
                      /db_xref="PFAM:PF13649"
                      /db_xref="InterPro:IPR041698"
-                     /note="e-value: 2.319131521369124e-13"
-                     /note="p-value: 8.643799930559537e-17"
+                     /note="e-value: 2.4253465299984994e-13"
+                     /note="p-value: 8.76842563267715e-17"
                      /function="Methyltransferase domain"
                      /standard_name="PF13649"
      misc_feature    20807..21103
                      /inference="protein motif"
                      /db_xref="PFAM:PF08242"
                      /db_xref="InterPro:IPR013217"
-                     /note="e-value: 3.6288099491186147e-22"
+                     /note="e-value: 3.7410690716593694e-22"
                      /note="p-value: 1.3525195486837923e-25"
                      /function="Methyltransferase domain"
                      /standard_name="PF08242"
@@ -361,7 +361,7 @@
                      /inference="protein motif"
                      /db_xref="PFAM:PF08241"
                      /db_xref="InterPro:IPR013216"
-                     /note="e-value: 5.245291385894328e-12"
+                     /note="e-value: 5.4075572021556884e-12"
                      /note="p-value: 1.9550098344742185e-15"
                      /function="Methyltransferase domain"
                      /standard_name="PF08241"
@@ -376,7 +376,7 @@
                      /inference="protein motif"
                      /db_xref="PFAM:PF00107"
                      /db_xref="InterPro:IPR013149"
-                     /note="e-value: 1.0960342036668699e-15"
+                     /note="e-value: 1.1299405916297285e-15"
                      /note="p-value: 4.085106983476965e-19"
                      /function="Zinc-binding dehydrogenase"
                      /standard_name="PF00107"
@@ -396,7 +396,7 @@
                      /inference="protein motif"
                      /db_xref="PFAM:PF08659"
                      /db_xref="InterPro:IPR013968"
-                     /note="e-value: 1.5141662612831146e-61"
+                     /note="e-value: 1.5610077818520667e-61"
                      /note="p-value: 5.643556695054471e-65"
                      /function="KR domain"
                      /standard_name="PF08659"
@@ -404,7 +404,7 @@
                      /inference="protein motif"
                      /db_xref="PFAM:PF00106"
                      /db_xref="InterPro:IPR002347"
-                     /note="e-value: 1.1379002942545491e-07"
+                     /note="e-value: 1.1731018314976082e-07"
                      /note="p-value: 4.2411490654288077e-11"
                      /function="short chain dehydrogenase"
                      /standard_name="PF00106"
@@ -412,7 +412,7 @@
                      /inference="protein motif"
                      /db_xref="PFAM:PF00550"
                      /db_xref="InterPro:IPR009081"
-                     /note="e-value: 3.359618716013185e-10"
+                     /note="e-value: 3.463550267794435e-10"
                      /note="p-value: 1.2521873708584363e-13"
                      /function="Phosphopantetheine attachment site"
                      /standard_name="PF00550"
@@ -426,8 +426,8 @@
                      /inference="protein motif"
                      /db_xref="PFAM:PF16073"
                      /db_xref="InterPro:IPR032088"
-                     /note="e-value: 1.3071857188363548e-23"
-                     /note="p-value: 4.872104803713585e-27"
+                     /note="e-value: 9.422238725791962e-24"
+                     /note="p-value: 3.406449286258844e-27"
                      /function="Starter unit:ACP transacylase in aflatoxin
                      biosynthesis"
                      /standard_name="PF16073"
@@ -459,8 +459,8 @@
                      /inference="protein motif"
                      /db_xref="PFAM:PF16073"
                      /db_xref="InterPro:IPR032088"
-                     /note="e-value: 8.208876065249628e-11"
-                     /note="p-value: 3.059588544632735e-14"
+                     /note="e-value: 4.380197593141013e-11"
+                     /note="p-value: 1.5835855362042708e-14"
                      /function="Starter unit:ACP transacylase in aflatoxin
                      biosynthesis"
                      /standard_name="PF16073"
@@ -468,7 +468,7 @@
                      /inference="protein motif"
                      /db_xref="PFAM:PF00109"
                      /db_xref="InterPro:IPR014030"
-                     /note="e-value: 2.667462237983852e-82"
+                     /note="e-value: 2.7499815692371726e-82"
                      /note="p-value: 9.942088102809735e-86"
                      /function="Beta-ketoacyl synthase, N-terminal domain"
                      /standard_name="PF00109"
@@ -476,7 +476,7 @@
                      /inference="protein motif"
                      /db_xref="PFAM:PF02801"
                      /db_xref="InterPro:IPR014031"
-                     /note="e-value: 2.4031043351141288e-34"
+                     /note="e-value: 2.4774456171918303e-34"
                      /note="p-value: 8.956780973217029e-38"
                      /function="Beta-ketoacyl synthase, C-terminal domain"
                      /standard_name="PF02801"
@@ -484,15 +484,15 @@
                      /inference="protein motif"
                      /db_xref="PFAM:PF16197"
                      /db_xref="InterPro:IPR032821"
-                     /note="e-value: 2.535893425129411e-07"
-                     /note="p-value: 9.451708628883381e-11"
+                     /note="e-value: 8.475099126640419e-07"
+                     /note="p-value: 3.0640271607521397e-10"
                      /function="Ketoacyl-synthetase C-terminal extension"
                      /standard_name="PF16197"
      misc_feature    28322..29233
                      /inference="protein motif"
                      /db_xref="PFAM:PF00698"
                      /db_xref="InterPro:IPR014043"
-                     /note="e-value: 4.597134671955754e-38"
+                     /note="e-value: 4.739349423268586e-38"
                      /note="p-value: 1.7134307387088164e-41"
                      /function="Acyl transferase domain"
                      /standard_name="PF00698"
@@ -509,7 +509,7 @@
                      /inference="protein motif"
                      /db_xref="PFAM:PF14765"
                      /db_xref="InterPro:IPR020807"
-                     /note="e-value: 7.778696660229127e-11"
+                     /note="e-value: 8.019334685871699e-11"
                      /note="p-value: 2.8992533209948296e-14"
                      /function="Polyketide synthase dehydratase"
                      /standard_name="PF14765"
@@ -533,7 +533,7 @@
                      /inference="protein motif"
                      /db_xref="PFAM:PF00550"
                      /db_xref="InterPro:IPR009081"
-                     /note="e-value: 5.884377030377924e-14"
+                     /note="e-value: 6.066413293337807e-14"
                      /note="p-value: 2.193207987468477e-17"
                      /function="Phosphopantetheine attachment site"
                      /standard_name="PF00550"
@@ -541,7 +541,7 @@
                      /inference="protein motif"
                      /db_xref="PFAM:PF00550"
                      /db_xref="InterPro:IPR009081"
-                     /note="e-value: 3.9212317886052276e-10"
+                     /note="e-value: 4.042537132792419e-10"
                      /note="p-value: 1.461510170930014e-13"
                      /function="Phosphopantetheine attachment site"
                      /standard_name="PF00550"
@@ -549,7 +549,7 @@
                      /inference="protein motif"
                      /db_xref="PFAM:PF00550"
                      /db_xref="InterPro:IPR009081"
-                     /note="e-value: 1.367829688372301e-08"
+                     /note="e-value: 1.4101442109719659e-08"
                      /note="p-value: 5.098135252971677e-12"
                      /function="Phosphopantetheine attachment site"
                      /standard_name="PF00550"
@@ -557,7 +557,7 @@
                      /inference="protein motif"
                      /db_xref="PFAM:PF00975"
                      /db_xref="InterPro:IPR001031"
-                     /note="e-value: 6.711355516947163e-24"
+                     /note="e-value: 6.91897478936856e-24"
                      /note="p-value: 2.5014370171252933e-27"
                      /function="Thioesterase domain"
                      /standard_name="PF00975"
--- a/test-data/clusters.tsv	Thu Mar 31 18:00:15 2022 +0000
+++ b/test-data/clusters.tsv	Tue Apr 05 23:18:49 2022 +0000
@@ -1,2 +1,2 @@
 sequence_id	bgc_id	start	end	average_p	max_p	type	alkaloid_probability	polyketide_probability	ripp_probability	saccharide_probability	terpene_probability	nrp_probability	proteins	domains
-BGC0001866.1	BGC0001866.1_cluster_1	347	32979	0.9969495815733557	0.9999999447224028	Polyketide	0.0	0.98	0.0	0.0	0.0	0.09999999999999998	BGC0001866.1_1;BGC0001866.1_2;BGC0001866.1_3;BGC0001866.1_4;BGC0001866.1_5;BGC0001866.1_6;BGC0001866.1_7;BGC0001866.1_8;BGC0001866.1_9;BGC0001866.1_10;BGC0001866.1_11;BGC0001866.1_12;BGC0001866.1_13;BGC0001866.1_14;BGC0001866.1_15;BGC0001866.1_16;BGC0001866.1_17;BGC0001866.1_18;BGC0001866.1_19;BGC0001866.1_20;BGC0001866.1_21;BGC0001866.1_22;BGC0001866.1_23	PF00106;PF00107;PF00109;PF00135;PF00394;PF00550;PF00698;PF00743;PF00891;PF00975;PF02801;PF06609;PF07690;PF07731;PF08241;PF08242;PF08493;PF08659;PF13434;PF13489;PF13649;PF13847;PF14765;PF16073;PF16197
+BGC0001866.1	BGC0001866.1_cluster_1	347	32979	0.9958958770931704	0.9999999976946022	Polyketide	0.010000000000000009	0.96	0.0	0.0	0.010000000000000009	0.14	BGC0001866.1_1;BGC0001866.1_2;BGC0001866.1_3;BGC0001866.1_4;BGC0001866.1_5;BGC0001866.1_6;BGC0001866.1_7;BGC0001866.1_8;BGC0001866.1_9;BGC0001866.1_10;BGC0001866.1_11;BGC0001866.1_12;BGC0001866.1_13;BGC0001866.1_14;BGC0001866.1_15;BGC0001866.1_16;BGC0001866.1_17;BGC0001866.1_18;BGC0001866.1_19;BGC0001866.1_20;BGC0001866.1_21;BGC0001866.1_22;BGC0001866.1_23	PF00106;PF00107;PF00109;PF00135;PF00394;PF00550;PF00698;PF00743;PF00891;PF00975;PF02801;PF06609;PF07690;PF07731;PF08241;PF08242;PF08493;PF08659;PF13434;PF13489;PF13649;PF13847;PF14765;PF16073;PF16197
--- a/test-data/features.tsv	Thu Mar 31 18:00:15 2022 +0000
+++ b/test-data/features.tsv	Tue Apr 05 23:18:49 2022 +0000
@@ -1,38 +1,38 @@
 sequence_id	protein_id	start	end	strand	domain	hmm	i_evalue	pvalue	domain_start	domain_end	bgc_probability
-BGC0001866.1	BGC0001866.1_1	347	1489	-	PF00394	Pfam	2.1941888078432915e-08	8.178117062405111e-12	1	63	0.9852038761627908
-BGC0001866.1	BGC0001866.1_1	347	1489	-	PF07731	Pfam	3.9374169295176556e-23	1.467542649838858e-26	150	281	0.9852038761627908
-BGC0001866.1	BGC0001866.1_6	3946	4389	+	PF00891	Pfam	4.743887678074703e-16	1.7681280946979883e-19	17	121	0.9910535094227727
-BGC0001866.1	BGC0001866.1_7	4683	5138	+	PF00135	Pfam	4.674605664377319e-21	1.7423055029360116e-24	48	140	0.9913598896683397
-BGC0001866.1	BGC0001866.1_8	5384	5812	+	PF00135	Pfam	3.9706994470948554e-30	1.4799476135277136e-33	2	114	0.9925093258822111
-BGC0001866.1	BGC0001866.1_9	5823	6599	+	PF00135	Pfam	1.4185801852307574e-15	5.287291037013632e-19	2	209	0.9946019708257335
-BGC0001866.1	BGC0001866.1_10	7758	9029	+	PF13434	Pfam	5.777178703900199e-08	2.153253337271785e-11	13	124	0.9978201609931655
-BGC0001866.1	BGC0001866.1_10	7758	9029	+	PF00743	Pfam	5.089108077410868e-07	1.8967976434628658e-10	36	102	0.9978201609931655
-BGC0001866.1	BGC0001866.1_13	11550	12662	+	PF07690	Pfam	5.839871260376694e-37	2.1766199255969786e-40	1	362	0.9990971143689635
-BGC0001866.1	BGC0001866.1_13	11550	12662	+	PF06609	Pfam	9.543170598318239e-09	3.55690294383833e-12	17	244	0.9990971143689635
-BGC0001866.1	BGC0001866.1_15	14920	15912	+	PF08493	Pfam	2.6165794251055913e-17	9.752439154325723e-21	139	224	0.9999977987864139
-BGC0001866.1	BGC0001866.1_16	17173	19143	+	PF00109	Pfam	9.025888536170949e-60	3.364103069761815e-63	2	248	0.9999994272691842
-BGC0001866.1	BGC0001866.1_16	17173	19143	+	PF02801	Pfam	2.2171445990751238e-35	8.263677223537547e-39	257	368	0.9999994272691842
-BGC0001866.1	BGC0001866.1_16	17173	19143	+	PF16197	Pfam	3.8698172759236842e-25	1.4423471024687604e-28	371	487	0.9999994272691842
-BGC0001866.1	BGC0001866.1_16	17173	19143	+	PF00698	Pfam	1.0799913424517567e-26	4.025312495161225e-30	512	648	0.9999994272691842
-BGC0001866.1	BGC0001866.1_17	19152	22424	+	PF00698	Pfam	2.639223271303753e-16	9.836836642950999e-20	2	151	0.9999940983719267
-BGC0001866.1	BGC0001866.1_17	19152	22424	+	PF14765	Pfam	2.520598829779557e-60	9.394703055458656e-64	228	504	0.9999940983719267
-BGC0001866.1	BGC0001866.1_17	19152	22424	+	PF13489	Pfam	1.0131254482174088e-12	3.776091868123029e-16	661	817	0.9999940983719267
-BGC0001866.1	BGC0001866.1_17	19152	22424	+	PF13847	Pfam	8.939870258494623e-11	3.332042586095648e-14	666	776	0.9999940983719267
-BGC0001866.1	BGC0001866.1_17	19152	22424	+	PF13649	Pfam	2.319131521369124e-13	8.643799930559537e-17	667	764	0.9999940983719267
-BGC0001866.1	BGC0001866.1_17	19152	22424	+	PF08242	Pfam	3.6288099491186147e-22	1.3525195486837923e-25	668	766	0.9999940983719267
-BGC0001866.1	BGC0001866.1_17	19152	22424	+	PF08241	Pfam	5.245291385894328e-12	1.9550098344742185e-15	668	767	0.9999940983719267
-BGC0001866.1	BGC0001866.1_18	22762	23235	+	PF00107	Pfam	1.0960342036668699e-15	4.085106983476965e-19	12	117	0.9999176675645223
-BGC0001866.1	BGC0001866.1_19	23268	24623	+	PF08659	Pfam	1.5141662612831146e-61	5.643556695054471e-65	65	239	0.9999724741067139
-BGC0001866.1	BGC0001866.1_19	23268	24623	+	PF00106	Pfam	1.1379002942545491e-07	4.2411490654288077e-11	68	221	0.9999724741067139
-BGC0001866.1	BGC0001866.1_19	23268	24623	+	PF00550	Pfam	3.359618716013185e-10	1.2521873708584363e-13	384	437	0.9999724741067139
-BGC0001866.1	BGC0001866.1_20	25769	26056	+	PF16073	Pfam	1.3071857188363548e-23	4.872104803713585e-27	8	94	0.999988513111687
-BGC0001866.1	BGC0001866.1_21	26544	29999	+	PF16073	Pfam	8.208876065249628e-11	3.059588544632735e-14	2	47	0.9999999447224028
-BGC0001866.1	BGC0001866.1_21	26544	29999	+	PF00109	Pfam	2.667462237983852e-82	9.942088102809735e-86	178	426	0.9999999447224028
-BGC0001866.1	BGC0001866.1_21	26544	29999	+	PF02801	Pfam	2.4031043351141288e-34	8.956780973217029e-38	434	555	0.9999999447224028
-BGC0001866.1	BGC0001866.1_21	26544	29999	+	PF16197	Pfam	2.535893425129411e-07	9.451708628883381e-11	567	673	0.9999999447224028
-BGC0001866.1	BGC0001866.1_21	26544	29999	+	PF00698	Pfam	4.597134671955754e-38	1.7134307387088164e-41	709	1012	0.9999999447224028
-BGC0001866.1	BGC0001866.1_22	30150	30890	+	PF14765	Pfam	7.778696660229127e-11	2.8992533209948296e-14	39	244	0.9999460955852995
-BGC0001866.1	BGC0001866.1_23	30937	32979	+	PF00550	Pfam	5.884377030377924e-14	2.193207987468477e-17	67	128	0.9997314383315643
-BGC0001866.1	BGC0001866.1_23	30937	32979	+	PF00550	Pfam	3.9212317886052276e-10	1.461510170930014e-13	174	238	0.9997314383315643
-BGC0001866.1	BGC0001866.1_23	30937	32979	+	PF00550	Pfam	1.367829688372301e-08	5.098135252971677e-12	299	360	0.9997314383315643
-BGC0001866.1	BGC0001866.1_23	30937	32979	+	PF00975	Pfam	6.711355516947163e-24	2.5014370171252933e-27	443	550	0.9997314383315643
+BGC0001866.1	BGC0001866.1_1	347	1489	-	PF00394	Pfam	2.262067179461254e-08	8.178117062405111e-12	1	63	0.9791890143072265
+BGC0001866.1	BGC0001866.1_1	347	1489	-	PF07731	Pfam	4.059222969454281e-23	1.467542649838858e-26	150	281	0.9791890143072265
+BGC0001866.1	BGC0001866.1_6	3946	4389	+	PF00891	Pfam	4.890642309934635e-16	1.7681280946979883e-19	17	121	0.9955095513800687
+BGC0001866.1	BGC0001866.1_7	4683	5138	+	PF00135	Pfam	4.819217021121008e-21	1.7423055029360116e-24	48	140	0.995982045872177
+BGC0001866.1	BGC0001866.1_8	5384	5812	+	PF00135	Pfam	4.0935350990176556e-30	1.4799476135277136e-33	2	114	0.9966491071789748
+BGC0001866.1	BGC0001866.1_9	5823	6599	+	PF00135	Pfam	1.4624647008379705e-15	5.287291037013632e-19	2	209	0.9975265367646511
+BGC0001866.1	BGC0001866.1_10	7758	9029	+	PF13434	Pfam	5.955898730893757e-08	2.153253337271785e-11	13	124	0.9986351193337516
+BGC0001866.1	BGC0001866.1_10	7758	9029	+	PF00743	Pfam	5.246542281818287e-07	1.8967976434628658e-10	36	102	0.9986351193337516
+BGC0001866.1	BGC0001866.1_13	11550	12662	+	PF07690	Pfam	6.020530714201243e-37	2.1766199255969786e-40	1	362	0.9994485509803548
+BGC0001866.1	BGC0001866.1_13	11550	12662	+	PF06609	Pfam	9.83839354265682e-09	3.55690294383833e-12	17	244	0.9994485509803548
+BGC0001866.1	BGC0001866.1_15	14920	15912	+	PF08493	Pfam	2.686865976406516e-17	9.713904470016327e-21	139	224	0.9999999296901834
+BGC0001866.1	BGC0001866.1_16	17173	19143	+	PF00109	Pfam	9.30510909096118e-60	3.364103069761815e-63	2	248	0.9999998571963613
+BGC0001866.1	BGC0001866.1_16	17173	19143	+	PF02801	Pfam	2.2857331200304854e-35	8.263677223537547e-39	257	368	0.9999998571963613
+BGC0001866.1	BGC0001866.1_16	17173	19143	+	PF16197	Pfam	4.800730099641783e-25	1.7356218726109122e-28	371	488	0.9999998571963613
+BGC0001866.1	BGC0001866.1_16	17173	19143	+	PF00698	Pfam	1.113401436161595e-26	4.025312495161225e-30	512	648	0.9999998571963613
+BGC0001866.1	BGC0001866.1_17	19152	22424	+	PF00698	Pfam	2.7208690154402465e-16	9.836836642950999e-20	2	151	0.9999990994944158
+BGC0001866.1	BGC0001866.1_17	19152	22424	+	PF14765	Pfam	2.598574865139864e-60	9.394703055458656e-64	228	504	0.9999990994944158
+BGC0001866.1	BGC0001866.1_17	19152	22424	+	PF13489	Pfam	1.04446701072283e-12	3.776091868123029e-16	661	817	0.9999990994944158
+BGC0001866.1	BGC0001866.1_17	19152	22424	+	PF13847	Pfam	8.752004453621267e-11	3.1641375465008194e-14	666	776	0.9999990994944158
+BGC0001866.1	BGC0001866.1_17	19152	22424	+	PF13649	Pfam	2.4253465299984994e-13	8.76842563267715e-17	667	764	0.9999990994944158
+BGC0001866.1	BGC0001866.1_17	19152	22424	+	PF08242	Pfam	3.7410690716593694e-22	1.3525195486837923e-25	668	766	0.9999990994944158
+BGC0001866.1	BGC0001866.1_17	19152	22424	+	PF08241	Pfam	5.4075572021556884e-12	1.9550098344742185e-15	668	767	0.9999990994944158
+BGC0001866.1	BGC0001866.1_18	22762	23235	+	PF00107	Pfam	1.1299405916297285e-15	4.085106983476965e-19	12	117	0.9999802025553775
+BGC0001866.1	BGC0001866.1_19	23268	24623	+	PF08659	Pfam	1.5610077818520667e-61	5.643556695054471e-65	65	239	0.9999913868972266
+BGC0001866.1	BGC0001866.1_19	23268	24623	+	PF00106	Pfam	1.1731018314976082e-07	4.2411490654288077e-11	68	221	0.9999913868972266
+BGC0001866.1	BGC0001866.1_19	23268	24623	+	PF00550	Pfam	3.463550267794435e-10	1.2521873708584363e-13	384	437	0.9999913868972266
+BGC0001866.1	BGC0001866.1_20	25769	26056	+	PF16073	Pfam	9.422238725791962e-24	3.406449286258844e-27	8	94	0.9999994733759681
+BGC0001866.1	BGC0001866.1_21	26544	29999	+	PF16073	Pfam	4.380197593141013e-11	1.5835855362042708e-14	2	47	0.9999999976946022
+BGC0001866.1	BGC0001866.1_21	26544	29999	+	PF00109	Pfam	2.7499815692371726e-82	9.942088102809735e-86	178	426	0.9999999976946022
+BGC0001866.1	BGC0001866.1_21	26544	29999	+	PF02801	Pfam	2.4774456171918303e-34	8.956780973217029e-38	434	555	0.9999999976946022
+BGC0001866.1	BGC0001866.1_21	26544	29999	+	PF16197	Pfam	8.475099126640419e-07	3.0640271607521397e-10	567	673	0.9999999976946022
+BGC0001866.1	BGC0001866.1_21	26544	29999	+	PF00698	Pfam	4.739349423268586e-38	1.7134307387088164e-41	709	1012	0.9999999976946022
+BGC0001866.1	BGC0001866.1_22	30150	30890	+	PF14765	Pfam	8.019334685871699e-11	2.8992533209948296e-14	39	244	0.9999912059124727
+BGC0001866.1	BGC0001866.1_23	30937	32979	+	PF00550	Pfam	6.066413293337807e-14	2.193207987468477e-17	67	128	0.9998703656415205
+BGC0001866.1	BGC0001866.1_23	30937	32979	+	PF00550	Pfam	4.042537132792419e-10	1.461510170930014e-13	174	238	0.9998703656415205
+BGC0001866.1	BGC0001866.1_23	30937	32979	+	PF00550	Pfam	1.4101442109719659e-08	5.098135252971677e-12	299	360	0.9998703656415205
+BGC0001866.1	BGC0001866.1_23	30937	32979	+	PF00975	Pfam	6.91897478936856e-24	2.5014370171252933e-27	443	550	0.9998703656415205
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/genes.tsv	Tue Apr 05 23:18:49 2022 +0000
@@ -0,0 +1,24 @@
+sequence_id	protein_id	start	end	strand	average_p	max_p
+BGC0001866.1	BGC0001866.1_1	347	1489	-	0.9791890143072265	0.9791890143072265
+BGC0001866.1	BGC0001866.1_2	1525	2016	+	0.9816626269970528	0.9816626269970528
+BGC0001866.1	BGC0001866.1_3	2513	2722	-	0.9844997726878899	0.9844997726878899
+BGC0001866.1	BGC0001866.1_4	2905	3378	+	0.9877300777686966	0.9877300777686966
+BGC0001866.1	BGC0001866.1_5	3353	3922	+	0.9913872741253911	0.9913872741253911
+BGC0001866.1	BGC0001866.1_6	3946	4389	+	0.9955095513800687	0.9955095513800687
+BGC0001866.1	BGC0001866.1_7	4683	5138	+	0.995982045872177	0.995982045872177
+BGC0001866.1	BGC0001866.1_8	5384	5812	+	0.9966491071789748	0.9966491071789748
+BGC0001866.1	BGC0001866.1_9	5823	6599	+	0.9975265367646511	0.9975265367646511
+BGC0001866.1	BGC0001866.1_10	7758	9029	+	0.9986351193337516	0.9986351193337516
+BGC0001866.1	BGC0001866.1_11	9800	10384	+	0.9988029392597757	0.9988029392597757
+BGC0001866.1	BGC0001866.1_12	11109	11537	+	0.999073142625125	0.999073142625125
+BGC0001866.1	BGC0001866.1_13	11550	12662	+	0.9994485509803548	0.9994485509803548
+BGC0001866.1	BGC0001866.1_14	12681	13127	+	0.9996778954036583	0.9996778954036583
+BGC0001866.1	BGC0001866.1_15	14920	15912	+	0.9999999296901834	0.9999999296901834
+BGC0001866.1	BGC0001866.1_16	17173	19143	+	0.9999998571963613	0.9999998571963613
+BGC0001866.1	BGC0001866.1_17	19152	22424	+	0.9999990994944158	0.9999990994944158
+BGC0001866.1	BGC0001866.1_18	22762	23235	+	0.9999802025553775	0.9999802025553775
+BGC0001866.1	BGC0001866.1_19	23268	24623	+	0.9999913868972266	0.9999913868972266
+BGC0001866.1	BGC0001866.1_20	25769	26056	+	0.9999994733759681	0.9999994733759681
+BGC0001866.1	BGC0001866.1_21	26544	29999	+	0.9999999976946022	0.9999999976946022
+BGC0001866.1	BGC0001866.1_22	30150	30890	+	0.9999912059124727	0.9999912059124727
+BGC0001866.1	BGC0001866.1_23	30937	32979	+	0.9998703656415205	0.9998703656415205
--- a/test-data/sideload.json	Thu Mar 31 18:00:15 2022 +0000
+++ b/test-data/sideload.json	Tue Apr 05 23:18:49 2022 +0000
@@ -5,14 +5,14 @@
             "subregions": [
                 {
                     "details": {
-                        "alkaloid_probability": "0.000",
-                        "average_p": "0.997",
+                        "alkaloid_probability": "0.010",
+                        "average_p": "0.996",
                         "max_p": "1.000",
-                        "nrp_probability": "0.100",
-                        "polyketide_probability": "0.980",
+                        "nrp_probability": "0.140",
+                        "polyketide_probability": "0.960",
                         "ripp_probability": "0.000",
                         "saccharide_probability": "0.000",
-                        "terpene_probability": "0.000"
+                        "terpene_probability": "0.010"
                     },
                     "end": 32979,
                     "label": "Polyketide",
@@ -25,11 +25,13 @@
         "configuration": {
             "cds": "3",
             "e-filter": "None",
+            "edge-distance": "0",
+            "mask": "False",
             "postproc": "'gecco'",
-            "threshold": "0.3"
+            "threshold": "0.8"
         },
         "description": "Biosynthetic Gene Cluster prediction with Conditional Random Fields.",
         "name": "GECCO",
-        "version": "0.8.10"
+        "version": "0.9.1"
     }
 }
\ No newline at end of file