Mercurial > repos > iuc > instrain_profile

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/instrain_profile.xml	Wed Aug 11 21:11:18 2021 +0000
@@ -0,0 +1,378 @@
+<tool id="instrain_profile" name="InStrain Profile" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
+    <description>Creates an inStrain profile (microdiversity analysis) from a mapping file </description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="edam_ontology"/>
+    <expand macro="requirements">
+        <requirement type="package" version="3.0">zip</requirement>
+    </expand>
+    <version_command>inStrain profile --version</version_command>
+    <command detect_errors="exit_code"><![CDATA[
+#set ext=$mapping_input.datatype.file_ext
+ln -s '$mapping_input' 'inputbam.$ext'
+#if $gene_profiling.gene_file
+&&
+ln -s '$gene_profiling.gene_file' 'gene_file.fna'
+#end if
+#if $stb
+&&
+ln -s '$stb' 'stb_file.stb'
+#end if
+&&
+inStrain profile
+    'inputbam.$ext'
+    '$sequence_input'
+    --output 'inStrain.IS'
+    $use_full_fasta_header
+    --processes "\${GALAXY_SLOTS:-6}"
+    --min_mapq $read_filtering.min_mapq
+    --max_insert_relative $read_filtering.max_insert_relative
+    --min_insert $read_filtering.min_insert
+    --pairing_filter '$read_filtering.pairing_filter'
+#if $priority_reads
+    --priority_reads '$read_filtering.priority_reads'
+#end if
+    $output.detailed_mapping_info
+    --min_cov $variant_calling.min_cov
+    --min_freq $variant_calling.min_freq
+    --fdr $variant_calling.fdr
+#if $gene_file
+    --gene_file 'gene_file.fna'
+#end if
+#if $stb
+    --stb 'stb_file.stb'
+#end if
+    $mm_level
+#if $profile.database_mode
+    $profile.database_mode
+#else
+    --min_read_ani $read_filtering.min_read_ani
+    --min_genome_coverage $profile.min_genome_coverage
+    $skip_mm_profiling
+#end if
+    --min_scaffold_reads $profile.min_scaffold_reads
+    --min_snp $profile.min_snp
+    $profile.store_everything
+#if $profile.scaffolds_to_profile
+    --scaffolds_to_profile '$profile.scaffolds_to_profile'
+#end if
+    --rarefied_coverage $profile.rarefied_coverage
+    --window_length $profile.window_length
+    $output.skip_genome_wide
+    $output.skip_plot_generation
+&&
+cd ./inStrain.IS && zip -r ../inStrain.IS.zip *
+    ]]></command>
+    <inputs>
+        <param name="mapping_input" type="data" format="bam,sam" label="A file containing metagenomic reads mapped to a DNA sequence" help="Sorted Bam file"/>
+        <param name="sequence_input" type="data" format="fasta" label="A file containing a DNA sequence."/>
+        <param argument="--use_full_fasta_header" type="boolean" truevalue="--use_full_fasta_header" falsevalue="" checked="false" label="Use full fasta header" help="Instead of using the fasta ID (space in header before space), use the full header. Needed for some mapping tools (including bbMap)"/>
+        <section name="read_filtering" title="Read Filtering" expanded="true">
+            <param argument="--min_read_ani" type="float" value="0.95" min="0" max="1" label="Minimum percent identity" help=" Minimum percent identity of read pairs to consensus to use the reads. Must be >, not >="/>
+            <param argument="--min_mapq" type="integer" value="-1" label="Minimum mapq score" help="Minimum mapq score of EITHER read in a pair to use that pair. Must be >, not >="/>
+            <param argument="--max_insert_relative" type="integer" value="3" label="Maximum insert relative" help="Multiplier to determine maximum insert size between two reads - default is to use 3x median insert size. Must be >, not >="/>
+            <param argument="--min_insert" type="integer" value="50" label="Minimum insert" help="Minimum insert size between two reads - default is 50 bp. If two reads are 50bp each and overlap completely, their insert will be 50. Must be >, not >="/>
+            <param argument="--pairing_filter" type="select" label="How should paired reads be handled?">
+                <option value="paired_only" selected="true">Only paired reads are retained</option>
+                <option value="non_discordant">Keep all paired reads and singleton reads that map to a single scaffold</option>
+                <option value="all_reads">Keep all reads regardless of pairing status (NOT RECOMMENDED; See documentation for deatils)</option>
+            </param>
+            <param argument="--priority_reads" type="data" format="fastqsanger,fastqsanger.gz" optional="true" label="The location of a list of reads that should be retained regardless of pairing status" help="For example long reads or merged reads. This can be a .fastq file or text file with list of read names (will assume file is compressed if ends in .gz"/>
+        </section>
+        <section name="variant_calling" title="Variant Calling" expanded="true">
+            <param argument="--min_cov" type="integer" value="5" label="Minimum coverage" help=" Minimum coverage to call a variant"/>
+            <param argument="--min_freq" type="float" value="0.05" label="Minimum SNP frequency" help="Minimum SNP frequency to confirm a SNV (both this AND the FDR snp count cutoff must be true to call a SNP)."/>
+            <param argument="--fdr" type="float" value="1e-06" min="0" max="1" label="FDR" help="SNP false discovery rate- based on simulation data with a 0.1 percent error rate (Q30)"/>
+        </section>
+        <section name="gene_profiling" title="Gene Profiling" expanded="true">
+            <param argument="--gene_file" type="data" format="fasta,genbank" optional="true" label="Path to prodigal .fna genes file. If file ends in .gb or .gbk, will treat as a genbank file" help="EXPERIMENTAL; the name of the gene must be in the gene qualifier"/>
+        </section>
+        <param argument="--stb" type="data" format="tabular" optional="true" label="Scaffold to bin" help="This can be a file with each line listing a scaffold and a bin name, tab-seperated. This can also be a space-seperated list of .fasta files, with one genome per .fasta file. If nothing is provided, all scaffolds will be treated as belonging to the same genome"/>
+        <param argument="--mm_level" type="boolean" truevalue="--mm_level" falsevalue="" checked="false" label="Create output files on the mm level"/>
+        <param argument="--skip_mm_profiling" type="boolean" truevalue="--skip_mm_profiling" falsevalue="" checked="false" label ="Skip mm profiling" help="Dont perform analysis on an mm level; saves RAM and time; impacts plots and raw_data"/>
+        <section name="profile" title="Profile" expanded="true">
+            <param argument="--database_mode" type="boolean" truevalue="--database_mode" falsevalue="" checked="false" label="Database mode" help="Set a number of parameters to values appropriate for mapping to a large fasta file."/>
+            <param argument="--min_scaffold_reads" type="integer" value="1" label="Minimum scaffold reads" help="Minimum number of reads mapping to a scaffold to proceed with profiling it"/>
+            <param argument="--min_genome_coverage" type="integer" value="0" label="Minimum genome coverage" help="Minimum number of reads mapping to a genome to proceed with profiling it. MUST profile .stb if this is set"/>
+            <param argument="--min_snp" type="integer" value="20" label="Minimum SNP" help="Absolute minimum number of reads connecting two SNPs to calculate LD between them."/>
+            <param argument="--store_everything" type="boolean" truevalue="--store_everything" falsevalue="" checked="false" label="Store everything" help="Store intermediate dictionaries in the pickle file; will result in significantly more RAM and disk usage"/>
+            <param argument="--scaffolds_to_profile" type="data" format="fasta" optional="true" label="Scaffolds to profile" help="File containing a list of scaffolds to profile- if provided will ONLY profile those scaffolds"/>
+            <param argument="--rarefied_coverage" type="integer" value="50" label="Rarefied coverage" help="When calculating nucleotide diversity, also calculate a rarefied version with this much coverage"/>
+            <param argument="--window_length" type="integer" value="10000" label ="Window length" help="Break scaffolds into windows of this length when profiling"/>
+        </section>
+        <section name="output" title="Set Output Parameters" expanded="true">
+            <param argument="--detailed_mapping_info" type="boolean" truevalue="--detailed_mapping_info" falsevalue="" checked="false" label="Detailed mapping info" help="Make a detailed read report indicating deatils about each individual mapped read"/>
+            <param argument="--skip_genome_wide" type="boolean" truevalue="--skip_genome_wide" falsevalue="" checked="false" label="Skip genome wide" help="Do not generate tables that consider groups of scaffolds belonging to genomes"/>
+            <param argument="--skip_plot_generation" type="boolean" truevalue="--skip_plot_generation" falsevalue="" checked="false" label="Skip plot generation" help="Do not make plots"/>
+        </section>
+    </inputs>
+    <outputs>
+        <data format="zip" name="inStrain_zip" from_work_dir="inStrain.IS.zip" label="inStrain Profile IS zip" />
+        <data name="scaffold_info" format="tabular" from_work_dir="inStrain.IS/output/inStrain.IS_scaffold_info.tsv" label="Scoffold Info, This gives basic information about the scaffolds in your sample at the highest allowed level of read identity." />
+        <data name="mapping_info" format="tabular" from_work_dir="inStrain.IS/output/inStrain.IS_mapping_info.tsv" label="Mapping Info, This provides an overview of the number of reads that map to each scaffold, and some basic metrics about their quality." />
+        <data name="SNVs" format="tabular" from_work_dir="inStrain.IS/output/inStrain.IS_SNVs.tsv" label="SNV, This describes the SNVs and SNSs that are detected in this mapping." />
+        <data format="tabular" name="linkage" from_work_dir="inStrain.IS/output/inStrain.IS_linkage.tsv" label="Linkage, This describes the linkage between pairs of SNPs in the mapping that are found on the same read pair at least min_snp times." />
+        <data format="tabular" name="gene_info" from_work_dir="inStrain.IS/output/inStrain.IS_gene_info.tsv" label="Gene Info, This describes some basic information about the genes being profiled" />
+        <data format="tabular" name="genome_info" from_work_dir="inStrain.IS/output/inStrain.IS_genome_info.tsv" label="Genome Info, This Describes many of the above metrics on a genome-by-genome level, rather than a scaffold-by-scaffold level." >
+            <filter>(output['skip_genome_wide'] is False)</filter>
+        </data>
+        <collection name="figures_pdfs" type="list" label="Figures" >
+            <discover_datasets pattern="(?P&lt;designation&gt;.+)" directory="inStrain.IS/figures/" format="pdf"/>
+            <filter>(output['skip_plot_generation'] is False)</filter>
+        </collection>
+    </outputs>
+    <tests>
+    <test expect_num_outputs="6">
+        <param name="mapping_input" value="SmallScaffold.fa.sorted.bam"/>
+        <param name="sequence_input" value="SmallScaffold.fa"/>
+        <param name="use_full_fasta_header" value="false"/>
+        <param name="mm_level" value="false"/>
+        <param name="skip_mm_profiling" value="false"/>
+        <section name="read_filtering">
+            <param name="min_read_ani" value="0.95"/>
+            <param name="min_mapq" value="-1"/>
+            <param name="max_insert_relative" value="3"/>
+            <param name="min_insert" value="50"/>
+            <param name="pairing_filter" value="paired_only"/>
+        </section>
+        <section name="variant_calling">
+            <param name="min_cov" value="5"/>
+            <param name="min_freq" value="0.05"/>
+            <param name="fdr" value="1e-06"/>
+        </section>
+        <section name="profile">
+            <param name="database_mode" value="false"/>
+            <param name="min_scaffold_reads" value="1"/>
+            <param name="min_genome_coverage" value="0"/>
+            <param name="min_snp" value="20"/>
+            <param name="store_everything" value="false"/>
+            <param name="rarefied_coverage" value="50"/>
+            <param name="window_length" value="10000"/>
+        </section>
+        <section name="output">
+            <param name="detailed_mapping_info" value="false"/>
+            <param name="skip_genome_wide" value="true"/>
+            <param name="skip_plot_generation" value="true"/>
+        </section>
+        <output name="inStrain_zip">
+            <assert_contents>
+                <has_size value="21606" delta="1000" />
+            </assert_contents>
+        </output>
+        <output name="scaffold_info">
+            <assert_contents>
+                <has_text text="length"/>
+                <has_n_lines n="2"/>
+                <has_n_columns n="21"/>
+            </assert_contents>
+        </output>
+        <output name="mapping_info">
+            <assert_contents>
+                <has_text text="scaffold"/>
+                <has_n_lines n="5"/>
+            </assert_contents>
+        </output>
+        <output name="SNVs">
+            <assert_contents>
+                <has_text text="position"/>
+                <has_n_lines n="5"/>
+                <has_n_columns n="16"/>
+            </assert_contents>
+        </output>
+        <output name="linkage">
+            <assert_contents>
+                <has_n_lines n="1"/>
+            </assert_contents>
+        </output>
+        <output name="gene_info">
+            <assert_contents>
+                <has_n_lines n="0"/>
+            </assert_contents>
+        </output>
+    </test>
+    <test expect_num_outputs="8">
+        <param name="mapping_input" value="SmallScaffold.fa.sorted.bam"/>
+        <param name="sequence_input" value="SmallScaffold.fa"/>
+        <param name="use_full_fasta_header" value="false"/>
+        <param name="mm_level" value="false"/>
+        <param name="skip_mm_profiling" value="false"/>
+        <section name="read_filtering">
+            <param name="min_read_ani" value="0.95"/>
+            <param name="min_mapq" value="-1"/>
+            <param name="max_insert_relative" value="3"/>
+            <param name="min_insert" value="50"/>
+            <param name="pairing_filter" value="paired_only"/>
+        </section>
+        <section name="variant_calling">
+            <param name="min_cov" value="5"/>
+            <param name="min_freq" value="0.05"/>
+            <param name="fdr" value="1e-06"/>
+        </section>
+        <section name="profile">
+            <param name="database_mode" value="false"/>
+            <param name="min_scaffold_reads" value="1"/>
+            <param name="min_genome_coverage" value="0"/>
+            <param name="min_snp" value="20"/>
+            <param name="store_everything" value="false"/>
+            <param name="rarefied_coverage" value="50"/>
+            <param name="window_length" value="10000"/>
+        </section>
+        <section name="output">
+            <param name="detailed_mapping_info" value="false"/>
+            <param name="skip_genome_wide" value="false"/>
+            <param name="skip_plot_generation" value="false"/>
+        </section>
+        <output name="inStrain_zip">
+            <assert_contents>
+                <has_size value="1468006" delta="100000" />
+            </assert_contents>
+        </output>
+        <output name="scaffold_info">
+            <assert_contents>
+                <has_text text="length"/>
+                <has_n_lines n="2"/>
+                <has_n_columns n="21"/>
+            </assert_contents>
+        </output>
+        <output name="mapping_info">
+            <assert_contents>
+                <has_text text="scaffold"/>
+                <has_n_lines n="5"/>
+            </assert_contents>
+        </output>
+        <output name="SNVs">
+            <assert_contents>
+                <has_text text="position"/>
+                <has_n_lines n="5"/>
+                <has_n_columns n="16"/>
+            </assert_contents>
+        </output>
+        <output name="linkage">
+            <assert_contents>
+                <has_n_lines n="1"/>
+            </assert_contents>
+        </output>
+        <output name="gene_info">
+            <assert_contents>
+                <has_n_lines n="0"/>
+            </assert_contents>
+        </output>
+        <output name="genome_info">
+            <assert_contents>
+                <has_text text="nucl_diversity"/>
+                <has_n_lines n="2"/>
+                <has_n_columns n="26"/>
+            </assert_contents>
+        </output>
+        <output_collection name="figures_pdfs" type="list">
+            <element name="inStrain.IS_CoverageAndBreadth_vs_readMismatch.pdf" ftype="pdf">
+                <assert_contents>
+                    <has_size value="383078" delta="10000" />
+                </assert_contents>
+            </element>
+            <element name="inStrain.IS_MajorAllele_frequency_plot.pdf" ftype="pdf">
+                <assert_contents>
+                    <has_size value="383590" delta="10000" />
+                </assert_contents>
+            </element>
+            <element name="inStrain.IS_ReadFiltering_plot.pdf" ftype="pdf">
+                <assert_contents>
+                    <has_size value="383078" delta="10000" />
+                </assert_contents>
+            </element>
+            <element name="inStrain.IS_ScaffoldInspection_plot.pdf" ftype="pdf">
+                <assert_contents>
+                    <has_size value="208" delta="10" />
+                </assert_contents>
+            </element>
+            <element name="inStrain.IS_genomeWide_microdiveristy_metrics.pdf" ftype="pdf">
+                <assert_contents>
+                    <has_size value="208" delta="10" />
+                </assert_contents>
+            </element>
+            <element name="inStrain.IS_readANI_distribution.pdf" ftype="pdf">
+                <assert_contents>
+                    <has_size value="382771" delta="10000" />
+                </assert_contents>
+            </element>
+        </output_collection>
+    </test>
+    </tests>
+    <help><![CDATA[
+@HELP_HEADER@
+
+Profile
+=======
+
+is the heart of inStrain tool.
+
+The functionality of inStrain profile is broken into several steps:
+
+First, all reads in the .bam file are filtered to only keep those that map with sufficient quality. All non-paired reads will be filtered out by default, and an additional set of filters are applied to each read pair (not the individual reads):
+
+  - Pairs must be mapped in the proper orientation with an expected insert size. The minimum insert distance can be set with the tool's corresponding parameter. The maximum insert distance is a multiple of the median insert distance. So if pairs have a median insert size of 500bp, by default all pairs with insert sizes over 1500bp will be excluded. For the max insert cutoff, the median_insert for all scaffolds is used.
+  - Pairs must have a minimum mapQ score. MapQ scores are confusing and how they’re calculated varies based on the mapping algorithm being used, but are meant to represent both the number of mismatches in the mapping and how unique that mapping is. With bowtie2, if the read maps equally well to two positions on the genome (multi-mapped read), its mapQ score will be set to 2. The read in the pair with the higher mapQ is used for the pair.
+  - Pairs must be above some minimum nucleotide identity (ANI) value. For example if reads in a pair are 100bp each, and each read has a single mismatch, the ANI of that pair would be 0.99
+
+Next, using only read pairs that pass filters, a number of microdiversity metrics are calculated on a scaffold-by-scaffold basis. This includes:
+
+  - Calculate the coverage at each position along the scaffold
+  - Calculate the nucleotide diversity at each position along the scaffold in which the coverage is greater than the min_cov argument.
+  - Identify SNSs and SNVs. The criteria for being reported as a divergent site are 1) More than min_cov number of bases at that position, 2) More than min_freq percentage of reads that are a variant base, 3) The number of reads with the variant base is more than the null model for that coverage.
+  - Calculate linkage between divergent sites on the same read pair. For each pair harboring a divergent site, calculate the linkage of that site with other divergent sites within that same pair. This is only done for pairs of divergent sites that are both on at least MIN_SNP reads
+  - Calculate scaffold-level properties. These include things like the overall coverage, breadth of coverage, average nucleotide identity (ANI) between the reads and the reference genome, and the expected breadth of coverage based on that true coverage.
+
+Finally, this information is stored as an IS_profile object. This includes the locations of divergent sites, the number of read pairs that passed filters (and other information) for each scaffold, the linkage between SNV pairs, ect.
+
+Inputs
+======
+
+A fasta file and a bam/sam file,
+
+Output
+======
+
+An IS_profile. (Zip file), Containing:
+
+1. scaffold_info.tsv
+
+   This gives basic information about the scaffolds in your sample at the highest allowed level of read identity.
+
+2. mapping_info.tsv
+
+   This provides an overview of the number of reads that map to each scaffold, and some basic metrics about their quality.
+
+3. SNVs.tsv
+
+   This describes the SNVs and SNSs that are detected in this mapping. While we should refer to these mutations as divergent sites, sometimes SNV is used to refer to both SNVs and SNSs.
+
+4. linkage.tsv
+
+   This describes the linkage between pairs of SNPs in the mapping that are found on the same read pair at least min_snp times.
+
+5. gene_info.tsv
+
+   This describes some basic information about the genes being profiled.
+
+6. genome_info.tsv
+
+   Describes many of the above metrics on a genome-by-genome level, rather than a scaffold-by-scaffold level. (To output it, set --skip_genome_wide to false)
+
+7. Figures/Plots (When --skip_plot_generation is set to false):
+
+  - Coverage and breadth vs. read mismatches
+  - Genome-wide microdiversity metrics
+  - Read-level ANI distribution
+  - Major allele frequencies
+  - Linkage decay
+  - Read filtering plots
+  - Scaffold inspection plot (large)
+  - Linkage with SNP type (GENES REQUIRED)
+  - Gene histograms (GENES REQUIRED)
+
+    ]]></help>
+    <citations>
+        <citation type="doi">10.1101/2020.01.22.915579</citation>
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Wed Aug 11 21:11:18 2021 +0000
@@ -0,0 +1,36 @@
+<?xml version="1.0"?>
+<macros>
+    <token name="@TOOL_VERSION@">1.5.3</token>
+    <token name="@VERSION_SUFFIX@">0</token>
+    <token name="@PROFILE@">20.01</token>
+    <xml name="edam_ontology">
+        <edam_topics>
+            <edam_topic>topic_0796</edam_topic>
+            <edam_topic>topic_3174</edam_topic>
+        </edam_topics>
+        <edam_operations>
+            <edam_operation>operation_0484</edam_operation>
+            <edam_operation>operation_3209</edam_operation>
+            <edam_operation>operation_3730</edam_operation>
+        </edam_operations>
+    </xml>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="@TOOL_VERSION@">instrain</requirement>
+            <yield/>
+        </requirements>
+    </xml>
+    <token name="@HELP_HEADER@">
+What it does
+============
+
+inStrain is python program for analysis of co-occurring genome populations from metagenomes that allows highly accurate genome comparisons, analysis of coverage, microdiversity, and linkage, and sensitive SNP detection with gene localization and synonymous non-synonymous identification.
+
+Read more about the tool: https://instrain.readthedocs.io/en/latest/
+    </token>
+    <xml name="citations">
+        <citations>
+            <citation type="doi">10.1101/2020.01.22.915579</citation>
+        </citations>
+    </xml>
+</macros>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/N5_271_010G1.maxbin2.stb	Wed Aug 11 21:11:18 2021 +0000
@@ -0,0 +1,167 @@
+N5_271_010G1_scaffold_0 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_1 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_2 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_3 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_4 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_5 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_6 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_7 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_8 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_9 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_10 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_11 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_12 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_13 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_14 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_15 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_16 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_17 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_18 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_19 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_20 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_21 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_22 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_23 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_24 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_25 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_26 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_27 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_28 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_29 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_30 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_31 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_32 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_33 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_34 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_35 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_36 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_37 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_39 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_40 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_41 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_42 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_43 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_44 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_45 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_46 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_47 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_48 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_49 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_50 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_51 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_52 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_53 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_54 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_55 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_56 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_57 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_58 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_59 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_60 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_61 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_63 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_64 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_65 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_66 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_67 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_68 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_69 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_70 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_71 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_73 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_74 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_75 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_76 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_77 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_78 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_79 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_80 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_81 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_82 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_83 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_84 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_85 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_86 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_87 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_88 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_89 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_90 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_91 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_92 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_94 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_95 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_96 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_97 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_98 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_99 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_100 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_101 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_102 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_103 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_104 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_105 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_106 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_107 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_108 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_109 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_111 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_112 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_113 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_114 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_116 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_117 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_118 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_119 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_120 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_121 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_122 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_123 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_125 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_126 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_127 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_128 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_129 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_130 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_131 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_132 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_133 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_134 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_135 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_136 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_137 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_138 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_139 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_141 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_142 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_143 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_144 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_145 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_147 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_148 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_149 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_150 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_151 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_152 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_153 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_154 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_155 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_156 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_157 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_158 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_159 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_160 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_161 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_162 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_163 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_185 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_197 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_341 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_350 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_362 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_376 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_419 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_443 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_484 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_618 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_836 	 maxbin2.maxbin.001.fasta
+N5_271_010G1_scaffold_963	fobin.fasta
Binary file test-data/N5_271_010G1_scaffold_min1000.fa-vs-N5_271_010G1.IS.zip has changed
Binary file test-data/N5_271_010G1_scaffold_min1000.fa-vs-N5_271_010G2.IS.zip has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/SmallScaffold.fa	Wed Aug 11 21:11:18 2021 +0000
@@ -0,0 +1,6 @@
+>WeirdBoi
+AAAAAAAAAAAAAAAAAAAAAAA
+>N5_271_010G1_scaffold_963 read_length_150 read_count_3782
+TCTCCATTACATTCCATTCCATTCGGGTTGTTCCATTCCATTCCATTCCA
+TTCCACTCCATTCCATTGCACTCGGGTTGATTCCATTCCATTCCATTCCG
+GATGATTCCATTCCATTGCATTCCGT
Binary file test-data/SmallScaffold.fa.sorted.bam has changed