Mercurial > repos > ieguinoa > ena_webin_cli

--- a/ena_consensus_submit.xml	Fri Feb 04 15:52:45 2022 +0000
+++ b/ena_consensus_submit.xml	Tue Feb 22 11:03:34 2022 +0000
@@ -51,8 +51,18 @@
 echo -e 'MOLECULETYPE\t$molecule_type' >> $manifest_base;

 #if $metadata_file_or_form.metadata_format == "file":
+    #import re
+    #for $file in $metadata_file_or_form.genome_fasta:
+        #if $file.is_of_type('fasta'):
+            #set $full_name = $file.element_identifier + '.gz'
+            gzip -c $file > './fasta/$full_name';
+        #else:
+            ln -s $file './fasta/$file.element_identifier';
+        #end if
+
+    #end for
     ## process the input tables, this creates an intermediate file with information
-    python3 '$__tool_directory__/process_input.py' $metadata_file_or_form.ena_receipt $genome_fasta './manifests' './fasta' $manifest_base;
+    python3 '$__tool_directory__/process_input.py' $metadata_file_or_form.ena_receipt $genome_fasta_files './manifests' $manifest_base >> $webin_cli_log;
     center_name=`grep 'center_name' $metadata_file_or_form.ena_receipt | cut -f2,2 | tr -d '\n'`;
 #else:
     #set $generated_manifest='./manifests/generated_manifest.txt'
@@ -64,6 +74,12 @@
     center_name='$metadata_file_or_form.center_name';
     echo -e 'NAME\t$metadata_file_or_form.assembly_name' >> $generated_manifest;
     echo -e 'PLATFORM\t$metadata_file_or_form.sequencing_platform' >> $generated_manifest;
+    #if $metadata_file_or_form.genome_fasta.is_of_type('fasta'):
+        gzip -c $metadata_file_or_form.genome_fasta > consensus.fasta.gz;
+    #else:
+        ln -s $metadata_file_or_form.genome_fasta consensus.fasta.gz;
+    #end if
+    echo -e 'FASTA\tconsensus.fasta.gz' >> $generated_manifest;
 #end if

 #set $outputs_dir = 'outputs'
@@ -73,6 +89,7 @@
     ## in case of errors, this list is empty
     while read line; do
         manifest=`echo \$line | cut -d' ' -f1,1`;
+        echo "Submitting manifest \$manifest" >> $webin_cli_log;
         ena-webin-cli
         -context genome
         -userName "'\$webin_id'"
@@ -84,12 +101,10 @@
             -validate
         #end if
         -outputDir $outputs_dir
-        >> $webin_cli_log ;
-    done < submit_list.tab
+        >> $webin_cli_log;
+    done < submit_list.tab;

 #else:
-    gzip -c $genome_fasta > consensus.fasta.gz;
-    echo -e 'FASTA\tconsensus.fasta.gz' >> $generated_manifest;
     ena-webin-cli
     #if $submit_test == "true":
         -test
@@ -109,7 +124,7 @@
     >> $webin_cli_log ;
     ##cp ./genome/$metadata_file_or_form.assembly_name/validate/webin-cli.report $validate_output
 #end if
-tar -cf $webin_cli_outputs $outputs_dir;
+tar -cf $webin_cli_outputs $outputs_dir ;
 ]]></command>
     <configfiles>
         <configfile name="credentials"><![CDATA[
@@ -121,15 +136,26 @@
 #end if

         ]]></configfile>
+    <configfile name="genome_fasta_files">
+#import json
+#import re
+#if $metadata_file_or_form.metadata_format == "file":
+    #set $fasta_files_list = list()
+    #for $file in $metadata_file_or_form.genome_fasta:
+        $fasta_files_list.append(str($file.element_identifier))
+    #end for
+    #echo json.dumps($fasta_files_list)
+#end if
+        </configfile>
     </configfiles>
 <inputs>
     <param name="test_submit" type="hidden" value="False" />
     <param name="submit_test" type="boolean" truevalue="true" falsevalue="false" label="Submit to test server" help="use Webin test service instead of the production service. Please note that the Webin upload area is shared between test and production services, and that test submission files will not be archived." />
     <param name="dry_run" type="boolean" truevalue="true" falsevalue="false" label="Validate files and metadata but do not submit" help="Generate input files and run Webin-CLI with -validate option. If 'No' is selected then it will validate and submit (-submit flag)"/>
-    <param name="genome_fasta" type="data" label="Select the consensus sequence assembly file" format="fasta"/>
     <param name="assembly_type" type="select" label="Assembly type">
         <option value="clone">Clone</option>
         <option value="isolate">Isolate</option>
+        <option value="COVID-19 outbreak">COVID-19 outbreak</option>
     </param>
     <param name="assembly_program" type="text" optional="False" label="Assembly program"/>
     <param name="molecule_type" type="select" label="Molecule type">
@@ -145,14 +171,16 @@
         </param>
         <when value="file">
             <param type="data" format="txt" name="ena_receipt" label="Submission receipt obtained from ENA upload tool"/>
+            <param name="genome_fasta" type="data" label="Select the consensus sequence assembly files or a collection of them" format="fasta,fasta.gz" multiple="true"/>
         </when>
         <when value="form">
             <param name="assembly_name" type="text" optional="False" label="Assembly name"/>
-            <param name="study_accession" type="text" optional="False" label="Study accession or unique name (alias)"/>
-            <param name="sample_accession" type="text" optional="False" label="Sample accession or unique name (alias)"/>
+            <param name="study_accession" type="text" optional="False" label="Study accession"/>
+            <param name="sample_accession" type="text" optional="False" label="Sample accession"/>
             <param name="sequencing_platform" type="text" optional="False" label="Sequencing platform"/>
             <param name="description" type="text" optional="True" value="" label="Description" help="Free text description of the genome assembly (optional)"/>
             <param name="center_name" type="text" optional="False" label="Center name"/>
+            <param name="genome_fasta" type="data" label="Select the consensus sequence assembly file" format="fasta,fasta.gz"/>
         </when>
     </conditional>
     <param name="min_gap_length" type="text" optional="True" label="Minimum gap length (optional)"/>
@@ -170,7 +198,6 @@
             <param name="submit_test" value="true" />
             <param name="dry_run" value="true" />
             <param name="test_submit" value="True" />
-            <param name="genome_fasta" value="phiX2.fasta"/>
             <param name="assembly_type" value="isolate"/>
             <param name="assembly_program" value="Test assembly program"/>
             <param name="molecule_type" value="viral cRNA"/>
@@ -183,6 +210,7 @@
                 <param name="sequencing_platform" value="Nanopore 0011"/>
                 <param name="description" value="Test Description"/>
                 <param name="center_name" value="Test center name"/>
+                <param name="genome_fasta" value="phiX2.fasta"/>
             </conditional>
             <param name="min_gap_length" value="30"/>
             <output name="webin_cli_log">
@@ -192,8 +220,52 @@
                 </assert_contents>
             </output>
         </test>
+        <test>
+            <param name="submit_test" value="true" />
+            <param name="dry_run" value="true" />
+            <param name="test_submit" value="True" />
+            <param name="assembly_type" value="isolate"/>
+            <param name="assembly_program" value="Test assembly program"/>
+            <param name="molecule_type" value="viral cRNA"/>
+            <param name="coverage" value="10000"/>
+            <conditional name="metadata_file_or_form">
+                <param name="metadata_format" value="file"/>
+                <param name="ena_receipt" value="receipt_sample_noPhiX.txt"/>
+                <param name="genome_fasta" value="phiX2.fasta.gz,sample_alias_001.fasta.gz"/>
+            </conditional>
+            <param name="min_gap_length" value="30"/>
+            <output name="webin_cli_log">
+                <assert_contents>
+                    <has_text_matching expression="Processing phiX2"/>
+                    <has_text_matching expression="No metadata found for sample phiX2"/>
+                    <has_text_matching expression="Processing sample_alias_001"/>
+                    <has_text_matching expression="Submitting manifest ./manifests/sample_alias_001.manifest.txt"/>
+                    <has_text_matching expression="ERROR: Invalid submission account user name or password. Please try enclosing your password in single quotes."/>
+                </assert_contents>
+            </output>
+        </test>
+        <test>
+            <param name="submit_test" value="true" />
+            <param name="dry_run" value="true" />
+            <param name="test_submit" value="True" />
+            <param name="assembly_type" value="isolate"/>
+            <param name="assembly_program" value="Test assembly program"/>
+            <param name="molecule_type" value="viral cRNA"/>
+            <param name="coverage" value="10000"/>
+            <conditional name="metadata_file_or_form">
+                <param name="metadata_format" value="file"/>
+                <param name="ena_receipt" value="receipt_sample.txt"/>
+                <param name="genome_fasta" value="sample_alias_001.fasta.gz"/>
+            </conditional>
+            <param name="min_gap_length" value="30"/>
+            <output name="webin_cli_log">
+                <assert_contents>
+                    <has_text_matching expression="ERROR: Invalid submission account user name or password. Please try enclosing your password in single quotes."/>
+                </assert_contents>
+            </output>
+        </test>
     </tests>
     <help><![CDATA[
-        TODO: Fill in help.
+        This tool is a wrapper for the ENA Webin CLI submission tool (https://ena-docs.readthedocs.io/en/latest/submit/general-guide/webin-cli.html).
     ]]></help>
 </tool>
--- a/process_input.py	Fri Feb 04 15:52:45 2022 +0000
+++ b/process_input.py	Tue Feb 22 11:03:34 2022 +0000
@@ -1,127 +1,127 @@
 import gzip
+import json
 import os
 import sys
 import shutil
 import yaml

-from Bio import SeqIO
-
-
-"""
-Takes as input:
-    1. A receipt obtained from ENA submission tool.
-    A txt file that includes a YAML section with
-
-    2. A fasta file with fasta entries ids defined after the files used for the raw submission.
-
-    3. Path to write generated manifests
-    4. Path to write generated fasta files
-    5. manifest template path: the manifest with the global values set (e.g COVERAGE, MINGAPLENGHT..)
-"""
-
-def get_section_string(f, start_line, end_line):
+def get_section_string(f, start_line, end_line, return_string=False):
     # consume starting lines
     start_string = iter(f.readline, start_line)
     start_string = ''.join(line for line in start_string)
     # read YAML lines
     yaml_string = iter(f.readline, end_line)
-    return ''.join(x for x in yaml_string)
+    if return_string:
+        return ''.join(x for x in yaml_string)
+    else:
+        return [x for x in yaml_string]
+
+def fill_from_yaml_data(yaml_only_dict, studies_samples_dict):
+    # fill experiment information (platform)  ****
+    for index,exp in yaml_only_dict['ENA_experiment'].items():
+        study_alias = exp['study_alias']
+        sample_alias = exp['sample_alias']
+        if study_alias in studies_samples_dict.keys():
+            if sample_alias in studies_samples_dict[study_alias].keys():
+                studies_samples_dict[study_alias][sample_alias]['experiments'].append({'platform': exp['platform']})
+            else:
+                studies_samples_dict[study_alias][sample_alias] = {'experiments': [{'platform': exp['platform']}]}
+        else:
+            studies_samples_dict[study_alias] = {sample_alias: {'experiments':[{'platform': exp['platform']}]}}
+
+
+def load_receipt_data(input_file_path):
+    # should do some health check of the input file?
+    # load yaml section
+    loaded_data = {}
+    yaml_delimiter = 'YAML -------------\n'
+    with open(input_file_path) as input_file:
+        yaml_only_section = yaml.safe_load(get_section_string(input_file, start_line=yaml_delimiter, end_line=yaml_delimiter, return_string=True))
+    fill_from_yaml_data(yaml_only_section, loaded_data)
+    # read study accessions
+    study_delimiter = 'Study accession details:\n'
+    end_line = '\n'
+    with open(input_file_path) as input_file:
+        studies_accession_lines = get_section_string(input_file, start_line=study_delimiter, end_line=end_line)
+    # loaded_data['studies'] = {}
+    for study_line in studies_accession_lines:
+        if study_line != '\n':
+            alias, accession, *_ = study_line.split('\t')
+            try:
+                loaded_data[alias]['accession'] = accession
+            except KeyError:
+                print(f"Experiment {exp} has unknown study or sample")
+            # loaded_data['studies'][alias]['accession'] = accession
+    samples_delimiter = 'Sample accession details:\n'
+    with open(input_file_path) as input_file:
+        samples_accession_lines = get_section_string(input_file, start_line=samples_delimiter, end_line=end_line)
+        ## need to iterate over all studies, because here I don't know which study is the sample from.
+    # loaded_data['samples'] = {}
+    for sample_line in samples_accession_lines:
+        if sample_line != '\n':
+            alias, accession, *_ = sample_line.split('\t')
+            for study in loaded_data.keys():
+                if alias in loaded_data[study].keys():
+                    loaded_data[study][alias]['accession'] = accession
+                    break
+    return loaded_data
+
+
+"""
+Takes as input:
+    1. A receipt obtained from ENA submission tool:
+        a txt file that contains sections describing submission details.
+    2. A json file with the list of fasta that the user loaded
+    3. Path to write generated manifests
+    4. Manifest template path: the manifest with the global values set
+        (e.g COVERAGE, MINGAPLENGHT..)
+"""
+


 def main():
-    input_file = open(sys.argv[1])
-    fasta_in = open(sys.argv[2])
+    input_file_path = sys.argv[1]
+    fasta_names_list_path = sys.argv[2]
     out_manifest_base = sys.argv[3]
-    out_fasta_base = sys.argv[4]
-    manifest_template = sys.argv[5]
-    yaml_delimiter = 'YAML -------------\n'
-    yaml_only = yaml.safe_load(get_section_string(input_file, start_line=yaml_delimiter, end_line=yaml_delimiter))
-    # print(yaml_only)
-    submission_tuples_list = []
-    # parse the sequence IDs
-    for record in SeqIO.parse(fasta_in, "fasta"):
-        seq_id = record.id
-        # need to map the seq ID to 1 or more seq files submitted: for single files these is the exact file name?
-        #  ... but for paired it may not be the exact same
-        # ...initially I should attempt to find a
-        #  .. if this
-        # in any case, if I cant make the right match then I should do a kind of substring match
-
-        # find the exp_alias associated with the file
-        exp_alias = None
-        for index,run in yaml_only['ENA_run'].items():
-            if run['file_name'] == seq_id:
-                ## TODO: match also cases when the seq entry name is == entry_[1|2].fastq.gz or something
-                exp_alias = run['experiment_alias']
-                break
-        if not exp_alias:
-            raise Exception("No run files match for the sequence entry {seq_id}")
-        # find the sample and study for that experiment
-        sample_alias = None
-        study_alias = None
-        for index,exp in yaml_only['ENA_experiment'].items():
-            if exp['alias'] == exp_alias:
-                sample_alias = exp['sample_alias']
-                study_alias = exp['study_alias']
-                platform = exp['platform']
-                break
-        if not sample_alias:
-            raise Exception("No sample associated with experiment {exp_alias}")
-        if not study_alias:
-            raise Exception("No study associated with experiment {exp_alias}")
-
+    manifest_template = sys.argv[4]
+    # load submitted data from receipt file
+    data_dict = load_receipt_data(input_file_path)
+    # iterate over the list of fasta files
+    with open(fasta_names_list_path, 'r') as fasta_files_json_file:
+        fasta_files_list = json.load(fasta_files_json_file)
+    with open('submit_list.tab', 'w') as written_manifests_out:
+        for fasta_file in fasta_files_list:
+            if fasta_file.endswith('.fasta.gz'):
+                sample_alias = fasta_file[:-9]
+            else:
+                sample_alias = fasta_file[:-6]
+            print(f'Processing {sample_alias}')
+            found_metadata = False
+            for study_alias in data_dict.keys():
+                if sample_alias in data_dict[study_alias].keys():
+                    sample_accession = data_dict[study_alias][sample_alias]['accession']
+                    study_accession = data_dict[study_alias]['accession']
+                    ### TODO get a string that concatenates plaform information from multiple exp
+                    platform = data_dict[study_alias][sample_alias]['experiments'][0]['platform']
+                    manifest_path = os.path.join(out_manifest_base, sample_alias + '.manifest.txt')
+                    with open(manifest_path, "w") as output_handle:
+                        # first dump the contents of manifest template
+                        # containing the global vars
+                        with open(manifest_template) as m_template:
+                            output_handle.write(m_template.read())
+                        output_handle.write("ASSEMBLYNAME\tconsensus_" + sample_alias + "\n")
+                        output_handle.write("PLATFORM\t" + platform + "\n")
+                        output_handle.write("STUDY\t" + study_accession + "\n")
+                        output_handle.write("SAMPLE\t" + sample_accession + "\n")
+                        # files should be available in the corresponding dir and named:
+                        #  sample_alias.fasta.gz
+                        output_handle.write("FASTA\t" + sample_alias + '.fasta.gz' + "\n")
+                    found_metadata = True
+                    written_manifests_out.write(manifest_path + '\n')
+                    break
+            if not found_metadata:
+                print(f'No metadata found for sample {sample_alias}')

-        # and finally create a fasta file for each sequence (e.g named with the seq id or the run ID)
-        fasta_path = os.path.join(out_fasta_base, seq_id + '.fasta')
-        with open(fasta_path, "w") as output_handle:
-            SeqIO.write([record], output_handle, "fasta")
-        #gzip the file (required by ENA upload tool)
-        fasta_path_gz = fasta_path + '.gz'
-        with open(fasta_path, 'rb') as f_in:
-            with gzip.open(fasta_path_gz, 'wb') as f_out:
-                shutil.copyfileobj(f_in, f_out)
-        # create the manifest
-        # add to the manifest the:
-        #
-        manifest_path = os.path.join(out_manifest_base, seq_id + '.manifest.txt')
-        with open(manifest_path, "w") as output_handle:
-            # first dump the contents of manifest template
-            # containing the global vars
-            with open(manifest_template) as m_template:
-                output_handle.write(m_template.read())
-            output_handle.write("ASSEMBLYNAME\tconsensus_" + seq_id + "\n")
-            output_handle.write("PLATFORM\t" + platform + "\n")
-            output_handle.write("STUDY\t" + study_alias + "\n")
-            output_handle.write("SAMPLE\t" + sample_alias + "\n")
-            output_handle.write("FASTA\t" + fasta_path_gz + "\n")
-
-        # ... and a dict  (or tuple list???) that contains for each study - sample  the name of the file that has the consensus sequence
-        # ****  is it ok to use the unique ids of the study and sample in the manifest?? or should I use the accessions??
-        # in the latest case then I also need to parse the  Study accession details: and Sample accession details: entries
-        # samples_dir[study][sample] = seq_id + '.fasta'
-        submission_tuples_list.append((manifest_path, fasta_path))
-
-    with open('submit_list.tab', "w") as output_handle:
-        for submit_tuple in submission_tuples_list:
-            output_handle.write('\t'.join(submit_tuple) + '\n')
-    ## DEBUG CASE
-    #study details
-    # start_study = 'Study accession details:\n'
-    # empty_end = '\n'
-    # study_data = get_section_string(input_file, start_line=start_study, end_line=empty_end)
-    # if len(study_data.split('\n')) > 2:
-        # # more than 1 study accession
-        # raise Exception("Multiple study accessions found")
-    # out_manifest.write(f'STUDY\t{study_data.split()[1]}\n')
-    # start_sample = 'Sample accession details:\n'
-    # sample_data = get_section_string(input_file, start_line=start_sample, end_line=empty_end)
-    # if len(sample_data.split('\n')) > 2:
-        # # more than 1 study accession
-        # raise Exception("Multiple sample accessions found")
-    # out_manifest.write(f'SAMPLE\t{sample_data.split()[1]}\n')
-    # platform = 'Ion Torrent'
-    # out_manifest.write(f"PLATFORM\t{platform}\n")
-    # out_manifest.close()

 if __name__ == '__main__':
     main()
Binary file test-data/.phiX.fasta.swp has changed
Binary file test-data/.receipt_sample.txt.swp has changed
Binary file test-data/phiX2.fasta.gz has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/receipt_sample.txt	Tue Feb 22 11:03:34 2022 +0000
@@ -0,0 +1,76 @@
+YAML -------------
+ENA_experiment:
+  0:
+    alias: exp_test_alias_001
+    design_description: Lot's of coffe and magic
+    insert_size: 250.0
+    instrument_model: NextSeq 500
+    library_construction_protocol: Illumina COVIDSeq Test Kit
+    library_layout: PAIRED
+    library_name: Cov51
+    library_selection: RT-PCR
+    library_source: VIRAL RNA
+    library_strategy: AMPLICON
+    platform: ILLUMINA
+    sample_alias: sample_alias_001
+    study_alias: study_alias_001
+    title: Illumina NextSeq paired end sequencing; Illumina COVIDSeq Test
+  1:
+    alias: exp_test_alias_002
+    design_description: Lot's of coffe and magic
+    insert_size: 250.0
+    instrument_model: NextSeq 500
+    library_construction_protocol: Illumina COVIDSeq Test Kit
+    library_layout: PAIRED
+    library_name: Cov51
+    library_selection: RT-PCR
+    library_source: VIRAL RNA
+    library_strategy: AMPLICON
+    platform: ILLUMINA
+    sample_alias: phiX2
+    study_alias: study_alias_001
+    title: Illumina NextSeq paired end sequencing; Illumina COVIDSeq Test
+ENA_run:
+  2:
+    alias: run_alias_001
+    experiment_alias: exp_test_alias_001
+    file_format: FASTQ
+    file_name: run001.fastq.gz
+ENA_sample:
+  2:
+    alias: sample_alias_001
+    collecting institution: Umbrella Corp.
+    collection date: '2021-05-03'
+    collector name: "John Doe"
+    definition for seropositive sample: ''
+  2:
+    alias: phiX2
+    collecting institution: Umbrella Corp.
+    collection date: '2021-05-03'
+    collector name: "John Doe"
+    definition for seropositive sample: ''
+ENA_study:
+  2:
+    alias: study_alias_001
+    study_abstract: "Help"
+    study_type: Whole Genome Sequencing
+    title: Whole genome sequencing of SARS-CoV-2
+YAML -------------
+
+Printing receipt to ./receipt.xml
+
+Submission was done successfully
+
+Study accession details:
+study_alias_001	FAKE0001	2011-01-16T10:52:06.497+01:00	added
+
+Sample accession details:
+sample_alias_001	FAKESAMP001	2011-01-16T10:52:06.497+01:00	added
+phiX2	FAKESAMP002	2011-01-16T10:52:06.497+01:00	added
+
+Saving updates in new tsv tables::
+save updates in ./submission_files/studies_updated.tsv
+save updates in ./submission_files/samples_updated.tsv
+save updates in ./submission_files/experiments_updated.tsv
+save updates in ./submission_files/runs_updated.tsv
+action_option	add
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/receipt_sample_noPhiX.txt	Tue Feb 22 11:03:34 2022 +0000
@@ -0,0 +1,54 @@
+YAML -------------
+ENA_experiment:
+  0:
+    alias: exp_test_alias_001
+    design_description: Lot's of coffe and magic
+    insert_size: 250.0
+    instrument_model: NextSeq 500
+    library_construction_protocol: Illumina COVIDSeq Test Kit
+    library_layout: PAIRED
+    library_name: Cov51
+    library_selection: RT-PCR
+    library_source: VIRAL RNA
+    library_strategy: AMPLICON
+    platform: ILLUMINA
+    sample_alias: sample_alias_001
+    study_alias: study_alias_001
+    title: Illumina NextSeq paired end sequencing; Illumina COVIDSeq Test
+ENA_run:
+  2:
+    alias: run_alias_001
+    experiment_alias: exp_test_alias_001
+    file_format: FASTQ
+    file_name: run001.fastq.gz
+ENA_sample:
+  2:
+    alias: sample_alias_001
+    collecting institution: Umbrella Corp.
+    collection date: '2021-05-03'
+    collector name: "John Doe"
+    definition for seropositive sample: ''
+ENA_study:
+  2:
+    alias: study_alias_001
+    study_abstract: "Help"
+    study_type: Whole Genome Sequencing
+    title: Whole genome sequencing of SARS-CoV-2
+YAML -------------
+
+Printing receipt to ./receipt.xml
+
+Submission was done successfully
+
+Study accession details:
+study_alias_001	FAKE0001	2011-01-16T10:52:06.497+01:00	added
+
+Sample accession details:
+sample_alias_001	FAKESAMP001	2011-01-16T10:52:06.497+01:00	added
+
+Saving updates in new tsv tables::
+save updates in ./submission_files/studies_updated.tsv
+save updates in ./submission_files/samples_updated.tsv
+save updates in ./submission_files/experiments_updated.tsv
+save updates in ./submission_files/runs_updated.tsv
+action_option	add
Binary file test-data/sample_alias_001.fasta.gz has changed