# HG changeset patch
# User ieguinoa
# Date 1645527814 0
# Node ID 7d751b5943b0353dc705231050d306e37b4a24dc
# Parent 1ecd8ce07db49cb4923870b8cd61694e9c87c809
Uploaded
diff -r 1ecd8ce07db4 -r 7d751b5943b0 ena_consensus_submit.xml
--- a/ena_consensus_submit.xml Fri Feb 04 15:52:45 2022 +0000
+++ b/ena_consensus_submit.xml Tue Feb 22 11:03:34 2022 +0000
@@ -51,8 +51,18 @@
echo -e 'MOLECULETYPE\t$molecule_type' >> $manifest_base;
#if $metadata_file_or_form.metadata_format == "file":
+ #import re
+ #for $file in $metadata_file_or_form.genome_fasta:
+ #if $file.is_of_type('fasta'):
+ #set $full_name = $file.element_identifier + '.gz'
+ gzip -c $file > './fasta/$full_name';
+ #else:
+ ln -s $file './fasta/$file.element_identifier';
+ #end if
+
+ #end for
## process the input tables, this creates an intermediate file with information
- python3 '$__tool_directory__/process_input.py' $metadata_file_or_form.ena_receipt $genome_fasta './manifests' './fasta' $manifest_base;
+ python3 '$__tool_directory__/process_input.py' $metadata_file_or_form.ena_receipt $genome_fasta_files './manifests' $manifest_base >> $webin_cli_log;
center_name=`grep 'center_name' $metadata_file_or_form.ena_receipt | cut -f2,2 | tr -d '\n'`;
#else:
#set $generated_manifest='./manifests/generated_manifest.txt'
@@ -64,6 +74,12 @@
center_name='$metadata_file_or_form.center_name';
echo -e 'NAME\t$metadata_file_or_form.assembly_name' >> $generated_manifest;
echo -e 'PLATFORM\t$metadata_file_or_form.sequencing_platform' >> $generated_manifest;
+ #if $metadata_file_or_form.genome_fasta.is_of_type('fasta'):
+ gzip -c $metadata_file_or_form.genome_fasta > consensus.fasta.gz;
+ #else:
+ ln -s $metadata_file_or_form.genome_fasta consensus.fasta.gz;
+ #end if
+ echo -e 'FASTA\tconsensus.fasta.gz' >> $generated_manifest;
#end if
#set $outputs_dir = 'outputs'
@@ -73,6 +89,7 @@
## in case of errors, this list is empty
while read line; do
manifest=`echo \$line | cut -d' ' -f1,1`;
+ echo "Submitting manifest \$manifest" >> $webin_cli_log;
ena-webin-cli
-context genome
-userName "'\$webin_id'"
@@ -84,12 +101,10 @@
-validate
#end if
-outputDir $outputs_dir
- >> $webin_cli_log ;
- done < submit_list.tab
+ >> $webin_cli_log;
+ done < submit_list.tab;
#else:
- gzip -c $genome_fasta > consensus.fasta.gz;
- echo -e 'FASTA\tconsensus.fasta.gz' >> $generated_manifest;
ena-webin-cli
#if $submit_test == "true":
-test
@@ -109,7 +124,7 @@
>> $webin_cli_log ;
##cp ./genome/$metadata_file_or_form.assembly_name/validate/webin-cli.report $validate_output
#end if
-tar -cf $webin_cli_outputs $outputs_dir;
+tar -cf $webin_cli_outputs $outputs_dir ;
]]>
+
+#import json
+#import re
+#if $metadata_file_or_form.metadata_format == "file":
+ #set $fasta_files_list = list()
+ #for $file in $metadata_file_or_form.genome_fasta:
+ $fasta_files_list.append(str($file.element_identifier))
+ #end for
+ #echo json.dumps($fasta_files_list)
+#end if
+
-
+
@@ -145,14 +171,16 @@
+
-
-
+
+
+
@@ -170,7 +198,6 @@
-
@@ -183,6 +210,7 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r 1ecd8ce07db4 -r 7d751b5943b0 process_input.py
--- a/process_input.py Fri Feb 04 15:52:45 2022 +0000
+++ b/process_input.py Tue Feb 22 11:03:34 2022 +0000
@@ -1,127 +1,127 @@
import gzip
+import json
import os
import sys
import shutil
import yaml
-from Bio import SeqIO
-
-
-"""
-Takes as input:
- 1. A receipt obtained from ENA submission tool.
- A txt file that includes a YAML section with
-
- 2. A fasta file with fasta entries ids defined after the files used for the raw submission.
-
- 3. Path to write generated manifests
- 4. Path to write generated fasta files
- 5. manifest template path: the manifest with the global values set (e.g COVERAGE, MINGAPLENGHT..)
-"""
-
-def get_section_string(f, start_line, end_line):
+def get_section_string(f, start_line, end_line, return_string=False):
# consume starting lines
start_string = iter(f.readline, start_line)
start_string = ''.join(line for line in start_string)
# read YAML lines
yaml_string = iter(f.readline, end_line)
- return ''.join(x for x in yaml_string)
+ if return_string:
+ return ''.join(x for x in yaml_string)
+ else:
+ return [x for x in yaml_string]
+
+def fill_from_yaml_data(yaml_only_dict, studies_samples_dict):
+ # fill experiment information (platform) ****
+ for index,exp in yaml_only_dict['ENA_experiment'].items():
+ study_alias = exp['study_alias']
+ sample_alias = exp['sample_alias']
+ if study_alias in studies_samples_dict.keys():
+ if sample_alias in studies_samples_dict[study_alias].keys():
+ studies_samples_dict[study_alias][sample_alias]['experiments'].append({'platform': exp['platform']})
+ else:
+ studies_samples_dict[study_alias][sample_alias] = {'experiments': [{'platform': exp['platform']}]}
+ else:
+ studies_samples_dict[study_alias] = {sample_alias: {'experiments':[{'platform': exp['platform']}]}}
+
+
+def load_receipt_data(input_file_path):
+ # should do some health check of the input file?
+ # load yaml section
+ loaded_data = {}
+ yaml_delimiter = 'YAML -------------\n'
+ with open(input_file_path) as input_file:
+ yaml_only_section = yaml.safe_load(get_section_string(input_file, start_line=yaml_delimiter, end_line=yaml_delimiter, return_string=True))
+ fill_from_yaml_data(yaml_only_section, loaded_data)
+ # read study accessions
+ study_delimiter = 'Study accession details:\n'
+ end_line = '\n'
+ with open(input_file_path) as input_file:
+ studies_accession_lines = get_section_string(input_file, start_line=study_delimiter, end_line=end_line)
+ # loaded_data['studies'] = {}
+ for study_line in studies_accession_lines:
+ if study_line != '\n':
+ alias, accession, *_ = study_line.split('\t')
+ try:
+ loaded_data[alias]['accession'] = accession
+ except KeyError:
+ print(f"Experiment {exp} has unknown study or sample")
+ # loaded_data['studies'][alias]['accession'] = accession
+ samples_delimiter = 'Sample accession details:\n'
+ with open(input_file_path) as input_file:
+ samples_accession_lines = get_section_string(input_file, start_line=samples_delimiter, end_line=end_line)
+ ## need to iterate over all studies, because here I don't know which study is the sample from.
+ # loaded_data['samples'] = {}
+ for sample_line in samples_accession_lines:
+ if sample_line != '\n':
+ alias, accession, *_ = sample_line.split('\t')
+ for study in loaded_data.keys():
+ if alias in loaded_data[study].keys():
+ loaded_data[study][alias]['accession'] = accession
+ break
+ return loaded_data
+
+
+"""
+Takes as input:
+ 1. A receipt obtained from ENA submission tool:
+ a txt file that contains sections describing submission details.
+ 2. A json file with the list of fasta that the user loaded
+ 3. Path to write generated manifests
+ 4. Manifest template path: the manifest with the global values set
+ (e.g COVERAGE, MINGAPLENGHT..)
+"""
+
def main():
- input_file = open(sys.argv[1])
- fasta_in = open(sys.argv[2])
+ input_file_path = sys.argv[1]
+ fasta_names_list_path = sys.argv[2]
out_manifest_base = sys.argv[3]
- out_fasta_base = sys.argv[4]
- manifest_template = sys.argv[5]
- yaml_delimiter = 'YAML -------------\n'
- yaml_only = yaml.safe_load(get_section_string(input_file, start_line=yaml_delimiter, end_line=yaml_delimiter))
- # print(yaml_only)
- submission_tuples_list = []
- # parse the sequence IDs
- for record in SeqIO.parse(fasta_in, "fasta"):
- seq_id = record.id
- # need to map the seq ID to 1 or more seq files submitted: for single files these is the exact file name?
- # ... but for paired it may not be the exact same
- # ...initially I should attempt to find a
- # .. if this
- # in any case, if I cant make the right match then I should do a kind of substring match
-
- # find the exp_alias associated with the file
- exp_alias = None
- for index,run in yaml_only['ENA_run'].items():
- if run['file_name'] == seq_id:
- ## TODO: match also cases when the seq entry name is == entry_[1|2].fastq.gz or something
- exp_alias = run['experiment_alias']
- break
- if not exp_alias:
- raise Exception("No run files match for the sequence entry {seq_id}")
- # find the sample and study for that experiment
- sample_alias = None
- study_alias = None
- for index,exp in yaml_only['ENA_experiment'].items():
- if exp['alias'] == exp_alias:
- sample_alias = exp['sample_alias']
- study_alias = exp['study_alias']
- platform = exp['platform']
- break
- if not sample_alias:
- raise Exception("No sample associated with experiment {exp_alias}")
- if not study_alias:
- raise Exception("No study associated with experiment {exp_alias}")
-
+ manifest_template = sys.argv[4]
+ # load submitted data from receipt file
+ data_dict = load_receipt_data(input_file_path)
+ # iterate over the list of fasta files
+ with open(fasta_names_list_path, 'r') as fasta_files_json_file:
+ fasta_files_list = json.load(fasta_files_json_file)
+ with open('submit_list.tab', 'w') as written_manifests_out:
+ for fasta_file in fasta_files_list:
+ if fasta_file.endswith('.fasta.gz'):
+ sample_alias = fasta_file[:-9]
+ else:
+ sample_alias = fasta_file[:-6]
+ print(f'Processing {sample_alias}')
+ found_metadata = False
+ for study_alias in data_dict.keys():
+ if sample_alias in data_dict[study_alias].keys():
+ sample_accession = data_dict[study_alias][sample_alias]['accession']
+ study_accession = data_dict[study_alias]['accession']
+ ### TODO get a string that concatenates plaform information from multiple exp
+ platform = data_dict[study_alias][sample_alias]['experiments'][0]['platform']
+ manifest_path = os.path.join(out_manifest_base, sample_alias + '.manifest.txt')
+ with open(manifest_path, "w") as output_handle:
+ # first dump the contents of manifest template
+ # containing the global vars
+ with open(manifest_template) as m_template:
+ output_handle.write(m_template.read())
+ output_handle.write("ASSEMBLYNAME\tconsensus_" + sample_alias + "\n")
+ output_handle.write("PLATFORM\t" + platform + "\n")
+ output_handle.write("STUDY\t" + study_accession + "\n")
+ output_handle.write("SAMPLE\t" + sample_accession + "\n")
+ # files should be available in the corresponding dir and named:
+ # sample_alias.fasta.gz
+ output_handle.write("FASTA\t" + sample_alias + '.fasta.gz' + "\n")
+ found_metadata = True
+ written_manifests_out.write(manifest_path + '\n')
+ break
+ if not found_metadata:
+ print(f'No metadata found for sample {sample_alias}')
- # and finally create a fasta file for each sequence (e.g named with the seq id or the run ID)
- fasta_path = os.path.join(out_fasta_base, seq_id + '.fasta')
- with open(fasta_path, "w") as output_handle:
- SeqIO.write([record], output_handle, "fasta")
- #gzip the file (required by ENA upload tool)
- fasta_path_gz = fasta_path + '.gz'
- with open(fasta_path, 'rb') as f_in:
- with gzip.open(fasta_path_gz, 'wb') as f_out:
- shutil.copyfileobj(f_in, f_out)
- # create the manifest
- # add to the manifest the:
- #
- manifest_path = os.path.join(out_manifest_base, seq_id + '.manifest.txt')
- with open(manifest_path, "w") as output_handle:
- # first dump the contents of manifest template
- # containing the global vars
- with open(manifest_template) as m_template:
- output_handle.write(m_template.read())
- output_handle.write("ASSEMBLYNAME\tconsensus_" + seq_id + "\n")
- output_handle.write("PLATFORM\t" + platform + "\n")
- output_handle.write("STUDY\t" + study_alias + "\n")
- output_handle.write("SAMPLE\t" + sample_alias + "\n")
- output_handle.write("FASTA\t" + fasta_path_gz + "\n")
-
- # ... and a dict (or tuple list???) that contains for each study - sample the name of the file that has the consensus sequence
- # **** is it ok to use the unique ids of the study and sample in the manifest?? or should I use the accessions??
- # in the latest case then I also need to parse the Study accession details: and Sample accession details: entries
- # samples_dir[study][sample] = seq_id + '.fasta'
- submission_tuples_list.append((manifest_path, fasta_path))
-
- with open('submit_list.tab', "w") as output_handle:
- for submit_tuple in submission_tuples_list:
- output_handle.write('\t'.join(submit_tuple) + '\n')
- ## DEBUG CASE
- #study details
- # start_study = 'Study accession details:\n'
- # empty_end = '\n'
- # study_data = get_section_string(input_file, start_line=start_study, end_line=empty_end)
- # if len(study_data.split('\n')) > 2:
- # # more than 1 study accession
- # raise Exception("Multiple study accessions found")
- # out_manifest.write(f'STUDY\t{study_data.split()[1]}\n')
- # start_sample = 'Sample accession details:\n'
- # sample_data = get_section_string(input_file, start_line=start_sample, end_line=empty_end)
- # if len(sample_data.split('\n')) > 2:
- # # more than 1 study accession
- # raise Exception("Multiple sample accessions found")
- # out_manifest.write(f'SAMPLE\t{sample_data.split()[1]}\n')
- # platform = 'Ion Torrent'
- # out_manifest.write(f"PLATFORM\t{platform}\n")
- # out_manifest.close()
if __name__ == '__main__':
main()
diff -r 1ecd8ce07db4 -r 7d751b5943b0 test-data/.phiX.fasta.swp
Binary file test-data/.phiX.fasta.swp has changed
diff -r 1ecd8ce07db4 -r 7d751b5943b0 test-data/.receipt_sample.txt.swp
Binary file test-data/.receipt_sample.txt.swp has changed
diff -r 1ecd8ce07db4 -r 7d751b5943b0 test-data/phiX2.fasta.gz
Binary file test-data/phiX2.fasta.gz has changed
diff -r 1ecd8ce07db4 -r 7d751b5943b0 test-data/receipt_sample.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/receipt_sample.txt Tue Feb 22 11:03:34 2022 +0000
@@ -0,0 +1,76 @@
+YAML -------------
+ENA_experiment:
+ 0:
+ alias: exp_test_alias_001
+ design_description: Lot's of coffe and magic
+ insert_size: 250.0
+ instrument_model: NextSeq 500
+ library_construction_protocol: Illumina COVIDSeq Test Kit
+ library_layout: PAIRED
+ library_name: Cov51
+ library_selection: RT-PCR
+ library_source: VIRAL RNA
+ library_strategy: AMPLICON
+ platform: ILLUMINA
+ sample_alias: sample_alias_001
+ study_alias: study_alias_001
+ title: Illumina NextSeq paired end sequencing; Illumina COVIDSeq Test
+ 1:
+ alias: exp_test_alias_002
+ design_description: Lot's of coffe and magic
+ insert_size: 250.0
+ instrument_model: NextSeq 500
+ library_construction_protocol: Illumina COVIDSeq Test Kit
+ library_layout: PAIRED
+ library_name: Cov51
+ library_selection: RT-PCR
+ library_source: VIRAL RNA
+ library_strategy: AMPLICON
+ platform: ILLUMINA
+ sample_alias: phiX2
+ study_alias: study_alias_001
+ title: Illumina NextSeq paired end sequencing; Illumina COVIDSeq Test
+ENA_run:
+ 2:
+ alias: run_alias_001
+ experiment_alias: exp_test_alias_001
+ file_format: FASTQ
+ file_name: run001.fastq.gz
+ENA_sample:
+ 2:
+ alias: sample_alias_001
+ collecting institution: Umbrella Corp.
+ collection date: '2021-05-03'
+ collector name: "John Doe"
+ definition for seropositive sample: ''
+ 2:
+ alias: phiX2
+ collecting institution: Umbrella Corp.
+ collection date: '2021-05-03'
+ collector name: "John Doe"
+ definition for seropositive sample: ''
+ENA_study:
+ 2:
+ alias: study_alias_001
+ study_abstract: "Help"
+ study_type: Whole Genome Sequencing
+ title: Whole genome sequencing of SARS-CoV-2
+YAML -------------
+
+Printing receipt to ./receipt.xml
+
+Submission was done successfully
+
+Study accession details:
+study_alias_001 FAKE0001 2011-01-16T10:52:06.497+01:00 added
+
+Sample accession details:
+sample_alias_001 FAKESAMP001 2011-01-16T10:52:06.497+01:00 added
+phiX2 FAKESAMP002 2011-01-16T10:52:06.497+01:00 added
+
+Saving updates in new tsv tables::
+save updates in ./submission_files/studies_updated.tsv
+save updates in ./submission_files/samples_updated.tsv
+save updates in ./submission_files/experiments_updated.tsv
+save updates in ./submission_files/runs_updated.tsv
+action_option add
diff -r 1ecd8ce07db4 -r 7d751b5943b0 test-data/receipt_sample_noPhiX.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/receipt_sample_noPhiX.txt Tue Feb 22 11:03:34 2022 +0000
@@ -0,0 +1,54 @@
+YAML -------------
+ENA_experiment:
+ 0:
+ alias: exp_test_alias_001
+ design_description: Lot's of coffe and magic
+ insert_size: 250.0
+ instrument_model: NextSeq 500
+ library_construction_protocol: Illumina COVIDSeq Test Kit
+ library_layout: PAIRED
+ library_name: Cov51
+ library_selection: RT-PCR
+ library_source: VIRAL RNA
+ library_strategy: AMPLICON
+ platform: ILLUMINA
+ sample_alias: sample_alias_001
+ study_alias: study_alias_001
+ title: Illumina NextSeq paired end sequencing; Illumina COVIDSeq Test
+ENA_run:
+ 2:
+ alias: run_alias_001
+ experiment_alias: exp_test_alias_001
+ file_format: FASTQ
+ file_name: run001.fastq.gz
+ENA_sample:
+ 2:
+ alias: sample_alias_001
+ collecting institution: Umbrella Corp.
+ collection date: '2021-05-03'
+ collector name: "John Doe"
+ definition for seropositive sample: ''
+ENA_study:
+ 2:
+ alias: study_alias_001
+ study_abstract: "Help"
+ study_type: Whole Genome Sequencing
+ title: Whole genome sequencing of SARS-CoV-2
+YAML -------------
+
+Printing receipt to ./receipt.xml
+
+Submission was done successfully
+
+Study accession details:
+study_alias_001 FAKE0001 2011-01-16T10:52:06.497+01:00 added
+
+Sample accession details:
+sample_alias_001 FAKESAMP001 2011-01-16T10:52:06.497+01:00 added
+
+Saving updates in new tsv tables::
+save updates in ./submission_files/studies_updated.tsv
+save updates in ./submission_files/samples_updated.tsv
+save updates in ./submission_files/experiments_updated.tsv
+save updates in ./submission_files/runs_updated.tsv
+action_option add
diff -r 1ecd8ce07db4 -r 7d751b5943b0 test-data/sample_alias_001.fasta.gz
Binary file test-data/sample_alias_001.fasta.gz has changed