# HG changeset patch # User ieguinoa # Date 1645527814 0 # Node ID 7d751b5943b0353dc705231050d306e37b4a24dc # Parent 1ecd8ce07db49cb4923870b8cd61694e9c87c809 Uploaded diff -r 1ecd8ce07db4 -r 7d751b5943b0 ena_consensus_submit.xml --- a/ena_consensus_submit.xml Fri Feb 04 15:52:45 2022 +0000 +++ b/ena_consensus_submit.xml Tue Feb 22 11:03:34 2022 +0000 @@ -51,8 +51,18 @@ echo -e 'MOLECULETYPE\t$molecule_type' >> $manifest_base; #if $metadata_file_or_form.metadata_format == "file": + #import re + #for $file in $metadata_file_or_form.genome_fasta: + #if $file.is_of_type('fasta'): + #set $full_name = $file.element_identifier + '.gz' + gzip -c $file > './fasta/$full_name'; + #else: + ln -s $file './fasta/$file.element_identifier'; + #end if + + #end for ## process the input tables, this creates an intermediate file with information - python3 '$__tool_directory__/process_input.py' $metadata_file_or_form.ena_receipt $genome_fasta './manifests' './fasta' $manifest_base; + python3 '$__tool_directory__/process_input.py' $metadata_file_or_form.ena_receipt $genome_fasta_files './manifests' $manifest_base >> $webin_cli_log; center_name=`grep 'center_name' $metadata_file_or_form.ena_receipt | cut -f2,2 | tr -d '\n'`; #else: #set $generated_manifest='./manifests/generated_manifest.txt' @@ -64,6 +74,12 @@ center_name='$metadata_file_or_form.center_name'; echo -e 'NAME\t$metadata_file_or_form.assembly_name' >> $generated_manifest; echo -e 'PLATFORM\t$metadata_file_or_form.sequencing_platform' >> $generated_manifest; + #if $metadata_file_or_form.genome_fasta.is_of_type('fasta'): + gzip -c $metadata_file_or_form.genome_fasta > consensus.fasta.gz; + #else: + ln -s $metadata_file_or_form.genome_fasta consensus.fasta.gz; + #end if + echo -e 'FASTA\tconsensus.fasta.gz' >> $generated_manifest; #end if #set $outputs_dir = 'outputs' @@ -73,6 +89,7 @@ ## in case of errors, this list is empty while read line; do manifest=`echo \$line | cut -d' ' -f1,1`; + echo "Submitting manifest \$manifest" >> $webin_cli_log; ena-webin-cli -context genome -userName "'\$webin_id'" @@ -84,12 +101,10 @@ -validate #end if -outputDir $outputs_dir - >> $webin_cli_log ; - done < submit_list.tab + >> $webin_cli_log; + done < submit_list.tab; #else: - gzip -c $genome_fasta > consensus.fasta.gz; - echo -e 'FASTA\tconsensus.fasta.gz' >> $generated_manifest; ena-webin-cli #if $submit_test == "true": -test @@ -109,7 +124,7 @@ >> $webin_cli_log ; ##cp ./genome/$metadata_file_or_form.assembly_name/validate/webin-cli.report $validate_output #end if -tar -cf $webin_cli_outputs $outputs_dir; +tar -cf $webin_cli_outputs $outputs_dir ; ]]> + +#import json +#import re +#if $metadata_file_or_form.metadata_format == "file": + #set $fasta_files_list = list() + #for $file in $metadata_file_or_form.genome_fasta: + $fasta_files_list.append(str($file.element_identifier)) + #end for + #echo json.dumps($fasta_files_list) +#end if + - + @@ -145,14 +171,16 @@ + - - + + + @@ -170,7 +198,6 @@ - @@ -183,6 +210,7 @@ + @@ -192,8 +220,52 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -r 1ecd8ce07db4 -r 7d751b5943b0 process_input.py --- a/process_input.py Fri Feb 04 15:52:45 2022 +0000 +++ b/process_input.py Tue Feb 22 11:03:34 2022 +0000 @@ -1,127 +1,127 @@ import gzip +import json import os import sys import shutil import yaml -from Bio import SeqIO - - -""" -Takes as input: - 1. A receipt obtained from ENA submission tool. - A txt file that includes a YAML section with - - 2. A fasta file with fasta entries ids defined after the files used for the raw submission. - - 3. Path to write generated manifests - 4. Path to write generated fasta files - 5. manifest template path: the manifest with the global values set (e.g COVERAGE, MINGAPLENGHT..) -""" - -def get_section_string(f, start_line, end_line): +def get_section_string(f, start_line, end_line, return_string=False): # consume starting lines start_string = iter(f.readline, start_line) start_string = ''.join(line for line in start_string) # read YAML lines yaml_string = iter(f.readline, end_line) - return ''.join(x for x in yaml_string) + if return_string: + return ''.join(x for x in yaml_string) + else: + return [x for x in yaml_string] + +def fill_from_yaml_data(yaml_only_dict, studies_samples_dict): + # fill experiment information (platform) **** + for index,exp in yaml_only_dict['ENA_experiment'].items(): + study_alias = exp['study_alias'] + sample_alias = exp['sample_alias'] + if study_alias in studies_samples_dict.keys(): + if sample_alias in studies_samples_dict[study_alias].keys(): + studies_samples_dict[study_alias][sample_alias]['experiments'].append({'platform': exp['platform']}) + else: + studies_samples_dict[study_alias][sample_alias] = {'experiments': [{'platform': exp['platform']}]} + else: + studies_samples_dict[study_alias] = {sample_alias: {'experiments':[{'platform': exp['platform']}]}} + + +def load_receipt_data(input_file_path): + # should do some health check of the input file? + # load yaml section + loaded_data = {} + yaml_delimiter = 'YAML -------------\n' + with open(input_file_path) as input_file: + yaml_only_section = yaml.safe_load(get_section_string(input_file, start_line=yaml_delimiter, end_line=yaml_delimiter, return_string=True)) + fill_from_yaml_data(yaml_only_section, loaded_data) + # read study accessions + study_delimiter = 'Study accession details:\n' + end_line = '\n' + with open(input_file_path) as input_file: + studies_accession_lines = get_section_string(input_file, start_line=study_delimiter, end_line=end_line) + # loaded_data['studies'] = {} + for study_line in studies_accession_lines: + if study_line != '\n': + alias, accession, *_ = study_line.split('\t') + try: + loaded_data[alias]['accession'] = accession + except KeyError: + print(f"Experiment {exp} has unknown study or sample") + # loaded_data['studies'][alias]['accession'] = accession + samples_delimiter = 'Sample accession details:\n' + with open(input_file_path) as input_file: + samples_accession_lines = get_section_string(input_file, start_line=samples_delimiter, end_line=end_line) + ## need to iterate over all studies, because here I don't know which study is the sample from. + # loaded_data['samples'] = {} + for sample_line in samples_accession_lines: + if sample_line != '\n': + alias, accession, *_ = sample_line.split('\t') + for study in loaded_data.keys(): + if alias in loaded_data[study].keys(): + loaded_data[study][alias]['accession'] = accession + break + return loaded_data + + +""" +Takes as input: + 1. A receipt obtained from ENA submission tool: + a txt file that contains sections describing submission details. + 2. A json file with the list of fasta that the user loaded + 3. Path to write generated manifests + 4. Manifest template path: the manifest with the global values set + (e.g COVERAGE, MINGAPLENGHT..) +""" + def main(): - input_file = open(sys.argv[1]) - fasta_in = open(sys.argv[2]) + input_file_path = sys.argv[1] + fasta_names_list_path = sys.argv[2] out_manifest_base = sys.argv[3] - out_fasta_base = sys.argv[4] - manifest_template = sys.argv[5] - yaml_delimiter = 'YAML -------------\n' - yaml_only = yaml.safe_load(get_section_string(input_file, start_line=yaml_delimiter, end_line=yaml_delimiter)) - # print(yaml_only) - submission_tuples_list = [] - # parse the sequence IDs - for record in SeqIO.parse(fasta_in, "fasta"): - seq_id = record.id - # need to map the seq ID to 1 or more seq files submitted: for single files these is the exact file name? - # ... but for paired it may not be the exact same - # ...initially I should attempt to find a - # .. if this - # in any case, if I cant make the right match then I should do a kind of substring match - - # find the exp_alias associated with the file - exp_alias = None - for index,run in yaml_only['ENA_run'].items(): - if run['file_name'] == seq_id: - ## TODO: match also cases when the seq entry name is == entry_[1|2].fastq.gz or something - exp_alias = run['experiment_alias'] - break - if not exp_alias: - raise Exception("No run files match for the sequence entry {seq_id}") - # find the sample and study for that experiment - sample_alias = None - study_alias = None - for index,exp in yaml_only['ENA_experiment'].items(): - if exp['alias'] == exp_alias: - sample_alias = exp['sample_alias'] - study_alias = exp['study_alias'] - platform = exp['platform'] - break - if not sample_alias: - raise Exception("No sample associated with experiment {exp_alias}") - if not study_alias: - raise Exception("No study associated with experiment {exp_alias}") - + manifest_template = sys.argv[4] + # load submitted data from receipt file + data_dict = load_receipt_data(input_file_path) + # iterate over the list of fasta files + with open(fasta_names_list_path, 'r') as fasta_files_json_file: + fasta_files_list = json.load(fasta_files_json_file) + with open('submit_list.tab', 'w') as written_manifests_out: + for fasta_file in fasta_files_list: + if fasta_file.endswith('.fasta.gz'): + sample_alias = fasta_file[:-9] + else: + sample_alias = fasta_file[:-6] + print(f'Processing {sample_alias}') + found_metadata = False + for study_alias in data_dict.keys(): + if sample_alias in data_dict[study_alias].keys(): + sample_accession = data_dict[study_alias][sample_alias]['accession'] + study_accession = data_dict[study_alias]['accession'] + ### TODO get a string that concatenates plaform information from multiple exp + platform = data_dict[study_alias][sample_alias]['experiments'][0]['platform'] + manifest_path = os.path.join(out_manifest_base, sample_alias + '.manifest.txt') + with open(manifest_path, "w") as output_handle: + # first dump the contents of manifest template + # containing the global vars + with open(manifest_template) as m_template: + output_handle.write(m_template.read()) + output_handle.write("ASSEMBLYNAME\tconsensus_" + sample_alias + "\n") + output_handle.write("PLATFORM\t" + platform + "\n") + output_handle.write("STUDY\t" + study_accession + "\n") + output_handle.write("SAMPLE\t" + sample_accession + "\n") + # files should be available in the corresponding dir and named: + # sample_alias.fasta.gz + output_handle.write("FASTA\t" + sample_alias + '.fasta.gz' + "\n") + found_metadata = True + written_manifests_out.write(manifest_path + '\n') + break + if not found_metadata: + print(f'No metadata found for sample {sample_alias}') - # and finally create a fasta file for each sequence (e.g named with the seq id or the run ID) - fasta_path = os.path.join(out_fasta_base, seq_id + '.fasta') - with open(fasta_path, "w") as output_handle: - SeqIO.write([record], output_handle, "fasta") - #gzip the file (required by ENA upload tool) - fasta_path_gz = fasta_path + '.gz' - with open(fasta_path, 'rb') as f_in: - with gzip.open(fasta_path_gz, 'wb') as f_out: - shutil.copyfileobj(f_in, f_out) - # create the manifest - # add to the manifest the: - # - manifest_path = os.path.join(out_manifest_base, seq_id + '.manifest.txt') - with open(manifest_path, "w") as output_handle: - # first dump the contents of manifest template - # containing the global vars - with open(manifest_template) as m_template: - output_handle.write(m_template.read()) - output_handle.write("ASSEMBLYNAME\tconsensus_" + seq_id + "\n") - output_handle.write("PLATFORM\t" + platform + "\n") - output_handle.write("STUDY\t" + study_alias + "\n") - output_handle.write("SAMPLE\t" + sample_alias + "\n") - output_handle.write("FASTA\t" + fasta_path_gz + "\n") - - # ... and a dict (or tuple list???) that contains for each study - sample the name of the file that has the consensus sequence - # **** is it ok to use the unique ids of the study and sample in the manifest?? or should I use the accessions?? - # in the latest case then I also need to parse the Study accession details: and Sample accession details: entries - # samples_dir[study][sample] = seq_id + '.fasta' - submission_tuples_list.append((manifest_path, fasta_path)) - - with open('submit_list.tab', "w") as output_handle: - for submit_tuple in submission_tuples_list: - output_handle.write('\t'.join(submit_tuple) + '\n') - ## DEBUG CASE - #study details - # start_study = 'Study accession details:\n' - # empty_end = '\n' - # study_data = get_section_string(input_file, start_line=start_study, end_line=empty_end) - # if len(study_data.split('\n')) > 2: - # # more than 1 study accession - # raise Exception("Multiple study accessions found") - # out_manifest.write(f'STUDY\t{study_data.split()[1]}\n') - # start_sample = 'Sample accession details:\n' - # sample_data = get_section_string(input_file, start_line=start_sample, end_line=empty_end) - # if len(sample_data.split('\n')) > 2: - # # more than 1 study accession - # raise Exception("Multiple sample accessions found") - # out_manifest.write(f'SAMPLE\t{sample_data.split()[1]}\n') - # platform = 'Ion Torrent' - # out_manifest.write(f"PLATFORM\t{platform}\n") - # out_manifest.close() if __name__ == '__main__': main() diff -r 1ecd8ce07db4 -r 7d751b5943b0 test-data/.phiX.fasta.swp Binary file test-data/.phiX.fasta.swp has changed diff -r 1ecd8ce07db4 -r 7d751b5943b0 test-data/.receipt_sample.txt.swp Binary file test-data/.receipt_sample.txt.swp has changed diff -r 1ecd8ce07db4 -r 7d751b5943b0 test-data/phiX2.fasta.gz Binary file test-data/phiX2.fasta.gz has changed diff -r 1ecd8ce07db4 -r 7d751b5943b0 test-data/receipt_sample.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/receipt_sample.txt Tue Feb 22 11:03:34 2022 +0000 @@ -0,0 +1,76 @@ +YAML ------------- +ENA_experiment: + 0: + alias: exp_test_alias_001 + design_description: Lot's of coffe and magic + insert_size: 250.0 + instrument_model: NextSeq 500 + library_construction_protocol: Illumina COVIDSeq Test Kit + library_layout: PAIRED + library_name: Cov51 + library_selection: RT-PCR + library_source: VIRAL RNA + library_strategy: AMPLICON + platform: ILLUMINA + sample_alias: sample_alias_001 + study_alias: study_alias_001 + title: Illumina NextSeq paired end sequencing; Illumina COVIDSeq Test + 1: + alias: exp_test_alias_002 + design_description: Lot's of coffe and magic + insert_size: 250.0 + instrument_model: NextSeq 500 + library_construction_protocol: Illumina COVIDSeq Test Kit + library_layout: PAIRED + library_name: Cov51 + library_selection: RT-PCR + library_source: VIRAL RNA + library_strategy: AMPLICON + platform: ILLUMINA + sample_alias: phiX2 + study_alias: study_alias_001 + title: Illumina NextSeq paired end sequencing; Illumina COVIDSeq Test +ENA_run: + 2: + alias: run_alias_001 + experiment_alias: exp_test_alias_001 + file_format: FASTQ + file_name: run001.fastq.gz +ENA_sample: + 2: + alias: sample_alias_001 + collecting institution: Umbrella Corp. + collection date: '2021-05-03' + collector name: "John Doe" + definition for seropositive sample: '' + 2: + alias: phiX2 + collecting institution: Umbrella Corp. + collection date: '2021-05-03' + collector name: "John Doe" + definition for seropositive sample: '' +ENA_study: + 2: + alias: study_alias_001 + study_abstract: "Help" + study_type: Whole Genome Sequencing + title: Whole genome sequencing of SARS-CoV-2 +YAML ------------- + +Printing receipt to ./receipt.xml + +Submission was done successfully + +Study accession details: +study_alias_001 FAKE0001 2011-01-16T10:52:06.497+01:00 added + +Sample accession details: +sample_alias_001 FAKESAMP001 2011-01-16T10:52:06.497+01:00 added +phiX2 FAKESAMP002 2011-01-16T10:52:06.497+01:00 added + +Saving updates in new tsv tables:: +save updates in ./submission_files/studies_updated.tsv +save updates in ./submission_files/samples_updated.tsv +save updates in ./submission_files/experiments_updated.tsv +save updates in ./submission_files/runs_updated.tsv +action_option add diff -r 1ecd8ce07db4 -r 7d751b5943b0 test-data/receipt_sample_noPhiX.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/receipt_sample_noPhiX.txt Tue Feb 22 11:03:34 2022 +0000 @@ -0,0 +1,54 @@ +YAML ------------- +ENA_experiment: + 0: + alias: exp_test_alias_001 + design_description: Lot's of coffe and magic + insert_size: 250.0 + instrument_model: NextSeq 500 + library_construction_protocol: Illumina COVIDSeq Test Kit + library_layout: PAIRED + library_name: Cov51 + library_selection: RT-PCR + library_source: VIRAL RNA + library_strategy: AMPLICON + platform: ILLUMINA + sample_alias: sample_alias_001 + study_alias: study_alias_001 + title: Illumina NextSeq paired end sequencing; Illumina COVIDSeq Test +ENA_run: + 2: + alias: run_alias_001 + experiment_alias: exp_test_alias_001 + file_format: FASTQ + file_name: run001.fastq.gz +ENA_sample: + 2: + alias: sample_alias_001 + collecting institution: Umbrella Corp. + collection date: '2021-05-03' + collector name: "John Doe" + definition for seropositive sample: '' +ENA_study: + 2: + alias: study_alias_001 + study_abstract: "Help" + study_type: Whole Genome Sequencing + title: Whole genome sequencing of SARS-CoV-2 +YAML ------------- + +Printing receipt to ./receipt.xml + +Submission was done successfully + +Study accession details: +study_alias_001 FAKE0001 2011-01-16T10:52:06.497+01:00 added + +Sample accession details: +sample_alias_001 FAKESAMP001 2011-01-16T10:52:06.497+01:00 added + +Saving updates in new tsv tables:: +save updates in ./submission_files/studies_updated.tsv +save updates in ./submission_files/samples_updated.tsv +save updates in ./submission_files/experiments_updated.tsv +save updates in ./submission_files/runs_updated.tsv +action_option add diff -r 1ecd8ce07db4 -r 7d751b5943b0 test-data/sample_alias_001.fasta.gz Binary file test-data/sample_alias_001.fasta.gz has changed