diff process_input.py @ 0:e25357392813 draft

Uploaded
author ieguinoa
date Tue, 18 May 2021 16:30:52 +0000
parents
children 7d751b5943b0
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/process_input.py	Tue May 18 16:30:52 2021 +0000
@@ -0,0 +1,127 @@
+import gzip
+import os
+import sys
+import shutil
+import yaml
+
+from Bio import SeqIO
+
+
+"""
+Takes as input:
+    1. A receipt obtained from ENA submission tool. 
+    A txt file that includes a YAML section with 
+
+    2. A fasta file with fasta entries ids defined after the files used for the raw submission.
+
+    3. Path to write generated manifests
+    4. Path to write generated fasta files
+    5. manifest template path: the manifest with the global values set (e.g COVERAGE, MINGAPLENGHT..)
+"""
+
+def get_section_string(f, start_line, end_line):
+    # consume starting lines
+    start_string = iter(f.readline, start_line)
+    start_string = ''.join(line for line in start_string)
+    # read YAML lines
+    yaml_string = iter(f.readline, end_line)
+    return ''.join(x for x in yaml_string)
+
+
+def main():
+    input_file = open(sys.argv[1])
+    fasta_in = open(sys.argv[2])
+    out_manifest_base = sys.argv[3] 
+    out_fasta_base = sys.argv[4] 
+    manifest_template = sys.argv[5]
+    yaml_delimiter = 'YAML -------------\n'
+    yaml_only = yaml.safe_load(get_section_string(input_file, start_line=yaml_delimiter, end_line=yaml_delimiter))
+    # print(yaml_only)
+    submission_tuples_list = []
+    # parse the sequence IDs
+    for record in SeqIO.parse(fasta_in, "fasta"):
+        seq_id = record.id
+        # need to map the seq ID to 1 or more seq files submitted: for single files these is the exact file name? 
+        #  ... but for paired it may not be the exact same
+        # ...initially I should attempt to find a 
+        #  .. if this 
+        # in any case, if I cant make the right match then I should do a kind of substring match
+
+        # find the exp_alias associated with the file
+        exp_alias = None
+        for index,run in yaml_only['ENA_run'].items():
+            if run['file_name'] == seq_id:
+                ## TODO: match also cases when the seq entry name is == entry_[1|2].fastq.gz or something
+                exp_alias = run['experiment_alias']
+                break
+        if not exp_alias:
+            raise Exception("No run files match for the sequence entry {seq_id}")
+        # find the sample and study for that experiment
+        sample_alias = None
+        study_alias = None
+        for index,exp in yaml_only['ENA_experiment'].items():
+            if exp['alias'] == exp_alias:
+                sample_alias = exp['sample_alias']
+                study_alias = exp['study_alias']
+                platform = exp['platform']
+                break
+        if not sample_alias:
+            raise Exception("No sample associated with experiment {exp_alias}")
+        if not study_alias:
+            raise Exception("No study associated with experiment {exp_alias}")
+
+
+        # and finally create a fasta file for each sequence (e.g named with the seq id or the run ID)
+        fasta_path = os.path.join(out_fasta_base, seq_id + '.fasta')
+        with open(fasta_path, "w") as output_handle:
+            SeqIO.write([record], output_handle, "fasta")
+        #gzip the file (required by ENA upload tool)
+        fasta_path_gz = fasta_path + '.gz'
+        with open(fasta_path, 'rb') as f_in:
+            with gzip.open(fasta_path_gz, 'wb') as f_out:
+                shutil.copyfileobj(f_in, f_out)
+        # create the manifest
+        # add to the manifest the: 
+        # 
+        manifest_path = os.path.join(out_manifest_base, seq_id + '.manifest.txt')
+        with open(manifest_path, "w") as output_handle:
+            # first dump the contents of manifest template
+            # containing the global vars
+            with open(manifest_template) as m_template:
+                output_handle.write(m_template.read())
+            output_handle.write("ASSEMBLYNAME\tconsensus_" + seq_id + "\n")
+            output_handle.write("PLATFORM\t" + platform + "\n")
+            output_handle.write("STUDY\t" + study_alias + "\n")
+            output_handle.write("SAMPLE\t" + sample_alias + "\n")
+            output_handle.write("FASTA\t" + fasta_path_gz + "\n")
+
+        # ... and a dict  (or tuple list???) that contains for each study - sample  the name of the file that has the consensus sequence
+        # ****  is it ok to use the unique ids of the study and sample in the manifest?? or should I use the accessions??
+        # in the latest case then I also need to parse the  Study accession details: and Sample accession details: entries
+        # samples_dir[study][sample] = seq_id + '.fasta'
+        submission_tuples_list.append((manifest_path, fasta_path))
+
+    with open('submit_list.tab', "w") as output_handle:
+        for submit_tuple in submission_tuples_list:
+            output_handle.write('\t'.join(submit_tuple) + '\n')
+    ## DEBUG CASE
+    #study details
+    # start_study = 'Study accession details:\n'
+    # empty_end = '\n'
+    # study_data = get_section_string(input_file, start_line=start_study, end_line=empty_end)
+    # if len(study_data.split('\n')) > 2:
+        # # more than 1 study accession
+        # raise Exception("Multiple study accessions found")
+    # out_manifest.write(f'STUDY\t{study_data.split()[1]}\n')
+    # start_sample = 'Sample accession details:\n'
+    # sample_data = get_section_string(input_file, start_line=start_sample, end_line=empty_end)
+    # if len(sample_data.split('\n')) > 2:
+        # # more than 1 study accession
+        # raise Exception("Multiple sample accessions found")
+    # out_manifest.write(f'SAMPLE\t{sample_data.split()[1]}\n')
+    # platform = 'Ion Torrent'
+    # out_manifest.write(f"PLATFORM\t{platform}\n")
+    # out_manifest.close()
+
+if __name__ == '__main__':
+    main()