Mercurial > repos > ieguinoa > ena_webin_cli
comparison process_input.py @ 0:e25357392813 draft
Uploaded
| author | ieguinoa |
|---|---|
| date | Tue, 18 May 2021 16:30:52 +0000 |
| parents | |
| children | 7d751b5943b0 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:e25357392813 |
|---|---|
| 1 import gzip | |
| 2 import os | |
| 3 import sys | |
| 4 import shutil | |
| 5 import yaml | |
| 6 | |
| 7 from Bio import SeqIO | |
| 8 | |
| 9 | |
| 10 """ | |
| 11 Takes as input: | |
| 12 1. A receipt obtained from ENA submission tool. | |
| 13 A txt file that includes a YAML section with | |
| 14 | |
| 15 2. A fasta file with fasta entries ids defined after the files used for the raw submission. | |
| 16 | |
| 17 3. Path to write generated manifests | |
| 18 4. Path to write generated fasta files | |
| 19 5. manifest template path: the manifest with the global values set (e.g COVERAGE, MINGAPLENGHT..) | |
| 20 """ | |
| 21 | |
| 22 def get_section_string(f, start_line, end_line): | |
| 23 # consume starting lines | |
| 24 start_string = iter(f.readline, start_line) | |
| 25 start_string = ''.join(line for line in start_string) | |
| 26 # read YAML lines | |
| 27 yaml_string = iter(f.readline, end_line) | |
| 28 return ''.join(x for x in yaml_string) | |
| 29 | |
| 30 | |
| 31 def main(): | |
| 32 input_file = open(sys.argv[1]) | |
| 33 fasta_in = open(sys.argv[2]) | |
| 34 out_manifest_base = sys.argv[3] | |
| 35 out_fasta_base = sys.argv[4] | |
| 36 manifest_template = sys.argv[5] | |
| 37 yaml_delimiter = 'YAML -------------\n' | |
| 38 yaml_only = yaml.safe_load(get_section_string(input_file, start_line=yaml_delimiter, end_line=yaml_delimiter)) | |
| 39 # print(yaml_only) | |
| 40 submission_tuples_list = [] | |
| 41 # parse the sequence IDs | |
| 42 for record in SeqIO.parse(fasta_in, "fasta"): | |
| 43 seq_id = record.id | |
| 44 # need to map the seq ID to 1 or more seq files submitted: for single files these is the exact file name? | |
| 45 # ... but for paired it may not be the exact same | |
| 46 # ...initially I should attempt to find a | |
| 47 # .. if this | |
| 48 # in any case, if I cant make the right match then I should do a kind of substring match | |
| 49 | |
| 50 # find the exp_alias associated with the file | |
| 51 exp_alias = None | |
| 52 for index,run in yaml_only['ENA_run'].items(): | |
| 53 if run['file_name'] == seq_id: | |
| 54 ## TODO: match also cases when the seq entry name is == entry_[1|2].fastq.gz or something | |
| 55 exp_alias = run['experiment_alias'] | |
| 56 break | |
| 57 if not exp_alias: | |
| 58 raise Exception("No run files match for the sequence entry {seq_id}") | |
| 59 # find the sample and study for that experiment | |
| 60 sample_alias = None | |
| 61 study_alias = None | |
| 62 for index,exp in yaml_only['ENA_experiment'].items(): | |
| 63 if exp['alias'] == exp_alias: | |
| 64 sample_alias = exp['sample_alias'] | |
| 65 study_alias = exp['study_alias'] | |
| 66 platform = exp['platform'] | |
| 67 break | |
| 68 if not sample_alias: | |
| 69 raise Exception("No sample associated with experiment {exp_alias}") | |
| 70 if not study_alias: | |
| 71 raise Exception("No study associated with experiment {exp_alias}") | |
| 72 | |
| 73 | |
| 74 # and finally create a fasta file for each sequence (e.g named with the seq id or the run ID) | |
| 75 fasta_path = os.path.join(out_fasta_base, seq_id + '.fasta') | |
| 76 with open(fasta_path, "w") as output_handle: | |
| 77 SeqIO.write([record], output_handle, "fasta") | |
| 78 #gzip the file (required by ENA upload tool) | |
| 79 fasta_path_gz = fasta_path + '.gz' | |
| 80 with open(fasta_path, 'rb') as f_in: | |
| 81 with gzip.open(fasta_path_gz, 'wb') as f_out: | |
| 82 shutil.copyfileobj(f_in, f_out) | |
| 83 # create the manifest | |
| 84 # add to the manifest the: | |
| 85 # | |
| 86 manifest_path = os.path.join(out_manifest_base, seq_id + '.manifest.txt') | |
| 87 with open(manifest_path, "w") as output_handle: | |
| 88 # first dump the contents of manifest template | |
| 89 # containing the global vars | |
| 90 with open(manifest_template) as m_template: | |
| 91 output_handle.write(m_template.read()) | |
| 92 output_handle.write("ASSEMBLYNAME\tconsensus_" + seq_id + "\n") | |
| 93 output_handle.write("PLATFORM\t" + platform + "\n") | |
| 94 output_handle.write("STUDY\t" + study_alias + "\n") | |
| 95 output_handle.write("SAMPLE\t" + sample_alias + "\n") | |
| 96 output_handle.write("FASTA\t" + fasta_path_gz + "\n") | |
| 97 | |
| 98 # ... and a dict (or tuple list???) that contains for each study - sample the name of the file that has the consensus sequence | |
| 99 # **** is it ok to use the unique ids of the study and sample in the manifest?? or should I use the accessions?? | |
| 100 # in the latest case then I also need to parse the Study accession details: and Sample accession details: entries | |
| 101 # samples_dir[study][sample] = seq_id + '.fasta' | |
| 102 submission_tuples_list.append((manifest_path, fasta_path)) | |
| 103 | |
| 104 with open('submit_list.tab', "w") as output_handle: | |
| 105 for submit_tuple in submission_tuples_list: | |
| 106 output_handle.write('\t'.join(submit_tuple) + '\n') | |
| 107 ## DEBUG CASE | |
| 108 #study details | |
| 109 # start_study = 'Study accession details:\n' | |
| 110 # empty_end = '\n' | |
| 111 # study_data = get_section_string(input_file, start_line=start_study, end_line=empty_end) | |
| 112 # if len(study_data.split('\n')) > 2: | |
| 113 # # more than 1 study accession | |
| 114 # raise Exception("Multiple study accessions found") | |
| 115 # out_manifest.write(f'STUDY\t{study_data.split()[1]}\n') | |
| 116 # start_sample = 'Sample accession details:\n' | |
| 117 # sample_data = get_section_string(input_file, start_line=start_sample, end_line=empty_end) | |
| 118 # if len(sample_data.split('\n')) > 2: | |
| 119 # # more than 1 study accession | |
| 120 # raise Exception("Multiple sample accessions found") | |
| 121 # out_manifest.write(f'SAMPLE\t{sample_data.split()[1]}\n') | |
| 122 # platform = 'Ion Torrent' | |
| 123 # out_manifest.write(f"PLATFORM\t{platform}\n") | |
| 124 # out_manifest.close() | |
| 125 | |
| 126 if __name__ == '__main__': | |
| 127 main() |
