comparison process_xlsx.py @ 1:57251c760cab draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ena_upload commit ffea061c1ad6e7291abfe220230dbdbe8d19a2bd"
author iuc
date Fri, 30 Apr 2021 12:09:25 +0000
parents 382518f24d6d
children 9e2df763086c
comparison
equal deleted inserted replaced
0:382518f24d6d 1:57251c760cab
1 import argparse 1 import argparse
2 import pathlib 2 import pathlib
3 import sys 3 import sys
4 4
5 import xlrd 5 import xlrd
6 6 import yaml
7 7
8 FILE_FORMAT = 'fastq' 8 FILE_FORMAT = 'fastq'
9 9
10 10
11 def extract_data(xl_sheet, expected_columns): 11 def extract_data(xl_sheet, expected_columns):
34 row_dict = {} 34 row_dict = {}
35 for col in range(1, len(expected_columns)): 35 for col in range(1, len(expected_columns)):
36 sheet_col_index = sheet_columns[expected_columns[col]] 36 sheet_col_index = sheet_columns[expected_columns[col]]
37 row_dict[expected_columns[col]] = xl_sheet.cell(row_id, sheet_col_index).value 37 row_dict[expected_columns[col]] = xl_sheet.cell(row_id, sheet_col_index).value
38 # should check for duplicate alias/ids? 38 # should check for duplicate alias/ids?
39 data_dict[xl_sheet.cell(row_id, index_col).value] = row_dict 39 if xl_sheet.cell(row_id, index_col).value in data_dict.keys():
40 tmp = data_dict[xl_sheet.cell(row_id, index_col).value]
41 data_dict[xl_sheet.cell(row_id, index_col).value] = [tmp]
42 data_dict[xl_sheet.cell(row_id, index_col).value].append(row_dict)
43 else:
44 data_dict[xl_sheet.cell(row_id, index_col).value] = row_dict
40 return data_dict 45 return data_dict
46
47
48 def paste_xls2yaml(xlsx_path):
49 print('YAML -------------')
50 xls = xlrd.open_workbook(xlsx_path)
51 content_dict = {}
52 for sheet_name in xls.sheet_names():
53 if sheet_name == 'controlled_vocabulary':
54 continue
55 xls_sheet = xls.sheet_by_name(sheet_name)
56 sheet_contents_dict = {}
57 colnames = []
58 for col in range(xls_sheet.ncols):
59 colnames.append(xls_sheet.cell(0, col).value)
60 # skip first 2 rows (column names and suggestions)
61 for row_id in range(2, xls_sheet.nrows):
62 row_dict = {}
63 for col_id in range(0, xls_sheet.ncols):
64 row_dict[colnames[col_id]] = xls_sheet.cell(row_id, col_id).value
65 # should check for duplicate alias/ids?
66 sheet_contents_dict[row_id] = row_dict
67 content_dict[sheet_name] = sheet_contents_dict
68 yaml.dump(content_dict, sys.stdout)
69 print('YAML -------------')
41 70
42 71
43 parser = argparse.ArgumentParser() 72 parser = argparse.ArgumentParser()
44 parser.add_argument('--form', dest='xlsx_path', required=True) 73 parser.add_argument('--form', dest='xlsx_path', required=True)
45 parser.add_argument('--out_dir', dest='out_path', required=True) 74 parser.add_argument('--out_dir', dest='out_path', required=True)
46 parser.add_argument('--action', dest='action', required=True) 75 parser.add_argument('--action', dest='action', required=True)
47 parser.add_argument('--vir', dest='viral_submission', required=False, action='store_true') 76 parser.add_argument('--vir', dest='viral_submission', required=False, action='store_true')
77 parser.add_argument('--verbose', dest='verbose', required=False, action='store_true')
48 args = parser.parse_args() 78 args = parser.parse_args()
49 79
50 xl_workbook = xlrd.open_workbook(args.xlsx_path) 80 xl_workbook = xlrd.open_workbook(args.xlsx_path)
51 81
52 # PARSE STUDIES 82 # PARSE STUDIES
75 # PARSE EXPERIMENTS 105 # PARSE EXPERIMENTS
76 ################# 106 #################
77 xl_sheet = xl_workbook.sheet_by_name('ENA_experiment') 107 xl_sheet = xl_workbook.sheet_by_name('ENA_experiment')
78 if xl_sheet.nrows < 3: 108 if xl_sheet.nrows < 3:
79 raise ValueError('No experiments found in experiments sheet') 109 raise ValueError('No experiments found in experiments sheet')
80 exp_columns = ['alias', 'title', 'study_alias', 'sample_alias', 'design_description', 'library_name', 110 exp_columns = ['alias', 'title', 'study_alias', 'sample_alias', 'design_description',
81 'library_strategy', 'library_source', 'library_selection', 'library_layout', 111 'library_name', 'library_strategy', 'library_source', 'library_selection',
82 'insert_size', 'library_construction_protocol', 'platform', 'instrument_model'] 112 'library_layout', 'insert_size', 'library_construction_protocol',
113 'platform', 'instrument_model']
114
83 experiments_dict = extract_data(xl_sheet, exp_columns) 115 experiments_dict = extract_data(xl_sheet, exp_columns)
84 116
85 # PARSE RUNS SHEET 117 # PARSE RUNS SHEET
86 ################# 118 #################
87 xl_sheet = xl_workbook.sheet_by_name('ENA_run') 119 xl_sheet = xl_workbook.sheet_by_name('ENA_run')
121 # WRITE DICTIONARIES TO TABLE FILES 153 # WRITE DICTIONARIES TO TABLE FILES
122 154
123 # ADD A TIMESTAMP TO THE ALIAS? SEEMS LIKE ENA REQUIRES ALL ENTRIES FOR A WEBIN TO HAVE UNIQUE IDS? 155 # ADD A TIMESTAMP TO THE ALIAS? SEEMS LIKE ENA REQUIRES ALL ENTRIES FOR A WEBIN TO HAVE UNIQUE IDS?
124 # dt_oobj = datetime.now(tz=None) 156 # dt_oobj = datetime.now(tz=None)
125 # timestamp = dt_oobj.strftime("%Y%m%d_%H:%M:%S") 157 # timestamp = dt_oobj.strftime("%Y%m%d_%H:%M:%S")
158 runs_included = []
159 exp_included = []
126 for study_alias, study in studies_dict.items(): 160 for study_alias, study in studies_dict.items():
127 # study_alias = study_alias + '_' + timestamp 161 # study_alias = study_alias + '_' + timestamp
128 studies_table.write('\t'.join([study_alias, action, 'ENA_accession', study['title'], 162 studies_table.write('\t'.join([study_alias, action, 'ENA_accession', study['title'],
129 study['study_type'], study['study_abstract'], '', 163 study['study_type'], study['study_abstract'], '',
130 'ENA_submission_data']) + '\n') # assuming no pubmed_id 164 'ENA_submission_data']) + '\n') # assuming no pubmed_id
160 exp['library_layout'].lower(), 194 exp['library_layout'].lower(),
161 str(int(exp['insert_size'])), 195 str(int(exp['insert_size'])),
162 exp['library_construction_protocol'], 196 exp['library_construction_protocol'],
163 exp['platform'], exp['instrument_model'], 197 exp['platform'], exp['instrument_model'],
164 'submission_date_ENA']) + '\n') 198 'submission_date_ENA']) + '\n')
199 exp_included.append(exp_alias)
165 for run_alias, run in runs_dict.items(): 200 for run_alias, run in runs_dict.items():
166 if run['experiment_alias'] == exp_alias: 201 # check that the experiments library_layout is set to paired
167 runs_table.write('\t'.join([run_alias, action, 'ena_run_accession', exp_alias, 202 # when multiple entries are associated with the same run alias
168 run['file_name'], FILE_FORMAT, 'file_checksum', 203 if not isinstance(run, list):
169 'submission_date_ENA']) + '\n') 204 runs_list = [run]
205 else:
206 runs_list = run
207 for run_entry in runs_list:
208 if run_entry['experiment_alias'] == exp_alias:
209 runs_table.write('\t'.join([run_alias, action, 'ena_run_accession',
210 exp_alias, run_entry['file_name'],
211 FILE_FORMAT, 'file_checksum',
212 'submission_date_ENA']) + '\n')
213 runs_included.append(run_alias)
214
215 # check if any experiment or run was not associated with any sample
216 for run in runs_dict.keys():
217 if run not in runs_included:
218 print(f'The run {run} is listed in the runs section but not associated with any \
219 used experiment')
220
221 for exp in experiments_dict.keys():
222 if exp not in exp_included:
223 print(f'The experiment {exp} is listed in the experiments section but not associated \
224 with any used sample')
225
170 studies_table.close() 226 studies_table.close()
171 samples_table.close() 227 samples_table.close()
172 experiments_table.close() 228 experiments_table.close()
173 runs_table.close() 229 runs_table.close()
230
231 if args.verbose:
232 paste_xls2yaml(args.xlsx_path)