comparison process_xlsx.py @ 2:9e2df763086c draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ena_upload commit 1eed23745846ce215e9bdc4a4934d6bc8f41b24e"
author iuc
date Thu, 15 Jul 2021 20:12:34 +0000
parents 57251c760cab
children 59bb6d34fca6
comparison
equal deleted inserted replaced
1:57251c760cab 2:9e2df763086c
2 import pathlib 2 import pathlib
3 import sys 3 import sys
4 4
5 import xlrd 5 import xlrd
6 import yaml 6 import yaml
7 from mappings import optional_samples_cols_mapping
7 8
8 FILE_FORMAT = 'fastq' 9 FILE_FORMAT = 'fastq'
9 10
10 11
11 def extract_data(xl_sheet, expected_columns): 12 def extract_data(xl_sheet, expected_columns, optional_cols=None):
12 """ 13 """
13 1. Check that the columns I expect are present in the sheet 14 1. Check that the columns I expect are present in the sheet
14 (any order and mixed with others, it's just a verification that 15 (any order and mixed with others, it's just a verification that
15 the user filled the correct template) 16 the user filled the correct template)
16 2. Fill a dictionary with the rows data indexed by first column in list""" 17 2. Fill a dictionary with the rows data indexed by first column in list"""
17 sheet_columns = {} 18 sheet_columns = {}
19 if optional_cols is None:
20 optional_cols = []
21 optional_cols_loaded = []
18 for sh_col in range(xl_sheet.ncols): 22 for sh_col in range(xl_sheet.ncols):
19 if xl_sheet.cell(0, sh_col).value in expected_columns: 23 if (xl_sheet.cell(0, sh_col).value in expected_columns) \
24 or (xl_sheet.cell(0, sh_col).value in optional_cols):
20 if xl_sheet.cell(0, sh_col).value in sheet_columns.keys(): 25 if xl_sheet.cell(0, sh_col).value in sheet_columns.keys():
21 sys.exit("Duplicated columns") 26 sys.exit("Duplicated columns found")
22 else: 27 else:
23 sheet_columns[xl_sheet.cell(0, sh_col).value] = sh_col 28 sheet_columns[xl_sheet.cell(0, sh_col).value] = sh_col
29 if xl_sheet.cell(0, sh_col).value in optional_cols:
30 # store the list of optional cols available
31 optional_cols_loaded.append(xl_sheet.cell(0, sh_col).value)
32 provided_cols = expected_columns + optional_cols_loaded
33
34 # check that the required columns are all present
35 # TODO: revise this for optional columns
24 for col in range(len(expected_columns)): 36 for col in range(len(expected_columns)):
25 assert expected_columns[col] in sheet_columns.keys(), \ 37 assert expected_columns[col] in sheet_columns.keys(), \
26 "Expected column %s not found" % expected_columns[col] 38 "Expected column %s not found" % expected_columns[col]
27 39
28 # fetch rows in a dict 40 # fetch rows in a dict
30 # the first of the expected columns will be the index 42 # the first of the expected columns will be the index
31 index_col = sheet_columns[expected_columns[0]] 43 index_col = sheet_columns[expected_columns[0]]
32 # skip first 2 rows: column names + comments rows 44 # skip first 2 rows: column names + comments rows
33 for row_id in range(2, xl_sheet.nrows): 45 for row_id in range(2, xl_sheet.nrows):
34 row_dict = {} 46 row_dict = {}
35 for col in range(1, len(expected_columns)): 47 for col in range(1, len(provided_cols)):
36 sheet_col_index = sheet_columns[expected_columns[col]] 48 sheet_col_index = sheet_columns[provided_cols[col]]
37 row_dict[expected_columns[col]] = xl_sheet.cell(row_id, sheet_col_index).value 49 row_dict[provided_cols[col]] = xl_sheet.cell(row_id, sheet_col_index).value
38 # should check for duplicate alias/ids? 50 # should check for duplicate alias/ids?
39 if xl_sheet.cell(row_id, index_col).value in data_dict.keys(): 51 if xl_sheet.cell(row_id, index_col).value in data_dict.keys():
40 tmp = data_dict[xl_sheet.cell(row_id, index_col).value] 52 tmp = data_dict[xl_sheet.cell(row_id, index_col).value]
41 data_dict[xl_sheet.cell(row_id, index_col).value] = [tmp] 53 data_dict[xl_sheet.cell(row_id, index_col).value] = [tmp]
42 data_dict[xl_sheet.cell(row_id, index_col).value].append(row_dict) 54 data_dict[xl_sheet.cell(row_id, index_col).value].append(row_dict)
43 else: 55 else:
44 data_dict[xl_sheet.cell(row_id, index_col).value] = row_dict 56 data_dict[xl_sheet.cell(row_id, index_col).value] = row_dict
45 return data_dict 57 return data_dict, optional_cols_loaded
46 58
47 59
48 def paste_xls2yaml(xlsx_path): 60 def paste_xls2yaml(xlsx_path):
49 print('YAML -------------') 61 print('YAML -------------')
50 xls = xlrd.open_workbook(xlsx_path) 62 xls = xlrd.open_workbook(xlsx_path)
84 xl_sheet = xl_workbook.sheet_by_name('ENA_study') 96 xl_sheet = xl_workbook.sheet_by_name('ENA_study')
85 if xl_sheet.nrows < 3: 97 if xl_sheet.nrows < 3:
86 raise ValueError('No entries found in studies sheet') 98 raise ValueError('No entries found in studies sheet')
87 studies_dict = {} 99 studies_dict = {}
88 studies_col = ['alias', 'title', 'study_type', 'study_abstract'] 100 studies_col = ['alias', 'title', 'study_type', 'study_abstract']
89 studies_dict = extract_data(xl_sheet, studies_col) 101 studies_dict, _ = extract_data(xl_sheet, studies_col)
90 102
91 # PARSE SAMPLES 103 # PARSE SAMPLES
92 ################# 104 #################
93 xl_sheet = xl_workbook.sheet_by_name('ENA_sample') 105 xl_sheet = xl_workbook.sheet_by_name('ENA_sample')
94 if xl_sheet.nrows < 3: 106 if xl_sheet.nrows < 3:
95 raise ValueError('No entries found in samples') 107 raise ValueError('No entries found in samples')
108
109 samples_cols_excel = ['alias', 'title', 'scientific_name', 'sample_description']
110 # optional_samples_cols_mapping = {}
96 if args.viral_submission: 111 if args.viral_submission:
97 samples_cols = ['alias', 'title', 'scientific_name', 'sample_description', 112 # load columns names from the table
98 'geographic location (country and/or sea)', 'host common name', 113 samples_cols_excel = samples_cols_excel + ['geographic location (country and/or sea)',
99 'host health state', 'host sex', 'host scientific name', 'collector name', 114 'host common name', 'host health state',
100 'collection date', 'collecting institution', 'isolate'] 115 'host sex', 'host scientific name', 'collector name',
101 else: 116 'collecting institution', 'isolate']
102 samples_cols = ['alias', 'title', 'scientific_name', 'sample_description'] 117
103 samples_dict = extract_data(xl_sheet, samples_cols) 118 samples_dict, samples_optional_cols_loaded = extract_data(xl_sheet, samples_cols_excel,
104 119 optional_samples_cols_mapping.keys())
105 # PARSE EXPERIMENTS 120 # PARSE EXPERIMENTS
106 ################# 121 #################
107 xl_sheet = xl_workbook.sheet_by_name('ENA_experiment') 122 xl_sheet = xl_workbook.sheet_by_name('ENA_experiment')
108 if xl_sheet.nrows < 3: 123 if xl_sheet.nrows < 3:
109 raise ValueError('No experiments found in experiments sheet') 124 raise ValueError('No experiments found in experiments sheet')
110 exp_columns = ['alias', 'title', 'study_alias', 'sample_alias', 'design_description', 125 exp_columns = ['alias', 'title', 'study_alias', 'sample_alias', 'design_description',
111 'library_name', 'library_strategy', 'library_source', 'library_selection', 126 'library_name', 'library_strategy', 'library_source', 'library_selection',
112 'library_layout', 'insert_size', 'library_construction_protocol', 127 'library_layout', 'insert_size', 'library_construction_protocol',
113 'platform', 'instrument_model'] 128 'platform', 'instrument_model']
114 129
115 experiments_dict = extract_data(xl_sheet, exp_columns) 130 experiments_dict, _ = extract_data(xl_sheet, exp_columns)
116 131
117 # PARSE RUNS SHEET 132 # PARSE RUNS SHEET
118 ################# 133 #################
119 xl_sheet = xl_workbook.sheet_by_name('ENA_run') 134 xl_sheet = xl_workbook.sheet_by_name('ENA_run')
120 if xl_sheet.nrows < 3: 135 if xl_sheet.nrows < 3:
121 raise ValueError('No entries found in runs sheet') 136 raise ValueError('No entries found in runs sheet')
122 run_cols = ['alias', 'experiment_alias', 'file_name', 'file_format'] 137 run_cols = ['alias', 'experiment_alias', 'file_name', 'file_format']
123 runs_dict = extract_data(xl_sheet, run_cols) 138 runs_dict, _ = extract_data(xl_sheet, run_cols)
124 139
125 # WRITE HEADERS TO TABLES 140 # WRITE HEADERS TO TABLES
126 studies_table = open(pathlib.Path(args.out_path) / 'studies.tsv', 'w') 141 studies_table = open(pathlib.Path(args.out_path) / 'studies.tsv', 'w')
127 studies_table.write('\t'.join(['alias', 'status', 'accession', 'title', 'study_type', 142 studies_table.write('\t'.join(['alias', 'status', 'accession', 'title', 'study_type',
128 'study_abstract', 'pubmed_id', 'submission_date']) + '\n') 143 'study_abstract', 'pubmed_id', 'submission_date']) + '\n')
129 samples_table = open(pathlib.Path(args.out_path) / 'samples.tsv', 'w') 144 samples_table = open(pathlib.Path(args.out_path) / 'samples.tsv', 'w')
145
146 samples_cols = ['alias', 'title', 'scientific_name', 'sample_description']
147 # extend the samples_cols list to add the ones that are filled by the CLI
148 samples_cols = samples_cols + ['status', 'accession', 'taxon_id', 'submission_date']
130 if args.viral_submission: 149 if args.viral_submission:
131 samples_table.write('\t'.join(['alias', 'status', 'accession', 'title', 'scientific_name', 150 # extend the samples columns with the viral specific data
132 'taxon_id', 'sample_description', 'collection_date', 151 samples_cols = samples_cols + ['geographic_location', 'host_common_name',
133 'geographic_location', 'host_common_name', 'host_subject_id', 152 'host_subject_id', 'host_health_state', 'host_sex',
134 'host_health_state', 'host_sex', 'host_scientific_name', 153 'host_scientific_name', 'collector_name',
135 'collector_name', 'collecting_institution', 'isolate', 154 'collecting_institution', 'isolate']
136 'submission_date']) + '\n') 155 if len(samples_optional_cols_loaded) > 0:
137 else: 156 for optional_cols_excel in samples_optional_cols_loaded:
138 samples_table.write('\t'.join(['alias', 'status', 'accession', 'title', 'scientific_name', 157 samples_cols.append(optional_samples_cols_mapping[optional_cols_excel])
139 'taxon_id', 'sample_description', 'submission_date']) + '\n') 158 samples_table.write('\t'.join(samples_cols) + '\n')
140 159
141 experiments_table = open(pathlib.Path(args.out_path) / 'experiments.tsv', 'w') 160 experiments_table = open(pathlib.Path(args.out_path) / 'experiments.tsv', 'w')
142 experiments_table.write('\t'.join(['alias', 'status', 'accession', 'title', 'study_alias', 161 experiments_table.write('\t'.join(['alias', 'status', 'accession', 'title', 'study_alias',
143 'sample_alias', 'design_description', 'library_name', 162 'sample_alias', 'design_description', 'library_name',
144 'library_strategy', 'library_source', 'library_selection', 163 'library_strategy', 'library_source', 'library_selection',
162 studies_table.write('\t'.join([study_alias, action, 'ENA_accession', study['title'], 181 studies_table.write('\t'.join([study_alias, action, 'ENA_accession', study['title'],
163 study['study_type'], study['study_abstract'], '', 182 study['study_type'], study['study_abstract'], '',
164 'ENA_submission_data']) + '\n') # assuming no pubmed_id 183 'ENA_submission_data']) + '\n') # assuming no pubmed_id
165 for sample_alias, sample in samples_dict.items(): 184 for sample_alias, sample in samples_dict.items():
166 # sample_alias = sample_alias + '_' + timestamp 185 # sample_alias = sample_alias + '_' + timestamp
186 samples_row_values = [sample_alias, sample['title'], sample['scientific_name'],
187 sample['sample_description'], action, 'ena_accession',
188 'tax_id_updated_by_ENA', 'ENA_submission_date']
167 if args.viral_submission: 189 if args.viral_submission:
190 # add the values that are unique for the viral samples
168 if sample['collector name'] == '': 191 if sample['collector name'] == '':
169 sample['collector name'] = 'unknown' 192 sample['collector name'] = 'unknown'
170 samples_table.write('\t'.join([sample_alias, action, 'ena_accession', sample['title'], 193 samples_row_values = samples_row_values + \
171 sample['scientific_name'], 'tax_id_updated_by_ENA', 194 [sample['geographic location (country and/or sea)'], sample['host common name'],
172 sample['sample_description'], sample['collection date'], 195 'host subject id', sample['host health state'], sample['host sex'],
173 sample['geographic location (country and/or sea)'], 196 sample['host scientific name'], sample['collector name'],
174 sample['host common name'], 'host subject id', 197 sample['collecting institution'], sample['isolate']]
175 sample['host health state'], sample['host sex'], 198 # add the (possible) optional columns values
176 sample['host scientific name'], sample['collector name'], 199 if len(samples_optional_cols_loaded) > 0:
177 sample['collecting institution'], sample['isolate'], 200 for optional_col in samples_optional_cols_loaded:
178 'ENA_submission_date']) + '\n') 201 # parse values stored as in excel date format (=float)
179 else: 202 if optional_col in ('collection date', 'receipt date'):
180 samples_table.write('\t'.join([sample_alias, action, 'ena_accession', sample['title'], 203 # check if excel stored it as date
181 sample['scientific_name'], 'tax_id_updated_by_ENA', 204 if isinstance(sample[optional_col], float):
182 sample['sample_description']]) + '\n') 205 year, month, day, hour, minute, second = xlrd.xldate_as_tuple(
206 sample[optional_col], xl_workbook.datemode)
207 month = "{:02d}".format(month)
208 day = "{:02d}".format(day)
209 hour = "{:02d}".format(hour)
210 minute = "{:02d}".format(minute)
211 second = "{:02d}".format(second)
212 # format it as 2008-01-23T19:23:10
213 sample[optional_col] = str(year) + '-' + str(month) + '-' + str(day) + \
214 'T' + str(hour) + ':' + str(minute) + ':' + str(second)
215 # excel stores everything as float so I need to check if
216 # the value was actually an int and keep it as int
217 if isinstance(sample[optional_col], float):
218 if int(sample[optional_col]) == sample[optional_col]:
219 # it is not really a float but an int
220 sample[optional_col] = int(sample[optional_col])
221 samples_row_values.append(str(sample[optional_col]))
222 samples_table.write('\t'.join(samples_row_values) + '\n')
223
183 for exp_alias, exp in experiments_dict.items(): 224 for exp_alias, exp in experiments_dict.items():
184 # should I check here if any experiment has a study or sample alias that is incorrect? 225 # should I check here if any experiment has a study or sample alias that is incorrect?
185 # (not listed in the samples or study dict) 226 # (not listed in the samples or study dict)
186 # process the experiments for this sample 227 # process the experiments for this sample
187 if exp['sample_alias'] == sample_alias: 228 if exp['sample_alias'] == sample_alias: