diff process_xlsx.py @ 1:57251c760cab draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ena_upload commit ffea061c1ad6e7291abfe220230dbdbe8d19a2bd"
author iuc
date Fri, 30 Apr 2021 12:09:25 +0000
parents 382518f24d6d
children 9e2df763086c
line wrap: on
line diff
--- a/process_xlsx.py	Sat Nov 28 09:45:44 2020 +0000
+++ b/process_xlsx.py	Fri Apr 30 12:09:25 2021 +0000
@@ -3,7 +3,7 @@
 import sys
 
 import xlrd
-
+import yaml
 
 FILE_FORMAT = 'fastq'
 
@@ -36,15 +36,45 @@
             sheet_col_index = sheet_columns[expected_columns[col]]
             row_dict[expected_columns[col]] = xl_sheet.cell(row_id, sheet_col_index).value
         # should check for duplicate alias/ids?
-        data_dict[xl_sheet.cell(row_id, index_col).value] = row_dict
+        if xl_sheet.cell(row_id, index_col).value in data_dict.keys():
+            tmp = data_dict[xl_sheet.cell(row_id, index_col).value]
+            data_dict[xl_sheet.cell(row_id, index_col).value] = [tmp]
+            data_dict[xl_sheet.cell(row_id, index_col).value].append(row_dict)
+        else:
+            data_dict[xl_sheet.cell(row_id, index_col).value] = row_dict
     return data_dict
 
 
+def paste_xls2yaml(xlsx_path):
+    print('YAML -------------')
+    xls = xlrd.open_workbook(xlsx_path)
+    content_dict = {}
+    for sheet_name in xls.sheet_names():
+        if sheet_name == 'controlled_vocabulary':
+            continue
+        xls_sheet = xls.sheet_by_name(sheet_name)
+        sheet_contents_dict = {}
+        colnames = []
+        for col in range(xls_sheet.ncols):
+            colnames.append(xls_sheet.cell(0, col).value)
+        # skip first 2 rows (column names and suggestions)
+        for row_id in range(2, xls_sheet.nrows):
+            row_dict = {}
+            for col_id in range(0, xls_sheet.ncols):
+                row_dict[colnames[col_id]] = xls_sheet.cell(row_id, col_id).value
+            # should check for duplicate alias/ids?
+            sheet_contents_dict[row_id] = row_dict
+        content_dict[sheet_name] = sheet_contents_dict
+    yaml.dump(content_dict, sys.stdout)
+    print('YAML -------------')
+
+
 parser = argparse.ArgumentParser()
 parser.add_argument('--form', dest='xlsx_path', required=True)
 parser.add_argument('--out_dir', dest='out_path', required=True)
 parser.add_argument('--action', dest='action', required=True)
 parser.add_argument('--vir', dest='viral_submission', required=False, action='store_true')
+parser.add_argument('--verbose', dest='verbose', required=False, action='store_true')
 args = parser.parse_args()
 
 xl_workbook = xlrd.open_workbook(args.xlsx_path)
@@ -77,9 +107,11 @@
 xl_sheet = xl_workbook.sheet_by_name('ENA_experiment')
 if xl_sheet.nrows < 3:
     raise ValueError('No experiments found in experiments sheet')
-exp_columns = ['alias', 'title', 'study_alias', 'sample_alias', 'design_description', 'library_name',
-               'library_strategy', 'library_source', 'library_selection', 'library_layout',
-               'insert_size', 'library_construction_protocol', 'platform', 'instrument_model']
+exp_columns = ['alias', 'title', 'study_alias', 'sample_alias', 'design_description',
+               'library_name', 'library_strategy', 'library_source', 'library_selection',
+               'library_layout', 'insert_size', 'library_construction_protocol',
+               'platform', 'instrument_model']
+
 experiments_dict = extract_data(xl_sheet, exp_columns)
 
 # PARSE RUNS SHEET
@@ -123,6 +155,8 @@
 # ADD A TIMESTAMP TO THE ALIAS? SEEMS LIKE ENA REQUIRES ALL ENTRIES FOR A WEBIN TO HAVE UNIQUE IDS?
 # dt_oobj = datetime.now(tz=None)
 # timestamp = dt_oobj.strftime("%Y%m%d_%H:%M:%S")
+runs_included = []
+exp_included = []
 for study_alias, study in studies_dict.items():
     # study_alias = study_alias + '_' + timestamp
     studies_table.write('\t'.join([study_alias, action, 'ENA_accession', study['title'],
@@ -162,12 +196,37 @@
                                                exp['library_construction_protocol'],
                                                exp['platform'], exp['instrument_model'],
                                                'submission_date_ENA']) + '\n')
+            exp_included.append(exp_alias)
             for run_alias, run in runs_dict.items():
-                if run['experiment_alias'] == exp_alias:
-                    runs_table.write('\t'.join([run_alias, action, 'ena_run_accession', exp_alias,
-                                                run['file_name'], FILE_FORMAT, 'file_checksum',
-                                                'submission_date_ENA']) + '\n')
+                # check that the experiments library_layout is set to paired
+                # when multiple entries are associated with the same run alias
+                if not isinstance(run, list):
+                    runs_list = [run]
+                else:
+                    runs_list = run
+                for run_entry in runs_list:
+                    if run_entry['experiment_alias'] == exp_alias:
+                        runs_table.write('\t'.join([run_alias, action, 'ena_run_accession',
+                                                    exp_alias, run_entry['file_name'],
+                                                    FILE_FORMAT, 'file_checksum',
+                                                    'submission_date_ENA']) + '\n')
+                runs_included.append(run_alias)
+
+# check if any experiment or run was not associated with any sample
+for run in runs_dict.keys():
+    if run not in runs_included:
+        print(f'The run {run} is listed in the runs section but not associated with any \
+              used experiment')
+
+for exp in experiments_dict.keys():
+    if exp not in exp_included:
+        print(f'The experiment {exp} is listed in the experiments section but not associated \
+              with any used sample')
+
 studies_table.close()
 samples_table.close()
 experiments_table.close()
 runs_table.close()
+
+if args.verbose:
+    paste_xls2yaml(args.xlsx_path)