annotate fetch_cbioportal_data.py @ 0:9c949eca5d72 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
author bgruening
date Mon, 14 Apr 2025 09:56:31 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
1 #!/usr/bin/env python
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
2
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
3 import argparse
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
4 import os
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
5
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
6 from flexynesis.utils import CBioPortalData
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
7
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
8
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
9 def main():
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
10 parser = argparse.ArgumentParser(description="Fetch and prepare cBioPortal data for Flexynesis.")
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
11 parser.add_argument("--study_id", required=True, help="cBioPortal study ID (e.g., 'brca_tcga')")
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
12 parser.add_argument("--data_types", required=True, help="Comma-separated list of data types (e.g., 'clin,mut,omics')")
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
13 parser.add_argument("--mapped_files", default=None, help="Comma-separated list of .txt files to map to data_types (optional)")
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
14 parser.add_argument("--split_ratio", type=float, default=0.7, help="Training/test split ratio (0.0 to 1.0)")
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
15 parser.add_argument("--output_dir", required=True, help="Output directory for datasets")
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
16
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
17 args = parser.parse_args()
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
18
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
19 data_types = args.data_types.split(",")
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
20 if "clin" not in data_types:
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
21 raise ValueError("Clinical data ('clin') is required for splitting the dataset.")
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
22
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
23 file_mapping = {
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
24 "clin": "data_clinical_patient.txt", # can be any with 'clinical' in file name
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
25 "mut": "data_mutations.txt", # any with 'mutations' in file name
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
26 "omics": "data_cna.txt",
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
27 "other": None
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
28 }
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
29
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
30 if args.mapped_files:
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
31 mapped_files = args.mapped_files.split(",")
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
32 if len(mapped_files) != len(data_types):
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
33 raise ValueError(f"Number of mapped files ({len(mapped_files)}) must match number of data types ({len(data_types)}).")
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
34 files_to_fetch = {dt: mf for dt, mf in zip(data_types, mapped_files)}
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
35 for mf in mapped_files:
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
36 if not mf.endswith(".txt"):
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
37 raise ValueError(f"Mapped file '{mf}' must end with '.txt'.")
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
38 else:
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
39 files_to_fetch = {dt: file_mapping[dt] for dt in data_types if dt in file_mapping}
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
40
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
41 invalid_types = set(data_types) - set(file_mapping.keys())
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
42 if invalid_types:
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
43 raise ValueError(f"Invalid data types: {invalid_types}. Supported types: {list(file_mapping.keys())}")
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
44
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
45 cbioportal = CBioPortalData(study_id=args.study_id)
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
46 cbioportal.get_cbioportal_data(study_id=args.study_id, files=files_to_fetch)
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
47 dataset = cbioportal.split_data(ratio=args.split_ratio)
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
48
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
49 os.makedirs(args.output_dir, exist_ok=True)
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
50
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
51 for data_type in data_types:
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
52 if data_type in dataset['train']:
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
53 train_file = os.path.join(args.output_dir, f"{data_type}_train.csv")
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
54 dataset['train'][data_type].to_csv(train_file, index=True)
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
55 if data_type in dataset['test']:
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
56 test_file = os.path.join(args.output_dir, f"{data_type}_test.csv")
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
57 dataset['test'][data_type].to_csv(test_file, index=True)
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
58
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
59
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
60 if __name__ == "__main__":
9c949eca5d72 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff changeset
61 main()