Mercurial > repos > bgruening > flexynesis
comparison flexynesis_utils.py @ 8:9c91d13827ef draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 6b520305ec30e6dc37eba92c67a5368cea0fc5ad
| author | bgruening |
|---|---|
| date | Wed, 23 Jul 2025 07:50:31 +0000 |
| parents | 9450286c42ab |
| children |
comparison
equal
deleted
inserted
replaced
| 7:9450286c42ab | 8:9c91d13827ef |
|---|---|
| 161 except Exception as e: | 161 except Exception as e: |
| 162 print(f"Error saving {key}: {e}") | 162 print(f"Error saving {key}: {e}") |
| 163 continue | 163 continue |
| 164 | 164 |
| 165 | 165 |
| 166 def validate_numeric_column(df, column_names, require_integer=False): | |
| 167 """ Validate that a column(s) in the DataFrame contains numeric values. """ | |
| 168 if isinstance(column_names, str): | |
| 169 # Handle comma-separated string: "col1,col2,col3" | |
| 170 if ',' in column_names: | |
| 171 column_names = [col.strip() for col in column_names.split(',')] | |
| 172 else: | |
| 173 # Single column name | |
| 174 column_names = [column_names] | |
| 175 | |
| 176 # Validate each column | |
| 177 for column_name in column_names: | |
| 178 if column_name not in df.columns: | |
| 179 raise ValueError(f"Column '{column_name}' not found in DataFrame.") | |
| 180 | |
| 181 try: | |
| 182 numeric_col = pd.to_numeric(df[column_name], errors='raise') | |
| 183 except Exception as e: | |
| 184 raise ValueError(f"Non-numeric values found in column '{column_name}': {e}") | |
| 185 | |
| 186 if require_integer: | |
| 187 # Check if all non-null values are equivalent to integers | |
| 188 non_null_values = numeric_col.dropna() | |
| 189 if not (non_null_values == non_null_values.round()).all(): | |
| 190 raise ValueError(f"Column '{column_name}' contains non-integer numeric values.") | |
| 191 print(f"Column '{column_name}': All values are integers or integer-equivalent floats.") | |
| 192 else: | |
| 193 print(f"Column '{column_name}': All values are numeric (integers and floats accepted).") | |
| 194 | |
| 195 | |
| 196 def validate_survival(df, column_names): | |
| 197 """Validate survival column(s) (integer).""" | |
| 198 validate_numeric_column(df, column_names, require_integer=True) | |
| 199 | |
| 200 | |
| 201 def validate_covariate(df, column_names): | |
| 202 """Validate covariate column(s) (numeric).""" | |
| 203 validate_numeric_column(df, column_names, require_integer=False) | |
| 204 | |
| 205 | |
| 166 def main(): | 206 def main(): |
| 167 parser = argparse.ArgumentParser(description='Flexynesis extra utilities') | 207 parser = argparse.ArgumentParser(description='Flexynesis extra utilities') |
| 168 | 208 |
| 169 parser.add_argument("--util", type=str, required=True, | 209 parser.add_argument("--util", type=str, required=True, |
| 170 choices=['split', 'binarize'], | 210 choices=['split', 'binarize', 'validate_survival', 'validate_covariate'], |
| 171 help="Utility function: 'split' for spiting data to train and test, 'binarize' for creating a binarized matrix from a mutation data") | 211 help="Utility function: 'split' for spiting data to train and test, 'binarize' for creating a binarized matrix from a mutation data, 'validate_survival' for validating survival data.") |
| 172 | 212 |
| 173 # Arguments for split | 213 # Arguments for split (clin also for validate_survival and validate_covariate) |
| 174 parser.add_argument('--clin', required=False, | 214 parser.add_argument('--clin', required=False, |
| 175 help='Path to clinical data CSV file (samples in rows)') | 215 help='Path to clinical data CSV file (samples in rows)') |
| 176 parser.add_argument('--omics', required=False, | 216 parser.add_argument('--omics', required=False, |
| 177 help='Comma-separated list of omics CSV files (samples in columns)') | 217 help='Comma-separated list of omics CSV files (samples in columns)') |
| 178 parser.add_argument('--split', type=float, default=0.7, | 218 parser.add_argument('--split', type=float, default=0.7, |
| 184 parser.add_argument('--gene_idx', type=int, default=0, | 224 parser.add_argument('--gene_idx', type=int, default=0, |
| 185 help='Column index for genes in mutation data (default: 0)') | 225 help='Column index for genes in mutation data (default: 0)') |
| 186 parser.add_argument('--sample_idx', type=int, default=1, | 226 parser.add_argument('--sample_idx', type=int, default=1, |
| 187 help='Column index for samples in mutation data (default: 1)') | 227 help='Column index for samples in mutation data (default: 1)') |
| 188 | 228 |
| 189 # common arguments | 229 # Arguments for validate_survival and validate_covariate |
| 230 parser.add_argument('--clin_variable', type=str, required=False, | |
| 231 help='Column name for clinical variable (e.g., death, SEX, ...)') | |
| 232 | |
| 233 # common arguments (binarize and split) | |
| 190 parser.add_argument('--out', default='.', | 234 parser.add_argument('--out', default='.', |
| 191 help='Output directory (default: current directory)') | 235 help='Output directory (default: current directory)') |
| 192 | 236 |
| 193 args = parser.parse_args() | 237 args = parser.parse_args() |
| 194 | 238 |
| 195 try: | 239 try: |
| 196 # validate utility function | 240 # validate utility function |
| 197 if not args.util: | 241 if not args.util: |
| 198 raise ValueError("Utility function must be specified") | 242 raise ValueError("Utility function must be specified") |
| 199 if args.util not in ['split', 'binarize']: | 243 if args.util not in ['split', 'binarize', 'validate_survival', 'validate_covariate']: |
| 200 raise ValueError(f"Invalid utility function: {args.util}") | 244 raise ValueError(f"Invalid utility function: {args.util}") |
| 201 | 245 |
| 202 if args.util == 'split': | 246 if args.util == 'split': |
| 203 # Validate inputs | 247 # Validate inputs |
| 204 if not args.clin: | 248 if not args.clin: |
| 219 raise FileNotFoundError(f"Mutation data file not found: {args.mutations}") | 263 raise FileNotFoundError(f"Mutation data file not found: {args.mutations}") |
| 220 # Validate gene and sample indices | 264 # Validate gene and sample indices |
| 221 if args.gene_idx < 0 or args.sample_idx < 0: | 265 if args.gene_idx < 0 or args.sample_idx < 0: |
| 222 raise ValueError("Gene and sample indices must be non-negative integers") | 266 raise ValueError("Gene and sample indices must be non-negative integers") |
| 223 | 267 |
| 268 elif args.util == 'validate_survival' or args.util == 'validate_covariate': | |
| 269 # Validate clinical data file | |
| 270 if not args.clin: | |
| 271 raise ValueError("Clinical data file must be provided") | |
| 272 if not os.path.isfile(args.clin): | |
| 273 raise FileNotFoundError(f"Clinical file not found: {args.clin}") | |
| 274 # Validate survival event variable | |
| 275 if not args.clin_variable: | |
| 276 raise ValueError("Survival event variable must be specified") | |
| 277 | |
| 224 # Create output directory if it doesn't exist | 278 # Create output directory if it doesn't exist |
| 225 if not os.path.exists(args.out): | 279 if not os.path.exists(args.out): |
| 226 os.makedirs(args.out) | 280 os.makedirs(args.out) |
| 227 | 281 |
| 228 if args.util == 'split': | 282 if args.util == 'split': |
| 246 # Save binarized matrix | 300 # Save binarized matrix |
| 247 output_file = os.path.join(args.out, 'binarized_mutations.tabular') | 301 output_file = os.path.join(args.out, 'binarized_mutations.tabular') |
| 248 binarized_matrix.to_csv(output_file, sep='\t') | 302 binarized_matrix.to_csv(output_file, sep='\t') |
| 249 print(f"Binarized mutation matrix saved to {output_file}") | 303 print(f"Binarized mutation matrix saved to {output_file}") |
| 250 | 304 |
| 305 elif args.util == 'validate_survival': | |
| 306 clin_df = read_data(args.clin, index=False) | |
| 307 if clin_df.empty: | |
| 308 raise ValueError("Clinical data file is empty") | |
| 309 | |
| 310 # Validate survival event variable | |
| 311 validate_survival(clin_df, args.clin_variable) | |
| 312 | |
| 313 elif args.util == 'validate_covariate': | |
| 314 clin_df = read_data(args.clin, index=False) | |
| 315 if clin_df.empty: | |
| 316 raise ValueError("Clinical data file is empty") | |
| 317 | |
| 318 # Validate clinical variable | |
| 319 validate_covariate(clin_df, args.clin_variable) | |
| 320 | |
| 251 except Exception as e: | 321 except Exception as e: |
| 252 print(f"Error: {e}", file=sys.stderr) | 322 print(f"Error: {e}", file=sys.stderr) |
| 253 sys.exit(1) | 323 sys.exit(1) |
| 254 | 324 |
| 255 | 325 |
