Mercurial > repos > recetox > target_screen
comparison target_screen.py @ 1:6d51be3d7bb5 draft default tip
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/misc commit d6102c60e41d91adf1c7a876f84ef420a69262e2
author | recetox |
---|---|
date | Mon, 12 May 2025 14:05:37 +0000 |
parents | d4c2d5bc0524 |
children |
comparison
equal
deleted
inserted
replaced
0:d4c2d5bc0524 | 1:6d51be3d7bb5 |
---|---|
1 import argparse | 1 import argparse |
2 from typing import Tuple | |
2 | 3 |
3 import numpy as np | 4 import numpy as np |
4 import pandas as pd | 5 import pandas as pd |
5 | 6 |
6 | 7 |
7 def mz_match(marker, peak, ppm): | 8 class LoadDataAction(argparse.Action): |
9 """ | |
10 Custom argparse action to load data from a file into a pandas DataFrame. | |
11 Supports CSV, TSV, and Parquet file formats. | |
12 """ | |
13 def __call__(self, parser: argparse.ArgumentParser, namespace: argparse.Namespace, values: Tuple[str, str], option_string: str = None) -> None: | |
14 file_path, file_extension = values | |
15 file_extension = file_extension.lower() | |
16 if file_extension == "csv": | |
17 df = pd.read_csv(file_path) | |
18 elif file_extension in ["tsv", "tabular"]: | |
19 df = pd.read_csv(file_path, sep="\t") | |
20 elif file_extension == "parquet": | |
21 df = pd.read_parquet(file_path) | |
22 else: | |
23 raise ValueError(f"Unsupported file format: {file_extension}") | |
24 setattr(namespace, self.dest, df) | |
25 | |
26 | |
27 def mz_match(marker: np.ndarray, peak: np.ndarray, ppm: int) -> np.ndarray: | |
28 """ | |
29 Check if the mass-to-charge ratio (m/z) of markers and peaks match within a given PPM tolerance. | |
30 | |
31 Args: | |
32 marker (np.ndarray): Array of marker m/z values. | |
33 peak (np.ndarray): Array of peak m/z values. | |
34 ppm (int): PPM tolerance for matching. | |
35 | |
36 Returns: | |
37 np.ndarray: Boolean array indicating matches. | |
38 """ | |
8 return np.abs(marker - peak) <= ((peak + marker) / 2) * ppm * 1e-06 | 39 return np.abs(marker - peak) <= ((peak + marker) / 2) * ppm * 1e-06 |
9 | 40 |
10 | 41 |
11 def rt_match(marker, peak, tol): | 42 def rt_match(marker: np.ndarray, peak: np.ndarray, tol: int) -> np.ndarray: |
43 """ | |
44 Check if the retention time (rt) of markers and peaks match within a given tolerance. | |
45 | |
46 Args: | |
47 marker (np.ndarray): Array of marker retention times. | |
48 peak (np.ndarray): Array of peak retention times. | |
49 tol (int): Retention time tolerance for matching. | |
50 | |
51 Returns: | |
52 np.ndarray: Boolean array indicating matches. | |
53 """ | |
12 return np.abs(marker - peak) <= tol | 54 return np.abs(marker - peak) <= tol |
13 | 55 |
14 | 56 |
15 def find_matches(peaks, markers, ppm, rt_tol): | 57 def find_matches(peaks: pd.DataFrame, markers: pd.DataFrame, ppm: int, rt_tol: int) -> pd.DataFrame: |
58 """ | |
59 Find matches between peaks and markers based on m/z and retention time tolerances. | |
60 | |
61 Args: | |
62 peaks (pd.DataFrame): DataFrame containing peak data with 'mz' and 'rt' columns. | |
63 markers (pd.DataFrame): DataFrame containing marker data with 'mz' and 'rt' columns. | |
64 ppm (int): PPM tolerance for m/z matching. | |
65 rt_tol (int): Retention time tolerance for rt matching. | |
66 | |
67 Returns: | |
68 pd.DataFrame: DataFrame containing matched rows with all columns from peaks and markers. | |
69 """ | |
16 # Create a meshgrid of all combinations of mz and rt values | 70 # Create a meshgrid of all combinations of mz and rt values |
17 marker_mz = markers['mz'].values[:, np.newaxis] | 71 marker_mz = markers['mz'].values[:, np.newaxis] |
18 peak_mz = peaks['mz'].values | 72 peak_mz = peaks['mz'].values |
19 marker_rt = markers['rt'].values[:, np.newaxis] | 73 marker_rt = markers['rt'].values[:, np.newaxis] |
20 peak_rt = peaks['rt'].values | 74 peak_rt = peaks['rt'].values |
27 match_indices = np.where(mz_matches & rt_matches) | 81 match_indices = np.where(mz_matches & rt_matches) |
28 | 82 |
29 # Create a DataFrame of hits | 83 # Create a DataFrame of hits |
30 matched_markers = markers.iloc[match_indices[0]].reset_index(drop=True) | 84 matched_markers = markers.iloc[match_indices[0]].reset_index(drop=True) |
31 matched_peaks = peaks.iloc[match_indices[1]].reset_index(drop=True) | 85 matched_peaks = peaks.iloc[match_indices[1]].reset_index(drop=True) |
32 hits = pd.concat([matched_markers[['formula']].reset_index(drop=True), matched_peaks], axis=1) | |
33 | 86 |
34 # Calculate mz and rt differences | 87 # Calculate mz and rt differences |
35 hits['mz_diff'] = np.abs(matched_markers['mz'].values - matched_peaks['mz'].values) | 88 matched_markers['mz_diff'] = np.abs(matched_markers['mz'].values - matched_peaks['mz'].values) |
36 hits['rt_diff'] = np.abs(matched_markers['rt'].values - matched_peaks['rt'].values) | 89 matched_markers['rt_diff'] = np.abs(matched_markers['rt'].values - matched_peaks['rt'].values) |
37 | 90 |
91 # Drop mz and rt columns from the marker table | |
92 matched_markers = matched_markers.drop(columns=['mz', 'rt']) | |
93 | |
94 # Combine all columns from peaks and markers | |
95 hits = pd.concat([matched_markers.reset_index(drop=True), matched_peaks.reset_index(drop=True)], axis=1) | |
38 return hits | 96 return hits |
39 | 97 |
40 | 98 |
41 def main(): | 99 def main() -> None: |
100 """ | |
101 Main function to parse arguments, find matches between peaks and markers, and save the results. | |
102 """ | |
42 parser = argparse.ArgumentParser(description='Find matches between peaks and markers.') | 103 parser = argparse.ArgumentParser(description='Find matches between peaks and markers.') |
43 parser.add_argument('--peaks', required=True, help='Path to the peaks parquet file.') | 104 parser.add_argument('--peaks', required=True, nargs=2, action=LoadDataAction, help='Path to the peaks file and its format (e.g., "file.parquet parquet").') |
44 parser.add_argument('--markers', required=True, help='Path to the markers CSV file.') | 105 parser.add_argument('--markers', required=True, nargs=2, action=LoadDataAction, help='Path to the markers file and its format (e.g., "file.tsv tsv").') |
45 parser.add_argument('--output', required=True, help='Path to the output TSV file.') | 106 parser.add_argument('--output', required=True, help='Path to the output TSV file.') |
46 parser.add_argument('--ppm', type=int, default=5, help='PPM tolerance for mz matching.') | 107 parser.add_argument('--ppm', type=int, default=5, help='PPM tolerance for mz matching.') |
47 parser.add_argument('--rt_tol', type=int, default=10, help='RT tolerance for rt matching.') | 108 parser.add_argument('--rt_tol', type=int, default=10, help='RT tolerance for rt matching.') |
48 args = parser.parse_args() | 109 args = parser.parse_args() |
49 | 110 |
50 peaks = pd.read_parquet(args.peaks) | 111 hits = find_matches(args.peaks, args.markers, args.ppm, args.rt_tol) |
51 markers = pd.read_csv(args.markers, sep='\t') | |
52 | |
53 hits = find_matches(peaks, markers, args.ppm, args.rt_tol) | |
54 | 112 |
55 hits.to_csv(args.output, sep='\t', index=False) | 113 hits.to_csv(args.output, sep='\t', index=False) |
56 | 114 |
57 | 115 |
58 if __name__ == "__main__": | 116 if __name__ == "__main__": |