Mercurial > repos > recetox > target_screen
changeset 1:6d51be3d7bb5 draft default tip
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/misc commit d6102c60e41d91adf1c7a876f84ef420a69262e2
author | recetox |
---|---|
date | Mon, 12 May 2025 14:05:37 +0000 |
parents | d4c2d5bc0524 |
children | |
files | target_screen.py target_screen.xml test-data/target_screen/out.tsv |
diffstat | 3 files changed, 157 insertions(+), 41 deletions(-) [+] |
line wrap: on
line diff
--- a/target_screen.py Thu Sep 26 13:03:05 2024 +0000 +++ b/target_screen.py Mon May 12 14:05:37 2025 +0000 @@ -1,18 +1,72 @@ import argparse +from typing import Tuple import numpy as np import pandas as pd -def mz_match(marker, peak, ppm): +class LoadDataAction(argparse.Action): + """ + Custom argparse action to load data from a file into a pandas DataFrame. + Supports CSV, TSV, and Parquet file formats. + """ + def __call__(self, parser: argparse.ArgumentParser, namespace: argparse.Namespace, values: Tuple[str, str], option_string: str = None) -> None: + file_path, file_extension = values + file_extension = file_extension.lower() + if file_extension == "csv": + df = pd.read_csv(file_path) + elif file_extension in ["tsv", "tabular"]: + df = pd.read_csv(file_path, sep="\t") + elif file_extension == "parquet": + df = pd.read_parquet(file_path) + else: + raise ValueError(f"Unsupported file format: {file_extension}") + setattr(namespace, self.dest, df) + + +def mz_match(marker: np.ndarray, peak: np.ndarray, ppm: int) -> np.ndarray: + """ + Check if the mass-to-charge ratio (m/z) of markers and peaks match within a given PPM tolerance. + + Args: + marker (np.ndarray): Array of marker m/z values. + peak (np.ndarray): Array of peak m/z values. + ppm (int): PPM tolerance for matching. + + Returns: + np.ndarray: Boolean array indicating matches. + """ return np.abs(marker - peak) <= ((peak + marker) / 2) * ppm * 1e-06 -def rt_match(marker, peak, tol): +def rt_match(marker: np.ndarray, peak: np.ndarray, tol: int) -> np.ndarray: + """ + Check if the retention time (rt) of markers and peaks match within a given tolerance. + + Args: + marker (np.ndarray): Array of marker retention times. + peak (np.ndarray): Array of peak retention times. + tol (int): Retention time tolerance for matching. + + Returns: + np.ndarray: Boolean array indicating matches. + """ return np.abs(marker - peak) <= tol -def find_matches(peaks, markers, ppm, rt_tol): +def find_matches(peaks: pd.DataFrame, markers: pd.DataFrame, ppm: int, rt_tol: int) -> pd.DataFrame: + """ + Find matches between peaks and markers based on m/z and retention time tolerances. + + Args: + peaks (pd.DataFrame): DataFrame containing peak data with 'mz' and 'rt' columns. + markers (pd.DataFrame): DataFrame containing marker data with 'mz' and 'rt' columns. + ppm (int): PPM tolerance for m/z matching. + rt_tol (int): Retention time tolerance for rt matching. + + Returns: + pd.DataFrame: DataFrame containing matched rows with all columns from peaks and markers. + """ # Create a meshgrid of all combinations of mz and rt values marker_mz = markers['mz'].values[:, np.newaxis] peak_mz = peaks['mz'].values @@ -29,28 +83,32 @@ # Create a DataFrame of hits matched_markers = markers.iloc[match_indices[0]].reset_index(drop=True) matched_peaks = peaks.iloc[match_indices[1]].reset_index(drop=True) - hits = pd.concat([matched_markers[['formula']].reset_index(drop=True), matched_peaks], axis=1) # Calculate mz and rt differences - hits['mz_diff'] = np.abs(matched_markers['mz'].values - matched_peaks['mz'].values) - hits['rt_diff'] = np.abs(matched_markers['rt'].values - matched_peaks['rt'].values) + matched_markers['mz_diff'] = np.abs(matched_markers['mz'].values - matched_peaks['mz'].values) + matched_markers['rt_diff'] = np.abs(matched_markers['rt'].values - matched_peaks['rt'].values) + # Drop mz and rt columns from the marker table + matched_markers = matched_markers.drop(columns=['mz', 'rt']) + + # Combine all columns from peaks and markers + hits = pd.concat([matched_markers.reset_index(drop=True), matched_peaks.reset_index(drop=True)], axis=1) return hits -def main(): +def main() -> None: + """ + Main function to parse arguments, find matches between peaks and markers, and save the results. + """ parser = argparse.ArgumentParser(description='Find matches between peaks and markers.') - parser.add_argument('--peaks', required=True, help='Path to the peaks parquet file.') - parser.add_argument('--markers', required=True, help='Path to the markers CSV file.') + parser.add_argument('--peaks', required=True, nargs=2, action=LoadDataAction, help='Path to the peaks file and its format (e.g., "file.parquet parquet").') + parser.add_argument('--markers', required=True, nargs=2, action=LoadDataAction, help='Path to the markers file and its format (e.g., "file.tsv tsv").') parser.add_argument('--output', required=True, help='Path to the output TSV file.') parser.add_argument('--ppm', type=int, default=5, help='PPM tolerance for mz matching.') parser.add_argument('--rt_tol', type=int, default=10, help='RT tolerance for rt matching.') args = parser.parse_args() - peaks = pd.read_parquet(args.peaks) - markers = pd.read_csv(args.markers, sep='\t') - - hits = find_matches(peaks, markers, args.ppm, args.rt_tol) + hits = find_matches(args.peaks, args.markers, args.ppm, args.rt_tol) hits.to_csv(args.output, sep='\t', index=False)
--- a/target_screen.xml Thu Sep 26 13:03:05 2024 +0000 +++ b/target_screen.xml Mon May 12 14:05:37 2025 +0000 @@ -1,8 +1,8 @@ -<tool id="target_screen" name="MS target screening" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.01" license="MIT"> +<tool id="target_screen" name="MS Target Screening" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.01" license="MIT"> <description>Extract peaks from recetox-aplcms tables using a list of target ions</description> <macros> <token name="@TOOL_VERSION@">0.1.0</token> - <token name="@VERSION_SUFFIX@">0</token> + <token name="@VERSION_SUFFIX@">1</token> </macros> <requirements> @@ -11,18 +11,18 @@ </requirements> <command detect_errors="exit_code"><![CDATA[ - python3 '${__tool_directory__}/target_screen.py' --peaks '$peaks' --markers '$markers' --output '$hits' --ppm $ppm --rt_tol $rt + python3 '${__tool_directory__}/target_screen.py' --peaks '$peaks' '$peaks.ext' --markers '$markers' '$markers.ext' --output '$hits' --ppm $ppm --rt_tol $rt ]]></command> <inputs> - <param name="peaks" type="data" format="parquet"/> - <param name="markers" type="data" format="tabular"/> - <param name="ppm" type="integer" min="0" max="1000" value="10" label="ppm" help="Tolerance for peak filtering in ppm." /> - <param name="rt" type="integer" min="0" max="100" value="10" label="rt tolerance" help="Toelrance regarding retention time to filter out peaks" /> + <param name="peaks" type="data" format="parquet,tabular,tsv,csv" label="Peaks Table" help="Input table containing detected peaks in Parquet format. Each row should represent a peak with columns for m/z, retention time, and intensity." /> + <param name="markers" type="data" format="parquet,tabular,tsv,csv" label="Target Markers Table" help="Input table containing target markers in tabular format. Each row should represent a marker with columns for m/z and retention time." /> + <param name="ppm" type="integer" min="0" max="1000" value="10" label="Mass Tolerance (ppm)" help="Tolerance for filtering peaks based on mass-to-charge ratio (m/z) in parts per million (ppm). The tolerance is applied symmetrically (±), so a value of 10 will match peaks within ±10 ppm of the target m/z." /> + <param name="rt" type="integer" min="0" max="100" value="10" label="Retention Time Tolerance" help="Tolerance for filtering peaks based on retention time in the same units as the input data. The tolerance is applied symmetrically (±), so a value of 10 will match peaks within ±10 units of the target retention time." /> </inputs> <outputs> - <data name="hits" format="tabular" label="${tool.name} on ${on_string}" /> + <data name="hits" format="tabular" label="Filtered Peaks (${tool.name} on ${on_string})" /> </outputs> <tests> @@ -38,8 +38,66 @@ **What it does** -This tool pulls out peaks from a table given a list of markers. -The markers are matched based on m/z values with a specified ppm tolerance and matched based on retention time with a tolerance in units of retention time. +This tool extracts peaks from a table of detected peaks based on a list of target markers. Peaks are matched to markers using a specified tolerance for mass-to-charge ratio (ppm) and retention time. Both tolerances are applied symmetrically (±). For example, if the retention time tolerance is set to 10, peaks within ±10 units of the target retention time will be matched. + +**Inputs** + +1. **Peaks Table (Parquet Format)**: + A table containing detected peaks. The table should be in Parquet format and include the following columns: + + - `mz`: Mass-to-charge ratio (m/z) of the peak. + - `rt`: Retention time of the peak. + - `intensity`: Intensity of the peak. + + Example Peaks Table: + + .. list-table:: Example Peaks Table + :header-rows: 1 + + * - mz + - rt + - intensity + * - 100.123 + - 5.2 + - 1500 + * - 200.456 + - 10.5 + - 3000 + +2. **Target Markers Table (Tabular Format)**: + A table containing target markers. The table should be in tabular format and include the following columns: + + - `mz`: Mass-to-charge ratio (m/z) of the marker. + - `rt`: Retention time of the marker. + + Example Markers Table: + + .. list-table:: Example Markers Table + :header-rows: 1 + + * - mz + - rt + * - 100.123 + - 5.2 + * - 200.456 + - 10.5 + +**Parameters** + +- **Mass Tolerance (ppm)**: + The tolerance for matching peaks to markers based on their mass-to-charge ratio (m/z). The value is specified in parts per million (ppm). The tolerance is applied symmetrically (±), so a value of 10 will match peaks within ±10 ppm of the target m/z. + +- **Retention Time Tolerance**: + The tolerance for matching peaks to markers based on their retention time. The value is specified in the same units as the input data. The tolerance is applied symmetrically (±), so a value of 10 will match peaks within ±10 units of the target retention time. + +**Outputs** + +- **Filtered Peaks Table**: + A tabular file containing the peaks that matched the target markers based on the specified tolerances. + +**Example Usage** + +This tool can be used to filter peaks from a mass spectrometry dataset based on a list of known target markers, enabling targeted analysis of specific compounds. ]]></help> <citations> <citation type="doi">10.25080/Majora-92bf1922-00a</citation>
--- a/test-data/target_screen/out.tsv Thu Sep 26 13:03:05 2024 +0000 +++ b/test-data/target_screen/out.tsv Mon May 12 14:05:37 2025 +0000 @@ -1,18 +1,18 @@ -formula mz rt sd1 sd2 area mz_diff rt_diff -C8H6Cl2O3 218.9619738108278 473.4840709352675 0.6057217022739683 2.7706017478506073 1239147.63695882 0.00012618917219242576 1.1159290647325406 -C9H15N3O1 180.11422341297595 450.9460162486645 0.4692965104502825 4.727634916193644 1100073.3285644436 2.341297593488889e-05 7.2839837513355405 -C5H2Cl3N1O1 195.91267599889463 487.37949630118806 0.8695685392506757 2.8811688054510127 734461.0596300099 0.00022400110538001172 0.7205036988119673 -C13H10O3 213.05556658306853 508.4123751384482 2.9585968043814983 3.226731392934289 1787580.264815322 0.00013341693147594924 0.01237513844824889 -C13H9FO3 231.04576243564085 521.2436784813573 0.9930695671903609 2.469013097815558 1316270.081622402 0.0005375643591492008 0.23632151864273965 -C10H12N2O3S1 239.04945126090132 311.8317362000094 0.5578277726641567 3.57063615115722 3042462.634739455 0.0001487390986767423 0.7182637999906092 -C14H17Cl2NO2 300.0561299922103 685.3731548839577 0.8491884774374224 2.8491999009146074 1021277.4141378121 0.0002700077897088704 4.416845116042282 -C12H4Cl2F6N4OS 434.93037227267905 766.6610671335172 0.6265405149641161 3.55175113250731 43923382.478327975 0.0010277273209453597 1.198932866482778 -C12H4Cl2F6N4O2S 450.9259113906124 789.7479646306683 0.5765707513162325 3.4834377486718897 35843894.74749327 0.00038860938764173625 1.5420353693316429 -C16H22ClN3O2 322.13274143359513 705.9176130811956 0.765497607933695 2.9798451004946203 7686414.229962895 5.8566404845805664e-05 0.5823869188044455 -C16H11ClF6N2O 395.0387483584033 741.1840034426168 0.9150873601266857 2.396923077539685 692605.613740076 0.0003516415966942077 0.7459965573831369 -C10H11Cl1O3 213.03219616261535 532.8368925687558 0.8335128693984499 2.548404631638127 1231177.7029795102 0.00020383738464602175 1.0631074312441342 -C7H9NO2S 170.0280487596005 363.28514725405876 0.8844811055327363 2.7876246329523737 915161.3987675996 5.124039950032966e-05 0.3048527459412185 -C12H7Cl3O2 286.9434413572324 831.0018611928409 0.32058179843066653 1.7667251294853705 19934.364712896095 0.00045864276756901745 0.03186119284089273 -C18H15Cl3O8 462.97625391610677 662.6552310211961 0.9093786171678189 2.128435471267278 1209160.0005544876 0.00025391610677161225 0.3347689788039361 -C12H7Cl3O5S 366.90097256680355 699.9403505546061 0.8393755187990459 2.354260942300286 9578789.63215569 0.0002725668035736817 0.5796494453938976 -C9H9N4Cl 207.04420254367005 402.95120970553893 1.2647033563807812 2.594410018631832 40475158.16355405 9.74563299394049e-05 0.4187902944610755 +formula mz_diff rt_diff mz rt sd1 sd2 area +C8H6Cl2O3 0.00012618917219242576 1.1159290647325406 218.9619738108278 473.4840709352675 0.6057217022739683 2.7706017478506073 1239147.63695882 +C9H15N3O1 2.341297593488889e-05 7.2839837513355405 180.11422341297595 450.9460162486645 0.4692965104502825 4.727634916193644 1100073.3285644436 +C5H2Cl3N1O1 0.00022400110538001172 0.7205036988119673 195.91267599889463 487.37949630118806 0.8695685392506757 2.8811688054510127 734461.0596300099 +C13H10O3 0.00013341693147594924 0.01237513844824889 213.05556658306853 508.4123751384482 2.9585968043814983 3.226731392934289 1787580.264815322 +C13H9FO3 0.0005375643591492008 0.23632151864273965 231.04576243564085 521.2436784813573 0.9930695671903609 2.469013097815558 1316270.081622402 +C10H12N2O3S1 0.0001487390986767423 0.7182637999906092 239.04945126090132 311.8317362000094 0.5578277726641567 3.57063615115722 3042462.634739455 +C14H17Cl2NO2 0.0002700077897088704 4.416845116042282 300.0561299922103 685.3731548839577 0.8491884774374224 2.8491999009146074 1021277.4141378121 +C12H4Cl2F6N4OS 0.0010277273209453597 1.198932866482778 434.93037227267905 766.6610671335172 0.6265405149641161 3.55175113250731 43923382.478327975 +C12H4Cl2F6N4O2S 0.00038860938764173625 1.5420353693316429 450.9259113906124 789.7479646306683 0.5765707513162325 3.4834377486718897 35843894.74749327 +C16H22ClN3O2 5.8566404845805664e-05 0.5823869188044455 322.13274143359513 705.9176130811956 0.765497607933695 2.9798451004946203 7686414.229962895 +C16H11ClF6N2O 0.0003516415966942077 0.7459965573831369 395.0387483584033 741.1840034426168 0.9150873601266857 2.396923077539685 692605.613740076 +C10H11Cl1O3 0.00020383738464602175 1.0631074312441342 213.03219616261535 532.8368925687558 0.8335128693984499 2.548404631638127 1231177.7029795102 +C7H9NO2S 5.124039950032966e-05 0.3048527459412185 170.0280487596005 363.28514725405876 0.8844811055327363 2.7876246329523737 915161.3987675996 +C12H7Cl3O2 0.00045864276756901745 0.03186119284089273 286.9434413572324 831.0018611928409 0.32058179843066653 1.7667251294853705 19934.364712896095 +C18H15Cl3O8 0.00025391610677161225 0.3347689788039361 462.97625391610677 662.6552310211961 0.9093786171678189 2.128435471267278 1209160.0005544876 +C12H7Cl3O5S 0.0002725668035736817 0.5796494453938976 366.90097256680355 699.9403505546061 0.8393755187990459 2.354260942300286 9578789.63215569 +C9H9N4Cl 9.74563299394049e-05 0.4187902944610755 207.04420254367005 402.95120970553893 1.2647033563807812 2.594410018631832 40475158.16355405