changeset 1:6d51be3d7bb5 draft default tip

planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/misc commit d6102c60e41d91adf1c7a876f84ef420a69262e2
author recetox
date Mon, 12 May 2025 14:05:37 +0000
parents d4c2d5bc0524
children
files target_screen.py target_screen.xml test-data/target_screen/out.tsv
diffstat 3 files changed, 157 insertions(+), 41 deletions(-) [+]
line wrap: on
line diff
--- a/target_screen.py	Thu Sep 26 13:03:05 2024 +0000
+++ b/target_screen.py	Mon May 12 14:05:37 2025 +0000
@@ -1,18 +1,72 @@
 import argparse
+from typing import Tuple
 
 import numpy as np
 import pandas as pd
 
 
-def mz_match(marker, peak, ppm):
+class LoadDataAction(argparse.Action):
+    """
+    Custom argparse action to load data from a file into a pandas DataFrame.
+    Supports CSV, TSV, and Parquet file formats.
+    """
+    def __call__(self, parser: argparse.ArgumentParser, namespace: argparse.Namespace, values: Tuple[str, str], option_string: str = None) -> None:
+        file_path, file_extension = values
+        file_extension = file_extension.lower()
+        if file_extension == "csv":
+            df = pd.read_csv(file_path)
+        elif file_extension in ["tsv", "tabular"]:
+            df = pd.read_csv(file_path, sep="\t")
+        elif file_extension == "parquet":
+            df = pd.read_parquet(file_path)
+        else:
+            raise ValueError(f"Unsupported file format: {file_extension}")
+        setattr(namespace, self.dest, df)
+
+
+def mz_match(marker: np.ndarray, peak: np.ndarray, ppm: int) -> np.ndarray:
+    """
+    Check if the mass-to-charge ratio (m/z) of markers and peaks match within a given PPM tolerance.
+
+    Args:
+        marker (np.ndarray): Array of marker m/z values.
+        peak (np.ndarray): Array of peak m/z values.
+        ppm (int): PPM tolerance for matching.
+
+    Returns:
+        np.ndarray: Boolean array indicating matches.
+    """
     return np.abs(marker - peak) <= ((peak + marker) / 2) * ppm * 1e-06
 
 
-def rt_match(marker, peak, tol):
+def rt_match(marker: np.ndarray, peak: np.ndarray, tol: int) -> np.ndarray:
+    """
+    Check if the retention time (rt) of markers and peaks match within a given tolerance.
+
+    Args:
+        marker (np.ndarray): Array of marker retention times.
+        peak (np.ndarray): Array of peak retention times.
+        tol (int): Retention time tolerance for matching.
+
+    Returns:
+        np.ndarray: Boolean array indicating matches.
+    """
     return np.abs(marker - peak) <= tol
 
 
-def find_matches(peaks, markers, ppm, rt_tol):
+def find_matches(peaks: pd.DataFrame, markers: pd.DataFrame, ppm: int, rt_tol: int) -> pd.DataFrame:
+    """
+    Find matches between peaks and markers based on m/z and retention time tolerances.
+
+    Args:
+        peaks (pd.DataFrame): DataFrame containing peak data with 'mz' and 'rt' columns.
+        markers (pd.DataFrame): DataFrame containing marker data with 'mz' and 'rt' columns.
+        ppm (int): PPM tolerance for m/z matching.
+        rt_tol (int): Retention time tolerance for rt matching.
+
+    Returns:
+        pd.DataFrame: DataFrame containing matched rows with all columns from peaks and markers.
+    """
     # Create a meshgrid of all combinations of mz and rt values
     marker_mz = markers['mz'].values[:, np.newaxis]
     peak_mz = peaks['mz'].values
@@ -29,28 +83,32 @@
     # Create a DataFrame of hits
     matched_markers = markers.iloc[match_indices[0]].reset_index(drop=True)
     matched_peaks = peaks.iloc[match_indices[1]].reset_index(drop=True)
-    hits = pd.concat([matched_markers[['formula']].reset_index(drop=True), matched_peaks], axis=1)
 
     # Calculate mz and rt differences
-    hits['mz_diff'] = np.abs(matched_markers['mz'].values - matched_peaks['mz'].values)
-    hits['rt_diff'] = np.abs(matched_markers['rt'].values - matched_peaks['rt'].values)
+    matched_markers['mz_diff'] = np.abs(matched_markers['mz'].values - matched_peaks['mz'].values)
+    matched_markers['rt_diff'] = np.abs(matched_markers['rt'].values - matched_peaks['rt'].values)
 
+    # Drop mz and rt columns from the marker table
+    matched_markers = matched_markers.drop(columns=['mz', 'rt'])
+
+    # Combine all columns from peaks and markers
+    hits = pd.concat([matched_markers.reset_index(drop=True), matched_peaks.reset_index(drop=True)], axis=1)
     return hits
 
 
-def main():
+def main() -> None:
+    """
+    Main function to parse arguments, find matches between peaks and markers, and save the results.
+    """
     parser = argparse.ArgumentParser(description='Find matches between peaks and markers.')
-    parser.add_argument('--peaks', required=True, help='Path to the peaks parquet file.')
-    parser.add_argument('--markers', required=True, help='Path to the markers CSV file.')
+    parser.add_argument('--peaks', required=True, nargs=2, action=LoadDataAction, help='Path to the peaks file and its format (e.g., "file.parquet parquet").')
+    parser.add_argument('--markers', required=True, nargs=2, action=LoadDataAction, help='Path to the markers file and its format (e.g., "file.tsv tsv").')
     parser.add_argument('--output', required=True, help='Path to the output TSV file.')
     parser.add_argument('--ppm', type=int, default=5, help='PPM tolerance for mz matching.')
     parser.add_argument('--rt_tol', type=int, default=10, help='RT tolerance for rt matching.')
     args = parser.parse_args()
 
-    peaks = pd.read_parquet(args.peaks)
-    markers = pd.read_csv(args.markers, sep='\t')
-
-    hits = find_matches(peaks, markers, args.ppm, args.rt_tol)
+    hits = find_matches(args.peaks, args.markers, args.ppm, args.rt_tol)
 
     hits.to_csv(args.output, sep='\t', index=False)
 
--- a/target_screen.xml	Thu Sep 26 13:03:05 2024 +0000
+++ b/target_screen.xml	Mon May 12 14:05:37 2025 +0000
@@ -1,8 +1,8 @@
-<tool id="target_screen" name="MS target screening" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.01" license="MIT">
+<tool id="target_screen" name="MS Target Screening" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.01" license="MIT">
     <description>Extract peaks from recetox-aplcms tables using a list of target ions</description>
     <macros>
         <token name="@TOOL_VERSION@">0.1.0</token>
-        <token name="@VERSION_SUFFIX@">0</token>
+        <token name="@VERSION_SUFFIX@">1</token>
     </macros>
 
     <requirements>
@@ -11,18 +11,18 @@
     </requirements>
 
     <command detect_errors="exit_code"><![CDATA[
-        python3 '${__tool_directory__}/target_screen.py' --peaks '$peaks' --markers '$markers' --output '$hits' --ppm $ppm --rt_tol $rt
+        python3 '${__tool_directory__}/target_screen.py' --peaks '$peaks' '$peaks.ext' --markers '$markers' '$markers.ext' --output '$hits' --ppm $ppm --rt_tol $rt
     ]]></command>
 
     <inputs>
-        <param name="peaks" type="data" format="parquet"/>
-        <param name="markers" type="data" format="tabular"/>
-        <param name="ppm" type="integer" min="0" max="1000" value="10" label="ppm" help="Tolerance for peak filtering in ppm." />
-        <param name="rt" type="integer" min="0" max="100" value="10" label="rt tolerance" help="Toelrance regarding retention time to filter out peaks" />
+        <param name="peaks" type="data" format="parquet,tabular,tsv,csv" label="Peaks Table" help="Input table containing detected peaks in Parquet format. Each row should represent a peak with columns for m/z, retention time, and intensity." />
+        <param name="markers" type="data" format="parquet,tabular,tsv,csv" label="Target Markers Table" help="Input table containing target markers in tabular format. Each row should represent a marker with columns for m/z and retention time." />
+        <param name="ppm" type="integer" min="0" max="1000" value="10" label="Mass Tolerance (ppm)" help="Tolerance for filtering peaks based on mass-to-charge ratio (m/z) in parts per million (ppm). The tolerance is applied symmetrically (±), so a value of 10 will match peaks within ±10 ppm of the target m/z." />
+        <param name="rt" type="integer" min="0" max="100" value="10" label="Retention Time Tolerance" help="Tolerance for filtering peaks based on retention time in the same units as the input data. The tolerance is applied symmetrically (±), so a value of 10 will match peaks within ±10 units of the target retention time." />
     </inputs>
 
     <outputs>
-        <data name="hits" format="tabular" label="${tool.name} on ${on_string}" />
+        <data name="hits" format="tabular" label="Filtered Peaks (${tool.name} on ${on_string})" />
     </outputs>
 
     <tests>
@@ -38,8 +38,66 @@
 
 **What it does**
 
-This tool pulls out peaks from a table given a list of markers.
-The markers are matched based on m/z values with a specified ppm tolerance and matched based on retention time with a tolerance in units of retention time.
+This tool extracts peaks from a table of detected peaks based on a list of target markers. Peaks are matched to markers using a specified tolerance for mass-to-charge ratio (ppm) and retention time. Both tolerances are applied symmetrically (±). For example, if the retention time tolerance is set to 10, peaks within ±10 units of the target retention time will be matched.
+
+**Inputs**
+
+1. **Peaks Table (Parquet Format)**:
+   A table containing detected peaks. The table should be in Parquet format and include the following columns:
+   
+   - `mz`: Mass-to-charge ratio (m/z) of the peak.
+   - `rt`: Retention time of the peak.
+   - `intensity`: Intensity of the peak.
+
+   Example Peaks Table:
+
+   .. list-table:: Example Peaks Table
+      :header-rows: 1
+
+      * - mz
+        - rt
+        - intensity
+      * - 100.123
+        - 5.2
+        - 1500
+      * - 200.456
+        - 10.5
+        - 3000
+
+2. **Target Markers Table (Tabular Format)**:
+   A table containing target markers. The table should be in tabular format and include the following columns:
+   
+   - `mz`: Mass-to-charge ratio (m/z) of the marker.
+   - `rt`: Retention time of the marker.
+
+   Example Markers Table:
+
+   .. list-table:: Example Markers Table
+      :header-rows: 1
+
+      * - mz
+        - rt
+      * - 100.123
+        - 5.2
+      * - 200.456
+        - 10.5
+
+**Parameters**
+
+- **Mass Tolerance (ppm)**:
+  The tolerance for matching peaks to markers based on their mass-to-charge ratio (m/z). The value is specified in parts per million (ppm). The tolerance is applied symmetrically (±), so a value of 10 will match peaks within ±10 ppm of the target m/z.
+
+- **Retention Time Tolerance**:
+  The tolerance for matching peaks to markers based on their retention time. The value is specified in the same units as the input data. The tolerance is applied symmetrically (±), so a value of 10 will match peaks within ±10 units of the target retention time.
+
+**Outputs**
+
+- **Filtered Peaks Table**:
+  A tabular file containing the peaks that matched the target markers based on the specified tolerances.
+
+**Example Usage**
+
+This tool can be used to filter peaks from a mass spectrometry dataset based on a list of known target markers, enabling targeted analysis of specific compounds.
     ]]></help>
     <citations>
         <citation type="doi">10.25080/Majora-92bf1922-00a</citation>
--- a/test-data/target_screen/out.tsv	Thu Sep 26 13:03:05 2024 +0000
+++ b/test-data/target_screen/out.tsv	Mon May 12 14:05:37 2025 +0000
@@ -1,18 +1,18 @@
-formula	mz	rt	sd1	sd2	area	mz_diff	rt_diff
-C8H6Cl2O3	218.9619738108278	473.4840709352675	0.6057217022739683	2.7706017478506073	1239147.63695882	0.00012618917219242576	1.1159290647325406
-C9H15N3O1	180.11422341297595	450.9460162486645	0.4692965104502825	4.727634916193644	1100073.3285644436	2.341297593488889e-05	7.2839837513355405
-C5H2Cl3N1O1	195.91267599889463	487.37949630118806	0.8695685392506757	2.8811688054510127	734461.0596300099	0.00022400110538001172	0.7205036988119673
-C13H10O3	213.05556658306853	508.4123751384482	2.9585968043814983	3.226731392934289	1787580.264815322	0.00013341693147594924	0.01237513844824889
-C13H9FO3	231.04576243564085	521.2436784813573	0.9930695671903609	2.469013097815558	1316270.081622402	0.0005375643591492008	0.23632151864273965
-C10H12N2O3S1	239.04945126090132	311.8317362000094	0.5578277726641567	3.57063615115722	3042462.634739455	0.0001487390986767423	0.7182637999906092
-C14H17Cl2NO2	300.0561299922103	685.3731548839577	0.8491884774374224	2.8491999009146074	1021277.4141378121	0.0002700077897088704	4.416845116042282
-C12H4Cl2F6N4OS	434.93037227267905	766.6610671335172	0.6265405149641161	3.55175113250731	43923382.478327975	0.0010277273209453597	1.198932866482778
-C12H4Cl2F6N4O2S	450.9259113906124	789.7479646306683	0.5765707513162325	3.4834377486718897	35843894.74749327	0.00038860938764173625	1.5420353693316429
-C16H22ClN3O2	322.13274143359513	705.9176130811956	0.765497607933695	2.9798451004946203	7686414.229962895	5.8566404845805664e-05	0.5823869188044455
-C16H11ClF6N2O	395.0387483584033	741.1840034426168	0.9150873601266857	2.396923077539685	692605.613740076	0.0003516415966942077	0.7459965573831369
-C10H11Cl1O3	213.03219616261535	532.8368925687558	0.8335128693984499	2.548404631638127	1231177.7029795102	0.00020383738464602175	1.0631074312441342
-C7H9NO2S	170.0280487596005	363.28514725405876	0.8844811055327363	2.7876246329523737	915161.3987675996	5.124039950032966e-05	0.3048527459412185
-C12H7Cl3O2	286.9434413572324	831.0018611928409	0.32058179843066653	1.7667251294853705	19934.364712896095	0.00045864276756901745	0.03186119284089273
-C18H15Cl3O8	462.97625391610677	662.6552310211961	0.9093786171678189	2.128435471267278	1209160.0005544876	0.00025391610677161225	0.3347689788039361
-C12H7Cl3O5S	366.90097256680355	699.9403505546061	0.8393755187990459	2.354260942300286	9578789.63215569	0.0002725668035736817	0.5796494453938976
-C9H9N4Cl	207.04420254367005	402.95120970553893	1.2647033563807812	2.594410018631832	40475158.16355405	9.74563299394049e-05	0.4187902944610755
+formula	mz_diff	rt_diff	mz	rt	sd1	sd2	area
+C8H6Cl2O3	0.00012618917219242576	1.1159290647325406	218.9619738108278	473.4840709352675	0.6057217022739683	2.7706017478506073	1239147.63695882
+C9H15N3O1	2.341297593488889e-05	7.2839837513355405	180.11422341297595	450.9460162486645	0.4692965104502825	4.727634916193644	1100073.3285644436
+C5H2Cl3N1O1	0.00022400110538001172	0.7205036988119673	195.91267599889463	487.37949630118806	0.8695685392506757	2.8811688054510127	734461.0596300099
+C13H10O3	0.00013341693147594924	0.01237513844824889	213.05556658306853	508.4123751384482	2.9585968043814983	3.226731392934289	1787580.264815322
+C13H9FO3	0.0005375643591492008	0.23632151864273965	231.04576243564085	521.2436784813573	0.9930695671903609	2.469013097815558	1316270.081622402
+C10H12N2O3S1	0.0001487390986767423	0.7182637999906092	239.04945126090132	311.8317362000094	0.5578277726641567	3.57063615115722	3042462.634739455
+C14H17Cl2NO2	0.0002700077897088704	4.416845116042282	300.0561299922103	685.3731548839577	0.8491884774374224	2.8491999009146074	1021277.4141378121
+C12H4Cl2F6N4OS	0.0010277273209453597	1.198932866482778	434.93037227267905	766.6610671335172	0.6265405149641161	3.55175113250731	43923382.478327975
+C12H4Cl2F6N4O2S	0.00038860938764173625	1.5420353693316429	450.9259113906124	789.7479646306683	0.5765707513162325	3.4834377486718897	35843894.74749327
+C16H22ClN3O2	5.8566404845805664e-05	0.5823869188044455	322.13274143359513	705.9176130811956	0.765497607933695	2.9798451004946203	7686414.229962895
+C16H11ClF6N2O	0.0003516415966942077	0.7459965573831369	395.0387483584033	741.1840034426168	0.9150873601266857	2.396923077539685	692605.613740076
+C10H11Cl1O3	0.00020383738464602175	1.0631074312441342	213.03219616261535	532.8368925687558	0.8335128693984499	2.548404631638127	1231177.7029795102
+C7H9NO2S	5.124039950032966e-05	0.3048527459412185	170.0280487596005	363.28514725405876	0.8844811055327363	2.7876246329523737	915161.3987675996
+C12H7Cl3O2	0.00045864276756901745	0.03186119284089273	286.9434413572324	831.0018611928409	0.32058179843066653	1.7667251294853705	19934.364712896095
+C18H15Cl3O8	0.00025391610677161225	0.3347689788039361	462.97625391610677	662.6552310211961	0.9093786171678189	2.128435471267278	1209160.0005544876
+C12H7Cl3O5S	0.0002725668035736817	0.5796494453938976	366.90097256680355	699.9403505546061	0.8393755187990459	2.354260942300286	9578789.63215569
+C9H9N4Cl	9.74563299394049e-05	0.4187902944610755	207.04420254367005	402.95120970553893	1.2647033563807812	2.594410018631832	40475158.16355405