Mercurial > repos > iuc > endorspy

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/endorS.py	Wed Mar 18 23:06:08 2026 +0000
@@ -0,0 +1,267 @@
+#!/usr/bin/env python3
+# Written by Aida Andrades ValtueÃ±a and released under MIT license.
+# See git repository (https://github.com/aidaanva/endorS.py) for full license text.
+"""Script to calculate the percent on target (aka endogenous DNA), clonality, and percent of duplicates in a sample from samtools flag stats.
+It accepts can accept up to three files: pre-quality, post-quality filtering and post-dedup. We recommend
+to use all files but you can also use with a combination of any those samtools flagstats.
+"""
+import argparse
+import json
+import re
+import sys
+import textwrap
+
+parser = argparse.ArgumentParser(prog='endorS.py',
+                                 usage='python %(prog)s [-h] [--version] -r [<samplesfile>.stats] -qF [<samplesfile>.stats] -dedup [<samplesfile>.stats]',
+                                 formatter_class=argparse.RawDescriptionHelpFormatter,
+                                 description=textwrap.dedent('''\
+        author:
+        Aida Andrades ValtueÃ±a (aida.andrades[at]gmail.com)
+
+        description:
+        %(prog)s calculates percent on target (aka Endogenous DNA) from samtools flagstat files and print to screen.
+        The percent on target reported will be different depending on the combination of samtools flagstat provided.
+        This program also calculates clonality (aka cluster factor) and percent duplicates when the flagstat file after duplicate removal is provided
+        Use --output flag to write results to a file
+        '''))
+parser.add_argument('--raw', '-r',
+                    metavar='<samplefile>.stats',
+                    type=str, nargs='?',
+                    help='output of samtools flagstat in a txt file, assumes no quality filtering nor duplicate removal performed')
+parser.add_argument('--qualityfiltered', '-q',
+                    metavar='<samplefile>.stats',
+                    type=str, nargs='?',
+                    help='output of samtools flagstat in a txt file, assumes some form of quality or length filtering has been performed, must be provided with at least one of the options -r or -dedup')
+parser.add_argument('--deduplicated', '-d',
+                    metavar='<samplefile>.stats',
+                    type=str, nargs='?',
+                    help='output of samtools flagstat in a txt file, whereby duplicate removal has been performed on the input reads')
+# parser.add_argument('samtoolsfiles', metavar='<samplefile>.stats', type=str, nargs='+',
+#                    help='output of samtools flagstat in a txt file (at least one required). If two files are supplied, the mapped reads of the second file is divided by the total reads in the first, since it assumes that the <samplefile.stats> are related to the same sample. Useful after BAM filtering')
+parser.add_argument('-v', '--version', action='version', version='%(prog)s 1.3')
+parser.add_argument('--output', '-o', nargs='?', help='specify a file format for an output file. Options: <json> for a MultiQC json output. Default: none')
+parser.add_argument('--name', '-n', nargs='?', help='specify name for the output file. Default: extracted from the first samtools flagstat file provided')
+parser.add_argument('--verbose', '-e', help='increase output verbosity', action='store_true')
+# parser.add_argument('--dedupflagstats', '-d', type=str, nargs='?')
+args = parser.parse_args()
+
+
+# print(args.verbose)
+# Check if at least one of the samtools flagstat provided
+
+if ((args.raw is None) and (args.qualityfiltered is None) and (args.deduplicated is None)):
+    print("ERROR: no samtools flagstat provided, please provide at least one samtools flagstat files with the flags --raw, --qualityfiltered or -deduplicated.\nRun:\npython endorS.py --help \nfor more information on how to run this script")
+    sys.exit(2)
+
+if ((args.raw is None) and (args.deduplicated is None)):
+    print("ERROR: only --qualityfiltered samtools flagstat file provided. No stats can be calculated")
+    sys.exit(2)
+
+if ((args.raw is None) and (args.qualityfiltered is None)):
+    print("ERROR: only --deduplicated samtools flagstat file provided. No stats can be calculated")
+    sys.exit(2)
+
+try:
+    with open(args.raw, 'r') as raw:
+        contentsRaw = raw.read()
+    # Extract number of total reads
+    totalReads = float((re.findall(r'^([0-9]+) \+ [0-9]+ in total', contentsRaw))[0])
+    # Extract number of mapped reads pre-quality filtering:
+    mappedRaw = float((re.findall(r'([0-9]+) \+ [0-9]+ mapped ', contentsRaw))[0])
+
+    # Calculate Percentage on target raw (aka endogenous raw)
+    if totalReads == 0.0:
+        endogenousRaw = 0.000000
+        print("WARNING: no reads in the fastq input, percent on target raw (%) set to 0.000000")
+    elif mappedRaw == 0.0:
+        endogenousRaw = 0.000000
+        print("WARNING: no mapped reads, Percent on target raw (%) set to 0.000000")
+    else:
+        endogenousRaw = float("{0:.6f}".format(round((mappedRaw / totalReads * 100), 6)))
+except BaseException:
+    if args.verbose:
+        print("No samtools flagstat --raw provided. \nWARNING: none of the percent on target stats will be calculated")
+
+try:
+    with open(args.qualityfiltered, 'r') as qF:
+        contentsqF = qF.read()
+    # Extract number of mapped reads post-quality filtering:
+    mappedPost = float((re.findall(r'([0-9]+) \+ [0-9]+ mapped', contentsqF))[0])
+    # Calculation of Percent on target quality filtered (aka endogenous DNA post-quality filtering):
+    if args.raw is not None:
+        if totalReads == 0.0:
+            endogenousQF = 0.000000
+            print("WARNING: no reads in the fastq input, percent on target raw (%) set to 0.000000")
+        elif mappedPost == 0.0:
+            endogenousQF = 0.000000
+            print("WARNING: no mapped reads, percent on target modified (%) set to 0.000000")
+        else:
+            endogenousQF = float("{0:.6f}".format(round((mappedPost / totalReads * 100), 6)))
+except BaseException:
+    if args.verbose:
+        print("No samtools flagstat --qualityfiltered provided. \nWARNING: Percent on target modified stat will not be calculated")
+
+try:
+    with open(args.deduplicated, 'r') as deDup:
+        contentsdeDup = deDup.read()
+
+    # Extract number of mapped reads post-dedup:
+    mappedDedup = float((re.findall(r'([0-9]+) \+ [0-9]+ mapped ', contentsdeDup))[0])
+
+    # Extract number of reads pre dedup: can only be extracted from either raw or filtered
+    if args.qualityfiltered is not None:
+        totalPreDedup = mappedPost
+    elif args.raw is not None:
+        totalPreDedup = mappedRaw
+
+    # Check if --raw provided and calculate Percent on target postDedup
+    if args.raw is not None:
+        if totalReads == 0.0:
+            endogenousDeDup = 0.000000
+            print("WARNING: no reads in the fastq input, percent on target post-deduplication (%) set to 0.000000")
+        elif mappedDedup == 0.0:
+            endogenousDeDup = 0.000000
+            print("WARNING: no mapped reads, Percent on target post-deduplication (%) set to 0.000000")
+        else:
+            endogenousDeDup = float("{0:.6f}".format(round((mappedDedup / totalReads * 100), 6)))
+    # Calculate clonality (aka cluster factor)
+    try:
+        clonality = float("{0:.6f}".format(round((totalPreDedup / mappedDedup), 6)))
+    except ZeroDivisionError:
+        clonality = 0
+        print("WARNING: non mapped reads post- and/or pre-deduplication, check your BAM file. Clonality set to 0.000000")
+    # Calculate Percentage of Duplicates
+    try:
+        percentDuplicates = float("{0:.6f}".format(round((((totalPreDedup - mappedDedup) / totalPreDedup) * 100), 6)))
+    except ZeroDivisionError:
+        percentDuplicates = 0
+        print("WARNING: non mapped reads post- and/or pre-deduplication, check your BAM file. Percent duplicates set to 0.000000")
+except BaseException:
+    if args.verbose:
+        print("No samtools flagstat --deduplicated provided. \nWARNING: Percent on target post-deduplication, clonality and percent of duplicates stats will not be calculated!")
+
+# Setting the name depending on the -name flag:
+if args.name is not None:
+    name = args.name
+else:
+    # Set up the name based on the first samtools flagstats:
+    if args.raw is not None:
+        name = str(((args.raw.rsplit(".", 1)[0]).rsplit("/"))[-1])
+    elif args.qualityfiltered is not None:
+        name = str(((args.qualityfiltered.rsplit(".", 1)[0]).rsplit("/"))[-1])
+    else:
+        name = str(((args.deduplicated.rsplit(".", 1)[0]).rsplit("/"))[-1])
+# print(name)
+
+# Creating output
+
+if args.raw is not None:
+    if args.qualityfiltered is not None:
+        # All samtools flagstat provided: Percent target Raw, Percent target Modified, Percent target postDedup, Clonality, Percent duplicates
+        if args.deduplicated is not None:
+            jsonOutput = {
+                "id": "endorSpy",
+                "plot_type": "generalstats",
+                "pconfig": {
+                    "percent_on_target": {"max": 100, "min": 0, "title": "Percent on target (%)", "format": '{:,.2f}'},
+                    "percent_on_target_quality_filtered": {"max": 100, "min": 0, "title": "Percent on target modified (%)", "format": '{:,.2f}'},
+                    "percent_on_target_postdedup": {"max": 100, "min": 0, "title": "Percent on target postdedup (%)", "format": '{:,.2f}'},
+                    "clonality": {"max": 100, "min": 0, "title": "Clonality", "format": '{:,.2f}'},
+                    "percent_duplicates": {"max": 100, "min": 0, "title": "Percent Duplicates (%)", "format": '{:,.2f}'}
+                },
+                "data": {
+                    name: {
+                        "percent_on_target": endogenousRaw,
+                        "percent_on_target_quality_filtered": endogenousQF,
+                        "percent_on_target_postdedup": endogenousDeDup,
+                        "clonality": clonality,
+                        "percent_duplicates": percentDuplicates
+                    }
+                },
+            }
+            print("Percent on target raw (%):", endogenousRaw)
+            print("Percent on target quality filtered (%):", endogenousQF)
+            print("Percent on target post-deduplication (%):", endogenousDeDup)
+            print("Clonality:", clonality)
+            print("Percent duplicates (%):", percentDuplicates)
+        # Raw + QF: Percent target Raw, Percent target quality filtered
+        else:
+            jsonOutput = {
+                "id": "endorSpy",
+                "plot_type": "generalstats",
+                "pconfig": {
+                    "percent_on_target": {"max": 100, "min": 0, "title": "Percent on target (%)", "format": '{:,.2f}'},
+                    "percent_on_target_quality_filtered": {"max": 100, "min": 0, "title": "Percent on target quality filtered (%)", "format": '{:,.2f}'}
+                },
+                "data": {
+                    name: {"percent_on_target": endogenousRaw, "percent_on_target_quality_filtered": endogenousQF}
+                },
+            }
+            print("Percent on target raw (%):", endogenousRaw)
+            print("Percent on target quality filtered (%):", endogenousQF)
+    # Raw + Dedup: Percent target Raw, Percent target postDedup, Clonality, Percent duplicates
+    elif args.deduplicated is not None:
+        jsonOutput = {
+            "id": "endorSpy",
+            "plot_type": "generalstats",
+            "pconfig": {
+                "percent_on_target": {"max": 100, "min": 0, "title": "Percent on target (%)", "format": '{:,.2f}'},
+                "percent_on_target_postdedup": {"max": 100, "min": 0, "title": "Percent on target postdedup (%)", "format": '{:,.2f}'},
+                "clonality": {"max": 100, "min": 0, "title": "Clonality", "format": '{:,.2f}'},
+                "percent_duplicates": {"max": 100, "min": 0, "title": "Percent duplicates (%)", "format": '{:,.2f}'}
+            },
+            "data": {
+                name: {
+                    "percent_on_target": endogenousRaw,
+                    "percent_on_target_postdedup": endogenousDeDup,
+                    "clonality": clonality,
+                    "percent_duplicates": percentDuplicates
+                }
+            },
+        }
+        print("Percent on target raw (%):", endogenousRaw)
+        print("Percent on target post-deduplication (%):", endogenousDeDup)
+        print("Clonality:", clonality)
+        print("Percent Duplicates (%):", percentDuplicates)
+
+    # Only raw: Percent target Raw
+    else:
+        jsonOutput = {
+            "id": "endorSpy",
+            "plot_type": "generalstats",
+            "pconfig":
+            {
+                "percent_on_target": {"max": 100, "min": 0, "title": "Percent on target (%)", "format": '{:,.2f}'},
+            },
+            "data": {
+                name: {"percent_on_target": endogenousRaw}
+            },
+        }
+        print("Percent on target raw (%):", endogenousRaw)
+# Only Dedup or Dedup + QF provided: clonality and percent duplicates reported
+else:
+    jsonOutput = {
+        "id": "endorSpy",
+        "plot_type": "generalstats",
+        "pconfig": {
+            "clonality": {"max": 100, "min": 0, "title": "Clonality", "format": '{:,.2f}'},
+            "percent_duplicates": {"max": 100, "min": 0, "title": "Percent Duplicates (%)", "format": '{:,.2f}'}
+        },
+        "data": {
+            name: {"clonality": clonality, "percent_duplicates": percentDuplicates}
+        },
+    }
+    print("Clonality:", clonality)
+    print("Percent Duplicates (%):", percentDuplicates)
+
+
+# Checking for print to screen argument:
+if args.output is not None:
+    # Creating file with the named after the name variable:
+    # #Writing the json output:
+    fileName = name + "_percent_on_target_mqc.json"
+    # print(fileName)
+    with open(fileName, "w+") as outfile:
+        json.dump(jsonOutput, outfile)
+    print(fileName, "has been generated")
+print("All done!")
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/endorspy.xml	Wed Mar 18 23:06:08 2026 +0000
@@ -0,0 +1,95 @@
+<tool id="endorspy" name="EndorSpy" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
+    <description>Calculate endogenous DNA percentage, clonality, and duplicates from flagstats</description>
+    <macros>
+        <token name="@TOOL_VERSION@">1.3</token>
+        <token name="@VERSION_SUFFIX@">0</token>
+        <token name="@PROFILE@">25.1</token>
+    </macros>
+
+    <requirements>
+        <requirement type="package" version="3.12">python</requirement>
+    </requirements>
+    <required_files>
+        <include path="endorS.py"/>
+    </required_files>
+    <command detect_errors="exit_code"><![CDATA[
+        ## 1. Links
+        ln -s '$pre_flagstat' raw.stats &&
+        #if $post_flagstat
+            ln -s '$post_flagstat' filtered.stats &&
+        #end if
+        #if $dedup_flagstat
+            ln -s '$dedup_flagstat' dedup.stats &&
+        #end if
+
+        ## 2. Run EndorS.py
+        python '$__tool_directory__/endorS.py'
+            -r raw.stats
+            #if $post_flagstat
+                -q filtered.stats
+            #end if
+            #if $dedup_flagstat
+                -d dedup.stats
+            #end if
+            -o json &&
+
+        ## 3. Output
+        mv *.json '$output_json'
+    ]]></command>
+
+    <inputs>
+        <param argument="-r" name="pre_flagstat" type="data" format="txt" label="Raw Flagstat" help="Output of samtools flagstat before any filtering or deduplication." />
+        <param argument="-q" name="post_flagstat" type="data" format="txt" optional="true" label="Quality-Filtered Flagstat" help="Output of samtools flagstat after quality filtering." />
+        <param argument="-d" name="dedup_flagstat" type="data" format="txt" optional="true" label="Deduplicated Flagstat" help="Output of samtools flagstat after duplicate removal." />
+    </inputs>
+
+    <outputs>
+        <data name="output_json" format="json" label="${tool.name} on ${on_string}: JSON Report" />
+    </outputs>
+
+    <tests>
+        <test expect_num_outputs="1">
+            <param name="pre_flagstat" value="raw_flagstat.txt" ftype="txt" />
+            <param name="post_flagstat" value="filtered_flagstat.txt" ftype="txt" />
+            <output name="output_json">
+                <assert_contents>
+                    <has_text text="percent_on_target" />
+                    <has_text text="percent_on_target_quality_filtered" />
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+
+    <help><![CDATA[
+**EndorSpy: Endogenous DNA, Clonality, and Duplication Calculator**
+
+This tool calculates the percent on target (Endogenous DNA), cluster factor, and percent of duplicates in a sample from `samtools flagstat` outputs.
+
+**Inputs:**
+You can provide a combination of samtools flagstat files:
+* **Raw Flagstat**: Assumes no quality filtering nor duplicate removal performed.
+* **Quality-Filtered Flagstat**: Assumes some form of quality filtering has been performed.
+* **Deduplicated Flagstat**: Whereby duplicate removal has been performed on the input reads.
+
+*Note: You must provide at least the Raw flagstat, OR both the Quality-filtered and Deduplicated flagstats.*
+
+**Outputs:**
+A JSON file containing the calculated statistics.
+    ]]></help>
+
+    <citations>
+        <citation type="bibtex">
+@misc{githubendorspy,
+  author = {Andrades Valtuena, Aida},
+  title = {EndorSpy},
+  publisher = {GitHub},
+  journal = {GitHub repository},
+  url = {https://github.com/aidaanva/endorS.py}
+}
+        </citation>
+    </citations>
+    <creator>
+        <person givenName="Ali Mert" familyName="Aydin" url="https://github.com/mertydn"/>
+        <organization name="Galaxy Europe" url="https://galaxyproject.eu/" />
+    </creator>
+</tool>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/filtered_flagstat.txt	Wed Mar 18 23:06:08 2026 +0000
@@ -0,0 +1,16 @@
+65951 + 0 in total (QC-passed reads + QC-failed reads)
+65951 + 0 primary
+0 + 0 secondary
+0 + 0 supplementary
+0 + 0 duplicates
+0 + 0 primary duplicates
+65951 + 0 mapped (100.00% : N/A)
+65951 + 0 primary mapped (100.00% : N/A)
+0 + 0 paired in sequencing
+0 + 0 read1
+0 + 0 read2
+0 + 0 properly paired (N/A : N/A)
+0 + 0 with itself and mate mapped
+0 + 0 singletons (N/A : N/A)
+0 + 0 with mate mapped to a different chr
+0 + 0 with mate mapped to a different chr (mapQ>=5)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/raw_flagstat.txt	Wed Mar 18 23:06:08 2026 +0000
@@ -0,0 +1,16 @@
+123406 + 0 in total (QC-passed reads + QC-failed reads)
+123406 + 0 primary
+0 + 0 secondary
+0 + 0 supplementary
+0 + 0 duplicates
+0 + 0 primary duplicates
+94254 + 0 mapped (76.38% : N/A)
+94254 + 0 primary mapped (76.38% : N/A)
+0 + 0 paired in sequencing
+0 + 0 read1
+0 + 0 read2
+0 + 0 properly paired (N/A : N/A)
+0 + 0 with itself and mate mapped
+0 + 0 singletons (N/A : N/A)
+0 + 0 with mate mapped to a different chr
+0 + 0 with mate mapped to a different chr (mapQ>=5)