view htsc_create_sliding_windows.py @ 0:94a987a7da69 draft default tip

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/htseq-clip commit 4879439f0df3386b97d8507c5991051fbdda053a
author bgruening
date Tue, 11 Oct 2022 16:09:23 +0000
parents
children
line wrap: on
line source

#!/usr/bin/env python3

import argparse
import os
import subprocess

"""

Install deseq-clip
==================

conda install -c bioconda pysam
conda install -c bioconda htseq
pip install htseq-clip

Or directly by:
conda install -c bioconda htseq-clip


Test call
=========

python htsc_create_sliding_windows.py --gff test-data/paper_tus.Synechocystis_pSYSM.gff3 --out test_compare_out --hcw-w 50 --hcw-s 20 --no-zipper

Compare:
diff test_compare_out/windows_mapped_to_ids.txt test-data/windows.exp.txt
diff test_compare_out/windows.bed test-data/windows.exp.bed
diff test_compare_out/annotation.bed test-data/annotation.exp.bed

This corresponds to:
htseq-clip annotation -g test-data/paper_tus.Synechocystis_pSYSM.gff3 -o test-data/annotation.exp.bed
htseq-clip createSlidingWindows -i test-data/annotation.exp.bed -w 50 -s 20 -o test-data/windows.exp.bed
htseq-clip mapToId -a test-data/windows.exp.bed -o test-data/windows.exp.txt

More tests:
python htsc_create_sliding_windows.py --gff test-data/paper_tus.Synechocystis_pSYSM.gff3 --out test_compare_out --hcw-w 50 --hcw-s 20 --no-zipper --hca-unsorted

DEWSeq input files:
test_compare_out/windows_mapped_to_ids.txt

"""


################################################################################

def setup_argument_parser():
    """Setup argparse parser."""
    help_description = """
    Based on genomic annotations GFF file (--gff), create sliding window
    annotations with htseq-clip.

    """
    # Define argument parser.
    p = argparse.ArgumentParser(add_help=False,
                                prog="htsc_create_sliding_windows.py",
                                description=help_description,
                                formatter_class=argparse.MetavarTypeHelpFormatter)

    # Required arguments.
    p.add_argument("-h", "--help",
                   action="help",
                   help="Print help message")
    p.add_argument("--gff",
                   dest="in_gff",
                   type=str,
                   metavar='str',
                   required=True,
                   help="Annotation file GFF (so far tested with hg38 GENCODE format). Also accepts gff.gz as well")
    p.add_argument("--out",
                   dest="out_folder",
                   type=str,
                   metavar='str',
                   required=True,
                   help="Results output folder")
    # htseq-clip annotation.
    p.add_argument("--hca-unsorted",
                   dest="hca_unsorted",
                   default=False,
                   action="store_true",
                   help="htseq-clip annotation --unsorted parameter. Use this flag if the GFF file is unsorted (default: False)")
    # htseq-clip createSlidingWindows.
    p.add_argument("--hcw-w",
                   dest="hcw_w",
                   type=int,
                   metavar='int',
                   default=50,
                   help="htseq-clip createSlidingWindows -w parameter. Sliding window size. If unsure, try 75-100 (default: 50)")
    p.add_argument("--hcw-s",
                   dest="hcw_s",
                   type=int,
                   metavar='int',
                   default=20,
                   help="htseq-clip createSlidingWindows -s parameter. Step size for sliding window (default: 20)")
    # More.
    p.add_argument("--no-zipper",
                   dest="no_zipper",
                   default=False,
                   action="store_true",
                   help="Do not gzip output files (default: False)")
    return p


################################################################################

if __name__ == '__main__':

    parser = setup_argument_parser()
    args = parser.parse_args()

    assert os.path.exists(args.in_gff), "--gff file \"%s\" not found" % (args.in_gff)

    # Output folder.
    if not os.path.exists(args.out_folder):
        os.makedirs(args.out_folder)

    """
    1) Flatten annotation.
    htseq-clip annotation -g args.in_gff -o annotation.bed

    -o content example:
    Synechocystis	5	105	TU1@TU1@protein_coding@exon@1/1@TU1:exon0001	2	-
    Synechocystis	576	990	TU2@TU2@protein_coding@exon@1/1@TU2:exon0001	2	+
    Synechocystis	809	909	TU3@TU3@protein_coding@exon@1/1@TU3:exon0001	2	-
    Synechocystis	1531	2150	TU4@TU4@protein_coding@exon@1/1@TU4:exon0001	2	+
    Synechocystis	2150	2701	TU6@TU6@protein_coding@exon@1/1@TU6:exon0001	2	+
    ...

    """

    annot_bed = args.out_folder + "/annotation.bed.gz"
    if args.no_zipper:
        annot_bed = args.out_folder + "/annotation.bed"

    print("Convert --gff to BED ... ")
    check_cmd = "htseq-clip annotation -g " + args.in_gff + " -o " + annot_bed
    if args.hca_unsorted:
        check_cmd += " --unsorted"
    output = subprocess.getoutput(check_cmd)

    print(check_cmd)
    print(output)

    assert os.path.exists(annot_bed), "htseq-clip annotation -o file \"%s\" not found" % (annot_bed)

    """
    2) Create sliding windows.
    htseq-clip createSlidingWindows -i annotation.bed -w hcw_w -s hcw_s -o windows.bed.gz

    -o content example:
    Synechocystis	5	80	TU1@TU1@protein_coding@exon@1/1@TU1:exon0001W00001@1	2	-
    Synechocystis	15	90	TU1@TU1@protein_coding@exon@1/1@TU1:exon0001W00002@2	2	-
    Synechocystis	25	100	TU1@TU1@protein_coding@exon@1/1@TU1:exon0001W00003@3	2	-
    Synechocystis	35	105	TU1@TU1@protein_coding@exon@1/1@TU1:exon0001W00004@4	2	-
    Synechocystis	576	651	TU2@TU2@protein_coding@exon@1/1@TU2:exon0001W00001@1	2	+
    Synechocystis	586	661	TU2@TU2@protein_coding@exon@1/1@TU2:exon0001W00002@2	2	+
    Synechocystis	596	671	TU2@TU2@protein_coding@exon@1/1@TU2:exon0001W00003@3	2	+
    Synechocystis	606	681	TU2@TU2@protein_coding@exon@1/1@TU2:exon0001W00004@4	2	+
    Synechocystis	616	691	TU2@TU2@protein_coding@exon@1/1@TU2:exon0001W00005@5	2	+
    ...

    """

    win_bed = args.out_folder + "/windows.bed.gz"
    if args.no_zipper:
        win_bed = args.out_folder + "/windows.bed"

    print("Create sliding windows BED ... ")
    win_params = " -w %i -s %i " % (args.hcw_w, args.hcw_s)
    check_cmd = "htseq-clip createSlidingWindows -i " + annot_bed + win_params + " -o " + win_bed
    output = subprocess.getoutput(check_cmd)

    print(check_cmd)
    print(output)

    assert os.path.exists(annot_bed), "htseq-clip createSlidingWindows -o file \"%s\" not found" % (win_bed)

    """
    3) Create mapping file for DEWSeq.
    mapToId: extract "name" column from the annotation file and map the entries
    to unique id and print out in tab separated format.
    htseq-clip mapToId -a windows.bed.gz -o windows.txt.gz

    -o content example:
    unique_id	chromosome	begin	end	strand	gene_id	gene_name	gene_type	gene_region	Nr_of_region	Total_nr_of_region	window_number
    TU1:exon0001W00001	Synechocystis	5	80	-	TU1	TU1	protein_coding	exon	1	1	1
    TU1:exon0001W00002	Synechocystis	15	90	-	TU1	TU1	protein_coding	exon	1	1	2
    TU1:exon0001W00003	Synechocystis	25	100	-	TU1	TU1	protein_coding	exon	1	1	3
    TU1:exon0001W00004	Synechocystis	35	105	-	TU1	TU1	protein_coding	exon	1	1	4
    TU2:exon0001W00001	Synechocystis	576	651	+	TU2	TU2	protein_coding	exon	1	1	1
    TU2:exon0001W00002	Synechocystis	586	661	+	TU2	TU2	protein_coding	exon	1	1	2
    TU2:exon0001W00003	Synechocystis	596	671	+	TU2	TU2	protein_coding	exon	1	1	3
    TU2:exon0001W00004	Synechocystis	606	681	+	TU2	TU2	protein_coding	exon	1	1	4
    TU2:exon0001W00005	Synechocystis	616	691	+	TU2	TU2	protein_coding	exon	1	1	5
    ...

    """

    mapped2ids_txt = args.out_folder + "/windows_mapped_to_ids.txt.gz"
    if args.no_zipper:
        mapped2ids_txt = args.out_folder + "/windows_mapped_to_ids.txt"

    print("Create DEWSeq input annotation file ... ")
    win_params = " -w %i -s %i " % (args.hcw_w, args.hcw_s)
    check_cmd = "htseq-clip mapToId -a " + win_bed + " -o " + mapped2ids_txt
    output = subprocess.getoutput(check_cmd)

    print(check_cmd)
    print(output)

    assert os.path.exists(mapped2ids_txt), "htseq-clip mapToId -o file \"%s\" not found" % (mapped2ids_txt)

    """
    Report.

    """

    print("")
    print("OUTPUT FILES")
    print("============")
    print("Annotation BED:\n%s" % (annot_bed))
    print("Windows BED:\n%s" % (win_bed))
    print("Windows mapped to IDs TXT (DEWseq annotation file):\n%s" % (mapped2ids_txt))
    print("")