Mercurial > repos > bgruening > htseq_clip
diff htsc_create_sliding_windows.py @ 0:94a987a7da69 draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/htseq-clip commit 4879439f0df3386b97d8507c5991051fbdda053a
author | bgruening |
---|---|
date | Tue, 11 Oct 2022 16:09:23 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/htsc_create_sliding_windows.py Tue Oct 11 16:09:23 2022 +0000 @@ -0,0 +1,223 @@ +#!/usr/bin/env python3 + +import argparse +import os +import subprocess + +""" + +Install deseq-clip +================== + +conda install -c bioconda pysam +conda install -c bioconda htseq +pip install htseq-clip + +Or directly by: +conda install -c bioconda htseq-clip + + +Test call +========= + +python htsc_create_sliding_windows.py --gff test-data/paper_tus.Synechocystis_pSYSM.gff3 --out test_compare_out --hcw-w 50 --hcw-s 20 --no-zipper + +Compare: +diff test_compare_out/windows_mapped_to_ids.txt test-data/windows.exp.txt +diff test_compare_out/windows.bed test-data/windows.exp.bed +diff test_compare_out/annotation.bed test-data/annotation.exp.bed + +This corresponds to: +htseq-clip annotation -g test-data/paper_tus.Synechocystis_pSYSM.gff3 -o test-data/annotation.exp.bed +htseq-clip createSlidingWindows -i test-data/annotation.exp.bed -w 50 -s 20 -o test-data/windows.exp.bed +htseq-clip mapToId -a test-data/windows.exp.bed -o test-data/windows.exp.txt + +More tests: +python htsc_create_sliding_windows.py --gff test-data/paper_tus.Synechocystis_pSYSM.gff3 --out test_compare_out --hcw-w 50 --hcw-s 20 --no-zipper --hca-unsorted + +DEWSeq input files: +test_compare_out/windows_mapped_to_ids.txt + +""" + + +################################################################################ + +def setup_argument_parser(): + """Setup argparse parser.""" + help_description = """ + Based on genomic annotations GFF file (--gff), create sliding window + annotations with htseq-clip. + + """ + # Define argument parser. + p = argparse.ArgumentParser(add_help=False, + prog="htsc_create_sliding_windows.py", + description=help_description, + formatter_class=argparse.MetavarTypeHelpFormatter) + + # Required arguments. + p.add_argument("-h", "--help", + action="help", + help="Print help message") + p.add_argument("--gff", + dest="in_gff", + type=str, + metavar='str', + required=True, + help="Annotation file GFF (so far tested with hg38 GENCODE format). Also accepts gff.gz as well") + p.add_argument("--out", + dest="out_folder", + type=str, + metavar='str', + required=True, + help="Results output folder") + # htseq-clip annotation. + p.add_argument("--hca-unsorted", + dest="hca_unsorted", + default=False, + action="store_true", + help="htseq-clip annotation --unsorted parameter. Use this flag if the GFF file is unsorted (default: False)") + # htseq-clip createSlidingWindows. + p.add_argument("--hcw-w", + dest="hcw_w", + type=int, + metavar='int', + default=50, + help="htseq-clip createSlidingWindows -w parameter. Sliding window size. If unsure, try 75-100 (default: 50)") + p.add_argument("--hcw-s", + dest="hcw_s", + type=int, + metavar='int', + default=20, + help="htseq-clip createSlidingWindows -s parameter. Step size for sliding window (default: 20)") + # More. + p.add_argument("--no-zipper", + dest="no_zipper", + default=False, + action="store_true", + help="Do not gzip output files (default: False)") + return p + + +################################################################################ + +if __name__ == '__main__': + + parser = setup_argument_parser() + args = parser.parse_args() + + assert os.path.exists(args.in_gff), "--gff file \"%s\" not found" % (args.in_gff) + + # Output folder. + if not os.path.exists(args.out_folder): + os.makedirs(args.out_folder) + + """ + 1) Flatten annotation. + htseq-clip annotation -g args.in_gff -o annotation.bed + + -o content example: + Synechocystis 5 105 TU1@TU1@protein_coding@exon@1/1@TU1:exon0001 2 - + Synechocystis 576 990 TU2@TU2@protein_coding@exon@1/1@TU2:exon0001 2 + + Synechocystis 809 909 TU3@TU3@protein_coding@exon@1/1@TU3:exon0001 2 - + Synechocystis 1531 2150 TU4@TU4@protein_coding@exon@1/1@TU4:exon0001 2 + + Synechocystis 2150 2701 TU6@TU6@protein_coding@exon@1/1@TU6:exon0001 2 + + ... + + """ + + annot_bed = args.out_folder + "/annotation.bed.gz" + if args.no_zipper: + annot_bed = args.out_folder + "/annotation.bed" + + print("Convert --gff to BED ... ") + check_cmd = "htseq-clip annotation -g " + args.in_gff + " -o " + annot_bed + if args.hca_unsorted: + check_cmd += " --unsorted" + output = subprocess.getoutput(check_cmd) + + print(check_cmd) + print(output) + + assert os.path.exists(annot_bed), "htseq-clip annotation -o file \"%s\" not found" % (annot_bed) + + """ + 2) Create sliding windows. + htseq-clip createSlidingWindows -i annotation.bed -w hcw_w -s hcw_s -o windows.bed.gz + + -o content example: + Synechocystis 5 80 TU1@TU1@protein_coding@exon@1/1@TU1:exon0001W00001@1 2 - + Synechocystis 15 90 TU1@TU1@protein_coding@exon@1/1@TU1:exon0001W00002@2 2 - + Synechocystis 25 100 TU1@TU1@protein_coding@exon@1/1@TU1:exon0001W00003@3 2 - + Synechocystis 35 105 TU1@TU1@protein_coding@exon@1/1@TU1:exon0001W00004@4 2 - + Synechocystis 576 651 TU2@TU2@protein_coding@exon@1/1@TU2:exon0001W00001@1 2 + + Synechocystis 586 661 TU2@TU2@protein_coding@exon@1/1@TU2:exon0001W00002@2 2 + + Synechocystis 596 671 TU2@TU2@protein_coding@exon@1/1@TU2:exon0001W00003@3 2 + + Synechocystis 606 681 TU2@TU2@protein_coding@exon@1/1@TU2:exon0001W00004@4 2 + + Synechocystis 616 691 TU2@TU2@protein_coding@exon@1/1@TU2:exon0001W00005@5 2 + + ... + + """ + + win_bed = args.out_folder + "/windows.bed.gz" + if args.no_zipper: + win_bed = args.out_folder + "/windows.bed" + + print("Create sliding windows BED ... ") + win_params = " -w %i -s %i " % (args.hcw_w, args.hcw_s) + check_cmd = "htseq-clip createSlidingWindows -i " + annot_bed + win_params + " -o " + win_bed + output = subprocess.getoutput(check_cmd) + + print(check_cmd) + print(output) + + assert os.path.exists(annot_bed), "htseq-clip createSlidingWindows -o file \"%s\" not found" % (win_bed) + + """ + 3) Create mapping file for DEWSeq. + mapToId: extract "name" column from the annotation file and map the entries + to unique id and print out in tab separated format. + htseq-clip mapToId -a windows.bed.gz -o windows.txt.gz + + -o content example: + unique_id chromosome begin end strand gene_id gene_name gene_type gene_region Nr_of_region Total_nr_of_region window_number + TU1:exon0001W00001 Synechocystis 5 80 - TU1 TU1 protein_coding exon 1 1 1 + TU1:exon0001W00002 Synechocystis 15 90 - TU1 TU1 protein_coding exon 1 1 2 + TU1:exon0001W00003 Synechocystis 25 100 - TU1 TU1 protein_coding exon 1 1 3 + TU1:exon0001W00004 Synechocystis 35 105 - TU1 TU1 protein_coding exon 1 1 4 + TU2:exon0001W00001 Synechocystis 576 651 + TU2 TU2 protein_coding exon 1 1 1 + TU2:exon0001W00002 Synechocystis 586 661 + TU2 TU2 protein_coding exon 1 1 2 + TU2:exon0001W00003 Synechocystis 596 671 + TU2 TU2 protein_coding exon 1 1 3 + TU2:exon0001W00004 Synechocystis 606 681 + TU2 TU2 protein_coding exon 1 1 4 + TU2:exon0001W00005 Synechocystis 616 691 + TU2 TU2 protein_coding exon 1 1 5 + ... + + """ + + mapped2ids_txt = args.out_folder + "/windows_mapped_to_ids.txt.gz" + if args.no_zipper: + mapped2ids_txt = args.out_folder + "/windows_mapped_to_ids.txt" + + print("Create DEWSeq input annotation file ... ") + win_params = " -w %i -s %i " % (args.hcw_w, args.hcw_s) + check_cmd = "htseq-clip mapToId -a " + win_bed + " -o " + mapped2ids_txt + output = subprocess.getoutput(check_cmd) + + print(check_cmd) + print(output) + + assert os.path.exists(mapped2ids_txt), "htseq-clip mapToId -o file \"%s\" not found" % (mapped2ids_txt) + + """ + Report. + + """ + + print("") + print("OUTPUT FILES") + print("============") + print("Annotation BED:\n%s" % (annot_bed)) + print("Windows BED:\n%s" % (win_bed)) + print("Windows mapped to IDs TXT (DEWseq annotation file):\n%s" % (mapped2ids_txt)) + print("")