Mercurial > repos > rnateam > htseq_clip
comparison htsc_create_sliding_windows.py @ 0:c82c41550f82 draft default tip
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/htseq-clip commit 4879439f0df3386b97d8507c5991051fbdda053a"
| author | rnateam |
|---|---|
| date | Sat, 15 Oct 2022 21:37:15 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:c82c41550f82 |
|---|---|
| 1 #!/usr/bin/env python3 | |
| 2 | |
| 3 import argparse | |
| 4 import os | |
| 5 import subprocess | |
| 6 | |
| 7 """ | |
| 8 | |
| 9 Install deseq-clip | |
| 10 ================== | |
| 11 | |
| 12 conda install -c bioconda pysam | |
| 13 conda install -c bioconda htseq | |
| 14 pip install htseq-clip | |
| 15 | |
| 16 Or directly by: | |
| 17 conda install -c bioconda htseq-clip | |
| 18 | |
| 19 | |
| 20 Test call | |
| 21 ========= | |
| 22 | |
| 23 python htsc_create_sliding_windows.py --gff test-data/paper_tus.Synechocystis_pSYSM.gff3 --out test_compare_out --hcw-w 50 --hcw-s 20 --no-zipper | |
| 24 | |
| 25 Compare: | |
| 26 diff test_compare_out/windows_mapped_to_ids.txt test-data/windows.exp.txt | |
| 27 diff test_compare_out/windows.bed test-data/windows.exp.bed | |
| 28 diff test_compare_out/annotation.bed test-data/annotation.exp.bed | |
| 29 | |
| 30 This corresponds to: | |
| 31 htseq-clip annotation -g test-data/paper_tus.Synechocystis_pSYSM.gff3 -o test-data/annotation.exp.bed | |
| 32 htseq-clip createSlidingWindows -i test-data/annotation.exp.bed -w 50 -s 20 -o test-data/windows.exp.bed | |
| 33 htseq-clip mapToId -a test-data/windows.exp.bed -o test-data/windows.exp.txt | |
| 34 | |
| 35 More tests: | |
| 36 python htsc_create_sliding_windows.py --gff test-data/paper_tus.Synechocystis_pSYSM.gff3 --out test_compare_out --hcw-w 50 --hcw-s 20 --no-zipper --hca-unsorted | |
| 37 | |
| 38 DEWSeq input files: | |
| 39 test_compare_out/windows_mapped_to_ids.txt | |
| 40 | |
| 41 """ | |
| 42 | |
| 43 | |
| 44 ################################################################################ | |
| 45 | |
| 46 def setup_argument_parser(): | |
| 47 """Setup argparse parser.""" | |
| 48 help_description = """ | |
| 49 Based on genomic annotations GFF file (--gff), create sliding window | |
| 50 annotations with htseq-clip. | |
| 51 | |
| 52 """ | |
| 53 # Define argument parser. | |
| 54 p = argparse.ArgumentParser(add_help=False, | |
| 55 prog="htsc_create_sliding_windows.py", | |
| 56 description=help_description, | |
| 57 formatter_class=argparse.MetavarTypeHelpFormatter) | |
| 58 | |
| 59 # Required arguments. | |
| 60 p.add_argument("-h", "--help", | |
| 61 action="help", | |
| 62 help="Print help message") | |
| 63 p.add_argument("--gff", | |
| 64 dest="in_gff", | |
| 65 type=str, | |
| 66 metavar='str', | |
| 67 required=True, | |
| 68 help="Annotation file GFF (so far tested with hg38 GENCODE format). Also accepts gff.gz as well") | |
| 69 p.add_argument("--out", | |
| 70 dest="out_folder", | |
| 71 type=str, | |
| 72 metavar='str', | |
| 73 required=True, | |
| 74 help="Results output folder") | |
| 75 # htseq-clip annotation. | |
| 76 p.add_argument("--hca-unsorted", | |
| 77 dest="hca_unsorted", | |
| 78 default=False, | |
| 79 action="store_true", | |
| 80 help="htseq-clip annotation --unsorted parameter. Use this flag if the GFF file is unsorted (default: False)") | |
| 81 # htseq-clip createSlidingWindows. | |
| 82 p.add_argument("--hcw-w", | |
| 83 dest="hcw_w", | |
| 84 type=int, | |
| 85 metavar='int', | |
| 86 default=50, | |
| 87 help="htseq-clip createSlidingWindows -w parameter. Sliding window size. If unsure, try 75-100 (default: 50)") | |
| 88 p.add_argument("--hcw-s", | |
| 89 dest="hcw_s", | |
| 90 type=int, | |
| 91 metavar='int', | |
| 92 default=20, | |
| 93 help="htseq-clip createSlidingWindows -s parameter. Step size for sliding window (default: 20)") | |
| 94 # More. | |
| 95 p.add_argument("--no-zipper", | |
| 96 dest="no_zipper", | |
| 97 default=False, | |
| 98 action="store_true", | |
| 99 help="Do not gzip output files (default: False)") | |
| 100 return p | |
| 101 | |
| 102 | |
| 103 ################################################################################ | |
| 104 | |
| 105 if __name__ == '__main__': | |
| 106 | |
| 107 parser = setup_argument_parser() | |
| 108 args = parser.parse_args() | |
| 109 | |
| 110 assert os.path.exists(args.in_gff), "--gff file \"%s\" not found" % (args.in_gff) | |
| 111 | |
| 112 # Output folder. | |
| 113 if not os.path.exists(args.out_folder): | |
| 114 os.makedirs(args.out_folder) | |
| 115 | |
| 116 """ | |
| 117 1) Flatten annotation. | |
| 118 htseq-clip annotation -g args.in_gff -o annotation.bed | |
| 119 | |
| 120 -o content example: | |
| 121 Synechocystis 5 105 TU1@TU1@protein_coding@exon@1/1@TU1:exon0001 2 - | |
| 122 Synechocystis 576 990 TU2@TU2@protein_coding@exon@1/1@TU2:exon0001 2 + | |
| 123 Synechocystis 809 909 TU3@TU3@protein_coding@exon@1/1@TU3:exon0001 2 - | |
| 124 Synechocystis 1531 2150 TU4@TU4@protein_coding@exon@1/1@TU4:exon0001 2 + | |
| 125 Synechocystis 2150 2701 TU6@TU6@protein_coding@exon@1/1@TU6:exon0001 2 + | |
| 126 ... | |
| 127 | |
| 128 """ | |
| 129 | |
| 130 annot_bed = args.out_folder + "/annotation.bed.gz" | |
| 131 if args.no_zipper: | |
| 132 annot_bed = args.out_folder + "/annotation.bed" | |
| 133 | |
| 134 print("Convert --gff to BED ... ") | |
| 135 check_cmd = "htseq-clip annotation -g " + args.in_gff + " -o " + annot_bed | |
| 136 if args.hca_unsorted: | |
| 137 check_cmd += " --unsorted" | |
| 138 output = subprocess.getoutput(check_cmd) | |
| 139 | |
| 140 print(check_cmd) | |
| 141 print(output) | |
| 142 | |
| 143 assert os.path.exists(annot_bed), "htseq-clip annotation -o file \"%s\" not found" % (annot_bed) | |
| 144 | |
| 145 """ | |
| 146 2) Create sliding windows. | |
| 147 htseq-clip createSlidingWindows -i annotation.bed -w hcw_w -s hcw_s -o windows.bed.gz | |
| 148 | |
| 149 -o content example: | |
| 150 Synechocystis 5 80 TU1@TU1@protein_coding@exon@1/1@TU1:exon0001W00001@1 2 - | |
| 151 Synechocystis 15 90 TU1@TU1@protein_coding@exon@1/1@TU1:exon0001W00002@2 2 - | |
| 152 Synechocystis 25 100 TU1@TU1@protein_coding@exon@1/1@TU1:exon0001W00003@3 2 - | |
| 153 Synechocystis 35 105 TU1@TU1@protein_coding@exon@1/1@TU1:exon0001W00004@4 2 - | |
| 154 Synechocystis 576 651 TU2@TU2@protein_coding@exon@1/1@TU2:exon0001W00001@1 2 + | |
| 155 Synechocystis 586 661 TU2@TU2@protein_coding@exon@1/1@TU2:exon0001W00002@2 2 + | |
| 156 Synechocystis 596 671 TU2@TU2@protein_coding@exon@1/1@TU2:exon0001W00003@3 2 + | |
| 157 Synechocystis 606 681 TU2@TU2@protein_coding@exon@1/1@TU2:exon0001W00004@4 2 + | |
| 158 Synechocystis 616 691 TU2@TU2@protein_coding@exon@1/1@TU2:exon0001W00005@5 2 + | |
| 159 ... | |
| 160 | |
| 161 """ | |
| 162 | |
| 163 win_bed = args.out_folder + "/windows.bed.gz" | |
| 164 if args.no_zipper: | |
| 165 win_bed = args.out_folder + "/windows.bed" | |
| 166 | |
| 167 print("Create sliding windows BED ... ") | |
| 168 win_params = " -w %i -s %i " % (args.hcw_w, args.hcw_s) | |
| 169 check_cmd = "htseq-clip createSlidingWindows -i " + annot_bed + win_params + " -o " + win_bed | |
| 170 output = subprocess.getoutput(check_cmd) | |
| 171 | |
| 172 print(check_cmd) | |
| 173 print(output) | |
| 174 | |
| 175 assert os.path.exists(annot_bed), "htseq-clip createSlidingWindows -o file \"%s\" not found" % (win_bed) | |
| 176 | |
| 177 """ | |
| 178 3) Create mapping file for DEWSeq. | |
| 179 mapToId: extract "name" column from the annotation file and map the entries | |
| 180 to unique id and print out in tab separated format. | |
| 181 htseq-clip mapToId -a windows.bed.gz -o windows.txt.gz | |
| 182 | |
| 183 -o content example: | |
| 184 unique_id chromosome begin end strand gene_id gene_name gene_type gene_region Nr_of_region Total_nr_of_region window_number | |
| 185 TU1:exon0001W00001 Synechocystis 5 80 - TU1 TU1 protein_coding exon 1 1 1 | |
| 186 TU1:exon0001W00002 Synechocystis 15 90 - TU1 TU1 protein_coding exon 1 1 2 | |
| 187 TU1:exon0001W00003 Synechocystis 25 100 - TU1 TU1 protein_coding exon 1 1 3 | |
| 188 TU1:exon0001W00004 Synechocystis 35 105 - TU1 TU1 protein_coding exon 1 1 4 | |
| 189 TU2:exon0001W00001 Synechocystis 576 651 + TU2 TU2 protein_coding exon 1 1 1 | |
| 190 TU2:exon0001W00002 Synechocystis 586 661 + TU2 TU2 protein_coding exon 1 1 2 | |
| 191 TU2:exon0001W00003 Synechocystis 596 671 + TU2 TU2 protein_coding exon 1 1 3 | |
| 192 TU2:exon0001W00004 Synechocystis 606 681 + TU2 TU2 protein_coding exon 1 1 4 | |
| 193 TU2:exon0001W00005 Synechocystis 616 691 + TU2 TU2 protein_coding exon 1 1 5 | |
| 194 ... | |
| 195 | |
| 196 """ | |
| 197 | |
| 198 mapped2ids_txt = args.out_folder + "/windows_mapped_to_ids.txt.gz" | |
| 199 if args.no_zipper: | |
| 200 mapped2ids_txt = args.out_folder + "/windows_mapped_to_ids.txt" | |
| 201 | |
| 202 print("Create DEWSeq input annotation file ... ") | |
| 203 win_params = " -w %i -s %i " % (args.hcw_w, args.hcw_s) | |
| 204 check_cmd = "htseq-clip mapToId -a " + win_bed + " -o " + mapped2ids_txt | |
| 205 output = subprocess.getoutput(check_cmd) | |
| 206 | |
| 207 print(check_cmd) | |
| 208 print(output) | |
| 209 | |
| 210 assert os.path.exists(mapped2ids_txt), "htseq-clip mapToId -o file \"%s\" not found" % (mapped2ids_txt) | |
| 211 | |
| 212 """ | |
| 213 Report. | |
| 214 | |
| 215 """ | |
| 216 | |
| 217 print("") | |
| 218 print("OUTPUT FILES") | |
| 219 print("============") | |
| 220 print("Annotation BED:\n%s" % (annot_bed)) | |
| 221 print("Windows BED:\n%s" % (win_bed)) | |
| 222 print("Windows mapped to IDs TXT (DEWseq annotation file):\n%s" % (mapped2ids_txt)) | |
| 223 print("") |
