Mercurial > repos > rnateam > rbpbench
comparison batch_table_wrapper.py @ 0:7dd2835ce566 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/rbpbench commit 0e21bd630200c1f199db8ba5d83b81d4214fc59f
| author | rnateam |
|---|---|
| date | Sun, 03 Dec 2023 12:51:54 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:7dd2835ce566 |
|---|---|
| 1 #!/usr/bin/env python3 | |
| 2 | |
| 3 import argparse | |
| 4 import os | |
| 5 import re | |
| 6 import subprocess | |
| 7 | |
| 8 | |
| 9 ############################################################################### | |
| 10 | |
| 11 def setup_argument_parser(): | |
| 12 """Setup argparse parser.""" | |
| 13 help_description = """ | |
| 14 Python wrapper for RBPBench Galaxy wrapper to work with collections of | |
| 15 input BED files (i.e. to process them with rbpbench batch). | |
| 16 """ | |
| 17 # Define argument parser. | |
| 18 p = argparse.ArgumentParser(add_help=False, | |
| 19 prog="batch_table_wrapper.py", | |
| 20 description=help_description, | |
| 21 formatter_class=argparse.MetavarTypeHelpFormatter) | |
| 22 | |
| 23 # Required arguments. | |
| 24 p.add_argument("-h", "--help", | |
| 25 action="help", | |
| 26 help="Print help message") | |
| 27 p.add_argument("--table", | |
| 28 dest="in_table", | |
| 29 type=str, | |
| 30 metavar='str', | |
| 31 required=True, | |
| 32 help="Input table file with data ID, method ID, RBP ID and file name (Galaxy element identifier in dataset collection) for each to be processed dataset by rbpbench batch") | |
| 33 p.add_argument("--paths", | |
| 34 dest="in_paths", | |
| 35 type=str, | |
| 36 metavar='str', | |
| 37 nargs='+', | |
| 38 required=True, | |
| 39 help="List of Galaxy BED file paths (--files path1 path2 .. )") | |
| 40 p.add_argument("--ids", | |
| 41 dest="in_ids", | |
| 42 type=str, | |
| 43 metavar='str', | |
| 44 nargs='+', | |
| 45 required=True, | |
| 46 help="List of Galaxy element identifiers, equal to the BED dataset names in the dataset collection (--ids id1 id2 .. )") | |
| 47 p.add_argument("--genome", | |
| 48 dest="in_genome", | |
| 49 type=str, | |
| 50 metavar='str', | |
| 51 required=True, | |
| 52 help="Genomic sequences file (currently supported formats: FASTA)") | |
| 53 p.add_argument("--out", | |
| 54 dest="out_folder", | |
| 55 type=str, | |
| 56 metavar='str', | |
| 57 required=True, | |
| 58 help="Batch results output folder") | |
| 59 # Optional batch arguments. | |
| 60 p.add_argument("--ext", | |
| 61 dest="ext_up_down", | |
| 62 type=str, | |
| 63 metavar='str', | |
| 64 default="0", | |
| 65 help="Up- and downstream extension of --in sites in nucleotides (nt). Set e.g. --ext 30 for 30 nt on both sides, or --ext 20,10 for different up- and downstream extension (default: 0)") | |
| 66 p.add_argument("--motif-db", | |
| 67 dest="motif_db", | |
| 68 type=int, | |
| 69 default=1, | |
| 70 choices=[1, 2, 3], | |
| 71 help="Motif database to use. 1: human RBP motifs full (259 RBPs, 605 motifs, human_v0.1), 2: human RBP motifs full (low frequencies not rounded, human_v0.1_no_round), 3: human RBP motifs eCLIP (107 RBPs, 316 motifs, human_eclip_v0.1) (default: 1)") | |
| 72 p.add_argument("--fimo-nt-freqs", | |
| 73 dest="fimo_nt_freqs", | |
| 74 type=str, | |
| 75 metavar='str', | |
| 76 default=False, | |
| 77 help="Provide FIMO nucleotide frequencies (FIMO option: --bifile) file (default: use internal frequencies file optimized for human transcripts)") | |
| 78 p.add_argument("--fimo-pval", | |
| 79 dest="fimo_pval", | |
| 80 type=float, | |
| 81 metavar='float', | |
| 82 default=0.001, | |
| 83 help="FIMO p-value threshold (FIMO option: --thresh) (default: 0.001)") | |
| 84 p.add_argument("--bed-score-col", | |
| 85 dest="bed_score_col", | |
| 86 type=int, | |
| 87 metavar='int', | |
| 88 default=5, | |
| 89 help="--in BED score column used for p-value calculations. BED score can be e.g. log2 fold change or -log10 p-value of the region (default: 5)") | |
| 90 p.add_argument("--unstranded", | |
| 91 dest="unstranded", | |
| 92 default=False, | |
| 93 action="store_true", | |
| 94 help="Set if --in BED regions are NOT strand-specific, i.e., to look for motifs on both strands of the provided regions. Note that the two strands of a region will still be counted as one region (change with --unstranded-ct) (default: False)") | |
| 95 p.add_argument("--unstranded-ct", | |
| 96 dest="unstranded_ct", | |
| 97 default=False, | |
| 98 action="store_true", | |
| 99 help="Count each --in region twice for RBP hit statistics when --unstranded is enabled. By default, two strands of one region are counted as one region for RBP hit statistics") | |
| 100 return p | |
| 101 | |
| 102 | |
| 103 ############################################################################### | |
| 104 | |
| 105 if __name__ == '__main__': | |
| 106 | |
| 107 parser = setup_argument_parser() | |
| 108 args = parser.parse_args() | |
| 109 | |
| 110 assert os.path.exists(args.in_table), "--table file \"%s\" not found" % (args.in_file) | |
| 111 assert os.path.exists(args.in_genome), "--genome file \"%s\" not found" % (args.in_genome) | |
| 112 | |
| 113 c_paths = len(args.in_paths) | |
| 114 c_ids = len(args.in_ids) | |
| 115 assert c_paths == c_ids, "given # paths (--paths) != # ids (--ids) (%i != %i). Please provide one ID for each path" % (c_paths, c_ids) | |
| 116 | |
| 117 """ | |
| 118 Check given paths and IDs. | |
| 119 | |
| 120 """ | |
| 121 | |
| 122 # Paths. | |
| 123 paths_dic = {} | |
| 124 paths_list = [] | |
| 125 for path in args.in_paths: | |
| 126 assert os.path.exists(path), "--paths %s file not found" % (path) | |
| 127 if path not in paths_dic: | |
| 128 paths_dic[path] = 1 | |
| 129 else: | |
| 130 assert False, "--paths %s given > 1. Please provide unique paths" % (path) | |
| 131 paths_list.append(path) | |
| 132 | |
| 133 # IDs | |
| 134 ids_dic = {} | |
| 135 ids_list = [] | |
| 136 for id in args.in_ids: | |
| 137 if id not in ids_dic: | |
| 138 ids_dic[id] = 1 | |
| 139 else: | |
| 140 assert False, "--ids \"%s\" given > 1. Please provide unique element identifiers (dataset names) inside the dataset collection, in order to unambiguously assign element ID to file path" % (id) | |
| 141 ids_list.append(id) | |
| 142 | |
| 143 id2path_dic = {} | |
| 144 for idx, id in enumerate(ids_list): | |
| 145 path = paths_list[idx] | |
| 146 id2path_dic[id] = path | |
| 147 | |
| 148 """ | |
| 149 Read in table. | |
| 150 | |
| 151 Column format: | |
| 152 rbp_id method_id data_id dataset_name | |
| 153 | |
| 154 """ | |
| 155 | |
| 156 comb_ids_dic = {} | |
| 157 id_collect_dic = {} | |
| 158 id_collect_dic["rbp_id"] = [] | |
| 159 id_collect_dic["method_id"] = [] | |
| 160 id_collect_dic["data_id"] = [] | |
| 161 id_collect_dic["set_name"] = [] | |
| 162 id_collect_dic["path"] = [] # Galaxy file path. | |
| 163 | |
| 164 print("Read in --table ... ") | |
| 165 | |
| 166 with open(args.in_table) as f: | |
| 167 for line in f: | |
| 168 | |
| 169 if re.search("^#", line): | |
| 170 continue | |
| 171 | |
| 172 cols = line.strip().split("\t") | |
| 173 | |
| 174 assert len(cols) == 4, "line in --table with # cols != 4 (%i) encountered:%s" % (len(cols), line) | |
| 175 | |
| 176 rbp_id = cols[0] | |
| 177 method_id = cols[1] | |
| 178 data_id = cols[2] | |
| 179 set_name = cols[3] | |
| 180 | |
| 181 if rbp_id == "rbp_id": | |
| 182 continue | |
| 183 | |
| 184 comb_id = "%s,%s,%s,%s" % (rbp_id, method_id, data_id, set_name) | |
| 185 | |
| 186 if comb_id not in comb_ids_dic: | |
| 187 comb_ids_dic[comb_id] = 1 | |
| 188 else: | |
| 189 assert False, "data combination (\"%s\") appears > 1 in --table file. Please provide unique combinations for rbpbench batch calculation" % (comb_id) | |
| 190 | |
| 191 assert set_name in ids_dic, "given dataset name \"%s\" from --table not part of given --ids. Please provide dataset names present in dataset collection" % (set_name) | |
| 192 | |
| 193 id_collect_dic["rbp_id"].append(rbp_id) | |
| 194 id_collect_dic["method_id"].append(method_id) | |
| 195 id_collect_dic["data_id"].append(data_id) | |
| 196 id_collect_dic["set_name"].append(set_name) | |
| 197 id_collect_dic["path"].append(id2path_dic[set_name]) | |
| 198 | |
| 199 f.closed | |
| 200 | |
| 201 assert id_collect_dic["rbp_id"], "nothing read in from --table. Please provide non-empty table in correct format (columns: rbp_id method_id data_id dataset_name)" | |
| 202 | |
| 203 """ | |
| 204 Construct RBPBench batch call. | |
| 205 | |
| 206 """ | |
| 207 | |
| 208 batch_call = "rbpbench batch" | |
| 209 batch_call += " --out %s" % (args.out_folder) | |
| 210 batch_call += " --genome %s" % (args.in_genome) | |
| 211 batch_call += " --ext %s" % (args.ext_up_down) | |
| 212 batch_call += " --motif-db %i" % (args.motif_db) | |
| 213 if args.fimo_nt_freqs: | |
| 214 batch_call += " --fimo-nt-freqs %s" % (args.fimo_nt_freqs) | |
| 215 batch_call += " --fimo-pval %s" % (str(args.fimo_pval)) | |
| 216 batch_call += " --bed-score-col %i" % (args.bed_score_col) | |
| 217 if args.unstranded: | |
| 218 batch_call += " --unstranded" | |
| 219 if args.unstranded_ct: | |
| 220 batch_call += " --unstranded-ct" | |
| 221 | |
| 222 rbp_ids = (" ").join(id_collect_dic["rbp_id"]) | |
| 223 method_ids = (" ").join(id_collect_dic["method_id"]) | |
| 224 data_ids = (" ").join(id_collect_dic["data_id"]) | |
| 225 paths = (" ").join(id_collect_dic["path"]) | |
| 226 | |
| 227 batch_call += " --rbp-list %s" % (rbp_ids) | |
| 228 batch_call += " --method-list %s" % (method_ids) | |
| 229 batch_call += " --data-list %s" % (data_ids) | |
| 230 batch_call += " --bed %s" % (paths) | |
| 231 | |
| 232 """ | |
| 233 Execute RBPBench batch call. | |
| 234 """ | |
| 235 | |
| 236 print("") | |
| 237 print("EXECUTING CALL:\n%s" % (batch_call)) | |
| 238 output = subprocess.getoutput(batch_call) | |
| 239 print("") | |
| 240 print("RUN OUTPUT:\n%s" % (output)) | |
| 241 print("") | |
| 242 print("DONE.") |
