comparison batch_table_wrapper.py @ 0:7dd2835ce566 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/rbpbench commit 0e21bd630200c1f199db8ba5d83b81d4214fc59f
author rnateam
date Sun, 03 Dec 2023 12:51:54 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:7dd2835ce566
1 #!/usr/bin/env python3
2
3 import argparse
4 import os
5 import re
6 import subprocess
7
8
9 ###############################################################################
10
11 def setup_argument_parser():
12 """Setup argparse parser."""
13 help_description = """
14 Python wrapper for RBPBench Galaxy wrapper to work with collections of
15 input BED files (i.e. to process them with rbpbench batch).
16 """
17 # Define argument parser.
18 p = argparse.ArgumentParser(add_help=False,
19 prog="batch_table_wrapper.py",
20 description=help_description,
21 formatter_class=argparse.MetavarTypeHelpFormatter)
22
23 # Required arguments.
24 p.add_argument("-h", "--help",
25 action="help",
26 help="Print help message")
27 p.add_argument("--table",
28 dest="in_table",
29 type=str,
30 metavar='str',
31 required=True,
32 help="Input table file with data ID, method ID, RBP ID and file name (Galaxy element identifier in dataset collection) for each to be processed dataset by rbpbench batch")
33 p.add_argument("--paths",
34 dest="in_paths",
35 type=str,
36 metavar='str',
37 nargs='+',
38 required=True,
39 help="List of Galaxy BED file paths (--files path1 path2 .. )")
40 p.add_argument("--ids",
41 dest="in_ids",
42 type=str,
43 metavar='str',
44 nargs='+',
45 required=True,
46 help="List of Galaxy element identifiers, equal to the BED dataset names in the dataset collection (--ids id1 id2 .. )")
47 p.add_argument("--genome",
48 dest="in_genome",
49 type=str,
50 metavar='str',
51 required=True,
52 help="Genomic sequences file (currently supported formats: FASTA)")
53 p.add_argument("--out",
54 dest="out_folder",
55 type=str,
56 metavar='str',
57 required=True,
58 help="Batch results output folder")
59 # Optional batch arguments.
60 p.add_argument("--ext",
61 dest="ext_up_down",
62 type=str,
63 metavar='str',
64 default="0",
65 help="Up- and downstream extension of --in sites in nucleotides (nt). Set e.g. --ext 30 for 30 nt on both sides, or --ext 20,10 for different up- and downstream extension (default: 0)")
66 p.add_argument("--motif-db",
67 dest="motif_db",
68 type=int,
69 default=1,
70 choices=[1, 2, 3],
71 help="Motif database to use. 1: human RBP motifs full (259 RBPs, 605 motifs, human_v0.1), 2: human RBP motifs full (low frequencies not rounded, human_v0.1_no_round), 3: human RBP motifs eCLIP (107 RBPs, 316 motifs, human_eclip_v0.1) (default: 1)")
72 p.add_argument("--fimo-nt-freqs",
73 dest="fimo_nt_freqs",
74 type=str,
75 metavar='str',
76 default=False,
77 help="Provide FIMO nucleotide frequencies (FIMO option: --bifile) file (default: use internal frequencies file optimized for human transcripts)")
78 p.add_argument("--fimo-pval",
79 dest="fimo_pval",
80 type=float,
81 metavar='float',
82 default=0.001,
83 help="FIMO p-value threshold (FIMO option: --thresh) (default: 0.001)")
84 p.add_argument("--bed-score-col",
85 dest="bed_score_col",
86 type=int,
87 metavar='int',
88 default=5,
89 help="--in BED score column used for p-value calculations. BED score can be e.g. log2 fold change or -log10 p-value of the region (default: 5)")
90 p.add_argument("--unstranded",
91 dest="unstranded",
92 default=False,
93 action="store_true",
94 help="Set if --in BED regions are NOT strand-specific, i.e., to look for motifs on both strands of the provided regions. Note that the two strands of a region will still be counted as one region (change with --unstranded-ct) (default: False)")
95 p.add_argument("--unstranded-ct",
96 dest="unstranded_ct",
97 default=False,
98 action="store_true",
99 help="Count each --in region twice for RBP hit statistics when --unstranded is enabled. By default, two strands of one region are counted as one region for RBP hit statistics")
100 return p
101
102
103 ###############################################################################
104
105 if __name__ == '__main__':
106
107 parser = setup_argument_parser()
108 args = parser.parse_args()
109
110 assert os.path.exists(args.in_table), "--table file \"%s\" not found" % (args.in_file)
111 assert os.path.exists(args.in_genome), "--genome file \"%s\" not found" % (args.in_genome)
112
113 c_paths = len(args.in_paths)
114 c_ids = len(args.in_ids)
115 assert c_paths == c_ids, "given # paths (--paths) != # ids (--ids) (%i != %i). Please provide one ID for each path" % (c_paths, c_ids)
116
117 """
118 Check given paths and IDs.
119
120 """
121
122 # Paths.
123 paths_dic = {}
124 paths_list = []
125 for path in args.in_paths:
126 assert os.path.exists(path), "--paths %s file not found" % (path)
127 if path not in paths_dic:
128 paths_dic[path] = 1
129 else:
130 assert False, "--paths %s given > 1. Please provide unique paths" % (path)
131 paths_list.append(path)
132
133 # IDs
134 ids_dic = {}
135 ids_list = []
136 for id in args.in_ids:
137 if id not in ids_dic:
138 ids_dic[id] = 1
139 else:
140 assert False, "--ids \"%s\" given > 1. Please provide unique element identifiers (dataset names) inside the dataset collection, in order to unambiguously assign element ID to file path" % (id)
141 ids_list.append(id)
142
143 id2path_dic = {}
144 for idx, id in enumerate(ids_list):
145 path = paths_list[idx]
146 id2path_dic[id] = path
147
148 """
149 Read in table.
150
151 Column format:
152 rbp_id method_id data_id dataset_name
153
154 """
155
156 comb_ids_dic = {}
157 id_collect_dic = {}
158 id_collect_dic["rbp_id"] = []
159 id_collect_dic["method_id"] = []
160 id_collect_dic["data_id"] = []
161 id_collect_dic["set_name"] = []
162 id_collect_dic["path"] = [] # Galaxy file path.
163
164 print("Read in --table ... ")
165
166 with open(args.in_table) as f:
167 for line in f:
168
169 if re.search("^#", line):
170 continue
171
172 cols = line.strip().split("\t")
173
174 assert len(cols) == 4, "line in --table with # cols != 4 (%i) encountered:%s" % (len(cols), line)
175
176 rbp_id = cols[0]
177 method_id = cols[1]
178 data_id = cols[2]
179 set_name = cols[3]
180
181 if rbp_id == "rbp_id":
182 continue
183
184 comb_id = "%s,%s,%s,%s" % (rbp_id, method_id, data_id, set_name)
185
186 if comb_id not in comb_ids_dic:
187 comb_ids_dic[comb_id] = 1
188 else:
189 assert False, "data combination (\"%s\") appears > 1 in --table file. Please provide unique combinations for rbpbench batch calculation" % (comb_id)
190
191 assert set_name in ids_dic, "given dataset name \"%s\" from --table not part of given --ids. Please provide dataset names present in dataset collection" % (set_name)
192
193 id_collect_dic["rbp_id"].append(rbp_id)
194 id_collect_dic["method_id"].append(method_id)
195 id_collect_dic["data_id"].append(data_id)
196 id_collect_dic["set_name"].append(set_name)
197 id_collect_dic["path"].append(id2path_dic[set_name])
198
199 f.closed
200
201 assert id_collect_dic["rbp_id"], "nothing read in from --table. Please provide non-empty table in correct format (columns: rbp_id method_id data_id dataset_name)"
202
203 """
204 Construct RBPBench batch call.
205
206 """
207
208 batch_call = "rbpbench batch"
209 batch_call += " --out %s" % (args.out_folder)
210 batch_call += " --genome %s" % (args.in_genome)
211 batch_call += " --ext %s" % (args.ext_up_down)
212 batch_call += " --motif-db %i" % (args.motif_db)
213 if args.fimo_nt_freqs:
214 batch_call += " --fimo-nt-freqs %s" % (args.fimo_nt_freqs)
215 batch_call += " --fimo-pval %s" % (str(args.fimo_pval))
216 batch_call += " --bed-score-col %i" % (args.bed_score_col)
217 if args.unstranded:
218 batch_call += " --unstranded"
219 if args.unstranded_ct:
220 batch_call += " --unstranded-ct"
221
222 rbp_ids = (" ").join(id_collect_dic["rbp_id"])
223 method_ids = (" ").join(id_collect_dic["method_id"])
224 data_ids = (" ").join(id_collect_dic["data_id"])
225 paths = (" ").join(id_collect_dic["path"])
226
227 batch_call += " --rbp-list %s" % (rbp_ids)
228 batch_call += " --method-list %s" % (method_ids)
229 batch_call += " --data-list %s" % (data_ids)
230 batch_call += " --bed %s" % (paths)
231
232 """
233 Execute RBPBench batch call.
234 """
235
236 print("")
237 print("EXECUTING CALL:\n%s" % (batch_call))
238 output = subprocess.getoutput(batch_call)
239 print("")
240 print("RUN OUTPUT:\n%s" % (output))
241 print("")
242 print("DONE.")